diff --git a/.github/workflows/lint-xml.yml b/.github/workflows/lint-xml.yml new file mode 100644 index 0000000..64ae822 --- /dev/null +++ b/.github/workflows/lint-xml.yml @@ -0,0 +1,40 @@ +name: Lint XML + +on: + push: + branches: + - main + - '**' # Or specify branches as needed + pull_request: + branches: + - main + - '**' + +jobs: + lint-xml: + runs-on: ubuntu-latest + + steps: + - name: Check out the repository + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + + - name: Install dependencies + run: | + pip install lxml + + - name: Lint XML references + run: | + # Adjust the command/path to your script as needed + python scripts/lint_verweise.py \ + meta.xml \ + references.xml \ + briefe.xml \ + edits.xml \ + traditions.xml \ + Marginal-Kommentar.xml \ + --register Register-Kommentar.xml Bibel-Kommentar.xml forschung.xml diff --git a/scripts/lint_verweise.py b/scripts/lint_verweise.py new file mode 100644 index 0000000..0c3b99a --- /dev/null +++ b/scripts/lint_verweise.py @@ -0,0 +1,577 @@ +#!/usr/bin/env python3 + +""" +Single integrated validator script with structured error handling (GitHub Actions annotations), +plus checks for: + 1) Cross-references (sender/receiver/location) in meta.xml + 2) page/line merges from briefe.xml & traditions.xml for letter=, page=, line= + 3) references + 4) & in registers (must have ) + 5) IDs must be unique among all kommentars or all subsections across registers + 6) must match a valid kommentar/subsection ID +""" + +import sys +import argparse +import re +from collections import defaultdict + +from lxml import etree + +############################################################################## +# Basic parse / line info +############################################################################## + +def parse_xml(file_path): + """ + Parse XML using lxml.etree (keeping line numbers). + Exit on syntax/file errors. + """ + try: + parser = etree.XMLParser(remove_blank_text=False) + return etree.parse(file_path, parser) + except etree.XMLSyntaxError as e: + print(f"Error parsing {file_path}: {e}") + sys.exit(1) + except OSError: + print(f"Error: File not found - {file_path}") + sys.exit(1) + +def get_line_number(elem): + """ + Return the sourceline of an lxml element, or 'Unknown'. + """ + return elem.sourceline if hasattr(elem, 'sourceline') and elem.sourceline else "Unknown" + +############################################################################## +# Merging letter/page/line from briefe.xml + traditions.xml +############################################################################## + +def build_letter_page_line_map_brief(doc_root): + """ + letter_pages[letter][page] = set(line) + + We allow to continue across transitions unless + a new is encountered. + """ + letter_pages = defaultdict(lambda: defaultdict(set)) + + doc_elem = doc_root.find(".//document") + if doc_elem is None: + doc_elem = doc_root + + current_letter = None + current_page = None + + for elem in doc_elem.iter(): + tag = elem.tag + if tag == 'letterText': + current_letter = elem.get('letter') + elif tag == 'page': + page_index = elem.get('index') + if page_index: + current_page = page_index + elif tag == 'line': + line_index = elem.get('index') + if current_letter and current_page and line_index: + letter_pages[current_letter][current_page].add(line_index) + + return letter_pages + +def build_letter_page_line_map_trad(trad_root): + """ + Similarly for traditions.xml, reading + to find any and . We'll unify them with briefe.xml data. + """ + letter_pages = defaultdict(lambda: defaultdict(set)) + + for letter_trad in trad_root.findall(".//letterTradition"): + letter_id = letter_trad.get('letter') + if not letter_id: + continue + current_page = None + + for elem in letter_trad.iter(): + if elem is letter_trad: + continue + if elem.tag == 'page': + pidx = elem.get('index') + if pidx: + current_page = pidx + elif elem.tag == 'line': + lidx = elem.get('index') + if current_page and lidx: + letter_pages[letter_id][current_page].add(lidx) + + return letter_pages + +def merge_page_line_maps(map_a, map_b): + """ + Merge two letter->page->lines maps (map_b into map_a). + """ + for letter_id, pages_dict in map_b.items(): + for page_id, line_set in pages_dict.items(): + map_a[letter_id][page_id].update(line_set) + return map_a + +############################################################################## +# check +############################################################################## + +def validate_intlinks(xml_root, file_path, letter_pages, letter_refs, errors): + """ + For each : + - letter is mandatory, must be in letter_refs + - if page is present => must exist in letter_pages + - if line is present => must also have that page + - line w/o page => error + """ + for intlink in xml_root.findall(".//intlink"): + line_no = get_line_number(intlink) + letter_id = intlink.get('letter') + page_id = intlink.get('page') + line_id = intlink.get('line') + + if not letter_id or letter_id not in letter_refs: + errors.append({ + "file": file_path, + "line": line_no, + "message": f"Invalid intlink letter={letter_id}" + }) + continue + + if letter_id not in letter_pages: + errors.append({ + "file": file_path, + "line": line_no, + "message": f"No pages known for letter={letter_id} in intlink" + }) + continue + + if page_id: + if page_id not in letter_pages[letter_id]: + errors.append({ + "file": file_path, + "line": line_no, + "message": f"Invalid page={page_id} for letter={letter_id} in intlink" + }) + else: + if line_id: + if line_id not in letter_pages[letter_id][page_id]: + errors.append({ + "file": file_path, + "line": line_no, + "message": f"Invalid line={line_id} for letter={letter_id}, page={page_id} in intlink" + }) + else: + # no page + if line_id: + errors.append({ + "file": file_path, + "line": line_no, + "message": f"intlink has line={line_id} but no page=? for letter={letter_id}" + }) + +############################################################################## +# and check +############################################################################## + +def gather_commentaries_and_subsections(xml_root, file_path, errors, + global_kommentar_ids, global_subsection_ids): + """ + For each => must have . ID must be globally unique among kommentars. + For each => must have . ID must be globally unique among subsections. + """ + local_komm_ids = set() + local_sub_ids = set() + + # + for kom in xml_root.findall(".//kommentar"): + kid = kom.get('id') + ln = get_line_number(kom) + if not kid: + errors.append({ + "file": file_path, + "line": ln, + "message": " missing @id" + }) + continue + + # check local duplicates + if kid in local_komm_ids: + errors.append({ + "file": file_path, + "line": ln, + "message": f"Duplicate in this file" + }) + else: + local_komm_ids.add(kid) + + # check global duplicates + if kid in global_kommentar_ids: + errors.append({ + "file": file_path, + "line": ln, + "message": f"Duplicate across multiple registers" + }) + else: + global_kommentar_ids.add(kid) + + # must have a child + lemma_elem = kom.find(".//lemma") + if lemma_elem is None: + errors.append({ + "file": file_path, + "line": ln, + "message": f" missing child" + }) + + # + for sub in xml_root.findall(".//subsection"): + sid = sub.get('id') + ln = get_line_number(sub) + if not sid: + errors.append({ + "file": file_path, + "line": ln, + "message": " missing @id" + }) + continue + + # local duplicates + if sid in local_sub_ids: + errors.append({ + "file": file_path, + "line": ln, + "message": f"Duplicate in this file" + }) + else: + local_sub_ids.add(sid) + + # global duplicates + if sid in global_subsection_ids: + errors.append({ + "file": file_path, + "line": ln, + "message": f"Duplicate across multiple registers" + }) + else: + global_subsection_ids.add(sid) + + # must have + lemma_elem = sub.find(".//lemma") + if lemma_elem is None: + errors.append({ + "file": file_path, + "line": ln, + "message": f" missing child" + }) + +############################################################################## +# check +############################################################################## + +def validate_links_for_commentary(xml_root, file_path, + kommentar_ids, subsection_ids, + errors): + """ + For each : + - if ref="X" => X must be in kommentars OR subsections + - if subref="Y" => Y must be in subsections + """ + for link_elem in xml_root.findall(".//link"): + ln = get_line_number(link_elem) + refval = link_elem.get('ref') + subrefval = link_elem.get('subref') + + # check ref + if refval: + if refval not in kommentar_ids and refval not in subsection_ids: + errors.append({ + "file": file_path, + "line": ln, + "message": f"Invalid (not in komentar/subsection IDs)" + }) + # check subref + if subrefval: + if subrefval not in subsection_ids: + errors.append({ + "file": file_path, + "line": ln, + "message": f"Invalid (not in IDs)" + }) + +############################################################################## +# The Main Validator +############################################################################## + +def validate_references(meta_file, references_file, briefe_file, + edits_file, traditions_file, marginalien_file, + extra_registers=None): + """All checks in one place.""" + + # 1) Parse main files + meta_tree = parse_xml(meta_file) + references_tree = parse_xml(references_file) + briefe_tree = parse_xml(briefe_file) + edits_tree = parse_xml(edits_file) + traditions_tree = parse_xml(traditions_file) + marginal_tree = parse_xml(marginalien_file) + + # parse optional register files + register_trees = [] + if extra_registers: + for rfile in extra_registers: + rtree = parse_xml(rfile) + register_trees.append((rfile, rtree)) + + # 2) Get roots + meta_xml = meta_tree.getroot() + references_xml = references_tree.getroot() + briefe_xml = briefe_tree.getroot() + edits_xml = edits_tree.getroot() + traditions_xml = traditions_tree.getroot() + marginalien_xml = marginal_tree.getroot() + + # 3) Reference sets from references.xml + edits.xml + meta.xml + person_refs = {p.get('index') for p in references_xml.findall(".//personDef")} + location_refs = {l.get('index') for l in references_xml.findall(".//locationDef")} + hand_refs = {h.get('index') for h in references_xml.findall(".//handDef")} + app_refs = {a.get('index') for a in references_xml.findall(".//appDef")} + edit_refs = {e.get('index') for e in edits_xml.findall(".//editreason")} + letter_refs = {desc.get('letter') for desc in meta_xml.findall(".//letterDesc")} + + # We'll accumulate all errors as a list of dict: {file, line, message} + errors = [] + + # 4) Gather all and IDs from each register + # to check their uniqueness and presence of . + global_kommentar_ids = set() + global_subsection_ids = set() + + for (rfile, rtree) in register_trees: + rroot = rtree.getroot() + gather_commentaries_and_subsections( + rroot, rfile, errors, + global_kommentar_ids, global_subsection_ids + ) + + # (If references.xml or traditions.xml also contain or , + # call gather_commentaries_and_subsections on them similarly.) + + # 5) Validate meta.xml references + for letter in meta_xml.findall(".//letterDesc"): + letter_id = letter.get('letter') + ln = get_line_number(letter) + + # + for sender in letter.findall(".//sender"): + ref = sender.get('ref') + if ref and ref not in person_refs: + errors.append({ + "file": meta_file, + "line": get_line_number(sender), + "message": f"Invalid sender ref: {ref} in letter={letter_id}" + }) + + # + for receiver in letter.findall(".//receiver"): + ref = receiver.get('ref') + if ref and ref not in person_refs: + errors.append({ + "file": meta_file, + "line": get_line_number(receiver), + "message": f"Invalid receiver ref: {ref} in letter={letter_id}" + }) + + # + loc_elem = letter.find(".//location") + if loc_elem is not None: + r = loc_elem.get('ref') + if r and r not in location_refs: + errors.append({ + "file": meta_file, + "line": get_line_number(loc_elem), + "message": f"Invalid location ref: {r} in letter={letter_id}" + }) + + # 6) Validate briefe.xml references + for letter_text in briefe_xml.findall(".//letterText"): + letter_id = letter_text.get('letter') + ln = get_line_number(letter_text) + + if letter_id and letter_id not in letter_refs: + errors.append({ + "file": briefe_file, + "line": ln, + "message": f"Invalid letter reference: {letter_id} in briefe.xml" + }) + + for hand_elem in letter_text.findall(".//hand"): + ref = hand_elem.get('ref') + if ref and ref not in hand_refs: + errors.append({ + "file": briefe_file, + "line": get_line_number(hand_elem), + "message": f"Invalid hand ref: {ref} in letter {letter_id}" + }) + + for edit_elem in letter_text.findall(".//edit"): + ref = edit_elem.get('ref') + if ref and ref not in edit_refs: + errors.append({ + "file": briefe_file, + "line": get_line_number(edit_elem), + "message": f"Invalid edit ref: {ref} in letter {letter_id}" + }) + + # 7) Validate traditions.xml references (besides page/line) + for tradition in traditions_xml.findall(".//letterTradition"): + letter_id = tradition.get('letter') + ln = get_line_number(tradition) + if letter_id and letter_id not in letter_refs: + errors.append({ + "file": traditions_file, + "line": ln, + "message": f"Invalid letterTradition reference: {letter_id}" + }) + + # + for app_elem in tradition.findall(".//app"): + ref = app_elem.get('ref') + if ref and ref not in app_refs: + errors.append({ + "file": traditions_file, + "line": get_line_number(app_elem), + "message": f"Invalid app ref: {ref} in letterTradition {letter_id}" + }) + + # + for hand_elem in tradition.findall(".//hand"): + ref = hand_elem.get('ref') + if ref and ref not in hand_refs: + errors.append({ + "file": traditions_file, + "line": get_line_number(hand_elem), + "message": f"Invalid hand ref: {ref} in letterTradition {letter_id}" + }) + + # 8) Merge letter->page->lines from briefe.xml & traditions.xml + letter_pages_brief = build_letter_page_line_map_brief(briefe_xml) + letter_pages_trad = build_letter_page_line_map_trad(traditions_xml) + letter_pages = merge_page_line_maps(letter_pages_brief, letter_pages_trad) + + # 9) Validate in traditions.xml, marginalien.xml, and all registers + validate_intlinks(traditions_xml, traditions_file, letter_pages, letter_refs, errors) + validate_intlinks(marginalien_xml, marginalien_file, letter_pages, letter_refs, errors) + for (rfile, rtree) in register_trees: + rroot = rtree.getroot() + validate_intlinks(rroot, rfile, letter_pages, letter_refs, errors) + + # 10) in Marginal-Kommentar.xml + for marginal_elem in marginalien_xml.findall(".//marginal"): + letter_id = marginal_elem.get('letter') + page_id = marginal_elem.get('page') + line_id = marginal_elem.get('line') + ln = get_line_number(marginal_elem) + + if letter_id not in letter_refs: + errors.append({ + "file": marginalien_file, + "line": ln, + "message": f"Invalid marginal letter reference: {letter_id} (not in meta.xml)" + }) + else: + if letter_id not in letter_pages: + errors.append({ + "file": marginalien_file, + "line": ln, + "message": f"No pages/lines known for letter={letter_id} in briefe/traditions" + }) + else: + if page_id not in letter_pages[letter_id]: + errors.append({ + "file": marginalien_file, + "line": ln, + "message": f"Invalid page reference: letter={letter_id}, page={page_id}" + }) + else: + if line_id not in letter_pages[letter_id][page_id]: + errors.append({ + "file": marginalien_file, + "line": ln, + "message": f"Invalid line reference: letter={letter_id}, page={page_id}, line={line_id}" + }) + + # 11) Now validate all across every file for commentary IDs + # We'll define two sets that might have come from gather_commentaries_and_subsections: + # global_kommentar_ids, global_subsection_ids + def validate_links_in_tree(root, path): + validate_links_for_commentary(root, path, global_kommentar_ids, global_subsection_ids, errors) + + # meta.xml + validate_links_in_tree(meta_xml, meta_file) + # references.xml + validate_links_in_tree(references_xml, references_file) + # briefe.xml + validate_links_in_tree(briefe_xml, briefe_file) + # edits.xml + validate_links_in_tree(edits_xml, edits_file) + # traditions.xml + validate_links_in_tree(traditions_xml, traditions_file) + # marginalien.xml + validate_links_in_tree(marginalien_xml, marginalien_file) + # registers + for (rfile, rtree) in register_trees: + validate_links_in_tree(rtree.getroot(), rfile) + + ############################################################################ + # Final: Print errors or success + ############################################################################ + if errors: + # Print them in GitHub annotation format: ::error file=...,line=...::{message} + for err in errors: + file_name = err["file"] + line_no = err["line"] + message = err["message"] + print(f"::error file={file_name},line={line_no}::{message}") + sys.exit(1) + else: + print("All references are valid.") + +############################################################################## +# Entry Point +############################################################################## + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=""" +Validates cross-references among multiple XML files, merges page/line from briefe.xml & traditions.xml, +handles , plus checks / with unique IDs and , and to these IDs. +Produces GitHub Actions annotation errors. +""") + + parser.add_argument("meta_file", help="Path to meta.xml") + parser.add_argument("references_file", help="Path to references.xml") + parser.add_argument("briefe_file", help="Path to briefe.xml") + parser.add_argument("edits_file", help="Path to edits.xml") + parser.add_argument("traditions_file", help="Path to traditions.xml") + parser.add_argument("marginalien_file", help="Path to Marginal-Kommentar.xml") + parser.add_argument( + "--register", + dest="registers", + nargs="*", + default=None, + help="One or more register.xml files, containing / plus possible or ." + ) + + args = parser.parse_args() + + validate_references( + args.meta_file, + args.references_file, + args.briefe_file, + args.edits_file, + args.traditions_file, + args.marginalien_file, + extra_registers=args.registers + )