mirror of
https://github.com/Theodor-Springmann-Stiftung/hamann-xml.git
synced 2025-10-28 16:45:31 +00:00
578 lines
22 KiB
Python
578 lines
22 KiB
Python
#!/usr/bin/env python3
|
|
|
|
"""
|
|
Single integrated validator script with structured error handling (GitHub Actions annotations),
|
|
plus checks for:
|
|
1) Cross-references (sender/receiver/location) in meta.xml
|
|
2) page/line merges from briefe.xml & traditions.xml for letter=, page=, line=
|
|
3) <intlink> references
|
|
4) <kommentar id="..."> & <subsection id="..."> in registers (must have <lemma>)
|
|
5) IDs must be unique among all kommentars or all subsections across registers
|
|
6) <link ref="..." subref="..."> must match a valid kommentar/subsection ID
|
|
"""
|
|
|
|
import sys
|
|
import argparse
|
|
import re
|
|
from collections import defaultdict
|
|
|
|
from lxml import etree
|
|
|
|
##############################################################################
|
|
# Basic parse / line info
|
|
##############################################################################
|
|
|
|
def parse_xml(file_path):
|
|
"""
|
|
Parse XML using lxml.etree (keeping line numbers).
|
|
Exit on syntax/file errors.
|
|
"""
|
|
try:
|
|
parser = etree.XMLParser(remove_blank_text=False)
|
|
return etree.parse(file_path, parser)
|
|
except etree.XMLSyntaxError as e:
|
|
print(f"Error parsing {file_path}: {e}")
|
|
sys.exit(1)
|
|
except OSError:
|
|
print(f"Error: File not found - {file_path}")
|
|
sys.exit(1)
|
|
|
|
def get_line_number(elem):
|
|
"""
|
|
Return the sourceline of an lxml element, or 'Unknown'.
|
|
"""
|
|
return elem.sourceline if hasattr(elem, 'sourceline') and elem.sourceline else "Unknown"
|
|
|
|
##############################################################################
|
|
# Merging letter/page/line from briefe.xml + traditions.xml
|
|
##############################################################################
|
|
|
|
def build_letter_page_line_map_brief(doc_root):
|
|
"""
|
|
letter_pages[letter][page] = set(line)
|
|
|
|
We allow <page> to continue across <letterText> transitions unless
|
|
a new <page> is encountered.
|
|
"""
|
|
letter_pages = defaultdict(lambda: defaultdict(set))
|
|
|
|
doc_elem = doc_root.find(".//document")
|
|
if doc_elem is None:
|
|
doc_elem = doc_root
|
|
|
|
current_letter = None
|
|
current_page = None
|
|
|
|
for elem in doc_elem.iter():
|
|
tag = elem.tag
|
|
if tag == 'letterText':
|
|
current_letter = elem.get('letter')
|
|
elif tag == 'page':
|
|
page_index = elem.get('index')
|
|
if page_index:
|
|
current_page = page_index
|
|
elif tag == 'line':
|
|
line_index = elem.get('index')
|
|
if current_letter and current_page and line_index:
|
|
letter_pages[current_letter][current_page].add(line_index)
|
|
|
|
return letter_pages
|
|
|
|
def build_letter_page_line_map_trad(trad_root):
|
|
"""
|
|
Similarly for traditions.xml, reading <letterTradition letter="...">
|
|
to find any <page> and <line>. We'll unify them with briefe.xml data.
|
|
"""
|
|
letter_pages = defaultdict(lambda: defaultdict(set))
|
|
|
|
for letter_trad in trad_root.findall(".//letterTradition"):
|
|
letter_id = letter_trad.get('letter')
|
|
if not letter_id:
|
|
continue
|
|
current_page = None
|
|
|
|
for elem in letter_trad.iter():
|
|
if elem is letter_trad:
|
|
continue
|
|
if elem.tag == 'page':
|
|
pidx = elem.get('index')
|
|
if pidx:
|
|
current_page = pidx
|
|
elif elem.tag == 'line':
|
|
lidx = elem.get('index')
|
|
if current_page and lidx:
|
|
letter_pages[letter_id][current_page].add(lidx)
|
|
|
|
return letter_pages
|
|
|
|
def merge_page_line_maps(map_a, map_b):
|
|
"""
|
|
Merge two letter->page->lines maps (map_b into map_a).
|
|
"""
|
|
for letter_id, pages_dict in map_b.items():
|
|
for page_id, line_set in pages_dict.items():
|
|
map_a[letter_id][page_id].update(line_set)
|
|
return map_a
|
|
|
|
##############################################################################
|
|
# <intlink> check
|
|
##############################################################################
|
|
|
|
def validate_intlinks(xml_root, file_path, letter_pages, letter_refs, errors):
|
|
"""
|
|
For each <intlink letter=... page=... line=.../>:
|
|
- letter is mandatory, must be in letter_refs
|
|
- if page is present => must exist in letter_pages
|
|
- if line is present => must also have that page
|
|
- line w/o page => error
|
|
"""
|
|
for intlink in xml_root.findall(".//intlink"):
|
|
line_no = get_line_number(intlink)
|
|
letter_id = intlink.get('letter')
|
|
page_id = intlink.get('page')
|
|
line_id = intlink.get('line')
|
|
|
|
if not letter_id or letter_id not in letter_refs:
|
|
errors.append({
|
|
"file": file_path,
|
|
"line": line_no,
|
|
"message": f"Invalid intlink letter={letter_id}"
|
|
})
|
|
continue
|
|
|
|
if letter_id not in letter_pages:
|
|
errors.append({
|
|
"file": file_path,
|
|
"line": line_no,
|
|
"message": f"No pages known for letter={letter_id} in intlink"
|
|
})
|
|
continue
|
|
|
|
if page_id:
|
|
if page_id not in letter_pages[letter_id]:
|
|
errors.append({
|
|
"file": file_path,
|
|
"line": line_no,
|
|
"message": f"Invalid page={page_id} for letter={letter_id} in intlink"
|
|
})
|
|
else:
|
|
if line_id:
|
|
if line_id not in letter_pages[letter_id][page_id]:
|
|
errors.append({
|
|
"file": file_path,
|
|
"line": line_no,
|
|
"message": f"Invalid line={line_id} for letter={letter_id}, page={page_id} in intlink"
|
|
})
|
|
else:
|
|
# no page
|
|
if line_id:
|
|
errors.append({
|
|
"file": file_path,
|
|
"line": line_no,
|
|
"message": f"intlink has line={line_id} but no page=? for letter={letter_id}"
|
|
})
|
|
|
|
##############################################################################
|
|
# <kommentar> and <subsection> check
|
|
##############################################################################
|
|
|
|
def gather_commentaries_and_subsections(xml_root, file_path, errors,
|
|
global_kommentar_ids, global_subsection_ids):
|
|
"""
|
|
For each <kommentar id="XYZ"> => must have <lemma>. ID must be globally unique among kommentars.
|
|
For each <subsection id="ABC"> => must have <lemma>. ID must be globally unique among subsections.
|
|
"""
|
|
local_komm_ids = set()
|
|
local_sub_ids = set()
|
|
|
|
# <kommentar id="...">
|
|
for kom in xml_root.findall(".//kommentar"):
|
|
kid = kom.get('id')
|
|
ln = get_line_number(kom)
|
|
if not kid:
|
|
errors.append({
|
|
"file": file_path,
|
|
"line": ln,
|
|
"message": "<kommentar> missing @id"
|
|
})
|
|
continue
|
|
|
|
# check local duplicates
|
|
if kid in local_komm_ids:
|
|
errors.append({
|
|
"file": file_path,
|
|
"line": ln,
|
|
"message": f"Duplicate <kommentar id='{kid}'> in this file"
|
|
})
|
|
else:
|
|
local_komm_ids.add(kid)
|
|
|
|
# check global duplicates
|
|
if kid in global_kommentar_ids:
|
|
errors.append({
|
|
"file": file_path,
|
|
"line": ln,
|
|
"message": f"Duplicate <kommentar id='{kid}'> across multiple registers"
|
|
})
|
|
else:
|
|
global_kommentar_ids.add(kid)
|
|
|
|
# must have a <lemma> child
|
|
lemma_elem = kom.find(".//lemma")
|
|
if lemma_elem is None:
|
|
errors.append({
|
|
"file": file_path,
|
|
"line": ln,
|
|
"message": f"<kommentar id='{kid}'> missing <lemma> child"
|
|
})
|
|
|
|
# <subsection id="...">
|
|
for sub in xml_root.findall(".//subsection"):
|
|
sid = sub.get('id')
|
|
ln = get_line_number(sub)
|
|
if not sid:
|
|
errors.append({
|
|
"file": file_path,
|
|
"line": ln,
|
|
"message": "<subsection> missing @id"
|
|
})
|
|
continue
|
|
|
|
# local duplicates
|
|
if sid in local_sub_ids:
|
|
errors.append({
|
|
"file": file_path,
|
|
"line": ln,
|
|
"message": f"Duplicate <subsection id='{sid}'> in this file"
|
|
})
|
|
else:
|
|
local_sub_ids.add(sid)
|
|
|
|
# global duplicates
|
|
if sid in global_subsection_ids:
|
|
errors.append({
|
|
"file": file_path,
|
|
"line": ln,
|
|
"message": f"Duplicate <subsection id='{sid}'> across multiple registers"
|
|
})
|
|
else:
|
|
global_subsection_ids.add(sid)
|
|
|
|
# must have <lemma>
|
|
lemma_elem = sub.find(".//lemma")
|
|
if lemma_elem is None:
|
|
errors.append({
|
|
"file": file_path,
|
|
"line": ln,
|
|
"message": f"<subsection id='{sid}'> missing <lemma> child"
|
|
})
|
|
|
|
##############################################################################
|
|
# <link ref="..." subref="..."> check
|
|
##############################################################################
|
|
|
|
def validate_links_for_commentary(xml_root, file_path,
|
|
kommentar_ids, subsection_ids,
|
|
errors):
|
|
"""
|
|
For each <link ref="X" subref="Y">:
|
|
- if ref="X" => X must be in kommentars OR subsections
|
|
- if subref="Y" => Y must be in subsections
|
|
"""
|
|
for link_elem in xml_root.findall(".//link"):
|
|
ln = get_line_number(link_elem)
|
|
refval = link_elem.get('ref')
|
|
subrefval = link_elem.get('subref')
|
|
|
|
# check ref
|
|
if refval:
|
|
if refval not in kommentar_ids and refval not in subsection_ids:
|
|
errors.append({
|
|
"file": file_path,
|
|
"line": ln,
|
|
"message": f"Invalid <link ref='{refval}'> (not in komentar/subsection IDs)"
|
|
})
|
|
# check subref
|
|
if subrefval:
|
|
if subrefval not in subsection_ids:
|
|
errors.append({
|
|
"file": file_path,
|
|
"line": ln,
|
|
"message": f"Invalid <link subref='{subrefval}'> (not in <subsection> IDs)"
|
|
})
|
|
|
|
##############################################################################
|
|
# The Main Validator
|
|
##############################################################################
|
|
|
|
def validate_references(meta_file, references_file, briefe_file,
|
|
edits_file, traditions_file, marginalien_file,
|
|
extra_registers=None):
|
|
"""All checks in one place."""
|
|
|
|
# 1) Parse main files
|
|
meta_tree = parse_xml(meta_file)
|
|
references_tree = parse_xml(references_file)
|
|
briefe_tree = parse_xml(briefe_file)
|
|
edits_tree = parse_xml(edits_file)
|
|
traditions_tree = parse_xml(traditions_file)
|
|
marginal_tree = parse_xml(marginalien_file)
|
|
|
|
# parse optional register files
|
|
register_trees = []
|
|
if extra_registers:
|
|
for rfile in extra_registers:
|
|
rtree = parse_xml(rfile)
|
|
register_trees.append((rfile, rtree))
|
|
|
|
# 2) Get roots
|
|
meta_xml = meta_tree.getroot()
|
|
references_xml = references_tree.getroot()
|
|
briefe_xml = briefe_tree.getroot()
|
|
edits_xml = edits_tree.getroot()
|
|
traditions_xml = traditions_tree.getroot()
|
|
marginalien_xml = marginal_tree.getroot()
|
|
|
|
# 3) Reference sets from references.xml + edits.xml + meta.xml
|
|
person_refs = {p.get('index') for p in references_xml.findall(".//personDef")}
|
|
location_refs = {l.get('index') for l in references_xml.findall(".//locationDef")}
|
|
hand_refs = {h.get('index') for h in references_xml.findall(".//handDef")}
|
|
app_refs = {a.get('index') for a in references_xml.findall(".//appDef")}
|
|
edit_refs = {e.get('index') for e in edits_xml.findall(".//editreason")}
|
|
letter_refs = {desc.get('letter') for desc in meta_xml.findall(".//letterDesc")}
|
|
|
|
# We'll accumulate all errors as a list of dict: {file, line, message}
|
|
errors = []
|
|
|
|
# 4) Gather all <kommentar> and <subsection> IDs from each register
|
|
# to check their uniqueness and presence of <lemma>.
|
|
global_kommentar_ids = set()
|
|
global_subsection_ids = set()
|
|
|
|
for (rfile, rtree) in register_trees:
|
|
rroot = rtree.getroot()
|
|
gather_commentaries_and_subsections(
|
|
rroot, rfile, errors,
|
|
global_kommentar_ids, global_subsection_ids
|
|
)
|
|
|
|
# (If references.xml or traditions.xml also contain <kommentar> or <subsection>,
|
|
# call gather_commentaries_and_subsections on them similarly.)
|
|
|
|
# 5) Validate meta.xml references
|
|
for letter in meta_xml.findall(".//letterDesc"):
|
|
letter_id = letter.get('letter')
|
|
ln = get_line_number(letter)
|
|
|
|
# <sender ref="...">
|
|
for sender in letter.findall(".//sender"):
|
|
ref = sender.get('ref')
|
|
if ref and ref not in person_refs:
|
|
errors.append({
|
|
"file": meta_file,
|
|
"line": get_line_number(sender),
|
|
"message": f"Invalid sender ref: {ref} in letter={letter_id}"
|
|
})
|
|
|
|
# <receiver ref="...">
|
|
for receiver in letter.findall(".//receiver"):
|
|
ref = receiver.get('ref')
|
|
if ref and ref not in person_refs:
|
|
errors.append({
|
|
"file": meta_file,
|
|
"line": get_line_number(receiver),
|
|
"message": f"Invalid receiver ref: {ref} in letter={letter_id}"
|
|
})
|
|
|
|
# <location ref="...">
|
|
loc_elem = letter.find(".//location")
|
|
if loc_elem is not None:
|
|
r = loc_elem.get('ref')
|
|
if r and r not in location_refs:
|
|
errors.append({
|
|
"file": meta_file,
|
|
"line": get_line_number(loc_elem),
|
|
"message": f"Invalid location ref: {r} in letter={letter_id}"
|
|
})
|
|
|
|
# 6) Validate briefe.xml references
|
|
for letter_text in briefe_xml.findall(".//letterText"):
|
|
letter_id = letter_text.get('letter')
|
|
ln = get_line_number(letter_text)
|
|
|
|
if letter_id and letter_id not in letter_refs:
|
|
errors.append({
|
|
"file": briefe_file,
|
|
"line": ln,
|
|
"message": f"Invalid letter reference: {letter_id} in briefe.xml"
|
|
})
|
|
|
|
for hand_elem in letter_text.findall(".//hand"):
|
|
ref = hand_elem.get('ref')
|
|
if ref and ref not in hand_refs:
|
|
errors.append({
|
|
"file": briefe_file,
|
|
"line": get_line_number(hand_elem),
|
|
"message": f"Invalid hand ref: {ref} in letter {letter_id}"
|
|
})
|
|
|
|
for edit_elem in letter_text.findall(".//edit"):
|
|
ref = edit_elem.get('ref')
|
|
if ref and ref not in edit_refs:
|
|
errors.append({
|
|
"file": briefe_file,
|
|
"line": get_line_number(edit_elem),
|
|
"message": f"Invalid edit ref: {ref} in letter {letter_id}"
|
|
})
|
|
|
|
# 7) Validate traditions.xml references (besides page/line)
|
|
for tradition in traditions_xml.findall(".//letterTradition"):
|
|
letter_id = tradition.get('letter')
|
|
ln = get_line_number(tradition)
|
|
if letter_id and letter_id not in letter_refs:
|
|
errors.append({
|
|
"file": traditions_file,
|
|
"line": ln,
|
|
"message": f"Invalid letterTradition reference: {letter_id}"
|
|
})
|
|
|
|
# <app ref="...">
|
|
for app_elem in tradition.findall(".//app"):
|
|
ref = app_elem.get('ref')
|
|
if ref and ref not in app_refs:
|
|
errors.append({
|
|
"file": traditions_file,
|
|
"line": get_line_number(app_elem),
|
|
"message": f"Invalid app ref: {ref} in letterTradition {letter_id}"
|
|
})
|
|
|
|
# <hand ref="...">
|
|
for hand_elem in tradition.findall(".//hand"):
|
|
ref = hand_elem.get('ref')
|
|
if ref and ref not in hand_refs:
|
|
errors.append({
|
|
"file": traditions_file,
|
|
"line": get_line_number(hand_elem),
|
|
"message": f"Invalid hand ref: {ref} in letterTradition {letter_id}"
|
|
})
|
|
|
|
# 8) Merge letter->page->lines from briefe.xml & traditions.xml
|
|
letter_pages_brief = build_letter_page_line_map_brief(briefe_xml)
|
|
letter_pages_trad = build_letter_page_line_map_trad(traditions_xml)
|
|
letter_pages = merge_page_line_maps(letter_pages_brief, letter_pages_trad)
|
|
|
|
# 9) Validate <intlink> in traditions.xml, marginalien.xml, and all registers
|
|
validate_intlinks(traditions_xml, traditions_file, letter_pages, letter_refs, errors)
|
|
validate_intlinks(marginalien_xml, marginalien_file, letter_pages, letter_refs, errors)
|
|
for (rfile, rtree) in register_trees:
|
|
rroot = rtree.getroot()
|
|
validate_intlinks(rroot, rfile, letter_pages, letter_refs, errors)
|
|
|
|
# 10) <marginal letter="..." page="..." line="..."> in Marginal-Kommentar.xml
|
|
for marginal_elem in marginalien_xml.findall(".//marginal"):
|
|
letter_id = marginal_elem.get('letter')
|
|
page_id = marginal_elem.get('page')
|
|
line_id = marginal_elem.get('line')
|
|
ln = get_line_number(marginal_elem)
|
|
|
|
if letter_id not in letter_refs:
|
|
errors.append({
|
|
"file": marginalien_file,
|
|
"line": ln,
|
|
"message": f"Invalid marginal letter reference: {letter_id} (not in meta.xml)"
|
|
})
|
|
else:
|
|
if letter_id not in letter_pages:
|
|
errors.append({
|
|
"file": marginalien_file,
|
|
"line": ln,
|
|
"message": f"No pages/lines known for letter={letter_id} in briefe/traditions"
|
|
})
|
|
else:
|
|
if page_id not in letter_pages[letter_id]:
|
|
errors.append({
|
|
"file": marginalien_file,
|
|
"line": ln,
|
|
"message": f"Invalid page reference: letter={letter_id}, page={page_id}"
|
|
})
|
|
else:
|
|
if line_id not in letter_pages[letter_id][page_id]:
|
|
errors.append({
|
|
"file": marginalien_file,
|
|
"line": ln,
|
|
"message": f"Invalid line reference: letter={letter_id}, page={page_id}, line={line_id}"
|
|
})
|
|
|
|
# 11) Now validate all <link ref="..." subref="..."> across every file for commentary IDs
|
|
# We'll define two sets that might have come from gather_commentaries_and_subsections:
|
|
# global_kommentar_ids, global_subsection_ids
|
|
def validate_links_in_tree(root, path):
|
|
validate_links_for_commentary(root, path, global_kommentar_ids, global_subsection_ids, errors)
|
|
|
|
# meta.xml
|
|
validate_links_in_tree(meta_xml, meta_file)
|
|
# references.xml
|
|
validate_links_in_tree(references_xml, references_file)
|
|
# briefe.xml
|
|
validate_links_in_tree(briefe_xml, briefe_file)
|
|
# edits.xml
|
|
validate_links_in_tree(edits_xml, edits_file)
|
|
# traditions.xml
|
|
validate_links_in_tree(traditions_xml, traditions_file)
|
|
# marginalien.xml
|
|
validate_links_in_tree(marginalien_xml, marginalien_file)
|
|
# registers
|
|
for (rfile, rtree) in register_trees:
|
|
validate_links_in_tree(rtree.getroot(), rfile)
|
|
|
|
############################################################################
|
|
# Final: Print errors or success
|
|
############################################################################
|
|
if errors:
|
|
# Print them in GitHub annotation format: ::error file=...,line=...::{message}
|
|
for err in errors:
|
|
file_name = err["file"]
|
|
line_no = err["line"]
|
|
message = err["message"]
|
|
print(f"::error file={file_name},line={line_no}::{message}")
|
|
sys.exit(1)
|
|
else:
|
|
print("All references are valid.")
|
|
|
|
##############################################################################
|
|
# Entry Point
|
|
##############################################################################
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="""
|
|
Validates cross-references among multiple XML files, merges page/line from briefe.xml & traditions.xml,
|
|
handles <intlink>, plus checks <kommentar>/<subsection> with unique IDs and <lemma>, and <link ref/subref> to these IDs.
|
|
Produces GitHub Actions annotation errors.
|
|
""")
|
|
|
|
parser.add_argument("meta_file", help="Path to meta.xml")
|
|
parser.add_argument("references_file", help="Path to references.xml")
|
|
parser.add_argument("briefe_file", help="Path to briefe.xml")
|
|
parser.add_argument("edits_file", help="Path to edits.xml")
|
|
parser.add_argument("traditions_file", help="Path to traditions.xml")
|
|
parser.add_argument("marginalien_file", help="Path to Marginal-Kommentar.xml")
|
|
parser.add_argument(
|
|
"--register",
|
|
dest="registers",
|
|
nargs="*",
|
|
default=None,
|
|
help="One or more register.xml files, containing <kommentar>/<subsection> plus possible <link> or <intlink>."
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
validate_references(
|
|
args.meta_file,
|
|
args.references_file,
|
|
args.briefe_file,
|
|
args.edits_file,
|
|
args.traditions_file,
|
|
args.marginalien_file,
|
|
extra_registers=args.registers
|
|
)
|