diff --git a/XSD/common.xsd b/XSD/common.xsd index 5267852..058ba19 100644 --- a/XSD/common.xsd +++ b/XSD/common.xsd @@ -59,6 +59,12 @@ + + + Die Reihenfolge des Beitrags auf einer Seite, falls mehrdeutig. + + + diff --git a/scripts/orderfix/add_order_hints.py b/scripts/orderfix/add_order_hints.py new file mode 100644 index 0000000..9854abe --- /dev/null +++ b/scripts/orderfix/add_order_hints.py @@ -0,0 +1,94 @@ + +import os +from lxml import etree +from collections import defaultdict +import sys + +def process_xml_file(filepath): + """ + Processes a single XML file to add 'order' attributes to elements where necessary. + + Args: + filepath (str): The path to the XML file. + """ + try: + # Use a parser that preserves comments and whitespace as much as possible + parser = etree.XMLParser(remove_blank_text=False, remove_comments=False, strip_cdata=False) + tree = etree.parse(filepath, parser) + root = tree.getroot() + + # Default namespace from the document, if it exists + ns = {'kgpz': root.nsmap.get(None)} if root.nsmap.get(None) else {} + + # Group stueck elements by (when, nr, von) + stueck_groups = defaultdict(list) + + # XPath to find all beitrag elements + beitraege = root.xpath('//beitrag') + + for beitrag in beitraege: + stueck = beitrag.find('stueck') + if stueck is not None: + when = stueck.get('when') + nr = stueck.get('nr') + von = stueck.get('von') + bis = stueck.get('bis') + + # Ambiguity arises for single-page entries. + # A multi-page entry is one where `bis` is present and `bis` != `von`. + is_multi_page = bis is not None and bis != von + + if von and not is_multi_page: + key = (when, nr, von) + stueck_groups[key].append(stueck) + + # Add order attribute where there is ambiguity + modified = False + for key, stuecks in stueck_groups.items(): + if len(stuecks) > 1: + modified = True + for i, stueck in enumerate(stuecks): + stueck.set('order', str(i + 1)) + + # Write back to the file only if changes were made + if modified: + # Get the original XML declaration details + xml_declaration = tree.docinfo.xml_version + encoding = tree.docinfo.encoding + + tree.write(filepath, + encoding=encoding, + xml_declaration=True, + pretty_print=False) + print(f"Updated order attributes in: {filepath}") + else: + print(f"No changes needed for: {filepath}") + + except etree.XMLSyntaxError as e: + print(f"Error parsing {filepath}: {e}", file=sys.stderr) + except Exception as e: + print(f"An unexpected error occurred with {filepath}: {e}", file=sys.stderr) + + +def main(): + """ + Main function to find and process all relevant XML files. + """ + # The script is in scripts/orderfix, so we go up two levels + # to the project root, then into XML/beitraege. + script_dir = os.path.dirname(os.path.abspath(__file__)) + beitraege_dir = os.path.abspath(os.path.join(script_dir, '../../XML/beitraege')) + + if not os.path.isdir(beitraege_dir): + print(f"Error: Directory not found at {beitraege_dir}", file=sys.stderr) + return + + print(f"Scanning for XML files in: {beitraege_dir}") + for filename in os.listdir(beitraege_dir): + if filename.endswith('.xml'): + filepath = os.path.join(beitraege_dir, filename) + process_xml_file(filepath) + print("Processing complete.") + +if __name__ == "__main__": + main()