mirror of
https://github.com/Theodor-Springmann-Stiftung/KGPZ.git
synced 2025-10-29 09:05:30 +00:00
95 lines
3.3 KiB
Python
95 lines
3.3 KiB
Python
|
|
import os
|
|
from lxml import etree
|
|
from collections import defaultdict
|
|
import sys
|
|
|
|
def process_xml_file(filepath):
|
|
"""
|
|
Processes a single XML file to add 'order' attributes to <stueck> elements where necessary.
|
|
|
|
Args:
|
|
filepath (str): The path to the XML file.
|
|
"""
|
|
try:
|
|
# Use a parser that preserves comments and whitespace as much as possible
|
|
parser = etree.XMLParser(remove_blank_text=False, remove_comments=False, strip_cdata=False)
|
|
tree = etree.parse(filepath, parser)
|
|
root = tree.getroot()
|
|
|
|
# Default namespace from the document, if it exists
|
|
ns = {'kgpz': root.nsmap.get(None)} if root.nsmap.get(None) else {}
|
|
|
|
# Group stueck elements by (when, nr, von)
|
|
stueck_groups = defaultdict(list)
|
|
|
|
# XPath to find all beitrag elements
|
|
beitraege = root.xpath('//beitrag')
|
|
|
|
for beitrag in beitraege:
|
|
stueck = beitrag.find('stueck')
|
|
if stueck is not None:
|
|
when = stueck.get('when')
|
|
nr = stueck.get('nr')
|
|
von = stueck.get('von')
|
|
bis = stueck.get('bis')
|
|
|
|
# Ambiguity arises for single-page entries.
|
|
# A multi-page entry is one where `bis` is present and `bis` != `von`.
|
|
is_multi_page = bis is not None and bis != von
|
|
|
|
if von and not is_multi_page:
|
|
key = (when, nr, von)
|
|
stueck_groups[key].append(stueck)
|
|
|
|
# Add order attribute where there is ambiguity
|
|
modified = False
|
|
for key, stuecks in stueck_groups.items():
|
|
if len(stuecks) > 1:
|
|
modified = True
|
|
for i, stueck in enumerate(stuecks):
|
|
stueck.set('order', str(i + 1))
|
|
|
|
# Write back to the file only if changes were made
|
|
if modified:
|
|
# Get the original XML declaration details
|
|
xml_declaration = tree.docinfo.xml_version
|
|
encoding = tree.docinfo.encoding
|
|
|
|
tree.write(filepath,
|
|
encoding=encoding,
|
|
xml_declaration=True,
|
|
pretty_print=False)
|
|
print(f"Updated order attributes in: {filepath}")
|
|
else:
|
|
print(f"No changes needed for: {filepath}")
|
|
|
|
except etree.XMLSyntaxError as e:
|
|
print(f"Error parsing {filepath}: {e}", file=sys.stderr)
|
|
except Exception as e:
|
|
print(f"An unexpected error occurred with {filepath}: {e}", file=sys.stderr)
|
|
|
|
|
|
def main():
|
|
"""
|
|
Main function to find and process all relevant XML files.
|
|
"""
|
|
# The script is in scripts/orderfix, so we go up two levels
|
|
# to the project root, then into XML/beitraege.
|
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
beitraege_dir = os.path.abspath(os.path.join(script_dir, '../../XML/beitraege'))
|
|
|
|
if not os.path.isdir(beitraege_dir):
|
|
print(f"Error: Directory not found at {beitraege_dir}", file=sys.stderr)
|
|
return
|
|
|
|
print(f"Scanning for XML files in: {beitraege_dir}")
|
|
for filename in os.listdir(beitraege_dir):
|
|
if filename.endswith('.xml'):
|
|
filepath = os.path.join(beitraege_dir, filename)
|
|
process_xml_file(filepath)
|
|
print("Processing complete.")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|