mirror of
https://github.com/Theodor-Springmann-Stiftung/KGPZ.git
synced 2025-10-28 16:45:31 +00:00
292 lines
9.7 KiB
Python
292 lines
9.7 KiB
Python
#!/usr/bin/env python3
|
|
|
|
import os
|
|
import glob
|
|
import re
|
|
from lxml import etree
|
|
|
|
def int_to_roman(num):
|
|
"""Convert integer to roman numeral"""
|
|
val = [
|
|
1000, 900, 500, 400,
|
|
100, 90, 50, 40,
|
|
10, 9, 5, 4,
|
|
1
|
|
]
|
|
syb = [
|
|
"M", "CM", "D", "CD",
|
|
"C", "XC", "L", "XL",
|
|
"X", "IX", "V", "IV",
|
|
"I"
|
|
]
|
|
roman_num = ''
|
|
i = 0
|
|
while num > 0:
|
|
for _ in range(num // val[i]):
|
|
roman_num += syb[i]
|
|
num -= val[i]
|
|
i += 1
|
|
return roman_num
|
|
|
|
def normalize_text_for_url(text):
|
|
"""Normalize text for URL usage according to specifications"""
|
|
if not text:
|
|
return ""
|
|
|
|
# Take text up to first punctuation mark if present
|
|
match = re.search(r'[.,:;!?]', text)
|
|
if match:
|
|
text = text[:match.start()]
|
|
|
|
# Lowercase
|
|
text = text.lower()
|
|
|
|
# Replace German umlauts
|
|
text = text.replace('ä', 'ae').replace('ü', 'ue').replace('ö', 'oe').replace('ß', 'ss')
|
|
|
|
# Replace spaces with hyphens
|
|
text = re.sub(r'\s+', '-', text)
|
|
|
|
# Remove all punctuation and special characters except hyphens
|
|
text = re.sub(r'[^\w\-]', '', text)
|
|
|
|
# Remove multiple consecutive hyphens
|
|
text = re.sub(r'-+', '-', text)
|
|
|
|
# Remove leading/trailing hyphens
|
|
text = text.strip('-')
|
|
|
|
return text
|
|
|
|
def get_element_info(element):
|
|
"""Get debugging info for an element including line number and XML content"""
|
|
line_num = getattr(element, 'sourceline', 'unknown')
|
|
|
|
# Get the XML representation of the element
|
|
xml_content = etree.tostring(element, encoding='unicode', pretty_print=True).strip()
|
|
|
|
return f"Line {line_num}: {xml_content}"
|
|
|
|
def generate_id_for_beitrag(beitrag, existing_ids):
|
|
"""Generate unique ID for a beitrag element"""
|
|
|
|
# Get all stueck elements - use local-name() to ignore namespace
|
|
stuecke = beitrag.xpath('./*[local-name()="stueck"]')
|
|
|
|
if not stuecke:
|
|
element_info = get_element_info(beitrag)
|
|
return None, f"No stueck elements found in beitrag:\n{element_info}"
|
|
|
|
base_id = ""
|
|
|
|
# Determine base ID based on number of stueck elements
|
|
if len(stuecke) == 1:
|
|
stueck = stuecke[0]
|
|
when = stueck.get('when')
|
|
nr = stueck.get('nr')
|
|
beilage = stueck.get('beilage')
|
|
|
|
if not when or not nr:
|
|
return None, f"Missing when ({when}) or nr ({nr}) in stueck"
|
|
|
|
base_id = f"{when}-{nr}-"
|
|
if beilage:
|
|
base_id += "beil-"
|
|
else:
|
|
# Multiple stueck elements - use year from first one
|
|
first_stueck = stuecke[0]
|
|
when = first_stueck.get('when')
|
|
if not when:
|
|
return None, "Missing when attribute in first stueck"
|
|
base_id = f"{when}-"
|
|
|
|
# Check for akteur with ref attribute - prefer ones without kat attribute first
|
|
akteur_no_kat = beitrag.xpath('./*[local-name()="akteur"][@ref and not(@kat)]')
|
|
akteur_with_kat = beitrag.xpath('./*[local-name()="akteur"][@ref and @kat]')
|
|
|
|
akteur_used_as_identifier = False
|
|
|
|
if akteur_no_kat:
|
|
# Include all akteur without kat (multiple authors)
|
|
akteur_refs = [akteur.get('ref') for akteur in akteur_no_kat]
|
|
base_id += f"{'-'.join(akteur_refs)}-"
|
|
|
|
# Try to find additional identifier in order of priority
|
|
additional_part = ""
|
|
|
|
# 1. Try title
|
|
titel = beitrag.xpath('./*[local-name()="titel"]')
|
|
if titel and titel[0].text:
|
|
additional_part = normalize_text_for_url(titel[0].text)
|
|
|
|
# 2. Try incipit if no title
|
|
if not additional_part:
|
|
incipit = beitrag.xpath('./*[local-name()="incipit"]')
|
|
if incipit and incipit[0].text:
|
|
additional_part = normalize_text_for_url(incipit[0].text)
|
|
|
|
# 3. Try kategorie ref if no title/incipit
|
|
if not additional_part:
|
|
kategorie = beitrag.xpath('./*[local-name()="kategorie"][@ref]')
|
|
if kategorie:
|
|
additional_part = kategorie[0].get('ref')
|
|
|
|
# 4. Try werk if no title/incipit/kategorie (ignore provinienz)
|
|
if not additional_part:
|
|
werk = beitrag.xpath('./*[local-name()="werk"][@ref and @kat != "provenienz"]')
|
|
if not werk:
|
|
# If no werk with kat != provinienz, try werk without kat
|
|
werk = beitrag.xpath('./*[local-name()="werk"][@ref and not(@kat)]')
|
|
|
|
if werk:
|
|
kat = werk[0].get('kat')
|
|
ref = werk[0].get('ref')
|
|
if kat:
|
|
additional_part = f"{kat}-{ref}"
|
|
else:
|
|
additional_part = ref
|
|
|
|
# 5. Try akteur with kat if no title/incipit/kategorie/werk
|
|
if not additional_part and akteur_with_kat:
|
|
akteur_ref = akteur_with_kat[0].get('ref')
|
|
akteur_kat = akteur_with_kat[0].get('kat')
|
|
additional_part = f"{akteur_ref}-{akteur_kat}"
|
|
akteur_used_as_identifier = True
|
|
|
|
# 6. Try anmerkung if all else fails
|
|
if not additional_part:
|
|
anmerkung = beitrag.xpath('./*[local-name()="anmerkung"]')
|
|
if anmerkung and anmerkung[0].text:
|
|
additional_part = normalize_text_for_url(anmerkung[0].text)
|
|
|
|
# 7. Check for nested beitrag tag and append its ref+kat (only if no title/incipit was found)
|
|
nested_beitrag = beitrag.xpath('./*[local-name()="beitrag"][@ref and @kat]')
|
|
if nested_beitrag and additional_part:
|
|
# Only append if we don't already have title/incipit
|
|
titel = beitrag.xpath('./*[local-name()="titel"]')
|
|
incipit = beitrag.xpath('./*[local-name()="incipit"]')
|
|
|
|
# If we have title or incipit, don't append nested beitrag
|
|
if not (titel and titel[0].text) and not (incipit and incipit[0].text):
|
|
nested_ref = nested_beitrag[0].get('ref')
|
|
nested_kat = nested_beitrag[0].get('kat')
|
|
additional_part += f"-{nested_ref}-{nested_kat}"
|
|
elif nested_beitrag and not additional_part:
|
|
# Use nested beitrag as identifier if nothing else was found
|
|
nested_ref = nested_beitrag[0].get('ref')
|
|
nested_kat = nested_beitrag[0].get('kat')
|
|
additional_part = f"{nested_ref}-{nested_kat}"
|
|
|
|
if not additional_part:
|
|
# Log failure with element info
|
|
element_info = get_element_info(beitrag)
|
|
return None, f"No identifier found for beitrag:\n{element_info}"
|
|
|
|
# Construct final ID
|
|
final_id = base_id + additional_part
|
|
|
|
# Ensure uniqueness with roman numerals
|
|
original_id = final_id
|
|
counter = 2 # Start with II for first duplicate
|
|
while final_id in existing_ids:
|
|
final_id = f"{original_id}-{int_to_roman(counter)}"
|
|
counter += 1
|
|
|
|
return final_id, None
|
|
|
|
def process_xml_file(file_path, existing_ids):
|
|
"""Process a single XML file and add IDs to beitrag elements"""
|
|
|
|
print(f"Processing {file_path}...")
|
|
|
|
# Parse with lxml preserving whitespace, comments, and line numbers
|
|
parser = etree.XMLParser(strip_cdata=False, remove_blank_text=False, remove_comments=False)
|
|
tree = etree.parse(file_path, parser)
|
|
root = tree.getroot()
|
|
|
|
# Find all beitrag elements that are direct children of beitraege and don't have an id attribute
|
|
beitraege = root.xpath('./*[local-name()="beitrag"][not(@id)]')
|
|
|
|
modified = False
|
|
errors = []
|
|
|
|
for beitrag in beitraege:
|
|
generated_id, error = generate_id_for_beitrag(beitrag, existing_ids)
|
|
|
|
if generated_id:
|
|
beitrag.set('id', generated_id)
|
|
existing_ids.add(generated_id)
|
|
modified = True
|
|
print(f" Added ID: {generated_id}")
|
|
else:
|
|
errors.append(error)
|
|
print(f" ERROR: {error}")
|
|
|
|
# Save the file if modified
|
|
if modified:
|
|
# Write back with original formatting preserved
|
|
tree.write(file_path, encoding='utf-8', xml_declaration=True, pretty_print=False)
|
|
print(f" Updated {file_path}")
|
|
|
|
return len(beitraege), len([e for e in errors if e]), errors
|
|
|
|
def main():
|
|
"""Main function to process all XML files"""
|
|
|
|
# Change to project root directory
|
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
project_root = os.path.join(script_dir, '..', '..')
|
|
os.chdir(project_root)
|
|
|
|
# Find all beitraege XML files
|
|
xml_files = glob.glob('XML/beitraege/*.xml')
|
|
|
|
if not xml_files:
|
|
print("No XML files found in XML/beitraege/")
|
|
return
|
|
|
|
# First pass: collect all existing IDs to ensure uniqueness
|
|
existing_ids = set()
|
|
|
|
print("Collecting existing IDs...")
|
|
for file_path in xml_files:
|
|
try:
|
|
parser = etree.XMLParser(strip_cdata=False, remove_blank_text=False, remove_comments=False)
|
|
tree = etree.parse(file_path, parser)
|
|
root = tree.getroot()
|
|
|
|
# Find all existing IDs from direct children of beitraege
|
|
existing_beitraege = root.xpath('./*[local-name()="beitrag"][@id]')
|
|
for beitrag in existing_beitraege:
|
|
existing_ids.add(beitrag.get('id'))
|
|
except Exception as e:
|
|
print(f"Error reading {file_path}: {e}")
|
|
|
|
print(f"Found {len(existing_ids)} existing IDs")
|
|
|
|
# Second pass: generate IDs for beitraege without IDs
|
|
total_processed = 0
|
|
total_errors = 0
|
|
all_errors = []
|
|
|
|
for file_path in xml_files:
|
|
try:
|
|
processed, errors, error_list = process_xml_file(file_path, existing_ids)
|
|
total_processed += processed
|
|
total_errors += errors
|
|
all_errors.extend(error_list)
|
|
except Exception as e:
|
|
print(f"Error processing {file_path}: {e}")
|
|
total_errors += 1
|
|
|
|
print(f"\nSummary:")
|
|
print(f"Total beitraege processed: {total_processed}")
|
|
print(f"Total errors: {total_errors}")
|
|
|
|
if all_errors:
|
|
print(f"\nErrors encountered:")
|
|
for error in all_errors:
|
|
print(f" - {error}")
|
|
|
|
if __name__ == "__main__":
|
|
main() |