Files
KGPZ/Scripts/beitragids/generate_ids.py

292 lines
9.7 KiB
Python

#!/usr/bin/env python3
import os
import glob
import re
from lxml import etree
def int_to_roman(num):
"""Convert integer to roman numeral"""
val = [
1000, 900, 500, 400,
100, 90, 50, 40,
10, 9, 5, 4,
1
]
syb = [
"M", "CM", "D", "CD",
"C", "XC", "L", "XL",
"X", "IX", "V", "IV",
"I"
]
roman_num = ''
i = 0
while num > 0:
for _ in range(num // val[i]):
roman_num += syb[i]
num -= val[i]
i += 1
return roman_num
def normalize_text_for_url(text):
"""Normalize text for URL usage according to specifications"""
if not text:
return ""
# Take text up to first punctuation mark if present
match = re.search(r'[.,:;!?]', text)
if match:
text = text[:match.start()]
# Lowercase
text = text.lower()
# Replace German umlauts
text = text.replace('ä', 'ae').replace('ü', 'ue').replace('ö', 'oe').replace('ß', 'ss')
# Replace spaces with hyphens
text = re.sub(r'\s+', '-', text)
# Remove all punctuation and special characters except hyphens
text = re.sub(r'[^\w\-]', '', text)
# Remove multiple consecutive hyphens
text = re.sub(r'-+', '-', text)
# Remove leading/trailing hyphens
text = text.strip('-')
return text
def get_element_info(element):
"""Get debugging info for an element including line number and XML content"""
line_num = getattr(element, 'sourceline', 'unknown')
# Get the XML representation of the element
xml_content = etree.tostring(element, encoding='unicode', pretty_print=True).strip()
return f"Line {line_num}: {xml_content}"
def generate_id_for_beitrag(beitrag, existing_ids):
"""Generate unique ID for a beitrag element"""
# Get all stueck elements - use local-name() to ignore namespace
stuecke = beitrag.xpath('./*[local-name()="stueck"]')
if not stuecke:
element_info = get_element_info(beitrag)
return None, f"No stueck elements found in beitrag:\n{element_info}"
base_id = ""
# Determine base ID based on number of stueck elements
if len(stuecke) == 1:
stueck = stuecke[0]
when = stueck.get('when')
nr = stueck.get('nr')
beilage = stueck.get('beilage')
if not when or not nr:
return None, f"Missing when ({when}) or nr ({nr}) in stueck"
base_id = f"{when}-{nr}-"
if beilage:
base_id += "beil-"
else:
# Multiple stueck elements - use year from first one
first_stueck = stuecke[0]
when = first_stueck.get('when')
if not when:
return None, "Missing when attribute in first stueck"
base_id = f"{when}-"
# Check for akteur with ref attribute - prefer ones without kat attribute first
akteur_no_kat = beitrag.xpath('./*[local-name()="akteur"][@ref and not(@kat)]')
akteur_with_kat = beitrag.xpath('./*[local-name()="akteur"][@ref and @kat]')
akteur_used_as_identifier = False
if akteur_no_kat:
# Include all akteur without kat (multiple authors)
akteur_refs = [akteur.get('ref') for akteur in akteur_no_kat]
base_id += f"{'-'.join(akteur_refs)}-"
# Try to find additional identifier in order of priority
additional_part = ""
# 1. Try title
titel = beitrag.xpath('./*[local-name()="titel"]')
if titel and titel[0].text:
additional_part = normalize_text_for_url(titel[0].text)
# 2. Try incipit if no title
if not additional_part:
incipit = beitrag.xpath('./*[local-name()="incipit"]')
if incipit and incipit[0].text:
additional_part = normalize_text_for_url(incipit[0].text)
# 3. Try kategorie ref if no title/incipit
if not additional_part:
kategorie = beitrag.xpath('./*[local-name()="kategorie"][@ref]')
if kategorie:
additional_part = kategorie[0].get('ref')
# 4. Try werk if no title/incipit/kategorie (ignore provinienz)
if not additional_part:
werk = beitrag.xpath('./*[local-name()="werk"][@ref and @kat != "provenienz"]')
if not werk:
# If no werk with kat != provinienz, try werk without kat
werk = beitrag.xpath('./*[local-name()="werk"][@ref and not(@kat)]')
if werk:
kat = werk[0].get('kat')
ref = werk[0].get('ref')
if kat:
additional_part = f"{kat}-{ref}"
else:
additional_part = ref
# 5. Try akteur with kat if no title/incipit/kategorie/werk
if not additional_part and akteur_with_kat:
akteur_ref = akteur_with_kat[0].get('ref')
akteur_kat = akteur_with_kat[0].get('kat')
additional_part = f"{akteur_ref}-{akteur_kat}"
akteur_used_as_identifier = True
# 6. Try anmerkung if all else fails
if not additional_part:
anmerkung = beitrag.xpath('./*[local-name()="anmerkung"]')
if anmerkung and anmerkung[0].text:
additional_part = normalize_text_for_url(anmerkung[0].text)
# 7. Check for nested beitrag tag and append its ref+kat (only if no title/incipit was found)
nested_beitrag = beitrag.xpath('./*[local-name()="beitrag"][@ref and @kat]')
if nested_beitrag and additional_part:
# Only append if we don't already have title/incipit
titel = beitrag.xpath('./*[local-name()="titel"]')
incipit = beitrag.xpath('./*[local-name()="incipit"]')
# If we have title or incipit, don't append nested beitrag
if not (titel and titel[0].text) and not (incipit and incipit[0].text):
nested_ref = nested_beitrag[0].get('ref')
nested_kat = nested_beitrag[0].get('kat')
additional_part += f"-{nested_ref}-{nested_kat}"
elif nested_beitrag and not additional_part:
# Use nested beitrag as identifier if nothing else was found
nested_ref = nested_beitrag[0].get('ref')
nested_kat = nested_beitrag[0].get('kat')
additional_part = f"{nested_ref}-{nested_kat}"
if not additional_part:
# Log failure with element info
element_info = get_element_info(beitrag)
return None, f"No identifier found for beitrag:\n{element_info}"
# Construct final ID
final_id = base_id + additional_part
# Ensure uniqueness with roman numerals
original_id = final_id
counter = 2 # Start with II for first duplicate
while final_id in existing_ids:
final_id = f"{original_id}-{int_to_roman(counter)}"
counter += 1
return final_id, None
def process_xml_file(file_path, existing_ids):
"""Process a single XML file and add IDs to beitrag elements"""
print(f"Processing {file_path}...")
# Parse with lxml preserving whitespace, comments, and line numbers
parser = etree.XMLParser(strip_cdata=False, remove_blank_text=False, remove_comments=False)
tree = etree.parse(file_path, parser)
root = tree.getroot()
# Find all beitrag elements that are direct children of beitraege and don't have an id attribute
beitraege = root.xpath('./*[local-name()="beitrag"][not(@id)]')
modified = False
errors = []
for beitrag in beitraege:
generated_id, error = generate_id_for_beitrag(beitrag, existing_ids)
if generated_id:
beitrag.set('id', generated_id)
existing_ids.add(generated_id)
modified = True
print(f" Added ID: {generated_id}")
else:
errors.append(error)
print(f" ERROR: {error}")
# Save the file if modified
if modified:
# Write back with original formatting preserved
tree.write(file_path, encoding='utf-8', xml_declaration=True, pretty_print=False)
print(f" Updated {file_path}")
return len(beitraege), len([e for e in errors if e]), errors
def main():
"""Main function to process all XML files"""
# Change to project root directory
script_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.join(script_dir, '..', '..')
os.chdir(project_root)
# Find all beitraege XML files
xml_files = glob.glob('XML/beitraege/*.xml')
if not xml_files:
print("No XML files found in XML/beitraege/")
return
# First pass: collect all existing IDs to ensure uniqueness
existing_ids = set()
print("Collecting existing IDs...")
for file_path in xml_files:
try:
parser = etree.XMLParser(strip_cdata=False, remove_blank_text=False, remove_comments=False)
tree = etree.parse(file_path, parser)
root = tree.getroot()
# Find all existing IDs from direct children of beitraege
existing_beitraege = root.xpath('./*[local-name()="beitrag"][@id]')
for beitrag in existing_beitraege:
existing_ids.add(beitrag.get('id'))
except Exception as e:
print(f"Error reading {file_path}: {e}")
print(f"Found {len(existing_ids)} existing IDs")
# Second pass: generate IDs for beitraege without IDs
total_processed = 0
total_errors = 0
all_errors = []
for file_path in xml_files:
try:
processed, errors, error_list = process_xml_file(file_path, existing_ids)
total_processed += processed
total_errors += errors
all_errors.extend(error_list)
except Exception as e:
print(f"Error processing {file_path}: {e}")
total_errors += 1
print(f"\nSummary:")
print(f"Total beitraege processed: {total_processed}")
print(f"Total errors: {total_errors}")
if all_errors:
print(f"\nErrors encountered:")
for error in all_errors:
print(f" - {error}")
if __name__ == "__main__":
main()