KGPZ/Scripts/beitragids/generate_ids.py

#!/usr/bin/env python3

import os
import glob
import re
from lxml import etree

def int_to_roman(num):
    """Convert integer to roman numeral"""
    val = [
        1000, 900, 500, 400,
        100, 90, 50, 40,
        10, 9, 5, 4,
        1
        ]
    syb = [
        "M", "CM", "D", "CD",
        "C", "XC", "L", "XL",
        "X", "IX", "V", "IV",
        "I"
        ]
    roman_num = ''
    i = 0
    while num > 0:
        for _ in range(num // val[i]):
            roman_num += syb[i]
            num -= val[i]
        i += 1
    return roman_num

def normalize_text_for_url(text):
    """Normalize text for URL usage according to specifications"""
    if not text:
        return ""

    # Take text up to first punctuation mark if present
    match = re.search(r'[.,:;!?]', text)
    if match:
        text = text[:match.start()]

    # Lowercase
    text = text.lower()

    # Replace German umlauts
    text = text.replace('ä', 'ae').replace('ü', 'ue').replace('ö', 'oe').replace('ß', 'ss')

    # Replace spaces with hyphens
    text = re.sub(r'\s+', '-', text)

    # Remove all punctuation and special characters except hyphens
    text = re.sub(r'[^\w\-]', '', text)

    # Remove multiple consecutive hyphens
    text = re.sub(r'-+', '-', text)

    # Remove leading/trailing hyphens
    text = text.strip('-')

    return text

def get_element_info(element):
    """Get debugging info for an element including line number and XML content"""
    line_num = getattr(element, 'sourceline', 'unknown')

    # Get the XML representation of the element
    xml_content = etree.tostring(element, encoding='unicode', pretty_print=True).strip()

    return f"Line {line_num}: {xml_content}"

def generate_id_for_beitrag(beitrag, existing_ids):
    """Generate unique ID for a beitrag element"""

    # Get all stueck elements - use local-name() to ignore namespace
    stuecke = beitrag.xpath('./*[local-name()="stueck"]')

    if not stuecke:
        element_info = get_element_info(beitrag)
        return None, f"No stueck elements found in beitrag:\n{element_info}"

    base_id = ""

    # Determine base ID based on number of stueck elements
    if len(stuecke) == 1:
        stueck = stuecke[0]
        when = stueck.get('when')
        nr = stueck.get('nr')
        beilage = stueck.get('beilage')

        if not when or not nr:
            return None, f"Missing when ({when}) or nr ({nr}) in stueck"

        base_id = f"{when}-{nr}-"
        if beilage:
            base_id += "beil-"
    else:
        # Multiple stueck elements - use year from first one
        first_stueck = stuecke[0]
        when = first_stueck.get('when')
        if not when:
            return None, "Missing when attribute in first stueck"
        base_id = f"{when}-"

    # Check for akteur with ref attribute - prefer ones without kat attribute first
    akteur_no_kat = beitrag.xpath('./*[local-name()="akteur"][@ref and not(@kat)]')
    akteur_with_kat = beitrag.xpath('./*[local-name()="akteur"][@ref and @kat]')

    akteur_used_as_identifier = False

    if akteur_no_kat:
        # Include all akteur without kat (multiple authors)
        akteur_refs = [akteur.get('ref') for akteur in akteur_no_kat]
        base_id += f"{'-'.join(akteur_refs)}-"

    # Try to find additional identifier in order of priority
    additional_part = ""

    # 1. Try title
    titel = beitrag.xpath('./*[local-name()="titel"]')
    if titel and titel[0].text:
        additional_part = normalize_text_for_url(titel[0].text)

    # 2. Try incipit if no title
    if not additional_part:
        incipit = beitrag.xpath('./*[local-name()="incipit"]')
        if incipit and incipit[0].text:
            additional_part = normalize_text_for_url(incipit[0].text)

    # 3. Try kategorie ref if no title/incipit
    if not additional_part:
        kategorie = beitrag.xpath('./*[local-name()="kategorie"][@ref]')
        if kategorie:
            additional_part = kategorie[0].get('ref')

    # 4. Try werk if no title/incipit/kategorie (ignore provinienz)
    if not additional_part:
        werk = beitrag.xpath('./*[local-name()="werk"][@ref and @kat != "provenienz"]')
        if not werk:
            # If no werk with kat != provinienz, try werk without kat
            werk = beitrag.xpath('./*[local-name()="werk"][@ref and not(@kat)]')

        if werk:
            kat = werk[0].get('kat')
            ref = werk[0].get('ref')
            if kat:
                additional_part = f"{kat}-{ref}"
            else:
                additional_part = ref

    # 5. Try akteur with kat if no title/incipit/kategorie/werk
    if not additional_part and akteur_with_kat:
        akteur_ref = akteur_with_kat[0].get('ref')
        akteur_kat = akteur_with_kat[0].get('kat')
        additional_part = f"{akteur_ref}-{akteur_kat}"
        akteur_used_as_identifier = True

    # 6. Try anmerkung if all else fails
    if not additional_part:
        anmerkung = beitrag.xpath('./*[local-name()="anmerkung"]')
        if anmerkung and anmerkung[0].text:
            additional_part = normalize_text_for_url(anmerkung[0].text)

    # 7. Check for nested beitrag tag and append its ref+kat (only if no title/incipit was found)
    nested_beitrag = beitrag.xpath('./*[local-name()="beitrag"][@ref and @kat]')
    if nested_beitrag and additional_part:
        # Only append if we don't already have title/incipit
        titel = beitrag.xpath('./*[local-name()="titel"]')
        incipit = beitrag.xpath('./*[local-name()="incipit"]')

        # If we have title or incipit, don't append nested beitrag
        if not (titel and titel[0].text) and not (incipit and incipit[0].text):
            nested_ref = nested_beitrag[0].get('ref')
            nested_kat = nested_beitrag[0].get('kat')
            additional_part += f"-{nested_ref}-{nested_kat}"
    elif nested_beitrag and not additional_part:
        # Use nested beitrag as identifier if nothing else was found
        nested_ref = nested_beitrag[0].get('ref')
        nested_kat = nested_beitrag[0].get('kat')
        additional_part = f"{nested_ref}-{nested_kat}"

    if not additional_part:
        # Log failure with element info
        element_info = get_element_info(beitrag)
        return None, f"No identifier found for beitrag:\n{element_info}"

    # Construct final ID
    final_id = base_id + additional_part

    # Ensure uniqueness with roman numerals
    original_id = final_id
    counter = 2  # Start with II for first duplicate
    while final_id in existing_ids:
        final_id = f"{original_id}-{int_to_roman(counter)}"
        counter += 1

    return final_id, None

def process_xml_file(file_path, existing_ids):
    """Process a single XML file and add IDs to beitrag elements"""

    print(f"Processing {file_path}...")

    # Parse with lxml preserving whitespace, comments, and line numbers
    parser = etree.XMLParser(strip_cdata=False, remove_blank_text=False, remove_comments=False)
    tree = etree.parse(file_path, parser)
    root = tree.getroot()

    # Find all beitrag elements that are direct children of beitraege and don't have an id attribute
    beitraege = root.xpath('./*[local-name()="beitrag"][not(@id)]')

    modified = False
    errors = []

    for beitrag in beitraege:
        generated_id, error = generate_id_for_beitrag(beitrag, existing_ids)

        if generated_id:
            beitrag.set('id', generated_id)
            existing_ids.add(generated_id)
            modified = True
            print(f"  Added ID: {generated_id}")
        else:
            errors.append(error)
            print(f"  ERROR: {error}")

    # Save the file if modified
    if modified:
        # Write back with original formatting preserved
        tree.write(file_path, encoding='utf-8', xml_declaration=True, pretty_print=False)
        print(f"  Updated {file_path}")

    return len(beitraege), len([e for e in errors if e]), errors

def main():
    """Main function to process all XML files"""

    # Change to project root directory
    script_dir = os.path.dirname(os.path.abspath(__file__))
    project_root = os.path.join(script_dir, '..', '..')
    os.chdir(project_root)

    # Find all beitraege XML files
    xml_files = glob.glob('XML/beitraege/*.xml')

    if not xml_files:
        print("No XML files found in XML/beitraege/")
        return

    # First pass: collect all existing IDs to ensure uniqueness
    existing_ids = set()

    print("Collecting existing IDs...")
    for file_path in xml_files:
        try:
            parser = etree.XMLParser(strip_cdata=False, remove_blank_text=False, remove_comments=False)
            tree = etree.parse(file_path, parser)
            root = tree.getroot()

            # Find all existing IDs from direct children of beitraege
            existing_beitraege = root.xpath('./*[local-name()="beitrag"][@id]')
            for beitrag in existing_beitraege:
                existing_ids.add(beitrag.get('id'))
        except Exception as e:
            print(f"Error reading {file_path}: {e}")

    print(f"Found {len(existing_ids)} existing IDs")

    # Second pass: generate IDs for beitraege without IDs
    total_processed = 0
    total_errors = 0
    all_errors = []

    for file_path in xml_files:
        try:
            processed, errors, error_list = process_xml_file(file_path, existing_ids)
            total_processed += processed
            total_errors += errors
            all_errors.extend(error_list)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            total_errors += 1

    print(f"\nSummary:")
    print(f"Total beitraege processed: {total_processed}")
    print(f"Total errors: {total_errors}")

    if all_errors:
        print(f"\nErrors encountered:")
        for error in all_errors:
            print(f"  - {error}")

if __name__ == "__main__":
    main()