mirror of
https://github.com/Theodor-Springmann-Stiftung/KGPZ.git
synced 2025-10-30 09:25:29 +00:00
+ Beitrag-IDs
This commit is contained in:
292
Scripts/beitragids/generate_ids.py
Normal file
292
Scripts/beitragids/generate_ids.py
Normal file
@@ -0,0 +1,292 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import glob
|
||||
import re
|
||||
from lxml import etree
|
||||
|
||||
def int_to_roman(num):
|
||||
"""Convert integer to roman numeral"""
|
||||
val = [
|
||||
1000, 900, 500, 400,
|
||||
100, 90, 50, 40,
|
||||
10, 9, 5, 4,
|
||||
1
|
||||
]
|
||||
syb = [
|
||||
"M", "CM", "D", "CD",
|
||||
"C", "XC", "L", "XL",
|
||||
"X", "IX", "V", "IV",
|
||||
"I"
|
||||
]
|
||||
roman_num = ''
|
||||
i = 0
|
||||
while num > 0:
|
||||
for _ in range(num // val[i]):
|
||||
roman_num += syb[i]
|
||||
num -= val[i]
|
||||
i += 1
|
||||
return roman_num
|
||||
|
||||
def normalize_text_for_url(text):
|
||||
"""Normalize text for URL usage according to specifications"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
# Take text up to first punctuation mark if present
|
||||
match = re.search(r'[.,:;!?]', text)
|
||||
if match:
|
||||
text = text[:match.start()]
|
||||
|
||||
# Lowercase
|
||||
text = text.lower()
|
||||
|
||||
# Replace German umlauts
|
||||
text = text.replace('ä', 'ae').replace('ü', 'ue').replace('ö', 'oe').replace('ß', 'ss')
|
||||
|
||||
# Replace spaces with hyphens
|
||||
text = re.sub(r'\s+', '-', text)
|
||||
|
||||
# Remove all punctuation and special characters except hyphens
|
||||
text = re.sub(r'[^\w\-]', '', text)
|
||||
|
||||
# Remove multiple consecutive hyphens
|
||||
text = re.sub(r'-+', '-', text)
|
||||
|
||||
# Remove leading/trailing hyphens
|
||||
text = text.strip('-')
|
||||
|
||||
return text
|
||||
|
||||
def get_element_info(element):
|
||||
"""Get debugging info for an element including line number and XML content"""
|
||||
line_num = getattr(element, 'sourceline', 'unknown')
|
||||
|
||||
# Get the XML representation of the element
|
||||
xml_content = etree.tostring(element, encoding='unicode', pretty_print=True).strip()
|
||||
|
||||
return f"Line {line_num}: {xml_content}"
|
||||
|
||||
def generate_id_for_beitrag(beitrag, existing_ids):
|
||||
"""Generate unique ID for a beitrag element"""
|
||||
|
||||
# Get all stueck elements - use local-name() to ignore namespace
|
||||
stuecke = beitrag.xpath('./*[local-name()="stueck"]')
|
||||
|
||||
if not stuecke:
|
||||
element_info = get_element_info(beitrag)
|
||||
return None, f"No stueck elements found in beitrag:\n{element_info}"
|
||||
|
||||
base_id = ""
|
||||
|
||||
# Determine base ID based on number of stueck elements
|
||||
if len(stuecke) == 1:
|
||||
stueck = stuecke[0]
|
||||
when = stueck.get('when')
|
||||
nr = stueck.get('nr')
|
||||
beilage = stueck.get('beilage')
|
||||
|
||||
if not when or not nr:
|
||||
return None, f"Missing when ({when}) or nr ({nr}) in stueck"
|
||||
|
||||
base_id = f"{when}-{nr}-"
|
||||
if beilage:
|
||||
base_id += "beil-"
|
||||
else:
|
||||
# Multiple stueck elements - use year from first one
|
||||
first_stueck = stuecke[0]
|
||||
when = first_stueck.get('when')
|
||||
if not when:
|
||||
return None, "Missing when attribute in first stueck"
|
||||
base_id = f"{when}-"
|
||||
|
||||
# Check for akteur with ref attribute - prefer ones without kat attribute first
|
||||
akteur_no_kat = beitrag.xpath('./*[local-name()="akteur"][@ref and not(@kat)]')
|
||||
akteur_with_kat = beitrag.xpath('./*[local-name()="akteur"][@ref and @kat]')
|
||||
|
||||
akteur_used_as_identifier = False
|
||||
|
||||
if akteur_no_kat:
|
||||
# Include all akteur without kat (multiple authors)
|
||||
akteur_refs = [akteur.get('ref') for akteur in akteur_no_kat]
|
||||
base_id += f"{'-'.join(akteur_refs)}-"
|
||||
|
||||
# Try to find additional identifier in order of priority
|
||||
additional_part = ""
|
||||
|
||||
# 1. Try title
|
||||
titel = beitrag.xpath('./*[local-name()="titel"]')
|
||||
if titel and titel[0].text:
|
||||
additional_part = normalize_text_for_url(titel[0].text)
|
||||
|
||||
# 2. Try incipit if no title
|
||||
if not additional_part:
|
||||
incipit = beitrag.xpath('./*[local-name()="incipit"]')
|
||||
if incipit and incipit[0].text:
|
||||
additional_part = normalize_text_for_url(incipit[0].text)
|
||||
|
||||
# 3. Try kategorie ref if no title/incipit
|
||||
if not additional_part:
|
||||
kategorie = beitrag.xpath('./*[local-name()="kategorie"][@ref]')
|
||||
if kategorie:
|
||||
additional_part = kategorie[0].get('ref')
|
||||
|
||||
# 4. Try werk if no title/incipit/kategorie (ignore provinienz)
|
||||
if not additional_part:
|
||||
werk = beitrag.xpath('./*[local-name()="werk"][@ref and @kat != "provinienz"]')
|
||||
if not werk:
|
||||
# If no werk with kat != provinienz, try werk without kat
|
||||
werk = beitrag.xpath('./*[local-name()="werk"][@ref and not(@kat)]')
|
||||
|
||||
if werk:
|
||||
kat = werk[0].get('kat')
|
||||
ref = werk[0].get('ref')
|
||||
if kat:
|
||||
additional_part = f"{kat}-{ref}"
|
||||
else:
|
||||
additional_part = ref
|
||||
|
||||
# 5. Try akteur with kat if no title/incipit/kategorie/werk
|
||||
if not additional_part and akteur_with_kat:
|
||||
akteur_ref = akteur_with_kat[0].get('ref')
|
||||
akteur_kat = akteur_with_kat[0].get('kat')
|
||||
additional_part = f"{akteur_ref}-{akteur_kat}"
|
||||
akteur_used_as_identifier = True
|
||||
|
||||
# 6. Try anmerkung if all else fails
|
||||
if not additional_part:
|
||||
anmerkung = beitrag.xpath('./*[local-name()="anmerkung"]')
|
||||
if anmerkung and anmerkung[0].text:
|
||||
additional_part = normalize_text_for_url(anmerkung[0].text)
|
||||
|
||||
# 7. Check for nested beitrag tag and append its ref+kat (only if no title/incipit was found)
|
||||
nested_beitrag = beitrag.xpath('./*[local-name()="beitrag"][@ref and @kat]')
|
||||
if nested_beitrag and additional_part:
|
||||
# Only append if we don't already have title/incipit
|
||||
titel = beitrag.xpath('./*[local-name()="titel"]')
|
||||
incipit = beitrag.xpath('./*[local-name()="incipit"]')
|
||||
|
||||
# If we have title or incipit, don't append nested beitrag
|
||||
if not (titel and titel[0].text) and not (incipit and incipit[0].text):
|
||||
nested_ref = nested_beitrag[0].get('ref')
|
||||
nested_kat = nested_beitrag[0].get('kat')
|
||||
additional_part += f"-{nested_ref}-{nested_kat}"
|
||||
elif nested_beitrag and not additional_part:
|
||||
# Use nested beitrag as identifier if nothing else was found
|
||||
nested_ref = nested_beitrag[0].get('ref')
|
||||
nested_kat = nested_beitrag[0].get('kat')
|
||||
additional_part = f"{nested_ref}-{nested_kat}"
|
||||
|
||||
if not additional_part:
|
||||
# Log failure with element info
|
||||
element_info = get_element_info(beitrag)
|
||||
return None, f"No identifier found for beitrag:\n{element_info}"
|
||||
|
||||
# Construct final ID
|
||||
final_id = base_id + additional_part
|
||||
|
||||
# Ensure uniqueness with roman numerals
|
||||
original_id = final_id
|
||||
counter = 2 # Start with II for first duplicate
|
||||
while final_id in existing_ids:
|
||||
final_id = f"{original_id}-{int_to_roman(counter)}"
|
||||
counter += 1
|
||||
|
||||
return final_id, None
|
||||
|
||||
def process_xml_file(file_path, existing_ids):
|
||||
"""Process a single XML file and add IDs to beitrag elements"""
|
||||
|
||||
print(f"Processing {file_path}...")
|
||||
|
||||
# Parse with lxml preserving whitespace, comments, and line numbers
|
||||
parser = etree.XMLParser(strip_cdata=False, remove_blank_text=False, remove_comments=False)
|
||||
tree = etree.parse(file_path, parser)
|
||||
root = tree.getroot()
|
||||
|
||||
# Find all beitrag elements that are direct children of beitraege and don't have an id attribute
|
||||
beitraege = root.xpath('./*[local-name()="beitrag"][not(@id)]')
|
||||
|
||||
modified = False
|
||||
errors = []
|
||||
|
||||
for beitrag in beitraege:
|
||||
generated_id, error = generate_id_for_beitrag(beitrag, existing_ids)
|
||||
|
||||
if generated_id:
|
||||
beitrag.set('id', generated_id)
|
||||
existing_ids.add(generated_id)
|
||||
modified = True
|
||||
print(f" Added ID: {generated_id}")
|
||||
else:
|
||||
errors.append(error)
|
||||
print(f" ERROR: {error}")
|
||||
|
||||
# Save the file if modified
|
||||
if modified:
|
||||
# Write back with original formatting preserved
|
||||
tree.write(file_path, encoding='utf-8', xml_declaration=True, pretty_print=False)
|
||||
print(f" Updated {file_path}")
|
||||
|
||||
return len(beitraege), len([e for e in errors if e]), errors
|
||||
|
||||
def main():
|
||||
"""Main function to process all XML files"""
|
||||
|
||||
# Change to project root directory
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.join(script_dir, '..', '..')
|
||||
os.chdir(project_root)
|
||||
|
||||
# Find all beitraege XML files
|
||||
xml_files = glob.glob('XML/beitraege/*.xml')
|
||||
|
||||
if not xml_files:
|
||||
print("No XML files found in XML/beitraege/")
|
||||
return
|
||||
|
||||
# First pass: collect all existing IDs to ensure uniqueness
|
||||
existing_ids = set()
|
||||
|
||||
print("Collecting existing IDs...")
|
||||
for file_path in xml_files:
|
||||
try:
|
||||
parser = etree.XMLParser(strip_cdata=False, remove_blank_text=False, remove_comments=False)
|
||||
tree = etree.parse(file_path, parser)
|
||||
root = tree.getroot()
|
||||
|
||||
# Find all existing IDs from direct children of beitraege
|
||||
existing_beitraege = root.xpath('./*[local-name()="beitrag"][@id]')
|
||||
for beitrag in existing_beitraege:
|
||||
existing_ids.add(beitrag.get('id'))
|
||||
except Exception as e:
|
||||
print(f"Error reading {file_path}: {e}")
|
||||
|
||||
print(f"Found {len(existing_ids)} existing IDs")
|
||||
|
||||
# Second pass: generate IDs for beitraege without IDs
|
||||
total_processed = 0
|
||||
total_errors = 0
|
||||
all_errors = []
|
||||
|
||||
for file_path in xml_files:
|
||||
try:
|
||||
processed, errors, error_list = process_xml_file(file_path, existing_ids)
|
||||
total_processed += processed
|
||||
total_errors += errors
|
||||
all_errors.extend(error_list)
|
||||
except Exception as e:
|
||||
print(f"Error processing {file_path}: {e}")
|
||||
total_errors += 1
|
||||
|
||||
print(f"\nSummary:")
|
||||
print(f"Total beitraege processed: {total_processed}")
|
||||
print(f"Total errors: {total_errors}")
|
||||
|
||||
if all_errors:
|
||||
print(f"\nErrors encountered:")
|
||||
for error in all_errors:
|
||||
print(f" - {error}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user