diff --git a/Scripts/lint_validation.py b/Scripts/lint_validation.py index 21033d1..04fd82a 100644 --- a/Scripts/lint_validation.py +++ b/Scripts/lint_validation.py @@ -1,6 +1,5 @@ import os from lxml import etree -from urllib.parse import urljoin REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) XML_DIR = os.path.join(REPO_ROOT, 'XML') @@ -11,42 +10,52 @@ def validate_xml(xml_file): tree = etree.parse(xml_file, parser) root = tree.getroot() - # Get the schema location schema_location = root.get('{http://www.w3.org/2001/XMLSchema-instance}schemaLocation') if schema_location: namespace, xsd_path = schema_location.split() - - # Convert relative path to absolute xsd_path = os.path.normpath(os.path.join(os.path.dirname(xml_file), xsd_path)) if os.path.exists(xsd_path): xsd_doc = etree.parse(xsd_path) schema = etree.XMLSchema(xsd_doc) - - # Validate the XML against the schema schema.assertValid(tree) - print(f"Validation successful: {xml_file}") + print(f"Validation erfolgreich: {xml_file}") else: - print(f"Schema file not found: {xsd_path} for {xml_file}") + print(f"Schema-Datei nicht gefunden: {xsd_path} für {xml_file}") + return False else: - print(f"No schema location found in {xml_file}") + print(f"Keine Schema-Location gefunden in {xml_file}") + return False except etree.DocumentInvalid as e: - print(f"Validation error in {xml_file}:") + print(f"Validierungsfehler in {xml_file}:") for error in e.error_log: - print(f" Line {error.line}, Column {error.column}: {error.message}") + print(f" Zeile {error.line}, Spalte {error.column}: {error.message}") + return False except etree.XMLSyntaxError as e: - print(f"XML syntax error in {xml_file}:") - print(f" Line {e.lineno}, Column {e.offset}: {e.msg}") + print(f"XML-Syntaxfehler in {xml_file}:") + print(f" Zeile {e.lineno}, Spalte {e.offset}: {e.msg}") + return False except Exception as e: - print(f"Error processing {xml_file}: {str(e)}") + print(f"Fehler bei der Verarbeitung von {xml_file}: {str(e)}") + return False + + return True def main(): + validation_failed = False for root, dirs, files in os.walk(XML_DIR): for file in files: if file.endswith('.xml'): xml_file = os.path.join(root, file) - validate_xml(xml_file) + if not validate_xml(xml_file): + validation_failed = True + + if validation_failed: + print("Validierung fehlgeschlagen. Bitte korrigieren Sie die oben genannten Fehler.") + exit(1) + else: + print("Alle XML-Dateien wurden erfolgreich validiert.") if __name__ == "__main__": main() diff --git a/generate_html.py b/generate_html.py new file mode 100644 index 0000000..a9e8bd7 --- /dev/null +++ b/generate_html.py @@ -0,0 +1,201 @@ +import xml.etree.ElementTree as ET +from pathlib import Path +from collections import defaultdict + +BASE_DIR = Path('XML') +BEITRAEGE_DIR = BASE_DIR / 'beitraege' +OUTPUT_DIR = Path('output') +NAMESPACES = { + 'kgpz': 'https://www.koenigsberger-zeitungen.de', + 'xsd': 'http://www.w3.org/2001/XMLSchema' +} + +def parse_xml_file(file_path): + return ET.parse(file_path).getroot() + +def load_werke(): + werke_root = parse_xml_file(BASE_DIR / 'werke.xml') + return {werk.attrib['id']: werk.find('kgpz:zitation', NAMESPACES).text + for werk in werke_root.findall('.//kgpz:werk', NAMESPACES)} + +def load_akteure(): + akteure_root = parse_xml_file(BASE_DIR / 'akteure.xml') + return {akteur.attrib['id']: akteur.find('kgpz:name', NAMESPACES).text + for akteur in akteure_root.findall('.//kgpz:akteur', NAMESPACES)} + +def load_orte(): + orte_root = parse_xml_file(BASE_DIR / 'orte.xml') + return {ort.attrib['id']: ort.find('kgpz:name', NAMESPACES).text + for ort in orte_root.findall('.//kgpz:ort', NAMESPACES)} + +def load_kategorien(): + kategorien_root = parse_xml_file(BASE_DIR / 'kategorien.xml') + return {kategorie.attrib['id']: kategorie.find('kgpz:name', NAMESPACES).text + for kategorie in kategorien_root.findall('.//kgpz:kategorie', NAMESPACES)} + +def load_reference_types(): + schema_root = parse_xml_file(BASE_DIR.parent / 'XSD' / 'common.xsd') + + def get_types(element_name): + complex_type = schema_root.find(f".//xsd:complexType[@name='{element_name}ref']", NAMESPACES) + simple_type = complex_type.find(".//xsd:simpleType", NAMESPACES) + return [e.attrib['value'] for e in simple_type.findall(".//xsd:enumeration", NAMESPACES)] + + def get_default(element_name): + complex_type = schema_root.find(f".//xsd:complexType[@name='{element_name}ref']", NAMESPACES) + attribute = complex_type.find(".//xsd:attribute[@name='kat']", NAMESPACES) + return attribute.attrib.get('default', '') + + return { + 'werk': {'types': get_types('werk'), 'default': get_default('werk')}, + 'akteur': {'types': get_types('akteur'), 'default': get_default('akteur')}, + 'ort': {'types': get_types('ort'), 'default': get_default('ort')} + } + +WERKE = load_werke() +AKTEURE = load_akteure() +ORTE = load_orte() +KATEGORIEN = load_kategorien() +REFERENCE_TYPES = load_reference_types() + +def process_stueck_or_beilage(element, current_year, current_issue): + tag_name = element.tag.split('}')[-1] + datum = element.attrib.get('datum', '') + nr = element.attrib.get('nr', '') + year = datum.split('-')[0] + + if tag_name == 'stueck': + von = element.attrib.get('von', '') + bis = element.attrib.get('bis', '') + pages = f"{von}" if von == bis or not bis else f"{von}-{bis}" + + if year == current_year and nr == current_issue: + return f"
{pages}
", (year, nr, 0) + else: + return f"Ausgabe {nr} ({datum}), {pages}
", (year, nr, 0) + else: + beilage_nr = element.attrib.get('beilage', '') + if year == current_year and nr == current_issue: + return f"Beilage {beilage_nr}
", (year, nr, 1) + else: + return f"Beilage {beilage_nr} zu Ausgabe {nr} ({datum})
", (year, nr, 1) + +def process_kategorie(element): + kategorie_id = element.attrib.get('ref', '') + kategorie_name = KATEGORIEN.get(kategorie_id, kategorie_id) + return f'{kategorie_name}' + +def process_titel_or_incipit(element): + return f'{element.text}
' + +def process_reference(element, ref_type): + ref_id = element.attrib.get('ref', '') + content = WERKE.get(ref_id, '') if ref_type == 'werk' else AKTEURE.get(ref_id, '') if ref_type == 'akteur' else ORTE.get(ref_id, '') + if not content: + content = f"{ref_type.capitalize()} nicht gefunden: {ref_id}" + + kat_id = element.attrib.get('kat', REFERENCE_TYPES[ref_type]['default']) + kat_name = KATEGORIEN.get(kat_id, kat_id) + return f'{kat_name} {content}
' + +def process_anmerkung_or_vermerk(element): + return f'{element.text}
' + +def process_beitrag(beitrag, current_year, current_issue): + content = f'