mirror of
https://github.com/Theodor-Springmann-Stiftung/KGPZ.git
synced 2025-10-29 00:55:31 +00:00
Generation Script + BUGFIX, XSD für Beilagen
This commit is contained in:
201
Scripts/generate_html.py
Normal file
201
Scripts/generate_html.py
Normal file
@@ -0,0 +1,201 @@
|
||||
import xml.etree.ElementTree as ET
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
BASE_DIR = Path('XML')
|
||||
BEITRAEGE_DIR = BASE_DIR / 'beitraege'
|
||||
OUTPUT_DIR = Path('output')
|
||||
NAMESPACES = {
|
||||
'kgpz': 'https://www.koenigsberger-zeitungen.de',
|
||||
'xsd': 'http://www.w3.org/2001/XMLSchema'
|
||||
}
|
||||
|
||||
def parse_xml_file(file_path):
|
||||
return ET.parse(file_path).getroot()
|
||||
|
||||
def load_werke():
|
||||
werke_root = parse_xml_file(BASE_DIR / 'werke.xml')
|
||||
return {werk.attrib['id']: werk.find('kgpz:zitation', NAMESPACES).text
|
||||
for werk in werke_root.findall('.//kgpz:werk', NAMESPACES)}
|
||||
|
||||
def load_akteure():
|
||||
akteure_root = parse_xml_file(BASE_DIR / 'akteure.xml')
|
||||
return {akteur.attrib['id']: akteur.find('kgpz:name', NAMESPACES).text
|
||||
for akteur in akteure_root.findall('.//kgpz:akteur', NAMESPACES)}
|
||||
|
||||
def load_orte():
|
||||
orte_root = parse_xml_file(BASE_DIR / 'orte.xml')
|
||||
return {ort.attrib['id']: ort.find('kgpz:name', NAMESPACES).text
|
||||
for ort in orte_root.findall('.//kgpz:ort', NAMESPACES)}
|
||||
|
||||
def load_kategorien():
|
||||
kategorien_root = parse_xml_file(BASE_DIR / 'kategorien.xml')
|
||||
return {kategorie.attrib['id']: kategorie.find('kgpz:name', NAMESPACES).text
|
||||
for kategorie in kategorien_root.findall('.//kgpz:kategorie', NAMESPACES)}
|
||||
|
||||
def load_reference_types():
|
||||
schema_root = parse_xml_file(BASE_DIR.parent / 'XSD' / 'common.xsd')
|
||||
|
||||
def get_types(element_name):
|
||||
complex_type = schema_root.find(f".//xsd:complexType[@name='{element_name}ref']", NAMESPACES)
|
||||
simple_type = complex_type.find(".//xsd:simpleType", NAMESPACES)
|
||||
return [e.attrib['value'] for e in simple_type.findall(".//xsd:enumeration", NAMESPACES)]
|
||||
|
||||
def get_default(element_name):
|
||||
complex_type = schema_root.find(f".//xsd:complexType[@name='{element_name}ref']", NAMESPACES)
|
||||
attribute = complex_type.find(".//xsd:attribute[@name='kat']", NAMESPACES)
|
||||
return attribute.attrib.get('default', '')
|
||||
|
||||
return {
|
||||
'werk': {'types': get_types('werk'), 'default': get_default('werk')},
|
||||
'akteur': {'types': get_types('akteur'), 'default': get_default('akteur')},
|
||||
'ort': {'types': get_types('ort'), 'default': get_default('ort')}
|
||||
}
|
||||
|
||||
WERKE = load_werke()
|
||||
AKTEURE = load_akteure()
|
||||
ORTE = load_orte()
|
||||
KATEGORIEN = load_kategorien()
|
||||
REFERENCE_TYPES = load_reference_types()
|
||||
|
||||
def process_stueck_or_beilage(element, current_year, current_issue):
|
||||
tag_name = element.tag.split('}')[-1]
|
||||
datum = element.attrib.get('datum', '')
|
||||
nr = element.attrib.get('nr', '')
|
||||
year = datum.split('-')[0]
|
||||
|
||||
if tag_name == 'stueck':
|
||||
von = element.attrib.get('von', '')
|
||||
bis = element.attrib.get('bis', '')
|
||||
pages = f"{von}" if von == bis or not bis else f"{von}-{bis}"
|
||||
|
||||
if year == current_year and nr == current_issue:
|
||||
return f"<p class='stueck'>{pages}</p>", (year, nr, 0)
|
||||
else:
|
||||
return f"<p class='stueck'>Ausgabe {nr} ({datum}), {pages}</p>", (year, nr, 0)
|
||||
else:
|
||||
beilage_nr = element.attrib.get('beilage', '')
|
||||
if year == current_year and nr == current_issue:
|
||||
return f"<p class='beilage'>Beilage {beilage_nr}</p>", (year, nr, 1)
|
||||
else:
|
||||
return f"<p class='beilage'>Beilage {beilage_nr} zu Ausgabe {nr} ({datum})</p>", (year, nr, 1)
|
||||
|
||||
def process_kategorie(element):
|
||||
kategorie_id = element.attrib.get('ref', '')
|
||||
kategorie_name = KATEGORIEN.get(kategorie_id, kategorie_id)
|
||||
return f'<span class="pill"><strong>{kategorie_name}</strong></span>'
|
||||
|
||||
def process_titel_or_incipit(element):
|
||||
return f'<p class="titel-incipit">{element.text}</p>'
|
||||
|
||||
def process_reference(element, ref_type):
|
||||
ref_id = element.attrib.get('ref', '')
|
||||
content = WERKE.get(ref_id, '') if ref_type == 'werk' else AKTEURE.get(ref_id, '') if ref_type == 'akteur' else ORTE.get(ref_id, '')
|
||||
if not content:
|
||||
content = f"{ref_type.capitalize()} nicht gefunden: {ref_id}"
|
||||
|
||||
kat_id = element.attrib.get('kat', REFERENCE_TYPES[ref_type]['default'])
|
||||
kat_name = KATEGORIEN.get(kat_id, kat_id)
|
||||
return f'<p class="{ref_type}"><span class="pill"><strong>{kat_name}</strong></span> {content}</p>'
|
||||
|
||||
def process_anmerkung_or_vermerk(element):
|
||||
return f'<p class="{element.tag.split("}")[-1]}">{element.text}</p>'
|
||||
|
||||
def process_beitrag(beitrag, current_year, current_issue):
|
||||
content = f'<div class="beitrag" id="{beitrag.attrib.get("id", "")}">\n'
|
||||
|
||||
stueck_infos = []
|
||||
for element in beitrag:
|
||||
tag = element.tag.split('}')[-1]
|
||||
if tag in ['stueck', 'beilage']:
|
||||
stueck_html, stueck_info = process_stueck_or_beilage(element, current_year, current_issue)
|
||||
content += stueck_html
|
||||
stueck_infos.append(stueck_info)
|
||||
elif tag == 'kategorie':
|
||||
content += process_kategorie(element)
|
||||
elif tag in ['titel', 'incipit']:
|
||||
content += process_titel_or_incipit(element)
|
||||
elif tag in ['werk', 'akteur', 'ort']:
|
||||
content += process_reference(element, tag)
|
||||
elif tag in ['anmerkung', 'vermerk']:
|
||||
content += process_anmerkung_or_vermerk(element)
|
||||
|
||||
content += '</div>\n'
|
||||
return content, stueck_infos
|
||||
|
||||
def create_html_content(root):
|
||||
beitraege_by_year_issue = defaultdict(lambda: defaultdict(list))
|
||||
|
||||
for beitrag in root.findall('.//kgpz:beitrag', NAMESPACES):
|
||||
for stueck in beitrag.findall('./kgpz:stueck', NAMESPACES) + beitrag.findall('./kgpz:beilage', NAMESPACES):
|
||||
year = stueck.attrib.get('datum', '').split('-')[0]
|
||||
issue = stueck.attrib.get('nr', '')
|
||||
content, _ = process_beitrag(beitrag, year, issue)
|
||||
beitraege_by_year_issue[year][issue].append(content)
|
||||
|
||||
def sort_key(x):
|
||||
try:
|
||||
return (0, int(x))
|
||||
except ValueError:
|
||||
return (1, x)
|
||||
|
||||
content = ""
|
||||
for year in sorted(beitraege_by_year_issue.keys()):
|
||||
content += f'<h2>Jahr {year}</h2>\n'
|
||||
for issue in sorted(beitraege_by_year_issue[year].keys(), key=sort_key):
|
||||
content += f'<h3>Ausgabe {issue}</h3>\n'
|
||||
content += ''.join(beitraege_by_year_issue[year][issue])
|
||||
|
||||
return content
|
||||
|
||||
def generate_html(content):
|
||||
return f'''
|
||||
<!DOCTYPE html>
|
||||
<html lang="de">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>KGPZ Beiträge</title>
|
||||
<style>
|
||||
body {{ font-family: Arial, sans-serif; line-height: 1.6; padding: 20px; }}
|
||||
.beitrag {{ margin-bottom: 20px; border: 1px solid #ccc; padding: 10px; }}
|
||||
h2 {{ color: #333; margin-top: 30px; }}
|
||||
h3 {{ color: #666; margin-top: 20px; }}
|
||||
p {{ margin: 5px 0; }}
|
||||
.titel-incipit {{ font-size: 1.1em; font-weight: bold; margin-top: 10px; margin-bottom: 10px; }}
|
||||
.anmerkung, .vermerk {{ background-color: #f0f0f0; padding: 5px; margin-top: 10px; }}
|
||||
.werk, .akteur, .ort {{ background-color: #f9f9f9; padding: 5px; }}
|
||||
.pill {{
|
||||
display: inline-block;
|
||||
padding: 2px 8px;
|
||||
background-color: #e0e0e0;
|
||||
border-radius: 12px;
|
||||
font-size: 0.8em;
|
||||
margin-right: 5px;
|
||||
}}
|
||||
.pill strong {{ font-weight: bold; }}
|
||||
.stueck, .beilage {{ font-weight: bold; color: #444; }}
|
||||
.beilage {{ color: #666; }}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
{content}
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
def main():
|
||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||
all_content = ''
|
||||
|
||||
for xml_file in sorted(BEITRAEGE_DIR.glob('*.xml')):
|
||||
root = parse_xml_file(xml_file)
|
||||
all_content += create_html_content(root)
|
||||
|
||||
html_output = generate_html(all_content)
|
||||
|
||||
with open(OUTPUT_DIR / 'kgpz_beitraege.html', 'w', encoding='utf-8') as f:
|
||||
f.write(html_output)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
108
Scripts/generate_stuecke_from_images.py
Normal file
108
Scripts/generate_stuecke_from_images.py
Normal file
@@ -0,0 +1,108 @@
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime, timedelta
|
||||
from xml.etree.ElementTree import Element, SubElement, tostring
|
||||
from xml.dom import minidom
|
||||
|
||||
def get_files(directory):
|
||||
pattern = re.compile(r'(\d{4})-(\d+)(b\d?)?-(\d+)\.jpg')
|
||||
files = {}
|
||||
year = None
|
||||
for filename in os.listdir(directory):
|
||||
match = pattern.match(filename)
|
||||
if match:
|
||||
file_year, stueck, beilage_info, page = match.groups()
|
||||
file_year = int(file_year)
|
||||
stueck = int(stueck)
|
||||
page = int(page)
|
||||
if year is None:
|
||||
year = file_year
|
||||
elif year != file_year:
|
||||
raise ValueError(f"Inconsistent years found: {year} and {file_year}")
|
||||
if stueck not in files:
|
||||
files[stueck] = {'main': [], 'beilage': []}
|
||||
if beilage_info:
|
||||
beilage_num = int(beilage_info[1:] or '1')
|
||||
files[stueck]['beilage'].append((beilage_num, page))
|
||||
else:
|
||||
files[stueck]['main'].append(page)
|
||||
return files, year
|
||||
|
||||
def calculate_date(year, stueck_number):
|
||||
base_date = datetime(year, 1, 1) # Start from January 1st of the given year
|
||||
while base_date.weekday() != 4: # Find the first Friday
|
||||
base_date += timedelta(days=1)
|
||||
days_to_add = (stueck_number - 1) * 3 + ((stueck_number - 1) // 2) * 1
|
||||
return base_date + timedelta(days=days_to_add)
|
||||
|
||||
def create_xml(files, year):
|
||||
root = Element('stuecke')
|
||||
root.set('xmlns', 'https://www.koenigsberger-zeitungen.de')
|
||||
root.set('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance')
|
||||
root.set('xsi:schemaLocation', 'https://www.koenigsberger-zeitungen.de ../../XSD/KGPZ.xsd')
|
||||
|
||||
for stueck, pages in sorted(files.items()):
|
||||
stueck_elem = SubElement(root, 'stueck')
|
||||
|
||||
nummer = SubElement(stueck_elem, 'nummer')
|
||||
nummer.text = str(stueck)
|
||||
|
||||
date = calculate_date(year, stueck)
|
||||
datum = SubElement(stueck_elem, 'datum')
|
||||
datum.set('when', date.strftime('%Y-%m-%d'))
|
||||
|
||||
if pages['main']:
|
||||
von = SubElement(stueck_elem, 'von')
|
||||
von.text = str(min(pages['main']))
|
||||
|
||||
bis = SubElement(stueck_elem, 'bis')
|
||||
bis.text = str(max(pages['main']))
|
||||
|
||||
if pages['beilage']:
|
||||
beilage_pages = sorted(pages['beilage'])
|
||||
current_beilage = [beilage_pages[0]]
|
||||
current_beilage_num = beilage_pages[0][0]
|
||||
|
||||
for beilage_num, page in beilage_pages[1:]:
|
||||
if beilage_num == current_beilage_num and page == current_beilage[-1][1] + 1:
|
||||
current_beilage.append((beilage_num, page))
|
||||
else:
|
||||
beilage_elem = SubElement(stueck_elem, 'beilage')
|
||||
beilage_elem.set('nummer', str(current_beilage_num))
|
||||
beilage_von = SubElement(beilage_elem, 'von')
|
||||
beilage_von.text = str(min(page for _, page in current_beilage))
|
||||
beilage_bis = SubElement(beilage_elem, 'bis')
|
||||
beilage_bis.text = str(max(page for _, page in current_beilage))
|
||||
current_beilage = [(beilage_num, page)]
|
||||
current_beilage_num = beilage_num
|
||||
|
||||
# Add the last beilage
|
||||
beilage_elem = SubElement(stueck_elem, 'beilage')
|
||||
beilage_elem.set('nummer', str(current_beilage_num))
|
||||
beilage_von = SubElement(beilage_elem, 'von')
|
||||
beilage_von.text = str(min(page for _, page in current_beilage))
|
||||
beilage_bis = SubElement(beilage_elem, 'bis')
|
||||
beilage_bis.text = str(max(page for _, page in current_beilage))
|
||||
|
||||
return root
|
||||
|
||||
def pretty_print(elem):
|
||||
rough_string = tostring(elem, 'utf-8')
|
||||
reparsed = minidom.parseString(rough_string)
|
||||
return reparsed.toprettyxml(indent=" ")
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 2:
|
||||
print("Usage: python script.py <directory_path>")
|
||||
sys.exit(1)
|
||||
|
||||
directory = sys.argv[1]
|
||||
files, year = get_files(directory)
|
||||
xml_root = create_xml(files, year)
|
||||
|
||||
output_filename = f'{year}-stuecke.xml'
|
||||
with open(output_filename, 'w', encoding='utf-8') as f:
|
||||
f.write(pretty_print(xml_root))
|
||||
|
||||
print(f"XML file '{output_filename}' has been generated.")
|
||||
Reference in New Issue
Block a user