mirror of
https://github.com/Theodor-Springmann-Stiftung/KGPZ.git
synced 2025-10-29 09:05:30 +00:00
Generation Script + BUGFIX, XSD für Beilagen
This commit is contained in:
108
Scripts/generate_stuecke_from_images.py
Normal file
108
Scripts/generate_stuecke_from_images.py
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from xml.etree.ElementTree import Element, SubElement, tostring
|
||||||
|
from xml.dom import minidom
|
||||||
|
|
||||||
|
def get_files(directory):
|
||||||
|
pattern = re.compile(r'(\d{4})-(\d+)(b\d?)?-(\d+)\.jpg')
|
||||||
|
files = {}
|
||||||
|
year = None
|
||||||
|
for filename in os.listdir(directory):
|
||||||
|
match = pattern.match(filename)
|
||||||
|
if match:
|
||||||
|
file_year, stueck, beilage_info, page = match.groups()
|
||||||
|
file_year = int(file_year)
|
||||||
|
stueck = int(stueck)
|
||||||
|
page = int(page)
|
||||||
|
if year is None:
|
||||||
|
year = file_year
|
||||||
|
elif year != file_year:
|
||||||
|
raise ValueError(f"Inconsistent years found: {year} and {file_year}")
|
||||||
|
if stueck not in files:
|
||||||
|
files[stueck] = {'main': [], 'beilage': []}
|
||||||
|
if beilage_info:
|
||||||
|
beilage_num = int(beilage_info[1:] or '1')
|
||||||
|
files[stueck]['beilage'].append((beilage_num, page))
|
||||||
|
else:
|
||||||
|
files[stueck]['main'].append(page)
|
||||||
|
return files, year
|
||||||
|
|
||||||
|
def calculate_date(year, stueck_number):
|
||||||
|
base_date = datetime(year, 1, 1) # Start from January 1st of the given year
|
||||||
|
while base_date.weekday() != 4: # Find the first Friday
|
||||||
|
base_date += timedelta(days=1)
|
||||||
|
days_to_add = (stueck_number - 1) * 3 + ((stueck_number - 1) // 2) * 1
|
||||||
|
return base_date + timedelta(days=days_to_add)
|
||||||
|
|
||||||
|
def create_xml(files, year):
|
||||||
|
root = Element('stuecke')
|
||||||
|
root.set('xmlns', 'https://www.koenigsberger-zeitungen.de')
|
||||||
|
root.set('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance')
|
||||||
|
root.set('xsi:schemaLocation', 'https://www.koenigsberger-zeitungen.de ../../XSD/KGPZ.xsd')
|
||||||
|
|
||||||
|
for stueck, pages in sorted(files.items()):
|
||||||
|
stueck_elem = SubElement(root, 'stueck')
|
||||||
|
|
||||||
|
nummer = SubElement(stueck_elem, 'nummer')
|
||||||
|
nummer.text = str(stueck)
|
||||||
|
|
||||||
|
date = calculate_date(year, stueck)
|
||||||
|
datum = SubElement(stueck_elem, 'datum')
|
||||||
|
datum.set('when', date.strftime('%Y-%m-%d'))
|
||||||
|
|
||||||
|
if pages['main']:
|
||||||
|
von = SubElement(stueck_elem, 'von')
|
||||||
|
von.text = str(min(pages['main']))
|
||||||
|
|
||||||
|
bis = SubElement(stueck_elem, 'bis')
|
||||||
|
bis.text = str(max(pages['main']))
|
||||||
|
|
||||||
|
if pages['beilage']:
|
||||||
|
beilage_pages = sorted(pages['beilage'])
|
||||||
|
current_beilage = [beilage_pages[0]]
|
||||||
|
current_beilage_num = beilage_pages[0][0]
|
||||||
|
|
||||||
|
for beilage_num, page in beilage_pages[1:]:
|
||||||
|
if beilage_num == current_beilage_num and page == current_beilage[-1][1] + 1:
|
||||||
|
current_beilage.append((beilage_num, page))
|
||||||
|
else:
|
||||||
|
beilage_elem = SubElement(stueck_elem, 'beilage')
|
||||||
|
beilage_elem.set('nummer', str(current_beilage_num))
|
||||||
|
beilage_von = SubElement(beilage_elem, 'von')
|
||||||
|
beilage_von.text = str(min(page for _, page in current_beilage))
|
||||||
|
beilage_bis = SubElement(beilage_elem, 'bis')
|
||||||
|
beilage_bis.text = str(max(page for _, page in current_beilage))
|
||||||
|
current_beilage = [(beilage_num, page)]
|
||||||
|
current_beilage_num = beilage_num
|
||||||
|
|
||||||
|
# Add the last beilage
|
||||||
|
beilage_elem = SubElement(stueck_elem, 'beilage')
|
||||||
|
beilage_elem.set('nummer', str(current_beilage_num))
|
||||||
|
beilage_von = SubElement(beilage_elem, 'von')
|
||||||
|
beilage_von.text = str(min(page for _, page in current_beilage))
|
||||||
|
beilage_bis = SubElement(beilage_elem, 'bis')
|
||||||
|
beilage_bis.text = str(max(page for _, page in current_beilage))
|
||||||
|
|
||||||
|
return root
|
||||||
|
|
||||||
|
def pretty_print(elem):
|
||||||
|
rough_string = tostring(elem, 'utf-8')
|
||||||
|
reparsed = minidom.parseString(rough_string)
|
||||||
|
return reparsed.toprettyxml(indent=" ")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) != 2:
|
||||||
|
print("Usage: python script.py <directory_path>")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
directory = sys.argv[1]
|
||||||
|
files, year = get_files(directory)
|
||||||
|
xml_root = create_xml(files, year)
|
||||||
|
|
||||||
|
output_filename = f'{year}-stuecke.xml'
|
||||||
|
with open(output_filename, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(pretty_print(xml_root))
|
||||||
|
|
||||||
|
print(f"XML file '{output_filename}' has been generated.")
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
<?xml version="1.0"?>
|
<?xml version="1.0"?>
|
||||||
<?xml version="1.0" ?>
|
<stuecke xmlns="https://www.koenigsberger-zeitungen.de"
|
||||||
<stuecke xmlns="https://www.koenigsberger-zeitungen.de" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="https://www.koenigsberger-zeitungen.de ../../XSD/KGPZ.xsd">
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="https://www.koenigsberger-zeitungen.de ../../XSD/KGPZ.xsd">
|
||||||
<stueck>
|
<stueck>
|
||||||
<nummer>2</nummer>
|
<nummer>2</nummer>
|
||||||
<datum when="1764-01-09" />
|
<datum when="1764-01-09" />
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
<?xml version="1.0"?>
|
<?xml version="1.0"?>
|
||||||
<?xml version="1.0" ?>
|
<stuecke xmlns="https://www.koenigsberger-zeitungen.de"
|
||||||
<stuecke xmlns="https://www.koenigsberger-zeitungen.de" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="https://www.koenigsberger-zeitungen.de ../../XSD/KGPZ.xsd">
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="https://www.koenigsberger-zeitungen.de ../../XSD/KGPZ.xsd">
|
||||||
<stueck>
|
<stueck>
|
||||||
<nummer>1</nummer>
|
<nummer>1</nummer>
|
||||||
<datum when="1765-01-04" />
|
<datum when="1765-01-04" />
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
<?xml version="1.0"?>
|
<?xml version="1.0"?>
|
||||||
<?xml version="1.0" ?>
|
<stuecke xmlns="https://www.koenigsberger-zeitungen.de"
|
||||||
<stuecke xmlns="https://www.koenigsberger-zeitungen.de" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="https://www.koenigsberger-zeitungen.de ../../XSD/KGPZ.xsd">
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="https://www.koenigsberger-zeitungen.de ../../XSD/KGPZ.xsd">
|
||||||
<stueck>
|
<stueck>
|
||||||
<nummer>1</nummer>
|
<nummer>1</nummer>
|
||||||
<datum when="1766-01-03" />
|
<datum when="1766-01-03" />
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
<?xml version="1.0"?>
|
<?xml version="1.0"?>
|
||||||
<?xml version="1.0" ?>
|
<stuecke xmlns="https://www.koenigsberger-zeitungen.de"
|
||||||
<stuecke xmlns="https://www.koenigsberger-zeitungen.de" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="https://www.koenigsberger-zeitungen.de ../../XSD/KGPZ.xsd">
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="https://www.koenigsberger-zeitungen.de ../../XSD/KGPZ.xsd">
|
||||||
<stueck>
|
<stueck>
|
||||||
<nummer>1</nummer>
|
<nummer>1</nummer>
|
||||||
<datum when="1767-01-02" />
|
<datum when="1767-01-02" />
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
<?xml version="1.0"?>
|
<?xml version="1.0"?>
|
||||||
<?xml version="1.0" ?>
|
<stuecke xmlns="https://www.koenigsberger-zeitungen.de"
|
||||||
<stuecke xmlns="https://www.koenigsberger-zeitungen.de" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="https://www.koenigsberger-zeitungen.de ../../XSD/KGPZ.xsd">
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="https://www.koenigsberger-zeitungen.de ../../XSD/KGPZ.xsd">
|
||||||
<stueck>
|
<stueck>
|
||||||
<nummer>1</nummer>
|
<nummer>1</nummer>
|
||||||
<datum when="1768-01-01" />
|
<datum when="1768-01-01" />
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
<?xml version="1.0"?>
|
<?xml version="1.0"?>
|
||||||
<?xml version="1.0" ?>
|
<stuecke xmlns="https://www.koenigsberger-zeitungen.de"
|
||||||
<stuecke xmlns="https://www.koenigsberger-zeitungen.de" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="https://www.koenigsberger-zeitungen.de ../../XSD/KGPZ.xsd">
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="https://www.koenigsberger-zeitungen.de ../../XSD/KGPZ.xsd">
|
||||||
<stueck>
|
<stueck>
|
||||||
<nummer>1</nummer>
|
<nummer>1</nummer>
|
||||||
<datum when="1771-01-04" />
|
<datum when="1771-01-04" />
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
<?xml version="1.0"?>
|
<?xml version="1.0"?>
|
||||||
<?xml version="1.0" ?>
|
<stuecke xmlns="https://www.koenigsberger-zeitungen.de"
|
||||||
<stuecke xmlns="https://www.koenigsberger-zeitungen.de" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="https://www.koenigsberger-zeitungen.de ../../XSD/KGPZ.xsd">
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="https://www.koenigsberger-zeitungen.de ../../XSD/KGPZ.xsd">
|
||||||
<stueck>
|
<stueck>
|
||||||
<nummer>1</nummer>
|
<nummer>1</nummer>
|
||||||
<datum when="1772-01-03" />
|
<datum when="1772-01-03" />
|
||||||
|
|||||||
@@ -22,31 +22,56 @@
|
|||||||
|
|
||||||
<xsd:element name="datum" type="kgpz:date" minOccurs="0" maxOccurs="1">
|
<xsd:element name="datum" type="kgpz:date" minOccurs="0" maxOccurs="1">
|
||||||
<xsd:annotation>
|
<xsd:annotation>
|
||||||
<xsd:documentation>Das Datum des Stücks, wie auf dem Titel angegeben, falls bekannt.</xsd:documentation>
|
<xsd:documentation>Das Datum des Stücks, wie auf dem Titel
|
||||||
|
angegeben, falls bekannt.</xsd:documentation>
|
||||||
</xsd:annotation>
|
</xsd:annotation>
|
||||||
</xsd:element>
|
</xsd:element>
|
||||||
|
|
||||||
<xsd:element name="von" type="xsd:positiveInteger" minOccurs="0"
|
<xsd:element name="von" type="xsd:positiveInteger" minOccurs="0"
|
||||||
maxOccurs="1">
|
maxOccurs="1">
|
||||||
<xsd:annotation>
|
<xsd:annotation>
|
||||||
<xsd:documentation>Die erste Seitenzahl des Stücks, falls bekannt.</xsd:documentation>
|
<xsd:documentation>Die erste Seitenzahl des Stücks, falls
|
||||||
|
bekannt.</xsd:documentation>
|
||||||
</xsd:annotation>
|
</xsd:annotation>
|
||||||
</xsd:element>
|
</xsd:element>
|
||||||
|
|
||||||
<xsd:element name="bis" type="xsd:positiveInteger" minOccurs="0"
|
<xsd:element name="bis" type="xsd:positiveInteger" minOccurs="0"
|
||||||
maxOccurs="1">
|
maxOccurs="1">
|
||||||
<xsd:annotation>
|
<xsd:annotation>
|
||||||
<xsd:documentation>Die letzte Seitenzahl des Stücks, falls bekannt.</xsd:documentation>
|
<xsd:documentation>Die letzte Seitenzahl des Stücks, falls
|
||||||
|
bekannt.</xsd:documentation>
|
||||||
</xsd:annotation>
|
</xsd:annotation>
|
||||||
</xsd:element>
|
</xsd:element>
|
||||||
|
|
||||||
<!-- Kann von Stücke abgeleitet werden -->
|
<!-- Kann von Stücke abgeleitet werden -->
|
||||||
<xsd:element name="beilagen" type="xsd:nonNegativeInteger" default="0"
|
<xsd:element name="beilage" minOccurs="0" maxOccurs="unbounded">
|
||||||
minOccurs="0" maxOccurs="1">
|
|
||||||
<xsd:annotation>
|
<xsd:annotation>
|
||||||
<xsd:documentation> Optional: Die Anzahl der Beilagen des
|
<xsd:documentation> Optional: Die Anzahl der Beilagen des
|
||||||
Stücks. </xsd:documentation>
|
Stücks. </xsd:documentation>
|
||||||
</xsd:annotation>
|
</xsd:annotation>
|
||||||
|
<xsd:complexType>
|
||||||
|
<xsd:sequence>
|
||||||
|
<xsd:element name="von" type="xsd:positiveInteger"
|
||||||
|
minOccurs="0"
|
||||||
|
maxOccurs="1">
|
||||||
|
<xsd:annotation>
|
||||||
|
<xsd:documentation>Die erste Seitenzahl der Beilage,
|
||||||
|
falls bekannt.</xsd:documentation>
|
||||||
|
</xsd:annotation>
|
||||||
|
</xsd:element>
|
||||||
|
|
||||||
|
<xsd:element name="bis" type="xsd:positiveInteger"
|
||||||
|
minOccurs="0"
|
||||||
|
maxOccurs="1">
|
||||||
|
<xsd:annotation>
|
||||||
|
<xsd:documentation>Die letzte Seitenzahl der
|
||||||
|
Beilage, falls bekannt.</xsd:documentation>
|
||||||
|
</xsd:annotation>
|
||||||
|
</xsd:element>
|
||||||
|
</xsd:sequence>
|
||||||
|
<xsd:attribute name="nummer" type="xsd:positiveInteger"
|
||||||
|
use="required"></xsd:attribute>
|
||||||
|
</xsd:complexType>
|
||||||
</xsd:element>
|
</xsd:element>
|
||||||
|
|
||||||
<!-- Vielleicht nötig -->
|
<!-- Vielleicht nötig -->
|
||||||
|
|||||||
Reference in New Issue
Block a user