mirror of
https://github.com/Theodor-Springmann-Stiftung/KGPZ.git
synced 2025-10-29 00:55:31 +00:00
better validation
This commit is contained in:
@@ -5,21 +5,20 @@ from collections import defaultdict
|
|||||||
REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
|
REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
|
||||||
XML_DIR = os.path.join(REPO_ROOT, 'XML')
|
XML_DIR = os.path.join(REPO_ROOT, 'XML')
|
||||||
|
|
||||||
def check_order_attribute_conditionally(tree):
|
def validate_stueck_constraints(tree):
|
||||||
"""Checks for conditional presence of the 'order' attribute."""
|
"""Performs all custom validation checks for <stueck> elements."""
|
||||||
errors = []
|
errors = []
|
||||||
root = tree.getroot()
|
root = tree.getroot()
|
||||||
|
|
||||||
ns = {'kgpz': 'https://www.koenigsberger-zeitungen.de'}
|
ns = {'kgpz': 'https://www.koenigsberger-zeitungen.de'}
|
||||||
|
|
||||||
|
# --- Rule 1: Check for conditional 'order' attribute on single-page entries ---
|
||||||
stuecke_without_bis = root.xpath('//kgpz:beitrag/kgpz:stueck[not(@bis)]', namespaces=ns)
|
stuecke_without_bis = root.xpath('//kgpz:beitrag/kgpz:stueck[not(@bis)]', namespaces=ns)
|
||||||
|
groups_single_page = defaultdict(list)
|
||||||
groups = defaultdict(list)
|
|
||||||
for stueck in stuecke_without_bis:
|
for stueck in stuecke_without_bis:
|
||||||
key = (stueck.get('when'), stueck.get('nr'), stueck.get('von'))
|
key = (stueck.get('when'), stueck.get('nr'), stueck.get('von'))
|
||||||
groups[key].append(stueck)
|
groups_single_page[key].append(stueck)
|
||||||
|
|
||||||
for key, stuecks in groups.items():
|
for key, stuecks in groups_single_page.items():
|
||||||
if len(stuecks) > 1:
|
if len(stuecks) > 1:
|
||||||
for stueck in stuecks:
|
for stueck in stuecks:
|
||||||
if stueck.get('order') is None:
|
if stueck.get('order') is None:
|
||||||
@@ -28,6 +27,23 @@ def check_order_attribute_conditionally(tree):
|
|||||||
f"(when='{key[0]}', nr='{key[1]}', von='{key[2]}') but is missing the 'order' attribute."
|
f"(when='{key[0]}', nr='{key[1]}', von='{key[2]}') but is missing the 'order' attribute."
|
||||||
)
|
)
|
||||||
errors.append(error_msg)
|
errors.append(error_msg)
|
||||||
|
|
||||||
|
# --- Rule 2: Check for uniqueness of multi-page jumps ---
|
||||||
|
stuecke_with_bis = root.xpath('//kgpz:beitrag/kgpz:stueck[@bis]', namespaces=ns)
|
||||||
|
groups_multi_page = defaultdict(list)
|
||||||
|
for stueck in stuecke_with_bis:
|
||||||
|
key = (stueck.get('when'), stueck.get('nr'), stueck.get('von'), stueck.get('bis'))
|
||||||
|
groups_multi_page[key].append(stueck)
|
||||||
|
|
||||||
|
for key, stuecks in groups_multi_page.items():
|
||||||
|
if len(stuecks) > 1:
|
||||||
|
error_msg = (
|
||||||
|
f"Custom Validation Error: Found {len(stuecks)} identical multi-page <stueck> tags. "
|
||||||
|
f"The combination of attributes (when='{key[0]}', nr='{key[1]}', von='{key[2]}', bis='{key[3]}') must be unique. "
|
||||||
|
f"Duplicate found on line {stuecks[1].sourceline}."
|
||||||
|
)
|
||||||
|
errors.append(error_msg)
|
||||||
|
|
||||||
return errors
|
return errors
|
||||||
|
|
||||||
def validate_xml(xml_file):
|
def validate_xml(xml_file):
|
||||||
@@ -49,7 +65,7 @@ def validate_xml(xml_file):
|
|||||||
print(f"XSD Validation successful: {xml_file}")
|
print(f"XSD Validation successful: {xml_file}")
|
||||||
|
|
||||||
# Perform custom validation after XSD check
|
# Perform custom validation after XSD check
|
||||||
custom_errors = check_order_attribute_conditionally(tree)
|
custom_errors = validate_stueck_constraints(tree)
|
||||||
if custom_errors:
|
if custom_errors:
|
||||||
errors.extend(custom_errors)
|
errors.extend(custom_errors)
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -104,15 +104,5 @@
|
|||||||
<xsd:selector xpath="kgpz:beitrag" />
|
<xsd:selector xpath="kgpz:beitrag" />
|
||||||
<xsd:field xpath="@id" />
|
<xsd:field xpath="@id" />
|
||||||
</xsd:unique>
|
</xsd:unique>
|
||||||
<xsd:unique name="uniqueStueckJump">
|
|
||||||
<xsd:annotation>
|
|
||||||
<xsd:documentation>Ein Beitrag kann nur einmal von einer Seite auf eine andere springen (Kombination aus when, nr, von und bis muss eindeutig sein).</xsd:documentation>
|
|
||||||
</xsd:annotation>
|
|
||||||
<xsd:selector xpath="kgpz:beitrag/kgpz:stueck"/>
|
|
||||||
<xsd:field xpath="@when"/>
|
|
||||||
<xsd:field xpath="@nr"/>
|
|
||||||
<xsd:field xpath="@von"/>
|
|
||||||
<xsd:field xpath="@bis"/>
|
|
||||||
</xsd:unique>
|
|
||||||
</xsd:element>
|
</xsd:element>
|
||||||
</xsd:schema>
|
</xsd:schema>
|
||||||
Reference in New Issue
Block a user