Files
kgpz_web/scripts/order/add_order_hints.py
2025-09-16 17:34:48 +02:00

243 lines
8.8 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Script to add order hints to Beiträge XML files.
This script processes XML files in cache/git/XML/beitraege/ and adds order="N"
attributes to <stueck> elements when multiple pieces appear on the same page.
"""
import argparse
import os
import shutil
import sys
from pathlib import Path
from typing import Dict, List, Tuple, Optional
from collections import defaultdict
from datetime import datetime
try:
from lxml import etree
except ImportError:
print("Error: lxml is required. Install with: pip install lxml")
print("Or activate the virtual environment: source venv/bin/activate")
sys.exit(1)
class OrderHintProcessor:
"""Processes XML files to add order hints to pieces on the same page."""
def __init__(self, xml_dir: Path, backup: bool = True, dry_run: bool = False):
self.xml_dir = xml_dir
self.backup = backup
self.dry_run = dry_run
self.stats = {
'files_processed': 0,
'pieces_with_order_added': 0,
'pages_with_multiple_pieces': 0,
'existing_order_hints': 0
}
def process_all_files(self) -> None:
"""Process all XML files in the beitraege directory."""
xml_files = list(self.xml_dir.glob("*-beitraege.xml"))
if not xml_files:
print(f"No XML files found in {self.xml_dir}")
return
print(f"Found {len(xml_files)} XML files to process")
for xml_file in sorted(xml_files):
try:
self.process_file(xml_file)
except Exception as e:
print(f"Error processing {xml_file}: {e}")
continue
self.print_stats()
def process_file(self, xml_file: Path, year: Optional[int] = None) -> None:
"""Process a single XML file to add order hints."""
if year and not xml_file.name.startswith(f"{year}-"):
return
print(f"Processing {xml_file.name}...")
# Parse XML with lxml to preserve comments and formatting
try:
parser = etree.XMLParser(remove_comments=False, remove_blank_text=False)
tree = etree.parse(str(xml_file), parser)
root = tree.getroot()
except etree.XMLSyntaxError as e:
print(f" XML syntax error: {e}")
return
# Group pieces by (year, issue_nr, page_nr, beilage)
page_groups = self._group_pieces_by_page(root)
# Track changes
changes_made = False
# Debug: Show all groups found
print(f" Found {len(page_groups)} page groups")
for page_key, pieces in page_groups.items():
if len(pieces) > 1:
print(f" Multi-piece page: {page_key} has {len(pieces)} pieces")
# Process each group that has multiple pieces
for page_key, pieces in page_groups.items():
if len(pieces) > 1:
self.stats['pages_with_multiple_pieces'] += 1
# Check if any pieces already have order hints
has_existing_order = any(
stueck.get('order') is not None
for piece in pieces
for stueck in piece.findall('.//stueck')
)
if has_existing_order:
self.stats['existing_order_hints'] += len(pieces)
print(f" Page {page_key}: Already has order hints, skipping")
continue
# Add order hints
for order_num, piece in enumerate(pieces, 1):
for stueck in piece.findall('.//stueck'):
# Check if this stueck matches our page grouping
if self._stueck_matches_page(stueck, page_key):
if not self.dry_run:
stueck.set('order', str(order_num))
changes_made = True
self.stats['pieces_with_order_added'] += 1
print(f" Page {page_key}: Added order hints to {len(pieces)} pieces")
# Save file if changes were made
if changes_made and not self.dry_run:
self._save_file(xml_file, tree)
self.stats['files_processed'] += 1
def _group_pieces_by_page(self, root) -> Dict[Tuple, List]:
"""Group beitrag elements by the pages they appear on."""
page_groups = defaultdict(list)
# Handle namespace - the XML has a default namespace
namespace = {'ns': 'https://www.koenigsberger-zeitungen.de'}
# Try with namespace first, then without
beitrag_elements = root.findall('.//ns:beitrag', namespace)
if not beitrag_elements:
# Fallback: try without namespace
beitrag_elements = root.findall('.//beitrag')
print(f" Found {len(beitrag_elements)} beitrag elements")
for beitrag in beitrag_elements:
# Get all stueck elements for this piece
stueck_elements = beitrag.findall('.//ns:stueck', namespace)
if not stueck_elements:
stueck_elements = beitrag.findall('.//stueck')
for stueck in stueck_elements:
# Extract page information
year = stueck.get('when')
issue_nr = stueck.get('nr')
page_von = stueck.get('von')
beilage = stueck.get('beilage', '0') # Default to 0 for main pages
# Skip if essential attributes are missing
if not all([year, issue_nr, page_von]):
continue
# Create page key
page_key = (year, issue_nr, page_von, beilage)
# Add this piece to the group (only once per page)
if beitrag not in page_groups[page_key]:
page_groups[page_key].append(beitrag)
return page_groups
def _stueck_matches_page(self, stueck, page_key: Tuple) -> bool:
"""Check if a stueck element matches the given page key."""
year, issue_nr, page_von, beilage = page_key
return (
stueck.get('when') == year and
stueck.get('nr') == issue_nr and
stueck.get('von') == page_von and
stueck.get('beilage', '0') == beilage
)
def _save_file(self, xml_file: Path, tree) -> None:
"""Save the modified XML file with backup if requested."""
if self.backup:
backup_file = xml_file.with_suffix('.xml.backup')
shutil.copy2(xml_file, backup_file)
print(f" Created backup: {backup_file.name}")
# Write the modified XML
tree.write(
str(xml_file),
encoding='utf-8',
xml_declaration=True,
pretty_print=True
)
def print_stats(self) -> None:
"""Print processing statistics."""
print("\n" + "="*50)
print("PROCESSING STATISTICS")
print("="*50)
print(f"Files processed: {self.stats['files_processed']}")
print(f"Pages with multiple pieces: {self.stats['pages_with_multiple_pieces']}")
print(f"Pieces with order hints added: {self.stats['pieces_with_order_added']}")
print(f"Existing order hints found: {self.stats['existing_order_hints']}")
if self.dry_run:
print("\n*** DRY RUN MODE - No files were modified ***")
def main():
parser = argparse.ArgumentParser(description='Add order hints to Beiträge XML files')
parser.add_argument('--year', type=int, help='Process only files for specific year')
parser.add_argument('--all', action='store_true', help='Process all years')
parser.add_argument('--dry-run', action='store_true', help='Show what would be changed without modifying files')
parser.add_argument('--no-backup', action='store_true', help='Skip creating backup files')
parser.add_argument('--xml-dir', type=Path, default=Path('../../cache/git/XML/beitraege'),
help='Directory containing XML files (default: ../../cache/git/XML/beitraege)')
args = parser.parse_args()
# Validation
if not args.year and not args.all:
parser.error("Must specify either --year YYYY or --all")
if not args.xml_dir.exists():
print(f"Error: XML directory not found: {args.xml_dir}")
sys.exit(1)
# Initialize processor
processor = OrderHintProcessor(
xml_dir=args.xml_dir,
backup=not args.no_backup,
dry_run=args.dry_run
)
# Process files
if args.all:
processor.process_all_files()
else:
# Find the specific year file
xml_file = args.xml_dir / f"{args.year}-beitraege.xml"
if not xml_file.exists():
print(f"Error: File not found: {xml_file}")
sys.exit(1)
processor.process_file(xml_file, args.year)
processor.print_stats()
if __name__ == '__main__':
main()