kgpz_web/scripts/order/add_order_hints.py

#!/usr/bin/env python3
"""
Script to add order hints to Beiträge XML files.

This script processes XML files in cache/git/XML/beitraege/ and adds order="N"
attributes to <stueck> elements when multiple pieces appear on the same page.
"""

import argparse
import os
import shutil
import sys
from pathlib import Path
from typing import Dict, List, Tuple, Optional
from collections import defaultdict
from datetime import datetime

try:
    from lxml import etree
except ImportError:
    print("Error: lxml is required. Install with: pip install lxml")
    print("Or activate the virtual environment: source venv/bin/activate")
    sys.exit(1)


class OrderHintProcessor:
    """Processes XML files to add order hints to pieces on the same page."""

    def __init__(self, xml_dir: Path, backup: bool = True, dry_run: bool = False):
        self.xml_dir = xml_dir
        self.backup = backup
        self.dry_run = dry_run
        self.stats = {
            'files_processed': 0,
            'pieces_with_order_added': 0,
            'pages_with_multiple_pieces': 0,
            'existing_order_hints': 0
        }

    def process_all_files(self) -> None:
        """Process all XML files in the beitraege directory."""
        xml_files = list(self.xml_dir.glob("*-beitraege.xml"))

        if not xml_files:
            print(f"No XML files found in {self.xml_dir}")
            return

        print(f"Found {len(xml_files)} XML files to process")

        for xml_file in sorted(xml_files):
            try:
                self.process_file(xml_file)
            except Exception as e:
                print(f"Error processing {xml_file}: {e}")
                continue

        self.print_stats()

    def process_file(self, xml_file: Path, year: Optional[int] = None) -> None:
        """Process a single XML file to add order hints."""
        if year and not xml_file.name.startswith(f"{year}-"):
            return

        print(f"Processing {xml_file.name}...")

        # Parse XML with lxml to preserve comments and formatting
        try:
            parser = etree.XMLParser(remove_comments=False, remove_blank_text=False)
            tree = etree.parse(str(xml_file), parser)
            root = tree.getroot()
        except etree.XMLSyntaxError as e:
            print(f"  XML syntax error: {e}")
            return

        # Group pieces by (year, issue_nr, page_nr, beilage)
        page_groups = self._group_pieces_by_page(root)

        # Track changes
        changes_made = False

        # Debug: Show all groups found
        print(f"  Found {len(page_groups)} page groups")
        for page_key, pieces in page_groups.items():
            if len(pieces) > 1:
                print(f"  Multi-piece page: {page_key} has {len(pieces)} pieces")

        # Process each group that has multiple pieces
        for page_key, pieces in page_groups.items():
            if len(pieces) > 1:
                self.stats['pages_with_multiple_pieces'] += 1

                # Check if any pieces already have order hints
                has_existing_order = any(
                    stueck.get('order') is not None
                    for piece in pieces
                    for stueck in piece.findall('.//stueck')
                )

                if has_existing_order:
                    self.stats['existing_order_hints'] += len(pieces)
                    print(f"  Page {page_key}: Already has order hints, skipping")
                    continue

                # Add order hints
                for order_num, piece in enumerate(pieces, 1):
                    for stueck in piece.findall('.//stueck'):
                        # Check if this stueck matches our page grouping
                        if self._stueck_matches_page(stueck, page_key):
                            if not self.dry_run:
                                stueck.set('order', str(order_num))
                            changes_made = True
                            self.stats['pieces_with_order_added'] += 1

                print(f"  Page {page_key}: Added order hints to {len(pieces)} pieces")

        # Save file if changes were made
        if changes_made and not self.dry_run:
            self._save_file(xml_file, tree)

        self.stats['files_processed'] += 1

    def _group_pieces_by_page(self, root) -> Dict[Tuple, List]:
        """Group beitrag elements by the pages they appear on."""
        page_groups = defaultdict(list)

        # Handle namespace - the XML has a default namespace
        namespace = {'ns': 'https://www.koenigsberger-zeitungen.de'}

        # Try with namespace first, then without
        beitrag_elements = root.findall('.//ns:beitrag', namespace)
        if not beitrag_elements:
            # Fallback: try without namespace
            beitrag_elements = root.findall('.//beitrag')

        print(f"  Found {len(beitrag_elements)} beitrag elements")

        for beitrag in beitrag_elements:
            # Get all stueck elements for this piece
            stueck_elements = beitrag.findall('.//ns:stueck', namespace)
            if not stueck_elements:
                stueck_elements = beitrag.findall('.//stueck')

            for stueck in stueck_elements:
                # Extract page information
                year = stueck.get('when')
                issue_nr = stueck.get('nr')
                page_von = stueck.get('von')
                beilage = stueck.get('beilage', '0')  # Default to 0 for main pages

                # Skip if essential attributes are missing
                if not all([year, issue_nr, page_von]):
                    continue

                # Create page key
                page_key = (year, issue_nr, page_von, beilage)

                # Add this piece to the group (only once per page)
                if beitrag not in page_groups[page_key]:
                    page_groups[page_key].append(beitrag)

        return page_groups

    def _stueck_matches_page(self, stueck, page_key: Tuple) -> bool:
        """Check if a stueck element matches the given page key."""
        year, issue_nr, page_von, beilage = page_key

        return (
            stueck.get('when') == year and
            stueck.get('nr') == issue_nr and
            stueck.get('von') == page_von and
            stueck.get('beilage', '0') == beilage
        )

    def _save_file(self, xml_file: Path, tree) -> None:
        """Save the modified XML file with backup if requested."""
        if self.backup:
            backup_file = xml_file.with_suffix('.xml.backup')
            shutil.copy2(xml_file, backup_file)
            print(f"  Created backup: {backup_file.name}")

        # Write the modified XML
        tree.write(
            str(xml_file),
            encoding='utf-8',
            xml_declaration=True,
            pretty_print=True
        )

    def print_stats(self) -> None:
        """Print processing statistics."""
        print("\n" + "="*50)
        print("PROCESSING STATISTICS")
        print("="*50)
        print(f"Files processed: {self.stats['files_processed']}")
        print(f"Pages with multiple pieces: {self.stats['pages_with_multiple_pieces']}")
        print(f"Pieces with order hints added: {self.stats['pieces_with_order_added']}")
        print(f"Existing order hints found: {self.stats['existing_order_hints']}")

        if self.dry_run:
            print("\n*** DRY RUN MODE - No files were modified ***")


def main():
    parser = argparse.ArgumentParser(description='Add order hints to Beiträge XML files')
    parser.add_argument('--year', type=int, help='Process only files for specific year')
    parser.add_argument('--all', action='store_true', help='Process all years')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be changed without modifying files')
    parser.add_argument('--no-backup', action='store_true', help='Skip creating backup files')
    parser.add_argument('--xml-dir', type=Path, default=Path('../../cache/git/XML/beitraege'),
                       help='Directory containing XML files (default: ../../cache/git/XML/beitraege)')

    args = parser.parse_args()

    # Validation
    if not args.year and not args.all:
        parser.error("Must specify either --year YYYY or --all")

    if not args.xml_dir.exists():
        print(f"Error: XML directory not found: {args.xml_dir}")
        sys.exit(1)

    # Initialize processor
    processor = OrderHintProcessor(
        xml_dir=args.xml_dir,
        backup=not args.no_backup,
        dry_run=args.dry_run
    )

    # Process files
    if args.all:
        processor.process_all_files()
    else:
        # Find the specific year file
        xml_file = args.xml_dir / f"{args.year}-beitraege.xml"
        if not xml_file.exists():
            print(f"Error: File not found: {xml_file}")
            sys.exit(1)
        processor.process_file(xml_file, args.year)
        processor.print_stats()


if __name__ == '__main__':
    main()