diff --git a/scripts/order/.gitignore b/scripts/order/.gitignore new file mode 100644 index 0000000..c9c7429 --- /dev/null +++ b/scripts/order/.gitignore @@ -0,0 +1,56 @@ +# Python Virtual Environment +venv/ +env/ +.env + +# Python cache files +__pycache__/ +*.pyc +*.pyo +*.pyd +.Python + +# Generated/processed images +demo_* +cleaned_* +comparison_* +*_cleaned_* +*_comparison_* + +# Processing outputs +cleaned/ +output/ +results/ + +# Configuration files (may contain sensitive settings) +config.json +*.config.json +custom_*.json + +# Temporary files +*.tmp +*.temp +.DS_Store +Thumbs.db + +# IDE files +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# Logs +*.log +logs/ + +# Test outputs +test_* +sample_output/ + +# Large source images (uncomment if you don't want to track originals) +# *.jpg +# *.jpeg +# *.png +# *.tif +# *.tiff \ No newline at end of file diff --git a/scripts/order/add_order_hints.py b/scripts/order/add_order_hints.py new file mode 100755 index 0000000..c28aa6d --- /dev/null +++ b/scripts/order/add_order_hints.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python3 +""" +Script to add order hints to Beiträge XML files. + +This script processes XML files in cache/git/XML/beitraege/ and adds order="N" +attributes to elements when multiple pieces appear on the same page. +""" + +import argparse +import os +import shutil +import sys +from pathlib import Path +from typing import Dict, List, Tuple, Optional +from collections import defaultdict +from datetime import datetime + +try: + from lxml import etree +except ImportError: + print("Error: lxml is required. Install with: pip install lxml") + print("Or activate the virtual environment: source venv/bin/activate") + sys.exit(1) + + +class OrderHintProcessor: + """Processes XML files to add order hints to pieces on the same page.""" + + def __init__(self, xml_dir: Path, backup: bool = True, dry_run: bool = False): + self.xml_dir = xml_dir + self.backup = backup + self.dry_run = dry_run + self.stats = { + 'files_processed': 0, + 'pieces_with_order_added': 0, + 'pages_with_multiple_pieces': 0, + 'existing_order_hints': 0 + } + + def process_all_files(self) -> None: + """Process all XML files in the beitraege directory.""" + xml_files = list(self.xml_dir.glob("*-beitraege.xml")) + + if not xml_files: + print(f"No XML files found in {self.xml_dir}") + return + + print(f"Found {len(xml_files)} XML files to process") + + for xml_file in sorted(xml_files): + try: + self.process_file(xml_file) + except Exception as e: + print(f"Error processing {xml_file}: {e}") + continue + + self.print_stats() + + def process_file(self, xml_file: Path, year: Optional[int] = None) -> None: + """Process a single XML file to add order hints.""" + if year and not xml_file.name.startswith(f"{year}-"): + return + + print(f"Processing {xml_file.name}...") + + # Parse XML with lxml to preserve comments and formatting + try: + parser = etree.XMLParser(remove_comments=False, remove_blank_text=False) + tree = etree.parse(str(xml_file), parser) + root = tree.getroot() + except etree.XMLSyntaxError as e: + print(f" XML syntax error: {e}") + return + + # Group pieces by (year, issue_nr, page_nr, beilage) + page_groups = self._group_pieces_by_page(root) + + # Track changes + changes_made = False + + # Debug: Show all groups found + print(f" Found {len(page_groups)} page groups") + for page_key, pieces in page_groups.items(): + if len(pieces) > 1: + print(f" Multi-piece page: {page_key} has {len(pieces)} pieces") + + # Process each group that has multiple pieces + for page_key, pieces in page_groups.items(): + if len(pieces) > 1: + self.stats['pages_with_multiple_pieces'] += 1 + + # Check if any pieces already have order hints + has_existing_order = any( + stueck.get('order') is not None + for piece in pieces + for stueck in piece.findall('.//stueck') + ) + + if has_existing_order: + self.stats['existing_order_hints'] += len(pieces) + print(f" Page {page_key}: Already has order hints, skipping") + continue + + # Add order hints + for order_num, piece in enumerate(pieces, 1): + for stueck in piece.findall('.//stueck'): + # Check if this stueck matches our page grouping + if self._stueck_matches_page(stueck, page_key): + if not self.dry_run: + stueck.set('order', str(order_num)) + changes_made = True + self.stats['pieces_with_order_added'] += 1 + + print(f" Page {page_key}: Added order hints to {len(pieces)} pieces") + + # Save file if changes were made + if changes_made and not self.dry_run: + self._save_file(xml_file, tree) + + self.stats['files_processed'] += 1 + + def _group_pieces_by_page(self, root) -> Dict[Tuple, List]: + """Group beitrag elements by the pages they appear on.""" + page_groups = defaultdict(list) + + # Handle namespace - the XML has a default namespace + namespace = {'ns': 'https://www.koenigsberger-zeitungen.de'} + + # Try with namespace first, then without + beitrag_elements = root.findall('.//ns:beitrag', namespace) + if not beitrag_elements: + # Fallback: try without namespace + beitrag_elements = root.findall('.//beitrag') + + print(f" Found {len(beitrag_elements)} beitrag elements") + + for beitrag in beitrag_elements: + # Get all stueck elements for this piece + stueck_elements = beitrag.findall('.//ns:stueck', namespace) + if not stueck_elements: + stueck_elements = beitrag.findall('.//stueck') + + for stueck in stueck_elements: + # Extract page information + year = stueck.get('when') + issue_nr = stueck.get('nr') + page_von = stueck.get('von') + beilage = stueck.get('beilage', '0') # Default to 0 for main pages + + # Skip if essential attributes are missing + if not all([year, issue_nr, page_von]): + continue + + # Create page key + page_key = (year, issue_nr, page_von, beilage) + + # Add this piece to the group (only once per page) + if beitrag not in page_groups[page_key]: + page_groups[page_key].append(beitrag) + + return page_groups + + def _stueck_matches_page(self, stueck, page_key: Tuple) -> bool: + """Check if a stueck element matches the given page key.""" + year, issue_nr, page_von, beilage = page_key + + return ( + stueck.get('when') == year and + stueck.get('nr') == issue_nr and + stueck.get('von') == page_von and + stueck.get('beilage', '0') == beilage + ) + + def _save_file(self, xml_file: Path, tree) -> None: + """Save the modified XML file with backup if requested.""" + if self.backup: + backup_file = xml_file.with_suffix('.xml.backup') + shutil.copy2(xml_file, backup_file) + print(f" Created backup: {backup_file.name}") + + # Write the modified XML + tree.write( + str(xml_file), + encoding='utf-8', + xml_declaration=True, + pretty_print=True + ) + + def print_stats(self) -> None: + """Print processing statistics.""" + print("\n" + "="*50) + print("PROCESSING STATISTICS") + print("="*50) + print(f"Files processed: {self.stats['files_processed']}") + print(f"Pages with multiple pieces: {self.stats['pages_with_multiple_pieces']}") + print(f"Pieces with order hints added: {self.stats['pieces_with_order_added']}") + print(f"Existing order hints found: {self.stats['existing_order_hints']}") + + if self.dry_run: + print("\n*** DRY RUN MODE - No files were modified ***") + + +def main(): + parser = argparse.ArgumentParser(description='Add order hints to Beiträge XML files') + parser.add_argument('--year', type=int, help='Process only files for specific year') + parser.add_argument('--all', action='store_true', help='Process all years') + parser.add_argument('--dry-run', action='store_true', help='Show what would be changed without modifying files') + parser.add_argument('--no-backup', action='store_true', help='Skip creating backup files') + parser.add_argument('--xml-dir', type=Path, default=Path('../../cache/git/XML/beitraege'), + help='Directory containing XML files (default: ../../cache/git/XML/beitraege)') + + args = parser.parse_args() + + # Validation + if not args.year and not args.all: + parser.error("Must specify either --year YYYY or --all") + + if not args.xml_dir.exists(): + print(f"Error: XML directory not found: {args.xml_dir}") + sys.exit(1) + + # Initialize processor + processor = OrderHintProcessor( + xml_dir=args.xml_dir, + backup=not args.no_backup, + dry_run=args.dry_run + ) + + # Process files + if args.all: + processor.process_all_files() + else: + # Find the specific year file + xml_file = args.xml_dir / f"{args.year}-beitraege.xml" + if not xml_file.exists(): + print(f"Error: File not found: {xml_file}") + sys.exit(1) + processor.process_file(xml_file, args.year) + processor.print_stats() + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/scripts/order/requirements.txt b/scripts/order/requirements.txt new file mode 100644 index 0000000..a855f37 --- /dev/null +++ b/scripts/order/requirements.txt @@ -0,0 +1 @@ +lxml>=4.9.0 \ No newline at end of file diff --git a/scripts/order/run.sh b/scripts/order/run.sh new file mode 100755 index 0000000..98dfb01 --- /dev/null +++ b/scripts/order/run.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# Script to run the order hints processor with the virtual environment + +cd "$(dirname "$0")" + +# Check if virtual environment exists +if [ ! -d "venv" ]; then + echo "Creating virtual environment..." + python -m venv venv + source venv/bin/activate + pip install lxml +else + source venv/bin/activate +fi + +# Run the script with passed arguments +python add_order_hints.py "$@" \ No newline at end of file