import xml.etree.ElementTree as ET from collections import Counter, defaultdict import os def parse_beitraege(file_paths): namespace = {'kgpz': 'https://www.koenigsberger-zeitungen.de'} stats = Counter() usage = defaultdict(set) # Track where each category is used missing_kat_counts = Counter({"werk": 0, "akteur": 0, "ort": 0, "beitr": 0}) # Track missing 'kat' multiple_categories = 0 # Count of beitraege with more than one category category_combinations = Counter() # Track category combinations for file_path in file_paths: print(f"Processing file: {file_path}") # Debugging: Log file being processed try: tree = ET.parse(file_path) root = tree.getroot() # Iterate over direct children of for beitrag in root.findall("./kgpz:beitrag", namespace): print(f"Processing in file: {file_path}") # Debugging: Log each # Collect categories for this beitrag categories = set() # Process elements within each for kategorie in beitrag.findall("kgpz:kategorie", namespace): if 'ref' in kategorie.attrib: stats[kategorie.attrib['ref']] += 1 usage[kategorie.attrib['ref']].add("kategorie") if kategorie.attrib['ref'] != "provenienz": categories.add(kategorie.attrib['ref']) # Process elements within each for werk in beitrag.findall("kgpz:werk", namespace): if 'kat' in werk.attrib: stats[werk.attrib['kat']] += 1 usage[werk.attrib['kat']].add("werk") if werk.attrib['kat'] != "provenienz": categories.add(werk.attrib['kat']) else: missing_kat_counts["werk"] += 1 # Process elements within each for akteur in beitrag.findall("kgpz:akteur", namespace): if 'kat' in akteur.attrib: stats[akteur.attrib['kat']] += 1 usage[akteur.attrib['kat']].add("akteur") if akteur.attrib['kat'] != "provenienz": categories.add(akteur.attrib['kat']) else: missing_kat_counts["akteur"] += 1 # Process elements within each for ort in beitrag.findall("kgpz:ort", namespace): if 'kat' in ort.attrib: stats[ort.attrib['kat']] += 1 usage[ort.attrib['kat']].add("ort") if ort.attrib['kat'] != "provenienz": categories.add(ort.attrib['kat']) else: missing_kat_counts["ort"] += 1 # Process elements within each for beitr in beitrag.findall("kgpz:beitrag", namespace): if 'kat' in beitr.attrib: stats[beitr.attrib['kat']] += 1 usage[beitr.attrib['kat']].add("beitr") if beitr.attrib['kat'] != "provenienz": categories.add(beitr.attrib['kat']) else: missing_kat_counts["beitr"] += 1 # Check for multiple categories and track combinations if len(categories) > 1: multiple_categories += 1 category_combinations[frozenset(categories)] += 1 except ET.ParseError as e: print(f"Warning: Failed to parse {file_path}: {e}") # Error handling: Skip malformed XML return stats, usage, missing_kat_counts, multiple_categories, category_combinations def write_stats(stats, usage, missing_kat_counts, multiple_categories, category_combinations, output_file): sorted_stats = sorted(stats.items(), key=lambda x: x[1], reverse=True) with open(output_file, 'w') as f: f.write("Category Usage:\n") for category, count in sorted_stats: usages = ", ".join(sorted(usage[category])) f.write(f"{category}: {count} (used in: {usages})\n") f.write("\nMissing 'kat' Counts:\n") for tag, count in missing_kat_counts.items(): f.write(f"<{tag}> missing 'kat': {count}\n") f.write("\nBeitraege with Multiple Categories:\n") f.write(f"Total: {multiple_categories}\n") f.write("\nCategory Combinations (Ordered by Most Used):\n") sorted_combinations = sorted(category_combinations.items(), key=lambda x: x[1], reverse=True) for combination, count in sorted_combinations: combination_str = ", ".join(sorted(combination)) f.write(f"{combination_str}: {count}\n") def main(): # Define file paths input_dir = os.getenv("INPUT_DIR", "./XML/beitraege") output_file = os.getenv("OUTPUT_FILE", "./stats.txt") # Get all XML files in the input directory file_paths = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.xml')] stats, usage, missing_kat_counts, multiple_categories, category_combinations = parse_beitraege(file_paths) write_stats(stats, usage, missing_kat_counts, multiple_categories, category_combinations, output_file) if __name__ == "__main__": main()