From ac1965a1bfd448d740b0c723a2ec649a63ee3770 Mon Sep 17 00:00:00 2001 From: Simon Martens Date: Mon, 20 Jan 2025 12:53:42 +0100 Subject: [PATCH] Stats script updates --- .gitignore | 1 + Scripts/stats.py | 121 ++++++++++++++++++++++++++++++----------------- 2 files changed, 79 insertions(+), 43 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ab56126 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +stats.txt diff --git a/Scripts/stats.py b/Scripts/stats.py index ec63bbf..25be3bd 100644 --- a/Scripts/stats.py +++ b/Scripts/stats.py @@ -2,60 +2,86 @@ import xml.etree.ElementTree as ET from collections import Counter, defaultdict import os -def parse_categories(file_paths): +def parse_beitraege(file_paths): namespace = {'kgpz': 'https://www.koenigsberger-zeitungen.de'} stats = Counter() usage = defaultdict(set) # Track where each category is used - missing_kat_counts = Counter({"werk": 0, "akteur": 0, "ort": 0}) # Track missing 'kat' + missing_kat_counts = Counter({"werk": 0, "akteur": 0, "ort": 0, "beitr": 0}) # Track missing 'kat' + multiple_categories = 0 # Count of beitraege with more than one category + category_combinations = Counter() # Track category combinations for file_path in file_paths: - tree = ET.parse(file_path) - root = tree.getroot() + print(f"Processing file: {file_path}") # Debugging: Log file being processed + try: + tree = ET.parse(file_path) + root = tree.getroot() - # Count categories from - for category in root.findall(".//kgpz:kategorie", namespace): - stats[category.get('ref')] += 1 - usage[category.get('ref')].add("kategorie") + # Iterate over direct children of + for beitrag in root.findall("./kgpz:beitrag", namespace): + print(f"Processing in file: {file_path}") # Debugging: Log each - # Count categories from - for werk in root.findall(".//kgpz:werk", namespace): - if 'kat' in werk.attrib: - stats[werk.attrib['kat']] += 1 - usage[werk.attrib['kat']].add("werk") - else: - missing_kat_counts["werk"] += 1 + # Collect categories for this beitrag + categories = set() - # Count categories from - for beitrag in root.findall(".//kgpz:beitrag", namespace): - if 'kat' in beitrag.attrib: - stats[beitrag.get('kat')] += 1 - usage[beitrag.get('kat')].add("beitrag") + # Process elements within each + for kategorie in beitrag.findall("kgpz:kategorie", namespace): + if 'ref' in kategorie.attrib: + stats[kategorie.attrib['ref']] += 1 + usage[kategorie.attrib['ref']].add("kategorie") + if kategorie.attrib['ref'] != "provinienz": + categories.add(kategorie.attrib['ref']) - # Count categories from - for akteur in root.findall(".//kgpz:akteur", namespace): - if 'kat' in akteur.attrib: - stats[akteur.attrib['kat']] += 1 - usage[akteur.attrib['kat']].add("akteur") - else: - missing_kat_counts["akteur"] += 1 + # Process elements within each + for werk in beitrag.findall("kgpz:werk", namespace): + if 'kat' in werk.attrib: + stats[werk.attrib['kat']] += 1 + usage[werk.attrib['kat']].add("werk") + if werk.attrib['kat'] != "provinienz": + categories.add(werk.attrib['kat']) + else: + missing_kat_counts["werk"] += 1 - # Count categories from - for ort in root.findall(".//kgpz:ort", namespace): - if 'kat' in ort.attrib: - stats[ort.attrib['kat']] += 1 - usage[ort.attrib['kat']].add("ort") - else: - missing_kat_counts["ort"] += 1 + # Process elements within each + for akteur in beitrag.findall("kgpz:akteur", namespace): + if 'kat' in akteur.attrib: + stats[akteur.attrib['kat']] += 1 + usage[akteur.attrib['kat']].add("akteur") + if akteur.attrib['kat'] != "provinienz": + categories.add(akteur.attrib['kat']) + else: + missing_kat_counts["akteur"] += 1 - # Count categories from - for issue in root.findall(".//kgpz:issue", namespace): - if 'kat' in issue.attrib: - stats[issue.attrib['kat']] += 1 - usage[issue.attrib['kat']].add("issue") + # Process elements within each + for ort in beitrag.findall("kgpz:ort", namespace): + if 'kat' in ort.attrib: + stats[ort.attrib['kat']] += 1 + usage[ort.attrib['kat']].add("ort") + if ort.attrib['kat'] != "provinienz": + categories.add(ort.attrib['kat']) + else: + missing_kat_counts["ort"] += 1 - return stats, usage, missing_kat_counts + # Process elements within each + for beitr in beitrag.findall("kgpz:beitrag", namespace): + if 'kat' in beitr.attrib: + stats[beitr.attrib['kat']] += 1 + usage[beitr.attrib['kat']].add("beitr") + if beitr.attrib['kat'] != "provinienz": + categories.add(beitr.attrib['kat']) + else: + missing_kat_counts["beitr"] += 1 -def write_stats(stats, usage, missing_kat_counts, output_file): + # Check for multiple categories and track combinations + if len(categories) > 1: + multiple_categories += 1 + category_combinations[frozenset(categories)] += 1 + + except ET.ParseError as e: + print(f"Warning: Failed to parse {file_path}: {e}") # Error handling: Skip malformed XML + + return stats, usage, missing_kat_counts, multiple_categories, category_combinations + +def write_stats(stats, usage, missing_kat_counts, multiple_categories, category_combinations, output_file): sorted_stats = sorted(stats.items(), key=lambda x: x[1], reverse=True) with open(output_file, 'w') as f: @@ -68,6 +94,15 @@ def write_stats(stats, usage, missing_kat_counts, output_file): for tag, count in missing_kat_counts.items(): f.write(f"<{tag}> missing 'kat': {count}\n") + f.write("\nBeitraege with Multiple Categories:\n") + f.write(f"Total: {multiple_categories}\n") + + f.write("\nCategory Combinations (Ordered by Most Used):\n") + sorted_combinations = sorted(category_combinations.items(), key=lambda x: x[1], reverse=True) + for combination, count in sorted_combinations: + combination_str = ", ".join(sorted(combination)) + f.write(f"{combination_str}: {count}\n") + def main(): # Define file paths input_dir = os.getenv("INPUT_DIR", "./XML/beitraege") @@ -76,8 +111,8 @@ def main(): # Get all XML files in the input directory file_paths = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.xml')] - stats, usage, missing_kat_counts = parse_categories(file_paths) - write_stats(stats, usage, missing_kat_counts, output_file) + stats, usage, missing_kat_counts, multiple_categories, category_combinations = parse_beitraege(file_paths) + write_stats(stats, usage, missing_kat_counts, multiple_categories, category_combinations, output_file) if __name__ == "__main__": main()