diff --git a/.github/workflows/Stats.yml b/.github/workflows/Stats.yml index cb512a4..597beaa 100644 --- a/.github/workflows/Stats.yml +++ b/.github/workflows/Stats.yml @@ -15,6 +15,13 @@ jobs: - name: Run stats script env: INPUT_DIR: './XML/beitraege' # Path to your XML files directory + OUTPUT_FILE: './stats.txt' # Output file for stats run: | python Scripts/stats.py cat ./stats.txt # Output stats to console + + - name: Upload stats artifact + uses: actions/upload-artifact@v3 + with: + name: category-stats + path: ./stats.txt # Path to the stats file diff --git a/Scripts/stats.py b/Scripts/stats.py index 3ce08ec..ec63bbf 100644 --- a/Scripts/stats.py +++ b/Scripts/stats.py @@ -6,6 +6,7 @@ def parse_categories(file_paths): namespace = {'kgpz': 'https://www.koenigsberger-zeitungen.de'} stats = Counter() usage = defaultdict(set) # Track where each category is used + missing_kat_counts = Counter({"werk": 0, "akteur": 0, "ort": 0}) # Track missing 'kat' for file_path in file_paths: tree = ET.parse(file_path) @@ -21,6 +22,8 @@ def parse_categories(file_paths): if 'kat' in werk.attrib: stats[werk.attrib['kat']] += 1 usage[werk.attrib['kat']].add("werk") + else: + missing_kat_counts["werk"] += 1 # Count categories from for beitrag in root.findall(".//kgpz:beitrag", namespace): @@ -33,12 +36,16 @@ def parse_categories(file_paths): if 'kat' in akteur.attrib: stats[akteur.attrib['kat']] += 1 usage[akteur.attrib['kat']].add("akteur") + else: + missing_kat_counts["akteur"] += 1 # Count categories from for ort in root.findall(".//kgpz:ort", namespace): if 'kat' in ort.attrib: stats[ort.attrib['kat']] += 1 usage[ort.attrib['kat']].add("ort") + else: + missing_kat_counts["ort"] += 1 # Count categories from for issue in root.findall(".//kgpz:issue", namespace): @@ -46,16 +53,21 @@ def parse_categories(file_paths): stats[issue.attrib['kat']] += 1 usage[issue.attrib['kat']].add("issue") - return stats, usage + return stats, usage, missing_kat_counts -def write_stats(stats, usage, output_file): +def write_stats(stats, usage, missing_kat_counts, output_file): sorted_stats = sorted(stats.items(), key=lambda x: x[1], reverse=True) with open(output_file, 'w') as f: + f.write("Category Usage:\n") for category, count in sorted_stats: usages = ", ".join(sorted(usage[category])) f.write(f"{category}: {count} (used in: {usages})\n") + f.write("\nMissing 'kat' Counts:\n") + for tag, count in missing_kat_counts.items(): + f.write(f"<{tag}> missing 'kat': {count}\n") + def main(): # Define file paths input_dir = os.getenv("INPUT_DIR", "./XML/beitraege") @@ -64,8 +76,8 @@ def main(): # Get all XML files in the input directory file_paths = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.xml')] - stats, usage = parse_categories(file_paths) - write_stats(stats, usage, output_file) + stats, usage, missing_kat_counts = parse_categories(file_paths) + write_stats(stats, usage, missing_kat_counts, output_file) if __name__ == "__main__": main()