mirror of
https://github.com/Theodor-Springmann-Stiftung/KGPZ.git
synced 2025-10-29 09:05:30 +00:00
Stats script updates
This commit is contained in:
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
stats.txt
|
||||||
121
Scripts/stats.py
121
Scripts/stats.py
@@ -2,60 +2,86 @@ import xml.etree.ElementTree as ET
|
|||||||
from collections import Counter, defaultdict
|
from collections import Counter, defaultdict
|
||||||
import os
|
import os
|
||||||
|
|
||||||
def parse_categories(file_paths):
|
def parse_beitraege(file_paths):
|
||||||
namespace = {'kgpz': 'https://www.koenigsberger-zeitungen.de'}
|
namespace = {'kgpz': 'https://www.koenigsberger-zeitungen.de'}
|
||||||
stats = Counter()
|
stats = Counter()
|
||||||
usage = defaultdict(set) # Track where each category is used
|
usage = defaultdict(set) # Track where each category is used
|
||||||
missing_kat_counts = Counter({"werk": 0, "akteur": 0, "ort": 0}) # Track missing 'kat'
|
missing_kat_counts = Counter({"werk": 0, "akteur": 0, "ort": 0, "beitr": 0}) # Track missing 'kat'
|
||||||
|
multiple_categories = 0 # Count of beitraege with more than one category
|
||||||
|
category_combinations = Counter() # Track category combinations
|
||||||
|
|
||||||
for file_path in file_paths:
|
for file_path in file_paths:
|
||||||
tree = ET.parse(file_path)
|
print(f"Processing file: {file_path}") # Debugging: Log file being processed
|
||||||
root = tree.getroot()
|
try:
|
||||||
|
tree = ET.parse(file_path)
|
||||||
|
root = tree.getroot()
|
||||||
|
|
||||||
# Count categories from <kategorie>
|
# Iterate over direct children <beitrag> of <beitraege>
|
||||||
for category in root.findall(".//kgpz:kategorie", namespace):
|
for beitrag in root.findall("./kgpz:beitrag", namespace):
|
||||||
stats[category.get('ref')] += 1
|
print(f"Processing <beitrag> in file: {file_path}") # Debugging: Log each <beitrag>
|
||||||
usage[category.get('ref')].add("kategorie")
|
|
||||||
|
|
||||||
# Count categories from <werk>
|
# Collect categories for this beitrag
|
||||||
for werk in root.findall(".//kgpz:werk", namespace):
|
categories = set()
|
||||||
if 'kat' in werk.attrib:
|
|
||||||
stats[werk.attrib['kat']] += 1
|
|
||||||
usage[werk.attrib['kat']].add("werk")
|
|
||||||
else:
|
|
||||||
missing_kat_counts["werk"] += 1
|
|
||||||
|
|
||||||
# Count categories from <beitrag>
|
# Process <kategorie> elements within each <beitrag>
|
||||||
for beitrag in root.findall(".//kgpz:beitrag", namespace):
|
for kategorie in beitrag.findall("kgpz:kategorie", namespace):
|
||||||
if 'kat' in beitrag.attrib:
|
if 'ref' in kategorie.attrib:
|
||||||
stats[beitrag.get('kat')] += 1
|
stats[kategorie.attrib['ref']] += 1
|
||||||
usage[beitrag.get('kat')].add("beitrag")
|
usage[kategorie.attrib['ref']].add("kategorie")
|
||||||
|
if kategorie.attrib['ref'] != "provinienz":
|
||||||
|
categories.add(kategorie.attrib['ref'])
|
||||||
|
|
||||||
# Count categories from <akteur>
|
# Process <werk> elements within each <beitrag>
|
||||||
for akteur in root.findall(".//kgpz:akteur", namespace):
|
for werk in beitrag.findall("kgpz:werk", namespace):
|
||||||
if 'kat' in akteur.attrib:
|
if 'kat' in werk.attrib:
|
||||||
stats[akteur.attrib['kat']] += 1
|
stats[werk.attrib['kat']] += 1
|
||||||
usage[akteur.attrib['kat']].add("akteur")
|
usage[werk.attrib['kat']].add("werk")
|
||||||
else:
|
if werk.attrib['kat'] != "provinienz":
|
||||||
missing_kat_counts["akteur"] += 1
|
categories.add(werk.attrib['kat'])
|
||||||
|
else:
|
||||||
|
missing_kat_counts["werk"] += 1
|
||||||
|
|
||||||
# Count categories from <ort>
|
# Process <akteur> elements within each <beitrag>
|
||||||
for ort in root.findall(".//kgpz:ort", namespace):
|
for akteur in beitrag.findall("kgpz:akteur", namespace):
|
||||||
if 'kat' in ort.attrib:
|
if 'kat' in akteur.attrib:
|
||||||
stats[ort.attrib['kat']] += 1
|
stats[akteur.attrib['kat']] += 1
|
||||||
usage[ort.attrib['kat']].add("ort")
|
usage[akteur.attrib['kat']].add("akteur")
|
||||||
else:
|
if akteur.attrib['kat'] != "provinienz":
|
||||||
missing_kat_counts["ort"] += 1
|
categories.add(akteur.attrib['kat'])
|
||||||
|
else:
|
||||||
|
missing_kat_counts["akteur"] += 1
|
||||||
|
|
||||||
# Count categories from <issue>
|
# Process <ort> elements within each <beitrag>
|
||||||
for issue in root.findall(".//kgpz:issue", namespace):
|
for ort in beitrag.findall("kgpz:ort", namespace):
|
||||||
if 'kat' in issue.attrib:
|
if 'kat' in ort.attrib:
|
||||||
stats[issue.attrib['kat']] += 1
|
stats[ort.attrib['kat']] += 1
|
||||||
usage[issue.attrib['kat']].add("issue")
|
usage[ort.attrib['kat']].add("ort")
|
||||||
|
if ort.attrib['kat'] != "provinienz":
|
||||||
|
categories.add(ort.attrib['kat'])
|
||||||
|
else:
|
||||||
|
missing_kat_counts["ort"] += 1
|
||||||
|
|
||||||
return stats, usage, missing_kat_counts
|
# Process <beitrag> elements within each <beitrag>
|
||||||
|
for beitr in beitrag.findall("kgpz:beitrag", namespace):
|
||||||
|
if 'kat' in beitr.attrib:
|
||||||
|
stats[beitr.attrib['kat']] += 1
|
||||||
|
usage[beitr.attrib['kat']].add("beitr")
|
||||||
|
if beitr.attrib['kat'] != "provinienz":
|
||||||
|
categories.add(beitr.attrib['kat'])
|
||||||
|
else:
|
||||||
|
missing_kat_counts["beitr"] += 1
|
||||||
|
|
||||||
def write_stats(stats, usage, missing_kat_counts, output_file):
|
# Check for multiple categories and track combinations
|
||||||
|
if len(categories) > 1:
|
||||||
|
multiple_categories += 1
|
||||||
|
category_combinations[frozenset(categories)] += 1
|
||||||
|
|
||||||
|
except ET.ParseError as e:
|
||||||
|
print(f"Warning: Failed to parse {file_path}: {e}") # Error handling: Skip malformed XML
|
||||||
|
|
||||||
|
return stats, usage, missing_kat_counts, multiple_categories, category_combinations
|
||||||
|
|
||||||
|
def write_stats(stats, usage, missing_kat_counts, multiple_categories, category_combinations, output_file):
|
||||||
sorted_stats = sorted(stats.items(), key=lambda x: x[1], reverse=True)
|
sorted_stats = sorted(stats.items(), key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
with open(output_file, 'w') as f:
|
with open(output_file, 'w') as f:
|
||||||
@@ -68,6 +94,15 @@ def write_stats(stats, usage, missing_kat_counts, output_file):
|
|||||||
for tag, count in missing_kat_counts.items():
|
for tag, count in missing_kat_counts.items():
|
||||||
f.write(f"<{tag}> missing 'kat': {count}\n")
|
f.write(f"<{tag}> missing 'kat': {count}\n")
|
||||||
|
|
||||||
|
f.write("\nBeitraege with Multiple Categories:\n")
|
||||||
|
f.write(f"Total: {multiple_categories}\n")
|
||||||
|
|
||||||
|
f.write("\nCategory Combinations (Ordered by Most Used):\n")
|
||||||
|
sorted_combinations = sorted(category_combinations.items(), key=lambda x: x[1], reverse=True)
|
||||||
|
for combination, count in sorted_combinations:
|
||||||
|
combination_str = ", ".join(sorted(combination))
|
||||||
|
f.write(f"{combination_str}: {count}\n")
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# Define file paths
|
# Define file paths
|
||||||
input_dir = os.getenv("INPUT_DIR", "./XML/beitraege")
|
input_dir = os.getenv("INPUT_DIR", "./XML/beitraege")
|
||||||
@@ -76,8 +111,8 @@ def main():
|
|||||||
# Get all XML files in the input directory
|
# Get all XML files in the input directory
|
||||||
file_paths = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.xml')]
|
file_paths = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.xml')]
|
||||||
|
|
||||||
stats, usage, missing_kat_counts = parse_categories(file_paths)
|
stats, usage, missing_kat_counts, multiple_categories, category_combinations = parse_beitraege(file_paths)
|
||||||
write_stats(stats, usage, missing_kat_counts, output_file)
|
write_stats(stats, usage, missing_kat_counts, multiple_categories, category_combinations, output_file)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
Reference in New Issue
Block a user