Modified Statistics script

This commit is contained in:
Simon Martens
2025-01-18 13:56:17 +01:00
parent e9ad04af41
commit 494bcde989
2 changed files with 23 additions and 4 deletions

View File

@@ -6,6 +6,7 @@ def parse_categories(file_paths):
namespace = {'kgpz': 'https://www.koenigsberger-zeitungen.de'}
stats = Counter()
usage = defaultdict(set) # Track where each category is used
missing_kat_counts = Counter({"werk": 0, "akteur": 0, "ort": 0}) # Track missing 'kat'
for file_path in file_paths:
tree = ET.parse(file_path)
@@ -21,6 +22,8 @@ def parse_categories(file_paths):
if 'kat' in werk.attrib:
stats[werk.attrib['kat']] += 1
usage[werk.attrib['kat']].add("werk")
else:
missing_kat_counts["werk"] += 1
# Count categories from <beitrag>
for beitrag in root.findall(".//kgpz:beitrag", namespace):
@@ -33,12 +36,16 @@ def parse_categories(file_paths):
if 'kat' in akteur.attrib:
stats[akteur.attrib['kat']] += 1
usage[akteur.attrib['kat']].add("akteur")
else:
missing_kat_counts["akteur"] += 1
# Count categories from <ort>
for ort in root.findall(".//kgpz:ort", namespace):
if 'kat' in ort.attrib:
stats[ort.attrib['kat']] += 1
usage[ort.attrib['kat']].add("ort")
else:
missing_kat_counts["ort"] += 1
# Count categories from <issue>
for issue in root.findall(".//kgpz:issue", namespace):
@@ -46,16 +53,21 @@ def parse_categories(file_paths):
stats[issue.attrib['kat']] += 1
usage[issue.attrib['kat']].add("issue")
return stats, usage
return stats, usage, missing_kat_counts
def write_stats(stats, usage, output_file):
def write_stats(stats, usage, missing_kat_counts, output_file):
sorted_stats = sorted(stats.items(), key=lambda x: x[1], reverse=True)
with open(output_file, 'w') as f:
f.write("Category Usage:\n")
for category, count in sorted_stats:
usages = ", ".join(sorted(usage[category]))
f.write(f"{category}: {count} (used in: {usages})\n")
f.write("\nMissing 'kat' Counts:\n")
for tag, count in missing_kat_counts.items():
f.write(f"<{tag}> missing 'kat': {count}\n")
def main():
# Define file paths
input_dir = os.getenv("INPUT_DIR", "./XML/beitraege")
@@ -64,8 +76,8 @@ def main():
# Get all XML files in the input directory
file_paths = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.xml')]
stats, usage = parse_categories(file_paths)
write_stats(stats, usage, output_file)
stats, usage, missing_kat_counts = parse_categories(file_paths)
write_stats(stats, usage, missing_kat_counts, output_file)
if __name__ == "__main__":
main()