Stats script updated

This commit is contained in:
Simon Martens
2025-01-16 13:19:15 +01:00
parent 5b0d80b8c7
commit f341dbfb33
2 changed files with 75 additions and 20 deletions

View File

@@ -1,4 +1,4 @@
name: Update Category Stats name: Output Category Stats
on: on:
push: push:
@@ -6,31 +6,15 @@ on:
- main - main
jobs: jobs:
update-stats: output-stats:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Checkout repository - name: Checkout repository
uses: actions/checkout@v3 uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.x'
- name: Install dependencies
run: pip install xml
- name: Run stats script - name: Run stats script
env: env:
INPUT_DIR: './XML/beitraege' # Path to your XML files directory INPUT_DIR: './XML/beitraege' # Path to your XML files directory
OUTPUT_FILE: './stats.txt' # Output stats file
run: python Scripts/stats.py
- name: Commit and push changes
run: | run: |
git config --local user.name "GitHub Actions Bot" python Scripts/stats.py
git config --local user.email "actions@github.com" cat ./stats.txt # Output stats to console
git add stats.txt
git commit -m "Update stats file on commit"
git push
continue-on-error: true

71
Scripts/stats.py Normal file
View File

@@ -0,0 +1,71 @@
import xml.etree.ElementTree as ET
from collections import Counter, defaultdict
import os
def parse_categories(file_paths):
namespace = {'kgpz': 'https://www.koenigsberger-zeitungen.de'}
stats = Counter()
usage = defaultdict(set) # Track where each category is used
for file_path in file_paths:
tree = ET.parse(file_path)
root = tree.getroot()
# Count categories from <kategorie>
for category in root.findall(".//kgpz:kategorie", namespace):
stats[category.get('ref')] += 1
usage[category.get('ref')].add("kategorie")
# Count categories from <werk>
for werk in root.findall(".//kgpz:werk", namespace):
if 'kat' in werk.attrib:
stats[werk.attrib['kat']] += 1
usage[werk.attrib['kat']].add("werk")
# Count categories from <beitrag>
for beitrag in root.findall(".//kgpz:beitrag", namespace):
if 'kat' in beitrag.attrib:
stats[beitrag.get('kat')] += 1
usage[beitrag.get('kat')].add("beitrag")
# Count categories from <akteur>
for akteur in root.findall(".//kgpz:akteur", namespace):
if 'kat' in akteur.attrib:
stats[akteur.attrib['kat']] += 1
usage[akteur.attrib['kat']].add("akteur")
# Count categories from <ort>
for ort in root.findall(".//kgpz:ort", namespace):
if 'kat' in ort.attrib:
stats[ort.attrib['kat']] += 1
usage[ort.attrib['kat']].add("ort")
# Count categories from <issue>
for issue in root.findall(".//kgpz:issue", namespace):
if 'kat' in issue.attrib:
stats[issue.attrib['kat']] += 1
usage[issue.attrib['kat']].add("issue")
return stats, usage
def write_stats(stats, usage, output_file):
sorted_stats = sorted(stats.items(), key=lambda x: x[1], reverse=True)
with open(output_file, 'w') as f:
for category, count in sorted_stats:
usages = ", ".join(sorted(usage[category]))
f.write(f"{category}: {count} (used in: {usages})\n")
def main():
# Define file paths
input_dir = os.getenv("INPUT_DIR", "./XML/beitraege")
output_file = os.getenv("OUTPUT_FILE", "./stats.txt")
# Get all XML files in the input directory
file_paths = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.xml')]
stats, usage = parse_categories(file_paths)
write_stats(stats, usage, output_file)
if __name__ == "__main__":
main()