diff --git a/.github/workflows/Stats.yml b/.github/workflows/Stats.yml index 7d6871b..cb512a4 100644 --- a/.github/workflows/Stats.yml +++ b/.github/workflows/Stats.yml @@ -1,4 +1,4 @@ -name: Update Category Stats +name: Output Category Stats on: push: @@ -6,31 +6,15 @@ on: - main jobs: - update-stats: + output-stats: runs-on: ubuntu-latest steps: - name: Checkout repository uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.x' - - - name: Install dependencies - run: pip install xml - - name: Run stats script env: INPUT_DIR: './XML/beitraege' # Path to your XML files directory - OUTPUT_FILE: './stats.txt' # Output stats file - run: python Scripts/stats.py - - - name: Commit and push changes run: | - git config --local user.name "GitHub Actions Bot" - git config --local user.email "actions@github.com" - git add stats.txt - git commit -m "Update stats file on commit" - git push - continue-on-error: true + python Scripts/stats.py + cat ./stats.txt # Output stats to console diff --git a/Scripts/stats.py b/Scripts/stats.py new file mode 100644 index 0000000..3ce08ec --- /dev/null +++ b/Scripts/stats.py @@ -0,0 +1,71 @@ +import xml.etree.ElementTree as ET +from collections import Counter, defaultdict +import os + +def parse_categories(file_paths): + namespace = {'kgpz': 'https://www.koenigsberger-zeitungen.de'} + stats = Counter() + usage = defaultdict(set) # Track where each category is used + + for file_path in file_paths: + tree = ET.parse(file_path) + root = tree.getroot() + + # Count categories from + for category in root.findall(".//kgpz:kategorie", namespace): + stats[category.get('ref')] += 1 + usage[category.get('ref')].add("kategorie") + + # Count categories from + for werk in root.findall(".//kgpz:werk", namespace): + if 'kat' in werk.attrib: + stats[werk.attrib['kat']] += 1 + usage[werk.attrib['kat']].add("werk") + + # Count categories from + for beitrag in root.findall(".//kgpz:beitrag", namespace): + if 'kat' in beitrag.attrib: + stats[beitrag.get('kat')] += 1 + usage[beitrag.get('kat')].add("beitrag") + + # Count categories from + for akteur in root.findall(".//kgpz:akteur", namespace): + if 'kat' in akteur.attrib: + stats[akteur.attrib['kat']] += 1 + usage[akteur.attrib['kat']].add("akteur") + + # Count categories from + for ort in root.findall(".//kgpz:ort", namespace): + if 'kat' in ort.attrib: + stats[ort.attrib['kat']] += 1 + usage[ort.attrib['kat']].add("ort") + + # Count categories from + for issue in root.findall(".//kgpz:issue", namespace): + if 'kat' in issue.attrib: + stats[issue.attrib['kat']] += 1 + usage[issue.attrib['kat']].add("issue") + + return stats, usage + +def write_stats(stats, usage, output_file): + sorted_stats = sorted(stats.items(), key=lambda x: x[1], reverse=True) + + with open(output_file, 'w') as f: + for category, count in sorted_stats: + usages = ", ".join(sorted(usage[category])) + f.write(f"{category}: {count} (used in: {usages})\n") + +def main(): + # Define file paths + input_dir = os.getenv("INPUT_DIR", "./XML/beitraege") + output_file = os.getenv("OUTPUT_FILE", "./stats.txt") + + # Get all XML files in the input directory + file_paths = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.xml')] + + stats, usage = parse_categories(file_paths) + write_stats(stats, usage, output_file) + +if __name__ == "__main__": + main()