mirror of
https://github.com/Theodor-Springmann-Stiftung/KGPZ.git
synced 2025-10-29 17:15:31 +00:00
Stats script updated
This commit is contained in:
24
.github/workflows/Stats.yml
vendored
24
.github/workflows/Stats.yml
vendored
@@ -1,4 +1,4 @@
|
|||||||
name: Update Category Stats
|
name: Output Category Stats
|
||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
@@ -6,31 +6,15 @@ on:
|
|||||||
- main
|
- main
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
update-stats:
|
output-stats:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Set up Python
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: '3.x'
|
|
||||||
|
|
||||||
- name: Install dependencies
|
|
||||||
run: pip install xml
|
|
||||||
|
|
||||||
- name: Run stats script
|
- name: Run stats script
|
||||||
env:
|
env:
|
||||||
INPUT_DIR: './XML/beitraege' # Path to your XML files directory
|
INPUT_DIR: './XML/beitraege' # Path to your XML files directory
|
||||||
OUTPUT_FILE: './stats.txt' # Output stats file
|
|
||||||
run: python Scripts/stats.py
|
|
||||||
|
|
||||||
- name: Commit and push changes
|
|
||||||
run: |
|
run: |
|
||||||
git config --local user.name "GitHub Actions Bot"
|
python Scripts/stats.py
|
||||||
git config --local user.email "actions@github.com"
|
cat ./stats.txt # Output stats to console
|
||||||
git add stats.txt
|
|
||||||
git commit -m "Update stats file on commit"
|
|
||||||
git push
|
|
||||||
continue-on-error: true
|
|
||||||
|
|||||||
71
Scripts/stats.py
Normal file
71
Scripts/stats.py
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
from collections import Counter, defaultdict
|
||||||
|
import os
|
||||||
|
|
||||||
|
def parse_categories(file_paths):
|
||||||
|
namespace = {'kgpz': 'https://www.koenigsberger-zeitungen.de'}
|
||||||
|
stats = Counter()
|
||||||
|
usage = defaultdict(set) # Track where each category is used
|
||||||
|
|
||||||
|
for file_path in file_paths:
|
||||||
|
tree = ET.parse(file_path)
|
||||||
|
root = tree.getroot()
|
||||||
|
|
||||||
|
# Count categories from <kategorie>
|
||||||
|
for category in root.findall(".//kgpz:kategorie", namespace):
|
||||||
|
stats[category.get('ref')] += 1
|
||||||
|
usage[category.get('ref')].add("kategorie")
|
||||||
|
|
||||||
|
# Count categories from <werk>
|
||||||
|
for werk in root.findall(".//kgpz:werk", namespace):
|
||||||
|
if 'kat' in werk.attrib:
|
||||||
|
stats[werk.attrib['kat']] += 1
|
||||||
|
usage[werk.attrib['kat']].add("werk")
|
||||||
|
|
||||||
|
# Count categories from <beitrag>
|
||||||
|
for beitrag in root.findall(".//kgpz:beitrag", namespace):
|
||||||
|
if 'kat' in beitrag.attrib:
|
||||||
|
stats[beitrag.get('kat')] += 1
|
||||||
|
usage[beitrag.get('kat')].add("beitrag")
|
||||||
|
|
||||||
|
# Count categories from <akteur>
|
||||||
|
for akteur in root.findall(".//kgpz:akteur", namespace):
|
||||||
|
if 'kat' in akteur.attrib:
|
||||||
|
stats[akteur.attrib['kat']] += 1
|
||||||
|
usage[akteur.attrib['kat']].add("akteur")
|
||||||
|
|
||||||
|
# Count categories from <ort>
|
||||||
|
for ort in root.findall(".//kgpz:ort", namespace):
|
||||||
|
if 'kat' in ort.attrib:
|
||||||
|
stats[ort.attrib['kat']] += 1
|
||||||
|
usage[ort.attrib['kat']].add("ort")
|
||||||
|
|
||||||
|
# Count categories from <issue>
|
||||||
|
for issue in root.findall(".//kgpz:issue", namespace):
|
||||||
|
if 'kat' in issue.attrib:
|
||||||
|
stats[issue.attrib['kat']] += 1
|
||||||
|
usage[issue.attrib['kat']].add("issue")
|
||||||
|
|
||||||
|
return stats, usage
|
||||||
|
|
||||||
|
def write_stats(stats, usage, output_file):
|
||||||
|
sorted_stats = sorted(stats.items(), key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
|
with open(output_file, 'w') as f:
|
||||||
|
for category, count in sorted_stats:
|
||||||
|
usages = ", ".join(sorted(usage[category]))
|
||||||
|
f.write(f"{category}: {count} (used in: {usages})\n")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Define file paths
|
||||||
|
input_dir = os.getenv("INPUT_DIR", "./XML/beitraege")
|
||||||
|
output_file = os.getenv("OUTPUT_FILE", "./stats.txt")
|
||||||
|
|
||||||
|
# Get all XML files in the input directory
|
||||||
|
file_paths = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.xml')]
|
||||||
|
|
||||||
|
stats, usage = parse_categories(file_paths)
|
||||||
|
write_stats(stats, usage, output_file)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user