mirror of
				https://github.com/Theodor-Springmann-Stiftung/KGPZ.git
				synced 2025-10-31 01:35:31 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			64 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			64 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import os
 | |
| from lxml import etree
 | |
| 
 | |
| NAMESPACE = {'kgpz': 'https://www.koenigsberger-zeitungen.de'}
 | |
| SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 | |
| XML_DIR = os.path.abspath(os.path.join(SCRIPT_DIR, '..', 'XML'))
 | |
| 
 | |
| def parse_xml_file(filepath):
 | |
|     try:
 | |
|         parser = etree.XMLParser(remove_blank_text=True)
 | |
|         tree = etree.parse(filepath, parser)
 | |
|         return tree.getroot()
 | |
|     except etree.ParseError as e:
 | |
|         print(f"Error parsing {filepath}: {e}")
 | |
|         return None
 | |
| 
 | |
| def get_all_ids(root, tag):
 | |
|     return set(elem.get('id') for elem in root.xpath(f'.//kgpz:{tag}', namespaces=NAMESPACE))
 | |
| 
 | |
| def check_references(beitrag_root, reference_data, filename):
 | |
|     errors = []
 | |
|     for ref_type, ref_tag in [('akteur', 'akteur'), ('kategorie', 'kategorie'), 
 | |
|                               ('ort', 'ort'), ('werk', 'werk')]:
 | |
|         for ref in beitrag_root.xpath(f'//kgpz:{ref_tag}', namespaces=NAMESPACE):
 | |
|             ref_id = ref.get('ref')
 | |
|             if ref_id not in reference_data[ref_type]:
 | |
|                 line_number = ref.sourceline
 | |
|                 errors.append((filename, line_number, f"INVALID REFERENCE ({ref_type}:{ref_id})"))
 | |
|     return errors
 | |
| 
 | |
| def main():
 | |
|     reference_data = {
 | |
|         'akteur': get_all_ids(parse_xml_file(os.path.join(XML_DIR, 'akteure.xml')), 'akteur'),
 | |
|         'kategorie': get_all_ids(parse_xml_file(os.path.join(XML_DIR, 'kategorien.xml')), 'kategorie'),
 | |
|         'ort': get_all_ids(parse_xml_file(os.path.join(XML_DIR, 'orte.xml')), 'ort'),
 | |
|         'werk': get_all_ids(parse_xml_file(os.path.join(XML_DIR, 'werke.xml')), 'werk'),
 | |
|     }
 | |
| 
 | |
|     all_errors = []
 | |
| 
 | |
|     beitraege_dir = os.path.join(XML_DIR, 'beitraege')
 | |
|     for filename in os.listdir(beitraege_dir):
 | |
|         if filename.endswith('-beitraege.xml'):
 | |
|             beitrag_root = parse_xml_file(os.path.join(beitraege_dir, filename))
 | |
|             if beitrag_root is not None:
 | |
|                 errors = check_references(beitrag_root, reference_data, filename)
 | |
|                 all_errors.extend(errors)
 | |
| 
 | |
|     all_errors.sort(key=lambda x: (x[0], x[1]))
 | |
| 
 | |
|     with open('linter_results.txt', 'w') as f:
 | |
|         for filename, line_number, error_message in all_errors:
 | |
|             f.write(f"{filename}:{line_number}:{error_message}\n")
 | |
| 
 | |
|     if all_errors:
 | |
|         for filename, line_number, error_message in all_errors:
 | |
|             print(f"{filename}, Line {line_number}: {error_message}")
 | |
|         exit(1)  # Exit with error code if there are any errors
 | |
|     else:
 | |
|         print("No errors found.")
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     main()
 | 
