mirror of
https://github.com/Theodor-Springmann-Stiftung/jacoblenz.git
synced 2025-10-28 16:55:33 +00:00
114 lines
4.2 KiB
C#
114 lines
4.2 KiB
C#
using System.Linq;
|
|
using System.Text;
|
|
using System.Text.RegularExpressions;
|
|
using System.Xml.Linq;
|
|
|
|
var _ = new SekundaerLiteraturParser("./old_data/sekundaer.xml");
|
|
|
|
static class Helpers {
|
|
public static XDocument ParseFile(string filepath) {
|
|
var text = File.ReadAllText(filepath, System.Text.Encoding.UTF8);
|
|
text = ReplaceWhiteSpaces(text);
|
|
return XDocument.Parse(text, LoadOptions.PreserveWhitespace);
|
|
}
|
|
|
|
|
|
public static bool HasClass(string classes, XElement element) {
|
|
if (String.IsNullOrEmpty(classes) || element == null) return false;
|
|
if (HasAttribute("class", element)) return element.Attribute("class")!.Value.Contains(classes);
|
|
return false;
|
|
}
|
|
|
|
public static bool HasAttribute(string attributename, XElement element) {
|
|
if (String.IsNullOrEmpty(attributename) || element == null) return false;
|
|
if (!element.HasAttributes) return false;
|
|
if (!element.Attributes(attributename).Any()) return false;
|
|
if (String.IsNullOrWhiteSpace(element.Attribute(attributename)!.Value)) return false;
|
|
return true;
|
|
}
|
|
|
|
public static string ReplaceWhiteSpaces(string str) {
|
|
Regex regex = new Regex(@"\s+");
|
|
return regex.Replace(str, " ");
|
|
}
|
|
}
|
|
|
|
|
|
class SekundaerLiteraturParser {
|
|
private string filepath;
|
|
private string filename;
|
|
private XDocument document;
|
|
private string? Jahreszahl;
|
|
private string? Name;
|
|
|
|
private List<(string Jahr, string Name, string Text, string Sort)> ParsedFiles;
|
|
private StringBuilder? CurrentText;
|
|
|
|
public SekundaerLiteraturParser(string filepath) {
|
|
this.filepath = filepath;
|
|
this.filename = filepath.Split("/").Last();
|
|
this.ParsedFiles = new List<(string Jahr, string Name, string Text, string Sort)>();
|
|
if (Directory.Exists("./output/" + filename)) Directory.Delete("./" + filename, true);
|
|
Directory.CreateDirectory("./output/" + filename);
|
|
this.document = Helpers.ParseFile(filepath);
|
|
this.CurrentText = new StringBuilder();
|
|
foreach (var element in document.Descendants()) {
|
|
if (Helpers.HasClass("Jahreszahl", element)) {
|
|
if (Jahreszahl != null) InsertInto();
|
|
this.Jahreszahl = element.Value.Trim();
|
|
}
|
|
if (Helpers.HasClass("Einzug", element)) {
|
|
if (CurrentText.Length != 0) {
|
|
InsertInto();
|
|
}
|
|
if (element.Descendants("b").Any()) {
|
|
this.Name = element.Descendants("b").First().Value.Trim();
|
|
}
|
|
CurrentText.Append(element.ToString());
|
|
}
|
|
if (Helpers.HasClass("Kommentar", element)) {
|
|
CurrentText.Append("\n\n");
|
|
CurrentText.Append(element.ToString());
|
|
}
|
|
}
|
|
Flush();
|
|
}
|
|
|
|
private void InsertInto() {
|
|
var similarentries = ParsedFiles.Where(x => x.Jahr == Jahreszahl && x.Name == Name).ToList();
|
|
if (!similarentries.Any()) {
|
|
ParsedFiles.Add((Jahreszahl, Name, CurrentText.ToString(), string.Empty));
|
|
}
|
|
else {
|
|
var fe = similarentries[0];
|
|
fe.Sort = "1";
|
|
ParsedFiles.Add((
|
|
Jahreszahl,
|
|
Name,
|
|
CurrentText.ToString(),
|
|
(similarentries.Count() + 1).ToString()
|
|
));
|
|
}
|
|
|
|
CurrentText.Clear();
|
|
}
|
|
|
|
private void Flush() {
|
|
foreach (var entry in this.ParsedFiles) {
|
|
var sb = new StringBuilder();
|
|
var fn = "./output/" + filename + "/";
|
|
if (String.IsNullOrWhiteSpace(entry.Sort)) fn += entry.Jahr + "_" + entry.Name + ".html";
|
|
else fn += entry.Jahr + "_" + entry.Name + "_" + entry.Sort + ".html";
|
|
sb.AppendLine("---");
|
|
sb.AppendLine("Jahr: " + entry.Jahr);
|
|
sb.AppendLine("Autor: " + entry.Name);
|
|
if (!String.IsNullOrWhiteSpace(entry.Sort)) sb.AppendLine("Sort: " + entry.Sort);
|
|
sb.AppendLine("---");
|
|
if (File.Exists(fn)) {
|
|
Console.WriteLine("Überschreibt: " + fn);
|
|
}
|
|
sb.Append(entry.Text);
|
|
System.IO.File.WriteAllText(fn, sb.ToString());
|
|
}
|
|
}
|
|
} |