Files
jacoblenz/old_files_transfer/Program.cs
2023-02-06 16:00:47 +01:00

272 lines
10 KiB
C#

using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Xml.Linq;
var _ = new NeueSekLitParser("./old_data/sekn.xml");
static class Helpers {
public static XDocument ParseFile(string filepath) {
var text = File.ReadAllText(filepath, System.Text.Encoding.UTF8);
text = ReplaceWhiteSpaces(text, " ");
return XDocument.Parse(text, LoadOptions.PreserveWhitespace);
}
public static bool HasClass(string classes, XElement element) {
if (String.IsNullOrEmpty(classes) || element == null) return false;
if (HasAttribute("class", element)) return element.Attribute("class")!.Value.Contains(classes);
return false;
}
public static bool HasAttribute(string attributename, XElement element) {
if (String.IsNullOrEmpty(attributename) || element == null) return false;
if (!element.HasAttributes) return false;
if (!element.Attributes(attributename).Any()) return false;
if (String.IsNullOrWhiteSpace(element.Attribute(attributename)!.Value)) return false;
return true;
}
public static string ReplaceWhiteSpaces(string str, string replacement) {
Regex regex = new Regex(@"\s+");
return regex.Replace(str, replacement);
}
}
class SekundaerLiteraturParser {
private string filepath;
private string filename;
private XDocument document;
private string? Jahreszahl;
private string? Name;
private List<(string Jahr, string Name, string Text, string Sort)> ParsedFiles;
private StringBuilder? CurrentText;
public SekundaerLiteraturParser(string filepath) {
this.filepath = filepath;
this.filename = filepath.Split("/").Last();
this.ParsedFiles = new List<(string Jahr, string Name, string Text, string Sort)>();
if (Directory.Exists("./output/" + filename)) Directory.Delete("./output/" + filename, true);
Directory.CreateDirectory("./output/" + filename);
this.document = Helpers.ParseFile(filepath);
this.CurrentText = new StringBuilder();
foreach (var element in document.Descendants()) {
if (Helpers.HasClass("Jahreszahl", element)) {
if (Jahreszahl != null) InsertInto();
this.Jahreszahl = element.Value.Trim();
}
if (Helpers.HasClass("Einzug", element)) {
if (CurrentText.Length != 0) {
InsertInto();
}
if (element.Descendants("b").Any()) {
this.Name = element.Descendants("b").First().Value.Trim();
}
CurrentText.Append(element.ToString());
}
if (Helpers.HasClass("Kommentar", element)) {
CurrentText.Append("\n\n");
CurrentText.Append(element.ToString());
}
}
Flush();
}
private void InsertInto() {
var similarentries = ParsedFiles.Where(x => x.Jahr == Jahreszahl && x.Name == Name).ToList();
if (!similarentries.Any()) {
ParsedFiles.Add((Jahreszahl, Name, CurrentText.ToString(), "1"));
}
else {
var fe = similarentries[0];
fe.Sort = "1";
ParsedFiles.Add((
Jahreszahl,
Name,
CurrentText.ToString(),
(similarentries.Count() + 1).ToString()
));
}
CurrentText.Clear();
}
private void Flush() {
foreach (var entry in this.ParsedFiles) {
var sb = new StringBuilder();
var fn = "./output/" + filename + "/";
if (String.IsNullOrWhiteSpace(entry.Sort)) fn += entry.Jahr + "_" + entry.Name + ".html";
else fn += entry.Jahr + "_" + entry.Name + "_" + entry.Sort + ".html";
sb.AppendLine("---");
sb.AppendLine("Jahr: " + entry.Jahr);
sb.AppendLine("Autor: " + entry.Name);
if (!String.IsNullOrWhiteSpace(entry.Sort)) sb.AppendLine("Sort: " + entry.Sort);
sb.AppendLine("---");
if (File.Exists(fn)) {
Console.WriteLine("Überschreibt: " + fn);
}
sb.Append(entry.Text);
System.IO.File.WriteAllText(fn, sb.ToString());
}
}
}
class Parser {
private string filepath;
private string catname;
private string filename;
private XDocument document;
private string? Jahreszahl;
private List<(string Jahr, string Name, string Text, string Sort)> ParsedFiles;
private StringBuilder? CurrentText;
public Parser(string filepath, string catname) {
this.filepath = filepath;
this.catname = catname;
this.filename = filepath.Split("/").Last();
this.ParsedFiles = new List<(string Jahr, string Name, string Text, string Sort)>();
if (Directory.Exists("./output/" + filename)) Directory.Delete("./output/" + filename, true);
Directory.CreateDirectory("./output/" + filename);
this.document = Helpers.ParseFile(filepath);
this.CurrentText = new StringBuilder();
foreach (var element in document.Descendants()) {
if (Helpers.HasClass("Jahreszahl", element)) {
if (Jahreszahl != null) InsertInto();
this.Jahreszahl = element.Value.Trim();
}
if (Helpers.HasClass("Einzug", element)) {
if (CurrentText.Length != 0) {
InsertInto();
}
CurrentText.Append(element.ToString());
}
if (Helpers.HasClass("Kommentar", element)) {
CurrentText.Append("\n\n");
CurrentText.Append(element.ToString());
}
}
Flush();
}
private void InsertInto() {
var similarentries = ParsedFiles.Where(x => x.Jahr == Jahreszahl).ToList();
if (!similarentries.Any()) {
ParsedFiles.Add((Jahreszahl, string.Empty, CurrentText.ToString(), "1"));
}
else {
ParsedFiles.Add((
Jahreszahl,
string.Empty,
CurrentText.ToString(),
(similarentries.Count() + 1).ToString()
));
}
CurrentText.Clear();
}
private void Flush() {
foreach (var entry in this.ParsedFiles) {
var sb = new StringBuilder();
var fn = "./output/" + filename + "/";
fn += Helpers.ReplaceWhiteSpaces(entry.Jahr, "-") + "_" + entry.Sort + ".html";
sb.AppendLine("---");
sb.AppendLine(this.catname + ": " + entry.Jahr);
if (!String.IsNullOrWhiteSpace(entry.Sort)) sb.AppendLine("Sort: " + entry.Sort);
sb.AppendLine("---");
if (File.Exists(fn)) {
Console.WriteLine("Überschreibt: " + fn);
}
sb.Append(entry.Text);
System.IO.File.WriteAllText(fn, sb.ToString());
}
}
}
class NeueSekLitParser {
private string filepath;
private string filename;
private XDocument document;
private string? Jahreszahl;
private string? Name;
private List<(string Jahr, string Name, string Text, string Sort)> ParsedFiles;
private StringBuilder? CurrentText;
public NeueSekLitParser(string filepath) {
this.filepath = filepath;
this.filename = filepath.Split("/").Last();
this.ParsedFiles = new List<(string Jahr, string Name, string Text, string Sort)>();
if (Directory.Exists("./output/" + filename)) Directory.Delete("./output/" + filename, true);
Directory.CreateDirectory("./output/" + filename);
this.document = Helpers.ParseFile(filepath);
this.CurrentText = new StringBuilder();
foreach (var element in document.Descendants()) {
if (element.Name == "h1") {
if (Jahreszahl != null) InsertInto();
this.Jahreszahl = element.Value.Trim();
}
if (Helpers.HasClass("Einzug", element)) {
if (CurrentText.Length != 0) {
InsertInto();
}
if (element.Descendants("b").Any()) {
char[] totrim = { '[', ']', ':', ',' };
var name = element.Descendants("b").First().Value.Trim();
this.Name = name.TrimEnd(totrim).TrimStart(totrim);
}
CurrentText.Append(element.ToString());
}
if (Helpers.HasClass("Kommentar", element)) {
CurrentText.Append("\n\n");
CurrentText.Append(element.ToString());
}
}
Flush();
}
private void InsertInto() {
var similarentries = ParsedFiles.Where(x => x.Jahr == Jahreszahl && x.Name == Name).ToList();
if (!similarentries.Any()) {
ParsedFiles.Add((Jahreszahl, Name, CurrentText.ToString(), "1"));
}
else {
var fe = similarentries[0];
fe.Sort = "1";
ParsedFiles.Add((
Jahreszahl,
Name,
CurrentText.ToString(),
(similarentries.Count() + 1).ToString()
));
}
CurrentText.Clear();
}
private void Flush() {
foreach (var entry in this.ParsedFiles) {
var sb = new StringBuilder();
var fn = "./output/" + filename + "/";
if (String.IsNullOrWhiteSpace(entry.Sort)) fn += entry.Jahr + "_" + entry.Name + ".html";
else fn += entry.Jahr + "_" + entry.Name + "_" + entry.Sort + ".html";
sb.AppendLine("---");
sb.AppendLine("Jahr: " + entry.Jahr);
sb.AppendLine("Autor: " + entry.Name);
if (!String.IsNullOrWhiteSpace(entry.Sort)) sb.AppendLine("Sort: " + entry.Sort);
sb.AppendLine("---");
if (File.Exists(fn)) {
Console.WriteLine("Überschreibt: " + fn);
}
sb.Append(entry.Text);
System.IO.File.WriteAllText(fn, sb.ToString());
}
}
}