mirror of
https://github.com/Theodor-Springmann-Stiftung/hamann-ausgabe-core.git
synced 2025-10-29 09:15:33 +00:00
Added Transformation Script
This commit is contained in:
125
Transformation-2023-9-15/AutopsicNumberTransform.cs
Normal file
125
Transformation-2023-9-15/AutopsicNumberTransform.cs
Normal file
@@ -0,0 +1,125 @@
|
||||
using System.Xml;
|
||||
using System.Xml.Linq;
|
||||
|
||||
public static class AutopsicNumberTransform {
|
||||
// Checks are done, we begin here
|
||||
// State
|
||||
public static List<(string, XDocument, bool)> Documents = new();
|
||||
static Dictionary<string, string> OldNewIndex = new();
|
||||
static Dictionary<string, List<XElement>> Intlinks = new();
|
||||
public static Dictionary<string, List<XElement>> Marginals = new();
|
||||
static Dictionary<string, List<XElement>> LetterTexts = new();
|
||||
static Dictionary<string, List<XElement>> LetterTraditions = new();
|
||||
static Dictionary<string, List<XElement>> LetterDescs = new();
|
||||
|
||||
public static void Collect(string[] xmls) {
|
||||
|
||||
List<XElement> Autopsic = new();
|
||||
|
||||
foreach (var f in xmls) {
|
||||
XmlReaderSettings set = new XmlReaderSettings();
|
||||
set.IgnoreWhitespace = false;
|
||||
set.CheckCharacters = false;
|
||||
using (FileStream fs = File.Open(f, FileMode.Open)) {
|
||||
var d = XDocument.Load(fs, LoadOptions.PreserveWhitespace);
|
||||
var affected = false;
|
||||
|
||||
var intlinks = d.Descendants("intlink");
|
||||
if (intlinks != null && intlinks.Any()) {
|
||||
foreach (var e in intlinks) {
|
||||
if (e.HasAttributes && e.Attribute("letter") != null) {
|
||||
int letter = -1;
|
||||
if (Int32.TryParse(e.Attribute("letter").Value, out letter) && letter > 368) {
|
||||
if (!Intlinks.ContainsKey(e.Attribute("letter").Value)) Intlinks.Add(e.Attribute("letter").Value, new());
|
||||
Intlinks[e.Attribute("letter").Value].Add(e);
|
||||
Console.WriteLine("intlink: " + e.ToString() + ", document: " + f);
|
||||
affected = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var marginals = d.Descendants("marginal");
|
||||
if (marginals != null && marginals.Any()) {
|
||||
foreach (var e in marginals) {
|
||||
if (e.HasAttributes && e.Attribute("letter") != null) {
|
||||
if (!Marginals.ContainsKey(e.Attribute("letter").Value)) Marginals.Add(e.Attribute("letter").Value, new());
|
||||
Marginals[e.Attribute("letter").Value].Add(e);
|
||||
affected = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var lettertexts = d.Descendants("letterText");
|
||||
if (lettertexts != null && lettertexts.Any()) {
|
||||
foreach (var e in lettertexts) {
|
||||
if (e.HasAttributes && e.Attribute("index") != null) {
|
||||
if (!LetterTexts.ContainsKey(e.Attribute("index").Value)) LetterTexts.Add(e.Attribute("index").Value, new());
|
||||
LetterTexts[e.Attribute("index").Value].Add(e);
|
||||
affected = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var lettertraditions = d.Descendants("letterTradition");
|
||||
if (lettertraditions != null && lettertraditions.Any()) {
|
||||
foreach (var e in lettertraditions) {
|
||||
if (e.HasAttributes && e.Attribute("ref") != null) {
|
||||
if (!LetterTraditions.ContainsKey(e.Attribute("ref").Value)) LetterTraditions.Add(e.Attribute("ref").Value, new());
|
||||
LetterTraditions[e.Attribute("ref").Value].Add(e);
|
||||
affected = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var letterdescs = d.Descendants("letterDesc");
|
||||
if (letterdescs != null && letterdescs.Any()) {
|
||||
foreach (var e in letterdescs) {
|
||||
if (e.HasAttributes && e.Attribute("ref") != null) {
|
||||
if (!LetterDescs.ContainsKey(e.Attribute("ref").Value)) LetterDescs.Add(e.Attribute("ref").Value, new());
|
||||
LetterDescs[e.Attribute("ref").Value].Add(e);
|
||||
|
||||
if (e.Element("autopsic") != null && e.Element("autopsic").HasAttributes && e.Element("autopsic").Attribute("value") != null) {
|
||||
OldNewIndex.Add(e.Attribute("ref").Value, e.Element("autopsic").Attribute("value").Value);
|
||||
Autopsic.Add(e.Element("autopsic"));
|
||||
affected = true;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Documents.Add((f, d, affected));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void Transform() {
|
||||
List<Dictionary<string, List<XElement>>> Collections = new() { Intlinks, Marginals, LetterTexts, LetterTraditions, LetterDescs };
|
||||
foreach (var number in OldNewIndex) {
|
||||
Console.Write(number.Key + " -> " + number.Value);
|
||||
foreach (var c in Collections) {
|
||||
if (c != null && c.ContainsKey(number.Key)) {
|
||||
foreach (var v in c[number.Key]) {
|
||||
if (v.HasAttributes && v.Attribute("letter") != null) {
|
||||
v.Attribute("letter").Value = number.Value;
|
||||
} else if (v.HasAttributes && v.Attribute("ref") != null) {
|
||||
v.Add(new XAttribute("letter", number.Value));
|
||||
v.Attribute("ref").Remove();
|
||||
|
||||
if (!v.IsEmpty && v.Element("autopsic") != null) {
|
||||
v.Element("autopsic").Remove();
|
||||
}
|
||||
} else if (v.HasAttributes && v.Attribute("index") != null) {
|
||||
v.Add(new XAttribute("letter", number.Value));
|
||||
v.Attribute("index").Remove();
|
||||
}
|
||||
if (v.HasAttributes && v.Attribute("autopsic") != null) {
|
||||
v.Attribute("autopsic").Remove();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
63
Transformation-2023-9-15/CharacterEntitiesReferences.cs
Normal file
63
Transformation-2023-9-15/CharacterEntitiesReferences.cs
Normal file
@@ -0,0 +1,63 @@
|
||||
using System.Text;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
public static class CharacterEntityReferences {
|
||||
public static void Replace(IEnumerable<string> files, IEnumerable<string?> codepoints) {
|
||||
foreach (var f in files) {
|
||||
Console.WriteLine("Replacing file " + f);
|
||||
var t = File.ReadAllText(f);
|
||||
foreach (var s in codepoints) {
|
||||
if (s != null) {
|
||||
t = t.Replace(s, ConvertStringToCodepoint(s));
|
||||
}
|
||||
}
|
||||
File.WriteAllText(f, t);
|
||||
}
|
||||
}
|
||||
|
||||
static string ConvertStringToCodepoint(string input) {
|
||||
var sb = new StringBuilder();
|
||||
for (int i = 0; i < input.Length; i += char.IsSurrogatePair(input, i) ? 2 : 1) {
|
||||
int codepoint = char.ConvertToUtf32(input, i);
|
||||
sb.Append("&#x").Append($"{codepoint:X}").Append(';');
|
||||
}
|
||||
return sb.ToString();
|
||||
}
|
||||
|
||||
public static HashSet<string?> GetCodePoints(string[] files) {
|
||||
HashSet<string?> res = new();
|
||||
Regex cphex = new Regex(@"&#x([0-9a-fA-F]{1,4});");
|
||||
Regex cpint = new Regex(@"&#([0-9]{1,4});");
|
||||
HashSet<string> XMLForbidden = new HashSet<string>(){
|
||||
"<",
|
||||
"&",
|
||||
">",
|
||||
"'",
|
||||
"\""
|
||||
};
|
||||
|
||||
foreach (var f in files) {
|
||||
var t = File.ReadAllText(f);
|
||||
|
||||
var ms = cphex.Matches(t);
|
||||
foreach (var mat in ms) {
|
||||
int value = Convert.ToInt32(String.Join(string.Empty, mat.ToString().Skip(3).SkipLast(1)), 16);
|
||||
var c = char.ConvertFromUtf32(value);
|
||||
if (!res.Contains(c) && !XMLForbidden.Contains(c)) {
|
||||
res.Add(c);
|
||||
}
|
||||
}
|
||||
|
||||
ms = cpint.Matches(t);
|
||||
foreach (var mat in ms) {
|
||||
int value = Convert.ToInt32(String.Join(string.Empty, mat.ToString().Skip(2).SkipLast(1)));
|
||||
var c = char.ConvertFromUtf32(value);
|
||||
Console.WriteLine(mat.ToString() + " " + c);
|
||||
if (!res.Contains(c) && !XMLForbidden.Contains(c)) {
|
||||
res.Add(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
}
|
||||
47
Transformation-2023-9-15/FileOperations.cs
Normal file
47
Transformation-2023-9-15/FileOperations.cs
Normal file
@@ -0,0 +1,47 @@
|
||||
using System.Xml;
|
||||
using System.Xml.Linq;
|
||||
|
||||
public static class FileOperations {
|
||||
public static void SaveFile(List<(string, XDocument, bool)> Documents, string dest) {
|
||||
foreach (var d in Documents) {
|
||||
if (d.Item3) {
|
||||
if (!Directory.Exists(dest)) {
|
||||
Directory.CreateDirectory(dest);
|
||||
}
|
||||
var filenameold = d.Item1.Split('/', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).LastOrDefault();
|
||||
if (filenameold == null) return;
|
||||
var path = Path.Combine(dest, filenameold);
|
||||
// element.Save(path, SaveOptions.DisableFormatting);
|
||||
XmlWriterSettings set = new XmlWriterSettings() {
|
||||
CheckCharacters = false,
|
||||
Encoding = System.Text.Encoding.UTF8,
|
||||
};
|
||||
using (XmlWriter writer = XmlWriter.Create(path, set)) {
|
||||
d.Item2.Save(writer);
|
||||
}
|
||||
// using (var targetStream = System.IO.File.Create(path)) {
|
||||
// element.Save(targetStream, SaveOptions.DisableFormatting);
|
||||
//}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Gets XML data if branch name is correct (to prevent data errors)
|
||||
public static string[] GetXMLs(string path, string git_path, string? branch_name) {
|
||||
// BASIC CHECKS
|
||||
if (branch_name != null) {
|
||||
if (File.Exists(git_path + ".git/HEAD")) {
|
||||
var text = File.ReadAllText(git_path + ".git/HEAD").Trim();
|
||||
if (!text.EndsWith(branch_name))
|
||||
throw new("Not it the Branch " + branch_name);
|
||||
} else throw new("Specified Paths do not exist");
|
||||
}
|
||||
|
||||
if (!Directory.Exists(path)) throw new("Directory does not exist!");
|
||||
|
||||
var xmls = Directory.GetFiles(path, "*.xml");
|
||||
if (xmls == null || !xmls.Any()) throw new("No XML Data fonund!");
|
||||
|
||||
return xmls;
|
||||
}
|
||||
}
|
||||
38
Transformation-2023-9-15/MarginalsTransform.cs
Normal file
38
Transformation-2023-9-15/MarginalsTransform.cs
Normal file
@@ -0,0 +1,38 @@
|
||||
using System.Xml.Linq;
|
||||
|
||||
public static class MarginalsTransform {
|
||||
public static List<XElement> Marginals;
|
||||
|
||||
public static void Transform() {
|
||||
var lu = Marginals
|
||||
.Where(x => x.HasAttributes &&
|
||||
x.Attribute("line") != null &&
|
||||
x.Attribute("page") != null &&
|
||||
x.Attribute("letter") != null)
|
||||
.GroupBy(x =>
|
||||
x.Attribute("letter")!.Value +
|
||||
"-" +
|
||||
x.Attribute("page")!.Value +
|
||||
"-" +
|
||||
x.Attribute("line")!.Value
|
||||
);
|
||||
|
||||
foreach (var l in lu) {
|
||||
if (l.Count() > 1) {
|
||||
var list = l
|
||||
.Where(x => x.HasAttributes && x.Attribute("index") != null && Int32.TryParse(x.Attribute("index")!.Value, out var _) != false)
|
||||
.OrderBy(y => Int32.Parse(y.Attribute("index")!.Value));
|
||||
var i = 1;
|
||||
foreach (var e in list) {
|
||||
e.Attribute("index")!.Remove();
|
||||
e.Add(new XAttribute("sort", i.ToString()));
|
||||
i++;
|
||||
}
|
||||
} else if (l.Count() == 1) {
|
||||
if (l.First().HasAttributes && l.First().Attribute("index") != null) {
|
||||
l.First().Attribute("index")!.Remove();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,7 @@
|
||||
using System.IO;
|
||||
using System.Security;
|
||||
using System.Text;
|
||||
using System.Text.RegularExpressions;
|
||||
using System.Xml;
|
||||
using System.Xml.Linq;
|
||||
// See https://aka.ms/new-console-template for more information
|
||||
@@ -8,158 +10,16 @@ const string DEST_PATH = "C:/Users/simon/source/hamann-xml/transformations_2023-
|
||||
const string GIT_PATH = "C:/Users/simon/source/hamann-xml/";
|
||||
const string BRANCH_NAME = "testdata";
|
||||
|
||||
if (File.Exists(GIT_PATH + ".git/HEAD") || !Directory.Exists(XML_PATH)) {
|
||||
var text = File.ReadAllText(GIT_PATH + ".git/HEAD").Trim();
|
||||
if (!text.EndsWith(BRANCH_NAME)) {
|
||||
throw new("Not it the Branch " + BRANCH_NAME);
|
||||
}
|
||||
} else {
|
||||
throw new("Specified Paths do not exist");
|
||||
}
|
||||
var xmls = FileOperations.GetXMLs(XML_PATH, GIT_PATH, BRANCH_NAME);
|
||||
var cp = CharacterEntityReferences.GetCodePoints(xmls);
|
||||
|
||||
var xmls = Directory.GetFiles(XML_PATH, "*.xml");
|
||||
AutopsicNumberTransform.Collect(xmls);
|
||||
AutopsicNumberTransform.Transform();
|
||||
MarginalsTransform.Marginals = AutopsicNumberTransform.Marginals.SelectMany(x => x.Value).ToList();
|
||||
MarginalsTransform.Transform();
|
||||
FileOperations.SaveFile(AutopsicNumberTransform.Documents, DEST_PATH);
|
||||
|
||||
if (xmls == null || !xmls.Any()) {
|
||||
throw new("No XML Data fonund!");
|
||||
}
|
||||
xmls = FileOperations.GetXMLs(DEST_PATH, null, null);
|
||||
|
||||
// Checks are done, we begin here
|
||||
// State
|
||||
List<(string, XDocument, bool)> Documents = new();
|
||||
Dictionary<string, string> OldNewIndex = new();
|
||||
Dictionary<string, List<XElement>> Intlinks = new();
|
||||
Dictionary<string, List<XElement>> Marginals = new();
|
||||
Dictionary<string, List<XElement>> LetterTexts = new();
|
||||
Dictionary<string, List<XElement>> LetterTraditions = new();
|
||||
Dictionary<string, List<XElement>> LetterDescs = new();
|
||||
|
||||
List<XElement> Autopsic = new();
|
||||
|
||||
foreach (var f in xmls) {
|
||||
XmlReaderSettings set = new XmlReaderSettings();
|
||||
set.IgnoreWhitespace = false;
|
||||
set.CheckCharacters = false;
|
||||
using (FileStream fs = File.Open(f, FileMode.Open)) {
|
||||
using (var r = new XmlTextReader(fs) { Normalization = false, WhitespaceHandling = WhitespaceHandling.All, EntityHandling = EntityHandling.ExpandCharEntities}) {
|
||||
|
||||
var d = XDocument.Load(r);
|
||||
var affected = false;
|
||||
|
||||
var intlinks = d.Descendants("intlink");
|
||||
if (intlinks != null && intlinks.Any()) {
|
||||
foreach (var e in intlinks) {
|
||||
if (e.HasAttributes && e.Attribute("letter") != null) {
|
||||
int letter = -1;
|
||||
if (Int32.TryParse(e.Attribute("letter").Value, out letter) && letter > 368) {
|
||||
if (!Intlinks.ContainsKey(e.Attribute("letter").Value)) Intlinks.Add(e.Attribute("letter").Value, new());
|
||||
Intlinks[e.Attribute("letter").Value].Add(e);
|
||||
Console.WriteLine(e.ToString());
|
||||
affected = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var marginals = d.Descendants("marginal");
|
||||
if (marginals != null && marginals.Any()) {
|
||||
foreach (var e in marginals) {
|
||||
if (e.HasAttributes && e.Attribute("letter") != null) {
|
||||
if (!Marginals.ContainsKey(e.Attribute("letter").Value)) Marginals.Add(e.Attribute("letter").Value, new());
|
||||
Marginals[e.Attribute("letter").Value].Add(e);
|
||||
affected = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var lettertexts = d.Descendants("letterText");
|
||||
if (lettertexts != null && lettertexts.Any()) {
|
||||
foreach (var e in lettertexts) {
|
||||
if (e.HasAttributes && e.Attribute("index") != null) {
|
||||
if (!LetterTexts.ContainsKey(e.Attribute("index").Value)) LetterTexts.Add(e.Attribute("index").Value, new());
|
||||
LetterTexts[e.Attribute("index").Value].Add(e);
|
||||
affected = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var lettertraditions = d.Descendants("letterTradition");
|
||||
if (lettertraditions != null && lettertraditions.Any()) {
|
||||
foreach (var e in lettertraditions) {
|
||||
if (e.HasAttributes && e.Attribute("ref") != null) {
|
||||
if (!LetterTraditions.ContainsKey(e.Attribute("ref").Value)) LetterTraditions.Add(e.Attribute("ref").Value, new());
|
||||
LetterTraditions[e.Attribute("ref").Value].Add(e);
|
||||
affected = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var letterdescs = d.Descendants("letterDesc");
|
||||
if (letterdescs != null && letterdescs.Any()) {
|
||||
foreach (var e in letterdescs) {
|
||||
if (e.HasAttributes && e.Attribute("ref") != null) {
|
||||
if (!LetterDescs.ContainsKey(e.Attribute("ref").Value)) LetterDescs.Add(e.Attribute("ref").Value, new());
|
||||
LetterDescs[e.Attribute("ref").Value].Add(e);
|
||||
|
||||
if (e.Element("autopsic") != null && e.Element("autopsic").HasAttributes && e.Element("autopsic").Attribute("value") != null) {
|
||||
OldNewIndex.Add(e.Attribute("ref").Value, e.Element("autopsic").Attribute("value").Value);
|
||||
Autopsic.Add(e.Element("autopsic"));
|
||||
affected = true;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Documents.Add((f, d, affected));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
List<Dictionary<string, List<XElement>>> Collections = new() { Intlinks, Marginals, LetterTexts, LetterTraditions, LetterDescs };
|
||||
foreach (var number in OldNewIndex) {
|
||||
if (number.Key == number.Value) continue;
|
||||
foreach (var c in Collections) {
|
||||
if (c != null && c.ContainsKey(number.Key)) {
|
||||
foreach (var v in c[number.Key]) {
|
||||
if (v.HasAttributes && v.Attribute("letter") != null) {
|
||||
v.Attribute("letter").Value = number.Value;
|
||||
} else if (v.HasAttributes && v.Attribute("ref") != null) {
|
||||
v.Attribute("ref").Value = number.Value;
|
||||
} else if (v.HasAttributes && v.Attribute("index") != null) {
|
||||
v.Attribute("index").Value = number.Value;
|
||||
}
|
||||
// NOT POSSIBLE:
|
||||
// if (v.HasAttributes && v.Attribute("autopsic") != null) {
|
||||
// v.Attribute("autopsic").Remove();
|
||||
// }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
foreach (var d in Documents) {
|
||||
//if (d.Item3) SaveFile(d.Item2, DEST_PATH, d.Item1);
|
||||
}
|
||||
|
||||
void SaveFile(XDocument element, string basefilepath, string oldfile) {
|
||||
if (!Directory.Exists(basefilepath)) {
|
||||
Directory.CreateDirectory(basefilepath);
|
||||
}
|
||||
var filenameold = oldfile.Split('/', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).LastOrDefault();
|
||||
if (filenameold == null) return;
|
||||
var filename = oldfile;
|
||||
var path = Path.Combine(basefilepath, filename);
|
||||
|
||||
if (!Directory.Exists(basefilepath))
|
||||
Directory.CreateDirectory(basefilepath);
|
||||
File.WriteAllText(path, element.ToString());
|
||||
// XmlWriterSettings set = new XmlWriterSettings() {
|
||||
// CheckCharacters = false
|
||||
// };
|
||||
// using (XmlTextWriter wr = new XmlTextWriter(path, System.Text.Encoding.UTF8) { Formatting = System.Xml.Formatting.None }) {
|
||||
// element.Save(wr);
|
||||
// }
|
||||
// using (var targetStream = System.IO.File.Create(path)) {
|
||||
// element.Save(targetStream, SaveOptions.DisableFormatting);
|
||||
//}
|
||||
}
|
||||
// Sets only whitespace entities
|
||||
CharacterEntityReferences.Replace(xmls, cp.Where(x => x != null && String.IsNullOrWhiteSpace(x)));
|
||||
Reference in New Issue
Block a user