diff --git a/Transformation-2023-9-15/AutopsicNumberTransform.cs b/Transformation-2023-9-15/AutopsicNumberTransform.cs new file mode 100644 index 0000000..3806062 --- /dev/null +++ b/Transformation-2023-9-15/AutopsicNumberTransform.cs @@ -0,0 +1,125 @@ +using System.Xml; +using System.Xml.Linq; + +public static class AutopsicNumberTransform { + // Checks are done, we begin here + // State + public static List<(string, XDocument, bool)> Documents = new(); + static Dictionary OldNewIndex = new(); + static Dictionary> Intlinks = new(); + public static Dictionary> Marginals = new(); + static Dictionary> LetterTexts = new(); + static Dictionary> LetterTraditions = new(); + static Dictionary> LetterDescs = new(); + + public static void Collect(string[] xmls) { + + List Autopsic = new(); + + foreach (var f in xmls) { + XmlReaderSettings set = new XmlReaderSettings(); + set.IgnoreWhitespace = false; + set.CheckCharacters = false; + using (FileStream fs = File.Open(f, FileMode.Open)) { + var d = XDocument.Load(fs, LoadOptions.PreserveWhitespace); + var affected = false; + + var intlinks = d.Descendants("intlink"); + if (intlinks != null && intlinks.Any()) { + foreach (var e in intlinks) { + if (e.HasAttributes && e.Attribute("letter") != null) { + int letter = -1; + if (Int32.TryParse(e.Attribute("letter").Value, out letter) && letter > 368) { + if (!Intlinks.ContainsKey(e.Attribute("letter").Value)) Intlinks.Add(e.Attribute("letter").Value, new()); + Intlinks[e.Attribute("letter").Value].Add(e); + Console.WriteLine("intlink: " + e.ToString() + ", document: " + f); + affected = true; + } + } + } + } + + var marginals = d.Descendants("marginal"); + if (marginals != null && marginals.Any()) { + foreach (var e in marginals) { + if (e.HasAttributes && e.Attribute("letter") != null) { + if (!Marginals.ContainsKey(e.Attribute("letter").Value)) Marginals.Add(e.Attribute("letter").Value, new()); + Marginals[e.Attribute("letter").Value].Add(e); + affected = true; + } + } + } + + var lettertexts = d.Descendants("letterText"); + if (lettertexts != null && lettertexts.Any()) { + foreach (var e in lettertexts) { + if (e.HasAttributes && e.Attribute("index") != null) { + if (!LetterTexts.ContainsKey(e.Attribute("index").Value)) LetterTexts.Add(e.Attribute("index").Value, new()); + LetterTexts[e.Attribute("index").Value].Add(e); + affected = true; + } + } + } + + var lettertraditions = d.Descendants("letterTradition"); + if (lettertraditions != null && lettertraditions.Any()) { + foreach (var e in lettertraditions) { + if (e.HasAttributes && e.Attribute("ref") != null) { + if (!LetterTraditions.ContainsKey(e.Attribute("ref").Value)) LetterTraditions.Add(e.Attribute("ref").Value, new()); + LetterTraditions[e.Attribute("ref").Value].Add(e); + affected = true; + } + } + } + + var letterdescs = d.Descendants("letterDesc"); + if (letterdescs != null && letterdescs.Any()) { + foreach (var e in letterdescs) { + if (e.HasAttributes && e.Attribute("ref") != null) { + if (!LetterDescs.ContainsKey(e.Attribute("ref").Value)) LetterDescs.Add(e.Attribute("ref").Value, new()); + LetterDescs[e.Attribute("ref").Value].Add(e); + + if (e.Element("autopsic") != null && e.Element("autopsic").HasAttributes && e.Element("autopsic").Attribute("value") != null) { + OldNewIndex.Add(e.Attribute("ref").Value, e.Element("autopsic").Attribute("value").Value); + Autopsic.Add(e.Element("autopsic")); + affected = true; + } + + } + } + } + + Documents.Add((f, d, affected)); + } + } + } + + public static void Transform() { + List>> Collections = new() { Intlinks, Marginals, LetterTexts, LetterTraditions, LetterDescs }; + foreach (var number in OldNewIndex) { + Console.Write(number.Key + " -> " + number.Value); + foreach (var c in Collections) { + if (c != null && c.ContainsKey(number.Key)) { + foreach (var v in c[number.Key]) { + if (v.HasAttributes && v.Attribute("letter") != null) { + v.Attribute("letter").Value = number.Value; + } else if (v.HasAttributes && v.Attribute("ref") != null) { + v.Add(new XAttribute("letter", number.Value)); + v.Attribute("ref").Remove(); + + if (!v.IsEmpty && v.Element("autopsic") != null) { + v.Element("autopsic").Remove(); + } + } else if (v.HasAttributes && v.Attribute("index") != null) { + v.Add(new XAttribute("letter", number.Value)); + v.Attribute("index").Remove(); + } + if (v.HasAttributes && v.Attribute("autopsic") != null) { + v.Attribute("autopsic").Remove(); + } + } + } + } + } + } +} \ No newline at end of file diff --git a/Transformation-2023-9-15/CharacterEntitiesReferences.cs b/Transformation-2023-9-15/CharacterEntitiesReferences.cs new file mode 100644 index 0000000..15d99f6 --- /dev/null +++ b/Transformation-2023-9-15/CharacterEntitiesReferences.cs @@ -0,0 +1,63 @@ +using System.Text; +using System.Text.RegularExpressions; + +public static class CharacterEntityReferences { + public static void Replace(IEnumerable files, IEnumerable codepoints) { + foreach (var f in files) { + Console.WriteLine("Replacing file " + f); + var t = File.ReadAllText(f); + foreach (var s in codepoints) { + if (s != null) { + t = t.Replace(s, ConvertStringToCodepoint(s)); + } + } + File.WriteAllText(f, t); + } + } + + static string ConvertStringToCodepoint(string input) { + var sb = new StringBuilder(); + for (int i = 0; i < input.Length; i += char.IsSurrogatePair(input, i) ? 2 : 1) { + int codepoint = char.ConvertToUtf32(input, i); + sb.Append("&#x").Append($"{codepoint:X}").Append(';'); + } + return sb.ToString(); + } + + public static HashSet GetCodePoints(string[] files) { + HashSet res = new(); + Regex cphex = new Regex(@"&#x([0-9a-fA-F]{1,4});"); + Regex cpint = new Regex(@"&#([0-9]{1,4});"); + HashSet XMLForbidden = new HashSet(){ + "<", + "&", + ">", + "'", + "\"" + }; + + foreach (var f in files) { + var t = File.ReadAllText(f); + + var ms = cphex.Matches(t); + foreach (var mat in ms) { + int value = Convert.ToInt32(String.Join(string.Empty, mat.ToString().Skip(3).SkipLast(1)), 16); + var c = char.ConvertFromUtf32(value); + if (!res.Contains(c) && !XMLForbidden.Contains(c)) { + res.Add(c); + } + } + + ms = cpint.Matches(t); + foreach (var mat in ms) { + int value = Convert.ToInt32(String.Join(string.Empty, mat.ToString().Skip(2).SkipLast(1))); + var c = char.ConvertFromUtf32(value); + Console.WriteLine(mat.ToString() + " " + c); + if (!res.Contains(c) && !XMLForbidden.Contains(c)) { + res.Add(c); + } + } + } + return res; +} +} \ No newline at end of file diff --git a/Transformation-2023-9-15/FileOperations.cs b/Transformation-2023-9-15/FileOperations.cs new file mode 100644 index 0000000..ad999e6 --- /dev/null +++ b/Transformation-2023-9-15/FileOperations.cs @@ -0,0 +1,47 @@ +using System.Xml; +using System.Xml.Linq; + +public static class FileOperations { + public static void SaveFile(List<(string, XDocument, bool)> Documents, string dest) { + foreach (var d in Documents) { + if (d.Item3) { + if (!Directory.Exists(dest)) { + Directory.CreateDirectory(dest); + } + var filenameold = d.Item1.Split('/', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).LastOrDefault(); + if (filenameold == null) return; + var path = Path.Combine(dest, filenameold); + // element.Save(path, SaveOptions.DisableFormatting); + XmlWriterSettings set = new XmlWriterSettings() { + CheckCharacters = false, + Encoding = System.Text.Encoding.UTF8, + }; + using (XmlWriter writer = XmlWriter.Create(path, set)) { + d.Item2.Save(writer); + } + // using (var targetStream = System.IO.File.Create(path)) { + // element.Save(targetStream, SaveOptions.DisableFormatting); + //} + } + } + } + + // Gets XML data if branch name is correct (to prevent data errors) + public static string[] GetXMLs(string path, string git_path, string? branch_name) { + // BASIC CHECKS + if (branch_name != null) { + if (File.Exists(git_path + ".git/HEAD")) { + var text = File.ReadAllText(git_path + ".git/HEAD").Trim(); + if (!text.EndsWith(branch_name)) + throw new("Not it the Branch " + branch_name); + } else throw new("Specified Paths do not exist"); + } + + if (!Directory.Exists(path)) throw new("Directory does not exist!"); + + var xmls = Directory.GetFiles(path, "*.xml"); + if (xmls == null || !xmls.Any()) throw new("No XML Data fonund!"); + + return xmls; + } +} \ No newline at end of file diff --git a/Transformation-2023-9-15/MarginalsTransform.cs b/Transformation-2023-9-15/MarginalsTransform.cs new file mode 100644 index 0000000..5395515 --- /dev/null +++ b/Transformation-2023-9-15/MarginalsTransform.cs @@ -0,0 +1,38 @@ +using System.Xml.Linq; + +public static class MarginalsTransform { + public static List Marginals; + + public static void Transform() { + var lu = Marginals + .Where(x => x.HasAttributes && + x.Attribute("line") != null && + x.Attribute("page") != null && + x.Attribute("letter") != null) + .GroupBy(x => + x.Attribute("letter")!.Value + + "-" + + x.Attribute("page")!.Value + + "-" + + x.Attribute("line")!.Value + ); + + foreach (var l in lu) { + if (l.Count() > 1) { + var list = l + .Where(x => x.HasAttributes && x.Attribute("index") != null && Int32.TryParse(x.Attribute("index")!.Value, out var _) != false) + .OrderBy(y => Int32.Parse(y.Attribute("index")!.Value)); + var i = 1; + foreach (var e in list) { + e.Attribute("index")!.Remove(); + e.Add(new XAttribute("sort", i.ToString())); + i++; + } + } else if (l.Count() == 1) { + if (l.First().HasAttributes && l.First().Attribute("index") != null) { + l.First().Attribute("index")!.Remove(); + } + } + } + } +} \ No newline at end of file diff --git a/Transformation-2023-9-15/Program.cs b/Transformation-2023-9-15/Program.cs index 776ee0f..dbff364 100644 --- a/Transformation-2023-9-15/Program.cs +++ b/Transformation-2023-9-15/Program.cs @@ -1,5 +1,7 @@ using System.IO; using System.Security; +using System.Text; +using System.Text.RegularExpressions; using System.Xml; using System.Xml.Linq; // See https://aka.ms/new-console-template for more information @@ -8,158 +10,16 @@ const string DEST_PATH = "C:/Users/simon/source/hamann-xml/transformations_2023- const string GIT_PATH = "C:/Users/simon/source/hamann-xml/"; const string BRANCH_NAME = "testdata"; -if (File.Exists(GIT_PATH + ".git/HEAD") || !Directory.Exists(XML_PATH)) { - var text = File.ReadAllText(GIT_PATH + ".git/HEAD").Trim(); - if (!text.EndsWith(BRANCH_NAME)) { - throw new("Not it the Branch " + BRANCH_NAME); - } -} else { - throw new("Specified Paths do not exist"); -} +var xmls = FileOperations.GetXMLs(XML_PATH, GIT_PATH, BRANCH_NAME); +var cp = CharacterEntityReferences.GetCodePoints(xmls); -var xmls = Directory.GetFiles(XML_PATH, "*.xml"); +AutopsicNumberTransform.Collect(xmls); +AutopsicNumberTransform.Transform(); +MarginalsTransform.Marginals = AutopsicNumberTransform.Marginals.SelectMany(x => x.Value).ToList(); +MarginalsTransform.Transform(); +FileOperations.SaveFile(AutopsicNumberTransform.Documents, DEST_PATH); -if (xmls == null || !xmls.Any()) { - throw new("No XML Data fonund!"); -} +xmls = FileOperations.GetXMLs(DEST_PATH, null, null); -// Checks are done, we begin here -// State -List<(string, XDocument, bool)> Documents = new(); -Dictionary OldNewIndex = new(); -Dictionary> Intlinks = new(); -Dictionary> Marginals = new(); -Dictionary> LetterTexts = new(); -Dictionary> LetterTraditions = new(); -Dictionary> LetterDescs = new(); - -List Autopsic = new(); - -foreach (var f in xmls) { - XmlReaderSettings set = new XmlReaderSettings(); - set.IgnoreWhitespace = false; - set.CheckCharacters = false; - using (FileStream fs = File.Open(f, FileMode.Open)) { - using (var r = new XmlTextReader(fs) { Normalization = false, WhitespaceHandling = WhitespaceHandling.All, EntityHandling = EntityHandling.ExpandCharEntities}) { - - var d = XDocument.Load(r); - var affected = false; - - var intlinks = d.Descendants("intlink"); - if (intlinks != null && intlinks.Any()) { - foreach (var e in intlinks) { - if (e.HasAttributes && e.Attribute("letter") != null) { - int letter = -1; - if (Int32.TryParse(e.Attribute("letter").Value, out letter) && letter > 368) { - if (!Intlinks.ContainsKey(e.Attribute("letter").Value)) Intlinks.Add(e.Attribute("letter").Value, new()); - Intlinks[e.Attribute("letter").Value].Add(e); - Console.WriteLine(e.ToString()); - affected = true; - } - } - } - } - - var marginals = d.Descendants("marginal"); - if (marginals != null && marginals.Any()) { - foreach (var e in marginals) { - if (e.HasAttributes && e.Attribute("letter") != null) { - if (!Marginals.ContainsKey(e.Attribute("letter").Value)) Marginals.Add(e.Attribute("letter").Value, new()); - Marginals[e.Attribute("letter").Value].Add(e); - affected = true; - } - } - } - - var lettertexts = d.Descendants("letterText"); - if (lettertexts != null && lettertexts.Any()) { - foreach (var e in lettertexts) { - if (e.HasAttributes && e.Attribute("index") != null) { - if (!LetterTexts.ContainsKey(e.Attribute("index").Value)) LetterTexts.Add(e.Attribute("index").Value, new()); - LetterTexts[e.Attribute("index").Value].Add(e); - affected = true; - } - } - } - - var lettertraditions = d.Descendants("letterTradition"); - if (lettertraditions != null && lettertraditions.Any()) { - foreach (var e in lettertraditions) { - if (e.HasAttributes && e.Attribute("ref") != null) { - if (!LetterTraditions.ContainsKey(e.Attribute("ref").Value)) LetterTraditions.Add(e.Attribute("ref").Value, new()); - LetterTraditions[e.Attribute("ref").Value].Add(e); - affected = true; - } - } - } - - var letterdescs = d.Descendants("letterDesc"); - if (letterdescs != null && letterdescs.Any()) { - foreach (var e in letterdescs) { - if (e.HasAttributes && e.Attribute("ref") != null) { - if (!LetterDescs.ContainsKey(e.Attribute("ref").Value)) LetterDescs.Add(e.Attribute("ref").Value, new()); - LetterDescs[e.Attribute("ref").Value].Add(e); - - if (e.Element("autopsic") != null && e.Element("autopsic").HasAttributes && e.Element("autopsic").Attribute("value") != null) { - OldNewIndex.Add(e.Attribute("ref").Value, e.Element("autopsic").Attribute("value").Value); - Autopsic.Add(e.Element("autopsic")); - affected = true; - } - - } - } - } - - Documents.Add((f, d, affected)); - } - } -} - -List>> Collections = new() { Intlinks, Marginals, LetterTexts, LetterTraditions, LetterDescs }; -foreach (var number in OldNewIndex) { - if (number.Key == number.Value) continue; - foreach (var c in Collections) { - if (c != null && c.ContainsKey(number.Key)) { - foreach (var v in c[number.Key]) { - if (v.HasAttributes && v.Attribute("letter") != null) { - v.Attribute("letter").Value = number.Value; - } else if (v.HasAttributes && v.Attribute("ref") != null) { - v.Attribute("ref").Value = number.Value; - } else if (v.HasAttributes && v.Attribute("index") != null) { - v.Attribute("index").Value = number.Value; - } - // NOT POSSIBLE: - // if (v.HasAttributes && v.Attribute("autopsic") != null) { - // v.Attribute("autopsic").Remove(); - // } - } - } - } -} - -foreach (var d in Documents) { - //if (d.Item3) SaveFile(d.Item2, DEST_PATH, d.Item1); -} - -void SaveFile(XDocument element, string basefilepath, string oldfile) { - if (!Directory.Exists(basefilepath)) { - Directory.CreateDirectory(basefilepath); - } - var filenameold = oldfile.Split('/', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).LastOrDefault(); - if (filenameold == null) return; - var filename = oldfile; - var path = Path.Combine(basefilepath, filename); - - if (!Directory.Exists(basefilepath)) - Directory.CreateDirectory(basefilepath); - File.WriteAllText(path, element.ToString()); - // XmlWriterSettings set = new XmlWriterSettings() { - // CheckCharacters = false - // }; - // using (XmlTextWriter wr = new XmlTextWriter(path, System.Text.Encoding.UTF8) { Formatting = System.Xml.Formatting.None }) { - // element.Save(wr); - // } - // using (var targetStream = System.IO.File.Create(path)) { - // element.Save(targetStream, SaveOptions.DisableFormatting); - //} -} \ No newline at end of file +// Sets only whitespace entities +CharacterEntityReferences.Replace(xmls, cp.Where(x => x != null && String.IsNullOrWhiteSpace(x))); \ No newline at end of file