Added Transformation Script

This commit is contained in:
Simon Martens
2023-09-15 20:46:57 +02:00
parent 89b55be98d
commit f054c8913d
5 changed files with 285 additions and 152 deletions

View File

@@ -0,0 +1,125 @@
using System.Xml;
using System.Xml.Linq;
public static class AutopsicNumberTransform {
// Checks are done, we begin here
// State
public static List<(string, XDocument, bool)> Documents = new();
static Dictionary<string, string> OldNewIndex = new();
static Dictionary<string, List<XElement>> Intlinks = new();
public static Dictionary<string, List<XElement>> Marginals = new();
static Dictionary<string, List<XElement>> LetterTexts = new();
static Dictionary<string, List<XElement>> LetterTraditions = new();
static Dictionary<string, List<XElement>> LetterDescs = new();
public static void Collect(string[] xmls) {
List<XElement> Autopsic = new();
foreach (var f in xmls) {
XmlReaderSettings set = new XmlReaderSettings();
set.IgnoreWhitespace = false;
set.CheckCharacters = false;
using (FileStream fs = File.Open(f, FileMode.Open)) {
var d = XDocument.Load(fs, LoadOptions.PreserveWhitespace);
var affected = false;
var intlinks = d.Descendants("intlink");
if (intlinks != null && intlinks.Any()) {
foreach (var e in intlinks) {
if (e.HasAttributes && e.Attribute("letter") != null) {
int letter = -1;
if (Int32.TryParse(e.Attribute("letter").Value, out letter) && letter > 368) {
if (!Intlinks.ContainsKey(e.Attribute("letter").Value)) Intlinks.Add(e.Attribute("letter").Value, new());
Intlinks[e.Attribute("letter").Value].Add(e);
Console.WriteLine("intlink: " + e.ToString() + ", document: " + f);
affected = true;
}
}
}
}
var marginals = d.Descendants("marginal");
if (marginals != null && marginals.Any()) {
foreach (var e in marginals) {
if (e.HasAttributes && e.Attribute("letter") != null) {
if (!Marginals.ContainsKey(e.Attribute("letter").Value)) Marginals.Add(e.Attribute("letter").Value, new());
Marginals[e.Attribute("letter").Value].Add(e);
affected = true;
}
}
}
var lettertexts = d.Descendants("letterText");
if (lettertexts != null && lettertexts.Any()) {
foreach (var e in lettertexts) {
if (e.HasAttributes && e.Attribute("index") != null) {
if (!LetterTexts.ContainsKey(e.Attribute("index").Value)) LetterTexts.Add(e.Attribute("index").Value, new());
LetterTexts[e.Attribute("index").Value].Add(e);
affected = true;
}
}
}
var lettertraditions = d.Descendants("letterTradition");
if (lettertraditions != null && lettertraditions.Any()) {
foreach (var e in lettertraditions) {
if (e.HasAttributes && e.Attribute("ref") != null) {
if (!LetterTraditions.ContainsKey(e.Attribute("ref").Value)) LetterTraditions.Add(e.Attribute("ref").Value, new());
LetterTraditions[e.Attribute("ref").Value].Add(e);
affected = true;
}
}
}
var letterdescs = d.Descendants("letterDesc");
if (letterdescs != null && letterdescs.Any()) {
foreach (var e in letterdescs) {
if (e.HasAttributes && e.Attribute("ref") != null) {
if (!LetterDescs.ContainsKey(e.Attribute("ref").Value)) LetterDescs.Add(e.Attribute("ref").Value, new());
LetterDescs[e.Attribute("ref").Value].Add(e);
if (e.Element("autopsic") != null && e.Element("autopsic").HasAttributes && e.Element("autopsic").Attribute("value") != null) {
OldNewIndex.Add(e.Attribute("ref").Value, e.Element("autopsic").Attribute("value").Value);
Autopsic.Add(e.Element("autopsic"));
affected = true;
}
}
}
}
Documents.Add((f, d, affected));
}
}
}
public static void Transform() {
List<Dictionary<string, List<XElement>>> Collections = new() { Intlinks, Marginals, LetterTexts, LetterTraditions, LetterDescs };
foreach (var number in OldNewIndex) {
Console.Write(number.Key + " -> " + number.Value);
foreach (var c in Collections) {
if (c != null && c.ContainsKey(number.Key)) {
foreach (var v in c[number.Key]) {
if (v.HasAttributes && v.Attribute("letter") != null) {
v.Attribute("letter").Value = number.Value;
} else if (v.HasAttributes && v.Attribute("ref") != null) {
v.Add(new XAttribute("letter", number.Value));
v.Attribute("ref").Remove();
if (!v.IsEmpty && v.Element("autopsic") != null) {
v.Element("autopsic").Remove();
}
} else if (v.HasAttributes && v.Attribute("index") != null) {
v.Add(new XAttribute("letter", number.Value));
v.Attribute("index").Remove();
}
if (v.HasAttributes && v.Attribute("autopsic") != null) {
v.Attribute("autopsic").Remove();
}
}
}
}
}
}
}

View File

@@ -0,0 +1,63 @@
using System.Text;
using System.Text.RegularExpressions;
public static class CharacterEntityReferences {
public static void Replace(IEnumerable<string> files, IEnumerable<string?> codepoints) {
foreach (var f in files) {
Console.WriteLine("Replacing file " + f);
var t = File.ReadAllText(f);
foreach (var s in codepoints) {
if (s != null) {
t = t.Replace(s, ConvertStringToCodepoint(s));
}
}
File.WriteAllText(f, t);
}
}
static string ConvertStringToCodepoint(string input) {
var sb = new StringBuilder();
for (int i = 0; i < input.Length; i += char.IsSurrogatePair(input, i) ? 2 : 1) {
int codepoint = char.ConvertToUtf32(input, i);
sb.Append("&#x").Append($"{codepoint:X}").Append(';');
}
return sb.ToString();
}
public static HashSet<string?> GetCodePoints(string[] files) {
HashSet<string?> res = new();
Regex cphex = new Regex(@"&#x([0-9a-fA-F]{1,4});");
Regex cpint = new Regex(@"&#([0-9]{1,4});");
HashSet<string> XMLForbidden = new HashSet<string>(){
"<",
"&",
">",
"'",
"\""
};
foreach (var f in files) {
var t = File.ReadAllText(f);
var ms = cphex.Matches(t);
foreach (var mat in ms) {
int value = Convert.ToInt32(String.Join(string.Empty, mat.ToString().Skip(3).SkipLast(1)), 16);
var c = char.ConvertFromUtf32(value);
if (!res.Contains(c) && !XMLForbidden.Contains(c)) {
res.Add(c);
}
}
ms = cpint.Matches(t);
foreach (var mat in ms) {
int value = Convert.ToInt32(String.Join(string.Empty, mat.ToString().Skip(2).SkipLast(1)));
var c = char.ConvertFromUtf32(value);
Console.WriteLine(mat.ToString() + " " + c);
if (!res.Contains(c) && !XMLForbidden.Contains(c)) {
res.Add(c);
}
}
}
return res;
}
}

View File

@@ -0,0 +1,47 @@
using System.Xml;
using System.Xml.Linq;
public static class FileOperations {
public static void SaveFile(List<(string, XDocument, bool)> Documents, string dest) {
foreach (var d in Documents) {
if (d.Item3) {
if (!Directory.Exists(dest)) {
Directory.CreateDirectory(dest);
}
var filenameold = d.Item1.Split('/', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).LastOrDefault();
if (filenameold == null) return;
var path = Path.Combine(dest, filenameold);
// element.Save(path, SaveOptions.DisableFormatting);
XmlWriterSettings set = new XmlWriterSettings() {
CheckCharacters = false,
Encoding = System.Text.Encoding.UTF8,
};
using (XmlWriter writer = XmlWriter.Create(path, set)) {
d.Item2.Save(writer);
}
// using (var targetStream = System.IO.File.Create(path)) {
// element.Save(targetStream, SaveOptions.DisableFormatting);
//}
}
}
}
// Gets XML data if branch name is correct (to prevent data errors)
public static string[] GetXMLs(string path, string git_path, string? branch_name) {
// BASIC CHECKS
if (branch_name != null) {
if (File.Exists(git_path + ".git/HEAD")) {
var text = File.ReadAllText(git_path + ".git/HEAD").Trim();
if (!text.EndsWith(branch_name))
throw new("Not it the Branch " + branch_name);
} else throw new("Specified Paths do not exist");
}
if (!Directory.Exists(path)) throw new("Directory does not exist!");
var xmls = Directory.GetFiles(path, "*.xml");
if (xmls == null || !xmls.Any()) throw new("No XML Data fonund!");
return xmls;
}
}

View File

@@ -0,0 +1,38 @@
using System.Xml.Linq;
public static class MarginalsTransform {
public static List<XElement> Marginals;
public static void Transform() {
var lu = Marginals
.Where(x => x.HasAttributes &&
x.Attribute("line") != null &&
x.Attribute("page") != null &&
x.Attribute("letter") != null)
.GroupBy(x =>
x.Attribute("letter")!.Value +
"-" +
x.Attribute("page")!.Value +
"-" +
x.Attribute("line")!.Value
);
foreach (var l in lu) {
if (l.Count() > 1) {
var list = l
.Where(x => x.HasAttributes && x.Attribute("index") != null && Int32.TryParse(x.Attribute("index")!.Value, out var _) != false)
.OrderBy(y => Int32.Parse(y.Attribute("index")!.Value));
var i = 1;
foreach (var e in list) {
e.Attribute("index")!.Remove();
e.Add(new XAttribute("sort", i.ToString()));
i++;
}
} else if (l.Count() == 1) {
if (l.First().HasAttributes && l.First().Attribute("index") != null) {
l.First().Attribute("index")!.Remove();
}
}
}
}
}

View File

@@ -1,5 +1,7 @@
using System.IO; using System.IO;
using System.Security; using System.Security;
using System.Text;
using System.Text.RegularExpressions;
using System.Xml; using System.Xml;
using System.Xml.Linq; using System.Xml.Linq;
// See https://aka.ms/new-console-template for more information // See https://aka.ms/new-console-template for more information
@@ -8,158 +10,16 @@ const string DEST_PATH = "C:/Users/simon/source/hamann-xml/transformations_2023-
const string GIT_PATH = "C:/Users/simon/source/hamann-xml/"; const string GIT_PATH = "C:/Users/simon/source/hamann-xml/";
const string BRANCH_NAME = "testdata"; const string BRANCH_NAME = "testdata";
if (File.Exists(GIT_PATH + ".git/HEAD") || !Directory.Exists(XML_PATH)) { var xmls = FileOperations.GetXMLs(XML_PATH, GIT_PATH, BRANCH_NAME);
var text = File.ReadAllText(GIT_PATH + ".git/HEAD").Trim(); var cp = CharacterEntityReferences.GetCodePoints(xmls);
if (!text.EndsWith(BRANCH_NAME)) {
throw new("Not it the Branch " + BRANCH_NAME);
}
} else {
throw new("Specified Paths do not exist");
}
var xmls = Directory.GetFiles(XML_PATH, "*.xml"); AutopsicNumberTransform.Collect(xmls);
AutopsicNumberTransform.Transform();
MarginalsTransform.Marginals = AutopsicNumberTransform.Marginals.SelectMany(x => x.Value).ToList();
MarginalsTransform.Transform();
FileOperations.SaveFile(AutopsicNumberTransform.Documents, DEST_PATH);
if (xmls == null || !xmls.Any()) { xmls = FileOperations.GetXMLs(DEST_PATH, null, null);
throw new("No XML Data fonund!");
}
// Checks are done, we begin here // Sets only whitespace entities
// State CharacterEntityReferences.Replace(xmls, cp.Where(x => x != null && String.IsNullOrWhiteSpace(x)));
List<(string, XDocument, bool)> Documents = new();
Dictionary<string, string> OldNewIndex = new();
Dictionary<string, List<XElement>> Intlinks = new();
Dictionary<string, List<XElement>> Marginals = new();
Dictionary<string, List<XElement>> LetterTexts = new();
Dictionary<string, List<XElement>> LetterTraditions = new();
Dictionary<string, List<XElement>> LetterDescs = new();
List<XElement> Autopsic = new();
foreach (var f in xmls) {
XmlReaderSettings set = new XmlReaderSettings();
set.IgnoreWhitespace = false;
set.CheckCharacters = false;
using (FileStream fs = File.Open(f, FileMode.Open)) {
using (var r = new XmlTextReader(fs) { Normalization = false, WhitespaceHandling = WhitespaceHandling.All, EntityHandling = EntityHandling.ExpandCharEntities}) {
var d = XDocument.Load(r);
var affected = false;
var intlinks = d.Descendants("intlink");
if (intlinks != null && intlinks.Any()) {
foreach (var e in intlinks) {
if (e.HasAttributes && e.Attribute("letter") != null) {
int letter = -1;
if (Int32.TryParse(e.Attribute("letter").Value, out letter) && letter > 368) {
if (!Intlinks.ContainsKey(e.Attribute("letter").Value)) Intlinks.Add(e.Attribute("letter").Value, new());
Intlinks[e.Attribute("letter").Value].Add(e);
Console.WriteLine(e.ToString());
affected = true;
}
}
}
}
var marginals = d.Descendants("marginal");
if (marginals != null && marginals.Any()) {
foreach (var e in marginals) {
if (e.HasAttributes && e.Attribute("letter") != null) {
if (!Marginals.ContainsKey(e.Attribute("letter").Value)) Marginals.Add(e.Attribute("letter").Value, new());
Marginals[e.Attribute("letter").Value].Add(e);
affected = true;
}
}
}
var lettertexts = d.Descendants("letterText");
if (lettertexts != null && lettertexts.Any()) {
foreach (var e in lettertexts) {
if (e.HasAttributes && e.Attribute("index") != null) {
if (!LetterTexts.ContainsKey(e.Attribute("index").Value)) LetterTexts.Add(e.Attribute("index").Value, new());
LetterTexts[e.Attribute("index").Value].Add(e);
affected = true;
}
}
}
var lettertraditions = d.Descendants("letterTradition");
if (lettertraditions != null && lettertraditions.Any()) {
foreach (var e in lettertraditions) {
if (e.HasAttributes && e.Attribute("ref") != null) {
if (!LetterTraditions.ContainsKey(e.Attribute("ref").Value)) LetterTraditions.Add(e.Attribute("ref").Value, new());
LetterTraditions[e.Attribute("ref").Value].Add(e);
affected = true;
}
}
}
var letterdescs = d.Descendants("letterDesc");
if (letterdescs != null && letterdescs.Any()) {
foreach (var e in letterdescs) {
if (e.HasAttributes && e.Attribute("ref") != null) {
if (!LetterDescs.ContainsKey(e.Attribute("ref").Value)) LetterDescs.Add(e.Attribute("ref").Value, new());
LetterDescs[e.Attribute("ref").Value].Add(e);
if (e.Element("autopsic") != null && e.Element("autopsic").HasAttributes && e.Element("autopsic").Attribute("value") != null) {
OldNewIndex.Add(e.Attribute("ref").Value, e.Element("autopsic").Attribute("value").Value);
Autopsic.Add(e.Element("autopsic"));
affected = true;
}
}
}
}
Documents.Add((f, d, affected));
}
}
}
List<Dictionary<string, List<XElement>>> Collections = new() { Intlinks, Marginals, LetterTexts, LetterTraditions, LetterDescs };
foreach (var number in OldNewIndex) {
if (number.Key == number.Value) continue;
foreach (var c in Collections) {
if (c != null && c.ContainsKey(number.Key)) {
foreach (var v in c[number.Key]) {
if (v.HasAttributes && v.Attribute("letter") != null) {
v.Attribute("letter").Value = number.Value;
} else if (v.HasAttributes && v.Attribute("ref") != null) {
v.Attribute("ref").Value = number.Value;
} else if (v.HasAttributes && v.Attribute("index") != null) {
v.Attribute("index").Value = number.Value;
}
// NOT POSSIBLE:
// if (v.HasAttributes && v.Attribute("autopsic") != null) {
// v.Attribute("autopsic").Remove();
// }
}
}
}
}
foreach (var d in Documents) {
//if (d.Item3) SaveFile(d.Item2, DEST_PATH, d.Item1);
}
void SaveFile(XDocument element, string basefilepath, string oldfile) {
if (!Directory.Exists(basefilepath)) {
Directory.CreateDirectory(basefilepath);
}
var filenameold = oldfile.Split('/', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).LastOrDefault();
if (filenameold == null) return;
var filename = oldfile;
var path = Path.Combine(basefilepath, filename);
if (!Directory.Exists(basefilepath))
Directory.CreateDirectory(basefilepath);
File.WriteAllText(path, element.ToString());
// XmlWriterSettings set = new XmlWriterSettings() {
// CheckCharacters = false
// };
// using (XmlTextWriter wr = new XmlTextWriter(path, System.Text.Encoding.UTF8) { Formatting = System.Xml.Formatting.None }) {
// element.Save(wr);
// }
// using (var targetStream = System.IO.File.Create(path)) {
// element.Save(targetStream, SaveOptions.DisableFormatting);
//}
}