mirror of
https://github.com/Theodor-Springmann-Stiftung/hamann-ausgabe-core.git
synced 2025-10-28 16:55:32 +00:00
347 lines
15 KiB
C#
347 lines
15 KiB
C#
using System.Collections.Concurrent;
|
|
using System.Diagnostics;
|
|
using System.Text;
|
|
using System.Xml;
|
|
using System.Xml.Linq;
|
|
using System.Xml.XPath;
|
|
using HaDocument.Interfaces;
|
|
using HaDocument.Models;
|
|
using HaWeb.FileHelpers;
|
|
using HaWeb.Models;
|
|
using HaWeb.SearchHelpers;
|
|
using HaWeb.XMLParser;
|
|
using HaWeb.XMLTests;
|
|
using HaXMLReader.Interfaces;
|
|
using Microsoft.Extensions.FileProviders;
|
|
|
|
// Conditions for Successful create
|
|
// All types there
|
|
// Merging Success
|
|
// Saving Success
|
|
// Loading Success
|
|
|
|
// Startup (BEFORE IXMLFileProvider, After IHaDocumentWrapper)
|
|
public class XMLInteractionService : IXMLInteractionService {
|
|
private readonly IXMLTestService _testService;
|
|
private readonly long _fileSizeLimit;
|
|
private readonly string[] _allowedExtensions = { ".xml" };
|
|
private readonly static XmlReaderSettings _xmlSettings = new XmlReaderSettings() {
|
|
CloseInput = true,
|
|
CheckCharacters = false,
|
|
ConformanceLevel = ConformanceLevel.Fragment,
|
|
IgnoreComments = true,
|
|
IgnoreProcessingInstructions = true,
|
|
IgnoreWhitespace = false
|
|
};
|
|
|
|
private Dictionary<string, IXMLRoot>? _RootDefs;
|
|
private Dictionary<string, IXMLCollection>? _CollectionDefs;
|
|
private Dictionary<string, ItemsCollection>? _Collection;
|
|
|
|
public event EventHandler<Dictionary<string, SyntaxCheckModel>?> SyntaxCheck;
|
|
|
|
private XMLParsingState? _State;
|
|
|
|
private Dictionary<string, SyntaxCheckModel>? _SCCache;
|
|
|
|
public XMLInteractionService(IConfiguration config, IXMLTestService testService) {
|
|
_testService = testService;
|
|
_fileSizeLimit = config.GetValue<long>("FileSizeLimit");
|
|
var roottypes = _GetAllTypesThatImplementInterface<IXMLRoot>().ToList();
|
|
roottypes.ForEach( x => {
|
|
if (this._RootDefs == null) this._RootDefs = new Dictionary<string, IXMLRoot>();
|
|
var instance = (IXMLRoot)Activator.CreateInstance(x)!;
|
|
if (instance != null) this._RootDefs.Add(instance.Prefix, instance);
|
|
});
|
|
|
|
var collectiontypes = _GetAllTypesThatImplementInterface<IXMLCollection>().ToList();
|
|
collectiontypes.ForEach( x => {
|
|
if (this._CollectionDefs == null) this._CollectionDefs = new Dictionary<string, IXMLCollection>();
|
|
var instance = (IXMLCollection)Activator.CreateInstance(x)!;
|
|
if (instance != null && instance.IsGlobal()) this._CollectionDefs.Add(instance.Key, instance);
|
|
});
|
|
|
|
if (_RootDefs == null || !_RootDefs.Any())
|
|
throw new Exception("No classes for upload endpoints were found!");
|
|
|
|
if (_CollectionDefs == null || !_CollectionDefs.Any())
|
|
throw new Exception("No classes for object collection were found!");
|
|
}
|
|
|
|
// Getters and Setters
|
|
public XMLParsingState? GetState() => this._State;
|
|
|
|
public void SetState(XMLParsingState? state) => this._State = state;
|
|
|
|
public Dictionary<string, IXMLRoot>? GetRootDefs() => this._RootDefs;
|
|
|
|
public Dictionary<string, SyntaxCheckModel>? GetSCCache() => this._SCCache;
|
|
|
|
public void SetSCCache(Dictionary<string, SyntaxCheckModel>? cache) => this._SCCache = cache;
|
|
|
|
// Functions
|
|
public XMLParsingState? Collect(List<IFileInfo> files, Dictionary<string, IXMLRoot>? rootDefs) {
|
|
if (files == null || !files.Any() || rootDefs == null || !rootDefs.Any()) return null;
|
|
var _state = new XMLParsingState() {
|
|
ValidState = true
|
|
};
|
|
foreach (var f in files) {
|
|
var m = _CreateFileModel(f, null);
|
|
_state.ManagedFiles!.Add(m);
|
|
// 1. Open File for Reading
|
|
try {
|
|
using (Stream file = f.CreateReadStream()) {
|
|
// 2. Some security checks, if file empty, wrong start, wrong extension, too big
|
|
if (!XMLFileHelpers.ProcessFile(file, f.Name, m.Log, _allowedExtensions, _fileSizeLimit)) continue;
|
|
}
|
|
} catch {
|
|
m.Log( "Datei konnte nicht geöffnet werden.");
|
|
continue;
|
|
}
|
|
|
|
// 3. Check validity of XML
|
|
try {
|
|
using (var xmlreader = XmlReader.Create(f.CreateReadStream(), _xmlSettings)) {
|
|
var doc = XDocument.Load(xmlreader, LoadOptions.PreserveWhitespace | LoadOptions.SetLineInfo);
|
|
|
|
// 4. Check if opus-Document
|
|
// TODO: Unter der HOOD werden in ProbeFiles noch eigene Files gebaut!
|
|
var docs = _ProbeFile(doc, m, rootDefs);
|
|
if (docs == null || !docs.Any()) continue;
|
|
|
|
// Success! File can be recognized and parsed.
|
|
m.Validate();
|
|
foreach (var d in docs) {
|
|
if (!_state.Loaded!.ContainsKey(d.Prefix)) _state.Loaded.Add(d.Prefix, new FileList(d.XMLRoot));
|
|
_state.Loaded[d.Prefix]!.Add(d);
|
|
}
|
|
}
|
|
} catch (Exception ex) {
|
|
m.Log($"Ungültiges XML: {ex.Message}");
|
|
continue;
|
|
}
|
|
}
|
|
|
|
foreach (var f in _state.ManagedFiles!) {
|
|
if (!f.IsValid) {
|
|
_state.ValidState = false;
|
|
break;
|
|
}
|
|
}
|
|
return _state;
|
|
}
|
|
|
|
// Every caller shoud ask the cache above first
|
|
public Dictionary<string, SyntaxCheckModel>? Test(XMLParsingState? state, string gitcommit) {
|
|
if (state == null || state.Loaded == null) return null;
|
|
// TODO: Speed up this, move it into a background task:
|
|
var sw = new Stopwatch();
|
|
sw.Start();
|
|
var res = state.Loaded?.SelectMany(x => x.Value?.GetFileList()?.Select(x => x.File)).Distinct().Select(x => x.FileName);
|
|
var ret = _testService.Test(state.Loaded, res.ToDictionary(x => x, y => new SyntaxCheckModel(y, gitcommit)));
|
|
if (ret != null)
|
|
foreach (var r in ret) {
|
|
r.Value.SortErrors();
|
|
}
|
|
sw.Stop();
|
|
Console.WriteLine("Syntaxcheck " + sw.ElapsedMilliseconds.ToString() + " ms");
|
|
OnSyntaxCheck(ret);
|
|
return ret;
|
|
}
|
|
|
|
public XElement? TryCreate(XMLParsingState state) {
|
|
if (state.Loaded == null || !state.Loaded.Any() || _RootDefs == null || !_RootDefs.Any() || !state.ValidState) return null;
|
|
var opus = new XElement("opus");
|
|
// TODO: Workaround for bug in HaDocument: roots have to be added in a specific order
|
|
var used = state.Loaded.OrderByDescending(x => x.Key);
|
|
foreach (var category in used) {
|
|
if (category.Value == null || category.Value.GetFileList() == null || !category.Value.GetFileList()!.Any()) {
|
|
return null;
|
|
}
|
|
var documents = category.Value.GetFileList();
|
|
foreach (var document in documents!) {
|
|
document.XMLRoot.MergeIntoFile(opus, document);
|
|
}
|
|
}
|
|
return opus;
|
|
}
|
|
|
|
public List<(CollectedItem Item, List<(string Page, string Line, string Preview, string? Identifier)> Results)>? GetPreviews(List<(string, List<Marginal>)> places, IReaderService reader, ILibrary lib) {
|
|
if (_Collection == null || !_Collection.ContainsKey("letters")) return null;
|
|
var searchableObjects = _Collection["letters"].Items;
|
|
var res = new ConcurrentBag<(CollectedItem item, List<(string Page, string Line, string preview, string? identifier)> Results)>();
|
|
|
|
Parallel.ForEach(places, (obj) => {
|
|
var text = searchableObjects[obj.Item1];
|
|
if (text == null || text.SearchText == null || obj.Item2 == null || !obj.Item2.Any()) return;
|
|
var state = new SearchState(String.Empty, false, lib);
|
|
var rd = reader.RequestStringReader(text.SearchText);
|
|
var parser = new HaWeb.HTMLParser.LineXMLHelper<SearchState>(state, rd, new StringBuilder(), null, null, null, null, null);
|
|
rd.Read();
|
|
|
|
res.Add((
|
|
text,
|
|
obj.Item2.Select(x => (
|
|
x.Page,
|
|
x.Line,
|
|
parser.Lines != null ?
|
|
parser.Lines
|
|
.Where(y => y.Page == x.Page && y.Line == x.Line)
|
|
.Select(y => y.Text)
|
|
.FirstOrDefault(string.Empty)
|
|
: string.Empty,
|
|
(string?)null
|
|
) ).ToList()
|
|
));
|
|
});
|
|
|
|
return res.ToList();
|
|
}
|
|
|
|
public CollectedItem? GetCollectedItem(string collection, string id) {
|
|
if (_Collection == null || !_Collection.ContainsKey(collection)) return null;
|
|
var objects = _Collection[collection].Items;
|
|
if (objects == null || !objects.ContainsKey(id)) return null;
|
|
return objects[id];
|
|
}
|
|
|
|
public List<(CollectedItem Item, List<(string Page, string Line, string Preview, string? Identifier)> Results)>? SearchCollection(string collection, string searchword, IReaderService reader, ILibrary? lib) {
|
|
if (_Collection == null || !_Collection.ContainsKey(collection)) return null;
|
|
var searchableObjects = _Collection[collection].Items;
|
|
var res = new ConcurrentBag<(CollectedItem item, List<(string Page, string Line, string preview, string? identifier)> Results)>();
|
|
var sw = StringHelpers.NormalizeWhiteSpace(searchword.Trim());
|
|
|
|
// Non Parallel:
|
|
// foreach (var obj in searchableObjects) {
|
|
// if (obj.Value.SearchText != null) {
|
|
// var state = new SearchState(sw, false, lib);
|
|
// var rd = reader.RequestStringReader(obj.Value.SearchText);
|
|
// var parser = new HaWeb.HTMLParser.LineXMLHelper<SearchState>(state, rd, new StringBuilder(), SearchRules.OTagRules, null, null, SearchRules.TextRules, SearchRules.WhitespaceRules);
|
|
// rd.Read();
|
|
// if (state.Results != null)
|
|
// res.Add((
|
|
// obj.Value.Index,
|
|
// state.Results.Select(x => (
|
|
// x.Page,
|
|
// x.Line,
|
|
// parser.Lines != null ?
|
|
// parser.Lines
|
|
// .Where(y => y.Page == x.Page && y.Line == x.Line)
|
|
// .Select(x => x.Text)
|
|
// .FirstOrDefault(string.Empty)
|
|
// : "",
|
|
// x.Identifier
|
|
// )).ToList()));
|
|
// }
|
|
// }
|
|
|
|
Parallel.ForEach(searchableObjects, (obj) => {
|
|
if (obj.Value.SearchText != null) {
|
|
var state = new SearchState(sw, false, lib);
|
|
var rd = reader.RequestStringReader(obj.Value.SearchText);
|
|
var parser = new HaWeb.HTMLParser.LineXMLHelper<SearchState>(state, rd, new StringBuilder(), SearchRules.OTagRules, SearchRules.OTagRules, null, SearchRules.TextRules, SearchRules.WhitespaceRules);
|
|
rd.Read();
|
|
if (state.Results != null)
|
|
res.Add((
|
|
obj.Value,
|
|
state.Results.Select(x => (
|
|
x.Page,
|
|
x.Line,
|
|
parser.Lines != null ?
|
|
parser.Lines
|
|
.Where(y => y.Page == x.Page && y.Line == x.Line)
|
|
.Select(x => x.Text)
|
|
.FirstOrDefault(string.Empty)
|
|
: "",
|
|
x.Identifier
|
|
)).ToList()));
|
|
}
|
|
});
|
|
return res.ToList();
|
|
}
|
|
|
|
public void CreateCollections(XDocument document) {
|
|
if (document == null || _RootDefs == null) return;
|
|
int numProcs = Environment.ProcessorCount;
|
|
int concurrencyLevel = numProcs * 2;
|
|
int startingSize = 2909;
|
|
int startingSizeAllCollections = 23;
|
|
var ret = new ConcurrentDictionary<string, ItemsCollection>(concurrencyLevel, startingSizeAllCollections);
|
|
|
|
if (_CollectionDefs != null)
|
|
Parallel.ForEach(_CollectionDefs, (coll) => {
|
|
var elem = coll.Value.xPath.Aggregate(new List<XElement>(), (x, y) => { x.AddRange(document.XPathSelectElements(y).ToList()); return x; } );
|
|
if (elem != null && elem.Any()) {
|
|
var items = new ConcurrentDictionary<string, CollectedItem>(concurrencyLevel, startingSize);
|
|
foreach (var e in elem) {
|
|
var k = coll.Value.GenerateKey(e);
|
|
if (k != null) {
|
|
var searchtext = coll.Value.Searchable ?
|
|
StringHelpers.NormalizeWhiteSpace(e.ToString(), ' ', false) :
|
|
null;
|
|
items[k] = new CollectedItem(k, e, coll.Value, searchtext);
|
|
}
|
|
}
|
|
if (items.Any()) {
|
|
if (!ret.ContainsKey(coll.Key))
|
|
ret[coll.Key] = new ItemsCollection(coll.Key, coll.Value);
|
|
foreach (var item in items)
|
|
ret[coll.Key].Items.Add(item.Key, item.Value);
|
|
}
|
|
}
|
|
});
|
|
|
|
if (ret.Any()) {
|
|
Parallel.ForEach(ret, (collection) => {
|
|
collection.Value.GenerateGroupings();
|
|
});
|
|
}
|
|
_Collection = ret.ToDictionary(x => x.Key, y => y.Value);
|
|
}
|
|
|
|
private IEnumerable<Type> _GetAllTypesThatImplementInterface<T>() {
|
|
return System.Reflection.Assembly.GetExecutingAssembly()
|
|
.GetTypes()
|
|
.Where(type => typeof(T).IsAssignableFrom(type) && !type.IsInterface);
|
|
}
|
|
|
|
private List<XMLRootDocument>? _ProbeFile(XDocument document, FileModel file, Dictionary<string, IXMLRoot>? rootDefs) {
|
|
if (document.Root!.Name != "opus") {
|
|
file.Log("Ein gültiges Dokument muss mit <opus> beginnen.");
|
|
return null;
|
|
}
|
|
|
|
List<XMLRootDocument>? res = null;
|
|
if (document.Root != null && rootDefs != null) {
|
|
foreach (var (_, root) in rootDefs) {
|
|
var elements = root.IsTypeOf(document.Root);
|
|
if (elements != null && elements.Any())
|
|
foreach (var elem in elements) {
|
|
if (res == null) res = new List<XMLRootDocument>();
|
|
res.Add(_createXMLRootDocument(root, elem, file));
|
|
}
|
|
}
|
|
}
|
|
if (res == null) file.Log("Dokumenten-Typ nicht erkannt.");
|
|
return res;
|
|
}
|
|
|
|
private XMLRootDocument _createXMLRootDocument(IXMLRoot Root, XElement element, FileModel file) {
|
|
var doc = new XMLRootDocument(Root, Root.Prefix, Root.GenerateIdentificationString(element), element, file);
|
|
doc.Fields = Root.GenerateFields(doc);
|
|
return doc;
|
|
}
|
|
|
|
private FileModel _CreateFileModel(IFileInfo file, string? message) {
|
|
var m = new FileModel(file.Name, file);
|
|
if (!String.IsNullOrWhiteSpace(message)) {
|
|
m.Log(message);
|
|
}
|
|
return m;
|
|
}
|
|
|
|
protected virtual void OnSyntaxCheck(Dictionary<string, SyntaxCheckModel>? state) {
|
|
EventHandler<Dictionary<string, SyntaxCheckModel>?> eh = SyntaxCheck;
|
|
eh?.Invoke(this, state);
|
|
}
|
|
} |