Parsing upon deserialising

This commit is contained in:
Simon Martens
2025-11-14 15:29:51 +01:00
parent a46c171de7
commit 2e251f446f
9 changed files with 633 additions and 331 deletions

View File

@@ -3,15 +3,44 @@ package xmlmodels
import (
"encoding/json"
"encoding/xml"
"github.com/Theodor-Springmann-Stiftung/lenz-web/xmlparsing"
)
type Letter struct {
XMLName xml.Name `xml:"letterText"`
Letter int `xml:"letter,attr"`
Pages []Page `xml:"page"`
Hands []RefElement `xml:"hand"`
Content string `xml:",innerxml"`
Chardata string `xml:",chardata"`
XMLName xml.Name `xml:"letterText"`
Letter int `xml:"letter,attr"`
Pages []Page `xml:"page"`
Hands []RefElement `xml:"hand"`
HTML xmlparsing.Parsed[LenzTextHandler, *LenzParseState] `xml:"-"`
}
func (l *Letter) UnmarshalXML(dec *xml.Decoder, start xml.StartElement) error {
type alias struct {
XMLName xml.Name `xml:"letterText"`
Letter int `xml:"letter,attr"`
Pages []Page `xml:"page"`
Hands []RefElement `xml:"hand"`
Inner string `xml:",innerxml"`
}
var data alias
if err := dec.DecodeElement(&data, &start); err != nil {
return err
}
l.XMLName = data.XMLName
l.Letter = data.Letter
l.Pages = data.Pages
l.Hands = data.Hands
parsed, err := parseText(Get(), data.Inner)
if err != nil {
return err
}
l.HTML = parsed
return nil
}
func (l Letter) Keys() []any {

273
xmlmodels/letter_text.go Normal file
View File

@@ -0,0 +1,273 @@
package xmlmodels
import (
"math/rand"
"strconv"
"strings"
"github.com/Theodor-Springmann-Stiftung/lenz-web/xmlparsing"
)
const charset = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
func randString(length int) string {
b := make([]byte, length)
for i := range b {
b[i] = charset[rand.Intn(len(charset))]
}
return string(b)
}
type Note struct {
Id string
Tokens Tokens
}
type LenzParseState struct {
Tokens Tokens
Notes []Note
Count []Note
LC int
PC string
CloseElement bool
Break bool
PageBreak bool
LineBreak bool
Lib *Library
rendered string
}
func (s *LenzParseState) String() string {
if s == nil {
return ""
}
if s.rendered != "" {
return s.rendered
}
builder := strings.Builder{}
builder.WriteString(outToken{Name: "div", Classes: []string{"count"}, Type: Element}.String())
for _, c := range s.Count {
builder.WriteString(c.Tokens.String())
}
builder.WriteString(outToken{Name: "div", Classes: []string{"count"}, Type: EndElement}.String())
tokens := s.Tokens
tokens.Prepend(outToken{Name: "div", Classes: []string{"fulltext"}, Type: Element})
tokens.AppendEndElement()
builder.WriteString(tokens.String())
builder.WriteString(outToken{Name: "div", Classes: []string{"notes"}, Type: Element}.String())
for _, note := range s.Notes {
builder.WriteString(note.Tokens.String())
}
builder.WriteString(outToken{Name: "div", Classes: []string{"notes"}, Type: EndElement}.String())
s.rendered = builder.String()
return s.rendered
}
func (s *LenzParseState) AppendNote(note Note) {
s.Notes = append(s.Notes, note)
}
type LenzTextHandler struct {
Lib *Library
}
func (h LenzTextHandler) NewState() *LenzParseState {
return &LenzParseState{
CloseElement: true,
PC: "1",
Lib: h.Lib,
}
}
func (h LenzTextHandler) OnOpenElement(state *xmlparsing.ParseState[*LenzParseState], elem *xmlparsing.Token) error {
ps := state.Data()
switch elem.Name {
case "insertion":
ps.Tokens.AppendDefaultElement(elem)
ps.Tokens.AppendDivElement("", "insertion-marker")
ps.Tokens.AppendEndElement()
case "sidenote":
id := randString(8)
ps.Tokens.AppendDefaultElement(elem)
ps.Break = false
ps.Tokens.AppendCustomAttribute("aria-describedby", id)
if elem.Attributes["annotation"] != "" ||
elem.Attributes["page"] != "" ||
elem.Attributes["pos"] != "" {
note := Note{Id: id}
note.Tokens.AppendDivElement(id, "note-sidenote-meta")
ps.Tokens.AppendDivElement(id, "inline-sidenote-meta")
if elem.Attributes["page"] != "" {
note.Tokens.AppendDivElement("", "sidenote-page")
note.Tokens.AppendText(elem.Attributes["page"])
note.Tokens.AppendEndElement()
ps.Tokens.AppendDivElement("", "sidenote-page")
ps.Tokens.AppendText(elem.Attributes["page"])
ps.Tokens.AppendEndElement()
}
if elem.Attributes["annotation"] != "" {
note.Tokens.AppendDivElement("", "sidenote-note")
note.Tokens.AppendText(elem.Attributes["annotation"])
note.Tokens.AppendEndElement()
ps.Tokens.AppendDivElement("", "sidenote-note")
ps.Tokens.AppendText(elem.Attributes["annotation"])
ps.Tokens.AppendEndElement()
}
if elem.Attributes["pos"] != "" {
note.Tokens.AppendDivElement("", "sidenote-pos")
note.Tokens.AppendText(elem.Attributes["pos"])
note.Tokens.AppendEndElement()
ps.Tokens.AppendDivElement("", "sidenote-pos")
ps.Tokens.AppendText(elem.Attributes["pos"])
ps.Tokens.AppendEndElement()
}
note.Tokens.AppendEndElement()
ps.Tokens.AppendEndElement()
ps.AppendNote(note)
}
case "note":
id := randString(8)
ps.Tokens.AppendLink("#"+id, "nanchor-note")
ps.Tokens.AppendEndElement()
ps.Tokens.AppendDivElement(id, "note", "note-note")
case "nr":
ext := elem.Attributes["extent"]
if ext == "" {
ext = "1"
}
extno, err := strconv.Atoi(ext)
if err != nil {
extno = 1
}
ps.Tokens.AppendDefaultElement(elem)
for i := 0; i < extno; i++ {
ps.Tokens.AppendText("&nbsp;")
}
case "hand":
id := randString(8)
idno, err := strconv.Atoi(elem.Attributes["ref"])
var person *PersonDef
if err == nil && ps.Lib != nil {
person = ps.Lib.Persons.Item(idno)
}
hand := "N/A"
if person != nil {
hand = person.Name
}
note := Note{Id: id}
note.Tokens.AppendDivElement(id, "note-hand")
note.Tokens.AppendText(hand)
note.Tokens.AppendEndElement()
ps.AppendNote(note)
ps.Tokens.AppendDivElement(id, "inline-hand")
ps.Tokens.AppendText(hand)
ps.Tokens.AppendEndElement()
ps.Tokens.AppendDivElement("", "hand")
ps.Tokens.AppendCustomAttribute("aria-describedby", id)
case "line":
if val := elem.Attributes["type"]; val != "empty" {
ps.LC += 1
if ps.Break {
ps.Tokens.AppendEmptyElement("br", ps.PC+"-"+strconv.Itoa(ps.LC))
}
ps.Tokens.AppendDefaultElement(elem)
} else {
ps.Tokens.AppendEmptyElement("br", "", "empty")
ps.CloseElement = false
}
ps.LineBreak = true
case "page":
ps.PC = elem.Attributes["index"]
ps.PageBreak = true
ps.CloseElement = false
default:
ps.Tokens.AppendDefaultElement(elem)
}
return nil
}
func (h LenzTextHandler) OnCloseElement(state *xmlparsing.ParseState[*LenzParseState], elem *xmlparsing.Token) error {
ps := state.Data()
if elem.Name == "sidenote" {
ps.LineBreak = true
}
if ps.CloseElement {
ps.Tokens.AppendEndElement()
} else {
ps.CloseElement = true
}
return nil
}
func (h LenzTextHandler) OnText(state *xmlparsing.ParseState[*LenzParseState], elem *xmlparsing.Token) error {
ps := state.Data()
trimmed := strings.TrimSpace(elem.Data)
if trimmed == "" {
return nil
}
if !ps.Break {
ps.Break = true
}
if ps.PageBreak && ps.PC != "1" {
ps.PageBreak = false
note := Note{Id: ps.PC}
quality := "outside"
if !ps.LineBreak {
quality = "inside"
}
ps.Tokens.AppendDivElement("", "eanchor-page", "eanchor-page-"+quality)
ps.Tokens.AppendCustomAttribute("aria-describedby", ps.PC)
ps.Tokens.AppendEndElement()
ps.Tokens.AppendDivElement("", "page-counter", "page-"+quality)
ps.Tokens.AppendText(ps.PC)
ps.Tokens.AppendEndElement()
note.Tokens.AppendDivElement(ps.PC, "page", "page-"+quality)
note.Tokens.AppendText(ps.PC)
note.Tokens.AppendEndElement()
ps.Count = append(ps.Count, note)
}
if ps.LineBreak {
ps.LineBreak = false
}
ps.Tokens.AppendDefaultElement(elem)
return nil
}
func (h LenzTextHandler) OnComment(*xmlparsing.ParseState[*LenzParseState], *xmlparsing.Token) error {
return nil
}
func (h LenzTextHandler) Result(state *xmlparsing.ParseState[*LenzParseState]) (string, error) {
return state.Data().String(), nil
}
func parseText(lib *Library, raw string) (xmlparsing.Parsed[LenzTextHandler, *LenzParseState], error) {
handler := LenzTextHandler{Lib: lib}
parsed := xmlparsing.NewParsed[LenzTextHandler, *LenzParseState](handler)
return parsed, parsed.ParseString(raw)
}
// TemplateParse exposes the legacy helper for go templates (e.g. traditions).
func TemplateParse(lib *Library) func(letter *Meta, s string) string {
return func(_ *Meta, s string) string {
parsed, err := parseText(lib, s)
if err != nil {
return err.Error()
}
return parsed.Data().String()
}
}

View File

@@ -107,77 +107,41 @@ func (l *Library) Parse(source xmlparsing.ParseSource, baseDir, commit string) e
l.prepare()
wg.Add(1)
go func() {
err := l.Persons.Serialize(&PersonDefs{}, filepath.Join(meta.BaseDir, REFERENCES_PATH), meta)
if err != nil {
metamu.Lock()
slog.Error("Failed to serialize persons:", "error", err)
meta.FailedPaths = append(meta.FailedPaths, filepath.Join(meta.BaseDir, REFERENCES_PATH))
metamu.Unlock()
}
wg.Done()
}()
parse := func(fn func() error, path string, label string) {
wg.Add(1)
go func() {
if err := fn(); err != nil {
metamu.Lock()
slog.Error("Failed to serialize "+label+":", "error", err)
meta.FailedPaths = append(meta.FailedPaths, filepath.Join(meta.BaseDir, path))
metamu.Unlock()
}
wg.Done()
}()
}
wg.Add(1)
go func() {
err := l.Places.Serialize(&LocationDefs{}, filepath.Join(meta.BaseDir, REFERENCES_PATH), meta)
if err != nil {
metamu.Lock()
slog.Error("Failed to serialize places:", "error", err)
meta.FailedPaths = append(meta.FailedPaths, filepath.Join(meta.BaseDir, REFERENCES_PATH))
metamu.Unlock()
}
wg.Done()
}()
// References must be ready before dependent documents (hands etc.) resolve correctly.
parse(func() error {
return l.Persons.Serialize(&PersonDefs{}, filepath.Join(meta.BaseDir, REFERENCES_PATH), meta)
}, REFERENCES_PATH, "persons")
parse(func() error {
return l.Places.Serialize(&LocationDefs{}, filepath.Join(meta.BaseDir, REFERENCES_PATH), meta)
}, REFERENCES_PATH, "places")
parse(func() error {
return l.AppDefs.Serialize(&AppDefs{}, filepath.Join(meta.BaseDir, REFERENCES_PATH), meta)
}, REFERENCES_PATH, "appdefs")
wg.Wait()
wg.Add(1)
go func() {
err := l.AppDefs.Serialize(&AppDefs{}, filepath.Join(meta.BaseDir, REFERENCES_PATH), meta)
if err != nil {
metamu.Lock()
slog.Error("Failed to serialize appdefs:", "error", err)
meta.FailedPaths = append(meta.FailedPaths, filepath.Join(meta.BaseDir, REFERENCES_PATH))
metamu.Unlock()
}
wg.Done()
}()
wg.Add(1)
go func() {
err := l.Letters.Serialize(&DocumentsRoot{}, filepath.Join(meta.BaseDir, LETTERS_PATH), meta)
if err != nil {
metamu.Lock()
slog.Error("Failed to serialize letters:", "error", err)
meta.FailedPaths = append(meta.FailedPaths, filepath.Join(meta.BaseDir, LETTERS_PATH))
metamu.Unlock()
}
wg.Done()
}()
wg.Add(1)
go func() {
err := l.Traditions.Serialize(&TraditionsRoot{}, filepath.Join(meta.BaseDir, TRADITIONS_PATH), meta)
if err != nil {
metamu.Lock()
slog.Error("Failed to serialize traditions:", "error", err)
meta.FailedPaths = append(meta.FailedPaths, filepath.Join(meta.BaseDir, TRADITIONS_PATH))
metamu.Unlock()
}
wg.Done()
}()
wg.Add(1)
go func() {
err := l.Metas.Serialize(&MetaRoot{}, filepath.Join(meta.BaseDir, META_PATH), meta)
if err != nil {
metamu.Lock()
slog.Error("Failed to serialize meta:", "error", err)
meta.FailedPaths = append(meta.FailedPaths, filepath.Join(meta.BaseDir, META_PATH))
metamu.Unlock()
}
wg.Done()
}()
// Remaining documents can be parsed once references are available.
parse(func() error {
return l.Letters.Serialize(&DocumentsRoot{}, filepath.Join(meta.BaseDir, LETTERS_PATH), meta)
}, LETTERS_PATH, "letters")
parse(func() error {
return l.Traditions.Serialize(&TraditionsRoot{}, filepath.Join(meta.BaseDir, TRADITIONS_PATH), meta)
}, TRADITIONS_PATH, "traditions")
parse(func() error {
return l.Metas.Serialize(&MetaRoot{}, filepath.Join(meta.BaseDir, META_PATH), meta)
}, META_PATH, "meta")
wg.Wait()

219
xmlmodels/text_tokens.go Normal file
View File

@@ -0,0 +1,219 @@
package xmlmodels
import (
"strings"
"github.com/Theodor-Springmann-Stiftung/lenz-web/xmlparsing"
)
type outType int
const (
NA outType = iota
Text
Element
EmptyElement
EndElement
)
type outToken struct {
Type outType
Name string
Classes []string
Id string
Value string
Attributes map[string]string
}
func (o outToken) String() string {
switch o.Type {
case Text:
return o.Value
case Element:
builder := strings.Builder{}
builder.WriteString("<")
builder.WriteString(o.Name)
if len(o.Classes) > 0 {
builder.WriteString(" class=\"")
builder.WriteString(strings.Join(o.Classes, " "))
builder.WriteString("\"")
}
if len(o.Id) > 0 {
builder.WriteString(" id=\"")
builder.WriteString(o.Id)
builder.WriteString("\"")
}
if len(o.Attributes) > 0 {
for key, value := range o.Attributes {
builder.WriteString(" ")
builder.WriteString(key)
builder.WriteString("=\"")
builder.WriteString(value)
builder.WriteString("\"")
}
}
builder.WriteString(">")
return builder.String()
case EndElement:
return "</" + o.Name + ">"
case EmptyElement:
builder := strings.Builder{}
builder.WriteString("<")
builder.WriteString(o.Name)
if len(o.Classes) > 0 {
builder.WriteString(" class=\"")
builder.WriteString(strings.Join(o.Classes, " "))
builder.WriteString("\"")
}
if len(o.Id) > 0 {
builder.WriteString(" id=\"")
builder.WriteString(o.Id)
builder.WriteString("\"")
}
if len(o.Attributes) > 0 {
for key, value := range o.Attributes {
builder.WriteString(" ")
builder.WriteString(key)
builder.WriteString("=\"")
builder.WriteString(value)
builder.WriteString("\"")
}
}
builder.WriteString("/>")
return builder.String()
}
return ""
}
func (o *outToken) ClassesFromAttrs(attrs map[string]string) {
if len(attrs) == 0 {
return
}
for key, value := range attrs {
o.Classes = append(o.Classes, key+"-"+value)
}
}
func Default(token *xmlparsing.Token) outToken {
o := outToken{}
switch token.Type {
case xmlparsing.StartElement:
o.Name = "div"
o.Type = Element
o.Classes = []string{token.Name}
o.ClassesFromAttrs(token.Attributes)
case xmlparsing.EndElement:
o.Type = EndElement
case xmlparsing.CharData:
o.Type = Text
o.Value = token.Data
}
return o
}
type Tokens struct {
Out []outToken
}
func (s *Tokens) Prepend(token outToken) {
s.Out = append([]outToken{token}, s.Out...)
}
func (s *Tokens) AppendDefaultElement(token *xmlparsing.Token, ids ...string) {
t := Default(token)
if len(ids) > 0 {
t.Id = ids[0]
}
s.Out = append(s.Out, t)
}
func (s *Tokens) AppendCustomAttribute(name, value string) {
if len(s.Out) == 0 {
return
}
if s.Out[len(s.Out)-1].Attributes == nil {
s.Out[len(s.Out)-1].Attributes = make(map[string]string)
}
s.Out[len(s.Out)-1].Attributes[name] = value
}
func (s *Tokens) AppendElement(name string, id string, classes ...string) {
s.Out = append(s.Out, outToken{
Name: name,
Id: id,
Classes: classes,
Type: Element,
})
}
func (s *Tokens) AppendEndElement() {
skip := 0
for i := len(s.Out) - 1; i >= 0; i-- {
if s.Out[i].Type == EndElement {
skip++
}
if s.Out[i].Type == Element && s.Out[i].Name != "p" && s.Out[i].Name != "br" {
if skip == 0 {
s.Out = append(s.Out, outToken{
Name: s.Out[i].Name,
Type: EndElement,
})
return
} else {
skip--
}
}
}
}
func (s *Tokens) AppendDivElement(id string, classes ...string) {
s.Out = append(s.Out, outToken{
Name: "div",
Id: id,
Classes: classes,
Type: Element,
})
}
func (s *Tokens) AppendEmptyElement(name string, id string, classes ...string) {
s.Out = append(s.Out, outToken{
Name: name,
Id: id,
Classes: classes,
Type: EmptyElement,
})
}
func (s *Tokens) AppendLink(href string, classes ...string) {
s.Out = append(s.Out, outToken{
Name: "a",
Attributes: map[string]string{"href": href},
Classes: classes,
Type: Element,
})
}
func (s *Tokens) AppendText(text string) {
s.Out = append(s.Out, outToken{
Type: Text,
Value: text,
})
}
func (s *Tokens) String() string {
builder := strings.Builder{}
for _, token := range s.Out {
builder.WriteString(token.String())
}
return builder.String()
}