Neuer Parser

This commit is contained in:
Simon Martens
2026-02-20 12:59:33 +01:00
parent a0e1d61f74
commit 1fa5f52eef
6 changed files with 1453 additions and 569 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,6 @@
IGNORE: IGNORE:
address address
tabs
BASE: BASE:
aq aq
@@ -16,6 +17,7 @@ pe
ru ru
tl // Textverlust tl // Textverlust
ul ul
note
fn[@index='1'] & anchor // keine ref? irgendwie nur die anchors in den footnotes? fn[@index='1'] & anchor // keine ref? irgendwie nur die anchors in den footnotes?
@@ -31,8 +33,6 @@ align center|right
tab 2|12|8 tab 2|12|8
BLOCK: BLOCK:
note? evtl.
tabs
letterText (wie line type="break" falls kein line) letterText (wie line type="break" falls kein line)
line (Fälle: empty, tab 1-2|4-8, break) line (Fälle: empty, tab 1-2|4-8, break)
page[@index='1-14'] page[@index='1-14']

106
xmlmodels/helpers.go Normal file
View File

@@ -0,0 +1,106 @@
package xmlmodels
import (
"encoding/xml"
"strconv"
)
func isASCIISpaceByte(b byte) bool {
return b == ' ' || b == '\t' || b == '\n' || b == '\r'
}
func trimLeftASCIISpace(s string) string {
i := 0
for i < len(s) && isASCIISpaceByte(s[i]) {
i++
}
return s[i:]
}
func trimRightASCIISpace(s string) string {
i := len(s)
for i > 0 && isASCIISpaceByte(s[i-1]) {
i--
}
return s[:i]
}
func trimASCIISpace(s string) string {
return trimRightASCIISpace(trimLeftASCIISpace(s))
}
func isOnlyASCIISpace(s string) bool {
if len(s) == 0 {
return true
}
for i := 0; i < len(s); i++ {
if !isASCIISpaceByte(s[i]) {
return false
}
}
return true
}
func hasLeadingASCIISpace(s string) bool {
return len(s) > 0 && isASCIISpaceByte(s[0])
}
func hasTrailingASCIISpace(s string) bool {
return len(s) > 0 && isASCIISpaceByte(s[len(s)-1])
}
func attrsToMap(attrs []xml.Attr) map[string]string {
if len(attrs) == 0 {
return nil
}
m := make(map[string]string, len(attrs))
for _, a := range attrs {
m[a.Name.Local] = a.Value
}
return m
}
func isInline(name string) bool {
switch name {
// BASE + note + specials + inline-block things treated as inline for stack correctness
case "aq", "b", "del", "dul", "tul", "er", "gr", "hb", "ink", "it", "pe", "ru", "tl", "ul",
"note",
"fn", "nr", "subst", "insertion", "hand",
"align", "tab":
return true
default:
return false
}
}
func isTransparentWrapper(name string) bool {
// IMPORTANT: address subtree is NOT skipped; wrapper tokens are ignored only.
return name == "tabs" || name == "address"
}
func parseLineMarker(se xml.StartElement) (LineType, int, bool) {
var (
indent int
typ string
)
for _, a := range se.Attr {
switch a.Name.Local {
case "tab":
if n, err := strconv.Atoi(trimASCIISpace(a.Value)); err == nil && n > 0 {
indent = n
}
case "type":
typ = trimASCIISpace(a.Value)
}
}
if typ == "empty" {
return Empty, 0, true
}
if indent > 0 {
return Indent, indent, false
}
if typ == "break" {
return Semantic, 0, false
}
return Continuation, 0, false
}

View File

@@ -3,14 +3,16 @@ package xmlmodels
import ( import (
"encoding/json" "encoding/json"
"encoding/xml" "encoding/xml"
"fmt"
"io"
"strconv"
) )
type Letter struct { type Letter struct {
XMLName xml.Name `xml:"letterText"` XMLName xml.Name `xml:"letterText"`
Letter int `xml:"letter,attr"` Letter int `xml:"letter,attr"`
Pages []Page `xml:"page"` Hands []int `xml:"-"`
Hands []RefElement `xml:"hand"` Data []Page
Inner string `xml:",innerxml"`
} }
func (l Letter) Keys() []any { func (l Letter) Keys() []any {
@@ -29,7 +31,192 @@ func (l Letter) String() string {
return string(json) return string(json)
} }
type Page struct { // NOTE: parseSidenote und unten UnmarshalXML sind die beiden haupstächlichen Kontexte, in denen Text gehalten wird.
XMLName xml.Name `xml:"page"` // Wir unterteilen Briefe in Brief - Seite - Zeilen und Sidenotes in Sidenote - Zeilen (weil eine Sidenote nicht über
Index int `xml:"index,attr"` // mehrere Seiten gehen kann).
// NOTE: Zeilen sind geschlossene Einheiten, die auch als HTML einen selbstständigen Block bilden können. Dazu werden
// in parseBlockLines synthetisch Elemente entweder am Anfang oder Ende der Zeile hinzugefügt, um einen offenen Stack
// zu schließen oder den Stack der vorhergehenden Zeile wieder zu öffnen, weil die Auszeichnugen fortgehen.
// NOTE: Wichtige synthetische Tags:
// - Am Beginn oder Ende einer Zeile, wenn der Kontext in der XML über die Zeilen geöffnet bleibt (Token.Synth = true)
// - Am Beginn von letterText und Sidenote kann eine synthetische erste Zeile eingefügt sein (Line.Type = First)
// - Am Beginn einer Seite kann eine eine Zeile eingefügt sein, wenn der Kontext beispielsweise eines offenen
// Absatzes über die Seitengrenze fortgeführt wird (Line.Type = Continuation)
// NOTE: Whitespace-Handling
// - Als Whitespace gilt hier nur ASCII-Whitespace, also TAB, LF, CR, SPACE. Alles andere kann semantisch bedeutsam sein.
// - Am Anfang von letterText, Sidenote oder Page: alle Whitespace-Token werden ignoriert, bis Text kommt
// - Am Anfang und Ende von Zeilen: alle Whitespace-Token werden ignoriert, bis Text bzw. die neue Zeile kommt.
func parseSidenote(dec *xml.Decoder, se xml.StartElement) (Sidenote, int, error) {
var sn Sidenote
pageNum := 0
for _, a := range se.Attr {
switch a.Name.Local {
case "pos":
sn.Position = a.Value
case "annotation":
sn.Annotation = a.Value
case "page":
if n, err := strconv.Atoi(trimASCIISpace(a.Value)); err == nil {
pageNum = n
}
}
}
lines, err := parseBlockLines(dec, "sidenote")
if err != nil {
return sn, pageNum, err
}
sn.Lines = lines
return sn, pageNum, nil
}
func (l *Letter) UnmarshalXML(dec *xml.Decoder, start xml.StartElement) error {
// INFO: Brifnummer extrahieren, main Loop below
for _, a := range start.Attr {
if a.Name.Local == "letter" {
n, err := strconv.Atoi(trimASCIISpace(a.Value))
if err != nil {
return fmt.Errorf("letterText@letter: %w", err)
}
l.Letter = n
break
}
}
var (
pages []Page
curPage *Page
)
ensurePage := func(num int) *Page {
for i := range pages {
if pages[i].Number == num {
return &pages[i]
}
}
pages = append(pages, Page{Number: num})
return &pages[len(pages)-1]
}
acc := newLineAccumulator(First, func(line Line) {
if curPage == nil {
curPage = ensurePage(1)
}
curPage.Lines = append(curPage.Lines, line)
})
handlePage := func(se xml.StartElement) error {
idx := 1
for _, a := range se.Attr {
if a.Name.Local == "index" {
n, err := strconv.Atoi(trimASCIISpace(a.Value))
if err != nil {
return fmt.Errorf("page@index: %w", err)
}
if n > 0 {
idx = n
}
break
}
}
if acc.curLine != nil {
acc.closeLine()
}
curPage = ensurePage(idx)
if acc.hasAnyLine {
acc.setImplicitType(Continuation)
} else {
acc.setImplicitType(First)
}
return nil
}
// INFO: Main Loop
for {
tok, err := dec.Token()
if err == io.EOF {
break
}
if err != nil {
return err
}
switch t := tok.(type) {
case xml.StartElement:
name := t.Name.Local
if isTransparentWrapper(name) {
continue
}
switch name {
case "page":
if err := handlePage(t); err != nil {
return err
}
continue
case "line":
acc.handleLineMarker(t)
continue
case "sidenote":
sn, pageNum, err := parseSidenote(dec, t)
if err != nil {
return err
}
if pageNum == 0 {
if curPage != nil {
pageNum = curPage.Number
} else {
pageNum = 1
}
}
p := ensurePage(pageNum)
p.Sidenotes = append(p.Sidenotes, sn)
continue
}
acc.appendStart(name, attrsToMap(t.Attr))
case xml.EndElement:
name := t.Name.Local
if isTransparentWrapper(name) {
continue
}
// INFO: Exit-Bedingung
if name == start.Name.Local {
if acc.curLine != nil {
acc.closeLine()
}
l.Data = pages
return nil
}
// INFO: Selbst-schließende tags werden vom Go-Parser expandiert, deswegen:
if name == "page" || name == "line" {
continue
}
acc.appendEnd(name)
case xml.CharData:
s := string([]byte(t))
if isOnlyASCIISpace(s) {
if acc.isAtLineStart() {
continue
}
s = " "
}
acc.appendText(s)
}
}
l.Data = pages
return nil
} }

View File

@@ -0,0 +1,278 @@
package xmlmodels
import (
"encoding/xml"
"os"
"slices"
"strings"
"testing"
)
type examplesRoot struct {
Letters []Letter `xml:"letterText"`
}
func loadExampleLetters(t *testing.T) []Letter {
t.Helper()
paths := []string{"example.xml", "../example.xml"}
var data []byte
var err error
for _, p := range paths {
data, err = os.ReadFile(p)
if err == nil {
break
}
}
if err != nil {
t.Fatalf("read example.xml: %v", err)
}
var root examplesRoot
if err := xml.Unmarshal(data, &root); err != nil {
t.Fatalf("unmarshal example.xml: %v", err)
}
if len(root.Letters) == 0 {
t.Fatalf("example.xml contained no letterText elements")
}
return root.Letters
}
func TestLettersFromExampleXMLRespectLineInvariants(t *testing.T) {
letters := loadExampleLetters(t)
var foundLetterFirst bool
var foundSidenoteFirst bool
var foundPageStartContinuation bool
var foundSyntheticCarry bool
for _, letter := range letters {
for pageIdx, page := range letter.Data {
if len(page.Lines) > 0 && page.Lines[0].Type == First {
foundLetterFirst = true
}
if pageIdx > 0 && len(page.Lines) > 0 && page.Lines[0].Type == Continuation {
foundPageStartContinuation = true
}
for i := 0; i+1 < len(page.Lines); i++ {
if linePairHasValidSyntheticCarry(page.Lines[i], page.Lines[i+1]) {
foundSyntheticCarry = true
}
}
for _, sn := range page.Sidenotes {
if len(sn.Lines) > 0 && sn.Lines[0].Type == First {
foundSidenoteFirst = true
}
for i := 0; i+1 < len(sn.Lines); i++ {
if linePairHasValidSyntheticCarry(sn.Lines[i], sn.Lines[i+1]) {
foundSyntheticCarry = true
}
}
}
}
}
if !foundLetterFirst {
t.Fatalf("expected at least one letter page to start with synthetic First line")
}
if !foundSidenoteFirst {
t.Fatalf("expected at least one sidenote to start with synthetic First line")
}
if !foundPageStartContinuation {
t.Fatalf("expected at least one non-initial page to start with Continuation line")
}
if !foundSyntheticCarry {
t.Fatalf("expected at least one synthetic close/reopen carry between consecutive lines")
}
}
func TestLettersFromExampleXMLSyntheticContinuationsAreConsistent(t *testing.T) {
letters := loadExampleLetters(t)
for _, letter := range letters {
for pageIdx, page := range letter.Data {
for lineIdx, line := range page.Lines {
assertLineLocallyValid(t, letter.Letter, page.Number, "page", lineIdx, line)
}
for i := 0; i+1 < len(page.Lines); i++ {
assertCarryPair(
t,
letter.Letter,
page.Number,
"page",
i,
page.Lines[i],
page.Lines[i+1],
)
}
if pageIdx > 0 && len(page.Lines) > 0 && len(letter.Data[pageIdx-1].Lines) > 0 {
prevPage := letter.Data[pageIdx-1]
assertCarryPair(
t,
letter.Letter,
page.Number,
"page-boundary",
0,
prevPage.Lines[len(prevPage.Lines)-1],
page.Lines[0],
)
}
for _, sn := range page.Sidenotes {
for lineIdx, line := range sn.Lines {
assertLineLocallyValid(t, letter.Letter, page.Number, "sidenote", lineIdx, line)
}
for i := 0; i+1 < len(sn.Lines); i++ {
assertCarryPair(
t,
letter.Letter,
page.Number,
"sidenote",
i,
sn.Lines[i],
sn.Lines[i+1],
)
}
}
}
}
}
func assertLineLocallyValid(t *testing.T, letter, page int, where string, lineIdx int, line Line) {
t.Helper()
sawNonContToken := false
for tokIdx, tok := range line.Tokens {
if tok.Type == StartElement && tok.Synth {
if sawNonContToken {
t.Fatalf("letter %d page %d %s line %d has synthetic opener after non-prefix token at token %d", letter, page, where, lineIdx, tokIdx)
}
continue
}
sawNonContToken = true
}
sawContCloser := false
for tokIdx, tok := range line.Tokens {
if tok.Type == EndElement && tok.Synth {
sawContCloser = true
continue
}
if sawContCloser {
t.Fatalf("letter %d page %d %s line %d has token after synthetic closer at token %d", letter, page, where, lineIdx, tokIdx)
}
}
var stack []string
var textFromTokens strings.Builder
for tokIdx, tok := range line.Tokens {
switch tok.Type {
case StartElement:
stack = append(stack, tok.Name)
case EndElement:
if len(stack) == 0 || stack[len(stack)-1] != tok.Name {
t.Fatalf("letter %d page %d %s line %d has unbalanced end token %q at token %d", letter, page, where, lineIdx, tok.Name, tokIdx)
}
stack = stack[:len(stack)-1]
case CharData:
textFromTokens.WriteString(tok.Value)
if isOnlyASCIISpace(tok.Value) {
if isLineStartPosition(line, tokIdx) {
t.Fatalf("letter %d page %d %s line %d contains leading whitespace-only chardata token at token %d", letter, page, where, lineIdx, tokIdx)
}
if tok.Value != " " {
t.Fatalf("letter %d page %d %s line %d contains non-normalized whitespace token %q at token %d", letter, page, where, lineIdx, tok.Value, tokIdx)
}
}
}
}
if len(stack) != 0 {
t.Fatalf("letter %d page %d %s line %d ended with %d unclosed tags", letter, page, where, lineIdx, len(stack))
}
if line.Text != textFromTokens.String() {
t.Fatalf("letter %d page %d %s line %d has Text mismatch: %q != %q", letter, page, where, lineIdx, line.Text, textFromTokens.String())
}
if line.Text != "" {
if hasLeadingASCIISpace(line.Text) {
t.Fatalf("letter %d page %d %s line %d has Text starting with whitespace: %q", letter, page, where, lineIdx, line.Text)
}
if hasTrailingASCIISpace(line.Text) {
t.Fatalf("letter %d page %d %s line %d has Text ending with whitespace: %q", letter, page, where, lineIdx, line.Text)
}
}
}
func isLineStartPosition(line Line, idx int) bool {
for i := 0; i < idx; i++ {
tok := line.Tokens[i]
if tok.Type == StartElement && tok.Synth {
continue
}
return false
}
return true
}
func assertCarryPair(t *testing.T, letter, page int, where string, lineIdx int, prev, next Line) {
t.Helper()
closed := syntheticClosedNames(prev)
reopened := syntheticReopenedPrefixNames(next)
if len(closed) == 0 {
if len(reopened) != 0 {
t.Fatalf("letter %d page %d %s line %d->%d reopens %d tags with no synthetic closes in previous line", letter, page, where, lineIdx, lineIdx+1, len(reopened))
}
return
}
slices.Reverse(closed)
if !slices.Equal(closed, reopened) {
t.Fatalf("letter %d page %d %s line %d->%d synthetic carry mismatch: closed=%v reopened=%v", letter, page, where, lineIdx, lineIdx+1, closed, reopened)
}
}
func syntheticClosedNames(line Line) []string {
var out []string
for _, tok := range line.Tokens {
if tok.Type == EndElement && tok.Synth {
out = append(out, tok.Name)
}
}
return out
}
func syntheticReopenedPrefixNames(line Line) []string {
var out []string
for _, tok := range line.Tokens {
if tok.Type == StartElement && tok.Synth {
out = append(out, tok.Name)
continue
}
break
}
return out
}
func linePairHasValidSyntheticCarry(prev, next Line) bool {
closed := syntheticClosedNames(prev)
if len(closed) == 0 {
return false
}
slices.Reverse(closed)
reopened := syntheticReopenedPrefixNames(next)
if len(reopened) < len(closed) {
return false
}
for i := range closed {
if reopened[i] != closed[i] {
return false
}
}
return true
}

282
xmlmodels/textparse.go Normal file
View File

@@ -0,0 +1,282 @@
package xmlmodels
import (
"encoding/xml"
"strings"
)
type TokenType int
const (
StartElement TokenType = iota
EndElement
CharData
)
type LineType int
const (
Continuation LineType = iota
First
Fist = First // backward-compatible alias for historical typo
Semantic // Indent=0 , still type="break"
Indent // Indent>0, type dosent matter
Empty // no line content, after that, an empty line
)
type Token struct {
Type TokenType
Name string
Attrs map[string]string
Value string
// INFO: true means synthetic token without corresponding XML token.
Synth bool
}
type Line struct {
Type LineType
Indent int
Text string
Tokens []Token
}
type Page struct {
Number int
Lines []Line
Sidenotes []Sidenote
}
type Sidenote struct {
Position string
Annotation string
Lines []Line
}
type lineAccumulator struct {
curLine *Line
openStack []Token
implicitType LineType
hasAnyLine bool
appendLine func(Line)
hasCharData bool
}
func newLineAccumulator(implicitType LineType, appendLine func(Line)) *lineAccumulator {
return &lineAccumulator{
implicitType: implicitType,
appendLine: appendLine,
}
}
func (a *lineAccumulator) setImplicitType(lt LineType) {
a.implicitType = lt
}
func (a *lineAccumulator) startLine(lt LineType, indent int) {
a.curLine = &Line{Type: lt, Indent: indent}
a.hasCharData = false
for _, st := range a.openStack {
a.curLine.Tokens = append(a.curLine.Tokens, Token{
Type: StartElement,
Name: st.Name,
Attrs: st.Attrs,
Synth: true,
})
}
}
func (a *lineAccumulator) ensureLine() {
if a.curLine != nil {
return
}
a.startLine(a.implicitType, 0)
if a.implicitType == First {
a.implicitType = Continuation
}
}
func (a *lineAccumulator) closeLine() {
if a.curLine == nil {
a.ensureLine()
}
a.trimRightWhitespace()
for i := len(a.openStack) - 1; i >= 0; i-- {
a.curLine.Tokens = append(a.curLine.Tokens, Token{
Type: EndElement,
Name: a.openStack[i].Name,
Synth: true,
})
}
a.curLine.Text = lineTextFromTokens(a.curLine.Tokens)
a.appendLine(*a.curLine)
a.hasAnyLine = true
a.curLine = nil
}
func (a *lineAccumulator) handleLineMarker(se xml.StartElement) {
lt, indent, emitEmpty := parseLineMarker(se)
if a.curLine != nil {
a.closeLine()
}
if emitEmpty {
a.startLine(Empty, 0)
a.closeLine()
a.implicitType = Continuation
return
}
a.startLine(lt, indent)
a.implicitType = Continuation
}
func (a *lineAccumulator) appendStart(name string, attrs map[string]string) {
a.ensureLine()
a.curLine.Tokens = append(a.curLine.Tokens, Token{
Type: StartElement,
Name: name,
Attrs: attrs,
})
a.openStack = append(a.openStack, Token{
Type: StartElement,
Name: name,
Attrs: attrs,
})
}
func (a *lineAccumulator) appendEnd(name string) {
a.ensureLine()
a.curLine.Tokens = append(a.curLine.Tokens, Token{
Type: EndElement,
Name: name,
})
if len(a.openStack) == 0 {
return
}
if a.openStack[len(a.openStack)-1].Name == name {
a.openStack = a.openStack[:len(a.openStack)-1]
return
}
for i := len(a.openStack) - 1; i >= 0; i-- {
if a.openStack[i].Name == name {
a.openStack = append(a.openStack[:i], a.openStack[i+1:]...)
return
}
}
}
func (a *lineAccumulator) appendText(s string) {
a.ensureLine()
if !a.hasCharData {
s = trimLeftASCIISpace(s)
}
if s == "" {
return
}
a.curLine.Tokens = append(a.curLine.Tokens, Token{
Type: CharData,
Value: s,
})
a.hasCharData = true
}
func (a *lineAccumulator) isAtLineStart() bool {
if a.curLine == nil {
return true
}
for _, tok := range a.curLine.Tokens {
if tok.Type == StartElement && tok.Synth {
continue
}
return false
}
return true
}
func (a *lineAccumulator) trimRightWhitespace() {
if a.curLine == nil {
return
}
toks := a.curLine.Tokens
for {
lastCharIdx := -1
for i := len(toks) - 1; i >= 0; i-- {
if toks[i].Type == CharData {
lastCharIdx = i
break
}
}
if lastCharIdx < 0 {
break
}
trimmed := trimRightASCIISpace(toks[lastCharIdx].Value)
if trimmed == "" {
toks = append(toks[:lastCharIdx], toks[lastCharIdx+1:]...)
continue
}
toks[lastCharIdx].Value = trimmed
break
}
a.curLine.Tokens = toks
}
func lineTextFromTokens(tokens []Token) string {
var b strings.Builder
for _, tok := range tokens {
if tok.Type == CharData {
b.WriteString(tok.Value)
}
}
return b.String()
}
func parseBlockLines(dec *xml.Decoder, endLocalName string) ([]Line, error) {
lines := make([]Line, 0, 8)
acc := newLineAccumulator(First, func(line Line) {
lines = append(lines, line)
})
for {
tok, err := dec.Token()
if err != nil {
return nil, err
}
switch t := tok.(type) {
case xml.StartElement:
name := t.Name.Local
if name == "line" {
acc.handleLineMarker(t)
continue
}
if isTransparentWrapper(name) {
continue
}
acc.appendStart(name, attrsToMap(t.Attr))
case xml.EndElement:
name := t.Name.Local
if isTransparentWrapper(name) {
continue
}
if name == endLocalName {
if acc.curLine != nil {
acc.closeLine()
}
return lines, nil
}
if name == "line" {
continue
}
acc.appendEnd(name)
case xml.CharData:
s := string([]byte(t))
if isOnlyASCIISpace(s) {
if acc.isAtLineStart() {
continue
}
s = " "
}
acc.appendText(s)
}
}
}