Files
lenz-web/xmlmodels/textparse.go
2026-02-20 14:53:05 +01:00

306 lines
5.7 KiB
Go

package xmlmodels
import (
"encoding/xml"
"strings"
)
type TokenType int
const (
StartElement TokenType = iota
EndElement
CharData
)
type LineType int
const (
Continuation LineType = iota
First
Fist = First // backward-compatible alias for historical typo
Semantic LineType = iota // Indent=0 , still type="break"
Indent // Indent>0, type dosent matter
Empty // no line content, after that, an empty line
)
type Token struct {
Type TokenType
Name string
Attrs map[string]string
Value string
// INFO: true means synthetic token without corresponding XML token.
Synth bool
}
type Line struct {
Type LineType
Indent int
AlignCtx bool
TabCtx bool
Text string
Tokens []Token
}
type Page struct {
Number int
Lines []Line
Sidenotes []Sidenote
}
type Sidenote struct {
Position string
Annotation string
Lines []Line
}
type lineAccumulator struct {
curLine *Line
openStack []Token
implicitType LineType
hasAnyLine bool
appendLine func(Line)
hasCharData bool
}
func newLineAccumulator(implicitType LineType, appendLine func(Line)) *lineAccumulator {
return &lineAccumulator{
implicitType: implicitType,
appendLine: appendLine,
}
}
func (a *lineAccumulator) setImplicitType(lt LineType) {
a.implicitType = lt
}
func (a *lineAccumulator) startLine(lt LineType, indent int) {
a.curLine = &Line{Type: lt, Indent: indent}
a.hasCharData = false
for _, st := range a.openStack {
a.curLine.Tokens = append(a.curLine.Tokens, Token{
Type: StartElement,
Name: st.Name,
Attrs: st.Attrs,
Synth: true,
})
}
}
func (a *lineAccumulator) ensureLine() {
if a.curLine != nil {
return
}
a.startLine(a.implicitType, 0)
if a.implicitType == First || a.implicitType == Continuation {
a.implicitType = Semantic
}
}
func (a *lineAccumulator) closeLine() {
if a.curLine == nil {
a.ensureLine()
}
a.trimRightWhitespace()
for i := len(a.openStack) - 1; i >= 0; i-- {
a.curLine.Tokens = append(a.curLine.Tokens, Token{
Type: EndElement,
Name: a.openStack[i].Name,
Synth: true,
})
}
a.applyContextFlags()
a.curLine.Text = lineTextFromTokens(a.curLine.Tokens)
a.appendLine(*a.curLine)
a.hasAnyLine = true
a.curLine = nil
}
func (a *lineAccumulator) handleLineMarker(se xml.StartElement) {
lt, indent, emitEmpty := parseLineMarker(se)
if a.curLine != nil {
a.closeLine()
}
if emitEmpty {
a.startLine(Empty, 0)
a.closeLine()
a.implicitType = Semantic
return
}
a.startLine(lt, indent)
a.implicitType = Semantic
}
func (a *lineAccumulator) appendStart(name string, attrs map[string]string) {
a.ensureLine()
a.curLine.Tokens = append(a.curLine.Tokens, Token{
Type: StartElement,
Name: name,
Attrs: attrs,
})
a.openStack = append(a.openStack, Token{
Type: StartElement,
Name: name,
Attrs: attrs,
})
}
func (a *lineAccumulator) appendEnd(name string) {
a.ensureLine()
a.curLine.Tokens = append(a.curLine.Tokens, Token{
Type: EndElement,
Name: name,
})
if len(a.openStack) == 0 {
return
}
if a.openStack[len(a.openStack)-1].Name == name {
a.openStack = a.openStack[:len(a.openStack)-1]
return
}
for i := len(a.openStack) - 1; i >= 0; i-- {
if a.openStack[i].Name == name {
a.openStack = append(a.openStack[:i], a.openStack[i+1:]...)
return
}
}
}
func (a *lineAccumulator) appendText(s string) {
a.ensureLine()
if !a.hasCharData {
s = trimLeftASCIISpace(s)
}
if s == "" {
return
}
a.curLine.Tokens = append(a.curLine.Tokens, Token{
Type: CharData,
Value: s,
})
a.hasCharData = true
}
func (a *lineAccumulator) isAtLineStart() bool {
if a.curLine == nil {
return true
}
for _, tok := range a.curLine.Tokens {
if tok.Type == StartElement && tok.Synth {
continue
}
return false
}
return true
}
func (a *lineAccumulator) trimRightWhitespace() {
if a.curLine == nil {
return
}
toks := a.curLine.Tokens
for {
lastCharIdx := -1
for i := len(toks) - 1; i >= 0; i-- {
if toks[i].Type == CharData {
lastCharIdx = i
break
}
}
if lastCharIdx < 0 {
break
}
trimmed := trimRightASCIISpace(toks[lastCharIdx].Value)
if trimmed == "" {
toks = append(toks[:lastCharIdx], toks[lastCharIdx+1:]...)
continue
}
toks[lastCharIdx].Value = trimmed
break
}
a.curLine.Tokens = toks
}
func lineTextFromTokens(tokens []Token) string {
var b strings.Builder
for _, tok := range tokens {
if tok.Type == CharData {
b.WriteString(tok.Value)
}
}
return b.String()
}
func (a *lineAccumulator) applyContextFlags() {
if a.curLine == nil {
return
}
for _, tok := range a.curLine.Tokens {
if tok.Type != StartElement {
continue
}
switch tok.Name {
case "align":
a.curLine.AlignCtx = true
case "tab":
a.curLine.TabCtx = true
}
if a.curLine.AlignCtx && a.curLine.TabCtx {
return
}
}
}
func parseBlockLines(dec *xml.Decoder, endLocalName string) ([]Line, error) {
lines := make([]Line, 0, 8)
acc := newLineAccumulator(First, func(line Line) {
lines = append(lines, line)
})
for {
tok, err := dec.Token()
if err != nil {
return nil, err
}
switch t := tok.(type) {
case xml.StartElement:
name := t.Name.Local
if name == "line" {
acc.handleLineMarker(t)
continue
}
if isTransparentWrapper(name) {
continue
}
acc.appendStart(name, attrsToMap(t.Attr))
case xml.EndElement:
name := t.Name.Local
if isTransparentWrapper(name) {
continue
}
if name == endLocalName {
if acc.curLine != nil {
acc.closeLine()
}
return lines, nil
}
if name == "line" {
continue
}
acc.appendEnd(name)
case xml.CharData:
s := string([]byte(t))
if isOnlyASCIISpace(s) {
if acc.isAtLineStart() {
continue
}
s = " "
}
acc.appendText(s)
}
}
}