mirror of
https://github.com/Theodor-Springmann-Stiftung/lenz-web.git
synced 2026-03-21 13:55:30 +00:00
Neuer Parser
This commit is contained in:
106
xmlmodels/helpers.go
Normal file
106
xmlmodels/helpers.go
Normal file
@@ -0,0 +1,106 @@
|
||||
package xmlmodels
|
||||
|
||||
import (
|
||||
"encoding/xml"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
func isASCIISpaceByte(b byte) bool {
|
||||
return b == ' ' || b == '\t' || b == '\n' || b == '\r'
|
||||
}
|
||||
|
||||
func trimLeftASCIISpace(s string) string {
|
||||
i := 0
|
||||
for i < len(s) && isASCIISpaceByte(s[i]) {
|
||||
i++
|
||||
}
|
||||
return s[i:]
|
||||
}
|
||||
|
||||
func trimRightASCIISpace(s string) string {
|
||||
i := len(s)
|
||||
for i > 0 && isASCIISpaceByte(s[i-1]) {
|
||||
i--
|
||||
}
|
||||
return s[:i]
|
||||
}
|
||||
|
||||
func trimASCIISpace(s string) string {
|
||||
return trimRightASCIISpace(trimLeftASCIISpace(s))
|
||||
}
|
||||
|
||||
func isOnlyASCIISpace(s string) bool {
|
||||
if len(s) == 0 {
|
||||
return true
|
||||
}
|
||||
for i := 0; i < len(s); i++ {
|
||||
if !isASCIISpaceByte(s[i]) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func hasLeadingASCIISpace(s string) bool {
|
||||
return len(s) > 0 && isASCIISpaceByte(s[0])
|
||||
}
|
||||
|
||||
func hasTrailingASCIISpace(s string) bool {
|
||||
return len(s) > 0 && isASCIISpaceByte(s[len(s)-1])
|
||||
}
|
||||
|
||||
func attrsToMap(attrs []xml.Attr) map[string]string {
|
||||
if len(attrs) == 0 {
|
||||
return nil
|
||||
}
|
||||
m := make(map[string]string, len(attrs))
|
||||
for _, a := range attrs {
|
||||
m[a.Name.Local] = a.Value
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
func isInline(name string) bool {
|
||||
switch name {
|
||||
// BASE + note + specials + inline-block things treated as inline for stack correctness
|
||||
case "aq", "b", "del", "dul", "tul", "er", "gr", "hb", "ink", "it", "pe", "ru", "tl", "ul",
|
||||
"note",
|
||||
"fn", "nr", "subst", "insertion", "hand",
|
||||
"align", "tab":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func isTransparentWrapper(name string) bool {
|
||||
// IMPORTANT: address subtree is NOT skipped; wrapper tokens are ignored only.
|
||||
return name == "tabs" || name == "address"
|
||||
}
|
||||
|
||||
func parseLineMarker(se xml.StartElement) (LineType, int, bool) {
|
||||
var (
|
||||
indent int
|
||||
typ string
|
||||
)
|
||||
for _, a := range se.Attr {
|
||||
switch a.Name.Local {
|
||||
case "tab":
|
||||
if n, err := strconv.Atoi(trimASCIISpace(a.Value)); err == nil && n > 0 {
|
||||
indent = n
|
||||
}
|
||||
case "type":
|
||||
typ = trimASCIISpace(a.Value)
|
||||
}
|
||||
}
|
||||
if typ == "empty" {
|
||||
return Empty, 0, true
|
||||
}
|
||||
if indent > 0 {
|
||||
return Indent, indent, false
|
||||
}
|
||||
if typ == "break" {
|
||||
return Semantic, 0, false
|
||||
}
|
||||
return Continuation, 0, false
|
||||
}
|
||||
@@ -3,14 +3,16 @@ package xmlmodels
|
||||
import (
|
||||
"encoding/json"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
type Letter struct {
|
||||
XMLName xml.Name `xml:"letterText"`
|
||||
Letter int `xml:"letter,attr"`
|
||||
Pages []Page `xml:"page"`
|
||||
Hands []RefElement `xml:"hand"`
|
||||
Inner string `xml:",innerxml"`
|
||||
XMLName xml.Name `xml:"letterText"`
|
||||
Letter int `xml:"letter,attr"`
|
||||
Hands []int `xml:"-"`
|
||||
Data []Page
|
||||
}
|
||||
|
||||
func (l Letter) Keys() []any {
|
||||
@@ -29,7 +31,192 @@ func (l Letter) String() string {
|
||||
return string(json)
|
||||
}
|
||||
|
||||
type Page struct {
|
||||
XMLName xml.Name `xml:"page"`
|
||||
Index int `xml:"index,attr"`
|
||||
// NOTE: parseSidenote und unten UnmarshalXML sind die beiden haupstächlichen Kontexte, in denen Text gehalten wird.
|
||||
// Wir unterteilen Briefe in Brief - Seite - Zeilen und Sidenotes in Sidenote - Zeilen (weil eine Sidenote nicht über
|
||||
// mehrere Seiten gehen kann).
|
||||
|
||||
// NOTE: Zeilen sind geschlossene Einheiten, die auch als HTML einen selbstständigen Block bilden können. Dazu werden
|
||||
// in parseBlockLines synthetisch Elemente entweder am Anfang oder Ende der Zeile hinzugefügt, um einen offenen Stack
|
||||
// zu schließen oder den Stack der vorhergehenden Zeile wieder zu öffnen, weil die Auszeichnugen fortgehen.
|
||||
|
||||
// NOTE: Wichtige synthetische Tags:
|
||||
// - Am Beginn oder Ende einer Zeile, wenn der Kontext in der XML über die Zeilen geöffnet bleibt (Token.Synth = true)
|
||||
// - Am Beginn von letterText und Sidenote kann eine synthetische erste Zeile eingefügt sein (Line.Type = First)
|
||||
// - Am Beginn einer Seite kann eine eine Zeile eingefügt sein, wenn der Kontext beispielsweise eines offenen
|
||||
// Absatzes über die Seitengrenze fortgeführt wird (Line.Type = Continuation)
|
||||
|
||||
// NOTE: Whitespace-Handling
|
||||
// - Als Whitespace gilt hier nur ASCII-Whitespace, also TAB, LF, CR, SPACE. Alles andere kann semantisch bedeutsam sein.
|
||||
// - Am Anfang von letterText, Sidenote oder Page: alle Whitespace-Token werden ignoriert, bis Text kommt
|
||||
// - Am Anfang und Ende von Zeilen: alle Whitespace-Token werden ignoriert, bis Text bzw. die neue Zeile kommt.
|
||||
func parseSidenote(dec *xml.Decoder, se xml.StartElement) (Sidenote, int, error) {
|
||||
var sn Sidenote
|
||||
pageNum := 0
|
||||
|
||||
for _, a := range se.Attr {
|
||||
switch a.Name.Local {
|
||||
case "pos":
|
||||
sn.Position = a.Value
|
||||
case "annotation":
|
||||
sn.Annotation = a.Value
|
||||
case "page":
|
||||
if n, err := strconv.Atoi(trimASCIISpace(a.Value)); err == nil {
|
||||
pageNum = n
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lines, err := parseBlockLines(dec, "sidenote")
|
||||
if err != nil {
|
||||
return sn, pageNum, err
|
||||
}
|
||||
sn.Lines = lines
|
||||
return sn, pageNum, nil
|
||||
}
|
||||
|
||||
func (l *Letter) UnmarshalXML(dec *xml.Decoder, start xml.StartElement) error {
|
||||
// INFO: Brifnummer extrahieren, main Loop below
|
||||
for _, a := range start.Attr {
|
||||
if a.Name.Local == "letter" {
|
||||
n, err := strconv.Atoi(trimASCIISpace(a.Value))
|
||||
if err != nil {
|
||||
return fmt.Errorf("letterText@letter: %w", err)
|
||||
}
|
||||
l.Letter = n
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
var (
|
||||
pages []Page
|
||||
curPage *Page
|
||||
)
|
||||
|
||||
ensurePage := func(num int) *Page {
|
||||
for i := range pages {
|
||||
if pages[i].Number == num {
|
||||
return &pages[i]
|
||||
}
|
||||
}
|
||||
pages = append(pages, Page{Number: num})
|
||||
return &pages[len(pages)-1]
|
||||
}
|
||||
|
||||
acc := newLineAccumulator(First, func(line Line) {
|
||||
if curPage == nil {
|
||||
curPage = ensurePage(1)
|
||||
}
|
||||
curPage.Lines = append(curPage.Lines, line)
|
||||
})
|
||||
|
||||
handlePage := func(se xml.StartElement) error {
|
||||
idx := 1
|
||||
for _, a := range se.Attr {
|
||||
if a.Name.Local == "index" {
|
||||
n, err := strconv.Atoi(trimASCIISpace(a.Value))
|
||||
if err != nil {
|
||||
return fmt.Errorf("page@index: %w", err)
|
||||
}
|
||||
if n > 0 {
|
||||
idx = n
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
if acc.curLine != nil {
|
||||
acc.closeLine()
|
||||
}
|
||||
curPage = ensurePage(idx)
|
||||
if acc.hasAnyLine {
|
||||
acc.setImplicitType(Continuation)
|
||||
} else {
|
||||
acc.setImplicitType(First)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// INFO: Main Loop
|
||||
for {
|
||||
tok, err := dec.Token()
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
switch t := tok.(type) {
|
||||
|
||||
case xml.StartElement:
|
||||
name := t.Name.Local
|
||||
|
||||
if isTransparentWrapper(name) {
|
||||
continue
|
||||
}
|
||||
|
||||
switch name {
|
||||
case "page":
|
||||
if err := handlePage(t); err != nil {
|
||||
return err
|
||||
}
|
||||
continue
|
||||
case "line":
|
||||
acc.handleLineMarker(t)
|
||||
continue
|
||||
case "sidenote":
|
||||
sn, pageNum, err := parseSidenote(dec, t)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if pageNum == 0 {
|
||||
if curPage != nil {
|
||||
pageNum = curPage.Number
|
||||
} else {
|
||||
pageNum = 1
|
||||
}
|
||||
}
|
||||
p := ensurePage(pageNum)
|
||||
p.Sidenotes = append(p.Sidenotes, sn)
|
||||
continue
|
||||
}
|
||||
|
||||
acc.appendStart(name, attrsToMap(t.Attr))
|
||||
|
||||
case xml.EndElement:
|
||||
name := t.Name.Local
|
||||
|
||||
if isTransparentWrapper(name) {
|
||||
continue
|
||||
}
|
||||
|
||||
// INFO: Exit-Bedingung
|
||||
if name == start.Name.Local {
|
||||
if acc.curLine != nil {
|
||||
acc.closeLine()
|
||||
}
|
||||
l.Data = pages
|
||||
return nil
|
||||
}
|
||||
|
||||
// INFO: Selbst-schließende tags werden vom Go-Parser expandiert, deswegen:
|
||||
if name == "page" || name == "line" {
|
||||
continue
|
||||
}
|
||||
|
||||
acc.appendEnd(name)
|
||||
|
||||
case xml.CharData:
|
||||
s := string([]byte(t))
|
||||
if isOnlyASCIISpace(s) {
|
||||
if acc.isAtLineStart() {
|
||||
continue
|
||||
}
|
||||
s = " "
|
||||
}
|
||||
acc.appendText(s)
|
||||
}
|
||||
}
|
||||
|
||||
l.Data = pages
|
||||
return nil
|
||||
}
|
||||
|
||||
278
xmlmodels/letter_examples_test.go
Normal file
278
xmlmodels/letter_examples_test.go
Normal file
@@ -0,0 +1,278 @@
|
||||
package xmlmodels
|
||||
|
||||
import (
|
||||
"encoding/xml"
|
||||
"os"
|
||||
"slices"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
type examplesRoot struct {
|
||||
Letters []Letter `xml:"letterText"`
|
||||
}
|
||||
|
||||
func loadExampleLetters(t *testing.T) []Letter {
|
||||
t.Helper()
|
||||
|
||||
paths := []string{"example.xml", "../example.xml"}
|
||||
var data []byte
|
||||
var err error
|
||||
for _, p := range paths {
|
||||
data, err = os.ReadFile(p)
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatalf("read example.xml: %v", err)
|
||||
}
|
||||
|
||||
var root examplesRoot
|
||||
if err := xml.Unmarshal(data, &root); err != nil {
|
||||
t.Fatalf("unmarshal example.xml: %v", err)
|
||||
}
|
||||
if len(root.Letters) == 0 {
|
||||
t.Fatalf("example.xml contained no letterText elements")
|
||||
}
|
||||
return root.Letters
|
||||
}
|
||||
|
||||
func TestLettersFromExampleXMLRespectLineInvariants(t *testing.T) {
|
||||
letters := loadExampleLetters(t)
|
||||
|
||||
var foundLetterFirst bool
|
||||
var foundSidenoteFirst bool
|
||||
var foundPageStartContinuation bool
|
||||
var foundSyntheticCarry bool
|
||||
|
||||
for _, letter := range letters {
|
||||
for pageIdx, page := range letter.Data {
|
||||
if len(page.Lines) > 0 && page.Lines[0].Type == First {
|
||||
foundLetterFirst = true
|
||||
}
|
||||
if pageIdx > 0 && len(page.Lines) > 0 && page.Lines[0].Type == Continuation {
|
||||
foundPageStartContinuation = true
|
||||
}
|
||||
|
||||
for i := 0; i+1 < len(page.Lines); i++ {
|
||||
if linePairHasValidSyntheticCarry(page.Lines[i], page.Lines[i+1]) {
|
||||
foundSyntheticCarry = true
|
||||
}
|
||||
}
|
||||
|
||||
for _, sn := range page.Sidenotes {
|
||||
if len(sn.Lines) > 0 && sn.Lines[0].Type == First {
|
||||
foundSidenoteFirst = true
|
||||
}
|
||||
for i := 0; i+1 < len(sn.Lines); i++ {
|
||||
if linePairHasValidSyntheticCarry(sn.Lines[i], sn.Lines[i+1]) {
|
||||
foundSyntheticCarry = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !foundLetterFirst {
|
||||
t.Fatalf("expected at least one letter page to start with synthetic First line")
|
||||
}
|
||||
if !foundSidenoteFirst {
|
||||
t.Fatalf("expected at least one sidenote to start with synthetic First line")
|
||||
}
|
||||
if !foundPageStartContinuation {
|
||||
t.Fatalf("expected at least one non-initial page to start with Continuation line")
|
||||
}
|
||||
if !foundSyntheticCarry {
|
||||
t.Fatalf("expected at least one synthetic close/reopen carry between consecutive lines")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLettersFromExampleXMLSyntheticContinuationsAreConsistent(t *testing.T) {
|
||||
letters := loadExampleLetters(t)
|
||||
|
||||
for _, letter := range letters {
|
||||
for pageIdx, page := range letter.Data {
|
||||
for lineIdx, line := range page.Lines {
|
||||
assertLineLocallyValid(t, letter.Letter, page.Number, "page", lineIdx, line)
|
||||
}
|
||||
|
||||
for i := 0; i+1 < len(page.Lines); i++ {
|
||||
assertCarryPair(
|
||||
t,
|
||||
letter.Letter,
|
||||
page.Number,
|
||||
"page",
|
||||
i,
|
||||
page.Lines[i],
|
||||
page.Lines[i+1],
|
||||
)
|
||||
}
|
||||
|
||||
if pageIdx > 0 && len(page.Lines) > 0 && len(letter.Data[pageIdx-1].Lines) > 0 {
|
||||
prevPage := letter.Data[pageIdx-1]
|
||||
assertCarryPair(
|
||||
t,
|
||||
letter.Letter,
|
||||
page.Number,
|
||||
"page-boundary",
|
||||
0,
|
||||
prevPage.Lines[len(prevPage.Lines)-1],
|
||||
page.Lines[0],
|
||||
)
|
||||
}
|
||||
|
||||
for _, sn := range page.Sidenotes {
|
||||
for lineIdx, line := range sn.Lines {
|
||||
assertLineLocallyValid(t, letter.Letter, page.Number, "sidenote", lineIdx, line)
|
||||
}
|
||||
for i := 0; i+1 < len(sn.Lines); i++ {
|
||||
assertCarryPair(
|
||||
t,
|
||||
letter.Letter,
|
||||
page.Number,
|
||||
"sidenote",
|
||||
i,
|
||||
sn.Lines[i],
|
||||
sn.Lines[i+1],
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func assertLineLocallyValid(t *testing.T, letter, page int, where string, lineIdx int, line Line) {
|
||||
t.Helper()
|
||||
|
||||
sawNonContToken := false
|
||||
for tokIdx, tok := range line.Tokens {
|
||||
if tok.Type == StartElement && tok.Synth {
|
||||
if sawNonContToken {
|
||||
t.Fatalf("letter %d page %d %s line %d has synthetic opener after non-prefix token at token %d", letter, page, where, lineIdx, tokIdx)
|
||||
}
|
||||
continue
|
||||
}
|
||||
sawNonContToken = true
|
||||
}
|
||||
|
||||
sawContCloser := false
|
||||
for tokIdx, tok := range line.Tokens {
|
||||
if tok.Type == EndElement && tok.Synth {
|
||||
sawContCloser = true
|
||||
continue
|
||||
}
|
||||
if sawContCloser {
|
||||
t.Fatalf("letter %d page %d %s line %d has token after synthetic closer at token %d", letter, page, where, lineIdx, tokIdx)
|
||||
}
|
||||
}
|
||||
|
||||
var stack []string
|
||||
var textFromTokens strings.Builder
|
||||
for tokIdx, tok := range line.Tokens {
|
||||
switch tok.Type {
|
||||
case StartElement:
|
||||
stack = append(stack, tok.Name)
|
||||
case EndElement:
|
||||
if len(stack) == 0 || stack[len(stack)-1] != tok.Name {
|
||||
t.Fatalf("letter %d page %d %s line %d has unbalanced end token %q at token %d", letter, page, where, lineIdx, tok.Name, tokIdx)
|
||||
}
|
||||
stack = stack[:len(stack)-1]
|
||||
case CharData:
|
||||
textFromTokens.WriteString(tok.Value)
|
||||
if isOnlyASCIISpace(tok.Value) {
|
||||
if isLineStartPosition(line, tokIdx) {
|
||||
t.Fatalf("letter %d page %d %s line %d contains leading whitespace-only chardata token at token %d", letter, page, where, lineIdx, tokIdx)
|
||||
}
|
||||
if tok.Value != " " {
|
||||
t.Fatalf("letter %d page %d %s line %d contains non-normalized whitespace token %q at token %d", letter, page, where, lineIdx, tok.Value, tokIdx)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(stack) != 0 {
|
||||
t.Fatalf("letter %d page %d %s line %d ended with %d unclosed tags", letter, page, where, lineIdx, len(stack))
|
||||
}
|
||||
if line.Text != textFromTokens.String() {
|
||||
t.Fatalf("letter %d page %d %s line %d has Text mismatch: %q != %q", letter, page, where, lineIdx, line.Text, textFromTokens.String())
|
||||
}
|
||||
if line.Text != "" {
|
||||
if hasLeadingASCIISpace(line.Text) {
|
||||
t.Fatalf("letter %d page %d %s line %d has Text starting with whitespace: %q", letter, page, where, lineIdx, line.Text)
|
||||
}
|
||||
if hasTrailingASCIISpace(line.Text) {
|
||||
t.Fatalf("letter %d page %d %s line %d has Text ending with whitespace: %q", letter, page, where, lineIdx, line.Text)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func isLineStartPosition(line Line, idx int) bool {
|
||||
for i := 0; i < idx; i++ {
|
||||
tok := line.Tokens[i]
|
||||
if tok.Type == StartElement && tok.Synth {
|
||||
continue
|
||||
}
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func assertCarryPair(t *testing.T, letter, page int, where string, lineIdx int, prev, next Line) {
|
||||
t.Helper()
|
||||
|
||||
closed := syntheticClosedNames(prev)
|
||||
reopened := syntheticReopenedPrefixNames(next)
|
||||
|
||||
if len(closed) == 0 {
|
||||
if len(reopened) != 0 {
|
||||
t.Fatalf("letter %d page %d %s line %d->%d reopens %d tags with no synthetic closes in previous line", letter, page, where, lineIdx, lineIdx+1, len(reopened))
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
slices.Reverse(closed)
|
||||
if !slices.Equal(closed, reopened) {
|
||||
t.Fatalf("letter %d page %d %s line %d->%d synthetic carry mismatch: closed=%v reopened=%v", letter, page, where, lineIdx, lineIdx+1, closed, reopened)
|
||||
}
|
||||
}
|
||||
|
||||
func syntheticClosedNames(line Line) []string {
|
||||
var out []string
|
||||
for _, tok := range line.Tokens {
|
||||
if tok.Type == EndElement && tok.Synth {
|
||||
out = append(out, tok.Name)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func syntheticReopenedPrefixNames(line Line) []string {
|
||||
var out []string
|
||||
for _, tok := range line.Tokens {
|
||||
if tok.Type == StartElement && tok.Synth {
|
||||
out = append(out, tok.Name)
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func linePairHasValidSyntheticCarry(prev, next Line) bool {
|
||||
closed := syntheticClosedNames(prev)
|
||||
if len(closed) == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
slices.Reverse(closed)
|
||||
reopened := syntheticReopenedPrefixNames(next)
|
||||
if len(reopened) < len(closed) {
|
||||
return false
|
||||
}
|
||||
for i := range closed {
|
||||
if reopened[i] != closed[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
282
xmlmodels/textparse.go
Normal file
282
xmlmodels/textparse.go
Normal file
@@ -0,0 +1,282 @@
|
||||
package xmlmodels
|
||||
|
||||
import (
|
||||
"encoding/xml"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type TokenType int
|
||||
|
||||
const (
|
||||
StartElement TokenType = iota
|
||||
EndElement
|
||||
CharData
|
||||
)
|
||||
|
||||
type LineType int
|
||||
|
||||
const (
|
||||
Continuation LineType = iota
|
||||
First
|
||||
Fist = First // backward-compatible alias for historical typo
|
||||
Semantic // Indent=0 , still type="break"
|
||||
Indent // Indent>0, type dosent matter
|
||||
Empty // no line content, after that, an empty line
|
||||
)
|
||||
|
||||
type Token struct {
|
||||
Type TokenType
|
||||
Name string
|
||||
Attrs map[string]string
|
||||
Value string
|
||||
// INFO: true means synthetic token without corresponding XML token.
|
||||
Synth bool
|
||||
}
|
||||
|
||||
type Line struct {
|
||||
Type LineType
|
||||
Indent int
|
||||
Text string
|
||||
Tokens []Token
|
||||
}
|
||||
|
||||
type Page struct {
|
||||
Number int
|
||||
Lines []Line
|
||||
Sidenotes []Sidenote
|
||||
}
|
||||
|
||||
type Sidenote struct {
|
||||
Position string
|
||||
Annotation string
|
||||
Lines []Line
|
||||
}
|
||||
|
||||
type lineAccumulator struct {
|
||||
curLine *Line
|
||||
openStack []Token
|
||||
implicitType LineType
|
||||
hasAnyLine bool
|
||||
appendLine func(Line)
|
||||
hasCharData bool
|
||||
}
|
||||
|
||||
func newLineAccumulator(implicitType LineType, appendLine func(Line)) *lineAccumulator {
|
||||
return &lineAccumulator{
|
||||
implicitType: implicitType,
|
||||
appendLine: appendLine,
|
||||
}
|
||||
}
|
||||
|
||||
func (a *lineAccumulator) setImplicitType(lt LineType) {
|
||||
a.implicitType = lt
|
||||
}
|
||||
|
||||
func (a *lineAccumulator) startLine(lt LineType, indent int) {
|
||||
a.curLine = &Line{Type: lt, Indent: indent}
|
||||
a.hasCharData = false
|
||||
for _, st := range a.openStack {
|
||||
a.curLine.Tokens = append(a.curLine.Tokens, Token{
|
||||
Type: StartElement,
|
||||
Name: st.Name,
|
||||
Attrs: st.Attrs,
|
||||
Synth: true,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func (a *lineAccumulator) ensureLine() {
|
||||
if a.curLine != nil {
|
||||
return
|
||||
}
|
||||
a.startLine(a.implicitType, 0)
|
||||
if a.implicitType == First {
|
||||
a.implicitType = Continuation
|
||||
}
|
||||
}
|
||||
|
||||
func (a *lineAccumulator) closeLine() {
|
||||
if a.curLine == nil {
|
||||
a.ensureLine()
|
||||
}
|
||||
a.trimRightWhitespace()
|
||||
for i := len(a.openStack) - 1; i >= 0; i-- {
|
||||
a.curLine.Tokens = append(a.curLine.Tokens, Token{
|
||||
Type: EndElement,
|
||||
Name: a.openStack[i].Name,
|
||||
Synth: true,
|
||||
})
|
||||
}
|
||||
a.curLine.Text = lineTextFromTokens(a.curLine.Tokens)
|
||||
a.appendLine(*a.curLine)
|
||||
a.hasAnyLine = true
|
||||
a.curLine = nil
|
||||
}
|
||||
|
||||
func (a *lineAccumulator) handleLineMarker(se xml.StartElement) {
|
||||
lt, indent, emitEmpty := parseLineMarker(se)
|
||||
if a.curLine != nil {
|
||||
a.closeLine()
|
||||
}
|
||||
if emitEmpty {
|
||||
a.startLine(Empty, 0)
|
||||
a.closeLine()
|
||||
a.implicitType = Continuation
|
||||
return
|
||||
}
|
||||
a.startLine(lt, indent)
|
||||
a.implicitType = Continuation
|
||||
}
|
||||
|
||||
func (a *lineAccumulator) appendStart(name string, attrs map[string]string) {
|
||||
a.ensureLine()
|
||||
a.curLine.Tokens = append(a.curLine.Tokens, Token{
|
||||
Type: StartElement,
|
||||
Name: name,
|
||||
Attrs: attrs,
|
||||
})
|
||||
a.openStack = append(a.openStack, Token{
|
||||
Type: StartElement,
|
||||
Name: name,
|
||||
Attrs: attrs,
|
||||
})
|
||||
}
|
||||
|
||||
func (a *lineAccumulator) appendEnd(name string) {
|
||||
a.ensureLine()
|
||||
a.curLine.Tokens = append(a.curLine.Tokens, Token{
|
||||
Type: EndElement,
|
||||
Name: name,
|
||||
})
|
||||
if len(a.openStack) == 0 {
|
||||
return
|
||||
}
|
||||
if a.openStack[len(a.openStack)-1].Name == name {
|
||||
a.openStack = a.openStack[:len(a.openStack)-1]
|
||||
return
|
||||
}
|
||||
for i := len(a.openStack) - 1; i >= 0; i-- {
|
||||
if a.openStack[i].Name == name {
|
||||
a.openStack = append(a.openStack[:i], a.openStack[i+1:]...)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (a *lineAccumulator) appendText(s string) {
|
||||
a.ensureLine()
|
||||
if !a.hasCharData {
|
||||
s = trimLeftASCIISpace(s)
|
||||
}
|
||||
if s == "" {
|
||||
return
|
||||
}
|
||||
a.curLine.Tokens = append(a.curLine.Tokens, Token{
|
||||
Type: CharData,
|
||||
Value: s,
|
||||
})
|
||||
a.hasCharData = true
|
||||
}
|
||||
|
||||
func (a *lineAccumulator) isAtLineStart() bool {
|
||||
if a.curLine == nil {
|
||||
return true
|
||||
}
|
||||
for _, tok := range a.curLine.Tokens {
|
||||
if tok.Type == StartElement && tok.Synth {
|
||||
continue
|
||||
}
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (a *lineAccumulator) trimRightWhitespace() {
|
||||
if a.curLine == nil {
|
||||
return
|
||||
}
|
||||
toks := a.curLine.Tokens
|
||||
for {
|
||||
lastCharIdx := -1
|
||||
for i := len(toks) - 1; i >= 0; i-- {
|
||||
if toks[i].Type == CharData {
|
||||
lastCharIdx = i
|
||||
break
|
||||
}
|
||||
}
|
||||
if lastCharIdx < 0 {
|
||||
break
|
||||
}
|
||||
trimmed := trimRightASCIISpace(toks[lastCharIdx].Value)
|
||||
if trimmed == "" {
|
||||
toks = append(toks[:lastCharIdx], toks[lastCharIdx+1:]...)
|
||||
continue
|
||||
}
|
||||
toks[lastCharIdx].Value = trimmed
|
||||
break
|
||||
}
|
||||
a.curLine.Tokens = toks
|
||||
}
|
||||
|
||||
func lineTextFromTokens(tokens []Token) string {
|
||||
var b strings.Builder
|
||||
for _, tok := range tokens {
|
||||
if tok.Type == CharData {
|
||||
b.WriteString(tok.Value)
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func parseBlockLines(dec *xml.Decoder, endLocalName string) ([]Line, error) {
|
||||
lines := make([]Line, 0, 8)
|
||||
acc := newLineAccumulator(First, func(line Line) {
|
||||
lines = append(lines, line)
|
||||
})
|
||||
|
||||
for {
|
||||
tok, err := dec.Token()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
switch t := tok.(type) {
|
||||
case xml.StartElement:
|
||||
name := t.Name.Local
|
||||
if name == "line" {
|
||||
acc.handleLineMarker(t)
|
||||
continue
|
||||
}
|
||||
if isTransparentWrapper(name) {
|
||||
continue
|
||||
}
|
||||
acc.appendStart(name, attrsToMap(t.Attr))
|
||||
|
||||
case xml.EndElement:
|
||||
name := t.Name.Local
|
||||
if isTransparentWrapper(name) {
|
||||
continue
|
||||
}
|
||||
if name == endLocalName {
|
||||
if acc.curLine != nil {
|
||||
acc.closeLine()
|
||||
}
|
||||
return lines, nil
|
||||
}
|
||||
if name == "line" {
|
||||
continue
|
||||
}
|
||||
acc.appendEnd(name)
|
||||
|
||||
case xml.CharData:
|
||||
s := string([]byte(t))
|
||||
if isOnlyASCIISpace(s) {
|
||||
if acc.isAtLineStart() {
|
||||
continue
|
||||
}
|
||||
s = " "
|
||||
}
|
||||
acc.appendText(s)
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user