mirror of
https://github.com/Theodor-Springmann-Stiftung/lenz-web.git
synced 2025-10-29 09:15:33 +00:00
361 lines
8.2 KiB
Go
361 lines
8.2 KiB
Go
package xmlmodels
|
|
|
|
import (
|
|
"encoding/xml"
|
|
"iter"
|
|
"strings"
|
|
)
|
|
|
|
// Token wraps xml.Token with additional parsing context
|
|
type Token struct {
|
|
Index int // Position in token array
|
|
Stack []string // Element names in the stack at this token
|
|
Attributes map[string]string // Attributes for StartElement tokens
|
|
}
|
|
|
|
// NewTokenFromXMLToken creates a Token from xml.Token with context
|
|
func NewTokenFromXMLToken(xmlToken xml.Token, stack []string, index int) Token {
|
|
token := Token{
|
|
Index: index,
|
|
Stack: make([]string, len(stack)),
|
|
Attributes: make(map[string]string),
|
|
}
|
|
|
|
copy(token.Stack, stack)
|
|
|
|
// Extract attributes if this is a StartElement
|
|
if startElement, ok := xmlToken.(xml.StartElement); ok {
|
|
for _, attr := range startElement.Attr {
|
|
token.Attributes[attr.Name.Local] = attr.Value
|
|
}
|
|
}
|
|
|
|
return token
|
|
}
|
|
|
|
type LetterTokenType int
|
|
|
|
const (
|
|
LetterStartElement LetterTokenType = iota
|
|
LetterEndElement
|
|
LetterCharData
|
|
LetterComment
|
|
LetterProcInst
|
|
LetterDirective
|
|
)
|
|
|
|
// LetterToken wraps xml.Token with additional context for Letter/Page parsing
|
|
type LetterToken struct {
|
|
Name string
|
|
Attributes map[string]string
|
|
Inner xml.Token
|
|
Type LetterTokenType
|
|
Data string
|
|
Stack []*LetterToken
|
|
Index int
|
|
PageIndex int // Which page this token belongs to
|
|
Letter int // Which letter this token belongs to
|
|
|
|
// Navigation fields
|
|
charData string
|
|
children []*LetterToken
|
|
childrenParsed bool
|
|
chardataParsed bool
|
|
parser *LetterParser
|
|
}
|
|
|
|
// LetterParser wraps a slice of LetterTokens with navigation capabilities
|
|
type LetterParser struct {
|
|
Stack []*LetterToken
|
|
pipeline []*LetterToken
|
|
letter int
|
|
pageMap map[int]int // Maps page number to starting token index
|
|
}
|
|
|
|
// NewLetterParser creates a parser from xml.Token slice
|
|
func NewLetterParser(tokens []xml.Token, letter int, pageIndex int) *LetterParser {
|
|
parser := &LetterParser{
|
|
Stack: make([]*LetterToken, 0),
|
|
letter: letter,
|
|
pageMap: make(map[int]int),
|
|
}
|
|
|
|
stack := make([]*LetterToken, 0)
|
|
|
|
for i, token := range tokens {
|
|
letterToken := &LetterToken{
|
|
Inner: xml.CopyToken(token),
|
|
Index: i,
|
|
PageIndex: pageIndex,
|
|
Letter: letter,
|
|
Stack: make([]*LetterToken, len(stack)),
|
|
parser: parser,
|
|
}
|
|
|
|
// Copy current stack
|
|
copy(letterToken.Stack, stack)
|
|
|
|
switch t := token.(type) {
|
|
case xml.StartElement:
|
|
letterToken.Name = t.Name.Local
|
|
letterToken.Attributes = mapXMLAttributes(t.Attr)
|
|
letterToken.Type = LetterStartElement
|
|
|
|
// Add to parent's children if not parsed yet
|
|
if len(stack) > 0 && !stack[len(stack)-1].childrenParsed {
|
|
stack[len(stack)-1].children = append(stack[len(stack)-1].children, letterToken)
|
|
}
|
|
stack = append(stack, letterToken)
|
|
|
|
case xml.EndElement:
|
|
if len(stack) > 0 {
|
|
element := stack[len(stack)-1]
|
|
element.childrenParsed = true
|
|
element.chardataParsed = true
|
|
stack = stack[:len(stack)-1]
|
|
}
|
|
letterToken.Name = t.Name.Local
|
|
letterToken.Attributes = make(map[string]string)
|
|
letterToken.Type = LetterEndElement
|
|
|
|
case xml.CharData:
|
|
text := string(t)
|
|
if text != "" && len(stack) > 0 {
|
|
for i := range stack {
|
|
if !stack[i].chardataParsed {
|
|
stack[i].charData += text
|
|
}
|
|
}
|
|
}
|
|
letterToken.Data = text
|
|
letterToken.Type = LetterCharData
|
|
|
|
case xml.Comment:
|
|
letterToken.Type = LetterComment
|
|
letterToken.Data = string(t)
|
|
|
|
case xml.ProcInst:
|
|
letterToken.Name = t.Target
|
|
letterToken.Data = string(t.Inst)
|
|
letterToken.Type = LetterProcInst
|
|
|
|
case xml.Directive:
|
|
letterToken.Data = string(t)
|
|
letterToken.Type = LetterDirective
|
|
}
|
|
|
|
parser.pipeline = append(parser.pipeline, letterToken)
|
|
}
|
|
|
|
return parser
|
|
}
|
|
|
|
// GetStack returns current parsing stack
|
|
func (p *LetterParser) GetStack() []*LetterToken {
|
|
return p.Stack
|
|
}
|
|
|
|
// Pipeline returns all tokens
|
|
func (p *LetterParser) Pipeline() []*LetterToken {
|
|
return p.pipeline
|
|
}
|
|
|
|
// TokenAt returns token at specific index
|
|
func (p *LetterParser) TokenAt(index int) *LetterToken {
|
|
if index < 0 || index >= len(p.pipeline) {
|
|
return nil
|
|
}
|
|
return p.pipeline[index]
|
|
}
|
|
|
|
// IterateFrom creates iterator starting from specific index
|
|
func (p *LetterParser) IterateFrom(index int) iter.Seq2[*LetterToken, error] {
|
|
return func(yield func(*LetterToken, error) bool) {
|
|
for i := index; i < len(p.pipeline); i++ {
|
|
if !yield(p.pipeline[i], nil) {
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Iterate over all tokens
|
|
func (p *LetterParser) Iterate() iter.Seq2[*LetterToken, error] {
|
|
return p.IterateFrom(0)
|
|
}
|
|
|
|
// Previous returns tokens before given index
|
|
func (p *LetterParser) Previous(index int) []*LetterToken {
|
|
if index <= 0 || index > len(p.pipeline) {
|
|
return nil
|
|
}
|
|
return p.pipeline[:index]
|
|
}
|
|
|
|
// LetterToken methods
|
|
|
|
// String returns string representation
|
|
func (t *LetterToken) String() string {
|
|
builder := strings.Builder{}
|
|
switch t.Type {
|
|
case LetterStartElement:
|
|
builder.WriteString("<" + t.Name)
|
|
for k, v := range t.Attributes {
|
|
builder.WriteString(" " + k + `="` + v + `"`)
|
|
}
|
|
builder.WriteString(">")
|
|
case LetterEndElement:
|
|
builder.WriteString("</" + t.Name + ">")
|
|
case LetterCharData:
|
|
builder.WriteString(t.Data)
|
|
case LetterComment:
|
|
builder.WriteString("<!--" + t.Data + "-->")
|
|
}
|
|
return builder.String()
|
|
}
|
|
|
|
// Element returns all tokens from start to matching end element
|
|
func (t *LetterToken) Element() []*LetterToken {
|
|
if t.Type != LetterStartElement {
|
|
return nil
|
|
}
|
|
|
|
var tokens []*LetterToken
|
|
depth := 0
|
|
|
|
for token, _ := range t.parser.IterateFrom(t.Index) {
|
|
tokens = append(tokens, token)
|
|
|
|
if token.Type == LetterStartElement && token.Name == t.Name {
|
|
depth++
|
|
} else if token.Type == LetterEndElement && token.Name == t.Name {
|
|
depth--
|
|
if depth == 0 {
|
|
return tokens
|
|
}
|
|
}
|
|
}
|
|
|
|
return tokens
|
|
}
|
|
|
|
// Children returns direct child elements
|
|
func (t *LetterToken) Children() []*LetterToken {
|
|
if t.childrenParsed {
|
|
return t.children
|
|
}
|
|
|
|
if t.Type != LetterStartElement {
|
|
return nil
|
|
}
|
|
|
|
element := t.Element()
|
|
if len(element) <= 1 {
|
|
return nil
|
|
}
|
|
|
|
// Skip first (self) and find direct children
|
|
depth := 0
|
|
for _, token := range element[1:] { // Skip self
|
|
if token.Type == LetterStartElement {
|
|
if depth == 0 {
|
|
t.children = append(t.children, token)
|
|
}
|
|
depth++
|
|
} else if token.Type == LetterEndElement {
|
|
depth--
|
|
}
|
|
}
|
|
|
|
t.childrenParsed = true
|
|
return t.children
|
|
}
|
|
|
|
// CharData returns character data content
|
|
func (t *LetterToken) CharData() string {
|
|
if t.Type == LetterCharData || t.Type == LetterComment {
|
|
return t.Data
|
|
}
|
|
|
|
if t.chardataParsed {
|
|
return t.charData
|
|
}
|
|
|
|
if t.Type != LetterStartElement {
|
|
return ""
|
|
}
|
|
|
|
element := t.Element()
|
|
if len(element) == 0 {
|
|
return ""
|
|
}
|
|
|
|
var builder strings.Builder
|
|
for _, token := range element {
|
|
if token.Type == LetterCharData {
|
|
builder.WriteString(token.Data)
|
|
}
|
|
}
|
|
|
|
t.chardataParsed = true
|
|
t.charData = builder.String()
|
|
return t.charData
|
|
}
|
|
|
|
// Next returns iterator from next token
|
|
func (t *LetterToken) Next() iter.Seq2[*LetterToken, error] {
|
|
return t.parser.IterateFrom(t.Index + 1)
|
|
}
|
|
|
|
// Previous returns tokens before this one
|
|
func (t *LetterToken) Previous() []*LetterToken {
|
|
return t.parser.Previous(t.Index)
|
|
}
|
|
|
|
// FindByName finds first child element with given name
|
|
func (t *LetterToken) FindByName(name string) *LetterToken {
|
|
for _, child := range t.Children() {
|
|
if child.Name == name {
|
|
return child
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// FindAllByName finds all child elements with given name
|
|
func (t *LetterToken) FindAllByName(name string) []*LetterToken {
|
|
var result []*LetterToken
|
|
for _, child := range t.Children() {
|
|
if child.Name == name {
|
|
result = append(result, child)
|
|
}
|
|
}
|
|
return result
|
|
}
|
|
|
|
// GetAttribute returns attribute value
|
|
func (t *LetterToken) GetAttribute(name string) string {
|
|
if t.Attributes == nil {
|
|
return ""
|
|
}
|
|
return t.Attributes[name]
|
|
}
|
|
|
|
// GetStackDepth returns current nesting depth
|
|
func (t *LetterToken) GetStackDepth() int {
|
|
return len(t.Stack)
|
|
}
|
|
|
|
// InPage checks if token belongs to specific page
|
|
func (t *LetterToken) InPage(pageNo int) bool {
|
|
return t.PageIndex == pageNo
|
|
}
|
|
|
|
// mapXMLAttributes converts xml.Attr to map[string]string
|
|
func mapXMLAttributes(attrs []xml.Attr) map[string]string {
|
|
attrMap := make(map[string]string)
|
|
for _, attr := range attrs {
|
|
attrMap[attr.Name.Local] = attr.Value
|
|
}
|
|
return attrMap
|
|
} |