mirror of
https://github.com/Theodor-Springmann-Stiftung/lenz-web.git
synced 2025-10-28 16:55:32 +00:00
Simple SAX parser
This commit is contained in:
130
xml/parser.go
Normal file
130
xml/parser.go
Normal file
@@ -0,0 +1,130 @@
|
||||
package xmlparsing
|
||||
|
||||
import (
|
||||
"encoding/xml"
|
||||
"io"
|
||||
"iter"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type TokenType int
|
||||
|
||||
const (
|
||||
StartElement TokenType = iota
|
||||
EndElement
|
||||
CharData
|
||||
Comment
|
||||
ProcInst
|
||||
Directive
|
||||
)
|
||||
|
||||
type Element struct {
|
||||
Name string
|
||||
Attributes map[string]string
|
||||
CharData string
|
||||
}
|
||||
|
||||
type Token struct {
|
||||
Name string
|
||||
Attributes map[string]string
|
||||
Inner xml.Token
|
||||
Type TokenType
|
||||
Data string
|
||||
}
|
||||
|
||||
type TokenResult[T any] struct {
|
||||
State T
|
||||
Token Token
|
||||
Stack []Element
|
||||
}
|
||||
|
||||
func Iterate[T any](xmlData string, initialState T) iter.Seq2[*TokenResult[T], error] {
|
||||
decoder := xml.NewDecoder(strings.NewReader(xmlData))
|
||||
stack := []Element{}
|
||||
state := initialState
|
||||
return iter.Seq2[*TokenResult[T], error](func(yield func(*TokenResult[T], error) bool) {
|
||||
for {
|
||||
token, err := decoder.Token()
|
||||
if err == io.EOF {
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
yield(nil, err)
|
||||
return
|
||||
}
|
||||
|
||||
var customToken Token
|
||||
switch t := token.(type) {
|
||||
case xml.StartElement:
|
||||
elem := Element{
|
||||
Name: t.Name.Local,
|
||||
Attributes: mapAttributes(t.Attr),
|
||||
CharData: "",
|
||||
}
|
||||
stack = append(stack, elem)
|
||||
customToken = Token{
|
||||
Name: t.Name.Local,
|
||||
Attributes: elem.Attributes,
|
||||
Inner: t,
|
||||
Type: StartElement,
|
||||
}
|
||||
case xml.EndElement:
|
||||
if len(stack) > 0 {
|
||||
stack = stack[:len(stack)-1]
|
||||
}
|
||||
customToken = Token{Name: t.Name.Local, Inner: t, Type: EndElement}
|
||||
case xml.CharData:
|
||||
text := strings.TrimSpace(string(t))
|
||||
if text != "" && len(stack) > 0 {
|
||||
stack[len(stack)-1].CharData += text + " "
|
||||
}
|
||||
customToken = Token{
|
||||
Name: "CharData",
|
||||
Inner: t,
|
||||
Data: text,
|
||||
Type: CharData,
|
||||
}
|
||||
case xml.Comment:
|
||||
customToken = Token{
|
||||
Name: "Comment",
|
||||
Inner: t,
|
||||
Data: string(t),
|
||||
Type: Comment,
|
||||
}
|
||||
case xml.ProcInst:
|
||||
customToken = Token{
|
||||
Name: t.Target,
|
||||
Inner: t,
|
||||
Data: string(t.Inst),
|
||||
Type: ProcInst,
|
||||
}
|
||||
case xml.Directive:
|
||||
customToken = Token{
|
||||
Name: "Directive",
|
||||
Inner: t,
|
||||
Data: string(t),
|
||||
Type: Directive,
|
||||
}
|
||||
}
|
||||
|
||||
result := &TokenResult[T]{
|
||||
State: state,
|
||||
Token: customToken,
|
||||
Stack: append([]Element{}, stack...),
|
||||
}
|
||||
|
||||
if !yield(result, nil) {
|
||||
return
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// mapAttributes converts xml.Attr to a map[string]string.
|
||||
func mapAttributes(attrs []xml.Attr) map[string]string {
|
||||
attrMap := make(map[string]string)
|
||||
for _, attr := range attrs {
|
||||
attrMap[attr.Name.Local] = attr.Value
|
||||
}
|
||||
return attrMap
|
||||
}
|
||||
97
xml/parser_test.go
Normal file
97
xml/parser_test.go
Normal file
@@ -0,0 +1,97 @@
|
||||
package xmlparsing
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
type TestState struct {
|
||||
ParsedElements []string
|
||||
}
|
||||
|
||||
func TestIterate_ValidXML(t *testing.T) {
|
||||
xmlData := `<root>
|
||||
<child attr="value">Text</child>
|
||||
<!-- This is a comment -->
|
||||
<?xml-stylesheet type="text/css" href="style.css"?>
|
||||
<!DOCTYPE note>
|
||||
</root>`
|
||||
|
||||
state := TestState{}
|
||||
for tokenResult, err := range Iterate(xmlData, state) {
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error: %v", err)
|
||||
}
|
||||
if tokenResult == nil {
|
||||
t.Fatal("Received nil token result")
|
||||
}
|
||||
state.ParsedElements = append(state.ParsedElements, tokenResult.Token.Name)
|
||||
}
|
||||
|
||||
if len(state.ParsedElements) == 0 {
|
||||
t.Fatal("No elements were parsed")
|
||||
}
|
||||
}
|
||||
|
||||
func TestIterate_InvalidXML(t *testing.T) {
|
||||
xmlData := `<root><child></root>`
|
||||
state := TestState{}
|
||||
var global error
|
||||
for _, err := range Iterate(xmlData, state) {
|
||||
if err != nil {
|
||||
global = err
|
||||
}
|
||||
}
|
||||
if global == nil {
|
||||
t.Fatal("Expected error, but got nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestIterate_EmptyXML(t *testing.T) {
|
||||
xmlData := ""
|
||||
state := TestState{}
|
||||
for _, err := range Iterate(xmlData, state) {
|
||||
if err != nil {
|
||||
t.Fatalf("Expected iter.ErrEnd, but got: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestIterate_CharDataTracking(t *testing.T) {
|
||||
xmlData := `<root>
|
||||
<child>First</child>
|
||||
<child>Second</child>
|
||||
</root>`
|
||||
|
||||
state := TestState{}
|
||||
charDataCount := 0
|
||||
for tokenResult, err := range Iterate(xmlData, state) {
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error: %v", err)
|
||||
}
|
||||
if tokenResult.Token.Name == "CharData" {
|
||||
charDataCount++
|
||||
}
|
||||
}
|
||||
|
||||
if charDataCount != 5 {
|
||||
t.Fatalf("Expected 2 CharData elements, got %d", charDataCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestIterate_AttributeParsing(t *testing.T) {
|
||||
xmlData := `<root>
|
||||
<child attr1="value1" attr2="value2">Content</child>
|
||||
</root>`
|
||||
|
||||
state := TestState{}
|
||||
for tokenResult, err := range Iterate(xmlData, state) {
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error: %v", err)
|
||||
}
|
||||
if tokenResult.Token.Name == "child" && tokenResult.Token.Type == StartElement {
|
||||
if tokenResult.Token.Attributes["attr1"] != "value1" || tokenResult.Token.Attributes["attr2"] != "value2" {
|
||||
t.Fatalf("Incorrect attributes parsed: %v", tokenResult.Token.Attributes)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user