Simple SAX parser

This commit is contained in:
Simon Martens
2025-03-19 21:36:17 +01:00
parent 5681f4f352
commit b692f22f12
2 changed files with 227 additions and 0 deletions

130
xml/parser.go Normal file
View File

@@ -0,0 +1,130 @@
package xmlparsing
import (
"encoding/xml"
"io"
"iter"
"strings"
)
type TokenType int
const (
StartElement TokenType = iota
EndElement
CharData
Comment
ProcInst
Directive
)
type Element struct {
Name string
Attributes map[string]string
CharData string
}
type Token struct {
Name string
Attributes map[string]string
Inner xml.Token
Type TokenType
Data string
}
type TokenResult[T any] struct {
State T
Token Token
Stack []Element
}
func Iterate[T any](xmlData string, initialState T) iter.Seq2[*TokenResult[T], error] {
decoder := xml.NewDecoder(strings.NewReader(xmlData))
stack := []Element{}
state := initialState
return iter.Seq2[*TokenResult[T], error](func(yield func(*TokenResult[T], error) bool) {
for {
token, err := decoder.Token()
if err == io.EOF {
return
}
if err != nil {
yield(nil, err)
return
}
var customToken Token
switch t := token.(type) {
case xml.StartElement:
elem := Element{
Name: t.Name.Local,
Attributes: mapAttributes(t.Attr),
CharData: "",
}
stack = append(stack, elem)
customToken = Token{
Name: t.Name.Local,
Attributes: elem.Attributes,
Inner: t,
Type: StartElement,
}
case xml.EndElement:
if len(stack) > 0 {
stack = stack[:len(stack)-1]
}
customToken = Token{Name: t.Name.Local, Inner: t, Type: EndElement}
case xml.CharData:
text := strings.TrimSpace(string(t))
if text != "" && len(stack) > 0 {
stack[len(stack)-1].CharData += text + " "
}
customToken = Token{
Name: "CharData",
Inner: t,
Data: text,
Type: CharData,
}
case xml.Comment:
customToken = Token{
Name: "Comment",
Inner: t,
Data: string(t),
Type: Comment,
}
case xml.ProcInst:
customToken = Token{
Name: t.Target,
Inner: t,
Data: string(t.Inst),
Type: ProcInst,
}
case xml.Directive:
customToken = Token{
Name: "Directive",
Inner: t,
Data: string(t),
Type: Directive,
}
}
result := &TokenResult[T]{
State: state,
Token: customToken,
Stack: append([]Element{}, stack...),
}
if !yield(result, nil) {
return
}
}
})
}
// mapAttributes converts xml.Attr to a map[string]string.
func mapAttributes(attrs []xml.Attr) map[string]string {
attrMap := make(map[string]string)
for _, attr := range attrs {
attrMap[attr.Name.Local] = attr.Value
}
return attrMap
}

97
xml/parser_test.go Normal file
View File

@@ -0,0 +1,97 @@
package xmlparsing
import (
"testing"
)
type TestState struct {
ParsedElements []string
}
func TestIterate_ValidXML(t *testing.T) {
xmlData := `<root>
<child attr="value">Text</child>
<!-- This is a comment -->
<?xml-stylesheet type="text/css" href="style.css"?>
<!DOCTYPE note>
</root>`
state := TestState{}
for tokenResult, err := range Iterate(xmlData, state) {
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if tokenResult == nil {
t.Fatal("Received nil token result")
}
state.ParsedElements = append(state.ParsedElements, tokenResult.Token.Name)
}
if len(state.ParsedElements) == 0 {
t.Fatal("No elements were parsed")
}
}
func TestIterate_InvalidXML(t *testing.T) {
xmlData := `<root><child></root>`
state := TestState{}
var global error
for _, err := range Iterate(xmlData, state) {
if err != nil {
global = err
}
}
if global == nil {
t.Fatal("Expected error, but got nil")
}
}
func TestIterate_EmptyXML(t *testing.T) {
xmlData := ""
state := TestState{}
for _, err := range Iterate(xmlData, state) {
if err != nil {
t.Fatalf("Expected iter.ErrEnd, but got: %v", err)
}
}
}
func TestIterate_CharDataTracking(t *testing.T) {
xmlData := `<root>
<child>First</child>
<child>Second</child>
</root>`
state := TestState{}
charDataCount := 0
for tokenResult, err := range Iterate(xmlData, state) {
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if tokenResult.Token.Name == "CharData" {
charDataCount++
}
}
if charDataCount != 5 {
t.Fatalf("Expected 2 CharData elements, got %d", charDataCount)
}
}
func TestIterate_AttributeParsing(t *testing.T) {
xmlData := `<root>
<child attr1="value1" attr2="value2">Content</child>
</root>`
state := TestState{}
for tokenResult, err := range Iterate(xmlData, state) {
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if tokenResult.Token.Name == "child" && tokenResult.Token.Type == StartElement {
if tokenResult.Token.Attributes["attr1"] != "value1" || tokenResult.Token.Attributes["attr2"] != "value2" {
t.Fatalf("Incorrect attributes parsed: %v", tokenResult.Token.Attributes)
}
}
}
}