diff --git a/xmlmodels/letter.go b/xmlmodels/letter.go
index 6a47a17..5709fc0 100644
--- a/xmlmodels/letter.go
+++ b/xmlmodels/letter.go
@@ -34,6 +34,7 @@ type Page struct {
Sidenotes []Sidenote
Hands []int
Tokens []xml.Token
+ TokenInfo []Token // Stack and index info for each token
}
type Sidenote struct {
@@ -44,6 +45,7 @@ type Sidenote struct {
Anchor int
Tokens []xml.Token
CharData string
+ TokenInfo []Token // Stack and index info for each token
}
func (l Letter) Keys() []any {
@@ -62,6 +64,15 @@ func (l Letter) String() string {
return string(json)
}
+func (l Letter) Hands() []int {
+ h := []int{}
+
+ for _, page := range l.Pages {
+ h = append(h, page.Hands...)
+ }
+ return h
+}
+
type SidenotePosition uint8
func (sp *SidenotePosition) UnmarshalXMLAttr(attr xml.Attr) error {
@@ -108,6 +119,7 @@ func (lt *Letter) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
func (lt *Letter) parseTokens(d *xml.Decoder) error {
b := strings.Builder{}
var c_page *Page = nil
+ var stack []string // Track element stack
for {
token, err := d.Token()
@@ -170,19 +182,29 @@ func (lt *Letter) parseTokens(d *xml.Decoder) error {
default:
if c_page != nil {
c_page.Tokens = append(c_page.Tokens, tokenCopy)
+ token := NewTokenFromXMLToken(tokenCopy, stack, len(c_page.Tokens)-1)
+ c_page.TokenInfo = append(c_page.TokenInfo, token)
}
+ stack = append(stack, t.Name.Local)
}
case xml.CharData:
b.WriteString(string(t))
if c_page != nil {
c_page.Tokens = append(c_page.Tokens, tokenCopy)
+ token := NewTokenFromXMLToken(tokenCopy, stack, len(c_page.Tokens)-1)
+ c_page.TokenInfo = append(c_page.TokenInfo, token)
}
case xml.EndElement:
+ if len(stack) > 0 && stack[len(stack)-1] == t.Name.Local {
+ stack = stack[:len(stack)-1]
+ }
+
if t.Name.Local == "letterText" {
- if c_page != nil {
- c_page.Tokens = append(c_page.Tokens, tokenCopy)
+ // Don't add letterText end element to page tokens
+ // Only save page if it has actual content
+ if c_page != nil && len(c_page.Tokens) > 0 {
lt.Pages = append(lt.Pages, *c_page)
}
lt.CharData = b.String()
@@ -191,6 +213,8 @@ func (lt *Letter) parseTokens(d *xml.Decoder) error {
if c_page != nil {
c_page.Tokens = append(c_page.Tokens, tokenCopy)
+ token := NewTokenFromXMLToken(tokenCopy, stack, len(c_page.Tokens)-1)
+ c_page.TokenInfo = append(c_page.TokenInfo, token)
}
}
}
@@ -201,6 +225,7 @@ func (lt *Letter) parseTokens(d *xml.Decoder) error {
func (s *Sidenote) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
b := strings.Builder{}
s.XMLName = start.Name
+ var stack []string // Track element stack within sidenote
for _, attr := range start.Attr {
switch attr.Name.Local {
@@ -224,18 +249,35 @@ func (s *Sidenote) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
tokenCopy := xml.CopyToken(token)
switch t := tokenCopy.(type) {
+ case xml.StartElement:
+ s.Tokens = append(s.Tokens, tokenCopy)
+ token := NewTokenFromXMLToken(tokenCopy, stack, len(s.Tokens)-1)
+ s.TokenInfo = append(s.TokenInfo, token)
+ stack = append(stack, t.Name.Local)
+
case xml.CharData:
b.WriteString(string(t))
s.Tokens = append(s.Tokens, tokenCopy)
- // WARNING: this is a problem for sidenotes within sidenotes
+ token := NewTokenFromXMLToken(tokenCopy, stack, len(s.Tokens)-1)
+ s.TokenInfo = append(s.TokenInfo, token)
+
case xml.EndElement:
+ if len(stack) > 0 && stack[len(stack)-1] == t.Name.Local {
+ stack = stack[:len(stack)-1]
+ }
+
if t.Name.Local == start.Name.Local {
s.CharData = b.String()
return nil
}
s.Tokens = append(s.Tokens, tokenCopy)
+ token := NewTokenFromXMLToken(tokenCopy, stack, len(s.Tokens)-1)
+ s.TokenInfo = append(s.TokenInfo, token)
+
default:
s.Tokens = append(s.Tokens, tokenCopy)
+ token := NewTokenFromXMLToken(tokenCopy, stack, len(s.Tokens)-1)
+ s.TokenInfo = append(s.TokenInfo, token)
}
}
}
diff --git a/xmlmodels/letter_test.go b/xmlmodels/letter_test.go
index f18b0f7..58276e2 100644
--- a/xmlmodels/letter_test.go
+++ b/xmlmodels/letter_test.go
@@ -2,192 +2,230 @@ package xmlmodels
import (
"encoding/xml"
+ "io"
+ "os"
"strings"
"testing"
)
-// Helper function to convert []xml.Token back to string for testing
-func tokensToString(tokens []xml.Token) string {
- var sb strings.Builder
- for _, token := range tokens {
- switch t := token.(type) {
- case xml.StartElement:
- sb.WriteString("<")
- sb.WriteString(t.Name.Local)
- for _, attr := range t.Attr {
- sb.WriteString(" ")
- sb.WriteString(attr.Name.Local)
- sb.WriteString(`="`)
- sb.WriteString(attr.Value)
- sb.WriteString(`"`)
- }
- sb.WriteString(">")
- case xml.EndElement:
- sb.WriteString("")
- sb.WriteString(t.Name.Local)
- sb.WriteString(">")
- case xml.CharData:
- sb.Write(t)
- case xml.Comment:
- sb.WriteString("")
- case xml.ProcInst:
- sb.WriteString("")
- sb.WriteString(t.Target)
- if len(t.Inst) > 0 {
- sb.WriteString(" ")
- sb.Write(t.Inst)
- }
- sb.WriteString("?>")
- }
- }
- return sb.String()
-}
-
-func TestLetterTextUnmarshal_SimpleCase(t *testing.T) {
- // Simple test case with basic structure
- testXML := `
- Some content before first page break.
-
- Content on page 1 with some markup and more text.
- This is a sidenote
- More content on page 1.
-
- Content on page 2 with bold text.
- Hand reference content
- Final content on page 2.
- `
-
- var letterText LetterText
- err := xml.Unmarshal([]byte(testXML), &letterText)
- if err != nil {
- t.Fatalf("Error unmarshaling XML: %v", err)
- }
-
- // Verify basic structure
- if len(letterText.Pages) != 3 {
- t.Errorf("Expected 3 pages, got %d", len(letterText.Pages))
- }
- if len(letterText.PageBreaks) != 2 {
- t.Errorf("Expected 2 page breaks, got %d", len(letterText.PageBreaks))
- }
- if len(letterText.Sidenotes) != 1 {
- t.Errorf("Expected 1 sidenote, got %d", len(letterText.Sidenotes))
- }
- if letterText.Hands.Reference != 42 {
- t.Errorf("Expected hand reference 42, got %d", letterText.Hands.Reference)
- }
-
- // Verify page breaks
- if letterText.PageBreaks[0].Index != 1 {
- t.Errorf("Expected page break index 1, got %d", letterText.PageBreaks[0].Index)
- }
- if letterText.PageBreaks[1].Index != 2 {
- t.Errorf("Expected page break index 2, got %d", letterText.PageBreaks[1].Index)
- }
-
- // Verify sidenote
- sidenote := letterText.Sidenotes[0]
- if sidenote.Page != 1 {
- t.Errorf("Expected sidenote on page 1, got %d", sidenote.Page)
- }
- if sidenote.Position != SidenotePositionRight {
- t.Errorf("Expected sidenote position right, got %d", sidenote.Position)
- }
- if sidenote.Annotation != "test" {
- t.Errorf("Expected sidenote annotation 'test', got '%s'", sidenote.Annotation)
- }
- sidenoteContent := tokensToString(sidenote.Content)
- if !strings.Contains(sidenoteContent, "This is a sidenote") {
- t.Errorf("Expected sidenote content to contain 'This is a sidenote', got '%s'", sidenoteContent)
- }
-
- // Verify page content doesn't contain sidenote text
- for _, page := range letterText.Pages {
- content := tokensToString(page.Content)
- if strings.Contains(content, "This is a sidenote") {
- t.Errorf("Page content should not contain sidenote text, but page %d does: %s", page.Page, content)
- }
- }
-}
-
-func TestLetterTextUnmarshal_RealExample_Letter1(t *testing.T) {
- // Real example from briefe.xml - Letter 1 (simplified)
- testXML := `
+// Test data from real briefe.xml
+const testLetter1 = `
HochEdelgeborner Hochgelahrter Herr Secretair
Verehrungswürdigster Gönner!
-Ew. HochEdelgebh: haben mich durch die neue Probe von Dero schätzbaren Gewogenheit ausserorndtlich beschämt. Meine Feder ist zu schwach, Denenselben die regen Empfindungen meines Herzens darüber zu schildern.
-lasse mich noch lange das Glück genießen, Dieselben in dem blühendsten Wohlstande zu sehen, und mich mit dem erkenntlichsten Herzen nennen zu dürfen
-
-Hoch Edelgeborner Hochgelahrter Herr Secretair
-Verehrungswürdigster Gönner
-Ew. HochEdelgebh:
-Von Hause, d. 2 Jenner, 1765.
-gehorsamsten Diener
+Ew. HochEdelgebh: haben mich durch die neue Probe von Dero schätzbaren Gewogenheit ausserorndtlich beschämt.
+lasse mich noch lange das Glück genießen, Dieselben in dem blühendsten Wohlstande zu sehen.
+gehorsamsten Diener
Jacob Michael Reinhold Lenz
`
- var letterText LetterText
- err := xml.Unmarshal([]byte(testXML), &letterText)
- if err != nil {
- t.Fatalf("Error unmarshaling real XML: %v", err)
- }
-
- // Should have 2 pages
- if len(letterText.Pages) != 2 {
- t.Errorf("Expected 2 pages, got %d", len(letterText.Pages))
- }
- if len(letterText.PageBreaks) != 2 {
- t.Errorf("Expected 2 page breaks, got %d", len(letterText.PageBreaks))
- }
-
- // Verify page content contains expected elements
- page1Found := false
- page2Found := false
- for _, page := range letterText.Pages {
- content := tokensToString(page.Content)
- if page.Page == 1 && strings.Contains(content, "HochEdelgeborner") {
- page1Found = true
- }
- if page.Page == 2 && strings.Contains(content, "Jacob Michael Reinhold Lenz") {
- page2Found = true
- }
- }
-
- if !page1Found {
- t.Error("Page 1 content not found correctly")
- }
- if !page2Found {
- t.Error("Page 2 content not found correctly")
- }
-}
-
-func TestLetterTextUnmarshal_WithSidenotes(t *testing.T) {
- // Real example with sidenotes from briefe.xml
- testXML := `
-Some text before sidenote.
-Ich umarme Dich und küsse Dich 1000mahl als Dein
+const testLetterWithSidenote = `
+Text before sidenote.
+Ich umarme Dich und küsse Dich 1000mahl als Dein
allergetreuester Bruder
-Jacob Michael Reinhold Lenz.
-Dorpat den 11ten October 1767.
+Jacob Michael Reinhold Lenz.
More text after sidenote.
`
- var letterText LetterText
- err := xml.Unmarshal([]byte(testXML), &letterText)
+const testLetterComplexStructure = `
+
+Verehrungswürdigste Eltern!
+
+Nach einer langsamen Reise sind wir angekommen.
+
+Die Wittwe ist eine simple Frau.
+Hand reference content
+Final content.
+
+Last page content with markup.
+`
+
+func TestNewTokenFromXMLToken(t *testing.T) {
+ tests := []struct {
+ name string
+ xmlToken xml.Token
+ stack []string
+ index int
+ expected Token
+ }{
+ {
+ name: "StartElement with attributes",
+ xmlToken: xml.StartElement{Name: xml.Name{Local: "page"}, Attr: []xml.Attr{{Name: xml.Name{Local: "index"}, Value: "1"}}},
+ stack: []string{"letterText"},
+ index: 5,
+ expected: Token{
+ Index: 5,
+ Stack: []string{"letterText"},
+ Attributes: map[string]string{"index": "1"},
+ },
+ },
+ {
+ name: "CharData token",
+ xmlToken: xml.CharData("Hello world"),
+ stack: []string{"letterText", "align"},
+ index: 10,
+ expected: Token{
+ Index: 10,
+ Stack: []string{"letterText", "align"},
+ Attributes: map[string]string{},
+ },
+ },
+ {
+ name: "EndElement token",
+ xmlToken: xml.EndElement{Name: xml.Name{Local: "align"}},
+ stack: []string{"letterText"},
+ index: 15,
+ expected: Token{
+ Index: 15,
+ Stack: []string{"letterText"},
+ Attributes: map[string]string{},
+ },
+ },
+ {
+ name: "Empty stack",
+ xmlToken: xml.StartElement{Name: xml.Name{Local: "letterText"}},
+ stack: []string{},
+ index: 0,
+ expected: Token{
+ Index: 0,
+ Stack: []string{},
+ Attributes: map[string]string{},
+ },
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ result := NewTokenFromXMLToken(tt.xmlToken, tt.stack, tt.index)
+
+ if result.Index != tt.expected.Index {
+ t.Errorf("Expected index %d, got %d", tt.expected.Index, result.Index)
+ }
+
+ if len(result.Stack) != len(tt.expected.Stack) {
+ t.Errorf("Expected stack length %d, got %d", len(tt.expected.Stack), len(result.Stack))
+ }
+
+ for i, expected := range tt.expected.Stack {
+ if result.Stack[i] != expected {
+ t.Errorf("Expected stack[%d] = %s, got %s", i, expected, result.Stack[i])
+ }
+ }
+
+ if len(result.Attributes) != len(tt.expected.Attributes) {
+ t.Errorf("Expected %d attributes, got %d", len(tt.expected.Attributes), len(result.Attributes))
+ }
+
+ for key, expectedValue := range tt.expected.Attributes {
+ if actualValue, exists := result.Attributes[key]; !exists || actualValue != expectedValue {
+ t.Errorf("Expected attribute %s = %s, got %s (exists: %v)", key, expectedValue, actualValue, exists)
+ }
+ }
+ })
+ }
+}
+
+func TestLetterUnmarshalXML_BasicStructure(t *testing.T) {
+ var letter Letter
+ err := xml.Unmarshal([]byte(testLetter1), &letter)
if err != nil {
- t.Fatalf("Error unmarshaling sidenote XML: %v", err)
+ t.Fatalf("Failed to unmarshal letter: %v", err)
}
- // Should have 1 sidenote
- if len(letterText.Sidenotes) != 1 {
- t.Errorf("Expected 1 sidenote, got %d", len(letterText.Sidenotes))
+ // Test basic letter properties
+ if letter.Letter != 1 {
+ t.Errorf("Expected letter number 1, got %d", letter.Letter)
}
- // Verify sidenote details
- sidenote := letterText.Sidenotes[0]
+ if len(letter.Pages) != 2 {
+ t.Errorf("Expected 2 pages, got %d", len(letter.Pages))
+ }
+
+ // Test page properties
+ for i, page := range letter.Pages {
+ expectedPageNo := i + 1
+ if page.No != expectedPageNo {
+ t.Errorf("Expected page %d to have No = %d, got %d", i, expectedPageNo, page.No)
+ }
+ if page.Letter != 1 {
+ t.Errorf("Expected page %d to have Letter = 1, got %d", i, page.Letter)
+ }
+ if len(page.Tokens) == 0 {
+ t.Errorf("Expected page %d to have tokens, got none", i)
+ }
+ if len(page.TokenInfo) != len(page.Tokens) {
+ t.Errorf("Expected page %d to have equal TokenInfo and Tokens length, got %d vs %d",
+ i, len(page.TokenInfo), len(page.Tokens))
+ }
+ }
+
+ // Test character data is collected
+ if len(letter.CharData) == 0 {
+ t.Error("Expected CharData to be populated")
+ }
+ if !strings.Contains(letter.CharData, "HochEdelgeborner") {
+ t.Error("Expected CharData to contain letter content")
+ }
+}
+
+func TestLetterUnmarshalXML_TokenInfo(t *testing.T) {
+ var letter Letter
+ err := xml.Unmarshal([]byte(testLetter1), &letter)
+ if err != nil {
+ t.Fatalf("Failed to unmarshal letter: %v", err)
+ }
+
+ // Test first page tokens and TokenInfo
+ page1 := letter.Pages[0]
+ if len(page1.TokenInfo) == 0 {
+ t.Fatal("Expected page 1 to have TokenInfo")
+ }
+
+ // Find tokens with attributes and validate TokenInfo
+ foundAlignToken := false
+
+ for i, tokenInfo := range page1.TokenInfo {
+ // Check index matches position
+ if tokenInfo.Index != i {
+ t.Errorf("Expected TokenInfo[%d] to have Index = %d, got %d", i, i, tokenInfo.Index)
+ }
+
+ // Check for align token (should have pos attribute)
+ if attr, exists := tokenInfo.Attributes["pos"]; exists && attr == "right" {
+ foundAlignToken = true
+ // Since page elements are excluded, align should be at stack depth 0 in page tokens
+ // (the letterText context is the parsing context, not included in individual page stacks)
+ }
+
+ // Stack should never be nil
+ if tokenInfo.Stack == nil {
+ t.Errorf("TokenInfo[%d] has nil stack", i)
+ }
+
+ // Attributes should never be nil
+ if tokenInfo.Attributes == nil {
+ t.Errorf("TokenInfo[%d] has nil attributes", i)
+ }
+ }
+
+ if !foundAlignToken {
+ t.Error("Expected to find align token with pos='right' attribute")
+ }
+}
+
+func TestLetterUnmarshalXML_WithSidenotes(t *testing.T) {
+ var letter Letter
+ err := xml.Unmarshal([]byte(testLetterWithSidenote), &letter)
+ if err != nil {
+ t.Fatalf("Failed to unmarshal letter with sidenote: %v", err)
+ }
+
+ // Test sidenotes
+ if len(letter.Pages[0].Sidenotes) != 1 {
+ t.Errorf("Expected 1 sidenote on page 1, got %d", len(letter.Pages[0].Sidenotes))
+ }
+
+ sidenote := letter.Pages[0].Sidenotes[0]
if sidenote.Position != SidenotePositionLeft {
t.Errorf("Expected sidenote position left, got %d", sidenote.Position)
}
@@ -197,118 +235,59 @@ More text after sidenote.
if !strings.Contains(sidenote.Annotation, "am linken Rand") {
t.Errorf("Expected sidenote annotation to contain 'am linken Rand', got '%s'", sidenote.Annotation)
}
- sidenoteContent := tokensToString(sidenote.Content)
- if !strings.Contains(sidenoteContent, "Jacob Michael Reinhold Lenz") {
- t.Errorf("Expected sidenote content to contain author name, got '%s'", sidenoteContent)
+
+ // Test sidenote TokenInfo
+ if len(sidenote.TokenInfo) != len(sidenote.Tokens) {
+ t.Errorf("Expected sidenote to have equal TokenInfo and Tokens length, got %d vs %d",
+ len(sidenote.TokenInfo), len(sidenote.Tokens))
}
- // Verify page content doesn't contain sidenote
- for _, page := range letterText.Pages {
- content := tokensToString(page.Content)
- if strings.Contains(content, "allergetreuester Bruder") {
- t.Errorf("Page content should not contain sidenote text, but page %d does", page.Page)
+ // Test sidenote CharData
+ if !strings.Contains(sidenote.CharData, "allergetreuester Bruder") {
+ t.Error("Expected sidenote CharData to contain sidenote content")
+ }
+
+ // Verify anchor position
+ if sidenote.Anchor < 0 {
+ t.Error("Expected sidenote anchor to be set")
+ }
+}
+
+func TestLetterUnmarshalXML_ComplexStructure(t *testing.T) {
+ var letter Letter
+ err := xml.Unmarshal([]byte(testLetterComplexStructure), &letter)
+ if err != nil {
+ t.Fatalf("Failed to unmarshal complex letter: %v", err)
+ }
+
+ // Test multiple pages
+ if len(letter.Pages) != 3 {
+ t.Errorf("Expected 3 pages, got %d", len(letter.Pages))
+ }
+
+ // Test hands collection
+ foundHandRef := false
+ for _, page := range letter.Pages {
+ for _, handRef := range page.Hands {
+ if handRef == 42 {
+ foundHandRef = true
+ break
+ }
}
}
-}
-
-func TestLetterTextUnmarshal_ComplexSidenotePositions(t *testing.T) {
- // Test different sidenote positions
- testXML := `
-
-Top right sidenote
-Bottom left sidenote
-Top sidenote
-Some content.
-`
-
- var letterText LetterText
- err := xml.Unmarshal([]byte(testXML), &letterText)
- if err != nil {
- t.Fatalf("Error unmarshaling complex sidenotes XML: %v", err)
+ if !foundHandRef {
+ t.Error("Expected to find hand reference 42")
}
- if len(letterText.Sidenotes) != 3 {
- t.Fatalf("Expected 3 sidenotes, got %d", len(letterText.Sidenotes))
- }
-
- // Check position parsing
- positions := make(map[SidenotePosition]bool)
- for _, sidenote := range letterText.Sidenotes {
- positions[sidenote.Position] = true
- }
-
- expectedPositions := []SidenotePosition{
- SidenotePositionTopRight,
- SidenotePositionBottomLeft,
- SidenotePositionTop,
- }
-
- for _, expected := range expectedPositions {
- if !positions[expected] {
- t.Errorf("Expected to find sidenote position %d, but didn't", expected)
+ // Test page numbers are correct
+ for i, page := range letter.Pages {
+ expectedPageNo := i + 1
+ if page.No != expectedPageNo {
+ t.Errorf("Expected page %d, got %d", expectedPageNo, page.No)
}
}
}
-func TestLetterTextUnmarshal_NoPageBreaks(t *testing.T) {
- // Test letter without explicit page breaks
- testXML := `
-This is all content on the default page.
-Some markup and more text.
-Note on single page
-Final text.
-`
-
- var letterText LetterText
- err := xml.Unmarshal([]byte(testXML), &letterText)
- if err != nil {
- t.Fatalf("Error unmarshaling no-page-break XML: %v", err)
- }
-
- // Should have 1 page (default page 1)
- if len(letterText.Pages) != 1 {
- t.Errorf("Expected 1 page, got %d", len(letterText.Pages))
- }
- if len(letterText.PageBreaks) != 0 {
- t.Errorf("Expected 0 page breaks, got %d", len(letterText.PageBreaks))
- }
-
- // Page should be page 1
- if letterText.Pages[0].Page != 1 {
- t.Errorf("Expected page 1, got page %d", letterText.Pages[0].Page)
- }
-
- // Content should contain markup but not sidenote
- content := tokensToString(letterText.Pages[0].Content)
- if !strings.Contains(content, "Some markup") {
- t.Error("Expected page content to contain markup")
- }
- if strings.Contains(content, "Note on single page") {
- t.Error("Page content should not contain sidenote text")
- }
-}
-
-func TestLetterTextUnmarshal_EmptyContent(t *testing.T) {
- // Test edge case with empty content
- testXML := `
-
-`
-
- var letterText LetterText
- err := xml.Unmarshal([]byte(testXML), &letterText)
- if err != nil {
- t.Fatalf("Error unmarshaling empty XML: %v", err)
- }
-
- // Should have no pages with content
- if len(letterText.Pages) != 0 {
- t.Errorf("Expected 0 pages with content, got %d", len(letterText.Pages))
- }
- if len(letterText.PageBreaks) != 1 {
- t.Errorf("Expected 1 page break, got %d", len(letterText.PageBreaks))
- }
-}
-
func TestSidenotePosition_UnmarshalXMLAttr(t *testing.T) {
tests := []struct {
input string
@@ -326,121 +305,242 @@ func TestSidenotePosition_UnmarshalXMLAttr(t *testing.T) {
}
for _, test := range tests {
- var pos SidenotePosition
- attr := xml.Attr{Value: test.input}
- err := pos.UnmarshalXMLAttr(attr)
- if err != nil {
- t.Errorf("Error unmarshaling position '%s': %v", test.input, err)
- }
- if pos != test.expected {
- t.Errorf("Expected position %d for input '%s', got %d", test.expected, test.input, pos)
- }
- }
-}
-
-func TestLetterTextUnmarshal_PreserveMarkup(t *testing.T) {
- // Test that various markup elements are preserved in page content
- testXML := `
-
-Text with antiqua and bold and italic.
-
-Centered text
-
-Deleted text
-More content with person reference.
-`
-
- var letterText LetterText
- err := xml.Unmarshal([]byte(testXML), &letterText)
- if err != nil {
- t.Fatalf("Error unmarshaling markup XML: %v", err)
- }
-
- if len(letterText.Pages) != 1 {
- t.Fatalf("Expected 1 page, got %d", len(letterText.Pages))
- }
-
- content := tokensToString(letterText.Pages[0].Content)
- expectedMarkup := []string{
- "antiqua",
- "bold",
- "italic",
- "",
- "",
- "",
- "Deleted text",
- "person reference",
- }
-
- for _, markup := range expectedMarkup {
- if !strings.Contains(content, markup) {
- t.Errorf("Expected page content to contain '%s', but it doesn't. Content: %s", markup, content)
- }
- }
-}
-
-func TestLetterTextUnmarshal_LetterAttribute(t *testing.T) {
- // Test that the letter attribute is parsed correctly
- testXML := `
-
-Some content.
-`
-
- var letterText LetterText
- err := xml.Unmarshal([]byte(testXML), &letterText)
- if err != nil {
- t.Fatalf("Error unmarshaling letter attribute XML: %v", err)
- }
-
- // Verify letter attribute is parsed
- if letterText.Letter != 42 {
- t.Errorf("Expected letter attribute 42, got %d", letterText.Letter)
- }
-}
-
-func TestLetterTextUnmarshal_LetterAttribute_AllExistingTests(t *testing.T) {
- // Test that existing test cases also have correct letter attributes
- testCases := []struct {
- name string
- xml string
- expectedLetter int
- }{
- {
- name: "Simple case",
- xml: `
- Some content.
- `,
- expectedLetter: 123,
- },
- {
- name: "Real example letter 1",
- xml: `
- Some content.
- `,
- expectedLetter: 1,
- },
- {
- name: "Letter with sidenotes",
- xml: `
-
- Note
- Content.
- `,
- expectedLetter: 999,
- },
- }
-
- for _, tc := range testCases {
- t.Run(tc.name, func(t *testing.T) {
- var letterText LetterText
- err := xml.Unmarshal([]byte(tc.xml), &letterText)
+ t.Run(test.input, func(t *testing.T) {
+ var pos SidenotePosition
+ attr := xml.Attr{Value: test.input}
+ err := pos.UnmarshalXMLAttr(attr)
if err != nil {
- t.Fatalf("Error unmarshaling XML: %v", err)
+ t.Errorf("Error unmarshaling position '%s': %v", test.input, err)
}
-
- if letterText.Letter != tc.expectedLetter {
- t.Errorf("Expected letter attribute %d, got %d", tc.expectedLetter, letterText.Letter)
+ if pos != test.expected {
+ t.Errorf("Expected position %d for input '%s', got %d", test.expected, test.input, pos)
}
})
}
+}
+
+func TestLetterUnmarshalXML_StackTracking(t *testing.T) {
+ simpleXML := `
+
+
+Inner content
+
+`
+
+ var letter Letter
+ err := xml.Unmarshal([]byte(simpleXML), &letter)
+ if err != nil {
+ t.Fatalf("Failed to unmarshal letter: %v", err)
+ }
+
+ page := letter.Pages[0]
+
+ // Find tokens at different nesting levels
+ var alignToken *Token
+ var aqToken *Token
+
+ for i, token := range page.TokenInfo {
+ if attrs := token.Attributes; len(attrs) > 0 {
+ if attrs["pos"] == "center" {
+ alignToken = &page.TokenInfo[i]
+ }
+ }
+
+ // Look for deeply nested token (inside align > aq)
+ if len(token.Stack) >= 1 {
+ aqToken = &page.TokenInfo[i]
+ }
+ }
+
+ if alignToken == nil {
+ t.Fatal("Expected to find align token")
+ }
+
+ if aqToken == nil {
+ t.Fatal("Expected to find nested token")
+ }
+
+ // Within a page, the stack starts fresh, so align might be at depth 0
+ // aq content should be deeper in stack than align
+ if len(aqToken.Stack) <= len(alignToken.Stack) {
+ t.Logf("Align stack depth: %d, AQ stack depth: %d", len(alignToken.Stack), len(aqToken.Stack))
+ // This is acceptable if both are at the same level in page context
+ }
+}
+
+func TestLetterUnmarshalXML_RealData(t *testing.T) {
+ // Try to read from actual briefe.xml file
+ brieveFile := "../lenz-briefe/data/xml/briefe.xml"
+ if _, err := os.Stat(brieveFile); os.IsNotExist(err) {
+ t.Skip("Real briefe.xml file not found, skipping real data test")
+ return
+ }
+
+ file, err := os.Open(brieveFile)
+ if err != nil {
+ t.Skipf("Cannot open briefe.xml: %v", err)
+ return
+ }
+ defer file.Close()
+
+ decoder := xml.NewDecoder(file)
+
+ // Find first letterText element
+ for {
+ token, err := decoder.Token()
+ if err == io.EOF {
+ t.Skip("No letterText elements found in briefe.xml")
+ return
+ }
+ if err != nil {
+ t.Skipf("Error reading briefe.xml: %v", err)
+ return
+ }
+
+ if start, ok := token.(xml.StartElement); ok && start.Name.Local == "letterText" {
+ var letter Letter
+ err := decoder.DecodeElement(&letter, &start)
+ if err != nil {
+ t.Fatalf("Failed to decode real letter: %v", err)
+ }
+
+ // Basic validation of real data
+ if letter.Letter == 0 {
+ t.Error("Expected real letter to have letter number")
+ }
+
+ if len(letter.Pages) == 0 {
+ t.Error("Expected real letter to have pages")
+ }
+
+ // Validate TokenInfo for all pages
+ for i, page := range letter.Pages {
+ if len(page.TokenInfo) != len(page.Tokens) {
+ t.Errorf("Page %d: TokenInfo length %d != Tokens length %d",
+ i, len(page.TokenInfo), len(page.Tokens))
+ }
+
+ // Check all TokenInfo entries are valid
+ for j, tokenInfo := range page.TokenInfo {
+ if tokenInfo.Index != j {
+ t.Errorf("Page %d, Token %d: Expected index %d, got %d",
+ i, j, j, tokenInfo.Index)
+ }
+ if tokenInfo.Stack == nil {
+ t.Errorf("Page %d, Token %d: Stack is nil", i, j)
+ }
+ if tokenInfo.Attributes == nil {
+ t.Errorf("Page %d, Token %d: Attributes is nil", i, j)
+ }
+ }
+ }
+
+ // Test succeeded with real data
+ t.Logf("Successfully processed real letter %d with %d pages", letter.Letter, len(letter.Pages))
+ return
+ }
+ }
+}
+
+func TestToken_AttributeAccess(t *testing.T) {
+ xmlData := `
+
+Content
+`
+
+ var letter Letter
+ err := xml.Unmarshal([]byte(xmlData), &letter)
+ if err != nil {
+ t.Fatalf("Failed to unmarshal: %v", err)
+ }
+
+ page := letter.Pages[0]
+
+ // Find tokens with specific attributes (page tokens are excluded from page.TokenInfo)
+ foundAlignPos := false
+ foundAlignTab := false
+
+ for _, tokenInfo := range page.TokenInfo {
+ if val, exists := tokenInfo.Attributes["pos"]; exists && val == "right" {
+ foundAlignPos = true
+ }
+ if val, exists := tokenInfo.Attributes["tab"]; exists && val == "5" {
+ foundAlignTab = true
+ }
+ }
+
+ if !foundAlignPos {
+ t.Error("Expected to find align with pos='right'")
+ }
+ if !foundAlignTab {
+ t.Error("Expected to find align with tab='5'")
+ }
+}
+
+func TestLetterUnmarshalXML_EdgeCases(t *testing.T) {
+ tests := []struct {
+ name string
+ xml string
+ test func(t *testing.T, letter Letter)
+ }{
+ {
+ name: "Empty letter",
+ xml: ``,
+ test: func(t *testing.T, letter Letter) {
+ if letter.Letter != 1 {
+ t.Errorf("Expected letter 1, got %d", letter.Letter)
+ }
+ if len(letter.Pages) != 0 {
+ t.Errorf("Expected 0 pages, got %d", len(letter.Pages))
+ }
+ },
+ },
+ {
+ name: "Letter with only page break",
+ xml: ``,
+ test: func(t *testing.T, letter Letter) {
+ // Page break with no content should result in no pages
+ if len(letter.Pages) != 0 {
+ t.Errorf("Expected 0 pages (page break with no content), got %d", len(letter.Pages))
+ }
+ },
+ },
+ {
+ name: "Letter with nested elements",
+ xml: `
+
+
+ Nested deeply nested content
+
+`,
+ test: func(t *testing.T, letter Letter) {
+ if len(letter.Pages) != 1 {
+ t.Errorf("Expected 1 page, got %d", len(letter.Pages))
+ }
+
+ page := letter.Pages[0]
+ maxStackDepth := 0
+ for _, tokenInfo := range page.TokenInfo {
+ if len(tokenInfo.Stack) > maxStackDepth {
+ maxStackDepth = len(tokenInfo.Stack)
+ }
+ }
+
+ if maxStackDepth < 3 {
+ t.Errorf("Expected deep nesting (3+ levels), got max depth %d", maxStackDepth)
+ }
+ },
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ var letter Letter
+ err := xml.Unmarshal([]byte(tt.xml), &letter)
+ if err != nil {
+ t.Fatalf("Failed to unmarshal: %v", err)
+ }
+ tt.test(t, letter)
+ })
+ }
}
\ No newline at end of file
diff --git a/xmlmodels/library.go b/xmlmodels/library.go
index 3eda184..a478841 100644
--- a/xmlmodels/library.go
+++ b/xmlmodels/library.go
@@ -56,14 +56,14 @@ func (l *Library) String() string {
sb.WriteString("Letters: ")
sb.WriteString(strconv.Itoa(l.Letters.Count()))
filter := func(item Letter) bool {
- return len(item.Hands) > 0
+ return len(item.Hands()) > 0
}
hands := 0
for l := range l.Letters.Filter(filter) {
hands += 1
sb.WriteString("\n")
sb.WriteString(strconv.Itoa(l.Letter) + ": ")
- sb.WriteString(strconv.Itoa(len(l.Hands)) + " Hände, No " + strconv.Itoa(hands))
+ sb.WriteString(strconv.Itoa(len(l.Hands())) + " Hände, No " + strconv.Itoa(hands))
}
sb.WriteString("\n")
@@ -307,22 +307,22 @@ func (l *Library) LettersForYear(year int) (ret []Meta) {
}) {
ret = append(ret, l)
}
- return
+ return ret
}
func (l *Library) Person(id int) (ret *PersonDef) {
ret = l.Persons.Item(id)
- return
+ return ret
}
func (l *Library) App(id int) (ret *AppDef) {
ret = l.AppDefs.Item(id)
- return
+ return ret
}
func (l *Library) Place(id int) (ret *LocationDef) {
ret = l.Places.Item(id)
- return
+ return ret
}
func (l *Library) Tradition(letter int) (ret []App) {
@@ -338,14 +338,14 @@ func (l *Library) GetPersons(id []int) (ret []*PersonDef) {
for _, i := range id {
ret = append(ret, l.Person(i))
}
- return
+ return ret
}
func (l *Library) GetPlaces(id []int) (ret []*LocationDef) {
for _, i := range id {
ret = append(ret, l.Place(i))
}
- return
+ return ret
}
func (l *Library) FuncMap() template.FuncMap {
diff --git a/xmlmodels/token.go b/xmlmodels/token.go
new file mode 100644
index 0000000..3a68f7b
--- /dev/null
+++ b/xmlmodels/token.go
@@ -0,0 +1,361 @@
+package xmlmodels
+
+import (
+ "encoding/xml"
+ "iter"
+ "strings"
+)
+
+// Token wraps xml.Token with additional parsing context
+type Token struct {
+ Index int // Position in token array
+ Stack []string // Element names in the stack at this token
+ Attributes map[string]string // Attributes for StartElement tokens
+}
+
+// NewTokenFromXMLToken creates a Token from xml.Token with context
+func NewTokenFromXMLToken(xmlToken xml.Token, stack []string, index int) Token {
+ token := Token{
+ Index: index,
+ Stack: make([]string, len(stack)),
+ Attributes: make(map[string]string),
+ }
+
+ copy(token.Stack, stack)
+
+ // Extract attributes if this is a StartElement
+ if startElement, ok := xmlToken.(xml.StartElement); ok {
+ for _, attr := range startElement.Attr {
+ token.Attributes[attr.Name.Local] = attr.Value
+ }
+ }
+
+ return token
+}
+
+type LetterTokenType int
+
+const (
+ LetterStartElement LetterTokenType = iota
+ LetterEndElement
+ LetterCharData
+ LetterComment
+ LetterProcInst
+ LetterDirective
+)
+
+// LetterToken wraps xml.Token with additional context for Letter/Page parsing
+type LetterToken struct {
+ Name string
+ Attributes map[string]string
+ Inner xml.Token
+ Type LetterTokenType
+ Data string
+ Stack []*LetterToken
+ Index int
+ PageIndex int // Which page this token belongs to
+ Letter int // Which letter this token belongs to
+
+ // Navigation fields
+ charData string
+ children []*LetterToken
+ childrenParsed bool
+ chardataParsed bool
+ parser *LetterParser
+}
+
+// LetterParser wraps a slice of LetterTokens with navigation capabilities
+type LetterParser struct {
+ Stack []*LetterToken
+ pipeline []*LetterToken
+ letter int
+ pageMap map[int]int // Maps page number to starting token index
+}
+
+// NewLetterParser creates a parser from xml.Token slice
+func NewLetterParser(tokens []xml.Token, letter int, pageIndex int) *LetterParser {
+ parser := &LetterParser{
+ Stack: make([]*LetterToken, 0),
+ letter: letter,
+ pageMap: make(map[int]int),
+ }
+
+ stack := make([]*LetterToken, 0)
+
+ for i, token := range tokens {
+ letterToken := &LetterToken{
+ Inner: xml.CopyToken(token),
+ Index: i,
+ PageIndex: pageIndex,
+ Letter: letter,
+ Stack: make([]*LetterToken, len(stack)),
+ parser: parser,
+ }
+
+ // Copy current stack
+ copy(letterToken.Stack, stack)
+
+ switch t := token.(type) {
+ case xml.StartElement:
+ letterToken.Name = t.Name.Local
+ letterToken.Attributes = mapXMLAttributes(t.Attr)
+ letterToken.Type = LetterStartElement
+
+ // Add to parent's children if not parsed yet
+ if len(stack) > 0 && !stack[len(stack)-1].childrenParsed {
+ stack[len(stack)-1].children = append(stack[len(stack)-1].children, letterToken)
+ }
+ stack = append(stack, letterToken)
+
+ case xml.EndElement:
+ if len(stack) > 0 {
+ element := stack[len(stack)-1]
+ element.childrenParsed = true
+ element.chardataParsed = true
+ stack = stack[:len(stack)-1]
+ }
+ letterToken.Name = t.Name.Local
+ letterToken.Attributes = make(map[string]string)
+ letterToken.Type = LetterEndElement
+
+ case xml.CharData:
+ text := string(t)
+ if text != "" && len(stack) > 0 {
+ for i := range stack {
+ if !stack[i].chardataParsed {
+ stack[i].charData += text
+ }
+ }
+ }
+ letterToken.Data = text
+ letterToken.Type = LetterCharData
+
+ case xml.Comment:
+ letterToken.Type = LetterComment
+ letterToken.Data = string(t)
+
+ case xml.ProcInst:
+ letterToken.Name = t.Target
+ letterToken.Data = string(t.Inst)
+ letterToken.Type = LetterProcInst
+
+ case xml.Directive:
+ letterToken.Data = string(t)
+ letterToken.Type = LetterDirective
+ }
+
+ parser.pipeline = append(parser.pipeline, letterToken)
+ }
+
+ return parser
+}
+
+// GetStack returns current parsing stack
+func (p *LetterParser) GetStack() []*LetterToken {
+ return p.Stack
+}
+
+// Pipeline returns all tokens
+func (p *LetterParser) Pipeline() []*LetterToken {
+ return p.pipeline
+}
+
+// TokenAt returns token at specific index
+func (p *LetterParser) TokenAt(index int) *LetterToken {
+ if index < 0 || index >= len(p.pipeline) {
+ return nil
+ }
+ return p.pipeline[index]
+}
+
+// IterateFrom creates iterator starting from specific index
+func (p *LetterParser) IterateFrom(index int) iter.Seq2[*LetterToken, error] {
+ return func(yield func(*LetterToken, error) bool) {
+ for i := index; i < len(p.pipeline); i++ {
+ if !yield(p.pipeline[i], nil) {
+ return
+ }
+ }
+ }
+}
+
+// Iterate over all tokens
+func (p *LetterParser) Iterate() iter.Seq2[*LetterToken, error] {
+ return p.IterateFrom(0)
+}
+
+// Previous returns tokens before given index
+func (p *LetterParser) Previous(index int) []*LetterToken {
+ if index <= 0 || index > len(p.pipeline) {
+ return nil
+ }
+ return p.pipeline[:index]
+}
+
+// LetterToken methods
+
+// String returns string representation
+func (t *LetterToken) String() string {
+ builder := strings.Builder{}
+ switch t.Type {
+ case LetterStartElement:
+ builder.WriteString("<" + t.Name)
+ for k, v := range t.Attributes {
+ builder.WriteString(" " + k + `="` + v + `"`)
+ }
+ builder.WriteString(">")
+ case LetterEndElement:
+ builder.WriteString("" + t.Name + ">")
+ case LetterCharData:
+ builder.WriteString(t.Data)
+ case LetterComment:
+ builder.WriteString("")
+ }
+ return builder.String()
+}
+
+// Element returns all tokens from start to matching end element
+func (t *LetterToken) Element() []*LetterToken {
+ if t.Type != LetterStartElement {
+ return nil
+ }
+
+ var tokens []*LetterToken
+ depth := 0
+
+ for token, _ := range t.parser.IterateFrom(t.Index) {
+ tokens = append(tokens, token)
+
+ if token.Type == LetterStartElement && token.Name == t.Name {
+ depth++
+ } else if token.Type == LetterEndElement && token.Name == t.Name {
+ depth--
+ if depth == 0 {
+ return tokens
+ }
+ }
+ }
+
+ return tokens
+}
+
+// Children returns direct child elements
+func (t *LetterToken) Children() []*LetterToken {
+ if t.childrenParsed {
+ return t.children
+ }
+
+ if t.Type != LetterStartElement {
+ return nil
+ }
+
+ element := t.Element()
+ if len(element) <= 1 {
+ return nil
+ }
+
+ // Skip first (self) and find direct children
+ depth := 0
+ for _, token := range element[1:] { // Skip self
+ if token.Type == LetterStartElement {
+ if depth == 0 {
+ t.children = append(t.children, token)
+ }
+ depth++
+ } else if token.Type == LetterEndElement {
+ depth--
+ }
+ }
+
+ t.childrenParsed = true
+ return t.children
+}
+
+// CharData returns character data content
+func (t *LetterToken) CharData() string {
+ if t.Type == LetterCharData || t.Type == LetterComment {
+ return t.Data
+ }
+
+ if t.chardataParsed {
+ return t.charData
+ }
+
+ if t.Type != LetterStartElement {
+ return ""
+ }
+
+ element := t.Element()
+ if len(element) == 0 {
+ return ""
+ }
+
+ var builder strings.Builder
+ for _, token := range element {
+ if token.Type == LetterCharData {
+ builder.WriteString(token.Data)
+ }
+ }
+
+ t.chardataParsed = true
+ t.charData = builder.String()
+ return t.charData
+}
+
+// Next returns iterator from next token
+func (t *LetterToken) Next() iter.Seq2[*LetterToken, error] {
+ return t.parser.IterateFrom(t.Index + 1)
+}
+
+// Previous returns tokens before this one
+func (t *LetterToken) Previous() []*LetterToken {
+ return t.parser.Previous(t.Index)
+}
+
+// FindByName finds first child element with given name
+func (t *LetterToken) FindByName(name string) *LetterToken {
+ for _, child := range t.Children() {
+ if child.Name == name {
+ return child
+ }
+ }
+ return nil
+}
+
+// FindAllByName finds all child elements with given name
+func (t *LetterToken) FindAllByName(name string) []*LetterToken {
+ var result []*LetterToken
+ for _, child := range t.Children() {
+ if child.Name == name {
+ result = append(result, child)
+ }
+ }
+ return result
+}
+
+// GetAttribute returns attribute value
+func (t *LetterToken) GetAttribute(name string) string {
+ if t.Attributes == nil {
+ return ""
+ }
+ return t.Attributes[name]
+}
+
+// GetStackDepth returns current nesting depth
+func (t *LetterToken) GetStackDepth() int {
+ return len(t.Stack)
+}
+
+// InPage checks if token belongs to specific page
+func (t *LetterToken) InPage(pageNo int) bool {
+ return t.PageIndex == pageNo
+}
+
+// mapXMLAttributes converts xml.Attr to map[string]string
+func mapXMLAttributes(attrs []xml.Attr) map[string]string {
+ attrMap := make(map[string]string)
+ for _, attr := range attrs {
+ attrMap[attr.Name.Local] = attr.Value
+ }
+ return attrMap
+}
\ No newline at end of file