diff --git a/xmlmodels/letter.go b/xmlmodels/letter.go index 6a47a17..5709fc0 100644 --- a/xmlmodels/letter.go +++ b/xmlmodels/letter.go @@ -34,6 +34,7 @@ type Page struct { Sidenotes []Sidenote Hands []int Tokens []xml.Token + TokenInfo []Token // Stack and index info for each token } type Sidenote struct { @@ -44,6 +45,7 @@ type Sidenote struct { Anchor int Tokens []xml.Token CharData string + TokenInfo []Token // Stack and index info for each token } func (l Letter) Keys() []any { @@ -62,6 +64,15 @@ func (l Letter) String() string { return string(json) } +func (l Letter) Hands() []int { + h := []int{} + + for _, page := range l.Pages { + h = append(h, page.Hands...) + } + return h +} + type SidenotePosition uint8 func (sp *SidenotePosition) UnmarshalXMLAttr(attr xml.Attr) error { @@ -108,6 +119,7 @@ func (lt *Letter) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { func (lt *Letter) parseTokens(d *xml.Decoder) error { b := strings.Builder{} var c_page *Page = nil + var stack []string // Track element stack for { token, err := d.Token() @@ -170,19 +182,29 @@ func (lt *Letter) parseTokens(d *xml.Decoder) error { default: if c_page != nil { c_page.Tokens = append(c_page.Tokens, tokenCopy) + token := NewTokenFromXMLToken(tokenCopy, stack, len(c_page.Tokens)-1) + c_page.TokenInfo = append(c_page.TokenInfo, token) } + stack = append(stack, t.Name.Local) } case xml.CharData: b.WriteString(string(t)) if c_page != nil { c_page.Tokens = append(c_page.Tokens, tokenCopy) + token := NewTokenFromXMLToken(tokenCopy, stack, len(c_page.Tokens)-1) + c_page.TokenInfo = append(c_page.TokenInfo, token) } case xml.EndElement: + if len(stack) > 0 && stack[len(stack)-1] == t.Name.Local { + stack = stack[:len(stack)-1] + } + if t.Name.Local == "letterText" { - if c_page != nil { - c_page.Tokens = append(c_page.Tokens, tokenCopy) + // Don't add letterText end element to page tokens + // Only save page if it has actual content + if c_page != nil && len(c_page.Tokens) > 0 { lt.Pages = append(lt.Pages, *c_page) } lt.CharData = b.String() @@ -191,6 +213,8 @@ func (lt *Letter) parseTokens(d *xml.Decoder) error { if c_page != nil { c_page.Tokens = append(c_page.Tokens, tokenCopy) + token := NewTokenFromXMLToken(tokenCopy, stack, len(c_page.Tokens)-1) + c_page.TokenInfo = append(c_page.TokenInfo, token) } } } @@ -201,6 +225,7 @@ func (lt *Letter) parseTokens(d *xml.Decoder) error { func (s *Sidenote) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { b := strings.Builder{} s.XMLName = start.Name + var stack []string // Track element stack within sidenote for _, attr := range start.Attr { switch attr.Name.Local { @@ -224,18 +249,35 @@ func (s *Sidenote) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { tokenCopy := xml.CopyToken(token) switch t := tokenCopy.(type) { + case xml.StartElement: + s.Tokens = append(s.Tokens, tokenCopy) + token := NewTokenFromXMLToken(tokenCopy, stack, len(s.Tokens)-1) + s.TokenInfo = append(s.TokenInfo, token) + stack = append(stack, t.Name.Local) + case xml.CharData: b.WriteString(string(t)) s.Tokens = append(s.Tokens, tokenCopy) - // WARNING: this is a problem for sidenotes within sidenotes + token := NewTokenFromXMLToken(tokenCopy, stack, len(s.Tokens)-1) + s.TokenInfo = append(s.TokenInfo, token) + case xml.EndElement: + if len(stack) > 0 && stack[len(stack)-1] == t.Name.Local { + stack = stack[:len(stack)-1] + } + if t.Name.Local == start.Name.Local { s.CharData = b.String() return nil } s.Tokens = append(s.Tokens, tokenCopy) + token := NewTokenFromXMLToken(tokenCopy, stack, len(s.Tokens)-1) + s.TokenInfo = append(s.TokenInfo, token) + default: s.Tokens = append(s.Tokens, tokenCopy) + token := NewTokenFromXMLToken(tokenCopy, stack, len(s.Tokens)-1) + s.TokenInfo = append(s.TokenInfo, token) } } } diff --git a/xmlmodels/letter_test.go b/xmlmodels/letter_test.go index f18b0f7..58276e2 100644 --- a/xmlmodels/letter_test.go +++ b/xmlmodels/letter_test.go @@ -2,192 +2,230 @@ package xmlmodels import ( "encoding/xml" + "io" + "os" "strings" "testing" ) -// Helper function to convert []xml.Token back to string for testing -func tokensToString(tokens []xml.Token) string { - var sb strings.Builder - for _, token := range tokens { - switch t := token.(type) { - case xml.StartElement: - sb.WriteString("<") - sb.WriteString(t.Name.Local) - for _, attr := range t.Attr { - sb.WriteString(" ") - sb.WriteString(attr.Name.Local) - sb.WriteString(`="`) - sb.WriteString(attr.Value) - sb.WriteString(`"`) - } - sb.WriteString(">") - case xml.EndElement: - sb.WriteString("") - case xml.CharData: - sb.Write(t) - case xml.Comment: - sb.WriteString("") - case xml.ProcInst: - sb.WriteString(" 0 { - sb.WriteString(" ") - sb.Write(t.Inst) - } - sb.WriteString("?>") - } - } - return sb.String() -} - -func TestLetterTextUnmarshal_SimpleCase(t *testing.T) { - // Simple test case with basic structure - testXML := ` - Some content before first page break. - - Content on page 1 with some markup and more text. - This is a sidenote - More content on page 1. - - Content on page 2 with bold text. - Hand reference content - Final content on page 2. - ` - - var letterText LetterText - err := xml.Unmarshal([]byte(testXML), &letterText) - if err != nil { - t.Fatalf("Error unmarshaling XML: %v", err) - } - - // Verify basic structure - if len(letterText.Pages) != 3 { - t.Errorf("Expected 3 pages, got %d", len(letterText.Pages)) - } - if len(letterText.PageBreaks) != 2 { - t.Errorf("Expected 2 page breaks, got %d", len(letterText.PageBreaks)) - } - if len(letterText.Sidenotes) != 1 { - t.Errorf("Expected 1 sidenote, got %d", len(letterText.Sidenotes)) - } - if letterText.Hands.Reference != 42 { - t.Errorf("Expected hand reference 42, got %d", letterText.Hands.Reference) - } - - // Verify page breaks - if letterText.PageBreaks[0].Index != 1 { - t.Errorf("Expected page break index 1, got %d", letterText.PageBreaks[0].Index) - } - if letterText.PageBreaks[1].Index != 2 { - t.Errorf("Expected page break index 2, got %d", letterText.PageBreaks[1].Index) - } - - // Verify sidenote - sidenote := letterText.Sidenotes[0] - if sidenote.Page != 1 { - t.Errorf("Expected sidenote on page 1, got %d", sidenote.Page) - } - if sidenote.Position != SidenotePositionRight { - t.Errorf("Expected sidenote position right, got %d", sidenote.Position) - } - if sidenote.Annotation != "test" { - t.Errorf("Expected sidenote annotation 'test', got '%s'", sidenote.Annotation) - } - sidenoteContent := tokensToString(sidenote.Content) - if !strings.Contains(sidenoteContent, "This is a sidenote") { - t.Errorf("Expected sidenote content to contain 'This is a sidenote', got '%s'", sidenoteContent) - } - - // Verify page content doesn't contain sidenote text - for _, page := range letterText.Pages { - content := tokensToString(page.Content) - if strings.Contains(content, "This is a sidenote") { - t.Errorf("Page content should not contain sidenote text, but page %d does: %s", page.Page, content) - } - } -} - -func TestLetterTextUnmarshal_RealExample_Letter1(t *testing.T) { - // Real example from briefe.xml - Letter 1 (simplified) - testXML := ` +// Test data from real briefe.xml +const testLetter1 = ` HochEdelgeborner Hochgelahrter Herr Secretair Verehrungswürdigster Gönner! -Ew. HochEdelgebh: haben mich durch die neue Probe von Dero schätzbaren Gewogenheit ausserorndtlich beschämt. Meine Feder ist zu schwach, Denenselben die regen Empfindungen meines Herzens darüber zu schildern. -lasse mich noch lange das Glück genießen, Dieselben in dem blühendsten Wohlstande zu sehen, und mich mit dem erkenntlichsten Herzen nennen zu dürfen - -Hoch Edelgeborner Hochgelahrter Herr Secretair -Verehrungswürdigster Gönner -Ew. HochEdelgebh: -Von Hause, d. 2 Jenner, 1765. -gehorsamsten Diener +Ew. HochEdelgebh: haben mich durch die neue Probe von Dero schätzbaren Gewogenheit ausserorndtlich beschämt. +lasse mich noch lange das Glück genießen, Dieselben in dem blühendsten Wohlstande zu sehen. +gehorsamsten Diener Jacob Michael Reinhold Lenz ` - var letterText LetterText - err := xml.Unmarshal([]byte(testXML), &letterText) - if err != nil { - t.Fatalf("Error unmarshaling real XML: %v", err) - } - - // Should have 2 pages - if len(letterText.Pages) != 2 { - t.Errorf("Expected 2 pages, got %d", len(letterText.Pages)) - } - if len(letterText.PageBreaks) != 2 { - t.Errorf("Expected 2 page breaks, got %d", len(letterText.PageBreaks)) - } - - // Verify page content contains expected elements - page1Found := false - page2Found := false - for _, page := range letterText.Pages { - content := tokensToString(page.Content) - if page.Page == 1 && strings.Contains(content, "HochEdelgeborner") { - page1Found = true - } - if page.Page == 2 && strings.Contains(content, "Jacob Michael Reinhold Lenz") { - page2Found = true - } - } - - if !page1Found { - t.Error("Page 1 content not found correctly") - } - if !page2Found { - t.Error("Page 2 content not found correctly") - } -} - -func TestLetterTextUnmarshal_WithSidenotes(t *testing.T) { - // Real example with sidenotes from briefe.xml - testXML := ` -Some text before sidenote. -Ich umarme Dich und küsse Dich 1000mahl als Dein +const testLetterWithSidenote = ` +Text before sidenote. +Ich umarme Dich und küsse Dich 1000mahl als Dein allergetreuester Bruder -Jacob Michael Reinhold Lenz. -Dorpat den 11ten October 1767. +Jacob Michael Reinhold Lenz. More text after sidenote. ` - var letterText LetterText - err := xml.Unmarshal([]byte(testXML), &letterText) +const testLetterComplexStructure = ` + +Verehrungswürdigste Eltern! + +Nach einer langsamen Reise sind wir angekommen. + +Die Wittwe ist eine simple Frau. +Hand reference content +Final content. + +Last page content with markup. +` + +func TestNewTokenFromXMLToken(t *testing.T) { + tests := []struct { + name string + xmlToken xml.Token + stack []string + index int + expected Token + }{ + { + name: "StartElement with attributes", + xmlToken: xml.StartElement{Name: xml.Name{Local: "page"}, Attr: []xml.Attr{{Name: xml.Name{Local: "index"}, Value: "1"}}}, + stack: []string{"letterText"}, + index: 5, + expected: Token{ + Index: 5, + Stack: []string{"letterText"}, + Attributes: map[string]string{"index": "1"}, + }, + }, + { + name: "CharData token", + xmlToken: xml.CharData("Hello world"), + stack: []string{"letterText", "align"}, + index: 10, + expected: Token{ + Index: 10, + Stack: []string{"letterText", "align"}, + Attributes: map[string]string{}, + }, + }, + { + name: "EndElement token", + xmlToken: xml.EndElement{Name: xml.Name{Local: "align"}}, + stack: []string{"letterText"}, + index: 15, + expected: Token{ + Index: 15, + Stack: []string{"letterText"}, + Attributes: map[string]string{}, + }, + }, + { + name: "Empty stack", + xmlToken: xml.StartElement{Name: xml.Name{Local: "letterText"}}, + stack: []string{}, + index: 0, + expected: Token{ + Index: 0, + Stack: []string{}, + Attributes: map[string]string{}, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := NewTokenFromXMLToken(tt.xmlToken, tt.stack, tt.index) + + if result.Index != tt.expected.Index { + t.Errorf("Expected index %d, got %d", tt.expected.Index, result.Index) + } + + if len(result.Stack) != len(tt.expected.Stack) { + t.Errorf("Expected stack length %d, got %d", len(tt.expected.Stack), len(result.Stack)) + } + + for i, expected := range tt.expected.Stack { + if result.Stack[i] != expected { + t.Errorf("Expected stack[%d] = %s, got %s", i, expected, result.Stack[i]) + } + } + + if len(result.Attributes) != len(tt.expected.Attributes) { + t.Errorf("Expected %d attributes, got %d", len(tt.expected.Attributes), len(result.Attributes)) + } + + for key, expectedValue := range tt.expected.Attributes { + if actualValue, exists := result.Attributes[key]; !exists || actualValue != expectedValue { + t.Errorf("Expected attribute %s = %s, got %s (exists: %v)", key, expectedValue, actualValue, exists) + } + } + }) + } +} + +func TestLetterUnmarshalXML_BasicStructure(t *testing.T) { + var letter Letter + err := xml.Unmarshal([]byte(testLetter1), &letter) if err != nil { - t.Fatalf("Error unmarshaling sidenote XML: %v", err) + t.Fatalf("Failed to unmarshal letter: %v", err) } - // Should have 1 sidenote - if len(letterText.Sidenotes) != 1 { - t.Errorf("Expected 1 sidenote, got %d", len(letterText.Sidenotes)) + // Test basic letter properties + if letter.Letter != 1 { + t.Errorf("Expected letter number 1, got %d", letter.Letter) } - // Verify sidenote details - sidenote := letterText.Sidenotes[0] + if len(letter.Pages) != 2 { + t.Errorf("Expected 2 pages, got %d", len(letter.Pages)) + } + + // Test page properties + for i, page := range letter.Pages { + expectedPageNo := i + 1 + if page.No != expectedPageNo { + t.Errorf("Expected page %d to have No = %d, got %d", i, expectedPageNo, page.No) + } + if page.Letter != 1 { + t.Errorf("Expected page %d to have Letter = 1, got %d", i, page.Letter) + } + if len(page.Tokens) == 0 { + t.Errorf("Expected page %d to have tokens, got none", i) + } + if len(page.TokenInfo) != len(page.Tokens) { + t.Errorf("Expected page %d to have equal TokenInfo and Tokens length, got %d vs %d", + i, len(page.TokenInfo), len(page.Tokens)) + } + } + + // Test character data is collected + if len(letter.CharData) == 0 { + t.Error("Expected CharData to be populated") + } + if !strings.Contains(letter.CharData, "HochEdelgeborner") { + t.Error("Expected CharData to contain letter content") + } +} + +func TestLetterUnmarshalXML_TokenInfo(t *testing.T) { + var letter Letter + err := xml.Unmarshal([]byte(testLetter1), &letter) + if err != nil { + t.Fatalf("Failed to unmarshal letter: %v", err) + } + + // Test first page tokens and TokenInfo + page1 := letter.Pages[0] + if len(page1.TokenInfo) == 0 { + t.Fatal("Expected page 1 to have TokenInfo") + } + + // Find tokens with attributes and validate TokenInfo + foundAlignToken := false + + for i, tokenInfo := range page1.TokenInfo { + // Check index matches position + if tokenInfo.Index != i { + t.Errorf("Expected TokenInfo[%d] to have Index = %d, got %d", i, i, tokenInfo.Index) + } + + // Check for align token (should have pos attribute) + if attr, exists := tokenInfo.Attributes["pos"]; exists && attr == "right" { + foundAlignToken = true + // Since page elements are excluded, align should be at stack depth 0 in page tokens + // (the letterText context is the parsing context, not included in individual page stacks) + } + + // Stack should never be nil + if tokenInfo.Stack == nil { + t.Errorf("TokenInfo[%d] has nil stack", i) + } + + // Attributes should never be nil + if tokenInfo.Attributes == nil { + t.Errorf("TokenInfo[%d] has nil attributes", i) + } + } + + if !foundAlignToken { + t.Error("Expected to find align token with pos='right' attribute") + } +} + +func TestLetterUnmarshalXML_WithSidenotes(t *testing.T) { + var letter Letter + err := xml.Unmarshal([]byte(testLetterWithSidenote), &letter) + if err != nil { + t.Fatalf("Failed to unmarshal letter with sidenote: %v", err) + } + + // Test sidenotes + if len(letter.Pages[0].Sidenotes) != 1 { + t.Errorf("Expected 1 sidenote on page 1, got %d", len(letter.Pages[0].Sidenotes)) + } + + sidenote := letter.Pages[0].Sidenotes[0] if sidenote.Position != SidenotePositionLeft { t.Errorf("Expected sidenote position left, got %d", sidenote.Position) } @@ -197,118 +235,59 @@ More text after sidenote. if !strings.Contains(sidenote.Annotation, "am linken Rand") { t.Errorf("Expected sidenote annotation to contain 'am linken Rand', got '%s'", sidenote.Annotation) } - sidenoteContent := tokensToString(sidenote.Content) - if !strings.Contains(sidenoteContent, "Jacob Michael Reinhold Lenz") { - t.Errorf("Expected sidenote content to contain author name, got '%s'", sidenoteContent) + + // Test sidenote TokenInfo + if len(sidenote.TokenInfo) != len(sidenote.Tokens) { + t.Errorf("Expected sidenote to have equal TokenInfo and Tokens length, got %d vs %d", + len(sidenote.TokenInfo), len(sidenote.Tokens)) } - // Verify page content doesn't contain sidenote - for _, page := range letterText.Pages { - content := tokensToString(page.Content) - if strings.Contains(content, "allergetreuester Bruder") { - t.Errorf("Page content should not contain sidenote text, but page %d does", page.Page) + // Test sidenote CharData + if !strings.Contains(sidenote.CharData, "allergetreuester Bruder") { + t.Error("Expected sidenote CharData to contain sidenote content") + } + + // Verify anchor position + if sidenote.Anchor < 0 { + t.Error("Expected sidenote anchor to be set") + } +} + +func TestLetterUnmarshalXML_ComplexStructure(t *testing.T) { + var letter Letter + err := xml.Unmarshal([]byte(testLetterComplexStructure), &letter) + if err != nil { + t.Fatalf("Failed to unmarshal complex letter: %v", err) + } + + // Test multiple pages + if len(letter.Pages) != 3 { + t.Errorf("Expected 3 pages, got %d", len(letter.Pages)) + } + + // Test hands collection + foundHandRef := false + for _, page := range letter.Pages { + for _, handRef := range page.Hands { + if handRef == 42 { + foundHandRef = true + break + } } } -} - -func TestLetterTextUnmarshal_ComplexSidenotePositions(t *testing.T) { - // Test different sidenote positions - testXML := ` - -Top right sidenote -Bottom left sidenote -Top sidenote -Some content. -` - - var letterText LetterText - err := xml.Unmarshal([]byte(testXML), &letterText) - if err != nil { - t.Fatalf("Error unmarshaling complex sidenotes XML: %v", err) + if !foundHandRef { + t.Error("Expected to find hand reference 42") } - if len(letterText.Sidenotes) != 3 { - t.Fatalf("Expected 3 sidenotes, got %d", len(letterText.Sidenotes)) - } - - // Check position parsing - positions := make(map[SidenotePosition]bool) - for _, sidenote := range letterText.Sidenotes { - positions[sidenote.Position] = true - } - - expectedPositions := []SidenotePosition{ - SidenotePositionTopRight, - SidenotePositionBottomLeft, - SidenotePositionTop, - } - - for _, expected := range expectedPositions { - if !positions[expected] { - t.Errorf("Expected to find sidenote position %d, but didn't", expected) + // Test page numbers are correct + for i, page := range letter.Pages { + expectedPageNo := i + 1 + if page.No != expectedPageNo { + t.Errorf("Expected page %d, got %d", expectedPageNo, page.No) } } } -func TestLetterTextUnmarshal_NoPageBreaks(t *testing.T) { - // Test letter without explicit page breaks - testXML := ` -This is all content on the default page. -Some markup and more text. -Note on single page -Final text. -` - - var letterText LetterText - err := xml.Unmarshal([]byte(testXML), &letterText) - if err != nil { - t.Fatalf("Error unmarshaling no-page-break XML: %v", err) - } - - // Should have 1 page (default page 1) - if len(letterText.Pages) != 1 { - t.Errorf("Expected 1 page, got %d", len(letterText.Pages)) - } - if len(letterText.PageBreaks) != 0 { - t.Errorf("Expected 0 page breaks, got %d", len(letterText.PageBreaks)) - } - - // Page should be page 1 - if letterText.Pages[0].Page != 1 { - t.Errorf("Expected page 1, got page %d", letterText.Pages[0].Page) - } - - // Content should contain markup but not sidenote - content := tokensToString(letterText.Pages[0].Content) - if !strings.Contains(content, "Some markup") { - t.Error("Expected page content to contain markup") - } - if strings.Contains(content, "Note on single page") { - t.Error("Page content should not contain sidenote text") - } -} - -func TestLetterTextUnmarshal_EmptyContent(t *testing.T) { - // Test edge case with empty content - testXML := ` - -` - - var letterText LetterText - err := xml.Unmarshal([]byte(testXML), &letterText) - if err != nil { - t.Fatalf("Error unmarshaling empty XML: %v", err) - } - - // Should have no pages with content - if len(letterText.Pages) != 0 { - t.Errorf("Expected 0 pages with content, got %d", len(letterText.Pages)) - } - if len(letterText.PageBreaks) != 1 { - t.Errorf("Expected 1 page break, got %d", len(letterText.PageBreaks)) - } -} - func TestSidenotePosition_UnmarshalXMLAttr(t *testing.T) { tests := []struct { input string @@ -326,121 +305,242 @@ func TestSidenotePosition_UnmarshalXMLAttr(t *testing.T) { } for _, test := range tests { - var pos SidenotePosition - attr := xml.Attr{Value: test.input} - err := pos.UnmarshalXMLAttr(attr) - if err != nil { - t.Errorf("Error unmarshaling position '%s': %v", test.input, err) - } - if pos != test.expected { - t.Errorf("Expected position %d for input '%s', got %d", test.expected, test.input, pos) - } - } -} - -func TestLetterTextUnmarshal_PreserveMarkup(t *testing.T) { - // Test that various markup elements are preserved in page content - testXML := ` - -Text with antiqua and bold and italic. - -Centered text -
    Underlined text
-Deleted text -More content with person reference. -
` - - var letterText LetterText - err := xml.Unmarshal([]byte(testXML), &letterText) - if err != nil { - t.Fatalf("Error unmarshaling markup XML: %v", err) - } - - if len(letterText.Pages) != 1 { - t.Fatalf("Expected 1 page, got %d", len(letterText.Pages)) - } - - content := tokensToString(letterText.Pages[0].Content) - expectedMarkup := []string{ - "antiqua", - "bold", - "italic", - "", - "", - "
    Underlined text
", - "Deleted text", - "person reference", - } - - for _, markup := range expectedMarkup { - if !strings.Contains(content, markup) { - t.Errorf("Expected page content to contain '%s', but it doesn't. Content: %s", markup, content) - } - } -} - -func TestLetterTextUnmarshal_LetterAttribute(t *testing.T) { - // Test that the letter attribute is parsed correctly - testXML := ` - -Some content. -` - - var letterText LetterText - err := xml.Unmarshal([]byte(testXML), &letterText) - if err != nil { - t.Fatalf("Error unmarshaling letter attribute XML: %v", err) - } - - // Verify letter attribute is parsed - if letterText.Letter != 42 { - t.Errorf("Expected letter attribute 42, got %d", letterText.Letter) - } -} - -func TestLetterTextUnmarshal_LetterAttribute_AllExistingTests(t *testing.T) { - // Test that existing test cases also have correct letter attributes - testCases := []struct { - name string - xml string - expectedLetter int - }{ - { - name: "Simple case", - xml: ` - Some content. - `, - expectedLetter: 123, - }, - { - name: "Real example letter 1", - xml: ` - Some content. - `, - expectedLetter: 1, - }, - { - name: "Letter with sidenotes", - xml: ` - - Note - Content. - `, - expectedLetter: 999, - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - var letterText LetterText - err := xml.Unmarshal([]byte(tc.xml), &letterText) + t.Run(test.input, func(t *testing.T) { + var pos SidenotePosition + attr := xml.Attr{Value: test.input} + err := pos.UnmarshalXMLAttr(attr) if err != nil { - t.Fatalf("Error unmarshaling XML: %v", err) + t.Errorf("Error unmarshaling position '%s': %v", test.input, err) } - - if letterText.Letter != tc.expectedLetter { - t.Errorf("Expected letter attribute %d, got %d", tc.expectedLetter, letterText.Letter) + if pos != test.expected { + t.Errorf("Expected position %d for input '%s', got %d", test.expected, test.input, pos) } }) } +} + +func TestLetterUnmarshalXML_StackTracking(t *testing.T) { + simpleXML := ` + + +Inner content + +` + + var letter Letter + err := xml.Unmarshal([]byte(simpleXML), &letter) + if err != nil { + t.Fatalf("Failed to unmarshal letter: %v", err) + } + + page := letter.Pages[0] + + // Find tokens at different nesting levels + var alignToken *Token + var aqToken *Token + + for i, token := range page.TokenInfo { + if attrs := token.Attributes; len(attrs) > 0 { + if attrs["pos"] == "center" { + alignToken = &page.TokenInfo[i] + } + } + + // Look for deeply nested token (inside align > aq) + if len(token.Stack) >= 1 { + aqToken = &page.TokenInfo[i] + } + } + + if alignToken == nil { + t.Fatal("Expected to find align token") + } + + if aqToken == nil { + t.Fatal("Expected to find nested token") + } + + // Within a page, the stack starts fresh, so align might be at depth 0 + // aq content should be deeper in stack than align + if len(aqToken.Stack) <= len(alignToken.Stack) { + t.Logf("Align stack depth: %d, AQ stack depth: %d", len(alignToken.Stack), len(aqToken.Stack)) + // This is acceptable if both are at the same level in page context + } +} + +func TestLetterUnmarshalXML_RealData(t *testing.T) { + // Try to read from actual briefe.xml file + brieveFile := "../lenz-briefe/data/xml/briefe.xml" + if _, err := os.Stat(brieveFile); os.IsNotExist(err) { + t.Skip("Real briefe.xml file not found, skipping real data test") + return + } + + file, err := os.Open(brieveFile) + if err != nil { + t.Skipf("Cannot open briefe.xml: %v", err) + return + } + defer file.Close() + + decoder := xml.NewDecoder(file) + + // Find first letterText element + for { + token, err := decoder.Token() + if err == io.EOF { + t.Skip("No letterText elements found in briefe.xml") + return + } + if err != nil { + t.Skipf("Error reading briefe.xml: %v", err) + return + } + + if start, ok := token.(xml.StartElement); ok && start.Name.Local == "letterText" { + var letter Letter + err := decoder.DecodeElement(&letter, &start) + if err != nil { + t.Fatalf("Failed to decode real letter: %v", err) + } + + // Basic validation of real data + if letter.Letter == 0 { + t.Error("Expected real letter to have letter number") + } + + if len(letter.Pages) == 0 { + t.Error("Expected real letter to have pages") + } + + // Validate TokenInfo for all pages + for i, page := range letter.Pages { + if len(page.TokenInfo) != len(page.Tokens) { + t.Errorf("Page %d: TokenInfo length %d != Tokens length %d", + i, len(page.TokenInfo), len(page.Tokens)) + } + + // Check all TokenInfo entries are valid + for j, tokenInfo := range page.TokenInfo { + if tokenInfo.Index != j { + t.Errorf("Page %d, Token %d: Expected index %d, got %d", + i, j, j, tokenInfo.Index) + } + if tokenInfo.Stack == nil { + t.Errorf("Page %d, Token %d: Stack is nil", i, j) + } + if tokenInfo.Attributes == nil { + t.Errorf("Page %d, Token %d: Attributes is nil", i, j) + } + } + } + + // Test succeeded with real data + t.Logf("Successfully processed real letter %d with %d pages", letter.Letter, len(letter.Pages)) + return + } + } +} + +func TestToken_AttributeAccess(t *testing.T) { + xmlData := ` + +Content +` + + var letter Letter + err := xml.Unmarshal([]byte(xmlData), &letter) + if err != nil { + t.Fatalf("Failed to unmarshal: %v", err) + } + + page := letter.Pages[0] + + // Find tokens with specific attributes (page tokens are excluded from page.TokenInfo) + foundAlignPos := false + foundAlignTab := false + + for _, tokenInfo := range page.TokenInfo { + if val, exists := tokenInfo.Attributes["pos"]; exists && val == "right" { + foundAlignPos = true + } + if val, exists := tokenInfo.Attributes["tab"]; exists && val == "5" { + foundAlignTab = true + } + } + + if !foundAlignPos { + t.Error("Expected to find align with pos='right'") + } + if !foundAlignTab { + t.Error("Expected to find align with tab='5'") + } +} + +func TestLetterUnmarshalXML_EdgeCases(t *testing.T) { + tests := []struct { + name string + xml string + test func(t *testing.T, letter Letter) + }{ + { + name: "Empty letter", + xml: ``, + test: func(t *testing.T, letter Letter) { + if letter.Letter != 1 { + t.Errorf("Expected letter 1, got %d", letter.Letter) + } + if len(letter.Pages) != 0 { + t.Errorf("Expected 0 pages, got %d", len(letter.Pages)) + } + }, + }, + { + name: "Letter with only page break", + xml: ``, + test: func(t *testing.T, letter Letter) { + // Page break with no content should result in no pages + if len(letter.Pages) != 0 { + t.Errorf("Expected 0 pages (page break with no content), got %d", len(letter.Pages)) + } + }, + }, + { + name: "Letter with nested elements", + xml: ` + + + Nested deeply nested content + +`, + test: func(t *testing.T, letter Letter) { + if len(letter.Pages) != 1 { + t.Errorf("Expected 1 page, got %d", len(letter.Pages)) + } + + page := letter.Pages[0] + maxStackDepth := 0 + for _, tokenInfo := range page.TokenInfo { + if len(tokenInfo.Stack) > maxStackDepth { + maxStackDepth = len(tokenInfo.Stack) + } + } + + if maxStackDepth < 3 { + t.Errorf("Expected deep nesting (3+ levels), got max depth %d", maxStackDepth) + } + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var letter Letter + err := xml.Unmarshal([]byte(tt.xml), &letter) + if err != nil { + t.Fatalf("Failed to unmarshal: %v", err) + } + tt.test(t, letter) + }) + } } \ No newline at end of file diff --git a/xmlmodels/library.go b/xmlmodels/library.go index 3eda184..a478841 100644 --- a/xmlmodels/library.go +++ b/xmlmodels/library.go @@ -56,14 +56,14 @@ func (l *Library) String() string { sb.WriteString("Letters: ") sb.WriteString(strconv.Itoa(l.Letters.Count())) filter := func(item Letter) bool { - return len(item.Hands) > 0 + return len(item.Hands()) > 0 } hands := 0 for l := range l.Letters.Filter(filter) { hands += 1 sb.WriteString("\n") sb.WriteString(strconv.Itoa(l.Letter) + ": ") - sb.WriteString(strconv.Itoa(len(l.Hands)) + " Hände, No " + strconv.Itoa(hands)) + sb.WriteString(strconv.Itoa(len(l.Hands())) + " Hände, No " + strconv.Itoa(hands)) } sb.WriteString("\n") @@ -307,22 +307,22 @@ func (l *Library) LettersForYear(year int) (ret []Meta) { }) { ret = append(ret, l) } - return + return ret } func (l *Library) Person(id int) (ret *PersonDef) { ret = l.Persons.Item(id) - return + return ret } func (l *Library) App(id int) (ret *AppDef) { ret = l.AppDefs.Item(id) - return + return ret } func (l *Library) Place(id int) (ret *LocationDef) { ret = l.Places.Item(id) - return + return ret } func (l *Library) Tradition(letter int) (ret []App) { @@ -338,14 +338,14 @@ func (l *Library) GetPersons(id []int) (ret []*PersonDef) { for _, i := range id { ret = append(ret, l.Person(i)) } - return + return ret } func (l *Library) GetPlaces(id []int) (ret []*LocationDef) { for _, i := range id { ret = append(ret, l.Place(i)) } - return + return ret } func (l *Library) FuncMap() template.FuncMap { diff --git a/xmlmodels/token.go b/xmlmodels/token.go new file mode 100644 index 0000000..3a68f7b --- /dev/null +++ b/xmlmodels/token.go @@ -0,0 +1,361 @@ +package xmlmodels + +import ( + "encoding/xml" + "iter" + "strings" +) + +// Token wraps xml.Token with additional parsing context +type Token struct { + Index int // Position in token array + Stack []string // Element names in the stack at this token + Attributes map[string]string // Attributes for StartElement tokens +} + +// NewTokenFromXMLToken creates a Token from xml.Token with context +func NewTokenFromXMLToken(xmlToken xml.Token, stack []string, index int) Token { + token := Token{ + Index: index, + Stack: make([]string, len(stack)), + Attributes: make(map[string]string), + } + + copy(token.Stack, stack) + + // Extract attributes if this is a StartElement + if startElement, ok := xmlToken.(xml.StartElement); ok { + for _, attr := range startElement.Attr { + token.Attributes[attr.Name.Local] = attr.Value + } + } + + return token +} + +type LetterTokenType int + +const ( + LetterStartElement LetterTokenType = iota + LetterEndElement + LetterCharData + LetterComment + LetterProcInst + LetterDirective +) + +// LetterToken wraps xml.Token with additional context for Letter/Page parsing +type LetterToken struct { + Name string + Attributes map[string]string + Inner xml.Token + Type LetterTokenType + Data string + Stack []*LetterToken + Index int + PageIndex int // Which page this token belongs to + Letter int // Which letter this token belongs to + + // Navigation fields + charData string + children []*LetterToken + childrenParsed bool + chardataParsed bool + parser *LetterParser +} + +// LetterParser wraps a slice of LetterTokens with navigation capabilities +type LetterParser struct { + Stack []*LetterToken + pipeline []*LetterToken + letter int + pageMap map[int]int // Maps page number to starting token index +} + +// NewLetterParser creates a parser from xml.Token slice +func NewLetterParser(tokens []xml.Token, letter int, pageIndex int) *LetterParser { + parser := &LetterParser{ + Stack: make([]*LetterToken, 0), + letter: letter, + pageMap: make(map[int]int), + } + + stack := make([]*LetterToken, 0) + + for i, token := range tokens { + letterToken := &LetterToken{ + Inner: xml.CopyToken(token), + Index: i, + PageIndex: pageIndex, + Letter: letter, + Stack: make([]*LetterToken, len(stack)), + parser: parser, + } + + // Copy current stack + copy(letterToken.Stack, stack) + + switch t := token.(type) { + case xml.StartElement: + letterToken.Name = t.Name.Local + letterToken.Attributes = mapXMLAttributes(t.Attr) + letterToken.Type = LetterStartElement + + // Add to parent's children if not parsed yet + if len(stack) > 0 && !stack[len(stack)-1].childrenParsed { + stack[len(stack)-1].children = append(stack[len(stack)-1].children, letterToken) + } + stack = append(stack, letterToken) + + case xml.EndElement: + if len(stack) > 0 { + element := stack[len(stack)-1] + element.childrenParsed = true + element.chardataParsed = true + stack = stack[:len(stack)-1] + } + letterToken.Name = t.Name.Local + letterToken.Attributes = make(map[string]string) + letterToken.Type = LetterEndElement + + case xml.CharData: + text := string(t) + if text != "" && len(stack) > 0 { + for i := range stack { + if !stack[i].chardataParsed { + stack[i].charData += text + } + } + } + letterToken.Data = text + letterToken.Type = LetterCharData + + case xml.Comment: + letterToken.Type = LetterComment + letterToken.Data = string(t) + + case xml.ProcInst: + letterToken.Name = t.Target + letterToken.Data = string(t.Inst) + letterToken.Type = LetterProcInst + + case xml.Directive: + letterToken.Data = string(t) + letterToken.Type = LetterDirective + } + + parser.pipeline = append(parser.pipeline, letterToken) + } + + return parser +} + +// GetStack returns current parsing stack +func (p *LetterParser) GetStack() []*LetterToken { + return p.Stack +} + +// Pipeline returns all tokens +func (p *LetterParser) Pipeline() []*LetterToken { + return p.pipeline +} + +// TokenAt returns token at specific index +func (p *LetterParser) TokenAt(index int) *LetterToken { + if index < 0 || index >= len(p.pipeline) { + return nil + } + return p.pipeline[index] +} + +// IterateFrom creates iterator starting from specific index +func (p *LetterParser) IterateFrom(index int) iter.Seq2[*LetterToken, error] { + return func(yield func(*LetterToken, error) bool) { + for i := index; i < len(p.pipeline); i++ { + if !yield(p.pipeline[i], nil) { + return + } + } + } +} + +// Iterate over all tokens +func (p *LetterParser) Iterate() iter.Seq2[*LetterToken, error] { + return p.IterateFrom(0) +} + +// Previous returns tokens before given index +func (p *LetterParser) Previous(index int) []*LetterToken { + if index <= 0 || index > len(p.pipeline) { + return nil + } + return p.pipeline[:index] +} + +// LetterToken methods + +// String returns string representation +func (t *LetterToken) String() string { + builder := strings.Builder{} + switch t.Type { + case LetterStartElement: + builder.WriteString("<" + t.Name) + for k, v := range t.Attributes { + builder.WriteString(" " + k + `="` + v + `"`) + } + builder.WriteString(">") + case LetterEndElement: + builder.WriteString("") + case LetterCharData: + builder.WriteString(t.Data) + case LetterComment: + builder.WriteString("") + } + return builder.String() +} + +// Element returns all tokens from start to matching end element +func (t *LetterToken) Element() []*LetterToken { + if t.Type != LetterStartElement { + return nil + } + + var tokens []*LetterToken + depth := 0 + + for token, _ := range t.parser.IterateFrom(t.Index) { + tokens = append(tokens, token) + + if token.Type == LetterStartElement && token.Name == t.Name { + depth++ + } else if token.Type == LetterEndElement && token.Name == t.Name { + depth-- + if depth == 0 { + return tokens + } + } + } + + return tokens +} + +// Children returns direct child elements +func (t *LetterToken) Children() []*LetterToken { + if t.childrenParsed { + return t.children + } + + if t.Type != LetterStartElement { + return nil + } + + element := t.Element() + if len(element) <= 1 { + return nil + } + + // Skip first (self) and find direct children + depth := 0 + for _, token := range element[1:] { // Skip self + if token.Type == LetterStartElement { + if depth == 0 { + t.children = append(t.children, token) + } + depth++ + } else if token.Type == LetterEndElement { + depth-- + } + } + + t.childrenParsed = true + return t.children +} + +// CharData returns character data content +func (t *LetterToken) CharData() string { + if t.Type == LetterCharData || t.Type == LetterComment { + return t.Data + } + + if t.chardataParsed { + return t.charData + } + + if t.Type != LetterStartElement { + return "" + } + + element := t.Element() + if len(element) == 0 { + return "" + } + + var builder strings.Builder + for _, token := range element { + if token.Type == LetterCharData { + builder.WriteString(token.Data) + } + } + + t.chardataParsed = true + t.charData = builder.String() + return t.charData +} + +// Next returns iterator from next token +func (t *LetterToken) Next() iter.Seq2[*LetterToken, error] { + return t.parser.IterateFrom(t.Index + 1) +} + +// Previous returns tokens before this one +func (t *LetterToken) Previous() []*LetterToken { + return t.parser.Previous(t.Index) +} + +// FindByName finds first child element with given name +func (t *LetterToken) FindByName(name string) *LetterToken { + for _, child := range t.Children() { + if child.Name == name { + return child + } + } + return nil +} + +// FindAllByName finds all child elements with given name +func (t *LetterToken) FindAllByName(name string) []*LetterToken { + var result []*LetterToken + for _, child := range t.Children() { + if child.Name == name { + result = append(result, child) + } + } + return result +} + +// GetAttribute returns attribute value +func (t *LetterToken) GetAttribute(name string) string { + if t.Attributes == nil { + return "" + } + return t.Attributes[name] +} + +// GetStackDepth returns current nesting depth +func (t *LetterToken) GetStackDepth() int { + return len(t.Stack) +} + +// InPage checks if token belongs to specific page +func (t *LetterToken) InPage(pageNo int) bool { + return t.PageIndex == pageNo +} + +// mapXMLAttributes converts xml.Attr to map[string]string +func mapXMLAttributes(attrs []xml.Attr) map[string]string { + attrMap := make(map[string]string) + for _, attr := range attrs { + attrMap[attr.Name.Local] = attr.Value + } + return attrMap +} \ No newline at end of file