diff --git a/xmlmodels/letter.go b/xmlmodels/letter.go index 860a957..ffb7d72 100644 --- a/xmlmodels/letter.go +++ b/xmlmodels/letter.go @@ -3,10 +3,17 @@ package xmlmodels import ( "encoding/json" "encoding/xml" - "io" - "strconv" ) +type Letter struct { + XMLName xml.Name `xml:"letterText"` + Letter int `xml:"letter,attr"` + Pages []Page `xml:"page"` + Hands []RefElement `xml:"hand"` + Content string `xml:",innerxml"` + Chardata string `xml:",chardata"` +} + func (l Letter) Keys() []any { return []any{l.Letter} } @@ -23,220 +30,7 @@ func (l Letter) String() string { return string(json) } -type SidenotePosition uint8 - -const ( - SidenotePositionLeft SidenotePosition = iota - SidenotePositionRight - SidenotePositionTop - SidenotePositionTopLeft - SidenotePositionTopRight - SidenotePositionBottom - SidenotePositionBottomLeft - SidenotePositionBottomRight -) - -func (sp *SidenotePosition) UnmarshalXMLAttr(attr xml.Attr) error { - switch attr.Value { - case "left": - *sp = SidenotePositionLeft - case "right": - *sp = SidenotePositionRight - case "top": - *sp = SidenotePositionTop - case "top left": - *sp = SidenotePositionTopLeft - case "top right": - *sp = SidenotePositionTopRight - case "bottom": - *sp = SidenotePositionBottom - case "bottom left": - *sp = SidenotePositionBottomLeft - case "bottom right": - *sp = SidenotePositionBottomRight - default: - *sp = SidenotePositionLeft // Default fallback - } - return nil -} - -type Letter struct { - XMLName xml.Name `xml:"letterText"` - Letter int - Pages []Page -} - type Page struct { - No int - Letter int - Sidenotes []Sidenote - Hands []int - Tokens []xml.Token - CharData string -} - -type Sidenote struct { - XMLName xml.Name - Position SidenotePosition - Page int - Annotation string - Anchor int - Tokens []xml.Token - CharData string -} - -type Char struct { - Stack []xml.Token - Value string -} - -func (c Char) String() string { - return c.Value -} - -func (lt *Letter) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { - lt.XMLName = start.Name - for _, attr := range start.Attr { - if attr.Name.Local == "letter" { - if letterNum, err := strconv.Atoi(attr.Value); err == nil { - lt.Letter = letterNum - } - } - } - - if err := lt.parseTokens(d); err != nil { - return err - } - - return nil -} - -func (lt *Letter) parseTokens(d *xml.Decoder) error { - stack := []xml.Token{} - var c_page *Page = nil - - for { - token, err := d.Token() - if err == io.EOF { - break - } - if err != nil { - return err - } - - // INFO: Make a copy of the token since Token() reuses the underlying data - tokenCopy := xml.CopyToken(token) - if c_page != nil { - c_page.Tokens = append(c_page.Tokens, tokenCopy) - } - - switch t := tokenCopy.(type) { - case xml.StartElement: - switch t.Name.Local { - case "page": - if c_page != nil { - lt.Pages = append(lt.Pages, *c_page) - } - - c_page = &Page{} - - for _, attr := range t.Attr { - if attr.Name.Local == "index" { - if idx, err := strconv.Atoi(attr.Value); err == nil { - c_page.No = idx - } - } - } - - d.Skip() - - // WARNING: UnmarshalXML continues and changes the state of the parser - case "sidenote": - var sidenote Sidenote = Sidenote{ - Anchor: len(c_page.Tokens), - } - if err := sidenote.UnmarshalXML(d, t); err == nil && c_page != nil { - c_page.Sidenotes = append(c_page.Sidenotes, sidenote) - } - - // INFO: We create a list of all hand in a letter - case "hand": - for _, attr := range t.Attr { - if attr.Name.Local == "ref" && c_page != nil { - if ref, err := strconv.Atoi(attr.Value); err == nil { - c_page.Hands = append(c_page.Hands, ref) - } - } - } - fallthrough - - default: - if c_page != nil { - c_page.Tokens = append(c_page.Tokens, tokenCopy) - } - } - - case xml.CharData: - if c_page != nil { - c_page.CharData = string(t) - c_page.Tokens = append(c_page.Tokens, tokenCopy) - } - - case xml.EndElement: - if t.Name.Local == "letterText" { - if c_page != nil { - lt.Pages = append(lt.Pages, *c_page) - } - return nil - } - - if c_page != nil { - c_page.Tokens = append(c_page.Tokens, tokenCopy) - } - } - } - - return nil -} - -func (s *Sidenote) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { - // Set the XMLName - s.XMLName = start.Name - - // Parse attributes - for _, attr := range start.Attr { - switch attr.Name.Local { - case "pos": - s.Position.UnmarshalXMLAttr(attr) - case "page": - if page, err := strconv.Atoi(attr.Value); err == nil { - s.Page = page - } - case "annotation": - s.Annotation = attr.Value - } - } - - // Collect all content tokens - for { - token, err := d.Token() - if err != nil { - return err - } - - tokenCopy := xml.CopyToken(token) - - switch t := tokenCopy.(type) { - case xml.EndElement: - if t.Name.Local == start.Name.Local { - // End of sidenote element - return nil - } - // Add the end element token to content - s.Content = append(s.Content, tokenCopy) - case xml.StartElement, xml.CharData, xml.Comment, xml.ProcInst: - // Add all other tokens to content - s.Content = append(s.Content, tokenCopy) - } - } + XMLName xml.Name `xml:"page"` + Index int `xml:"index,attr"` } diff --git a/xmlmodels/letter_test.go b/xmlmodels/letter_test.go deleted file mode 100644 index f18b0f7..0000000 --- a/xmlmodels/letter_test.go +++ /dev/null @@ -1,446 +0,0 @@ -package xmlmodels - -import ( - "encoding/xml" - "strings" - "testing" -) - -// Helper function to convert []xml.Token back to string for testing -func tokensToString(tokens []xml.Token) string { - var sb strings.Builder - for _, token := range tokens { - switch t := token.(type) { - case xml.StartElement: - sb.WriteString("<") - sb.WriteString(t.Name.Local) - for _, attr := range t.Attr { - sb.WriteString(" ") - sb.WriteString(attr.Name.Local) - sb.WriteString(`="`) - sb.WriteString(attr.Value) - sb.WriteString(`"`) - } - sb.WriteString(">") - case xml.EndElement: - sb.WriteString("") - case xml.CharData: - sb.Write(t) - case xml.Comment: - sb.WriteString("") - case xml.ProcInst: - sb.WriteString(" 0 { - sb.WriteString(" ") - sb.Write(t.Inst) - } - sb.WriteString("?>") - } - } - return sb.String() -} - -func TestLetterTextUnmarshal_SimpleCase(t *testing.T) { - // Simple test case with basic structure - testXML := ` - Some content before first page break. - - Content on page 1 with some markup and more text. - This is a sidenote - More content on page 1. - - Content on page 2 with bold text. - Hand reference content - Final content on page 2. - ` - - var letterText LetterText - err := xml.Unmarshal([]byte(testXML), &letterText) - if err != nil { - t.Fatalf("Error unmarshaling XML: %v", err) - } - - // Verify basic structure - if len(letterText.Pages) != 3 { - t.Errorf("Expected 3 pages, got %d", len(letterText.Pages)) - } - if len(letterText.PageBreaks) != 2 { - t.Errorf("Expected 2 page breaks, got %d", len(letterText.PageBreaks)) - } - if len(letterText.Sidenotes) != 1 { - t.Errorf("Expected 1 sidenote, got %d", len(letterText.Sidenotes)) - } - if letterText.Hands.Reference != 42 { - t.Errorf("Expected hand reference 42, got %d", letterText.Hands.Reference) - } - - // Verify page breaks - if letterText.PageBreaks[0].Index != 1 { - t.Errorf("Expected page break index 1, got %d", letterText.PageBreaks[0].Index) - } - if letterText.PageBreaks[1].Index != 2 { - t.Errorf("Expected page break index 2, got %d", letterText.PageBreaks[1].Index) - } - - // Verify sidenote - sidenote := letterText.Sidenotes[0] - if sidenote.Page != 1 { - t.Errorf("Expected sidenote on page 1, got %d", sidenote.Page) - } - if sidenote.Position != SidenotePositionRight { - t.Errorf("Expected sidenote position right, got %d", sidenote.Position) - } - if sidenote.Annotation != "test" { - t.Errorf("Expected sidenote annotation 'test', got '%s'", sidenote.Annotation) - } - sidenoteContent := tokensToString(sidenote.Content) - if !strings.Contains(sidenoteContent, "This is a sidenote") { - t.Errorf("Expected sidenote content to contain 'This is a sidenote', got '%s'", sidenoteContent) - } - - // Verify page content doesn't contain sidenote text - for _, page := range letterText.Pages { - content := tokensToString(page.Content) - if strings.Contains(content, "This is a sidenote") { - t.Errorf("Page content should not contain sidenote text, but page %d does: %s", page.Page, content) - } - } -} - -func TestLetterTextUnmarshal_RealExample_Letter1(t *testing.T) { - // Real example from briefe.xml - Letter 1 (simplified) - testXML := ` -HochEdelgeborner Hochgelahrter Herr Secretair -Verehrungswürdigster Gönner! - -Ew. HochEdelgebh: haben mich durch die neue Probe von Dero schätzbaren Gewogenheit ausserorndtlich beschämt. Meine Feder ist zu schwach, Denenselben die regen Empfindungen meines Herzens darüber zu schildern. -lasse mich noch lange das Glück genießen, Dieselben in dem blühendsten Wohlstande zu sehen, und mich mit dem erkenntlichsten Herzen nennen zu dürfen - -Hoch Edelgeborner Hochgelahrter Herr Secretair -Verehrungswürdigster Gönner -Ew. HochEdelgebh: -Von Hause, d. 2 Jenner, 1765. -gehorsamsten Diener -Jacob Michael Reinhold Lenz -` - - var letterText LetterText - err := xml.Unmarshal([]byte(testXML), &letterText) - if err != nil { - t.Fatalf("Error unmarshaling real XML: %v", err) - } - - // Should have 2 pages - if len(letterText.Pages) != 2 { - t.Errorf("Expected 2 pages, got %d", len(letterText.Pages)) - } - if len(letterText.PageBreaks) != 2 { - t.Errorf("Expected 2 page breaks, got %d", len(letterText.PageBreaks)) - } - - // Verify page content contains expected elements - page1Found := false - page2Found := false - for _, page := range letterText.Pages { - content := tokensToString(page.Content) - if page.Page == 1 && strings.Contains(content, "HochEdelgeborner") { - page1Found = true - } - if page.Page == 2 && strings.Contains(content, "Jacob Michael Reinhold Lenz") { - page2Found = true - } - } - - if !page1Found { - t.Error("Page 1 content not found correctly") - } - if !page2Found { - t.Error("Page 2 content not found correctly") - } -} - -func TestLetterTextUnmarshal_WithSidenotes(t *testing.T) { - // Real example with sidenotes from briefe.xml - testXML := ` -Some text before sidenote. -Ich umarme Dich und küsse Dich 1000mahl als Dein -allergetreuester Bruder -Jacob Michael Reinhold Lenz. -Dorpat den 11ten October 1767. -More text after sidenote. -` - - var letterText LetterText - err := xml.Unmarshal([]byte(testXML), &letterText) - if err != nil { - t.Fatalf("Error unmarshaling sidenote XML: %v", err) - } - - // Should have 1 sidenote - if len(letterText.Sidenotes) != 1 { - t.Errorf("Expected 1 sidenote, got %d", len(letterText.Sidenotes)) - } - - // Verify sidenote details - sidenote := letterText.Sidenotes[0] - if sidenote.Position != SidenotePositionLeft { - t.Errorf("Expected sidenote position left, got %d", sidenote.Position) - } - if sidenote.Page != 1 { - t.Errorf("Expected sidenote on page 1, got %d", sidenote.Page) - } - if !strings.Contains(sidenote.Annotation, "am linken Rand") { - t.Errorf("Expected sidenote annotation to contain 'am linken Rand', got '%s'", sidenote.Annotation) - } - sidenoteContent := tokensToString(sidenote.Content) - if !strings.Contains(sidenoteContent, "Jacob Michael Reinhold Lenz") { - t.Errorf("Expected sidenote content to contain author name, got '%s'", sidenoteContent) - } - - // Verify page content doesn't contain sidenote - for _, page := range letterText.Pages { - content := tokensToString(page.Content) - if strings.Contains(content, "allergetreuester Bruder") { - t.Errorf("Page content should not contain sidenote text, but page %d does", page.Page) - } - } -} - -func TestLetterTextUnmarshal_ComplexSidenotePositions(t *testing.T) { - // Test different sidenote positions - testXML := ` - -Top right sidenote -Bottom left sidenote -Top sidenote -Some content. -` - - var letterText LetterText - err := xml.Unmarshal([]byte(testXML), &letterText) - if err != nil { - t.Fatalf("Error unmarshaling complex sidenotes XML: %v", err) - } - - if len(letterText.Sidenotes) != 3 { - t.Fatalf("Expected 3 sidenotes, got %d", len(letterText.Sidenotes)) - } - - // Check position parsing - positions := make(map[SidenotePosition]bool) - for _, sidenote := range letterText.Sidenotes { - positions[sidenote.Position] = true - } - - expectedPositions := []SidenotePosition{ - SidenotePositionTopRight, - SidenotePositionBottomLeft, - SidenotePositionTop, - } - - for _, expected := range expectedPositions { - if !positions[expected] { - t.Errorf("Expected to find sidenote position %d, but didn't", expected) - } - } -} - -func TestLetterTextUnmarshal_NoPageBreaks(t *testing.T) { - // Test letter without explicit page breaks - testXML := ` -This is all content on the default page. -Some markup and more text. -Note on single page -Final text. -` - - var letterText LetterText - err := xml.Unmarshal([]byte(testXML), &letterText) - if err != nil { - t.Fatalf("Error unmarshaling no-page-break XML: %v", err) - } - - // Should have 1 page (default page 1) - if len(letterText.Pages) != 1 { - t.Errorf("Expected 1 page, got %d", len(letterText.Pages)) - } - if len(letterText.PageBreaks) != 0 { - t.Errorf("Expected 0 page breaks, got %d", len(letterText.PageBreaks)) - } - - // Page should be page 1 - if letterText.Pages[0].Page != 1 { - t.Errorf("Expected page 1, got page %d", letterText.Pages[0].Page) - } - - // Content should contain markup but not sidenote - content := tokensToString(letterText.Pages[0].Content) - if !strings.Contains(content, "Some markup") { - t.Error("Expected page content to contain markup") - } - if strings.Contains(content, "Note on single page") { - t.Error("Page content should not contain sidenote text") - } -} - -func TestLetterTextUnmarshal_EmptyContent(t *testing.T) { - // Test edge case with empty content - testXML := ` - -` - - var letterText LetterText - err := xml.Unmarshal([]byte(testXML), &letterText) - if err != nil { - t.Fatalf("Error unmarshaling empty XML: %v", err) - } - - // Should have no pages with content - if len(letterText.Pages) != 0 { - t.Errorf("Expected 0 pages with content, got %d", len(letterText.Pages)) - } - if len(letterText.PageBreaks) != 1 { - t.Errorf("Expected 1 page break, got %d", len(letterText.PageBreaks)) - } -} - -func TestSidenotePosition_UnmarshalXMLAttr(t *testing.T) { - tests := []struct { - input string - expected SidenotePosition - }{ - {"left", SidenotePositionLeft}, - {"right", SidenotePositionRight}, - {"top", SidenotePositionTop}, - {"top left", SidenotePositionTopLeft}, - {"top right", SidenotePositionTopRight}, - {"bottom", SidenotePositionBottom}, - {"bottom left", SidenotePositionBottomLeft}, - {"bottom right", SidenotePositionBottomRight}, - {"unknown", SidenotePositionLeft}, // Default fallback - } - - for _, test := range tests { - var pos SidenotePosition - attr := xml.Attr{Value: test.input} - err := pos.UnmarshalXMLAttr(attr) - if err != nil { - t.Errorf("Error unmarshaling position '%s': %v", test.input, err) - } - if pos != test.expected { - t.Errorf("Expected position %d for input '%s', got %d", test.expected, test.input, pos) - } - } -} - -func TestLetterTextUnmarshal_PreserveMarkup(t *testing.T) { - // Test that various markup elements are preserved in page content - testXML := ` - -Text with antiqua and bold and italic. - -Centered text -
    Underlined text
-Deleted text -More content with person reference. -
` - - var letterText LetterText - err := xml.Unmarshal([]byte(testXML), &letterText) - if err != nil { - t.Fatalf("Error unmarshaling markup XML: %v", err) - } - - if len(letterText.Pages) != 1 { - t.Fatalf("Expected 1 page, got %d", len(letterText.Pages)) - } - - content := tokensToString(letterText.Pages[0].Content) - expectedMarkup := []string{ - "antiqua", - "bold", - "italic", - "", - "", - "", - "Deleted text", - "person reference", - } - - for _, markup := range expectedMarkup { - if !strings.Contains(content, markup) { - t.Errorf("Expected page content to contain '%s', but it doesn't. Content: %s", markup, content) - } - } -} - -func TestLetterTextUnmarshal_LetterAttribute(t *testing.T) { - // Test that the letter attribute is parsed correctly - testXML := ` - -Some content. -` - - var letterText LetterText - err := xml.Unmarshal([]byte(testXML), &letterText) - if err != nil { - t.Fatalf("Error unmarshaling letter attribute XML: %v", err) - } - - // Verify letter attribute is parsed - if letterText.Letter != 42 { - t.Errorf("Expected letter attribute 42, got %d", letterText.Letter) - } -} - -func TestLetterTextUnmarshal_LetterAttribute_AllExistingTests(t *testing.T) { - // Test that existing test cases also have correct letter attributes - testCases := []struct { - name string - xml string - expectedLetter int - }{ - { - name: "Simple case", - xml: ` - Some content. - `, - expectedLetter: 123, - }, - { - name: "Real example letter 1", - xml: ` - Some content. - `, - expectedLetter: 1, - }, - { - name: "Letter with sidenotes", - xml: ` - - Note - Content. - `, - expectedLetter: 999, - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - var letterText LetterText - err := xml.Unmarshal([]byte(tc.xml), &letterText) - if err != nil { - t.Fatalf("Error unmarshaling XML: %v", err) - } - - if letterText.Letter != tc.expectedLetter { - t.Errorf("Expected letter attribute %d, got %d", tc.expectedLetter, letterText.Letter) - } - }) - } -} \ No newline at end of file