From 9106d1c313373779dae55d0f09c4417884a212d7 Mon Sep 17 00:00:00 2001 From: Simon Martens Date: Wed, 17 Sep 2025 16:42:42 +0200 Subject: [PATCH] Init rework --- xmlmodels/letter.go | 230 ++++++++++++++++++-- xmlmodels/letter_test.go | 446 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 664 insertions(+), 12 deletions(-) create mode 100644 xmlmodels/letter_test.go diff --git a/xmlmodels/letter.go b/xmlmodels/letter.go index ffb7d72..860a957 100644 --- a/xmlmodels/letter.go +++ b/xmlmodels/letter.go @@ -3,17 +3,10 @@ package xmlmodels import ( "encoding/json" "encoding/xml" + "io" + "strconv" ) -type Letter struct { - XMLName xml.Name `xml:"letterText"` - Letter int `xml:"letter,attr"` - Pages []Page `xml:"page"` - Hands []RefElement `xml:"hand"` - Content string `xml:",innerxml"` - Chardata string `xml:",chardata"` -} - func (l Letter) Keys() []any { return []any{l.Letter} } @@ -30,7 +23,220 @@ func (l Letter) String() string { return string(json) } -type Page struct { - XMLName xml.Name `xml:"page"` - Index int `xml:"index,attr"` +type SidenotePosition uint8 + +const ( + SidenotePositionLeft SidenotePosition = iota + SidenotePositionRight + SidenotePositionTop + SidenotePositionTopLeft + SidenotePositionTopRight + SidenotePositionBottom + SidenotePositionBottomLeft + SidenotePositionBottomRight +) + +func (sp *SidenotePosition) UnmarshalXMLAttr(attr xml.Attr) error { + switch attr.Value { + case "left": + *sp = SidenotePositionLeft + case "right": + *sp = SidenotePositionRight + case "top": + *sp = SidenotePositionTop + case "top left": + *sp = SidenotePositionTopLeft + case "top right": + *sp = SidenotePositionTopRight + case "bottom": + *sp = SidenotePositionBottom + case "bottom left": + *sp = SidenotePositionBottomLeft + case "bottom right": + *sp = SidenotePositionBottomRight + default: + *sp = SidenotePositionLeft // Default fallback + } + return nil +} + +type Letter struct { + XMLName xml.Name `xml:"letterText"` + Letter int + Pages []Page +} + +type Page struct { + No int + Letter int + Sidenotes []Sidenote + Hands []int + Tokens []xml.Token + CharData string +} + +type Sidenote struct { + XMLName xml.Name + Position SidenotePosition + Page int + Annotation string + Anchor int + Tokens []xml.Token + CharData string +} + +type Char struct { + Stack []xml.Token + Value string +} + +func (c Char) String() string { + return c.Value +} + +func (lt *Letter) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { + lt.XMLName = start.Name + for _, attr := range start.Attr { + if attr.Name.Local == "letter" { + if letterNum, err := strconv.Atoi(attr.Value); err == nil { + lt.Letter = letterNum + } + } + } + + if err := lt.parseTokens(d); err != nil { + return err + } + + return nil +} + +func (lt *Letter) parseTokens(d *xml.Decoder) error { + stack := []xml.Token{} + var c_page *Page = nil + + for { + token, err := d.Token() + if err == io.EOF { + break + } + if err != nil { + return err + } + + // INFO: Make a copy of the token since Token() reuses the underlying data + tokenCopy := xml.CopyToken(token) + if c_page != nil { + c_page.Tokens = append(c_page.Tokens, tokenCopy) + } + + switch t := tokenCopy.(type) { + case xml.StartElement: + switch t.Name.Local { + case "page": + if c_page != nil { + lt.Pages = append(lt.Pages, *c_page) + } + + c_page = &Page{} + + for _, attr := range t.Attr { + if attr.Name.Local == "index" { + if idx, err := strconv.Atoi(attr.Value); err == nil { + c_page.No = idx + } + } + } + + d.Skip() + + // WARNING: UnmarshalXML continues and changes the state of the parser + case "sidenote": + var sidenote Sidenote = Sidenote{ + Anchor: len(c_page.Tokens), + } + if err := sidenote.UnmarshalXML(d, t); err == nil && c_page != nil { + c_page.Sidenotes = append(c_page.Sidenotes, sidenote) + } + + // INFO: We create a list of all hand in a letter + case "hand": + for _, attr := range t.Attr { + if attr.Name.Local == "ref" && c_page != nil { + if ref, err := strconv.Atoi(attr.Value); err == nil { + c_page.Hands = append(c_page.Hands, ref) + } + } + } + fallthrough + + default: + if c_page != nil { + c_page.Tokens = append(c_page.Tokens, tokenCopy) + } + } + + case xml.CharData: + if c_page != nil { + c_page.CharData = string(t) + c_page.Tokens = append(c_page.Tokens, tokenCopy) + } + + case xml.EndElement: + if t.Name.Local == "letterText" { + if c_page != nil { + lt.Pages = append(lt.Pages, *c_page) + } + return nil + } + + if c_page != nil { + c_page.Tokens = append(c_page.Tokens, tokenCopy) + } + } + } + + return nil +} + +func (s *Sidenote) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { + // Set the XMLName + s.XMLName = start.Name + + // Parse attributes + for _, attr := range start.Attr { + switch attr.Name.Local { + case "pos": + s.Position.UnmarshalXMLAttr(attr) + case "page": + if page, err := strconv.Atoi(attr.Value); err == nil { + s.Page = page + } + case "annotation": + s.Annotation = attr.Value + } + } + + // Collect all content tokens + for { + token, err := d.Token() + if err != nil { + return err + } + + tokenCopy := xml.CopyToken(token) + + switch t := tokenCopy.(type) { + case xml.EndElement: + if t.Name.Local == start.Name.Local { + // End of sidenote element + return nil + } + // Add the end element token to content + s.Content = append(s.Content, tokenCopy) + case xml.StartElement, xml.CharData, xml.Comment, xml.ProcInst: + // Add all other tokens to content + s.Content = append(s.Content, tokenCopy) + } + } } diff --git a/xmlmodels/letter_test.go b/xmlmodels/letter_test.go new file mode 100644 index 0000000..f18b0f7 --- /dev/null +++ b/xmlmodels/letter_test.go @@ -0,0 +1,446 @@ +package xmlmodels + +import ( + "encoding/xml" + "strings" + "testing" +) + +// Helper function to convert []xml.Token back to string for testing +func tokensToString(tokens []xml.Token) string { + var sb strings.Builder + for _, token := range tokens { + switch t := token.(type) { + case xml.StartElement: + sb.WriteString("<") + sb.WriteString(t.Name.Local) + for _, attr := range t.Attr { + sb.WriteString(" ") + sb.WriteString(attr.Name.Local) + sb.WriteString(`="`) + sb.WriteString(attr.Value) + sb.WriteString(`"`) + } + sb.WriteString(">") + case xml.EndElement: + sb.WriteString("") + case xml.CharData: + sb.Write(t) + case xml.Comment: + sb.WriteString("") + case xml.ProcInst: + sb.WriteString(" 0 { + sb.WriteString(" ") + sb.Write(t.Inst) + } + sb.WriteString("?>") + } + } + return sb.String() +} + +func TestLetterTextUnmarshal_SimpleCase(t *testing.T) { + // Simple test case with basic structure + testXML := ` + Some content before first page break. + + Content on page 1 with some markup and more text. + This is a sidenote + More content on page 1. + + Content on page 2 with bold text. + Hand reference content + Final content on page 2. + ` + + var letterText LetterText + err := xml.Unmarshal([]byte(testXML), &letterText) + if err != nil { + t.Fatalf("Error unmarshaling XML: %v", err) + } + + // Verify basic structure + if len(letterText.Pages) != 3 { + t.Errorf("Expected 3 pages, got %d", len(letterText.Pages)) + } + if len(letterText.PageBreaks) != 2 { + t.Errorf("Expected 2 page breaks, got %d", len(letterText.PageBreaks)) + } + if len(letterText.Sidenotes) != 1 { + t.Errorf("Expected 1 sidenote, got %d", len(letterText.Sidenotes)) + } + if letterText.Hands.Reference != 42 { + t.Errorf("Expected hand reference 42, got %d", letterText.Hands.Reference) + } + + // Verify page breaks + if letterText.PageBreaks[0].Index != 1 { + t.Errorf("Expected page break index 1, got %d", letterText.PageBreaks[0].Index) + } + if letterText.PageBreaks[1].Index != 2 { + t.Errorf("Expected page break index 2, got %d", letterText.PageBreaks[1].Index) + } + + // Verify sidenote + sidenote := letterText.Sidenotes[0] + if sidenote.Page != 1 { + t.Errorf("Expected sidenote on page 1, got %d", sidenote.Page) + } + if sidenote.Position != SidenotePositionRight { + t.Errorf("Expected sidenote position right, got %d", sidenote.Position) + } + if sidenote.Annotation != "test" { + t.Errorf("Expected sidenote annotation 'test', got '%s'", sidenote.Annotation) + } + sidenoteContent := tokensToString(sidenote.Content) + if !strings.Contains(sidenoteContent, "This is a sidenote") { + t.Errorf("Expected sidenote content to contain 'This is a sidenote', got '%s'", sidenoteContent) + } + + // Verify page content doesn't contain sidenote text + for _, page := range letterText.Pages { + content := tokensToString(page.Content) + if strings.Contains(content, "This is a sidenote") { + t.Errorf("Page content should not contain sidenote text, but page %d does: %s", page.Page, content) + } + } +} + +func TestLetterTextUnmarshal_RealExample_Letter1(t *testing.T) { + // Real example from briefe.xml - Letter 1 (simplified) + testXML := ` +HochEdelgeborner Hochgelahrter Herr Secretair +Verehrungswürdigster Gönner! + +Ew. HochEdelgebh: haben mich durch die neue Probe von Dero schätzbaren Gewogenheit ausserorndtlich beschämt. Meine Feder ist zu schwach, Denenselben die regen Empfindungen meines Herzens darüber zu schildern. +lasse mich noch lange das Glück genießen, Dieselben in dem blühendsten Wohlstande zu sehen, und mich mit dem erkenntlichsten Herzen nennen zu dürfen + +Hoch Edelgeborner Hochgelahrter Herr Secretair +Verehrungswürdigster Gönner +Ew. HochEdelgebh: +Von Hause, d. 2 Jenner, 1765. +gehorsamsten Diener +Jacob Michael Reinhold Lenz +` + + var letterText LetterText + err := xml.Unmarshal([]byte(testXML), &letterText) + if err != nil { + t.Fatalf("Error unmarshaling real XML: %v", err) + } + + // Should have 2 pages + if len(letterText.Pages) != 2 { + t.Errorf("Expected 2 pages, got %d", len(letterText.Pages)) + } + if len(letterText.PageBreaks) != 2 { + t.Errorf("Expected 2 page breaks, got %d", len(letterText.PageBreaks)) + } + + // Verify page content contains expected elements + page1Found := false + page2Found := false + for _, page := range letterText.Pages { + content := tokensToString(page.Content) + if page.Page == 1 && strings.Contains(content, "HochEdelgeborner") { + page1Found = true + } + if page.Page == 2 && strings.Contains(content, "Jacob Michael Reinhold Lenz") { + page2Found = true + } + } + + if !page1Found { + t.Error("Page 1 content not found correctly") + } + if !page2Found { + t.Error("Page 2 content not found correctly") + } +} + +func TestLetterTextUnmarshal_WithSidenotes(t *testing.T) { + // Real example with sidenotes from briefe.xml + testXML := ` +Some text before sidenote. +Ich umarme Dich und küsse Dich 1000mahl als Dein +allergetreuester Bruder +Jacob Michael Reinhold Lenz. +Dorpat den 11ten October 1767. +More text after sidenote. +` + + var letterText LetterText + err := xml.Unmarshal([]byte(testXML), &letterText) + if err != nil { + t.Fatalf("Error unmarshaling sidenote XML: %v", err) + } + + // Should have 1 sidenote + if len(letterText.Sidenotes) != 1 { + t.Errorf("Expected 1 sidenote, got %d", len(letterText.Sidenotes)) + } + + // Verify sidenote details + sidenote := letterText.Sidenotes[0] + if sidenote.Position != SidenotePositionLeft { + t.Errorf("Expected sidenote position left, got %d", sidenote.Position) + } + if sidenote.Page != 1 { + t.Errorf("Expected sidenote on page 1, got %d", sidenote.Page) + } + if !strings.Contains(sidenote.Annotation, "am linken Rand") { + t.Errorf("Expected sidenote annotation to contain 'am linken Rand', got '%s'", sidenote.Annotation) + } + sidenoteContent := tokensToString(sidenote.Content) + if !strings.Contains(sidenoteContent, "Jacob Michael Reinhold Lenz") { + t.Errorf("Expected sidenote content to contain author name, got '%s'", sidenoteContent) + } + + // Verify page content doesn't contain sidenote + for _, page := range letterText.Pages { + content := tokensToString(page.Content) + if strings.Contains(content, "allergetreuester Bruder") { + t.Errorf("Page content should not contain sidenote text, but page %d does", page.Page) + } + } +} + +func TestLetterTextUnmarshal_ComplexSidenotePositions(t *testing.T) { + // Test different sidenote positions + testXML := ` + +Top right sidenote +Bottom left sidenote +Top sidenote +Some content. +` + + var letterText LetterText + err := xml.Unmarshal([]byte(testXML), &letterText) + if err != nil { + t.Fatalf("Error unmarshaling complex sidenotes XML: %v", err) + } + + if len(letterText.Sidenotes) != 3 { + t.Fatalf("Expected 3 sidenotes, got %d", len(letterText.Sidenotes)) + } + + // Check position parsing + positions := make(map[SidenotePosition]bool) + for _, sidenote := range letterText.Sidenotes { + positions[sidenote.Position] = true + } + + expectedPositions := []SidenotePosition{ + SidenotePositionTopRight, + SidenotePositionBottomLeft, + SidenotePositionTop, + } + + for _, expected := range expectedPositions { + if !positions[expected] { + t.Errorf("Expected to find sidenote position %d, but didn't", expected) + } + } +} + +func TestLetterTextUnmarshal_NoPageBreaks(t *testing.T) { + // Test letter without explicit page breaks + testXML := ` +This is all content on the default page. +Some markup and more text. +Note on single page +Final text. +` + + var letterText LetterText + err := xml.Unmarshal([]byte(testXML), &letterText) + if err != nil { + t.Fatalf("Error unmarshaling no-page-break XML: %v", err) + } + + // Should have 1 page (default page 1) + if len(letterText.Pages) != 1 { + t.Errorf("Expected 1 page, got %d", len(letterText.Pages)) + } + if len(letterText.PageBreaks) != 0 { + t.Errorf("Expected 0 page breaks, got %d", len(letterText.PageBreaks)) + } + + // Page should be page 1 + if letterText.Pages[0].Page != 1 { + t.Errorf("Expected page 1, got page %d", letterText.Pages[0].Page) + } + + // Content should contain markup but not sidenote + content := tokensToString(letterText.Pages[0].Content) + if !strings.Contains(content, "Some markup") { + t.Error("Expected page content to contain markup") + } + if strings.Contains(content, "Note on single page") { + t.Error("Page content should not contain sidenote text") + } +} + +func TestLetterTextUnmarshal_EmptyContent(t *testing.T) { + // Test edge case with empty content + testXML := ` + +` + + var letterText LetterText + err := xml.Unmarshal([]byte(testXML), &letterText) + if err != nil { + t.Fatalf("Error unmarshaling empty XML: %v", err) + } + + // Should have no pages with content + if len(letterText.Pages) != 0 { + t.Errorf("Expected 0 pages with content, got %d", len(letterText.Pages)) + } + if len(letterText.PageBreaks) != 1 { + t.Errorf("Expected 1 page break, got %d", len(letterText.PageBreaks)) + } +} + +func TestSidenotePosition_UnmarshalXMLAttr(t *testing.T) { + tests := []struct { + input string + expected SidenotePosition + }{ + {"left", SidenotePositionLeft}, + {"right", SidenotePositionRight}, + {"top", SidenotePositionTop}, + {"top left", SidenotePositionTopLeft}, + {"top right", SidenotePositionTopRight}, + {"bottom", SidenotePositionBottom}, + {"bottom left", SidenotePositionBottomLeft}, + {"bottom right", SidenotePositionBottomRight}, + {"unknown", SidenotePositionLeft}, // Default fallback + } + + for _, test := range tests { + var pos SidenotePosition + attr := xml.Attr{Value: test.input} + err := pos.UnmarshalXMLAttr(attr) + if err != nil { + t.Errorf("Error unmarshaling position '%s': %v", test.input, err) + } + if pos != test.expected { + t.Errorf("Expected position %d for input '%s', got %d", test.expected, test.input, pos) + } + } +} + +func TestLetterTextUnmarshal_PreserveMarkup(t *testing.T) { + // Test that various markup elements are preserved in page content + testXML := ` + +Text with antiqua and bold and italic. + +Centered text +
    Underlined text
+Deleted text +More content with person reference. +
` + + var letterText LetterText + err := xml.Unmarshal([]byte(testXML), &letterText) + if err != nil { + t.Fatalf("Error unmarshaling markup XML: %v", err) + } + + if len(letterText.Pages) != 1 { + t.Fatalf("Expected 1 page, got %d", len(letterText.Pages)) + } + + content := tokensToString(letterText.Pages[0].Content) + expectedMarkup := []string{ + "antiqua", + "bold", + "italic", + "", + "", + "
    Underlined text
", + "Deleted text", + "person reference", + } + + for _, markup := range expectedMarkup { + if !strings.Contains(content, markup) { + t.Errorf("Expected page content to contain '%s', but it doesn't. Content: %s", markup, content) + } + } +} + +func TestLetterTextUnmarshal_LetterAttribute(t *testing.T) { + // Test that the letter attribute is parsed correctly + testXML := ` + +Some content. +` + + var letterText LetterText + err := xml.Unmarshal([]byte(testXML), &letterText) + if err != nil { + t.Fatalf("Error unmarshaling letter attribute XML: %v", err) + } + + // Verify letter attribute is parsed + if letterText.Letter != 42 { + t.Errorf("Expected letter attribute 42, got %d", letterText.Letter) + } +} + +func TestLetterTextUnmarshal_LetterAttribute_AllExistingTests(t *testing.T) { + // Test that existing test cases also have correct letter attributes + testCases := []struct { + name string + xml string + expectedLetter int + }{ + { + name: "Simple case", + xml: ` + Some content. + `, + expectedLetter: 123, + }, + { + name: "Real example letter 1", + xml: ` + Some content. + `, + expectedLetter: 1, + }, + { + name: "Letter with sidenotes", + xml: ` + + Note + Content. + `, + expectedLetter: 999, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + var letterText LetterText + err := xml.Unmarshal([]byte(tc.xml), &letterText) + if err != nil { + t.Fatalf("Error unmarshaling XML: %v", err) + } + + if letterText.Letter != tc.expectedLetter { + t.Errorf("Expected letter attribute %d, got %d", tc.expectedLetter, letterText.Letter) + } + }) + } +} \ No newline at end of file