From 8f5338c0b873508b529d4cadde559e4519df6ff2 Mon Sep 17 00:00:00 2001 From: Simon Martens Date: Fri, 20 Feb 2026 13:39:34 +0100 Subject: [PATCH] block contexts --- uniq.txt | 49 +++++++++++++++---------------- xmlmodels/helpers.go | 33 ++++----------------- xmlmodels/letter_examples_test.go | 8 +++++ xmlmodels/textparse.go | 39 +++++++++++++++++++----- 4 files changed, 69 insertions(+), 60 deletions(-) diff --git a/uniq.txt b/uniq.txt index e368775..6872260 100644 --- a/uniq.txt +++ b/uniq.txt @@ -1,39 +1,38 @@ IGNORE: -address +address tabs BASE: -aq -b // im Druck -del -dul -tul -er -gr -hb -ink // wechsel der Tinte ref="2" -it // im Druck -pe -ru -tl // Textverlust -ul -note -fn[@index='1'] & anchor // keine ref? irgendwie nur die anchors in den footnotes? +aq --- Sans-Serif +b --- bold +del --- deletion, can be bascaded for double deletion del del +dul - double underline +tul - triple underline +er - erased text +gr -- greek -- no style changes +hb -- hebrew -- no style changes +ink // wechsel der Tinte ref="2" -- different color (bleu?) +it -- italic +pe -- different color (blue-grey) +ru -- russioan -- nos style changes +tl -- text loss +ul -- underline +note -- smaller, bold and sans-serif in grey +fn[@index='1'] & anchor // keine ref? irgendwie nur die anchors in den footnotes? -- no style change for now INLINE-SONDERFÄLLE: -nr[@extent='1-4|8|30'] oder ohne default = 1 -subst (+kinder) = Überschreibungen -insertion -del in subst -hand +nr[@extent='1-4|8|30'] oder ohne default = 1 one mioddot before, one after. content = extent emount of spaces +subst (+kinder) = Überschreibungen -- subst no style for now +insertion -- no style for now +hand no style for now INLINE-BLOCK: -align center|right -tab 2|12|8 +align pos= center|right align context fivides the line in 3: normal text right aligned center text center and left text left (can be inside the same line) +tab value= 2|12|8 opens a tab context: value="1-2" means divide this context into two and place the content of this into the first compartment, value="4-8" means divide this context into 8 and place the content of this into the fourth compartment. BLOCK: letterText (wie line type="break" falls kein line) -line (Fälle: empty, tab 1-2|4-8, break) +line (Fälle: empty, tab 1-2|4-8, break) tab is number of indents page[@index='1-14'] sidenote[@pos='left|right|(bottom|top [left|right])' and @page='1-14' and @annotation='[.*]' diff --git a/xmlmodels/helpers.go b/xmlmodels/helpers.go index 130f098..d97e746 100644 --- a/xmlmodels/helpers.go +++ b/xmlmodels/helpers.go @@ -3,6 +3,7 @@ package xmlmodels import ( "encoding/xml" "strconv" + "strings" ) func isASCIISpaceByte(b byte) bool { @@ -41,14 +42,6 @@ func isOnlyASCIISpace(s string) bool { return true } -func hasLeadingASCIISpace(s string) bool { - return len(s) > 0 && isASCIISpaceByte(s[0]) -} - -func hasTrailingASCIISpace(s string) bool { - return len(s) > 0 && isASCIISpaceByte(s[len(s)-1]) -} - func attrsToMap(attrs []xml.Attr) map[string]string { if len(attrs) == 0 { return nil @@ -60,21 +53,8 @@ func attrsToMap(attrs []xml.Attr) map[string]string { return m } -func isInline(name string) bool { - switch name { - // BASE + note + specials + inline-block things treated as inline for stack correctness - case "aq", "b", "del", "dul", "tul", "er", "gr", "hb", "ink", "it", "pe", "ru", "tl", "ul", - "note", - "fn", "nr", "subst", "insertion", "hand", - "align", "tab": - return true - default: - return false - } -} - +// INFO: list of tags ignored func isTransparentWrapper(name string) bool { - // IMPORTANT: address subtree is NOT skipped; wrapper tokens are ignored only. return name == "tabs" || name == "address" } @@ -90,7 +70,7 @@ func parseLineMarker(se xml.StartElement) (LineType, int, bool) { indent = n } case "type": - typ = trimASCIISpace(a.Value) + typ = strings.ToLower(trimASCIISpace(a.Value)) } } if typ == "empty" { @@ -99,8 +79,7 @@ func parseLineMarker(se xml.StartElement) (LineType, int, bool) { if indent > 0 { return Indent, indent, false } - if typ == "break" { - return Semantic, 0, false - } - return Continuation, 0, false + + // INFO: we don't check for break here, it's the default + return Semantic, 0, false } diff --git a/xmlmodels/letter_examples_test.go b/xmlmodels/letter_examples_test.go index a776ea7..3e8c21d 100644 --- a/xmlmodels/letter_examples_test.go +++ b/xmlmodels/letter_examples_test.go @@ -276,3 +276,11 @@ func linePairHasValidSyntheticCarry(prev, next Line) bool { } return true } + +func hasLeadingASCIISpace(s string) bool { + return len(s) > 0 && isASCIISpaceByte(s[0]) +} + +func hasTrailingASCIISpace(s string) bool { + return len(s) > 0 && isASCIISpaceByte(s[len(s)-1]) +} diff --git a/xmlmodels/textparse.go b/xmlmodels/textparse.go index f6dcaad..e006c10 100644 --- a/xmlmodels/textparse.go +++ b/xmlmodels/textparse.go @@ -34,10 +34,12 @@ type Token struct { } type Line struct { - Type LineType - Indent int - Text string - Tokens []Token + Type LineType + Indent int + AlignCtx bool + TabCtx bool + Text string + Tokens []Token } type Page struct { @@ -90,8 +92,8 @@ func (a *lineAccumulator) ensureLine() { return } a.startLine(a.implicitType, 0) - if a.implicitType == First { - a.implicitType = Continuation + if a.implicitType == First || a.implicitType == Continuation { + a.implicitType = Semantic } } @@ -107,6 +109,7 @@ func (a *lineAccumulator) closeLine() { Synth: true, }) } + a.applyContextFlags() a.curLine.Text = lineTextFromTokens(a.curLine.Tokens) a.appendLine(*a.curLine) a.hasAnyLine = true @@ -121,11 +124,11 @@ func (a *lineAccumulator) handleLineMarker(se xml.StartElement) { if emitEmpty { a.startLine(Empty, 0) a.closeLine() - a.implicitType = Continuation + a.implicitType = Semantic return } a.startLine(lt, indent) - a.implicitType = Continuation + a.implicitType = Semantic } func (a *lineAccumulator) appendStart(name string, attrs map[string]string) { @@ -228,6 +231,26 @@ func lineTextFromTokens(tokens []Token) string { return b.String() } +func (a *lineAccumulator) applyContextFlags() { + if a.curLine == nil { + return + } + for _, tok := range a.curLine.Tokens { + if tok.Type != StartElement { + continue + } + switch tok.Name { + case "align": + a.curLine.AlignCtx = true + case "tab": + a.curLine.TabCtx = true + } + if a.curLine.AlignCtx && a.curLine.TabCtx { + return + } + } +} + func parseBlockLines(dec *xml.Decoder, endLocalName string) ([]Line, error) { lines := make([]Line, 0, 8) acc := newLineAccumulator(First, func(line Line) {