Lesekabinett & Startseite

2025-12-16 22:25:30 +00:00 · 2025-03-02 00:27:16 +01:00
parent 6e286857d5
commit 0a86833a9f
56 changed files with 771 additions and 445 deletions
--- a/dbmodels/collectionhelper.go
+++ b/dbmodels/collectionhelper.go
@@ -3,16 +3,30 @@ package dbmodels
 import (
 	"encoding/json"
 	"regexp"
-	"sort"
 	"strconv"
 	"strings"
 )

-// CollectionInfo holds only the ID, a list of single references, and the Recorded flag.
+// INFO: tries to parse the Sammlungen field of contents.
+// Doesn't do a good job at all, but it's hard, there are many errors
+// Safe for concurrent use:
+var inrex = regexp.MustCompile(`(?is)inr[.:,;]?\s*([\d,\-(?:u.?)\v\f\t –—;\.]+)`)
+var onrex = regexp.MustCompile(`(?is)obj[.:,;]?\s*([\d,\-(?:u.?)\v\f\t –—;\.]+)`)
+var dashRegex = regexp.MustCompile(`[–—−‒]`)
+var delims = regexp.MustCompile(`[;/]+`)
+var reno = regexp.MustCompile(`\b\d+\b`)
+
 type CollectionInfo struct {
-	Collection *Content
-	Singles    []int
-	Recorded   bool
+	Annotation string
+	Collection int
+	Obj        []string
+	INr        []int
+	Obj_Unsure []string
+	INr_Unsure []int
+
+	ObjRanges []Range[string]
+	INrRanges []Range[int]
+	Recorded  bool
 }

 func (ci CollectionInfo) String() string {
@@ -20,107 +34,296 @@ func (ci CollectionInfo) String() string {
 	return string(marshalled)
 }

-// parseAnnotation detects "nicht erfasst" references (Recorded=false),
-// then finds all "INr" references (both single values and ranges).
-// Ranges like "100-105" are fully expanded to singles. Duplicates are removed.
-// Any references not in `inos` are ignored.
-func ParseAnnotation(c *Content, annotation string, inos []int) CollectionInfo {
+func (ci CollectionInfo) ShortString() string {
+	s := strings.Builder{}
+	s.WriteString(strconv.Itoa(ci.Collection))
+	s.WriteString(": ")
+	s.WriteString(ci.Annotation)
+	s.WriteString("\n")
+
+	if ci.Recorded {
+		s.WriteString("recorded")
+	} else {
+		s.WriteString("not recorded")
+	}
+
+	s.WriteString("\n")
+
+	if len(ci.INrRanges) > 0 {
+		s.WriteString("INr-Ranges: ")
+		for _, r := range ci.INrRanges {
+			s.WriteString(strconv.Itoa(r.From))
+			s.WriteString("-")
+			s.WriteString(strconv.Itoa(r.To))
+			s.WriteString("; ")
+		}
+		s.WriteString("\n")
+	}
+
+	if len(ci.INr) > 0 {
+		s.WriteString("INr-Singles: ")
+		for _, i := range ci.INr {
+			s.WriteString(strconv.Itoa(i))
+			s.WriteString("; ")
+		}
+	}
+
+	if len(ci.INr_Unsure) > 0 {
+		s.WriteString("INr-Unsure: ")
+		if len(ci.INr_Unsure) > 100 {
+			s.WriteString("many")
+		} else {
+			for _, i := range ci.INr_Unsure {
+				s.WriteString(strconv.Itoa(i))
+				s.WriteString("; ")
+			}
+			s.WriteString("\n")
+		}
+	}
+
+	if len(ci.ObjRanges) > 0 {
+		s.WriteString("Obj-Ranges: ")
+		for _, r := range ci.ObjRanges {
+			s.WriteString(r.From)
+			s.WriteString("-")
+			s.WriteString(r.To)
+			s.WriteString("; ")
+		}
+		s.WriteString("\n")
+	}
+
+	if len(ci.Obj) > 0 {
+		s.WriteString("Obj-Singles: ")
+		for _, i := range ci.Obj {
+			s.WriteString(i)
+			s.WriteString("; ")
+		}
+		s.WriteString("\n")
+	}
+
+	if len(ci.Obj_Unsure) > 0 {
+		s.WriteString("Obj-Unsure: ")
+		for _, i := range ci.Obj_Unsure {
+			s.WriteString(i)
+			s.WriteString("; ")
+		}
+		s.WriteString("\n")
+	}
+
+	return s.String()
+}
+
+type Range[T any] struct {
+	From T
+	To   T
+}
+
+func ParseAnnotation(c int, annotation string, inos []int, objnos []string) CollectionInfo {
 	ci := CollectionInfo{
+		Annotation: annotation,
 		Collection: c,
-		Singles:    []int{},
-		Recorded:   true, // Default
+		Recorded:   true,
 	}

-	// 1) Detect phrases like "nicht erfasst", "nicht aufgenommen", etc.
-	notRecordedPatterns := []string{"erfasst", "aufgenommen", "verzeichnet", "registriert"}
-	lowerAnn := strings.ToLower(annotation)
-	if strings.Contains(lowerAnn, "nicht") {
-		for _, kw := range notRecordedPatterns {
-			if strings.Contains(lowerAnn, kw) {
-				ci.Recorded = false
-				break
+	split := strings.Split(annotation, "/)")
+
+	inomap := make(map[int]bool)
+	for _, i := range inos {
+		inomap[i] = true
+	}
+
+	objnomap := make(map[string]bool)
+	for _, o := range objnos {
+		objnomap[o] = true
+	}
+
+	unsure_inr := func(in int) {
+		instr := strconv.Itoa(in)
+		if _, ok := objnomap[instr]; ok {
+			ci.Obj = append(ci.Obj, instr)
+		} else {
+			ci.INr_Unsure = append(ci.INr_Unsure, in)
+		}
+	}
+
+	unsure_inr_range := func(r Range[int]) {
+		cfrom := strconv.Itoa(r.From)
+		cto := strconv.Itoa(r.To)
+		_, ok := objnomap[cfrom]
+		_, ok2 := objnomap[cto]
+		if ok && ok2 {
+			ci.ObjRanges = append(ci.ObjRanges, Range[string]{From: cfrom, To: cto})
+		} else {
+			for i := r.From; i <= r.To; i++ {
+				unsure_inr(i)
 			}
 		}
 	}

-	// We'll keep singles in a map for deduplication
-	singlesMap := make(map[int]struct{})
+	for _, s := range split {
+		l := strings.ToLower(s)

-	// 2) Regex that matches "INr" plus the numeric portion (including dash / punctuation).
-	re := regexp.MustCompile(`(?i)\bINr[.:]?\s+([\d,\-\s–—;/.]+)`)
-	matches := re.FindAllStringSubmatch(annotation, -1)
-
-	// Regex to unify different dash characters into a simple '-'
-	dashRegex := regexp.MustCompile(`[–—−‒]`)
-
-	// Helper to expand a range, e.g. 10615–10621 => 10615..10621
-	expandRange := func(fromVal, toVal int) {
-		// If reversed, its a typo
-		if fromVal > toVal {
-			return
-		}
-		for v := fromVal; v <= toVal; v++ {
-			if inList(v, inos) {
-				singlesMap[v] = struct{}{}
+		// TODO: before this, we may cut the annotation into /) pieces
+		notRecordedPatterns := []string{"erfasst", "aufgenommen", "verzeichnet", "registriert"}
+		if strings.Contains(l, "nicht") {
+			for _, kw := range notRecordedPatterns {
+				if strings.Contains(l, kw) {
+					ci.Recorded = false
+					break
+				}
 			}
 		}
+
+		matches := inrex.FindAllStringSubmatch(s, -1)
+		inRanges, inSingles := findINrRangesSingles(matches)
+
+		// INFO: Heuristics
+		for _, in := range inSingles {
+			if _, ok := inomap[in]; ok {
+				ci.INr = append(ci.INr, in)
+			} else {
+				unsure_inr(in)
+			}
+		}
+
+		for _, r := range inRanges {
+			if r.From < r.To {
+				_, ok := inomap[r.From]
+				_, ok2 := inomap[r.To]
+				if ok && ok2 {
+					ci.INrRanges = append(ci.INrRanges, r)
+					continue
+				}
+
+				unsure_inr_range(r)
+			} else {
+				for i := r.From; i <= r.To; i++ {
+					ci.INr_Unsure = append(ci.INr_Unsure, i)
+				}
+			}
+		}
+
+		matches = onrex.FindAllStringSubmatch(s, -1)
+		objRanges, objSingles := findONrRangesSingles(matches)
+
+		for _, o := range objSingles {
+			if _, ok := objnomap[o]; ok {
+				ci.Obj = append(ci.Obj, o)
+			} else {
+				ci.Obj_Unsure = append(ci.Obj_Unsure, o)
+			}
+		}
+
+		for _, r := range objRanges {
+			if r.From < r.To {
+				_, ok := objnomap[r.From]
+				_, ok2 := objnomap[r.To]
+				if ok && ok2 {
+					ci.ObjRanges = append(ci.ObjRanges, r)
+					continue
+				}
+			}
+		}
+
 	}

-	for _, m := range matches {
-		numericChunk := m[1]
+	return ci
+}

-		// Replace typographic dashes with ASCII hyphen
-		numericChunk = dashRegex.ReplaceAllString(numericChunk, "-")
+func findINrRangesSingles(matches [][]string) ([]Range[int], []int) {
+	ranges := make([]Range[int], 0)
+	singles := make([]int, 0)

-		// Also unify semicolons or slashes to commas
-		extraDelims := regexp.MustCompile(`[;/]+`)
-		numericChunk = extraDelims.ReplaceAllString(numericChunk, ",")
-
-		// Now split on commas
-		parts := strings.Split(numericChunk, ",")
+	for _, match := range matches {
+		chunk := match[1]
+		normalized := dashRegex.ReplaceAllString(chunk, "-")
+		// WARNING: Replacing the OBj and INr delimiter ; with a comma here.
+		// It's a problem if the Obj was left out: INr 323345-323398; 23-53
+		//                      Here is an Obj often, but not always ^
+		// We do some heuristics later on to differentiate INr from Obj.
+		normalized = delims.ReplaceAllString(normalized, ",")
+		parts := strings.Split(normalized, ",")
 		for _, p := range parts {
 			p = strings.TrimSpace(p)
 			if p == "" {
 				continue
 			}
-			// If we see a hyphen, treat it as a range
-			if strings.Contains(p, "-") {
-				rangeParts := strings.SplitN(p, "-", 2)
-				if len(rangeParts) == 2 {
-					fromStr := strings.TrimSpace(rangeParts[0])
-					toStr := strings.TrimSpace(rangeParts[1])
-					if fromVal, errFrom := strconv.Atoi(fromStr); errFrom == nil {
-						if toVal, errTo := strconv.Atoi(toStr); errTo == nil {
-							expandRange(fromVal, toVal)
+
+			rangeParts := strings.Split(p, "-")
+			if len(rangeParts) == 2 {
+				// INFO: we have a range, most prob
+				fromStr := strings.TrimSpace(rangeParts[0])
+				toStr := strings.TrimSpace(rangeParts[1])
+				if fromVal, errFrom := strconv.Atoi(fromStr); errFrom == nil {
+					if toVal, errTo := strconv.Atoi(toStr); errTo == nil {
+						ranges = append(ranges, Range[int]{From: fromVal, To: toVal})
+						continue
+					}
+					to := reno.FindAllString(toStr, -1)
+					if len(to) >= 1 {
+						if val, err := strconv.Atoi(to[0]); err == nil {
+							ranges = append(ranges, Range[int]{From: fromVal, To: val})
+						}
+						if len(to) > 1 {
+							if val, err := strconv.Atoi(to[1]); err == nil {
+								singles = append(singles, val)
+							}
 						}
 					}
 				}
-			} else {
-				// Single integer reference
-				if val, err := strconv.Atoi(p); err == nil {
-					if inList(val, inos) {
-						singlesMap[val] = struct{}{}
+				continue
+			}
+
+			rangeParts = strings.Split(p, " u")
+			for _, r := range rangeParts {
+				trimmed := strings.TrimSpace(r)
+				matches := reno.FindAllString(trimmed, -1)
+				for _, m := range matches {
+					if val, err := strconv.Atoi(m); err == nil {
+						singles = append(singles, val)
 					}
 				}
 			}
 		}
 	}

-	// Flatten the map into a sorted slice
-	for s := range singlesMap {
-		ci.Singles = append(ci.Singles, s)
-	}
-	sort.Ints(ci.Singles)
-
-	return ci
+	return ranges, singles
 }

-// inList checks membership in `inos`
-func inList(x int, list []int) bool {
-	for _, item := range list {
-		if item == x {
-			return true
+func findONrRangesSingles(matches [][]string) ([]Range[string], []string) {
+	ranges := make([]Range[string], 0)
+	singles := make([]string, 0)
+
+	for _, match := range matches {
+		chunk := match[1]
+		normalized := dashRegex.ReplaceAllString(chunk, "-")
+		normalized = delims.ReplaceAllString(normalized, ",")
+		parts := strings.Split(normalized, ",")
+		for _, p := range parts {
+			p = strings.TrimSpace(p)
+			if p == "" {
+				continue
+			}
+
+			rangeParts := strings.Split(p, "-")
+			if len(rangeParts) == 2 {
+				// INFO: we have a range, most prob
+				fromStr := strings.TrimSpace(rangeParts[0])
+				toStr := strings.TrimSpace(rangeParts[1])
+				ranges = append(ranges, Range[string]{From: fromStr, To: toStr})
+				continue
+			}
+
+			rangeParts = strings.Split(p, " u")
+			for _, r := range rangeParts {
+				trimmed := strings.TrimSpace(r)
+				matches := reno.FindAllString(trimmed, -1)
+				for _, m := range matches {
+					singles = append(singles, m)
+				}
+			}
 		}
 	}
-	return false
+
+	return ranges, singles
 }