Files
musenalm/dbmodels/collectionhelper.go
2025-03-02 00:27:16 +01:00

330 lines
7.4 KiB
Go

package dbmodels
import (
"encoding/json"
"regexp"
"strconv"
"strings"
)
// INFO: tries to parse the Sammlungen field of contents.
// Doesn't do a good job at all, but it's hard, there are many errors
// Safe for concurrent use:
var inrex = regexp.MustCompile(`(?is)inr[.:,;]?\s*([\d,\-(?:u.?)\v\f\t –—;\.]+)`)
var onrex = regexp.MustCompile(`(?is)obj[.:,;]?\s*([\d,\-(?:u.?)\v\f\t –—;\.]+)`)
var dashRegex = regexp.MustCompile(`[–—−‒]`)
var delims = regexp.MustCompile(`[;/]+`)
var reno = regexp.MustCompile(`\b\d+\b`)
type CollectionInfo struct {
Annotation string
Collection int
Obj []string
INr []int
Obj_Unsure []string
INr_Unsure []int
ObjRanges []Range[string]
INrRanges []Range[int]
Recorded bool
}
func (ci CollectionInfo) String() string {
marshalled, _ := json.Marshal(ci)
return string(marshalled)
}
func (ci CollectionInfo) ShortString() string {
s := strings.Builder{}
s.WriteString(strconv.Itoa(ci.Collection))
s.WriteString(": ")
s.WriteString(ci.Annotation)
s.WriteString("\n")
if ci.Recorded {
s.WriteString("recorded")
} else {
s.WriteString("not recorded")
}
s.WriteString("\n")
if len(ci.INrRanges) > 0 {
s.WriteString("INr-Ranges: ")
for _, r := range ci.INrRanges {
s.WriteString(strconv.Itoa(r.From))
s.WriteString("-")
s.WriteString(strconv.Itoa(r.To))
s.WriteString("; ")
}
s.WriteString("\n")
}
if len(ci.INr) > 0 {
s.WriteString("INr-Singles: ")
for _, i := range ci.INr {
s.WriteString(strconv.Itoa(i))
s.WriteString("; ")
}
}
if len(ci.INr_Unsure) > 0 {
s.WriteString("INr-Unsure: ")
if len(ci.INr_Unsure) > 100 {
s.WriteString("many")
} else {
for _, i := range ci.INr_Unsure {
s.WriteString(strconv.Itoa(i))
s.WriteString("; ")
}
s.WriteString("\n")
}
}
if len(ci.ObjRanges) > 0 {
s.WriteString("Obj-Ranges: ")
for _, r := range ci.ObjRanges {
s.WriteString(r.From)
s.WriteString("-")
s.WriteString(r.To)
s.WriteString("; ")
}
s.WriteString("\n")
}
if len(ci.Obj) > 0 {
s.WriteString("Obj-Singles: ")
for _, i := range ci.Obj {
s.WriteString(i)
s.WriteString("; ")
}
s.WriteString("\n")
}
if len(ci.Obj_Unsure) > 0 {
s.WriteString("Obj-Unsure: ")
for _, i := range ci.Obj_Unsure {
s.WriteString(i)
s.WriteString("; ")
}
s.WriteString("\n")
}
return s.String()
}
type Range[T any] struct {
From T
To T
}
func ParseAnnotation(c int, annotation string, inos []int, objnos []string) CollectionInfo {
ci := CollectionInfo{
Annotation: annotation,
Collection: c,
Recorded: true,
}
split := strings.Split(annotation, "/)")
inomap := make(map[int]bool)
for _, i := range inos {
inomap[i] = true
}
objnomap := make(map[string]bool)
for _, o := range objnos {
objnomap[o] = true
}
unsure_inr := func(in int) {
instr := strconv.Itoa(in)
if _, ok := objnomap[instr]; ok {
ci.Obj = append(ci.Obj, instr)
} else {
ci.INr_Unsure = append(ci.INr_Unsure, in)
}
}
unsure_inr_range := func(r Range[int]) {
cfrom := strconv.Itoa(r.From)
cto := strconv.Itoa(r.To)
_, ok := objnomap[cfrom]
_, ok2 := objnomap[cto]
if ok && ok2 {
ci.ObjRanges = append(ci.ObjRanges, Range[string]{From: cfrom, To: cto})
} else {
for i := r.From; i <= r.To; i++ {
unsure_inr(i)
}
}
}
for _, s := range split {
l := strings.ToLower(s)
// TODO: before this, we may cut the annotation into /) pieces
notRecordedPatterns := []string{"erfasst", "aufgenommen", "verzeichnet", "registriert"}
if strings.Contains(l, "nicht") {
for _, kw := range notRecordedPatterns {
if strings.Contains(l, kw) {
ci.Recorded = false
break
}
}
}
matches := inrex.FindAllStringSubmatch(s, -1)
inRanges, inSingles := findINrRangesSingles(matches)
// INFO: Heuristics
for _, in := range inSingles {
if _, ok := inomap[in]; ok {
ci.INr = append(ci.INr, in)
} else {
unsure_inr(in)
}
}
for _, r := range inRanges {
if r.From < r.To {
_, ok := inomap[r.From]
_, ok2 := inomap[r.To]
if ok && ok2 {
ci.INrRanges = append(ci.INrRanges, r)
continue
}
unsure_inr_range(r)
} else {
for i := r.From; i <= r.To; i++ {
ci.INr_Unsure = append(ci.INr_Unsure, i)
}
}
}
matches = onrex.FindAllStringSubmatch(s, -1)
objRanges, objSingles := findONrRangesSingles(matches)
for _, o := range objSingles {
if _, ok := objnomap[o]; ok {
ci.Obj = append(ci.Obj, o)
} else {
ci.Obj_Unsure = append(ci.Obj_Unsure, o)
}
}
for _, r := range objRanges {
if r.From < r.To {
_, ok := objnomap[r.From]
_, ok2 := objnomap[r.To]
if ok && ok2 {
ci.ObjRanges = append(ci.ObjRanges, r)
continue
}
}
}
}
return ci
}
func findINrRangesSingles(matches [][]string) ([]Range[int], []int) {
ranges := make([]Range[int], 0)
singles := make([]int, 0)
for _, match := range matches {
chunk := match[1]
normalized := dashRegex.ReplaceAllString(chunk, "-")
// WARNING: Replacing the OBj and INr delimiter ; with a comma here.
// It's a problem if the Obj was left out: INr 323345-323398; 23-53
// Here is an Obj often, but not always ^
// We do some heuristics later on to differentiate INr from Obj.
normalized = delims.ReplaceAllString(normalized, ",")
parts := strings.Split(normalized, ",")
for _, p := range parts {
p = strings.TrimSpace(p)
if p == "" {
continue
}
rangeParts := strings.Split(p, "-")
if len(rangeParts) == 2 {
// INFO: we have a range, most prob
fromStr := strings.TrimSpace(rangeParts[0])
toStr := strings.TrimSpace(rangeParts[1])
if fromVal, errFrom := strconv.Atoi(fromStr); errFrom == nil {
if toVal, errTo := strconv.Atoi(toStr); errTo == nil {
ranges = append(ranges, Range[int]{From: fromVal, To: toVal})
continue
}
to := reno.FindAllString(toStr, -1)
if len(to) >= 1 {
if val, err := strconv.Atoi(to[0]); err == nil {
ranges = append(ranges, Range[int]{From: fromVal, To: val})
}
if len(to) > 1 {
if val, err := strconv.Atoi(to[1]); err == nil {
singles = append(singles, val)
}
}
}
}
continue
}
rangeParts = strings.Split(p, " u")
for _, r := range rangeParts {
trimmed := strings.TrimSpace(r)
matches := reno.FindAllString(trimmed, -1)
for _, m := range matches {
if val, err := strconv.Atoi(m); err == nil {
singles = append(singles, val)
}
}
}
}
}
return ranges, singles
}
func findONrRangesSingles(matches [][]string) ([]Range[string], []string) {
ranges := make([]Range[string], 0)
singles := make([]string, 0)
for _, match := range matches {
chunk := match[1]
normalized := dashRegex.ReplaceAllString(chunk, "-")
normalized = delims.ReplaceAllString(normalized, ",")
parts := strings.Split(normalized, ",")
for _, p := range parts {
p = strings.TrimSpace(p)
if p == "" {
continue
}
rangeParts := strings.Split(p, "-")
if len(rangeParts) == 2 {
// INFO: we have a range, most prob
fromStr := strings.TrimSpace(rangeParts[0])
toStr := strings.TrimSpace(rangeParts[1])
ranges = append(ranges, Range[string]{From: fromStr, To: toStr})
continue
}
rangeParts = strings.Split(p, " u")
for _, r := range rangeParts {
trimmed := strings.TrimSpace(r)
matches := reno.FindAllString(trimmed, -1)
for _, m := range matches {
singles = append(singles, m)
}
}
}
}
return ranges, singles
}