musenalm/dbmodels/collectionhelper.go

package dbmodels

import (
	"encoding/json"
	"regexp"
	"sort"
	"strconv"
	"strings"
)

// CollectionInfo holds only the ID, a list of single references, and the Recorded flag.
type CollectionInfo struct {
	Collection *Content
	Singles    []int
	Recorded   bool
}

func (ci CollectionInfo) String() string {
	marshalled, _ := json.Marshal(ci)
	return string(marshalled)
}

// parseAnnotation detects "nicht erfasst" references (Recorded=false),
// then finds all "INr" references (both single values and ranges).
// Ranges like "100-105" are fully expanded to singles. Duplicates are removed.
// Any references not in `inos` are ignored.
func ParseAnnotation(c *Content, annotation string, inos []int) CollectionInfo {
	ci := CollectionInfo{
		Collection: c,
		Singles:    []int{},
		Recorded:   true, // Default
	}

	// 1) Detect phrases like "nicht erfasst", "nicht aufgenommen", etc.
	notRecordedPatterns := []string{"erfasst", "aufgenommen", "verzeichnet", "registriert"}
	lowerAnn := strings.ToLower(annotation)
	if strings.Contains(lowerAnn, "nicht") {
		for _, kw := range notRecordedPatterns {
			if strings.Contains(lowerAnn, kw) {
				ci.Recorded = false
				break
			}
		}
	}

	// We'll keep singles in a map for deduplication
	singlesMap := make(map[int]struct{})

	// 2) Regex that matches "INr" plus the numeric portion (including dash / punctuation).
	re := regexp.MustCompile(`(?i)\bINr[.:]?\s+([\d,\-\s–—;/.]+)`)
	matches := re.FindAllStringSubmatch(annotation, -1)

	// Regex to unify different dash characters into a simple '-'
	dashRegex := regexp.MustCompile(`[–—−‒]`)

	// Helper to expand a range, e.g. 10615–10621 => 10615..10621
	expandRange := func(fromVal, toVal int) {
		// If reversed, its a typo
		if fromVal > toVal {
			return
		}
		for v := fromVal; v <= toVal; v++ {
			if inList(v, inos) {
				singlesMap[v] = struct{}{}
			}
		}
	}

	for _, m := range matches {
		numericChunk := m[1]

		// Replace typographic dashes with ASCII hyphen
		numericChunk = dashRegex.ReplaceAllString(numericChunk, "-")

		// Also unify semicolons or slashes to commas
		extraDelims := regexp.MustCompile(`[;/]+`)
		numericChunk = extraDelims.ReplaceAllString(numericChunk, ",")

		// Now split on commas
		parts := strings.Split(numericChunk, ",")
		for _, p := range parts {
			p = strings.TrimSpace(p)
			if p == "" {
				continue
			}
			// If we see a hyphen, treat it as a range
			if strings.Contains(p, "-") {
				rangeParts := strings.SplitN(p, "-", 2)
				if len(rangeParts) == 2 {
					fromStr := strings.TrimSpace(rangeParts[0])
					toStr := strings.TrimSpace(rangeParts[1])
					if fromVal, errFrom := strconv.Atoi(fromStr); errFrom == nil {
						if toVal, errTo := strconv.Atoi(toStr); errTo == nil {
							expandRange(fromVal, toVal)
						}
					}
				}
			} else {
				// Single integer reference
				if val, err := strconv.Atoi(p); err == nil {
					if inList(val, inos) {
						singlesMap[val] = struct{}{}
					}
				}
			}
		}
	}

	// Flatten the map into a sorted slice
	for s := range singlesMap {
		ci.Singles = append(ci.Singles, s)
	}
	sort.Ints(ci.Singles)

	return ci
}

// inList checks membership in `inos`
func inList(x int, list []int) bool {
	for _, item := range list {
		if item == x {
			return true
		}
	}
	return false
}