mirror of
				https://github.com/Theodor-Springmann-Stiftung/musenalm.git
				synced 2025-10-31 10:15:32 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			330 lines
		
	
	
		
			7.4 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			330 lines
		
	
	
		
			7.4 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| package dbmodels
 | |
| 
 | |
| import (
 | |
| 	"encoding/json"
 | |
| 	"regexp"
 | |
| 	"strconv"
 | |
| 	"strings"
 | |
| )
 | |
| 
 | |
| // INFO: tries to parse the Sammlungen field of contents.
 | |
| // Doesn't do a good job at all, but it's hard, there are many errors
 | |
| // Safe for concurrent use:
 | |
| var inrex = regexp.MustCompile(`(?is)inr[.:,;]?\s*([\d,\-(?:u.?)\v\f\t –—;\.]+)`)
 | |
| var onrex = regexp.MustCompile(`(?is)obj[.:,;]?\s*([\d,\-(?:u.?)\v\f\t –—;\.]+)`)
 | |
| var dashRegex = regexp.MustCompile(`[–—−‒]`)
 | |
| var delims = regexp.MustCompile(`[;/]+`)
 | |
| var reno = regexp.MustCompile(`\b\d+\b`)
 | |
| 
 | |
| type CollectionInfo struct {
 | |
| 	Annotation string
 | |
| 	Collection int
 | |
| 	Obj        []string
 | |
| 	INr        []int
 | |
| 	Obj_Unsure []string
 | |
| 	INr_Unsure []int
 | |
| 
 | |
| 	ObjRanges []Range[string]
 | |
| 	INrRanges []Range[int]
 | |
| 	Recorded  bool
 | |
| }
 | |
| 
 | |
| func (ci CollectionInfo) String() string {
 | |
| 	marshalled, _ := json.Marshal(ci)
 | |
| 	return string(marshalled)
 | |
| }
 | |
| 
 | |
| func (ci CollectionInfo) ShortString() string {
 | |
| 	s := strings.Builder{}
 | |
| 	s.WriteString(strconv.Itoa(ci.Collection))
 | |
| 	s.WriteString(": ")
 | |
| 	s.WriteString(ci.Annotation)
 | |
| 	s.WriteString("\n")
 | |
| 
 | |
| 	if ci.Recorded {
 | |
| 		s.WriteString("recorded")
 | |
| 	} else {
 | |
| 		s.WriteString("not recorded")
 | |
| 	}
 | |
| 
 | |
| 	s.WriteString("\n")
 | |
| 
 | |
| 	if len(ci.INrRanges) > 0 {
 | |
| 		s.WriteString("INr-Ranges: ")
 | |
| 		for _, r := range ci.INrRanges {
 | |
| 			s.WriteString(strconv.Itoa(r.From))
 | |
| 			s.WriteString("-")
 | |
| 			s.WriteString(strconv.Itoa(r.To))
 | |
| 			s.WriteString("; ")
 | |
| 		}
 | |
| 		s.WriteString("\n")
 | |
| 	}
 | |
| 
 | |
| 	if len(ci.INr) > 0 {
 | |
| 		s.WriteString("INr-Singles: ")
 | |
| 		for _, i := range ci.INr {
 | |
| 			s.WriteString(strconv.Itoa(i))
 | |
| 			s.WriteString("; ")
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	if len(ci.INr_Unsure) > 0 {
 | |
| 		s.WriteString("INr-Unsure: ")
 | |
| 		if len(ci.INr_Unsure) > 100 {
 | |
| 			s.WriteString("many")
 | |
| 		} else {
 | |
| 			for _, i := range ci.INr_Unsure {
 | |
| 				s.WriteString(strconv.Itoa(i))
 | |
| 				s.WriteString("; ")
 | |
| 			}
 | |
| 			s.WriteString("\n")
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	if len(ci.ObjRanges) > 0 {
 | |
| 		s.WriteString("Obj-Ranges: ")
 | |
| 		for _, r := range ci.ObjRanges {
 | |
| 			s.WriteString(r.From)
 | |
| 			s.WriteString("-")
 | |
| 			s.WriteString(r.To)
 | |
| 			s.WriteString("; ")
 | |
| 		}
 | |
| 		s.WriteString("\n")
 | |
| 	}
 | |
| 
 | |
| 	if len(ci.Obj) > 0 {
 | |
| 		s.WriteString("Obj-Singles: ")
 | |
| 		for _, i := range ci.Obj {
 | |
| 			s.WriteString(i)
 | |
| 			s.WriteString("; ")
 | |
| 		}
 | |
| 		s.WriteString("\n")
 | |
| 	}
 | |
| 
 | |
| 	if len(ci.Obj_Unsure) > 0 {
 | |
| 		s.WriteString("Obj-Unsure: ")
 | |
| 		for _, i := range ci.Obj_Unsure {
 | |
| 			s.WriteString(i)
 | |
| 			s.WriteString("; ")
 | |
| 		}
 | |
| 		s.WriteString("\n")
 | |
| 	}
 | |
| 
 | |
| 	return s.String()
 | |
| }
 | |
| 
 | |
| type Range[T any] struct {
 | |
| 	From T
 | |
| 	To   T
 | |
| }
 | |
| 
 | |
| func ParseAnnotation(c int, annotation string, inos []int, objnos []string) CollectionInfo {
 | |
| 	ci := CollectionInfo{
 | |
| 		Annotation: annotation,
 | |
| 		Collection: c,
 | |
| 		Recorded:   true,
 | |
| 	}
 | |
| 
 | |
| 	split := strings.Split(annotation, "/)")
 | |
| 
 | |
| 	inomap := make(map[int]bool)
 | |
| 	for _, i := range inos {
 | |
| 		inomap[i] = true
 | |
| 	}
 | |
| 
 | |
| 	objnomap := make(map[string]bool)
 | |
| 	for _, o := range objnos {
 | |
| 		objnomap[o] = true
 | |
| 	}
 | |
| 
 | |
| 	unsure_inr := func(in int) {
 | |
| 		instr := strconv.Itoa(in)
 | |
| 		if _, ok := objnomap[instr]; ok {
 | |
| 			ci.Obj = append(ci.Obj, instr)
 | |
| 		} else {
 | |
| 			ci.INr_Unsure = append(ci.INr_Unsure, in)
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	unsure_inr_range := func(r Range[int]) {
 | |
| 		cfrom := strconv.Itoa(r.From)
 | |
| 		cto := strconv.Itoa(r.To)
 | |
| 		_, ok := objnomap[cfrom]
 | |
| 		_, ok2 := objnomap[cto]
 | |
| 		if ok && ok2 {
 | |
| 			ci.ObjRanges = append(ci.ObjRanges, Range[string]{From: cfrom, To: cto})
 | |
| 		} else {
 | |
| 			for i := r.From; i <= r.To; i++ {
 | |
| 				unsure_inr(i)
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	for _, s := range split {
 | |
| 		l := strings.ToLower(s)
 | |
| 
 | |
| 		// TODO: before this, we may cut the annotation into /) pieces
 | |
| 		notRecordedPatterns := []string{"erfasst", "aufgenommen", "verzeichnet", "registriert"}
 | |
| 		if strings.Contains(l, "nicht") {
 | |
| 			for _, kw := range notRecordedPatterns {
 | |
| 				if strings.Contains(l, kw) {
 | |
| 					ci.Recorded = false
 | |
| 					break
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		matches := inrex.FindAllStringSubmatch(s, -1)
 | |
| 		inRanges, inSingles := findINrRangesSingles(matches)
 | |
| 
 | |
| 		// INFO: Heuristics
 | |
| 		for _, in := range inSingles {
 | |
| 			if _, ok := inomap[in]; ok {
 | |
| 				ci.INr = append(ci.INr, in)
 | |
| 			} else {
 | |
| 				unsure_inr(in)
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		for _, r := range inRanges {
 | |
| 			if r.From < r.To {
 | |
| 				_, ok := inomap[r.From]
 | |
| 				_, ok2 := inomap[r.To]
 | |
| 				if ok && ok2 {
 | |
| 					ci.INrRanges = append(ci.INrRanges, r)
 | |
| 					continue
 | |
| 				}
 | |
| 
 | |
| 				unsure_inr_range(r)
 | |
| 			} else {
 | |
| 				for i := r.From; i <= r.To; i++ {
 | |
| 					ci.INr_Unsure = append(ci.INr_Unsure, i)
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		matches = onrex.FindAllStringSubmatch(s, -1)
 | |
| 		objRanges, objSingles := findONrRangesSingles(matches)
 | |
| 
 | |
| 		for _, o := range objSingles {
 | |
| 			if _, ok := objnomap[o]; ok {
 | |
| 				ci.Obj = append(ci.Obj, o)
 | |
| 			} else {
 | |
| 				ci.Obj_Unsure = append(ci.Obj_Unsure, o)
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		for _, r := range objRanges {
 | |
| 			if r.From < r.To {
 | |
| 				_, ok := objnomap[r.From]
 | |
| 				_, ok2 := objnomap[r.To]
 | |
| 				if ok && ok2 {
 | |
| 					ci.ObjRanges = append(ci.ObjRanges, r)
 | |
| 					continue
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 	}
 | |
| 
 | |
| 	return ci
 | |
| }
 | |
| 
 | |
| func findINrRangesSingles(matches [][]string) ([]Range[int], []int) {
 | |
| 	ranges := make([]Range[int], 0)
 | |
| 	singles := make([]int, 0)
 | |
| 
 | |
| 	for _, match := range matches {
 | |
| 		chunk := match[1]
 | |
| 		normalized := dashRegex.ReplaceAllString(chunk, "-")
 | |
| 		// WARNING: Replacing the OBj and INr delimiter ; with a comma here.
 | |
| 		// It's a problem if the Obj was left out: INr 323345-323398; 23-53
 | |
| 		//                      Here is an Obj often, but not always ^
 | |
| 		// We do some heuristics later on to differentiate INr from Obj.
 | |
| 		normalized = delims.ReplaceAllString(normalized, ",")
 | |
| 		parts := strings.Split(normalized, ",")
 | |
| 		for _, p := range parts {
 | |
| 			p = strings.TrimSpace(p)
 | |
| 			if p == "" {
 | |
| 				continue
 | |
| 			}
 | |
| 
 | |
| 			rangeParts := strings.Split(p, "-")
 | |
| 			if len(rangeParts) == 2 {
 | |
| 				// INFO: we have a range, most prob
 | |
| 				fromStr := strings.TrimSpace(rangeParts[0])
 | |
| 				toStr := strings.TrimSpace(rangeParts[1])
 | |
| 				if fromVal, errFrom := strconv.Atoi(fromStr); errFrom == nil {
 | |
| 					if toVal, errTo := strconv.Atoi(toStr); errTo == nil {
 | |
| 						ranges = append(ranges, Range[int]{From: fromVal, To: toVal})
 | |
| 						continue
 | |
| 					}
 | |
| 					to := reno.FindAllString(toStr, -1)
 | |
| 					if len(to) >= 1 {
 | |
| 						if val, err := strconv.Atoi(to[0]); err == nil {
 | |
| 							ranges = append(ranges, Range[int]{From: fromVal, To: val})
 | |
| 						}
 | |
| 						if len(to) > 1 {
 | |
| 							if val, err := strconv.Atoi(to[1]); err == nil {
 | |
| 								singles = append(singles, val)
 | |
| 							}
 | |
| 						}
 | |
| 					}
 | |
| 				}
 | |
| 				continue
 | |
| 			}
 | |
| 
 | |
| 			rangeParts = strings.Split(p, " u")
 | |
| 			for _, r := range rangeParts {
 | |
| 				trimmed := strings.TrimSpace(r)
 | |
| 				matches := reno.FindAllString(trimmed, -1)
 | |
| 				for _, m := range matches {
 | |
| 					if val, err := strconv.Atoi(m); err == nil {
 | |
| 						singles = append(singles, val)
 | |
| 					}
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return ranges, singles
 | |
| }
 | |
| 
 | |
| func findONrRangesSingles(matches [][]string) ([]Range[string], []string) {
 | |
| 	ranges := make([]Range[string], 0)
 | |
| 	singles := make([]string, 0)
 | |
| 
 | |
| 	for _, match := range matches {
 | |
| 		chunk := match[1]
 | |
| 		normalized := dashRegex.ReplaceAllString(chunk, "-")
 | |
| 		normalized = delims.ReplaceAllString(normalized, ",")
 | |
| 		parts := strings.Split(normalized, ",")
 | |
| 		for _, p := range parts {
 | |
| 			p = strings.TrimSpace(p)
 | |
| 			if p == "" {
 | |
| 				continue
 | |
| 			}
 | |
| 
 | |
| 			rangeParts := strings.Split(p, "-")
 | |
| 			if len(rangeParts) == 2 {
 | |
| 				// INFO: we have a range, most prob
 | |
| 				fromStr := strings.TrimSpace(rangeParts[0])
 | |
| 				toStr := strings.TrimSpace(rangeParts[1])
 | |
| 				ranges = append(ranges, Range[string]{From: fromStr, To: toStr})
 | |
| 				continue
 | |
| 			}
 | |
| 
 | |
| 			rangeParts = strings.Split(p, " u")
 | |
| 			for _, r := range rangeParts {
 | |
| 				trimmed := strings.TrimSpace(r)
 | |
| 				matches := reno.FindAllString(trimmed, -1)
 | |
| 				for _, m := range matches {
 | |
| 					singles = append(singles, m)
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return ranges, singles
 | |
| }
 | 
