Search Index Build

This commit is contained in:
Simon Martens
2025-02-17 21:42:20 +01:00
parent d109f7a040
commit fd2fa157b2
18 changed files with 611 additions and 16 deletions

View File

@@ -41,6 +41,7 @@ type Config struct {
GITPath string `json:"git_path" envconfig:"GIT_PATH"`
GNDPath string `json:"gnd_path" envconfig:"GND_PATH"`
GeoPath string `json:"geo_path" envconfig:"GEO_PATH"`
SearchPath string `json:"search_path" envconfig:"SEARCH_PATH"`
ImgPath string `json:"img_path" envconfig:"IMG_PATH"`
WebHookEndpoint string `json:"webhook_endpoint" envconfig:"WEBHOOK_ENDPOINT"`
WebHookSecret string `json:"webhook_secret" envconfig:"WEBHOOK_SECRET"`
@@ -123,6 +124,10 @@ func readDefaults(cfg *Config) *Config {
cfg.ImgPath = DEFAULT_IMG_DIR
}
if strings.TrimSpace(cfg.SearchPath) == "" {
cfg.SearchPath = DEFAULT_SEARCH_CACHE_DIR
}
return cfg
}

View File

@@ -0,0 +1,129 @@
package searchprovider
import (
"errors"
"path/filepath"
"sync"
"github.com/Theodor-Springmann-Stiftung/kgpz_web/xmlmodels"
"github.com/blevesearch/bleve/v2"
"github.com/blevesearch/bleve/v2/analysis/analyzer/custom"
"github.com/blevesearch/bleve/v2/analysis/char/html"
"github.com/blevesearch/bleve/v2/analysis/char/regexp"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/token/ngram"
"github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/v2/mapping"
)
var NoKeyError = errors.New("Missing ID key.")
var NoLibError = errors.New("Missing library.")
type ISearchable interface {
Keys() []string
Readable(lib *xmlmodels.Library) map[string]interface{}
Type() string
}
type SearchProvider struct {
indeces sync.Map
basepath string
}
func NewSearchProvider(basepath string) (*SearchProvider, error) {
sp := &SearchProvider{basepath: basepath}
return sp, nil
}
func (sp *SearchProvider) Index(item ISearchable, lib *xmlmodels.Library) error {
keys := item.Keys()
if len(keys) == 0 {
return NoKeyError
}
if lib == nil {
return NoLibError
}
i, err := sp.FindCreateIndex(item.Type())
if err != nil {
return err
}
return i.Index(keys[0], item.Readable(lib))
}
func (sp *SearchProvider) FindCreateIndex(typ string) (bleve.Index, error) {
index, ok := sp.indeces.Load(typ)
if ok {
i := index.(bleve.Index)
return i, nil
}
fp := filepath.Join(sp.basepath, typ+".bleve")
ind, err := bleve.Open(fp)
if err == bleve.ErrorIndexPathDoesNotExist {
mapping, err := default_mapping()
if err != nil {
return nil, err
}
ind, err = bleve.New(filepath.Join(fp), mapping)
if err != nil {
return nil, err
}
}
sp.indeces.Store(typ, ind)
return ind, nil
}
func default_mapping() (*mapping.IndexMappingImpl, error) {
indexMapping := bleve.NewIndexMapping()
customunicodeFilter := map[string]interface{}{
"type": unicodenorm.Name,
"form": unicodenorm.NFKD,
}
customCharFilterConfig := map[string]interface{}{
"type": regexp.Name,
"regexp": `[[:punct:]]+`, // Removes all punctuation characters
"replace": "",
}
customNgramFilterConfig := map[string]interface{}{
"type": ngram.Name,
"min": 1, // minimum n-gram size
"max": 20, // maximum n-gram size
}
customNgramAnalyzer := map[string]interface{}{
"type": custom.Name,
"tokenizer": unicode.Name,
"char_filters": []string{"removePunctuation", html.Name},
"token_filters": []string{lowercase.Name, "customNgramFilter", "customUnicodeCharFilter"},
}
err := indexMapping.AddCustomTokenFilter("customNgramFilter", customNgramFilterConfig)
if err != nil {
return nil, err
}
err = indexMapping.AddCustomCharFilter("removePunctuation", customCharFilterConfig)
if err != nil {
return nil, err
}
err = indexMapping.AddCustomTokenFilter("customUnicodeCharFilter", customunicodeFilter)
if err != nil {
return nil, err
}
err = indexMapping.AddCustomAnalyzer("customNgramAnalyzer", customNgramAnalyzer)
if err != nil {
return nil, err
}
indexMapping.DefaultAnalyzer = "customNgramAnalyzer"
return indexMapping, nil
}

View File

@@ -2,8 +2,11 @@ package xmlprovider
import "fmt"
type XMLItem interface {
type IXMLItem interface {
fmt.Stringer
// INFO:
// - Keys should be unique
// - Keys[0] has the special meaning of the primary key (for FTS etc.)
Keys() []string
Name() string
}
@@ -12,13 +15,13 @@ type ILibrary interface {
Parse(meta ParseMeta) error
}
type ResolvingMap[T XMLItem] map[string][]Resolved[T]
type ResolvingMap[T IXMLItem] map[string][]Resolved[T]
type ReferenceResolver[T XMLItem] interface {
type ReferenceResolver[T IXMLItem] interface {
References() ResolvingMap[T]
}
type Resolved[T XMLItem] struct {
type Resolved[T IXMLItem] struct {
Item *T
Reference string
Category string

View File

@@ -7,13 +7,13 @@ import (
"sync"
)
type Resolver[T XMLItem] struct {
type Resolver[T IXMLItem] struct {
// INFO: map[type][ID]
index map[string]map[string][]Resolved[T]
mu sync.RWMutex
}
func NewResolver[T XMLItem]() *Resolver[T] {
func NewResolver[T IXMLItem]() *Resolver[T] {
return &Resolver[T]{index: make(map[string]map[string][]Resolved[T])}
}

View File

@@ -34,7 +34,7 @@ func (p ParseMeta) Failed(path string) bool {
}
// An XMLProvider is a struct that holds holds serialized XML data of a specific type. It combines multiple parses IF a succeeded parse can not serialize the data from a path.
type XMLProvider[T XMLItem] struct {
type XMLProvider[T IXMLItem] struct {
// INFO: map is type map[string]*T
Items sync.Map
// INFO: map is type [string]ItemInfo
@@ -50,7 +50,7 @@ type XMLProvider[T XMLItem] struct {
Array []T
}
func NewXMLProvider[T XMLItem]() *XMLProvider[T] {
func NewXMLProvider[T IXMLItem]() *XMLProvider[T] {
return &XMLProvider[T]{Resolver: *NewResolver[T]()}
}
@@ -141,7 +141,7 @@ func (p *XMLProvider[T]) addResolvable(item T) {
}
}
func (p *XMLProvider[T]) ReverseLookup(item XMLItem) []Resolved[T] {
func (p *XMLProvider[T]) ReverseLookup(item IXMLItem) []Resolved[T] {
// INFO: this runs just once for the first key
ret := make([]Resolved[T], 0)
keys := item.Keys()

View File

@@ -5,7 +5,7 @@ import (
"strings"
)
func Sort[T XMLItem](i, j T) int {
func Sort[T IXMLItem](i, j T) int {
keys_a := i.Keys()
keys_b := j.Keys()