From fd2fa157b254a74db814528895348a3d9af4324d Mon Sep 17 00:00:00 2001 From: Simon Martens Date: Mon, 17 Feb 2025 21:42:20 +0100 Subject: [PATCH] Search Index Build --- app/kgpz.go | 92 ++++++++++++++++++- go.mod | 26 ++++++ go.sum | 56 ++++++++++++ providers/config.go | 5 ++ providers/search/searchprovider.go | 129 +++++++++++++++++++++++++++ providers/xmlprovider/models.go | 11 ++- providers/xmlprovider/resolver.go | 4 +- providers/xmlprovider/xmlprovider.go | 6 +- providers/xmlprovider/xmlsort.go | 2 +- scratchpad.md | 6 ++ xmlmodels/agents.go | 23 +++++ xmlmodels/categories.go | 22 +++++ xmlmodels/common.go | 34 ++++++- xmlmodels/issues.go | 23 +++++ xmlmodels/pieces.go | 53 ++++++++++- xmlmodels/places.go | 21 +++++ xmlmodels/references.go | 83 ++++++++++++++++- xmlmodels/works.go | 31 +++++++ 18 files changed, 611 insertions(+), 16 deletions(-) create mode 100644 providers/search/searchprovider.go diff --git a/app/kgpz.go b/app/kgpz.go index 5681160..9ee537d 100644 --- a/app/kgpz.go +++ b/app/kgpz.go @@ -10,6 +10,7 @@ import ( "github.com/Theodor-Springmann-Stiftung/kgpz_web/helpers/logging" "github.com/Theodor-Springmann-Stiftung/kgpz_web/providers" "github.com/Theodor-Springmann-Stiftung/kgpz_web/providers/gnd" + searchprovider "github.com/Theodor-Springmann-Stiftung/kgpz_web/providers/search" "github.com/Theodor-Springmann-Stiftung/kgpz_web/providers/xmlprovider" "github.com/Theodor-Springmann-Stiftung/kgpz_web/xmlmodels" "github.com/gofiber/fiber/v2" @@ -50,6 +51,7 @@ type KGPZ struct { Repo *providers.GitProvider GND *gnd.GNDProvider Library *xmlmodels.Library + Search *searchprovider.SearchProvider } func NewKGPZ(config *providers.ConfigProvider) (*KGPZ, error) { @@ -68,7 +70,6 @@ func NewKGPZ(config *providers.ConfigProvider) (*KGPZ, error) { } func (k *KGPZ) Pre(srv *fiber.App) error { - // Check if folder exists and if yes, serve image files from i if _, err := os.Stat(k.Config.Config.ImgPath); err == nil { fs := os.DirFS(k.Config.Config.ImgPath) @@ -100,8 +101,15 @@ func (k *KGPZ) Init() error { logging.Error(err, "Error reading GND-Cache. Continuing.") } - go k.Enrich() + if sp, err := searchprovider.NewSearchProvider(filepath.Join(k.Config.Config.BaseDIR, k.Config.SearchPath)); err != nil { + logging.Error(err, "Error initializing SearchProvider. Continuing without Search.") + } else { + k.Search = sp + } + + k.Enrich() go k.Pull() + k.BuildSearchIndex() return nil } @@ -179,12 +187,89 @@ func (k *KGPZ) Enrich() error { defer k.fsmu.Unlock() data := xmlmodels.AgentsIntoDataset(k.Library.Agents) k.GND.FetchPersons(data) - k.GND.WriteCache(k.Config.GNDPath) + k.GND.WriteCache(filepath.Join(k.Config.BaseDIR, k.Config.GNDPath)) }() return nil } +func (k *KGPZ) BuildSearchIndex() error { + if k.Library == nil || k.Library.Agents == nil || k.Search == nil { + return nil + } + + go func() { + k.fsmu.Lock() + defer k.fsmu.Unlock() + wg := new(sync.WaitGroup) + wg.Add(6) + go func() { + for _, agent := range k.Library.Agents.Array { + err := k.Search.Index(agent, k.Library) + if err != nil { + logging.Error(err, "Error indexing agent") + } + } + wg.Done() + }() + + go func() { + for _, place := range k.Library.Places.Array { + err := k.Search.Index(place, k.Library) + if err != nil { + logging.Error(err, "Error indexing place") + } + } + wg.Done() + }() + + go func() { + for _, cat := range k.Library.Categories.Array { + err := k.Search.Index(cat, k.Library) + if err != nil { + logging.Error(err, "Error indexing category") + } + } + wg.Done() + }() + + go func() { + for _, work := range k.Library.Works.Array { + err := k.Search.Index(work, k.Library) + if err != nil { + logging.Error(err, "Error indexing work") + } + } + wg.Done() + }() + + go func() { + for _, issue := range k.Library.Issues.Array { + err := k.Search.Index(issue, k.Library) + if err != nil { + logging.Error(err, "Error indexing issue") + } + } + wg.Done() + }() + + go func() { + for _, piece := range k.Library.Pieces.Array { + err := k.Search.Index(piece, k.Library) + if err != nil { + logging.Error(err, "Error indexing piece") + } + } + wg.Done() + }() + + wg.Wait() + logging.Info("Search index built.") + + }() + return nil +} + func (k *KGPZ) Serialize() error { // TODO: this is error handling from hell // Preventing pulling and serializing at the same time @@ -226,6 +311,7 @@ func (k *KGPZ) Pull() { logging.ObjDebug(&k.Repo, "Remote changed. Reparsing") k.Serialize() k.Enrich() + k.BuildSearchIndex() } } diff --git a/go.mod b/go.mod index 9d0e8fa..7515e48 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/Theodor-Springmann-Stiftung/kgpz_web go 1.23.2 require ( + github.com/blevesearch/bleve/v2 v2.4.4 github.com/fsnotify/fsnotify v1.8.0 github.com/go-git/go-git/v5 v5.12.0 github.com/gofiber/fiber/v2 v2.52.5 @@ -17,19 +18,43 @@ require ( dario.cat/mergo v1.0.0 // indirect github.com/Microsoft/go-winio v0.6.1 // indirect github.com/ProtonMail/go-crypto v1.0.0 // indirect + github.com/RoaringBitmap/roaring v1.9.3 // indirect github.com/andybalholm/brotli v1.0.5 // indirect + github.com/bits-and-blooms/bitset v1.12.0 // indirect + github.com/blevesearch/bleve_index_api v1.1.12 // indirect + github.com/blevesearch/geo v0.1.20 // indirect + github.com/blevesearch/go-faiss v1.0.24 // indirect + github.com/blevesearch/go-porterstemmer v1.0.3 // indirect + github.com/blevesearch/gtreap v0.1.1 // indirect + github.com/blevesearch/mmap-go v1.0.4 // indirect + github.com/blevesearch/scorch_segment_api/v2 v2.2.16 // indirect + github.com/blevesearch/segment v0.9.1 // indirect + github.com/blevesearch/snowballstem v0.9.0 // indirect + github.com/blevesearch/upsidedown_store_api v1.0.2 // indirect + github.com/blevesearch/vellum v1.0.10 // indirect + github.com/blevesearch/zapx/v11 v11.3.10 // indirect + github.com/blevesearch/zapx/v12 v12.3.10 // indirect + github.com/blevesearch/zapx/v13 v13.3.10 // indirect + github.com/blevesearch/zapx/v14 v14.3.10 // indirect + github.com/blevesearch/zapx/v15 v15.3.16 // indirect + github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b // indirect github.com/cloudflare/circl v1.3.7 // indirect github.com/cyphar/filepath-securejoin v0.2.4 // indirect github.com/emirpasic/gods v1.18.1 // indirect github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376 // indirect github.com/go-git/go-billy/v5 v5.5.0 // indirect + github.com/golang/geo v0.0.0-20210211234256-740aa86cb551 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect + github.com/golang/protobuf v1.3.2 // indirect + github.com/golang/snappy v0.0.1 // indirect github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 // indirect + github.com/json-iterator/go v0.0.0-20171115153421-f7279a603ede // indirect github.com/kevinburke/ssh_config v1.2.0 // indirect github.com/klauspost/compress v1.17.0 // indirect github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-isatty v0.0.20 // indirect github.com/mattn/go-runewidth v0.0.15 // indirect + github.com/mschoch/smat v0.2.0 // indirect github.com/philhofer/fwd v1.1.2 // indirect github.com/pjbgf/sha1cd v0.3.0 // indirect github.com/rivo/uniseg v0.2.0 // indirect @@ -40,6 +65,7 @@ require ( github.com/valyala/fasthttp v1.51.0 // indirect github.com/valyala/tcplisten v1.0.0 // indirect github.com/xanzy/ssh-agent v0.3.3 // indirect + go.etcd.io/bbolt v1.3.7 // indirect golang.org/x/crypto v0.21.0 // indirect golang.org/x/mod v0.12.0 // indirect golang.org/x/net v0.22.0 // indirect diff --git a/go.sum b/go.sum index 573e1b1..2fa4f5a 100644 --- a/go.sum +++ b/go.sum @@ -5,12 +5,52 @@ github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migc github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM= github.com/ProtonMail/go-crypto v1.0.0 h1:LRuvITjQWX+WIfr930YHG2HNfjR1uOfyf5vE0kC2U78= github.com/ProtonMail/go-crypto v1.0.0/go.mod h1:EjAoLdwvbIOoOQr3ihjnSoLZRtE8azugULFRteWMNc0= +github.com/RoaringBitmap/roaring v1.9.3 h1:t4EbC5qQwnisr5PrP9nt0IRhRTb9gMUgQF4t4S2OByM= +github.com/RoaringBitmap/roaring v1.9.3/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90= github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs= github.com/andybalholm/brotli v1.0.5/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig= github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be h1:9AeTilPcZAjCFIImctFaOjnTIavg87rW78vTPkQqLI8= github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be/go.mod h1:ySMOLuWl6zY27l47sB3qLNK6tF2fkHG55UZxx8oIVo4= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= +github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA= +github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= +github.com/blevesearch/bleve/v2 v2.4.4 h1:RwwLGjUm54SwyyykbrZs4vc1qjzYic4ZnAnY9TwNl60= +github.com/blevesearch/bleve/v2 v2.4.4/go.mod h1:fa2Eo6DP7JR+dMFpQe+WiZXINKSunh7WBtlDGbolKXk= +github.com/blevesearch/bleve_index_api v1.1.12 h1:P4bw9/G/5rulOF7SJ9l4FsDoo7UFJ+5kexNy1RXfegY= +github.com/blevesearch/bleve_index_api v1.1.12/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8= +github.com/blevesearch/geo v0.1.20 h1:paaSpu2Ewh/tn5DKn/FB5SzvH0EWupxHEIwbCk/QPqM= +github.com/blevesearch/geo v0.1.20/go.mod h1:DVG2QjwHNMFmjo+ZgzrIq2sfCh6rIHzy9d9d0B59I6w= +github.com/blevesearch/go-faiss v1.0.24 h1:K79IvKjoKHdi7FdiXEsAhxpMuns0x4fM0BO93bW5jLI= +github.com/blevesearch/go-faiss v1.0.24/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk= +github.com/blevesearch/go-porterstemmer v1.0.3 h1:GtmsqID0aZdCSNiY8SkuPJ12pD4jI+DdXTAn4YRcHCo= +github.com/blevesearch/go-porterstemmer v1.0.3/go.mod h1:angGc5Ht+k2xhJdZi511LtmxuEf0OVpvUUNrwmM1P7M= +github.com/blevesearch/gtreap v0.1.1 h1:2JWigFrzDMR+42WGIN/V2p0cUvn4UP3C4Q5nmaZGW8Y= +github.com/blevesearch/gtreap v0.1.1/go.mod h1:QaQyDRAT51sotthUWAH4Sj08awFSSWzgYICSZ3w0tYk= +github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc= +github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs= +github.com/blevesearch/scorch_segment_api/v2 v2.2.16 h1:uGvKVvG7zvSxCwcm4/ehBa9cCEuZVE+/zvrSl57QUVY= +github.com/blevesearch/scorch_segment_api/v2 v2.2.16/go.mod h1:VF5oHVbIFTu+znY1v30GjSpT5+9YFs9dV2hjvuh34F0= +github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU= +github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw= +github.com/blevesearch/snowballstem v0.9.0 h1:lMQ189YspGP6sXvZQ4WZ+MLawfV8wOmPoD/iWeNXm8s= +github.com/blevesearch/snowballstem v0.9.0/go.mod h1:PivSj3JMc8WuaFkTSRDW2SlrulNWPl4ABg1tC/hlgLs= +github.com/blevesearch/upsidedown_store_api v1.0.2 h1:U53Q6YoWEARVLd1OYNc9kvhBMGZzVrdmaozG2MfoB+A= +github.com/blevesearch/upsidedown_store_api v1.0.2/go.mod h1:M01mh3Gpfy56Ps/UXHjEO/knbqyQ1Oamg8If49gRwrQ= +github.com/blevesearch/vellum v1.0.10 h1:HGPJDT2bTva12hrHepVT3rOyIKFFF4t7Gf6yMxyMIPI= +github.com/blevesearch/vellum v1.0.10/go.mod h1:ul1oT0FhSMDIExNjIxHqJoGpVrBpKCdgDQNxfqgJt7k= +github.com/blevesearch/zapx/v11 v11.3.10 h1:hvjgj9tZ9DeIqBCxKhi70TtSZYMdcFn7gDb71Xo/fvk= +github.com/blevesearch/zapx/v11 v11.3.10/go.mod h1:0+gW+FaE48fNxoVtMY5ugtNHHof/PxCqh7CnhYdnMzQ= +github.com/blevesearch/zapx/v12 v12.3.10 h1:yHfj3vXLSYmmsBleJFROXuO08mS3L1qDCdDK81jDl8s= +github.com/blevesearch/zapx/v12 v12.3.10/go.mod h1:0yeZg6JhaGxITlsS5co73aqPtM04+ycnI6D1v0mhbCs= +github.com/blevesearch/zapx/v13 v13.3.10 h1:0KY9tuxg06rXxOZHg3DwPJBjniSlqEgVpxIqMGahDE8= +github.com/blevesearch/zapx/v13 v13.3.10/go.mod h1:w2wjSDQ/WBVeEIvP0fvMJZAzDwqwIEzVPnCPrz93yAk= +github.com/blevesearch/zapx/v14 v14.3.10 h1:SG6xlsL+W6YjhX5N3aEiL/2tcWh3DO75Bnz77pSwwKU= +github.com/blevesearch/zapx/v14 v14.3.10/go.mod h1:qqyuR0u230jN1yMmE4FIAuCxmahRQEOehF78m6oTgns= +github.com/blevesearch/zapx/v15 v15.3.16 h1:Ct3rv7FUJPfPk99TI/OofdC+Kpb4IdyfdMH48sb+FmE= +github.com/blevesearch/zapx/v15 v15.3.16/go.mod h1:Turk/TNRKj9es7ZpKK95PS7f6D44Y7fAFy8F4LXQtGg= +github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b h1:ju9Az5YgrzCeK3M1QwvZIpxYhChkXp7/L0RhDYsxXoE= +github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b/go.mod h1:BlrYNpOu4BvVRslmIG+rLtKhmjIaRhIbG8sb9scGTwI= github.com/bwesterb/go-ristretto v1.2.3/go.mod h1:fUIoIZaG73pV5biE2Blr2xEzDoMj7NFEuV9ekS419A0= github.com/cloudflare/circl v1.3.3/go.mod h1:5XYMA4rFBvNIrhs50XuiBJ15vF2pZn4nnUKZrLbUZFA= github.com/cloudflare/circl v1.3.7 h1:qlCDlTPz2n9fu58M0Nh1J/JzcFpfgkFHHX3O35r5vcU= @@ -40,14 +80,24 @@ github.com/gofiber/fiber/v2 v2.52.5 h1:tWoP1MJQjGEe4GB5TUGOi7P2E0ZMMRx5ZTG4rT+yG github.com/gofiber/fiber/v2 v2.52.5/go.mod h1:KEOE+cXMhXG0zHc9d8+E38hoX+ZN7bhOtgeF2oT6jrQ= github.com/gofiber/storage/memory/v2 v2.0.1 h1:tAETnom9uvEB9B3I2LkgewiuqYDAH0ItrIsmT8MUEwk= github.com/gofiber/storage/memory/v2 v2.0.1/go.mod h1:RRo3RfX6nTD/UhERyE/u5LcSfqtMo9dA4ltmieSe+QM= +github.com/golang/geo v0.0.0-20210211234256-740aa86cb551 h1:gtexQ/VGyN+VVFRXSFiguSNcXmS6rkKT+X7FdIrTtfo= +github.com/golang/geo v0.0.0-20210211234256-740aa86cb551/go.mod h1:QZ0nwyI2jOfgRAoBvP+ab5aRr7c9x7lhGEJrKvBwjWI= github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/protobuf v1.3.2 h1:6nsPYzhq5kReh6QImI3k5qWzO4PEbvbIW2cwSfR/6xs= +github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4= +github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= +github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 h1:BQSFePA1RWJOlocH6Fxy8MmwDt+yVQYULKfN0RoTN8A= github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99/go.mod h1:1lJo3i6rXxKeerYnT8Nvf0QmHCRC1n8sfWVwXF2Frvo= +github.com/json-iterator/go v0.0.0-20171115153421-f7279a603ede h1:YrgBGwxMRK0Vq0WSCWFaZUnTsrA/PZE/xs1QZh+/edg= +github.com/json-iterator/go v0.0.0-20171115153421-f7279a603ede/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= github.com/kelseyhightower/envconfig v1.4.0 h1:Im6hONhd3pLkfDFsbRgu68RDNkGF1r3dvMUtDTo2cv8= github.com/kelseyhightower/envconfig v1.4.0/go.mod h1:cccZRl6mQpaq41TPp5QxidR+Sa3axMbJDNb//FQX6Gg= github.com/kevinburke/ssh_config v1.2.0 h1:x584FjTGwHzMwvHx18PXxbBVzfnxogHaAReU4gf13a4= @@ -68,6 +118,8 @@ github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWE github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZgg3U= github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= +github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM= +github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw= github.com/onsi/gomega v1.27.10 h1:naR28SdDFlqrG6kScpT8VWpu1xWY5nJRCF3XaYyBjhI= github.com/onsi/gomega v1.27.10/go.mod h1:RsS8tutOdbdgzbPtzzATp12yT7kM5I5aElG3evPbQ0M= github.com/philhofer/fwd v1.1.2 h1:bnDivRJ1EWPjUIRXV5KfORO897HTbpFAQddBdE8t7Gw= @@ -90,6 +142,7 @@ github.com/skeema/knownhosts v1.2.2/go.mod h1:xYbVRSPxqBZFrdmDyMmsOs+uX1UZC3nTN3 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/tinylib/msgp v1.1.8 h1:FCXC1xanKO4I8plpHGH2P7koL/RzZs12l/+r7vakfm0= @@ -105,6 +158,8 @@ github.com/xanzy/ssh-agent v0.3.3/go.mod h1:6dzNDKs0J9rVPHPhaGCukekBHKqfl+L3KghI github.com/yalue/merged_fs v1.3.0 h1:qCeh9tMPNy/i8cwDsQTJ5bLr6IRxbs6meakNE5O+wyY= github.com/yalue/merged_fs v1.3.0/go.mod h1:WqqchfVYQyclV2tnR7wtRhBddzBvLVR83Cjw9BKQw0M= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +go.etcd.io/bbolt v1.3.7 h1:j+zJOnnEjF/kyHlDDgGnVL/AIqIJPq8UoB2GSNfkUfQ= +go.etcd.io/bbolt v1.3.7/go.mod h1:N9Mkw9X8x5fupy0IKsmuqVtoGDyxsaDlbk4Rd05IAQw= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= @@ -182,5 +237,6 @@ gopkg.in/warnings.v0 v0.1.2 h1:wFXVbFY8DY5/xOe1ECiWdKCzZlxgshcYVNkBHstARME= gopkg.in/warnings.v0 v0.1.2/go.mod h1:jksf8JmL6Qr/oQM2OXTHunEvvTAsrWBLb6OOjuVWRNI= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/providers/config.go b/providers/config.go index be8901b..33213ba 100644 --- a/providers/config.go +++ b/providers/config.go @@ -41,6 +41,7 @@ type Config struct { GITPath string `json:"git_path" envconfig:"GIT_PATH"` GNDPath string `json:"gnd_path" envconfig:"GND_PATH"` GeoPath string `json:"geo_path" envconfig:"GEO_PATH"` + SearchPath string `json:"search_path" envconfig:"SEARCH_PATH"` ImgPath string `json:"img_path" envconfig:"IMG_PATH"` WebHookEndpoint string `json:"webhook_endpoint" envconfig:"WEBHOOK_ENDPOINT"` WebHookSecret string `json:"webhook_secret" envconfig:"WEBHOOK_SECRET"` @@ -123,6 +124,10 @@ func readDefaults(cfg *Config) *Config { cfg.ImgPath = DEFAULT_IMG_DIR } + if strings.TrimSpace(cfg.SearchPath) == "" { + cfg.SearchPath = DEFAULT_SEARCH_CACHE_DIR + } + return cfg } diff --git a/providers/search/searchprovider.go b/providers/search/searchprovider.go new file mode 100644 index 0000000..61e21c8 --- /dev/null +++ b/providers/search/searchprovider.go @@ -0,0 +1,129 @@ +package searchprovider + +import ( + "errors" + "path/filepath" + "sync" + + "github.com/Theodor-Springmann-Stiftung/kgpz_web/xmlmodels" + "github.com/blevesearch/bleve/v2" + "github.com/blevesearch/bleve/v2/analysis/analyzer/custom" + "github.com/blevesearch/bleve/v2/analysis/char/html" + "github.com/blevesearch/bleve/v2/analysis/char/regexp" + "github.com/blevesearch/bleve/v2/analysis/token/lowercase" + "github.com/blevesearch/bleve/v2/analysis/token/ngram" + "github.com/blevesearch/bleve/v2/analysis/token/unicodenorm" + "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode" + "github.com/blevesearch/bleve/v2/mapping" +) + +var NoKeyError = errors.New("Missing ID key.") +var NoLibError = errors.New("Missing library.") + +type ISearchable interface { + Keys() []string + Readable(lib *xmlmodels.Library) map[string]interface{} + Type() string +} + +type SearchProvider struct { + indeces sync.Map + basepath string +} + +func NewSearchProvider(basepath string) (*SearchProvider, error) { + sp := &SearchProvider{basepath: basepath} + return sp, nil +} + +func (sp *SearchProvider) Index(item ISearchable, lib *xmlmodels.Library) error { + keys := item.Keys() + if len(keys) == 0 { + return NoKeyError + } + if lib == nil { + return NoLibError + } + + i, err := sp.FindCreateIndex(item.Type()) + if err != nil { + return err + } + + return i.Index(keys[0], item.Readable(lib)) +} + +func (sp *SearchProvider) FindCreateIndex(typ string) (bleve.Index, error) { + index, ok := sp.indeces.Load(typ) + if ok { + i := index.(bleve.Index) + return i, nil + } + + fp := filepath.Join(sp.basepath, typ+".bleve") + ind, err := bleve.Open(fp) + if err == bleve.ErrorIndexPathDoesNotExist { + mapping, err := default_mapping() + if err != nil { + return nil, err + } + ind, err = bleve.New(filepath.Join(fp), mapping) + if err != nil { + return nil, err + } + } + sp.indeces.Store(typ, ind) + + return ind, nil +} + +func default_mapping() (*mapping.IndexMappingImpl, error) { + indexMapping := bleve.NewIndexMapping() + + customunicodeFilter := map[string]interface{}{ + "type": unicodenorm.Name, + "form": unicodenorm.NFKD, + } + + customCharFilterConfig := map[string]interface{}{ + "type": regexp.Name, + "regexp": `[[:punct:]]+`, // Removes all punctuation characters + "replace": "", + } + + customNgramFilterConfig := map[string]interface{}{ + "type": ngram.Name, + "min": 1, // minimum n-gram size + "max": 20, // maximum n-gram size + } + + customNgramAnalyzer := map[string]interface{}{ + "type": custom.Name, + "tokenizer": unicode.Name, + "char_filters": []string{"removePunctuation", html.Name}, + "token_filters": []string{lowercase.Name, "customNgramFilter", "customUnicodeCharFilter"}, + } + + err := indexMapping.AddCustomTokenFilter("customNgramFilter", customNgramFilterConfig) + if err != nil { + return nil, err + } + + err = indexMapping.AddCustomCharFilter("removePunctuation", customCharFilterConfig) + if err != nil { + return nil, err + } + + err = indexMapping.AddCustomTokenFilter("customUnicodeCharFilter", customunicodeFilter) + if err != nil { + return nil, err + } + + err = indexMapping.AddCustomAnalyzer("customNgramAnalyzer", customNgramAnalyzer) + if err != nil { + return nil, err + } + + indexMapping.DefaultAnalyzer = "customNgramAnalyzer" + return indexMapping, nil +} diff --git a/providers/xmlprovider/models.go b/providers/xmlprovider/models.go index e849c6c..0afb9f9 100644 --- a/providers/xmlprovider/models.go +++ b/providers/xmlprovider/models.go @@ -2,8 +2,11 @@ package xmlprovider import "fmt" -type XMLItem interface { +type IXMLItem interface { fmt.Stringer + // INFO: + // - Keys should be unique + // - Keys[0] has the special meaning of the primary key (for FTS etc.) Keys() []string Name() string } @@ -12,13 +15,13 @@ type ILibrary interface { Parse(meta ParseMeta) error } -type ResolvingMap[T XMLItem] map[string][]Resolved[T] +type ResolvingMap[T IXMLItem] map[string][]Resolved[T] -type ReferenceResolver[T XMLItem] interface { +type ReferenceResolver[T IXMLItem] interface { References() ResolvingMap[T] } -type Resolved[T XMLItem] struct { +type Resolved[T IXMLItem] struct { Item *T Reference string Category string diff --git a/providers/xmlprovider/resolver.go b/providers/xmlprovider/resolver.go index 30bed65..88b52a3 100644 --- a/providers/xmlprovider/resolver.go +++ b/providers/xmlprovider/resolver.go @@ -7,13 +7,13 @@ import ( "sync" ) -type Resolver[T XMLItem] struct { +type Resolver[T IXMLItem] struct { // INFO: map[type][ID] index map[string]map[string][]Resolved[T] mu sync.RWMutex } -func NewResolver[T XMLItem]() *Resolver[T] { +func NewResolver[T IXMLItem]() *Resolver[T] { return &Resolver[T]{index: make(map[string]map[string][]Resolved[T])} } diff --git a/providers/xmlprovider/xmlprovider.go b/providers/xmlprovider/xmlprovider.go index 878b608..b4c4a67 100644 --- a/providers/xmlprovider/xmlprovider.go +++ b/providers/xmlprovider/xmlprovider.go @@ -34,7 +34,7 @@ func (p ParseMeta) Failed(path string) bool { } // An XMLProvider is a struct that holds holds serialized XML data of a specific type. It combines multiple parses IF a succeeded parse can not serialize the data from a path. -type XMLProvider[T XMLItem] struct { +type XMLProvider[T IXMLItem] struct { // INFO: map is type map[string]*T Items sync.Map // INFO: map is type [string]ItemInfo @@ -50,7 +50,7 @@ type XMLProvider[T XMLItem] struct { Array []T } -func NewXMLProvider[T XMLItem]() *XMLProvider[T] { +func NewXMLProvider[T IXMLItem]() *XMLProvider[T] { return &XMLProvider[T]{Resolver: *NewResolver[T]()} } @@ -141,7 +141,7 @@ func (p *XMLProvider[T]) addResolvable(item T) { } } -func (p *XMLProvider[T]) ReverseLookup(item XMLItem) []Resolved[T] { +func (p *XMLProvider[T]) ReverseLookup(item IXMLItem) []Resolved[T] { // INFO: this runs just once for the first key ret := make([]Resolved[T], 0) keys := item.Keys() diff --git a/providers/xmlprovider/xmlsort.go b/providers/xmlprovider/xmlsort.go index 00de5c6..f867bd9 100644 --- a/providers/xmlprovider/xmlsort.go +++ b/providers/xmlprovider/xmlsort.go @@ -5,7 +5,7 @@ import ( "strings" ) -func Sort[T XMLItem](i, j T) int { +func Sort[T IXMLItem](i, j T) int { keys_a := i.Keys() keys_b := j.Keys() diff --git a/scratchpad.md b/scratchpad.md index ac3d6c3..7b0b30e 100644 --- a/scratchpad.md +++ b/scratchpad.md @@ -54,3 +54,9 @@ In Kombination: Anderes: - Provinienz in Werk + + +Suche: +- Phrasensuche +- AND, OR +- Groß- und Kleinschreibung ignorieren diff --git a/xmlmodels/agents.go b/xmlmodels/agents.go index 6447489..91d7c59 100644 --- a/xmlmodels/agents.go +++ b/xmlmodels/agents.go @@ -3,6 +3,11 @@ package xmlmodels import ( "encoding/json" "encoding/xml" + "strings" +) + +const ( + AGENT_TYPE = "agent" ) type Agent struct { @@ -24,3 +29,21 @@ func (a Agent) String() string { data, _ := json.MarshalIndent(a, "", " ") return string(data) } + +func (a Agent) Readable(_ *Library) map[string]interface{} { + ret := map[string]interface{}{ + "ID": a.ID, + "Names": strings.Join(a.Names, "; "), + "Life": a.Life, + } + + for k, v := range a.AnnotationNote.Readable() { + ret[k] = v + } + + return ret +} + +func (a Agent) Type() string { + return AGENT_TYPE +} diff --git a/xmlmodels/categories.go b/xmlmodels/categories.go index 98a3f66..915b3be 100644 --- a/xmlmodels/categories.go +++ b/xmlmodels/categories.go @@ -3,6 +3,11 @@ package xmlmodels import ( "encoding/json" "encoding/xml" + "strings" +) + +const ( + CATEGORY_TYPE = "category" ) type Category struct { @@ -21,3 +26,20 @@ func (c Category) String() string { data, _ := json.MarshalIndent(c, "", " ") return string(data) } + +func (c Category) Readable(_ *Library) map[string]interface{} { + ret := map[string]interface{}{ + "ID": c.ID, + "Names": strings.Join(c.Names, "; "), + } + + for k, v := range c.AnnotationNote.Readable() { + ret[k] = v + } + + return ret +} + +func (c Category) Type() string { + return CATEGORY_TYPE +} diff --git a/xmlmodels/common.go b/xmlmodels/common.go index daa15a9..6bbcc61 100644 --- a/xmlmodels/common.go +++ b/xmlmodels/common.go @@ -3,6 +3,7 @@ package xmlmodels import ( "encoding/xml" "errors" + "strings" "github.com/Theodor-Springmann-Stiftung/kgpz_web/helpers/xsdtime" ) @@ -38,6 +39,24 @@ type AnnotationNote struct { Notes []Note `xml:"vermerk"` } +func (an AnnotationNote) Readable() map[string]interface{} { + ret := make(map[string]interface{}) + annnotations := make([]string, len(an.Annotations)) + for _, a := range an.Annotations { + annnotations = append(annnotations, a.Chardata) + } + + ret["Annotations"] = strings.Join(annnotations, "; ") + + nots := make([]string, len(an.Notes)) + for _, n := range an.Notes { + nots = append(nots, n.Chardata) + } + + ret["Notes"] = strings.Join(nots, "; ") + return ret +} + type Annotation struct { XMLName xml.Name `xml:"anmerkung"` Value @@ -57,7 +76,7 @@ type Identifier struct { func (i Identifier) Keys() []string { if len(i.keys) == 0 { - i.keys = append(i.keys, i.ID) + i.keys = []string{i.ID} } return i.keys } @@ -69,6 +88,19 @@ type Reference struct { Inner Inner } +func (r Reference) Readable(lib *Library) map[string]interface{} { + data := make(map[string]interface{}) + if r.Category != "" { + cat := lib.Categories.Item(r.Category) + if cat != nil { + data["ReferenceCategory"] = cat.Names + } + } + + data["ReferenceComment"] = r.Inner.InnerXML + return data +} + type Value struct { Chardata string `xml:",chardata"` } diff --git a/xmlmodels/issues.go b/xmlmodels/issues.go index 8b71a94..5fcb8bb 100644 --- a/xmlmodels/issues.go +++ b/xmlmodels/issues.go @@ -6,6 +6,10 @@ import ( "strconv" ) +const ( + ISSUE_TYPE = "issue" +) + type Issue struct { XMLName xml.Name `xml:"stueck"` Number Nummer `xml:"nummer"` @@ -57,3 +61,22 @@ func (i Issue) String() string { data, _ := json.MarshalIndent(i, "", " ") return string(data) } + +func (i Issue) Readable(_ *Library) map[string]interface{} { + ret := map[string]interface{}{ + "ID": i.ID, + "Number": i.Number.No, + "Year": i.Datum.When.Year, + "Date": i.Datum.When.String(), + } + + for k, v := range i.AnnotationNote.Readable() { + ret[k] = v + } + + return ret +} + +func (i Issue) Type() string { + return ISSUE_TYPE +} diff --git a/xmlmodels/pieces.go b/xmlmodels/pieces.go index f09a977..86d346a 100644 --- a/xmlmodels/pieces.go +++ b/xmlmodels/pieces.go @@ -10,6 +10,10 @@ import ( "github.com/google/uuid" ) +const ( + PIECES_CATEGORY = "piece" +) + type Piece struct { XMLName xml.Name `xml:"beitrag"` IssueRefs []IssueRef `xml:"stueck"` @@ -39,13 +43,14 @@ func (p Piece) Keys() []string { return p.keys } - ret := make([]string, 2) + ret := make([]string, 0, 3) if p.ID != "" { ret = append(ret, p.ID) } // TODO: sensible IDs uid := uuid.New() + ret = append(ret, uid.String()) for _, i := range p.IssueRefs { ret = append(ret, strconv.Itoa(i.When.Year)+"-"+strconv.Itoa(i.Nr)+"-"+uid.String()) @@ -212,3 +217,49 @@ func (p Piece) ReferencesWork(id string) (*WorkRef, bool) { } return nil, false } + +func (p Piece) Readable(lib *Library) map[string]interface{} { + data := make(map[string]interface{}) + data["Title"] = p.Title + data["Incipit"] = p.Incipit + + for k, v := range p.AnnotationNote.Readable() { + data[k] = v + } + + agents := make([]map[string]interface{}, len(p.AgentRefs)) + for k, v := range p.AgentRefs { + agents[k] = v.Readable(lib) + } + data["Agents"] = agents + + works := make([]map[string]interface{}, len(p.WorkRefs)) + for k, v := range p.WorkRefs { + works[k] = v.Readable(lib) + } + data["Works"] = works + + places := make([]map[string]interface{}, len(p.PlaceRefs)) + for k, v := range p.PlaceRefs { + places[k] = v.Readable(lib) + } + data["Places"] = places + + categories := make([]map[string]interface{}, len(p.CategoryRefs)) + for k, v := range p.CategoryRefs { + categories[k] = v.Readable(lib) + } + data["Categories"] = categories + + issuerefs := make([]map[string]interface{}, len(p.IssueRefs)) + for k, v := range p.IssueRefs { + issuerefs[k] = v.Readable(lib) + } + data["Issues"] = issuerefs + + return data +} + +func (p Piece) Type() string { + return PIECES_CATEGORY +} diff --git a/xmlmodels/places.go b/xmlmodels/places.go index 7f95b90..d792809 100644 --- a/xmlmodels/places.go +++ b/xmlmodels/places.go @@ -5,6 +5,10 @@ import ( "encoding/xml" ) +const ( + PLACE_TYPE = "place" +) + type Place struct { XMLName xml.Name `xml:"ort"` Names []string `xml:"name"` @@ -22,3 +26,20 @@ func (p Place) String() string { data, _ := json.MarshalIndent(p, "", " ") return string(data) } + +func (p Place) Readable(_ *Library) map[string]interface{} { + ret := map[string]interface{}{ + "ID": p.ID, + "Names": p.Names, + } + + for k, v := range p.AnnotationNote.Readable() { + ret[k] = v + } + + return ret +} + +func (p Place) Type() string { + return PLACE_TYPE +} diff --git a/xmlmodels/references.go b/xmlmodels/references.go index cd1c582..159b3d3 100644 --- a/xmlmodels/references.go +++ b/xmlmodels/references.go @@ -1,6 +1,9 @@ package xmlmodels -import "encoding/xml" +import ( + "encoding/xml" + "strconv" +) type AgentRef struct { XMLName xml.Name `xml:"akteur"` @@ -12,6 +15,19 @@ func (ar AgentRef) Name() string { return x.Name() } +func (ar AgentRef) Readable(lib *Library) map[string]interface{} { + data := make(map[string]interface{}) + agent := lib.Agents.Item(ar.Ref) + if agent != nil { + data["AgentNames"] = agent.Names + } + + for k, v := range ar.Reference.Readable(lib) { + data[k] = v + } + return data +} + type IssueRef struct { XMLName xml.Name `xml:"stueck"` Nr int `xml:"nr,attr"` @@ -22,6 +38,25 @@ type IssueRef struct { Reference // Nicht im Schema } +func (ir IssueRef) Readable(lib *Library) map[string]interface{} { + data := make(map[string]interface{}) + if ir.When.Year != 0 { + data["IssueYear"] = ir.When.Year + } else { + return data + } + + issuekey := strconv.Itoa(ir.When.Year) + "-" + strconv.Itoa(ir.Nr) + issue := lib.Issues.Item(issuekey) + if issue != nil { + data["IssueDate"] = issue.Datum.When.String() + } + + data["IssueNumber"] = ir.Nr + + return data +} + func (ir IssueRef) Name() string { var x Issue return x.Name() @@ -32,6 +67,19 @@ type PlaceRef struct { Reference } +func (pr *PlaceRef) Readable(lib *Library) map[string]interface{} { + data := make(map[string]interface{}) + place := lib.Places.Item(pr.Ref) + if place != nil { + data["PlaceNames"] = place.Names + } + + for k, v := range pr.Reference.Readable(lib) { + data[k] = v + } + return data +} + func (pr PlaceRef) Name() string { var x Place return x.Name() @@ -47,12 +95,45 @@ func (cr CategoryRef) Name() string { return x.Name() } +func (cr CategoryRef) Readable(lib *Library) map[string]interface{} { + data := make(map[string]interface{}) + cat := lib.Categories.Item(cr.Ref) + if cat != nil { + data["CategoryNames"] = cat.Names + } + + for k, v := range cr.Reference.Readable(lib) { + data[k] = v + } + return data +} + type WorkRef struct { XMLName xml.Name `xml:"werk"` Page string `xml:"s,attr"` Reference } +func (wr WorkRef) Readable(lib *Library) map[string]interface{} { + data := make(map[string]interface{}) + work := lib.Works.Item(wr.Ref) + if work != nil { + data["WorkTitle"] = work.Citation.Title + data["WorkYear"] = work.Citation.Year + data["WorkPreferredTitle"] = work.PreferredTitle + prefs := make([]map[string]interface{}, len(work.AgentRefs)) + for k, v := range work.AgentRefs { + prefs[k] = v.Readable(lib) + } + data["WorkAgents"] = prefs + } + + for k, v := range wr.Reference.Readable(lib) { + data[k] = v + } + return data +} + func (wr WorkRef) Name() string { var x Work return x.Name() diff --git a/xmlmodels/works.go b/xmlmodels/works.go index 800110f..3c8efae 100644 --- a/xmlmodels/works.go +++ b/xmlmodels/works.go @@ -7,6 +7,10 @@ import ( "github.com/Theodor-Springmann-Stiftung/kgpz_web/providers/xmlprovider" ) +const ( + WORKS_CATEGORY = "work" +) + type Work struct { XMLName xml.Name `xml:"werk"` URLs []URL `xml:"url"` @@ -49,3 +53,30 @@ func (w Work) String() string { data, _ := json.MarshalIndent(w, "", " ") return string(data) } + +func (w Work) Readable(lib *Library) map[string]interface{} { + ret := map[string]interface{}{ + "ID": w.ID, + "PreferredTitle": w.PreferredTitle, + "Title": w.Citation.Title, + "Year": w.Citation.Year, + "CitationTitle": w.Citation.Title, + } + + for k, v := range w.AnnotationNote.Readable() { + ret[k] = v + } + + agents := make([]map[string]interface{}, len(w.AgentRefs)) + for k, v := range w.AgentRefs { + agents[k] = v.Readable(lib) + } + + ret["Agents"] = agents + + return ret +} + +func (w Work) Type() string { + return WORKS_CATEGORY +}