Added arrays to XMLProvider types for faster iteration

This commit is contained in:
Simon Martens
2024-12-20 19:54:25 +01:00
parent 3ef30ef7c7
commit 977d7331d5
11 changed files with 188 additions and 122 deletions

View File

@@ -23,9 +23,8 @@ const (
)
type KGPZ struct {
// LMU is here for file system access
lmu sync.Mutex
// GMU is only here to prevent concurrent pulls
// or file system operations while parsing
gmu sync.Mutex
Config *providers.ConfigProvider
Repo *providers.GitProvider
@@ -78,28 +77,23 @@ func (k *KGPZ) Enrich() error {
k.InitGND()
}
k.lmu.Lock()
defer k.lmu.Unlock()
if k.Library == nil || k.Library.Agents == nil {
return nil
}
// INFO: We pass agents by value since we don't want to block the library
agents := k.Library.Agents.Everything()
go func(agents []*xmlprovider.Agent) {
k.GND.FetchPersons(agents)
// TODO: Library locking is never needed, since the library items, once set, are never changed
// We only need to check if set
go func() {
data := gnd.ProviderIntoDataset(k.Library.Agents)
k.GND.FetchPersons(data)
k.GND.WriteCache(k.Config.GNDPath)
}(agents)
}()
return nil
}
func (k *KGPZ) Serialize() {
// TODO: this is error handling from hell
// There is no need to recreate the whole library if the paths haven't changed
// We do it to keep the old data if the new data is missing
// Preventing pulling and serializing at the same time
k.gmu.Lock()
defer k.gmu.Unlock()
@@ -115,23 +109,15 @@ func (k *KGPZ) Serialize() {
pieces, err := getXMLFiles(filepath.Join(k.Config.FolderPath, PIECES_DIR))
helpers.Assert(err, "Error getting pieces")
k.lmu.Lock()
defer k.lmu.Unlock()
if k.Library == nil {
lib := xmlprovider.NewLibrary(
k.Library = xmlprovider.NewLibrary(
[]string{filepath.Join(k.Config.FolderPath, AGENTS_PATH)},
[]string{filepath.Join(k.Config.FolderPath, PLACES_PATH)},
[]string{filepath.Join(k.Config.FolderPath, WORKS_PATH)},
[]string{filepath.Join(k.Config.FolderPath, CATEGORIES_PATH)},
*issues,
*pieces)
lib.Serialize(commit)
k.Library = lib
} else {
// TODO: where to clear the old data?
// How to differentiate between deleted data points and stale data points bc of parse errors?
k.Library.SetPaths(
[]string{filepath.Join(k.Config.FolderPath, AGENTS_PATH)},
[]string{filepath.Join(k.Config.FolderPath, PLACES_PATH)},
@@ -139,8 +125,8 @@ func (k *KGPZ) Serialize() {
[]string{filepath.Join(k.Config.FolderPath, CATEGORIES_PATH)},
*issues,
*pieces)
k.Library.Serialize(commit)
}
k.Library.Serialize(commit)
}
func (k *KGPZ) IsDebug() bool {

View File

@@ -13,7 +13,6 @@ import (
"time"
"github.com/Theodor-Springmann-Stiftung/kgpz_web/helpers/logging"
"github.com/Theodor-Springmann-Stiftung/kgpz_web/providers/xmlprovider"
)
const (
@@ -77,7 +76,6 @@ func (p *GNDProvider) readPersons(folder string) error {
func (p *GNDProvider) readPerson(file string) {
person := Person{}
// JSON unmarshalling of the file and sanity check:
f, err := os.Open(file)
if err != nil {
logging.Error(err, "Error opening file for reading: "+file)
@@ -96,8 +94,8 @@ func (p *GNDProvider) readPerson(file string) {
return
}
if person.Agent.GND != "" {
p.Persons.Store(person.Agent.GND, person)
if person.KGPZURL != "" {
p.Persons.Store(person.KGPZURL, person)
return
}
}
@@ -168,14 +166,14 @@ func (p *GNDProvider) Person(id string) *Person {
return &pers
}
func (p *GNDProvider) FetchPersons(persons []*xmlprovider.Agent) {
func (p *GNDProvider) FetchPersons(persons []GNDData) {
wg := sync.WaitGroup{}
for _, person := range persons {
if person.ID == "" || person.GND == "" {
continue
}
// INFO: person already fetched; check for updates??
// TODO: person already fetched; check for updates??
if _, ok := p.Persons.Load(person.GND); ok {
continue
}
@@ -187,27 +185,27 @@ func (p *GNDProvider) FetchPersons(persons []*xmlprovider.Agent) {
p.errmu.Unlock()
wg.Add(1)
go func(person *xmlprovider.Agent) {
go func(person *GNDData) {
defer wg.Done()
p.fetchPerson(*person)
}(person)
p.fetchPerson(person.ID, person.GND)
}(&person)
}
wg.Wait()
}
func (p *GNDProvider) fetchPerson(person xmlprovider.Agent) {
SPLITURL := strings.Split(person.GND, "/")
func (p *GNDProvider) fetchPerson(ID, GND string) {
SPLITURL := strings.Split(GND, "/")
if len(SPLITURL) < 2 {
logging.Error(nil, "Error parsing GND ID from: "+person.GND)
logging.Error(nil, "Error parsing GND ID from: "+GND)
return
}
GNDID := SPLITURL[len(SPLITURL)-1]
logging.Debug("Fetching person: " + person.ID + " with URL: " + LOBID_URL + GNDID)
logging.Debug("Fetching person: " + ID + " with URL: " + LOBID_URL + GNDID)
request, err := http.NewRequest("GET", LOBID_URL+GNDID, nil)
if err != nil {
logging.Error(err, "Error creating request: "+person.ID)
logging.Error(err, "Error creating request: "+ID)
return
}
@@ -218,17 +216,17 @@ func (p *GNDProvider) fetchPerson(person xmlprovider.Agent) {
response, err = http.DefaultClient.Do(request)
if err == nil && response.StatusCode < 400 {
if i > 0 {
logging.Info("Successfully fetched person: " + person.ID + " after " + strconv.Itoa(i) + " retries")
logging.Info("Successfully fetched person: " + ID + " after " + strconv.Itoa(i) + " retries")
}
break
}
time.Sleep(time.Duration(i+1) * time.Second)
logging.Error(err, "Retry fetching person: "+person.ID)
logging.Error(err, "Retry fetching person: "+ID)
}
if err != nil {
logging.Error(err, "Error fetching person: "+person.ID)
logging.Error(err, "Error fetching person: "+ID)
return
}
@@ -237,29 +235,29 @@ func (p *GNDProvider) fetchPerson(person xmlprovider.Agent) {
if response.StatusCode != http.StatusOK {
if response.StatusCode < 500 {
p.errmu.Lock()
p.errs[person.GND] = response.StatusCode
p.errs[GND] = response.StatusCode
p.errmu.Unlock()
}
logging.Error(errors.New("Error fetching person: " + person.ID + " with status code: " + http.StatusText(response.StatusCode)))
logging.Error(errors.New("Error fetching person: " + ID + " with status code: " + http.StatusText(response.StatusCode)))
return
}
body, err := io.ReadAll(response.Body)
if err != nil {
logging.Error(err, "Error reading response body: "+person.ID)
logging.Error(err, "Error reading response body: "+ID)
return
}
// For debug purposes: Write response body to file:
// os.WriteFile("gnd_responses/"+person.ID+".json", body, 0644)
// os.WriteFile("gnd_responses/"+ID+".json", body, 0644)
gndPerson := Person{}
if err := json.Unmarshal(body, &gndPerson); err != nil {
logging.Error(err, "Error unmarshalling response body: "+person.ID)
logging.Error(err, "Error unmarshalling response body: "+ID)
return
}
gndPerson.KGPZID = person.ID
gndPerson.Agent = person
p.Persons.Store(person.GND, gndPerson)
gndPerson.KGPZID = ID
gndPerson.KGPZURL = GND
p.Persons.Store(GND, gndPerson)
}

17
providers/gnd/helpers.go Normal file
View File

@@ -0,0 +1,17 @@
package gnd
import "github.com/Theodor-Springmann-Stiftung/kgpz_web/providers/xmlprovider"
type GNDData struct {
ID, GND string
}
func ProviderIntoDataset(provider *xmlprovider.XMLProvider[xmlprovider.Agent]) []GNDData {
provider.Lock()
defer provider.Unlock()
var data []GNDData
for _, agent := range provider.Array {
data = append(data, GNDData{ID: agent.ID, GND: agent.GND})
}
return data
}

View File

@@ -2,13 +2,11 @@ package gnd
import (
"fmt"
"github.com/Theodor-Springmann-Stiftung/kgpz_web/providers/xmlprovider"
)
type Person struct {
KGPZID string `json:"kgpzid"`
Agent xmlprovider.Agent `json:"agent"`
KGPZURL string `json:"kgpzurl"`
URL string `json:"id,omitempty"`
DateOfBirth []string `json:"dateOfBirth,omitempty"`
PlaceOfBirth []Entity `json:"placeOfBirth,omitempty"`

View File

@@ -0,0 +1,32 @@
package xmlprovider
import (
"encoding/xml"
"io"
"os"
"github.com/Theodor-Springmann-Stiftung/kgpz_web/helpers/logging"
)
func UnmarshalFile[T any](filename string, data T) error {
xmlFile, err := os.Open(filename)
if err != nil {
logging.Error(err, "Could not open file: "+filename)
return err
}
defer xmlFile.Close()
logging.Info("Deserialization: " + filename)
byteValue, err := io.ReadAll(xmlFile)
if err != nil {
logging.Error(err, "Could not read file: "+filename)
return err
}
err = xml.Unmarshal(byteValue, &data)
if err != nil {
logging.Error(err, "Could not unmarshal file: "+filename)
return err
}
return nil
}

View File

@@ -29,7 +29,11 @@ type Additional struct {
Bis int `xml:"bis"`
}
func (i Issue) GetIDs() []string {
func (i Issue) Keys() []string {
if len(i.keys) > 0 {
return i.keys
}
res := make([]string, 2)
date := i.Datum.When
if date != "" {
@@ -40,6 +44,8 @@ func (i Issue) GetIDs() []string {
res = append(res, i.Datum.When[0:4]+"-"+strconv.Itoa(i.Number.No))
}
i.keys = res
return res
}

View File

@@ -4,3 +4,7 @@ type ItemInfo struct {
Source string
Parse *ParseMeta
}
type KeyedItem struct {
keys []string
}

View File

@@ -6,6 +6,7 @@ import (
)
type Library struct {
amu sync.Mutex
Agents *XMLProvider[Agent]
Places *XMLProvider[Place]
Works *XMLProvider[Work]
@@ -19,6 +20,7 @@ func (l *Library) String() string {
l.Agents.String(), l.Places.String(), l.Works.String(), l.Categories.String(), l.Issues.String(), l.Pieces.String())
}
// INFO: this is the only place where the providers are created. There is no need for locking on access.
func NewLibrary(agentpaths, placepaths, workpaths, categorypaths, issuepaths, piecepaths []string) *Library {
return &Library{
Agents: &XMLProvider[Agent]{Paths: agentpaths},
@@ -31,6 +33,8 @@ func NewLibrary(agentpaths, placepaths, workpaths, categorypaths, issuepaths, pi
}
func (l *Library) SetPaths(agentpaths, placepaths, workpaths, categorypaths, issuepaths, piecepaths []string) {
l.amu.Lock()
defer l.amu.Unlock()
l.Agents.Paths = agentpaths
l.Places.Paths = placepaths
l.Works.Paths = workpaths
@@ -93,14 +97,9 @@ func (l *Library) Serialize(commit string) {
}
wg.Wait()
go func() {
l.Cleanup()
}()
}
// TODO: Prepare resets the list of failed parses for a new parse.
// We need to set the logs accordingly.
func (l *Library) Prepare(commit string) {
l.Agents.Prepare(commit)
l.Places.Prepare(commit)
@@ -111,10 +110,38 @@ func (l *Library) Prepare(commit string) {
}
func (l *Library) Cleanup() {
wg := sync.WaitGroup{}
wg.Add(6)
go func() {
l.Agents.Cleanup()
wg.Done()
}()
go func() {
l.Places.Cleanup()
wg.Done()
}()
go func() {
l.Works.Cleanup()
wg.Done()
}()
go func() {
l.Categories.Cleanup()
wg.Done()
}()
go func() {
l.Issues.Cleanup()
wg.Done()
}()
go func() {
l.Pieces.Cleanup()
wg.Done()
}()
wg.Wait()
}

View File

@@ -28,7 +28,11 @@ func (p Piece) String() string {
return fmt.Sprintf("ID: %s\nIssueRefs: %v\nPlaceRefs: %v\nCategoryRefs: %v\nAgentRefs: %v\nWorkRefs: %v\nPieceRefs: %v\nAdditionalRef: %v\nIncipit: %v\nTitle: %v\nAnnotations: %v\nNotes: %v\n", p.ID, p.IssueRefs, p.PlaceRefs, p.CategoryRefs, p.AgentRefs, p.WorkRefs, p.PieceRefs, p.AdditionalRef, p.Incipit, p.Title, p.Annotations, p.Notes)
}
func (p Piece) GetIDs() []string {
func (p Piece) Keys() []string {
if len(p.keys) > 0 {
return p.keys
}
ret := make([]string, 2)
if p.ID != "" {
ret = append(ret, p.ID)
@@ -44,6 +48,9 @@ func (p Piece) GetIDs() []string {
for _, i := range p.AdditionalRef {
ret = append(ret, i.Datum+"-"+strconv.Itoa(i.Nr)+"-b-"+uid.String())
}
p.keys = ret
return ret
}

View File

@@ -37,10 +37,15 @@ type Note struct {
type Identifier struct {
ID string `xml:"id,attr"`
KeyedItem
}
func (i Identifier) GetIDs() []string {
return []string{i.ID}
func (i Identifier) Keys() []string {
if len(i.keys) > 0 {
return i.keys
}
i.keys = []string{i.ID}
return i.keys
}
type Reference struct {

View File

@@ -1,10 +1,7 @@
package xmlprovider
import (
"encoding/xml"
"fmt"
"io"
"os"
"slices"
"sync"
"time"
@@ -19,45 +16,40 @@ type ParseMeta struct {
type XMLItem interface {
fmt.Stringer
GetIDs() []string
Keys() []string
}
// An XMLProvider is a struct that holds holds serialized XML data of a specific type. It combines multiple parses IF a succeeded parse can not serialize the data from a path.
type XMLProvider[T XMLItem] struct {
Paths []string
// INFO: map is type [string]*T
// INFO: map is type map[string]*T
Items sync.Map
// INFO: map is type [string]ItemInfo
// It keeps information about parsing status of the items.
Infos sync.Map
mu sync.Mutex
// TODO: This is not populated yet
// TODO: This array is meant to be for iteration purposes, since iteration over the sync.Map is slow.
// It is best for this array to be sorted by key of the corresponding item.
Array []T
Previous []T
failed []string
parses []ParseMeta
}
// INFO: To parse sth, we call Prepare, then Serialize, then Cleanup.
// Serialize can be called concurretly.
// Prepare & Cleanup are called once per parse. Serialize is called for every path.
// and can be called concurretly.
func (p *XMLProvider[T]) Prepare(commit string) {
p.mu.Lock()
defer p.mu.Unlock()
p.Previous = p.Array
p.Array = make([]T, len(p.Previous))
p.failed = make([]string, 0)
p.parses = append(p.parses, ParseMeta{Commit: commit, Date: time.Now()})
}
func (p *XMLProvider[T]) Serialize(dataholder XMLRootElement[T], path string) error {
if len(p.parses) == 0 {
logging.Error(fmt.Errorf("No commit set"), "No commit set")
return fmt.Errorf("No commit set")
}
p.mu.Lock()
commit := &p.parses[len(p.parses)-1]
p.mu.Unlock()
// Introduce goroutine for every path, locking on append:
if err := UnmarshalFile(path, dataholder); err != nil {
logging.Error(err, "Could not unmarshal file: "+path)
logging.ParseMessages.LogError(logging.Unknown, path, "", "Could not unmarshal file.")
@@ -67,9 +59,18 @@ func (p *XMLProvider[T]) Serialize(dataholder XMLRootElement[T], path string) er
return err
}
p.mu.Lock()
if len(p.parses) == 0 {
logging.Error(fmt.Errorf("No commit set"), "No commit set")
return fmt.Errorf("No commit set")
}
commit := &p.parses[len(p.parses)-1]
p.Array = append(p.Array, dataholder.Children()...)
p.mu.Unlock()
for _, item := range dataholder.Children() {
// INFO: Mostly it's just one ID, so the double loop is not that bad.
for _, id := range item.GetIDs() {
for _, id := range item.Keys() {
p.Infos.Store(id, ItemInfo{Source: path, Parse: commit})
p.Items.Store(id, &item)
}
@@ -92,11 +93,20 @@ func (p *XMLProvider[T]) Cleanup() {
lastcommit := &p.parses[len(p.parses)-1]
todelete := make([]string, 0)
toappend := make([]*T, 0)
p.Infos.Range(func(key, value interface{}) bool {
info := value.(ItemInfo)
if info.Parse != lastcommit {
if !slices.Contains(p.failed, info.Source) {
todelete = append(todelete, key.(string))
} else {
item, ok := p.Items.Load(key)
if ok {
i := item.(*T)
if !slices.Contains(toappend, i) {
toappend = append(toappend, i)
}
}
}
}
return true
@@ -106,6 +116,10 @@ func (p *XMLProvider[T]) Cleanup() {
p.Infos.Delete(key)
p.Items.Delete(key)
}
for _, item := range toappend {
p.Array = append(p.Array, *item)
}
}
func (a *XMLProvider[T]) String() string {
@@ -118,29 +132,6 @@ func (a *XMLProvider[T]) String() string {
return s
}
func UnmarshalFile[T any](filename string, data T) error {
xmlFile, err := os.Open(filename)
if err != nil {
logging.Error(err, "Could not open file: "+filename)
return err
}
defer xmlFile.Close()
logging.Info("Deserialization: " + filename)
byteValue, err := io.ReadAll(xmlFile)
if err != nil {
logging.Error(err, "Could not read file: "+filename)
return err
}
err = xml.Unmarshal(byteValue, &data)
if err != nil {
logging.Error(err, "Could not unmarshal file: "+filename)
return err
}
return nil
}
func (p *XMLProvider[T]) Item(id string) *T {
item, ok := p.Items.Load(id)
if !ok {
@@ -173,15 +164,10 @@ func (p *XMLProvider[T]) FindKey(fn func(string) bool) []*T {
return items
}
// INFO: Do not use this, except when iterating over a collection multiple times (three times or more).
// Maps are slow to iterate, but many of the Iterations can only be done once, so it doesn´t matter for a
// few thousand objects. We prefer to lookup objects by key and have multiple meaningful keys; along with
// sensible caching rules to keep the application responsive.
func (p *XMLProvider[T]) Everything() []*T {
var items []*T
p.Items.Range(func(key, value interface{}) bool {
items = append(items, value.(*T))
return true
})
return items
func (p *XMLProvider[T]) Lock() {
p.mu.Lock()
}
func (p *XMLProvider[T]) Unlock() {
p.mu.Unlock()
}