diff --git a/app/kgpz.go b/app/kgpz.go index 08b5c6e..42e9b5d 100644 --- a/app/kgpz.go +++ b/app/kgpz.go @@ -23,9 +23,8 @@ const ( ) type KGPZ struct { - // LMU is here for file system access - lmu sync.Mutex // GMU is only here to prevent concurrent pulls + // or file system operations while parsing gmu sync.Mutex Config *providers.ConfigProvider Repo *providers.GitProvider @@ -78,28 +77,23 @@ func (k *KGPZ) Enrich() error { k.InitGND() } - k.lmu.Lock() - defer k.lmu.Unlock() - if k.Library == nil || k.Library.Agents == nil { return nil } - // INFO: We pass agents by value since we don't want to block the library - agents := k.Library.Agents.Everything() - go func(agents []*xmlprovider.Agent) { - k.GND.FetchPersons(agents) + // TODO: Library locking is never needed, since the library items, once set, are never changed + // We only need to check if set + go func() { + data := gnd.ProviderIntoDataset(k.Library.Agents) + k.GND.FetchPersons(data) k.GND.WriteCache(k.Config.GNDPath) - }(agents) + }() return nil } func (k *KGPZ) Serialize() { // TODO: this is error handling from hell - // There is no need to recreate the whole library if the paths haven't changed - // We do it to keep the old data if the new data is missing - // Preventing pulling and serializing at the same time k.gmu.Lock() defer k.gmu.Unlock() @@ -115,23 +109,15 @@ func (k *KGPZ) Serialize() { pieces, err := getXMLFiles(filepath.Join(k.Config.FolderPath, PIECES_DIR)) helpers.Assert(err, "Error getting pieces") - k.lmu.Lock() - defer k.lmu.Unlock() if k.Library == nil { - lib := xmlprovider.NewLibrary( + k.Library = xmlprovider.NewLibrary( []string{filepath.Join(k.Config.FolderPath, AGENTS_PATH)}, []string{filepath.Join(k.Config.FolderPath, PLACES_PATH)}, []string{filepath.Join(k.Config.FolderPath, WORKS_PATH)}, []string{filepath.Join(k.Config.FolderPath, CATEGORIES_PATH)}, *issues, *pieces) - - lib.Serialize(commit) - - k.Library = lib } else { - // TODO: where to clear the old data? - // How to differentiate between deleted data points and stale data points bc of parse errors? k.Library.SetPaths( []string{filepath.Join(k.Config.FolderPath, AGENTS_PATH)}, []string{filepath.Join(k.Config.FolderPath, PLACES_PATH)}, @@ -139,8 +125,8 @@ func (k *KGPZ) Serialize() { []string{filepath.Join(k.Config.FolderPath, CATEGORIES_PATH)}, *issues, *pieces) - k.Library.Serialize(commit) } + k.Library.Serialize(commit) } func (k *KGPZ) IsDebug() bool { diff --git a/providers/gnd/gnd.go b/providers/gnd/gnd.go index 9299025..03b2864 100644 --- a/providers/gnd/gnd.go +++ b/providers/gnd/gnd.go @@ -13,7 +13,6 @@ import ( "time" "github.com/Theodor-Springmann-Stiftung/kgpz_web/helpers/logging" - "github.com/Theodor-Springmann-Stiftung/kgpz_web/providers/xmlprovider" ) const ( @@ -77,7 +76,6 @@ func (p *GNDProvider) readPersons(folder string) error { func (p *GNDProvider) readPerson(file string) { person := Person{} - // JSON unmarshalling of the file and sanity check: f, err := os.Open(file) if err != nil { logging.Error(err, "Error opening file for reading: "+file) @@ -96,8 +94,8 @@ func (p *GNDProvider) readPerson(file string) { return } - if person.Agent.GND != "" { - p.Persons.Store(person.Agent.GND, person) + if person.KGPZURL != "" { + p.Persons.Store(person.KGPZURL, person) return } } @@ -168,14 +166,14 @@ func (p *GNDProvider) Person(id string) *Person { return &pers } -func (p *GNDProvider) FetchPersons(persons []*xmlprovider.Agent) { +func (p *GNDProvider) FetchPersons(persons []GNDData) { wg := sync.WaitGroup{} for _, person := range persons { if person.ID == "" || person.GND == "" { continue } - // INFO: person already fetched; check for updates?? + // TODO: person already fetched; check for updates?? if _, ok := p.Persons.Load(person.GND); ok { continue } @@ -187,27 +185,27 @@ func (p *GNDProvider) FetchPersons(persons []*xmlprovider.Agent) { p.errmu.Unlock() wg.Add(1) - go func(person *xmlprovider.Agent) { + go func(person *GNDData) { defer wg.Done() - p.fetchPerson(*person) - }(person) + p.fetchPerson(person.ID, person.GND) + }(&person) } wg.Wait() } -func (p *GNDProvider) fetchPerson(person xmlprovider.Agent) { - SPLITURL := strings.Split(person.GND, "/") +func (p *GNDProvider) fetchPerson(ID, GND string) { + SPLITURL := strings.Split(GND, "/") if len(SPLITURL) < 2 { - logging.Error(nil, "Error parsing GND ID from: "+person.GND) + logging.Error(nil, "Error parsing GND ID from: "+GND) return } GNDID := SPLITURL[len(SPLITURL)-1] - logging.Debug("Fetching person: " + person.ID + " with URL: " + LOBID_URL + GNDID) + logging.Debug("Fetching person: " + ID + " with URL: " + LOBID_URL + GNDID) request, err := http.NewRequest("GET", LOBID_URL+GNDID, nil) if err != nil { - logging.Error(err, "Error creating request: "+person.ID) + logging.Error(err, "Error creating request: "+ID) return } @@ -218,17 +216,17 @@ func (p *GNDProvider) fetchPerson(person xmlprovider.Agent) { response, err = http.DefaultClient.Do(request) if err == nil && response.StatusCode < 400 { if i > 0 { - logging.Info("Successfully fetched person: " + person.ID + " after " + strconv.Itoa(i) + " retries") + logging.Info("Successfully fetched person: " + ID + " after " + strconv.Itoa(i) + " retries") } break } time.Sleep(time.Duration(i+1) * time.Second) - logging.Error(err, "Retry fetching person: "+person.ID) + logging.Error(err, "Retry fetching person: "+ID) } if err != nil { - logging.Error(err, "Error fetching person: "+person.ID) + logging.Error(err, "Error fetching person: "+ID) return } @@ -237,29 +235,29 @@ func (p *GNDProvider) fetchPerson(person xmlprovider.Agent) { if response.StatusCode != http.StatusOK { if response.StatusCode < 500 { p.errmu.Lock() - p.errs[person.GND] = response.StatusCode + p.errs[GND] = response.StatusCode p.errmu.Unlock() } - logging.Error(errors.New("Error fetching person: " + person.ID + " with status code: " + http.StatusText(response.StatusCode))) + logging.Error(errors.New("Error fetching person: " + ID + " with status code: " + http.StatusText(response.StatusCode))) return } body, err := io.ReadAll(response.Body) if err != nil { - logging.Error(err, "Error reading response body: "+person.ID) + logging.Error(err, "Error reading response body: "+ID) return } // For debug purposes: Write response body to file: - // os.WriteFile("gnd_responses/"+person.ID+".json", body, 0644) + // os.WriteFile("gnd_responses/"+ID+".json", body, 0644) gndPerson := Person{} if err := json.Unmarshal(body, &gndPerson); err != nil { - logging.Error(err, "Error unmarshalling response body: "+person.ID) + logging.Error(err, "Error unmarshalling response body: "+ID) return } - gndPerson.KGPZID = person.ID - gndPerson.Agent = person - p.Persons.Store(person.GND, gndPerson) + gndPerson.KGPZID = ID + gndPerson.KGPZURL = GND + p.Persons.Store(GND, gndPerson) } diff --git a/providers/gnd/helpers.go b/providers/gnd/helpers.go new file mode 100644 index 0000000..038500e --- /dev/null +++ b/providers/gnd/helpers.go @@ -0,0 +1,17 @@ +package gnd + +import "github.com/Theodor-Springmann-Stiftung/kgpz_web/providers/xmlprovider" + +type GNDData struct { + ID, GND string +} + +func ProviderIntoDataset(provider *xmlprovider.XMLProvider[xmlprovider.Agent]) []GNDData { + provider.Lock() + defer provider.Unlock() + var data []GNDData + for _, agent := range provider.Array { + data = append(data, GNDData{ID: agent.ID, GND: agent.GND}) + } + return data +} diff --git a/providers/gnd/model.go b/providers/gnd/model.go index 62a5698..a758eb4 100644 --- a/providers/gnd/model.go +++ b/providers/gnd/model.go @@ -2,13 +2,11 @@ package gnd import ( "fmt" - - "github.com/Theodor-Springmann-Stiftung/kgpz_web/providers/xmlprovider" ) type Person struct { KGPZID string `json:"kgpzid"` - Agent xmlprovider.Agent `json:"agent"` + KGPZURL string `json:"kgpzurl"` URL string `json:"id,omitempty"` DateOfBirth []string `json:"dateOfBirth,omitempty"` PlaceOfBirth []Entity `json:"placeOfBirth,omitempty"` diff --git a/providers/xmlprovider/helpers.go b/providers/xmlprovider/helpers.go new file mode 100644 index 0000000..8d653b8 --- /dev/null +++ b/providers/xmlprovider/helpers.go @@ -0,0 +1,32 @@ +package xmlprovider + +import ( + "encoding/xml" + "io" + "os" + + "github.com/Theodor-Springmann-Stiftung/kgpz_web/helpers/logging" +) + +func UnmarshalFile[T any](filename string, data T) error { + xmlFile, err := os.Open(filename) + if err != nil { + logging.Error(err, "Could not open file: "+filename) + return err + } + defer xmlFile.Close() + + logging.Info("Deserialization: " + filename) + byteValue, err := io.ReadAll(xmlFile) + if err != nil { + logging.Error(err, "Could not read file: "+filename) + return err + } + err = xml.Unmarshal(byteValue, &data) + + if err != nil { + logging.Error(err, "Could not unmarshal file: "+filename) + return err + } + return nil +} diff --git a/providers/xmlprovider/issues.go b/providers/xmlprovider/issues.go index b4d6bc1..0d37319 100644 --- a/providers/xmlprovider/issues.go +++ b/providers/xmlprovider/issues.go @@ -29,7 +29,11 @@ type Additional struct { Bis int `xml:"bis"` } -func (i Issue) GetIDs() []string { +func (i Issue) Keys() []string { + if len(i.keys) > 0 { + return i.keys + } + res := make([]string, 2) date := i.Datum.When if date != "" { @@ -40,6 +44,8 @@ func (i Issue) GetIDs() []string { res = append(res, i.Datum.When[0:4]+"-"+strconv.Itoa(i.Number.No)) } + i.keys = res + return res } diff --git a/providers/xmlprovider/item.go b/providers/xmlprovider/item.go index 1fef1ef..ef8ae15 100644 --- a/providers/xmlprovider/item.go +++ b/providers/xmlprovider/item.go @@ -4,3 +4,7 @@ type ItemInfo struct { Source string Parse *ParseMeta } + +type KeyedItem struct { + keys []string +} diff --git a/providers/xmlprovider/library.go b/providers/xmlprovider/library.go index 3018c9b..279aa4b 100644 --- a/providers/xmlprovider/library.go +++ b/providers/xmlprovider/library.go @@ -6,6 +6,7 @@ import ( ) type Library struct { + amu sync.Mutex Agents *XMLProvider[Agent] Places *XMLProvider[Place] Works *XMLProvider[Work] @@ -19,6 +20,7 @@ func (l *Library) String() string { l.Agents.String(), l.Places.String(), l.Works.String(), l.Categories.String(), l.Issues.String(), l.Pieces.String()) } +// INFO: this is the only place where the providers are created. There is no need for locking on access. func NewLibrary(agentpaths, placepaths, workpaths, categorypaths, issuepaths, piecepaths []string) *Library { return &Library{ Agents: &XMLProvider[Agent]{Paths: agentpaths}, @@ -31,6 +33,8 @@ func NewLibrary(agentpaths, placepaths, workpaths, categorypaths, issuepaths, pi } func (l *Library) SetPaths(agentpaths, placepaths, workpaths, categorypaths, issuepaths, piecepaths []string) { + l.amu.Lock() + defer l.amu.Unlock() l.Agents.Paths = agentpaths l.Places.Paths = placepaths l.Works.Paths = workpaths @@ -93,14 +97,9 @@ func (l *Library) Serialize(commit string) { } wg.Wait() - - go func() { - l.Cleanup() - }() + l.Cleanup() } -// TODO: Prepare resets the list of failed parses for a new parse. -// We need to set the logs accordingly. func (l *Library) Prepare(commit string) { l.Agents.Prepare(commit) l.Places.Prepare(commit) @@ -111,10 +110,38 @@ func (l *Library) Prepare(commit string) { } func (l *Library) Cleanup() { - l.Agents.Cleanup() - l.Places.Cleanup() - l.Works.Cleanup() - l.Categories.Cleanup() - l.Issues.Cleanup() - l.Pieces.Cleanup() + wg := sync.WaitGroup{} + wg.Add(6) + + go func() { + l.Agents.Cleanup() + wg.Done() + }() + + go func() { + l.Places.Cleanup() + wg.Done() + }() + + go func() { + l.Works.Cleanup() + wg.Done() + }() + + go func() { + l.Categories.Cleanup() + wg.Done() + }() + + go func() { + l.Issues.Cleanup() + wg.Done() + }() + + go func() { + l.Pieces.Cleanup() + wg.Done() + }() + + wg.Wait() } diff --git a/providers/xmlprovider/pieces.go b/providers/xmlprovider/pieces.go index 873369a..71d92d9 100644 --- a/providers/xmlprovider/pieces.go +++ b/providers/xmlprovider/pieces.go @@ -28,7 +28,11 @@ func (p Piece) String() string { return fmt.Sprintf("ID: %s\nIssueRefs: %v\nPlaceRefs: %v\nCategoryRefs: %v\nAgentRefs: %v\nWorkRefs: %v\nPieceRefs: %v\nAdditionalRef: %v\nIncipit: %v\nTitle: %v\nAnnotations: %v\nNotes: %v\n", p.ID, p.IssueRefs, p.PlaceRefs, p.CategoryRefs, p.AgentRefs, p.WorkRefs, p.PieceRefs, p.AdditionalRef, p.Incipit, p.Title, p.Annotations, p.Notes) } -func (p Piece) GetIDs() []string { +func (p Piece) Keys() []string { + if len(p.keys) > 0 { + return p.keys + } + ret := make([]string, 2) if p.ID != "" { ret = append(ret, p.ID) @@ -44,6 +48,9 @@ func (p Piece) GetIDs() []string { for _, i := range p.AdditionalRef { ret = append(ret, i.Datum+"-"+strconv.Itoa(i.Nr)+"-b-"+uid.String()) } + + p.keys = ret + return ret } diff --git a/providers/xmlprovider/xmlcommon.go b/providers/xmlprovider/xmlcommon.go index 834e9ab..392411a 100644 --- a/providers/xmlprovider/xmlcommon.go +++ b/providers/xmlprovider/xmlcommon.go @@ -37,10 +37,15 @@ type Note struct { type Identifier struct { ID string `xml:"id,attr"` + KeyedItem } -func (i Identifier) GetIDs() []string { - return []string{i.ID} +func (i Identifier) Keys() []string { + if len(i.keys) > 0 { + return i.keys + } + i.keys = []string{i.ID} + return i.keys } type Reference struct { diff --git a/providers/xmlprovider/xmlprovider.go b/providers/xmlprovider/xmlprovider.go index da30bec..76924d2 100644 --- a/providers/xmlprovider/xmlprovider.go +++ b/providers/xmlprovider/xmlprovider.go @@ -1,10 +1,7 @@ package xmlprovider import ( - "encoding/xml" "fmt" - "io" - "os" "slices" "sync" "time" @@ -19,45 +16,40 @@ type ParseMeta struct { type XMLItem interface { fmt.Stringer - GetIDs() []string + Keys() []string } // An XMLProvider is a struct that holds holds serialized XML data of a specific type. It combines multiple parses IF a succeeded parse can not serialize the data from a path. type XMLProvider[T XMLItem] struct { Paths []string - // INFO: map is type [string]*T + // INFO: map is type map[string]*T Items sync.Map // INFO: map is type [string]ItemInfo // It keeps information about parsing status of the items. Infos sync.Map mu sync.Mutex - // TODO: This is not populated yet - Array []T - failed []string - parses []ParseMeta + // TODO: This array is meant to be for iteration purposes, since iteration over the sync.Map is slow. + // It is best for this array to be sorted by key of the corresponding item. + Array []T + Previous []T + failed []string + parses []ParseMeta } // INFO: To parse sth, we call Prepare, then Serialize, then Cleanup. -// Serialize can be called concurretly. +// Prepare & Cleanup are called once per parse. Serialize is called for every path. +// and can be called concurretly. func (p *XMLProvider[T]) Prepare(commit string) { p.mu.Lock() defer p.mu.Unlock() + p.Previous = p.Array + p.Array = make([]T, len(p.Previous)) p.failed = make([]string, 0) p.parses = append(p.parses, ParseMeta{Commit: commit, Date: time.Now()}) } func (p *XMLProvider[T]) Serialize(dataholder XMLRootElement[T], path string) error { - if len(p.parses) == 0 { - logging.Error(fmt.Errorf("No commit set"), "No commit set") - return fmt.Errorf("No commit set") - } - - p.mu.Lock() - commit := &p.parses[len(p.parses)-1] - p.mu.Unlock() - - // Introduce goroutine for every path, locking on append: if err := UnmarshalFile(path, dataholder); err != nil { logging.Error(err, "Could not unmarshal file: "+path) logging.ParseMessages.LogError(logging.Unknown, path, "", "Could not unmarshal file.") @@ -67,9 +59,18 @@ func (p *XMLProvider[T]) Serialize(dataholder XMLRootElement[T], path string) er return err } + p.mu.Lock() + if len(p.parses) == 0 { + logging.Error(fmt.Errorf("No commit set"), "No commit set") + return fmt.Errorf("No commit set") + } + commit := &p.parses[len(p.parses)-1] + p.Array = append(p.Array, dataholder.Children()...) + p.mu.Unlock() + for _, item := range dataholder.Children() { // INFO: Mostly it's just one ID, so the double loop is not that bad. - for _, id := range item.GetIDs() { + for _, id := range item.Keys() { p.Infos.Store(id, ItemInfo{Source: path, Parse: commit}) p.Items.Store(id, &item) } @@ -92,11 +93,20 @@ func (p *XMLProvider[T]) Cleanup() { lastcommit := &p.parses[len(p.parses)-1] todelete := make([]string, 0) + toappend := make([]*T, 0) p.Infos.Range(func(key, value interface{}) bool { info := value.(ItemInfo) if info.Parse != lastcommit { if !slices.Contains(p.failed, info.Source) { todelete = append(todelete, key.(string)) + } else { + item, ok := p.Items.Load(key) + if ok { + i := item.(*T) + if !slices.Contains(toappend, i) { + toappend = append(toappend, i) + } + } } } return true @@ -106,6 +116,10 @@ func (p *XMLProvider[T]) Cleanup() { p.Infos.Delete(key) p.Items.Delete(key) } + + for _, item := range toappend { + p.Array = append(p.Array, *item) + } } func (a *XMLProvider[T]) String() string { @@ -118,29 +132,6 @@ func (a *XMLProvider[T]) String() string { return s } -func UnmarshalFile[T any](filename string, data T) error { - xmlFile, err := os.Open(filename) - if err != nil { - logging.Error(err, "Could not open file: "+filename) - return err - } - defer xmlFile.Close() - - logging.Info("Deserialization: " + filename) - byteValue, err := io.ReadAll(xmlFile) - if err != nil { - logging.Error(err, "Could not read file: "+filename) - return err - } - err = xml.Unmarshal(byteValue, &data) - - if err != nil { - logging.Error(err, "Could not unmarshal file: "+filename) - return err - } - return nil -} - func (p *XMLProvider[T]) Item(id string) *T { item, ok := p.Items.Load(id) if !ok { @@ -173,15 +164,10 @@ func (p *XMLProvider[T]) FindKey(fn func(string) bool) []*T { return items } -// INFO: Do not use this, except when iterating over a collection multiple times (three times or more). -// Maps are slow to iterate, but many of the Iterations can only be done once, so it doesn´t matter for a -// few thousand objects. We prefer to lookup objects by key and have multiple meaningful keys; along with -// sensible caching rules to keep the application responsive. -func (p *XMLProvider[T]) Everything() []*T { - var items []*T - p.Items.Range(func(key, value interface{}) bool { - items = append(items, value.(*T)) - return true - }) - return items +func (p *XMLProvider[T]) Lock() { + p.mu.Lock() +} + +func (p *XMLProvider[T]) Unlock() { + p.mu.Unlock() }