Better GND data fetching

This commit is contained in:
Simon Martens
2024-11-20 03:54:49 +01:00
parent c86fed3cbe
commit 75f762a042
4 changed files with 103 additions and 40 deletions

View File

@@ -23,7 +23,9 @@ const (
) )
type KGPZ struct { type KGPZ struct {
lmu sync.Mutex // LMU is here for file system access
lmu sync.Mutex
// GMU is only here to prevent concurrent pulls
gmu sync.Mutex gmu sync.Mutex
Config *providers.ConfigProvider Config *providers.ConfigProvider
Repo *providers.GitProvider Repo *providers.GitProvider
@@ -61,10 +63,6 @@ func NewKGPZ(config *providers.ConfigProvider) *KGPZ {
} }
func (k *KGPZ) InitGND() { func (k *KGPZ) InitGND() {
k.gmu.Lock()
defer k.gmu.Unlock()
k.lmu.Lock()
defer k.lmu.Unlock()
if k.GND == nil { if k.GND == nil {
k.GND = gnd.NewGNDProvider() k.GND = gnd.NewGNDProvider()
} }
@@ -81,14 +79,14 @@ func (k *KGPZ) Enrich() error {
k.lmu.Lock() k.lmu.Lock()
defer k.lmu.Unlock() defer k.lmu.Unlock()
k.gmu.Lock()
defer k.gmu.Unlock()
if k.Library == nil || k.Library.Agents == nil { if k.Library == nil || k.Library.Agents == nil {
return nil return nil
} }
agents := k.Library.Agents.Items.Agents // INFO: We pass agents by value since we don't want to block the library
agents := make([]xmlprovider.Agent, len(k.Library.Agents.Items.Agents))
_ = copy(agents, k.Library.Agents.Items.Agents)
go func(agents []xmlprovider.Agent) { go func(agents []xmlprovider.Agent) {
k.GND.FetchPersons(agents) k.GND.FetchPersons(agents)
k.GND.WriteCache(k.Config.GNDPath) k.GND.WriteCache(k.Config.GNDPath)

View File

@@ -1,4 +1,3 @@
> curl --header "Accept: application/json" https://lobid.org/gnd/11854523X > lobid_example.json
{ {
"gender" : [ { "gender" : [ {
"id" : "https://d-nb.info/standards/vocab/gnd/gender#male", "id" : "https://d-nb.info/standards/vocab/gnd/gender#male",

View File

@@ -2,12 +2,15 @@ package gnd
import ( import (
"encoding/json" "encoding/json"
"errors"
"io" "io"
"net/http" "net/http"
"os" "os"
"path/filepath" "path/filepath"
"strconv"
"strings" "strings"
"sync" "sync"
"time"
"github.com/Theodor-Springmann-Stiftung/kgpz_web/helpers/logging" "github.com/Theodor-Springmann-Stiftung/kgpz_web/helpers/logging"
"github.com/Theodor-Springmann-Stiftung/kgpz_web/providers/xmlprovider" "github.com/Theodor-Springmann-Stiftung/kgpz_web/providers/xmlprovider"
@@ -18,13 +21,18 @@ const (
) )
type GNDProvider struct { type GNDProvider struct {
// Mutex is for file reading & writing // Mutex is for file reading & writing, not person map access
mu sync.Mutex mu sync.Mutex
Persons sync.Map Persons sync.Map
errmu sync.Mutex
errs map[string]int
} }
func NewGNDProvider() *GNDProvider { func NewGNDProvider() *GNDProvider {
return &GNDProvider{} return &GNDProvider{
errs: make(map[string]int),
}
} }
func (p *GNDProvider) ReadCache(folder string) error { func (p *GNDProvider) ReadCache(folder string) error {
@@ -86,10 +94,11 @@ func (p *GNDProvider) readPerson(file string) {
return return
} }
if person.KGPZID != "" { if person.Agent.GND != "" {
p.Persons.Store(person.KGPZID, person) p.Persons.Store(person.Agent.GND, person)
return return
} }
} }
func (p *GNDProvider) WriteCache(folder string) error { func (p *GNDProvider) WriteCache(folder string) error {
@@ -101,6 +110,7 @@ func (p *GNDProvider) WriteCache(folder string) error {
return nil return nil
} }
// TODO: Dont write persons already written
func (p *GNDProvider) writePersons(folder string) error { func (p *GNDProvider) writePersons(folder string) error {
info, err := os.Stat(folder) info, err := os.Stat(folder)
if err == os.ErrNotExist { if err == os.ErrNotExist {
@@ -126,7 +136,7 @@ func (p *GNDProvider) writePersons(folder string) error {
func (p *GNDProvider) writePerson(folder, id string, person Person) { func (p *GNDProvider) writePerson(folder, id string, person Person) {
// JSON marshalling of the person and sanity check: // JSON marshalling of the person and sanity check:
filepath := filepath.Join(folder, id+".json") filepath := filepath.Join(folder, person.KGPZID+".json")
f, err := os.Create(filepath) f, err := os.Create(filepath)
if err != nil { if err != nil {
logging.Error(err, "Error creating file for writing: "+id) logging.Error(err, "Error creating file for writing: "+id)
@@ -157,18 +167,25 @@ func (p *GNDProvider) GetPerson(id string) (Person, error) {
func (p *GNDProvider) FetchPersons(persons []xmlprovider.Agent) { func (p *GNDProvider) FetchPersons(persons []xmlprovider.Agent) {
wg := sync.WaitGroup{} wg := sync.WaitGroup{}
for _, person := range persons { for _, person := range persons {
if person.ID == "" { if person.ID == "" || person.GND == "" {
continue continue
} }
if _, ok := p.Persons.Load(person.ID); ok {
// INFO: person already fetched; check for updates??
if _, ok := p.Persons.Load(person.GND); ok {
continue continue
} }
p.errmu.Lock()
if _, ok := p.errs[person.GND]; ok {
continue
}
p.errmu.Unlock()
wg.Add(1) wg.Add(1)
go func(person xmlprovider.Agent) { go func(person xmlprovider.Agent) {
defer wg.Done() defer wg.Done()
if person.GND != "" { p.fetchPerson(person)
p.fetchPerson(person)
}
}(person) }(person)
} }
wg.Wait() wg.Wait()
@@ -177,23 +194,48 @@ func (p *GNDProvider) FetchPersons(persons []xmlprovider.Agent) {
func (p *GNDProvider) fetchPerson(person xmlprovider.Agent) { func (p *GNDProvider) fetchPerson(person xmlprovider.Agent) {
SPLITURL := strings.Split(person.GND, "/") SPLITURL := strings.Split(person.GND, "/")
if len(SPLITURL) < 2 { if len(SPLITURL) < 2 {
logging.Error(nil, "Error parsing GND ID: "+person.GND) logging.Error(nil, "Error parsing GND ID from: "+person.GND)
return return
} }
GNDID := SPLITURL[len(SPLITURL)-1] GNDID := SPLITURL[len(SPLITURL)-1]
logging.Debug("Fetching person: " + person.ID + " with URL: " + LOBID_URL + GNDID) logging.Debug("Fetching person: " + person.ID + " with URL: " + LOBID_URL + GNDID)
request, _ := http.NewRequest("GET", LOBID_URL+GNDID, nil) request, err := http.NewRequest("GET", LOBID_URL+GNDID, nil)
response, err := http.DefaultClient.Do(request) if err != nil {
logging.Error(err, "Error creating request: "+person.ID)
return
}
var response *http.Response
for i := 0; i < 3; i++ {
response, err = http.DefaultClient.Do(request)
if err == nil && 500 > response.StatusCode {
if i > 0 {
logging.Info("Successfully fetched person: " + person.ID + " after " + strconv.Itoa(i) + " retries")
}
break
}
time.Sleep(time.Duration(i+1) * time.Second)
logging.Error(err, "Retry fetching person: "+person.ID)
}
if err != nil { if err != nil {
logging.Error(err, "Error fetching person: "+person.ID) logging.Error(err, "Error fetching person: "+person.ID)
return return
} }
defer response.Body.Close() defer response.Body.Close()
if response.StatusCode != http.StatusOK { if response.StatusCode != http.StatusOK {
logging.Error(nil, "Error fetching person: "+person.ID+" with status code: "+response.Status) if response.StatusCode < 500 {
p.errmu.Lock()
p.errs[person.GND] = response.StatusCode
p.errmu.Unlock()
}
logging.Error(errors.New("Error fetching person: " + person.ID + " with status code: " + http.StatusText(response.StatusCode)))
return return
} }
@@ -203,6 +245,9 @@ func (p *GNDProvider) fetchPerson(person xmlprovider.Agent) {
return return
} }
// Wirte response body to file:
// os.WriteFile("gnd_responses/"+person.ID+".json", body, 0644)
gndPerson := Person{} gndPerson := Person{}
if err := json.Unmarshal(body, &gndPerson); err != nil { if err := json.Unmarshal(body, &gndPerson); err != nil {
logging.Error(err, "Error unmarshalling response body: "+person.ID) logging.Error(err, "Error unmarshalling response body: "+person.ID)
@@ -210,5 +255,6 @@ func (p *GNDProvider) fetchPerson(person xmlprovider.Agent) {
} }
gndPerson.KGPZID = person.ID gndPerson.KGPZID = person.ID
p.Persons.Store(person.ID, gndPerson) gndPerson.Agent = person
p.Persons.Store(person.GND, gndPerson)
} }

View File

@@ -1,22 +1,35 @@
package gnd package gnd
import (
"fmt"
"github.com/Theodor-Springmann-Stiftung/kgpz_web/providers/xmlprovider"
)
type Person struct { type Person struct {
KGPZID string `json:"kgpzid"` KGPZID string `json:"kgpzid"`
URL string `json:"id"` Agent xmlprovider.Agent `json:"agent"`
DateOfDeath []string `json:"dateOfDeath"` URL string `json:"id"`
PlaceOfDeath []Entity `json:"placeOfDeath"` DateOfBirth []string `json:"dateOfBirth"`
BibliographicalOrHistoricalInformation []string `json:"bibliographicalOrHistoricalInformation"` PlaceOfBirth []Entity `json:"placeOfBirth"`
PreferredName string `json:"preferredName"` DateOfDeath []string `json:"dateOfDeath"`
GndIdentifier string `json:"gndIdentifier"` PlaceOfDeath []Entity `json:"placeOfDeath"`
Wikipedia []Entity `json:"wikipedia"` PlaceOfBirthAsLiteral []string `json:"placeOfBirthAsLiteral"`
Depiction []Picture `json:"depiction"` PlaceOfDeathAsLiteral []string `json:"placeOfDeathAsLiteral"`
ProfessionOrOccupation []Entity `json:"professionOrOccupation"` BiographicalOrHistoricalInformation []string `json:"biographicalOrHistoricalInformation"`
PreferredEntityForThePerson []PersonNameEntity `json:"preferredEntityForThePerson"` PreferredName string `json:"preferredName"`
DateOfBirth []string `json:"dateOfBirth"` GndIdentifier string `json:"gndIdentifier"`
PlaceOfBirth []Entity `json:"placeOfBirth"` Wikipedia []Entity `json:"wikipedia"`
VariantNameEntityForThePerson []PersonNameEntity `json:"variantNameEntityForThePerson"` Depiction []Picture `json:"depiction"`
VariantName []string `json:"variantName"` ProfessionOrOccupation []Entity `json:"professionOrOccupation"`
SameAs []CrossReferences `json:"sameAs"` PreferredNameEntityForThePerson PersonNameEntity `json:"preferredNameEntityForThePerson"`
VariantNameEntityForThePerson []PersonNameEntity `json:"variantNameEntityForThePerson"`
VariantName []string `json:"variantName"`
SameAs []CrossReferences `json:"sameAs"`
Pseudonym []Entity `json:"pseudonym"`
GNDSubjectCategory []Entity `json:"gndSubjectCategory"`
Type []string `json:"type"`
PlaceOfActivity []Entity `json:"placeOfActivity"`
} }
type CrossReferences struct { type CrossReferences struct {
@@ -49,8 +62,15 @@ type Entity struct {
} }
type PersonNameEntity struct { type PersonNameEntity struct {
Prefix []string `json:"prefix"`
Counting []string `json:"counting"`
Forename []string `json:"forename"` Forename []string `json:"forename"`
Surname []string `json:"surname"` Surname []string `json:"surname"`
PersonalName []string `json:"personalName"` PersonalName []string `json:"personalName"`
NameAddition []string `json:"nameAddition"` NameAddition []string `json:"nameAddition"`
} }
func (p Person) String() string {
// Copilot: Please format and return all fields of the struct
return fmt.Sprintf("Person{KGPZID: %v, URL: %v, DateOfDeath: %v, PlaceOfDeath: %v, BiographicalOrHistoricalInformation: %v, PreferredName: %v, GndIdentifier: %v, Wikipedia: %v, Depiction: %v, ProfessionOrOccupation: %v, PreferredNameEntityForThePerson: %v, DateOfBirth: %v, PlaceOfBirth: %v, VariantNameEntityForThePerson: %v, VariantName: %v, SameAs: %v}", p.KGPZID, p.URL, p.DateOfDeath, p.PlaceOfDeath, p.BiographicalOrHistoricalInformation, p.PreferredName, p.GndIdentifier, p.Wikipedia, p.Depiction, p.ProfessionOrOccupation, p.PreferredNameEntityForThePerson, p.DateOfBirth, p.PlaceOfBirth, p.VariantNameEntityForThePerson, p.VariantName, p.SameAs)
}