diff --git a/app/kgpz.go b/app/kgpz.go index acd7d60..68f5c35 100644 --- a/app/kgpz.go +++ b/app/kgpz.go @@ -23,7 +23,9 @@ const ( ) type KGPZ struct { - lmu sync.Mutex + // LMU is here for file system access + lmu sync.Mutex + // GMU is only here to prevent concurrent pulls gmu sync.Mutex Config *providers.ConfigProvider Repo *providers.GitProvider @@ -61,10 +63,6 @@ func NewKGPZ(config *providers.ConfigProvider) *KGPZ { } func (k *KGPZ) InitGND() { - k.gmu.Lock() - defer k.gmu.Unlock() - k.lmu.Lock() - defer k.lmu.Unlock() if k.GND == nil { k.GND = gnd.NewGNDProvider() } @@ -81,14 +79,14 @@ func (k *KGPZ) Enrich() error { k.lmu.Lock() defer k.lmu.Unlock() - k.gmu.Lock() - defer k.gmu.Unlock() if k.Library == nil || k.Library.Agents == nil { return nil } - agents := k.Library.Agents.Items.Agents + // INFO: We pass agents by value since we don't want to block the library + agents := make([]xmlprovider.Agent, len(k.Library.Agents.Items.Agents)) + _ = copy(agents, k.Library.Agents.Items.Agents) go func(agents []xmlprovider.Agent) { k.GND.FetchPersons(agents) k.GND.WriteCache(k.Config.GNDPath) diff --git a/lobid_example.json b/lobid_example.json index dcd9f96..94e1d45 100644 --- a/lobid_example.json +++ b/lobid_example.json @@ -1,4 +1,3 @@ -> curl --header "Accept: application/json" https://lobid.org/gnd/11854523X > lobid_example.json { "gender" : [ { "id" : "https://d-nb.info/standards/vocab/gnd/gender#male", diff --git a/providers/gnd/gnd.go b/providers/gnd/gnd.go index bc783cc..c7cc778 100644 --- a/providers/gnd/gnd.go +++ b/providers/gnd/gnd.go @@ -2,12 +2,15 @@ package gnd import ( "encoding/json" + "errors" "io" "net/http" "os" "path/filepath" + "strconv" "strings" "sync" + "time" "github.com/Theodor-Springmann-Stiftung/kgpz_web/helpers/logging" "github.com/Theodor-Springmann-Stiftung/kgpz_web/providers/xmlprovider" @@ -18,13 +21,18 @@ const ( ) type GNDProvider struct { - // Mutex is for file reading & writing + // Mutex is for file reading & writing, not person map access mu sync.Mutex Persons sync.Map + + errmu sync.Mutex + errs map[string]int } func NewGNDProvider() *GNDProvider { - return &GNDProvider{} + return &GNDProvider{ + errs: make(map[string]int), + } } func (p *GNDProvider) ReadCache(folder string) error { @@ -86,10 +94,11 @@ func (p *GNDProvider) readPerson(file string) { return } - if person.KGPZID != "" { - p.Persons.Store(person.KGPZID, person) + if person.Agent.GND != "" { + p.Persons.Store(person.Agent.GND, person) return } + } func (p *GNDProvider) WriteCache(folder string) error { @@ -101,6 +110,7 @@ func (p *GNDProvider) WriteCache(folder string) error { return nil } +// TODO: Dont write persons already written func (p *GNDProvider) writePersons(folder string) error { info, err := os.Stat(folder) if err == os.ErrNotExist { @@ -126,7 +136,7 @@ func (p *GNDProvider) writePersons(folder string) error { func (p *GNDProvider) writePerson(folder, id string, person Person) { // JSON marshalling of the person and sanity check: - filepath := filepath.Join(folder, id+".json") + filepath := filepath.Join(folder, person.KGPZID+".json") f, err := os.Create(filepath) if err != nil { logging.Error(err, "Error creating file for writing: "+id) @@ -157,18 +167,25 @@ func (p *GNDProvider) GetPerson(id string) (Person, error) { func (p *GNDProvider) FetchPersons(persons []xmlprovider.Agent) { wg := sync.WaitGroup{} for _, person := range persons { - if person.ID == "" { + if person.ID == "" || person.GND == "" { continue } - if _, ok := p.Persons.Load(person.ID); ok { + + // INFO: person already fetched; check for updates?? + if _, ok := p.Persons.Load(person.GND); ok { continue } + + p.errmu.Lock() + if _, ok := p.errs[person.GND]; ok { + continue + } + p.errmu.Unlock() + wg.Add(1) go func(person xmlprovider.Agent) { defer wg.Done() - if person.GND != "" { - p.fetchPerson(person) - } + p.fetchPerson(person) }(person) } wg.Wait() @@ -177,23 +194,48 @@ func (p *GNDProvider) FetchPersons(persons []xmlprovider.Agent) { func (p *GNDProvider) fetchPerson(person xmlprovider.Agent) { SPLITURL := strings.Split(person.GND, "/") if len(SPLITURL) < 2 { - logging.Error(nil, "Error parsing GND ID: "+person.GND) + logging.Error(nil, "Error parsing GND ID from: "+person.GND) return } GNDID := SPLITURL[len(SPLITURL)-1] logging.Debug("Fetching person: " + person.ID + " with URL: " + LOBID_URL + GNDID) - request, _ := http.NewRequest("GET", LOBID_URL+GNDID, nil) - response, err := http.DefaultClient.Do(request) + request, err := http.NewRequest("GET", LOBID_URL+GNDID, nil) + if err != nil { + logging.Error(err, "Error creating request: "+person.ID) + return + } + + var response *http.Response + + for i := 0; i < 3; i++ { + response, err = http.DefaultClient.Do(request) + if err == nil && 500 > response.StatusCode { + if i > 0 { + logging.Info("Successfully fetched person: " + person.ID + " after " + strconv.Itoa(i) + " retries") + } + break + } + + time.Sleep(time.Duration(i+1) * time.Second) + logging.Error(err, "Retry fetching person: "+person.ID) + } + if err != nil { logging.Error(err, "Error fetching person: "+person.ID) return } + defer response.Body.Close() if response.StatusCode != http.StatusOK { - logging.Error(nil, "Error fetching person: "+person.ID+" with status code: "+response.Status) + if response.StatusCode < 500 { + p.errmu.Lock() + p.errs[person.GND] = response.StatusCode + p.errmu.Unlock() + } + logging.Error(errors.New("Error fetching person: " + person.ID + " with status code: " + http.StatusText(response.StatusCode))) return } @@ -203,6 +245,9 @@ func (p *GNDProvider) fetchPerson(person xmlprovider.Agent) { return } + // Wirte response body to file: + // os.WriteFile("gnd_responses/"+person.ID+".json", body, 0644) + gndPerson := Person{} if err := json.Unmarshal(body, &gndPerson); err != nil { logging.Error(err, "Error unmarshalling response body: "+person.ID) @@ -210,5 +255,6 @@ func (p *GNDProvider) fetchPerson(person xmlprovider.Agent) { } gndPerson.KGPZID = person.ID - p.Persons.Store(person.ID, gndPerson) + gndPerson.Agent = person + p.Persons.Store(person.GND, gndPerson) } diff --git a/providers/gnd/model.go b/providers/gnd/model.go index 08818f3..6e89bda 100644 --- a/providers/gnd/model.go +++ b/providers/gnd/model.go @@ -1,22 +1,35 @@ package gnd +import ( + "fmt" + + "github.com/Theodor-Springmann-Stiftung/kgpz_web/providers/xmlprovider" +) + type Person struct { - KGPZID string `json:"kgpzid"` - URL string `json:"id"` - DateOfDeath []string `json:"dateOfDeath"` - PlaceOfDeath []Entity `json:"placeOfDeath"` - BibliographicalOrHistoricalInformation []string `json:"bibliographicalOrHistoricalInformation"` - PreferredName string `json:"preferredName"` - GndIdentifier string `json:"gndIdentifier"` - Wikipedia []Entity `json:"wikipedia"` - Depiction []Picture `json:"depiction"` - ProfessionOrOccupation []Entity `json:"professionOrOccupation"` - PreferredEntityForThePerson []PersonNameEntity `json:"preferredEntityForThePerson"` - DateOfBirth []string `json:"dateOfBirth"` - PlaceOfBirth []Entity `json:"placeOfBirth"` - VariantNameEntityForThePerson []PersonNameEntity `json:"variantNameEntityForThePerson"` - VariantName []string `json:"variantName"` - SameAs []CrossReferences `json:"sameAs"` + KGPZID string `json:"kgpzid"` + Agent xmlprovider.Agent `json:"agent"` + URL string `json:"id"` + DateOfBirth []string `json:"dateOfBirth"` + PlaceOfBirth []Entity `json:"placeOfBirth"` + DateOfDeath []string `json:"dateOfDeath"` + PlaceOfDeath []Entity `json:"placeOfDeath"` + PlaceOfBirthAsLiteral []string `json:"placeOfBirthAsLiteral"` + PlaceOfDeathAsLiteral []string `json:"placeOfDeathAsLiteral"` + BiographicalOrHistoricalInformation []string `json:"biographicalOrHistoricalInformation"` + PreferredName string `json:"preferredName"` + GndIdentifier string `json:"gndIdentifier"` + Wikipedia []Entity `json:"wikipedia"` + Depiction []Picture `json:"depiction"` + ProfessionOrOccupation []Entity `json:"professionOrOccupation"` + PreferredNameEntityForThePerson PersonNameEntity `json:"preferredNameEntityForThePerson"` + VariantNameEntityForThePerson []PersonNameEntity `json:"variantNameEntityForThePerson"` + VariantName []string `json:"variantName"` + SameAs []CrossReferences `json:"sameAs"` + Pseudonym []Entity `json:"pseudonym"` + GNDSubjectCategory []Entity `json:"gndSubjectCategory"` + Type []string `json:"type"` + PlaceOfActivity []Entity `json:"placeOfActivity"` } type CrossReferences struct { @@ -49,8 +62,15 @@ type Entity struct { } type PersonNameEntity struct { + Prefix []string `json:"prefix"` + Counting []string `json:"counting"` Forename []string `json:"forename"` Surname []string `json:"surname"` PersonalName []string `json:"personalName"` NameAddition []string `json:"nameAddition"` } + +func (p Person) String() string { + // Copilot: Please format and return all fields of the struct + return fmt.Sprintf("Person{KGPZID: %v, URL: %v, DateOfDeath: %v, PlaceOfDeath: %v, BiographicalOrHistoricalInformation: %v, PreferredName: %v, GndIdentifier: %v, Wikipedia: %v, Depiction: %v, ProfessionOrOccupation: %v, PreferredNameEntityForThePerson: %v, DateOfBirth: %v, PlaceOfBirth: %v, VariantNameEntityForThePerson: %v, VariantName: %v, SameAs: %v}", p.KGPZID, p.URL, p.DateOfDeath, p.PlaceOfDeath, p.BiographicalOrHistoricalInformation, p.PreferredName, p.GndIdentifier, p.Wikipedia, p.Depiction, p.ProfessionOrOccupation, p.PreferredNameEntityForThePerson, p.DateOfBirth, p.PlaceOfBirth, p.VariantNameEntityForThePerson, p.VariantName, p.SameAs) +}