mirror of
https://github.com/Theodor-Springmann-Stiftung/kgpz_web.git
synced 2025-10-28 16:45:32 +00:00
Better GND data fetching
This commit is contained in:
14
app/kgpz.go
14
app/kgpz.go
@@ -23,7 +23,9 @@ const (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type KGPZ struct {
|
type KGPZ struct {
|
||||||
lmu sync.Mutex
|
// LMU is here for file system access
|
||||||
|
lmu sync.Mutex
|
||||||
|
// GMU is only here to prevent concurrent pulls
|
||||||
gmu sync.Mutex
|
gmu sync.Mutex
|
||||||
Config *providers.ConfigProvider
|
Config *providers.ConfigProvider
|
||||||
Repo *providers.GitProvider
|
Repo *providers.GitProvider
|
||||||
@@ -61,10 +63,6 @@ func NewKGPZ(config *providers.ConfigProvider) *KGPZ {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (k *KGPZ) InitGND() {
|
func (k *KGPZ) InitGND() {
|
||||||
k.gmu.Lock()
|
|
||||||
defer k.gmu.Unlock()
|
|
||||||
k.lmu.Lock()
|
|
||||||
defer k.lmu.Unlock()
|
|
||||||
if k.GND == nil {
|
if k.GND == nil {
|
||||||
k.GND = gnd.NewGNDProvider()
|
k.GND = gnd.NewGNDProvider()
|
||||||
}
|
}
|
||||||
@@ -81,14 +79,14 @@ func (k *KGPZ) Enrich() error {
|
|||||||
|
|
||||||
k.lmu.Lock()
|
k.lmu.Lock()
|
||||||
defer k.lmu.Unlock()
|
defer k.lmu.Unlock()
|
||||||
k.gmu.Lock()
|
|
||||||
defer k.gmu.Unlock()
|
|
||||||
|
|
||||||
if k.Library == nil || k.Library.Agents == nil {
|
if k.Library == nil || k.Library.Agents == nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
agents := k.Library.Agents.Items.Agents
|
// INFO: We pass agents by value since we don't want to block the library
|
||||||
|
agents := make([]xmlprovider.Agent, len(k.Library.Agents.Items.Agents))
|
||||||
|
_ = copy(agents, k.Library.Agents.Items.Agents)
|
||||||
go func(agents []xmlprovider.Agent) {
|
go func(agents []xmlprovider.Agent) {
|
||||||
k.GND.FetchPersons(agents)
|
k.GND.FetchPersons(agents)
|
||||||
k.GND.WriteCache(k.Config.GNDPath)
|
k.GND.WriteCache(k.Config.GNDPath)
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
> curl --header "Accept: application/json" https://lobid.org/gnd/11854523X > lobid_example.json
|
|
||||||
{
|
{
|
||||||
"gender" : [ {
|
"gender" : [ {
|
||||||
"id" : "https://d-nb.info/standards/vocab/gnd/gender#male",
|
"id" : "https://d-nb.info/standards/vocab/gnd/gender#male",
|
||||||
|
|||||||
@@ -2,12 +2,15 @@ package gnd
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
"io"
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/Theodor-Springmann-Stiftung/kgpz_web/helpers/logging"
|
"github.com/Theodor-Springmann-Stiftung/kgpz_web/helpers/logging"
|
||||||
"github.com/Theodor-Springmann-Stiftung/kgpz_web/providers/xmlprovider"
|
"github.com/Theodor-Springmann-Stiftung/kgpz_web/providers/xmlprovider"
|
||||||
@@ -18,13 +21,18 @@ const (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type GNDProvider struct {
|
type GNDProvider struct {
|
||||||
// Mutex is for file reading & writing
|
// Mutex is for file reading & writing, not person map access
|
||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
Persons sync.Map
|
Persons sync.Map
|
||||||
|
|
||||||
|
errmu sync.Mutex
|
||||||
|
errs map[string]int
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewGNDProvider() *GNDProvider {
|
func NewGNDProvider() *GNDProvider {
|
||||||
return &GNDProvider{}
|
return &GNDProvider{
|
||||||
|
errs: make(map[string]int),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *GNDProvider) ReadCache(folder string) error {
|
func (p *GNDProvider) ReadCache(folder string) error {
|
||||||
@@ -86,10 +94,11 @@ func (p *GNDProvider) readPerson(file string) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if person.KGPZID != "" {
|
if person.Agent.GND != "" {
|
||||||
p.Persons.Store(person.KGPZID, person)
|
p.Persons.Store(person.Agent.GND, person)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *GNDProvider) WriteCache(folder string) error {
|
func (p *GNDProvider) WriteCache(folder string) error {
|
||||||
@@ -101,6 +110,7 @@ func (p *GNDProvider) WriteCache(folder string) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: Dont write persons already written
|
||||||
func (p *GNDProvider) writePersons(folder string) error {
|
func (p *GNDProvider) writePersons(folder string) error {
|
||||||
info, err := os.Stat(folder)
|
info, err := os.Stat(folder)
|
||||||
if err == os.ErrNotExist {
|
if err == os.ErrNotExist {
|
||||||
@@ -126,7 +136,7 @@ func (p *GNDProvider) writePersons(folder string) error {
|
|||||||
|
|
||||||
func (p *GNDProvider) writePerson(folder, id string, person Person) {
|
func (p *GNDProvider) writePerson(folder, id string, person Person) {
|
||||||
// JSON marshalling of the person and sanity check:
|
// JSON marshalling of the person and sanity check:
|
||||||
filepath := filepath.Join(folder, id+".json")
|
filepath := filepath.Join(folder, person.KGPZID+".json")
|
||||||
f, err := os.Create(filepath)
|
f, err := os.Create(filepath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.Error(err, "Error creating file for writing: "+id)
|
logging.Error(err, "Error creating file for writing: "+id)
|
||||||
@@ -157,18 +167,25 @@ func (p *GNDProvider) GetPerson(id string) (Person, error) {
|
|||||||
func (p *GNDProvider) FetchPersons(persons []xmlprovider.Agent) {
|
func (p *GNDProvider) FetchPersons(persons []xmlprovider.Agent) {
|
||||||
wg := sync.WaitGroup{}
|
wg := sync.WaitGroup{}
|
||||||
for _, person := range persons {
|
for _, person := range persons {
|
||||||
if person.ID == "" {
|
if person.ID == "" || person.GND == "" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if _, ok := p.Persons.Load(person.ID); ok {
|
|
||||||
|
// INFO: person already fetched; check for updates??
|
||||||
|
if _, ok := p.Persons.Load(person.GND); ok {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
p.errmu.Lock()
|
||||||
|
if _, ok := p.errs[person.GND]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
p.errmu.Unlock()
|
||||||
|
|
||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
go func(person xmlprovider.Agent) {
|
go func(person xmlprovider.Agent) {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
if person.GND != "" {
|
p.fetchPerson(person)
|
||||||
p.fetchPerson(person)
|
|
||||||
}
|
|
||||||
}(person)
|
}(person)
|
||||||
}
|
}
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
@@ -177,23 +194,48 @@ func (p *GNDProvider) FetchPersons(persons []xmlprovider.Agent) {
|
|||||||
func (p *GNDProvider) fetchPerson(person xmlprovider.Agent) {
|
func (p *GNDProvider) fetchPerson(person xmlprovider.Agent) {
|
||||||
SPLITURL := strings.Split(person.GND, "/")
|
SPLITURL := strings.Split(person.GND, "/")
|
||||||
if len(SPLITURL) < 2 {
|
if len(SPLITURL) < 2 {
|
||||||
logging.Error(nil, "Error parsing GND ID: "+person.GND)
|
logging.Error(nil, "Error parsing GND ID from: "+person.GND)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
GNDID := SPLITURL[len(SPLITURL)-1]
|
GNDID := SPLITURL[len(SPLITURL)-1]
|
||||||
|
|
||||||
logging.Debug("Fetching person: " + person.ID + " with URL: " + LOBID_URL + GNDID)
|
logging.Debug("Fetching person: " + person.ID + " with URL: " + LOBID_URL + GNDID)
|
||||||
request, _ := http.NewRequest("GET", LOBID_URL+GNDID, nil)
|
request, err := http.NewRequest("GET", LOBID_URL+GNDID, nil)
|
||||||
response, err := http.DefaultClient.Do(request)
|
if err != nil {
|
||||||
|
logging.Error(err, "Error creating request: "+person.ID)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var response *http.Response
|
||||||
|
|
||||||
|
for i := 0; i < 3; i++ {
|
||||||
|
response, err = http.DefaultClient.Do(request)
|
||||||
|
if err == nil && 500 > response.StatusCode {
|
||||||
|
if i > 0 {
|
||||||
|
logging.Info("Successfully fetched person: " + person.ID + " after " + strconv.Itoa(i) + " retries")
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
time.Sleep(time.Duration(i+1) * time.Second)
|
||||||
|
logging.Error(err, "Retry fetching person: "+person.ID)
|
||||||
|
}
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.Error(err, "Error fetching person: "+person.ID)
|
logging.Error(err, "Error fetching person: "+person.ID)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
defer response.Body.Close()
|
defer response.Body.Close()
|
||||||
|
|
||||||
if response.StatusCode != http.StatusOK {
|
if response.StatusCode != http.StatusOK {
|
||||||
logging.Error(nil, "Error fetching person: "+person.ID+" with status code: "+response.Status)
|
if response.StatusCode < 500 {
|
||||||
|
p.errmu.Lock()
|
||||||
|
p.errs[person.GND] = response.StatusCode
|
||||||
|
p.errmu.Unlock()
|
||||||
|
}
|
||||||
|
logging.Error(errors.New("Error fetching person: " + person.ID + " with status code: " + http.StatusText(response.StatusCode)))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -203,6 +245,9 @@ func (p *GNDProvider) fetchPerson(person xmlprovider.Agent) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Wirte response body to file:
|
||||||
|
// os.WriteFile("gnd_responses/"+person.ID+".json", body, 0644)
|
||||||
|
|
||||||
gndPerson := Person{}
|
gndPerson := Person{}
|
||||||
if err := json.Unmarshal(body, &gndPerson); err != nil {
|
if err := json.Unmarshal(body, &gndPerson); err != nil {
|
||||||
logging.Error(err, "Error unmarshalling response body: "+person.ID)
|
logging.Error(err, "Error unmarshalling response body: "+person.ID)
|
||||||
@@ -210,5 +255,6 @@ func (p *GNDProvider) fetchPerson(person xmlprovider.Agent) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
gndPerson.KGPZID = person.ID
|
gndPerson.KGPZID = person.ID
|
||||||
p.Persons.Store(person.ID, gndPerson)
|
gndPerson.Agent = person
|
||||||
|
p.Persons.Store(person.GND, gndPerson)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,22 +1,35 @@
|
|||||||
package gnd
|
package gnd
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"github.com/Theodor-Springmann-Stiftung/kgpz_web/providers/xmlprovider"
|
||||||
|
)
|
||||||
|
|
||||||
type Person struct {
|
type Person struct {
|
||||||
KGPZID string `json:"kgpzid"`
|
KGPZID string `json:"kgpzid"`
|
||||||
URL string `json:"id"`
|
Agent xmlprovider.Agent `json:"agent"`
|
||||||
DateOfDeath []string `json:"dateOfDeath"`
|
URL string `json:"id"`
|
||||||
PlaceOfDeath []Entity `json:"placeOfDeath"`
|
DateOfBirth []string `json:"dateOfBirth"`
|
||||||
BibliographicalOrHistoricalInformation []string `json:"bibliographicalOrHistoricalInformation"`
|
PlaceOfBirth []Entity `json:"placeOfBirth"`
|
||||||
PreferredName string `json:"preferredName"`
|
DateOfDeath []string `json:"dateOfDeath"`
|
||||||
GndIdentifier string `json:"gndIdentifier"`
|
PlaceOfDeath []Entity `json:"placeOfDeath"`
|
||||||
Wikipedia []Entity `json:"wikipedia"`
|
PlaceOfBirthAsLiteral []string `json:"placeOfBirthAsLiteral"`
|
||||||
Depiction []Picture `json:"depiction"`
|
PlaceOfDeathAsLiteral []string `json:"placeOfDeathAsLiteral"`
|
||||||
ProfessionOrOccupation []Entity `json:"professionOrOccupation"`
|
BiographicalOrHistoricalInformation []string `json:"biographicalOrHistoricalInformation"`
|
||||||
PreferredEntityForThePerson []PersonNameEntity `json:"preferredEntityForThePerson"`
|
PreferredName string `json:"preferredName"`
|
||||||
DateOfBirth []string `json:"dateOfBirth"`
|
GndIdentifier string `json:"gndIdentifier"`
|
||||||
PlaceOfBirth []Entity `json:"placeOfBirth"`
|
Wikipedia []Entity `json:"wikipedia"`
|
||||||
VariantNameEntityForThePerson []PersonNameEntity `json:"variantNameEntityForThePerson"`
|
Depiction []Picture `json:"depiction"`
|
||||||
VariantName []string `json:"variantName"`
|
ProfessionOrOccupation []Entity `json:"professionOrOccupation"`
|
||||||
SameAs []CrossReferences `json:"sameAs"`
|
PreferredNameEntityForThePerson PersonNameEntity `json:"preferredNameEntityForThePerson"`
|
||||||
|
VariantNameEntityForThePerson []PersonNameEntity `json:"variantNameEntityForThePerson"`
|
||||||
|
VariantName []string `json:"variantName"`
|
||||||
|
SameAs []CrossReferences `json:"sameAs"`
|
||||||
|
Pseudonym []Entity `json:"pseudonym"`
|
||||||
|
GNDSubjectCategory []Entity `json:"gndSubjectCategory"`
|
||||||
|
Type []string `json:"type"`
|
||||||
|
PlaceOfActivity []Entity `json:"placeOfActivity"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type CrossReferences struct {
|
type CrossReferences struct {
|
||||||
@@ -49,8 +62,15 @@ type Entity struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type PersonNameEntity struct {
|
type PersonNameEntity struct {
|
||||||
|
Prefix []string `json:"prefix"`
|
||||||
|
Counting []string `json:"counting"`
|
||||||
Forename []string `json:"forename"`
|
Forename []string `json:"forename"`
|
||||||
Surname []string `json:"surname"`
|
Surname []string `json:"surname"`
|
||||||
PersonalName []string `json:"personalName"`
|
PersonalName []string `json:"personalName"`
|
||||||
NameAddition []string `json:"nameAddition"`
|
NameAddition []string `json:"nameAddition"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (p Person) String() string {
|
||||||
|
// Copilot: Please format and return all fields of the struct
|
||||||
|
return fmt.Sprintf("Person{KGPZID: %v, URL: %v, DateOfDeath: %v, PlaceOfDeath: %v, BiographicalOrHistoricalInformation: %v, PreferredName: %v, GndIdentifier: %v, Wikipedia: %v, Depiction: %v, ProfessionOrOccupation: %v, PreferredNameEntityForThePerson: %v, DateOfBirth: %v, PlaceOfBirth: %v, VariantNameEntityForThePerson: %v, VariantName: %v, SameAs: %v}", p.KGPZID, p.URL, p.DateOfDeath, p.PlaceOfDeath, p.BiographicalOrHistoricalInformation, p.PreferredName, p.GndIdentifier, p.Wikipedia, p.Depiction, p.ProfessionOrOccupation, p.PreferredNameEntityForThePerson, p.DateOfBirth, p.PlaceOfBirth, p.VariantNameEntityForThePerson, p.VariantName, p.SameAs)
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user