Very basic data enrichment via LOBID/GND

This commit is contained in:
Simon Martens
2024-11-19 18:51:42 +01:00
parent 7dc603df2c
commit c86fed3cbe
12 changed files with 368 additions and 50827 deletions

View File

@@ -7,13 +7,13 @@ tmp_dir = "tmp"
bin = "./tmp/main"
cmd = "go build -tags=\"dev\" -o ./tmp/main ."
delay = 1000
exclude_dir = ["assets", "views", "tmp", "vendor", "testdata"]
exclude_dir = ["assets", "views", "tmp", "vendor", "testdata", "data_git", "cache_gnd"]
exclude_file = []
exclude_regex = ["_test.go"]
exclude_unchanged = false
follow_symlink = false
full_bin = ""
include_dir = [ "views/assets" ]
include_dir = []
include_ext = ["go", "tpl", "tmpl", "html"]
include_file = []
kill_delay = "0s"
@@ -24,7 +24,7 @@ tmp_dir = "tmp"
pre_cmd = []
rerun = false
rerun_delay = 500
send_interrupt = false
send_interrupt = true
stop_on_error = false
[color]
@@ -43,7 +43,7 @@ tmp_dir = "tmp"
[proxy]
app_port = 8080
enabled = false
enabled = true
proxy_port = 8081
[screen]

2
.gitignore vendored
View File

@@ -6,3 +6,5 @@ cache_gnd/
config.json
out.log
kgpz_web
*.log
*.out

View File

@@ -8,6 +8,7 @@ import (
"github.com/Theodor-Springmann-Stiftung/kgpz_web/helpers"
"github.com/Theodor-Springmann-Stiftung/kgpz_web/helpers/logging"
"github.com/Theodor-Springmann-Stiftung/kgpz_web/providers"
"github.com/Theodor-Springmann-Stiftung/kgpz_web/providers/gnd"
"github.com/Theodor-Springmann-Stiftung/kgpz_web/providers/xmlprovider"
)
@@ -26,6 +27,7 @@ type KGPZ struct {
gmu sync.Mutex
Config *providers.ConfigProvider
Repo *providers.GitProvider
GND *gnd.GNDProvider
Library *xmlprovider.Library
}
@@ -38,11 +40,15 @@ func (k *KGPZ) Init() {
go k.initRepo()
}
k.Serialize()
k.InitGND()
k.Enrich()
return
}
k.initRepo()
k.Serialize()
k.InitGND()
k.Enrich()
}
func NewKGPZ(config *providers.ConfigProvider) *KGPZ {
@@ -54,6 +60,43 @@ func NewKGPZ(config *providers.ConfigProvider) *KGPZ {
return &KGPZ{Config: config}
}
func (k *KGPZ) InitGND() {
k.gmu.Lock()
defer k.gmu.Unlock()
k.lmu.Lock()
defer k.lmu.Unlock()
if k.GND == nil {
k.GND = gnd.NewGNDProvider()
}
if err := k.GND.ReadCache(k.Config.GNDPath); err != nil {
logging.Error(err, "Error reading GND cache")
}
}
func (k *KGPZ) Enrich() error {
if k.GND == nil {
k.InitGND()
}
k.lmu.Lock()
defer k.lmu.Unlock()
k.gmu.Lock()
defer k.gmu.Unlock()
if k.Library == nil || k.Library.Agents == nil {
return nil
}
agents := k.Library.Agents.Items.Agents
go func(agents []xmlprovider.Agent) {
k.GND.FetchPersons(agents)
k.GND.WriteCache(k.Config.GNDPath)
}(agents)
return nil
}
func (k *KGPZ) Serialize() {
// TODO: this is error handling from hell
// There is no need to recreate the whole library if the paths haven't changed

View File

@@ -57,6 +57,14 @@ func Info(msg ...string) {
}
}
func Debug(msg ...string) {
if len(msg) > 0 {
for _, m := range msg {
slog.Debug(m)
}
}
}
func SetDebug() {
slog.SetLogLoggerLevel(slog.LevelDebug)
}

50783
log.out

File diff suppressed because it is too large Load Diff

214
providers/gnd/gnd.go Normal file
View File

@@ -0,0 +1,214 @@
package gnd
import (
"encoding/json"
"io"
"net/http"
"os"
"path/filepath"
"strings"
"sync"
"github.com/Theodor-Springmann-Stiftung/kgpz_web/helpers/logging"
"github.com/Theodor-Springmann-Stiftung/kgpz_web/providers/xmlprovider"
)
const (
LOBID_URL = "https://lobid.org/gnd/"
)
type GNDProvider struct {
// Mutex is for file reading & writing
mu sync.Mutex
Persons sync.Map
}
func NewGNDProvider() *GNDProvider {
return &GNDProvider{}
}
func (p *GNDProvider) ReadCache(folder string) error {
p.mu.Lock()
defer p.mu.Unlock()
if err := p.readPersons(folder); err != nil {
return err
}
return nil
}
func (p *GNDProvider) readPersons(folder string) error {
info, err := os.Stat(folder)
if os.IsNotExist(err) {
return os.MkdirAll(folder, 0755)
}
if err != nil || !info.IsDir() {
return err
}
files, err := filepath.Glob(filepath.Join(folder, "*.json"))
// TODO: try to recover by recreating the folder
if err != nil {
return err
}
wg := sync.WaitGroup{}
wg.Add(len(files))
for _, file := range files {
go func(file string) {
p.readPerson(file)
wg.Done()
}(file)
}
wg.Wait()
return nil
}
func (p *GNDProvider) readPerson(file string) {
person := Person{}
// JSON unmarshalling of the file and sanity check:
f, err := os.Open(file)
if err != nil {
logging.Error(err, "Error opening file for reading: "+file)
return
}
defer f.Close()
bytevalue, err := io.ReadAll(f)
if err != nil {
logging.Error(err, "Error reading file: "+file)
return
}
if err := json.Unmarshal(bytevalue, &person); err != nil {
logging.Error(err, "Error unmarshalling file:"+file)
return
}
if person.KGPZID != "" {
p.Persons.Store(person.KGPZID, person)
return
}
}
func (p *GNDProvider) WriteCache(folder string) error {
p.mu.Lock()
defer p.mu.Unlock()
if err := p.writePersons(folder); err != nil {
return err
}
return nil
}
func (p *GNDProvider) writePersons(folder string) error {
info, err := os.Stat(folder)
if err == os.ErrNotExist {
return os.MkdirAll(folder, 0755)
}
if err != nil || !info.IsDir() {
return err
}
wg := sync.WaitGroup{}
p.Persons.Range(func(key, value interface{}) bool {
wg.Add(1)
go func(key string, value Person) {
p.writePerson(folder, key, value)
wg.Done()
}(key.(string), value.(Person))
return true
})
wg.Wait()
return nil
}
func (p *GNDProvider) writePerson(folder, id string, person Person) {
// JSON marshalling of the person and sanity check:
filepath := filepath.Join(folder, id+".json")
f, err := os.Create(filepath)
if err != nil {
logging.Error(err, "Error creating file for writing: "+id)
return
}
defer f.Close()
bytevalue, err := json.Marshal(person)
if err != nil {
logging.Error(err, "Error marshalling person: "+id)
return
}
if _, err := f.Write(bytevalue); err != nil {
logging.Error(err, "Error writing file: "+id)
return
}
}
func (p *GNDProvider) GetPerson(id string) (Person, error) {
person, ok := p.Persons.Load(id)
if !ok {
return Person{}, nil
}
return person.(Person), nil
}
func (p *GNDProvider) FetchPersons(persons []xmlprovider.Agent) {
wg := sync.WaitGroup{}
for _, person := range persons {
if person.ID == "" {
continue
}
if _, ok := p.Persons.Load(person.ID); ok {
continue
}
wg.Add(1)
go func(person xmlprovider.Agent) {
defer wg.Done()
if person.GND != "" {
p.fetchPerson(person)
}
}(person)
}
wg.Wait()
}
func (p *GNDProvider) fetchPerson(person xmlprovider.Agent) {
SPLITURL := strings.Split(person.GND, "/")
if len(SPLITURL) < 2 {
logging.Error(nil, "Error parsing GND ID: "+person.GND)
return
}
GNDID := SPLITURL[len(SPLITURL)-1]
logging.Debug("Fetching person: " + person.ID + " with URL: " + LOBID_URL + GNDID)
request, _ := http.NewRequest("GET", LOBID_URL+GNDID, nil)
response, err := http.DefaultClient.Do(request)
if err != nil {
logging.Error(err, "Error fetching person: "+person.ID)
return
}
defer response.Body.Close()
if response.StatusCode != http.StatusOK {
logging.Error(nil, "Error fetching person: "+person.ID+" with status code: "+response.Status)
return
}
body, err := io.ReadAll(response.Body)
if err != nil {
logging.Error(err, "Error reading response body: "+person.ID)
return
}
gndPerson := Person{}
if err := json.Unmarshal(body, &gndPerson); err != nil {
logging.Error(err, "Error unmarshalling response body: "+person.ID)
return
}
gndPerson.KGPZID = person.ID
p.Persons.Store(person.ID, gndPerson)
}

View File

@@ -1,4 +1,56 @@
package gnd
type Person struct {
KGPZID string `json:"kgpzid"`
URL string `json:"id"`
DateOfDeath []string `json:"dateOfDeath"`
PlaceOfDeath []Entity `json:"placeOfDeath"`
BibliographicalOrHistoricalInformation []string `json:"bibliographicalOrHistoricalInformation"`
PreferredName string `json:"preferredName"`
GndIdentifier string `json:"gndIdentifier"`
Wikipedia []Entity `json:"wikipedia"`
Depiction []Picture `json:"depiction"`
ProfessionOrOccupation []Entity `json:"professionOrOccupation"`
PreferredEntityForThePerson []PersonNameEntity `json:"preferredEntityForThePerson"`
DateOfBirth []string `json:"dateOfBirth"`
PlaceOfBirth []Entity `json:"placeOfBirth"`
VariantNameEntityForThePerson []PersonNameEntity `json:"variantNameEntityForThePerson"`
VariantName []string `json:"variantName"`
SameAs []CrossReferences `json:"sameAs"`
}
type CrossReferences struct {
Items Collection `json:"collection"`
ID string `json:"id"`
}
type Collection struct {
Abbr string `json:"abbr"`
Name string `json:"name"`
Publisher string `json:"publisher"`
Icon string `json:"icon"`
ID string `json:"id"`
}
type Link struct {
ID string `json:"id"`
Label string `json:"label"`
}
type Picture struct {
ID string `json:"id"`
URL string `json:"url"`
Thumbnail string `json:"thumbnail"`
}
type Entity struct {
ID string `json:"id"`
Label string `json:"label"`
}
type PersonNameEntity struct {
Forename []string `json:"forename"`
Surname []string `json:"surname"`
PersonalName []string `json:"personalName"`
NameAddition []string `json:"nameAddition"`
}

View File

@@ -96,6 +96,7 @@ func (l *Library) Serialize() {
wg.Wait()
}
// TODO: make Items into a sync.Map
func (p *XMLProvider[T]) Serialize() error {
// Introduce goroutine for every path, locking on append:
var wg sync.WaitGroup
@@ -114,7 +115,6 @@ func (p *XMLProvider[T]) Serialize() error {
}
wg.Wait()
fmt.Println(p.Items)
return nil
}
@@ -127,13 +127,22 @@ func (a *XMLProvider[T]) String() string {
func UnmarshalFile[T any](filename string, data *T) error {
xmlFile, err := os.Open(filename)
if err != nil {
logging.Error(err, "Could not deserialize file: "+filename)
logging.Error(err, "Could not open file: "+filename)
return err
}
defer xmlFile.Close()
logging.Info("Deserialization: " + filename)
byteValue, _ := io.ReadAll(xmlFile)
xml.Unmarshal(byteValue, data)
logging.Info("Deserialization: " + filename)
byteValue, err := io.ReadAll(xmlFile)
if err != nil {
logging.Error(err, "Could not read file: "+filename)
return err
}
err = xml.Unmarshal(byteValue, data)
if err != nil {
logging.Error(err, "Could not unmarshal file: "+filename)
return err
}
return nil
}

View File

@@ -1 +0,0 @@
exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1exit status 1

BIN
tmp/main

Binary file not shown.

View File

@@ -6,7 +6,6 @@ import (
"sort"
"strconv"
"github.com/Theodor-Springmann-Stiftung/kgpz_web/helpers/logging"
"github.com/Theodor-Springmann-Stiftung/kgpz_web/providers/xmlprovider"
)
@@ -18,6 +17,35 @@ type YearViewModel struct {
Issues IssuesByMonth
}
func YearView(year string, lib *xmlprovider.Library) (*YearViewModel, error) {
res := YearViewModel{Year: year}
res.Issues = make(IssuesByMonth, 12)
last := ""
for _, issue := range lib.Issues.Items.Issues {
if len(issue.Datum.When) < 4 {
continue
}
date := issue.Datum.When[0:4]
if date != last {
res.PushAvailable(date)
last = date
}
if date == year {
res.PushIssue(issue)
}
}
if len(res.Issues) == 0 {
return nil, errors.New("No issues found for year " + year)
}
res.SortAvailableYears()
return &res, nil
}
func (y *YearViewModel) PushIssue(i xmlprovider.Issue) {
iv, err := FromIssue(i)
if err != nil {
@@ -54,34 +82,3 @@ func (y *YearViewModel) SortAvailableYears() {
return iint < jint
})
}
func YearView(year string, lib *xmlprovider.Library) (*YearViewModel, error) {
res := YearViewModel{Year: year}
res.Issues = make(IssuesByMonth, 12)
last := ""
for _, issue := range lib.Issues.Items.Issues {
logging.ObjDebug(&issue, "Issue")
if len(issue.Datum.When) < 4 {
continue
}
date := issue.Datum.When[0:4]
if date != last {
res.PushAvailable(date)
last = date
}
if date == year {
res.PushIssue(issue)
}
}
if len(res.Issues) == 0 {
return nil, errors.New("No issues found")
}
res.SortAvailableYears()
return &res, nil
}

View File

@@ -12,9 +12,9 @@
<!-- Issues -->
{{ range $issue := $month }}
<a href="/{{ $y }}/{{ $issue.Number.Chardata }}">
<a href="/{{ $y }}/{{ $issue.Number.No }}">
<div>
{{ $issue.Number.Chardata }}
{{ $issue.Number.No }}
</div>
<div>
{{ index $issue.Weekday 1 }}