Better structure of files; introduced XML models

This commit is contained in:
Simon Martens
2025-01-01 17:00:26 +01:00
parent e46d540c01
commit 7539a2dca7
25 changed files with 297 additions and 348 deletions

View File

@@ -1,21 +0,0 @@
package xmlprovider
import (
"encoding/xml"
"fmt"
)
type Agent struct {
XMLName xml.Name `xml:"akteur"`
Names []string `xml:"name"`
SortName string `xml:"sortiername"`
Life string `xml:"lebensdaten"`
GND string `xml:"gnd"`
Org bool `xml:"org,attr"`
Identifier
AnnotationNote
}
func (a Agent) String() string {
return fmt.Sprintf("ID: %s\nNames: %v\nSortName: %s\nLife: %s\nGND: %s\nAnnotations: %v\nNotes: %v\n", a.ID, a.Names, a.SortName, a.Life, a.GND, a.Annotations, a.Notes)
}

View File

@@ -1,18 +0,0 @@
package xmlprovider
import (
"encoding/xml"
"fmt"
)
type Category struct {
XMLName xml.Name `xml:"kategorie"`
Names []string `xml:"name"`
SortName string `xml:"sortiername"`
Identifier
AnnotationNote
}
func (c Category) String() string {
return fmt.Sprintf("ID: %s\nNames: %v\nSortName: %s\nAnnotations: %v\nNotes: %v\n", c.ID, c.Names, c.SortName, c.Annotations, c.Notes)
}

View File

@@ -4,6 +4,7 @@ import (
"encoding/xml"
"io"
"os"
"path/filepath"
"github.com/Theodor-Springmann-Stiftung/kgpz_web/helpers/logging"
)
@@ -30,3 +31,13 @@ func UnmarshalFile[T any](filename string, data T) error {
}
return nil
}
func XMLFilesForPath(path string) ([]string, error) {
if _, err := os.Stat(path); os.IsNotExist(err) {
return nil, err
}
matches, err := filepath.Glob(filepath.Join(path, "*.xml"))
return matches, err
}

View File

@@ -1,59 +0,0 @@
package xmlprovider
import (
"encoding/xml"
"fmt"
"strconv"
)
type Issue struct {
XMLName xml.Name `xml:"stueck"`
Number Nummer `xml:"nummer"`
Datum KGPZDate `xml:"datum"`
Von int `xml:"von"`
Bis int `xml:"bis"`
Additionals []Additional `xml:"beilage"`
Identifier
AnnotationNote
}
type Nummer struct {
No int `xml:",chardata"`
Corrected string `xml:"korrigiert,attr"`
}
type Additional struct {
XMLName xml.Name `xml:"beilage"`
Nummer int `xml:"nummer,attr"`
Von int `xml:"von"`
Bis int `xml:"bis"`
}
func (i Issue) Keys() []string {
if len(i.keys) > 0 {
return i.keys
}
res := make([]string, 2)
date := i.Datum.When.String()
if date != "" {
res = append(res, date)
}
res = append(res, i.Reference())
i.keys = res
return res
}
func (i Issue) Year() int {
return i.Datum.When.Year
}
func (i Issue) Reference() string {
return strconv.Itoa(i.Number.No) + "-" + strconv.Itoa(i.Datum.When.Year)
}
func (i Issue) String() string {
return fmt.Sprintf("Number: %v, Datum: %v, Von: %d, Bis: %d, Additionals: %v, Identifier: %v, AnnotationNote: %v\n", i.Number, i.Datum, i.Von, i.Bis, i.Additionals, i.Identifier, i.AnnotationNote)
}

View File

@@ -5,6 +5,8 @@ type ItemInfo struct {
Parse *ParseMeta
}
type KeyedItem struct {
keys []string
// INFO: These are just root elements that hold the data of the XML files.
// They get discarded after a parse.
type XMLRootElement[T any] interface {
Children() []T
}

View File

@@ -1,147 +0,0 @@
package xmlprovider
import (
"fmt"
"sync"
)
type Library struct {
amu sync.Mutex
Agents *XMLProvider[Agent]
Places *XMLProvider[Place]
Works *XMLProvider[Work]
Categories *XMLProvider[Category]
Issues *XMLProvider[Issue]
Pieces *XMLProvider[Piece]
}
func (l *Library) String() string {
return fmt.Sprintf("Agents: %s\nPlaces: %s\nWorks: %s\nCategories: %s\nIssues: %s\nPieces: %s\n",
l.Agents.String(), l.Places.String(), l.Works.String(), l.Categories.String(), l.Issues.String(), l.Pieces.String())
}
// INFO: this is the only place where the providers are created. There is no need for locking on access.
func NewLibrary(agentpaths, placepaths, workpaths, categorypaths, issuepaths, piecepaths []string) *Library {
return &Library{
Agents: &XMLProvider[Agent]{Paths: agentpaths},
Places: &XMLProvider[Place]{Paths: placepaths},
Works: &XMLProvider[Work]{Paths: workpaths},
Categories: &XMLProvider[Category]{Paths: categorypaths},
Issues: &XMLProvider[Issue]{Paths: issuepaths},
Pieces: &XMLProvider[Piece]{Paths: piecepaths},
}
}
func (l *Library) SetPaths(agentpaths, placepaths, workpaths, categorypaths, issuepaths, piecepaths []string) {
l.amu.Lock()
defer l.amu.Unlock()
l.Agents.Paths = agentpaths
l.Places.Paths = placepaths
l.Works.Paths = workpaths
l.Categories.Paths = categorypaths
l.Issues.Paths = issuepaths
l.Pieces.Paths = piecepaths
}
func (l *Library) Serialize(commit string) {
wg := sync.WaitGroup{}
l.Prepare(commit)
for _, path := range l.Places.Paths {
wg.Add(1)
go func() {
l.Places.Serialize(NewPlaceRoot(), path)
wg.Done()
}()
}
for _, path := range l.Agents.Paths {
wg.Add(1)
go func() {
l.Agents.Serialize(NewAgentRoot(), path)
wg.Done()
}()
}
for _, path := range l.Categories.Paths {
wg.Add(1)
go func() {
l.Categories.Serialize(NewCategoryRoot(), path)
wg.Done()
}()
}
for _, path := range l.Works.Paths {
wg.Add(1)
go func() {
l.Works.Serialize(NewWorkRoot(), path)
wg.Done()
}()
}
for _, path := range l.Issues.Paths {
wg.Add(1)
go func() {
l.Issues.Serialize(NewIssueRoot(), path)
wg.Done()
}()
}
for _, path := range l.Pieces.Paths {
wg.Add(1)
go func() {
l.Pieces.Serialize(NewPieceRoot(), path)
wg.Done()
}()
}
wg.Wait()
l.Cleanup()
}
func (l *Library) Prepare(commit string) {
l.Agents.Prepare(commit)
l.Places.Prepare(commit)
l.Works.Prepare(commit)
l.Categories.Prepare(commit)
l.Issues.Prepare(commit)
l.Pieces.Prepare(commit)
}
func (l *Library) Cleanup() {
wg := sync.WaitGroup{}
wg.Add(6)
go func() {
l.Agents.Cleanup()
wg.Done()
}()
go func() {
l.Places.Cleanup()
wg.Done()
}()
go func() {
l.Works.Cleanup()
wg.Done()
}()
go func() {
l.Categories.Cleanup()
wg.Done()
}()
go func() {
l.Issues.Cleanup()
wg.Done()
}()
go func() {
l.Pieces.Cleanup()
wg.Done()
}()
wg.Wait()
}

View File

@@ -1,101 +0,0 @@
package xmlprovider
import (
"encoding/xml"
"fmt"
"strconv"
"strings"
"github.com/google/uuid"
)
type Piece struct {
XMLName xml.Name `xml:"beitrag"`
IssueRefs []IssueRef `xml:"stueck"`
PlaceRefs []PlaceRef `xml:"ort"`
CategoryRefs []CategoryRef `xml:"kategorie"`
AgentRefs []AgentRef `xml:"akteur"`
WorkRefs []WorkRef `xml:"werk"`
PieceRefs []PieceRef `xml:"beitrag"`
Datum []KGPZDate `xml:"datum"`
Incipit []string `xml:"incipit"`
Title []string `xml:"titel"`
Identifier
AnnotationNote
}
func (p Piece) String() string {
return fmt.Sprintf("ID: %s\nIssueRefs: %v\nPlaceRefs: %v\nCategoryRefs: %v\nAgentRefs: %v\nWorkRefs: %v\nPieceRefs: %v\nIncipit: %v\nTitle: %v\nAnnotations: %v\nNotes: %v\n", p.ID, p.IssueRefs, p.PlaceRefs, p.CategoryRefs, p.AgentRefs, p.WorkRefs, p.PieceRefs, p.Incipit, p.Title, p.Annotations, p.Notes)
}
func (p Piece) Keys() []string {
if len(p.keys) > 0 {
return p.keys
}
ret := make([]string, 2)
if p.ID != "" {
ret = append(ret, p.ID)
}
// TODO: sensible IDs
uid := uuid.New()
for _, i := range p.IssueRefs {
ret = append(ret, strconv.Itoa(i.When.Year)+"-"+strconv.Itoa(i.Nr)+"-"+uid.String())
}
p.keys = ret
return ret
}
func (p Piece) ReferencesIssue(y, no int) (*IssueRef, bool) {
for _, i := range p.IssueRefs {
if i.Nr == no {
if i.When.Year == y {
return &i, true
}
}
}
return nil, false
}
func (p Piece) ReferencesAgent(a string) (*AgentRef, bool) {
for _, i := range p.AgentRefs {
if strings.HasPrefix(i.Ref, a) {
return &i, true
}
}
return nil, false
}
// TODO: We can make this fast depending on which category to look for
// but we'll have to define rules for every single category (~35 of them)
func (p Piece) IsCat(k string) bool {
for _, c := range p.CategoryRefs {
if c.Category == k {
return true
}
}
for _, c := range p.WorkRefs {
if c.Category == k {
return true
}
}
for _, c := range p.AgentRefs {
if c.Category == k {
return true
}
}
for _, c := range p.PieceRefs {
if c.Category == k {
return true
}
}
return false
}

View File

@@ -1,19 +0,0 @@
package xmlprovider
import (
"encoding/xml"
"fmt"
)
type Place struct {
XMLName xml.Name `xml:"ort"`
Names []string `xml:"name"`
SortName string `xml:"sortiername"`
Geo string `xml:"geonames"`
Identifier
AnnotationNote
}
func (p Place) String() string {
return fmt.Sprintf("ID: %s\nNames: %v\nSortName: %s\nGeo: %s\nAnnotations: %v\nNotes: %v\n", p.ID, p.Names, p.SortName, p.Geo, p.Annotations, p.Notes)
}

View File

@@ -1,111 +0,0 @@
package xmlprovider
import "encoding/xml"
// INFO: These are just root elements that hold the data of the XML files.
// They get discarded after a parse.
type XMLRootElement[T any] interface {
Children() []T
}
type AgentRoot struct {
XMLName xml.Name `xml:"akteure"`
Agents []Agent `xml:"akteur"`
}
func NewAgentRoot() *AgentRoot {
return &AgentRoot{}
}
func (a AgentRoot) New() *AgentRoot {
return NewAgentRoot()
}
func (a AgentRoot) Children() []Agent {
return a.Agents
}
type PlaceRoot struct {
XMLName xml.Name `xml:"orte"`
Place []Place `xml:"ort"`
}
func NewPlaceRoot() *PlaceRoot {
return &PlaceRoot{}
}
func (p PlaceRoot) New() *PlaceRoot {
return NewPlaceRoot()
}
func (p PlaceRoot) Children() []Place {
return p.Place
}
type CategoryRoot struct {
XMLName xml.Name `xml:"kategorien"`
Category []Category `xml:"kategorie"`
}
func NewCategoryRoot() *CategoryRoot {
return &CategoryRoot{}
}
func (c CategoryRoot) New() XMLRootElement[Category] {
return NewCategoryRoot()
}
func (c CategoryRoot) Children() []Category {
return c.Category
}
type PieceRoot struct {
XMLName xml.Name `xml:"beitraege"`
Piece []Piece `xml:"beitrag"`
}
func NewPieceRoot() *PieceRoot {
return &PieceRoot{}
}
func (p PieceRoot) New() XMLRootElement[Piece] {
return NewPieceRoot()
}
func (p PieceRoot) Children() []Piece {
return p.Piece
}
type IssueRoot struct {
XMLName xml.Name `xml:"stuecke"`
Issues []Issue `xml:"stueck"`
}
func NewIssueRoot() *IssueRoot {
return &IssueRoot{}
}
func (i IssueRoot) New() XMLRootElement[Issue] {
return NewIssueRoot()
}
func (i IssueRoot) Children() []Issue {
return i.Issues
}
type WorkRoot struct {
XMLName xml.Name `xml:"werke"`
Work []Work `xml:"werk"`
}
func NewWorkRoot() *WorkRoot {
return &WorkRoot{}
}
func (w WorkRoot) New() XMLRootElement[Work] {
return NewWorkRoot()
}
func (w WorkRoot) Children() []Work {
return w.Work
}

View File

@@ -1,38 +0,0 @@
package xmlprovider
import (
"encoding/xml"
"fmt"
"strings"
)
type Work struct {
XMLName xml.Name `xml:"werk"`
URLs []URL `xml:"url"`
Citation Citation `xml:"zitation"`
PreferredTitle string `xml:"preferred"`
AgentRefs []AgentRef `xml:"akteur"`
Identifier
AnnotationNote
}
func (p Work) ReferencesAgent(a string) (*AgentRef, bool) {
for _, i := range p.AgentRefs {
if strings.HasPrefix(i.Ref, a) {
return &i, true
}
}
return nil, false
}
type Citation struct {
XMLName xml.Name `xml:"zitation"`
Title string `xml:"title"`
Year []string `xml:"year"`
Value
Inner
}
func (w Work) String() string {
return fmt.Sprintf("URLs: %v, Citation: %v, PreferredTitle: %s, Akteur: %v, Identifier: %v, AnnotationNote: %v\n", w.URLs, w.Citation, w.PreferredTitle, w.AgentRefs, w.Identifier, w.AnnotationNote)
}

View File

@@ -1,78 +0,0 @@
package xmlprovider
import (
"encoding/xml"
"errors"
"github.com/Theodor-Springmann-Stiftung/kgpz_web/helpers/xsdtime"
)
var InvalidDateError = errors.New("Invalid date")
const DateLayout = "2006-01-02"
type KGPZDate struct {
XMLName xml.Name `xml:"datum"`
DateAttributes
Value
}
type DateAttributes struct {
When xsdtime.XSDDate `xml:"when,attr"`
NotBefore xsdtime.XSDDate `xml:"notBefore,attr"`
NotAfter xsdtime.XSDDate `xml:"notAfter,attr"`
From xsdtime.XSDDate `xml:"from,attr"`
To xsdtime.XSDDate `xml:"to,attr"`
Cert string `xml:"cert,attr"`
}
type URL struct {
XMLName xml.Name `xml:"url"`
Address string `xml:"address,attr"`
Value
}
type AnnotationNote struct {
Annotations []Annotation `xml:"anmerkung"`
Notes []Note `xml:"vermerk"`
}
type Annotation struct {
XMLName xml.Name `xml:"anmerkung"`
Value
Inner
}
type Note struct {
XMLName xml.Name `xml:"vermerk"`
Value
Inner
}
type Identifier struct {
ID string `xml:"id,attr"`
KeyedItem
}
func (i Identifier) Keys() []string {
if len(i.keys) > 0 {
return i.keys
}
i.keys = []string{i.ID}
return i.keys
}
type Reference struct {
Ref string `xml:"ref,attr"`
Category string `xml:"kat,attr"`
Unsicher bool `xml:"unsicher,attr"`
Value
}
type Value struct {
Chardata string `xml:",chardata"`
}
type Inner struct {
InnerXML string `xml:",innerxml"`
}

View File

@@ -21,7 +21,6 @@ type XMLItem interface {
// An XMLProvider is a struct that holds holds serialized XML data of a specific type. It combines multiple parses IF a succeeded parse can not serialize the data from a path.
type XMLProvider[T XMLItem] struct {
Paths []string
// INFO: map is type map[string]*T
Items sync.Map
// INFO: map is type [string]ItemInfo
@@ -60,15 +59,17 @@ func (p *XMLProvider[T]) Serialize(dataholder XMLRootElement[T], path string) er
}
p.mu.Lock()
defer p.mu.Unlock()
if len(p.parses) == 0 {
logging.Error(fmt.Errorf("No commit set"), "No commit set")
return fmt.Errorf("No commit set")
}
commit := &p.parses[len(p.parses)-1]
p.Array = append(p.Array, dataholder.Children()...)
p.mu.Unlock()
for _, item := range dataholder.Children() {
commit := &p.parses[len(p.parses)-1]
newItems := dataholder.Children()
for _, item := range newItems {
// INFO: Mostly it's just one ID, so the double loop is not that bad.
for _, id := range item.Keys() {
p.Infos.Store(id, ItemInfo{Source: path, Parse: commit})
@@ -76,6 +77,7 @@ func (p *XMLProvider[T]) Serialize(dataholder XMLRootElement[T], path string) er
}
}
p.Array = append(p.Array, newItems...)
return nil
}

View File

@@ -1,50 +0,0 @@
package xmlprovider
import "encoding/xml"
type AgentRef struct {
XMLName xml.Name `xml:"akteur"`
Reference
}
type AdditionalRef struct {
XMLName xml.Name `xml:"beilage"`
Reference // Ist nicht im Schema
Datum string `xml:"datum,attr"`
Nr int `xml:"nr,attr"`
AdditionalNo int `xml:"beilage,attr"`
Von int `xml:"von,attr"`
Bis int `xml:"bis,attr"`
}
type IssueRef struct {
XMLName xml.Name `xml:"stueck"`
Reference // Ist nicht im Schema
DateAttributes
Nr int `xml:"nr,attr"`
Von int `xml:"von,attr"`
Bis int `xml:"bis,attr"`
Beilage int `xml:"beilage,attr"`
}
type PlaceRef struct {
XMLName xml.Name `xml:"ort"`
Reference
}
type CategoryRef struct {
XMLName xml.Name `xml:"kategorie"`
Reference
}
type WorkRef struct {
XMLName xml.Name `xml:"werk"`
Reference
Page string `xml:"s,attr"`
}
type PieceRef struct {
XMLName xml.Name `xml:"beitrag"`
Page string `xml:"s,attr"`
Reference
}