XML parsing overhaul

This commit is contained in:
Simon Martens
2024-11-22 00:35:27 +01:00
parent b93256c522
commit bc244fbad4
26 changed files with 507 additions and 352 deletions

View File

@@ -5,10 +5,6 @@ import (
"fmt"
)
type AgentProvider struct {
XMLProvider[Agents]
}
type Agent struct {
XMLName xml.Name `xml:"akteur"`
Names []string `xml:"name"`
@@ -20,29 +16,6 @@ type Agent struct {
AnnotationNote
}
type Agents struct {
XMLName xml.Name `xml:"akteure"`
Agents []Agent `xml:"akteur"`
}
func (a Agents) String() string {
var res []string
for _, agent := range a.Agents {
res = append(res, agent.String())
}
return fmt.Sprintf("Agents: %v", res)
}
func (a Agents) Append(data Agents) Agents {
a.Agents = append(a.Agents, data.Agents...)
return a
}
func (a *Agent) String() string {
func (a Agent) String() string {
return fmt.Sprintf("ID: %s\nNames: %v\nSortName: %s\nLife: %s\nGND: %s\nAnnotations: %v\nNotes: %v\n", a.ID, a.Names, a.SortName, a.Life, a.GND, a.Annotations, a.Notes)
}
func NewAgentProvider(paths []string) *AgentProvider {
return &AgentProvider{XMLProvider: XMLProvider[Agents]{paths: paths}}
}

View File

@@ -5,15 +5,6 @@ import (
"fmt"
)
type CategoryProvider struct {
XMLProvider[Categories]
}
type Categories struct {
XMLName xml.Name `xml:"kategorien"`
Category []Category `xml:"kategorie"`
}
type Category struct {
XMLName xml.Name `xml:"kategorie"`
Names []string `xml:"name"`
@@ -22,24 +13,6 @@ type Category struct {
AnnotationNote
}
func (c Categories) Append(data Categories) Categories {
c.Category = append(c.Category, data.Category...)
return c
}
func (c Categories) String() string {
var res []string
for _, category := range c.Category {
res = append(res, category.String())
}
return fmt.Sprintf("Categories: %v", res)
}
func (c *Category) String() string {
func (c Category) String() string {
return fmt.Sprintf("ID: %s\nNames: %v\nSortName: %s\nAnnotations: %v\nNotes: %v\n", c.ID, c.Names, c.SortName, c.Annotations, c.Notes)
}
func NewCategoryProvider(paths []string) *CategoryProvider {
return &CategoryProvider{XMLProvider: XMLProvider[Categories]{paths: paths}}
}

View File

@@ -3,17 +3,9 @@ package xmlprovider
import (
"encoding/xml"
"fmt"
"strconv"
)
type IssueProvider struct {
XMLProvider[Issues]
}
type Issues struct {
XMLName xml.Name `xml:"stuecke"`
Issues []Issue `xml:"stueck"`
}
type Issue struct {
XMLName xml.Name `xml:"stueck"`
Number Nummer `xml:"nummer"`
@@ -37,24 +29,20 @@ type Additional struct {
Bis string `xml:"bis"`
}
func (i Issues) Append(data Issues) Issues {
i.Issues = append(i.Issues, data.Issues...)
return i
}
func (i Issues) String() string {
var res []string
for _, issue := range i.Issues {
res = append(res, issue.String())
func (i Issue) GetIDs() []string {
res := make([]string, 2)
date := i.Datum.When
if date != "" {
res = append(res, date)
}
return fmt.Sprintf("Issues: %v", res)
if len(date) > 4 {
res = append(res, i.Datum.When[0:4]+"-"+strconv.Itoa(i.Number.No))
}
return res
}
func (i Issue) String() string {
return fmt.Sprintf("Number: %v, Datum: %v, Von: %d, Bis: %d, Additionals: %v, Identifier: %v, AnnotationNote: %v\n", i.Number, i.Datum, i.Von, i.Bis, i.Additionals, i.Identifier, i.AnnotationNote)
}
func NewIssueProvider(paths []string) *IssueProvider {
return &IssueProvider{XMLProvider: XMLProvider[Issues]{paths: paths}}
}

View File

@@ -3,17 +3,10 @@ package xmlprovider
import (
"encoding/xml"
"fmt"
"github.com/google/uuid"
)
type PieceProvider struct {
XMLProvider[Pieces]
}
type Pieces struct {
XMLName xml.Name `xml:"beitraege"`
Piece []Piece `xml:"beitrag"`
}
type Piece struct {
XMLName xml.Name `xml:"beitrag"`
IssueRefs []IssueRef `xml:"stueck"`
@@ -30,24 +23,18 @@ type Piece struct {
AnnotationNote
}
func (p Pieces) Append(data Pieces) Pieces {
p.Piece = append(p.Piece, data.Piece...)
return p
}
func (p Pieces) String() string {
var res []string
for _, piece := range p.Piece {
res = append(res, piece.String())
}
return fmt.Sprintf("Pieces: %v", res)
}
func (p Piece) String() string {
return fmt.Sprintf("ID: %s\nIssueRefs: %v\nPlaceRefs: %v\nCategoryRefs: %v\nAgentRefs: %v\nWorkRefs: %v\nPieceRefs: %v\nAdditionalRef: %v\nIncipit: %v\nTitle: %v\nAnnotations: %v\nNotes: %v\n", p.ID, p.IssueRefs, p.PlaceRefs, p.CategoryRefs, p.AgentRefs, p.WorkRefs, p.PieceRefs, p.AdditionalRef, p.Incipit, p.Title, p.Annotations, p.Notes)
}
func NewPieceProvider(paths []string) *PieceProvider {
return &PieceProvider{XMLProvider: XMLProvider[Pieces]{paths: paths}}
func (p Piece) GetIDs() []string {
ret := make([]string, 2)
if p.ID != "" {
ret = append(ret, p.ID)
}
// TODO: sensible IDs
uid := uuid.New()
ret = append(ret, uid.String())
return ret
}

View File

@@ -5,15 +5,6 @@ import (
"fmt"
)
type PlaceProvider struct {
XMLProvider[Places]
}
type Places struct {
XMLName xml.Name `xml:"orte"`
Place []Place `xml:"ort"`
}
type Place struct {
XMLName xml.Name `xml:"ort"`
Names []string `xml:"name"`
@@ -23,24 +14,6 @@ type Place struct {
AnnotationNote
}
func (p Places) Append(data Places) Places {
p.Place = append(p.Place, data.Place...)
return p
}
func (p Places) String() string {
var res []string
for _, place := range p.Place {
res = append(res, place.String())
}
return fmt.Sprintf("Places: %v", res)
}
func (p *Place) String() string {
func (p Place) String() string {
return fmt.Sprintf("ID: %s\nNames: %v\nSortName: %s\nGeo: %s\nAnnotations: %v\nNotes: %v\n", p.ID, p.Names, p.SortName, p.Geo, p.Annotations, p.Notes)
}
func NewPlaceProvider(paths []string) *PlaceProvider {
return &PlaceProvider{XMLProvider: XMLProvider[Places]{paths: paths}}
}

View File

@@ -0,0 +1,111 @@
package xmlprovider
import "encoding/xml"
// INFO: These are just root elements that hold the data of the XML files.
// They get discarded after a parse.
type XMLRootElement[T any] interface {
Children() []T
}
type AgentRoot struct {
XMLName xml.Name `xml:"akteure"`
Agents []Agent `xml:"akteur"`
}
func NewAgentRoot() *AgentRoot {
return &AgentRoot{}
}
func (a AgentRoot) New() *AgentRoot {
return NewAgentRoot()
}
func (a AgentRoot) Children() []Agent {
return a.Agents
}
type PlaceRoot struct {
XMLName xml.Name `xml:"orte"`
Place []Place `xml:"ort"`
}
func NewPlaceRoot() *PlaceRoot {
return &PlaceRoot{}
}
func (p PlaceRoot) New() *PlaceRoot {
return NewPlaceRoot()
}
func (p PlaceRoot) Children() []Place {
return p.Place
}
type CategoryRoot struct {
XMLName xml.Name `xml:"kategorien"`
Category []Category `xml:"kategorie"`
}
func NewCategoryRoot() *CategoryRoot {
return &CategoryRoot{}
}
func (c CategoryRoot) New() XMLRootElement[Category] {
return NewCategoryRoot()
}
func (c CategoryRoot) Children() []Category {
return c.Category
}
type PieceRoot struct {
XMLName xml.Name `xml:"beitraege"`
Piece []Piece `xml:"beitrag"`
}
func NewPieceRoot() *PieceRoot {
return &PieceRoot{}
}
func (p PieceRoot) New() XMLRootElement[Piece] {
return NewPieceRoot()
}
func (p PieceRoot) Children() []Piece {
return p.Piece
}
type IssueRoot struct {
XMLName xml.Name `xml:"stuecke"`
Issues []Issue `xml:"stueck"`
}
func NewIssueRoot() *IssueRoot {
return &IssueRoot{}
}
func (i IssueRoot) New() XMLRootElement[Issue] {
return NewIssueRoot()
}
func (i IssueRoot) Children() []Issue {
return i.Issues
}
type WorkRoot struct {
XMLName xml.Name `xml:"werke"`
Work []Work `xml:"werk"`
}
func NewWorkRoot() *WorkRoot {
return &WorkRoot{}
}
func (w WorkRoot) New() XMLRootElement[Work] {
return NewWorkRoot()
}
func (w WorkRoot) Children() []Work {
return w.Work
}

View File

@@ -5,15 +5,6 @@ import (
"fmt"
)
type WorkProvider struct {
XMLProvider[Works]
}
type Works struct {
XMLName xml.Name `xml:"werke"`
Work []Work `xml:"werk"`
}
type Work struct {
XMLName xml.Name `xml:"werk"`
URLs []URL `xml:"url"`
@@ -32,24 +23,6 @@ type Citation struct {
Inner
}
func (w Works) Append(data Works) Works {
w.Work = append(w.Work, data.Work...)
return w
}
func (w Works) String() string {
var res []string
for _, work := range w.Work {
res = append(res, work.String())
}
return fmt.Sprintf("Works: %v", res)
}
func (w *Work) String() string {
func (w Work) String() string {
return fmt.Sprintf("URLs: %v, Citation: %v, PreferredTitle: %s, Akteur: %v, Identifier: %v, AnnotationNote: %v\n", w.URLs, w.Citation, w.PreferredTitle, w.Akteur, w.Identifier, w.AnnotationNote)
}
func NewWorkProvider(paths []string) *WorkProvider {
return &WorkProvider{XMLProvider: XMLProvider[Works]{paths: paths}}
}

View File

@@ -39,6 +39,10 @@ type Identifier struct {
ID string `xml:"id,attr"`
}
func (i Identifier) GetIDs() []string {
return []string{i.ID}
}
type Reference struct {
Ref string `xml:"ref,attr"`
Category string `xml:"kat,attr"`

View File

@@ -10,34 +10,40 @@ import (
"github.com/Theodor-Springmann-Stiftung/kgpz_web/helpers/logging"
)
type KGPZXML[T any] interface {
Append(data T) T
type XMLItem interface {
fmt.Stringer
GetIDs() []string
}
type XMLProvider[T KGPZXML[T]] struct {
mu sync.Mutex
paths []string
Items T
type XMLProvider[T XMLItem] struct {
Paths []string
Items sync.Map
mu sync.Mutex
}
type Library struct {
Agents *AgentProvider
Places *PlaceProvider
Works *WorkProvider
Categories *CategoryProvider
Issues *IssueProvider
Pieces *PieceProvider
Agents *XMLProvider[Agent]
Places *XMLProvider[Place]
Works *XMLProvider[Work]
Categories *XMLProvider[Category]
Issues *XMLProvider[Issue]
Pieces *XMLProvider[Piece]
}
func (l *Library) String() string {
return fmt.Sprintf("Agents: %s\nPlaces: %s\nWorks: %s\nCategories: %s\nIssues: %s\nPieces: %s\n",
l.Agents.String(), l.Places.String(), l.Works.String(), l.Categories.String(), l.Issues.String(), l.Pieces.String())
}
func NewLibrary(agentpaths, placepaths, workpaths, categorypaths, issuepaths, piecepaths []string) *Library {
return &Library{
Agents: NewAgentProvider(agentpaths),
Places: NewPlaceProvider(placepaths),
Works: NewWorkProvider(workpaths),
Categories: NewCategoryProvider(categorypaths),
Issues: NewIssueProvider(issuepaths),
Pieces: NewPieceProvider(piecepaths),
Agents: &XMLProvider[Agent]{Paths: agentpaths},
Places: &XMLProvider[Place]{Paths: placepaths},
Works: &XMLProvider[Work]{Paths: workpaths},
Categories: &XMLProvider[Category]{Paths: categorypaths},
Issues: &XMLProvider[Issue]{Paths: issuepaths},
Pieces: &XMLProvider[Piece]{Paths: piecepaths},
}
}
@@ -47,84 +53,98 @@ func (l *Library) Serialize() {
go func() {
defer wg.Done()
err := l.Agents.Serialize()
if err != nil {
l.Agents = nil
lwg := sync.WaitGroup{}
for _, path := range l.Places.Paths {
lwg.Add(1)
go l.Places.Serialize(NewPlaceRoot(), path, &lwg)
}
lwg.Wait()
}()
go func() {
defer wg.Done()
err := l.Places.Serialize()
if err != nil {
l.Places = nil
lwg := sync.WaitGroup{}
for _, path := range l.Agents.Paths {
lwg.Add(1)
go l.Agents.Serialize(NewAgentRoot(), path, &lwg)
}
lwg.Wait()
}()
go func() {
defer wg.Done()
err := l.Works.Serialize()
if err != nil {
l.Works = nil
lwg := sync.WaitGroup{}
for _, path := range l.Categories.Paths {
lwg.Add(1)
go l.Categories.Serialize(NewCategoryRoot(), path, &lwg)
}
lwg.Wait()
}()
go func() {
defer wg.Done()
err := l.Categories.Serialize()
if err != nil {
l.Categories = nil
lwg := sync.WaitGroup{}
for _, path := range l.Works.Paths {
lwg.Add(1)
go l.Works.Serialize(NewWorkRoot(), path, &lwg)
}
lwg.Wait()
}()
go func() {
defer wg.Done()
err := l.Issues.Serialize()
if err != nil {
l.Issues = nil
lwg := sync.WaitGroup{}
for _, path := range l.Issues.Paths {
lwg.Add(1)
go l.Issues.Serialize(NewIssueRoot(), path, &lwg)
}
lwg.Wait()
}()
go func() {
defer wg.Done()
err := l.Pieces.Serialize()
if err != nil {
l.Pieces = nil
lwg := sync.WaitGroup{}
for _, path := range l.Pieces.Paths {
lwg.Add(1)
go l.Pieces.Serialize(NewPieceRoot(), path, &lwg)
}
lwg.Wait()
}()
wg.Wait()
}
// TODO: make Items into a sync.Map
func (p *XMLProvider[T]) Serialize() error {
func (p *XMLProvider[T]) Serialize(dataholder XMLRootElement[T], path string, wg *sync.WaitGroup) error {
// Introduce goroutine for every path, locking on append:
var wg sync.WaitGroup
for _, path := range p.paths {
wg.Add(1)
go func(path string) {
defer wg.Done()
var data T
if err := UnmarshalFile(path, &data); err != nil {
return
}
p.mu.Lock()
defer p.mu.Unlock()
p.Items = p.Items.Append(data)
}(path)
if err := UnmarshalFile(path, dataholder); err != nil {
logging.Error(err, "Could not unmarshal file: "+path)
return err
}
for _, item := range dataholder.Children() {
// INFO: Mostly it's just one ID, so the double loop is not that bad.
for _, id := range item.GetIDs() {
p.Items.Store(id, item)
}
}
if wg != nil {
wg.Done()
}
wg.Wait()
return nil
}
func (a *XMLProvider[T]) String() string {
a.mu.Lock()
defer a.mu.Unlock()
return fmt.Sprintf("Items: %s", a.Items)
var s string
a.Items.Range(func(key, value interface{}) bool {
v := value.(T)
s += v.String()
return true
})
return s
}
func UnmarshalFile[T any](filename string, data *T) error {
func UnmarshalFile[T any](filename string, data T) error {
xmlFile, err := os.Open(filename)
if err != nil {
logging.Error(err, "Could not open file: "+filename)
@@ -138,7 +158,7 @@ func UnmarshalFile[T any](filename string, data *T) error {
logging.Error(err, "Could not read file: "+filename)
return err
}
err = xml.Unmarshal(byteValue, data)
err = xml.Unmarshal(byteValue, &data)
if err != nil {
logging.Error(err, "Could not unmarshal file: "+filename)
@@ -146,3 +166,44 @@ func UnmarshalFile[T any](filename string, data *T) error {
}
return nil
}
func (p *XMLProvider[T]) Item(id string) *T {
item, ok := p.Items.Load(id)
if !ok {
return nil
}
i := item.(T)
return &i
}
func (p *XMLProvider[T]) Find(fn func(T) bool) []T {
var items []T
p.Items.Range(func(key, value interface{}) bool {
if fn(value.(T)) {
items = append(items, value.(T))
}
return true
})
return items
}
func (p *XMLProvider[T]) FindKey(fn func(string) bool) []T {
var items []T
p.Items.Range(func(key, value interface{}) bool {
if fn(key.(string)) {
items = append(items, value.(T))
}
return true
})
return items
}
func (p *XMLProvider[T]) All() []T {
var items []T
p.Items.Range(func(key, value interface{}) bool {
items = append(items, value.(T))
return true
})
return items
}