diff --git a/app/kgpz.go b/app/kgpz.go index 097aef6..cb76b38 100644 --- a/app/kgpz.go +++ b/app/kgpz.go @@ -2,13 +2,13 @@ package app import ( "os" - "path/filepath" "sync" "github.com/Theodor-Springmann-Stiftung/kgpz_web/helpers" "github.com/Theodor-Springmann-Stiftung/kgpz_web/helpers/logging" "github.com/Theodor-Springmann-Stiftung/kgpz_web/providers" "github.com/Theodor-Springmann-Stiftung/kgpz_web/providers/gnd" + "github.com/Theodor-Springmann-Stiftung/kgpz_web/providers/xmlprovider" "github.com/Theodor-Springmann-Stiftung/kgpz_web/xmlmodels" ) @@ -86,22 +86,18 @@ func (k *KGPZ) Serialize() { k.gmu.Lock() defer k.gmu.Unlock() - commit := "staticfile" + commit := "" + source := xmlprovider.Path if k.Repo != nil { commit = k.Repo.Commit + source = xmlprovider.Commit } - issues, err := getXMLFiles(filepath.Join(k.Config.FolderPath, ISSUES_DIR)) - helpers.Assert(err, "Error getting issues") - - pieces, err := getXMLFiles(filepath.Join(k.Config.FolderPath, PIECES_DIR)) - helpers.Assert(err, "Error getting pieces") - if k.Library == nil { - k.Library = xmlmodels.NewLibrary(k.Config.FolderPath) + k.Library = xmlmodels.NewLibrary() } - k.Library.Serialize(commit) + k.Library.Parse(source, k.Config.FolderPath, commit) } func (k *KGPZ) IsDebug() bool { diff --git a/providers/xmlprovider/item.go b/providers/xmlprovider/item.go index cd1b5af..f2eb3b0 100644 --- a/providers/xmlprovider/item.go +++ b/providers/xmlprovider/item.go @@ -2,7 +2,7 @@ package xmlprovider type ItemInfo struct { Source string - Parse *ParseMeta + Parse ParseMeta } // INFO: These are just root elements that hold the data of the XML files. diff --git a/providers/xmlprovider/resolver.go b/providers/xmlprovider/resolver.go new file mode 100644 index 0000000..9106585 --- /dev/null +++ b/providers/xmlprovider/resolver.go @@ -0,0 +1,44 @@ +package xmlprovider + +// INFO: This is used to resolve references (back-links) between XML items. + +import ( + "fmt" + "sync" +) + +type ReferenceResolver interface { + GetReferences() map[string][]string +} + +type Resolver[T XMLItem] struct { + index map[string]map[string][]*T // Map[typeName][refID] -> []*T + mu sync.Mutex // Synchronization for thread safety +} + +func NewResolver[T XMLItem]() *Resolver[T] { + return &Resolver[T]{index: make(map[string]map[string][]*T)} +} + +func (r *Resolver[T]) Add(typeName, refID string, item *T) { + r.mu.Lock() + defer r.mu.Unlock() + + if _, exists := r.index[typeName]; !exists { + r.index[typeName] = make(map[string][]*T) + } + r.index[typeName][refID] = append(r.index[typeName][refID], item) +} + +func (r *Resolver[T]) Get(typeName, refID string) ([]*T, error) { + r.mu.Lock() + defer r.mu.Unlock() + + if typeIndex, exists := r.index[typeName]; exists { + if items, ok := typeIndex[refID]; ok { + return items, nil + } + return nil, fmt.Errorf("no references found for refID '%s' of type '%s'", refID, typeName) + } + return nil, fmt.Errorf("no index exists for type '%s'", typeName) +} diff --git a/providers/xmlprovider/xmlprovider.go b/providers/xmlprovider/xmlprovider.go index c04e0fb..935ac78 100644 --- a/providers/xmlprovider/xmlprovider.go +++ b/providers/xmlprovider/xmlprovider.go @@ -9,14 +9,40 @@ import ( "github.com/Theodor-Springmann-Stiftung/kgpz_web/helpers/logging" ) +type ParseSource int + +const ( + Unknown ParseSource = iota + Path + Commit +) + type ParseMeta struct { - Commit string - Date time.Time + Source ParseSource + BaseDir string + Commit string + Date time.Time + + FailedPaths []string +} + +func (p ParseMeta) Equals(other ParseMeta) bool { + return p.Source == other.Source && p.BaseDir == other.BaseDir && p.Commit == other.Commit && p.Date == other.Date +} + +func (p ParseMeta) Failed(path string) bool { + return slices.Contains(p.FailedPaths, path) } type XMLItem interface { fmt.Stringer Keys() []string + Name() string +} + +type ILibrary interface { + Parse(meta ParseMeta) error + Latest() *ParseMeta } // An XMLProvider is a struct that holds holds serialized XML data of a specific type. It combines multiple parses IF a succeeded parse can not serialize the data from a path. @@ -27,54 +53,54 @@ type XMLProvider[T XMLItem] struct { // It keeps information about parsing status of the items. Infos sync.Map + // INFO: Resolver is used to resolve references (back-links) between XML items. + Resolver Resolver[T] + mu sync.Mutex // TODO: This array is meant to be for iteration purposes, since iteration over the sync.Map is slow. // It is best for this array to be sorted by key of the corresponding item. - Array []T - Previous []T - failed []string - parses []ParseMeta + Array []T +} + +func NewXMLProvider[T XMLItem]() *XMLProvider[T] { + return &XMLProvider[T]{Resolver: *NewResolver[T]()} } // INFO: To parse sth, we call Prepare, then Serialize, then Cleanup. // Prepare & Cleanup are called once per parse. Serialize is called for every path. // and can be called concurretly. -func (p *XMLProvider[T]) Prepare(commit string) { +func (p *XMLProvider[T]) Prepare() { p.mu.Lock() defer p.mu.Unlock() - p.Previous = p.Array - p.Array = make([]T, len(p.Previous)) - p.failed = make([]string, 0) - p.parses = append(p.parses, ParseMeta{Commit: commit, Date: time.Now()}) + p.Array = make([]T, 1000) } -func (p *XMLProvider[T]) Serialize(dataholder XMLRootElement[T], path string) error { +func (p *XMLProvider[T]) Serialize(dataholder XMLRootElement[T], path string, latest ParseMeta) error { if err := UnmarshalFile(path, dataholder); err != nil { logging.Error(err, "Could not unmarshal file: "+path) logging.ParseMessages.LogError(logging.Unknown, path, "", "Could not unmarshal file.") - p.mu.Lock() - defer p.mu.Unlock() - p.failed = append(p.failed, path) return err } p.mu.Lock() defer p.mu.Unlock() - - if len(p.parses) == 0 { - logging.Error(fmt.Errorf("No commit set"), "No commit set") - return fmt.Errorf("No commit set") - } - - commit := &p.parses[len(p.parses)-1] newItems := dataholder.Children() for _, item := range newItems { // INFO: Mostly it's just one ID, so the double loop is not that bad. for _, id := range item.Keys() { - p.Infos.Store(id, ItemInfo{Source: path, Parse: commit}) + p.Infos.Store(id, ItemInfo{Source: path, Parse: latest}) p.Items.Store(id, &item) } + + // INFO: If the item has a GetReferences method, we add the references to the resolver. + if refResolver, ok := any(item).(ReferenceResolver); ok { + for refType, ids := range refResolver.GetReferences() { + for _, refID := range ids { + p.Resolver.Add(refType, refID, &item) + } + } + } } p.Array = append(p.Array, newItems...) @@ -84,22 +110,16 @@ func (p *XMLProvider[T]) Serialize(dataholder XMLRootElement[T], path string) er // INFO: Cleanup is called after all paths have been serialized. // It deletes all items that have not been parsed in the last commit, // and whose filepath has not been marked as failed. -func (p *XMLProvider[T]) Cleanup() { +func (p *XMLProvider[T]) Cleanup(latest ParseMeta) { p.mu.Lock() defer p.mu.Unlock() - if len(p.parses) == 0 { - logging.Error(fmt.Errorf("Trying to cleanup an empty XMLProvider.")) - return - } - - lastcommit := &p.parses[len(p.parses)-1] todelete := make([]string, 0) toappend := make([]*T, 0) p.Infos.Range(func(key, value interface{}) bool { info := value.(ItemInfo) - if info.Parse != lastcommit { - if !slices.Contains(p.failed, info.Source) { + if !info.Parse.Equals(latest) { + if !latest.Failed(info.Source) { todelete = append(todelete, key.(string)) } else { item, ok := p.Items.Load(key) @@ -124,6 +144,23 @@ func (p *XMLProvider[T]) Cleanup() { } } +func (p *XMLProvider[T]) ReverseLookup(item XMLItem) ([]*T, error) { + keys := item.Keys() + + if len(keys) == 0 { + return nil, fmt.Errorf("Item has no keys") + } + + for _, key := range keys { + ret, err := p.Resolver.Get(item.Name(), key) + if err != nil { + return ret, nil + } + } + + return []*T{}, nil +} + func (a *XMLProvider[T]) String() string { var s string a.Items.Range(func(key, value interface{}) bool { diff --git a/xmlmodels/agents.go b/xmlmodels/agents.go index c8b3613..6447489 100644 --- a/xmlmodels/agents.go +++ b/xmlmodels/agents.go @@ -1,8 +1,8 @@ package xmlmodels import ( + "encoding/json" "encoding/xml" - "fmt" ) type Agent struct { @@ -16,6 +16,11 @@ type Agent struct { AnnotationNote } -func (a Agent) String() string { - return fmt.Sprintf("ID: %s\nNames: %v\nSortName: %s\nLife: %s\nGND: %s\nAnnotations: %v\nNotes: %v\n", a.ID, a.Names, a.SortName, a.Life, a.GND, a.Annotations, a.Notes) +func (a Agent) Name() string { + return "agent" +} + +func (a Agent) String() string { + data, _ := json.MarshalIndent(a, "", " ") + return string(data) } diff --git a/xmlmodels/categories.go b/xmlmodels/categories.go index 9dcef8c..98a3f66 100644 --- a/xmlmodels/categories.go +++ b/xmlmodels/categories.go @@ -1,8 +1,8 @@ package xmlmodels import ( + "encoding/json" "encoding/xml" - "fmt" ) type Category struct { @@ -13,6 +13,11 @@ type Category struct { AnnotationNote } -func (c Category) String() string { - return fmt.Sprintf("ID: %s\nNames: %v\nSortName: %s\nAnnotations: %v\nNotes: %v\n", c.ID, c.Names, c.SortName, c.Annotations, c.Notes) +func (c Category) Name() string { + return "category" +} + +func (c Category) String() string { + data, _ := json.MarshalIndent(c, "", " ") + return string(data) } diff --git a/xmlmodels/common.go b/xmlmodels/common.go index ea393ab..bc49551 100644 --- a/xmlmodels/common.go +++ b/xmlmodels/common.go @@ -55,10 +55,9 @@ type Identifier struct { } func (i Identifier) Keys() []string { - if len(i.keys) > 0 { - return i.keys + if len(i.keys) == 0 { + i.keys = append(i.keys, i.ID) } - i.keys = []string{i.ID} return i.keys } diff --git a/xmlmodels/helpers.go b/xmlmodels/helpers.go index 11f8699..998bc67 100644 --- a/xmlmodels/helpers.go +++ b/xmlmodels/helpers.go @@ -5,16 +5,6 @@ import ( "github.com/Theodor-Springmann-Stiftung/kgpz_web/providers/xmlprovider" ) -const ( - AGENTS_PATH = "XML/akteure.xml" - PLACES_PATH = "XML/orte.xml" - WORKS_PATH = "XML/werke.xml" - CATEGORIES_PATH = "XML/kategorien.xml" - - ISSUES_DIR = "XML/stuecke/" - PIECES_DIR = "XML/beitraege/" -) - func AgentsIntoDataset(provider *xmlprovider.XMLProvider[Agent]) []gnd.GNDData { provider.Lock() defer provider.Unlock() diff --git a/xmlmodels/issues.go b/xmlmodels/issues.go index 11a38f2..6ad2f45 100644 --- a/xmlmodels/issues.go +++ b/xmlmodels/issues.go @@ -1,8 +1,8 @@ package xmlmodels import ( + "encoding/json" "encoding/xml" - "fmt" "strconv" ) @@ -29,6 +29,10 @@ type Additional struct { Bis int `xml:"bis"` } +func (i Issue) Name() string { + return "issue" +} + func (i Issue) Keys() []string { if len(i.keys) > 0 { return i.keys @@ -55,5 +59,6 @@ func (i Issue) Reference() string { } func (i Issue) String() string { - return fmt.Sprintf("Number: %v, Datum: %v, Von: %d, Bis: %d, Additionals: %v, Identifier: %v, AnnotationNote: %v\n", i.Number, i.Datum, i.Von, i.Bis, i.Additionals, i.Identifier, i.AnnotationNote) + data, _ := json.MarshalIndent(i, "", " ") + return string(data) } diff --git a/xmlmodels/library.go b/xmlmodels/library.go index 9dd2256..9a52976 100644 --- a/xmlmodels/library.go +++ b/xmlmodels/library.go @@ -3,13 +3,26 @@ package xmlmodels import ( "fmt" "path/filepath" + "strings" "sync" + "time" "github.com/Theodor-Springmann-Stiftung/kgpz_web/providers/xmlprovider" ) +const ( + AGENTS_PATH = "XML/akteure.xml" + PLACES_PATH = "XML/orte.xml" + WORKS_PATH = "XML/werke.xml" + CATEGORIES_PATH = "XML/kategorien.xml" + + ISSUES_DIR = "XML/stuecke/" + PIECES_DIR = "XML/beitraege/" +) + type Library struct { - baseDir string + mu sync.Mutex + Parses []xmlprovider.ParseMeta Agents *xmlprovider.XMLProvider[Agent] Places *xmlprovider.XMLProvider[Place] @@ -25,9 +38,8 @@ func (l *Library) String() string { } // INFO: this is the only place where the providers are created. There is no need for locking on access. -func NewLibrary(basedir string) *Library { +func NewLibrary() *Library { return &Library{ - baseDir: basedir, Agents: &xmlprovider.XMLProvider[Agent]{}, Places: &xmlprovider.XMLProvider[Place]{}, Works: &xmlprovider.XMLProvider[Work]{}, @@ -37,97 +49,149 @@ func NewLibrary(basedir string) *Library { } } -func (l *Library) Serialize(commit string) { +func (l *Library) Parse(source xmlprovider.ParseSource, baseDir, commit string) error { + // INFO: this lock prevents multiple parses from happening at the same time. + l.mu.Lock() + defer l.mu.Unlock() + wg := sync.WaitGroup{} + meta := xmlprovider.ParseMeta{ + Source: source, + BaseDir: baseDir, + Commit: commit, + Date: time.Now(), + } + metamu := sync.Mutex{} - l.Prepare(commit) + l.prepare() wg.Add(1) go func() { - l.Places.Serialize(&PlaceRoot{}, filepath.Join(l.baseDir, PLACES_PATH)) + err := l.Places.Serialize(&PlaceRoot{}, filepath.Join(meta.BaseDir, PLACES_PATH), meta) + if err != nil { + metamu.Lock() + meta.FailedPaths = append(meta.FailedPaths, filepath.Join(meta.BaseDir, PLACES_PATH)) + metamu.Unlock() + } wg.Done() }() wg.Add(1) go func() { - l.Agents.Serialize(&AgentRoot{}, filepath.Join(l.baseDir, AGENTS_PATH)) + err := l.Agents.Serialize(&AgentRoot{}, filepath.Join(meta.BaseDir, AGENTS_PATH), meta) + if err != nil { + metamu.Lock() + meta.FailedPaths = append(meta.FailedPaths, filepath.Join(meta.BaseDir, AGENTS_PATH)) + metamu.Unlock() + } wg.Done() }() wg.Add(1) go func() { - l.Categories.Serialize(&CategoryRoot{}, filepath.Join(l.baseDir, CATEGORIES_PATH)) + err := l.Categories.Serialize(&CategoryRoot{}, filepath.Join(meta.BaseDir, CATEGORIES_PATH), meta) + if err != nil { + metamu.Lock() + meta.FailedPaths = append(meta.FailedPaths, filepath.Join(meta.BaseDir, CATEGORIES_PATH)) + metamu.Unlock() + } wg.Done() }() wg.Add(1) go func() { - l.Works.Serialize(&WorkRoot{}, filepath.Join(l.baseDir, WORKS_PATH)) + err := l.Works.Serialize(&WorkRoot{}, filepath.Join(meta.BaseDir, WORKS_PATH), meta) + if err != nil { + metamu.Lock() + meta.FailedPaths = append(meta.FailedPaths, filepath.Join(meta.BaseDir, WORKS_PATH)) + metamu.Unlock() + } wg.Done() }() - issuepaths, _ := xmlprovider.XMLFilesForPath(filepath.Join(l.baseDir, ISSUES_DIR)) + issuepaths, _ := xmlprovider.XMLFilesForPath(filepath.Join(meta.BaseDir, ISSUES_DIR)) for _, path := range issuepaths { wg.Add(1) go func() { - l.Issues.Serialize(&IssueRoot{}, path) + err := l.Issues.Serialize(&IssueRoot{}, path, meta) + if err != nil { + metamu.Lock() + meta.FailedPaths = append(meta.FailedPaths, path) + metamu.Unlock() + } wg.Done() }() } - piecepaths, _ := xmlprovider.XMLFilesForPath(filepath.Join(l.baseDir, PIECES_DIR)) + piecepaths, _ := xmlprovider.XMLFilesForPath(filepath.Join(meta.BaseDir, PIECES_DIR)) for _, path := range piecepaths { wg.Add(1) go func() { - l.Pieces.Serialize(&PieceRoot{}, path) + err := l.Pieces.Serialize(&PieceRoot{}, path, meta) + if err != nil { + metamu.Lock() + meta.FailedPaths = append(meta.FailedPaths, path) + metamu.Unlock() + } wg.Done() }() } wg.Wait() - l.Cleanup() + + l.cleanup(meta) + l.Parses = append(l.Parses, meta) + + var errors []string + if len(meta.FailedPaths) > 0 { + errors = append(errors, fmt.Sprintf("Failed paths: %v", meta.FailedPaths)) + } + if len(errors) > 0 { + return fmt.Errorf("Parsing encountered errors: %v", strings.Join(errors, "; ")) + } + return nil } -func (l *Library) Prepare(commit string) { - l.Agents.Prepare(commit) - l.Places.Prepare(commit) - l.Works.Prepare(commit) - l.Categories.Prepare(commit) - l.Issues.Prepare(commit) - l.Pieces.Prepare(commit) +func (l *Library) prepare() { + l.Agents.Prepare() + l.Places.Prepare() + l.Works.Prepare() + l.Categories.Prepare() + l.Issues.Prepare() + l.Pieces.Prepare() } -func (l *Library) Cleanup() { +func (l *Library) cleanup(meta xmlprovider.ParseMeta) { wg := sync.WaitGroup{} wg.Add(6) go func() { - l.Agents.Cleanup() + l.Agents.Cleanup(meta) wg.Done() }() go func() { - l.Places.Cleanup() + l.Places.Cleanup(meta) wg.Done() }() go func() { - l.Works.Cleanup() + l.Works.Cleanup(meta) wg.Done() }() go func() { - l.Categories.Cleanup() + l.Categories.Cleanup(meta) wg.Done() }() go func() { - l.Issues.Cleanup() + l.Issues.Cleanup(meta) wg.Done() }() go func() { - l.Pieces.Cleanup() + l.Pieces.Cleanup(meta) wg.Done() }() diff --git a/xmlmodels/pieces.go b/xmlmodels/pieces.go index 6de5b98..662cdcf 100644 --- a/xmlmodels/pieces.go +++ b/xmlmodels/pieces.go @@ -1,8 +1,8 @@ package xmlmodels import ( + "encoding/json" "encoding/xml" - "fmt" "strconv" "strings" @@ -24,8 +24,13 @@ type Piece struct { AnnotationNote } +func (p Piece) Name() string { + return "piece" +} + func (p Piece) String() string { - return fmt.Sprintf("ID: %s\nIssueRefs: %v\nPlaceRefs: %v\nCategoryRefs: %v\nAgentRefs: %v\nWorkRefs: %v\nPieceRefs: %v\nIncipit: %v\nTitle: %v\nAnnotations: %v\nNotes: %v\n", p.ID, p.IssueRefs, p.PlaceRefs, p.CategoryRefs, p.AgentRefs, p.WorkRefs, p.PieceRefs, p.Incipit, p.Title, p.Annotations, p.Notes) + data, _ := json.MarshalIndent(p, "", " ") + return string(data) } func (p Piece) Keys() []string { diff --git a/xmlmodels/places.go b/xmlmodels/places.go index 68dcb52..7f95b90 100644 --- a/xmlmodels/places.go +++ b/xmlmodels/places.go @@ -1,8 +1,8 @@ package xmlmodels import ( + "encoding/json" "encoding/xml" - "fmt" ) type Place struct { @@ -14,6 +14,11 @@ type Place struct { AnnotationNote } -func (p Place) String() string { - return fmt.Sprintf("ID: %s\nNames: %v\nSortName: %s\nGeo: %s\nAnnotations: %v\nNotes: %v\n", p.ID, p.Names, p.SortName, p.Geo, p.Annotations, p.Notes) +func (p Place) Name() string { + return "place" +} + +func (p Place) String() string { + data, _ := json.MarshalIndent(p, "", " ") + return string(data) } diff --git a/xmlmodels/works.go b/xmlmodels/works.go index 5bbe6be..68199f9 100644 --- a/xmlmodels/works.go +++ b/xmlmodels/works.go @@ -1,8 +1,8 @@ package xmlmodels import ( + "encoding/json" "encoding/xml" - "fmt" "strings" ) @@ -16,6 +16,10 @@ type Work struct { AnnotationNote } +func (w Work) Name() string { + return "work" +} + func (p Work) ReferencesAgent(a string) (*AgentRef, bool) { for _, i := range p.AgentRefs { if strings.HasPrefix(i.Ref, a) { @@ -34,5 +38,6 @@ type Citation struct { } func (w Work) String() string { - return fmt.Sprintf("URLs: %v, Citation: %v, PreferredTitle: %s, Akteur: %v, Identifier: %v, AnnotationNote: %v\n", w.URLs, w.Citation, w.PreferredTitle, w.AgentRefs, w.Identifier, w.AnnotationNote) + data, _ := json.MarshalIndent(w, "", " ") + return string(data) }