Added functiontionality to cleanup XML cached collections

This commit is contained in:
Simon Martens
2024-12-02 10:19:18 +01:00
parent 7a6edbf668
commit e6b844cae1
12 changed files with 168 additions and 112 deletions

View File

@@ -1,5 +1,7 @@
package logging package logging
// BUG: loggings happens without manual flush, so the messagees come from all threads at the same time.
import ( import (
"fmt" "fmt"
"log/slog" "log/slog"

View File

@@ -14,7 +14,6 @@ type Agent struct {
Org bool `xml:"org,attr"` Org bool `xml:"org,attr"`
Identifier Identifier
AnnotationNote AnnotationNote
SerializedItem
} }
func (a Agent) String() string { func (a Agent) String() string {

View File

@@ -11,7 +11,6 @@ type Category struct {
SortName string `xml:"sortiername"` SortName string `xml:"sortiername"`
Identifier Identifier
AnnotationNote AnnotationNote
SerializedItem
} }
func (c Category) String() string { func (c Category) String() string {

View File

@@ -15,7 +15,6 @@ type Issue struct {
Additionals []Additional `xml:"beilage"` Additionals []Additional `xml:"beilage"`
Identifier Identifier
AnnotationNote AnnotationNote
SerializedItem
} }
type Nummer struct { type Nummer struct {

View File

@@ -1,19 +1,7 @@
package xmlprovider package xmlprovider
type SerializedItem struct { type ItemInfo struct {
Source string Source string
Date string Date string
Commit string Commit string
} }
func (si SerializedItem) SetSource(s string) {
si.Source = s
}
func (si SerializedItem) SetDate(d string) {
si.Date = d
}
func (si SerializedItem) SetCommit(c string) {
si.Commit = c
}

View File

@@ -0,0 +1,120 @@
package xmlprovider
import (
"fmt"
"sync"
)
type Library struct {
Agents *XMLProvider[Agent]
Places *XMLProvider[Place]
Works *XMLProvider[Work]
Categories *XMLProvider[Category]
Issues *XMLProvider[Issue]
Pieces *XMLProvider[Piece]
}
func (l *Library) String() string {
return fmt.Sprintf("Agents: %s\nPlaces: %s\nWorks: %s\nCategories: %s\nIssues: %s\nPieces: %s\n",
l.Agents.String(), l.Places.String(), l.Works.String(), l.Categories.String(), l.Issues.String(), l.Pieces.String())
}
func NewLibrary(agentpaths, placepaths, workpaths, categorypaths, issuepaths, piecepaths []string) *Library {
return &Library{
Agents: &XMLProvider[Agent]{Paths: agentpaths},
Places: &XMLProvider[Place]{Paths: placepaths},
Works: &XMLProvider[Work]{Paths: workpaths},
Categories: &XMLProvider[Category]{Paths: categorypaths},
Issues: &XMLProvider[Issue]{Paths: issuepaths},
Pieces: &XMLProvider[Piece]{Paths: piecepaths},
}
}
func (l *Library) SetPaths(agentpaths, placepaths, workpaths, categorypaths, issuepaths, piecepaths []string) {
l.Agents.Paths = agentpaths
l.Places.Paths = placepaths
l.Works.Paths = workpaths
l.Categories.Paths = categorypaths
l.Issues.Paths = issuepaths
l.Pieces.Paths = piecepaths
}
func (l *Library) Serialize(commit string) {
wg := sync.WaitGroup{}
l.Prepare()
for _, path := range l.Places.Paths {
wg.Add(1)
go func() {
l.Places.Serialize(NewPlaceRoot(), path, commit)
wg.Done()
}()
}
for _, path := range l.Agents.Paths {
wg.Add(1)
go func() {
l.Agents.Serialize(NewAgentRoot(), path, commit)
wg.Done()
}()
}
for _, path := range l.Categories.Paths {
wg.Add(1)
go func() {
l.Categories.Serialize(NewCategoryRoot(), path, commit)
wg.Done()
}()
}
for _, path := range l.Works.Paths {
wg.Add(1)
go func() {
l.Works.Serialize(NewWorkRoot(), path, commit)
wg.Done()
}()
}
for _, path := range l.Issues.Paths {
wg.Add(1)
go func() {
l.Issues.Serialize(NewIssueRoot(), path, commit)
wg.Done()
}()
}
for _, path := range l.Pieces.Paths {
wg.Add(1)
go func() {
l.Pieces.Serialize(NewPieceRoot(), path, commit)
wg.Done()
}()
}
wg.Wait()
go func() {
l.Cleanup(commit)
}()
}
// TODO: Prepare resets the list of failed parses for a new parse.
// We need to set the logs accordingly.
func (l *Library) Prepare() {
l.Agents.Prepare()
l.Places.Prepare()
l.Works.Prepare()
l.Categories.Prepare()
l.Issues.Prepare()
l.Pieces.Prepare()
}
func (l *Library) Cleanup(commit string) {
l.Agents.Cleanup(commit)
l.Places.Cleanup(commit)
l.Works.Cleanup(commit)
l.Categories.Cleanup(commit)
l.Issues.Cleanup(commit)
l.Pieces.Cleanup(commit)
}

View File

@@ -22,7 +22,6 @@ type Piece struct {
Title []string `xml:"titel"` Title []string `xml:"titel"`
Identifier Identifier
AnnotationNote AnnotationNote
SerializedItem
} }
func (p Piece) String() string { func (p Piece) String() string {

View File

@@ -12,7 +12,6 @@ type Place struct {
Geo string `xml:"geonames"` Geo string `xml:"geonames"`
Identifier Identifier
AnnotationNote AnnotationNote
SerializedItem
} }
func (p Place) String() string { func (p Place) String() string {

View File

@@ -13,7 +13,6 @@ type Work struct {
Akteur []AgentRef `xml:"akteur"` Akteur []AgentRef `xml:"akteur"`
Identifier Identifier
AnnotationNote AnnotationNote
SerializedItem
} }
type Citation struct { type Citation struct {

View File

@@ -5,6 +5,7 @@ import (
"fmt" "fmt"
"io" "io"
"os" "os"
"slices"
"sync" "sync"
"time" "time"
@@ -14,9 +15,6 @@ import (
type XMLItem interface { type XMLItem interface {
fmt.Stringer fmt.Stringer
GetIDs() []string GetIDs() []string
SetSource(string)
SetDate(string)
SetCommit(string)
} }
type Collection[T XMLItem] struct { type Collection[T XMLItem] struct {
@@ -28,94 +26,18 @@ type XMLProvider[T XMLItem] struct {
Paths []string Paths []string
// INFO: map is type [string]T // INFO: map is type [string]T
Items sync.Map Items sync.Map
// INFO: map is type [string]ItemInfo
// It keeps information about parsing status of the items.
Infos sync.Map
mu sync.Mutex
failed []string
} }
type Library struct { func (p *XMLProvider[T]) Prepare() {
Agents *XMLProvider[Agent] p.mu.Lock()
Places *XMLProvider[Place] defer p.mu.Unlock()
Works *XMLProvider[Work] p.failed = make([]string, 0)
Categories *XMLProvider[Category]
Issues *XMLProvider[Issue]
Pieces *XMLProvider[Piece]
}
func (l *Library) String() string {
return fmt.Sprintf("Agents: %s\nPlaces: %s\nWorks: %s\nCategories: %s\nIssues: %s\nPieces: %s\n",
l.Agents.String(), l.Places.String(), l.Works.String(), l.Categories.String(), l.Issues.String(), l.Pieces.String())
}
func NewLibrary(agentpaths, placepaths, workpaths, categorypaths, issuepaths, piecepaths []string) *Library {
return &Library{
Agents: &XMLProvider[Agent]{Paths: agentpaths},
Places: &XMLProvider[Place]{Paths: placepaths},
Works: &XMLProvider[Work]{Paths: workpaths},
Categories: &XMLProvider[Category]{Paths: categorypaths},
Issues: &XMLProvider[Issue]{Paths: issuepaths},
Pieces: &XMLProvider[Piece]{Paths: piecepaths},
}
}
func (l *Library) SetPaths(agentpaths, placepaths, workpaths, categorypaths, issuepaths, piecepaths []string) {
l.Agents.Paths = agentpaths
l.Places.Paths = placepaths
l.Works.Paths = workpaths
l.Categories.Paths = categorypaths
l.Issues.Paths = issuepaths
l.Pieces.Paths = piecepaths
}
func (l *Library) Serialize(commit string) {
wg := sync.WaitGroup{}
for _, path := range l.Places.Paths {
wg.Add(1)
go func() {
l.Places.Serialize(NewPlaceRoot(), path, commit)
wg.Done()
}()
}
for _, path := range l.Agents.Paths {
wg.Add(1)
go func() {
l.Agents.Serialize(NewAgentRoot(), path, commit)
wg.Done()
}()
}
for _, path := range l.Categories.Paths {
wg.Add(1)
go func() {
l.Categories.Serialize(NewCategoryRoot(), path, commit)
wg.Done()
}()
}
for _, path := range l.Works.Paths {
wg.Add(1)
go func() {
l.Works.Serialize(NewWorkRoot(), path, commit)
wg.Done()
}()
}
for _, path := range l.Issues.Paths {
wg.Add(1)
go func() {
l.Issues.Serialize(NewIssueRoot(), path, commit)
wg.Done()
}()
}
for _, path := range l.Pieces.Paths {
wg.Add(1)
go func() {
l.Pieces.Serialize(NewPieceRoot(), path, commit)
wg.Done()
}()
}
wg.Wait()
} }
func (p *XMLProvider[T]) Serialize(dataholder XMLRootElement[T], path, commit string) error { func (p *XMLProvider[T]) Serialize(dataholder XMLRootElement[T], path, commit string) error {
@@ -124,14 +46,16 @@ func (p *XMLProvider[T]) Serialize(dataholder XMLRootElement[T], path, commit st
if err := UnmarshalFile(path, dataholder); err != nil { if err := UnmarshalFile(path, dataholder); err != nil {
logging.Error(err, "Could not unmarshal file: "+path) logging.Error(err, "Could not unmarshal file: "+path)
logging.ParseMessages.ParseErrors <- logging.ParseMessage{MessageType: logging.ErrorMessage, Message: "Could not unmarshal file: " + path} logging.ParseMessages.ParseErrors <- logging.ParseMessage{MessageType: logging.ErrorMessage, Message: "Could not unmarshal file: " + path}
p.mu.Lock()
defer p.mu.Unlock()
p.failed = append(p.failed, path)
return err return err
} }
for _, item := range dataholder.Children() { for _, item := range dataholder.Children() {
item.SetSource(path)
item.SetDate(date)
item.SetCommit(commit)
// INFO: Mostly it's just one ID, so the double loop is not that bad. // INFO: Mostly it's just one ID, so the double loop is not that bad.
for _, id := range item.GetIDs() { for _, id := range item.GetIDs() {
p.Infos.Store(id, ItemInfo{Source: path, Date: date, Commit: commit})
p.Items.Store(id, item) p.Items.Store(id, item)
} }
} }
@@ -214,3 +138,27 @@ func (p *XMLProvider[T]) Everything() []T {
}) })
return items return items
} }
// TODO: how to find that the item was deleted, and couldn't just be serialized?
// -> We compare filepaths of failed serializations with filepaths of the items.
// - If the item is not in the failed serializations, it was deleted.
// - If the item is in the failed serializations, we don't know if it was deleted or not, and we keep it.
//
// Consequence: If all serializations completed, we cleanup everything.
func (p *XMLProvider[T]) Cleanup(commit string) {
todelete := make([]string, 0)
p.Infos.Range(func(key, value interface{}) bool {
info := value.(ItemInfo)
if info.Commit != commit {
if !slices.Contains(p.failed, info.Source) {
todelete = append(todelete, key.(string))
}
}
return true
})
for _, key := range todelete {
p.Infos.Delete(key)
p.Items.Delete(key)
}
}

3
reset.sh Executable file
View File

@@ -0,0 +1,3 @@
#!/bin/bash
rm -rf ./cache_gnd
rm -rf ./data_git

View File

@@ -112,6 +112,7 @@ func (s *Server) Start() {
// TODO: Error handler, which sadly, is global: // TODO: Error handler, which sadly, is global:
ErrorHandler: fiber.DefaultErrorHandler, ErrorHandler: fiber.DefaultErrorHandler,
// WARNING: The app must be run in a console, since this uses environment variables: // WARNING: The app must be run in a console, since this uses environment variables:
// It is not trivial to turn this on, since we need to mark goroutines that can be started only once.
// Prefork: true, // Prefork: true,
StreamRequestBody: false, StreamRequestBody: false,
WriteTimeout: REQUEST_TIMEOUT, WriteTimeout: REQUEST_TIMEOUT,
@@ -156,7 +157,7 @@ func (s *Server) Start() {
srv.Get("/:year?", controllers.GetYear(s.kgpz)) srv.Get("/:year?", controllers.GetYear(s.kgpz))
srv.Get("/:year/:issue/:page?", controllers.GetIssue(s.kgpz)) srv.Get("/:year/:issue/:page?", controllers.GetIssue(s.kgpz))
srv.Get("/:year/:issue/beilage/:subpage?", controllers.GetIssue(s.kgpz)) srv.Get("/:year/:issue/beilage/:page?", controllers.GetIssue(s.kgpz))
s.runner(srv) s.runner(srv)