diff --git a/helpers/logging/logging.go b/helpers/logging/logging.go index bd55866..eb8e977 100644 --- a/helpers/logging/logging.go +++ b/helpers/logging/logging.go @@ -1,5 +1,7 @@ package logging +// BUG: loggings happens without manual flush, so the messagees come from all threads at the same time. + import ( "fmt" "log/slog" diff --git a/providers/xmlprovider/agents.go b/providers/xmlprovider/agents.go index 31da437..2e30119 100644 --- a/providers/xmlprovider/agents.go +++ b/providers/xmlprovider/agents.go @@ -14,7 +14,6 @@ type Agent struct { Org bool `xml:"org,attr"` Identifier AnnotationNote - SerializedItem } func (a Agent) String() string { diff --git a/providers/xmlprovider/categories.go b/providers/xmlprovider/categories.go index 90a9d10..c7611f5 100644 --- a/providers/xmlprovider/categories.go +++ b/providers/xmlprovider/categories.go @@ -11,7 +11,6 @@ type Category struct { SortName string `xml:"sortiername"` Identifier AnnotationNote - SerializedItem } func (c Category) String() string { diff --git a/providers/xmlprovider/issues.go b/providers/xmlprovider/issues.go index 1f4ede9..5d76140 100644 --- a/providers/xmlprovider/issues.go +++ b/providers/xmlprovider/issues.go @@ -15,7 +15,6 @@ type Issue struct { Additionals []Additional `xml:"beilage"` Identifier AnnotationNote - SerializedItem } type Nummer struct { diff --git a/providers/xmlprovider/item.go b/providers/xmlprovider/item.go index 02f49c5..7a0a472 100644 --- a/providers/xmlprovider/item.go +++ b/providers/xmlprovider/item.go @@ -1,19 +1,7 @@ package xmlprovider -type SerializedItem struct { +type ItemInfo struct { Source string Date string Commit string } - -func (si SerializedItem) SetSource(s string) { - si.Source = s -} - -func (si SerializedItem) SetDate(d string) { - si.Date = d -} - -func (si SerializedItem) SetCommit(c string) { - si.Commit = c -} diff --git a/providers/xmlprovider/library.go b/providers/xmlprovider/library.go new file mode 100644 index 0000000..957e0ef --- /dev/null +++ b/providers/xmlprovider/library.go @@ -0,0 +1,120 @@ +package xmlprovider + +import ( + "fmt" + "sync" +) + +type Library struct { + Agents *XMLProvider[Agent] + Places *XMLProvider[Place] + Works *XMLProvider[Work] + Categories *XMLProvider[Category] + Issues *XMLProvider[Issue] + Pieces *XMLProvider[Piece] +} + +func (l *Library) String() string { + return fmt.Sprintf("Agents: %s\nPlaces: %s\nWorks: %s\nCategories: %s\nIssues: %s\nPieces: %s\n", + l.Agents.String(), l.Places.String(), l.Works.String(), l.Categories.String(), l.Issues.String(), l.Pieces.String()) +} + +func NewLibrary(agentpaths, placepaths, workpaths, categorypaths, issuepaths, piecepaths []string) *Library { + return &Library{ + Agents: &XMLProvider[Agent]{Paths: agentpaths}, + Places: &XMLProvider[Place]{Paths: placepaths}, + Works: &XMLProvider[Work]{Paths: workpaths}, + Categories: &XMLProvider[Category]{Paths: categorypaths}, + Issues: &XMLProvider[Issue]{Paths: issuepaths}, + Pieces: &XMLProvider[Piece]{Paths: piecepaths}, + } +} + +func (l *Library) SetPaths(agentpaths, placepaths, workpaths, categorypaths, issuepaths, piecepaths []string) { + l.Agents.Paths = agentpaths + l.Places.Paths = placepaths + l.Works.Paths = workpaths + l.Categories.Paths = categorypaths + l.Issues.Paths = issuepaths + l.Pieces.Paths = piecepaths +} + +func (l *Library) Serialize(commit string) { + wg := sync.WaitGroup{} + + l.Prepare() + + for _, path := range l.Places.Paths { + wg.Add(1) + go func() { + l.Places.Serialize(NewPlaceRoot(), path, commit) + wg.Done() + }() + } + + for _, path := range l.Agents.Paths { + wg.Add(1) + go func() { + l.Agents.Serialize(NewAgentRoot(), path, commit) + wg.Done() + }() + } + + for _, path := range l.Categories.Paths { + wg.Add(1) + go func() { + l.Categories.Serialize(NewCategoryRoot(), path, commit) + wg.Done() + }() + } + + for _, path := range l.Works.Paths { + wg.Add(1) + go func() { + l.Works.Serialize(NewWorkRoot(), path, commit) + wg.Done() + }() + } + + for _, path := range l.Issues.Paths { + wg.Add(1) + go func() { + l.Issues.Serialize(NewIssueRoot(), path, commit) + wg.Done() + }() + } + + for _, path := range l.Pieces.Paths { + wg.Add(1) + go func() { + l.Pieces.Serialize(NewPieceRoot(), path, commit) + wg.Done() + }() + } + + wg.Wait() + + go func() { + l.Cleanup(commit) + }() +} + +// TODO: Prepare resets the list of failed parses for a new parse. +// We need to set the logs accordingly. +func (l *Library) Prepare() { + l.Agents.Prepare() + l.Places.Prepare() + l.Works.Prepare() + l.Categories.Prepare() + l.Issues.Prepare() + l.Pieces.Prepare() +} + +func (l *Library) Cleanup(commit string) { + l.Agents.Cleanup(commit) + l.Places.Cleanup(commit) + l.Works.Cleanup(commit) + l.Categories.Cleanup(commit) + l.Issues.Cleanup(commit) + l.Pieces.Cleanup(commit) +} diff --git a/providers/xmlprovider/pieces.go b/providers/xmlprovider/pieces.go index 15b27e5..873369a 100644 --- a/providers/xmlprovider/pieces.go +++ b/providers/xmlprovider/pieces.go @@ -22,7 +22,6 @@ type Piece struct { Title []string `xml:"titel"` Identifier AnnotationNote - SerializedItem } func (p Piece) String() string { diff --git a/providers/xmlprovider/places.go b/providers/xmlprovider/places.go index 7288f91..0d86488 100644 --- a/providers/xmlprovider/places.go +++ b/providers/xmlprovider/places.go @@ -12,7 +12,6 @@ type Place struct { Geo string `xml:"geonames"` Identifier AnnotationNote - SerializedItem } func (p Place) String() string { diff --git a/providers/xmlprovider/works.go b/providers/xmlprovider/works.go index 4abcaff..41ea442 100644 --- a/providers/xmlprovider/works.go +++ b/providers/xmlprovider/works.go @@ -13,7 +13,6 @@ type Work struct { Akteur []AgentRef `xml:"akteur"` Identifier AnnotationNote - SerializedItem } type Citation struct { diff --git a/providers/xmlprovider/xmlprovider.go b/providers/xmlprovider/xmlprovider.go index 565d331..75b90dc 100644 --- a/providers/xmlprovider/xmlprovider.go +++ b/providers/xmlprovider/xmlprovider.go @@ -5,6 +5,7 @@ import ( "fmt" "io" "os" + "slices" "sync" "time" @@ -14,9 +15,6 @@ import ( type XMLItem interface { fmt.Stringer GetIDs() []string - SetSource(string) - SetDate(string) - SetCommit(string) } type Collection[T XMLItem] struct { @@ -28,94 +26,18 @@ type XMLProvider[T XMLItem] struct { Paths []string // INFO: map is type [string]T Items sync.Map + // INFO: map is type [string]ItemInfo + // It keeps information about parsing status of the items. + Infos sync.Map + + mu sync.Mutex + failed []string } -type Library struct { - Agents *XMLProvider[Agent] - Places *XMLProvider[Place] - Works *XMLProvider[Work] - Categories *XMLProvider[Category] - Issues *XMLProvider[Issue] - Pieces *XMLProvider[Piece] -} - -func (l *Library) String() string { - return fmt.Sprintf("Agents: %s\nPlaces: %s\nWorks: %s\nCategories: %s\nIssues: %s\nPieces: %s\n", - l.Agents.String(), l.Places.String(), l.Works.String(), l.Categories.String(), l.Issues.String(), l.Pieces.String()) -} - -func NewLibrary(agentpaths, placepaths, workpaths, categorypaths, issuepaths, piecepaths []string) *Library { - return &Library{ - Agents: &XMLProvider[Agent]{Paths: agentpaths}, - Places: &XMLProvider[Place]{Paths: placepaths}, - Works: &XMLProvider[Work]{Paths: workpaths}, - Categories: &XMLProvider[Category]{Paths: categorypaths}, - Issues: &XMLProvider[Issue]{Paths: issuepaths}, - Pieces: &XMLProvider[Piece]{Paths: piecepaths}, - } -} - -func (l *Library) SetPaths(agentpaths, placepaths, workpaths, categorypaths, issuepaths, piecepaths []string) { - l.Agents.Paths = agentpaths - l.Places.Paths = placepaths - l.Works.Paths = workpaths - l.Categories.Paths = categorypaths - l.Issues.Paths = issuepaths - l.Pieces.Paths = piecepaths -} - -func (l *Library) Serialize(commit string) { - wg := sync.WaitGroup{} - - for _, path := range l.Places.Paths { - wg.Add(1) - go func() { - l.Places.Serialize(NewPlaceRoot(), path, commit) - wg.Done() - }() - } - - for _, path := range l.Agents.Paths { - wg.Add(1) - go func() { - l.Agents.Serialize(NewAgentRoot(), path, commit) - wg.Done() - }() - } - - for _, path := range l.Categories.Paths { - wg.Add(1) - go func() { - l.Categories.Serialize(NewCategoryRoot(), path, commit) - wg.Done() - }() - } - - for _, path := range l.Works.Paths { - wg.Add(1) - go func() { - l.Works.Serialize(NewWorkRoot(), path, commit) - wg.Done() - }() - } - - for _, path := range l.Issues.Paths { - wg.Add(1) - go func() { - l.Issues.Serialize(NewIssueRoot(), path, commit) - wg.Done() - }() - } - - for _, path := range l.Pieces.Paths { - wg.Add(1) - go func() { - l.Pieces.Serialize(NewPieceRoot(), path, commit) - wg.Done() - }() - } - - wg.Wait() +func (p *XMLProvider[T]) Prepare() { + p.mu.Lock() + defer p.mu.Unlock() + p.failed = make([]string, 0) } func (p *XMLProvider[T]) Serialize(dataholder XMLRootElement[T], path, commit string) error { @@ -124,14 +46,16 @@ func (p *XMLProvider[T]) Serialize(dataholder XMLRootElement[T], path, commit st if err := UnmarshalFile(path, dataholder); err != nil { logging.Error(err, "Could not unmarshal file: "+path) logging.ParseMessages.ParseErrors <- logging.ParseMessage{MessageType: logging.ErrorMessage, Message: "Could not unmarshal file: " + path} + p.mu.Lock() + defer p.mu.Unlock() + p.failed = append(p.failed, path) return err } + for _, item := range dataholder.Children() { - item.SetSource(path) - item.SetDate(date) - item.SetCommit(commit) // INFO: Mostly it's just one ID, so the double loop is not that bad. for _, id := range item.GetIDs() { + p.Infos.Store(id, ItemInfo{Source: path, Date: date, Commit: commit}) p.Items.Store(id, item) } } @@ -214,3 +138,27 @@ func (p *XMLProvider[T]) Everything() []T { }) return items } + +// TODO: how to find that the item was deleted, and couldn't just be serialized? +// -> We compare filepaths of failed serializations with filepaths of the items. +// - If the item is not in the failed serializations, it was deleted. +// - If the item is in the failed serializations, we don't know if it was deleted or not, and we keep it. +// +// Consequence: If all serializations completed, we cleanup everything. +func (p *XMLProvider[T]) Cleanup(commit string) { + todelete := make([]string, 0) + p.Infos.Range(func(key, value interface{}) bool { + info := value.(ItemInfo) + if info.Commit != commit { + if !slices.Contains(p.failed, info.Source) { + todelete = append(todelete, key.(string)) + } + } + return true + }) + + for _, key := range todelete { + p.Infos.Delete(key) + p.Items.Delete(key) + } +} diff --git a/reset.sh b/reset.sh new file mode 100755 index 0000000..37ec9d9 --- /dev/null +++ b/reset.sh @@ -0,0 +1,3 @@ +#!/bin/bash +rm -rf ./cache_gnd +rm -rf ./data_git diff --git a/server/server.go b/server/server.go index cefc348..bebc760 100644 --- a/server/server.go +++ b/server/server.go @@ -112,6 +112,7 @@ func (s *Server) Start() { // TODO: Error handler, which sadly, is global: ErrorHandler: fiber.DefaultErrorHandler, // WARNING: The app must be run in a console, since this uses environment variables: + // It is not trivial to turn this on, since we need to mark goroutines that can be started only once. // Prefork: true, StreamRequestBody: false, WriteTimeout: REQUEST_TIMEOUT, @@ -156,7 +157,7 @@ func (s *Server) Start() { srv.Get("/:year?", controllers.GetYear(s.kgpz)) srv.Get("/:year/:issue/:page?", controllers.GetIssue(s.kgpz)) - srv.Get("/:year/:issue/beilage/:subpage?", controllers.GetIssue(s.kgpz)) + srv.Get("/:year/:issue/beilage/:page?", controllers.GetIssue(s.kgpz)) s.runner(srv)