This commit is contained in:
Simon Martens
2025-03-05 16:41:39 +01:00
commit e19fd47c17
88 changed files with 9765 additions and 0 deletions

37
xml/helpers.go Normal file
View File

@@ -0,0 +1,37 @@
package xmlparsing
import (
"encoding/xml"
"io"
"os"
"path/filepath"
)
func UnmarshalFile[T any](filename string, data T) error {
xmlFile, err := os.Open(filename)
if err != nil {
return err
}
defer xmlFile.Close()
byteValue, err := io.ReadAll(xmlFile)
if err != nil {
return err
}
err = xml.Unmarshal(byteValue, &data)
if err != nil {
return err
}
return nil
}
func XMLFilesForPath(path string) ([]string, error) {
if _, err := os.Stat(path); os.IsNotExist(err) {
return nil, err
}
matches, err := filepath.Glob(filepath.Join(path, "*.xml"))
return matches, err
}

12
xml/item.go Normal file
View File

@@ -0,0 +1,12 @@
package xmlparsing
type ItemInfo struct {
Source string
Parse ParseMeta
}
// INFO: These are just root elements that hold the data of the XML files.
// They get discarded after a parse.
type XMLRootElement[T any] interface {
Children() []T
}

15
xml/library.go Normal file
View File

@@ -0,0 +1,15 @@
package xmlparsing
import "sync"
type Library struct {
pmux sync.Mutex
Parses []ParseMeta
}
func (l *Library) Latest() ParseMeta {
if len(l.Parses) == 0 {
return ParseMeta{}
}
return l.Parses[len(l.Parses)-1]
}

32
xml/models.go Normal file
View File

@@ -0,0 +1,32 @@
package xmlparsing
import "fmt"
type IXMLItem interface {
fmt.Stringer
// INFO:
// - Keys should be unique
// - Keys[0] has the special meaning of the primary key (for FTS etc.)
Keys() []string
Type() string
}
type ILibrary interface {
Parse(meta ParseMeta) error
}
type ResolvingMap[T IXMLItem] map[string][]Resolved[T]
type ReferenceResolver[T IXMLItem] interface {
References() ResolvingMap[T]
}
type Resolved[T IXMLItem] struct {
Item *T
Reference string
Category string
Cert bool
Conjecture bool
Comment string
MetaData map[string]string
}

49
xml/optionalbool.go Normal file
View File

@@ -0,0 +1,49 @@
package xmlparsing
import (
"encoding/xml"
"strings"
)
type OptionalBool int
const (
Unspecified OptionalBool = iota
True
False
)
func (b *OptionalBool) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
var attr struct {
Value string `xml:"value,attr"`
}
if err := d.DecodeElement(&attr, &start); err != nil {
return err
}
switch strings.ToLower(attr.Value) {
case "true":
*b = True
case "false":
*b = False
default:
*b = Unspecified
}
return nil
}
func (b OptionalBool) MarshalXML(e *xml.Encoder, start xml.StartElement) error {
if b == Unspecified {
return nil
}
value := "false"
if b == True {
value = "true"
}
type alias struct {
Value string `xml:"value,attr"`
}
return e.EncodeElement(alias{Value: value}, start)
}

48
xml/resolver.go Normal file
View File

@@ -0,0 +1,48 @@
package xmlparsing
// INFO: This is used to resolve references (back-links) between XML items.
import (
"fmt"
"sync"
)
type Resolver[T IXMLItem] struct {
// INFO: map[type][ID]
index map[string]map[string][]Resolved[T]
mu sync.RWMutex
}
func NewResolver[T IXMLItem]() *Resolver[T] {
return &Resolver[T]{index: make(map[string]map[string][]Resolved[T])}
}
func (r *Resolver[T]) Add(typeName, refID string, item Resolved[T]) {
r.mu.Lock()
defer r.mu.Unlock()
if _, exists := r.index[typeName]; !exists {
r.index[typeName] = make(map[string][]Resolved[T])
}
r.index[typeName][refID] = append(r.index[typeName][refID], item)
}
func (r *Resolver[T]) Get(typeName, refID string) ([]Resolved[T], error) {
r.mu.RLock()
defer r.mu.RUnlock()
if typeIndex, exists := r.index[typeName]; exists {
if items, ok := typeIndex[refID]; ok {
return items, nil
}
return nil, fmt.Errorf("no references found for refID '%s' of type '%s'", refID, typeName)
}
return nil, fmt.Errorf("no index exists for type '%s'", typeName)
}
func (r *Resolver[T]) Clear() {
r.mu.Lock()
defer r.mu.Unlock()
r.index = make(map[string]map[string][]Resolved[T])
}

200
xml/xmlprovider.go Normal file
View File

@@ -0,0 +1,200 @@
package xmlparsing
import (
"slices"
"sync"
"time"
)
type ParseSource int
const (
SourceUnknown ParseSource = iota
Path
Commit
)
type ParseMeta struct {
Source ParseSource
BaseDir string
Commit string
Date time.Time
FailedPaths []string
}
func (p ParseMeta) Equals(other ParseMeta) bool {
return p.Source == other.Source && p.BaseDir == other.BaseDir && p.Commit == other.Commit && p.Date == other.Date
}
func (p ParseMeta) Failed(path string) bool {
return slices.Contains(p.FailedPaths, path)
}
// An XMLParser is a struct that holds holds serialized XML data of a specific type. It combines multiple parses IF a succeeded parse can not serialize the data from a path.
type XMLParser[T IXMLItem] struct {
// INFO: map is type map[string]*T
Items sync.Map
// INFO: map is type [string]ItemInfo
// It keeps information about parsing status of the items.
Infos sync.Map
// INFO: Resolver is used to resolve references (back-links) between XML items.
Resolver Resolver[T]
mu sync.RWMutex
// TODO: This array is meant to be for iteration purposes, since iteration over the sync.Map is slow.
// It is best for this array to be sorted by key of the corresponding item.
Array []T
}
func NewXMLParser[T IXMLItem]() *XMLParser[T] {
return &XMLParser[T]{Resolver: *NewResolver[T]()}
}
// INFO: To parse sth, we call Prepare, then Serialize, then Cleanup.
// Prepare & Cleanup are called once per parse. Serialize is called for every path.
// and can be called concurretly.
func (p *XMLParser[T]) Prepare() {
p.mu.Lock()
defer p.mu.Unlock()
p.Array = make([]T, 0, len(p.Array))
p.Resolver.Clear()
}
func (p *XMLParser[T]) Serialize(dataholder XMLRootElement[T], path string, latest ParseMeta) error {
if err := UnmarshalFile(path, dataholder); err != nil {
return err
}
newItems := dataholder.Children()
for _, item := range newItems {
// INFO: Mostly it's just one ID, so the double loop is not that bad.
for _, id := range item.Keys() {
p.Infos.Store(id, ItemInfo{Source: path, Parse: latest})
p.Items.Store(id, &item)
}
p.addResolvable(item)
}
p.mu.Lock()
defer p.mu.Unlock()
p.Array = append(p.Array, newItems...)
return nil
}
// INFO: Cleanup is called after all paths have been serialized.
// It deletes all items that have not been parsed in the last commit,
// and whose filepath has not been marked as failed.
func (p *XMLParser[T]) Cleanup(latest ParseMeta) {
todelete := make([]string, 0)
toappend := make([]*T, 0)
p.Infos.Range(func(key, value interface{}) bool {
info := value.(ItemInfo)
if !info.Parse.Equals(latest) {
if !latest.Failed(info.Source) {
todelete = append(todelete, key.(string))
} else {
item, ok := p.Items.Load(key)
if ok {
i := item.(*T)
if !slices.Contains(toappend, i) {
toappend = append(toappend, i)
}
}
}
}
return true
})
for _, key := range todelete {
p.Infos.Delete(key)
p.Items.Delete(key)
}
p.mu.Lock()
defer p.mu.Unlock()
for _, item := range toappend {
p.Array = append(p.Array, *item)
p.addResolvable(*item)
}
slices.SortFunc(p.Array, Sort)
}
func (p *XMLParser[T]) addResolvable(item T) {
// INFO: If the item has a GetReferences method, we add the references to the resolver.
if rr, ok := any(item).(ReferenceResolver[T]); ok {
for name, ids := range rr.References() {
for _, res := range ids {
res.Item = &item
p.Resolver.Add(name, res.Reference, res)
}
}
}
}
func (p *XMLParser[T]) ReverseLookup(item IXMLItem) []Resolved[T] {
// INFO: this runs just once for the first key
ret := make([]Resolved[T], 0)
keys := item.Keys()
for _, key := range keys {
r, err := p.Resolver.Get(item.Type(), key)
if err == nil {
ret = append(ret, r...)
}
}
return ret
}
func (a *XMLParser[T]) String() string {
var s string
for _, item := range a.Array {
s += item.String()
}
return s
}
func (p *XMLParser[T]) Info(id string) ItemInfo {
info, ok := p.Infos.Load(id)
if !ok {
return ItemInfo{}
}
return info.(ItemInfo)
}
func (p *XMLParser[T]) Item(id string) *T {
item, ok := p.Items.Load(id)
if !ok {
return nil
}
i := item.(*T)
return i
}
func (p *XMLParser[T]) Find(fn func(*T) bool) []T {
p.mu.RLock()
defer p.mu.RUnlock()
var items []T
for _, item := range p.Array {
if fn(&item) {
items = append(items, item)
}
}
return items
}
// INFO: These are only reading locks.
func (p *XMLParser[T]) Lock() {
p.mu.RLock()
}
func (p *XMLParser[T]) Unlock() {
p.mu.RUnlock()
}

77
xml/xmlsort.go Normal file
View File

@@ -0,0 +1,77 @@
package xmlparsing
import (
"strconv"
"strings"
)
func Sort[T IXMLItem](i, j T) int {
keys_a := i.Keys()
keys_b := j.Keys()
if len(keys_a) == 0 && len(keys_b) == 0 {
return 0
}
if len(keys_a) == 0 && len(keys_b) > 0 {
return -1
}
if len(keys_a) > 0 && len(keys_b) == 0 {
return 1
}
sort_a := strings.Split(keys_a[0], "-")
sort_b := strings.Split(keys_b[0], "-")
for i, item := range sort_a {
if i >= len(sort_b) {
return 1
}
// INFO: this is a bit lazy since
// - we are comparing bit values not unicode code points
// - the comparison is case sensitive
int_a, err := strconv.Atoi(item)
if err != nil {
if item < sort_b[i] {
return -1
}
if item > sort_b[i] {
return 1
}
continue
}
int_b, err := strconv.Atoi(sort_b[i])
if err != nil {
if item < sort_b[i] {
return -1
}
if item > sort_b[i] {
return 1
}
continue
}
if int_a < int_b {
return -1
}
if int_a > int_b {
return 1
}
}
if len(sort_b) > len(sort_a) {
return -1
}
return 0
}

371
xml/xsdtime.go Normal file
View File

@@ -0,0 +1,371 @@
package xmlparsing
import (
"errors"
"fmt"
"strconv"
"strings"
)
// An implementation of the xsd 1.1 datatypes:
// date, gDay, gMonth, gMonthDay, gYear, gYearMonth.
type XSDDatetype int
type Seperator byte
const (
DEFAULT_YEAR = 0
DEFAULT_DAY = 1
DEFAULT_MONTH = 1
MIN_ALLOWED_NUMBER = 0x30 // 0
MAX_ALLOWED_NUMBER = 0x39 // 9
SIGN = 0x2D // -
SEPERATOR = 0x2D // -
PLUS = 0x2B // +
COLON = 0x3A // :
TIMEZONE = 0x5A // Z
NONE = 0x00 // 0
)
const (
Unknown XSDDatetype = iota
Invalid
Date
GDay
GMonth
GYear
GMonthDay
GYearMonth
)
type XSDDate struct {
base string
Year int
Month int
Day int
hasTimezone bool
hasYear bool
hasMonth bool
hasDay bool
TZH int
TZM int
state XSDDatetype
error bool
// INFO: XSD Date Datatypes typically describe a duration in the value space.
// TimeError bool
// BaseTime time.Time
// BaseDuration time.Duration
}
// Sanity check:
// MONTH DAY + Date: Sanity check Month and Day. Additional checks:
// - Month: 2 - Day < 30
// - Month: 4, 6, 9, 11 - Day < 31
// - Month: 1, 3, 5, 7, 8, 10, 12 - Day < 32
// YEAR + Date: Sanity check Year + February 29. Check zero padding.
// Additional checks:
// - Feb 29 on leap years: y % 4 == 0 && (y % 100 != 0 || y % 400 == 0)
// -> Check last 2 digits: if both are zero, check first two digits.
// Else if last digit is n % 4 == 0, the second to last digit m % 2 == 0
// Else if last digit is n % 4 == 2, the second to last digit m % 2 == 1
// Else its not a leap year.
// - no 0000 Year
//
func New(s string) (XSDDate, error) {
dt := XSDDate{base: s}
err := dt.Parse(s)
return dt, err
}
func (d XSDDate) String() string {
var s string
if d.Year != 0 {
s += fmt.Sprintf("%d", d.Year)
}
if d.Month != 0 {
if d.Year == 0 {
s += "-"
}
s += fmt.Sprintf("-%02d", d.Month)
}
if d.Day != 0 {
if d.Year == 0 && d.Month == 0 {
s += "--"
}
s += fmt.Sprintf("-%02d", d.Day)
}
if d.hasTimezone {
if d.TZH == 0 && d.TZM == 0 {
s += "Z"
} else {
sep := "+"
hint := d.TZH
if hint < 0 {
sep = "-"
hint *= -1
}
h := fmt.Sprintf("%02d", hint)
s += fmt.Sprintf("%v%v:%02d", sep, h, d.TZM)
}
}
return s
}
func (d *XSDDate) UnmarshalText(text []byte) error {
return d.Parse(string(text))
}
func (d XSDDate) MarshalText() ([]byte, error) {
return []byte(d.String()), nil
}
func (xsdd *XSDDate) Parse(s string) error {
s = strings.TrimSpace(s)
xsdd.base = s
// The smallest possible date is 4 chars long
if len(s) < 4 {
return xsdd.parseError("Date too short")
}
// Check for Z, then check for timezone
if len(s) >= 5 && s[len(s)-1] == TIMEZONE {
xsdd.hasTimezone = true
s = s[:len(s)-1]
} else if len(s) >= 10 {
err := xsdd.parseTimezone(s[len(s)-6:])
if err == nil {
s = s[:len(s)-6]
}
}
// Year
if s[1] != SEPERATOR {
i := 3
for ; i < len(s); i++ {
if s[i] < MIN_ALLOWED_NUMBER || s[i] > MAX_ALLOWED_NUMBER {
break
}
}
yint, err := strconv.Atoi(s[:i])
if err != nil {
return xsdd.parseError(fmt.Sprintf("Invalid year: %v", s[:i]))
}
xsdd.Year = yint
xsdd.hasYear = true
if i == len(s) {
return nil
}
s = s[i+1:]
} else {
s = s[2:]
}
// Left are 02 (Month), -02 (Day), 02-02 (Date)
if s[0] != SEPERATOR {
mstr := s[:2]
mint, err := strconv.Atoi(mstr)
if err != nil {
return xsdd.parseError(fmt.Sprintf("Invalid month: %v", mstr))
}
xsdd.Month = mint
xsdd.hasMonth = true
s = s[2:]
if len(s) == 0 {
return nil
} else if len(s) != 3 || s[0] != SEPERATOR {
return xsdd.parseError(fmt.Sprintf("Invalid date ending: %v", s))
}
}
s = s[1:]
// Left is 02 Day
dint, err := strconv.Atoi(s)
if err != nil {
return xsdd.parseError(fmt.Sprintf("Invalid day: %v", s))
}
// INFO: We do not check len here, it is handled above
xsdd.Day = dint
xsdd.hasDay = true
return nil
}
var WD_CALC_MATRIX = []int{0, 3, 2, 5, 0, 3, 5, 1, 4, 6, 2, 4}
func (xsdd XSDDate) Weekday() int {
y := xsdd.Year
if xsdd.Month < 3 {
y--
}
return (y + y/4 - y/100 + y/400 + WD_CALC_MATRIX[xsdd.Month-1] + xsdd.Day) % 7
}
func (xsdd XSDDate) Base() string {
return xsdd.base
}
func (xsdd XSDDate) Type() XSDDatetype {
if xsdd.state == Unknown {
_ = xsdd.Validate()
}
return xsdd.state
}
func (xsdd *XSDDate) Validate() bool {
if xsdd.error {
xsdd.state = Invalid
return false
}
xsdd.state = xsdd.inferState()
if xsdd.state != Invalid {
return true
}
return false
}
func (xsdd *XSDDate) parseError(s string) error {
xsdd.error = true
xsdd.state = Invalid
return errors.New(s)
}
func (xsdd *XSDDate) parseTimezone(s string) error {
// INFO: We assume the check for 'Z' has already been done
if len(s) != 6 || s[3] != COLON || (s[0] != PLUS && s[0] != SIGN) {
return fmt.Errorf("Invalid timezone")
}
h, err := strconv.Atoi(s[:3])
if err != nil {
return fmt.Errorf("Invalid hour: %v", s[:3])
}
m, err := strconv.Atoi(s[4:])
if err != nil {
return fmt.Errorf("Invalid minute: %v", s[4:])
}
xsdd.hasTimezone = true
xsdd.TZH = h
xsdd.TZM = m
return nil
}
func (xsdd XSDDate) inferState() XSDDatetype {
if xsdd.hasYear && xsdd.hasMonth && xsdd.hasDay {
if !validDayMonthYear(xsdd.Year, xsdd.Month, xsdd.Day) {
return Invalid
}
return Date
} else if xsdd.hasYear && xsdd.hasMonth {
if !validMonth(xsdd.Month) || !validYear(xsdd.Year) {
return Invalid
}
return GYearMonth
} else if xsdd.hasMonth && xsdd.hasDay {
if !validDayMonth(xsdd.Day, xsdd.Month) {
return Invalid
}
return GMonthDay
} else if xsdd.hasYear {
if !validYear(xsdd.Year) {
return Invalid
}
return GYear
} else if xsdd.hasMonth {
if !validMonth(xsdd.Month) {
return Invalid
}
return GMonth
} else if xsdd.hasDay {
if !validDay(xsdd.Day) {
return Invalid
}
return GDay
}
return Invalid
}
func validDay(i int) bool {
if i < 1 || i > 31 {
return false
}
return true
}
func validMonth(i int) bool {
if i < 1 || i > 12 {
return false
}
return true
}
func validYear(i int) bool {
if i == 0 {
return false
}
return true
}
func validDayMonth(d int, m int) bool {
if !validDay(d) || !validMonth(m) {
return false
}
if m == 2 {
if d > 29 {
return false
}
} else if m == 4 || m == 6 || m == 9 || m == 11 {
if d > 30 {
return false
}
}
return true
}
func validDayMonthYear(y int, m int, d int) bool {
if !validDay(d) || !validMonth(m) || !validYear(y) {
return false
}
if m == 2 {
if d == 29 {
if y%4 == 0 && (y%100 != 0 || y%400 == 0) {
return true
}
return false
}
}
return true
}

69
xml/xsdtime_test.go Normal file
View File

@@ -0,0 +1,69 @@
package xmlparsing
import "testing"
type Test struct {
Input string
Output XSDDate
Type XSDDatetype
}
var tests = []Test{
{"2006-01-02", XSDDate{Year: 2006, Month: 1, Day: 2}, GYear},
{"-1222-01-02", XSDDate{Year: -1222, Month: 1, Day: 2}, Date},
{"-2777", XSDDate{Year: -2777}, GYear},
{"1988-12:30", XSDDate{Year: 1988, hasTimezone: true, TZH: -12, TZM: 30}, GYear},
{"--03+05:00", XSDDate{Month: 3, hasTimezone: true, TZH: 5, TZM: 0}, GMonth},
{"---29", XSDDate{Day: 29}, GDay},
{"-1234567-12Z", XSDDate{Year: -1234567, Month: 12, hasTimezone: true, TZH: 0, TZM: 0}, GYearMonth},
{"-1234567-12+05:00", XSDDate{Year: -1234567, Month: 12, hasTimezone: true, TZH: 5, TZM: 0}, GYearMonth},
{"--12-31", XSDDate{Month: 12, Day: 31}, GMonthDay},
}
func TestParse(t *testing.T) {
for _, test := range tests {
dt, err := New(test.Input)
if err != nil {
t.Errorf("Error parsing %v: %v", test.Input, err)
continue
}
if dt.Year != test.Output.Year {
t.Errorf("Year mismatch for %v: expected %v, got %v", test.Input, test.Output.Year, dt.Year)
}
if dt.Month != test.Output.Month {
t.Errorf("Month mismatch for %v: expected %v, got %v", test.Input, test.Output.Month, dt.Month)
}
if dt.Day != test.Output.Day {
t.Errorf("Day mismatch for %v: expected %v, got %v", test.Input, test.Output.Day, dt.Day)
}
if dt.hasTimezone != test.Output.hasTimezone {
t.Errorf("Timezone mismatch for %v: expected %v, got %v", test.Input, test.Output.hasTimezone, dt.hasTimezone)
}
if dt.TZH != test.Output.TZH {
t.Errorf("Timezone mismatch for %v: expected %v, got %v", test.Input, test.Output.TZH, dt.TZH)
}
if dt.TZM != test.Output.TZM {
t.Errorf("Timezone mismatch for %v: expected %v, got %v", test.Input, test.Output.TZM, dt.TZM)
}
}
}
func TestString(t *testing.T) {
for _, test := range tests {
dt, err := New(test.Input)
if err != nil {
t.Errorf("Error parsing %v: %v", test.Input, err)
continue
}
if dt.String() != test.Input {
t.Errorf("String mismatch for %v: expected %v, got %v", test.Input, test.Input, dt.String())
}
}
}