Commit 8a9e9567 authored by Andreas Wagner's avatar Andreas Wagner
Browse files

Parse XML files.

parent e8555a34
config.json
......@@ -81,8 +81,11 @@ func SetupRoutes(conf tei2zenodo.Config) *gin.Engine {
}
*/
default:
log.Printf("--- TEI successfully parsed ---")
log.Printf("DOI present: %v", doi)
log.Printf("Title: %v", md.Title)
log.Printf("Contributor 1: %v", md.Contributors[0])
log.Printf("Contributor 2: %v", md.Contributors[1])
}
})
APIv1.GET("/webhook", func(c *gin.Context) {
......
......@@ -2,68 +2,236 @@ package xml
import (
"bytes"
"fmt"
"log"
"reflect"
"regexp"
"github.com/antchfx/xmlquery"
"github.com/antchfx/xpath"
"gitlab.gwdg.de/rg-mpg-de/tei2zenodo"
)
// ParseTEI reads a TEI file and parses its metadata into a ZMetadata variable. Returns a doi (maybe empty) and an error value.
// ParseTEI reads a TEI file and parses its metadata into a ZMetadata variable.
// Returns a doi (maybe empty) and an error value.
func ParseTEI(buf *bytes.Buffer, md *tei2zenodo.ZMetadata, conf *tei2zenodo.MetadataConfig) (string, error) {
// This is wih antchfx/xmlquery...
// Parse document (in buf) wih antchfx/xmlquery...
doc, err := xmlquery.Parse(buf)
if err != nil {
log.Printf("Could not parse xml.\n")
return "", err
}
idno := xmlquery.FindOne(doc, "//publicationStmt//idno[@type='doi']")
var doi string
if n := idno; n != nil {
doi = n.InnerText()
log.Printf("doi found: %s\n", n.InnerText())
} else {
doi = ""
log.Printf("no doi found.\n")
}
md.DOI = doi
re := regexp.MustCompile(`\s+`)
var complexTypes []string
complexTypes = append(complexTypes, "[]tei2zenodo.ZCreator")
complexTypes = append(complexTypes, "[]tei2zenodo.ZContributor")
complexTypes = append(complexTypes, "[]tei2zenodo.ZIdentifier")
complexTypes = append(complexTypes, "[]tei2zenodo.ZCommunity")
complexTypes = append(complexTypes, "[]tei2zenodo.ZGrant")
complexTypes = append(complexTypes, "[]tei2zenodo.ZSubject")
complexTypes = append(complexTypes, "[]tei2zenodo.ZLocation")
complexTypes = append(complexTypes, "[]tei2zenodo.ZDate")
//Iterate through the fields of the metadata struct and see if we have a config for them
// Iterate through the fields of the metadata struct (md)
// and see if we have a config (conf) for them
s := reflect.ValueOf(md).Elem()
typeOfT := s.Type()
for i := 0; i < s.NumField(); i++ {
f := s.Field(i)
structFieldname := typeOfT.Field(i).Name
// structFieldtype := f.Type()
structFieldtype := f.Type()
// structFieldvalue := f.Interface()
jsonFieldname := typeOfT.Field(i).Tag.Get("json")
for j := range conf.Fields {
if conf.Fields[j].Field == jsonFieldname && conf.Fields[j].XPath != "" {
// log.Printf("Found config for %s: %s", jsonFieldname, conf.Fields[j].XPath)
xpath := conf.Fields[j].XPath
t := xmlquery.FindOne(doc, xpath)
var u string
if n := t; t != nil {
u = n.InnerText()
log.Printf("%s found. Set to %v ...\n", structFieldname, n.InnerText())
f.SetString(u)
if conf.Fields[j].Field == jsonFieldname {
// log.Printf("Found config for %s.", jsonFieldname)
// log.Printf("It's a %s value...", structFieldtype)
cplx := false
for _, a := range complexTypes {
if a == structFieldtype.String() {
cplx = true
break
}
}
if cplx {
confSubfields := conf.Fields[j].Subfields
if conf.Fields[j].XPath != "" {
xp := conf.Fields[j].XPath
for _, l := range xmlquery.Find(doc, xp) {
varType := structFieldtype.Elem()
zc := reflect.Indirect(reflect.New(varType))
// u := tei2zenodo.ZContributor{Name: string(i), Type: "Test Role"}
// v := &u
// zc := reflect.ValueOf(v).Elem()
// typeOfZcT := zc.Type()
for m := 0; m < zc.NumField(); m++ {
zcf := zc.Field(m)
zStructFieldname := varType.Field(m).Name
zStructFieldtype := zcf.Type()
// ztructRFieldvalue := rf.Interface()
zJSONFieldname := varType.Field(m).Tag.Get("json")
for m := range confSubfields {
if confSubfields[m].Field == zJSONFieldname {
// log.Printf("Found config for %s.", zJSONFieldname)
// log.Printf("It's a %s value...", zStructFieldtype)
if confSubfields[m].XPath != "" {
xrpath := confSubfields[m].XPath
var u string
t := xmlquery.FindOne(l, xrpath)
if n := t; t != nil {
// u = strings.Replace(strings.Replace(strings.TrimSpace(n.InnerText()), "\n", " ", -1), " ", " ", -1)
u = re.ReplaceAllString(n.InnerText(), " ")
log.Printf("%s.%s found. Set to '%v' ...\n", structFieldname, zStructFieldname, u)
zcf.SetString(u)
}
} else if confSubfields[m].XExpression != "" {
xexpr, err := xpath.Compile(confSubfields[m].XExpression)
if err != nil {
log.Printf("Erroneous XPath expression: %s ...", confSubfields[m].XExpression)
return "", fmt.Errorf("unknown (hardcoded?) metadata type: %s.%s", structFieldtype, zStructFieldtype)
}
switch zStructFieldtype.String() {
case "string":
var u string
// TODO: Check for wrong types (e.g. xepr returning int instead of string)
u = xexpr.Evaluate(xmlquery.CreateXPathNavigator(l)).(string)
if u != "" {
log.Printf("%s.%s found. Set to '%v' ...\n", structFieldname, zStructFieldname, u)
zcf.SetString(u)
}
case "[]string":
var u []string
// TODO: Check for wrong types (e.g. xepr returning int instead of string)
v := xexpr.Evaluate(xmlquery.CreateXPathNavigator(l)).([]string)
for _, n := range v {
u = append(u, n)
newSlice := reflect.Append(f, reflect.ValueOf(n))
zcf.Set(newSlice)
}
log.Printf("%s.%s found. Set to '%v' ...\n", structFieldname, zStructFieldname, u)
default:
log.Printf("Unknown (hardcoded?) metadata type: %s.%s ...", structFieldtype, zStructFieldtype)
return "", fmt.Errorf("xml: unknown (hardcoded?) metadata type: %s.%s", structFieldtype, zStructFieldtype)
}
} else if confSubfields[m].Field == "name" || confSubfields[m].Field == "type" {
log.Printf("Problem with config: XPath or XExpression missing in %v ...", conf.Fields[j])
return "", fmt.Errorf("xml: malformed config (xpath/xexpression missing): %v", conf.Fields[j])
}
}
}
}
switch varType.Name() {
case "ZCreator":
newSlice := reflect.Append(f, reflect.ValueOf(zc.Interface().(tei2zenodo.ZCreator)))
f.Set(newSlice)
case "ZContributor":
newSlice := reflect.Append(f, reflect.ValueOf(zc.Interface().(tei2zenodo.ZContributor)))
f.Set(newSlice)
case "ZIdentifier":
newSlice := reflect.Append(f, reflect.ValueOf(zc.Interface().(tei2zenodo.ZIdentifier)))
f.Set(newSlice)
case "ZCommunity":
newSlice := reflect.Append(f, reflect.ValueOf(zc.Interface().(tei2zenodo.ZCommunity)))
f.Set(newSlice)
case "ZGrant":
newSlice := reflect.Append(f, reflect.ValueOf(zc.Interface().(tei2zenodo.ZGrant)))
f.Set(newSlice)
case "ZSubject":
newSlice := reflect.Append(f, reflect.ValueOf(zc.Interface().(tei2zenodo.ZSubject)))
f.Set(newSlice)
case "ZLocation":
newSlice := reflect.Append(f, reflect.ValueOf(zc.Interface().(tei2zenodo.ZLocation)))
f.Set(newSlice)
case "ZDate":
newSlice := reflect.Append(f, reflect.ValueOf(zc.Interface().(tei2zenodo.ZDate)))
f.Set(newSlice)
default:
log.Printf("Problem with type conversion of %s (%s)", structFieldname, varType.Name())
return "", fmt.Errorf("xml: malformed config (type problem in %s [%s])", structFieldname, varType.Name())
}
// newSlice := reflect.Append(f, reflect.ValueOf(zc))
// f.Set(newSlice)
}
} else {
log.Printf("Problem with config: XPath missing in %v ...", conf.Fields[j])
return "", fmt.Errorf("xml: malformed config (xpath missing): %v", conf.Fields[j])
}
} else if conf.Fields[j].XPath != "" {
xpath := conf.Fields[j].XPath
switch structFieldtype.String() {
case "string":
var u string
t := xmlquery.FindOne(doc, xpath)
if n := t; t != nil {
// u = strings.Replace(strings.Replace(strings.TrimSpace(n.InnerText()), "\n", " ", -1), " ", " ", -1)
u = re.ReplaceAllString(n.InnerText(), " ")
log.Printf("%s found. Set to '%v' ...\n", structFieldname, u)
f.SetString(u)
}
case "[]string":
var u []string
for _, n := range xmlquery.Find(doc, xpath) {
// v = append(u, strings.Replace(strings.Replace(strings.TrimSpace(n.InnerText()), "\n", " ", -1), " ", " ", -1))
v := re.ReplaceAllString(n.InnerText(), " ")
u = append(u, v)
newSlice := reflect.Append(f, reflect.ValueOf(v))
f.Set(newSlice)
}
log.Printf("%s found. Set to '%v' ...\n", structFieldname, u)
default:
log.Printf("Unknown (hardcoded?) metadata type: %s ...", structFieldtype)
return "", fmt.Errorf("xml: unknown (hardcoded?) metadata type: %s", structFieldtype)
}
} else if conf.Fields[j].XExpression != "" {
xexpr, err := xpath.Compile(conf.Fields[j].XExpression)
if err != nil {
log.Printf("Erroneous XPath expression: %s ...", conf.Fields[j].XExpression)
return "", fmt.Errorf("unknown (hardcoded?) metadata type: %s", structFieldtype)
}
switch structFieldtype.String() {
case "string":
var u string
// TODO: Check for wrong types (e.g. xepr returning int instead of string)
u = xexpr.Evaluate(xmlquery.CreateXPathNavigator(doc)).(string)
if u != "" {
log.Printf("%s found. Set to '%v' ...\n", structFieldname, u)
f.SetString(u)
}
case "[]string":
var u []string
// TODO: Check for wrong types (e.g. xepr returning int instead of string)
v := xexpr.Evaluate(xmlquery.CreateXPathNavigator(doc)).([]string)
for _, n := range v {
u = append(u, n)
newSlice := reflect.Append(f, reflect.ValueOf(n))
f.Set(newSlice)
}
log.Printf("%s found. Set to '%v' ...\n", structFieldname, u)
default:
log.Printf("Unknown (hardcoded?) metadata type: %s ...", structFieldtype)
return "", fmt.Errorf("xml: unknown (hardcoded?) metadata type: %s", structFieldtype)
}
} else {
log.Printf("Malformed config entry: %v ...", conf.Fields[j])
return "", fmt.Errorf("xml: malformed config entry: %v", conf.Fields[j])
}
}
}
}
// for i, n := range xmlquery.Find(doc, "//item/title") {
// log.Printf("#%d %s\n", i, n.InnerText())
// }
/*
expr, err := xpath.Compile("sum(//book/price)")
price := expr.Evaluate(xmlquery.CreateXPathNavigator(doc)).(float64)
*/
doi := md.DOI
return doi, nil
}
No preview for this file type
......@@ -39,9 +39,10 @@ type MetadataConfig struct {
}
type metadataField struct {
Field string
XPath string
Subfields []metadataField
Field string
XPath string
XExpression string
Subfields []metadataField
}
// Link HATEOAS-links our resources
......
......@@ -65,6 +65,9 @@
refer to our website.</ref></publisher>
</xi:fallback>
</xi:include>
<availability>
<license target="http://creativecommons.org/cc-by/4.0">CC-BY</license>
</availability>
<date type="digitizedEd" when="2018-07-10">2018-07-10</date>
<idno>
<idno type="doi">10.1234/zenodo.567890</idno>
......@@ -134,6 +137,13 @@
<language ident="es" usage="75" n="main" xml:lang="en">Spanish</language>
<language ident="la" usage="25" n="marginal" xml:lang="en">Latin</language>
</langUsage>
<textClass>
<keywords>
<term>Keyword 1</term>
<term>Keyword 2</term>
<term>Keyword 3</term>
</keywords>
</textClass>
</profileDesc>
<encodingDesc>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment