Commit e34aea9d authored by Moritz Schepp's avatar Moritz Schepp
Browse files

implement elasticsearch indexing and triggers

parent 621043f3
<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://exist-db.org/collection-config/1.0">
<index xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xhtml="http://www.w3.org/1999/xhtml">
<lucene>
<analyzer class="org.apache.lucene.analysis.standard.StandardAnalyzer"/>
<text match="//xhtml:div"/>
<text qname="tei:person">
<field name="name_de" expression="tei:persName[@xml:lang='de']"/>
<field name="name_fr" expression="tei:persName[@xml:lang='fra']"/>
<field name="nationality_de" expression="tei:nationality[@xml:lang='de']"/>
<field name="nationality_fr" expression="tei:nationality[@xml:lang='fra']"/>
<field name="occupation_de" expression="tei:occupation[@xml:lang='de']"/>
<field name="occupation_fr" expression="tei:occupation[@xml:lang='fra']"/>
<field name="note_de" expression="tei:note[@xml:lang='de']"/>
<field name="note_fr" expression="tei:note[@xml:lang='fra']"/>
<field name="birth" expression="tei:birth"/>
<field name="death" expression="tei:death"/>
</text>
<text qname="tei:place">
<field name="name_de" expression="tei:placeName[@xml:lang='de']"/>
<field name="name_fr" expression="tei:placeName[@xml:lang='fra']"/>
<field name="note_de" expression="tei:note[@xml:lang='de']"/>
<field name="note_fr" expression="tei:note[@xml:lang='fra']"/>
<field name="artist" expression="tei:note[@tei:type='artist']"/>
</text>
<text qname="tei:item">
<field name="name_de" expression="tei:name[@xml:lang='de']"/>
<field name="name_fr" expression="tei:name[@xml:lang='fra']"/>
<field name="date_de" expression="tei:date[@xml:lang='de']"/>
<field name="date_fr" expression="tei:date[@xml:lang='fra']"/>
<field name="note_de" expression="tei:note[@xml:lang='de']"/>
<field name="note_fr" expression="tei:note[@xml:lang='fra']"/>
<field name="artist" expression="tei:note[@tei:type='artist']/persName"/>
<field name="location" expression="tei:location/placeName"/>
</text>
</lucene>
<index xmlns:xs="http://www.w3.org/2001/XMLSchema">
<fulltext default="none" attributes="false"/>
</index>
<triggers>
<trigger class="org.exist.extensions.exquery.restxq.impl.RestXqTrigger"/>
......
xquery version "3.1";
xmldb:reindex('/db/apps/sade-architrave')
\ No newline at end of file
(:xmldb:reindex('/db/apps/sade-architrave'):)
import module namespace elastic="http://elastic.io" at "elastic.xqm";
elastic:setup()
xquery version "3.1";
import module namespace config="https://sade.textgrid.de/ns/config" at "config.xqm";
import module namespace kwic="http://exist-db.org/xquery/kwic";
(:import module namespace config="https://sade.textgrid.de/ns/config" at "config.xqm";:)
(:import module namespace kwic="http://exist-db.org/xquery/kwic";:)
import module namespace elastic="http://elastic.io" at "elastic.xqm";
declare namespace output="http://www.w3.org/2010/xslt-xquery-serialization";
declare namespace tei="http://www.tei-c.org/ns/1.0";
declare namespace xhtml="http://www.w3.org/1999/xhtml";
(:declare namespace tei="http://www.tei-c.org/ns/1.0";:)
(:declare namespace xhtml="http://www.w3.org/1999/xhtml";:)
declare option output:method "json";
declare option output:media-type "application/json";
declare variable $action := request:get-parameter('action', 'search');
(: :declare variable $terms := request:get-parameter('terms', '*:*');:)
declare variable $terms := request:get-parameter('terms', '');
declare variable $page := xs:integer(request:get-parameter('page', 1));
declare variable $per_page := xs:integer(request:get-parameter('per_page', 10));
declare variable $locale := request:get-parameter('locale', 'any');
declare variable $app-root := '/db/apps/sade-architrave';
declare function local:search-editions-lucene() {
let $editions_de := collection(
concat($config:app-root, '/templates/34zmq'), (: harrach :)
concat($config:app-root, '/templates/34zs7'), (: sturm :)
concat($config:app-root, '/templates/3ptwg'), (: corfey :)
concat($config:app-root, '/templates/3qr4f'), (: neumann :)
concat($config:app-root, '/templates/34znb'), (: pitzler :)
concat($config:app-root, '/templates/3c0m2') (: knesebeck :)
)
let $editions_fr := collection(
concat($config:app-root, '/templates/3czfj'), (: harrach :)
concat($config:app-root, '/templates/3q4rq'), (: sturm :)
concat($config:app-root, '/templates/3r0fv'), (: corfey :)
concat($config:app-root, '/templates/3r3nn'), (: neumann :)
concat($config:app-root, '/templates/350mg'), (: pitzler :)
concat($config:app-root, '/templates/3czn9') (: knesebeck :)
)
(: yeah right, that's pretty nasty, but I didn't find another way since
'collection' can't be called with a collection and I didn't find anything
like a splat operator to convert a sequence to a list of parameters :)
let $both := collection(
(: de :)
concat($config:app-root, '/templates/34zmq'), (: harrach :)
concat($config:app-root, '/templates/34zs7'), (: sturm :)
concat($config:app-root, '/templates/3ptwg'), (: corfey :)
concat($config:app-root, '/templates/3qr4f'), (: neumann :)
concat($config:app-root, '/templates/34znb'), (: pitzler :)
concat($config:app-root, '/templates/3c0m2'), (: knesebeck :)
(: fr :)
concat($config:app-root, '/templates/3czfj'), (: harrach :)
concat($config:app-root, '/templates/3q4rq'), (: sturm :)
concat($config:app-root, '/templates/3r0fv'), (: corfey :)
concat($config:app-root, '/templates/3r3nn'), (: neumann :)
concat($config:app-root, '/templates/350mg'), (: pitzler :)
concat($config:app-root, '/templates/3czn9') (: knesebeck :)
)
let $editions :=
if ($locale eq 'de') then
$editions_de
else
if ($locale eq 'fr') then
$editions_fr
else
$both
let $pages := $editions/xhtml:div
let $results :=
for $hit in $pages[ft:query(., $terms)]
let $textgrid_uri := string($hit//xhtml:span[@id='tei-meta-textGridURI']/text())
let $edition := fn:tokenize($textgrid_uri, '[:\.]')[2]
let $page_number := xs:integer($hit//xhtml:span[@class='pb']/text())
let $text := fn:normalize-space(data($hit))
let $name := string($hit//xhtml:h4[starts-with(@id, 'tei-title-main-')][1])
let $score as xs:float := ft:score($hit)
order by $name, $page_number
return
map {
'type': 'edition',
'id': concat('edition-', $edition, '-', $page_number),
'name': $name,
'edition': $edition,
'page': $page_number,
'text': $text,
'score': $score,
'hit': $hit
}
let $paginated := local:paginate($results, fn:true())
return $paginated
};
(: we wrap all wiki content with an "artificial" element to hold the signature
: as in <id>-<locale> and the revision. This makes it easier later to iterate
: grouped constellations based on those values. :)
declare function local:wrap-wiki-pages() {
let $wikis := collection(concat($config:app-root, '/docs'))//div[@id='wiki']
let $wraps :=
for $wiki in $wikis
let $uri := string(base-uri($wiki))
let $doc_name := fn:tokenize($uri, '/')[6]
let $parts := fn:tokenize($doc_name, '[_\.]')
let $id := $parts[1]
let $l := $parts[2]
let $revision := xs:integer(replace($parts[3], 'rev', ''))
return
<arch-wiki-page
wiki-id="{$id}"
locale="{$l}"
signature="{$id}-{$l}"
revision="{$revision}"
>{$wiki}</arch-wiki-page>
return $wraps
};
(: this discards all old wiki page revisions :)
declare function local:recent-wiki-content() {
let $wraps := local:wrap-wiki-pages()
let $result :=
for $sig in distinct-values($wraps/@signature)
let $revisions :=
for $revision in $wraps[@signature=$sig]/@revision
order by xs:integer($revision) descending
return $revision
let $latest := $revisions[1]
return $wraps[@signature=$sig][@revision=$latest]
return $result
};
(: doesn't work because the search base is generic as they are wrapped documents
: so the index doesn't get triggered on them and ft:query simply returns the
: empty sequence :)
(: :declare function local:search-pages-lucene() {
let $wikis := local:recent-wiki-content()
let $hits := $wikis[ft:query(., $terms)]
let $results :=
for $hit in $hits
let $score as xs:float := ft:score($hit)
let $id := string($hit/@id)
let $l := string($hit/@locale)
let $revision := xs:integer($hit/@revision)
let $text := fn:normalize-space(data($hit))
let $title := string(($hit//h1)[1])
let $localeMatch :=
local:locale(fn:false()) eq 'any' or
$l eq local:locale(fn:false())
where $localeMatch
return
map {
'type': 'page',
'id': concat('page-', $id, '-', $l, '-', $revision),
'name': $title,
'confluence_id': $id,
'locale': $l,
'revision': $revision,
'text': $text,
'title': $title,
'hit': $hit,
'score': $score
}
return local:paginate($results, fn:true())
};
:)
declare function local:search-pages() {
let $results :=
for $page in local:recent-wiki-content()
let $id := $page/@id
let $l := string($page/@locale)
let $revision := xs:integer($page/@revision)
let $text := fn:normalize-space(data($page))
let $title := string(($page//h1)[1])
let $localeMatch :=
local:locale(fn:false()) eq 'any' or
$l eq local:locale(fn:false())
where fn:matches($text, $terms, 'i') and $localeMatch
return
map {
'type': 'page',
'id': concat('page-', $id, '-', $l, '-', $revision),
'name': $title,
'confluence_id': $id,
'locale': $l,
'revision': $revision,
'text': $text,
'title': $title
declare function local:search-editions-elastic() {
let $filters := if ($locale eq 'any') then [] else [
map {"term": map {"locale": $locale}}
]
let $response := elastic:search("edition_pages", map {
"from": ($page - 1) * $per_page,
"size": $per_page,
"query": map {
"bool": map {
"must": [
map {
"query_string": map {
"query": $terms
}
}
],
"filter": $filters
}
return local:paginate($results, fn:false())
};
declare function local:search-people-lucene() {
let $people := collection(concat($config:app-root, '/templates/register'))//tei:person
let $query := if ($locale eq 'any') then $terms
else
fn:string-join(
(
'name_', $locale,':(', $terms,') OR ',
'nationality_', $locale,':(', $terms,') OR ',
'occupation_', $locale,':(', $terms,') OR ',
'note_', $locale,':(', $terms,') OR ',
'birth:(', $terms,') OR ',
'death:(', $terms,')'
),
''
)
let $hits := $people[ft:query(., $query)]
let $l := if ($locale eq 'any') then 'de' else local:locale()
let $results :=
for $hit in $hits
let $id := string($hit/@xml:id)
let $name := string($hit/tei:persName[@xml:lang=$l])
return
map {
'type': 'person',
'id': concat('person-', $id),
'name': $name,
'textgrid_id': $id,
'de': ft:get-field($hit, 'de')
},
"highlight": map {
"fragment_size": 100,
"fields": map {
"search_data": map {}
}
return
local:paginate($results, fn:false())
};
(:
declare function local:search-people() {
let $persons := doc(concat($config:app-root, '/templates/register/persons.xml'))
let $l :=
if (local:locale() eq 'any') then 'de'
else local:locale()
let $results :=
for $person in $persons//tei:listPerson/tei:person
let $id := string($person/@xml:id)
let $name := string($person/tei:persName[@xml:lang=$l])
where fn:matches($name, $terms, 'i')
order by $name
return
map {
'type': 'person',
'id': concat('person-', $id),
'name': $name,
'textgrid_id': $id
}
return
local:paginate($results, fn:false())
};
:)
declare function local:search-works-lucene() {
let $people := collection(concat($config:app-root, '/templates/register'))//tei:item
let $query := if ($locale eq 'any') then $terms
else
fn:string-join(
(
'name_', $locale,':(', $terms,') OR ',
'date_', $locale,':(', $terms,') OR ',
'note_', $locale,':(', $terms,') OR ',
'artist:(', $terms,') OR ',
'location:(', $terms,')'
),
''
)
let $hits := $people[ft:query(., $query)]
let $l := if ($locale eq 'any') then 'de' else local:locale()
let $results :=
for $hit in $hits
let $id := string($hit/@xml:id)
let $name := string($hit/tei:name[@xml:lang=$l])
return
map {
'type': 'work',
'id': concat('work-', $id),
'name': $name,
'textgrid_id': $id
}
})
return $response
};
declare function local:search-pages-elastic() {
let $filters := if ($locale eq 'any') then [] else [
map {"term": map {"locale": $locale}}
]
let $response := elastic:search("wiki_pages", map {
"from": ($page - 1) * $per_page,
"size": $per_page,
"query": map {
"bool": map {
"must": [
map {
"query_string": map {
"query": $terms
}
}
],
"filter": $filters
}
return
local:paginate($results, fn:false())
};
(:
declare function local:search-works() {
let $works := doc(concat($config:app-root, '/templates/register/works.xml'))
let $l :=
if (local:locale() eq 'any') then 'de'
else local:locale()
let $results :=
for $work in $works//tei:list[@type='artworks']/tei:item
let $id := string($work/@xml:id)
let $name := string($work/tei:name[@xml:lang=$l])
where fn:matches($name, $terms, 'i')
order by $name
return
map {
'type': 'work',
'id': concat('work-', $id),
'name': $name,
'textgrid_id': $id
},
"highlight": map {
"fragment_size": 100,
"fields": map {
"search_data": map {}
}
return
local:paginate($results, fn:false())
}
})
return $response
};
:)
declare function local:search-places-lucene() {
let $people := collection(concat($config:app-root, '/templates/register'))//tei:place
let $query := if ($locale eq 'any') then $terms
declare function local:search-register-elastic($type) {
let $fields := if ($locale eq 'de') then
['search_data.de']
else if ($locale eq 'fr') then
['search_data.fr']
else
fn:string-join(
(
'name_', $locale,':(', $terms,') OR ',
'note_', $locale,':(', $terms,') OR ',
'artist:(', $terms,')'
),
''
)
let $hits := $people[ft:query(., $query)]
let $l := if ($locale eq 'any') then 'de' else local:locale()
let $results :=
for $hit in $hits
let $id := string($hit/@xml:id)
let $name := string($hit/tei:placeName[@type='current' and @xml:lang=$l])
let $description := string($hit/tei:note[@type='description' and @xml:lang=$l])
return
map {
'type': 'place',
'id': concat('place-', $id),
'name': $name,
'description': $description,
'textgrid_id': $id
['search_data.de', 'search_data.fr']
let $response := elastic:search($type, map {
"from": ($page - 1) * $per_page,
"size": $per_page,
"query": map {
"query_string": map {
"query": $terms,
"fields": $fields
}
return
local:paginate($results, fn:false())
};
(:
declare function local:search-places() {
let $places := doc(concat($config:app-root, '/templates/register/places.xml'))
let $l :=
if (local:locale() eq 'any') then 'de'
else local:locale()
let $results :=
for $place in $places//tei:listPlace/tei:place
let $id := string($place/@xml:id)
let $name := string($place/tei:placeName[@type='current' and @xml:lang=$l])
let $description := string($place/tei:note[@type='description'])
where
fn:matches($name, $terms, 'i') or
fn:matches($description, $terms, 'i')
order by $name
return
map {
'type': 'place',
'id': concat('place-', $id),
'name': $name,
'description': $description,
'textgrid_id': $id
},
"highlight": map {
"fragment_size": 100,
"fields": map {
"search_data.de": map {},
"search_data.fr": map {}
}
return
local:paginate($results, fn:false())
};
:)
declare function local:paginate($results, $add_summaries) {
let $results := if (fn:count($results) eq 0) then () else $results
let $total := fn:count($results)
let $pages := fn:ceiling(xs:double($total) div $per_page)
let $new_page := fn:min(($pages, $page))
let $limited :=
for $result in subsequence($results, ($new_page - 1) * $per_page + 1, $per_page)
return
if ($add_summaries eq fn:true()) then
map:merge((
$result,
map {
'summary': kwic:summarize($result('hit'), <config width="80" />),
'hit': ()
}
))
else
$result
return
map {
'results':
if ($total eq 0) then []
else if ($total eq 1) then [$limited]
else $limited,
'total': $total,
'page': $new_page,
'per_page': $per_page,
'pages': $pages
}
};
declare function local:locale() {
if ($locale eq 'fr') then
'fra'
else
$locale
};
declare function local:locale($fix_fra as xs:boolean) {
if ($fix_fra eq fn:true()) then
local:locale()
else
$locale
})
return $response
};
declare function local:error($message as xs:string, $status as xs:integer) {
......@@ -410,15 +111,15 @@ declare function local:error($message as xs:string, $status as xs:integer) {
declare function local:route() {
if ($action eq 'search-people') then
local:search-people-lucene()
local:search-register-elastic('people')
else if ($action eq 'search-works') then
local:search-works-lucene()
local:search-register-elastic('works')
else if ($action eq 'search-places') then
local:search-places-lucene()
local:search-register-elastic('places')
else if ($action eq 'search-editions') then
local:search-editions-lucene()
local:search-editions-elastic()
else if ($action eq 'search-pages') then
local:search-pages()
local:search-pages-elastic()
else
local:error('unknown action', 400)
};
......
xquery version "3.1";
module namespace elastic="http://elastic.io";
import module namespace config="https://sade.textgrid.de/ns/config" at "config.xqm";
declare namespace tei="http://www.tei-c.org/ns/1.0";
declare namespace xhtml="http://www.w3.org/1999/xhtml";
declare namespace http="http://expath.org/ns/http-client";
declare namespace elastic="http://elastic.io";
declare variable $elastic:uri := 'http://127.0.0.1:9200';
declare variable $elastic:prefix := 'at_';
declare function elastic:search($index, $query) {
let $json := elastic:to_json($query)
let $response := http:send-request(
<http:request method="POST" href="{$elastic:uri}/{$elastic:prefix}{$index}/_search">
<http:body media-type="application/json" method="text">{$json}</http:body>
</http:request>
)
let $str := util:base64-decode($response[2])
let $data := parse-json($str)
return $data
};
declare function elastic:drop-index($index) {
let $response := http:send-request(
<http:request method="DELETE" href="{$elastic:uri}/{$elastic:prefix}{$index}" />
)
return $response
};
declare function elastic:create-index($index, $data) {
let $json := elastic:to_json($data)
let $response := http:send-request(
<http:request method="PUT" href="{$elastic:uri}/{$elastic:prefix}{$index}">
<http:body media-type="application/json" method="text">{$json}</http:body>
</http:request>
)
return $response