tidy.xqm 10.8 KB
Newer Older
1
2
3
4
xquery version "3.1";

(:~
 : This modules handles the conversion of the Fontante-TEI/XML into a TEI subset
mrodzis's avatar
mrodzis committed
5
 : for the edited text. The resulting TEI is the basis for the "Edierter
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
 : Text" (edited text) view on the website and the book. It represents the latest
 : layer of text.
 : 
 : Its main purpose is to tidy up the intermediate TEI that has been created by
 : tei2teisimple.
 :
 : @author Michelle Weidling
 : @version 0.1
 : @since 0.0.0
 :)

module namespace tidy ="http://fontane-nb.dariah.eu/tidy";


declare namespace tei="http://www.tei-c.org/ns/1.0";
declare namespace test="http://exist-db.org/xquery/xqsuite";

import module namespace config="http://textgrid.de/ns/SADE/config" at "../../config/config.xqm";
import module namespace functx="http://www.functx.com";
import module namespace index-info="http://fontane-nb.dariah.eu/index-info" at "index-info.xqm";
import module namespace simpleHelpers="http://fontane-nb.dariah.eu/teisimplehelpers" at "teisimplehelpers.xqm";


(: only contemporary hands (and selected posthumous hands) are considered for 
 : the edited text :)
declare variable $tidy:valid-hands :=
    for $res in collection($config:data-root || "/data")
    return
        $res//tei:handNote[@script = "contemporary"]/@xml:id/string();


declare function tidy:main($tei as node()*, $uri as xs:string) {
    let $tidy := tidy:enhance-handshifts($tei)
        => tidy:sort-out-surplus-elements()
        => tidy:sort-out-invalid-hands()
        => tidy:split-headings()
        => tidy:summarize()
        => tidy:summarize-headings()
        => tidy:summarize-notes()
        => tidy:summarize-hi()
        => tidy:sort-double-imgs()
        => tidy:tidy()
    let $header :=
        tidy:get-Fontanes-sources($tei//tei:teiHeader[parent::tei:TEI])
        => tidy:get-references-in-abstract()
    (: tei:TEI/@id is always something like 'Notizbuch A1'.
    for sorting we use the shelf number :)
    let $id-parts := tokenize($tei//tei:TEI/@id, " ")
    let $key1 := substring($id-parts[2], 1, 1)
    let $key2 := substring($id-parts[2], 2)
    let $final-tei := <TEI xmlns="http://www.tei-c.org/ns/1.0" id="{$tei//tei:TEI/@id}" key1="{$key1}" key2="{$key2}">{$header}{$tidy//tei:text}</TEI>
    let $store := xmldb:store($config:data-root || "/print/xml/", $uri || ".xml", $final-tei)
    return
        $final-tei
};

(:~
 : Returns the text that has been written by contemporary (or certain posthumous)
 : hands. Up until this point, all encoded hands and their texts are still in 
 : place.
 :)
declare function tidy:sort-out-invalid-hands($nodes as node()*)
as node()* {
    for $node in $nodes return
        let $prev-handshift := $node/preceding::tei:milestone[@unit = "handshift"][1]
        let $is-hand-not-valid := not(simpleHelpers:is-hand-valid($tidy:valid-hands, $prev-handshift))
        return
            typeswitch ($node)
            case text() return
                if($prev-handshift
                and $is-hand-not-valid) then
                    ()
                else
                    $node
    
mrodzis's avatar
mrodzis committed
81
82
            (: All lines have to be preserved because of the editorial commentary
            which references the lines in the notebooks. If we omitted @unit = "line",
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
            referencing wouldn't work any longer :)
            case element(tei:milestone) return
                if($node/@unit = "handshift" and
                simpleHelpers:is-hand-valid($tidy:valid-hands, $node)) then
                    tidy:construct-element($node, "post")
    
                else if($node/@unit = "handshift") then
                    ()
    
                else if($prev-handshift
                and $node/@unit = "line"
                and $is-hand-not-valid) then
                    tidy:construct-element($node, "post")
    
                else if($prev-handshift
                and $is-hand-not-valid) then
                    ()
    
                else
                    tidy:construct-element($node, "post")
    
            case element(tei:div) return
                (: even though it's posthumous we want to keep the text written on
                calendar pages by Friedrich Fontane. Unfortunately, Friedrich's
                handshift is oftentimes not the first hand appearing on the page
                but we want to keep the page nevertheless. :)
                if($node/@type = "Kalenderblatt"
                or $node/@type = "clipping") then
                    tidy:construct-element($node, "post")
                else
                    tidy:invalid-hands-default-return($node)
    
            default return
                tidy:invalid-hands-default-return($node)
};

declare function tidy:invalid-hands-default-return($node as node()*)
as node()* {
    let $prev-handshift := $node/preceding::tei:milestone[@unit = "handshift"][1]
    let $first-child-handshift := $node/child::tei:milestone[@unit = "handshift"][1]
    let $first-child-element := $node/child::*[1]
    let $first-child-node := $node/child::node()[1]

    return
        (: in some cases the valid handshift is the first child node
        instead of a previous node. of course we want to keep the element
        then :)
        if($first-child-element = $first-child-handshift
        (: ensure there's no text before the handshift :)
        and (normalize-space($first-child-node) = ""
            or $first-child-element = $first-child-node)
        and simpleHelpers:is-hand-valid($tidy:valid-hands, $first-child-handshift)) then
            tidy:construct-element($node, "post")


        else if($prev-handshift
        and not(simpleHelpers:is-hand-valid($tidy:valid-hands, $prev-handshift))) then
            ()

        else
            tidy:construct-element($node, "post")
};

(:~
 : Some elements aren't considered in the edited text. These encompass:
 : 
 : * subsequent handshifts of the same type
 : * certain line markers
 : * empty elements that have lost their text nodes during the sorting process
 : 
 :)
declare function tidy:sort-out-surplus-elements($nodes as node()*)
as node()* {
    for $node in $nodes return
        typeswitch ($node)
        case text() return
            $node

        case element(tei:milestone) return
            if($node/@unit = "handshift") then
                if(simpleHelpers:is-prev-hand-same($node)) then
                    ()
                else
                    tidy:construct-element($node, "surplus")

            else if($node/@unit = "line"
            and ($node/ancestor::tei:seg[@type = "missing-hyphen"]
                or $node/preceding-sibling::*[1][self::tei:seg[@type = "missing-hyphen"]])) then
                ()

            else
                tidy:construct-element($node, "surplus")

        case element(tei:head) return
            tidy:surplus-elements-default-return($node)

        case element(tei:date) return
            tidy:surplus-elements-default-return($node)

        case element(tei:rs) return
            tidy:surplus-elements-default-return($node)

        case element(tei:note) return
            tidy:surplus-elements-default-return($node)

        case element(tei:abbr) return
            if(not($node/* or $node/node())) then
                ()
            else
                tidy:construct-element($node, "surplus")

        case element(tei:list) return
            tidy:surplus-elements-default-return($node)

        case element(tei:item) return
            tidy:surplus-elements-default-return($node)


        case element(tei:div) return
            if($node/@type = "label"
            and not($node/* or $node/node())) then
                ()
            else
                tidy:construct-element($node, "surplus")

        default return
            tidy:construct-element($node, "surplus")
};

(:~
mrodzis's avatar
mrodzis committed
213
214
215
 : As a result of the previous sorting process, certain elements may be empty
 : at this process stage. They don't have any information value anymore and are
 : therefore removed.
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
 :)
declare function tidy:surplus-elements-default-return($node as node())
as element() {
    if(not($node/* or $node/node())) then
        ()
    else
        tidy:construct-element($node, "surplus")
};


declare function tidy:has-hand-text($node as element(tei:milestone))
as xs:boolean {
    let $next-handshift := $node/following::tei:milestone[@unit = "handshift"][1]
    let $nodes-between := $node/following::node()[. << $next-handshift]
    let $is-text-node :=
        for $node-between in $nodes-between
        return
            if ($node-between[self::text()]
            and not(normalize-space($node-between) = "")) then
                true()
            else
                false()
    return
        if($next-handshift
        and functx:is-value-in-sequence(true(), $is-text-node)) then
            true()
        else if(not($next-handshift)) then
            true()
        else
            false()
};

(:~
 : A constructor. Creates a TEI element with the same name and jumps back into
 : the process of sorting out surplus elements.
 : 
 : @param $node The current node
 : @param $flag Indicates the function to be called from within the constructor
 :)
declare function tidy:construct-element($node as node(), $flag as xs:string)
 {
    element {QName("http://www.tei-c.org/ns/1.0", $node/name())}{
        $node/@*,
        if($flag = "post") then
            tidy:sort-out-invalid-hands($node/node())
        else if($flag = "surplus") then
            tidy:sort-out-surplus-elements($node/node())
        else if($flag = "hs-enhance") then
            tidy:enhance-handshifts($node/node())
        else if($flag = "sources") then
            tidy:get-Fontanes-sources($node/node())
        else if($flag = "summarize") then
            tidy:summarize($node/node())
        else if($flag = "summarize-headings") then
            tidy:summarize-headings($node/node())
        else if($flag = "summarize-notes") then
            tidy:summarize-notes($node/node())
        else if($flag = "summarize-hi") then
            tidy:summarize-hi($node/node())
        else if($flag = "ref") then
            tidy:get-references-in-abstract($node/node())
        else if($flag = "double-imgs") then
            tidy:sort-double-imgs($node/node())
        else if($flag = "tidy") then
            tidy:tidy($node/node())
        else
            text{"!!!Kopieren des Elements fehlgeschlagen!!!"}
    }
};

(:~
 : Purges surplus attributes from tei:milestone[@unit = "handshift"].
 :
 : @author Michelle Weidling
 : @param $node the current tei:milestone[@unit = "handshift"]
 : @return the purged tei:milestone[@unit = "handshift"]
 :  :)
declare function tidy:clear-handshift($node as element(tei:milestone))
as element(tei:milestone) {
    element {QName("http://www.tei-c.org/ns/1.0", $node/name())} {
        attribute unit {"handshift"},
        $node/(@* except (@subtype, @rend)),
        if($node/@subtype = "") then
            ()
        else
            $node/@subtype,
        if($node/@rend = "") then
            ()
        else
            $node/@rend
    }
};