gfl-indexer.xslt 17.3 KB
Newer Older
Dennis Neumann's avatar
Dennis Neumann committed
1
2
<?xml version="1.0" encoding="utf-8"?>

Dennis Neumann's avatar
Dennis Neumann committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
<!-- 

This script produces Solr XML documents.


Field 'fulltext_html'

This field  contains the HTML representation of the text of a TEI document (e. g. a Goethe letter).
The Goethe letters are composed of different parts, for example 'opener', 'closer', 'salute'.
All those parts are represented here as <div>'s with the corresponding CSS classes.
The frontend viewer must decide how to format those parts and present them to the user.

Also, the original TEI files contain mark-up for many in-text parts, like dates, names, underlined words, etc.
Most of these are also transformed to <div>'s with their own CSS classes.
Although the in-text parts are by nature inline elements, we use here <div>'s and not <span>'s.
The reason is that Solr seems to have problems when highlighting fields that contain <span>'s
by sometimes producing corrupt HTML.
By using <div>'s, we avoid this problem.
In the frontend, these <div>'s must be set to 'display: inline'.

Some other in-text parts are transformed to special HTML elements.
For example, superscripted text is marked as <sup>, because HTML offers the appropriate element.

The project is still continuing and new TEI files are being produced.
That's why there might be new elements in the future that cannot be handled yet in this script.
The text of such TEI elements is enclosed in HTML elements of class 'unknown-element'.
Furthermore, a warning message is generated that contains data of the first occurrence of such a new element.






 -->

Dennis Neumann's avatar
Dennis Neumann committed
38
<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
39
40
41
   xpath-default-namespace="http://www.tei-c.org/ns/1.0" xmlns:gfl="http://sub.gfl.de"
   xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:saxon="http://saxon.sf.net/" exclude-result-prefixes="gfl saxon xs">

Dennis Neumann's avatar
Dennis Neumann committed
42
   <xsl:output method="xml" indent="yes" saxon:suppress-indentation="div" />
Dennis Neumann's avatar
Dennis Neumann committed
43
   <xsl:strip-space elements="*" />
44
   <xsl:preserve-space elements="msIdentifier bibl p" />
45
46

   <xsl:template match="/">
47
48
49
50
      <xsl:apply-templates select="TEI" />
   </xsl:template>

   <xsl:template match="TEI">
51
52
      <add>
         <doc>
53
            <xsl:apply-templates select="teiHeader | text" />
54
         </doc>
55
         <xsl:apply-templates select="text/body/div" mode="page_splitting" />
56
57
      </add>
   </xsl:template>
Dennis Neumann's avatar
Dennis Neumann committed
58

59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
   <xsl:template match="text()" mode="html_for_whole_article">
      <xsl:variable name="currentText" select="replace(., '\s+', ' ')" />
      <xsl:choose>
         <xsl:when test="ends-with(., '&#0173;')">
            <xsl:value-of select="replace($currentText, '&#0173;', '-')" />
         </xsl:when>
         <xsl:otherwise>
            <xsl:value-of select="$currentText" />
         </xsl:otherwise>
      </xsl:choose>
   </xsl:template>
   
   <xsl:template match="text()" mode="text_only">
      <xsl:variable name="currentText" select="replace(., '\s+', ' ')" />
      <xsl:choose>
         <xsl:when test="ends-with(., '-')">
Dennis Neumann's avatar
Dennis Neumann committed
75
76
            <!-- These are cases where one word is divided between two lines. 
            The minus sign is removed here, the line break (<lb/>) is removed in its own template. -->
77
78
79
            <xsl:value-of select="substring($currentText, 1, string-length($currentText)-1)" />
         </xsl:when>
         <xsl:when test="ends-with(., '&#0173;')">
Dennis Neumann's avatar
Dennis Neumann committed
80
81
82
            <!-- A soft hyphen is a convention to mark a hyphen that belongs to the word or is a hyphen on its own.
            For now, it is just replaced by a minus sign. 
            Later, it might be useful to differentiate between word divisions and hyphens. -->
83
84
85
86
87
88
            <xsl:value-of select="replace($currentText, '&#0173;', '-')" />
         </xsl:when>
         <xsl:otherwise>
            <xsl:value-of select="$currentText" />
         </xsl:otherwise>
      </xsl:choose>
Dennis Neumann's avatar
Dennis Neumann committed
89
90
91
92
93
   </xsl:template>
   
   <!--###########   Header   #######################-->
   
   <xsl:template match="teiHeader">
Dennis Neumann's avatar
Dennis Neumann committed
94
      <xsl:apply-templates select="fileDesc | profileDesc/textClass" />
Dennis Neumann's avatar
Dennis Neumann committed
95
96
97
98
99
100
101
   </xsl:template>
   
   <xsl:template match="fileDesc">
      <xsl:apply-templates select="titleStmt/title" />
      <xsl:apply-templates select="titleStmt/title[@type='desc']/name" />
      <xsl:apply-templates select="titleStmt/title[@type='desc']/date[@type='orn']" />
      <xsl:apply-templates select="titleStmt/author/name" />
102
      <xsl:apply-templates select="sourceDesc" />
Dennis Neumann's avatar
Dennis Neumann committed
103
104
   </xsl:template>
   
Dennis Neumann's avatar
Dennis Neumann committed
105
106
107
108
   <xsl:template match="profileDesc/textClass">
      <xsl:apply-templates select="keywords/term" />
   </xsl:template>
   
Dennis Neumann's avatar
Dennis Neumann committed
109
   <xsl:template match="title[@type='short']">
Dennis Neumann's avatar
Dennis Neumann committed
110
      <field name="short_title">
Dennis Neumann's avatar
Dennis Neumann committed
111
         <xsl:apply-templates mode="text_only" />
Dennis Neumann's avatar
Dennis Neumann committed
112
113
114
      </field>
   </xsl:template>

Dennis Neumann's avatar
Dennis Neumann committed
115
   <xsl:template match="title[@type='desc']">
Dennis Neumann's avatar
Dennis Neumann committed
116
      <field name="title">
Dennis Neumann's avatar
Dennis Neumann committed
117
         <xsl:apply-templates mode="text_only" />
Dennis Neumann's avatar
Dennis Neumann committed
118
119
120
      </field>
   </xsl:template>

Dennis Neumann's avatar
Dennis Neumann committed
121
   <xsl:template match="title/name[@type='place' and @subtype='orn']">
Dennis Neumann's avatar
Dennis Neumann committed
122
      <field name="origin_place">
Dennis Neumann's avatar
Dennis Neumann committed
123
         <xsl:apply-templates mode="text_only" />
Dennis Neumann's avatar
Dennis Neumann committed
124
125
126
      </field>
   </xsl:template>

Dennis Neumann's avatar
Dennis Neumann committed
127
   <xsl:template match="title/name[@type='place' and @subtype='dtn']">
Dennis Neumann's avatar
Dennis Neumann committed
128
      <field name="destination_place">
Dennis Neumann's avatar
Dennis Neumann committed
129
         <xsl:apply-templates mode="text_only" />
Dennis Neumann's avatar
Dennis Neumann committed
130
131
132
      </field>
   </xsl:template>

Dennis Neumann's avatar
Dennis Neumann committed
133
   <xsl:template match="title/name[@type='person' and @subtype='rcp']">
Dennis Neumann's avatar
Dennis Neumann committed
134
      <field name="recipient">
Dennis Neumann's avatar
Dennis Neumann committed
135
         <xsl:apply-templates mode="text_only" />
Dennis Neumann's avatar
Dennis Neumann committed
136
137
138
      </field>
   </xsl:template>

Dennis Neumann's avatar
Dennis Neumann committed
139
   <xsl:template match="title/date[@type='orn']">
Dennis Neumann's avatar
Dennis Neumann committed
140
141
142
143
144
      <field name="origin_date">
         <xsl:value-of select="@when" />
      </field>
   </xsl:template>

Dennis Neumann's avatar
Dennis Neumann committed
145
   <xsl:template match="author/name[@type='person' and @subtype='aut']">
Dennis Neumann's avatar
Dennis Neumann committed
146
      <field name="author">
Dennis Neumann's avatar
Dennis Neumann committed
147
         <xsl:apply-templates mode="text_only" />
Dennis Neumann's avatar
Dennis Neumann committed
148
149
      </field>
   </xsl:template>
150
151
152
153
154
155
   
   <xsl:template match="sourceDesc">
      <field name="source_description">
         <xsl:apply-templates mode="text_only" />
      </field>
   </xsl:template>
Dennis Neumann's avatar
Dennis Neumann committed
156

Dennis Neumann's avatar
Dennis Neumann committed
157
158
159
160
161
162
163
164
165
166
167
168
169
   <xsl:template match="textClass/keywords[@scheme='#gnd']/term">
      <field name="gnd_keyword">
         <xsl:apply-templates mode="text_only" />
      </field>
   </xsl:template>

   <xsl:template match="textClass/keywords[@scheme='free']/term">
      <field name="free_keyword">
         <xsl:apply-templates mode="text_only" />
      </field>
   </xsl:template>


170

Dennis Neumann's avatar
Dennis Neumann committed
171
172
173
174
175
176
   <!--###################   text/body   ##########################-->

   <xsl:template match="text">
      <field name="id">
         <xsl:value-of select="@xml:id" />
      </field>
177
      <field name="doctype">whole_article</field>
Dennis Neumann's avatar
Dennis Neumann committed
178
      <field name="fulltext">
Dennis Neumann's avatar
Dennis Neumann committed
179
         <xsl:apply-templates select="body" mode="text_only" />
Dennis Neumann's avatar
Dennis Neumann committed
180
      </field>
Dennis Neumann's avatar
Dennis Neumann committed
181
182
183
      <field name="fulltext_html">
         <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
            <div class="article">
184
               <xsl:apply-templates mode="html_for_whole_article" />
Dennis Neumann's avatar
Dennis Neumann committed
185
186
187
            </div>
         <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
      </field>
Dennis Neumann's avatar
Dennis Neumann committed
188
      <xsl:apply-templates select=".//note[@type='com']" />
189
   </xsl:template>
Dennis Neumann's avatar
Dennis Neumann committed
190
191
   
   <xsl:template match="body | div" mode="text_only">
192
      <xsl:apply-templates mode="text_only"/>
Dennis Neumann's avatar
Dennis Neumann committed
193
194
   </xsl:template>

195
   <xsl:template match="p | salute | signed | addrLine | dateline" mode="text_only">
Dennis Neumann's avatar
Dennis Neumann committed
196
197
198
      <xsl:apply-templates mode="text_only" />
      <xsl:text> </xsl:text>
   </xsl:template>
199

200
201
202
203
204
   <xsl:template match="note[@place='end']" mode="text_only">
      <xsl:text> </xsl:text>
      <xsl:apply-templates mode="text_only" />
   </xsl:template>

205
   <xsl:template match="lb" mode="text_only">
206
207
208
209
210
211
212
213
214
215
216
217
218
219
      <xsl:variable name="precedingText" select="preceding-sibling::text()[1]" />
      <xsl:choose>
         <xsl:when test="ends-with($precedingText, '-')">
            <!-- Cases where a word is divided between two lines -->
            <!-- no output -->
         </xsl:when>
         <xsl:when test="ends-with($precedingText, '&#0173;') and not(ends-with($precedingText, ' &#0173;'))">
            <!-- Cases where the hyphen belongs to the word (Anna-<lb/>Lena) -->
            <!-- no output -->
         </xsl:when>
         <xsl:otherwise>
            <xsl:text> </xsl:text>
         </xsl:otherwise>
      </xsl:choose>
220
   </xsl:template>
Dennis Neumann's avatar
Dennis Neumann committed
221

222
223
224
225
226
227
228
229
   <xsl:template match="pb" mode="text_only">
      <xsl:text> </xsl:text>
   </xsl:template>

   <xsl:template match="space" mode="text_only">
      <xsl:text> </xsl:text>
   </xsl:template>

230
   <xsl:template match="note[@type='com']" mode="text_only">
231
   </xsl:template>
Dennis Neumann's avatar
Dennis Neumann committed
232
   
Dennis Neumann's avatar
Dennis Neumann committed
233
234
235
236
237
238
   <xsl:template match="note[@type='com']">
      <field name="note_comment">
         <xsl:apply-templates mode="text_only" />
      </field>
   </xsl:template>
   
Dennis Neumann's avatar
Dennis Neumann committed
239
240
   
   <!-- ++++++++++++ HTML +++++++++++++++++++ -->
241
242

   <xsl:template match="*" mode="html_for_whole_article">
243
      <!--xsl:if test=".//text()"-->
244
245
246
         <xsl:message>
            <xsl:text>Unknown element &lt;</xsl:text>
            <xsl:value-of select="local-name()" />
247
248
249
            <xsl:if test="@rendition">
               <xsl:text> rendition="</xsl:text>
               <xsl:value-of select="@rendition" />
250
251
252
253
254
255
256
257
258
259
260
261
               <xsl:text>"</xsl:text>
            </xsl:if>
            <xsl:if test="@type">
               <xsl:text> type="</xsl:text>
               <xsl:value-of select="@type" />
               <xsl:text>"</xsl:text>
            </xsl:if>
            <xsl:text>&gt; - first occurrence: </xsl:text>
         </xsl:message>
         <span class="unknown-element">
            <xsl:apply-templates mode="html_for_whole_article" />
         </span>
262
      <!--/xsl:if-->
263
264
265
266
267
   </xsl:template>
   
   <xsl:template match="body | div" mode="html_for_whole_article">
      <xsl:apply-templates mode="html_for_whole_article"/>
   </xsl:template>
Dennis Neumann's avatar
Dennis Neumann committed
268
   
269
   <xsl:template match="p | opener | salute | seg | bibl | closer | signed | dateline | date 
270
   | label[not(@rendition)] | choice | abbr | expan | postscript" mode="html_for_whole_article">
Dennis Neumann's avatar
Dennis Neumann committed
271
      <div class="{local-name()}">
Dennis Neumann's avatar
Dennis Neumann committed
272
273
274
275
276
277
278
         <xsl:apply-templates mode="html_for_whole_article" />
      </div>
   </xsl:template>

   <xsl:template match="lb" mode="html_for_whole_article">
      <br />
   </xsl:template>
279

280
281
   <xsl:template match="space[@unit='lines']" mode="html_for_whole_article">
      <xsl:variable name="emptyLines" select="@quantity" />
282
283
284
285
286
287
288
289
290
291
      <xsl:choose>
         <xsl:when test="$emptyLines castable as xs:integer">
            <xsl:for-each select="1 to $emptyLines">
               <br />
            </xsl:for-each>
         </xsl:when>
         <xsl:otherwise>
            <br />
         </xsl:otherwise>
      </xsl:choose>
292
293
294
295
296
297
298
299
   </xsl:template>

   <xsl:template match="name[@type='place']" mode="html_for_whole_article">
      <div class="place">
         <xsl:apply-templates mode="html_for_whole_article" />
      </div>
   </xsl:template>
   
Dennis Neumann's avatar
Dennis Neumann committed
300
301
302
303
304
305
306
307
308
309
310
311
   <xsl:template match="name[@type='org']" mode="html_for_whole_article">
      <div class="org">
         <xsl:apply-templates mode="html_for_whole_article" />
      </div>
   </xsl:template>
   
   <xsl:template match="name[@type='person']" mode="html_for_whole_article">
      <div class="person">
         <xsl:apply-templates mode="html_for_whole_article" />
      </div>
   </xsl:template>
   
312
313
314
315
316
317
   <xsl:template match="name[@type='object']" mode="html_for_whole_article">
      <div class="object">
         <xsl:apply-templates mode="html_for_whole_article" />
      </div>
   </xsl:template>
   
318
319
   <xsl:template match="pb" mode="html_for_whole_article">
      <div class="page-break">
320
321
         <xsl:variable name="facsId" select="substring(@facs, 2, string-length(@facs))" />
         <xsl:variable name="graphicUrl" select="id($facsId)/@url" />
322
323
324
         <xsl:variable name="graphicUrlWithoutJpg">
            <xsl:value-of select="if (ends-with($graphicUrl, '.jpg')) then substring($graphicUrl, 1, string-length($graphicUrl)-4) else $graphicUrl" />
         </xsl:variable>
325
         
Dennis Neumann's avatar
Dennis Neumann committed
326
         <xsl:choose>
327
328
            <xsl:when test="@n ne '' and $graphicUrlWithoutJpg">
               <a href="{concat('/', $graphicUrlWithoutJpg)}" target="_blank">
Dennis Neumann's avatar
Dennis Neumann committed
329
330
331
                  <xsl:value-of select="@n" />
               </a>
            </xsl:when>
332
333
334
            <xsl:when test="@n ne ''">
               <xsl:value-of select="@n" />
            </xsl:when>
Dennis Neumann's avatar
Dennis Neumann committed
335
336
337
338
            <xsl:otherwise>
               <xsl:text> </xsl:text>
            </xsl:otherwise>
         </xsl:choose>
339
340
341
      </div>
   </xsl:template>
   
342
343
344
345
346
347
   <xsl:template match="rs[@type='person']" mode="html_for_whole_article">
      <div class="rs-person">
         <xsl:apply-templates mode="html_for_whole_article" />
      </div>
   </xsl:template>
   
Dennis Neumann's avatar
Dennis Neumann committed
348
349
350
351
352
353
   <xsl:template match="rs[@type='place']" mode="html_for_whole_article">
      <div class="rs-place">
         <xsl:apply-templates mode="html_for_whole_article" />
      </div>
   </xsl:template>
      
354
355
356
357
358
   <xsl:template match="note[@type='com']" mode="html_for_whole_article">
      <div class="note-comment">
         <xsl:apply-templates mode="html_for_whole_article" />
      </div>
   </xsl:template>
Dennis Neumann's avatar
Dennis Neumann committed
359
360
361
362
363
364
365
366
367
368
369
370
   
   <xsl:template match="note[@type='footnote']" mode="html_for_whole_article">
      <div class="note-footnote">
         <xsl:apply-templates mode="html_for_whole_article" />
      </div>
   </xsl:template>
   
   <xsl:template match="note[not(@type)]" mode="html_for_whole_article">
      <div class="note">
         <xsl:apply-templates mode="html_for_whole_article" />
      </div>
   </xsl:template>
371
372
373
374
375
376
377
   
   <xsl:template match="hi[@rendition='simple:underline']" mode="html_for_whole_article">
      <div class="underline">
         <xsl:apply-templates mode="html_for_whole_article" />
      </div>
   </xsl:template>
   
378
379
380
381
382
383
   <xsl:template match="hi[@rendition='simple:doubleunderline']" mode="html_for_whole_article">
      <div class="doubleunderline">
         <xsl:apply-templates mode="html_for_whole_article" />
      </div>
   </xsl:template>
   
384
   <xsl:template match="hi[@rendition='simple:superscript']" mode="html_for_whole_article">
385
      <sup>
386
         <xsl:apply-templates mode="html_for_whole_article" />
387
      </sup>
388
389
   </xsl:template>
   
390
391
392
393
394
395
   <xsl:template match="hi[@rendition='simple:subscript']" mode="html_for_whole_article">
      <sub>
         <xsl:apply-templates mode="html_for_whole_article" />
      </sub>
   </xsl:template>
   
396
397
398
399
400
401
   <xsl:template match="hi[@rendition='simple:italic']" mode="html_for_whole_article">
      <div class="italic">
         <xsl:apply-templates mode="html_for_whole_article" />
      </div>
   </xsl:template>
   
402
403
404
405
406
407
408
409
410
411
412
413
   <xsl:template match="hi[@rendition='simple:letterspace']" mode="html_for_whole_article">
      <div class="letterspace">
         <xsl:apply-templates mode="html_for_whole_article" />
      </div>
   </xsl:template>
   
   <xsl:template match="hi[@rendition='simple:right']" mode="html_for_whole_article">
      <div class="right">
         <xsl:apply-templates mode="html_for_whole_article" />
      </div>
   </xsl:template>
   
414
415
416
417
418
419
   <xsl:template match="ref[@target]" mode="html_for_whole_article">
      <a href="{@target}">
         <xsl:apply-templates mode="html_for_whole_article" />
      </a>
   </xsl:template>
   
Dennis Neumann's avatar
Dennis Neumann committed
420
421
   <xsl:template match="head[@rendition] | label[@rendition]" mode="html_for_whole_article">
      <xsl:variable name="classNames" select="local-name(), substring-after(@rendition, 'simple:')" />
422
      <div class="{$classNames}">
Dennis Neumann's avatar
Dennis Neumann committed
423
         <xsl:apply-templates mode="html_for_whole_article" />
424
      </div>
Dennis Neumann's avatar
Dennis Neumann committed
425
426
   </xsl:template>
   
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
   
   <!-- %%%%%%%%%%%%% page splitting %%%%%%%%%%%%%%%%%%%%%%% -->

   <xsl:template match="div" mode="page_splitting">
      <xsl:variable name="context" select="." />
      <xsl:for-each-group select="descendant::node()[not(node())]" group-starting-with="pb">
         <xsl:if test="self::pb">
            <doc>
               <xsl:variable name="pageNumber" select="count(self::pb/preceding::pb) + 1" />
               <field name="id">
                  <xsl:value-of select="$context/../../@xml:id" />
                  <xsl:text>_page</xsl:text>
                  <xsl:value-of select="$pageNumber" />
               </field>
              <field name="article_id">
                  <xsl:value-of select="$context/../../@xml:id" />
               </field>
               <field name="doctype">one_page</field>
               <field name="page_number">
                  <xsl:value-of select="$pageNumber" />
               </field>
               <field name="html_page">
                  <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
                  <div class="page">
                     <div class="page-beginning">
                        <xsl:value-of select="self::pb/@n" />
                     </div>
                     <xsl:apply-templates select="$context/*" mode="page_splitting">
                        <xsl:with-param name="restricted-to" select="current-group()/ancestor-or-self::node()" tunnel="yes" />
                     </xsl:apply-templates>
                  </div>
                  <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
               </field>
            </doc>
         </xsl:if>
      </xsl:for-each-group>
   </xsl:template>

   <xsl:template match="p | name" mode="page_splitting">
      <xsl:param name="restricted-to" tunnel="yes" />
      <xsl:if test="exists(. intersect $restricted-to)">
         <div class="{local-name(.)}">
            <xsl:apply-templates mode="page_splitting" />
         </div>
      </xsl:if>
   </xsl:template>

   <xsl:template match="text()" mode="page_splitting">
      <xsl:param name="restricted-to" tunnel="yes" />
      <xsl:if test="exists(. intersect $restricted-to)">
         <xsl:copy />
      </xsl:if>
   </xsl:template>

   <xsl:template match="pb" mode="page_splitting" />

Dennis Neumann's avatar
Dennis Neumann committed
483
</xsl:stylesheet>