Commit 7a1dce0b authored by Dennis Neumann's avatar Dennis Neumann
Browse files

Read languages from TEI

parent 996cdd5f
......@@ -44,7 +44,7 @@ The second kind of documents that are produced are page documents.
The resulting pages are in the HTML format.
As the TEI file is processed, the TEI XML structure is split into pages using
the page beginning elements (<pb/>).
Refer to comment in the code to understand the used algorithm.
Refer to comments in the code to understand the used algorithm.
-->
......@@ -65,6 +65,11 @@ Refer to comment in the code to understand the used algorithm.
<add>
<doc>
<xsl:apply-templates select="teiHeader | text" />
<xsl:for-each select="distinct-values(//@xml:lang)">
<field name="language">
<xsl:value-of select="." />
</field>
</xsl:for-each>
</doc>
<xsl:apply-templates select="text" mode="page_splitting" />
</add>
......
package sub.gfl;
import static org.junit.Assert.*;
import java.io.ByteArrayOutputStream;
import java.io.OutputStream;
import org.junit.After;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import net.sf.saxon.s9api.SaxonApiException;
import sub.ent.backend.Xslt;
public class SplitTest {
private OutputStream outputBaos;
private static Xslt xslt;
@BeforeClass
public static void beforeAllTests() throws Exception {
xslt = new Xslt();
xslt.setXsltScript("src/test/resources/split-foreach-group.xslt");
}
@Before
public void beforeEachTest() throws Exception {
outputBaos = new ByteArrayOutputStream();
}
@After
public void afterEachTest() {
System.out.println(outputBaos.toString());
}
@Test
public void testGroup() throws Exception {
String result = transform("two-page-beginnings.xml");
}
private String transform(String fileName) throws SaxonApiException {
xslt.transform("src/test/resources/tei-snippets-split/" + fileName, outputBaos);
return outputBaos.toString();
}
}
......@@ -35,6 +35,22 @@ public class XsltTest {
System.out.println(outputBaos.toString());
}
@Test
public void twoDifferentLanguages() throws Exception {
String result = transform("language_twoDifferentEntries.xml");
assertXpathEvaluatesTo("2", "count(//field[@name='language'])", result);
assertXpathEvaluatesTo("ger", "//field[@name='language'][1]", result);
assertXpathEvaluatesTo("eng", "//field[@name='language'][2]", result);
}
@Test
public void oneLanguage() throws Exception {
String result = transform("language.xml");
assertXpathEvaluatesTo("ger", "//field[@name='language']", result);
}
@Test
public void spaceAfterAddressLine() throws Exception {
String result = transform("address-in-opener.xml");
......
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xpath-default-namespace="http://www.tei-c.org/ns/1.0" version="2.0">
<xsl:output method="xml" indent="yes" />
<xsl:template match="/">
<add>
<doc>
<xsl:apply-templates select="TEI" />
</doc>
</add>
</xsl:template>
<xsl:template match="TEI">
<xsl:apply-templates select="text/body" />
</xsl:template>
<xsl:template match="body">
<xsl:apply-templates />
</xsl:template>
<xsl:template match="div">
<xsl:variable name="context" select="." />
<xsl:for-each-group select="descendant::node()[not(node())]" group-starting-with="pb">
<xsl:if test="self::pb">
<field name="html_page">
<div class="page">
<div class="page-beginning">
<xsl:value-of select="count(self::pb/preceding::pb) + 1" />
<xsl:value-of select="self::pb/@n" />
</div>
<xsl:apply-templates select="$context/*" mode="split">
<xsl:with-param name="restricted-to" select="current-group()/ancestor-or-self::node()" tunnel="yes" />
</xsl:apply-templates>
</div>
</field>
</xsl:if>
</xsl:for-each-group>
</xsl:template>
<xsl:template match="p | name" mode="split">
<xsl:param name="restricted-to" tunnel="yes" />
<xsl:if test="exists(. intersect $restricted-to)">
<div class="{local-name(.)}">
<xsl:apply-templates mode="split" />
</div>
</xsl:if>
</xsl:template>
<xsl:template match="text()" mode="split">
<xsl:param name="restricted-to" tunnel="yes" />
<xsl:if test="exists(. intersect $restricted-to)">
<xsl:copy />
</xsl:if>
</xsl:template>
<xsl:template match="pb" mode="split" />
</xsl:stylesheet>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<TEI xmlns="http://www.tei-c.org/ns/1.0">
<teiHeader>
</teiHeader>
<text xml:id="my_id" xml:lang="ger">
<body>
<div>
<p>Test text.</p>
</div>
</body>
</text>
</TEI>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<TEI xmlns="http://www.tei-c.org/ns/1.0">
<teiHeader>
</teiHeader>
<text xml:id="my_id" xml:lang="ger">
<body>
<div xml:lang="ger">
<p xml:lang="eng">Test text.</p>
</div>
</body>
</text>
</TEI>
\ No newline at end of file
......@@ -56,6 +56,7 @@
<!-- for doctype 'article' -->
<field name="number_of_pages" type="int" />
<field name="language" type="string" multiValued="true" />
<field name="fulltext" type="text_de" multiValued="false" />
<field name="fulltext_html" type="text_de" multiValued="false" />
<field name="short_title" type="text_de" multiValued="false" />
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment