Commit 6a8dfc7d authored by Dennis Neumann's avatar Dennis Neumann
Browse files

Start generating split HTML snippets for each page

parent 75d5ac45
......@@ -44,17 +44,18 @@ Furthermore, a warning message is generated that contains data of the first occu
<xsl:preserve-space elements="msIdentifier bibl p" />
<xsl:template match="/">
<xsl:apply-templates select="TEI" />
</xsl:template>
<xsl:template match="TEI">
<add>
<doc>
<xsl:apply-templates select="TEI" />
<xsl:apply-templates select="teiHeader | text" />
</doc>
<xsl:apply-templates select="text/body/div" mode="page_splitting" />
</add>
</xsl:template>
<xsl:template match="TEI">
<xsl:apply-templates select="teiHeader | text" />
</xsl:template>
<xsl:template match="text()" mode="html_for_whole_article">
<xsl:variable name="currentText" select="replace(., '\s+', ' ')" />
<xsl:choose>
......@@ -173,6 +174,7 @@ Furthermore, a warning message is generated that contains data of the first occu
<field name="id">
<xsl:value-of select="@xml:id" />
</field>
<field name="doctype">whole_article</field>
<field name="fulltext">
<xsl:apply-templates select="body" mode="text_only" />
</field>
......@@ -422,4 +424,60 @@ Furthermore, a warning message is generated that contains data of the first occu
</div>
</xsl:template>
<!-- %%%%%%%%%%%%% page splitting %%%%%%%%%%%%%%%%%%%%%%% -->
<xsl:template match="div" mode="page_splitting">
<xsl:variable name="context" select="." />
<xsl:for-each-group select="descendant::node()[not(node())]" group-starting-with="pb">
<xsl:if test="self::pb">
<doc>
<xsl:variable name="pageNumber" select="count(self::pb/preceding::pb) + 1" />
<field name="id">
<xsl:value-of select="$context/../../@xml:id" />
<xsl:text>_page</xsl:text>
<xsl:value-of select="$pageNumber" />
</field>
<field name="article_id">
<xsl:value-of select="$context/../../@xml:id" />
</field>
<field name="doctype">one_page</field>
<field name="page_number">
<xsl:value-of select="$pageNumber" />
</field>
<field name="html_page">
<xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
<div class="page">
<div class="page-beginning">
<xsl:value-of select="self::pb/@n" />
</div>
<xsl:apply-templates select="$context/*" mode="page_splitting">
<xsl:with-param name="restricted-to" select="current-group()/ancestor-or-self::node()" tunnel="yes" />
</xsl:apply-templates>
</div>
<xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
</field>
</doc>
</xsl:if>
</xsl:for-each-group>
</xsl:template>
<xsl:template match="p | name" mode="page_splitting">
<xsl:param name="restricted-to" tunnel="yes" />
<xsl:if test="exists(. intersect $restricted-to)">
<div class="{local-name(.)}">
<xsl:apply-templates mode="page_splitting" />
</div>
</xsl:if>
</xsl:template>
<xsl:template match="text()" mode="page_splitting">
<xsl:param name="restricted-to" tunnel="yes" />
<xsl:if test="exists(. intersect $restricted-to)">
<xsl:copy />
</xsl:if>
</xsl:template>
<xsl:template match="pb" mode="page_splitting" />
</xsl:stylesheet>
\ No newline at end of file
......@@ -21,7 +21,7 @@ public class SplitTest {
@BeforeClass
public static void beforeAllTests() throws Exception {
xslt = new Xslt();
xslt.setXsltScript("src/test/resources/split.xslt");
xslt.setXsltScript("src/test/resources/split-foreach-group.xslt");
}
@Before
......@@ -34,22 +34,8 @@ public class SplitTest {
System.out.println(outputBaos.toString());
}
@Test
public void test() throws Exception {
String result = transform("two-page-beginnings.xml");
}
@Test
public void test2() throws Exception {
xslt.setXsltScript("src/test/resources/split2.xslt");
String result = transform("two-page-beginnings.xml");
}
@Test
public void testGroup() throws Exception {
xslt.setXsltScript("src/test/resources/split-foreach-group.xslt");
String result = transform("two-page-beginnings.xml");
}
......
package sub.gfl;
import static org.custommonkey.xmlunit.XMLAssert.assertXpathEvaluatesTo;
import static org.junit.Assert.*;
import java.io.ByteArrayOutputStream;
import java.io.OutputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.junit.After;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import sub.ent.backend.Xslt;
public class XsltSplitTest {
private OutputStream outputBaos;
private static Xslt xslt;
@BeforeClass
public static void beforeAllTests() throws Exception {
xslt = new Xslt();
xslt.setXsltScript("src/main/resources/gfl-indexer.xslt");
}
@Before
public void beforeEachTest() throws Exception {
outputBaos = new ByteArrayOutputStream();
}
@After
public void afterEachTest() {
System.out.println(outputBaos.toString());
}
@Test
public void twoPages() throws Exception {
String htmlPage1 = transform("two-page-beginnings.xml", 1);
assertXpathEvaluatesTo("Page 1", "//div[@class='p'][1]", htmlPage1);
String htmlPage2 = transform("two-page-beginnings.xml", 2);
assertXpathEvaluatesTo("Second page", "//div[@class='p'][1]", htmlPage2);
}
private String transform(String fileName, int pageNumber) throws Exception {
xslt.transform("src/test/resources/tei-snippets-split/" + fileName, outputBaos);
return extractHtmlField(outputBaos.toString(), pageNumber);
}
private String extractHtmlField(String s, int pageNumber) {
Pattern pattern = Pattern.compile("html_page\"><!\\[CDATA\\[(.*?)]]");
Matcher matcher = pattern.matcher(s.replaceAll("\\n", " "));
String html = "";
for (int i = 0; i < pageNumber; i++) {
matcher.find();
}
html = matcher.group(1);
return html;
}
}
......@@ -22,12 +22,13 @@
<xsl:template match="div">
<xsl:variable name="context" select="." as="element(div)" />
<xsl:variable name="context" select="." />
<xsl:for-each-group select="descendant::node()[not(node())]" group-starting-with="pb">
<xsl:if test="self::pb">
<field name="html_page">
<div class="page">
<div class="page-beginning">
<xsl:value-of select="count(self::pb/preceding::pb) + 1" />
<xsl:value-of select="self::pb/@n" />
</div>
<xsl:apply-templates select="$context/*" mode="split">
......@@ -40,7 +41,7 @@
</xsl:template>
<xsl:template match="p | name" mode="split">
<xsl:param name="restricted-to" as="node()+" tunnel="yes" />
<xsl:param name="restricted-to" tunnel="yes" />
<xsl:if test="exists(. intersect $restricted-to)">
<div class="{local-name(.)}">
<xsl:apply-templates mode="split" />
......@@ -49,7 +50,7 @@
</xsl:template>
<xsl:template match="text()" mode="split">
<xsl:param name="restricted-to" as="node()+" tunnel="yes" />
<xsl:param name="restricted-to" tunnel="yes" />
<xsl:if test="exists(. intersect $restricted-to)">
<xsl:copy />
</xsl:if>
......
<?xml version="1.0"?>
<!-- Delivers the fragment between two milestones. Takes no care about namespaces. -->
<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:fn="local-function"
xpath-default-namespace="http://www.tei-c.org/ns/1.0" exclude-result-prefixes="fn">
<xsl:function name="fn:contains">
<xsl:param name="sequence" as="node()*"/>
<xsl:param name="node" as="node()?"/>
<xsl:sequence select="some $nodeInSequence in $sequence satisfies $nodeInSequence is $node"/>
</xsl:function>
<xsl:output method="xml" indent="yes" />
<xsl:variable name="ms1" select="subsequence(//pb, 1, 1)" />
<xsl:variable name="ms2" select="subsequence(//pb, 2, 1)" />
<xsl:variable name="ms1Ancestors" select="$ms1/ancestor::*"/>
<xsl:variable name="ms2Ancestors" select="$ms2/ancestor::*"/>
<xsl:template match="/">
<add>
<doc>
<xsl:apply-templates select="TEI" />
</doc>
</add>
</xsl:template>
<xsl:template match="TEI">
<xsl:apply-templates select="text/body" />
</xsl:template>
<xsl:template match="body | div">
<xsl:apply-templates />
</xsl:template>
<xsl:template match="p">
<xsl:choose>
<xsl:when test="(. >> $ms1 or fn:contains($ms1Ancestors, .)) and ($ms2 >> . or fn:contains($ms2Ancestors, .))">
<div class="{local-name(.)}"><xsl:apply-templates/></div>
</xsl:when>
<xsl:otherwise><xsl:apply-templates/></xsl:otherwise>
</xsl:choose>
</xsl:template>
<xsl:template match="attribute()|text()|comment()|processing-instruction()">
<xsl:choose>
<xsl:when test=". >> $ms1 and $ms2 >> .">
<xsl:copy><xsl:apply-templates/></xsl:copy>
</xsl:when>
<xsl:otherwise><xsl:apply-templates/></xsl:otherwise>
</xsl:choose>
</xsl:template>
<xsl:template match="pb">
<xsl:choose>
<xsl:when test=". is $ms1">
<div class="page-beginning"><xsl:value-of select="@n" /></div>
</xsl:when>
<xsl:otherwise></xsl:otherwise>
</xsl:choose>
</xsl:template>
</xsl:stylesheet>
\ No newline at end of file
<?xml version="1.0"?>
<xsl:stylesheet version="2.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:fn="local-function"
xpath-default-namespace="http://www.tei-c.org/ns/1.0"
exclude-result-prefixes="fn">
<xsl:output method="xml" indent="yes" />
<xsl:template match="/">
<add>
<doc>
<xsl:apply-templates select="TEI" />
</doc>
</add>
</xsl:template>
<xsl:template match="TEI">
<xsl:apply-templates select="text/body" />
</xsl:template>
<xsl:template match="body | div">
<xsl:apply-templates />
</xsl:template>
<xsl:template match="text()">
</xsl:template>
<xsl:template match="pb">
<field name="html_page">
<div class="page">
<div class="page-beginning">
<xsl:value-of select="@n" />
</div>
</div>
</field>
</xsl:template>
</xsl:stylesheet>
\ No newline at end of file
......@@ -6,7 +6,7 @@
<body>
<div>
<pb n="1"/>
<p>Page 1<pb n="2"/>Second <name type="person">Boisserée</name> page</p>
<p>Page 1<pb n="2"/>Second page</p>
</div>
</body>
</text>
......
......@@ -46,8 +46,15 @@
<field name="id" type="string" required="true" />
<field name="type" type="string" required="false" />
<!-- 'whole_article' or 'one_page' -->
<field name="doctype" type="string" required="true" />
<!-- for type 'one_page' -->
<field name="article_id" type="string" />
<field name="page_number" type="int" />
<field name="html_page" type="text_de" multiValued="false" />
<!-- for type 'whole_article' -->
<field name="fulltext" type="text_de" multiValued="false" />
<field name="fulltext_html" type="text_de" multiValued="false" />
<field name="short_title" type="text_de" multiValued="false" />
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment