Commit 86aaee82 authored by asajedi's avatar asajedi
Browse files

Validate TEI file before loading

parent f2eb5186
......@@ -3,7 +3,9 @@
<component name="ChangeListManager">
<list default="true" id="9299d56e-0c70-4b88-b2b1-2eb4d0694c1d" name="Default Changelist" comment="">
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/package.json" beforeDir="false" afterPath="$PROJECT_DIR$/package.json" afterDir="false" />
<change beforePath="$PROJECT_DIR$/config/services.yaml" beforeDir="false" afterPath="$PROJECT_DIR$/config/services.yaml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/src/Command/SolrIndexing.php" beforeDir="false" afterPath="$PROJECT_DIR$/src/Command/SolrIndexing.php" afterDir="false" />
<change beforePath="$PROJECT_DIR$/src/Controller/SimplexmlController.php" beforeDir="false" afterPath="$PROJECT_DIR$/src/Controller/SimplexmlController.php" afterDir="false" />
</list>
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
......@@ -23,14 +25,17 @@
<setting file="file://$PROJECT_DIR$/var/emo-bundle/Model/Annotation/PartOf.php" root0="FORCE_HIGHLIGHTING" root1="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/.env.dist" root0="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/var/emo-bundle/Model/Annotation/AnnotationCollection.php" root0="FORCE_HIGHLIGHTING" root1="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/composer.json" root0="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/var/emo-bundle/Model/Annotation/Target.php" root0="FORCE_HIGHLIGHTING" root1="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/templates/item/detail.html.twig" root0="FORCE_HIGHLIGHTING" root1="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/var/emo-bundle/Model/Presentation/Title.php" root0="FORCE_HIGHLIGHTING" root1="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/var/emo-bundle/Model/Presentation/Item.php" root0="FORCE_HIGHLIGHTING" root1="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/docker-compose.yaml" root0="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/var/emo-bundle/Model/Document.php" root0="FORCE_HIGHLIGHTING" root1="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/var/emo-bundle/Model/Presentation/Image.php" root0="FORCE_HIGHLIGHTING" root1="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/src/Controller/SimplexmlController.php" root0="FORCE_HIGHLIGHTING" root1="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/composer.json" root0="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/config/services.yaml" root0="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/src/Command/SolrIndexing.php" root0="FORCE_HIGHLIGHTING" root1="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/var/emo-bundle/Model/Document.php" root0="FORCE_HIGHLIGHTING" root1="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/var/emo-bundle/Model/Annotation/AnnotationPage.php" root0="FORCE_HIGHLIGHTING" root1="FORCE_HIGHLIGHTING" />
</component>
<component name="PhpWorkspaceProjectConfiguration">
......@@ -196,7 +201,7 @@
<property name="WebServerToolWindowPanel.toolwindow.show.date" value="false" />
<property name="WebServerToolWindowPanel.toolwindow.show.permissions" value="false" />
<property name="WebServerToolWindowPanel.toolwindow.show.size" value="false" />
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
<property name="last_opened_file_path" value="$PROJECT_DIR$/../digizeit-app" />
<property name="nodejs_package_manager_path" value="yarn" />
<property name="settings.editor.selected.configurable" value="preferences.editor" />
<property name="vue.rearranger.settings.migration" value="true" />
......@@ -248,6 +253,10 @@
<workItem from="1630014126785" duration="612000" />
<workItem from="1630441686460" duration="2123000" />
<workItem from="1631180525162" duration="176000" />
<workItem from="1631236749149" duration="8000" />
<workItem from="1631272334167" duration="2663000" />
<workItem from="1631618672072" duration="9455000" />
<workItem from="1631699750162" duration="4175000" />
</task>
<servers />
</component>
......
......@@ -22,7 +22,8 @@ parameters:
SBB_SPK: 'Staatsbibliothek Preußischer Kulturbesitz, Berlin'
SMB_ZA: 'Zentralarchiv Staatliche Museen zu Berlin Preußischer Kulturbesitz'
UAHW: 'Archiv der Martin-Luther-Universität Halle-Wittenberg, Halle / S.'
tei_dir: '%kernel.project_dir%/teis/sampletei/'
tei_dir: '%kernel.project_dir%/data/gitlab/'
# tei_dir: '%kernel.project_dir%/teis/sampletei/'
services:
# default configuration for services in *this* file
......
......@@ -41,7 +41,7 @@ class SolrIndexing extends Command
{
$output->writeln('Start solr indexing.');
// $this->simplexmlController->fetchTeis();
$this->simplexmlController->fetchTeis();
$this->simplexmlController->deleteSolrIndex();
$this->simplexmlController->tei2solr();
......
......@@ -505,8 +505,11 @@ class SimplexmlController extends AbstractController
$finder->files()->in($this->teiDir);
foreach ($finder as $file) {
libxml_use_internal_errors(TRUE);
$doc = new DOMDocument();
$doc->load($file->getRealPath());
if (!libxml_get_errors()) {
$xpath = new DOMXPath($doc);
$xpath->registerNamespace('tei', 'http://www.tei-c.org/ns/1.0');
......@@ -721,7 +724,6 @@ class SimplexmlController extends AbstractController
}
}
}
}
if (isset($entities) && is_iterable($entities)) {
......@@ -735,11 +737,10 @@ class SimplexmlController extends AbstractController
try {
$fileContent = @file_get_contents($remoteFilePath, true);
if($fileContent==false) {
if ($fileContent == false) {
throw new Exception($localFilePath);
}
} catch (Exception $e)
{
} catch (Exception $e) {
echo $e->getMessage();
}
......@@ -783,6 +784,7 @@ class SimplexmlController extends AbstractController
}
}
}
}
public function getUuid()
{
......@@ -838,8 +840,12 @@ class SimplexmlController extends AbstractController
public function getTextVersions(string $filePath, array $graphics): SolrDocument
{
libxml_use_internal_errors(TRUE);
$solrDocument = new SolrDocument();
$doc = new DOMDocument();
$doc->load($filePath);
if (!libxml_get_errors()) {
$xpath = new DOMXPath($doc);
$xpath->registerNamespace('tei', 'http://www.tei-c.org/ns/1.0');
$pagesNodes = $xpath->query('//tei:body');
......@@ -883,13 +889,13 @@ class SimplexmlController extends AbstractController
) {
if (isset($add) && !empty($add)) {
$tText .= ' &lang;'.$e->data.' '.$add;
$tText .= ' &lang;' . $e->data . ' ' . $add;
$eText .= ' '.$e->data;
$eText .= ' ' . $e->data;
$add = '';
} elseif (isset($del) && !empty($del)) {
$tText .= ' &#x5B;'.$e->data.' '.$del;
$tText .= ' &#x5B;' . $e->data . ' ' . $del;
$del = '';
} elseif (isset($li) && !empty($li) && !empty($e->data) && 'item' === $e->parentNode->nodeName) {
......@@ -909,21 +915,18 @@ class SimplexmlController extends AbstractController
$hi = explode(':', $e->parentNode->attributes[0]->value);
if (isset($hi[1]) && !empty($hi[1]) && 'superscript' === $hi[1]) {
$tText .= '<sup>'.$e->data.'</sup>';
}
elseif (isset($hi[1]) && !empty($hi[1]) && 'italic' === $hi[1]) {
$tText .= '<sup>' . $e->data . '</sup>';
} elseif (isset($hi[1]) && !empty($hi[1]) && 'italic' === $hi[1]) {
$tText .= '<i>' . $e->data . '</i>';
$italic = true;
}
} elseif (isset($gnd) && true === $gnd) {
$eText .= $e->data.'</span>';
$eText .= $e->data . '</span>';
$tText .= $e->data;
$gnd = false;
}
else {
} else {
$eText .= $e->data;
if (isset($renditions) && !empty($renditions)) {
......@@ -939,7 +942,7 @@ class SimplexmlController extends AbstractController
}
if (isset($class) && !empty($class)) {
$classOpeningTag .= '<' . $class . '>';
$classEndTag .= '</'.$class.'>';
$classEndTag .= '</' . $class . '>';
}
$renditions = [];
......@@ -1048,7 +1051,7 @@ class SimplexmlController extends AbstractController
$uuid = $this->getUuid();
$gndsUuids[$uuid] = str_replace('gnd:', '', $e->attributes[1]->value);
$pagesGndsUuids[$k][$uuid] = str_replace('gnd:', '', $e->attributes[1]->value);
$eText .= '<span class="'.$e->attributes[0]->value.'" id="'.$uuid.'">';
$eText .= '<span class="' . $e->attributes[0]->value . '" id="' . $uuid . '">';
$gnd = true;
}
}
......@@ -1064,8 +1067,8 @@ class SimplexmlController extends AbstractController
$eText
);
$myeditedText .= '<p>'.$eText.'</p>';
$mytranscriptedText .= '<p>'.$tText.'</p>';
$myeditedText .= '<p>' . $eText . '</p>';
$mytranscriptedText .= '<p>' . $tText . '</p>';
} elseif ('pb' === $element->nodeName) {
foreach ($element->attributes as $key => $attribute) {
......@@ -1092,8 +1095,8 @@ class SimplexmlController extends AbstractController
$tText = $href;
$eText = $href;
$myeditedText .= '<p>'.$eText.'</p>';
$mytranscriptedText .= '<p>'.$tText.'</p>';
$myeditedText .= '<p>' . $eText . '</p>';
$mytranscriptedText .= '<p>' . $tText . '</p>';
}
}
......@@ -1111,13 +1114,13 @@ class SimplexmlController extends AbstractController
$editedText .= $myeditedText;
}
$solrDocument = new SolrDocument();
$solrDocument->setTranscriptedText($transcriptedText);
$solrDocument->setTranscriptedTextArr($transcriptedTextArr);
$solrDocument->setEditedText($editedText);
$solrDocument->setEditedTextArr($editedTextArr);
$solrDocument->setGndsUuids($gndsUuids);
$solrDocument->setPagesGndsUuids($pagesGndsUuids);
}
return $solrDocument;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment