Commit 308df675 authored by asajedi's avatar asajedi
Browse files

Log invalid TEI files including error messages

parent 86aaee82
...@@ -3,8 +3,6 @@ ...@@ -3,8 +3,6 @@
<component name="ChangeListManager"> <component name="ChangeListManager">
<list default="true" id="9299d56e-0c70-4b88-b2b1-2eb4d0694c1d" name="Default Changelist" comment=""> <list default="true" id="9299d56e-0c70-4b88-b2b1-2eb4d0694c1d" name="Default Changelist" comment="">
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" /> <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/config/services.yaml" beforeDir="false" afterPath="$PROJECT_DIR$/config/services.yaml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/src/Command/SolrIndexing.php" beforeDir="false" afterPath="$PROJECT_DIR$/src/Command/SolrIndexing.php" afterDir="false" />
<change beforePath="$PROJECT_DIR$/src/Controller/SimplexmlController.php" beforeDir="false" afterPath="$PROJECT_DIR$/src/Controller/SimplexmlController.php" afterDir="false" /> <change beforePath="$PROJECT_DIR$/src/Controller/SimplexmlController.php" beforeDir="false" afterPath="$PROJECT_DIR$/src/Controller/SimplexmlController.php" afterDir="false" />
</list> </list>
<option name="SHOW_DIALOG" value="false" /> <option name="SHOW_DIALOG" value="false" />
...@@ -201,7 +199,7 @@ ...@@ -201,7 +199,7 @@
<property name="WebServerToolWindowPanel.toolwindow.show.date" value="false" /> <property name="WebServerToolWindowPanel.toolwindow.show.date" value="false" />
<property name="WebServerToolWindowPanel.toolwindow.show.permissions" value="false" /> <property name="WebServerToolWindowPanel.toolwindow.show.permissions" value="false" />
<property name="WebServerToolWindowPanel.toolwindow.show.size" value="false" /> <property name="WebServerToolWindowPanel.toolwindow.show.size" value="false" />
<property name="last_opened_file_path" value="$PROJECT_DIR$/../digizeit-app" /> <property name="last_opened_file_path" value="$PROJECT_DIR$/../nlh-app" />
<property name="nodejs_package_manager_path" value="yarn" /> <property name="nodejs_package_manager_path" value="yarn" />
<property name="settings.editor.selected.configurable" value="preferences.editor" /> <property name="settings.editor.selected.configurable" value="preferences.editor" />
<property name="vue.rearranger.settings.migration" value="true" /> <property name="vue.rearranger.settings.migration" value="true" />
...@@ -256,7 +254,9 @@ ...@@ -256,7 +254,9 @@
<workItem from="1631236749149" duration="8000" /> <workItem from="1631236749149" duration="8000" />
<workItem from="1631272334167" duration="2663000" /> <workItem from="1631272334167" duration="2663000" />
<workItem from="1631618672072" duration="9455000" /> <workItem from="1631618672072" duration="9455000" />
<workItem from="1631699750162" duration="4175000" /> <workItem from="1631699750162" duration="5357000" />
<workItem from="1631791563265" duration="2112000" />
<workItem from="1631795325839" duration="20066000" />
</task> </task>
<servers /> <servers />
</component> </component>
......
<?php <?php
namespace App\Controller; declare(strict_types=1);
namespace App\Controller;
use App\Model\SolrDocument; use App\Model\SolrDocument;
use DOMDocument; use DOMDocument;
...@@ -501,6 +502,7 @@ class SimplexmlController extends AbstractController ...@@ -501,6 +502,7 @@ class SimplexmlController extends AbstractController
*/ */
public function tei2solr(): void public function tei2solr(): void
{ {
$this->client->getEndpoint()->setOptions(['timeout' => 60]);
$finder = new Finder(); $finder = new Finder();
$finder->files()->in($this->teiDir); $finder->files()->in($this->teiDir);
...@@ -782,6 +784,28 @@ class SimplexmlController extends AbstractController ...@@ -782,6 +784,28 @@ class SimplexmlController extends AbstractController
} }
} }
} }
} else {
$filesystem = new Filesystem();
$teiImportLogFile = './data/log/teiImportLogs.txt';
if (!$filesystem->exists($teiImportLogFile)) {
$filesystem->mkdir('./data/log');
$filesystem->touch($teiImportLogFile);
}
$errors = [];
foreach (libxml_get_errors() as $key => $error) {
if (0 === $key) {
$errors[] = explode('/', $error->file)[4].PHP_EOL;
$errors[] = '--------------------'.PHP_EOL;
}
$errors[] = $error->message;
}
$filesystem->appendToFile($teiImportLogFile, implode('', $errors));
libxml_clear_errors();
} }
} }
} }
...@@ -845,283 +869,281 @@ class SimplexmlController extends AbstractController ...@@ -845,283 +869,281 @@ class SimplexmlController extends AbstractController
$doc = new DOMDocument(); $doc = new DOMDocument();
$doc->load($filePath); $doc->load($filePath);
if (!libxml_get_errors()) { $xpath = new DOMXPath($doc);
$xpath = new DOMXPath($doc); $xpath->registerNamespace('tei', 'http://www.tei-c.org/ns/1.0');
$xpath->registerNamespace('tei', 'http://www.tei-c.org/ns/1.0'); $pagesNodes = $xpath->query('//tei:body');
$pagesNodes = $xpath->query('//tei:body');
$elements = [];
foreach ($pagesNodes as $pagesNode) {
$elements = $this->getPagesNodes($pagesNode, $elements);
}
$editedTextArr = []; $elements = [];
$transcriptedTextArr = []; foreach ($pagesNodes as $pagesNode) {
$gndsUuids = []; $elements = $this->getPagesNodes($pagesNode, $elements);
$pagesGndsUuids = []; }
foreach ($elements as $k => $myelement) {
$renditions = [];
$myeditedText = '';
$mytranscriptedText = '';
foreach ($myelement as $element) {
if (isset($element->nodeName) && (
$element->nodeName === 'p' ||
$element->nodeName === 'dateline' ||
$element->nodeName === 'address' ||
$element->nodeName === 'closer' ||
$element->nodeName === 'list'
)) {
$ele = [];
$el = $this->getNodeChilds($element, $ele);
$tText = '';
$eText = '';
$n = 0;
$liNumber = 1;
foreach ($el as $e) {
if ($e->nodeName === '#text'
&& ($e->parentNode->nodeName !== 'abbr' && ($e->nodeName !== 'del' || $e->parentNode->nodeName !== 'add'))
&& $e->parentNode->nodeName !== 'note'
) {
if (isset($add) && !empty($add)) {
$tText .= ' &lang;' . $e->data . ' ' . $add;
$eText .= ' ' . $e->data;
$add = '';
} elseif (isset($del) && !empty($del)) {
$tText .= ' &#x5B;' . $e->data . ' ' . $del;
$del = '';
} elseif (isset($li) && !empty($li) && !empty($e->data) && 'item' === $e->parentNode->nodeName) {
if (isset($italic) && true === $italic) {
$tText .= $li . $e->data . '</li>';
$eText .= $li . $e->data . '</li>';
$italic = false;
} else {
$tText .= $li . $e->data;
$eText .= $li . $e->data;
}
$li = ''; $editedTextArr = [];
} elseif ('hi' === $e->parentNode->nodeName && isset($e->parentNode->attributes[0])) { $transcriptedTextArr = [];
$gndsUuids = [];
$pagesGndsUuids = [];
foreach ($elements as $k => $myelement) {
$renditions = [];
$myeditedText = '';
$mytranscriptedText = '';
foreach ($myelement as $element) {
if (isset($element->nodeName) && (
$element->nodeName === 'p' ||
$element->nodeName === 'dateline' ||
$element->nodeName === 'address' ||
$element->nodeName === 'closer' ||
$element->nodeName === 'list'
)) {
$ele = [];
$el = $this->getNodeChilds($element, $ele);
$tText = '';
$eText = '';
$n = 0;
$liNumber = 1;
foreach ($el as $e) {
if ($e->nodeName === '#text'
&& ($e->parentNode->nodeName !== 'abbr' && ($e->nodeName !== 'del' || $e->parentNode->nodeName !== 'add'))
&& $e->parentNode->nodeName !== 'note'
) {
if (isset($add) && !empty($add)) {
$tText .= ' &lang;' . $e->data . ' ' . $add;
$eText .= ' ' . $e->data;
$add = '';
} elseif (isset($del) && !empty($del)) {
$tText .= ' &#x5B;' . $e->data . ' ' . $del;
$del = '';
} elseif (isset($li) && !empty($li) && !empty($e->data) && 'item' === $e->parentNode->nodeName) {
if (isset($italic) && true === $italic) {
$tText .= $li . $e->data . '</li>';
$eText .= $li . $e->data . '</li>';
$italic = false;
} else {
$tText .= $li . $e->data;
$eText .= $li . $e->data;
}
$hi = explode(':', $e->parentNode->attributes[0]->value); $li = '';
} elseif ('hi' === $e->parentNode->nodeName && isset($e->parentNode->attributes[0])) {
if (isset($hi[1]) && !empty($hi[1]) && 'superscript' === $hi[1]) { $hi = explode(':', $e->parentNode->attributes[0]->value);
$tText .= '<sup>' . $e->data . '</sup>';
} elseif (isset($hi[1]) && !empty($hi[1]) && 'italic' === $hi[1]) {
$tText .= '<i>' . $e->data . '</i>'; if (isset($hi[1]) && !empty($hi[1]) && 'superscript' === $hi[1]) {
$tText .= '<sup>' . $e->data . '</sup>';
} elseif (isset($hi[1]) && !empty($hi[1]) && 'italic' === $hi[1]) {
$italic = true; $tText .= '<i>' . $e->data . '</i>';
}
} elseif (isset($gnd) && true === $gnd) {
$eText .= $e->data . '</span>';
$tText .= $e->data;
$gnd = false;
} else {
$eText .= $e->data;
if (isset($renditions) && !empty($renditions)) { $italic = true;
$classOpeningTag = ''; }
$classEndTag = ''; } elseif (isset($gnd) && true === $gnd) {
$eText .= $e->data . '</span>';
$tText .= $e->data;
$gnd = false;
} else {
$eText .= $e->data;
foreach ($renditions as $rendition) { if (isset($renditions) && !empty($renditions)) {
if ('italic' === $rendition) { $classOpeningTag = '';
$class = 'i'; $classEndTag = '';
} elseif ('underline' === $rendition) { foreach ($renditions as $rendition) {
$class = 'u'; if ('italic' === $rendition) {
} $class = 'i';
if (isset($class) && !empty($class)) {
$classOpeningTag .= '<' . $class . '>';
$classEndTag .= '</' . $class . '>';
}
$renditions = []; } elseif ('underline' === $rendition) {
$class = 'u';
}
if (isset($class) && !empty($class)) {
$classOpeningTag .= '<' . $class . '>';
$classEndTag .= '</' . $class . '>';
} }
}
if (isset($classOpeningTag) && !empty($classOpeningTag)) { $renditions = [];
$tText .= $classOpeningTag;
$classOpeningTag = '';
} }
}
$tText .= $e->data; if (isset($classOpeningTag) && !empty($classOpeningTag)) {
$tText .= $classOpeningTag;
$classOpeningTag = '';
}
if (isset($classEndTag) && !empty($classEndTag)) { $tText .= $e->data;
$tText .= $classEndTag;
$classEndTag = '';
}
} if (isset($classEndTag) && !empty($classEndTag)) {
} elseif ('item' === $e->nodeName) { $tText .= $classEndTag;
if ($liNumber++ === 1) { $classEndTag = '';
$li = '<ul><li>';
} else {
$li = '<li>';
} }
} elseif ('add' === $e->nodeName) { }
if ('rdg' === $e->parentNode->nodeName) { } elseif ('item' === $e->nodeName) {
if ($liNumber++ === 1) {
$li = '<ul><li>';
} else {
$li = '<li>';
}
if (!empty($e->attributes)) { } elseif ('add' === $e->nodeName) {
if ('rdg' === $e->parentNode->nodeName) {
if (!empty($e->attributes)) {
$pattern = '/^#[a-z_]*$/i'; $pattern = '/^#[a-z_]*$/i';
foreach ($e->attributes as $attribute) { foreach ($e->attributes as $attribute) {
if ('hand' === $attribute->nodeName) { if ('hand' === $attribute->nodeName) {
$match = preg_match($pattern, $attribute->nodeValue, $matches); $match = preg_match($pattern, $attribute->nodeValue, $matches);
if ($match) { if ($match) {
$add = '<i>erg. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . '</i>&rang; '; $add = '<i>erg. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . '</i>&rang; ';
}
} }
} }
} else {
$add = '<i>erg.</i>&rang; ';
} }
} else {
$add = '<i>erg.</i>&rang; ';
} }
} elseif ('del' === $e->nodeName) { }
if ('rdg' === $e->parentNode->nodeName) { } elseif ('del' === $e->nodeName) {
if (!empty($e->attributes)) { if ('rdg' === $e->parentNode->nodeName) {
if (!empty($e->attributes)) {
foreach ($e->attributes as $attribute) { foreach ($e->attributes as $attribute) {
if ('hand' === $attribute->nodeName) { if ('hand' === $attribute->nodeName) {
$del = '<i>str. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . '</i>&#x5D;'; $del = '<i>str. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . '</i>&#x5D;';
}
} }
} else {
$del = '<i>str.</i>&#x5D;';
} }
} else {
$del = '<i>str.</i>&#x5D;';
} }
} elseif ('handShift' === $e->nodeName && 'signed' === $e->parentNode->nodeName) { }
} elseif ('handShift' === $e->nodeName && 'signed' === $e->parentNode->nodeName) {
if (!empty($e->attributes)) { if (!empty($e->attributes)) {
$pattern = '/^#[a-z_]*$/i'; $pattern = '/^#[a-z_]*$/i';
foreach ($e->attributes as $attribute) { foreach ($e->attributes as $attribute) {
if ('scribeRef' === $attribute->nodeName) { if ('scribeRef' === $attribute->nodeName) {
$match = preg_match($pattern, $attribute->nodeValue, $matches); $match = preg_match($pattern, $attribute->nodeValue, $matches);
if ($match) { if ($match) {
$add = '<i>sign. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . '</i>&rang; '; $add = '<i>sign. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . '</i>&rang; ';
}
} }
} }
} }
} elseif ('lb' === $e->nodeName) { }
$tText .= '<br>'; } elseif ('lb' === $e->nodeName) {
$eText .= ' '; $tText .= '<br>';
} elseif ('addrLine' === $e->nodeName) { $eText .= ' ';
} elseif ('addrLine' === $e->nodeName) {
if ($n++ > 0) { if ($n++ > 0) {
$eText .= '<br>'; $eText .= '<br>';
$tText .= '<br>'; $tText .= '<br>';
} }
} elseif ('signed' === $e->nodeName) { } elseif ('signed' === $e->nodeName) {
$tText .= '<br><br>'; $tText .= '<br><br>';
$eText .= '<br><br>'; $eText .= '<br><br>';
} elseif ('hi' === $e->nodeName) { } elseif ('hi' === $e->nodeName) {
foreach ($e->attributes as $attribute) { foreach ($e->attributes as $attribute) {
if (isset(explode(':', $attribute->value)[1])) { if (isset(explode(':', $attribute->value)[1])) {
$rend = explode(':', $attribute->value)[1]; $rend = explode(':', $attribute->value)[1];
if ('superscript' !== $rend && 'italic' !== $rend) { if ('superscript' !== $rend && 'italic' !== $rend) {
$renditions[] = explode(':', $attribute->value)[1]; $renditions[] = explode(':', $attribute->value)[1];
}
} }
} }
} elseif ('name' === $e->nodeName && (isset($e->attributes[1]->value) && !empty($e->attributes[1]->value)) && (isset($e->attributes[0]->value) && !empty($e->attributes[0]->value))) {
$uuid = $this->getUuid();
$gndsUuids[$uuid] = str_replace('gnd:', '', $e->attributes[1]->value);
$pagesGndsUuids[$k][$uuid] = str_replace('gnd:', '', $e->attributes[1]->value);
$eText .= '<span class="' . $e->attributes[0]->value . '" id="' . $uuid . '">';
$gnd = true;
} }
} elseif ('name' === $e->nodeName && (isset($e->attributes[1]->value) && !empty($e->attributes[1]->value)) && (isset($e->attributes[0]->value) && !empty($e->attributes[0]->value))) {
$uuid = $this->getUuid();
$gndsUuids[$uuid] = str_replace('gnd:', '', $e->attributes[1]->value);
$pagesGndsUuids[$k][$uuid] = str_replace('gnd:', '', $e->attributes[1]->value);
$eText .= '<span class="' . $e->attributes[0]->value . '" id="' . $uuid . '">';
$gnd = true;
} }
}
$eText = preg_replace('~\x{00AD}~u', '-', $eText); $eText = preg_replace('~\x{00AD}~u', '-', $eText);
$pattern = '/(\w+)-\s(\w)/i'; $pattern = '/(\w+)-\s(\w)/i';
$eText = preg_replace_callback( $eText = preg_replace_callback(
$pattern, $pattern,
function ($match) { function ($match) {
return $match[1] . $match[2]; return $match[1] . $match[2];
}, },
$eText $eText
); );
$myeditedText .= '<p>' . $eText . '</p>'; $myeditedText .= '<p>' . $eText . '</p>';
$mytranscriptedText .= '<p>' . $tText . '</p>'; $mytranscriptedText .= '<p>' . $tText . '</p>';
} elseif ('pb' === $element->nodeName) { } elseif ('pb' === $element->nodeName) {
foreach ($element->attributes as $key => $attribute) { foreach ($element->attributes as $key => $attribute) {
if ('facs' === $attribute->name) { if ('facs' === $attribute->name) {
if (isset($graphics[trim($attribute->value, '#')])) { if (isset($graphics[trim($attribute->value, '#')])) {
$graphic = $graphics[trim($attribute->value, '#')]; $graphic = $graphics[trim($attribute->value, '#')];
if (str_ends_with($graphic, '.jpg')) { if (str_ends_with($graphic, '.jpg')) {
$graphic = substr($graphic, 0, strlen($graphic) - 4); $graphic = substr($graphic, 0, strlen($graphic) - 4);
}
} }
} elseif ('n' === $attribute->name) {
$pageNumber = $attribute->value;
}
if (!empty($pageNumber) && !empty($graphic)) {
$href = '<a href="/' . $graphic . '" target="_blank"/>' . $pageNumber . '</a>';
} elseif (!empty($pageNumber) && empty($graphic)) {
$href = $pageNumber;
} else {
$href = '';
} }
} elseif ('n' === $attribute->name) {
$pageNumber = $attribute->value;
} }
$tText = $href; if (!empty($pageNumber) && !empty($graphic)) {
$eText = $href; $href = '<a href="/' . $graphic . '" target="_blank"/>' . $pageNumber . '</a>';
$myeditedText .= '<p>' . $eText . '</p>'; } elseif (!empty($pageNumber) && empty($graphic)) {
$mytranscriptedText .= '<p>' . $tText . '</p>'; $href = $pageNumber;
} else {
$href = '';
}
} }
}
$editedTextArr[] = $myeditedText; $tText = $href;