Commit 308df675 authored by asajedi's avatar asajedi
Browse files

Log invalid TEI files including error messages

parent 86aaee82
......@@ -3,8 +3,6 @@
<component name="ChangeListManager">
<list default="true" id="9299d56e-0c70-4b88-b2b1-2eb4d0694c1d" name="Default Changelist" comment="">
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/config/services.yaml" beforeDir="false" afterPath="$PROJECT_DIR$/config/services.yaml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/src/Command/SolrIndexing.php" beforeDir="false" afterPath="$PROJECT_DIR$/src/Command/SolrIndexing.php" afterDir="false" />
<change beforePath="$PROJECT_DIR$/src/Controller/SimplexmlController.php" beforeDir="false" afterPath="$PROJECT_DIR$/src/Controller/SimplexmlController.php" afterDir="false" />
</list>
<option name="SHOW_DIALOG" value="false" />
......@@ -201,7 +199,7 @@
<property name="WebServerToolWindowPanel.toolwindow.show.date" value="false" />
<property name="WebServerToolWindowPanel.toolwindow.show.permissions" value="false" />
<property name="WebServerToolWindowPanel.toolwindow.show.size" value="false" />
<property name="last_opened_file_path" value="$PROJECT_DIR$/../digizeit-app" />
<property name="last_opened_file_path" value="$PROJECT_DIR$/../nlh-app" />
<property name="nodejs_package_manager_path" value="yarn" />
<property name="settings.editor.selected.configurable" value="preferences.editor" />
<property name="vue.rearranger.settings.migration" value="true" />
......@@ -256,7 +254,9 @@
<workItem from="1631236749149" duration="8000" />
<workItem from="1631272334167" duration="2663000" />
<workItem from="1631618672072" duration="9455000" />
<workItem from="1631699750162" duration="4175000" />
<workItem from="1631699750162" duration="5357000" />
<workItem from="1631791563265" duration="2112000" />
<workItem from="1631795325839" duration="20066000" />
</task>
<servers />
</component>
......
<?php
namespace App\Controller;
declare(strict_types=1);
namespace App\Controller;
use App\Model\SolrDocument;
use DOMDocument;
......@@ -501,6 +502,7 @@ class SimplexmlController extends AbstractController
*/
public function tei2solr(): void
{
$this->client->getEndpoint()->setOptions(['timeout' => 60]);
$finder = new Finder();
$finder->files()->in($this->teiDir);
......@@ -782,6 +784,28 @@ class SimplexmlController extends AbstractController
}
}
}
} else {
$filesystem = new Filesystem();
$teiImportLogFile = './data/log/teiImportLogs.txt';
if (!$filesystem->exists($teiImportLogFile)) {
$filesystem->mkdir('./data/log');
$filesystem->touch($teiImportLogFile);
}
$errors = [];
foreach (libxml_get_errors() as $key => $error) {
if (0 === $key) {
$errors[] = explode('/', $error->file)[4].PHP_EOL;
$errors[] = '--------------------'.PHP_EOL;
}
$errors[] = $error->message;
}
$filesystem->appendToFile($teiImportLogFile, implode('', $errors));
libxml_clear_errors();
}
}
}
......@@ -845,283 +869,281 @@ class SimplexmlController extends AbstractController
$doc = new DOMDocument();
$doc->load($filePath);
if (!libxml_get_errors()) {
$xpath = new DOMXPath($doc);
$xpath->registerNamespace('tei', 'http://www.tei-c.org/ns/1.0');
$pagesNodes = $xpath->query('//tei:body');
$elements = [];
foreach ($pagesNodes as $pagesNode) {
$elements = $this->getPagesNodes($pagesNode, $elements);
}
$xpath = new DOMXPath($doc);
$xpath->registerNamespace('tei', 'http://www.tei-c.org/ns/1.0');
$pagesNodes = $xpath->query('//tei:body');
$editedTextArr = [];
$transcriptedTextArr = [];
$gndsUuids = [];
$pagesGndsUuids = [];
foreach ($elements as $k => $myelement) {
$renditions = [];
$myeditedText = '';
$mytranscriptedText = '';
foreach ($myelement as $element) {
if (isset($element->nodeName) && (
$element->nodeName === 'p' ||
$element->nodeName === 'dateline' ||
$element->nodeName === 'address' ||
$element->nodeName === 'closer' ||
$element->nodeName === 'list'
)) {
$ele = [];
$el = $this->getNodeChilds($element, $ele);
$tText = '';
$eText = '';
$n = 0;
$liNumber = 1;
foreach ($el as $e) {
if ($e->nodeName === '#text'
&& ($e->parentNode->nodeName !== 'abbr' && ($e->nodeName !== 'del' || $e->parentNode->nodeName !== 'add'))
&& $e->parentNode->nodeName !== 'note'
) {
if (isset($add) && !empty($add)) {
$tText .= ' &lang;' . $e->data . ' ' . $add;
$eText .= ' ' . $e->data;
$add = '';
} elseif (isset($del) && !empty($del)) {
$tText .= ' &#x5B;' . $e->data . ' ' . $del;
$del = '';
} elseif (isset($li) && !empty($li) && !empty($e->data) && 'item' === $e->parentNode->nodeName) {
if (isset($italic) && true === $italic) {
$tText .= $li . $e->data . '</li>';
$eText .= $li . $e->data . '</li>';
$italic = false;
} else {
$tText .= $li . $e->data;
$eText .= $li . $e->data;
}
$elements = [];
foreach ($pagesNodes as $pagesNode) {
$elements = $this->getPagesNodes($pagesNode, $elements);
}
$li = '';
} elseif ('hi' === $e->parentNode->nodeName && isset($e->parentNode->attributes[0])) {
$editedTextArr = [];
$transcriptedTextArr = [];
$gndsUuids = [];
$pagesGndsUuids = [];
foreach ($elements as $k => $myelement) {
$renditions = [];
$myeditedText = '';
$mytranscriptedText = '';
foreach ($myelement as $element) {
if (isset($element->nodeName) && (
$element->nodeName === 'p' ||
$element->nodeName === 'dateline' ||
$element->nodeName === 'address' ||
$element->nodeName === 'closer' ||
$element->nodeName === 'list'
)) {
$ele = [];
$el = $this->getNodeChilds($element, $ele);
$tText = '';
$eText = '';
$n = 0;
$liNumber = 1;
foreach ($el as $e) {
if ($e->nodeName === '#text'
&& ($e->parentNode->nodeName !== 'abbr' && ($e->nodeName !== 'del' || $e->parentNode->nodeName !== 'add'))
&& $e->parentNode->nodeName !== 'note'
) {
if (isset($add) && !empty($add)) {
$tText .= ' &lang;' . $e->data . ' ' . $add;
$eText .= ' ' . $e->data;
$add = '';
} elseif (isset($del) && !empty($del)) {
$tText .= ' &#x5B;' . $e->data . ' ' . $del;
$del = '';
} elseif (isset($li) && !empty($li) && !empty($e->data) && 'item' === $e->parentNode->nodeName) {
if (isset($italic) && true === $italic) {
$tText .= $li . $e->data . '</li>';
$eText .= $li . $e->data . '</li>';
$italic = false;
} else {
$tText .= $li . $e->data;
$eText .= $li . $e->data;
}
$hi = explode(':', $e->parentNode->attributes[0]->value);
$li = '';
} elseif ('hi' === $e->parentNode->nodeName && isset($e->parentNode->attributes[0])) {
if (isset($hi[1]) && !empty($hi[1]) && 'superscript' === $hi[1]) {
$tText .= '<sup>' . $e->data . '</sup>';
} elseif (isset($hi[1]) && !empty($hi[1]) && 'italic' === $hi[1]) {
$hi = explode(':', $e->parentNode->attributes[0]->value);
$tText .= '<i>' . $e->data . '</i>';
if (isset($hi[1]) && !empty($hi[1]) && 'superscript' === $hi[1]) {
$tText .= '<sup>' . $e->data . '</sup>';
} elseif (isset($hi[1]) && !empty($hi[1]) && 'italic' === $hi[1]) {
$italic = true;
}
} elseif (isset($gnd) && true === $gnd) {
$eText .= $e->data . '</span>';
$tText .= $e->data;
$gnd = false;
} else {
$eText .= $e->data;
$tText .= '<i>' . $e->data . '</i>';
if (isset($renditions) && !empty($renditions)) {
$classOpeningTag = '';
$classEndTag = '';
$italic = true;
}
} elseif (isset($gnd) && true === $gnd) {
$eText .= $e->data . '</span>';
$tText .= $e->data;
$gnd = false;
} else {
$eText .= $e->data;
foreach ($renditions as $rendition) {
if ('italic' === $rendition) {
$class = 'i';
if (isset($renditions) && !empty($renditions)) {
$classOpeningTag = '';
$classEndTag = '';
} elseif ('underline' === $rendition) {
$class = 'u';
}
if (isset($class) && !empty($class)) {
$classOpeningTag .= '<' . $class . '>';
$classEndTag .= '</' . $class . '>';
}
foreach ($renditions as $rendition) {
if ('italic' === $rendition) {
$class = 'i';
$renditions = [];
} elseif ('underline' === $rendition) {
$class = 'u';
}
if (isset($class) && !empty($class)) {
$classOpeningTag .= '<' . $class . '>';
$classEndTag .= '</' . $class . '>';
}
}
if (isset($classOpeningTag) && !empty($classOpeningTag)) {
$tText .= $classOpeningTag;
$classOpeningTag = '';
$renditions = [];
}
}
$tText .= $e->data;
if (isset($classOpeningTag) && !empty($classOpeningTag)) {
$tText .= $classOpeningTag;
$classOpeningTag = '';
}
if (isset($classEndTag) && !empty($classEndTag)) {
$tText .= $classEndTag;
$classEndTag = '';
}
$tText .= $e->data;
}
} elseif ('item' === $e->nodeName) {
if ($liNumber++ === 1) {
$li = '<ul><li>';
} else {
$li = '<li>';
if (isset($classEndTag) && !empty($classEndTag)) {
$tText .= $classEndTag;
$classEndTag = '';
}
} elseif ('add' === $e->nodeName) {
if ('rdg' === $e->parentNode->nodeName) {
}
} elseif ('item' === $e->nodeName) {
if ($liNumber++ === 1) {
$li = '<ul><li>';
} else {
$li = '<li>';
}
if (!empty($e->attributes)) {
} elseif ('add' === $e->nodeName) {
if ('rdg' === $e->parentNode->nodeName) {
if (!empty($e->attributes)) {
$pattern = '/^#[a-z_]*$/i';
$pattern = '/^#[a-z_]*$/i';
foreach ($e->attributes as $attribute) {
foreach ($e->attributes as $attribute) {
if ('hand' === $attribute->nodeName) {
$match = preg_match($pattern, $attribute->nodeValue, $matches);
if ('hand' === $attribute->nodeName) {
$match = preg_match($pattern, $attribute->nodeValue, $matches);
if ($match) {
$add = '<i>erg. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . '</i>&rang; ';
}
if ($match) {
$add = '<i>erg. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . '</i>&rang; ';
}
}
} else {
$add = '<i>erg.</i>&rang; ';
}
} else {
$add = '<i>erg.</i>&rang; ';
}
} elseif ('del' === $e->nodeName) {
if ('rdg' === $e->parentNode->nodeName) {
if (!empty($e->attributes)) {
}
} elseif ('del' === $e->nodeName) {
if ('rdg' === $e->parentNode->nodeName) {
if (!empty($e->attributes)) {
foreach ($e->attributes as $attribute) {
foreach ($e->attributes as $attribute) {
if ('hand' === $attribute->nodeName) {
$del = '<i>str. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . '</i>&#x5D;';
}
if ('hand' === $attribute->nodeName) {
$del = '<i>str. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . '</i>&#x5D;';
}
} else {
$del = '<i>str.</i>&#x5D;';
}
} else {
$del = '<i>str.</i>&#x5D;';
}
} elseif ('handShift' === $e->nodeName && 'signed' === $e->parentNode->nodeName) {
}
} elseif ('handShift' === $e->nodeName && 'signed' === $e->parentNode->nodeName) {
if (!empty($e->attributes)) {
if (!empty($e->attributes)) {
$pattern = '/^#[a-z_]*$/i';
$pattern = '/^#[a-z_]*$/i';
foreach ($e->attributes as $attribute) {
foreach ($e->attributes as $attribute) {
if ('scribeRef' === $attribute->nodeName) {
$match = preg_match($pattern, $attribute->nodeValue, $matches);
if ('scribeRef' === $attribute->nodeName) {
$match = preg_match($pattern, $attribute->nodeValue, $matches);
if ($match) {
$add = '<i>sign. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . '</i>&rang; ';
}
if ($match) {
$add = '<i>sign. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . '</i>&rang; ';
}
}
}
} elseif ('lb' === $e->nodeName) {
$tText .= '<br>';
$eText .= ' ';
} elseif ('addrLine' === $e->nodeName) {
}
} elseif ('lb' === $e->nodeName) {
$tText .= '<br>';
$eText .= ' ';
} elseif ('addrLine' === $e->nodeName) {
if ($n++ > 0) {
$eText .= '<br>';
$tText .= '<br>';
}
if ($n++ > 0) {
$eText .= '<br>';
$tText .= '<br>';
}
} elseif ('signed' === $e->nodeName) {
$tText .= '<br><br>';
$eText .= '<br><br>';
} elseif ('hi' === $e->nodeName) {
foreach ($e->attributes as $attribute) {
} elseif ('signed' === $e->nodeName) {
$tText .= '<br><br>';
$eText .= '<br><br>';
} elseif ('hi' === $e->nodeName) {
foreach ($e->attributes as $attribute) {
if (isset(explode(':', $attribute->value)[1])) {
if (isset(explode(':', $attribute->value)[1])) {
$rend = explode(':', $attribute->value)[1];
if ('superscript' !== $rend && 'italic' !== $rend) {
$renditions[] = explode(':', $attribute->value)[1];
}
$rend = explode(':', $attribute->value)[1];
if ('superscript' !== $rend && 'italic' !== $rend) {
$renditions[] = explode(':', $attribute->value)[1];
}
}
} elseif ('name' === $e->nodeName && (isset($e->attributes[1]->value) && !empty($e->attributes[1]->value)) && (isset($e->attributes[0]->value) && !empty($e->attributes[0]->value))) {
$uuid = $this->getUuid();
$gndsUuids[$uuid] = str_replace('gnd:', '', $e->attributes[1]->value);
$pagesGndsUuids[$k][$uuid] = str_replace('gnd:', '', $e->attributes[1]->value);
$eText .= '<span class="' . $e->attributes[0]->value . '" id="' . $uuid . '">';
$gnd = true;
}
} elseif ('name' === $e->nodeName && (isset($e->attributes[1]->value) && !empty($e->attributes[1]->value)) && (isset($e->attributes[0]->value) && !empty($e->attributes[0]->value))) {
$uuid = $this->getUuid();
$gndsUuids[$uuid] = str_replace('gnd:', '', $e->attributes[1]->value);
$pagesGndsUuids[$k][$uuid] = str_replace('gnd:', '', $e->attributes[1]->value);
$eText .= '<span class="' . $e->attributes[0]->value . '" id="' . $uuid . '">';
$gnd = true;
}
}
$eText = preg_replace('~\x{00AD}~u', '-', $eText);
$pattern = '/(\w+)-\s(\w)/i';
$eText = preg_replace('~\x{00AD}~u', '-', $eText);
$pattern = '/(\w+)-\s(\w)/i';
$eText = preg_replace_callback(
$pattern,
function ($match) {
return $match[1] . $match[2];
},
$eText
);
$eText = preg_replace_callback(
$pattern,
function ($match) {
return $match[1] . $match[2];
},
$eText
);
$myeditedText .= '<p>' . $eText . '</p>';
$mytranscriptedText .= '<p>' . $tText . '</p>';
$myeditedText .= '<p>' . $eText . '</p>';
$mytranscriptedText .= '<p>' . $tText . '</p>';
} elseif ('pb' === $element->nodeName) {
foreach ($element->attributes as $key => $attribute) {
if ('facs' === $attribute->name) {
if (isset($graphics[trim($attribute->value, '#')])) {
$graphic = $graphics[trim($attribute->value, '#')];
} elseif ('pb' === $element->nodeName) {
foreach ($element->attributes as $key => $attribute) {
if ('facs' === $attribute->name) {
if (isset($graphics[trim($attribute->value, '#')])) {
$graphic = $graphics[trim($attribute->value, '#')];
if (str_ends_with($graphic, '.jpg')) {
$graphic = substr($graphic, 0, strlen($graphic) - 4);
}
if (str_ends_with($graphic, '.jpg')) {
$graphic = substr($graphic, 0, strlen($graphic) - 4);
}
} elseif ('n' === $attribute->name) {
$pageNumber = $attribute->value;
}
if (!empty($pageNumber) && !empty($graphic)) {
$href = '<a href="/' . $graphic . '" target="_blank"/>' . $pageNumber . '</a>';
} elseif (!empty($pageNumber) && empty($graphic)) {
$href = $pageNumber;
} else {
$href = '';
}
} elseif ('n' === $attribute->name) {
$pageNumber = $attribute->value;
}
$tText = $href;
$eText = $href;
$myeditedText .= '<p>' . $eText . '</p>';
$mytranscriptedText .= '<p>' . $tText . '</p>';
if (!empty($pageNumber) && !empty($graphic)) {
$href = '<a href="/' . $graphic . '" target="_blank"/>' . $pageNumber . '</a>';
} elseif (!empty($pageNumber) && empty($graphic)) {
$href = $pageNumber;
} else {
$href = '';
}
}
}
$editedTextArr[] = $myeditedText;
$transcriptedTextArr[] = $mytranscriptedText;
$tText = $href;
$eText = $href;
$myeditedText .= '<p>' . $eText . '</p>';
$mytranscriptedText .= '<p>' . $tText . '</p>';
}
}
$transcriptedText = '';
foreach ($transcriptedTextArr as $mytranscriptedText) {
$transcriptedText .= $mytranscriptedText;
}
$editedTextArr[] = $myeditedText;
$transcriptedTextArr[] = $mytranscriptedText;
}
$editedText = '';
foreach ($editedTextArr as $myeditedText) {
$editedText .= $myeditedText;
}
$transcriptedText = '';
foreach ($transcriptedTextArr as $mytranscriptedText) {
$transcriptedText .= $mytranscriptedText;
}
$solrDocument->setTranscriptedText($transcriptedText);
$solrDocument->setTranscriptedTextArr($transcriptedTextArr);
$solrDocument->setEditedText($editedText);
$solrDocument->setEditedTextArr($editedTextArr);
$solrDocument->setGndsUuids($gndsUuids);
$solrDocument->setPagesGndsUuids($pagesGndsUuids);
$editedText = '';
foreach ($editedTextArr as $myeditedText) {
$editedText .= $myeditedText;
}
$solrDocument->setTranscriptedText($transcriptedText);
$solrDocument->setTranscriptedTextArr($transcriptedTextArr);
$solrDocument->setEditedText($editedText);
$solrDocument->setEditedTextArr($editedTextArr);
$solrDocument->setGndsUuids($gndsUuids);
$solrDocument->setPagesGndsUuids($pagesGndsUuids);
return $solrDocument;
}
......@@ -1130,7 +1152,12 @@ class SimplexmlController extends AbstractController
*/
public function fetchTeis(): void
{
for ($i = 1; $i <= 100; $i++) {
$filesystem = new Filesystem();
if ($filesystem->exists($this->teiDir)) {
$filesystem->mkdir($this->teiDir);
}
for ($i = 1; $i <= 150; $i