Commit 308df675 authored by asajedi's avatar asajedi
Browse files

Log invalid TEI files including error messages

parent 86aaee82
...@@ -3,8 +3,6 @@ ...@@ -3,8 +3,6 @@
<component name="ChangeListManager"> <component name="ChangeListManager">
<list default="true" id="9299d56e-0c70-4b88-b2b1-2eb4d0694c1d" name="Default Changelist" comment=""> <list default="true" id="9299d56e-0c70-4b88-b2b1-2eb4d0694c1d" name="Default Changelist" comment="">
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" /> <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/config/services.yaml" beforeDir="false" afterPath="$PROJECT_DIR$/config/services.yaml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/src/Command/SolrIndexing.php" beforeDir="false" afterPath="$PROJECT_DIR$/src/Command/SolrIndexing.php" afterDir="false" />
<change beforePath="$PROJECT_DIR$/src/Controller/SimplexmlController.php" beforeDir="false" afterPath="$PROJECT_DIR$/src/Controller/SimplexmlController.php" afterDir="false" /> <change beforePath="$PROJECT_DIR$/src/Controller/SimplexmlController.php" beforeDir="false" afterPath="$PROJECT_DIR$/src/Controller/SimplexmlController.php" afterDir="false" />
</list> </list>
<option name="SHOW_DIALOG" value="false" /> <option name="SHOW_DIALOG" value="false" />
...@@ -201,7 +199,7 @@ ...@@ -201,7 +199,7 @@
<property name="WebServerToolWindowPanel.toolwindow.show.date" value="false" /> <property name="WebServerToolWindowPanel.toolwindow.show.date" value="false" />
<property name="WebServerToolWindowPanel.toolwindow.show.permissions" value="false" /> <property name="WebServerToolWindowPanel.toolwindow.show.permissions" value="false" />
<property name="WebServerToolWindowPanel.toolwindow.show.size" value="false" /> <property name="WebServerToolWindowPanel.toolwindow.show.size" value="false" />
<property name="last_opened_file_path" value="$PROJECT_DIR$/../digizeit-app" /> <property name="last_opened_file_path" value="$PROJECT_DIR$/../nlh-app" />
<property name="nodejs_package_manager_path" value="yarn" /> <property name="nodejs_package_manager_path" value="yarn" />
<property name="settings.editor.selected.configurable" value="preferences.editor" /> <property name="settings.editor.selected.configurable" value="preferences.editor" />
<property name="vue.rearranger.settings.migration" value="true" /> <property name="vue.rearranger.settings.migration" value="true" />
...@@ -256,7 +254,9 @@ ...@@ -256,7 +254,9 @@
<workItem from="1631236749149" duration="8000" /> <workItem from="1631236749149" duration="8000" />
<workItem from="1631272334167" duration="2663000" /> <workItem from="1631272334167" duration="2663000" />
<workItem from="1631618672072" duration="9455000" /> <workItem from="1631618672072" duration="9455000" />
<workItem from="1631699750162" duration="4175000" /> <workItem from="1631699750162" duration="5357000" />
<workItem from="1631791563265" duration="2112000" />
<workItem from="1631795325839" duration="20066000" />
</task> </task>
<servers /> <servers />
</component> </component>
......
<?php <?php
namespace App\Controller; declare(strict_types=1);
namespace App\Controller;
use App\Model\SolrDocument; use App\Model\SolrDocument;
use DOMDocument; use DOMDocument;
...@@ -501,6 +502,7 @@ class SimplexmlController extends AbstractController ...@@ -501,6 +502,7 @@ class SimplexmlController extends AbstractController
*/ */
public function tei2solr(): void public function tei2solr(): void
{ {
$this->client->getEndpoint()->setOptions(['timeout' => 60]);
$finder = new Finder(); $finder = new Finder();
$finder->files()->in($this->teiDir); $finder->files()->in($this->teiDir);
...@@ -782,6 +784,28 @@ class SimplexmlController extends AbstractController ...@@ -782,6 +784,28 @@ class SimplexmlController extends AbstractController
} }
} }
} }
} else {
$filesystem = new Filesystem();
$teiImportLogFile = './data/log/teiImportLogs.txt';
if (!$filesystem->exists($teiImportLogFile)) {
$filesystem->mkdir('./data/log');
$filesystem->touch($teiImportLogFile);
}
$errors = [];
foreach (libxml_get_errors() as $key => $error) {
if (0 === $key) {
$errors[] = explode('/', $error->file)[4].PHP_EOL;
$errors[] = '--------------------'.PHP_EOL;
}
$errors[] = $error->message;
}
$filesystem->appendToFile($teiImportLogFile, implode('', $errors));
libxml_clear_errors();
} }
} }
} }
...@@ -845,7 +869,6 @@ class SimplexmlController extends AbstractController ...@@ -845,7 +869,6 @@ class SimplexmlController extends AbstractController
$doc = new DOMDocument(); $doc = new DOMDocument();
$doc->load($filePath); $doc->load($filePath);
if (!libxml_get_errors()) {
$xpath = new DOMXPath($doc); $xpath = new DOMXPath($doc);
$xpath->registerNamespace('tei', 'http://www.tei-c.org/ns/1.0'); $xpath->registerNamespace('tei', 'http://www.tei-c.org/ns/1.0');
$pagesNodes = $xpath->query('//tei:body'); $pagesNodes = $xpath->query('//tei:body');
...@@ -1120,7 +1143,6 @@ class SimplexmlController extends AbstractController ...@@ -1120,7 +1143,6 @@ class SimplexmlController extends AbstractController
$solrDocument->setEditedTextArr($editedTextArr); $solrDocument->setEditedTextArr($editedTextArr);
$solrDocument->setGndsUuids($gndsUuids); $solrDocument->setGndsUuids($gndsUuids);
$solrDocument->setPagesGndsUuids($pagesGndsUuids); $solrDocument->setPagesGndsUuids($pagesGndsUuids);
}
return $solrDocument; return $solrDocument;
} }
...@@ -1130,7 +1152,12 @@ class SimplexmlController extends AbstractController ...@@ -1130,7 +1152,12 @@ class SimplexmlController extends AbstractController
*/ */
public function fetchTeis(): void public function fetchTeis(): void
{ {
for ($i = 1; $i <= 100; $i++) { $filesystem = new Filesystem();
if ($filesystem->exists($this->teiDir)) {
$filesystem->mkdir($this->teiDir);
}
for ($i = 1; $i <= 150; $i++) {
try { try {
$files = file_get_contents('https://gitlab.gwdg.de/api/v4/projects/3451/repository/tree?access_token=ordUT8XGFzf-HMsXvCXU&path=TEI_doc_bearb&per_page=10&page=' . $i); $files = file_get_contents('https://gitlab.gwdg.de/api/v4/projects/3451/repository/tree?access_token=ordUT8XGFzf-HMsXvCXU&path=TEI_doc_bearb&per_page=10&page=' . $i);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment