Commit 86aaee82 authored by asajedi's avatar asajedi
Browse files

Validate TEI file before loading

parent f2eb5186
......@@ -3,7 +3,9 @@
<component name="ChangeListManager">
<list default="true" id="9299d56e-0c70-4b88-b2b1-2eb4d0694c1d" name="Default Changelist" comment="">
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/package.json" beforeDir="false" afterPath="$PROJECT_DIR$/package.json" afterDir="false" />
<change beforePath="$PROJECT_DIR$/config/services.yaml" beforeDir="false" afterPath="$PROJECT_DIR$/config/services.yaml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/src/Command/SolrIndexing.php" beforeDir="false" afterPath="$PROJECT_DIR$/src/Command/SolrIndexing.php" afterDir="false" />
<change beforePath="$PROJECT_DIR$/src/Controller/SimplexmlController.php" beforeDir="false" afterPath="$PROJECT_DIR$/src/Controller/SimplexmlController.php" afterDir="false" />
</list>
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
......@@ -23,14 +25,17 @@
<setting file="file://$PROJECT_DIR$/var/emo-bundle/Model/Annotation/PartOf.php" root0="FORCE_HIGHLIGHTING" root1="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/.env.dist" root0="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/var/emo-bundle/Model/Annotation/AnnotationCollection.php" root0="FORCE_HIGHLIGHTING" root1="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/composer.json" root0="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/var/emo-bundle/Model/Annotation/Target.php" root0="FORCE_HIGHLIGHTING" root1="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/templates/item/detail.html.twig" root0="FORCE_HIGHLIGHTING" root1="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/var/emo-bundle/Model/Presentation/Title.php" root0="FORCE_HIGHLIGHTING" root1="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/var/emo-bundle/Model/Presentation/Item.php" root0="FORCE_HIGHLIGHTING" root1="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/docker-compose.yaml" root0="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/var/emo-bundle/Model/Document.php" root0="FORCE_HIGHLIGHTING" root1="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/var/emo-bundle/Model/Presentation/Image.php" root0="FORCE_HIGHLIGHTING" root1="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/src/Controller/SimplexmlController.php" root0="FORCE_HIGHLIGHTING" root1="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/composer.json" root0="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/config/services.yaml" root0="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/src/Command/SolrIndexing.php" root0="FORCE_HIGHLIGHTING" root1="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/var/emo-bundle/Model/Document.php" root0="FORCE_HIGHLIGHTING" root1="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/var/emo-bundle/Model/Annotation/AnnotationPage.php" root0="FORCE_HIGHLIGHTING" root1="FORCE_HIGHLIGHTING" />
</component>
<component name="PhpWorkspaceProjectConfiguration">
......@@ -196,7 +201,7 @@
<property name="WebServerToolWindowPanel.toolwindow.show.date" value="false" />
<property name="WebServerToolWindowPanel.toolwindow.show.permissions" value="false" />
<property name="WebServerToolWindowPanel.toolwindow.show.size" value="false" />
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
<property name="last_opened_file_path" value="$PROJECT_DIR$/../digizeit-app" />
<property name="nodejs_package_manager_path" value="yarn" />
<property name="settings.editor.selected.configurable" value="preferences.editor" />
<property name="vue.rearranger.settings.migration" value="true" />
......@@ -248,6 +253,10 @@
<workItem from="1630014126785" duration="612000" />
<workItem from="1630441686460" duration="2123000" />
<workItem from="1631180525162" duration="176000" />
<workItem from="1631236749149" duration="8000" />
<workItem from="1631272334167" duration="2663000" />
<workItem from="1631618672072" duration="9455000" />
<workItem from="1631699750162" duration="4175000" />
</task>
<servers />
</component>
......
......@@ -22,7 +22,8 @@ parameters:
SBB_SPK: 'Staatsbibliothek Preußischer Kulturbesitz, Berlin'
SMB_ZA: 'Zentralarchiv Staatliche Museen zu Berlin Preußischer Kulturbesitz'
UAHW: 'Archiv der Martin-Luther-Universität Halle-Wittenberg, Halle / S.'
tei_dir: '%kernel.project_dir%/teis/sampletei/'
tei_dir: '%kernel.project_dir%/data/gitlab/'
# tei_dir: '%kernel.project_dir%/teis/sampletei/'
services:
# default configuration for services in *this* file
......
......@@ -41,7 +41,7 @@ class SolrIndexing extends Command
{
$output->writeln('Start solr indexing.');
// $this->simplexmlController->fetchTeis();
$this->simplexmlController->fetchTeis();
$this->simplexmlController->deleteSolrIndex();
$this->simplexmlController->tei2solr();
......
......@@ -505,279 +505,281 @@ class SimplexmlController extends AbstractController
$finder->files()->in($this->teiDir);
foreach ($finder as $file) {
libxml_use_internal_errors(TRUE);
$doc = new DOMDocument();
$doc->load($file->getRealPath());
$xpath = new DOMXPath($doc);
$xpath->registerNamespace('tei', 'http://www.tei-c.org/ns/1.0');
$id = $this->getId($xpath);
$docType = 'article';
$shortTitle = $this->getShortTitle($xpath);
$title = $this->getTitle($xpath);
if (!libxml_get_errors()) {
$xpath = new DOMXPath($doc);
$xpath->registerNamespace('tei', 'http://www.tei-c.org/ns/1.0');
$originPlaceGNDNode = $xpath->query('//tei:name[@type="place" and @subtype="orn"]/@ref');
$id = $this->getId($xpath);
$docType = 'article';
$shortTitle = $this->getShortTitle($xpath);
$title = $this->getTitle($xpath);
if ($originPlaceGNDNode->item(0)) {
$originPlaceGND = $originPlaceGNDNode->item(0)->nodeValue;
}
$originPlaceGNDNode = $xpath->query('//tei:name[@type="place" and @subtype="orn"]/@ref');
$originPlace = $this->getOriginPlace($xpath);
$author = $this->getAuthor($xpath);
$recipient = $this->getRecipient($xpath);
$destinationPlace = $this->getDestinationPlace($xpath);
$originDate = $this->getOriginDate($xpath);
$license = $this->getLicense($xpath);
$language = $this->getLanguage($xpath);
$reference = $this->getReference($xpath);
$response = $this->getResponse($xpath);
$relatedItems = $this->getRelatedItems($xpath);
$repository = $this->getRepository($xpath);
$institution = $this->getInstitution($xpath);
$settlement = $this->getSettlement($xpath);
$country = $this->getCountry($xpath);
if (isset($repository) && isset($institution) && isset($settlement) && isset($country)) {
$institution = $repository . ', ' . $institution . ', ' . $settlement . ' (' . $country . ')';
$institution = trim(preg_replace('/\s+/', ' ', $institution));
}
if ($originPlaceGNDNode->item(0)) {
$originPlaceGND = $originPlaceGNDNode->item(0)->nodeValue;
}
$sourceDescription = $this->getSourceDescription($xpath);
$publicationDate = $this->getPublicationDate($xpath);
$fulltext = $this->getFulltext($xpath);
$numberOfPages = $this->getNumberOfPages($xpath);
$gndKeywords = $this->getGndKeywords($xpath);
$freeKeywords = $this->getFreeKeywords($xpath);
$shelfmark = $this->getShelfmark($xpath);
$scriptSource = $this->getScriptSource($xpath);
$writers = $this->getWriters($xpath);
$imageIds = $this->getImageIds($xpath);
$imageUrls = $this->getImageUrls($xpath);
$graphics = $this->getGraphics($imageIds, $imageUrls);
$entities = $this->getEntities($xpath);
$doctypeNotes = $this->getDoctypeNotes($xpath, $id);
$solrDocument = $this->getTextVersions($file->getRealPath(), $graphics);
$transcription = $solrDocument->getTranscriptedText();
$pagesTranscription = $solrDocument->getTranscriptedTextArr();
$editedText = $solrDocument->getEditedText();
$pagesEdited = $solrDocument->getEditedTextArr();
$gndsUuids = $solrDocument->getGndsUuids();
$pagesGndsUuids = $solrDocument->getPagesGndsUuids();
$update = $this->client->createUpdate();
$doc = $update->createDocument();
if (!empty($id)) {
$doc->id = $id;
$doc->doctype = $docType;
$doc->short_title = $shortTitle;
$doc->title = $title;
if (isset($originPlace) && !empty($originPlace)) {
$doc->origin_place = $originPlace;
$originPlace = $this->getOriginPlace($xpath);
$author = $this->getAuthor($xpath);
$recipient = $this->getRecipient($xpath);
$destinationPlace = $this->getDestinationPlace($xpath);
$originDate = $this->getOriginDate($xpath);
$license = $this->getLicense($xpath);
$language = $this->getLanguage($xpath);
$reference = $this->getReference($xpath);
$response = $this->getResponse($xpath);
$relatedItems = $this->getRelatedItems($xpath);
$repository = $this->getRepository($xpath);
$institution = $this->getInstitution($xpath);
$settlement = $this->getSettlement($xpath);
$country = $this->getCountry($xpath);
if (isset($repository) && isset($institution) && isset($settlement) && isset($country)) {
$institution = $repository . ', ' . $institution . ', ' . $settlement . ' (' . $country . ')';
$institution = trim(preg_replace('/\s+/', ' ', $institution));
}
$doc->author = $author;
$sourceDescription = $this->getSourceDescription($xpath);
$publicationDate = $this->getPublicationDate($xpath);
$fulltext = $this->getFulltext($xpath);
$numberOfPages = $this->getNumberOfPages($xpath);
$gndKeywords = $this->getGndKeywords($xpath);
$freeKeywords = $this->getFreeKeywords($xpath);
$shelfmark = $this->getShelfmark($xpath);
$scriptSource = $this->getScriptSource($xpath);
$writers = $this->getWriters($xpath);
$imageIds = $this->getImageIds($xpath);
$imageUrls = $this->getImageUrls($xpath);
$graphics = $this->getGraphics($imageIds, $imageUrls);
$entities = $this->getEntities($xpath);
$doctypeNotes = $this->getDoctypeNotes($xpath, $id);
$solrDocument = $this->getTextVersions($file->getRealPath(), $graphics);
$transcription = $solrDocument->getTranscriptedText();
$pagesTranscription = $solrDocument->getTranscriptedTextArr();
$editedText = $solrDocument->getEditedText();
$pagesEdited = $solrDocument->getEditedTextArr();
$gndsUuids = $solrDocument->getGndsUuids();
$pagesGndsUuids = $solrDocument->getPagesGndsUuids();
$update = $this->client->createUpdate();
$doc = $update->createDocument();
if (!empty($id)) {
$doc->id = $id;
$doc->doctype = $docType;
$doc->short_title = $shortTitle;
$doc->title = $title;
if (isset($originPlace) && !empty($originPlace)) {
$doc->origin_place = $originPlace;
}
if (isset($recipient) && !empty($recipient)) {
$doc->recipient = $recipient;
}
$doc->author = $author;
if (isset($originDate) && !empty($originDate)) {
$doc->origin_date = $originDate;
}
if (isset($recipient) && !empty($recipient)) {
$doc->recipient = $recipient;
}
if (isset($destinationPlace) && !empty($destinationPlace)) {
$doc->destination_place = $destinationPlace;
}
if (isset($originDate) && !empty($originDate)) {
$doc->origin_date = $originDate;
}
$doc->license = $license;
$doc->language = $language;
if (isset($destinationPlace) && !empty($destinationPlace)) {
$doc->destination_place = $destinationPlace;
}
if (isset($reference) && !empty($reference)) {
$doc->reference = $reference;
}
$doc->license = $license;
$doc->language = $language;
if (isset($response) && !empty($response)) {
$doc->response = $response;
}
if (isset($reference) && !empty($reference)) {
$doc->reference = $reference;
}
if (isset($relatedItem) && !empty($relatedItem)) {
$doc->related_items = $relatedItems;
}
if (isset($response) && !empty($response)) {
$doc->response = $response;
}
if (isset($institution) && !empty($institution)) {
$doc->institution = $institution;
}
if (isset($relatedItem) && !empty($relatedItem)) {
$doc->related_items = $relatedItems;
}
if (isset($sourceDescription) && !empty($sourceDescription)) {
$doc->source_description = $sourceDescription;
}
if (isset($institution) && !empty($institution)) {
$doc->institution = $institution;
}
if (isset($publicationDate) && !empty($publicationDate)) {
$doc->article_pub_date = $publicationDate;
}
if (isset($sourceDescription) && !empty($sourceDescription)) {
$doc->source_description = $sourceDescription;
}
if (isset($fulltext) && !empty($fulltext)) {
$doc->fulltext = $fulltext;
}
if (isset($publicationDate) && !empty($publicationDate)) {
$doc->article_pub_date = $publicationDate;
}
if (isset($numberOfPages) && !empty($numberOfPages)) {
$doc->number_of_pages = $numberOfPages;
}
if (isset($fulltext) && !empty($fulltext)) {
$doc->fulltext = $fulltext;
}
if (isset($gndKeywords) && !empty($gndKeywords)) {
$doc->gnd_keyword = $gndKeywords;
}
if (isset($numberOfPages) && !empty($numberOfPages)) {
$doc->number_of_pages = $numberOfPages;
}
if (isset($freeKeywords) && !empty($freeKeywords)) {
$doc->free_keyword = $freeKeywords;
}
if (isset($gndKeywords) && !empty($gndKeywords)) {
$doc->gnd_keyword = $gndKeywords;
}
if (isset($shelfmark) && !empty($shelfmark)) {
$doc->shelfmark = $shelfmark;
}
if (isset($freeKeywords) && !empty($freeKeywords)) {
$doc->free_keyword = $freeKeywords;
}
if (isset($scriptSource) && !empty($scriptSource)) {
$doc->script_source = $scriptSource;
}
if (isset($shelfmark) && !empty($shelfmark)) {
$doc->shelfmark = $shelfmark;
}
if (isset($writers) && !empty($writers)) {
$doc->writer = $writers;
}
if (isset($scriptSource) && !empty($scriptSource)) {
$doc->script_source = $scriptSource;
}
if (isset($imageIds) && !empty($imageIds)) {
$doc->image_ids = $imageIds;
}
if (isset($writers) && !empty($writers)) {
$doc->writer = $writers;
}
if (isset($imageUrls) && !empty($imageUrls)) {
$doc->image_urls = $imageUrls;
}
if (isset($imageIds) && !empty($imageIds)) {
$doc->image_ids = $imageIds;
}
if (isset($documentEntities) && !empty($documentEntities)) {
$doc->entities = $documentEntities;
}
if (isset($imageUrls) && !empty($imageUrls)) {
$doc->image_urls = $imageUrls;
}
if (isset($notes) && !empty($notes)) {
$doc->notes = $notes;
}
if (isset($documentEntities) && !empty($documentEntities)) {
$doc->entities = $documentEntities;
}
if (isset($transcription) && !empty($transcription)) {
$doc->transcripted_text = $transcription;
$doc->edited_text = $editedText;
}
if (isset($notes) && !empty($notes)) {
$doc->notes = $notes;
}
if (!empty($numberOfPages) && intval($numberOfPages)) {
for ($i = 1; $i <= $numberOfPages; $i++) {
$update1 = $this->client->createUpdate();
$childDoc = $update1->createDocument();
$childDoc->id = $id . '_page' . $i;
$childDoc->article_id = $id;
$childDoc->article_title = $title;
$childDoc->doctype = 'page';
$childDoc->page_number = $i;
$childDoc->language = $language;
if (isset($imageUrls[$i - 1]) && !empty($imageUrls[$i - 1])) {
$childDoc->image_url = $imageUrls[$i - 1];
}
if (isset($transcription) && !empty($transcription)) {
$doc->transcripted_text = $transcription;
$doc->edited_text = $editedText;
}
if (isset($pagesTranscription[$i - 1]) && !empty($pagesTranscription[$i - 1])) {
$childDoc->transcripted_text = $pagesTranscription[$i - 1];
}
if (!empty($numberOfPages) && intval($numberOfPages)) {
for ($i = 1; $i <= $numberOfPages; $i++) {
$update1 = $this->client->createUpdate();
$childDoc = $update1->createDocument();
$childDoc->id = $id . '_page' . $i;
$childDoc->article_id = $id;
$childDoc->article_title = $title;
$childDoc->doctype = 'page';
$childDoc->page_number = $i;
$childDoc->language = $language;
if (isset($imageUrls[$i - 1]) && !empty($imageUrls[$i - 1])) {
$childDoc->image_url = $imageUrls[$i - 1];
}
if (isset($pagesEdited[$i - 1]) && !empty($pagesEdited[$i - 1])) {
$childDoc->edited_text = $pagesEdited[$i - 1];
}
if (isset($pagesTranscription[$i - 1]) && !empty($pagesTranscription[$i - 1])) {
$childDoc->transcripted_text = $pagesTranscription[$i - 1];
}
if (isset($pagesGndsUuids[$i - 1]) && !empty(($pagesGndsUuids[$i - 1]))) {
$childDoc->entities = array_values($pagesGndsUuids[$i - 1]);
$childDoc->annotation_ids = array_keys($pagesGndsUuids[$i - 1]);
}
if (isset($pagesEdited[$i - 1]) && !empty($pagesEdited[$i - 1])) {
$childDoc->edited_text = $pagesEdited[$i - 1];
}
$update->addDocument($childDoc);
if (isset($pagesGndsUuids[$i - 1]) && !empty(($pagesGndsUuids[$i - 1]))) {
$childDoc->entities = array_values($pagesGndsUuids[$i - 1]);
$childDoc->annotation_ids = array_keys($pagesGndsUuids[$i - 1]);
}
$update->addDocument($childDoc);
}
}
}
$update->addDocument($doc);
$update->addCommit();
$this->client->execute($update);
if (isset($doctypeNotes) && is_iterable($doctypeNotes)) {
foreach ($doctypeNotes as $doctypeNoteArr) {
foreach ($doctypeNoteArr as $doctypeNote) {
if (!empty($doctypeNote['id'])) {
$update = $this->client->createUpdate();
$doc = $update->createDocument();
$doc->id = $doctypeNote['id'];
$doc->article_id = $doctypeNote['article_id'];
$doc->doctype = $doctypeNote['doctype'];
$doc->note = $doctypeNote['note'];
$update->addDocument($doc);
$update->addCommit();
$this->client->execute($update);
$update->addDocument($doc);
$update->addCommit();
$this->client->execute($update);
if (isset($doctypeNotes) && is_iterable($doctypeNotes)) {
foreach ($doctypeNotes as $doctypeNoteArr) {
foreach ($doctypeNoteArr as $doctypeNote) {
if (!empty($doctypeNote['id'])) {
$update = $this->client->createUpdate();
$doc = $update->createDocument();
$doc->id = $doctypeNote['id'];
$doc->article_id = $doctypeNote['article_id'];
$doc->doctype = $doctypeNote['doctype'];
$doc->note = $doctypeNote['note'];
$update->addDocument($doc);
$update->addCommit();
$this->client->execute($update);
}
}
}
}
}
}
if (isset($entities) && is_iterable($entities)) {
foreach ($entities as $entity) {
if (!empty($entity['gnd'])) {
$localFilePath = './../data/gnd-files/' . $entity['gnd'] . '.json';
if (!file_exists($localFilePath)) {
if (isset($entities) && is_iterable($entities)) {
foreach ($entities as $entity) {
if (!empty($entity['gnd'])) {
$localFilePath = './../data/gnd-files/' . $entity['gnd'] . '.json';
if (!file_exists($localFilePath)) {
$remoteFilePath = 'https://lobid.org/gnd/' . $entity['gnd'] . '.json';
$remoteFilePath = 'https://lobid.org/gnd/' . $entity['gnd'] . '.json';
try {
$fileContent = @file_get_contents($remoteFilePath, true);
try {
$fileContent = @file_get_contents($remoteFilePath, true);
if($fileContent==false) {
throw new Exception($localFilePath);
if ($fileContent == false) {
throw new Exception($localFilePath);
}
} catch (Exception $e) {
echo $e->getMessage();
}
} catch (Exception $e)
{
echo $e->getMessage();
}
$filesystem = new Filesystem();
$filesystem->dumpFile($localFilePath, $fileContent);
$filesystem = new Filesystem();
$filesystem->dumpFile($localFilePath, $fileContent);
$gndArr = json_decode($fileContent);
} else {
$gndArr = json_decode(file_get_contents($localFilePath));
}
$gndArr = json_decode($fileContent);
} else {
$gndArr = json_decode(file_get_contents($localFilePath));
}
if (isset($gndArr->preferredName) && !empty($gndArr->preferredName)) {
$preferredName = $gndArr->preferredName;
}
if (isset($gndArr->preferredName) && !empty($gndArr->preferredName)) {
$preferredName = $gndArr->preferredName;
}
if (isset($gndArr->variantName) && !empty($gndArr->variantName)) {
$variantNames = $gndArr->variantName;
}
if (isset($gndArr->variantName) && !empty($gndArr->variantName)) {
$variantNames = $gndArr->variantName;
}
$update = $this->client->createUpdate();
$doc = $update->createDocument();
$doc->id = $entity['gnd'];
$doc->entity_name = $entity['name'];
$doc->doctype = $entity['doctype'];
$doc->entitytype = $entity['entity_type'];
$update = $this->client->createUpdate();
$doc = $update->createDocument();
$doc->id = $entity['gnd'];
$doc->entity_name = $entity['name'];
$doc->doctype = $entity['doctype'];
$doc->entitytype = $entity['entity_type'];
if (isset($preferredName) && !empty($preferredName)) {
$doc->mostly_use_name = $preferredName;
}
if (isset($preferredName) && !empty($preferredName)) {
$doc->mostly_use_name = $preferredName;
}
if (is_iterable($variantNames) && !empty($variantNames)) {
$doc->alternatively_name = $variantNames;
}
if (is_iterable($variantNames) && !empty($variantNames)) {
$doc->alternatively_name = $variantNames;
}
$update->addDocument($doc);
$update->addCommit();
$update->addDocument($doc);
$update->addCommit();
$this->client->execute($update);
$this->client->execute($update);
}
}
}