diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index 259a947fa0e83cf65deafc503c07f67b89edf71b..b9eb5551cbc4a6bd68583830b370ded34d4f5891 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -3,7 +3,9 @@
-
+
+
+
@@ -23,14 +25,17 @@
-
-
+
+
+
+
+
@@ -196,7 +201,7 @@
-
+
@@ -248,6 +253,10 @@
+
+
+
+
diff --git a/config/services.yaml b/config/services.yaml
index 9f9f92069d9b4b3edbb5b20889169a08db46ec79..d4a8b45e59d0fa52a9186770a67c90da4fc4cdc4 100755
--- a/config/services.yaml
+++ b/config/services.yaml
@@ -22,7 +22,8 @@ parameters:
SBB_SPK: 'Staatsbibliothek Preußischer Kulturbesitz, Berlin'
SMB_ZA: 'Zentralarchiv Staatliche Museen zu Berlin – Preußischer Kulturbesitz'
UAHW: 'Archiv der Martin-Luther-Universität Halle-Wittenberg, Halle / S.'
- tei_dir: '%kernel.project_dir%/teis/sampletei/'
+ tei_dir: '%kernel.project_dir%/data/gitlab/'
+# tei_dir: '%kernel.project_dir%/teis/sampletei/'
services:
# default configuration for services in *this* file
diff --git a/src/Command/SolrIndexing.php b/src/Command/SolrIndexing.php
index a5c56fd621bf590c20fa6408d4e8e7f4c383fdd3..f6fe37baf66a8ccab561456e9272457bddea249b 100644
--- a/src/Command/SolrIndexing.php
+++ b/src/Command/SolrIndexing.php
@@ -41,7 +41,7 @@ class SolrIndexing extends Command
{
$output->writeln('Start solr indexing.');
-// $this->simplexmlController->fetchTeis();
+ $this->simplexmlController->fetchTeis();
$this->simplexmlController->deleteSolrIndex();
$this->simplexmlController->tei2solr();
diff --git a/src/Controller/SimplexmlController.php b/src/Controller/SimplexmlController.php
index d9f263803614523599113a2cc42eceb8e66d32fd..10eeb6ca6d2c9dc3aaef4d6ebeba5b7346788936 100755
--- a/src/Controller/SimplexmlController.php
+++ b/src/Controller/SimplexmlController.php
@@ -505,279 +505,281 @@ class SimplexmlController extends AbstractController
$finder->files()->in($this->teiDir);
foreach ($finder as $file) {
+ libxml_use_internal_errors(TRUE);
$doc = new DOMDocument();
$doc->load($file->getRealPath());
- $xpath = new DOMXPath($doc);
- $xpath->registerNamespace('tei', 'http://www.tei-c.org/ns/1.0');
- $id = $this->getId($xpath);
- $docType = 'article';
- $shortTitle = $this->getShortTitle($xpath);
- $title = $this->getTitle($xpath);
+ if (!libxml_get_errors()) {
+ $xpath = new DOMXPath($doc);
+ $xpath->registerNamespace('tei', 'http://www.tei-c.org/ns/1.0');
- $originPlaceGNDNode = $xpath->query('//tei:name[@type="place" and @subtype="orn"]/@ref');
+ $id = $this->getId($xpath);
+ $docType = 'article';
+ $shortTitle = $this->getShortTitle($xpath);
+ $title = $this->getTitle($xpath);
- if ($originPlaceGNDNode->item(0)) {
- $originPlaceGND = $originPlaceGNDNode->item(0)->nodeValue;
- }
+ $originPlaceGNDNode = $xpath->query('//tei:name[@type="place" and @subtype="orn"]/@ref');
- $originPlace = $this->getOriginPlace($xpath);
- $author = $this->getAuthor($xpath);
- $recipient = $this->getRecipient($xpath);
- $destinationPlace = $this->getDestinationPlace($xpath);
- $originDate = $this->getOriginDate($xpath);
- $license = $this->getLicense($xpath);
- $language = $this->getLanguage($xpath);
- $reference = $this->getReference($xpath);
- $response = $this->getResponse($xpath);
- $relatedItems = $this->getRelatedItems($xpath);
- $repository = $this->getRepository($xpath);
- $institution = $this->getInstitution($xpath);
- $settlement = $this->getSettlement($xpath);
- $country = $this->getCountry($xpath);
- if (isset($repository) && isset($institution) && isset($settlement) && isset($country)) {
- $institution = $repository . ', ' . $institution . ', ' . $settlement . ' (' . $country . ')';
- $institution = trim(preg_replace('/\s+/', ' ', $institution));
- }
+ if ($originPlaceGNDNode->item(0)) {
+ $originPlaceGND = $originPlaceGNDNode->item(0)->nodeValue;
+ }
- $sourceDescription = $this->getSourceDescription($xpath);
- $publicationDate = $this->getPublicationDate($xpath);
- $fulltext = $this->getFulltext($xpath);
- $numberOfPages = $this->getNumberOfPages($xpath);
- $gndKeywords = $this->getGndKeywords($xpath);
- $freeKeywords = $this->getFreeKeywords($xpath);
- $shelfmark = $this->getShelfmark($xpath);
- $scriptSource = $this->getScriptSource($xpath);
- $writers = $this->getWriters($xpath);
- $imageIds = $this->getImageIds($xpath);
- $imageUrls = $this->getImageUrls($xpath);
- $graphics = $this->getGraphics($imageIds, $imageUrls);
- $entities = $this->getEntities($xpath);
- $doctypeNotes = $this->getDoctypeNotes($xpath, $id);
-
- $solrDocument = $this->getTextVersions($file->getRealPath(), $graphics);
- $transcription = $solrDocument->getTranscriptedText();
- $pagesTranscription = $solrDocument->getTranscriptedTextArr();
- $editedText = $solrDocument->getEditedText();
- $pagesEdited = $solrDocument->getEditedTextArr();
- $gndsUuids = $solrDocument->getGndsUuids();
- $pagesGndsUuids = $solrDocument->getPagesGndsUuids();
-
- $update = $this->client->createUpdate();
- $doc = $update->createDocument();
-
- if (!empty($id)) {
- $doc->id = $id;
- $doc->doctype = $docType;
- $doc->short_title = $shortTitle;
- $doc->title = $title;
-
- if (isset($originPlace) && !empty($originPlace)) {
- $doc->origin_place = $originPlace;
+ $originPlace = $this->getOriginPlace($xpath);
+ $author = $this->getAuthor($xpath);
+ $recipient = $this->getRecipient($xpath);
+ $destinationPlace = $this->getDestinationPlace($xpath);
+ $originDate = $this->getOriginDate($xpath);
+ $license = $this->getLicense($xpath);
+ $language = $this->getLanguage($xpath);
+ $reference = $this->getReference($xpath);
+ $response = $this->getResponse($xpath);
+ $relatedItems = $this->getRelatedItems($xpath);
+ $repository = $this->getRepository($xpath);
+ $institution = $this->getInstitution($xpath);
+ $settlement = $this->getSettlement($xpath);
+ $country = $this->getCountry($xpath);
+ if (isset($repository) && isset($institution) && isset($settlement) && isset($country)) {
+ $institution = $repository . ', ' . $institution . ', ' . $settlement . ' (' . $country . ')';
+ $institution = trim(preg_replace('/\s+/', ' ', $institution));
}
- $doc->author = $author;
+ $sourceDescription = $this->getSourceDescription($xpath);
+ $publicationDate = $this->getPublicationDate($xpath);
+ $fulltext = $this->getFulltext($xpath);
+ $numberOfPages = $this->getNumberOfPages($xpath);
+ $gndKeywords = $this->getGndKeywords($xpath);
+ $freeKeywords = $this->getFreeKeywords($xpath);
+ $shelfmark = $this->getShelfmark($xpath);
+ $scriptSource = $this->getScriptSource($xpath);
+ $writers = $this->getWriters($xpath);
+ $imageIds = $this->getImageIds($xpath);
+ $imageUrls = $this->getImageUrls($xpath);
+ $graphics = $this->getGraphics($imageIds, $imageUrls);
+ $entities = $this->getEntities($xpath);
+ $doctypeNotes = $this->getDoctypeNotes($xpath, $id);
+
+ $solrDocument = $this->getTextVersions($file->getRealPath(), $graphics);
+ $transcription = $solrDocument->getTranscriptedText();
+ $pagesTranscription = $solrDocument->getTranscriptedTextArr();
+ $editedText = $solrDocument->getEditedText();
+ $pagesEdited = $solrDocument->getEditedTextArr();
+ $gndsUuids = $solrDocument->getGndsUuids();
+ $pagesGndsUuids = $solrDocument->getPagesGndsUuids();
+
+ $update = $this->client->createUpdate();
+ $doc = $update->createDocument();
+
+ if (!empty($id)) {
+ $doc->id = $id;
+ $doc->doctype = $docType;
+ $doc->short_title = $shortTitle;
+ $doc->title = $title;
+
+ if (isset($originPlace) && !empty($originPlace)) {
+ $doc->origin_place = $originPlace;
+ }
- if (isset($recipient) && !empty($recipient)) {
- $doc->recipient = $recipient;
- }
+ $doc->author = $author;
- if (isset($originDate) && !empty($originDate)) {
- $doc->origin_date = $originDate;
- }
+ if (isset($recipient) && !empty($recipient)) {
+ $doc->recipient = $recipient;
+ }
- if (isset($destinationPlace) && !empty($destinationPlace)) {
- $doc->destination_place = $destinationPlace;
- }
+ if (isset($originDate) && !empty($originDate)) {
+ $doc->origin_date = $originDate;
+ }
- $doc->license = $license;
- $doc->language = $language;
+ if (isset($destinationPlace) && !empty($destinationPlace)) {
+ $doc->destination_place = $destinationPlace;
+ }
- if (isset($reference) && !empty($reference)) {
- $doc->reference = $reference;
- }
+ $doc->license = $license;
+ $doc->language = $language;
- if (isset($response) && !empty($response)) {
- $doc->response = $response;
- }
+ if (isset($reference) && !empty($reference)) {
+ $doc->reference = $reference;
+ }
- if (isset($relatedItem) && !empty($relatedItem)) {
- $doc->related_items = $relatedItems;
- }
+ if (isset($response) && !empty($response)) {
+ $doc->response = $response;
+ }
- if (isset($institution) && !empty($institution)) {
- $doc->institution = $institution;
- }
+ if (isset($relatedItem) && !empty($relatedItem)) {
+ $doc->related_items = $relatedItems;
+ }
- if (isset($sourceDescription) && !empty($sourceDescription)) {
- $doc->source_description = $sourceDescription;
- }
+ if (isset($institution) && !empty($institution)) {
+ $doc->institution = $institution;
+ }
- if (isset($publicationDate) && !empty($publicationDate)) {
- $doc->article_pub_date = $publicationDate;
- }
+ if (isset($sourceDescription) && !empty($sourceDescription)) {
+ $doc->source_description = $sourceDescription;
+ }
- if (isset($fulltext) && !empty($fulltext)) {
- $doc->fulltext = $fulltext;
- }
+ if (isset($publicationDate) && !empty($publicationDate)) {
+ $doc->article_pub_date = $publicationDate;
+ }
- if (isset($numberOfPages) && !empty($numberOfPages)) {
- $doc->number_of_pages = $numberOfPages;
- }
+ if (isset($fulltext) && !empty($fulltext)) {
+ $doc->fulltext = $fulltext;
+ }
- if (isset($gndKeywords) && !empty($gndKeywords)) {
- $doc->gnd_keyword = $gndKeywords;
- }
+ if (isset($numberOfPages) && !empty($numberOfPages)) {
+ $doc->number_of_pages = $numberOfPages;
+ }
- if (isset($freeKeywords) && !empty($freeKeywords)) {
- $doc->free_keyword = $freeKeywords;
- }
+ if (isset($gndKeywords) && !empty($gndKeywords)) {
+ $doc->gnd_keyword = $gndKeywords;
+ }
- if (isset($shelfmark) && !empty($shelfmark)) {
- $doc->shelfmark = $shelfmark;
- }
+ if (isset($freeKeywords) && !empty($freeKeywords)) {
+ $doc->free_keyword = $freeKeywords;
+ }
- if (isset($scriptSource) && !empty($scriptSource)) {
- $doc->script_source = $scriptSource;
- }
+ if (isset($shelfmark) && !empty($shelfmark)) {
+ $doc->shelfmark = $shelfmark;
+ }
- if (isset($writers) && !empty($writers)) {
- $doc->writer = $writers;
- }
+ if (isset($scriptSource) && !empty($scriptSource)) {
+ $doc->script_source = $scriptSource;
+ }
- if (isset($imageIds) && !empty($imageIds)) {
- $doc->image_ids = $imageIds;
- }
+ if (isset($writers) && !empty($writers)) {
+ $doc->writer = $writers;
+ }
- if (isset($imageUrls) && !empty($imageUrls)) {
- $doc->image_urls = $imageUrls;
- }
+ if (isset($imageIds) && !empty($imageIds)) {
+ $doc->image_ids = $imageIds;
+ }
- if (isset($documentEntities) && !empty($documentEntities)) {
- $doc->entities = $documentEntities;
- }
+ if (isset($imageUrls) && !empty($imageUrls)) {
+ $doc->image_urls = $imageUrls;
+ }
- if (isset($notes) && !empty($notes)) {
- $doc->notes = $notes;
- }
+ if (isset($documentEntities) && !empty($documentEntities)) {
+ $doc->entities = $documentEntities;
+ }
- if (isset($transcription) && !empty($transcription)) {
- $doc->transcripted_text = $transcription;
- $doc->edited_text = $editedText;
- }
+ if (isset($notes) && !empty($notes)) {
+ $doc->notes = $notes;
+ }
- if (!empty($numberOfPages) && intval($numberOfPages)) {
- for ($i = 1; $i <= $numberOfPages; $i++) {
- $update1 = $this->client->createUpdate();
- $childDoc = $update1->createDocument();
- $childDoc->id = $id . '_page' . $i;
- $childDoc->article_id = $id;
- $childDoc->article_title = $title;
- $childDoc->doctype = 'page';
- $childDoc->page_number = $i;
- $childDoc->language = $language;
-
- if (isset($imageUrls[$i - 1]) && !empty($imageUrls[$i - 1])) {
- $childDoc->image_url = $imageUrls[$i - 1];
- }
+ if (isset($transcription) && !empty($transcription)) {
+ $doc->transcripted_text = $transcription;
+ $doc->edited_text = $editedText;
+ }
- if (isset($pagesTranscription[$i - 1]) && !empty($pagesTranscription[$i - 1])) {
- $childDoc->transcripted_text = $pagesTranscription[$i - 1];
- }
+ if (!empty($numberOfPages) && intval($numberOfPages)) {
+ for ($i = 1; $i <= $numberOfPages; $i++) {
+ $update1 = $this->client->createUpdate();
+ $childDoc = $update1->createDocument();
+ $childDoc->id = $id . '_page' . $i;
+ $childDoc->article_id = $id;
+ $childDoc->article_title = $title;
+ $childDoc->doctype = 'page';
+ $childDoc->page_number = $i;
+ $childDoc->language = $language;
+
+ if (isset($imageUrls[$i - 1]) && !empty($imageUrls[$i - 1])) {
+ $childDoc->image_url = $imageUrls[$i - 1];
+ }
- if (isset($pagesEdited[$i - 1]) && !empty($pagesEdited[$i - 1])) {
- $childDoc->edited_text = $pagesEdited[$i - 1];
- }
+ if (isset($pagesTranscription[$i - 1]) && !empty($pagesTranscription[$i - 1])) {
+ $childDoc->transcripted_text = $pagesTranscription[$i - 1];
+ }
- if (isset($pagesGndsUuids[$i - 1]) && !empty(($pagesGndsUuids[$i - 1]))) {
- $childDoc->entities = array_values($pagesGndsUuids[$i - 1]);
- $childDoc->annotation_ids = array_keys($pagesGndsUuids[$i - 1]);
- }
+ if (isset($pagesEdited[$i - 1]) && !empty($pagesEdited[$i - 1])) {
+ $childDoc->edited_text = $pagesEdited[$i - 1];
+ }
- $update->addDocument($childDoc);
+ if (isset($pagesGndsUuids[$i - 1]) && !empty(($pagesGndsUuids[$i - 1]))) {
+ $childDoc->entities = array_values($pagesGndsUuids[$i - 1]);
+ $childDoc->annotation_ids = array_keys($pagesGndsUuids[$i - 1]);
+ }
+
+ $update->addDocument($childDoc);
+ }
}
- }
- $update->addDocument($doc);
- $update->addCommit();
- $this->client->execute($update);
-
- if (isset($doctypeNotes) && is_iterable($doctypeNotes)) {
- foreach ($doctypeNotes as $doctypeNoteArr) {
- foreach ($doctypeNoteArr as $doctypeNote) {
- if (!empty($doctypeNote['id'])) {
- $update = $this->client->createUpdate();
- $doc = $update->createDocument();
- $doc->id = $doctypeNote['id'];
- $doc->article_id = $doctypeNote['article_id'];
- $doc->doctype = $doctypeNote['doctype'];
- $doc->note = $doctypeNote['note'];
-
- $update->addDocument($doc);
- $update->addCommit();
-
- $this->client->execute($update);
+ $update->addDocument($doc);
+ $update->addCommit();
+ $this->client->execute($update);
+
+ if (isset($doctypeNotes) && is_iterable($doctypeNotes)) {
+ foreach ($doctypeNotes as $doctypeNoteArr) {
+ foreach ($doctypeNoteArr as $doctypeNote) {
+ if (!empty($doctypeNote['id'])) {
+ $update = $this->client->createUpdate();
+ $doc = $update->createDocument();
+ $doc->id = $doctypeNote['id'];
+ $doc->article_id = $doctypeNote['article_id'];
+ $doc->doctype = $doctypeNote['doctype'];
+ $doc->note = $doctypeNote['note'];
+
+ $update->addDocument($doc);
+ $update->addCommit();
+
+ $this->client->execute($update);
+ }
}
}
}
}
- }
+ if (isset($entities) && is_iterable($entities)) {
+ foreach ($entities as $entity) {
+ if (!empty($entity['gnd'])) {
+ $localFilePath = './../data/gnd-files/' . $entity['gnd'] . '.json';
+ if (!file_exists($localFilePath)) {
- if (isset($entities) && is_iterable($entities)) {
- foreach ($entities as $entity) {
- if (!empty($entity['gnd'])) {
- $localFilePath = './../data/gnd-files/' . $entity['gnd'] . '.json';
- if (!file_exists($localFilePath)) {
+ $remoteFilePath = 'https://lobid.org/gnd/' . $entity['gnd'] . '.json';
- $remoteFilePath = 'https://lobid.org/gnd/' . $entity['gnd'] . '.json';
+ try {
+ $fileContent = @file_get_contents($remoteFilePath, true);
- try {
- $fileContent = @file_get_contents($remoteFilePath, true);
-
- if($fileContent==false) {
- throw new Exception($localFilePath);
+ if ($fileContent == false) {
+ throw new Exception($localFilePath);
+ }
+ } catch (Exception $e) {
+ echo $e->getMessage();
}
- } catch (Exception $e)
- {
- echo $e->getMessage();
- }
- $filesystem = new Filesystem();
- $filesystem->dumpFile($localFilePath, $fileContent);
+ $filesystem = new Filesystem();
+ $filesystem->dumpFile($localFilePath, $fileContent);
- $gndArr = json_decode($fileContent);
- } else {
- $gndArr = json_decode(file_get_contents($localFilePath));
- }
+ $gndArr = json_decode($fileContent);
+ } else {
+ $gndArr = json_decode(file_get_contents($localFilePath));
+ }
- if (isset($gndArr->preferredName) && !empty($gndArr->preferredName)) {
- $preferredName = $gndArr->preferredName;
- }
+ if (isset($gndArr->preferredName) && !empty($gndArr->preferredName)) {
+ $preferredName = $gndArr->preferredName;
+ }
- if (isset($gndArr->variantName) && !empty($gndArr->variantName)) {
- $variantNames = $gndArr->variantName;
- }
+ if (isset($gndArr->variantName) && !empty($gndArr->variantName)) {
+ $variantNames = $gndArr->variantName;
+ }
- $update = $this->client->createUpdate();
- $doc = $update->createDocument();
- $doc->id = $entity['gnd'];
- $doc->entity_name = $entity['name'];
- $doc->doctype = $entity['doctype'];
- $doc->entitytype = $entity['entity_type'];
+ $update = $this->client->createUpdate();
+ $doc = $update->createDocument();
+ $doc->id = $entity['gnd'];
+ $doc->entity_name = $entity['name'];
+ $doc->doctype = $entity['doctype'];
+ $doc->entitytype = $entity['entity_type'];
- if (isset($preferredName) && !empty($preferredName)) {
- $doc->mostly_use_name = $preferredName;
- }
+ if (isset($preferredName) && !empty($preferredName)) {
+ $doc->mostly_use_name = $preferredName;
+ }
- if (is_iterable($variantNames) && !empty($variantNames)) {
- $doc->alternatively_name = $variantNames;
- }
+ if (is_iterable($variantNames) && !empty($variantNames)) {
+ $doc->alternatively_name = $variantNames;
+ }
- $update->addDocument($doc);
- $update->addCommit();
+ $update->addDocument($doc);
+ $update->addCommit();
- $this->client->execute($update);
+ $this->client->execute($update);
+ }
}
}
}
@@ -838,287 +840,288 @@ class SimplexmlController extends AbstractController
public function getTextVersions(string $filePath, array $graphics): SolrDocument
{
+ libxml_use_internal_errors(TRUE);
+ $solrDocument = new SolrDocument();
$doc = new DOMDocument();
$doc->load($filePath);
- $xpath = new DOMXPath($doc);
- $xpath->registerNamespace('tei', 'http://www.tei-c.org/ns/1.0');
- $pagesNodes = $xpath->query('//tei:body');
- $elements = [];
- foreach ($pagesNodes as $pagesNode) {
- $elements = $this->getPagesNodes($pagesNode, $elements);
- }
+ if (!libxml_get_errors()) {
+ $xpath = new DOMXPath($doc);
+ $xpath->registerNamespace('tei', 'http://www.tei-c.org/ns/1.0');
+ $pagesNodes = $xpath->query('//tei:body');
- $editedTextArr = [];
- $transcriptedTextArr = [];
- $gndsUuids = [];
- $pagesGndsUuids = [];
-
- foreach ($elements as $k => $myelement) {
- $renditions = [];
- $myeditedText = '';
- $mytranscriptedText = '';
-
- foreach ($myelement as $element) {
- if (isset($element->nodeName) && (
- $element->nodeName === 'p' ||
- $element->nodeName === 'dateline' ||
- $element->nodeName === 'address' ||
- $element->nodeName === 'closer' ||
- $element->nodeName === 'list'
- )) {
-
- $ele = [];
- $el = $this->getNodeChilds($element, $ele);
- $tText = '';
- $eText = '';
- $n = 0;
- $liNumber = 1;
-
- foreach ($el as $e) {
- if ($e->nodeName === '#text'
- && ($e->parentNode->nodeName !== 'abbr' && ($e->nodeName !== 'del' || $e->parentNode->nodeName !== 'add'))
- && $e->parentNode->nodeName !== 'note'
-
- ) {
-
- if (isset($add) && !empty($add)) {
- $tText .= ' 〈'.$e->data.' '.$add;
-
- $eText .= ' '.$e->data;
-
- $add = '';
- } elseif (isset($del) && !empty($del)) {
- $tText .= ' ['.$e->data.' '.$del;
-
- $del = '';
- } elseif (isset($li) && !empty($li) && !empty($e->data) && 'item' === $e->parentNode->nodeName) {
-
- if (isset($italic) && true === $italic) {
- $tText .= $li . $e->data . '';
- $eText .= $li . $e->data . '';
- $italic = false;
- } else {
- $tText .= $li . $e->data;
- $eText .= $li . $e->data;
- }
+ $elements = [];
+ foreach ($pagesNodes as $pagesNode) {
+ $elements = $this->getPagesNodes($pagesNode, $elements);
+ }
- $li = '';
- } elseif ('hi' === $e->parentNode->nodeName && isset($e->parentNode->attributes[0])) {
+ $editedTextArr = [];
+ $transcriptedTextArr = [];
+ $gndsUuids = [];
+ $pagesGndsUuids = [];
+
+ foreach ($elements as $k => $myelement) {
+ $renditions = [];
+ $myeditedText = '';
+ $mytranscriptedText = '';
+
+ foreach ($myelement as $element) {
+ if (isset($element->nodeName) && (
+ $element->nodeName === 'p' ||
+ $element->nodeName === 'dateline' ||
+ $element->nodeName === 'address' ||
+ $element->nodeName === 'closer' ||
+ $element->nodeName === 'list'
+ )) {
+
+ $ele = [];
+ $el = $this->getNodeChilds($element, $ele);
+ $tText = '';
+ $eText = '';
+ $n = 0;
+ $liNumber = 1;
+
+ foreach ($el as $e) {
+ if ($e->nodeName === '#text'
+ && ($e->parentNode->nodeName !== 'abbr' && ($e->nodeName !== 'del' || $e->parentNode->nodeName !== 'add'))
+ && $e->parentNode->nodeName !== 'note'
+
+ ) {
+
+ if (isset($add) && !empty($add)) {
+ $tText .= ' 〈' . $e->data . ' ' . $add;
+
+ $eText .= ' ' . $e->data;
+
+ $add = '';
+ } elseif (isset($del) && !empty($del)) {
+ $tText .= ' [' . $e->data . ' ' . $del;
+
+ $del = '';
+ } elseif (isset($li) && !empty($li) && !empty($e->data) && 'item' === $e->parentNode->nodeName) {
+
+ if (isset($italic) && true === $italic) {
+ $tText .= $li . $e->data . '';
+ $eText .= $li . $e->data . '';
+ $italic = false;
+ } else {
+ $tText .= $li . $e->data;
+ $eText .= $li . $e->data;
+ }
- $hi = explode(':', $e->parentNode->attributes[0]->value);
+ $li = '';
+ } elseif ('hi' === $e->parentNode->nodeName && isset($e->parentNode->attributes[0])) {
- if (isset($hi[1]) && !empty($hi[1]) && 'superscript' === $hi[1]) {
- $tText .= ''.$e->data.'';
- }
+ $hi = explode(':', $e->parentNode->attributes[0]->value);
- elseif (isset($hi[1]) && !empty($hi[1]) && 'italic' === $hi[1]) {
+ if (isset($hi[1]) && !empty($hi[1]) && 'superscript' === $hi[1]) {
+ $tText .= '' . $e->data . '';
+ } elseif (isset($hi[1]) && !empty($hi[1]) && 'italic' === $hi[1]) {
- $tText .= '' . $e->data . '';
+ $tText .= '' . $e->data . '';
- $italic = true;
- }
- } elseif (isset($gnd) && true === $gnd) {
- $eText .= $e->data.'';
- $tText .= $e->data;
- $gnd = false;
- }
- else {
- $eText .= $e->data;
+ $italic = true;
+ }
+ } elseif (isset($gnd) && true === $gnd) {
+ $eText .= $e->data . '';
+ $tText .= $e->data;
+ $gnd = false;
+ } else {
+ $eText .= $e->data;
- if (isset($renditions) && !empty($renditions)) {
- $classOpeningTag = '';
- $classEndTag = '';
+ if (isset($renditions) && !empty($renditions)) {
+ $classOpeningTag = '';
+ $classEndTag = '';
- foreach ($renditions as $rendition) {
- if ('italic' === $rendition) {
- $class = 'i';
+ foreach ($renditions as $rendition) {
+ if ('italic' === $rendition) {
+ $class = 'i';
- } elseif ('underline' === $rendition) {
- $class = 'u';
- }
- if (isset($class) && !empty($class)) {
- $classOpeningTag .= '<' . $class . '>';
- $classEndTag .= ''.$class.'>';
+ } elseif ('underline' === $rendition) {
+ $class = 'u';
+ }
+ if (isset($class) && !empty($class)) {
+ $classOpeningTag .= '<' . $class . '>';
+ $classEndTag .= '' . $class . '>';
+ }
+
+ $renditions = [];
}
+ }
- $renditions = [];
+ if (isset($classOpeningTag) && !empty($classOpeningTag)) {
+ $tText .= $classOpeningTag;
+ $classOpeningTag = '';
}
- }
- if (isset($classOpeningTag) && !empty($classOpeningTag)) {
- $tText .= $classOpeningTag;
- $classOpeningTag = '';
- }
+ $tText .= $e->data;
- $tText .= $e->data;
+ if (isset($classEndTag) && !empty($classEndTag)) {
+ $tText .= $classEndTag;
+ $classEndTag = '';
+ }
- if (isset($classEndTag) && !empty($classEndTag)) {
- $tText .= $classEndTag;
- $classEndTag = '';
+ }
+ } elseif ('item' === $e->nodeName) {
+ if ($liNumber++ === 1) {
+ $li = '
- ';
+ } else {
+ $li = '
- ';
}
- }
- } elseif ('item' === $e->nodeName) {
- if ($liNumber++ === 1) {
- $li = '
- ';
- } else {
- $li = '
- ';
- }
-
- } elseif ('add' === $e->nodeName) {
- if ('rdg' === $e->parentNode->nodeName) {
+ } elseif ('add' === $e->nodeName) {
+ if ('rdg' === $e->parentNode->nodeName) {
- if (!empty($e->attributes)) {
+ if (!empty($e->attributes)) {
- $pattern = '/^#[a-z_]*$/i';
+ $pattern = '/^#[a-z_]*$/i';
- foreach ($e->attributes as $attribute) {
+ foreach ($e->attributes as $attribute) {
- if ('hand' === $attribute->nodeName) {
- $match = preg_match($pattern, $attribute->nodeValue, $matches);
+ if ('hand' === $attribute->nodeName) {
+ $match = preg_match($pattern, $attribute->nodeValue, $matches);
- if ($match) {
- $add = 'erg. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . '〉 ';
+ if ($match) {
+ $add = 'erg. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . '〉 ';
+ }
}
}
- }
- } else {
- $add = 'erg.〉 ';
+ } else {
+ $add = 'erg.〉 ';
+ }
}
- }
- } elseif ('del' === $e->nodeName) {
- if ('rdg' === $e->parentNode->nodeName) {
- if (!empty($e->attributes)) {
+ } elseif ('del' === $e->nodeName) {
+ if ('rdg' === $e->parentNode->nodeName) {
+ if (!empty($e->attributes)) {
- foreach ($e->attributes as $attribute) {
+ foreach ($e->attributes as $attribute) {
- if ('hand' === $attribute->nodeName) {
- $del = 'str. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . ']';
+ if ('hand' === $attribute->nodeName) {
+ $del = 'str. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . ']';
+ }
}
- }
- } else {
- $del = 'str.]';
+ } else {
+ $del = 'str.]';
+ }
}
- }
- } elseif ('handShift' === $e->nodeName && 'signed' === $e->parentNode->nodeName) {
+ } elseif ('handShift' === $e->nodeName && 'signed' === $e->parentNode->nodeName) {
- if (!empty($e->attributes)) {
+ if (!empty($e->attributes)) {
- $pattern = '/^#[a-z_]*$/i';
+ $pattern = '/^#[a-z_]*$/i';
- foreach ($e->attributes as $attribute) {
+ foreach ($e->attributes as $attribute) {
- if ('scribeRef' === $attribute->nodeName) {
- $match = preg_match($pattern, $attribute->nodeValue, $matches);
+ if ('scribeRef' === $attribute->nodeName) {
+ $match = preg_match($pattern, $attribute->nodeValue, $matches);
- if ($match) {
- $add = 'sign. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . '〉 ';
+ if ($match) {
+ $add = 'sign. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . '〉 ';
+ }
}
}
}
- }
- } elseif ('lb' === $e->nodeName) {
- $tText .= '
';
- $eText .= ' ';
- } elseif ('addrLine' === $e->nodeName) {
-
- if ($n++ > 0) {
- $eText .= '
';
+ } elseif ('lb' === $e->nodeName) {
$tText .= '
';
- }
+ $eText .= ' ';
+ } elseif ('addrLine' === $e->nodeName) {
- } elseif ('signed' === $e->nodeName) {
- $tText .= '
';
- $eText .= '
';
- } elseif ('hi' === $e->nodeName) {
- foreach ($e->attributes as $attribute) {
+ if ($n++ > 0) {
+ $eText .= '
';
+ $tText .= '
';
+ }
+
+ } elseif ('signed' === $e->nodeName) {
+ $tText .= '
';
+ $eText .= '
';
+ } elseif ('hi' === $e->nodeName) {
+ foreach ($e->attributes as $attribute) {
- if (isset(explode(':', $attribute->value)[1])) {
+ if (isset(explode(':', $attribute->value)[1])) {
- $rend = explode(':', $attribute->value)[1];
- if ('superscript' !== $rend && 'italic' !== $rend) {
- $renditions[] = explode(':', $attribute->value)[1];
+ $rend = explode(':', $attribute->value)[1];
+ if ('superscript' !== $rend && 'italic' !== $rend) {
+ $renditions[] = explode(':', $attribute->value)[1];
+ }
}
}
+ } elseif ('name' === $e->nodeName && (isset($e->attributes[1]->value) && !empty($e->attributes[1]->value)) && (isset($e->attributes[0]->value) && !empty($e->attributes[0]->value))) {
+ $uuid = $this->getUuid();
+ $gndsUuids[$uuid] = str_replace('gnd:', '', $e->attributes[1]->value);
+ $pagesGndsUuids[$k][$uuid] = str_replace('gnd:', '', $e->attributes[1]->value);
+ $eText .= '';
+ $gnd = true;
}
- } elseif ('name' === $e->nodeName && (isset($e->attributes[1]->value) && !empty($e->attributes[1]->value)) && (isset($e->attributes[0]->value) && !empty($e->attributes[0]->value))) {
- $uuid = $this->getUuid();
- $gndsUuids[$uuid] = str_replace('gnd:', '', $e->attributes[1]->value);
- $pagesGndsUuids[$k][$uuid] = str_replace('gnd:', '', $e->attributes[1]->value);
- $eText .= '';
- $gnd = true;
}
- }
- $eText = preg_replace('~\x{00AD}~u', '-', $eText);
- $pattern = '/(\w+)-\s(\w)/i';
+ $eText = preg_replace('~\x{00AD}~u', '-', $eText);
+ $pattern = '/(\w+)-\s(\w)/i';
- $eText = preg_replace_callback(
- $pattern,
- function ($match) {
- return $match[1] . $match[2];
- },
- $eText
- );
+ $eText = preg_replace_callback(
+ $pattern,
+ function ($match) {
+ return $match[1] . $match[2];
+ },
+ $eText
+ );
- $myeditedText .= ''.$eText.'
';
- $mytranscriptedText .= ''.$tText.'
';
+ $myeditedText .= '' . $eText . '
';
+ $mytranscriptedText .= '' . $tText . '
';
- } elseif ('pb' === $element->nodeName) {
- foreach ($element->attributes as $key => $attribute) {
- if ('facs' === $attribute->name) {
- if (isset($graphics[trim($attribute->value, '#')])) {
- $graphic = $graphics[trim($attribute->value, '#')];
+ } elseif ('pb' === $element->nodeName) {
+ foreach ($element->attributes as $key => $attribute) {
+ if ('facs' === $attribute->name) {
+ if (isset($graphics[trim($attribute->value, '#')])) {
+ $graphic = $graphics[trim($attribute->value, '#')];
- if (str_ends_with($graphic, '.jpg')) {
- $graphic = substr($graphic, 0, strlen($graphic) - 4);
+ if (str_ends_with($graphic, '.jpg')) {
+ $graphic = substr($graphic, 0, strlen($graphic) - 4);
+ }
}
+ } elseif ('n' === $attribute->name) {
+ $pageNumber = $attribute->value;
}
- } elseif ('n' === $attribute->name) {
- $pageNumber = $attribute->value;
- }
- if (!empty($pageNumber) && !empty($graphic)) {
- $href = '' . $pageNumber . '';
- } elseif (!empty($pageNumber) && empty($graphic)) {
- $href = $pageNumber;
- } else {
- $href = '';
+ if (!empty($pageNumber) && !empty($graphic)) {
+ $href = '' . $pageNumber . '';
+ } elseif (!empty($pageNumber) && empty($graphic)) {
+ $href = $pageNumber;
+ } else {
+ $href = '';
+ }
}
- }
- $tText = $href;
- $eText = $href;
- $myeditedText .= ''.$eText.'
';
- $mytranscriptedText .= ''.$tText.'
';
+ $tText = $href;
+ $eText = $href;
+ $myeditedText .= '' . $eText . '
';
+ $mytranscriptedText .= '' . $tText . '
';
+ }
}
+
+ $editedTextArr[] = $myeditedText;
+ $transcriptedTextArr[] = $mytranscriptedText;
}
- $editedTextArr[] = $myeditedText;
- $transcriptedTextArr[] = $mytranscriptedText;
- }
+ $transcriptedText = '';
+ foreach ($transcriptedTextArr as $mytranscriptedText) {
+ $transcriptedText .= $mytranscriptedText;
+ }
- $transcriptedText = '';
- foreach ($transcriptedTextArr as $mytranscriptedText) {
- $transcriptedText .= $mytranscriptedText;
- }
+ $editedText = '';
+ foreach ($editedTextArr as $myeditedText) {
+ $editedText .= $myeditedText;
+ }
- $editedText = '';
- foreach ($editedTextArr as $myeditedText) {
- $editedText .= $myeditedText;
+ $solrDocument->setTranscriptedText($transcriptedText);
+ $solrDocument->setTranscriptedTextArr($transcriptedTextArr);
+ $solrDocument->setEditedText($editedText);
+ $solrDocument->setEditedTextArr($editedTextArr);
+ $solrDocument->setGndsUuids($gndsUuids);
+ $solrDocument->setPagesGndsUuids($pagesGndsUuids);
}
- $solrDocument = new SolrDocument();
- $solrDocument->setTranscriptedText($transcriptedText);
- $solrDocument->setTranscriptedTextArr($transcriptedTextArr);
- $solrDocument->setEditedText($editedText);
- $solrDocument->setEditedTextArr($editedTextArr);
- $solrDocument->setGndsUuids($gndsUuids);
- $solrDocument->setPagesGndsUuids($pagesGndsUuids);
-
return $solrDocument;
}