diff --git a/src/Controller/Tei2SolrController.php b/src/Controller/Tei2SolrController.php index c3cb1c5c74943b181d869e9892531a74cf7c1788..c8ee39cbdc3f328c39ee10fcafaa7eb636b15cb3 100755 --- a/src/Controller/Tei2SolrController.php +++ b/src/Controller/Tei2SolrController.php @@ -2,8 +2,12 @@ namespace App\Controller; +use App\Service\PreProcessingService; use App\Model\SolrDocument; +use App\Service\EditedTextService; +use App\Service\TranscriptionService; use DOMDocument; +use DOMElement; use DOMXPath; use League\Flysystem\Exception; use Symfony\Bundle\FrameworkBundle\Controller\AbstractController; @@ -18,10 +22,21 @@ class Tei2SolrController extends AbstractController private Client $client; private ?string $teiDir = null; - - public function __construct(Client $client) + private TranscriptionService $transcriptionService; + private EditedTextService $editedTextService; + private PreProcessingService $preProcessingService; + + public function __construct( + Client $client, + PreProcessingService $preProcessingService, + TranscriptionService $transcriptionService, + EditedTextService $editedTextService + ) { $this->client = $client; + $this->transcriptionService = $transcriptionService; + $this->editedTextService = $editedTextService; + $this->preProcessingService = $preProcessingService; } public function setConfigs(string $teiDir) { @@ -628,9 +643,9 @@ class Tei2SolrController extends AbstractController $graphics = $this->getGraphics($imageIds, $imageUrls); $solrDocument = $this->getTextVersions($file, $graphics); $transcription = $solrDocument->getTranscriptedText(); - $pagesTranscription = $solrDocument->getTranscriptedTextArr(); + $pagesTranscription = $solrDocument->getPageLevelTranscriptedText(); $editedText = $solrDocument->getEditedText(); - $pagesEdited = $solrDocument->getEditedTextArr(); + $pagesEdited = $solrDocument->getPageLevelEditedText(); $pagesGndsUuids = $solrDocument->getPagesGndsUuids(); $pagesNotesUuids = $solrDocument->getPagesNotesUuids(); $pagesSegs = $solrDocument->getPagesSegs(); @@ -763,9 +778,10 @@ class Tei2SolrController extends AbstractController $childDoc->edited_text = $pagesEdited[$i - 1]; } - if (isset($pagesGndsUuids[$i - 1]) && !empty(($pagesGndsUuids[$i - 1]))) { - $childDoc->entities = array_values($pagesGndsUuids[$i - 1]); - $childDoc->annotation_ids = array_keys($pagesGndsUuids[$i - 1]); + if (isset($pagesGndsUuids[$i]) && !empty(($pagesGndsUuids[$i]))) { +// if (isset($pagesGndsUuids[$i - 1]) && !empty(($pagesGndsUuids[$i - 1]))) { + $childDoc->entities = array_values($pagesGndsUuids[$i]); + $childDoc->annotation_ids = array_keys($pagesGndsUuids[$i]); } if (isset($pagesNotesUuids[$i - 1]) && !empty(($pagesNotesUuids[$i - 1]))) { @@ -920,355 +936,61 @@ class Tei2SolrController extends AbstractController public function getTextVersions(string $filePath = './../data/gitlab/Z_1822-02-20_k.xml', array $graphics = []): SolrDocument { $doc = new DOMDocument(); - $doc->load($filePath); + $doc->load($filePath, LIBXML_NOBLANKS); $xpath = new DOMXPath($doc); $xpath->registerNamespace('tei', 'http://www.tei-c.org/ns/1.0'); $pagesNodes = $xpath->query('//tei:body'); + /** @var DOMElement $body */ + $body = $pagesNodes[0]; + $pages = $this->preProcessingService->splitByPages($body); - $allElements = []; - foreach ($pagesNodes as $pagesNode) { - $allElements = $this->getPagesNodes($pagesNode, $allElements); - } + $pageLevelEditedText = []; + $pageLevelTranscriptedText = []; - $editedTextArr = []; - $transcriptedTextArr = []; - $gndsUuids = []; $pagesGndsUuids = []; $pagesSegs = []; $pagesNotesUuids = []; $notesUuids = []; $pagesSics = []; - foreach ($allElements as $k => $elements) { - $renditions = []; - $myeditedText = ''; - $mytranscriptedText = ''; - - foreach ($elements as $element) { - if (isset($element->nodeName) && ( - ($element->nodeName === 'p' && $element->firstChild->nodeName !== 'address') || - ($element->nodeName === 'dateline' && 'closer' !== $element->parentNode->nodeName) || - $element->nodeName === 'address' || - $element->nodeName === 'closer' || - $element->nodeName === 'list' || - ($element->nodeName === 'signed' && 'closer' !== $element->parentNode->nodeName) - )) { - - $n = 0; - $liNumber = 1; - $tText = ''; - $eText = ''; - $childElementsArr = []; - $childElements = $this->getNodeChilds($element, $childElementsArr); - - foreach ($childElements as $childElement) { - if ($childElement->nodeName === '#text' - && ($childElement->parentNode->nodeName !== 'abbr' && ($childElement->nodeName !== 'del' || $childElement->parentNode->nodeName !== 'add')) - && $childElement->parentNode->nodeName !== 'note' && $childElement->parentNode->nodeName !== 'seg' - && $childElement->parentNode->nodeName !== 'ref' - ) { - if (isset($add) && !empty($add)) { - $tText .= ' ⟨' . $childElement->data . ' ' . $add; - $eText .= ' ' . $childElement->data; - $add = ''; - } elseif (isset($del) && !empty($del)) { - $tText .= ' [' . $childElement->data . ' ' . $del; - $del = ''; - } elseif (isset($li) && !empty($li) && !empty($childElement->data) && 'item' === $childElement->parentNode->nodeName) { - if (isset($italic) && true === $italic) { - $tText .= $li . $childElement->data . ''; - $eText .= $li . $childElement->data . ''; - $italic = false; - } else { - $tText .= $li . $childElement->data; - $eText .= $li . $childElement->data; - } - - $li = ''; - } elseif ('supplied' === $childElement->parentNode->nodeName) { - $tText .= $childElement->data; - $eText .= '' . $childElement->data . ''; - } elseif ('hi' === $childElement->parentNode->nodeName && isset($childElement->parentNode->attributes[0])) { - - $hi = explode(':', $childElement->parentNode->attributes[0]->value); - - if (isset($hi[1]) && !empty($hi[1]) && 'superscript' === $hi[1]) { - $tText .= '' . $childElement->data . ''; - $eText .= $childElement->data; - } elseif (isset($hi[1]) && !empty($hi[1]) && 'italic' === $hi[1]) { - $tText .= '' . $childElement->data . ''; - $eText .= $childElement->data; - $italic = true; - } - } elseif (isset($gnd) && true === $gnd) { - $eText .= ''; - $tText .= ''; - $gnd = false; - } elseif (isset($note) && true === $note) { - $eText .= ' ' . $childElement->data; - $tText .= ' ' . $childElement->data; - $note = false; - } elseif (isset($sic) && true === $sic) { - $eText .= ' '; - $tText .= ' '; - $sic = false; - } elseif (isset($abbr) && !empty($abbr)) { - $tText .= $abbr; - $abbr = ''; - } - elseif (isset($expan) && !empty($expan)) { - $eText .= $expan; - $expan = ''; - } else { - $eText .= $childElement->data; - if (isset($renditions) && !empty($renditions)) { - $classOpeningTag = ''; - $classEndTag = ''; - - foreach ($renditions as $rendition) { - if ('italic' === $rendition) { - $class = 'i'; - } elseif ('underline' === $rendition) { - $class = 'u'; - } - if (isset($class) && !empty($class)) { - $classOpeningTag .= '<' . $class . '>'; - $classEndTag .= ''; - } - - $renditions = []; - } - } - - if (isset($classOpeningTag) && !empty($classOpeningTag)) { - $tText .= $classOpeningTag; - $classOpeningTag = ''; - } - - $tText .= $childElement->data; - - if (isset($classEndTag) && !empty($classEndTag)) { - $tText .= $classEndTag; - $classEndTag = ''; - } - } - } elseif ($childElement->nodeName === "seg") { - $noteBibl = false; - foreach ($childElement->childNodes as $childElementChildNode) { - if ($childElementChildNode->nodeName === 'bibl') { - $noteBibl = true; - break; - } - } - - if (true === $noteBibl) { - $segBiblTarget = $this->getSegBiblTarget($childElement); - if (!empty($segBiblTarget)) { - $segBiblTargetArr = explode('#', $segBiblTarget); - $segTextBibl = str_replace('_', ' ', $segBiblTargetArr[1]); - $segBiblTargetlink = $segBiblTargetArr[0]; - $segTextBibl = $segTextBibl.' ('.$segBiblTargetArr[0].')'; - $segText = $this->getElementText($childElement); - } - } else { - $segText = $this->getElementText($childElement); - } - - $noteUuid = $this->getUuid(); - $eText .= ''.$segText; - $tText .= $segText; - $note = true; + foreach ($pages as $key => $page) { + if ($key > 0) { + $transcriptedDoc = $this->transcriptionService->transformPage($page); + $pageLevelTranscriptedText[] = $transcriptedDoc->saveHTML(); - if (true === $noteBibl && !empty($segTextBibl)) { - $pagesSegs[$k][$noteUuid] = trim($segTextBibl); - } elseif ($noteBibl === false && !empty($segText)) { - $pagesSegs[$k][$noteUuid] = trim($segText); - } - } elseif ($childElement->nodeName === "note") { - $noteText = $this->getElementText($childElement); - if ((isset($noteBibl) && false === $noteBibl) && (isset($noteUuid) && !empty($noteUuid)) && !empty($noteText)) { - $notesUuids[$noteUuid] = trim($noteText); - $pagesNotesUuids[$k][$noteUuid] = trim($noteText); - } elseif ((isset($noteBibl) && true === $noteBibl) && (isset($noteUuid) && !empty($noteUuid)) && !empty($segBiblTargetlink)) { - $notesUuids[$noteUuid] = trim($segBiblTargetlink); - $pagesNotesUuids[$k][$noteUuid] = trim(' '); - } - unset($noteUuid); - } elseif ('item' === $childElement->nodeName) { - if ($liNumber++ === 1) { - $li = '