From 006a3fe2a509067680c6eab649ad12bbea4441cf Mon Sep 17 00:00:00 2001 From: Paul Pestov Date: Sun, 10 Oct 2021 23:17:08 +0200 Subject: [PATCH 1/6] Add new TEI transform services --- src/Controller/Tei2SolrController.php | 38 ++++++++++++++- src/Service/EditedTextService.php | 67 +++++++++++++++++++++++++++ src/Service/HtmlService.php | 41 ++++++++++++++++ src/Service/TranscriptionService.php | 64 +++++++++++++++++++++++++ 4 files changed, 208 insertions(+), 2 deletions(-) create mode 100644 src/Service/EditedTextService.php create mode 100644 src/Service/HtmlService.php create mode 100644 src/Service/TranscriptionService.php diff --git a/src/Controller/Tei2SolrController.php b/src/Controller/Tei2SolrController.php index 55d7fe3..633e56b 100755 --- a/src/Controller/Tei2SolrController.php +++ b/src/Controller/Tei2SolrController.php @@ -3,7 +3,10 @@ namespace App\Controller; use App\Model\SolrDocument; +use App\Service\EditedTextService; +use App\Service\TranscriptionService; use DOMDocument; +use DOMElement; use DOMXPath; use League\Flysystem\Exception; use Symfony\Bundle\FrameworkBundle\Controller\AbstractController; @@ -18,10 +21,18 @@ class Tei2SolrController extends AbstractController private Client $client; private ?string $teiDir = null; - - public function __construct(Client $client) + private TranscriptionService $transcriptionService; + private EditedTextService $editedTextService; + + public function __construct( + Client $client, + TranscriptionService $transcriptionService, + EditedTextService $editedTextService + ) { $this->client = $client; + $this->transcriptionService = $transcriptionService; + $this->editedTextService = $editedTextService; } public function setConfigs(string $teiDir) { @@ -926,6 +937,28 @@ class Tei2SolrController extends AbstractController $xpath->registerNamespace('tei', 'http://www.tei-c.org/ns/1.0'); $pagesNodes = $xpath->query('//tei:body'); + // ===================================================================== + // WORK IN PROGRESS + // ===================================================================== + + $transcriptedText = ''; + $editedText = ''; + + /** @var DOMElement[] $node */ + foreach ($pagesNodes as $node) { + $transcriptedDoc = $this->transcriptionService->transformDoc($node); + $transcriptedText .= $transcriptedDoc->saveHTML(); + + $editedDoc = $this->editedTextService->transformDoc($node); + $editedText .= $editedDoc->saveHTML(); + } + + var_dump($transcriptedText); + + // ===================================================================== + // WORK IN PROGRESS + // ===================================================================== + $allElements = []; foreach ($pagesNodes as $pagesNode) { $allElements = $this->getPagesNodes($pagesNode, $allElements); @@ -945,6 +978,7 @@ class Tei2SolrController extends AbstractController $myeditedText = ''; $mytranscriptedText = ''; + /** @var DOMElement $element */ foreach ($elements as $element) { if (isset($element->nodeName) && ( ($element->nodeName === 'p' && $element->firstChild->nodeName !== 'address') || diff --git a/src/Service/EditedTextService.php b/src/Service/EditedTextService.php new file mode 100644 index 0000000..77facf5 --- /dev/null +++ b/src/Service/EditedTextService.php @@ -0,0 +1,67 @@ +setDoc(new DOMDocument()); + + /** @var DOMElement $element */ + foreach ($page->childNodes as $element) { + $this->appendChild($this->transformElement($element)); + } + + return $this->getDoc(); + } + + private function transformElement(DOMNode $el): DOMNode + { + $htmlEl = $this->span(); + + if ($el->hasChildNodes()) { + foreach ($el->childNodes as $child) { + $htmlEl->appendChild($this->transformElement($child)); + } + } + + return $htmlEl; + } + + private function handleName(DOMNode $el): DOMNode + { + $uuid = $this->createUuid(); + + if (str_contains($el->attributes[1]->value, 'gnd.')) { + $this->gndsUuids[$uuid] = str_replace('gnd.', '', $el->attributes[1]->value); + // $pagesGndsUuids[$k][$uuid] = str_replace('gnd.', '', $el->attributes[1]->value); + } elseif (str_contains($el->attributes[1]->value, 'gnd:')) { + $this->gndsUuids[$uuid] = str_replace('gnd:', '', $el->attributes[1]->value); + // $pagesGndsUuids[$k][$uuid] = str_replace('gnd:', '', $el->attributes[1]->value); + } + + $htmlEl = $this->span(); + + $htmlEl->setAttribute('id', $uuid); + $htmlEl->setAttribute('class', $el->attributes[0]->value); + + return $htmlEl; + } + + private function handleNote() { + + } + + public function createUuid() + { + return uuid_create(UUID_TYPE_RANDOM); + } +} diff --git a/src/Service/HtmlService.php b/src/Service/HtmlService.php new file mode 100644 index 0000000..58b34a4 --- /dev/null +++ b/src/Service/HtmlService.php @@ -0,0 +1,41 @@ +doc; + } + + public function setDoc(DOMDocument $doc): void + { + $this->doc = $doc; + } + + public function appendChild(DOMNode $el): DOMNode + { + return $this->doc->appendChild($el); + } + + public function p(): DOMElement + { + return $this->doc->createElement('p'); + } + + public function div():DOMElement + { + return $this->doc->createElement('div'); + } + + public function span(): DOMElement { + return $this->doc->createElement('span'); + } +} diff --git a/src/Service/TranscriptionService.php b/src/Service/TranscriptionService.php new file mode 100644 index 0000000..df302bd --- /dev/null +++ b/src/Service/TranscriptionService.php @@ -0,0 +1,64 @@ +setDoc(new DOMDocument()); + + /** @var DOMElement $element */ + foreach ($page->childNodes as $element) { + $this->appendChild($this->transformElement($element)); + } + + return $this->getDoc(); + } + + private function transformElement(DOMNode $el): DOMNode { + $methodName = 'handle' . trim(ucfirst($el->nodeName), '#'); + if (method_exists($this, $methodName)) { + $htmlEl = $this->{$methodName}($el); + } else { + $htmlEl = $this->p(); + } + + if ($el->hasChildNodes()) { + foreach ($el->childNodes as $child) { + $htmlEl->appendChild($this->transformElement($child)); + } + } + + return $htmlEl; + } + + private function handlePb(DOMElement $el): DOMNode { + return $this->div(); + } + + private function handleP(DOMElement $el): DOMNode { + return $this->p(); + } + + private function handleDiv(DOMElement $el): DOMNode { + return $this->div(); + } + + private function handleSpan(DOMElement $el): DOMNode { + return $this->span(); + } + + private function handleText(DOMNode $el): DOMNode { + $text = new DOMText(); + if ($el->nodeName === '#text') { + $text->data = $el->textContent; + } + return $text; + } +} -- GitLab From 583e93018709b2873e1aae6779080d3dab41f92b Mon Sep 17 00:00:00 2001 From: Paul Pestov Date: Mon, 11 Oct 2021 16:56:25 +0200 Subject: [PATCH 2/6] Add PreProcessingService to split TEI by pages --- src/Controller/Tei2SolrController.php | 26 ++++-- src/Service/EditedTextService.php | 2 +- src/Service/PreProcessingService.php | 123 ++++++++++++++++++++++++++ src/Service/TranscriptionService.php | 13 ++- 4 files changed, 154 insertions(+), 10 deletions(-) create mode 100644 src/Service/PreProcessingService.php diff --git a/src/Controller/Tei2SolrController.php b/src/Controller/Tei2SolrController.php index 633e56b..e329981 100755 --- a/src/Controller/Tei2SolrController.php +++ b/src/Controller/Tei2SolrController.php @@ -2,6 +2,7 @@ namespace App\Controller; +use App\Service\PreProcessingService; use App\Model\SolrDocument; use App\Service\EditedTextService; use App\Service\TranscriptionService; @@ -23,9 +24,11 @@ class Tei2SolrController extends AbstractController private ?string $teiDir = null; private TranscriptionService $transcriptionService; private EditedTextService $editedTextService; + private PreProcessingService $preProcessingService; public function __construct( Client $client, + PreProcessingService $preProcessingService, TranscriptionService $transcriptionService, EditedTextService $editedTextService ) @@ -33,6 +36,7 @@ class Tei2SolrController extends AbstractController $this->client = $client; $this->transcriptionService = $transcriptionService; $this->editedTextService = $editedTextService; + $this->preProcessingService = $preProcessingService; } public function setConfigs(string $teiDir) { @@ -931,7 +935,7 @@ class Tei2SolrController extends AbstractController public function getTextVersions(string $filePath = './../data/gitlab/Z_1822-02-20_k.xml', array $graphics = []): SolrDocument { $doc = new DOMDocument(); - $doc->load($filePath); + $doc->load($filePath, LIBXML_NOBLANKS); $xpath = new DOMXPath($doc); $xpath->registerNamespace('tei', 'http://www.tei-c.org/ns/1.0'); @@ -944,16 +948,22 @@ class Tei2SolrController extends AbstractController $transcriptedText = ''; $editedText = ''; - /** @var DOMElement[] $node */ - foreach ($pagesNodes as $node) { - $transcriptedDoc = $this->transcriptionService->transformDoc($node); - $transcriptedText .= $transcriptedDoc->saveHTML(); + /** @var DOMElement[] $body */ + foreach ($pagesNodes as $body) { + $pages = $this->preProcessingService->splitByPages($body); + + foreach ($pages as $page) { + $transcriptedDoc = $this->transcriptionService->transformPage($page); + $transcriptedText .= $transcriptedDoc->saveHTML(); + + $editedDoc = $this->editedTextService->transformPage($page); + $editedText .= $editedDoc->saveHTML(); + var_dump($transcriptedText); + } - $editedDoc = $this->editedTextService->transformDoc($node); - $editedText .= $editedDoc->saveHTML(); } - var_dump($transcriptedText); + die(); // ===================================================================== // WORK IN PROGRESS diff --git a/src/Service/EditedTextService.php b/src/Service/EditedTextService.php index 77facf5..bd826f1 100644 --- a/src/Service/EditedTextService.php +++ b/src/Service/EditedTextService.php @@ -11,7 +11,7 @@ class EditedTextService extends HtmlService private $gndsUuids = []; private $notesUuids = []; - public function transformDoc(DOMElement $page): DOMDocument + public function transformPage(DOMDocument $page): DOMDocument { $this->setDoc(new DOMDocument()); diff --git a/src/Service/PreProcessingService.php b/src/Service/PreProcessingService.php new file mode 100644 index 0000000..2a9a36a --- /dev/null +++ b/src/Service/PreProcessingService.php @@ -0,0 +1,123 @@ +pages; + } + + public function splitByPages(DOMELement $body) + { + $this->pages[] = new DOMDocument(); + + // To start out we set our empty page as last parent to append other elements to + $this->lastParent = $this->getLastPage(); + + // Start recursion + $this->checkPb($body); + + return $this->pages; + } + + private function checkPb(DOMNode $el) { + if ($el->nodeName === '#comment') { + return; + } + + $lastPage = $this->getLastPage(); + + if ($el->nodeName === '#text') { + $clone = $lastPage->createTextNode($el->textContent); + } else { + $clone = $lastPage->createElement($el->nodeName); + $clone = $this->cloneAttributes($el->attributes, $clone); + } + + // Always append the current element clone to last parent + $this->lastParent->appendChild($clone); + + if ($el->hasChildNodes()) { + + // If there are children then we want to append their clones to the current element + // so we have to move lastParent + $this->lastParent = $clone; + + /** @var DOMElement $child */ + foreach ($el->childNodes as $child) { + if ($child->nodeName === 'pb') { + $this->pages[] = $this->createNewPage($child); + } else { + $this->checkPb($child); + } + } + + // After we finished iterating (recursively) over all children + // we are done here and want move on with our next sibling + // so we have to set lastParent to it's parent + if ($this->lastParent->parentNode) { + $this->lastParent = $this->lastParent->parentNode; + } + } + } + + private function createNewPage(DOMElement $pbEl): DOMDocument + { + // Creates a new DOMDocument and replicates every parent node of pb element + // up to the + $newPage = new DOMDocument(); + $parent = $pbEl->parentNode; + + $parents = []; + while ($parent->parentNode) { + // Collect all parents until + + $parents[] = $parent; + + if ($parent->nodeName === 'body') break; + $parent = $parent->parentNode; + } + + // Reverse them to append them from document root + $parentsReversed = array_reverse($parents); + + $lastNode = $newPage; + foreach ($parentsReversed as $parent) { + $node = $newPage->createElement($parent->nodeName); + $node = $this->cloneAttributes($parent->attributes, $node); + $node = $lastNode->appendChild($node); + + // Result will be the deepest parent node so we can continue cloning the TEI + // in further checkPb calls + $this->lastParent = $node; + $lastNode = $node; + } + + return $newPage; + } + + private function getLastPage(): ?DOMDocument + { + return (!empty($this->pages)) ? $this->pages[count($this->pages) - 1] : null; + } + + private function cloneAttributes($attributes, $clone) + { + foreach ($attributes as $attr) { + $clone->setAttribute($attr->name, $attr->value); + } + + return $clone; + } +} diff --git a/src/Service/TranscriptionService.php b/src/Service/TranscriptionService.php index df302bd..f49059f 100644 --- a/src/Service/TranscriptionService.php +++ b/src/Service/TranscriptionService.php @@ -9,7 +9,7 @@ use DOMText; class TranscriptionService extends HtmlService { - public function transformDoc(DOMElement $page): DOMDocument + public function transformPage(DOMDocument $page): DOMDocument { $this->setDoc(new DOMDocument()); @@ -21,6 +21,17 @@ class TranscriptionService extends HtmlService return $this->getDoc(); } + public function splitByPb(DOMElement $page): array + { + if (true) { + + } + /** @var DOMElement $element */ + foreach ($page->childNodes as $element) { + $this->appendChild($this->transformElement($element)); + } + } + private function transformElement(DOMNode $el): DOMNode { $methodName = 'handle' . trim(ucfirst($el->nodeName), '#'); if (method_exists($this, $methodName)) { -- GitLab From b2946b7ec7c37847feeb3e2fdc11c184fa6f2947 Mon Sep 17 00:00:00 2001 From: Paul Pestov Date: Mon, 11 Oct 2021 23:38:37 +0200 Subject: [PATCH 3/6] Refactor to execute all TEI transformations --- src/Controller/Tei2SolrController.php | 564 +++++++++++++------------- src/Service/EditedTextService.php | 29 +- src/Service/HtmlService.php | 16 +- src/Service/PreProcessingService.php | 5 + src/Service/TranscriptionService.php | 19 - 5 files changed, 328 insertions(+), 305 deletions(-) diff --git a/src/Controller/Tei2SolrController.php b/src/Controller/Tei2SolrController.php index e329981..e73019c 100755 --- a/src/Controller/Tei2SolrController.php +++ b/src/Controller/Tei2SolrController.php @@ -945,31 +945,37 @@ class Tei2SolrController extends AbstractController // WORK IN PROGRESS // ===================================================================== - $transcriptedText = ''; - $editedText = ''; - - /** @var DOMElement[] $body */ - foreach ($pagesNodes as $body) { - $pages = $this->preProcessingService->splitByPages($body); + $editedTextArr = []; + $transcriptedTextArr = []; + $gndsUuids = []; + $pagesGndsUuids = []; + $pagesSegs = []; + $pagesNotesUuids = []; + $notesUuids = []; + $pagesSics = []; - foreach ($pages as $page) { - $transcriptedDoc = $this->transcriptionService->transformPage($page); - $transcriptedText .= $transcriptedDoc->saveHTML(); + /** @var DOMElement $body */ + $body = $pagesNodes[0]; + $pages = $this->preProcessingService->splitByPages($body); - $editedDoc = $this->editedTextService->transformPage($page); - $editedText .= $editedDoc->saveHTML(); - var_dump($transcriptedText); - } + foreach ($pages as $page) { + $transcriptedDoc = $this->transcriptionService->transformPage($page); + $transcriptedTextArr[] = $transcriptedDoc->saveHTML(); + $editedDoc = $this->editedTextService->transformPage($page); + $editedTextArr[] = $editedDoc->saveHTML(); + $this->transcriptionService->clear(); + $this->editedTextService->clear(); } - die(); + $this->preProcessingService->clear(); // ===================================================================== // WORK IN PROGRESS // ===================================================================== - $allElements = []; + /* + $allElements = []; foreach ($pagesNodes as $pagesNode) { $allElements = $this->getPagesNodes($pagesNode, $allElements); } @@ -988,311 +994,311 @@ class Tei2SolrController extends AbstractController $myeditedText = ''; $mytranscriptedText = ''; - /** @var DOMElement $element */ - foreach ($elements as $element) { - if (isset($element->nodeName) && ( - ($element->nodeName === 'p' && $element->firstChild->nodeName !== 'address') || - ($element->nodeName === 'dateline' && 'closer' !== $element->parentNode->nodeName) || - $element->nodeName === 'address' || - $element->nodeName === 'closer' || - $element->nodeName === 'list' || - ($element->nodeName === 'signed' && 'closer' !== $element->parentNode->nodeName) - )) { - - $n = 0; - $liNumber = 1; - $tText = ''; - $eText = ''; - $childElementsArr = []; - $childElements = $this->getNodeChilds($element, $childElementsArr); - - foreach ($childElements as $childElement) { - if ($childElement->nodeName === '#text' - && ($childElement->parentNode->nodeName !== 'abbr' && ($childElement->nodeName !== 'del' || $childElement->parentNode->nodeName !== 'add')) - && $childElement->parentNode->nodeName !== 'note' && $childElement->parentNode->nodeName !== 'seg' - && $childElement->parentNode->nodeName !== 'ref' - ) { - if (isset($add) && !empty($add)) { - $tText .= ' ⟨' . $childElement->data . ' ' . $add; - $eText .= ' ' . $childElement->data; - $add = ''; - } elseif (isset($del) && !empty($del)) { - $tText .= ' [' . $childElement->data . ' ' . $del; - $del = ''; - } elseif (isset($li) && !empty($li) && !empty($childElement->data) && 'item' === $childElement->parentNode->nodeName) { - if (isset($italic) && true === $italic) { - $tText .= $li . $childElement->data . ''; - $eText .= $li . $childElement->data . ''; - $italic = false; - } else { - $tText .= $li . $childElement->data; - $eText .= $li . $childElement->data; - } - - $li = ''; - } elseif ('supplied' === $childElement->parentNode->nodeName) { - $tText .= $childElement->data; - $eText .= '' . $childElement->data . ''; - } elseif ('hi' === $childElement->parentNode->nodeName && isset($childElement->parentNode->attributes[0])) { - - $hi = explode(':', $childElement->parentNode->attributes[0]->value); - - if (isset($hi[1]) && !empty($hi[1]) && 'superscript' === $hi[1]) { - $tText .= '' . $childElement->data . ''; - $eText .= $childElement->data; - } elseif (isset($hi[1]) && !empty($hi[1]) && 'italic' === $hi[1]) { - $tText .= '' . $childElement->data . ''; - $eText .= $childElement->data; - $italic = true; - } - } elseif (isset($gnd) && true === $gnd) { - $eText .= ''; - $tText .= ''; - $gnd = false; - } elseif (isset($note) && true === $note) { - $eText .= ' ' . $childElement->data; - $tText .= ' ' . $childElement->data; - $note = false; - } elseif (isset($sic) && true === $sic) { - $eText .= ' '; - $tText .= ' '; - $sic = false; - } elseif (isset($abbr) && !empty($abbr)) { - $tText .= $abbr; - $abbr = ''; - } - elseif (isset($expan) && !empty($expan)) { - $eText .= $expan; - $expan = ''; + foreach ($elements as $element) { + if (isset($element->nodeName) && ( + ($element->nodeName === 'p' && $element->firstChild->nodeName !== 'address') || + ($element->nodeName === 'dateline' && 'closer' !== $element->parentNode->nodeName) || + $element->nodeName === 'address' || + $element->nodeName === 'closer' || + $element->nodeName === 'list' || + ($element->nodeName === 'signed' && 'closer' !== $element->parentNode->nodeName) + )) { + + $n = 0; + $liNumber = 1; + $tText = ''; + $eText = ''; + $childElementsArr = []; + $childElements = $this->getNodeChilds($element, $childElementsArr); + + foreach ($childElements as $childElement) { + if ($childElement->nodeName === '#text' + && ($childElement->parentNode->nodeName !== 'abbr' && ($childElement->nodeName !== 'del' || $childElement->parentNode->nodeName !== 'add')) + && $childElement->parentNode->nodeName !== 'note' && $childElement->parentNode->nodeName !== 'seg' + && $childElement->parentNode->nodeName !== 'ref' + ) { + if (isset($add) && !empty($add)) { + $tText .= ' ⟨' . $childElement->data . ' ' . $add; + $eText .= ' ' . $childElement->data; + $add = ''; + } elseif (isset($del) && !empty($del)) { + $tText .= ' [' . $childElement->data . ' ' . $del; + $del = ''; + } elseif (isset($li) && !empty($li) && !empty($childElement->data) && 'item' === $childElement->parentNode->nodeName) { + if (isset($italic) && true === $italic) { + $tText .= $li . $childElement->data . ''; + $eText .= $li . $childElement->data . ''; + $italic = false; } else { - $eText .= $childElement->data; - if (isset($renditions) && !empty($renditions)) { - $classOpeningTag = ''; - $classEndTag = ''; - - foreach ($renditions as $rendition) { - if ('italic' === $rendition) { - $class = 'i'; - } elseif ('underline' === $rendition) { - $class = 'u'; - } - if (isset($class) && !empty($class)) { - $classOpeningTag .= '<' . $class . '>'; - $classEndTag .= ''; - } - - $renditions = []; - } - } + $tText .= $li . $childElement->data; + $eText .= $li . $childElement->data; + } - if (isset($classOpeningTag) && !empty($classOpeningTag)) { - $tText .= $classOpeningTag; - $classOpeningTag = ''; - } + $li = ''; + } elseif ('supplied' === $childElement->parentNode->nodeName) { + $tText .= $childElement->data; + $eText .= '' . $childElement->data . ''; + } elseif ('hi' === $childElement->parentNode->nodeName && isset($childElement->parentNode->attributes[0])) { - $tText .= $childElement->data; + $hi = explode(':', $childElement->parentNode->attributes[0]->value); - if (isset($classEndTag) && !empty($classEndTag)) { - $tText .= $classEndTag; - $classEndTag = ''; - } + if (isset($hi[1]) && !empty($hi[1]) && 'superscript' === $hi[1]) { + $tText .= '' . $childElement->data . ''; + $eText .= $childElement->data; + } elseif (isset($hi[1]) && !empty($hi[1]) && 'italic' === $hi[1]) { + $tText .= '' . $childElement->data . ''; + $eText .= $childElement->data; + $italic = true; } - } elseif ($childElement->nodeName === "seg") { - $noteBibl = false; - foreach ($childElement->childNodes as $childElementChildNode) { - if ($childElementChildNode->nodeName === 'bibl') { - $noteBibl = true; - break; + } elseif (isset($gnd) && true === $gnd) { + $eText .= ''; + $tText .= ''; + $gnd = false; + } elseif (isset($note) && true === $note) { + $eText .= ' ' . $childElement->data; + $tText .= ' ' . $childElement->data; + $note = false; + } elseif (isset($sic) && true === $sic) { + $eText .= ' '; + $tText .= ' '; + $sic = false; + } elseif (isset($abbr) && !empty($abbr)) { + $tText .= $abbr; + $abbr = ''; + } + elseif (isset($expan) && !empty($expan)) { + $eText .= $expan; + $expan = ''; + } else { + $eText .= $childElement->data; + if (isset($renditions) && !empty($renditions)) { + $classOpeningTag = ''; + $classEndTag = ''; + + foreach ($renditions as $rendition) { + if ('italic' === $rendition) { + $class = 'i'; + } elseif ('underline' === $rendition) { + $class = 'u'; + } + if (isset($class) && !empty($class)) { + $classOpeningTag .= '<' . $class . '>'; + $classEndTag .= ''; + } + + $renditions = []; } } - if (true === $noteBibl) { - $segBiblTarget = $this->getSegBiblTarget($childElement); - if (!empty($segBiblTarget)) { - $segBiblTargetArr = explode('#', $segBiblTarget); - $segTextBibl = str_replace('_', ' ', $segBiblTargetArr[1]); - $segBiblTargetlink = $segBiblTargetArr[0]; - $segTextBibl = $segTextBibl.' ('.$segBiblTargetArr[0].')'; - $segText = $this->getElementText($childElement); - } - } else { - $segText = $this->getElementText($childElement); + if (isset($classOpeningTag) && !empty($classOpeningTag)) { + $tText .= $classOpeningTag; + $classOpeningTag = ''; } - $noteUuid = $this->getUuid(); - $eText .= ''.$segText; - $tText .= $segText; - $note = true; + $tText .= $childElement->data; - if (true === $noteBibl && !empty($segTextBibl)) { - $pagesSegs[$k][$noteUuid] = trim($segTextBibl); - } elseif ($noteBibl === false && !empty($segText)) { - $pagesSegs[$k][$noteUuid] = trim($segText); + if (isset($classEndTag) && !empty($classEndTag)) { + $tText .= $classEndTag; + $classEndTag = ''; } - } elseif ($childElement->nodeName === "note") { - $noteText = $this->getElementText($childElement); - if ((isset($noteBibl) && false === $noteBibl) && (isset($noteUuid) && !empty($noteUuid)) && !empty($noteText)) { - $notesUuids[$noteUuid] = trim($noteText); - $pagesNotesUuids[$k][$noteUuid] = trim($noteText); - } elseif ((isset($noteBibl) && true === $noteBibl) && (isset($noteUuid) && !empty($noteUuid)) && !empty($segBiblTargetlink)) { - $notesUuids[$noteUuid] = trim($segBiblTargetlink); - $pagesNotesUuids[$k][$noteUuid] = trim(' '); + } + } elseif ($childElement->nodeName === "seg") { + $noteBibl = false; + foreach ($childElement->childNodes as $childElementChildNode) { + if ($childElementChildNode->nodeName === 'bibl') { + $noteBibl = true; + break; } - unset($noteUuid); - } elseif ('item' === $childElement->nodeName) { - if ($liNumber++ === 1) { - $li = '
  • '; - } else { - $li = '
  • '; + } + + if (true === $noteBibl) { + $segBiblTarget = $this->getSegBiblTarget($childElement); + if (!empty($segBiblTarget)) { + $segBiblTargetArr = explode('#', $segBiblTarget); + $segTextBibl = str_replace('_', ' ', $segBiblTargetArr[1]); + $segBiblTargetlink = $segBiblTargetArr[0]; + $segTextBibl = $segTextBibl.' ('.$segBiblTargetArr[0].')'; + $segText = $this->getElementText($childElement); } - } elseif ('add' === $childElement->nodeName) { - if ('rdg' === $childElement->parentNode->nodeName) { - if (!empty($childElement->attributes)) { - $pattern = '/^#[a-z_]*$/i'; - foreach ($childElement->attributes as $attribute) { - if ('hand' === $attribute->nodeName) { - if (strpos($attribute->nodeValue, 'scrb') !== false) { - $add = $this->transformAddScrb($attribute->nodeValue); - } else { - $match = preg_match($pattern, $attribute->nodeValue, $matches); - if ($match) { - $add = 'erg. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . ''; - } + } else { + $segText = $this->getElementText($childElement); + } + + $noteUuid = $this->getUuid(); + $eText .= ''.$segText; + $tText .= $segText; + $note = true; + + if (true === $noteBibl && !empty($segTextBibl)) { + $pagesSegs[$k][$noteUuid] = trim($segTextBibl); + } elseif ($noteBibl === false && !empty($segText)) { + $pagesSegs[$k][$noteUuid] = trim($segText); + } + } elseif ($childElement->nodeName === "note") { + $noteText = $this->getElementText($childElement); + if ((isset($noteBibl) && false === $noteBibl) && (isset($noteUuid) && !empty($noteUuid)) && !empty($noteText)) { + $notesUuids[$noteUuid] = trim($noteText); + $pagesNotesUuids[$k][$noteUuid] = trim($noteText); + } elseif ((isset($noteBibl) && true === $noteBibl) && (isset($noteUuid) && !empty($noteUuid)) && !empty($segBiblTargetlink)) { + $notesUuids[$noteUuid] = trim($segBiblTargetlink); + $pagesNotesUuids[$k][$noteUuid] = trim(' '); + } + unset($noteUuid); + } elseif ('item' === $childElement->nodeName) { + if ($liNumber++ === 1) { + $li = '
    • '; + } else { + $li = '
    • '; + } + } elseif ('add' === $childElement->nodeName) { + if ('rdg' === $childElement->parentNode->nodeName) { + if (!empty($childElement->attributes)) { + $pattern = '/^#[a-z_]*$/i'; + foreach ($childElement->attributes as $attribute) { + if ('hand' === $attribute->nodeName) { + if (strpos($attribute->nodeValue, 'scrb') !== false) { + $add = $this->transformAddScrb($attribute->nodeValue); + } else { + $match = preg_match($pattern, $attribute->nodeValue, $matches); + if ($match) { + $add = 'erg. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . ''; } } } - } else { - $add = 'erg.'; } + } else { + $add = 'erg.'; + } - if (!empty($add)) { - $add .= '⟩ '; - } + if (!empty($add)) { + $add .= '⟩ '; } - } elseif ('del' === $childElement->nodeName) { - if ('rdg' === $childElement->parentNode->nodeName) { - if (!empty($childElement->attributes)) { - foreach ($childElement->attributes as $attribute) { - if ('hand' === $attribute->nodeName) { - if (strpos($attribute->nodeValue, 'scrb') !== false) { - $valueArr = explode('scrb', $attribute->nodeValue); - if (isset($valueArr[1])) { - $valueArr = explode('_', ltrim($valueArr[1], '_')); - if (count($valueArr) === 2) { - $del = 'str Schrhd.' . $valueArr[0] . ' ' . $valueArr[1] . ''; - } + } + } elseif ('del' === $childElement->nodeName) { + if ('rdg' === $childElement->parentNode->nodeName) { + if (!empty($childElement->attributes)) { + foreach ($childElement->attributes as $attribute) { + if ('hand' === $attribute->nodeName) { + if (strpos($attribute->nodeValue, 'scrb') !== false) { + $valueArr = explode('scrb', $attribute->nodeValue); + if (isset($valueArr[1])) { + $valueArr = explode('_', ltrim($valueArr[1], '_')); + if (count($valueArr) === 2) { + $del = 'str Schrhd.' . $valueArr[0] . ' ' . $valueArr[1] . ''; } - } else { - $del = 'str. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . ''; } + } else { + $del = 'str. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . ''; } } - } else { - $del = 'str.'; - } - - if (!empty($del)) { - $del .= ']'; } + } else { + $del = 'str.'; } - } elseif ('abbr' === $childElement->nodeName && 'choice' === $childElement->parentNode->nodeName) { - if (isset($childElement->firstChild) && $childElement->firstChild->nodeName === '#text') { - $abbr = $childElement->firstChild->data;; - } - } elseif ('expan' === $childElement->nodeName && 'choice' === $childElement->parentNode->nodeName) { - if (isset($childElement->firstChild) && $childElement->firstChild->nodeName === '#text') { - $expan = $childElement->firstChild->data; + + if (!empty($del)) { + $del .= ']'; } - } elseif ('handShift' === $childElement->nodeName && 'signed' === $childElement->parentNode->nodeName) { - if (!empty($childElement->attributes)) { - $pattern = '/^#[a-z_]*$/i'; - foreach ($childElement->attributes as $attribute) { - if ('scribeRef' === $attribute->nodeName) { - $match = preg_match($pattern, $attribute->nodeValue, $matches); - if ($match) { - $add = 'sign. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . '⟩ '; - } + } + } elseif ('abbr' === $childElement->nodeName && 'choice' === $childElement->parentNode->nodeName) { + if (isset($childElement->firstChild) && $childElement->firstChild->nodeName === '#text') { + $abbr = $childElement->firstChild->data;; + } + } elseif ('expan' === $childElement->nodeName && 'choice' === $childElement->parentNode->nodeName) { + if (isset($childElement->firstChild) && $childElement->firstChild->nodeName === '#text') { + $expan = $childElement->firstChild->data; + } + } elseif ('handShift' === $childElement->nodeName && 'signed' === $childElement->parentNode->nodeName) { + if (!empty($childElement->attributes)) { + $pattern = '/^#[a-z_]*$/i'; + foreach ($childElement->attributes as $attribute) { + if ('scribeRef' === $attribute->nodeName) { + $match = preg_match($pattern, $attribute->nodeValue, $matches); + if ($match) { + $add = 'sign. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . '⟩ '; } } } - } elseif ('lb' === $childElement->nodeName) { + } + } elseif ('lb' === $childElement->nodeName) { + $tText .= '
      '; + $eText .= ' '; + } elseif ('addrLine' === $childElement->nodeName) { + if ($n++ > 0) { + $eText .= '
      '; $tText .= '
      '; - $eText .= ' '; - } elseif ('addrLine' === $childElement->nodeName) { - if ($n++ > 0) { - $eText .= '
      '; - $tText .= '
      '; - } - } elseif ('signed' === $childElement->nodeName) { - $tText .= '

      '; - $eText .= '

      '; - $signedText = $this->getElementText($childElement); - $tText .= $signedText; - $eText .= $signedText; - } elseif ('hi' === $childElement->nodeName) { - foreach ($childElement->attributes as $attribute) { - if (isset(explode(':', $attribute->value)[1])) { - $rend = explode(':', $attribute->value)[1]; - if ('superscript' !== $rend && 'italic' !== $rend) { - $renditions[] = explode(':', $attribute->value)[1]; - } + } + } elseif ('signed' === $childElement->nodeName) { + $tText .= '

      '; + $eText .= '

      '; + $signedText = $this->getElementText($childElement); + $tText .= $signedText; + $eText .= $signedText; + } elseif ('hi' === $childElement->nodeName) { + foreach ($childElement->attributes as $attribute) { + if (isset(explode(':', $attribute->value)[1])) { + $rend = explode(':', $attribute->value)[1]; + if ('superscript' !== $rend && 'italic' !== $rend) { + $renditions[] = explode(':', $attribute->value)[1]; } } - } elseif ('name' === $childElement->nodeName && (isset($childElement->attributes[1]->value) && !empty($childElement->attributes[1]->value)) && (isset($childElement->attributes[0]->value) && !empty($childElement->attributes[0]->value))) { - $entityName = $this->getEntityName($childElement); - $uuid = $this->getUuid(); - if (str_contains($childElement->attributes[1]->value, 'gnd.')) { - $gndsUuids[$uuid] = str_replace('gnd.', '', $childElement->attributes[1]->value); - $pagesGndsUuids[$k][$uuid] = str_replace('gnd.', '', $childElement->attributes[1]->value); - } elseif (str_contains($childElement->attributes[1]->value, 'gnd:')) { - $gndsUuids[$uuid] = str_replace('gnd:', '', $childElement->attributes[1]->value); - $pagesGndsUuids[$k][$uuid] = str_replace('gnd:', '', $childElement->attributes[1]->value); + } + } elseif ('name' === $childElement->nodeName && (isset($childElement->attributes[1]->value) && !empty($childElement->attributes[1]->value)) && (isset($childElement->attributes[0]->value) && !empty($childElement->attributes[0]->value))) { + $entityName = $this->getEntityName($childElement); + $uuid = $this->getUuid(); + if (str_contains($childElement->attributes[1]->value, 'gnd.')) { + $gndsUuids[$uuid] = str_replace('gnd.', '', $childElement->attributes[1]->value); + $pagesGndsUuids[$k][$uuid] = str_replace('gnd.', '', $childElement->attributes[1]->value); + } elseif (str_contains($childElement->attributes[1]->value, 'gnd:')) { + $gndsUuids[$uuid] = str_replace('gnd:', '', $childElement->attributes[1]->value); + $pagesGndsUuids[$k][$uuid] = str_replace('gnd:', '', $childElement->attributes[1]->value); + } + if ('signed' === $childElement->parentNode->nodeName) { + if (isset($childElement->parentNode->attributes[0])) { + $hand = explode('_', trim($childElement->parentNode->attributes[0]->value, '#')); } - if ('signed' === $childElement->parentNode->nodeName) { - if (isset($childElement->parentNode->attributes[0])) { - $hand = explode('_', trim($childElement->parentNode->attributes[0]->value, '#')); - } - $eText .= ''.$entityName; - $tText .= '⟨'.$entityName.' sign.'; - if (isset($hand[0]) && !empty($hand[0])) { - $tText .= ' '.$hand[0]; - } - if (isset($hand[1]) && !empty($hand[1])) { - $tText .= ' '.$hand[1]; - } - $tText .= ' ⟩'; - } else { - $eText .= ''.$entityName; - $tText .= $entityName; + $eText .= ''.$entityName; + $tText .= '⟨'.$entityName.' sign.'; + if (isset($hand[0]) && !empty($hand[0])) { + $tText .= ' '.$hand[0]; + } + if (isset($hand[1]) && !empty($hand[1])) { + $tText .= ' '.$hand[1]; } - $gnd = true; - } elseif ($childElement->nodeName === "sic") { - $sicText = $this->getElementText($childElement); - $sicUuid = $this->getUuid(); - $eText .= $this->createSpan($sicUuid, $sicText, $childElement->nodeName); - $tText .= $sicText; - $pagesSics[$k][$sicUuid] = trim($sicText); - $sic = true; + $tText .= ' ⟩'; + } else { + $eText .= ''.$entityName; + $tText .= $entityName; } + $gnd = true; + } elseif ($childElement->nodeName === "sic") { + $sicText = $this->getElementText($childElement); + $sicUuid = $this->getUuid(); + $eText .= $this->createSpan($sicUuid, $sicText, $childElement->nodeName); + $tText .= $sicText; + $pagesSics[$k][$sicUuid] = trim($sicText); + $sic = true; } - - $eText = $this->convertSoftHyphenToHyphen($eText); - $eText = $this->removeHyphen($eText); - $myeditedText .= '

      ' . $eText . '

      '; - $mytranscriptedText .= '

      ' . $tText . '

      '; - } elseif ('pb' === $element->nodeName) { - $imageHref = $this->getImageHref($element, $graphics); - $tText = $imageHref; - $eText = $imageHref; - $myeditedText .= '

      ' . $eText . '

      '; - $mytranscriptedText .= '

      ' . $tText . '

      '; } - } - $editedTextArr[] = $myeditedText; - $transcriptedTextArr[] = $mytranscriptedText; + $eText = $this->convertSoftHyphenToHyphen($eText); + $eText = $this->removeHyphen($eText); + $myeditedText .= '

      ' . $eText . '

      '; + $mytranscriptedText .= '

      ' . $tText . '

      '; + } elseif ('pb' === $element->nodeName) { + $imageHref = $this->getImageHref($element, $graphics); + $tText = $imageHref; + $eText = $imageHref; + $myeditedText .= '

      ' . $eText . '

      '; + $mytranscriptedText .= '

      ' . $tText . '

      '; + } } + $editedTextArr[] = $myeditedText; + $transcriptedTextArr[] = $mytranscriptedText; + } + */ + $transcriptedText = ''; foreach ($transcriptedTextArr as $mytranscriptedText) { $transcriptedText .= $mytranscriptedText; diff --git a/src/Service/EditedTextService.php b/src/Service/EditedTextService.php index bd826f1..21befff 100644 --- a/src/Service/EditedTextService.php +++ b/src/Service/EditedTextService.php @@ -8,8 +8,8 @@ use DOMNode; class EditedTextService extends HtmlService { - private $gndsUuids = []; - private $notesUuids = []; + private array $gndsUuids = []; + private array $notesUuids = []; public function transformPage(DOMDocument $page): DOMDocument { @@ -17,19 +17,29 @@ class EditedTextService extends HtmlService /** @var DOMElement $element */ foreach ($page->childNodes as $element) { - $this->appendChild($this->transformElement($element)); + $transformed = $this->transformElement($element); + if ($transformed) { + $this->appendChild($transformed); + } } return $this->getDoc(); } - private function transformElement(DOMNode $el): DOMNode + private function transformElement(DOMNode $el): ?DOMNode { - $htmlEl = $this->span(); + $htmlEl = null; + if ($el->nodeName === '#text') { + $htmlEl = $this->handleText($el); + } if ($el->hasChildNodes()) { + $htmlEl = $this->span(); foreach ($el->childNodes as $child) { - $htmlEl->appendChild($this->transformElement($child)); + $transformed = $this->transformElement($child); + if ($transformed) { + $htmlEl->appendChild($transformed); + } } } @@ -64,4 +74,11 @@ class EditedTextService extends HtmlService { return uuid_create(UUID_TYPE_RANDOM); } + + public function clear() + { + parent::clear(); + $this->gndsUuids = []; + $this->notesUuids = []; + } } diff --git a/src/Service/HtmlService.php b/src/Service/HtmlService.php index 58b34a4..89681b4 100644 --- a/src/Service/HtmlService.php +++ b/src/Service/HtmlService.php @@ -5,10 +5,11 @@ namespace App\Service; use DOMDocument; use DOMElement; use DOMNode; +use DOMText; class HtmlService { - private DOMDocument $doc; + private ?DOMDocument $doc = null; public function getDoc(): DOMDocument { @@ -38,4 +39,17 @@ class HtmlService public function span(): DOMElement { return $this->doc->createElement('span'); } + + public function clear() + { + $this->doc = null; + } + + public function handleText(DOMNode $el): DOMNode { + $text = new DOMText(); + if ($el->nodeName === '#text') { + $text->data = $el->textContent; + } + return $text; + } } diff --git a/src/Service/PreProcessingService.php b/src/Service/PreProcessingService.php index 2a9a36a..9ed4551 100644 --- a/src/Service/PreProcessingService.php +++ b/src/Service/PreProcessingService.php @@ -112,6 +112,11 @@ class PreProcessingService return (!empty($this->pages)) ? $this->pages[count($this->pages) - 1] : null; } + public function clear() + { + $this->pages = []; + } + private function cloneAttributes($attributes, $clone) { foreach ($attributes as $attr) { diff --git a/src/Service/TranscriptionService.php b/src/Service/TranscriptionService.php index f49059f..878b3c0 100644 --- a/src/Service/TranscriptionService.php +++ b/src/Service/TranscriptionService.php @@ -21,17 +21,6 @@ class TranscriptionService extends HtmlService return $this->getDoc(); } - public function splitByPb(DOMElement $page): array - { - if (true) { - - } - /** @var DOMElement $element */ - foreach ($page->childNodes as $element) { - $this->appendChild($this->transformElement($element)); - } - } - private function transformElement(DOMNode $el): DOMNode { $methodName = 'handle' . trim(ucfirst($el->nodeName), '#'); if (method_exists($this, $methodName)) { @@ -64,12 +53,4 @@ class TranscriptionService extends HtmlService private function handleSpan(DOMElement $el): DOMNode { return $this->span(); } - - private function handleText(DOMNode $el): DOMNode { - $text = new DOMText(); - if ($el->nodeName === '#text') { - $text->data = $el->textContent; - } - return $text; - } } -- GitLab From 5a492559056dec8c14356430ea1e8e4d7ceae587 Mon Sep 17 00:00:00 2001 From: asajedi Date: Thu, 14 Oct 2021 02:58:22 +0200 Subject: [PATCH 4/6] Start implementing entity annotations --- src/Controller/Tei2SolrController.php | 388 ++------------------------ src/Model/SolrDocument.php | 20 +- src/Service/EditedTextService.php | 45 +-- src/Service/HtmlService.php | 13 +- src/Service/PreProcessingService.php | 2 - src/Service/TranscriptionService.php | 66 +++-- 6 files changed, 111 insertions(+), 423 deletions(-) diff --git a/src/Controller/Tei2SolrController.php b/src/Controller/Tei2SolrController.php index e73019c..081690b 100755 --- a/src/Controller/Tei2SolrController.php +++ b/src/Controller/Tei2SolrController.php @@ -643,9 +643,9 @@ class Tei2SolrController extends AbstractController $graphics = $this->getGraphics($imageIds, $imageUrls); $solrDocument = $this->getTextVersions($file, $graphics); $transcription = $solrDocument->getTranscriptedText(); - $pagesTranscription = $solrDocument->getTranscriptedTextArr(); + $pagesTranscription = $solrDocument->getPageLevelTranscriptedText(); $editedText = $solrDocument->getEditedText(); - $pagesEdited = $solrDocument->getEditedTextArr(); + $pagesEdited = $solrDocument->getPageLevelEditedText(); $pagesGndsUuids = $solrDocument->getPagesGndsUuids(); $pagesNotesUuids = $solrDocument->getPagesNotesUuids(); $pagesSegs = $solrDocument->getPagesSegs(); @@ -778,9 +778,10 @@ class Tei2SolrController extends AbstractController $childDoc->edited_text = $pagesEdited[$i - 1]; } - if (isset($pagesGndsUuids[$i - 1]) && !empty(($pagesGndsUuids[$i - 1]))) { - $childDoc->entities = array_values($pagesGndsUuids[$i - 1]); - $childDoc->annotation_ids = array_keys($pagesGndsUuids[$i - 1]); + if (isset($pagesGndsUuids[$i]) && !empty(($pagesGndsUuids[$i]))) { +// if (isset($pagesGndsUuids[$i - 1]) && !empty(($pagesGndsUuids[$i - 1]))) { + $childDoc->entities = array_values($pagesGndsUuids[$i]); + $childDoc->annotation_ids = array_keys($pagesGndsUuids[$i]); } if (isset($pagesNotesUuids[$i - 1]) && !empty(($pagesNotesUuids[$i - 1]))) { @@ -940,48 +941,12 @@ class Tei2SolrController extends AbstractController $xpath = new DOMXPath($doc); $xpath->registerNamespace('tei', 'http://www.tei-c.org/ns/1.0'); $pagesNodes = $xpath->query('//tei:body'); - - // ===================================================================== - // WORK IN PROGRESS - // ===================================================================== - - $editedTextArr = []; - $transcriptedTextArr = []; - $gndsUuids = []; - $pagesGndsUuids = []; - $pagesSegs = []; - $pagesNotesUuids = []; - $notesUuids = []; - $pagesSics = []; - /** @var DOMElement $body */ $body = $pagesNodes[0]; $pages = $this->preProcessingService->splitByPages($body); - foreach ($pages as $page) { - $transcriptedDoc = $this->transcriptionService->transformPage($page); - $transcriptedTextArr[] = $transcriptedDoc->saveHTML(); - - $editedDoc = $this->editedTextService->transformPage($page); - $editedTextArr[] = $editedDoc->saveHTML(); - $this->transcriptionService->clear(); - $this->editedTextService->clear(); - } - - $this->preProcessingService->clear(); - - // ===================================================================== - // WORK IN PROGRESS - // ===================================================================== - - /* - $allElements = []; - foreach ($pagesNodes as $pagesNode) { - $allElements = $this->getPagesNodes($pagesNode, $allElements); - } - - $editedTextArr = []; - $transcriptedTextArr = []; + $pageLevelEditedText = []; + $pageLevelTranscriptedText = []; $gndsUuids = []; $pagesGndsUuids = []; $pagesSegs = []; @@ -989,336 +954,38 @@ class Tei2SolrController extends AbstractController $notesUuids = []; $pagesSics = []; - foreach ($allElements as $k => $elements) { - $renditions = []; - $myeditedText = ''; - $mytranscriptedText = ''; - - foreach ($elements as $element) { - if (isset($element->nodeName) && ( - ($element->nodeName === 'p' && $element->firstChild->nodeName !== 'address') || - ($element->nodeName === 'dateline' && 'closer' !== $element->parentNode->nodeName) || - $element->nodeName === 'address' || - $element->nodeName === 'closer' || - $element->nodeName === 'list' || - ($element->nodeName === 'signed' && 'closer' !== $element->parentNode->nodeName) - )) { - - $n = 0; - $liNumber = 1; - $tText = ''; - $eText = ''; - $childElementsArr = []; - $childElements = $this->getNodeChilds($element, $childElementsArr); - - foreach ($childElements as $childElement) { - if ($childElement->nodeName === '#text' - && ($childElement->parentNode->nodeName !== 'abbr' && ($childElement->nodeName !== 'del' || $childElement->parentNode->nodeName !== 'add')) - && $childElement->parentNode->nodeName !== 'note' && $childElement->parentNode->nodeName !== 'seg' - && $childElement->parentNode->nodeName !== 'ref' - ) { - if (isset($add) && !empty($add)) { - $tText .= ' ⟨' . $childElement->data . ' ' . $add; - $eText .= ' ' . $childElement->data; - $add = ''; - } elseif (isset($del) && !empty($del)) { - $tText .= ' [' . $childElement->data . ' ' . $del; - $del = ''; - } elseif (isset($li) && !empty($li) && !empty($childElement->data) && 'item' === $childElement->parentNode->nodeName) { - if (isset($italic) && true === $italic) { - $tText .= $li . $childElement->data . '
    • '; - $eText .= $li . $childElement->data . ''; - $italic = false; - } else { - $tText .= $li . $childElement->data; - $eText .= $li . $childElement->data; - } - - $li = ''; - } elseif ('supplied' === $childElement->parentNode->nodeName) { - $tText .= $childElement->data; - $eText .= '' . $childElement->data . ''; - } elseif ('hi' === $childElement->parentNode->nodeName && isset($childElement->parentNode->attributes[0])) { - - $hi = explode(':', $childElement->parentNode->attributes[0]->value); - - if (isset($hi[1]) && !empty($hi[1]) && 'superscript' === $hi[1]) { - $tText .= '' . $childElement->data . ''; - $eText .= $childElement->data; - } elseif (isset($hi[1]) && !empty($hi[1]) && 'italic' === $hi[1]) { - $tText .= '' . $childElement->data . ''; - $eText .= $childElement->data; - $italic = true; - } - } elseif (isset($gnd) && true === $gnd) { - $eText .= ''; - $tText .= ''; - $gnd = false; - } elseif (isset($note) && true === $note) { - $eText .= ' ' . $childElement->data; - $tText .= ' ' . $childElement->data; - $note = false; - } elseif (isset($sic) && true === $sic) { - $eText .= ' '; - $tText .= ' '; - $sic = false; - } elseif (isset($abbr) && !empty($abbr)) { - $tText .= $abbr; - $abbr = ''; - } - elseif (isset($expan) && !empty($expan)) { - $eText .= $expan; - $expan = ''; - } else { - $eText .= $childElement->data; - if (isset($renditions) && !empty($renditions)) { - $classOpeningTag = ''; - $classEndTag = ''; - - foreach ($renditions as $rendition) { - if ('italic' === $rendition) { - $class = 'i'; - } elseif ('underline' === $rendition) { - $class = 'u'; - } - if (isset($class) && !empty($class)) { - $classOpeningTag .= '<' . $class . '>'; - $classEndTag .= ''; - } - - $renditions = []; - } - } - - if (isset($classOpeningTag) && !empty($classOpeningTag)) { - $tText .= $classOpeningTag; - $classOpeningTag = ''; - } - - $tText .= $childElement->data; - - if (isset($classEndTag) && !empty($classEndTag)) { - $tText .= $classEndTag; - $classEndTag = ''; - } - } - } elseif ($childElement->nodeName === "seg") { - $noteBibl = false; - foreach ($childElement->childNodes as $childElementChildNode) { - if ($childElementChildNode->nodeName === 'bibl') { - $noteBibl = true; - break; - } - } - - if (true === $noteBibl) { - $segBiblTarget = $this->getSegBiblTarget($childElement); - if (!empty($segBiblTarget)) { - $segBiblTargetArr = explode('#', $segBiblTarget); - $segTextBibl = str_replace('_', ' ', $segBiblTargetArr[1]); - $segBiblTargetlink = $segBiblTargetArr[0]; - $segTextBibl = $segTextBibl.' ('.$segBiblTargetArr[0].')'; - $segText = $this->getElementText($childElement); - } - } else { - $segText = $this->getElementText($childElement); - } - - $noteUuid = $this->getUuid(); - $eText .= ''.$segText; - $tText .= $segText; - $note = true; - - if (true === $noteBibl && !empty($segTextBibl)) { - $pagesSegs[$k][$noteUuid] = trim($segTextBibl); - } elseif ($noteBibl === false && !empty($segText)) { - $pagesSegs[$k][$noteUuid] = trim($segText); - } - } elseif ($childElement->nodeName === "note") { - $noteText = $this->getElementText($childElement); - if ((isset($noteBibl) && false === $noteBibl) && (isset($noteUuid) && !empty($noteUuid)) && !empty($noteText)) { - $notesUuids[$noteUuid] = trim($noteText); - $pagesNotesUuids[$k][$noteUuid] = trim($noteText); - } elseif ((isset($noteBibl) && true === $noteBibl) && (isset($noteUuid) && !empty($noteUuid)) && !empty($segBiblTargetlink)) { - $notesUuids[$noteUuid] = trim($segBiblTargetlink); - $pagesNotesUuids[$k][$noteUuid] = trim(' '); - } - unset($noteUuid); - } elseif ('item' === $childElement->nodeName) { - if ($liNumber++ === 1) { - $li = '
      • '; - } else { - $li = '
      • '; - } - } elseif ('add' === $childElement->nodeName) { - if ('rdg' === $childElement->parentNode->nodeName) { - if (!empty($childElement->attributes)) { - $pattern = '/^#[a-z_]*$/i'; - foreach ($childElement->attributes as $attribute) { - if ('hand' === $attribute->nodeName) { - if (strpos($attribute->nodeValue, 'scrb') !== false) { - $add = $this->transformAddScrb($attribute->nodeValue); - } else { - $match = preg_match($pattern, $attribute->nodeValue, $matches); - if ($match) { - $add = 'erg. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . ''; - } - } - } - } - } else { - $add = 'erg.'; - } - - if (!empty($add)) { - $add .= '⟩ '; - } - } - } elseif ('del' === $childElement->nodeName) { - if ('rdg' === $childElement->parentNode->nodeName) { - if (!empty($childElement->attributes)) { - foreach ($childElement->attributes as $attribute) { - if ('hand' === $attribute->nodeName) { - if (strpos($attribute->nodeValue, 'scrb') !== false) { - $valueArr = explode('scrb', $attribute->nodeValue); - if (isset($valueArr[1])) { - $valueArr = explode('_', ltrim($valueArr[1], '_')); - if (count($valueArr) === 2) { - $del = 'str Schrhd.' . $valueArr[0] . ' ' . $valueArr[1] . ''; - } - } - } else { - $del = 'str. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . ''; - } - } - } - } else { - $del = 'str.'; - } - - if (!empty($del)) { - $del .= ']'; - } - } - } elseif ('abbr' === $childElement->nodeName && 'choice' === $childElement->parentNode->nodeName) { - if (isset($childElement->firstChild) && $childElement->firstChild->nodeName === '#text') { - $abbr = $childElement->firstChild->data;; - } - } elseif ('expan' === $childElement->nodeName && 'choice' === $childElement->parentNode->nodeName) { - if (isset($childElement->firstChild) && $childElement->firstChild->nodeName === '#text') { - $expan = $childElement->firstChild->data; - } - } elseif ('handShift' === $childElement->nodeName && 'signed' === $childElement->parentNode->nodeName) { - if (!empty($childElement->attributes)) { - $pattern = '/^#[a-z_]*$/i'; - foreach ($childElement->attributes as $attribute) { - if ('scribeRef' === $attribute->nodeName) { - $match = preg_match($pattern, $attribute->nodeValue, $matches); - if ($match) { - $add = 'sign. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . '⟩ '; - } - } - } - } - } elseif ('lb' === $childElement->nodeName) { - $tText .= '
        '; - $eText .= ' '; - } elseif ('addrLine' === $childElement->nodeName) { - if ($n++ > 0) { - $eText .= '
        '; - $tText .= '
        '; - } - } elseif ('signed' === $childElement->nodeName) { - $tText .= '

        '; - $eText .= '

        '; - $signedText = $this->getElementText($childElement); - $tText .= $signedText; - $eText .= $signedText; - } elseif ('hi' === $childElement->nodeName) { - foreach ($childElement->attributes as $attribute) { - if (isset(explode(':', $attribute->value)[1])) { - $rend = explode(':', $attribute->value)[1]; - if ('superscript' !== $rend && 'italic' !== $rend) { - $renditions[] = explode(':', $attribute->value)[1]; - } - } - } - } elseif ('name' === $childElement->nodeName && (isset($childElement->attributes[1]->value) && !empty($childElement->attributes[1]->value)) && (isset($childElement->attributes[0]->value) && !empty($childElement->attributes[0]->value))) { - $entityName = $this->getEntityName($childElement); - $uuid = $this->getUuid(); - if (str_contains($childElement->attributes[1]->value, 'gnd.')) { - $gndsUuids[$uuid] = str_replace('gnd.', '', $childElement->attributes[1]->value); - $pagesGndsUuids[$k][$uuid] = str_replace('gnd.', '', $childElement->attributes[1]->value); - } elseif (str_contains($childElement->attributes[1]->value, 'gnd:')) { - $gndsUuids[$uuid] = str_replace('gnd:', '', $childElement->attributes[1]->value); - $pagesGndsUuids[$k][$uuid] = str_replace('gnd:', '', $childElement->attributes[1]->value); - } - if ('signed' === $childElement->parentNode->nodeName) { - if (isset($childElement->parentNode->attributes[0])) { - $hand = explode('_', trim($childElement->parentNode->attributes[0]->value, '#')); - } - $eText .= ''.$entityName; - $tText .= '⟨'.$entityName.' sign.'; - if (isset($hand[0]) && !empty($hand[0])) { - $tText .= ' '.$hand[0]; - } - if (isset($hand[1]) && !empty($hand[1])) { - $tText .= ' '.$hand[1]; - } - $tText .= ' ⟩'; - } else { - $eText .= ''.$entityName; - $tText .= $entityName; - } - $gnd = true; - } elseif ($childElement->nodeName === "sic") { - $sicText = $this->getElementText($childElement); - $sicUuid = $this->getUuid(); - $eText .= $this->createSpan($sicUuid, $sicText, $childElement->nodeName); - $tText .= $sicText; - $pagesSics[$k][$sicUuid] = trim($sicText); - $sic = true; - } - } - - $eText = $this->convertSoftHyphenToHyphen($eText); - $eText = $this->removeHyphen($eText); - $myeditedText .= '

        ' . $eText . '

        '; - $mytranscriptedText .= '

        ' . $tText . '

        '; - } elseif ('pb' === $element->nodeName) { - $imageHref = $this->getImageHref($element, $graphics); - $tText = $imageHref; - $eText = $imageHref; - $myeditedText .= '

        ' . $eText . '

        '; - $mytranscriptedText .= '

        ' . $tText . '

        '; + foreach ($pages as $key => $page) { + if ($key > 0) { + $transcriptedDoc = $this->transcriptionService->transformPage($page); + $pageLevelTranscriptedText[] = $transcriptedDoc->saveHTML(); + [$editedDoc, $pagesGndsUuids] = $this->editedTextService->transformPage($page, $key); + $pageLevelEditedText[] = $editedDoc->saveHTML(); +// $this->transcriptionService->clear(); + $this->editedTextService->clear(); } } - $editedTextArr[] = $myeditedText; - $transcriptedTextArr[] = $mytranscriptedText; - } - */ + $this->preProcessingService->clear(); - $transcriptedText = ''; - foreach ($transcriptedTextArr as $mytranscriptedText) { - $transcriptedText .= $mytranscriptedText; + $documentLevelTranscriptedText = ''; + foreach ($pageLevelTranscriptedText as $singlePageTranscriptedText) { + $documentLevelTranscriptedText .= $singlePageTranscriptedText; } - $editedText = ''; - foreach ($editedTextArr as $myeditedText) { - $editedText .= $myeditedText; + $documentLevelEditedText = ''; + foreach ($pageLevelEditedText as $singlePageEditedText) { + $documentLevelEditedText .= $singlePageEditedText; } $solrDocument = new SolrDocument(); - $solrDocument->setTranscriptedText($transcriptedText); - $solrDocument->setTranscriptedTextArr($transcriptedTextArr); - $solrDocument->setEditedText($editedText); - $solrDocument->setEditedTextArr($editedTextArr); + $solrDocument->setTranscriptedText($documentLevelTranscriptedText); + $solrDocument->setPageLevelTranscriptedText($pageLevelTranscriptedText); + $solrDocument->setEditedText($documentLevelEditedText); + $solrDocument->setPageLevelEditedText($pageLevelEditedText); $solrDocument->setGndsUuids($gndsUuids); $solrDocument->setPagesGndsUuids($pagesGndsUuids); $solrDocument->setPagesNotesUuids($pagesNotesUuids); $solrDocument->setPagesSegs($pagesSegs); - $solrDocument->setPagesSics($pagesSics); return $solrDocument; @@ -1331,7 +998,6 @@ class Tei2SolrController extends AbstractController return $span; } - private function getSegBiblTarget(\DOMElement $childElement): ?string { $elementChildsArr = []; diff --git a/src/Model/SolrDocument.php b/src/Model/SolrDocument.php index 98fd842..b5150d6 100755 --- a/src/Model/SolrDocument.php +++ b/src/Model/SolrDocument.php @@ -5,9 +5,9 @@ namespace App\Model; class SolrDocument { private string $transcriptedText; - private array $transcriptedTextArr; + private array $pageLevelTranscriptedText; private string $editedText; - private array $editedTextArr; + private array $pageLevelEditedText; private array $gndsUuids; private array $pagesGndsUuids; private array $pagesNotesUuids; @@ -26,14 +26,14 @@ class SolrDocument return $this; } - public function getTranscriptedTextArr(): array + public function getPageLevelTranscriptedText(): array { - return $this->transcriptedTextArr; + return $this->pageLevelTranscriptedText; } - public function setTranscriptedTextArr(array $transcriptedTextArr): SolrDocument + public function setPageLevelTranscriptedText(array $pageLevelTranscriptedText): SolrDocument { - $this->transcriptedTextArr = $transcriptedTextArr; + $this->pageLevelTranscriptedText = $pageLevelTranscriptedText; return $this; } @@ -50,14 +50,14 @@ class SolrDocument return $this; } - public function getEditedTextArr(): array + public function getPageLevelEditedText(): array { - return $this->editedTextArr; + return $this->pageLevelEditedText; } - public function setEditedTextArr(array $editedTextArr): SolrDocument + public function setPageLevelEditedText(array $pageLevelEditedText): SolrDocument { - $this->editedTextArr = $editedTextArr; + $this->pageLevelEditedText = $pageLevelEditedText; return $this; } diff --git a/src/Service/EditedTextService.php b/src/Service/EditedTextService.php index 21befff..1fc1020 100644 --- a/src/Service/EditedTextService.php +++ b/src/Service/EditedTextService.php @@ -10,23 +10,25 @@ class EditedTextService extends HtmlService { private array $gndsUuids = []; private array $notesUuids = []; + private array $pagesGndsUuids = []; - public function transformPage(DOMDocument $page): DOMDocument + public function transformPage(DOMDocument $page, int $key): array { $this->setDoc(new DOMDocument()); /** @var DOMElement $element */ foreach ($page->childNodes as $element) { - $transformed = $this->transformElement($element); + $transformed = $this->transformElement($element, $key); if ($transformed) { $this->appendChild($transformed); } } - return $this->getDoc(); + // Later this should be packed in an object + return [$this->getDoc(), $this->pagesGndsUuids]; } - private function transformElement(DOMNode $el): ?DOMNode + private function transformElement(DOMNode $el, int $key): ?DOMNode { $htmlEl = null; if ($el->nodeName === '#text') { @@ -34,10 +36,15 @@ class EditedTextService extends HtmlService } if ($el->hasChildNodes()) { - $htmlEl = $this->span(); + $htmlEl = $this->handleSpan(); foreach ($el->childNodes as $child) { - $transformed = $this->transformElement($child); + $transformed = $this->transformElement($child, $key); if ($transformed) { + $methodName = 'handleET' . trim(ucfirst($child->nodeName), '#'); + if (method_exists($this, $methodName)) { + $htmlEl = $this->{$methodName}($child, $htmlEl, $key); + } + $htmlEl->appendChild($transformed); } } @@ -46,30 +53,24 @@ class EditedTextService extends HtmlService return $htmlEl; } - private function handleName(DOMNode $el): DOMNode + private function handleETName(DOMNode $el, $htmlEl, int $key): DOMNode { - $uuid = $this->createUuid(); + $gndsUuids = []; - if (str_contains($el->attributes[1]->value, 'gnd.')) { - $this->gndsUuids[$uuid] = str_replace('gnd.', '', $el->attributes[1]->value); - // $pagesGndsUuids[$k][$uuid] = str_replace('gnd.', '', $el->attributes[1]->value); - } elseif (str_contains($el->attributes[1]->value, 'gnd:')) { + if (isset($el->attributes[1]->value) && str_contains($el->attributes[1]->value, 'gnd:')) { + $uuid = $this->createUuid(); + $gndsUuids[$uuid] = str_replace('gnd:', '', $el->attributes[1]->value); $this->gndsUuids[$uuid] = str_replace('gnd:', '', $el->attributes[1]->value); - // $pagesGndsUuids[$k][$uuid] = str_replace('gnd:', '', $el->attributes[1]->value); + $this->pagesGndsUuids[$key][$uuid] = str_replace('gnd:', '', $el->attributes[1]->value); + $htmlEl->setAttribute('id', $uuid); + if (isset($el->attributes[0]->value)) { + $htmlEl->setAttribute('class', $el->attributes[0]->value); + } } - $htmlEl = $this->span(); - - $htmlEl->setAttribute('id', $uuid); - $htmlEl->setAttribute('class', $el->attributes[0]->value); - return $htmlEl; } - private function handleNote() { - - } - public function createUuid() { return uuid_create(UUID_TYPE_RANDOM); diff --git a/src/Service/HtmlService.php b/src/Service/HtmlService.php index 89681b4..cf980d1 100644 --- a/src/Service/HtmlService.php +++ b/src/Service/HtmlService.php @@ -26,26 +26,31 @@ class HtmlService return $this->doc->appendChild($el); } - public function p(): DOMElement + public function handleP(): DOMElement { return $this->doc->createElement('p'); } - public function div():DOMElement + public function handleDiv():DOMElement { return $this->doc->createElement('div'); } - public function span(): DOMElement { + public function handleSpan(): DOMElement { return $this->doc->createElement('span'); } + public function handleLb(): DOMElement { + return $this->doc->createElement('br'); + } + public function clear() { $this->doc = null; } - public function handleText(DOMNode $el): DOMNode { + public function handleText(DOMNode $el): DOMNode + { $text = new DOMText(); if ($el->nodeName === '#text') { $text->data = $el->textContent; diff --git a/src/Service/PreProcessingService.php b/src/Service/PreProcessingService.php index 9ed4551..ffa82f1 100644 --- a/src/Service/PreProcessingService.php +++ b/src/Service/PreProcessingService.php @@ -2,14 +2,12 @@ namespace App\Service; - use DOMDocument; use DOMElement; use DOMNode; class PreProcessingService { - private array $pages = []; private DOMNode $lastParent; diff --git a/src/Service/TranscriptionService.php b/src/Service/TranscriptionService.php index 878b3c0..44e015b 100644 --- a/src/Service/TranscriptionService.php +++ b/src/Service/TranscriptionService.php @@ -5,28 +5,37 @@ namespace App\Service; use DOMDocument; use DOMElement; use DOMNode; -use DOMText; +use App\Service\HtmlService; -class TranscriptionService extends HtmlService +class TranscriptionService +//class TranscriptionService extends HtmlService { + private HtmlService $htmlService; + + public function __construct(HtmlService $htmlService) + { + $this->htmlService = $htmlService; + } + public function transformPage(DOMDocument $page): DOMDocument { - $this->setDoc(new DOMDocument()); + $this->htmlService->setDoc(new DOMDocument()); /** @var DOMElement $element */ foreach ($page->childNodes as $element) { - $this->appendChild($this->transformElement($element)); + $this->htmlService->appendChild($this->transformElement($element)); } - return $this->getDoc(); + return $this->htmlService->getDoc(); } - private function transformElement(DOMNode $el): DOMNode { + private function transformElement(DOMNode $el): DOMNode + { $methodName = 'handle' . trim(ucfirst($el->nodeName), '#'); - if (method_exists($this, $methodName)) { - $htmlEl = $this->{$methodName}($el); + if (method_exists($this->htmlService, $methodName)) { + $htmlEl = $this->htmlService->{$methodName}($el); } else { - $htmlEl = $this->p(); + $htmlEl = $this->htmlService->handleP(); } if ($el->hasChildNodes()) { @@ -38,19 +47,28 @@ class TranscriptionService extends HtmlService return $htmlEl; } - private function handlePb(DOMElement $el): DOMNode { - return $this->div(); - } - - private function handleP(DOMElement $el): DOMNode { - return $this->p(); - } - - private function handleDiv(DOMElement $el): DOMNode { - return $this->div(); - } - - private function handleSpan(DOMElement $el): DOMNode { - return $this->span(); - } +// private function handleLb(DOMElement $el): DOMNode +// { +// return $this->htmlService->br(); +// } +// +// private function handlePb(DOMElement $el): DOMNode +// { +// return $this->htmlService->div(); +// } +// +// private function handleP(DOMElement $el): DOMNode +// { +// return $this->htmlService->htmlService->p(); +// } +// +// private function handleDiv(DOMElement $el): DOMNode +// { +// return $this->htmlService->div(); +// } +// +// private function handleSpan(DOMElement $el): DOMNode +// { +// return $this->htmlService->span(); +// } } -- GitLab From 0cce09c858753ee92a6d303d84ce0f17833b57d8 Mon Sep 17 00:00:00 2001 From: asajedi Date: Thu, 14 Oct 2021 03:01:29 +0200 Subject: [PATCH 5/6] Remove htmlService extension --- src/Service/TranscriptionService.php | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Service/TranscriptionService.php b/src/Service/TranscriptionService.php index 44e015b..005ac0c 100644 --- a/src/Service/TranscriptionService.php +++ b/src/Service/TranscriptionService.php @@ -8,7 +8,6 @@ use DOMNode; use App\Service\HtmlService; class TranscriptionService -//class TranscriptionService extends HtmlService { private HtmlService $htmlService; -- GitLab From 6b3c5b7b7308c1147289fbca3805fd76cde943ff Mon Sep 17 00:00:00 2001 From: Paul Pestov Date: Thu, 14 Oct 2021 22:06:04 +0200 Subject: [PATCH 6/6] Refactor services --- src/Controller/Tei2SolrController.php | 11 +++-- src/Service/EditedTextService.php | 41 +++++++++------- src/Service/HtmlService.php | 13 ++--- src/Service/TranscriptionService.php | 69 ++++++++++++--------------- 4 files changed, 69 insertions(+), 65 deletions(-) diff --git a/src/Controller/Tei2SolrController.php b/src/Controller/Tei2SolrController.php index 081690b..82c650e 100755 --- a/src/Controller/Tei2SolrController.php +++ b/src/Controller/Tei2SolrController.php @@ -947,7 +947,7 @@ class Tei2SolrController extends AbstractController $pageLevelEditedText = []; $pageLevelTranscriptedText = []; - $gndsUuids = []; + $pagesGndsUuids = []; $pagesSegs = []; $pagesNotesUuids = []; @@ -958,15 +958,20 @@ class Tei2SolrController extends AbstractController if ($key > 0) { $transcriptedDoc = $this->transcriptionService->transformPage($page); $pageLevelTranscriptedText[] = $transcriptedDoc->saveHTML(); - [$editedDoc, $pagesGndsUuids] = $this->editedTextService->transformPage($page, $key); + + $editedDoc = $this->editedTextService->transformPage($page); + $pagesGndsUuids[$key] = $this->editedTextService->getGndsUuids(); $pageLevelEditedText[] = $editedDoc->saveHTML(); -// $this->transcriptionService->clear(); + + $this->transcriptionService->clear(); $this->editedTextService->clear(); } } $this->preProcessingService->clear(); + $gndsUuids = array_merge($pagesGndsUuids); + $documentLevelTranscriptedText = ''; foreach ($pageLevelTranscriptedText as $singlePageTranscriptedText) { $documentLevelTranscriptedText .= $singlePageTranscriptedText; diff --git a/src/Service/EditedTextService.php b/src/Service/EditedTextService.php index 1fc1020..b6c0fb8 100644 --- a/src/Service/EditedTextService.php +++ b/src/Service/EditedTextService.php @@ -10,41 +10,48 @@ class EditedTextService extends HtmlService { private array $gndsUuids = []; private array $notesUuids = []; - private array $pagesGndsUuids = []; - public function transformPage(DOMDocument $page, int $key): array + public function transformPage(DOMDocument $page): DOMDocument { $this->setDoc(new DOMDocument()); /** @var DOMElement $element */ foreach ($page->childNodes as $element) { - $transformed = $this->transformElement($element, $key); + $transformed = $this->transformElement($element); if ($transformed) { $this->appendChild($transformed); } } // Later this should be packed in an object - return [$this->getDoc(), $this->pagesGndsUuids]; + return $this->getDoc(); } - private function transformElement(DOMNode $el, int $key): ?DOMNode + public function getGndsUuids(): array + { + return $this->gndsUuids; + } + + public function getNotesUuids(): array + { + return $this->notesUuids; + } + + private function transformElement(DOMNode $el): ?DOMNode { $htmlEl = null; - if ($el->nodeName === '#text') { - $htmlEl = $this->handleText($el); + $methodName = 'handle' . trim(ucfirst($el->nodeName), '#'); + if (method_exists($this, $methodName)) { + $htmlEl = $this->{$methodName}($el); + } else { + $htmlEl = $this->span(); } if ($el->hasChildNodes()) { - $htmlEl = $this->handleSpan(); + var_dump($el->nodeName); foreach ($el->childNodes as $child) { - $transformed = $this->transformElement($child, $key); + $transformed = $this->transformElement($child); if ($transformed) { - $methodName = 'handleET' . trim(ucfirst($child->nodeName), '#'); - if (method_exists($this, $methodName)) { - $htmlEl = $this->{$methodName}($child, $htmlEl, $key); - } - $htmlEl->appendChild($transformed); } } @@ -53,15 +60,13 @@ class EditedTextService extends HtmlService return $htmlEl; } - private function handleETName(DOMNode $el, $htmlEl, int $key): DOMNode + private function handleName(DOMNode $el): DOMNode { - $gndsUuids = []; + $htmlEl = $this->span(); if (isset($el->attributes[1]->value) && str_contains($el->attributes[1]->value, 'gnd:')) { $uuid = $this->createUuid(); - $gndsUuids[$uuid] = str_replace('gnd:', '', $el->attributes[1]->value); $this->gndsUuids[$uuid] = str_replace('gnd:', '', $el->attributes[1]->value); - $this->pagesGndsUuids[$key][$uuid] = str_replace('gnd:', '', $el->attributes[1]->value); $htmlEl->setAttribute('id', $uuid); if (isset($el->attributes[0]->value)) { $htmlEl->setAttribute('class', $el->attributes[0]->value); diff --git a/src/Service/HtmlService.php b/src/Service/HtmlService.php index cf980d1..f07b492 100644 --- a/src/Service/HtmlService.php +++ b/src/Service/HtmlService.php @@ -26,21 +26,23 @@ class HtmlService return $this->doc->appendChild($el); } - public function handleP(): DOMElement + public function p(): DOMElement { return $this->doc->createElement('p'); } - public function handleDiv():DOMElement + public function div():DOMElement { return $this->doc->createElement('div'); } - public function handleSpan(): DOMElement { + public function span(): DOMElement + { return $this->doc->createElement('span'); } - public function handleLb(): DOMElement { + public function br(): DOMElement + { return $this->doc->createElement('br'); } @@ -49,8 +51,7 @@ class HtmlService $this->doc = null; } - public function handleText(DOMNode $el): DOMNode - { + public function handleText(DOMNode $el): DOMNode { $text = new DOMText(); if ($el->nodeName === '#text') { $text->data = $el->textContent; diff --git a/src/Service/TranscriptionService.php b/src/Service/TranscriptionService.php index 005ac0c..a4aa4fb 100644 --- a/src/Service/TranscriptionService.php +++ b/src/Service/TranscriptionService.php @@ -7,34 +7,27 @@ use DOMElement; use DOMNode; use App\Service\HtmlService; -class TranscriptionService +class TranscriptionService extends HtmlService { - private HtmlService $htmlService; - - public function __construct(HtmlService $htmlService) - { - $this->htmlService = $htmlService; - } - public function transformPage(DOMDocument $page): DOMDocument { - $this->htmlService->setDoc(new DOMDocument()); + $this->setDoc(new DOMDocument()); /** @var DOMElement $element */ foreach ($page->childNodes as $element) { - $this->htmlService->appendChild($this->transformElement($element)); + $this->appendChild($this->transformElement($element)); } - return $this->htmlService->getDoc(); + return $this->getDoc(); } private function transformElement(DOMNode $el): DOMNode { $methodName = 'handle' . trim(ucfirst($el->nodeName), '#'); - if (method_exists($this->htmlService, $methodName)) { - $htmlEl = $this->htmlService->{$methodName}($el); + if (method_exists($this, $methodName)) { + $htmlEl = $this->{$methodName}($el); } else { - $htmlEl = $this->htmlService->handleP(); + $htmlEl = $this->p(); } if ($el->hasChildNodes()) { @@ -46,28 +39,28 @@ class TranscriptionService return $htmlEl; } -// private function handleLb(DOMElement $el): DOMNode -// { -// return $this->htmlService->br(); -// } -// -// private function handlePb(DOMElement $el): DOMNode -// { -// return $this->htmlService->div(); -// } -// -// private function handleP(DOMElement $el): DOMNode -// { -// return $this->htmlService->htmlService->p(); -// } -// -// private function handleDiv(DOMElement $el): DOMNode -// { -// return $this->htmlService->div(); -// } -// -// private function handleSpan(DOMElement $el): DOMNode -// { -// return $this->htmlService->span(); -// } + private function handleLb(DOMElement $el): DOMNode + { + return $this->br(); + } + + private function handlePb(DOMElement $el): DOMNode + { + return $this->div(); + } + + private function handleP(DOMElement $el): DOMNode + { + return $this->p(); + } + + private function handleDiv(DOMElement $el): DOMNode + { + return $this->div(); + } + + private function handleSpan(DOMElement $el): DOMNode + { + return $this->span(); + } } -- GitLab