From bc80fe3e22a7b1ca7f01a59e1b94c7f3f2e17ccd Mon Sep 17 00:00:00 2001 From: asajedi Date: Wed, 24 Nov 2021 00:48:25 +0100 Subject: [PATCH 1/2] Transform list/item to html counterpart ul/li --- src/Import/HTMLDocument.php | 10 ++++++++++ src/Import/Indexer.php | 1 - src/Service/CommonTransformService.php | 8 +++++++- src/Service/EditedTextService.php | 10 ++++++++++ src/Service/PreProcessingService.php | 5 +++-- src/Service/TranscriptionService.php | 10 ++++++++++ 6 files changed, 40 insertions(+), 4 deletions(-) diff --git a/src/Import/HTMLDocument.php b/src/Import/HTMLDocument.php index 849949a..0fd732f 100644 --- a/src/Import/HTMLDocument.php +++ b/src/Import/HTMLDocument.php @@ -44,6 +44,16 @@ class HTMLDocument extends DOMDocument return $element; } + public function ul(string $classes = ''): DOMElement + { + return $this->createCustomElement('ul', $classes); + } + + public function li(string $classes = ''): DOMElement + { + return $this->createCustomElement('li', $classes); + } + public function div(string $classes = ''): DOMElement { return $this->createCustomElement('div', $classes); diff --git a/src/Import/Indexer.php b/src/Import/Indexer.php index a180ecf..9288f5f 100644 --- a/src/Import/Indexer.php +++ b/src/Import/Indexer.php @@ -65,7 +65,6 @@ class Indexer implements IndexerInterface $this->transcriptionService->setGraphics($graphics); $this->editedTextService->setGraphics($graphics); $pages = $this->preProcessingService->splitByPages($body); - $pageLevelEditedText = []; $pageLevelTranscriptedText = []; $pagesGndsUuids = []; diff --git a/src/Service/CommonTransformService.php b/src/Service/CommonTransformService.php index 1f282f4..375f3db 100644 --- a/src/Service/CommonTransformService.php +++ b/src/Service/CommonTransformService.php @@ -138,8 +138,14 @@ class CommonTransformService return $doc->div(); } - protected function handleLabel(DOMElement $teiEl, HTMLDocument $doc): DOMNode + protected function handleLabel(DOMElement $teiEl, HTMLDocument $doc): ?DOMNode { + // This is temporarily implemented till styling requirements + // for lable are specified. + if ('item' === $teiEl->parentNode->nodeName) { + return $doc->text($teiEl->textContent); + } + $classes = ''; if ( 'div' === $teiEl->parentNode->nodeName && diff --git a/src/Service/EditedTextService.php b/src/Service/EditedTextService.php index 3933409..567639f 100644 --- a/src/Service/EditedTextService.php +++ b/src/Service/EditedTextService.php @@ -32,6 +32,16 @@ class EditedTextService extends CommonTransformService private array $notes = []; private array $works = []; + protected function handleItem(DOMElement $teiEl, HTMLDocument $doc): ?DOMNode + { + return $doc->li(); + } + + protected function handleList(DOMElement $teiEl, HTMLDocument $doc): ?DOMNode + { + return $doc->ul(); + } + public function clear() { $this->gndsUuids = []; diff --git a/src/Service/PreProcessingService.php b/src/Service/PreProcessingService.php index 96dce41..e4b4698 100644 --- a/src/Service/PreProcessingService.php +++ b/src/Service/PreProcessingService.php @@ -118,10 +118,11 @@ class PreProcessingService $lastNode = $node; } - // Lastly we append the element itself to the new page to maintain it for further processing + // Lastly we insert the element itself as the first element + // to the new page to maintain it for further processing $pbClone = $newPage->createElement('pb'); $pbClone = $this->cloneAttributes($pbEl->attributes, $pbClone); - $this->lastParent->appendChild($pbClone); + $newPage->insertBefore($pbClone, $newPage->firstChild); return $newPage; } diff --git a/src/Service/TranscriptionService.php b/src/Service/TranscriptionService.php index 3f6443a..f3db7b0 100644 --- a/src/Service/TranscriptionService.php +++ b/src/Service/TranscriptionService.php @@ -36,6 +36,16 @@ class TranscriptionService extends CommonTransformService 'wavyunderline' => 'underline', ]; + protected function handleItem(DOMElement $teiEl, HTMLDocument $doc): ?DOMNode + { + return $doc->li(); + } + + protected function handleList(DOMElement $teiEl, HTMLDocument $doc): ?DOMNode + { + return $doc->ul(); + } + protected function handleAdd(DOMElement $teiEl, HTMLDocument $doc): ?DOMNode { $type = $teiEl->getAttribute('type'); -- GitLab From 1d8dab7c16fe3685997bc98d67fc373b0ee6c9c6 Mon Sep 17 00:00:00 2001 From: asajedi Date: Wed, 24 Nov 2021 00:50:33 +0100 Subject: [PATCH 2/2] Modify checking for invalid TEI file list --- src/Import/Importer.php | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/Import/Importer.php b/src/Import/Importer.php index e4e1220..9188341 100644 --- a/src/Import/Importer.php +++ b/src/Import/Importer.php @@ -17,19 +17,23 @@ class Importer implements ImporterInterface public function import(): void { + $filesystem = new Filesystem(); + if (!$filesystem->exists($this->teiDir)) { + $filesystem->mkdir($this->teiDir); + } + $invalidTeiList = $this->getInvalidTeiList(); + for ($i = 1; $i <= 100; ++$i) { try { $files = file_get_contents($this->gitlabRepoTreeUrl.'&access_token='.$this->gitlabRepoToken.'&page='.$i); $files = json_decode($files, true); - foreach ($files as $file) { - $invalidTeiList = $this->getInvalidTeiList(); + foreach ($files as $file) { if ([] !== $invalidTeiList && !in_array(trim($file['name']), $invalidTeiList)) { $fileData = file_get_contents( $this->gitlabProcessedTeiRepoUrl.$file['name'].'?access_token='.$this->gitlabRepoToken.'&ref=master' ); $fileData = json_decode($fileData, true); - $filesystem = new Filesystem(); try { $filesystem->dumpFile($this->teiDir.$file['name'], base64_decode($fileData['content'])); @@ -56,8 +60,8 @@ class Importer implements ImporterInterface private function getInvalidTeiList(): array { $invalidTeiList = []; - - if (file_exists($this->invalidTeiListFile)) { + $file_headers = @get_headers($this->invalidTeiListFile); + if($file_headers[0] !== 'HTTP/1.1 404 Not Found') { $invalidTeiList = json_decode(file_get_contents($this->invalidTeiListFile), true); } -- GitLab