From 8072e27565cd6b00dc0f115ad3ee4655b671f45d Mon Sep 17 00:00:00 2001 From: asajedi Date: Wed, 1 Dec 2021 03:17:06 +0100 Subject: [PATCH] Import and index sample TEI document Resolves #28 --- config/services.yaml | 6 ++++-- src/Import/Importer.php | 28 +++++++++++++++++++++++++--- src/Import/Indexer.php | 10 +++++++++- 3 files changed, 38 insertions(+), 6 deletions(-) diff --git a/config/services.yaml b/config/services.yaml index 51723be..878c285 100755 --- a/config/services.yaml +++ b/config/services.yaml @@ -23,8 +23,8 @@ parameters: SMB_ZA: 'Zentralarchiv Staatliche Museen zu Berlin – Preußischer Kulturbesitz' UAHW: 'Archiv der Martin-Luther-Universität Halle-Wittenberg, Halle / S.' SUB_HSD: 'SUB HSD Niedersächsische Staats- und Universitätsbibliothek Göttingen, Abteilung Handschriften und Seltene Drucke' - tei_dir: '%kernel.project_dir%/data/gitlab/' - tei_sample_dir: '%kernel.project_dir%/teis/sampletei/' + tei_dir: '%kernel.project_dir%/data/teis/' + tei_sample_dir: '%kernel.project_dir%/data/sampletei/' services: # default configuration for services in *this* file @@ -93,10 +93,12 @@ services: - method: 'setConfigs' arguments: - '%tei_dir%' + - '%tei_sample_dir%' - '%env(GITLAB_REPO_TOKEN)%' - '%env(GITLAB_REPO_TREE_URL)%' - '%env(GITLAB_PROCESSED_TEI_REPO_URL)%' - '%env(INVALIDE_TEI_LIST_FILE)%' + - '%env(SAMPLE_TEI_DOCUMENT_URL)%' App\Import\Indexer: calls: diff --git a/src/Import/Importer.php b/src/Import/Importer.php index 4ab6889..433c4d1 100644 --- a/src/Import/Importer.php +++ b/src/Import/Importer.php @@ -4,19 +4,23 @@ declare(strict_types=1); namespace App\Import; +use App\Import\ImporterInterface; use Symfony\Component\Filesystem\Filesystem; use Symfony\Component\HttpFoundation\File\Exception\FileException; class Importer implements ImporterInterface { + private ?string $teiDir = null; + private ?string $teiSampleDir = null; private ?string $gitlabProcessedTeiRepoUrl; private ?string $gitlabRepoToken; private ?string $gitlabRepoTreeUrl; private ?string $invalidTeiListFile; - private ?string $teiDir = null; + private ?string $sampleTEIDocumentUrl; public function import(): void { + $this->importSampleTEIDocument(); $filesystem = new Filesystem(); if (!$filesystem->exists($this->teiDir)) { $filesystem->mkdir($this->teiDir); @@ -58,23 +62,41 @@ class Importer implements ImporterInterface } } - public function setConfigs(string $teiDir, string $gitlabRepoToken, string $gitlabRepoTreeUrl, string $gitlabProcessedTeiRepoUrl, string $invalidTeiListFile): void + public function setConfigs(string $teiDir, string $teiSampleDir, string $gitlabRepoToken, string $gitlabRepoTreeUrl, string $gitlabProcessedTeiRepoUrl, string $invalidTeiListFile, string $sampleTEIDocumentUrl): void { $this->teiDir = $teiDir; + $this->teiSampleDir = $teiSampleDir; $this->gitlabRepoToken = $gitlabRepoToken; $this->gitlabRepoTreeUrl = $gitlabRepoTreeUrl; $this->gitlabProcessedTeiRepoUrl = $gitlabProcessedTeiRepoUrl; $this->invalidTeiListFile = $invalidTeiListFile; + $this->sampleTEIDocumentUrl = $sampleTEIDocumentUrl; } private function getInvalidTeiList(): array { $invalidTeiList = []; $file_headers = @get_headers($this->invalidTeiListFile); - if($file_headers[0] !== 'HTTP/1.1 404 Not Found') { + if($file_headers[0] === 'HTTP/1.1 200 OK') { $invalidTeiList = json_decode(file_get_contents($this->invalidTeiListFile), true); } return $invalidTeiList; } + + private function importSampleTEIDocument(): void + { + $sampleTEIDocumentUrl = $this->sampleTEIDocumentUrl.'&access_token='.$this->gitlabRepoToken; + $file_headers = @get_headers($sampleTEIDocumentUrl); + if($file_headers[0] === 'HTTP/1.1 200 OK') { + $sampleDocumentData = json_decode(file_get_contents($sampleTEIDocumentUrl), true); + $sampleTEIDocument = base64_decode($sampleDocumentData['content']); + $filesystem = new Filesystem(); + if (!$filesystem->exists($this->teiSampleDir)) { + $filesystem->mkdir($this->teiSampleDir); + } + + $filesystem->dumpFile($this->teiSampleDir.$sampleDocumentData['file_name'], $sampleTEIDocument); + } + } } diff --git a/src/Import/Indexer.php b/src/Import/Indexer.php index ce420ac..a24dddb 100644 --- a/src/Import/Indexer.php +++ b/src/Import/Indexer.php @@ -14,6 +14,7 @@ use DOMElement; use DOMXPath; use League\Flysystem\Exception; use Solarium\Client; +use Symfony\Component\Filesystem\Exception\IOExceptionInterface; use Symfony\Component\Filesystem\Filesystem; use Symfony\Component\Finder\Finder; @@ -126,7 +127,7 @@ class Indexer implements IndexerInterface public function tei2solr(): void { - $this->client->getEndpoint()->setOptions(['timeout' => 60]); + $this->client->getEndpoint()->setOptions(['timeout' => 60, 'index_timeout' => 60]); $finder = new Finder(); $finder->files()->in($this->teiDir); @@ -183,6 +184,13 @@ class Indexer implements IndexerInterface libxml_clear_errors(); } } + + try { + $filesystem = new Filesystem(); + $filesystem->remove($this->teiDir); + } catch (IOExceptionInterface $exception) { + echo "Error deleting directory at". $exception->getPath(); + } } private function convertSoftHyphenToHyphen(string $text): string -- GitLab