From eb60e95746dcb32880e1915314c55da2048754a7 Mon Sep 17 00:00:00 2001 From: asajedi Date: Thu, 18 Nov 2021 02:33:14 +0100 Subject: [PATCH] Exclude invalid TEI files from processing Resolves #79 --- config/services.yaml | 1 + src/Command/SolrIndexing.php | 2 +- src/Import/Importer.php | 31 +++++++++++++++++++++---------- src/Import/Indexer.php | 9 ++++----- 4 files changed, 27 insertions(+), 16 deletions(-) diff --git a/config/services.yaml b/config/services.yaml index e0844a3..38faa4b 100755 --- a/config/services.yaml +++ b/config/services.yaml @@ -95,6 +95,7 @@ services: - '%env(GITLAB_REPO_TOKEN)%' - '%env(GITLAB_REPO_TREE_URL)%' - '%env(GITLAB_PROCESSED_TEI_REPO_URL)%' + - '%env(INVALIDE_TEI_LIST_FILE)%' App\Import\Indexer: calls: diff --git a/src/Command/SolrIndexing.php b/src/Command/SolrIndexing.php index 430ff8f..680331c 100644 --- a/src/Command/SolrIndexing.php +++ b/src/Command/SolrIndexing.php @@ -38,7 +38,7 @@ class SolrIndexing extends Command { $output->writeln('Start solr indexing.'); -// $this->importer->import(); + $this->importer->import(); $this->indexer->deleteSolrIndex(); $this->indexer->tei2solr(); diff --git a/src/Import/Importer.php b/src/Import/Importer.php index 0fc40b0..ca76d3f 100644 --- a/src/Import/Importer.php +++ b/src/Import/Importer.php @@ -13,9 +13,13 @@ class Importer implements ImporterInterface private ?string $gitlabRepoToken; private ?string $gitlabRepoTreeUrl; private ?string $teiDir = null; + private ?string $invalidTeiListFile; - public function __construct() + private function getInvalidTeiList(): array { + $invalidTeiList = json_decode(file_get_contents($this->invalidTeiListFile), true); + + return $invalidTeiList; } public function import(): void @@ -25,14 +29,20 @@ class Importer implements ImporterInterface $files = file_get_contents($this->gitlabRepoTreeUrl.'&access_token='.$this->gitlabRepoToken.'&page='.$i); $files = json_decode($files, true); foreach ($files as $file) { - $fileData = file_get_contents($this->gitlabProcessedTeiRepoUrl.$file['name'].'?access_token='.$this->gitlabRepoToken.'&ref=master'); - $fileData = json_decode($fileData, true); - $filesystem = new Filesystem(); - - try { - $filesystem->dumpFile($this->teiDir.$file['name'], base64_decode($fileData['content'])); - } catch (FileException $exception) { - echo $file['name'].' could not be imported.'; + $invalidTeiList = $this->getInvalidTeiList(); + + if ([] !== $invalidTeiList && !in_array(trim($file['name']), $invalidTeiList)) { + $fileData = file_get_contents( + $this->gitlabProcessedTeiRepoUrl.$file['name'].'?access_token='.$this->gitlabRepoToken.'&ref=master' + ); + $fileData = json_decode($fileData, true); + $filesystem = new Filesystem(); + + try { + $filesystem->dumpFile($this->teiDir.$file['name'], base64_decode($fileData['content'])); + } catch (FileException $exception) { + echo $file['name'].' could not be imported.'; + } } } } catch (FileException $exception) { @@ -41,11 +51,12 @@ class Importer implements ImporterInterface } } - public function setConfigs(string $teiDir, string $gitlabRepoToken, string $gitlabRepoTreeUrl, string $gitlabProcessedTeiRepoUrl): void + public function setConfigs(string $teiDir, string $gitlabRepoToken, string $gitlabRepoTreeUrl, string $gitlabProcessedTeiRepoUrl, string $invalidTeiListFile): void { $this->teiDir = $teiDir; $this->gitlabRepoToken = $gitlabRepoToken; $this->gitlabRepoTreeUrl = $gitlabRepoTreeUrl; $this->gitlabProcessedTeiRepoUrl = $gitlabProcessedTeiRepoUrl; + $this->invalidTeiListFile = $invalidTeiListFile; } } diff --git a/src/Import/Indexer.php b/src/Import/Indexer.php index 04cf240..227d422 100644 --- a/src/Import/Indexer.php +++ b/src/Import/Indexer.php @@ -19,11 +19,10 @@ use Symfony\Component\Finder\Finder; class Indexer implements IndexerInterface { - private const ARTICLE_DOC_TYPE = 'article;'; - private const PAGE_DOC_TYPE = 'page;'; - private const NOTE_DOC_TYPE = 'note;'; - private const ENTITY_DOC_TYPE = 'entity;'; - + private const ARTICLE_DOC_TYPE = 'article'; + private const PAGE_DOC_TYPE = 'page'; + private const NOTE_DOC_TYPE = 'note'; + private const ENTITY_DOC_TYPE = 'entity'; private Client $client; private EditedTextService $editedTextService; private PreProcessingService $preProcessingService; -- GitLab