From 532385f7bfe11244dc1a2ec8ffd83cdae4cb2b39 Mon Sep 17 00:00:00 2001 From: asajedi Date: Wed, 1 Dec 2021 13:06:54 +0100 Subject: [PATCH 1/3] Outsource tranformation process to Transform folder --- config/services.yaml | 2 +- src/Import/Indexer.php | 2 +- src/{Import => Transform}/MetadataTransformer.php | 2 +- src/{Import => Transform}/MetadataTransformerInterface.php | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) rename src/{Import => Transform}/MetadataTransformer.php (99%) rename src/{Import => Transform}/MetadataTransformerInterface.php (98%) diff --git a/config/services.yaml b/config/services.yaml index 878c285..8f077a3 100755 --- a/config/services.yaml +++ b/config/services.yaml @@ -107,7 +107,7 @@ services: - '%tei_dir%' - '%tei_sample_dir%' - App\Import\MetadataTransformer: + App\Transform\MetadataTransformer: calls: - method: 'setConfigs' arguments: diff --git a/src/Import/Indexer.php b/src/Import/Indexer.php index a24dddb..0cf586b 100644 --- a/src/Import/Indexer.php +++ b/src/Import/Indexer.php @@ -8,7 +8,7 @@ use App\Model\SolrDocument; use App\Service\EditedTextService; use App\Service\PreProcessingService; use App\Service\TranscriptionService; -use App\Import\MetadataTransformerInterface; +use App\Transform\MetadataTransformerInterface; use DOMDocument; use DOMElement; use DOMXPath; diff --git a/src/Import/MetadataTransformer.php b/src/Transform/MetadataTransformer.php similarity index 99% rename from src/Import/MetadataTransformer.php rename to src/Transform/MetadataTransformer.php index 00cfa88..c73c03d 100644 --- a/src/Import/MetadataTransformer.php +++ b/src/Transform/MetadataTransformer.php @@ -2,7 +2,7 @@ declare(strict_types=1); -namespace App\Import; +namespace App\Transform; use DOMXPath; use Symfony\Component\Routing\RouterInterface; diff --git a/src/Import/MetadataTransformerInterface.php b/src/Transform/MetadataTransformerInterface.php similarity index 98% rename from src/Import/MetadataTransformerInterface.php rename to src/Transform/MetadataTransformerInterface.php index 2ba6808..8101051 100644 --- a/src/Import/MetadataTransformerInterface.php +++ b/src/Transform/MetadataTransformerInterface.php @@ -1,6 +1,6 @@ Date: Wed, 1 Dec 2021 13:25:45 +0100 Subject: [PATCH 2/3] Outsource solr indexing to Index folder --- config/services.yaml | 2 +- src/Command/SolrIndexing.php | 2 +- src/Controller/Tei2SolrController.php | 2 +- src/{Import => Index}/Indexer.php | 2 +- src/{Import => Index}/IndexerInterface.php | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) rename src/{Import => Index}/Indexer.php (99%) rename src/{Import => Index}/IndexerInterface.php (91%) diff --git a/config/services.yaml b/config/services.yaml index 8f077a3..5da60ae 100755 --- a/config/services.yaml +++ b/config/services.yaml @@ -100,7 +100,7 @@ services: - '%env(INVALIDE_TEI_LIST_FILE)%' - '%env(SAMPLE_TEI_DOCUMENT_URL)%' - App\Import\Indexer: + App\Index\Indexer: calls: - method: 'setConfigs' arguments: diff --git a/src/Command/SolrIndexing.php b/src/Command/SolrIndexing.php index 680331c..6a0837a 100644 --- a/src/Command/SolrIndexing.php +++ b/src/Command/SolrIndexing.php @@ -5,7 +5,7 @@ declare(strict_types=1); namespace App\Command; use App\Import\ImporterInterface; -use App\Import\IndexerInterface; +use App\Index\IndexerInterface; use Symfony\Component\Console\Command\Command; use Symfony\Component\Console\Input\InputInterface; use Symfony\Component\Console\Output\OutputInterface; diff --git a/src/Controller/Tei2SolrController.php b/src/Controller/Tei2SolrController.php index 5bd6c3e..2def707 100755 --- a/src/Controller/Tei2SolrController.php +++ b/src/Controller/Tei2SolrController.php @@ -3,7 +3,7 @@ namespace App\Controller; use App\Import\ImporterInterface; -use App\Import\IndexerInterface; +use App\Index\IndexerInterface; use App\Model\SolrDocument; use Symfony\Bundle\FrameworkBundle\Controller\AbstractController; use Symfony\Component\Routing\Annotation\Route; diff --git a/src/Import/Indexer.php b/src/Index/Indexer.php similarity index 99% rename from src/Import/Indexer.php rename to src/Index/Indexer.php index 0cf586b..85d79a8 100644 --- a/src/Import/Indexer.php +++ b/src/Index/Indexer.php @@ -2,7 +2,7 @@ declare(strict_types=1); -namespace App\Import; +namespace App\Index; use App\Model\SolrDocument; use App\Service\EditedTextService; diff --git a/src/Import/IndexerInterface.php b/src/Index/IndexerInterface.php similarity index 91% rename from src/Import/IndexerInterface.php rename to src/Index/IndexerInterface.php index 6fbb13b..6e7147e 100644 --- a/src/Import/IndexerInterface.php +++ b/src/Index/IndexerInterface.php @@ -1,6 +1,6 @@ Date: Thu, 2 Dec 2021 04:30:43 +0100 Subject: [PATCH 3/3] Set up TEI import into S3 storage --- config/packages/oneup_flysystem.yaml | 11 +++ config/services.yaml | 2 +- src/Command/TeiToS3Storage.php | 46 ++++++++++++ src/Import/Importer.php | 107 +++++++++++++++++++++++---- src/Service/FileService.php | 13 ++-- 5 files changed, 157 insertions(+), 22 deletions(-) create mode 100644 src/Command/TeiToS3Storage.php diff --git a/config/packages/oneup_flysystem.yaml b/config/packages/oneup_flysystem.yaml index 280b353..a874930 100755 --- a/config/packages/oneup_flysystem.yaml +++ b/config/packages/oneup_flysystem.yaml @@ -3,6 +3,12 @@ oneup_flysystem: default_adapter: local: directory: '%kernel.cache_dir%/flysystem' + + main_adapter: + awss3v3: + client: gfl.s3_client + bucket: '%env(STORAGE_BUCKET)%' + cache_adapter: awss3v3: client: gfl.s3_client @@ -37,6 +43,11 @@ oneup_flysystem: default_filesystem: adapter: default_adapter alias: League\Flysystem\Filesystem + + subugoe_gfl_main: + adapter: main_adapter + alias: main_filesystem + subugoe_iiif_cache: adapter: cache_adapter alias: cache_filesystem diff --git a/config/services.yaml b/config/services.yaml index 5da60ae..e10b5bf 100755 --- a/config/services.yaml +++ b/config/services.yaml @@ -86,7 +86,7 @@ services: public: true App\Service\FileService: - arguments: ["@cache_filesystem", "@source_filesystem", "@pdf_filesystem", "@image_filesystem", "@tei_filesystem"] + arguments: ["@main_filesystem", "@cache_filesystem", "@source_filesystem", "@pdf_filesystem", "@image_filesystem", "@tei_filesystem"] App\Import\Importer: calls: diff --git a/src/Command/TeiToS3Storage.php b/src/Command/TeiToS3Storage.php new file mode 100644 index 0000000..23b9731 --- /dev/null +++ b/src/Command/TeiToS3Storage.php @@ -0,0 +1,46 @@ +importer = $importer; + } + + /** + * {@inheritdoc} + */ + protected function configure() + { + $this->setDescription('Import TEI files from gitlab to S3 storage.'); + } + + /** + * {@inheritdoc} + */ + protected function execute(InputInterface $input, OutputInterface $output): int + { + $output->writeln('Start importing TEI files into S3 storage.'); + + $this->importer->importTeiToS3Storage(); + + $time = microtime(true) - $_SERVER['REQUEST_TIME_FLOAT']; + $time /= 60; + $output->writeln('Indexing process completed in '.$time.' minutes.'); + + return 1; + } +} diff --git a/src/Import/Importer.php b/src/Import/Importer.php index 433c4d1..1968e0d 100644 --- a/src/Import/Importer.php +++ b/src/Import/Importer.php @@ -4,23 +4,79 @@ declare(strict_types=1); namespace App\Import; +use App\Service\FileService; use App\Import\ImporterInterface; use Symfony\Component\Filesystem\Filesystem; use Symfony\Component\HttpFoundation\File\Exception\FileException; class Importer implements ImporterInterface { + private FileService $fileService; private ?string $teiDir = null; - private ?string $teiSampleDir = null; + private ?string $teiSampleDir = null; private ?string $gitlabProcessedTeiRepoUrl; private ?string $gitlabRepoToken; private ?string $gitlabRepoTreeUrl; private ?string $invalidTeiListFile; - private ?string $sampleTEIDocumentUrl; + private ?string $sampleTeiDocumentUrl; + + public function __construct(FileService $fileService) + { + $this->fileService = $fileService; + } + + public function importTeiToS3Storage(): void + { + $mainFileSystem = $this->fileService->getMainFilesystem(); + $mainFileSystem->deleteDir('tei'); + $teiFilesystem = $this->fileService->getTeiFilesystem(); + + $sampleTeiDocument = $this->getSampleTEIDocument(); + if (!empty($sampleTeiDocument)) { + $teiFilesystem->write('sample.xml', $sampleTeiDocument); + } + + $filesystem = new Filesystem(); + if (!$filesystem->exists($this->teiDir)) { + $filesystem->mkdir($this->teiDir); + } + $invalidTeiList = $this->getInvalidTeiList(); + + for ($i = 1; $i <= 100; ++$i) { + try { + $files = file_get_contents($this->gitlabRepoTreeUrl.'&access_token='.$this->gitlabRepoToken.'&page='.$i); + + if (is_string($files)) { + $files = json_decode($files, true); + + foreach ($files as $file) { + if ([] !== $invalidTeiList && !in_array(trim($file['name']), $invalidTeiList)) { + $teiFileUrl = $this->gitlabProcessedTeiRepoUrl.$file['name'].'?access_token='.$this->gitlabRepoToken.'&ref=master'; + $fileData = file_get_contents($teiFileUrl); + if (is_string($fileData)) { + $fileData = json_decode($fileData, true); + + try { + $teiFilesystem->write($file['name'], base64_decode($fileData['content'])); + } catch (FileException $exception) { + echo $file['name'].' could not be imported.'; + } + } else { + // TODO retry to download the file again + echo $file['name'].' could not be imported.'; + } + } + } + } + } catch (FileException $exception) { + echo 'Files list could not be imported from gitlab'; + } + } + } public function import(): void { - $this->importSampleTEIDocument(); + $this->importSampleTeiDocument(); $filesystem = new Filesystem(); if (!$filesystem->exists($this->teiDir)) { $filesystem->mkdir($this->teiDir); @@ -62,7 +118,7 @@ class Importer implements ImporterInterface } } - public function setConfigs(string $teiDir, string $teiSampleDir, string $gitlabRepoToken, string $gitlabRepoTreeUrl, string $gitlabProcessedTeiRepoUrl, string $invalidTeiListFile, string $sampleTEIDocumentUrl): void + public function setConfigs(string $teiDir, string $teiSampleDir, string $gitlabRepoToken, string $gitlabRepoTreeUrl, string $gitlabProcessedTeiRepoUrl, string $invalidTeiListFile, string $sampleTeiDocumentUrl): void { $this->teiDir = $teiDir; $this->teiSampleDir = $teiSampleDir; @@ -70,7 +126,7 @@ class Importer implements ImporterInterface $this->gitlabRepoTreeUrl = $gitlabRepoTreeUrl; $this->gitlabProcessedTeiRepoUrl = $gitlabProcessedTeiRepoUrl; $this->invalidTeiListFile = $invalidTeiListFile; - $this->sampleTEIDocumentUrl = $sampleTEIDocumentUrl; + $this->sampleTeiDocumentUrl = $sampleTeiDocumentUrl; } private function getInvalidTeiList(): array @@ -84,19 +140,38 @@ class Importer implements ImporterInterface return $invalidTeiList; } - private function importSampleTEIDocument(): void + private function importSampleTeiDocument(): void { - $sampleTEIDocumentUrl = $this->sampleTEIDocumentUrl.'&access_token='.$this->gitlabRepoToken; - $file_headers = @get_headers($sampleTEIDocumentUrl); - if($file_headers[0] === 'HTTP/1.1 200 OK') { - $sampleDocumentData = json_decode(file_get_contents($sampleTEIDocumentUrl), true); - $sampleTEIDocument = base64_decode($sampleDocumentData['content']); - $filesystem = new Filesystem(); - if (!$filesystem->exists($this->teiSampleDir)) { - $filesystem->mkdir($this->teiSampleDir); - } + $sampleTeiDocument = $this->getSampleTeiDocument(); + $filesystem = new Filesystem(); + if (!empty($sampleTeiDocument)) { + $filesystem->dumpFile($this->teiSampleDir.'sample.xml', $sampleTeiDocument); + } + +// $sampleTEIDocumentUrl = $this->sampleTEIDocumentUrl.'&access_token='.$this->gitlabRepoToken; +// $file_headers = @get_headers($sampleTEIDocumentUrl); +// if($file_headers[0] === 'HTTP/1.1 200 OK') { +// $sampleDocumentData = json_decode(file_get_contents($sampleTEIDocumentUrl), true); +// $sampleTEIDocument = base64_decode($sampleDocumentData['content']); +// $filesystem = new Filesystem(); +// if (!$filesystem->exists($this->teiSampleDir)) { +// $filesystem->mkdir($this->teiSampleDir); +// } +// +// $filesystem->dumpFile($this->teiSampleDir.$sampleDocumentData['file_name'], $sampleTEIDocument); +// } + } - $filesystem->dumpFile($this->teiSampleDir.$sampleDocumentData['file_name'], $sampleTEIDocument); + private function getSampleTeiDocument(): ?string + { + $sampleTeiDocument = ''; + $sampleTeiDocumentUrl = $this->sampleTeiDocumentUrl.'&access_token='.$this->gitlabRepoToken; + $file_headers = @get_headers($sampleTeiDocumentUrl); + if($file_headers[0] === 'HTTP/1.1 200 OK') { + $sampleDocumentData = json_decode(file_get_contents($sampleTeiDocumentUrl), true); + $sampleTeiDocument = base64_decode($sampleDocumentData['content']); } + + return $sampleTeiDocument; } } diff --git a/src/Service/FileService.php b/src/Service/FileService.php index 535eb20..55ddbc5 100755 --- a/src/Service/FileService.php +++ b/src/Service/FileService.php @@ -9,18 +9,16 @@ use League\Flysystem\FilesystemInterface; */ class FileService { + private FilesystemInterface $mainFilesystem; private FilesystemInterface $cacheFilesystem; - private FilesystemInterface $imageFilesystem; - private FilesystemInterface $pdfFilesystem; - private FilesystemInterface $sourceFilesystem; - private FilesystemInterface $teiFilesystem; - public function __construct(FilesystemInterface $cacheFilesystem, FilesystemInterface $sourceFilesystem, FilesystemInterface $pdfFilesystem, FilesystemInterface $imageFilesystem, FilesystemInterface $teiFilesystem) + public function __construct(FilesystemInterface $mainFilesystem, FilesystemInterface $cacheFilesystem, FilesystemInterface $sourceFilesystem, FilesystemInterface $pdfFilesystem, FilesystemInterface $imageFilesystem, FilesystemInterface $teiFilesystem) { + $this->mainFilesystem = $mainFilesystem; $this->cacheFilesystem = $cacheFilesystem; $this->sourceFilesystem = $sourceFilesystem; $this->pdfFilesystem = $pdfFilesystem; @@ -28,6 +26,11 @@ class FileService $this->teiFilesystem = $teiFilesystem; } + public function getMainFilesystem(): FilesystemInterface + { + return $this->mainFilesystem; + } + public function getCacheFilesystem(): FilesystemInterface { return $this->cacheFilesystem; -- GitLab