diff --git a/config/packages/oneup_flysystem.yaml b/config/packages/oneup_flysystem.yaml index 280b353fa6a791caffb8fcc6635d8e0f43323055..a874930f86e0cdac24c7ef2e9e3166bf9f0a7ded 100755 --- a/config/packages/oneup_flysystem.yaml +++ b/config/packages/oneup_flysystem.yaml @@ -3,6 +3,12 @@ oneup_flysystem: default_adapter: local: directory: '%kernel.cache_dir%/flysystem' + + main_adapter: + awss3v3: + client: gfl.s3_client + bucket: '%env(STORAGE_BUCKET)%' + cache_adapter: awss3v3: client: gfl.s3_client @@ -37,6 +43,11 @@ oneup_flysystem: default_filesystem: adapter: default_adapter alias: League\Flysystem\Filesystem + + subugoe_gfl_main: + adapter: main_adapter + alias: main_filesystem + subugoe_iiif_cache: adapter: cache_adapter alias: cache_filesystem diff --git a/config/services.yaml b/config/services.yaml index 878c285f1377a766d76013a519e2774611b19873..e10b5bf92d1847b3d19af8feced04632c411e247 100755 --- a/config/services.yaml +++ b/config/services.yaml @@ -86,7 +86,7 @@ services: public: true App\Service\FileService: - arguments: ["@cache_filesystem", "@source_filesystem", "@pdf_filesystem", "@image_filesystem", "@tei_filesystem"] + arguments: ["@main_filesystem", "@cache_filesystem", "@source_filesystem", "@pdf_filesystem", "@image_filesystem", "@tei_filesystem"] App\Import\Importer: calls: @@ -100,14 +100,14 @@ services: - '%env(INVALIDE_TEI_LIST_FILE)%' - '%env(SAMPLE_TEI_DOCUMENT_URL)%' - App\Import\Indexer: + App\Index\Indexer: calls: - method: 'setConfigs' arguments: - '%tei_dir%' - '%tei_sample_dir%' - App\Import\MetadataTransformer: + App\Transform\MetadataTransformer: calls: - method: 'setConfigs' arguments: diff --git a/src/Command/SolrIndexing.php b/src/Command/SolrIndexing.php index 680331c6107f0dc4fcd673bdf357e79f588d6e2b..6a0837ad9c030d9e59a0f37c2ef9cf99a5d40800 100644 --- a/src/Command/SolrIndexing.php +++ b/src/Command/SolrIndexing.php @@ -5,7 +5,7 @@ declare(strict_types=1); namespace App\Command; use App\Import\ImporterInterface; -use App\Import\IndexerInterface; +use App\Index\IndexerInterface; use Symfony\Component\Console\Command\Command; use Symfony\Component\Console\Input\InputInterface; use Symfony\Component\Console\Output\OutputInterface; diff --git a/src/Command/TeiToS3Storage.php b/src/Command/TeiToS3Storage.php new file mode 100644 index 0000000000000000000000000000000000000000..23b9731506c21e670753a47d915ad041ef6dbe51 --- /dev/null +++ b/src/Command/TeiToS3Storage.php @@ -0,0 +1,46 @@ +importer = $importer; + } + + /** + * {@inheritdoc} + */ + protected function configure() + { + $this->setDescription('Import TEI files from gitlab to S3 storage.'); + } + + /** + * {@inheritdoc} + */ + protected function execute(InputInterface $input, OutputInterface $output): int + { + $output->writeln('Start importing TEI files into S3 storage.'); + + $this->importer->importTeiToS3Storage(); + + $time = microtime(true) - $_SERVER['REQUEST_TIME_FLOAT']; + $time /= 60; + $output->writeln('Indexing process completed in '.$time.' minutes.'); + + return 1; + } +} diff --git a/src/Controller/Tei2SolrController.php b/src/Controller/Tei2SolrController.php index 5bd6c3ecf7e939f64b240f4276153c9af73ed2f7..2def7073bb25f82a21a7b8432c99d39a59a239ae 100755 --- a/src/Controller/Tei2SolrController.php +++ b/src/Controller/Tei2SolrController.php @@ -3,7 +3,7 @@ namespace App\Controller; use App\Import\ImporterInterface; -use App\Import\IndexerInterface; +use App\Index\IndexerInterface; use App\Model\SolrDocument; use Symfony\Bundle\FrameworkBundle\Controller\AbstractController; use Symfony\Component\Routing\Annotation\Route; diff --git a/src/Import/Importer.php b/src/Import/Importer.php index 433c4d1409b7f5ce69b95a63b4bb43065cc5fca5..1968e0d1666757969fd6ed6ff9280bf1ca6d6b6a 100644 --- a/src/Import/Importer.php +++ b/src/Import/Importer.php @@ -4,23 +4,79 @@ declare(strict_types=1); namespace App\Import; +use App\Service\FileService; use App\Import\ImporterInterface; use Symfony\Component\Filesystem\Filesystem; use Symfony\Component\HttpFoundation\File\Exception\FileException; class Importer implements ImporterInterface { + private FileService $fileService; private ?string $teiDir = null; - private ?string $teiSampleDir = null; + private ?string $teiSampleDir = null; private ?string $gitlabProcessedTeiRepoUrl; private ?string $gitlabRepoToken; private ?string $gitlabRepoTreeUrl; private ?string $invalidTeiListFile; - private ?string $sampleTEIDocumentUrl; + private ?string $sampleTeiDocumentUrl; + + public function __construct(FileService $fileService) + { + $this->fileService = $fileService; + } + + public function importTeiToS3Storage(): void + { + $mainFileSystem = $this->fileService->getMainFilesystem(); + $mainFileSystem->deleteDir('tei'); + $teiFilesystem = $this->fileService->getTeiFilesystem(); + + $sampleTeiDocument = $this->getSampleTEIDocument(); + if (!empty($sampleTeiDocument)) { + $teiFilesystem->write('sample.xml', $sampleTeiDocument); + } + + $filesystem = new Filesystem(); + if (!$filesystem->exists($this->teiDir)) { + $filesystem->mkdir($this->teiDir); + } + $invalidTeiList = $this->getInvalidTeiList(); + + for ($i = 1; $i <= 100; ++$i) { + try { + $files = file_get_contents($this->gitlabRepoTreeUrl.'&access_token='.$this->gitlabRepoToken.'&page='.$i); + + if (is_string($files)) { + $files = json_decode($files, true); + + foreach ($files as $file) { + if ([] !== $invalidTeiList && !in_array(trim($file['name']), $invalidTeiList)) { + $teiFileUrl = $this->gitlabProcessedTeiRepoUrl.$file['name'].'?access_token='.$this->gitlabRepoToken.'&ref=master'; + $fileData = file_get_contents($teiFileUrl); + if (is_string($fileData)) { + $fileData = json_decode($fileData, true); + + try { + $teiFilesystem->write($file['name'], base64_decode($fileData['content'])); + } catch (FileException $exception) { + echo $file['name'].' could not be imported.'; + } + } else { + // TODO retry to download the file again + echo $file['name'].' could not be imported.'; + } + } + } + } + } catch (FileException $exception) { + echo 'Files list could not be imported from gitlab'; + } + } + } public function import(): void { - $this->importSampleTEIDocument(); + $this->importSampleTeiDocument(); $filesystem = new Filesystem(); if (!$filesystem->exists($this->teiDir)) { $filesystem->mkdir($this->teiDir); @@ -62,7 +118,7 @@ class Importer implements ImporterInterface } } - public function setConfigs(string $teiDir, string $teiSampleDir, string $gitlabRepoToken, string $gitlabRepoTreeUrl, string $gitlabProcessedTeiRepoUrl, string $invalidTeiListFile, string $sampleTEIDocumentUrl): void + public function setConfigs(string $teiDir, string $teiSampleDir, string $gitlabRepoToken, string $gitlabRepoTreeUrl, string $gitlabProcessedTeiRepoUrl, string $invalidTeiListFile, string $sampleTeiDocumentUrl): void { $this->teiDir = $teiDir; $this->teiSampleDir = $teiSampleDir; @@ -70,7 +126,7 @@ class Importer implements ImporterInterface $this->gitlabRepoTreeUrl = $gitlabRepoTreeUrl; $this->gitlabProcessedTeiRepoUrl = $gitlabProcessedTeiRepoUrl; $this->invalidTeiListFile = $invalidTeiListFile; - $this->sampleTEIDocumentUrl = $sampleTEIDocumentUrl; + $this->sampleTeiDocumentUrl = $sampleTeiDocumentUrl; } private function getInvalidTeiList(): array @@ -84,19 +140,38 @@ class Importer implements ImporterInterface return $invalidTeiList; } - private function importSampleTEIDocument(): void + private function importSampleTeiDocument(): void { - $sampleTEIDocumentUrl = $this->sampleTEIDocumentUrl.'&access_token='.$this->gitlabRepoToken; - $file_headers = @get_headers($sampleTEIDocumentUrl); - if($file_headers[0] === 'HTTP/1.1 200 OK') { - $sampleDocumentData = json_decode(file_get_contents($sampleTEIDocumentUrl), true); - $sampleTEIDocument = base64_decode($sampleDocumentData['content']); - $filesystem = new Filesystem(); - if (!$filesystem->exists($this->teiSampleDir)) { - $filesystem->mkdir($this->teiSampleDir); - } + $sampleTeiDocument = $this->getSampleTeiDocument(); + $filesystem = new Filesystem(); + if (!empty($sampleTeiDocument)) { + $filesystem->dumpFile($this->teiSampleDir.'sample.xml', $sampleTeiDocument); + } + +// $sampleTEIDocumentUrl = $this->sampleTEIDocumentUrl.'&access_token='.$this->gitlabRepoToken; +// $file_headers = @get_headers($sampleTEIDocumentUrl); +// if($file_headers[0] === 'HTTP/1.1 200 OK') { +// $sampleDocumentData = json_decode(file_get_contents($sampleTEIDocumentUrl), true); +// $sampleTEIDocument = base64_decode($sampleDocumentData['content']); +// $filesystem = new Filesystem(); +// if (!$filesystem->exists($this->teiSampleDir)) { +// $filesystem->mkdir($this->teiSampleDir); +// } +// +// $filesystem->dumpFile($this->teiSampleDir.$sampleDocumentData['file_name'], $sampleTEIDocument); +// } + } - $filesystem->dumpFile($this->teiSampleDir.$sampleDocumentData['file_name'], $sampleTEIDocument); + private function getSampleTeiDocument(): ?string + { + $sampleTeiDocument = ''; + $sampleTeiDocumentUrl = $this->sampleTeiDocumentUrl.'&access_token='.$this->gitlabRepoToken; + $file_headers = @get_headers($sampleTeiDocumentUrl); + if($file_headers[0] === 'HTTP/1.1 200 OK') { + $sampleDocumentData = json_decode(file_get_contents($sampleTeiDocumentUrl), true); + $sampleTeiDocument = base64_decode($sampleDocumentData['content']); } + + return $sampleTeiDocument; } } diff --git a/src/Import/Indexer.php b/src/Index/Indexer.php similarity index 99% rename from src/Import/Indexer.php rename to src/Index/Indexer.php index a24dddb5b26d44ca2c024d6d7b22598443030145..85d79a8b38d7d8e2e26df0acfc3db39ef594529a 100644 --- a/src/Import/Indexer.php +++ b/src/Index/Indexer.php @@ -2,13 +2,13 @@ declare(strict_types=1); -namespace App\Import; +namespace App\Index; use App\Model\SolrDocument; use App\Service\EditedTextService; use App\Service\PreProcessingService; use App\Service\TranscriptionService; -use App\Import\MetadataTransformerInterface; +use App\Transform\MetadataTransformerInterface; use DOMDocument; use DOMElement; use DOMXPath; diff --git a/src/Import/IndexerInterface.php b/src/Index/IndexerInterface.php similarity index 91% rename from src/Import/IndexerInterface.php rename to src/Index/IndexerInterface.php index 6fbb13b36a5b8d57fd99fac42286368f949840ca..6e7147e5cf49b89671fc294ec70af7495969230b 100644 --- a/src/Import/IndexerInterface.php +++ b/src/Index/IndexerInterface.php @@ -1,6 +1,6 @@ mainFilesystem = $mainFilesystem; $this->cacheFilesystem = $cacheFilesystem; $this->sourceFilesystem = $sourceFilesystem; $this->pdfFilesystem = $pdfFilesystem; @@ -28,6 +26,11 @@ class FileService $this->teiFilesystem = $teiFilesystem; } + public function getMainFilesystem(): FilesystemInterface + { + return $this->mainFilesystem; + } + public function getCacheFilesystem(): FilesystemInterface { return $this->cacheFilesystem; diff --git a/src/Import/MetadataTransformer.php b/src/Transform/MetadataTransformer.php similarity index 99% rename from src/Import/MetadataTransformer.php rename to src/Transform/MetadataTransformer.php index 00cfa88b3aa505268c048739ac06c4e528e0238f..c73c03d705ee1051efd548db9a324c631f30928f 100644 --- a/src/Import/MetadataTransformer.php +++ b/src/Transform/MetadataTransformer.php @@ -2,7 +2,7 @@ declare(strict_types=1); -namespace App\Import; +namespace App\Transform; use DOMXPath; use Symfony\Component\Routing\RouterInterface; diff --git a/src/Import/MetadataTransformerInterface.php b/src/Transform/MetadataTransformerInterface.php similarity index 98% rename from src/Import/MetadataTransformerInterface.php rename to src/Transform/MetadataTransformerInterface.php index 2ba6808451cd098162ab3da0d26569fb02f093d7..8101051a666e9012f1a4477d6a8c9bca86cf6814 100644 --- a/src/Import/MetadataTransformerInterface.php +++ b/src/Transform/MetadataTransformerInterface.php @@ -1,6 +1,6 @@