diff --git a/config/services.yaml b/config/services.yaml index 88fdfd5bc6a5376afa11f92e3eeade5aa7b61d05..19c881f13ec29af224c2f48e8d7f64f477d0ab4d 100755 --- a/config/services.yaml +++ b/config/services.yaml @@ -25,12 +25,35 @@ parameters: SUB_HSD: 'SUB HSD Niedersächsische Staats- und Universitätsbibliothek Göttingen, Abteilung Handschriften und Seltene Drucke' tei_dir: '%kernel.project_dir%/data/teis/' tei_sample_dir: '%kernel.project_dir%/data/sampletei/' + lit_dir: '%kernel.project_dir%/data/lit/' document_languages: eng: Englisch fre: Französisch ger: Deutsch ita: Italienisch lat: Latein + literatur_data_elements: + biblScope_volume_n: biblScope_volume_n + biblScope_part_n: biblScope_part_n + biblScope_pages: biblScope_pages + biblScope_part: biblScope_part + biblScope_volume: biblScope_volume + date: lit_pub_date + extent: extent + editor: editor + edition: edition + idno_isbn: ISBN + idno_issn: ISSN + title_a_main: analytic_main_title + title_j_main: journal_main_title + title_m_main: monographic_main_title + title_s_main: series_main_title + title_u_main: unpublished_main_title + title_a_sub: analytic_sub_title + title_j_sub: journal_sub_title + title_m_sub: monographic_sub_title + title_s_sub: series_sub_title + title_u_sub: unpublished_sub_title services: # default configuration for services in *this* file @@ -100,11 +123,14 @@ services: arguments: - '%tei_dir%' - '%tei_sample_dir%' + - '%lit_dir%' - '%env(GITLAB_REPO_TOKEN)%' - '%env(GITLAB_REPO_TREE_URL)%' - '%env(GITLAB_PROCESSED_TEI_REPO_URL)%' - '%env(INVALIDE_TEI_LIST_FILE)%' - '%env(SAMPLE_TEI_DOCUMENT_URL)%' + - '%env(GITLAB_LIT_REPO_URL)%' + - '%env(GITLAB_PROCESSED_LIT_REPO_URL)%' App\Index\Indexer: calls: @@ -112,6 +138,8 @@ services: arguments: - '%tei_dir%' - '%tei_sample_dir%' + - '%lit_dir%' + - '%literatur_data_elements%' App\Transform\MetadataTransformer: calls: diff --git a/solr/gfl/conf/schema.xml b/solr/gfl/conf/schema.xml index ced49b75d4cc62b8fea6945a94829bc0c2ee010b..dd5bc1becf9beafcffaaebbc0f3162d427594ad3 100755 --- a/solr/gfl/conf/schema.xml +++ b/solr/gfl/conf/schema.xml @@ -92,6 +92,8 @@ + + diff --git a/solr/gfloffline/conf/schema.xml b/solr/gfloffline/conf/schema.xml index 00e8f1543cd0034badee7cb7b1995ca05feb4e45..70e6bc7d636426a75c5ea49f8cdb3adc55815df0 100755 --- a/solr/gfloffline/conf/schema.xml +++ b/solr/gfloffline/conf/schema.xml @@ -91,6 +91,8 @@ + + diff --git a/src/Command/SolrIndexing.php b/src/Command/SolrIndexing.php index f72624130d211d08d888fe1b94b3ba5e36cc3e7b..0a6efe52961da84dc876019e50a350de6bcdd998 100644 --- a/src/Command/SolrIndexing.php +++ b/src/Command/SolrIndexing.php @@ -44,7 +44,8 @@ class SolrIndexing extends Command $output->writeln('Start solr indexing.'); $this->importer->import($server); $this->indexer->deleteSolrIndex(); - $this->indexer->tei2solr($server); + $this->indexer->tei2Solr($server); + $this->indexer->lit2Solr(); $time = microtime(true) - $_SERVER['REQUEST_TIME_FLOAT']; $time /= 60; $output->writeln('Indexing process completed in '.$time.' minutes.'); diff --git a/src/Import/Importer.php b/src/Import/Importer.php index cffd9d9db71b566af54dd2a610d9f29ef8730d68..5e90c1bef8a56b24bba1d0c438190a663d6f4e62 100644 --- a/src/Import/Importer.php +++ b/src/Import/Importer.php @@ -19,21 +19,27 @@ class Importer implements ImporterInterface private ?string $gitlabRepoTreeUrl; private ?string $invalidTeiListFile; private ?string $sampleTeiDocumentUrl; + private ?string $litDir; + private ?string $gitlabLitRepoUrl; + private ?string $gitlabProcessedLitRepoUrl; public function __construct(FileService $fileService) { $this->fileService = $fileService; } - public function setConfigs(string $teiDir, string $teiSampleDir, string $gitlabRepoToken, string $gitlabRepoTreeUrl, string $gitlabProcessedTeiRepoUrl, string $invalidTeiListFile, string $sampleTeiDocumentUrl): void + public function setConfigs(string $teiDir, string $teiSampleDir, string $litDir, string $gitlabRepoToken, string $gitlabRepoTreeUrl, string $gitlabProcessedTeiRepoUrl, string $invalidTeiListFile, string $sampleTeiDocumentUrl, string $gitlabLitRepoUrl, string $gitlabProcessedLitRepoUrl): void { $this->teiDir = $teiDir; $this->teiSampleDir = $teiSampleDir; + $this->litDir = $litDir; $this->gitlabRepoToken = $gitlabRepoToken; $this->gitlabRepoTreeUrl = $gitlabRepoTreeUrl; $this->gitlabProcessedTeiRepoUrl = $gitlabProcessedTeiRepoUrl; $this->invalidTeiListFile = $invalidTeiListFile; $this->sampleTeiDocumentUrl = $sampleTeiDocumentUrl; + $this->gitlabLitRepoUrl = $gitlabLitRepoUrl; + $this->gitlabProcessedLitRepoUrl = $gitlabProcessedLitRepoUrl; } public function importTeiToS3Storage(): void @@ -41,16 +47,18 @@ class Importer implements ImporterInterface $mainFileSystem = $this->fileService->getMainFilesystem(); $mainFileSystem->deleteDir('tei'); $teiFilesystem = $this->fileService->getTeiFilesystem(); - $sampleTeiDocument = $this->getSampleTEIDocument(); + if (!empty($sampleTeiDocument)) { $teiFilesystem->write('sample.xml', $sampleTeiDocument); } $filesystem = new Filesystem(); + if (!$filesystem->exists($this->teiDir)) { $filesystem->mkdir($this->teiDir); } + $invalidTeiList = $this->getInvalidTeiList(); for ($i = 1; $i <= 100; ++$i) { @@ -85,12 +93,53 @@ class Importer implements ImporterInterface } } + public function importLiterature(): void + { + $filesystem = new Filesystem(); + + if (!$filesystem->exists($this->litDir)) { + $filesystem->mkdir($this->litDir); + } + + try { + $files = file_get_contents($this->gitlabLitRepoUrl.'&page=1&access_token='.$this->gitlabRepoToken); + + if (is_string($files)) { + $files = json_decode($files, true); + + foreach ($files as $file) { + $teiFileUrl = $this->gitlabProcessedLitRepoUrl.$file['name'].'?access_token='.$this->gitlabRepoToken.'&ref=master'; + $fileData = file_get_contents($teiFileUrl); + + if (is_string($fileData)) { + $fileData = json_decode($fileData, true); + + try { + $filesystem->dumpFile( + $this->litDir.$file['name'], + base64_decode($fileData['content']) + ); + } catch (FileException $exception) { + echo $file['name'].' could not be imported.'; + } + } else { + // TODO retry to download the file again + echo $file['name'].' could not be imported.'; + } + } + } + } catch (FileException $exception) { + echo 'Literature files list could not be imported from gitlab'; + } + } + public function import(string $server): void { if ('dev' === $server) { $this->importSampleTeiDocument(); } + $this->importLiterature(); $filesystem = new Filesystem(); if (!$filesystem->exists($this->teiDir)) { $filesystem->mkdir($this->teiDir); diff --git a/src/Index/Indexer.php b/src/Index/Indexer.php index 70e701f67607d4c9c047926c0f3459222b9c3aea..544c474ec39879f612f7572a981ec24e22a26075 100644 --- a/src/Index/Indexer.php +++ b/src/Index/Indexer.php @@ -24,13 +24,16 @@ class Indexer implements IndexerInterface private const PAGE_DOC_TYPE = 'page'; private const NOTE_DOC_TYPE = 'note'; private const ENTITY_DOC_TYPE = 'entity'; + private const LITERATURE_DOC_TYPE = 'literature'; private Client $client; private EditedTextService $editedTextService; private PreProcessingService $preProcessingService; - private ?string $teiDir = null; - private ?string $teiSampleDir = null; private TranscriptionService $transcriptionService; private MetadataTransformerInterface $metadataTransformer; + private ?string $teiDir = null; + private ?string $teiSampleDir = null; + private ?string $litDir; + private ?array $literaturDataElements; public function __construct( Client $client, @@ -46,6 +49,14 @@ class Indexer implements IndexerInterface $this->metadataTransformer = $metadataTransformer; } + public function setConfigs(string $teiDir, string $teiSampleDir, string $litDir, array $literaturDataElements): void + { + $this->teiDir = $teiDir; + $this->teiSampleDir = $teiSampleDir; + $this->litDir = $litDir;; + $this->literaturDataElements = $literaturDataElements; + } + public function deleteSolrIndex(): void { $update = $this->client->createUpdate(); @@ -119,13 +130,151 @@ class Indexer implements IndexerInterface return $solrDocument; } - public function setConfigs(string $teiDir, string $teiSampleDir): void + public function lit2Solr(): void { - $this->teiDir = $teiDir; - $this->teiSampleDir = $teiSampleDir; + $this->client->getEndpoint()->setOptions(['timeout' => 60, 'index_timeout' => 60]); + $finder = new Finder(); + $finder->files()->in($this->litDir); + foreach ($finder as $file) { + libxml_use_internal_errors(true); + $doc = new \DOMDocument(); + $doc->load($file->getRealPath()); + if (!libxml_get_errors()) { + $xpath = new \DOMXPath($doc); + $xpath->registerNamespace('tei', 'http://www.tei-c.org/ns/1.0'); + $literature = $xpath->query('//tei:text//tei:body//tei:listBibl//tei:bibl'); + + foreach ($literature as $literatureItem) { + $update = $this->client->createUpdate(); + $sdoc = $update->createDocument(); + $uri = []; + $author = []; + $publisher = []; + $pubPlace = []; + $edition = []; + + foreach ($literatureItem->childNodes as $childNode) { + $id = str_replace('_', ' ', $literatureItem->attributes->item(0)->textContent); + $sdoc->id = $id; + $sdoc->doctype = 'literature'; + + if ('#text' !== $childNode->nodeName) { + $text = trim(preg_replace('/\s+/', ' ', $childNode->nodeValue)); + + if ('relatedItem' === $childNode->nodeName) { + foreach ($childNode->childNodes as $childChildNode) { + if ('ref' === $childChildNode->nodeName) { + $ref = $childChildNode->attributes->item(0)->nodeValue; + + if ('_' !== $ref) { + $uri[] = $ref; + } + } + } + } elseif ('title' === $childNode->nodeName) { + $name = 'title_'.$childNode->attributes->item( + 0 + )->nodeValue.'_'.$childNode->attributes->item(1)->nodeValue; + + $name = $this->literaturDataElements[$name]; + + if (!empty($name)) { + $sdoc->$name = $text; + } + } elseif ('author' === $childNode->nodeName) { + foreach ($childNode->childNodes as $item) { + $authorElement = trim(preg_replace('/\s+/', ' ', $item->nodeValue)); + + if (!empty($authorElement)) { + $author[] = $authorElement; + } + } + } elseif ('publisher' === $childNode->nodeName) { + foreach ($childNode->childNodes as $item) { + $publisherElement = trim(preg_replace('/\s+/', ' ', $item->nodeValue)); + + if (!empty($publisherElement)) { + $publisher[] = $publisherElement; + } + } + } elseif ('pubPlace' === $childNode->nodeName) { + foreach ($childNode->childNodes as $item) { + $pubPlaceElement = trim(preg_replace('/\s+/', ' ', $item->nodeValue)); + + if (!empty($pubPlaceElement)) { + $pubPlace[] = $pubPlaceElement; + } + } + } elseif ('edition' === $childNode->nodeName) { + foreach ($childNode->childNodes as $item) { + $editionElement = trim(preg_replace('/\s+/', ' ', $item->nodeValue)); + + if (!empty($editionElement)) { + $edition[] = $editionElement; + } + } + } elseif ('idno' === $childNode->nodeName) { + $name = 'idno_'.strtolower($childNode->attributes->item(0)->nodeValue); + $name = $this->literaturDataElements[$name]; + + if (!empty($name) && !empty($text)) { + $sdoc->$name = $text; + } + } elseif ('biblScope' === $childNode->nodeName) { + $name = 'biblScope_'.$childNode->attributes->item(0)->nodeValue; + + if ('n' === $childNode->attributes->item(1)->nodeName) { + $name .= '_'.$childNode->attributes->item(1)->nodeValue; + } + + $name = $this->literaturDataElements[$name]; + + if (!empty($name) && !empty($text)) { + $sdoc->$name = $text; + } + } else { + $name = strval($childNode->nodeName); + $name = $this->literaturDataElements[$name]; + + if (!empty($name) && !empty($text)) { + $sdoc->$name = $text; + } + } + } + + unset($text); + unset($name); + } + + if ([] !== $uri) { + $sdoc->uri = $uri; + } + + if ([] !== $author) { + $sdoc->literature_author = $author; + } + + if ([] !== $publisher) { + $sdoc->publisher = $publisher; + } + + if ([] !== $pubPlace) { + $sdoc->pub_place = $pubPlace; + } + + if ([] !== $edition) { + $sdoc->edition = $edition; + } + + $update->addDocument($sdoc); + $update->addCommit(); + $this->client->execute($update); + } + } + } } - public function tei2solr(string $server): void + public function tei2Solr(string $server): void { $this->client->getEndpoint()->setOptions(['timeout' => 60, 'index_timeout' => 60]); $finder = new Finder(); @@ -318,16 +467,12 @@ class Indexer implements IndexerInterface { $xpath = new DOMXPath($doc); $xpath->registerNamespace('tei', 'http://www.tei-c.org/ns/1.0'); - $id = $this->getId($xpath); $fulltext = $this->getFulltext($xpath); $abstracts = $this->getAbstracts($xpath); - $docType = self::ARTICLE_DOC_TYPE; - $shortTitle = $this->metadataTransformer->getShortTitle($xpath); $title = $this->metadataTransformer->getTitle($xpath); - $originPlaceGNDNode = $xpath->query('//tei:name[@type="place" and @subtype="orn"]/@ref'); if ($originPlaceGNDNode->item(0)) { diff --git a/src/Service/SolrSearchService.php b/src/Service/SolrSearchService.php index c512e193d0ea867a017c4039870d7b440a878d72..b8212bfa2fcae5470b91506b52b27de1c0a1293c 100755 --- a/src/Service/SolrSearchService.php +++ b/src/Service/SolrSearchService.php @@ -219,7 +219,7 @@ class SolrSearchService implements SearchServiceInterface public function getLiterature(): array { - $select = $this->client->createSelect()->setRows(200); + $select = $this->client->createSelect()->setRows(1000); $query = vsprintf('%s:%s', ['doctype', 'literature']); $select->setQuery($query)->addSort('id', 'asc');