diff --git a/config/services.yaml b/config/services.yaml
index 88fdfd5bc6a5376afa11f92e3eeade5aa7b61d05..19c881f13ec29af224c2f48e8d7f64f477d0ab4d 100755
--- a/config/services.yaml
+++ b/config/services.yaml
@@ -25,12 +25,35 @@ parameters:
SUB_HSD: 'SUB HSD Niedersächsische Staats- und Universitätsbibliothek Göttingen, Abteilung Handschriften und Seltene Drucke'
tei_dir: '%kernel.project_dir%/data/teis/'
tei_sample_dir: '%kernel.project_dir%/data/sampletei/'
+ lit_dir: '%kernel.project_dir%/data/lit/'
document_languages:
eng: Englisch
fre: Französisch
ger: Deutsch
ita: Italienisch
lat: Latein
+ literatur_data_elements:
+ biblScope_volume_n: biblScope_volume_n
+ biblScope_part_n: biblScope_part_n
+ biblScope_pages: biblScope_pages
+ biblScope_part: biblScope_part
+ biblScope_volume: biblScope_volume
+ date: lit_pub_date
+ extent: extent
+ editor: editor
+ edition: edition
+ idno_isbn: ISBN
+ idno_issn: ISSN
+ title_a_main: analytic_main_title
+ title_j_main: journal_main_title
+ title_m_main: monographic_main_title
+ title_s_main: series_main_title
+ title_u_main: unpublished_main_title
+ title_a_sub: analytic_sub_title
+ title_j_sub: journal_sub_title
+ title_m_sub: monographic_sub_title
+ title_s_sub: series_sub_title
+ title_u_sub: unpublished_sub_title
services:
# default configuration for services in *this* file
@@ -100,11 +123,14 @@ services:
arguments:
- '%tei_dir%'
- '%tei_sample_dir%'
+ - '%lit_dir%'
- '%env(GITLAB_REPO_TOKEN)%'
- '%env(GITLAB_REPO_TREE_URL)%'
- '%env(GITLAB_PROCESSED_TEI_REPO_URL)%'
- '%env(INVALIDE_TEI_LIST_FILE)%'
- '%env(SAMPLE_TEI_DOCUMENT_URL)%'
+ - '%env(GITLAB_LIT_REPO_URL)%'
+ - '%env(GITLAB_PROCESSED_LIT_REPO_URL)%'
App\Index\Indexer:
calls:
@@ -112,6 +138,8 @@ services:
arguments:
- '%tei_dir%'
- '%tei_sample_dir%'
+ - '%lit_dir%'
+ - '%literatur_data_elements%'
App\Transform\MetadataTransformer:
calls:
diff --git a/solr/gfl/conf/schema.xml b/solr/gfl/conf/schema.xml
index ced49b75d4cc62b8fea6945a94829bc0c2ee010b..dd5bc1becf9beafcffaaebbc0f3162d427594ad3 100755
--- a/solr/gfl/conf/schema.xml
+++ b/solr/gfl/conf/schema.xml
@@ -92,6 +92,8 @@
+
+
diff --git a/solr/gfloffline/conf/schema.xml b/solr/gfloffline/conf/schema.xml
index 00e8f1543cd0034badee7cb7b1995ca05feb4e45..70e6bc7d636426a75c5ea49f8cdb3adc55815df0 100755
--- a/solr/gfloffline/conf/schema.xml
+++ b/solr/gfloffline/conf/schema.xml
@@ -91,6 +91,8 @@
+
+
diff --git a/src/Command/SolrIndexing.php b/src/Command/SolrIndexing.php
index f72624130d211d08d888fe1b94b3ba5e36cc3e7b..0a6efe52961da84dc876019e50a350de6bcdd998 100644
--- a/src/Command/SolrIndexing.php
+++ b/src/Command/SolrIndexing.php
@@ -44,7 +44,8 @@ class SolrIndexing extends Command
$output->writeln('Start solr indexing.');
$this->importer->import($server);
$this->indexer->deleteSolrIndex();
- $this->indexer->tei2solr($server);
+ $this->indexer->tei2Solr($server);
+ $this->indexer->lit2Solr();
$time = microtime(true) - $_SERVER['REQUEST_TIME_FLOAT'];
$time /= 60;
$output->writeln('Indexing process completed in '.$time.' minutes.');
diff --git a/src/Import/Importer.php b/src/Import/Importer.php
index cffd9d9db71b566af54dd2a610d9f29ef8730d68..5e90c1bef8a56b24bba1d0c438190a663d6f4e62 100644
--- a/src/Import/Importer.php
+++ b/src/Import/Importer.php
@@ -19,21 +19,27 @@ class Importer implements ImporterInterface
private ?string $gitlabRepoTreeUrl;
private ?string $invalidTeiListFile;
private ?string $sampleTeiDocumentUrl;
+ private ?string $litDir;
+ private ?string $gitlabLitRepoUrl;
+ private ?string $gitlabProcessedLitRepoUrl;
public function __construct(FileService $fileService)
{
$this->fileService = $fileService;
}
- public function setConfigs(string $teiDir, string $teiSampleDir, string $gitlabRepoToken, string $gitlabRepoTreeUrl, string $gitlabProcessedTeiRepoUrl, string $invalidTeiListFile, string $sampleTeiDocumentUrl): void
+ public function setConfigs(string $teiDir, string $teiSampleDir, string $litDir, string $gitlabRepoToken, string $gitlabRepoTreeUrl, string $gitlabProcessedTeiRepoUrl, string $invalidTeiListFile, string $sampleTeiDocumentUrl, string $gitlabLitRepoUrl, string $gitlabProcessedLitRepoUrl): void
{
$this->teiDir = $teiDir;
$this->teiSampleDir = $teiSampleDir;
+ $this->litDir = $litDir;
$this->gitlabRepoToken = $gitlabRepoToken;
$this->gitlabRepoTreeUrl = $gitlabRepoTreeUrl;
$this->gitlabProcessedTeiRepoUrl = $gitlabProcessedTeiRepoUrl;
$this->invalidTeiListFile = $invalidTeiListFile;
$this->sampleTeiDocumentUrl = $sampleTeiDocumentUrl;
+ $this->gitlabLitRepoUrl = $gitlabLitRepoUrl;
+ $this->gitlabProcessedLitRepoUrl = $gitlabProcessedLitRepoUrl;
}
public function importTeiToS3Storage(): void
@@ -41,16 +47,18 @@ class Importer implements ImporterInterface
$mainFileSystem = $this->fileService->getMainFilesystem();
$mainFileSystem->deleteDir('tei');
$teiFilesystem = $this->fileService->getTeiFilesystem();
-
$sampleTeiDocument = $this->getSampleTEIDocument();
+
if (!empty($sampleTeiDocument)) {
$teiFilesystem->write('sample.xml', $sampleTeiDocument);
}
$filesystem = new Filesystem();
+
if (!$filesystem->exists($this->teiDir)) {
$filesystem->mkdir($this->teiDir);
}
+
$invalidTeiList = $this->getInvalidTeiList();
for ($i = 1; $i <= 100; ++$i) {
@@ -85,12 +93,53 @@ class Importer implements ImporterInterface
}
}
+ public function importLiterature(): void
+ {
+ $filesystem = new Filesystem();
+
+ if (!$filesystem->exists($this->litDir)) {
+ $filesystem->mkdir($this->litDir);
+ }
+
+ try {
+ $files = file_get_contents($this->gitlabLitRepoUrl.'&page=1&access_token='.$this->gitlabRepoToken);
+
+ if (is_string($files)) {
+ $files = json_decode($files, true);
+
+ foreach ($files as $file) {
+ $teiFileUrl = $this->gitlabProcessedLitRepoUrl.$file['name'].'?access_token='.$this->gitlabRepoToken.'&ref=master';
+ $fileData = file_get_contents($teiFileUrl);
+
+ if (is_string($fileData)) {
+ $fileData = json_decode($fileData, true);
+
+ try {
+ $filesystem->dumpFile(
+ $this->litDir.$file['name'],
+ base64_decode($fileData['content'])
+ );
+ } catch (FileException $exception) {
+ echo $file['name'].' could not be imported.';
+ }
+ } else {
+ // TODO retry to download the file again
+ echo $file['name'].' could not be imported.';
+ }
+ }
+ }
+ } catch (FileException $exception) {
+ echo 'Literature files list could not be imported from gitlab';
+ }
+ }
+
public function import(string $server): void
{
if ('dev' === $server) {
$this->importSampleTeiDocument();
}
+ $this->importLiterature();
$filesystem = new Filesystem();
if (!$filesystem->exists($this->teiDir)) {
$filesystem->mkdir($this->teiDir);
diff --git a/src/Index/Indexer.php b/src/Index/Indexer.php
index 70e701f67607d4c9c047926c0f3459222b9c3aea..544c474ec39879f612f7572a981ec24e22a26075 100644
--- a/src/Index/Indexer.php
+++ b/src/Index/Indexer.php
@@ -24,13 +24,16 @@ class Indexer implements IndexerInterface
private const PAGE_DOC_TYPE = 'page';
private const NOTE_DOC_TYPE = 'note';
private const ENTITY_DOC_TYPE = 'entity';
+ private const LITERATURE_DOC_TYPE = 'literature';
private Client $client;
private EditedTextService $editedTextService;
private PreProcessingService $preProcessingService;
- private ?string $teiDir = null;
- private ?string $teiSampleDir = null;
private TranscriptionService $transcriptionService;
private MetadataTransformerInterface $metadataTransformer;
+ private ?string $teiDir = null;
+ private ?string $teiSampleDir = null;
+ private ?string $litDir;
+ private ?array $literaturDataElements;
public function __construct(
Client $client,
@@ -46,6 +49,14 @@ class Indexer implements IndexerInterface
$this->metadataTransformer = $metadataTransformer;
}
+ public function setConfigs(string $teiDir, string $teiSampleDir, string $litDir, array $literaturDataElements): void
+ {
+ $this->teiDir = $teiDir;
+ $this->teiSampleDir = $teiSampleDir;
+ $this->litDir = $litDir;;
+ $this->literaturDataElements = $literaturDataElements;
+ }
+
public function deleteSolrIndex(): void
{
$update = $this->client->createUpdate();
@@ -119,13 +130,151 @@ class Indexer implements IndexerInterface
return $solrDocument;
}
- public function setConfigs(string $teiDir, string $teiSampleDir): void
+ public function lit2Solr(): void
{
- $this->teiDir = $teiDir;
- $this->teiSampleDir = $teiSampleDir;
+ $this->client->getEndpoint()->setOptions(['timeout' => 60, 'index_timeout' => 60]);
+ $finder = new Finder();
+ $finder->files()->in($this->litDir);
+ foreach ($finder as $file) {
+ libxml_use_internal_errors(true);
+ $doc = new \DOMDocument();
+ $doc->load($file->getRealPath());
+ if (!libxml_get_errors()) {
+ $xpath = new \DOMXPath($doc);
+ $xpath->registerNamespace('tei', 'http://www.tei-c.org/ns/1.0');
+ $literature = $xpath->query('//tei:text//tei:body//tei:listBibl//tei:bibl');
+
+ foreach ($literature as $literatureItem) {
+ $update = $this->client->createUpdate();
+ $sdoc = $update->createDocument();
+ $uri = [];
+ $author = [];
+ $publisher = [];
+ $pubPlace = [];
+ $edition = [];
+
+ foreach ($literatureItem->childNodes as $childNode) {
+ $id = str_replace('_', ' ', $literatureItem->attributes->item(0)->textContent);
+ $sdoc->id = $id;
+ $sdoc->doctype = 'literature';
+
+ if ('#text' !== $childNode->nodeName) {
+ $text = trim(preg_replace('/\s+/', ' ', $childNode->nodeValue));
+
+ if ('relatedItem' === $childNode->nodeName) {
+ foreach ($childNode->childNodes as $childChildNode) {
+ if ('ref' === $childChildNode->nodeName) {
+ $ref = $childChildNode->attributes->item(0)->nodeValue;
+
+ if ('_' !== $ref) {
+ $uri[] = $ref;
+ }
+ }
+ }
+ } elseif ('title' === $childNode->nodeName) {
+ $name = 'title_'.$childNode->attributes->item(
+ 0
+ )->nodeValue.'_'.$childNode->attributes->item(1)->nodeValue;
+
+ $name = $this->literaturDataElements[$name];
+
+ if (!empty($name)) {
+ $sdoc->$name = $text;
+ }
+ } elseif ('author' === $childNode->nodeName) {
+ foreach ($childNode->childNodes as $item) {
+ $authorElement = trim(preg_replace('/\s+/', ' ', $item->nodeValue));
+
+ if (!empty($authorElement)) {
+ $author[] = $authorElement;
+ }
+ }
+ } elseif ('publisher' === $childNode->nodeName) {
+ foreach ($childNode->childNodes as $item) {
+ $publisherElement = trim(preg_replace('/\s+/', ' ', $item->nodeValue));
+
+ if (!empty($publisherElement)) {
+ $publisher[] = $publisherElement;
+ }
+ }
+ } elseif ('pubPlace' === $childNode->nodeName) {
+ foreach ($childNode->childNodes as $item) {
+ $pubPlaceElement = trim(preg_replace('/\s+/', ' ', $item->nodeValue));
+
+ if (!empty($pubPlaceElement)) {
+ $pubPlace[] = $pubPlaceElement;
+ }
+ }
+ } elseif ('edition' === $childNode->nodeName) {
+ foreach ($childNode->childNodes as $item) {
+ $editionElement = trim(preg_replace('/\s+/', ' ', $item->nodeValue));
+
+ if (!empty($editionElement)) {
+ $edition[] = $editionElement;
+ }
+ }
+ } elseif ('idno' === $childNode->nodeName) {
+ $name = 'idno_'.strtolower($childNode->attributes->item(0)->nodeValue);
+ $name = $this->literaturDataElements[$name];
+
+ if (!empty($name) && !empty($text)) {
+ $sdoc->$name = $text;
+ }
+ } elseif ('biblScope' === $childNode->nodeName) {
+ $name = 'biblScope_'.$childNode->attributes->item(0)->nodeValue;
+
+ if ('n' === $childNode->attributes->item(1)->nodeName) {
+ $name .= '_'.$childNode->attributes->item(1)->nodeValue;
+ }
+
+ $name = $this->literaturDataElements[$name];
+
+ if (!empty($name) && !empty($text)) {
+ $sdoc->$name = $text;
+ }
+ } else {
+ $name = strval($childNode->nodeName);
+ $name = $this->literaturDataElements[$name];
+
+ if (!empty($name) && !empty($text)) {
+ $sdoc->$name = $text;
+ }
+ }
+ }
+
+ unset($text);
+ unset($name);
+ }
+
+ if ([] !== $uri) {
+ $sdoc->uri = $uri;
+ }
+
+ if ([] !== $author) {
+ $sdoc->literature_author = $author;
+ }
+
+ if ([] !== $publisher) {
+ $sdoc->publisher = $publisher;
+ }
+
+ if ([] !== $pubPlace) {
+ $sdoc->pub_place = $pubPlace;
+ }
+
+ if ([] !== $edition) {
+ $sdoc->edition = $edition;
+ }
+
+ $update->addDocument($sdoc);
+ $update->addCommit();
+ $this->client->execute($update);
+ }
+ }
+ }
}
- public function tei2solr(string $server): void
+ public function tei2Solr(string $server): void
{
$this->client->getEndpoint()->setOptions(['timeout' => 60, 'index_timeout' => 60]);
$finder = new Finder();
@@ -318,16 +467,12 @@ class Indexer implements IndexerInterface
{
$xpath = new DOMXPath($doc);
$xpath->registerNamespace('tei', 'http://www.tei-c.org/ns/1.0');
-
$id = $this->getId($xpath);
$fulltext = $this->getFulltext($xpath);
$abstracts = $this->getAbstracts($xpath);
-
$docType = self::ARTICLE_DOC_TYPE;
-
$shortTitle = $this->metadataTransformer->getShortTitle($xpath);
$title = $this->metadataTransformer->getTitle($xpath);
-
$originPlaceGNDNode = $xpath->query('//tei:name[@type="place" and @subtype="orn"]/@ref');
if ($originPlaceGNDNode->item(0)) {
diff --git a/src/Service/SolrSearchService.php b/src/Service/SolrSearchService.php
index c512e193d0ea867a017c4039870d7b440a878d72..b8212bfa2fcae5470b91506b52b27de1c0a1293c 100755
--- a/src/Service/SolrSearchService.php
+++ b/src/Service/SolrSearchService.php
@@ -219,7 +219,7 @@ class SolrSearchService implements SearchServiceInterface
public function getLiterature(): array
{
- $select = $this->client->createSelect()->setRows(200);
+ $select = $this->client->createSelect()->setRows(1000);
$query = vsprintf('%s:%s', ['doctype', 'literature']);
$select->setQuery($query)->addSort('id', 'asc');