Commit cec8cfc2 authored by asajedi's avatar asajedi
Browse files

Merge branch 'indexBibliography' into 'master'

Index bibliography

Closes #99

See merge request !61
parents e018e407 1241c6b7
Pipeline #256721 failed with stages
in 10 minutes and 5 seconds
......@@ -25,12 +25,35 @@ parameters:
SUB_HSD: 'SUB HSD Niedersächsische Staats- und Universitätsbibliothek Göttingen, Abteilung Handschriften und Seltene Drucke'
tei_dir: '%kernel.project_dir%/data/teis/'
tei_sample_dir: '%kernel.project_dir%/data/sampletei/'
lit_dir: '%kernel.project_dir%/data/lit/'
document_languages:
eng: Englisch
fre: Französisch
ger: Deutsch
ita: Italienisch
lat: Latein
literatur_data_elements:
biblScope_volume_n: biblScope_volume_n
biblScope_part_n: biblScope_part_n
biblScope_pages: biblScope_pages
biblScope_part: biblScope_part
biblScope_volume: biblScope_volume
date: lit_pub_date
extent: extent
editor: editor
edition: edition
idno_isbn: ISBN
idno_issn: ISSN
title_a_main: analytic_main_title
title_j_main: journal_main_title
title_m_main: monographic_main_title
title_s_main: series_main_title
title_u_main: unpublished_main_title
title_a_sub: analytic_sub_title
title_j_sub: journal_sub_title
title_m_sub: monographic_sub_title
title_s_sub: series_sub_title
title_u_sub: unpublished_sub_title
services:
# default configuration for services in *this* file
......@@ -100,11 +123,14 @@ services:
arguments:
- '%tei_dir%'
- '%tei_sample_dir%'
- '%lit_dir%'
- '%env(GITLAB_REPO_TOKEN)%'
- '%env(GITLAB_REPO_TREE_URL)%'
- '%env(GITLAB_PROCESSED_TEI_REPO_URL)%'
- '%env(INVALIDE_TEI_LIST_FILE)%'
- '%env(SAMPLE_TEI_DOCUMENT_URL)%'
- '%env(GITLAB_LIT_REPO_URL)%'
- '%env(GITLAB_PROCESSED_LIT_REPO_URL)%'
App\Index\Indexer:
calls:
......@@ -112,6 +138,8 @@ services:
arguments:
- '%tei_dir%'
- '%tei_sample_dir%'
- '%lit_dir%'
- '%literatur_data_elements%'
App\Transform\MetadataTransformer:
calls:
......
......@@ -92,6 +92,8 @@
<field name="extent" type="string" multiValued="true"/>
<field name="biblScope" type="string" multiValued="true"/>
<field name="biblScope_volume_n" type="string" multiValued="true"/>
<field name="biblScope_part_n" type="string" multiValued="true"/>
<field name="biblScope_part" type="string" multiValued="true"/>
<field name="biblScope_volume" type="string" multiValued="true"/>
<field name="biblScope_pages" type="string" multiValued="true"/>
......
......@@ -91,6 +91,8 @@
<field name="extent" type="string" multiValued="true"/>
<field name="biblScope" type="string" multiValued="true"/>
<field name="biblScope_volume_n" type="string" multiValued="true"/>
<field name="biblScope_part_n" type="string" multiValued="true"/>
<field name="biblScope_part" type="string" multiValued="true"/>
<field name="biblScope_volume" type="string" multiValued="true"/>
<field name="biblScope_pages" type="string" multiValued="true"/>
......
......@@ -44,7 +44,8 @@ class SolrIndexing extends Command
$output->writeln('Start solr indexing.');
$this->importer->import($server);
$this->indexer->deleteSolrIndex();
$this->indexer->tei2solr($server);
$this->indexer->tei2Solr($server);
$this->indexer->lit2Solr();
$time = microtime(true) - $_SERVER['REQUEST_TIME_FLOAT'];
$time /= 60;
$output->writeln('Indexing process completed in '.$time.' minutes.');
......
......@@ -19,21 +19,27 @@ class Importer implements ImporterInterface
private ?string $gitlabRepoTreeUrl;
private ?string $invalidTeiListFile;
private ?string $sampleTeiDocumentUrl;
private ?string $litDir;
private ?string $gitlabLitRepoUrl;
private ?string $gitlabProcessedLitRepoUrl;
public function __construct(FileService $fileService)
{
$this->fileService = $fileService;
}
public function setConfigs(string $teiDir, string $teiSampleDir, string $gitlabRepoToken, string $gitlabRepoTreeUrl, string $gitlabProcessedTeiRepoUrl, string $invalidTeiListFile, string $sampleTeiDocumentUrl): void
public function setConfigs(string $teiDir, string $teiSampleDir, string $litDir, string $gitlabRepoToken, string $gitlabRepoTreeUrl, string $gitlabProcessedTeiRepoUrl, string $invalidTeiListFile, string $sampleTeiDocumentUrl, string $gitlabLitRepoUrl, string $gitlabProcessedLitRepoUrl): void
{
$this->teiDir = $teiDir;
$this->teiSampleDir = $teiSampleDir;
$this->litDir = $litDir;
$this->gitlabRepoToken = $gitlabRepoToken;
$this->gitlabRepoTreeUrl = $gitlabRepoTreeUrl;
$this->gitlabProcessedTeiRepoUrl = $gitlabProcessedTeiRepoUrl;
$this->invalidTeiListFile = $invalidTeiListFile;
$this->sampleTeiDocumentUrl = $sampleTeiDocumentUrl;
$this->gitlabLitRepoUrl = $gitlabLitRepoUrl;
$this->gitlabProcessedLitRepoUrl = $gitlabProcessedLitRepoUrl;
}
public function importTeiToS3Storage(): void
......@@ -41,16 +47,18 @@ class Importer implements ImporterInterface
$mainFileSystem = $this->fileService->getMainFilesystem();
$mainFileSystem->deleteDir('tei');
$teiFilesystem = $this->fileService->getTeiFilesystem();
$sampleTeiDocument = $this->getSampleTEIDocument();
if (!empty($sampleTeiDocument)) {
$teiFilesystem->write('sample.xml', $sampleTeiDocument);
}
$filesystem = new Filesystem();
if (!$filesystem->exists($this->teiDir)) {
$filesystem->mkdir($this->teiDir);
}
$invalidTeiList = $this->getInvalidTeiList();
for ($i = 1; $i <= 100; ++$i) {
......@@ -85,12 +93,53 @@ class Importer implements ImporterInterface
}
}
public function importLiterature(): void
{
$filesystem = new Filesystem();
if (!$filesystem->exists($this->litDir)) {
$filesystem->mkdir($this->litDir);
}
try {
$files = file_get_contents($this->gitlabLitRepoUrl.'&page=1&access_token='.$this->gitlabRepoToken);
if (is_string($files)) {
$files = json_decode($files, true);
foreach ($files as $file) {
$teiFileUrl = $this->gitlabProcessedLitRepoUrl.$file['name'].'?access_token='.$this->gitlabRepoToken.'&ref=master';
$fileData = file_get_contents($teiFileUrl);
if (is_string($fileData)) {
$fileData = json_decode($fileData, true);
try {
$filesystem->dumpFile(
$this->litDir.$file['name'],
base64_decode($fileData['content'])
);
} catch (FileException $exception) {
echo $file['name'].' could not be imported.';
}
} else {
// TODO retry to download the file again
echo $file['name'].' could not be imported.';
}
}
}
} catch (FileException $exception) {
echo 'Literature files list could not be imported from gitlab';
}
}
public function import(string $server): void
{
if ('dev' === $server) {
$this->importSampleTeiDocument();
}
$this->importLiterature();
$filesystem = new Filesystem();
if (!$filesystem->exists($this->teiDir)) {
$filesystem->mkdir($this->teiDir);
......
......@@ -24,13 +24,16 @@ class Indexer implements IndexerInterface
private const PAGE_DOC_TYPE = 'page';
private const NOTE_DOC_TYPE = 'note';
private const ENTITY_DOC_TYPE = 'entity';
private const LITERATURE_DOC_TYPE = 'literature';
private Client $client;
private EditedTextService $editedTextService;
private PreProcessingService $preProcessingService;
private ?string $teiDir = null;
private ?string $teiSampleDir = null;
private TranscriptionService $transcriptionService;
private MetadataTransformerInterface $metadataTransformer;
private ?string $teiDir = null;
private ?string $teiSampleDir = null;
private ?string $litDir;
private ?array $literaturDataElements;
public function __construct(
Client $client,
......@@ -46,6 +49,14 @@ class Indexer implements IndexerInterface
$this->metadataTransformer = $metadataTransformer;
}
public function setConfigs(string $teiDir, string $teiSampleDir, string $litDir, array $literaturDataElements): void
{
$this->teiDir = $teiDir;
$this->teiSampleDir = $teiSampleDir;
$this->litDir = $litDir;;
$this->literaturDataElements = $literaturDataElements;
}
public function deleteSolrIndex(): void
{
$update = $this->client->createUpdate();
......@@ -119,13 +130,151 @@ class Indexer implements IndexerInterface
return $solrDocument;
}
public function setConfigs(string $teiDir, string $teiSampleDir): void
public function lit2Solr(): void
{
$this->teiDir = $teiDir;
$this->teiSampleDir = $teiSampleDir;
$this->client->getEndpoint()->setOptions(['timeout' => 60, 'index_timeout' => 60]);
$finder = new Finder();
$finder->files()->in($this->litDir);
foreach ($finder as $file) {
libxml_use_internal_errors(true);
$doc = new \DOMDocument();
$doc->load($file->getRealPath());
if (!libxml_get_errors()) {
$xpath = new \DOMXPath($doc);
$xpath->registerNamespace('tei', 'http://www.tei-c.org/ns/1.0');
$literature = $xpath->query('//tei:text//tei:body//tei:listBibl//tei:bibl');
foreach ($literature as $literatureItem) {
$update = $this->client->createUpdate();
$sdoc = $update->createDocument();
$uri = [];
$author = [];
$publisher = [];
$pubPlace = [];
$edition = [];
foreach ($literatureItem->childNodes as $childNode) {
$id = str_replace('_', ' ', $literatureItem->attributes->item(0)->textContent);
$sdoc->id = $id;
$sdoc->doctype = 'literature';
if ('#text' !== $childNode->nodeName) {
$text = trim(preg_replace('/\s+/', ' ', $childNode->nodeValue));
if ('relatedItem' === $childNode->nodeName) {
foreach ($childNode->childNodes as $childChildNode) {
if ('ref' === $childChildNode->nodeName) {
$ref = $childChildNode->attributes->item(0)->nodeValue;
if ('_' !== $ref) {
$uri[] = $ref;
}
}
}
} elseif ('title' === $childNode->nodeName) {
$name = 'title_'.$childNode->attributes->item(
0
)->nodeValue.'_'.$childNode->attributes->item(1)->nodeValue;
$name = $this->literaturDataElements[$name];
if (!empty($name)) {
$sdoc->$name = $text;
}
} elseif ('author' === $childNode->nodeName) {
foreach ($childNode->childNodes as $item) {
$authorElement = trim(preg_replace('/\s+/', ' ', $item->nodeValue));
if (!empty($authorElement)) {
$author[] = $authorElement;
}
}
} elseif ('publisher' === $childNode->nodeName) {
foreach ($childNode->childNodes as $item) {
$publisherElement = trim(preg_replace('/\s+/', ' ', $item->nodeValue));
if (!empty($publisherElement)) {
$publisher[] = $publisherElement;
}
}
} elseif ('pubPlace' === $childNode->nodeName) {
foreach ($childNode->childNodes as $item) {
$pubPlaceElement = trim(preg_replace('/\s+/', ' ', $item->nodeValue));
if (!empty($pubPlaceElement)) {
$pubPlace[] = $pubPlaceElement;
}
}
} elseif ('edition' === $childNode->nodeName) {
foreach ($childNode->childNodes as $item) {
$editionElement = trim(preg_replace('/\s+/', ' ', $item->nodeValue));
if (!empty($editionElement)) {
$edition[] = $editionElement;
}
}
} elseif ('idno' === $childNode->nodeName) {
$name = 'idno_'.strtolower($childNode->attributes->item(0)->nodeValue);
$name = $this->literaturDataElements[$name];
if (!empty($name) && !empty($text)) {
$sdoc->$name = $text;
}
} elseif ('biblScope' === $childNode->nodeName) {
$name = 'biblScope_'.$childNode->attributes->item(0)->nodeValue;
if ('n' === $childNode->attributes->item(1)->nodeName) {
$name .= '_'.$childNode->attributes->item(1)->nodeValue;
}
$name = $this->literaturDataElements[$name];
if (!empty($name) && !empty($text)) {
$sdoc->$name = $text;
}
} else {
$name = strval($childNode->nodeName);
$name = $this->literaturDataElements[$name];
if (!empty($name) && !empty($text)) {
$sdoc->$name = $text;
}
}
}
unset($text);
unset($name);
}
if ([] !== $uri) {
$sdoc->uri = $uri;
}
if ([] !== $author) {
$sdoc->literature_author = $author;
}
if ([] !== $publisher) {
$sdoc->publisher = $publisher;
}
if ([] !== $pubPlace) {
$sdoc->pub_place = $pubPlace;
}
if ([] !== $edition) {
$sdoc->edition = $edition;
}
$update->addDocument($sdoc);
$update->addCommit();
$this->client->execute($update);
}
}
}
}
public function tei2solr(string $server): void
public function tei2Solr(string $server): void
{
$this->client->getEndpoint()->setOptions(['timeout' => 60, 'index_timeout' => 60]);
$finder = new Finder();
......@@ -318,16 +467,12 @@ class Indexer implements IndexerInterface
{
$xpath = new DOMXPath($doc);
$xpath->registerNamespace('tei', 'http://www.tei-c.org/ns/1.0');
$id = $this->getId($xpath);
$fulltext = $this->getFulltext($xpath);
$abstracts = $this->getAbstracts($xpath);
$docType = self::ARTICLE_DOC_TYPE;
$shortTitle = $this->metadataTransformer->getShortTitle($xpath);
$title = $this->metadataTransformer->getTitle($xpath);
$originPlaceGNDNode = $xpath->query('//tei:name[@type="place" and @subtype="orn"]/@ref');
if ($originPlaceGNDNode->item(0)) {
......
......@@ -219,7 +219,7 @@ class SolrSearchService implements SearchServiceInterface
public function getLiterature(): array
{
$select = $this->client->createSelect()->setRows(200);
$select = $this->client->createSelect()->setRows(1000);
$query = vsprintf('%s:%s', ['doctype', 'literature']);
$select->setQuery($query)->addSort('id', 'asc');
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment