Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Goethes Farbenlehre
gfl-app-new
Commits
cec8cfc2
Commit
cec8cfc2
authored
Dec 10, 2021
by
asajedi
Browse files
Merge branch 'indexBibliography' into 'master'
Index bibliography Closes
#99
See merge request
!61
parents
e018e407
1241c6b7
Pipeline
#256721
failed with stages
in 10 minutes and 5 seconds
Changes
7
Pipelines
2
Hide whitespace changes
Inline
Side-by-side
config/services.yaml
View file @
cec8cfc2
...
...
@@ -25,12 +25,35 @@ parameters:
SUB_HSD
:
'
SUB
HSD
Niedersächsische
Staats-
und
Universitätsbibliothek
Göttingen,
Abteilung
Handschriften
und
Seltene
Drucke'
tei_dir
:
'
%kernel.project_dir%/data/teis/'
tei_sample_dir
:
'
%kernel.project_dir%/data/sampletei/'
lit_dir
:
'
%kernel.project_dir%/data/lit/'
document_languages
:
eng
:
Englisch
fre
:
Französisch
ger
:
Deutsch
ita
:
Italienisch
lat
:
Latein
literatur_data_elements
:
biblScope_volume_n
:
biblScope_volume_n
biblScope_part_n
:
biblScope_part_n
biblScope_pages
:
biblScope_pages
biblScope_part
:
biblScope_part
biblScope_volume
:
biblScope_volume
date
:
lit_pub_date
extent
:
extent
editor
:
editor
edition
:
edition
idno_isbn
:
ISBN
idno_issn
:
ISSN
title_a_main
:
analytic_main_title
title_j_main
:
journal_main_title
title_m_main
:
monographic_main_title
title_s_main
:
series_main_title
title_u_main
:
unpublished_main_title
title_a_sub
:
analytic_sub_title
title_j_sub
:
journal_sub_title
title_m_sub
:
monographic_sub_title
title_s_sub
:
series_sub_title
title_u_sub
:
unpublished_sub_title
services
:
# default configuration for services in *this* file
...
...
@@ -100,11 +123,14 @@ services:
arguments
:
-
'
%tei_dir%'
-
'
%tei_sample_dir%'
-
'
%lit_dir%'
-
'
%env(GITLAB_REPO_TOKEN)%'
-
'
%env(GITLAB_REPO_TREE_URL)%'
-
'
%env(GITLAB_PROCESSED_TEI_REPO_URL)%'
-
'
%env(INVALIDE_TEI_LIST_FILE)%'
-
'
%env(SAMPLE_TEI_DOCUMENT_URL)%'
-
'
%env(GITLAB_LIT_REPO_URL)%'
-
'
%env(GITLAB_PROCESSED_LIT_REPO_URL)%'
App\Index\Indexer
:
calls
:
...
...
@@ -112,6 +138,8 @@ services:
arguments
:
-
'
%tei_dir%'
-
'
%tei_sample_dir%'
-
'
%lit_dir%'
-
'
%literatur_data_elements%'
App\Transform\MetadataTransformer
:
calls
:
...
...
solr/gfl/conf/schema.xml
View file @
cec8cfc2
...
...
@@ -92,6 +92,8 @@
<field
name=
"extent"
type=
"string"
multiValued=
"true"
/>
<field
name=
"biblScope"
type=
"string"
multiValued=
"true"
/>
<field
name=
"biblScope_volume_n"
type=
"string"
multiValued=
"true"
/>
<field
name=
"biblScope_part_n"
type=
"string"
multiValued=
"true"
/>
<field
name=
"biblScope_part"
type=
"string"
multiValued=
"true"
/>
<field
name=
"biblScope_volume"
type=
"string"
multiValued=
"true"
/>
<field
name=
"biblScope_pages"
type=
"string"
multiValued=
"true"
/>
...
...
solr/gfloffline/conf/schema.xml
View file @
cec8cfc2
...
...
@@ -91,6 +91,8 @@
<field
name=
"extent"
type=
"string"
multiValued=
"true"
/>
<field
name=
"biblScope"
type=
"string"
multiValued=
"true"
/>
<field
name=
"biblScope_volume_n"
type=
"string"
multiValued=
"true"
/>
<field
name=
"biblScope_part_n"
type=
"string"
multiValued=
"true"
/>
<field
name=
"biblScope_part"
type=
"string"
multiValued=
"true"
/>
<field
name=
"biblScope_volume"
type=
"string"
multiValued=
"true"
/>
<field
name=
"biblScope_pages"
type=
"string"
multiValued=
"true"
/>
...
...
src/Command/SolrIndexing.php
View file @
cec8cfc2
...
...
@@ -44,7 +44,8 @@ class SolrIndexing extends Command
$output
->
writeln
(
'Start solr indexing.'
);
$this
->
importer
->
import
(
$server
);
$this
->
indexer
->
deleteSolrIndex
();
$this
->
indexer
->
tei2solr
(
$server
);
$this
->
indexer
->
tei2Solr
(
$server
);
$this
->
indexer
->
lit2Solr
();
$time
=
microtime
(
true
)
-
$_SERVER
[
'REQUEST_TIME_FLOAT'
];
$time
/=
60
;
$output
->
writeln
(
'Indexing process completed in '
.
$time
.
' minutes.'
);
...
...
src/Import/Importer.php
View file @
cec8cfc2
...
...
@@ -19,21 +19,27 @@ class Importer implements ImporterInterface
private
?string
$gitlabRepoTreeUrl
;
private
?string
$invalidTeiListFile
;
private
?string
$sampleTeiDocumentUrl
;
private
?string
$litDir
;
private
?string
$gitlabLitRepoUrl
;
private
?string
$gitlabProcessedLitRepoUrl
;
public
function
__construct
(
FileService
$fileService
)
{
$this
->
fileService
=
$fileService
;
}
public
function
setConfigs
(
string
$teiDir
,
string
$teiSampleDir
,
string
$gitlabRepoToken
,
string
$gitlabRepoTreeUrl
,
string
$gitlabProcessedTeiRepoUrl
,
string
$invalidTeiListFile
,
string
$sampleTeiDocumentUrl
):
void
public
function
setConfigs
(
string
$teiDir
,
string
$teiSampleDir
,
string
$litDir
,
string
$gitlabRepoToken
,
string
$gitlabRepoTreeUrl
,
string
$gitlabProcessedTeiRepoUrl
,
string
$invalidTeiListFile
,
string
$sampleTeiDocumentUrl
,
string
$gitlabLitRepoUrl
,
string
$gitlabProcessedLitRepoUrl
):
void
{
$this
->
teiDir
=
$teiDir
;
$this
->
teiSampleDir
=
$teiSampleDir
;
$this
->
litDir
=
$litDir
;
$this
->
gitlabRepoToken
=
$gitlabRepoToken
;
$this
->
gitlabRepoTreeUrl
=
$gitlabRepoTreeUrl
;
$this
->
gitlabProcessedTeiRepoUrl
=
$gitlabProcessedTeiRepoUrl
;
$this
->
invalidTeiListFile
=
$invalidTeiListFile
;
$this
->
sampleTeiDocumentUrl
=
$sampleTeiDocumentUrl
;
$this
->
gitlabLitRepoUrl
=
$gitlabLitRepoUrl
;
$this
->
gitlabProcessedLitRepoUrl
=
$gitlabProcessedLitRepoUrl
;
}
public
function
importTeiToS3Storage
():
void
...
...
@@ -41,16 +47,18 @@ class Importer implements ImporterInterface
$mainFileSystem
=
$this
->
fileService
->
getMainFilesystem
();
$mainFileSystem
->
deleteDir
(
'tei'
);
$teiFilesystem
=
$this
->
fileService
->
getTeiFilesystem
();
$sampleTeiDocument
=
$this
->
getSampleTEIDocument
();
if
(
!
empty
(
$sampleTeiDocument
))
{
$teiFilesystem
->
write
(
'sample.xml'
,
$sampleTeiDocument
);
}
$filesystem
=
new
Filesystem
();
if
(
!
$filesystem
->
exists
(
$this
->
teiDir
))
{
$filesystem
->
mkdir
(
$this
->
teiDir
);
}
$invalidTeiList
=
$this
->
getInvalidTeiList
();
for
(
$i
=
1
;
$i
<=
100
;
++
$i
)
{
...
...
@@ -85,12 +93,53 @@ class Importer implements ImporterInterface
}
}
public
function
importLiterature
():
void
{
$filesystem
=
new
Filesystem
();
if
(
!
$filesystem
->
exists
(
$this
->
litDir
))
{
$filesystem
->
mkdir
(
$this
->
litDir
);
}
try
{
$files
=
file_get_contents
(
$this
->
gitlabLitRepoUrl
.
'&page=1&access_token='
.
$this
->
gitlabRepoToken
);
if
(
is_string
(
$files
))
{
$files
=
json_decode
(
$files
,
true
);
foreach
(
$files
as
$file
)
{
$teiFileUrl
=
$this
->
gitlabProcessedLitRepoUrl
.
$file
[
'name'
]
.
'?access_token='
.
$this
->
gitlabRepoToken
.
'&ref=master'
;
$fileData
=
file_get_contents
(
$teiFileUrl
);
if
(
is_string
(
$fileData
))
{
$fileData
=
json_decode
(
$fileData
,
true
);
try
{
$filesystem
->
dumpFile
(
$this
->
litDir
.
$file
[
'name'
],
base64_decode
(
$fileData
[
'content'
])
);
}
catch
(
FileException
$exception
)
{
echo
$file
[
'name'
]
.
' could not be imported.'
;
}
}
else
{
// TODO retry to download the file again
echo
$file
[
'name'
]
.
' could not be imported.'
;
}
}
}
}
catch
(
FileException
$exception
)
{
echo
'Literature files list could not be imported from gitlab'
;
}
}
public
function
import
(
string
$server
):
void
{
if
(
'dev'
===
$server
)
{
$this
->
importSampleTeiDocument
();
}
$this
->
importLiterature
();
$filesystem
=
new
Filesystem
();
if
(
!
$filesystem
->
exists
(
$this
->
teiDir
))
{
$filesystem
->
mkdir
(
$this
->
teiDir
);
...
...
src/Index/Indexer.php
View file @
cec8cfc2
...
...
@@ -24,13 +24,16 @@ class Indexer implements IndexerInterface
private
const
PAGE_DOC_TYPE
=
'page'
;
private
const
NOTE_DOC_TYPE
=
'note'
;
private
const
ENTITY_DOC_TYPE
=
'entity'
;
private
const
LITERATURE_DOC_TYPE
=
'literature'
;
private
Client
$client
;
private
EditedTextService
$editedTextService
;
private
PreProcessingService
$preProcessingService
;
private
?string
$teiDir
=
null
;
private
?string
$teiSampleDir
=
null
;
private
TranscriptionService
$transcriptionService
;
private
MetadataTransformerInterface
$metadataTransformer
;
private
?string
$teiDir
=
null
;
private
?string
$teiSampleDir
=
null
;
private
?string
$litDir
;
private
?array
$literaturDataElements
;
public
function
__construct
(
Client
$client
,
...
...
@@ -46,6 +49,14 @@ class Indexer implements IndexerInterface
$this
->
metadataTransformer
=
$metadataTransformer
;
}
public
function
setConfigs
(
string
$teiDir
,
string
$teiSampleDir
,
string
$litDir
,
array
$literaturDataElements
):
void
{
$this
->
teiDir
=
$teiDir
;
$this
->
teiSampleDir
=
$teiSampleDir
;
$this
->
litDir
=
$litDir
;;
$this
->
literaturDataElements
=
$literaturDataElements
;
}
public
function
deleteSolrIndex
():
void
{
$update
=
$this
->
client
->
createUpdate
();
...
...
@@ -119,13 +130,151 @@ class Indexer implements IndexerInterface
return
$solrDocument
;
}
public
function
setConfigs
(
string
$teiDir
,
string
$teiSampleDir
):
void
public
function
lit2Solr
(
):
void
{
$this
->
teiDir
=
$teiDir
;
$this
->
teiSampleDir
=
$teiSampleDir
;
$this
->
client
->
getEndpoint
()
->
setOptions
([
'timeout'
=>
60
,
'index_timeout'
=>
60
]);
$finder
=
new
Finder
();
$finder
->
files
()
->
in
(
$this
->
litDir
);
foreach
(
$finder
as
$file
)
{
libxml_use_internal_errors
(
true
);
$doc
=
new
\
DOMDocument
();
$doc
->
load
(
$file
->
getRealPath
());
if
(
!
libxml_get_errors
())
{
$xpath
=
new
\
DOMXPath
(
$doc
);
$xpath
->
registerNamespace
(
'tei'
,
'http://www.tei-c.org/ns/1.0'
);
$literature
=
$xpath
->
query
(
'//tei:text//tei:body//tei:listBibl//tei:bibl'
);
foreach
(
$literature
as
$literatureItem
)
{
$update
=
$this
->
client
->
createUpdate
();
$sdoc
=
$update
->
createDocument
();
$uri
=
[];
$author
=
[];
$publisher
=
[];
$pubPlace
=
[];
$edition
=
[];
foreach
(
$literatureItem
->
childNodes
as
$childNode
)
{
$id
=
str_replace
(
'_'
,
' '
,
$literatureItem
->
attributes
->
item
(
0
)
->
textContent
);
$sdoc
->
id
=
$id
;
$sdoc
->
doctype
=
'literature'
;
if
(
'#text'
!==
$childNode
->
nodeName
)
{
$text
=
trim
(
preg_replace
(
'/\s+/'
,
' '
,
$childNode
->
nodeValue
));
if
(
'relatedItem'
===
$childNode
->
nodeName
)
{
foreach
(
$childNode
->
childNodes
as
$childChildNode
)
{
if
(
'ref'
===
$childChildNode
->
nodeName
)
{
$ref
=
$childChildNode
->
attributes
->
item
(
0
)
->
nodeValue
;
if
(
'_'
!==
$ref
)
{
$uri
[]
=
$ref
;
}
}
}
}
elseif
(
'title'
===
$childNode
->
nodeName
)
{
$name
=
'title_'
.
$childNode
->
attributes
->
item
(
0
)
->
nodeValue
.
'_'
.
$childNode
->
attributes
->
item
(
1
)
->
nodeValue
;
$name
=
$this
->
literaturDataElements
[
$name
];
if
(
!
empty
(
$name
))
{
$sdoc
->
$name
=
$text
;
}
}
elseif
(
'author'
===
$childNode
->
nodeName
)
{
foreach
(
$childNode
->
childNodes
as
$item
)
{
$authorElement
=
trim
(
preg_replace
(
'/\s+/'
,
' '
,
$item
->
nodeValue
));
if
(
!
empty
(
$authorElement
))
{
$author
[]
=
$authorElement
;
}
}
}
elseif
(
'publisher'
===
$childNode
->
nodeName
)
{
foreach
(
$childNode
->
childNodes
as
$item
)
{
$publisherElement
=
trim
(
preg_replace
(
'/\s+/'
,
' '
,
$item
->
nodeValue
));
if
(
!
empty
(
$publisherElement
))
{
$publisher
[]
=
$publisherElement
;
}
}
}
elseif
(
'pubPlace'
===
$childNode
->
nodeName
)
{
foreach
(
$childNode
->
childNodes
as
$item
)
{
$pubPlaceElement
=
trim
(
preg_replace
(
'/\s+/'
,
' '
,
$item
->
nodeValue
));
if
(
!
empty
(
$pubPlaceElement
))
{
$pubPlace
[]
=
$pubPlaceElement
;
}
}
}
elseif
(
'edition'
===
$childNode
->
nodeName
)
{
foreach
(
$childNode
->
childNodes
as
$item
)
{
$editionElement
=
trim
(
preg_replace
(
'/\s+/'
,
' '
,
$item
->
nodeValue
));
if
(
!
empty
(
$editionElement
))
{
$edition
[]
=
$editionElement
;
}
}
}
elseif
(
'idno'
===
$childNode
->
nodeName
)
{
$name
=
'idno_'
.
strtolower
(
$childNode
->
attributes
->
item
(
0
)
->
nodeValue
);
$name
=
$this
->
literaturDataElements
[
$name
];
if
(
!
empty
(
$name
)
&&
!
empty
(
$text
))
{
$sdoc
->
$name
=
$text
;
}
}
elseif
(
'biblScope'
===
$childNode
->
nodeName
)
{
$name
=
'biblScope_'
.
$childNode
->
attributes
->
item
(
0
)
->
nodeValue
;
if
(
'n'
===
$childNode
->
attributes
->
item
(
1
)
->
nodeName
)
{
$name
.
=
'_'
.
$childNode
->
attributes
->
item
(
1
)
->
nodeValue
;
}
$name
=
$this
->
literaturDataElements
[
$name
];
if
(
!
empty
(
$name
)
&&
!
empty
(
$text
))
{
$sdoc
->
$name
=
$text
;
}
}
else
{
$name
=
strval
(
$childNode
->
nodeName
);
$name
=
$this
->
literaturDataElements
[
$name
];
if
(
!
empty
(
$name
)
&&
!
empty
(
$text
))
{
$sdoc
->
$name
=
$text
;
}
}
}
unset
(
$text
);
unset
(
$name
);
}
if
([]
!==
$uri
)
{
$sdoc
->
uri
=
$uri
;
}
if
([]
!==
$author
)
{
$sdoc
->
literature_author
=
$author
;
}
if
([]
!==
$publisher
)
{
$sdoc
->
publisher
=
$publisher
;
}
if
([]
!==
$pubPlace
)
{
$sdoc
->
pub_place
=
$pubPlace
;
}
if
([]
!==
$edition
)
{
$sdoc
->
edition
=
$edition
;
}
$update
->
addDocument
(
$sdoc
);
$update
->
addCommit
();
$this
->
client
->
execute
(
$update
);
}
}
}
}
public
function
tei2
s
olr
(
string
$server
):
void
public
function
tei2
S
olr
(
string
$server
):
void
{
$this
->
client
->
getEndpoint
()
->
setOptions
([
'timeout'
=>
60
,
'index_timeout'
=>
60
]);
$finder
=
new
Finder
();
...
...
@@ -318,16 +467,12 @@ class Indexer implements IndexerInterface
{
$xpath
=
new
DOMXPath
(
$doc
);
$xpath
->
registerNamespace
(
'tei'
,
'http://www.tei-c.org/ns/1.0'
);
$id
=
$this
->
getId
(
$xpath
);
$fulltext
=
$this
->
getFulltext
(
$xpath
);
$abstracts
=
$this
->
getAbstracts
(
$xpath
);
$docType
=
self
::
ARTICLE_DOC_TYPE
;
$shortTitle
=
$this
->
metadataTransformer
->
getShortTitle
(
$xpath
);
$title
=
$this
->
metadataTransformer
->
getTitle
(
$xpath
);
$originPlaceGNDNode
=
$xpath
->
query
(
'//tei:name[@type="place" and @subtype="orn"]/@ref'
);
if
(
$originPlaceGNDNode
->
item
(
0
))
{
...
...
src/Service/SolrSearchService.php
View file @
cec8cfc2
...
...
@@ -219,7 +219,7 @@ class SolrSearchService implements SearchServiceInterface
public
function
getLiterature
():
array
{
$select
=
$this
->
client
->
createSelect
()
->
setRows
(
2
00
);
$select
=
$this
->
client
->
createSelect
()
->
setRows
(
10
00
);
$query
=
vsprintf
(
'%s:%s'
,
[
'doctype'
,
'literature'
]);
$select
->
setQuery
(
$query
)
->
addSort
(
'id'
,
'asc'
);
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment