Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
SSHOC
data-ingestion
Commits
26d12cdf
Commit
26d12cdf
authored
May 04, 2022
by
Seungbin Yim
Browse files
Separate pdf download logic from extractor
parent
ae161131
Changes
4
Hide whitespace changes
Inline
Side-by-side
repositories/extraction/publication_retrieval/docker/init.sh
View file @
26d12cdf
#!/bin/sh
set
-e
set
-x
eval
$(
ssh-agent
-s
)
mkdir
-p
~/.ssh
mkdir
-p
~/.creds
chmod
700 ~/.ssh
ls
-la
~/.ssh
echo
"
${
GITLAB
}
"
>
~/.ssh/id_rsa
chmod
700 ~/.ssh/id_rsa
ssh
-T
-i
~/.ssh/id_rsa
-o
StrictHostKeyChecking
=
no git@gitlab.gwdg.de
cd
/app
||
exit
MODEL_VERSION
=
$1
cd
/app
pip
install
--upgrade
pip setuptools wheel
git clone git@gitlab.gwdg.de:sshoc/data-ingestion.git
GITLAB_TOKEN
=
$(
echo
$GITLAB
)
export
CONFIG_FILE
=
/app/data-ingestion/repositories/extraction/publication_retrieval/config/config.yaml
git clone https://sbyim1:
$GITLAB_TOKEN
@gitlab.gwdg.de/sshoc/data-ingestion.git
cd
data-ingestion
#ls -la
### INSTALL DEPENDENCIES ###
cd
/app/data-ingestion/openaire
pip
install
-U
build
python
-m
build
...
...
@@ -20,15 +22,15 @@ pip install 'dvc[gs]'
cd
/app/data-ingestion
||
exit
pip
install
-U
-r
/app/data-ingestion/repositories/extraction/publication_retrieval/requirements.txt
cd
/app/data-ingestion/repositories/extraction/
||
exit
dvc pull
-v
ner_ml/model/tools_model_with_corrections_pretrained
dvc pull
-v
ner_ml/model/tools_model_with_corrections_pretrained
_
$MODEL_VERSION
cd
/app/data-ingestion/repositories/extraction/publication_retrieval
dvc status
dvc repro
dvc push
git status
git add
.
git config
--global
user.email
"seung-bin.yim@oeaw.ac.at"
git config
--global
user.name
"Seung-bin Yim"
tagname
=
$(
date
+%y.%m.%d
)
git tag
-a
$tagname
-m
"Tool Extraction execution version"
git push origin
$tagname
\ No newline at end of file
dvc repro
-p
retrieve_pubs
#dvc push
#git status
#git add .
#git config --global user.email "seung-bin.yim@oeaw.ac.at"
#git config --global user.name "Seung-bin Yim"
#tagname=$(date +%y.%m.%d)
#git tag -a $tagname -m "Tool Extraction execution version"
#git push origin $tagname
\ No newline at end of file
repositories/extraction/publication_retrieval/dvc.yaml
View file @
26d12cdf
...
...
@@ -108,7 +108,7 @@ stages:
outs
:
-
output
extract_candidates
:
cmd
:
python src/extract_tool_candidates.py
cmd
:
python src/extract_tool_candidates.py
${model_version}
deps
:
-
output
-
publications/publications2pdf_prod.pkl
...
...
repositories/extraction/publication_retrieval/src/download_pdf.py
View file @
26d12cdf
...
...
@@ -8,8 +8,7 @@ from rest.sshoc_adapter import SshocAdapter
sys
.
path
.
insert
(
0
,
'src'
)
from
publication_tool_extractor
import
Publication
,
\
SshocToolCandidateExtractor
from
publication_tool_extractor
import
Publication
,
SshocToolRetriever
parser
=
argparse
.
ArgumentParser
(
description
=
'Download pdf publications'
)
parser
.
add_argument
(
'--testrun'
,
action
=
'store_true'
,
...
...
@@ -29,8 +28,8 @@ if __name__ == '__main__':
pdf_path
=
'pdf_test'
if
testrun
else
'pdf'
xml_path
=
'output_test'
if
testrun
else
'output'
tool_
candidate_extractor
=
SshocToolCandidateExtracto
r
(
sshoc_adapter
,
pdf_save_path
=
pdf_path
,
xml_dir
=
xml_path
)
publication2pdf
:
[
Publication
]
=
tool_
candidate_extracto
r
.
download_publications
(
pubs
,
pdf_path
)
tool_
retriever
=
SshocToolRetrieve
r
(
sshoc_adapter
,
pdf_save_path
=
pdf_path
,
xml_dir
=
xml_path
)
publication2pdf
:
[
Publication
]
=
tool_
retrieve
r
.
download_publications
(
pubs
,
pdf_path
)
with
open
(
'publications/publications2pdf_'
+
filename_postfix
+
'.pkl'
,
'wb+'
)
as
file
:
pickle
.
dump
(
publication2pdf
,
file
)
...
...
repositories/extraction/publication_retrieval/src/publication_tool_extractor.py
View file @
26d12cdf
...
...
@@ -134,9 +134,12 @@ class WikidataAdapter(object):
class
SshocToolRetriever
(
object
):
def
__init__
(
self
,
sshoc_adapter
:
SshocAdapter
,
model_version
:
float
=
None
,
pdf_save_path
=
'pdf'
,
xml_dir
=
"output"
):
def
__init__
(
self
,
sshoc_adapter
:
SshocAdapter
,
pdf_save_path
=
'pdf'
,
xml_dir
=
"output"
):
self
.
logger
=
logging
.
getLogger
(
__name__
)
self
.
_sshoc_adapter
=
sshoc_adapter
self
.
_rest_client
=
RestClient
()
self
.
_pdf_dir
=
pdf_save_path
self
.
_xml_dir
=
xml_dir
def
get_sshoc_publications_with_valid_link
(
self
,
skip_existing
=
True
,
only_pdf
=
False
,
page
=
None
,
page_limit
=
None
):
pubs
=
[]
...
...
@@ -156,25 +159,6 @@ class SshocToolRetriever(object):
else
:
return
pubs_with_accessible_at
class
SshocToolCandidateExtractor
(
object
):
_sshoc_adapter
:
SshocAdapter
def
__init__
(
self
,
sshoc_adapter
:
SshocAdapter
,
model_version
:
float
=
None
,
pdf_save_path
=
'pdf'
,
xml_dir
=
"output"
):
self
.
logger
=
logging
.
getLogger
(
__name__
)
self
.
model_version
=
model_version
self
.
config
=
load_openaire_config
(
os
.
environ
.
get
(
'CONFIG_FILE'
))
self
.
_sshoc_adapter
=
sshoc_adapter
self
.
_pdf_dir
=
pdf_save_path
self
.
_xml_dir
=
xml_dir
self
.
_nlp
=
spacy
.
load
(
"en_core_web_sm"
)
self
.
_nlp
.
remove_pipe
(
"ner"
)
self
.
logger
.
info
(
self
.
_nlp
.
pipeline
)
version_postfix
=
''
if
self
.
model_version
is
None
else
'_'
+
str
(
self
.
model_version
)
self
.
_pretrained_model
=
spacy
.
load
(
'../ner_ml/model/tools_model_with_corrections_pretrained'
+
version_postfix
)
self
.
_rest_client
=
RestClient
()
self
.
_wiki_adapter
=
WikidataAdapter
(
RestClient
())
def
download_publications
(
self
,
publications
:
[
SshocPublication
],
pdf_path
=
'pdf'
)
->
[
Publication
]:
result
=
[]
xml_count
=
0
...
...
@@ -219,6 +203,48 @@ class SshocToolCandidateExtractor(object):
return
result
def
__write_publication
(
self
,
idx
,
pub
,
response
,
link
):
filename
=
pub
.
label
+
'_'
+
str
(
idx
)
os
.
makedirs
(
os
.
path
.
dirname
(
self
.
_pdf_dir
+
'/'
+
filename
),
exist_ok
=
True
)
extension
=
None
if
link
.
endswith
(
'pdf'
):
extension
=
'.pdf'
elif
link
.
endswith
(
'xml'
):
extension
=
'.xml'
else
:
extension
=
'.html'
try
:
file_path
=
self
.
_pdf_dir
+
'/'
+
filename
+
extension
if
extension
.
endswith
(
'.pdf'
)
else
self
.
_xml_dir
+
'/'
+
filename
+
extension
with
open
(
file_path
,
'wb+'
)
as
f
:
f
.
write
(
response
.
content
)
except
FileNotFoundError
:
with
open
(
'logs/write_publication_file_not_found_error.log'
,
'a+'
)
as
fne_log
:
fne_log
.
write
(
self
.
_xml_dir
+
filename
+
extension
+
'
\n
'
)
return
filename
class
SshocToolCandidateExtractor
(
object
):
_sshoc_adapter
:
SshocAdapter
def
__init__
(
self
,
sshoc_adapter
:
SshocAdapter
,
model_version
:
float
=
None
,
pdf_save_path
=
'pdf'
,
xml_dir
=
"output"
):
self
.
logger
=
logging
.
getLogger
(
__name__
)
self
.
model_version
=
model_version
self
.
config
=
load_openaire_config
(
os
.
environ
.
get
(
'CONFIG_FILE'
))
self
.
_sshoc_adapter
=
sshoc_adapter
self
.
_pdf_dir
=
pdf_save_path
self
.
_xml_dir
=
xml_dir
self
.
_nlp
=
spacy
.
load
(
"en_core_web_sm"
)
self
.
_nlp
.
remove_pipe
(
"ner"
)
self
.
logger
.
info
(
self
.
_nlp
.
pipeline
)
version_postfix
=
''
if
self
.
model_version
is
None
else
'_'
+
str
(
self
.
model_version
)
self
.
_pretrained_model
=
spacy
.
load
(
'../ner_ml/model/tools_model_with_corrections_pretrained'
+
version_postfix
)
self
.
_rest_client
=
RestClient
()
self
.
_wiki_adapter
=
WikidataAdapter
(
RestClient
())
def
extract_tool_candidate_names
(
self
,
publication_object
:
object
):
self
.
logger
.
info
(
"Extracting tool candidate names from XML"
)
...
...
@@ -312,29 +338,6 @@ class SshocToolCandidateExtractor(object):
return
publication
def
__write_publication
(
self
,
idx
,
pub
,
response
,
link
):
filename
=
pub
.
label
+
'_'
+
str
(
idx
)
os
.
makedirs
(
os
.
path
.
dirname
(
self
.
_pdf_dir
+
'/'
+
filename
),
exist_ok
=
True
)
extension
=
None
if
link
.
endswith
(
'pdf'
):
extension
=
'.pdf'
elif
link
.
endswith
(
'xml'
):
extension
=
'.xml'
else
:
extension
=
'.html'
try
:
file_path
=
self
.
_pdf_dir
+
'/'
+
filename
+
extension
if
extension
.
endswith
(
'.pdf'
)
else
self
.
_xml_dir
+
'/'
+
filename
+
extension
with
open
(
file_path
,
'wb+'
)
as
f
:
f
.
write
(
response
.
content
)
except
FileNotFoundError
:
with
open
(
'logs/write_publication_file_not_found_error.log'
,
'a+'
)
as
fne_log
:
fne_log
.
write
(
self
.
_xml_dir
+
filename
+
extension
+
'
\n
'
)
return
filename
def
__extract_toolnames
(
self
,
sentences
)
->
{
str
:
[
str
]}:
tool_names
=
[]
sent_of_tools
=
{}
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment