Commit 26d12cdf authored by Seungbin Yim's avatar Seungbin Yim
Browse files

Separate pdf download logic from extractor

parent ae161131
#!/bin/sh
set -e
set -x
eval $(ssh-agent -s)
mkdir -p ~/.ssh
mkdir -p ~/.creds
chmod 700 ~/.ssh
ls -la ~/.ssh
echo "${GITLAB}" > ~/.ssh/id_rsa
chmod 700 ~/.ssh/id_rsa
ssh -T -i ~/.ssh/id_rsa -o StrictHostKeyChecking=no git@gitlab.gwdg.de
cd /app || exit
MODEL_VERSION=$1
cd /app
pip install --upgrade pip setuptools wheel
git clone git@gitlab.gwdg.de:sshoc/data-ingestion.git
GITLAB_TOKEN=$(echo $GITLAB)
export CONFIG_FILE=/app/data-ingestion/repositories/extraction/publication_retrieval/config/config.yaml
git clone https://sbyim1:$GITLAB_TOKEN@gitlab.gwdg.de/sshoc/data-ingestion.git
cd data-ingestion
#ls -la
### INSTALL DEPENDENCIES ###
cd /app/data-ingestion/openaire
pip install -U build
python -m build
......@@ -20,15 +22,15 @@ pip install 'dvc[gs]'
cd /app/data-ingestion || exit
pip install -U -r /app/data-ingestion/repositories/extraction/publication_retrieval/requirements.txt
cd /app/data-ingestion/repositories/extraction/ || exit
dvc pull -v ner_ml/model/tools_model_with_corrections_pretrained
dvc pull -v ner_ml/model/tools_model_with_corrections_pretrained_$MODEL_VERSION
cd /app/data-ingestion/repositories/extraction/publication_retrieval
dvc status
dvc repro
dvc push
git status
git add .
git config --global user.email "seung-bin.yim@oeaw.ac.at"
git config --global user.name "Seung-bin Yim"
tagname=$(date +%y.%m.%d)
git tag -a $tagname -m "Tool Extraction execution version"
git push origin $tagname
\ No newline at end of file
dvc repro -p retrieve_pubs
#dvc push
#git status
#git add .
#git config --global user.email "seung-bin.yim@oeaw.ac.at"
#git config --global user.name "Seung-bin Yim"
#tagname=$(date +%y.%m.%d)
#git tag -a $tagname -m "Tool Extraction execution version"
#git push origin $tagname
\ No newline at end of file
......@@ -108,7 +108,7 @@ stages:
outs:
- output
extract_candidates:
cmd: python src/extract_tool_candidates.py
cmd: python src/extract_tool_candidates.py ${model_version}
deps:
- output
- publications/publications2pdf_prod.pkl
......
......@@ -8,8 +8,7 @@ from rest.sshoc_adapter import SshocAdapter
sys.path.insert(0, 'src')
from publication_tool_extractor import Publication, \
SshocToolCandidateExtractor
from publication_tool_extractor import Publication, SshocToolRetriever
parser = argparse.ArgumentParser(description='Download pdf publications')
parser.add_argument('--testrun', action='store_true',
......@@ -29,8 +28,8 @@ if __name__ == '__main__':
pdf_path = 'pdf_test' if testrun else 'pdf'
xml_path = 'output_test' if testrun else 'output'
tool_candidate_extractor = SshocToolCandidateExtractor(sshoc_adapter, pdf_save_path=pdf_path, xml_dir=xml_path)
publication2pdf: [Publication] = tool_candidate_extractor.download_publications(pubs, pdf_path)
tool_retriever = SshocToolRetriever(sshoc_adapter, pdf_save_path=pdf_path, xml_dir=xml_path)
publication2pdf: [Publication] = tool_retriever.download_publications(pubs, pdf_path)
with open('publications/publications2pdf_' + filename_postfix + '.pkl', 'wb+') as file:
pickle.dump(publication2pdf, file)
......
......@@ -134,9 +134,12 @@ class WikidataAdapter(object):
class SshocToolRetriever(object):
def __init__(self, sshoc_adapter: SshocAdapter, model_version: float = None, pdf_save_path='pdf', xml_dir="output"):
def __init__(self, sshoc_adapter: SshocAdapter, pdf_save_path='pdf', xml_dir="output"):
self.logger = logging.getLogger(__name__)
self._sshoc_adapter = sshoc_adapter
self._rest_client = RestClient()
self._pdf_dir = pdf_save_path
self._xml_dir = xml_dir
def get_sshoc_publications_with_valid_link(self, skip_existing=True, only_pdf=False, page=None, page_limit=None):
pubs = []
......@@ -156,25 +159,6 @@ class SshocToolRetriever(object):
else:
return pubs_with_accessible_at
class SshocToolCandidateExtractor(object):
_sshoc_adapter: SshocAdapter
def __init__(self, sshoc_adapter: SshocAdapter, model_version: float = None, pdf_save_path='pdf', xml_dir="output"):
self.logger = logging.getLogger(__name__)
self.model_version = model_version
self.config = load_openaire_config(os.environ.get('CONFIG_FILE'))
self._sshoc_adapter = sshoc_adapter
self._pdf_dir = pdf_save_path
self._xml_dir = xml_dir
self._nlp = spacy.load("en_core_web_sm")
self._nlp.remove_pipe("ner")
self.logger.info(self._nlp.pipeline)
version_postfix = '' if self.model_version is None else '_' + str(self.model_version)
self._pretrained_model = spacy.load('../ner_ml/model/tools_model_with_corrections_pretrained' + version_postfix)
self._rest_client = RestClient()
self._wiki_adapter = WikidataAdapter(RestClient())
def download_publications(self, publications: [SshocPublication], pdf_path='pdf') -> [Publication]:
result = []
xml_count = 0
......@@ -219,6 +203,48 @@ class SshocToolCandidateExtractor(object):
return result
def __write_publication(self, idx, pub, response, link):
filename = pub.label + '_' + str(idx)
os.makedirs(os.path.dirname(self._pdf_dir + '/' + filename), exist_ok=True)
extension = None
if link.endswith('pdf'):
extension = '.pdf'
elif link.endswith('xml'):
extension = '.xml'
else:
extension = '.html'
try:
file_path = self._pdf_dir + '/' + filename + extension if extension.endswith(
'.pdf') else self._xml_dir + '/' + filename + extension
with open(file_path, 'wb+') as f:
f.write(response.content)
except FileNotFoundError:
with open('logs/write_publication_file_not_found_error.log', 'a+') as fne_log:
fne_log.write(self._xml_dir + filename + extension + '\n')
return filename
class SshocToolCandidateExtractor(object):
_sshoc_adapter: SshocAdapter
def __init__(self, sshoc_adapter: SshocAdapter, model_version: float = None, pdf_save_path='pdf', xml_dir="output"):
self.logger = logging.getLogger(__name__)
self.model_version = model_version
self.config = load_openaire_config(os.environ.get('CONFIG_FILE'))
self._sshoc_adapter = sshoc_adapter
self._pdf_dir = pdf_save_path
self._xml_dir = xml_dir
self._nlp = spacy.load("en_core_web_sm")
self._nlp.remove_pipe("ner")
self.logger.info(self._nlp.pipeline)
version_postfix = '' if self.model_version is None else '_' + str(self.model_version)
self._pretrained_model = spacy.load('../ner_ml/model/tools_model_with_corrections_pretrained' + version_postfix)
self._rest_client = RestClient()
self._wiki_adapter = WikidataAdapter(RestClient())
def extract_tool_candidate_names(self, publication_object: object):
self.logger.info("Extracting tool candidate names from XML")
......@@ -312,29 +338,6 @@ class SshocToolCandidateExtractor(object):
return publication
def __write_publication(self, idx, pub, response, link):
filename = pub.label + '_' + str(idx)
os.makedirs(os.path.dirname(self._pdf_dir + '/' + filename), exist_ok=True)
extension = None
if link.endswith('pdf'):
extension = '.pdf'
elif link.endswith('xml'):
extension = '.xml'
else:
extension = '.html'
try:
file_path = self._pdf_dir + '/' + filename + extension if extension.endswith(
'.pdf') else self._xml_dir + '/' + filename + extension
with open(file_path, 'wb+') as f:
f.write(response.content)
except FileNotFoundError:
with open('logs/write_publication_file_not_found_error.log', 'a+') as fne_log:
fne_log.write(self._xml_dir + filename + extension + '\n')
return filename
def __extract_toolnames(self, sentences) -> {str: [str]}:
tool_names = []
sent_of_tools = {}
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment