Commit 9bf75d5a authored by Seungbin Yim's avatar Seungbin Yim
Browse files

Support skipping publications processed_at

parent 1929194a
......@@ -33,6 +33,7 @@ stages:
- publications/pub2pdf_with_candidates.pkl
- candidates/none_existing.pkl
- candidates/analyse.csv
- candidates//extracted_toolnames.csv
add_relations:
cmd: python src/add_relation.py
deps:
......
......@@ -196,3 +196,26 @@ http://www.surf.nl/binaries/content/assets/surf/en/knowledgebase/2011/What_resea
https://dh2017.adho.org/program/abstracts/
https://dh2017.adho.org/program/abstracts/
https://dh2017.adho.org/program/abstracts/
https://dh2017.adho.org/program/abstracts/
http://dharchive.org/paper/DH2014/Paper-652.xml
http://dh2010.cch.kcl.ac.uk/academic-programme/abstracts/papers/pdf/book-final.pdf
http://dh2010.cch.kcl.ac.uk/academic-programme/abstracts/papers/pdf/book-final.pdf
https://dh2017.adho.org/abstracts/686/686.pdf
https://dh2017.adho.org/abstracts/248/248.pdf
https://dh2017.adho.org/abstracts/281/281.pdf
https://dh2017.adho.org/abstracts/144/144.pdf
https://dh2017.adho.org/abstracts/246/246.pdf
https://dh2017.adho.org/abstracts/668/668.pdf
http://new.cidoc-crm.org/sites/default/files/cidoc_crm_version_5.0.4.pdf
https://dh2017.adho.org/abstracts/538/538.pdf
https://dh2017.adho.org/abstracts/247/247.pdf
https://dh2017.adho.org/abstracts/463/463.pdf
https://www.metamorfoze.nl/sites/metamorfoze.nl/files/publicatie_documenten/Metamorfoze_Preservation_Imaging_Guidelines_1.0.pdf
https://www.britishmuseum.org/pdf/charisma-multispectral-imaging-manual-2013.pdf
https://dh2017.adho.org/abstracts/533/533.pdf
http://www.ariadne-infrastructure.eu/content/download/2106/11888/version/2/file/D3.3+Report+on+data+sharing+policies_final.pdf
http://www.ariadne-infrastructure.eu/content/download/1782/9961/version/2/file/D3.2+Report+on+project+standards.pdf
https://dh2017.adho.org/abstracts/137/137.pdf
https://dh2017.adho.org/abstracts/539/539.pdf
https://dh2017.adho.org/abstracts/036/036.pdf
http://www.surf.nl/binaries/content/assets/surf/en/knowledgebase/2011/What_researchers_want.pdf
WORDSno sentences
Githubno sentences
Basecampno sentences
FEDORAno sentences
Githubno sentences
DEVONthinkno sentences
Malletno sentences
Gephino sentences
Githubno sentences
Orangeno sentences
Githubno sentences
Githubno sentences
D3.jsno sentences
ArcGISno sentences
FEDORAno sentences
Solrno sentences
Githubno sentences
JavaScriptno sentences
Githubno sentences
Twitterno sentences
FEDORAno sentences
Matlabno sentences
WordPressno sentences
Githubno sentences
digilibno sentences
JIRAno sentences
word2vecno sentences
FEDORAno sentences
Githubno sentences
Githubno sentences
Githubno sentences
HUBzerono sentences
JavaScriptno sentences
MAXQDAno sentences
Githubno sentences
JavaScriptno sentences
Githubno sentences
JavaScriptno sentences
Solrno sentences
Githubno sentences
Githubno sentences
JavaScriptno sentences
YouTubeno sentences
stylono sentences
Githubno sentences
Githubno sentences
Githubno sentences
Githubno sentences
FEDORAno sentences
Githubno sentences
Githubno sentences
Basecampno sentences
WORDSno sentences
FEDORAno sentences
Githubno sentences
Githubno sentences
DEVONthinkno sentences
Malletno sentences
Gephino sentences
Orangeno sentences
Githubno sentences
Githubno sentences
Githubno sentences
D3.jsno sentences
ArcGISno sentences
FEDORAno sentences
Solrno sentences
JavaScriptno sentences
Githubno sentences
Githubno sentences
Twitterno sentences
FEDORAno sentences
Matlabno sentences
WordPressno sentences
Githubno sentences
digilibno sentences
JIRAno sentences
word2vecno sentences
FEDORAno sentences
Githubno sentences
Githubno sentences
Githubno sentences
HUBzerono sentences
JavaScriptno sentences
MAXQDAno sentences
Githubno sentences
JavaScriptno sentences
Githubno sentences
JavaScriptno sentences
Solrno sentences
Githubno sentences
Githubno sentences
JavaScriptno sentences
YouTubeno sentences
stylono sentences
Githubno sentences
Githubno sentences
Githubno sentences
Githubno sentences
FEDORAno sentences
Githubno sentences
Githubno sentences
Githubno sentences
WORDSno sentences
Basecampno sentences
FEDORAno sentences
Githubno sentences
DEVONthinkno sentences
Malletno sentences
Gephino sentences
Githubno sentences
Orangeno sentences
Githubno sentences
Githubno sentences
D3.jsno sentences
ArcGISno sentences
Solrno sentences
FEDORAno sentences
Githubno sentences
JavaScriptno sentences
Githubno sentences
Twitterno sentences
FEDORAno sentences
Matlabno sentences
WordPressno sentences
Githubno sentences
JIRAno sentences
digilibno sentences
word2vecno sentences
FEDORAno sentences
Githubno sentences
Githubno sentences
Githubno sentences
HUBzerono sentences
JavaScriptno sentences
MAXQDAno sentences
Githubno sentences
JavaScriptno sentences
Githubno sentences
JavaScriptno sentences
Solrno sentences
Githubno sentences
Githubno sentences
JavaScriptno sentences
YouTubeno sentences
Githubno sentences
stylono sentences
Githubno sentences
Githubno sentences
Githubno sentences
FEDORAno sentences
Githubno sentences
Githubno sentences
FEDORAno sentences
WORDSno sentences
Githubno sentences
Basecampno sentences
Githubno sentences
DEVONthinkno sentences
Malletno sentences
Gephino sentences
Orangeno sentences
Githubno sentences
Githubno sentences
Githubno sentences
D3.jsno sentences
ArcGISno sentences
FEDORAno sentences
Solrno sentences
Githubno sentences
JavaScriptno sentences
Githubno sentences
Twitterno sentences
FEDORAno sentences
Matlabno sentences
WordPressno sentences
Githubno sentences
digilibno sentences
JIRAno sentences
word2vecno sentences
FEDORAno sentences
Githubno sentences
Githubno sentences
Githubno sentences
JavaScriptno sentences
HUBzerono sentences
MAXQDAno sentences
Githubno sentences
JavaScriptno sentences
Githubno sentences
JavaScriptno sentences
Solrno sentences
Githubno sentences
Githubno sentences
JavaScriptno sentences
YouTubeno sentences
stylono sentences
Githubno sentences
Githubno sentences
Githubno sentences
Githubno sentences
FEDORAno sentences
Githubno sentences
Githubno sentences
FEDORAno sentences
WORDSno sentences
Githubno sentences
Basecampno sentences
Githubno sentences
DEVONthinkno sentences
Malletno sentences
Gephino sentences
Githubno sentences
Orangeno sentences
Githubno sentences
Githubno sentences
D3.jsno sentences
ArcGISno sentences
FEDORAno sentences
Solrno sentences
JavaScriptno sentences
Githubno sentences
Githubno sentences
Twitterno sentences
FEDORAno sentences
Matlabno sentences
WordPressno sentences
Githubno sentences
JIRAno sentences
digilibno sentences
word2vecno sentences
FEDORAno sentences
Githubno sentences
Githubno sentences
Githubno sentences
JavaScriptno sentences
HUBzerono sentences
MAXQDAno sentences
Githubno sentences
JavaScriptno sentences
Githubno sentences
JavaScriptno sentences
Solrno sentences
Githubno sentences
Githubno sentences
JavaScriptno sentences
YouTubeno sentences
Githubno sentences
stylono sentences
Githubno sentences
Githubno sentences
Githubno sentences
FEDORAno sentences
Githubno sentences
Githubno sentences
......@@ -21,9 +21,11 @@ if __name__ == '__main__':
none_existing = []
analyse = []
all_extracted_tools = []
count_sentences = 0
for pub in publication2pdf:
candidates_to_sentences: {str: [str]} = tool_candidate_extractor.extract_tool_candidate_names(pub)
candidates_to_sentences, sentences = tool_candidate_extractor.extract_tool_candidate_names(pub)
count_sentences += len(sentences)
print('Tool Candidates:')
print(candidates_to_sentences.keys())
......@@ -34,14 +36,34 @@ if __name__ == '__main__':
none_existing.extend(none)
for tool in pub.tool_candidates:
try:
all_extracted_tools.append([pub.link, tool.label, 'marketplace', candidates_to_sentences[tool.label]])
except KeyError:
with open('logs/sentences_key_not_exists.log', 'a+') as fne_log:
fne_log.write(tool.label + 'no sentences' + '\n')
for tool in wiki_candidates:
try:
all_extracted_tools.append([pub.link, tool.label, 'wikidata', candidates_to_sentences[tool.label]])
except KeyError:
with open('logs/sentences_key_not_exists_wiki.log', 'a+') as fne_log:
fne_log.write(tool.label + 'no sentences' + '\n')
for tool in none:
analyse.append([pub.link, tool, candidates_to_sentences[tool]])
all_extracted_tools.append([pub.link, tool, 'none', candidates_to_sentences[tool]])
with open('candidates/analyse.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerows(analyse)
print(analyse)
with open('candidates/extracted_toolnames.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerows(all_extracted_tools)
print(all_extracted_tools)
with open("publications/pub2pdf_with_candidates.pkl", 'wb+') as pub2pdf:
pickle.dump(publication2pdf, pub2pdf)
......@@ -49,4 +71,6 @@ if __name__ == '__main__':
pickle.dump(none_existing, none_existing_file)
with open("candidates/analyse.pkl", 'wb+') as analyse_file:
pickle.dump(analyse, analyse_file)
\ No newline at end of file
pickle.dump(analyse, analyse_file)
print('Total Number of sentences: ' + str(count_sentences))
\ No newline at end of file
......@@ -65,11 +65,11 @@ def check_accessible_at(item: SshocItem) -> bool:
if hasattr(item, 'accessible_at'):
try:
for link in item.accessible_at:
if link is not None:
# if link is not None:
# return True
if ".pdf" in link in link:
logging.info(link)
return True
# if ".pdf" in link or ".xml" in link:
# logging.info(link)
# return True
except Exception as e:
logging.error('Error occurred while checking pdf link: ' + item.label)
return False
......@@ -147,7 +147,7 @@ class SshocToolCandidateExtractor(object):
self._rest_client = RestClient()
self._wiki_adapter = WikidataAdapter(RestClient())
def get_sshoc_publications_with_valid_link(self, page=None):
def get_sshoc_publications_with_valid_link(self, skip_existing=True, only_pdf=False, page=None):
pubs = []
if page is not None:
retrieved = self._sshoc_adapter.get_items_of_page(self._sshoc_adapter.PUBLICATIONS_ENDPOINT,
......@@ -160,7 +160,10 @@ class SshocToolCandidateExtractor(object):
pubs_with_accessible_at = list(filter(check_accessible_at, pubs))
self.logger.info('# of pubs with valid link' + str(len(pubs_with_accessible_at)))
return list(filterfalse(exists_processed_at, pubs_with_accessible_at))
if skip_existing:
return list(filterfalse(exists_processed_at, pubs_with_accessible_at))
else:
return pubs_with_accessible_at
def download_publications(self, publications: [SshocPublication]) -> [Publication]:
result = []
......@@ -179,7 +182,7 @@ class SshocToolCandidateExtractor(object):
self.logger.info(link)
result.append(Publication(pub, filename, link))
except KeyError:
with open('logs/pub_download_keyerror.log','a+') as ke_log:
with open('logs/pub_download_keyerror.log', 'a+') as ke_log:
ke_log.write(link + '\n')
self.logger.debug('KeyError while downloading publication source:' + link)
except HTTPError:
......@@ -196,15 +199,13 @@ class SshocToolCandidateExtractor(object):
return result
def extract_tool_candidate_names(self, publication_object: Publication) -> {str: [str]}:
def extract_tool_candidate_names(self, publication_object: object):
self.logger.info("Extracting tool candidate names from XML")
sentences: [str] = self.__convert_xml_to_sentences(publication_object)
tool_name_to_sentences: {str: [str]} = self.__extract_toolnames(sentences)
return tool_name_to_sentences
return tool_name_to_sentences, sentences
def retrieve_tool_candidates_from_marketplace(self, tool_candidate_names: [str]) -> [ToolCandidate]:
existing = []
......@@ -360,7 +361,6 @@ class SshocToolCandidateExtractor(object):
else:
raise FileNotFoundError
@staticmethod
def __extract_sentences(docs) -> list[str]:
sentences = []
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment