Commit b63c0fe5 authored by Seungbin Yim's avatar Seungbin Yim
Browse files

Dockerize evaluation

parent a37c40df
......@@ -37,4 +37,5 @@ openaire/result/tool_map.pkl
repositories/data/
repositories/extraction/publication_retrieval/test/pdf/
__pycache__!/repositories/extraction/evaluation/.local/env
__pycache__
\ No newline at end of file
__pycache__
repositories/extraction/publication_retrieval/publications_pdf.zip
\ No newline at end of file
# Workflow Description
1. Train model
<code> ner_model </code> can be trained using the sshoc_ner_train docker image.
if(basemodel exists): Compare against base model
periodically, once a month (automated)
Extract Entity and Ingest
Run periodically once a month after Step 1.
manually Evaluate suggestion
Add manually evaluate data to training/test set
Back to 1.
# DATASETS
Input datasets are shared version controlled with DVC.
You can pull the data with the following command.
......
Dockerfile/deployment/postgres-configmap.yaml
prodigy*.whl
prodigy*.zip
\ No newline at end of file
prodigy*.zip
**/postgres-configmap.yaml
**/secrets.yaml
\ No newline at end of file
......@@ -7,7 +7,10 @@ RUN cat /etc/apt/sources.list
RUN apt-get update
RUN apt-get upgrade -y
RUN apt-get -y install gcc
RUN apt-get -y install iputils-ping
RUN apt-get -y install postgresql postgresql-contrib
RUN apt-get install -y make build-essential python-dev git
# TODO: Parameterize this wheel file
RUN pip install prodigy -f prodigy-1.10.2-cp36.cp37.cp38-cp36m.cp37m.cp38-linux_x86_64.whl
EXPOSE 8080
......
Prodigy version 1.10.2 required.
\ No newline at end of file
Prodigy version 1.10.2 required.
Requires Gitlab Deployment Token as Environment variable 'GITLAB'
apiVersion: apps/v1
kind: Deployment
metadata:
name: sshoc-evaluator-deployment
labels:
app: sshoc-evaluator
spec:
replicas: 1
selector:
matchLabels:
app: sshoc-evaluator
template:
metadata:
labels:
app: sshoc-evaluator
spec:
containers:
- name: sshoc-evaluator
image: acdhtech/sshoc_tool_evaluator
ports:
- containerPort: 8080
volumeMounts:
# name must match the volume name below
- name: secret-volume
mountPath: /etc/secret-volume
env:
- name: GITLAB
valueFrom:
secretKeyRef:
name: ssh-key-secret
key: ssh-privatekey
optional: false
- name: PGHOSTADDR
value: $(POSTGRES_SERVICE_HOST)
- name: PGUSER
valueFrom:
configMapKeyRef:
name: postgres-config
key: POSTGRES_USER
- name: PGPASSWORD
valueFrom:
configMapKeyRef:
name: postgres-config
key: POSTGRES_PASSWORD
volumes:
- name: secret-volume
secret:
secretName: ssh-key-secret
apiVersion: apps/v1
kind: Deployment
metadata:
name: postgres
spec:
replicas: 1
selector:
matchLabels:
app: sshoc-postgres
template:
metadata:
labels:
app: sshoc-postgres
spec:
containers:
- name: sshoc-postgres
image: postgres:14.2-bullseye
imagePullPolicy: "IfNotPresent"
ports:
- containerPort: 5432
envFrom:
- configMapRef:
name: postgres-config
volumeMounts:
- mountPath: /var/lib/postgresql/data
name: postgresdb
volumes:
- name: postgresdb
persistentVolumeClaim:
claimName: postgres-pv-claim
kind: PersistentVolume
apiVersion: v1
metadata:
name: postgres-pv-volume
labels:
type: local
app: sshoc-postgres
spec:
storageClassName: manual
capacity:
storage: 5Gi
accessModes:
- ReadWriteMany
hostPath:
path: "/Users/seungbinyim/Development/repos/sshoc/data-ingestion/repositories/extraction/evaluation/db"
---
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
name: postgres-pv-claim
labels:
app: sshoc-postgres
spec:
storageClassName: manual
accessModes:
- ReadWriteMany
resources:
requests:
storage: 5Gi
apiVersion: v1
kind: Service
metadata:
name: sshoc-postgres
labels:
app: sshoc-postgres
spec:
type: NodePort
ports:
- port: 5432
targetPort: 5432
selector:
app: sshoc-postgres
{
"db": "sqlite",
"db": "postgresql",
"db_settings": {
"sqlite": {
"name": "prodigy.db",
"path": "/app/data-ingestion/repositories/extraction/evaluation"
"postgresql": {
"dbname": "sshoc_prodigy"
}
}
}
\ No newline at end of file
......@@ -11,12 +11,20 @@ cd /app
pip install --upgrade pip setuptools wheel
git clone git@gitlab.gwdg.de:sshoc/data-ingestion.git
cd data-ingestion
# TODO: remove the checkout command
git checkout 91-all-sentences
cd /app/data-ingestion
cd /app/data-ingestion/repositories/extraction/
cp /app/prodigy.json /app/data-ingestion/repositories/extraction/prodigy.json
pip install pydantic==1.7.2
pip install 'dvc[gs]'
pip install psycopg2==2.8.6
dvc pull publication_retrieval/candidates/sentences_to_analyse.jsonl
dvc pull ner_ml/model/tools_model_with_corrections_pretrained
PRODIGY_LOGGING=verbose prodigy ner.correct correct_analyze ner_ml/model/tools_model_with_corrections_pretrained publication_retrieval/candidates/sentences_to_analyse.jsonl --label TOOL
\ No newline at end of file
This is more directly seen in the tagging of videos and their categorization without additional interpretational work, such as in the Oral History Metadata Synchronizer (OHMS) tool developed at the Louie B. Nunn Center for Oral History University of Kentucky Libraries.The Graph Poem Project at University of Ottawa develops tools for poetry computational analysis and applies graph theory and network graph computational apps in structuring, analyzing, and visualizing poetic corpora.Our research, tools, publications, and future work will therefore be then presented in a comparative manner in the wider context of current trends, theories, and debates in DH in general and text analysis in particular-by considering for instance the potential relevance of the Graph Poem Project to at least some of the issues raised in the "Forum:Despite these contributions by organizations such as the Post Colonial Digital Humanities (DHPoCo), FemTech.The data were received as an Excel spreadsheet from which the parts were removed that were not related to the digitization of museum objects and the data on galleries that were for temporary display and did not store any objects.We developed the system utilizing RedHat Linux, Apache, PostgreSQL, PHP, jQuery, and Annotorious, which enabled the easy annotation of images.We developed the system utilizing RedHat Linux, Apache, PostgreSQL, PHP, jQuery, and Annotorious, which enabled the easy annotation of images.The annotations were stored in PostgreSQL including attributes such as date and responsibility.This paper describes a collaborative approach of information exchange between art history and literature via IIIF as conducted by two projects: the SAT Daizōkyō Text Database Committee (SAT) and a project to leverage an open dataset of the National Institute of Japanese Literature.SATiDB provides annotations for about 5,200 Buddhist icons (busson) and symbols (sanmayagyō and mandala) in the books and several search functions of the annotations with a simple translator from English to technical terms in CJK characters via the Digital Dictionary of Buddhism.SATiDB has a function to expose several objects in parallel by clicking checkboxes of cropped images by coordination of each object in the search results on the IIIF viewer, Mirador (Figure2).As one of the two transcribed woodcut printing books includes names of Buddhist saints, we added tags on the names to trigger an event to search the name and prepare a function to request queries to the SATiDB.On the other hand, in the SATiDB, a function to distribute only a list of search results including images cropped by IIIF Image API was implemented to pull search results from other Web sites by use of a form of URL such as: http://dzkimgs.l.u-tokyo.ac.jp/SATi/key:_keyword_As a result, readers-primarily researchers, but laypersons as well-can see images of related Buddhist icons on SATiDB while reading the book.We developed the system utilizing RedHat Linux, Apache, PostgreSQL, PHP, jQuery, and Annotorious, which enabled the easy annotation of images.We developed the system utilizing RedHat Linux, Apache, PostgreSQL, PHP, jQuery, and Annotorious, which enabled the easy annotation of images.We developed the system utilizing RedHat Linux, Apache, PostgreSQL, PHP, jQuery, and Annotorious, which enabled the easy annotation of images.We developed the system utilizing RedHat Linux, Apache, PostgreSQL, PHP, jQuery, and Annotorious, which enabled the easy annotation of images.We developed the system utilizing RedHat Linux, Apache, PostgreSQL, PHP, jQuery, and Annotorious, which enabled the easy annotation of images.After input, the data were converted into IIIF Presentation API and distributed with hi-resolution images converted from 60M-pixel images delivered with IIIF Image API.The other project also developed a Web collaboration system to embed transcription of Japanese texts (the issues of such transcription have been described by Nagasaki et al, 2016) line-by-line in the style of IIIF annotation which enables to search images as-theyare via Smart-GS.(Hashimoto et al, 2014)It adopts OpenSeaDragon and its plugins to annotate images with zooming and has a function to convert them into the format of IIIF Presentation API.After input, the data were converted into IIIF Presentation API and distributed with hi-resolution images converted from 60M-pixel images delivered with IIIF Image API.The other project also developed a Web collaboration system to embed transcription of Japanese texts (the issues of such transcription have been described by Nagasaki et al, 2016) line-by-line in the style of IIIF annotation which enables to search images as-theyare via Smart-GS.(Hashimoto et al, 2014)It adopts OpenSeaDragon and its plugins to annotate images with zooming and has a function to convert them into the format of IIIF Presentation API.
\ No newline at end of file
......@@ -12,7 +12,6 @@ cd /app || exit
pip install --upgrade pip setuptools wheel
git clone git@gitlab.gwdg.de:sshoc/data-ingestion.git
cd data-ingestion
git checkout 86-dockerize-tool-extraction
#ls -la
cd /app/data-ingestion/openaire
pip install -U build
......
......@@ -19275,3 +19275,83 @@ output/Transatlantic knowledge production and conveyance in community-engaged pu
output/Unmaking/Remaking Memory Work - Centering Community Narratives of Latinx Lived Experience_0.tei.xml
output/Using Wikipedia to Enable Entity Retrieval and Visualization Concerning the Intellectual/Cultural Heritage_0.tei.xml
output/Weaving the Word / Tramando la palabra_0.tei.xml
output/1 Million Dutch Newspaper Images available for researchers - The KBK-1M Dataset_0.tei.xml
output/3D-ICONS -- 3D Digitisation of Icons of European Architectural and Archaeological Heritage_0.tei.xml
output/3D Scanning for Preservation - Difficulties and Dissemination_0.tei.xml
output/3rd International Workshop on Computational History, HistoInformatics@DH 2016, Krakow, Poland, July 11, 2016_0.tei.xml
output/4 Default Text Structure - The TEI Guidelines_0.tei.xml
output/4Humanities - Designing Digital Advocacy_0.tei.xml
output/4 Ríos - una construcción transmedia de memoria histórica sobre el conflicto armado en Colombia_0.tei.xml
output/9 Dictionaries - The TEI Guidelines_0.tei.xml
output/A 3D Common Ground - Bringing Humanities Data Together Inside Online Game Engines_0.tei.xml
output/Abbreviations In Manuscripts - Systematization And Crowdsourcing By Ad Fontes_0.tei.xml
output/A Bilingual Digital Edition of Trinity College Cambridge MS O.1.77_0.tei.xml
output/Abundance and Access - Early Modern Political Letters in Contemporary and Digital Archives_0.tei.xml
output/Abusing the Concept of Normalization for Better Collation Results (and Profit)_0.tei.xml
output/Academic Migrants - A Digital Discussion of Transnational Teaching and Learning_0.tei.xml
output/Academic Pillow-Talk and Two Immersive Explorations of Linguistic Space_0.tei.xml
output/1 Million Dutch Newspaper Images available for researchers - The KBK-1M Dataset_0.tei.xml
output/3D-ICONS -- 3D Digitisation of Icons of European Architectural and Archaeological Heritage_0.tei.xml
output/3D Scanning for Preservation - Difficulties and Dissemination_0.tei.xml
output/3rd International Workshop on Computational History, HistoInformatics@DH 2016, Krakow, Poland, July 11, 2016_0.tei.xml
output/4 Default Text Structure - The TEI Guidelines_0.tei.xml
output/4Humanities - Designing Digital Advocacy_0.tei.xml
output/4 Ríos - una construcción transmedia de memoria histórica sobre el conflicto armado en Colombia_0.tei.xml
output/9 Dictionaries - The TEI Guidelines_0.tei.xml
output/A 3D Common Ground - Bringing Humanities Data Together Inside Online Game Engines_0.tei.xml
output/Abbreviations In Manuscripts - Systematization And Crowdsourcing By Ad Fontes_0.tei.xml
output/A Bilingual Digital Edition of Trinity College Cambridge MS O.1.77_0.tei.xml
output/Abundance and Access - Early Modern Political Letters in Contemporary and Digital Archives_0.tei.xml
output/Abusing the Concept of Normalization for Better Collation Results (and Profit)_0.tei.xml
output/Academic Migrants - A Digital Discussion of Transnational Teaching and Learning_0.tei.xml
output/Academic Pillow-Talk and Two Immersive Explorations of Linguistic Space_0.tei.xml
output/1 Million Dutch Newspaper Images available for researchers - The KBK-1M Dataset_0.tei.xml
output/3D-ICONS -- 3D Digitisation of Icons of European Architectural and Archaeological Heritage_0.tei.xml
output/3D Scanning for Preservation - Difficulties and Dissemination_0.tei.xml
output/3rd International Workshop on Computational History, HistoInformatics@DH 2016, Krakow, Poland, July 11, 2016_0.tei.xml
output/4 Default Text Structure - The TEI Guidelines_0.tei.xml
output/4Humanities - Designing Digital Advocacy_0.tei.xml
output/4 Ríos - una construcción transmedia de memoria histórica sobre el conflicto armado en Colombia_0.tei.xml
output/9 Dictionaries - The TEI Guidelines_0.tei.xml
output/A 3D Common Ground - Bringing Humanities Data Together Inside Online Game Engines_0.tei.xml
output/Abbreviations In Manuscripts - Systematization And Crowdsourcing By Ad Fontes_0.tei.xml
output/A Bilingual Digital Edition of Trinity College Cambridge MS O.1.77_0.tei.xml
output/Abundance and Access - Early Modern Political Letters in Contemporary and Digital Archives_0.tei.xml
output/Abusing the Concept of Normalization for Better Collation Results (and Profit)_0.tei.xml
output/Academic Migrants - A Digital Discussion of Transnational Teaching and Learning_0.tei.xml
output/Academic Pillow-Talk and Two Immersive Explorations of Linguistic Space_0.tei.xml
output/Academy of Finland Research Programme "Digital Humanities" (DIGIHUM)_0.tei.xml
output/A Case Study of Integration of Services and Resources on a Web Service_0.tei.xml
output/A catalogue of digital editions_0.tei.xml
output/Accessibility and Reception - Vector Semantics, Reading Publics, and the Changing Reception of Literary Works_0.tei.xml
output/Access, Ownership, Protection - The Ethics of Digital Scholarship_0.tei.xml
output/Achieving Machine-Readable Mayan Text via Unicode - Blending "Old World" script-encoding with novel digital approaches_0.tei.xml
output/A Clear Temporal GIS Viewer and Software for Discovering Irregularities in Historical GIS_0.tei.xml
output/Academy of Finland Research Programme "Digital Humanities" (DIGIHUM)_0.tei.xml
output/A Case Study of Integration of Services and Resources on a Web Service_0.tei.xml
output/A catalogue of digital editions_0.tei.xml
output/Accessibility and Reception - Vector Semantics, Reading Publics, and the Changing Reception of Literary Works_0.tei.xml
output/Access, Ownership, Protection - The Ethics of Digital Scholarship_0.tei.xml
output/Achieving Machine-Readable Mayan Text via Unicode - Blending "Old World" script-encoding with novel digital approaches_0.tei.xml
output/A Clear Temporal GIS Viewer and Software for Discovering Irregularities in Historical GIS_0.tei.xml
output/Academy of Finland Research Programme "Digital Humanities" (DIGIHUM)_0.tei.xml
output/A Case Study of Integration of Services and Resources on a Web Service_0.tei.xml
output/A catalogue of digital editions_0.tei.xml
output/Accessibility and Reception - Vector Semantics, Reading Publics, and the Changing Reception of Literary Works_0.tei.xml
output/Access, Ownership, Protection - The Ethics of Digital Scholarship_0.tei.xml
output/Achieving Machine-Readable Mayan Text via Unicode - Blending "Old World" script-encoding with novel digital approaches_0.tei.xml
output/A Clear Temporal GIS Viewer and Software for Discovering Irregularities in Historical GIS_0.tei.xml
output/Academy of Finland Research Programme "Digital Humanities" (DIGIHUM)_0.tei.xml
output/A Case Study of Integration of Services and Resources on a Web Service_0.tei.xml
output/A catalogue of digital editions_0.tei.xml
output/Accessibility and Reception - Vector Semantics, Reading Publics, and the Changing Reception of Literary Works_0.tei.xml
output/Access, Ownership, Protection - The Ethics of Digital Scholarship_0.tei.xml
output/Achieving Machine-Readable Mayan Text via Unicode - Blending "Old World" script-encoding with novel digital approaches_0.tei.xml
output/A Clear Temporal GIS Viewer and Software for Discovering Irregularities in Historical GIS_0.tei.xml
output/Academy of Finland Research Programme "Digital Humanities" (DIGIHUM)_0.tei.xml
output/A Case Study of Integration of Services and Resources on a Web Service_0.tei.xml
output/A catalogue of digital editions_0.tei.xml
output/Accessibility and Reception - Vector Semantics, Reading Publics, and the Changing Reception of Literary Works_0.tei.xml
output/Access, Ownership, Protection - The Ethics of Digital Scholarship_0.tei.xml
output/Achieving Machine-Readable Mayan Text via Unicode - Blending "Old World" script-encoding with novel digital approaches_0.tei.xml
output/A Clear Temporal GIS Viewer and Software for Discovering Irregularities in Historical GIS_0.tei.xml
......@@ -6841,3 +6841,8 @@ http://www.dh2012.uni-hamburg.de/conference/programme/abstracts/workflows-as-str
http://www.dh2012.uni-hamburg.de/conference/programme/abstracts/writing-with-sound-composing-multimodal-long-form-scholarship.1.html
http://www.dh2012.uni-hamburg.de/conference/programme/abstracts/xml-print-an-ergonomic-typesetting-system-for-complex-text-structures.1.html
http://xtf-prod.stanford.edu/xtf/view?docId=tei/ab-360.xml
http://xtf-prod.stanford.edu/xtf/view?docId=tei/ab-208.xml
http://www.dh2012.uni-hamburg.de/conference/programme/abstracts/academic-research-in-the-blogosphere-adapting-to-new-opportunities-and-risks-on-the-internet.1.html
http://xtf-prod.stanford.edu/xtf/view?docId=tei/ab-208.xml
http://www.dh2012.uni-hamburg.de/conference/programme/abstracts/academic-research-in-the-blogosphere-adapting-to-new-opportunities-and-risks-on-the-internet.1.html
http://xtf-prod.stanford.edu/xtf/view?docId=tei/ab-264.xml
......@@ -6473,3 +6473,10 @@ http://dharchive.org/paper/DH2014/Paper-205.xml
http://dharchive.org/paper/DH2014/Plenary-809.xml
http://dharchive.org/paper/DH2014/Paper-724.xml
http://dharchive.org/paper/DH2014/Paper-877.xml
http://dharchive.org/paper/DH2014/Paper-652.xml
https://doi.org/10.1093/llc/fqz025
http://dharchive.org/paper/DH2014/Paper-652.xml
https://doi.org/10.1093/llc/fqz025
https://doi.org/10.1093/llc/fqz007
http://dharchive.org/paper/DH2014/Paper-416.xml
http://dharchive.org/paper/DH2014/Workshops-802.xml
{"number_of_downloaded_pubs": 2103}
\ No newline at end of file
{"number_of_downloaded_pubs": 14}
\ No newline at end of file
{"total_number_of_pubs": 2103, "number_of_extracted_toolnames": 5060, "number_of_distinct_extracted_toolnames": 2342, "number_of_pubs_without_tool_mentions": 1501, "number_of_pubs_with_tool_mentions": 602, "total_number_of_sentences": 104821}
\ No newline at end of file
{"total_number_of_pubs": 14, "number_of_extracted_toolnames": 16, "number_of_distinct_extracted_toolnames": 16, "number_of_pubs_without_tool_mentions": 11, "number_of_pubs_with_tool_mentions": 3, "total_number_of_sentences": 206}
\ No newline at end of file
{"number_of_retreived_pubs": 18}
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment