Commit aeb55688 authored by Seungbin Yim's avatar Seungbin Yim
Browse files

Configure DVC Pipeline for ML Training

parent 8bab88a4
preprocess_dh_pub:
cmd: python preprocess/tei_to_sentences.py data/dh_pub/xml data/dh_pub_output/
deps:
- path: preprocess/tei_to_sentences.py
md5: 102500cdb262dee333685baf89f46eee
size: 2571
outs:
- path: data/dh_pub_output
md5: 28987489de3a4824787d954807bdb568.dir
size: 13457520
nfiles: 2359
to_jsonl:
cmd: python preprocess/corpus_to_jsonl.py data/dh_pub_output data/sentences/dh_pub_sentences.jsonl
deps:
- path: data/dh_pub_output
md5: 28987489de3a4824787d954807bdb568.dir
size: 13457520
nfiles: 2359
- path: preprocess/corpus_to_jsonl.py
md5: b5d119d7929e1122cab6b3c367a7c3f2
size: 1091
outs:
- path: data/sentences
md5: b1880a37a96da4a422788a43146887c4.dir
size: 14517551
nfiles: 1
schema: '2.0'
stages:
preprocess_dh_pub:
cmd: python preprocess/tei_to_sentences.py data/dh_pub/xml data/dh_pub_output/
deps:
- path: preprocess/tei_to_sentences.py
md5: 102500cdb262dee333685baf89f46eee
size: 2571
outs:
- path: data/dh_pub_output
md5: 28987489de3a4824787d954807bdb568.dir
size: 13457520
nfiles: 2359
to_jsonl:
cmd: python preprocess/corpus_to_jsonl.py data/dh_pub_output data/sentences/dh_pub_sentences.jsonl
deps:
- path: data/dh_pub_output
md5: 28987489de3a4824787d954807bdb568.dir
size: 13457520
nfiles: 2359
- path: preprocess/corpus_to_jsonl.py
md5: b5d119d7929e1122cab6b3c367a7c3f2
size: 1091
outs:
- path: data/sentences
md5: b1880a37a96da4a422788a43146887c4.dir
size: 14517551
nfiles: 1
ml_train:
cmd: prodigy train ner ann_manual,ann_corrected_p1 en_vectors_web_lg --init-tok2ve
data/pretrain/model.bin --output ner_ml/model/tools_model_with_corrections_pretrained
--eval-split 0.2 --n-iter 1
deps:
- path: data/annotations/ann_corrected_p1.jsonl
md5: 51b217bbd23310ce4ce2eb5374282742
size: 4242612
- path: data/annotations/ann_corrected_p2.jsonl
md5: 3d93f21e36e0df6f34fffb2ee5674f10
size: 347989
- path: data/annotations/ann_manual.jsonl
md5: b74ee888672872f874e65bfd2ee85c29
size: 5811880
- path: data/pretrain/model.bin
md5: 9128bb604cb4d6ae98c02abd7fb191bd
size: 91739273
outs:
- path: ner_ml/model/tools_model_with_corrections_pretrained
md5: 0b8d6deaeea712a34b0771bdbb57ff42.dir
size: 1435077219
nfiles: 9
......@@ -12,3 +12,12 @@ stages:
- preprocess/corpus_to_jsonl.py
outs:
- data/sentences
ml_train:
cmd: prodigy train ner ann_manual,ann_corrected_p1 en_vectors_web_lg --init-tok2ve data/pretrain/model.bin --output ner_ml/model/tools_model_with_corrections_pretrained --eval-split 0.2 --n-iter 20
deps:
- data/annotations/ann_manual.jsonl
- data/annotations/ann_corrected_p1.jsonl
- data/annotations/ann_corrected_p2.jsonl
- data/pretrain/model.bin
outs:
- ner_ml/model/tools_model_with_corrections_pretrained
\ No newline at end of file
# Created by .ignore support plugin (hsz.mobi)
prodigy/prodigy-1.10.2-cp36.cp37.cp38-cp36m.cp37m.cp38-macosx_10_14_x86_64.whl
# Requirements
You need prodigy installed.
Tested with python version 3.8.4
```prodigy train ner ann_manual, ann_corrected_p1 en_vectors_web_lg --init-tok2ve data/pretrain/model.bin --output ./tools_model_with_corrections_pretrained --eval-split 0.2 --n-iter 20```
\ No newline at end of file
/tools_model_with_corrections_pretrained
https://github.com/explosion/spacy-models/releases/download/en_vectors_web_lg-2.3.0/en_vectors_web_lg-2.3.0.tar.gz
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment