Skip to content
Snippets Groups Projects
Unverified Commit 4b556f80 authored by mweidling's avatar mweidling Committed by GitHub
Browse files

Refactor and improve logging

* refactor: remove dead code, improve logging

* build: add targets for convenience

* refactor

* build: add complete cleaning

* build: remove surplus copy (dir is mounted)

* refactor: restore old check, fix bug, extend logging

* refactor: remove cleaning from bash script

* refactor: remove dead code, streamline params

* refactor: remove dead code

* update .gitignore

* refactor: remove dead code

* refactor: mkdirs, fix bug in var assignment
parent cf50b1b4
No related branches found
No related tags found
1 merge request!1Merge GitHub's state
...@@ -7,6 +7,7 @@ work/ ...@@ -7,6 +7,7 @@ work/
workflows/nf-results/* workflows/nf-results/*
workflows/results workflows/results
workflows/ocrd_workflows/*.nf workflows/ocrd_workflows/*.nf
workflows/ocrd_workflows/ocrd.log
models models
.idea .idea
gt/* gt/*
......
...@@ -26,6 +26,4 @@ RUN pip3 install -r requirements.txt ...@@ -26,6 +26,4 @@ RUN pip3 install -r requirements.txt
RUN pip3 install . RUN pip3 install .
RUN nextflow RUN nextflow
COPY workflows workflows
ENTRYPOINT [ "bash" ] ENTRYPOINT [ "bash" ]
\ No newline at end of file
...@@ -12,4 +12,13 @@ run: ...@@ -12,4 +12,13 @@ run:
docker compose exec app bash workflows/execute_workflows.sh > logs/run_$$(date +"%s").log docker compose exec app bash workflows/execute_workflows.sh > logs/run_$$(date +"%s").log
stop: stop:
CONTAINER_ID=$$(docker ps | grep quiver | cut -d' ' -f1); docker container stop $$CONTAINER_ID && docker container remove $$CONTAINER_ID CONTAINER_ID=$$(docker ps | grep quiver | cut -d' ' -f1); docker container stop $$CONTAINER_ID && docker container remove $$CONTAINER_ID
\ No newline at end of file
clean-workspaces:
docker compose exec app rm -rf workflows/workspaces
clean-results:
docker compose exec app rm -rf workflows/nf-results workflows/results
clean: clean-workspaces clean-results
@echo "Cleaning everything."
\ No newline at end of file
...@@ -4,21 +4,17 @@ ROOT=$PWD ...@@ -4,21 +4,17 @@ ROOT=$PWD
WORKFLOW_DIR="$ROOT"/workflows WORKFLOW_DIR="$ROOT"/workflows
OCRD_WORKFLOW_DIR="$WORKFLOW_DIR"/ocrd_workflows OCRD_WORKFLOW_DIR="$WORKFLOW_DIR"/ocrd_workflows
WORKSPACE_DIR="$WORKFLOW_DIR"/workspaces WORKSPACE_DIR="$WORKFLOW_DIR"/workspaces
RESULTS_DIR="$WORKFLOW_DIR"/results
set -euo pipefail set -euo pipefail
clean_up_dirs() { clean_up_dirs() {
if [[ -d workflows/workspaces ]]; then
rm -rf workflows/workspaces
fi
if [[ -d workflows/nf-results ]]; then if [[ -d workflows/nf-results ]]; then
rm -rf workflows/nf-results rm -rf workflows/nf-results
fi fi
mkdir -p "$WORKSPACE_DIR"
if [[ -d workflows/results ]]; then mkdir -p "$RESULTS_DIR"
rm -rf workflows/results mkdir workflows/nf-results
fi
} }
convert_ocrd_wfs_to_NextFlow() { convert_ocrd_wfs_to_NextFlow() {
...@@ -26,14 +22,9 @@ convert_ocrd_wfs_to_NextFlow() { ...@@ -26,14 +22,9 @@ convert_ocrd_wfs_to_NextFlow() {
echo "Convert OCR-D workflows to NextFlow …" echo "Convert OCR-D workflows to NextFlow …"
mkdir -p "$WORKFLOW_DIR/nf-results"
for FILE in *.txt for FILE in *.txt
do do
oton convert -I "$FILE" -O "$FILE".nf oton convert -I "$FILE" -O "$FILE".nf
# the venv part is not needed since we execute this in an image derived from ocrd/all:maximum
sed -i 's/source "${params.venv_path}"//g' "$FILE".nf
sed -i 's/deactivate//g' "$FILE".nf
done done
} }
...@@ -41,19 +32,16 @@ download_models() { ...@@ -41,19 +32,16 @@ download_models() {
echo "Download the necessary models if not available" echo "Download the necessary models if not available"
if [[ ! -f /usr/local/share/tessdata/Fraktur_GT4HistOCR.traineddata ]] if [[ ! -f /usr/local/share/tessdata/Fraktur_GT4HistOCR.traineddata ]]
then then
#mkdir -p /usr/local/share/ocrd-resources/
ocrd resmgr download ocrd-tesserocr-recognize '*' ocrd resmgr download ocrd-tesserocr-recognize '*'
fi fi
if [[ ! -d /usr/local/share/ocrd-resources/ocrd-calamari-recognize/qurator-gt4histocr-1.0 ]] if [[ ! -d /usr/local/share/ocrd-resources/ocrd-calamari-recognize/qurator-gt4histocr-1.0 ]]
then then
mkdir -p /usr/local/share/ocrd-resources/
ocrd resmgr download ocrd-calamari-recognize qurator-gt4histocr-1.0 ocrd resmgr download ocrd-calamari-recognize qurator-gt4histocr-1.0
fi fi
} }
create_wf_specific_workspaces() { create_wf_specific_workspaces() {
# execute this workflow on the existing data (incl. evaluation) # execute this workflow on the existing data (incl. evaluation)
mkdir -p "$WORKSPACE_DIR"/tmp
cd "$WORKSPACE_DIR" || exit cd "$WORKSPACE_DIR" || exit
# create workspace for all OCR workflows. # create workspace for all OCR workflows.
...@@ -65,67 +53,54 @@ create_wf_specific_workspaces() { ...@@ -65,67 +53,54 @@ create_wf_specific_workspaces() {
for WORKFLOW in "$OCRD_WORKFLOW_DIR"/*ocr.txt.nf for WORKFLOW in "$OCRD_WORKFLOW_DIR"/*ocr.txt.nf
do do
WF_NAME=$(basename -s .txt.nf "$WORKFLOW") WF_NAME=$(basename -s .txt.nf "$WORKFLOW")
cp -r "$ROOT"/gt/"$DIR_NAME" "$WORKSPACE_DIR"/tmp/"$DIR_NAME"_"$WF_NAME" TARGET_DIR="$WORKSPACE_DIR"/"$DIR_NAME"_"$WF_NAME"
cp "$WORKFLOW" "$WORKSPACE_DIR"/tmp/"$DIR_NAME"_"$WF_NAME"/data/*/ if [[ ! -d "$TARGET_DIR" ]]; then
cp -r "$ROOT"/gt/"$DIR_NAME" "$TARGET_DIR"
cp "$WORKFLOW" "$TARGET_DIR"/data/*/
cp "$OCRD_WORKFLOW_DIR"/*eval.txt.nf "$TARGET_DIR"/data/*/
rm -rf "$WORKSPACE_DIR"/log.log
else
echo "$TARGET_DIR already exists. Skipping."
fi
done done
fi fi
done done
} }
clean_up_tmp_dirs() {
echo "Clean up intermediate dirs …"
for DIR in "$WORKSPACE_DIR"/tmp/*
do
echo "Cleaning up $DIR."
DIR_NAME=$(basename "$DIR")
mv "$DIR" "$WORKSPACE_DIR"/"$DIR_NAME"
cp "$OCRD_WORKFLOW_DIR"/*eval.txt.nf "$WORKSPACE_DIR"/"$DIR_NAME"/data/*/
done
rm -rf "$WORKSPACE_DIR"/tmp
rm -rf "$WORKSPACE_DIR"/log.log
}
execute_wfs_and_extract_benchmarks() { execute_wfs_and_extract_benchmarks() {
mkdir -p "$ROOT"/workflows/results
# for all data sets… # for all data sets…
for WS_DIR in "$WORKSPACE_DIR"/* for WS_DIR in "$WORKSPACE_DIR"/*
do do
INNER_DIR=$(ls "$WS_DIR"/data/) DATA_DIR="$WS_DIR"/data
DIR_NAME=$(basename "$WS_DIR")
INNER_DIR=$(ls "$DATA_DIR"/)
if [ -d "$WS_DIR" ] && ! grep -q "OCR-D-OCR" "$WS_DIR/data/$INNER_DIR/mets.xml" ; then if ! grep -q "OCR-D-OCR" "$WS_DIR/data/$INNER_DIR/mets.xml" ; then
echo "Switching to $WS_DIR." echo "Switching to $WS_DIR."
DIR_NAME=$(basename "$WS_DIR") run "$DATA_DIR"/*/*ocr.txt.nf "$DIR_NAME" "$WS_DIR"
run "$DATA_DIR"/*/*eval.txt.nf "$DIR_NAME" "$WS_DIR"
run "$WS_DIR"/data/*/*ocr.txt.nf "$DIR_NAME" "$WS_DIR"
run "$WS_DIR"/data/*/*eval.txt.nf "$DIR_NAME" "$WS_DIR"
# create a result JSON according to the specs # create a result JSON according to the specs
echo "Get Benchmark JSON …" echo "Get Benchmark JSON …"
quiver benchmarks-extraction "$WS_DIR"/data/* "$WORKFLOW" WORKFLOW=$(basename -s .txt.nf "$DATA_DIR"/*/*ocr.txt.nf)
quiver benchmarks-extraction "$DATA_DIR"/* "$WORKFLOW"
echo "Done." echo "Done."
# move data to results dir # move data to results dir
mv "$WS_DIR"/data/*/*.json "$WORKFLOW_DIR"/results mv "$DATA_DIR"/*/*result.json "$RESULTS_DIR"
else
echo "$WS_DIR has already been processed."
fi fi
done done
cd "$ROOT" || exit cd "$ROOT" || exit
} }
adjust_workflow_settings() {
# $1: $WORKFLOW
# $2: $DIR_NAME
sed -i "s CURRENT app/workflows/workspaces/$2/data/*/ g" "$1"
}
rename_and_move_nextflow_result() { rename_and_move_nextflow_result() {
# rename NextFlow results in order to properly match them to the workflows # rename NextFlow results in order to properly match them to the workflows
# $1: $WORKFLOW # $1: $WORKFLOW
# $2: $DIR_NAME # $2: $DIR_NAME
WORKFLOW_NAME=$(basename -s .txt.nf "$1") WORKFLOW_NAME=$(basename -s .txt.nf "$1")
rm "$WORKFLOW_DIR"/nf-results/*process_completed.json
mv "$WORKFLOW_DIR"/nf-results/*_completed.json "$WORKFLOW_DIR"/results/"$2"_"$WORKFLOW_NAME"_completed.json
if [ "$WORKFLOW_NAME" != "dinglehopper_eval" ]; then if [ "$WORKFLOW_NAME" != "dinglehopper_eval" ]; then
for DIR in "$WORKSPACE_DIR"/work/* for DIR in "$WORKSPACE_DIR"/work/*
do do
...@@ -146,20 +121,19 @@ run() { ...@@ -146,20 +121,19 @@ run() {
# $1: $WORKFLOW # $1: $WORKFLOW
# $2: $DIR_NAME # $2: $DIR_NAME
# $3: $WS_DIR # $3: $WS_DIR
adjust_workflow_settings "$1" "$2"
nextflow run "$1" -with-weblog http://127.0.0.1:8000/nextflow/ --mets_path "/app/workflows/workspaces/$2/data/*/mets.xml" nextflow run "$1" -with-weblog http://127.0.0.1:8000/nextflow/ --mets_path "/app/workflows/workspaces/$2/data/*/mets.xml"
rename_and_move_nextflow_result "$1" "$2" rename_and_move_nextflow_result "$1" "$2"
save_workspaces "$3"/data "$2" "$1" save_workspaces "$3"/data "$2" "$1"
} }
save_workspaces() { save_workspaces() {
# $1: $WS_DIR # $1: $WORKFLOW
# $2: $DIR_NAME # $2: $DIR_NAME
# $3: $WORKFLOW # $3: $WS_DIR
echo "Zipping workspace $1" echo "Zipping workspace $3"
ocrd -l ERROR zip bag -d "$DIR_NAME"/data/* -i "$DIR_NAME"/data/* "$DIR_NAME" ocrd -l ERROR zip bag -d "$DIR_NAME"/data/* -i "$DIR_NAME"/data/* "$DIR_NAME"
WORKFLOW_NAME=$(basename -s .txt.nf "$3") WORKFLOW_NAME=$(basename -s .txt.nf "$1")
mv "$WORKSPACE_DIR"/"$2".zip "$WORKFLOW_DIR"/results/"$2"_"$WORKFLOW_NAME".zip mv "$WORKSPACE_DIR"/"$2".zip "$RESULTS_DIR"/"$2"_"$WORKFLOW_NAME".zip
} }
summarize_to_data_json() { summarize_to_data_json() {
...@@ -169,24 +143,12 @@ summarize_to_data_json() { ...@@ -169,24 +143,12 @@ summarize_to_data_json() {
echo "Done." echo "Done."
} }
final_clean_up() {
echo "Cleaning up …"
rm -rf "$WORKSPACE_DIR"
rm -rf "$ROOT"/work
rm -rf "$WORKFLOW_DIR"/nf-results
rm -rf "$WORKFLOW_DIR"/results
rm "$WORKFLOW_DIR"/ocrd_workflows/*.nf
}
clean_up_dirs clean_up_dirs
convert_ocrd_wfs_to_NextFlow convert_ocrd_wfs_to_NextFlow
download_models download_models
create_wf_specific_workspaces create_wf_specific_workspaces
clean_up_tmp_dirs
uvicorn api:app --app-dir "$ROOT"/src & # start webserver for evaluation uvicorn api:app --app-dir "$ROOT"/src & # start webserver for evaluation
echo "Process is running. See logs at ./logs for more information." sleep 2 && >&2 echo "Process is running. See logs at ./logs for more information."
execute_wfs_and_extract_benchmarks execute_wfs_and_extract_benchmarks
summarize_to_data_json summarize_to_data_json
final_clean_up
echo "All workflows have been run." echo "All workflows have been run."
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment