Skip to content
Snippets Groups Projects
Unverified Commit 4b556f80 authored by mweidling's avatar mweidling Committed by GitHub
Browse files

Refactor and improve logging

* refactor: remove dead code, improve logging

* build: add targets for convenience

* refactor

* build: add complete cleaning

* build: remove surplus copy (dir is mounted)

* refactor: restore old check, fix bug, extend logging

* refactor: remove cleaning from bash script

* refactor: remove dead code, streamline params

* refactor: remove dead code

* update .gitignore

* refactor: remove dead code

* refactor: mkdirs, fix bug in var assignment
parent cf50b1b4
No related branches found
No related tags found
1 merge request!1Merge GitHub's state
......@@ -7,6 +7,7 @@ work/
workflows/nf-results/*
workflows/results
workflows/ocrd_workflows/*.nf
workflows/ocrd_workflows/ocrd.log
models
.idea
gt/*
......
......@@ -26,6 +26,4 @@ RUN pip3 install -r requirements.txt
RUN pip3 install .
RUN nextflow
COPY workflows workflows
ENTRYPOINT [ "bash" ]
\ No newline at end of file
......@@ -12,4 +12,13 @@ run:
docker compose exec app bash workflows/execute_workflows.sh > logs/run_$$(date +"%s").log
stop:
CONTAINER_ID=$$(docker ps | grep quiver | cut -d' ' -f1); docker container stop $$CONTAINER_ID && docker container remove $$CONTAINER_ID
\ No newline at end of file
CONTAINER_ID=$$(docker ps | grep quiver | cut -d' ' -f1); docker container stop $$CONTAINER_ID && docker container remove $$CONTAINER_ID
clean-workspaces:
docker compose exec app rm -rf workflows/workspaces
clean-results:
docker compose exec app rm -rf workflows/nf-results workflows/results
clean: clean-workspaces clean-results
@echo "Cleaning everything."
\ No newline at end of file
......@@ -4,21 +4,17 @@ ROOT=$PWD
WORKFLOW_DIR="$ROOT"/workflows
OCRD_WORKFLOW_DIR="$WORKFLOW_DIR"/ocrd_workflows
WORKSPACE_DIR="$WORKFLOW_DIR"/workspaces
RESULTS_DIR="$WORKFLOW_DIR"/results
set -euo pipefail
clean_up_dirs() {
if [[ -d workflows/workspaces ]]; then
rm -rf workflows/workspaces
fi
if [[ -d workflows/nf-results ]]; then
rm -rf workflows/nf-results
fi
if [[ -d workflows/results ]]; then
rm -rf workflows/results
fi
mkdir -p "$WORKSPACE_DIR"
mkdir -p "$RESULTS_DIR"
mkdir workflows/nf-results
}
convert_ocrd_wfs_to_NextFlow() {
......@@ -26,14 +22,9 @@ convert_ocrd_wfs_to_NextFlow() {
echo "Convert OCR-D workflows to NextFlow …"
mkdir -p "$WORKFLOW_DIR/nf-results"
for FILE in *.txt
do
oton convert -I "$FILE" -O "$FILE".nf
# the venv part is not needed since we execute this in an image derived from ocrd/all:maximum
sed -i 's/source "${params.venv_path}"//g' "$FILE".nf
sed -i 's/deactivate//g' "$FILE".nf
done
}
......@@ -41,19 +32,16 @@ download_models() {
echo "Download the necessary models if not available"
if [[ ! -f /usr/local/share/tessdata/Fraktur_GT4HistOCR.traineddata ]]
then
#mkdir -p /usr/local/share/ocrd-resources/
ocrd resmgr download ocrd-tesserocr-recognize '*'
fi
if [[ ! -d /usr/local/share/ocrd-resources/ocrd-calamari-recognize/qurator-gt4histocr-1.0 ]]
then
mkdir -p /usr/local/share/ocrd-resources/
ocrd resmgr download ocrd-calamari-recognize qurator-gt4histocr-1.0
fi
}
create_wf_specific_workspaces() {
# execute this workflow on the existing data (incl. evaluation)
mkdir -p "$WORKSPACE_DIR"/tmp
cd "$WORKSPACE_DIR" || exit
# create workspace for all OCR workflows.
......@@ -65,67 +53,54 @@ create_wf_specific_workspaces() {
for WORKFLOW in "$OCRD_WORKFLOW_DIR"/*ocr.txt.nf
do
WF_NAME=$(basename -s .txt.nf "$WORKFLOW")
cp -r "$ROOT"/gt/"$DIR_NAME" "$WORKSPACE_DIR"/tmp/"$DIR_NAME"_"$WF_NAME"
cp "$WORKFLOW" "$WORKSPACE_DIR"/tmp/"$DIR_NAME"_"$WF_NAME"/data/*/
TARGET_DIR="$WORKSPACE_DIR"/"$DIR_NAME"_"$WF_NAME"
if [[ ! -d "$TARGET_DIR" ]]; then
cp -r "$ROOT"/gt/"$DIR_NAME" "$TARGET_DIR"
cp "$WORKFLOW" "$TARGET_DIR"/data/*/
cp "$OCRD_WORKFLOW_DIR"/*eval.txt.nf "$TARGET_DIR"/data/*/
rm -rf "$WORKSPACE_DIR"/log.log
else
echo "$TARGET_DIR already exists. Skipping."
fi
done
fi
done
}
clean_up_tmp_dirs() {
echo "Clean up intermediate dirs …"
for DIR in "$WORKSPACE_DIR"/tmp/*
do
echo "Cleaning up $DIR."
DIR_NAME=$(basename "$DIR")
mv "$DIR" "$WORKSPACE_DIR"/"$DIR_NAME"
cp "$OCRD_WORKFLOW_DIR"/*eval.txt.nf "$WORKSPACE_DIR"/"$DIR_NAME"/data/*/
done
rm -rf "$WORKSPACE_DIR"/tmp
rm -rf "$WORKSPACE_DIR"/log.log
}
execute_wfs_and_extract_benchmarks() {
mkdir -p "$ROOT"/workflows/results
# for all data sets…
for WS_DIR in "$WORKSPACE_DIR"/*
do
INNER_DIR=$(ls "$WS_DIR"/data/)
DATA_DIR="$WS_DIR"/data
DIR_NAME=$(basename "$WS_DIR")
INNER_DIR=$(ls "$DATA_DIR"/)
if [ -d "$WS_DIR" ] && ! grep -q "OCR-D-OCR" "$WS_DIR/data/$INNER_DIR/mets.xml" ; then
echo "Switching to $WS_DIR."
if ! grep -q "OCR-D-OCR" "$WS_DIR/data/$INNER_DIR/mets.xml" ; then
echo "Switching to $WS_DIR."
DIR_NAME=$(basename "$WS_DIR")
run "$WS_DIR"/data/*/*ocr.txt.nf "$DIR_NAME" "$WS_DIR"
run "$WS_DIR"/data/*/*eval.txt.nf "$DIR_NAME" "$WS_DIR"
run "$DATA_DIR"/*/*ocr.txt.nf "$DIR_NAME" "$WS_DIR"
run "$DATA_DIR"/*/*eval.txt.nf "$DIR_NAME" "$WS_DIR"
# create a result JSON according to the specs
echo "Get Benchmark JSON …"
quiver benchmarks-extraction "$WS_DIR"/data/* "$WORKFLOW"
WORKFLOW=$(basename -s .txt.nf "$DATA_DIR"/*/*ocr.txt.nf)
quiver benchmarks-extraction "$DATA_DIR"/* "$WORKFLOW"
echo "Done."
# move data to results dir
mv "$WS_DIR"/data/*/*.json "$WORKFLOW_DIR"/results
mv "$DATA_DIR"/*/*result.json "$RESULTS_DIR"
else
echo "$WS_DIR has already been processed."
fi
done
cd "$ROOT" || exit
}
adjust_workflow_settings() {
# $1: $WORKFLOW
# $2: $DIR_NAME
sed -i "s CURRENT app/workflows/workspaces/$2/data/*/ g" "$1"
}
rename_and_move_nextflow_result() {
# rename NextFlow results in order to properly match them to the workflows
# $1: $WORKFLOW
# $2: $DIR_NAME
WORKFLOW_NAME=$(basename -s .txt.nf "$1")
rm "$WORKFLOW_DIR"/nf-results/*process_completed.json
mv "$WORKFLOW_DIR"/nf-results/*_completed.json "$WORKFLOW_DIR"/results/"$2"_"$WORKFLOW_NAME"_completed.json
if [ "$WORKFLOW_NAME" != "dinglehopper_eval" ]; then
for DIR in "$WORKSPACE_DIR"/work/*
do
......@@ -146,20 +121,19 @@ run() {
# $1: $WORKFLOW
# $2: $DIR_NAME
# $3: $WS_DIR
adjust_workflow_settings "$1" "$2"
nextflow run "$1" -with-weblog http://127.0.0.1:8000/nextflow/ --mets_path "/app/workflows/workspaces/$2/data/*/mets.xml"
rename_and_move_nextflow_result "$1" "$2"
save_workspaces "$3"/data "$2" "$1"
}
save_workspaces() {
# $1: $WS_DIR
# $1: $WORKFLOW
# $2: $DIR_NAME
# $3: $WORKFLOW
echo "Zipping workspace $1"
# $3: $WS_DIR
echo "Zipping workspace $3"
ocrd -l ERROR zip bag -d "$DIR_NAME"/data/* -i "$DIR_NAME"/data/* "$DIR_NAME"
WORKFLOW_NAME=$(basename -s .txt.nf "$3")
mv "$WORKSPACE_DIR"/"$2".zip "$WORKFLOW_DIR"/results/"$2"_"$WORKFLOW_NAME".zip
WORKFLOW_NAME=$(basename -s .txt.nf "$1")
mv "$WORKSPACE_DIR"/"$2".zip "$RESULTS_DIR"/"$2"_"$WORKFLOW_NAME".zip
}
summarize_to_data_json() {
......@@ -169,24 +143,12 @@ summarize_to_data_json() {
echo "Done."
}
final_clean_up() {
echo "Cleaning up …"
rm -rf "$WORKSPACE_DIR"
rm -rf "$ROOT"/work
rm -rf "$WORKFLOW_DIR"/nf-results
rm -rf "$WORKFLOW_DIR"/results
rm "$WORKFLOW_DIR"/ocrd_workflows/*.nf
}
clean_up_dirs
convert_ocrd_wfs_to_NextFlow
download_models
create_wf_specific_workspaces
clean_up_tmp_dirs
uvicorn api:app --app-dir "$ROOT"/src & # start webserver for evaluation
echo "Process is running. See logs at ./logs for more information."
sleep 2 && >&2 echo "Process is running. See logs at ./logs for more information."
execute_wfs_and_extract_benchmarks
summarize_to_data_json
final_clean_up
echo "All workflows have been run."
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment