Something went wrong on our end
-
mweidling authored
* feat: add Reichsanzeiger-GT to prepare.sh * add Reichsanzeiger subsets * add download for Reichsanzeiger subsets * add first draft for reichsanzeiger gt * extract sample bundles * update .gitignore * clean up * tidy up project root * tidy up volumes * update README * update data * build: roll back to version where cis runs still runs * build: remove default model mounting
mweidling authored* feat: add Reichsanzeiger-GT to prepare.sh * add Reichsanzeiger subsets * add download for Reichsanzeiger subsets * add first draft for reichsanzeiger gt * extract sample bundles * update .gitignore * clean up * tidy up project root * tidy up volumes * update README * update data * build: roll back to version where cis runs still runs * build: remove default model mounting
execute_workflows.sh 5.48 KiB
#!/bin/bash
ROOT=$PWD
WORKFLOW_DIR="$ROOT"/workflows
OCRD_WORKFLOW_DIR="$WORKFLOW_DIR"/ocrd_workflows
WORKSPACE_DIR="$WORKFLOW_DIR"/workspaces
set -euo pipefail
clean_up_dirs() {
if [[ -d workflows/workspaces ]]; then
rm -rf workflows/workspaces
fi
if [[ -d workflows/nf-results ]]; then
rm -rf workflows/nf-results
fi
if [[ -d workflows/results ]]; then
rm -rf workflows/results
fi
}
convert_ocrd_wfs_to_NextFlow() {
cd "$OCRD_WORKFLOW_DIR" || exit
echo "Convert OCR-D workflows to NextFlow …"
mkdir -p "$WORKFLOW_DIR/nf-results"
for FILE in *.txt
do
oton convert -I "$FILE" -O "$FILE".nf
# the venv part is not needed since we execute this in an image derived from ocrd/all:maximum
sed -i 's/source "${params.venv_path}"//g' "$FILE".nf
sed -i 's/deactivate//g' "$FILE".nf
done
}
download_models() {
echo "Download the necessary models if not available"
if [[ ! -d /usr/local/share/tessdata ]]
then
#mkdir -p /usr/local/share/ocrd-resources/
ocrd resmgr download ocrd-tesserocr-recognize '*'
fi
if [[ ! -d /usr/local/share/ocrd-resources/ocrd-calamari-recognize/qurator-gt4histocr-1.0 ]]
then
mkdir -p /usr/local/share/ocrd-resources/
ocrd resmgr download ocrd-calamari-recognize qurator-gt4histocr-1.0
fi
}
create_wf_specific_workspaces() {
# execute this workflow on the existing data (incl. evaluation)
mkdir -p "$WORKSPACE_DIR"/tmp
cd "$WORKSPACE_DIR" || exit
# create workspace for all OCR workflows.
# each workflow has a separate workspace to work with.
for DIR in "$ROOT"/gt/*/; do
DIR_NAME=$(basename "$DIR")
if [[ ! $DIR_NAME == "reichsanzeiger-gt" ]]; then
echo "Create workflow specific workspace for $DIR_NAME."
for WORKFLOW in "$OCRD_WORKFLOW_DIR"/*ocr.txt.nf
do
WF_NAME=$(basename -s .txt.nf "$WORKFLOW")
cp -r "$ROOT"/gt/"$DIR_NAME" "$WORKSPACE_DIR"/tmp/"$DIR_NAME"_"$WF_NAME"
cp "$WORKFLOW" "$WORKSPACE_DIR"/tmp/"$DIR_NAME"_"$WF_NAME"/data/*/
done
fi
done
}
clean_up_tmp_dirs() {
echo "Clean up intermediate dirs …"
for DIR in "$WORKSPACE_DIR"/tmp/*
do
echo "Cleaning up $DIR."
DIR_NAME=$(basename "$DIR")
mv "$DIR" "$WORKSPACE_DIR"/"$DIR_NAME"
cp "$OCRD_WORKFLOW_DIR"/*eval.txt.nf "$WORKSPACE_DIR"/"$DIR_NAME"/data/*/
done
rm -rf "$WORKSPACE_DIR"/tmp
rm -rf "$WORKSPACE_DIR"/log.log
}
execute_wfs_and_extract_benchmarks() {
mkdir -p "$ROOT"/workflows/results
# for all data sets…
for WS_DIR in "$WORKSPACE_DIR"/*
do
if [ -d "$WS_DIR" ]; then
echo "Switching to $WS_DIR."
DIR_NAME=$(basename $WS_DIR)
run "$WS_DIR"/data/*/*ocr.txt.nf "$DIR_NAME" "$WS_DIR"
run "$WS_DIR"/data/*/*eval.txt.nf "$DIR_NAME" "$WS_DIR"
# create a result JSON according to the specs
echo "Get Benchmark JSON …"
quiver benchmarks-extraction "$WS_DIR"/data/* "$WORKFLOW"
echo "Done."
# move data to results dir
mv "$WS_DIR"/data/*/*.json "$WORKFLOW_DIR"/results
fi
done
cd "$ROOT" || exit
}
adjust_workflow_settings() {
# $1: $WORKFLOW
# $2: $DIR_NAME
sed -i "s CURRENT app/workflows/workspaces/$2/data/*/ g" "$1"
}
rename_and_move_nextflow_result() {
# rename NextFlow results in order to properly match them to the workflows
# $1: $WORKFLOW
# $2: $DIR_NAME
WORKFLOW_NAME=$(basename -s .txt.nf "$1")
rm "$WORKFLOW_DIR"/nf-results/*process_completed.json
mv "$WORKFLOW_DIR"/nf-results/*_completed.json "$WORKFLOW_DIR"/results/"$2"_"$WORKFLOW_NAME"_completed.json
if [ "$WORKFLOW_NAME" != "dinglehopper_eval" ]; then
for DIR in "$WORKSPACE_DIR"/work/*
do
WORK_DIR_NAME=$(basename "$DIR")
for SUB_WORK_DIR in "$DIR"/*
do
SUB_WORK_DIR_NAME=$(basename "$SUB_WORK_DIR")
mv "$WORKSPACE_DIR"/work/"$WORK_DIR_NAME"/"$SUB_WORK_DIR_NAME"/.command.log "$WORKSPACE_DIR"/"$2"/"$WORK_DIR_NAME"_"$SUB_WORK_DIR_NAME".command.log
done
done
fi
rm -rf "$WORKSPACE_DIR"/work/*
rm "$WORKSPACE_DIR"/.nextflow.log
}
run() {
# $1: $WORKFLOW
# $2: $DIR_NAME
# $3: $WS_DIR
adjust_workflow_settings "$1" "$2"
nextflow run "$1" -with-weblog http://127.0.0.1:8000/nextflow/
rename_and_move_nextflow_result "$1" "$2"
save_workspaces "$3"/data "$2" "$1"
}
save_workspaces() {
# $1: $WS_DIR
# $2: $DIR_NAME
# $3: $WORKFLOW
echo "Zipping workspace $1"
ocrd zip bag -d "$DIR_NAME"/data/* -i "$DIR_NAME"/data/* "$DIR_NAME"
WORKFLOW_NAME=$(basename -s .txt.nf "$3")
mv "$WORKSPACE_DIR"/"$2".zip "$WORKFLOW_DIR"/results/"$2"_"$WORKFLOW_NAME".zip
}
summarize_to_data_json() {
# summarize JSONs
echo "Summarize JSONs to one file …"
quiver summarize-benchmarks
echo "Done."
}
final_clean_up() {
echo "Cleaning up …"
rm -rf "$WORKSPACE_DIR"
rm -rf "$ROOT"/work
rm -rf "$WORKFLOW_DIR"/nf-results
rm -rf "$WORKFLOW_DIR"/results
rm "$WORKFLOW_DIR"/ocrd_workflows/*.nf
}
clean_up_dirs
convert_ocrd_wfs_to_NextFlow
download_models
create_wf_specific_workspaces
clean_up_tmp_dirs
uvicorn api:app --app-dir "$ROOT"/src & # start webserver for evaluation
execute_wfs_and_extract_benchmarks
summarize_to_data_json
final_clean_up