diff --git a/.gitignore b/.gitignore index 87f881c403d513dbc17aeeae0fbeb524b2997ae2..6ff3b1976f4a70891f7731511b4cbc1c5753df32 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ work/ workflows/nf-results/* workflows/results workflows/ocrd_workflows/*.nf +workflows/ocrd_workflows/ocrd.log models .idea gt/* diff --git a/Dockerfile b/Dockerfile index ebf5b16dbf7c9787a6cd597adf0098f056fdb67c..d621633eba85202f4fa3acb192c5f8885a7962e9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -26,6 +26,4 @@ RUN pip3 install -r requirements.txt RUN pip3 install . RUN nextflow -COPY workflows workflows - ENTRYPOINT [ "bash" ] \ No newline at end of file diff --git a/Makefile b/Makefile index 6cc9eda6e614ad65dda8914e3f1993625e0feec8..fba4b22bafd8ceb51ba357c39b60d1beee532406 100644 --- a/Makefile +++ b/Makefile @@ -12,4 +12,13 @@ run: docker compose exec app bash workflows/execute_workflows.sh > logs/run_$$(date +"%s").log stop: - CONTAINER_ID=$$(docker ps | grep quiver | cut -d' ' -f1); docker container stop $$CONTAINER_ID && docker container remove $$CONTAINER_ID \ No newline at end of file + CONTAINER_ID=$$(docker ps | grep quiver | cut -d' ' -f1); docker container stop $$CONTAINER_ID && docker container remove $$CONTAINER_ID + +clean-workspaces: + docker compose exec app rm -rf workflows/workspaces + +clean-results: + docker compose exec app rm -rf workflows/nf-results workflows/results + +clean: clean-workspaces clean-results + @echo "Cleaning everything." \ No newline at end of file diff --git a/workflows/execute_workflows.sh b/workflows/execute_workflows.sh index bcbde19ab15936070e9a7a83fedd2e6962f68cee..3ce0d9bf5a8d5f87838c03564eb6c45ceefa7803 100755 --- a/workflows/execute_workflows.sh +++ b/workflows/execute_workflows.sh @@ -4,21 +4,17 @@ ROOT=$PWD WORKFLOW_DIR="$ROOT"/workflows OCRD_WORKFLOW_DIR="$WORKFLOW_DIR"/ocrd_workflows WORKSPACE_DIR="$WORKFLOW_DIR"/workspaces +RESULTS_DIR="$WORKFLOW_DIR"/results set -euo pipefail clean_up_dirs() { - if [[ -d workflows/workspaces ]]; then - rm -rf workflows/workspaces - fi - if [[ -d workflows/nf-results ]]; then rm -rf workflows/nf-results fi - - if [[ -d workflows/results ]]; then - rm -rf workflows/results - fi + mkdir -p "$WORKSPACE_DIR" + mkdir -p "$RESULTS_DIR" + mkdir workflows/nf-results } convert_ocrd_wfs_to_NextFlow() { @@ -26,14 +22,9 @@ convert_ocrd_wfs_to_NextFlow() { echo "Convert OCR-D workflows to NextFlow …" - mkdir -p "$WORKFLOW_DIR/nf-results" - for FILE in *.txt do oton convert -I "$FILE" -O "$FILE".nf - # the venv part is not needed since we execute this in an image derived from ocrd/all:maximum - sed -i 's/source "${params.venv_path}"//g' "$FILE".nf - sed -i 's/deactivate//g' "$FILE".nf done } @@ -41,19 +32,16 @@ download_models() { echo "Download the necessary models if not available" if [[ ! -f /usr/local/share/tessdata/Fraktur_GT4HistOCR.traineddata ]] then - #mkdir -p /usr/local/share/ocrd-resources/ ocrd resmgr download ocrd-tesserocr-recognize '*' fi if [[ ! -d /usr/local/share/ocrd-resources/ocrd-calamari-recognize/qurator-gt4histocr-1.0 ]] then - mkdir -p /usr/local/share/ocrd-resources/ ocrd resmgr download ocrd-calamari-recognize qurator-gt4histocr-1.0 fi } create_wf_specific_workspaces() { # execute this workflow on the existing data (incl. evaluation) - mkdir -p "$WORKSPACE_DIR"/tmp cd "$WORKSPACE_DIR" || exit # create workspace for all OCR workflows. @@ -65,67 +53,54 @@ create_wf_specific_workspaces() { for WORKFLOW in "$OCRD_WORKFLOW_DIR"/*ocr.txt.nf do WF_NAME=$(basename -s .txt.nf "$WORKFLOW") - cp -r "$ROOT"/gt/"$DIR_NAME" "$WORKSPACE_DIR"/tmp/"$DIR_NAME"_"$WF_NAME" - cp "$WORKFLOW" "$WORKSPACE_DIR"/tmp/"$DIR_NAME"_"$WF_NAME"/data/*/ + TARGET_DIR="$WORKSPACE_DIR"/"$DIR_NAME"_"$WF_NAME" + if [[ ! -d "$TARGET_DIR" ]]; then + cp -r "$ROOT"/gt/"$DIR_NAME" "$TARGET_DIR" + cp "$WORKFLOW" "$TARGET_DIR"/data/*/ + cp "$OCRD_WORKFLOW_DIR"/*eval.txt.nf "$TARGET_DIR"/data/*/ + rm -rf "$WORKSPACE_DIR"/log.log + else + echo "$TARGET_DIR already exists. Skipping." + fi done fi done } -clean_up_tmp_dirs() { - echo "Clean up intermediate dirs …" - for DIR in "$WORKSPACE_DIR"/tmp/* - do - echo "Cleaning up $DIR." - DIR_NAME=$(basename "$DIR") - mv "$DIR" "$WORKSPACE_DIR"/"$DIR_NAME" - cp "$OCRD_WORKFLOW_DIR"/*eval.txt.nf "$WORKSPACE_DIR"/"$DIR_NAME"/data/*/ - done - - rm -rf "$WORKSPACE_DIR"/tmp - rm -rf "$WORKSPACE_DIR"/log.log -} - execute_wfs_and_extract_benchmarks() { - mkdir -p "$ROOT"/workflows/results # for all data sets… for WS_DIR in "$WORKSPACE_DIR"/* do - INNER_DIR=$(ls "$WS_DIR"/data/) + DATA_DIR="$WS_DIR"/data + DIR_NAME=$(basename "$WS_DIR") + INNER_DIR=$(ls "$DATA_DIR"/) - if [ -d "$WS_DIR" ] && ! grep -q "OCR-D-OCR" "$WS_DIR/data/$INNER_DIR/mets.xml" ; then - echo "Switching to $WS_DIR." + if ! grep -q "OCR-D-OCR" "$WS_DIR/data/$INNER_DIR/mets.xml" ; then + echo "Switching to $WS_DIR." - DIR_NAME=$(basename "$WS_DIR") - - run "$WS_DIR"/data/*/*ocr.txt.nf "$DIR_NAME" "$WS_DIR" - run "$WS_DIR"/data/*/*eval.txt.nf "$DIR_NAME" "$WS_DIR" + run "$DATA_DIR"/*/*ocr.txt.nf "$DIR_NAME" "$WS_DIR" + run "$DATA_DIR"/*/*eval.txt.nf "$DIR_NAME" "$WS_DIR" # create a result JSON according to the specs echo "Get Benchmark JSON …" - quiver benchmarks-extraction "$WS_DIR"/data/* "$WORKFLOW" + WORKFLOW=$(basename -s .txt.nf "$DATA_DIR"/*/*ocr.txt.nf) + quiver benchmarks-extraction "$DATA_DIR"/* "$WORKFLOW" echo "Done." # move data to results dir - mv "$WS_DIR"/data/*/*.json "$WORKFLOW_DIR"/results + mv "$DATA_DIR"/*/*result.json "$RESULTS_DIR" + else + echo "$WS_DIR has already been processed." fi done cd "$ROOT" || exit } -adjust_workflow_settings() { - # $1: $WORKFLOW - # $2: $DIR_NAME - sed -i "s CURRENT app/workflows/workspaces/$2/data/*/ g" "$1" -} - rename_and_move_nextflow_result() { # rename NextFlow results in order to properly match them to the workflows # $1: $WORKFLOW # $2: $DIR_NAME WORKFLOW_NAME=$(basename -s .txt.nf "$1") - rm "$WORKFLOW_DIR"/nf-results/*process_completed.json - mv "$WORKFLOW_DIR"/nf-results/*_completed.json "$WORKFLOW_DIR"/results/"$2"_"$WORKFLOW_NAME"_completed.json if [ "$WORKFLOW_NAME" != "dinglehopper_eval" ]; then for DIR in "$WORKSPACE_DIR"/work/* do @@ -146,20 +121,19 @@ run() { # $1: $WORKFLOW # $2: $DIR_NAME # $3: $WS_DIR - adjust_workflow_settings "$1" "$2" nextflow run "$1" -with-weblog http://127.0.0.1:8000/nextflow/ --mets_path "/app/workflows/workspaces/$2/data/*/mets.xml" rename_and_move_nextflow_result "$1" "$2" save_workspaces "$3"/data "$2" "$1" } save_workspaces() { - # $1: $WS_DIR + # $1: $WORKFLOW # $2: $DIR_NAME - # $3: $WORKFLOW - echo "Zipping workspace $1" + # $3: $WS_DIR + echo "Zipping workspace $3" ocrd -l ERROR zip bag -d "$DIR_NAME"/data/* -i "$DIR_NAME"/data/* "$DIR_NAME" - WORKFLOW_NAME=$(basename -s .txt.nf "$3") - mv "$WORKSPACE_DIR"/"$2".zip "$WORKFLOW_DIR"/results/"$2"_"$WORKFLOW_NAME".zip + WORKFLOW_NAME=$(basename -s .txt.nf "$1") + mv "$WORKSPACE_DIR"/"$2".zip "$RESULTS_DIR"/"$2"_"$WORKFLOW_NAME".zip } summarize_to_data_json() { @@ -169,24 +143,12 @@ summarize_to_data_json() { echo "Done." } - -final_clean_up() { - echo "Cleaning up …" - rm -rf "$WORKSPACE_DIR" - rm -rf "$ROOT"/work - rm -rf "$WORKFLOW_DIR"/nf-results - rm -rf "$WORKFLOW_DIR"/results - rm "$WORKFLOW_DIR"/ocrd_workflows/*.nf -} - clean_up_dirs convert_ocrd_wfs_to_NextFlow download_models create_wf_specific_workspaces -clean_up_tmp_dirs uvicorn api:app --app-dir "$ROOT"/src & # start webserver for evaluation -echo "Process is running. See logs at ./logs for more information." +sleep 2 && >&2 echo "Process is running. See logs at ./logs for more information." execute_wfs_and_extract_benchmarks summarize_to_data_json -final_clean_up echo "All workflows have been run."