From 4b556f805a27153fb843260a7a301a9419befdac Mon Sep 17 00:00:00 2001
From: mweidling <13831557+mweidling@users.noreply.github.com>
Date: Mon, 24 Apr 2023 09:52:01 +0200
Subject: [PATCH] Refactor and improve logging

* refactor: remove dead code, improve logging

* build: add targets for convenience

* refactor

* build: add complete cleaning

* build: remove surplus copy (dir is mounted)

* refactor: restore old check, fix bug, extend logging

* refactor: remove cleaning from bash script

* refactor: remove dead code, streamline params

* refactor: remove dead code

* update .gitignore

* refactor: remove dead code

* refactor: mkdirs, fix bug in var assignment
---
 .gitignore                     |   1 +
 Dockerfile                     |   2 -
 Makefile                       |  11 +++-
 workflows/execute_workflows.sh | 100 ++++++++++-----------------------
 4 files changed, 42 insertions(+), 72 deletions(-)

diff --git a/.gitignore b/.gitignore
index 87f881c..6ff3b19 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@ work/
 workflows/nf-results/*
 workflows/results
 workflows/ocrd_workflows/*.nf
+workflows/ocrd_workflows/ocrd.log
 models
 .idea
 gt/*
diff --git a/Dockerfile b/Dockerfile
index ebf5b16..d621633 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -26,6 +26,4 @@ RUN pip3 install -r requirements.txt
 RUN pip3 install .
 RUN nextflow
 
-COPY workflows workflows
-
 ENTRYPOINT [ "bash" ]
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 6cc9eda..fba4b22 100644
--- a/Makefile
+++ b/Makefile
@@ -12,4 +12,13 @@ run:
 	docker compose exec app bash workflows/execute_workflows.sh > logs/run_$$(date +"%s").log
 
 stop:
-	CONTAINER_ID=$$(docker ps | grep quiver | cut -d' ' -f1); docker container stop $$CONTAINER_ID && docker container remove $$CONTAINER_ID
\ No newline at end of file
+	CONTAINER_ID=$$(docker ps | grep quiver | cut -d' ' -f1); docker container stop $$CONTAINER_ID && docker container remove $$CONTAINER_ID
+
+clean-workspaces:
+	docker compose exec app rm -rf workflows/workspaces
+
+clean-results:
+	docker compose exec app rm -rf workflows/nf-results workflows/results
+
+clean: clean-workspaces clean-results
+	@echo "Cleaning everything."
\ No newline at end of file
diff --git a/workflows/execute_workflows.sh b/workflows/execute_workflows.sh
index bcbde19..3ce0d9b 100755
--- a/workflows/execute_workflows.sh
+++ b/workflows/execute_workflows.sh
@@ -4,21 +4,17 @@ ROOT=$PWD
 WORKFLOW_DIR="$ROOT"/workflows
 OCRD_WORKFLOW_DIR="$WORKFLOW_DIR"/ocrd_workflows
 WORKSPACE_DIR="$WORKFLOW_DIR"/workspaces
+RESULTS_DIR="$WORKFLOW_DIR"/results
 
 set -euo pipefail
 
 clean_up_dirs() {
-    if [[ -d  workflows/workspaces ]]; then
-    rm -rf workflows/workspaces
-    fi
-
     if [[ -d  workflows/nf-results ]]; then
         rm -rf workflows/nf-results
     fi
-
-    if [[ -d  workflows/results ]]; then
-        rm -rf workflows/results
-    fi
+    mkdir -p "$WORKSPACE_DIR"
+    mkdir -p "$RESULTS_DIR"
+    mkdir workflows/nf-results
 }
 
 convert_ocrd_wfs_to_NextFlow() {
@@ -26,14 +22,9 @@ convert_ocrd_wfs_to_NextFlow() {
 
     echo "Convert OCR-D workflows to NextFlow …"
 
-    mkdir -p "$WORKFLOW_DIR/nf-results"
-
     for FILE in *.txt
     do
         oton convert -I "$FILE" -O "$FILE".nf
-        # the venv part is not needed since we execute this in an image derived from ocrd/all:maximum
-        sed -i 's/source "${params.venv_path}"//g' "$FILE".nf
-        sed -i 's/deactivate//g' "$FILE".nf
     done
 }
 
@@ -41,19 +32,16 @@ download_models() {
     echo "Download the necessary models if not available"
     if [[ ! -f /usr/local/share/tessdata/Fraktur_GT4HistOCR.traineddata ]]
     then
-        #mkdir -p /usr/local/share/ocrd-resources/
         ocrd resmgr download ocrd-tesserocr-recognize '*'
     fi
     if [[ ! -d /usr/local/share/ocrd-resources/ocrd-calamari-recognize/qurator-gt4histocr-1.0 ]]
     then
-        mkdir -p /usr/local/share/ocrd-resources/
         ocrd resmgr download ocrd-calamari-recognize qurator-gt4histocr-1.0
     fi
 }
 
 create_wf_specific_workspaces() {
     # execute this workflow on the existing data (incl. evaluation)
-    mkdir -p "$WORKSPACE_DIR"/tmp
     cd "$WORKSPACE_DIR" || exit
 
     # create workspace for all OCR workflows.
@@ -65,67 +53,54 @@ create_wf_specific_workspaces() {
             for WORKFLOW in "$OCRD_WORKFLOW_DIR"/*ocr.txt.nf
             do
                 WF_NAME=$(basename -s .txt.nf "$WORKFLOW")
-                cp -r "$ROOT"/gt/"$DIR_NAME" "$WORKSPACE_DIR"/tmp/"$DIR_NAME"_"$WF_NAME"
-                cp "$WORKFLOW" "$WORKSPACE_DIR"/tmp/"$DIR_NAME"_"$WF_NAME"/data/*/
+                TARGET_DIR="$WORKSPACE_DIR"/"$DIR_NAME"_"$WF_NAME"
+                if [[ ! -d "$TARGET_DIR" ]]; then
+                    cp -r "$ROOT"/gt/"$DIR_NAME" "$TARGET_DIR"
+                    cp "$WORKFLOW" "$TARGET_DIR"/data/*/
+                    cp "$OCRD_WORKFLOW_DIR"/*eval.txt.nf "$TARGET_DIR"/data/*/
+                    rm -rf "$WORKSPACE_DIR"/log.log
+                else
+                    echo "$TARGET_DIR already exists. Skipping."
+                fi
             done
         fi
     done
 }
 
-clean_up_tmp_dirs() {
-    echo "Clean up intermediate dirs …"
-    for DIR in "$WORKSPACE_DIR"/tmp/*
-    do
-        echo "Cleaning up $DIR."
-        DIR_NAME=$(basename "$DIR")
-        mv "$DIR" "$WORKSPACE_DIR"/"$DIR_NAME"
-        cp "$OCRD_WORKFLOW_DIR"/*eval.txt.nf "$WORKSPACE_DIR"/"$DIR_NAME"/data/*/
-    done
-
-    rm -rf "$WORKSPACE_DIR"/tmp
-    rm -rf "$WORKSPACE_DIR"/log.log
-}
-
 execute_wfs_and_extract_benchmarks() {
-    mkdir -p "$ROOT"/workflows/results
     # for all data sets…
     for WS_DIR in "$WORKSPACE_DIR"/*
     do
-        INNER_DIR=$(ls "$WS_DIR"/data/)
+        DATA_DIR="$WS_DIR"/data
+        DIR_NAME=$(basename "$WS_DIR")
+        INNER_DIR=$(ls "$DATA_DIR"/)
 
-        if [ -d "$WS_DIR" ] &&  ! grep -q "OCR-D-OCR" "$WS_DIR/data/$INNER_DIR/mets.xml" ; then
-            echo "Switching to $WS_DIR."
+        if ! grep -q "OCR-D-OCR" "$WS_DIR/data/$INNER_DIR/mets.xml" ; then
+            echo "Switching to $WS_DIR."            
 
-            DIR_NAME=$(basename "$WS_DIR")
-
-            run "$WS_DIR"/data/*/*ocr.txt.nf "$DIR_NAME" "$WS_DIR"
-            run "$WS_DIR"/data/*/*eval.txt.nf "$DIR_NAME" "$WS_DIR"
+            run "$DATA_DIR"/*/*ocr.txt.nf "$DIR_NAME" "$WS_DIR"
+            run "$DATA_DIR"/*/*eval.txt.nf "$DIR_NAME" "$WS_DIR"
 
             # create a result JSON according to the specs          
             echo "Get Benchmark JSON …"
-            quiver benchmarks-extraction "$WS_DIR"/data/* "$WORKFLOW"
+            WORKFLOW=$(basename -s .txt.nf "$DATA_DIR"/*/*ocr.txt.nf)
+            quiver benchmarks-extraction "$DATA_DIR"/* "$WORKFLOW"
             echo "Done."
 
             # move data to results dir
-            mv "$WS_DIR"/data/*/*.json "$WORKFLOW_DIR"/results
+            mv "$DATA_DIR"/*/*result.json "$RESULTS_DIR"
+        else
+            echo "$WS_DIR has already been processed."
         fi
     done
     cd "$ROOT" || exit
 }
 
-adjust_workflow_settings() {
-    # $1: $WORKFLOW
-    # $2: $DIR_NAME
-    sed -i "s CURRENT app/workflows/workspaces/$2/data/*/ g" "$1"
-}
-
 rename_and_move_nextflow_result() {
     # rename NextFlow results in order to properly match them to the workflows
     # $1: $WORKFLOW
     # $2: $DIR_NAME
     WORKFLOW_NAME=$(basename -s .txt.nf "$1")
-    rm "$WORKFLOW_DIR"/nf-results/*process_completed.json
-    mv "$WORKFLOW_DIR"/nf-results/*_completed.json "$WORKFLOW_DIR"/results/"$2"_"$WORKFLOW_NAME"_completed.json
     if [ "$WORKFLOW_NAME" != "dinglehopper_eval" ]; then
         for DIR in "$WORKSPACE_DIR"/work/*
         do
@@ -146,20 +121,19 @@ run() {
     # $1: $WORKFLOW
     # $2: $DIR_NAME
     # $3: $WS_DIR
-    adjust_workflow_settings "$1" "$2"
     nextflow run "$1" -with-weblog http://127.0.0.1:8000/nextflow/ --mets_path "/app/workflows/workspaces/$2/data/*/mets.xml"
     rename_and_move_nextflow_result "$1" "$2"
     save_workspaces "$3"/data "$2" "$1"
 }
 
 save_workspaces() {
-    # $1: $WS_DIR
+    # $1: $WORKFLOW
     # $2: $DIR_NAME
-    # $3: $WORKFLOW
-    echo "Zipping workspace $1"
+    # $3: $WS_DIR
+    echo "Zipping workspace $3"
     ocrd -l ERROR zip bag -d "$DIR_NAME"/data/* -i "$DIR_NAME"/data/* "$DIR_NAME"
-    WORKFLOW_NAME=$(basename -s .txt.nf "$3")
-    mv "$WORKSPACE_DIR"/"$2".zip "$WORKFLOW_DIR"/results/"$2"_"$WORKFLOW_NAME".zip
+    WORKFLOW_NAME=$(basename -s .txt.nf "$1")
+    mv "$WORKSPACE_DIR"/"$2".zip "$RESULTS_DIR"/"$2"_"$WORKFLOW_NAME".zip
 }
 
 summarize_to_data_json() {
@@ -169,24 +143,12 @@ summarize_to_data_json() {
     echo "Done."
 }
 
-
-final_clean_up() {
-    echo "Cleaning up …"
-    rm -rf "$WORKSPACE_DIR"
-    rm -rf "$ROOT"/work
-    rm -rf "$WORKFLOW_DIR"/nf-results
-    rm -rf "$WORKFLOW_DIR"/results
-    rm "$WORKFLOW_DIR"/ocrd_workflows/*.nf
-}
-
 clean_up_dirs
 convert_ocrd_wfs_to_NextFlow
 download_models
 create_wf_specific_workspaces
-clean_up_tmp_dirs
 uvicorn api:app --app-dir "$ROOT"/src & # start webserver for evaluation
-echo "Process is running. See logs at ./logs for more information."
+sleep 2 && >&2 echo "Process is running. See logs at ./logs for more information."
 execute_wfs_and_extract_benchmarks
 summarize_to_data_json
-final_clean_up
 echo "All workflows have been run."
-- 
GitLab