#!/bin/bash mkdir -p gt echo "Prepare OCR-D Ground Truth …" while IFS= read -r URL; do OWNER=$(echo "$URL" | cut -d'/' -f4) REPO=$(echo "$URL" | cut -d'/' -f5) if [[ ! -f gt/"$REPO".zip ]]; then echo "Downloading $REPO …" RESULT=$(curl -L \ -H "Accept: application/vnd.github+json" \ -H "X-GitHub-Api-Version: 2022-11-28" \ https://api.github.com/repos/"$OWNER"/"$REPO"/releases/latest) ZIP_URL=$(echo "$RESULT" | jq -r '.assets | .[].browser_download_url') curl -L -o gt/"$REPO".zip "$ZIP_URL" fi done < data_srcs/default_data_sources.txt cd gt || exit # the default data is structured like this: # repository_name.zip # |___ subordinate_work_1.zip # |___ subordinate_work_2.zip # |___ ... # $ZIP refers to the release itself which is on level "repository_name.zip" # the subordinate works are also OCR-D BagIts / zips. these are referred to by $INNER_ZIP. for ZIP in *.zip; do NAME=$(echo "$ZIP" | cut -d"." -f1) echo "Processing $NAME" if [[ ! -d $NAME && $NAME != "reichsanzeiger-gt" ]]; then unzip -qq -d "$NAME" "$ZIP" mv "$NAME"/ocrdzip_out/* "$NAME" && rm -r "$NAME"/ocrdzip_out for INNER_ZIP in "$NAME"/*.zip; do echo "Dealing with inner zip files …" INNER_ZIP_NAME=$(basename "$INNER_ZIP" .ocrd.zip) unzip -qq -d "$NAME"/"$INNER_ZIP_NAME" "$INNER_ZIP" && rm "$INNER_ZIP" echo "Recreate required directory structure for $INNER_ZIP_NAME." mkdir "$NAME"/"$INNER_ZIP_NAME"/data/"$INNER_ZIP_NAME" mv "$NAME"/"$INNER_ZIP_NAME"/data/OCR-* "$NAME"/"$INNER_ZIP_NAME"/data/"$INNER_ZIP_NAME" mv "$NAME"/"$INNER_ZIP_NAME"/data/mets.xml "$NAME"/"$INNER_ZIP_NAME"/data/"$INNER_ZIP_NAME" cp "$NAME"/metadata.json "$NAME"/"$INNER_ZIP_NAME"/data/"$INNER_ZIP_NAME"/metadata.json echo "Moving $INNER_ZIP_NAME higher in dir structure." mv "$NAME"/"$INNER_ZIP_NAME" . echo "Done." done rm -rf "$NAME" fi done echo "Prepare Reichsanzeiger GT …" if [[ $1 == "ra-full" ]]; then echo "Preparing the full Reichsanzeiger GT." if [ ! -d reichsanzeiger-gt ]; then git clone https://github.com/UB-Mannheim/reichsanzeiger-gt fi RA_GT=/app/gt/reichsanzeiger-gt DATA_DIR=/$RA_GT/data cd $DATA_DIR|| exit if [[ -d reichsanzeiger-1820-1939/OCR-D-IMG ]]; then echo "Skip downloading Reichsanzeiger images." else bash download_images.sh fi cd reichsanzeiger-1820-1939 || exit ocrd workspace init mkdir OCR-D-IMG cp ../images/* OCR-D-IMG rm -rf ../images rm -rf ../reichsanzeiger-1820-1939_with-TableRegion cp -r GT-PAGE OCR-D-GT-SEG-LINE echo "Adding images to mets …" FILEGRP="OCR-D-IMG" EXT=".jpg" # the actual extension of the image files MEDIATYPE='image/jpeg' # the actual media type of the image files for i in "$FILEGRP"/*"$EXT"; do BASE=$(basename "${i}" $EXT); ocrd workspace add -G $FILEGRP -i ${FILEGRP}_"${BASE}" -g P_"${BASE}" -m $MEDIATYPE "${i}"; done python3 /app/scripts/convert-yml-to-json.py --indent 2 $RA_GT/METADATA.yml $RA_GT/metadata.json echo " … and ready to go!" else echo "Prepare smaller sets of Reichsanzeiger GT." cd /app || exit bash /app/scripts/prepare_reichsanzeiger_sets.sh fi