Newer
Older
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
echo "Prepare OCR-D Ground Truth …"
while IFS= read -r URL; do
OWNER=$(echo "$URL" | cut -d'/' -f4)
REPO=$(echo "$URL" | cut -d'/' -f5)
if [[ ! -f gt/"$REPO".zip ]]; then
echo "Downloading $REPO …"
RESULT=$(curl -L \
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
https://api.github.com/repos/"$OWNER"/"$REPO"/releases/latest)
ZIP_URL=$(echo "$RESULT" | jq -r '.assets | .[].browser_download_url')
curl -L -o gt/"$REPO".zip "$ZIP_URL"
fi
done < data_srcs/default_data_sources.txt
cd gt || exit
# the default data is structured like this:
# repository_name.zip
# |___ subordinate_work_1.zip
# |___ subordinate_work_2.zip
# |___ ...
# $ZIP refers to the release itself which is on level "repository_name.zip"
# the subordinate works are also OCR-D BagIts / zips. these are referred to by $INNER_ZIP.
for ZIP in *.zip; do
NAME=$(echo "$ZIP" | cut -d"." -f1)
echo "Processing $NAME"
if [[ ! -d $NAME && $NAME != "reichsanzeiger-gt" ]]; then
unzip -qq -d "$NAME" "$ZIP"
mv "$NAME"/ocrdzip_out/* "$NAME" && rm -r "$NAME"/ocrdzip_out
for INNER_ZIP in "$NAME"/*.zip; do
echo "Dealing with inner zip files …"
INNER_ZIP_NAME=$(basename "$INNER_ZIP" .ocrd.zip)
unzip -qq -d "$NAME"/"$INNER_ZIP_NAME" "$INNER_ZIP" && rm "$INNER_ZIP"
echo "Recreate required directory structure for $INNER_ZIP_NAME."
mkdir "$NAME"/"$INNER_ZIP_NAME"/data/"$INNER_ZIP_NAME"
mv "$NAME"/"$INNER_ZIP_NAME"/data/OCR-* "$NAME"/"$INNER_ZIP_NAME"/data/"$INNER_ZIP_NAME"
mv "$NAME"/"$INNER_ZIP_NAME"/data/mets.xml "$NAME"/"$INNER_ZIP_NAME"/data/"$INNER_ZIP_NAME"
cp "$NAME"/metadata.json "$NAME"/"$INNER_ZIP_NAME"/data/"$INNER_ZIP_NAME"/metadata.json
echo "Moving $INNER_ZIP_NAME higher in dir structure."
mv "$NAME"/"$INNER_ZIP_NAME" .
echo "Done."
done
rm -rf "$NAME"
fi
done
echo "Prepare Reichsanzeiger GT …"
if [[ $1 == "ra-full" ]]; then
echo "Preparing the full Reichsanzeiger GT."
if [ ! -d reichsanzeiger-gt ]; then
git clone https://github.com/UB-Mannheim/reichsanzeiger-gt
fi
RA_GT=/app/gt/reichsanzeiger-gt
DATA_DIR=/$RA_GT/data
cd $DATA_DIR|| exit
if [[ -d reichsanzeiger-1820-1939/OCR-D-IMG ]]; then
echo "Skip downloading Reichsanzeiger images."
else
bash download_images.sh
fi
cd reichsanzeiger-1820-1939 || exit
ocrd workspace init
mkdir OCR-D-IMG
cp ../images/* OCR-D-IMG
rm -rf ../images
rm -rf ../reichsanzeiger-1820-1939_with-TableRegion
cp -r GT-PAGE OCR-D-GT-SEG-LINE
echo "Adding images to mets …"
FILEGRP="OCR-D-IMG"
EXT=".jpg" # the actual extension of the image files
MEDIATYPE='image/jpeg' # the actual media type of the image files
for i in "$FILEGRP"/*"$EXT"; do
BASE=$(basename "${i}" $EXT);
ocrd workspace add -G $FILEGRP -i ${FILEGRP}_"${BASE}" -g P_"${BASE}" -m $MEDIATYPE "${i}";
done
python3 /app/scripts/convert-yml-to-json.py --indent 2 $RA_GT/METADATA.yml $RA_GT/metadata.json
echo " … and ready to go!"
else
echo "Prepare smaller sets of Reichsanzeiger GT."
cd /app || exit
bash /app/scripts/prepare_reichsanzeiger_sets.sh
fi