-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathocr_images_old.sh
executable file
·218 lines (201 loc) · 6.37 KB
/
ocr_images_old.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#!/bin/bash
#
# OCR images of text and combine result into PDF
#
# 1. If they're not 1-bit, make sure they're png, otherwise convert
# 2. If size is too small for tesseract, enlarge
# 3. Create OCR and output to PDF with tesseract
# For enlarged files, create blank version with text
# to merge with original-sized images
# 4. Combine into final PDF
# For 1-bit images:
# output and then convert to final tiff in this way:
# convert imagefile -alpha off -monochrome -compress Group4 -quality 100 output.tiff
# Function to get dpi for 8.5 x 11 paper size (to nearest 10)
# Not quite right. Need to check ratio of sides to determine which to use.
function get_dpi {
size=`magick identify -format "%w x %h" "$1"`
width=${size%% *}
height=${size##* }
if [[ $height -lt $width ]]
then
# landscape orientation
dpi=`echo $(( (width * 10 / (110 + 5) ) ))`
h=`echo $(( 10 * height / dpi ))`
maxheight=85
else
# portrait orientation
dpi=`echo $(( (height * 10 / (110 + 5) ) ))`
h=`echo $(( 10 * height / dpi ))`
maxheight=110
fi
while [[ $h -gt $maxheight ]]
do
dpi=`echo $(( dpi + 10 ))`
h=`echo $(( 10 * height / dpi ))`
done
echo $dpi
}
# Function to set enlargement value for optimal tesseract OCR
# Recommended at least 300 dpi
function get_enlargement {
imgDPI=$1
minDPI=300
testDPI=`expr $minDPI - 1`
enlarge=0
if [[ ! $imgDPI -gt $testDPI ]]
then
newDPI=$imgDPI
while [[ ! $newDPI -gt $testDPI ]]
do
enlarge=`echo $(( $enlarge + 1 ))`
echo $imgDPI $enlarge;exit 0
newDPI=`echo $(( $imgDPI * $enlarge))`
done
else
enlarge=1
fi
echo "$enlarge"00%
}
# Set some variables
png=true
lang=''
number=$RANDOM
number="00$number"
number=${number: -5}
finalname=final_$number.pdf
# set up functions to report Usage and Usage with Description
PROGNAME=`type $0 | awk '{print $3}'` # search for convert executable on path
PROGNAME=`basename $PROGNAME` # base name of program
# Read flags
# Provide help if no argument
if [[ $# == 0 ]]
then
set -- "-h"
fi
while test $# -gt 0; do
case "$1" in
-h|--help)
echo "$PROGNAME [options] input_file [output_file]"
echo ''
echo "options:"
echo "-h, --help show brief help"
echo "-l <language> specify a tesseract 3-letter language code"
echo ''
echo "This script takes an input file and uses tesseract to perform OCR on it,"
echo "and then creates an output PDF in the same directory."
echo ''
echo "All files in the directory with the same starting letter and extension"
echo "as the input file will be processed."
echo ''
echo "If an output file is provided, it will be used with the extension \"pdf\" appended to it, if needed."
echo "If no output file is provided \"final_xxxxx.pdf\" will be used, where xxxxx are random numbers."
echo "Any existing file with the same name will be overwritten silently."
exit 0
;;
-l)
shift
lang=$1
shift
;;
*)
break
;;
esac
done
# Remaining input should be the input file name with optional output filename
# Make sure it has the right extension
input="$1"
if [[ $# > 1 ]]
then
finalname="$2"
finalextension="${finalname##*.}"
if [[ $finalextension != 'pdf' ]]
then
finalname="$finalname".pdf
fi
fi
# Make sure file exists
if [[ ! -s "$1" ]]
then
echo ''
echo -e "\a"File "$1" does not appear to exist! Exiting...
echo ''
exit 0
fi
# Get absolute path & file info
filename=`basename "$1"`
firstchar=${filename:0:1}
input=$(cd "$(dirname "$1")"; pwd)/"$filename"
extension="${input##*.}"
origin_dir=`dirname "$input"`
# Create work directories in the system temp folder
tempdir=`echo $TMPDIR`
workdir="$tempdir$number"_tesseract
finaldir=$workdir
mkdir $workdir
# Create directories for enlarging if needed
mkdir $workdir/big
mkdir $workdir/final
mkdir $workdir/text
# Check for language. Easy to forget to specify.
# Needs some error checking
if [[ $lang == '' ]]
then
read -p 'No language was specified. Hit enter to use English or supply the 3-letter language code: ' langInput
if [[ $langInput == '' ]]
then
lang='eng'
else
lang=$langInput
fi
fi
# OCR and save to PDF
# Convert to correctly sized png which tesseract leaves alone when making PDF
# Testing: don't understand why I'm converting to png here
# for i in "$origin_dir"/"$firstchar"*."$extension"
# do
# dpi=`get_dpi "$i"`
# output=`basename "$i"`
# # output=${output%.*}
# # cp "$i" "$workdir/$output"
# # magick "$i" -units PixelsPerInch -density $dpi +repage -define png:compression-filter=1 -define png:compression-level=3 -define png:compression-strategy=0 "$workdir/$output.png"
# done
# Enlarge if the dpi is too small for a good tesseract reading
# A bit arbitrary to use 300 dpi as minimum. Should estimate better.
# Make a PDF of original and combine with no-image PDF from tesseract.
#for i in "$workdir/"*".png"
# Creating pdf with magick doesn't affect image size, so do that always,
# then tesseract the appropriate image and merge
finaldir=$workdir/final
for i in "$origin_dir"/"$firstchar"*."$extension"
do
# Spit out a little output for user feedback
filename=`basename "$i"`
echo $filename | sed "s/^.*\-//" | sed "s/\..*//"
# Ideally files have the same size and dpi, but in case they don't, do this for each
dpi=`get_dpi "$i"`
enlarge=`get_enlargement "$dpi"`
# Create PDF at appropriate size
magick "$i" -units pixelsperinch -density $dpi "$workdir/$filename.pdf"
if [[ $dpi -lt 300 ]]
then
# Create enlarged file for tesseract
magick "$i" -resize $enlarge "$workdir/big/$filename"
# Create text-only PDF with tesseract
tesseract -l $lang -c textonly_pdf=1 "$workdir/big/$filename" "$workdir/text/$filename" pdf
else
# Just create text-only PDF with tesseract
tesseract "$i" "$workdir/text/$filename" -l $lang -c textonly_pdf=1 pdf
# v 4 command: tesseract "$i" "$workdir/text/$filename" --dpi $dpi -l $lang -c textonly_pdf=1 pdf
fi
# Combine two PDF files for final version
pdftk "$workdir/$filename.pdf" multibackground "$workdir/text/$filename.pdf" output "$finaldir/$filename.pdf"
# else
# echo '1'
# tesseract -l $lang "$workdir/$filename" "$finaldir/$filename" pdf
# echo '2'
done
# Sleeping to avoid some unexpected terminations
sleep 5 && pdfunite "$finaldir/"*.pdf "$origin_dir/$finalname"
terminal-notifier -message "Your OCR is complete." -title "Yay!" -sound default