-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathscribd_download.sh
executable file
·433 lines (371 loc) · 12 KB
/
scribd_download.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
#!/bin/bash -e
# This script created by Tobias Bora is under GPLv3 Licence
# This script download and convert a document from scribd.com into pdf
# ImageMagick, Phantomjs and pdftk must be installed
# Doc : https://github.com/ariya/phantomjs/wiki/API-Reference-WebPage#wiki-webpage-viewportSize
# Working examples :
# http://fr.scribd.com/doc/63942746/chopin-nocturne-n-20-partition
# http://fr.scribd.com/doc/48491291/partition
# If you don't want install phantomjs/imagemagick,
# you can just put phantomjs and convert exec files
# is in the current directory
pdf_convert_mode="pdftk"
# Uncomment this if you prefer to use convert
# than pdftk for convertion (you can have memory issues)
# pdf_convert_mode="convert"
if [ -z "$1" ]
then
echo "scribd_download.sh <url>"
echo "or if you want to change the number of pages :"
echo "scribd_download.sh <url> <number of pages>"
echo "or if you want to specify the width/height manually :"
echo "scribd_download.sh <url> <number of pages> <width> <height>"
echo "If you don't want to specify the number of pages :"
echo "scribd_download.sh <url> 0 <width> <height>"
exit 1
fi
# If convert isn't installed
if [ -z "$(which convert)" ]
then
file="$(dirname $(readlink -f .))/convert"
# Even in the current dir
if [ ! -f "$file" ]
then
echo "You must install 'convert' from the package imagemagick."
echo "On ubuntu run :"
echo "sudo apt-get install imagemagick"
exit 1
else
echo "The convert command has been found in the current dir."
echo "I'll use it."
exec_convert="$file"
fi
else
exec_convert="convert"
fi
# If phantomjs isn't installed
if [ -z "$(which phantomjs)" ]
then
file="$(dirname $(readlink -f .))/phantomjs"
# Even in the current dir
if [ ! -f "$file" ]
then
echo "You must install phantomjs."
echo "On ubuntu run :"
echo "sudo apt-get install phantomjs"
exit 1
else
echo "The phantomjs command has been found in the current dir."
echo "I'll use it."
exec_phantomjs="$file"
fi
else
exec_phantomjs="phantomjs"
fi
# If pdftk isn't installed
if [ ! "$pdf_convert_mode" = "convert" ] && [ -z "$(which pdftk)" ]
then
file="$(dirname $(readlink -f .))/pdftk"
# Even in the current dir
if [ ! -f "$file" ]
then
echo "You must install pdftk."
echo "On ubuntu run :"
echo "sudo apt-get install pdftk"
echo ""
echo "(Or modify the script conf if you don't want it)"
exit 1
else
echo "The pdftk command has been found in the current dir."
echo "I'll use it."
exec_pdftk="$file"
fi
else
exec_pdftk="pdftk"
fi
url="$1"
zoom_precision=2
rm -rf .tmp
mkdir .tmp
cd .tmp
# Get the number of pages
echo "Getting informations..."
echo "(It can be quite long, and don't worry if"
echo "you see some errors during the conversion)"
echo -n " Number of pages... "
echo "var page = require('webpage').create();
url = \"$url\"
page.open(url, function () {
console.log(page.content);
phantom.exit();
});
// Avoid error messages
page.onError = function(msg, trace) {
};
" > phantom_nb_pages.js
# Update of Scribd
$exec_phantomjs --load-images=no phantom_nb_pages.js > page.html
nb_pages="$(cat page.html | grep 'document.getElementById(\"outer_page' | wc -l)"
if [ -z "$2" ] || [ "$2" = "0" ]
then
if [ -z "$nb_pages" ]
then
echo "I can't find the number of pages... Please, how many pages are there in the file ?"
read nb_pages
fi
else
nb_pages="$2"
fi
echo "$nb_pages"
page_name=`cat page.html | egrep -o "<title>.*</title>" | sed -E 's/<title>(.*)<\/title>/\1/' | sed -e 's/ /_/g' | tr -cd '[[:alnum:]]._-'`
echo " Title... $page_name"
echo "Done."
# We remove useless parts in files
echo "Removing useless parts..."
# We make a new line for each html element.
sed -i -e "s/</\\n</g" page.html
sed -i -e "s/>[^\\n]/>\\n/g" page.html
# sed -i -e "s/>/>\\n/g" page.html
# exit 0
function remove_node {
# $1 is the node regexp string
# $2 is the file
node_regex=$1
filename=$2
commande="{if(!i && /${node_regex}/){i=1}else{if(i){if(/<div/){i++} if(/<\/div>/){i--}}else{if(!i){print \$0}} }}"
awk "$commande" "$filename" > tmp
mv tmp "$filename"
}
function remove_n_node {
# $1 is the node regexp string
# $2 is the file
node_regex=$1
filename=$2
n=$3
commande="BEGIN {l=${n}} {if( !i && l>0 && /${node_regex}/ ){i=1;l--}else{if(i){if(/<div/){i++} if(/<\/div>/){i--}}else{if(!i){print \$0}} }}"
awk "$commande" "$filename" > tmp
mv tmp "$filename"
}
function keep_n_node {
# $1 is the node regexp string
# $2 is the file
node_regex=$1
filename=$2
n=$3
commande="BEGIN {l=${n}} {if(l > 0 && /${node_regex}/ ){l--;print \$0}else{if(!i && /${node_regex}/ ){i=1;l--}else{if(i){if(/<div/){i++} if(/<\/div>/){i--}}else{if(!i){print \$0}} }}}"
awk "$commande" "$filename" > tmp
mv tmp "$filename"
}
function remove_errors {
awk '/</{i++}i' "$1" > tmp
mv tmp "$1"
}
# We remove the margin on the left of the main block
sed -i -e 's/id="doc_container"/id="doc_container" style="min-width:0px;margin-left : 0px;"/g' page.html
sed -i -e 's/class="sidebar_column"/class="sidebar_column" style="display:none;"/g' page.html
sed -i -e 's/class="document_column"/class="document_column" style="left : 0px;"/g' page.html
# We remove all html elements which are useless (menus...)
echo -n "-"
remove_errors "page.html"
echo -n "-"
remove_node '<div.*id="global_header"' "page.html"
echo -n "-"
remove_node '<div class="header_spacer"' "page.html"
echo -n "-"
remove_node '<div class="page_blur_promo"' "page.html"
echo -n "-"
remove_node '<div.*id="doc_info"' "page.html"
echo -n "-"
remove_node '<div.*class="toolbar_spacer"' "page.html"
echo -n "-"
remove_node '<div.*between_page_ads_1' "page.html"
echo -n "-"
remove_node 'id="leaderboard_ad_main">' "page.html"
echo -n "-"
# Remove the space between pages
remove_node 'class="page_missing_explanation ' "page.html"
echo -n "-"
remove_node '<div id="between_page_ads' "page.html"
echo -n "-"
remove_node '<div class="b_..">' "page.html"
echo -n "-"
remove_node '<div class="buy_doc_bar' "page.html"
sed -i -e 's/<div class="outer_page/<div style="margin: 0px;" class="outer_page/g' page.html
# Remove shadow on forbidden pages
echo -n "-"
remove_node '<div class="shadow_overlay">' "page.html"
echo -n "-"
remove_node 'grab_blur_promo_here' "page.html"
echo -n "-"
remove_node 'missing_page_buy_button' "page.html"
echo -e "\nDone"
# We download the page with images
echo "Downloading page..."
# Automatic detection
if [ -z "$4" ]
then
#### The page size is founded automatiquely
# New way : with this way it should be possible to
# choose the size of each page
width_no_zoom="$(cat page.html | grep -o '\"origWidth\": [0-9]*' | head -n 1 | awk -F ' ' '{print $2}')"
height_no_zoom="$(cat page.html | grep -o '\"origHeight\": [0-9]*' | head -n 1 | awk -F ' ' '{print $2}')"
# If it doesn't work
if [ -z "$width_no_zoom" ]
then
echo "The first detection didn't work..."
width_no_zoom="$(cat page.html | grep 'id=\"outer_page_1' | egrep -o '[0-9]+px' | egrep -o '[0-9]+' | awk 'NR == 1')"
height_no_zoom="$(cat page.html | grep 'id=\"outer_page_1' | egrep -o '[0-9]+px' | egrep -o '[0-9]+' | awk 'NR == 2')"
else
echo "Detection successfull !"
# If it works we modify the Javascript to have the good width
sed -i -e "s/var defaultViewWidth .*;/var defaultViewWidth = defaultViewWidth || $width_no_zoom;/g" page.html
fi
else
width_no_zoom="$3"
height_no_zoom="$4"
fi
# space_no_zoom=100
space_no_zoom=0
echo "Width : $width_no_zoom px"
echo "Height : $height_no_zoom px"
width=$(($width_no_zoom * $zoom_precision))
height=$(($height_no_zoom * $zoom_precision))
space=$(($space_no_zoom * $zoom_precision))
# We treat each pages 10 by 10 because phantomjs can't manage to deal
# with big documents (something like 20 pages)
current_page=0
leaving_pages="$nb_pages"
max_treat=10
# We make a copy in order to remove useless pages
# page_svg.html contains all pages which hasn't been recorded
cp page.html page_svg.html
# We treat pages until all pages are treated
while [ "$leaving_pages" -gt "0" ]
do
if [ "$leaving_pages" -lt "$max_treat" ]
then
nb_pages_to_treat="$leaving_pages"
leaving_pages=0
else
nb_pages_to_treat="$max_treat"
leaving_pages="$(($leaving_pages - $max_treat))"
fi
echo "Downloading $nb_pages_to_treat pages ($leaving_pages leaving pages after that, $current_page already downloaded)"
cp page_svg.html page.html
keep_n_node 'id="outer_page_' "page.html" "$nb_pages_to_treat"
echo "var page = require('webpage').create();
output='out.png';
address = 'page.html';
nb_pages = $nb_pages_to_treat;
zoom = $zoom_precision;
width = $width
height = (768+($height+$space)*nb_pages);
page.viewportSize = { width: width, height: height };
page.zoomFactor = zoom;
page.open(address, function (status) {
if (status !== 'success') {
console.log('Unable to load the address!');
} else {
page.clipRect = { top: 0, left: 0, width: width, height: height };
window.setTimeout(function () {
page.render(output);
phantom.exit();
}, 200);
}
});
// Avoid error messages
page.onError = function(msg, trace) {
};
" > phantom_render.js
$exec_phantomjs phantom_render.js
echo "Done"
### Treatment of the picture
# Separate pages
echo "Treatment... "
for i in `seq 0 $(( $nb_pages_to_treat - 1))`
do
# We add zeros to fill the page number in file name
printf -v page_filename "0_%05d.png" "$current_page"
# We select the good page and save it in a new file
$exec_convert out.png -gravity NorthWest -crop ${width}x${height}+0+$(( $i*($height + $space) )) $page_filename
current_page="$(($current_page + 1))"
done
### Remove useless pages in page.html
if [ "$leaving_pages" -ne "0" ]
then
remove_n_node 'id="outer_page_' "page_svg.html" "$nb_pages_to_treat"
fi
done
# Create the pdf file
echo "All pages have been downloaded, I will now create the pdf file"
# This function is used in the pdftk mode
# It combines each pdf two by two (avoid memory error)
# This function modify the input pdf array
function combine_pdf {
# $1 = pdf array
# $2 = base new name
# $3 = output variable name
declare -a pdf=("${!1}")
base_new_name="$2"
# Empty array
out_pdf=()
i=0
# For each file and it's neightbour...
while [ "$i" -lt "$(( ${#pdf[*]} - 1))" ]
do
output_name="${base_new_name}_${i}.pdf"
# echo "${pdf[$i]} & ${pdf[ $(( $i + 1 )) ]} => $output_name"
# Combine two by two
$exec_pdftk "${pdf[$i]}" "${pdf[ $(( $i + 1 )) ]}" cat output "$output_name"
# Add in the output array
out_pdf["$i"]="$output_name"
i="$(( $i + 2 ))"
echo -n "-"
done
# If one element hasn't been treated we add it in the output
if [ "$i" -ne "${#pdf[*]}" ]
then
out_pdf["$i"]="${pdf[ $(( ${#pdf[*]} - 1 )) ]}"
fi
# Copy
eval "$3=(\"\${out_pdf[@]}\")"
}
if [ "$pdf_convert_mode" = "convert" ]
then
echo "Using convert (can not work with low memory)"
$exec_convert 0_*.png -quality 100 -compress jpeg -gravity center -resize 1240x1753 -extent 1240x1753 -gravity SouthWest -page a4 ../${page_name}.pdf
else
echo "Using pdftk (maybe longer but no memory error)"
echo "You can change the configuration if you prefer convert"
# Convertion of each picture one by one
for picture in 0_*.png
do
$exec_convert "$picture" -quality 100 -compress jpeg -gravity center -resize 1240x1753 -extent 1240x1753 -gravity SouthWest -page a4 "$picture.pdf"
echo -n "-"
done
echo ""
echo "Listing files..."
files=()
i=0
# List all files
for line in 0_*.pdf
do
files[ $i ]="$line"
i="$(( $i + 1 ))"
done
echo "Putting files together..."
# Combine
j=1
while [ "${#files[*]}" -gt "1" ]
do
combine_pdf files[@] "$j" "files"
j="$(( $j + 1 ))"
echo "#"
done
eval "cp \"${files[0]}\" \"../${page_name}.pdf\""
fi
cd ..
echo "Done"
echo "The outputfile is $(pwd)/${page_name}.pdf"
echo "Name : ${page_name}.pdf"
# rm -rf .tmp