└── scribd-dl
/scribd-dl:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #WORKINGDIR="/tmp/${0##*/}-$$"
4 | WORKINGDIR=".${0##*/}-$$"
5 |
6 | if (( ${#@} != 1 )) || [[ $1 = '--help' ]]; then
7 | printf '%s\n' \
8 | "Renders PDF files of books on Scribd." \
9 | "Usage: ${0##*/} scribd_book_url"
10 | exit 1
11 | elif ! command -v wget > /dev/null; then
12 | echo "This script requires wget."
13 | exit 1
14 | elif ! command -v convert > /dev/null; then
15 | echo "This script requires ImageMagick."
16 | exit 1
17 | elif ! command -v pdftk > /dev/null; then
18 | echo "This script requires PDFTK."
19 | exit 1
20 | fi
21 |
22 | printf 'Getting HTML content... '
23 | if ! html="$(wget -qO - "$1")"; then
24 | printf '%s\n' "error." "Could not download \"$1\"."
25 | exit 1
26 | fi
27 | echo "done."
28 |
29 | filename="${html#*
}"
30 | filename="${filename%%*}"
31 | filename="${filename//\//-}"
32 |
33 | echo "Destination PDF: \"$filename.pdf\"."
34 |
35 | printf 'Searching for assetPrefix... '
36 | regexPrefix="docManager.assetPrefix = \""
37 | if ! [[ $html =~ $regexPrefix ]]; then
38 | printf '%s\n' "error." "\"$1\" is not a valid Scribd book URL."
39 | exit 1
40 | fi
41 | prefix="${html#*$regexPrefix}"
42 | prefix="${prefix%%\"*}"
43 | echo "\"$prefix\"."
44 |
45 | printf 'Finding pages to download... '
46 | regexJpg="http://html.scribd.com/$prefix/images/[0-9]+-[0-9a-z]+.jpg"
47 | regexJsonp="http://html[1-4].scribdassets.com/$prefix/pages/[0-9]+-[0-9a-z]+.jsonp"
48 | unset -v pages
49 | while read line; do
50 | if [[ $line =~ $regexJsonp ]]; then
51 | page="${line##*/pages/}"
52 | page="${page%%.jsonp*}"
53 | pages+=("$page")
54 | continue
55 | fi
56 |
57 | if [[ $line =~ $regexJpg ]]; then
58 | page="${line##*/images/}"
59 | page="${page%%.jpg*}"
60 | pages+=("$page")
61 | fi
62 | done <<< "$html"
63 | totalPages="${#pages[@]}"
64 | echo "$totalPages pages."
65 |
66 | clean-up() {
67 | printf 'Cleaning up... '
68 | rm -r "$WORKINGDIR"
69 | echo "done."
70 | }
71 |
72 | trap 'echo; clean-up; exit 1' SIGINT SIGTERM
73 |
74 | echo "Using working directory \"$WORKINGDIR\"."
75 | mkdir "$WORKINGDIR"
76 |
77 | padding="$(($totalPages-1))"
78 | padding="${#totalPages}"
79 | for pageIndex in ${!pages[@]}; do
80 | pageNumber="$((pageIndex + 1))"
81 | printf -v file "%.${padding}d" "$((pageIndex + 1))"
82 |
83 | printf 'Rendering PDF data for page %s (of %s)... ' "$pageNumber" "$totalPages"
84 | retries=0
85 | until (( retries == 10 )); do
86 | wget -qO "$WORKINGDIR/$file.jpg" "http://html.scribd.com/$prefix/images/${pages[$pageIndex]}.jpg" && break
87 | sleep 2
88 | done
89 |
90 | if (( retries == 10 )); then
91 | printf '%s\n' "error." "Error while downloading \"http://html.scribd.com/$prefix/images/${pages[$pageIndex]}.jpg\"."
92 | clean-up
93 | exit 1
94 | fi
95 |
96 | if ! convert "$WORKINGDIR/$file.jpg" "$WORKINGDIR/$file.pdf"; then
97 | printf '%s\n' "error." "Could not convert \"$WORKINGDIR/$file.jpg\" to PDF."
98 | clean-up
99 | exit 1
100 | fi
101 |
102 | rm "$WORKINGDIR/$file.jpg"
103 | echo "$(($(stat --printf '%s' "$WORKINGDIR/$file.pdf") / 1024)) KB."
104 | done
105 |
106 | printf 'Combining all PDF files to one file... '
107 | pdftk "$WORKINGDIR/"*".pdf" cat output "$WORKINGDIR/$filename.pdf"
108 | mv "$WORKINGDIR/$filename.pdf" "$filename.pdf"
109 | echo "$(($(stat --printf '%s' "$filename.pdf") / 1024)) KB."
110 |
111 | clean-up
112 |
113 | echo "PDF file saved as \"$filename.pdf\"."
114 |
--------------------------------------------------------------------------------