No Description
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

scribd_download.sh 12KB


  1. #!/bin/bash -e
  2. # This script created by Tobias Bora is under GPLv3 Licence
  3. # This script download and convert a document from scribd.com into pdf
  4. # ImageMagick, Phantomjs and pdftk must be installed
  5. # Doc : https://github.com/ariya/phantomjs/wiki/API-Reference-WebPage#wiki-webpage-viewportSize
  6. # Working examples :
  7. # http://fr.scribd.com/doc/63942746/chopin-nocturne-n-20-partition
  8. # http://fr.scribd.com/doc/48491291/partition
  9. # If you don't want install phantomjs/imagemagick,
  10. # you can just put phantomjs and convert exec files
  11. # is in the current directory
  12. pdf_convert_mode="pdftk"
  13. # Uncomment this if you prefer to use convert
  14. # than pdftk for convertion (you can have memory issues)
  15. # pdf_convert_mode="convert"
  16. if [ -z "$1" ]
  17. then
  18. echo "scribd_download.sh <url>"
  19. echo "or if you want to change the number of pages :"
  20. echo "scribd_download.sh <url> <number of pages>"
  21. echo "or if you want to specify the width/height manually :"
  22. echo "scribd_download.sh <url> <number of pages> <width> <height>"
  23. echo "If you don't want to specify the number of pages :"
  24. echo "scribd_download.sh <url> 0 <width> <height>"
  25. exit 1
  26. fi
  27. # If convert isn't installed
  28. if [ -z "$(which convert)" ]
  29. then
  30. file="$(dirname $(readlink -f .))/convert"
  31. # Even in the current dir
  32. if [ ! -f "$file" ]
  33. then
  34. echo "You must install 'convert' from the package imagemagick."
  35. echo "On ubuntu run :"
  36. echo "sudo apt-get install imagemagick"
  37. exit 1
  38. else
  39. echo "The convert command has been found in the current dir."
  40. echo "I'll use it."
  41. exec_convert="$file"
  42. fi
  43. else
  44. exec_convert="convert"
  45. fi
  46. # If phantomjs isn't installed
  47. if [ -z "$(which phantomjs)" ]
  48. then
  49. file="$(dirname $(readlink -f .))/phantomjs"
  50. # Even in the current dir
  51. if [ ! -f "$file" ]
  52. then
  53. echo "You must install phantomjs."
  54. echo "On ubuntu run :"
  55. echo "sudo apt-get install phantomjs"
  56. exit 1
  57. else
  58. echo "The phantomjs command has been found in the current dir."
  59. echo "I'll use it."
  60. exec_phantomjs="$file"
  61. fi
  62. else
  63. exec_phantomjs="phantomjs"
  64. fi
  65. # If pdftk isn't installed
  66. if [ ! "$pdf_convert_mode" = "convert" ] && [ -z "$(which pdftk)" ]
  67. then
  68. file="$(dirname $(readlink -f .))/pdftk"
  69. # Even in the current dir
  70. if [ ! -f "$file" ]
  71. then
  72. echo "You must install pdftk."
  73. echo "On ubuntu run :"
  74. echo "sudo apt-get install pdftk"
  75. echo ""
  76. echo "(Or modify the script conf if you don't want it)"
  77. exit 1
  78. else
  79. echo "The pdftk command has been found in the current dir."
  80. echo "I'll use it."
  81. exec_pdftk="$file"
  82. fi
  83. else
  84. exec_pdftk="pdftk"
  85. fi
  86. url="$1"
  87. zoom_precision=2
  88. rm -rf .tmp
  89. mkdir .tmp
  90. cd .tmp
  91. # Get the number of pages
  92. echo "Getting informations..."
  93. echo "(It can be quite long, and don't worry if"
  94. echo "you see some errors during the conversion)"
  95. echo -n " Number of pages... "
  96. echo "var page = require('webpage').create();
  97. url = \"$url\"
  98. page.open(url, function () {
  99. console.log(page.content);
  100. phantom.exit();
  101. });
  102. // Avoid error messages
  103. page.onError = function(msg, trace) {
  104. };
  105. " > phantom_nb_pages.js
  106. # Update of Scribd
  107. $exec_phantomjs --load-images=no phantom_nb_pages.js > page.html
  108. nb_pages="$(cat page.html | grep 'document.getElementById(\"outer_page' | wc -l)"
  109. if [ -z "$2" ] || [ "$2" = "0" ]
  110. then
  111. if [ -z "$nb_pages" ]
  112. then
  113. echo "I can't find the number of pages... Please, how many pages are there in the file ?"
  114. read nb_pages
  115. fi
  116. else
  117. nb_pages="$2"
  118. fi
  119. echo "$nb_pages"
  120. page_name=`cat page.html | egrep -o "<title>.*</title>" | sed -E 's/<title>(.*)<\/title>/\1/' | sed -e 's/ /_/g' | tr -cd '[[:alnum:]]._-'`
  121. echo " Title... $page_name"
  122. echo "Done."
  123. # We remove useless parts in files
  124. echo "Removing useless parts..."
  125. # We make a new line for each html element.
  126. sed -i -e "s/</\\n</g" page.html
  127. sed -i -e "s/>[^\\n]/>\\n/g" page.html
  128. # sed -i -e "s/>/>\\n/g" page.html
  129. # exit 0
  130. function remove_node {
  131. # $1 is the node regexp string
  132. # $2 is the file
  133. node_regex=$1
  134. filename=$2
  135. commande="{if(!i && /${node_regex}/){i=1}else{if(i){if(/<div/){i++} if(/<\/div>/){i--}}else{if(!i){print \$0}} }}"
  136. awk "$commande" "$filename" > tmp
  137. mv tmp "$filename"
  138. }
  139. function remove_n_node {
  140. # $1 is the node regexp string
  141. # $2 is the file
  142. node_regex=$1
  143. filename=$2
  144. n=$3
  145. commande="BEGIN {l=${n}} {if( !i && l>0 && /${node_regex}/ ){i=1;l--}else{if(i){if(/<div/){i++} if(/<\/div>/){i--}}else{if(!i){print \$0}} }}"
  146. awk "$commande" "$filename" > tmp
  147. mv tmp "$filename"
  148. }
  149. function keep_n_node {
  150. # $1 is the node regexp string
  151. # $2 is the file
  152. node_regex=$1
  153. filename=$2
  154. n=$3
  155. commande="BEGIN {l=${n}} {if(l > 0 && /${node_regex}/ ){l--;print \$0}else{if(!i && /${node_regex}/ ){i=1;l--}else{if(i){if(/<div/){i++} if(/<\/div>/){i--}}else{if(!i){print \$0}} }}}"
  156. awk "$commande" "$filename" > tmp
  157. mv tmp "$filename"
  158. }
  159. function remove_errors {
  160. awk '/</{i++}i' "$1" > tmp
  161. mv tmp "$1"
  162. }
  163. # We remove the margin on the left of the main block
  164. sed -i -e 's/id="doc_container"/id="doc_container" style="min-width:0px;margin-left : 0px;"/g' page.html
  165. # We remove all html elements which are useless (menus...)
  166. echo -n "-"
  167. remove_errors "page.html"
  168. echo -n "-"
  169. remove_node '<div.*id="global_header"' "page.html"
  170. echo -n "-"
  171. remove_node '<div class="header_spacer"' "page.html"
  172. echo -n "-"
  173. remove_node '<div.*id="doc_info"' "page.html"
  174. echo -n "-"
  175. remove_node '<div.*class="toolbar_spacer"' "page.html"
  176. echo -n "-"
  177. remove_node '<div.*between_page_ads_1' "page.html"
  178. echo -n "-"
  179. remove_node 'id="leaderboard_ad_main">' "page.html"
  180. echo -n "-"
  181. # Remove the space between pages
  182. remove_node 'class="page_missing_explanation ' "page.html"
  183. echo -n "-"
  184. remove_node '<div id="between_page_ads' "page.html"
  185. echo -n "-"
  186. remove_node '<div class="b_..">' "page.html"
  187. echo -n "-"
  188. remove_node '<div class="buy_doc_bar' "page.html"
  189. sed -i -e 's/<div class="outer_page/<div style="margin: 0px;" class="outer_page/g' page.html
  190. # Remove shadow on forbidden pages
  191. echo -n "-"
  192. remove_node '<div class="shadow_overlay">' "page.html"
  193. echo -n "-"
  194. remove_node 'grab_blur_promo_here' "page.html"
  195. echo -n "-"
  196. remove_node 'missing_page_buy_button' "page.html"
  197. echo -e "\nDone"
  198. # We download the page with images
  199. echo "Downloading page..."
  200. # Automatic detection
  201. if [ -z "$4" ]
  202. then
  203. #### The page size is founded automatiquely
  204. # New way : with this way it should be possible to
  205. # choose the size of each page
  206. width_no_zoom="$(cat page.html | grep -o '\"origWidth\": [0-9]*' | head -n 1 | awk -F ' ' '{print $2}')"
  207. height_no_zoom="$(cat page.html | grep -o '\"origHeight\": [0-9]*' | head -n 1 | awk -F ' ' '{print $2}')"
  208. # If it doesn't work
  209. if [ -z "$width_no_zoom" ]
  210. then
  211. echo "The first detection didn't work..."
  212. width_no_zoom="$(cat page.html | grep 'id=\"outer_page_1' | egrep -o '[0-9]+px' | egrep -o '[0-9]+' | awk 'NR == 1')"
  213. height_no_zoom="$(cat page.html | grep 'id=\"outer_page_1' | egrep -o '[0-9]+px' | egrep -o '[0-9]+' | awk 'NR == 2')"
  214. else
  215. echo "Detection successfull !"
  216. # If it works we modify the Javascript to have the good width
  217. sed -i -e "s/var defaultViewWidth .*;/var defaultViewWidth = defaultViewWidth || $width_no_zoom;/g" page.html
  218. fi
  219. else
  220. width_no_zoom="$3"
  221. height_no_zoom="$4"
  222. fi
  223. # space_no_zoom=100
  224. space_no_zoom=0
  225. echo "Width : $width_no_zoom px"
  226. echo "Height : $height_no_zoom px"
  227. width=$(($width_no_zoom * $zoom_precision))
  228. height=$(($height_no_zoom * $zoom_precision))
  229. space=$(($space_no_zoom * $zoom_precision))
  230. # We treat each pages 10 by 10 because phantomjs can't manage to deal
  231. # with big documents (something like 20 pages)
  232. current_page=0
  233. leaving_pages="$nb_pages"
  234. max_treat=10
  235. # We make a copy in order to remove useless pages
  236. # page_svg.html contains all pages which hasn't been recorded
  237. cp page.html page_svg.html
  238. # We treat pages until all pages are treated
  239. while [ "$leaving_pages" -gt "0" ]
  240. do
  241. if [ "$leaving_pages" -lt "$max_treat" ]
  242. then
  243. nb_pages_to_treat="$leaving_pages"
  244. leaving_pages=0
  245. else
  246. nb_pages_to_treat="$max_treat"
  247. leaving_pages="$(($leaving_pages - $max_treat))"
  248. fi
  249. echo "Downloading $nb_pages_to_treat pages ($leaving_pages leaving pages after that, $current_page already downloaded)"
  250. cp page_svg.html page.html
  251. keep_n_node 'id="outer_page_' "page.html" "$nb_pages_to_treat"
  252. echo "var page = require('webpage').create();
  253. output='out.png';
  254. address = 'page.html';
  255. nb_pages = $nb_pages_to_treat;
  256. zoom = $zoom_precision;
  257. width = $width
  258. height = (768+($height+$space)*nb_pages);
  259. page.viewportSize = { width: width, height: height };
  260. page.zoomFactor = zoom;
  261. page.open(address, function (status) {
  262. if (status !== 'success') {
  263. console.log('Unable to load the address!');
  264. } else {
  265. page.clipRect = { top: 0, left: 0, width: width, height: height };
  266. window.setTimeout(function () {
  267. page.render(output);
  268. phantom.exit();
  269. }, 200);
  270. }
  271. });
  272. // Avoid error messages
  273. page.onError = function(msg, trace) {
  274. };
  275. " > phantom_render.js
  276. $exec_phantomjs phantom_render.js
  277. echo "Done"
  278. ### Treatment of the picture
  279. # Separate pages
  280. echo "Treatment... "
  281. for i in `seq 0 $(( $nb_pages_to_treat - 1))`
  282. do
  283. # We add zeros to fill the page number in file name
  284. printf -v page_filename "0_%05d.png" "$current_page"
  285. # We select the good page and save it in a new file
  286. $exec_convert out.png -gravity NorthWest -crop ${width}x${height}+0+$(( $i*($height + $space) )) $page_filename
  287. current_page="$(($current_page + 1))"
  288. done
  289. ### Remove useless pages in page.html
  290. if [ "$leaving_pages" -ne "0" ]
  291. then
  292. remove_n_node 'id="outer_page_' "page_svg.html" "$nb_pages_to_treat"
  293. fi
  294. done
  295. # Create the pdf file
  296. echo "All pages have been downloaded, I will now create the pdf file"
  297. # This function is used in the pdftk mode
  298. # It combines each pdf two by two (avoid memory error)
  299. # This function modify the input pdf array
  300. function combine_pdf {
  301. # $1 = pdf array
  302. # $2 = base new name
  303. # $3 = output variable name
  304. declare -a pdf=("${!1}")
  305. base_new_name="$2"
  306. # Empty array
  307. out_pdf=()
  308. i=0
  309. # For each file and it's neightbour...
  310. while [ "$i" -lt "$(( ${#pdf[*]} - 1))" ]
  311. do
  312. output_name="${base_new_name}_${i}.pdf"
  313. # echo "${pdf[$i]} & ${pdf[ $(( $i + 1 )) ]} => $output_name"
  314. # Combine two by two
  315. $exec_pdftk "${pdf[$i]}" "${pdf[ $(( $i + 1 )) ]}" cat output "$output_name"
  316. # Add in the output array
  317. out_pdf["$i"]="$output_name"
  318. i="$(( $i + 2 ))"
  319. echo -n "-"
  320. done
  321. # If one element hasn't been treated we add it in the output
  322. if [ "$i" -ne "${#pdf[*]}" ]
  323. then
  324. out_pdf["$i"]="${pdf[ $(( ${#pdf[*]} - 1 )) ]}"
  325. fi
  326. # Copy
  327. eval "$3=(\"\${out_pdf[@]}\")"
  328. }
  329. if [ "$pdf_convert_mode" = "convert" ]
  330. then
  331. echo "Using convert (can not work with low memory)"
  332. $exec_convert 0_*.png -quality 100 -compress jpeg -gravity center -resize 1240x1753 -extent 1240x1753 -gravity SouthWest -page a4 ../${page_name}.pdf
  333. else
  334. echo "Using pdftk (maybe longer but no memory error)"
  335. echo "You can change the configuration if you prefer convert"
  336. # Convertion of each picture one by one
  337. for picture in 0_*.png
  338. do
  339. $exec_convert "$picture" -quality 100 -compress jpeg -gravity center -resize 1240x1753 -extent 1240x1753 -gravity SouthWest -page a4 "$picture.pdf"
  340. echo -n "-"
  341. done
  342. echo ""
  343. echo "Listing files..."
  344. files=()
  345. i=0
  346. # List all files
  347. for line in 0_*.pdf
  348. do
  349. files[ $i ]="$line"
  350. i="$(( $i + 1 ))"
  351. done
  352. echo "Putting files together..."
  353. # Combine
  354. j=1
  355. while [ "${#files[*]}" -gt "1" ]
  356. do
  357. combine_pdf files[@] "$j" "files"
  358. j="$(( $j + 1 ))"
  359. echo "#"
  360. done
  361. eval "cp \"${files[0]}\" \"../${page_name}.pdf\""
  362. fi
  363. cd ..
  364. echo "Done"
  365. echo "The outputfile is $(pwd)/${page_name}.pdf"
  366. echo "Name : ${page_name}.pdf"
  367. # rm -rf .tmp