├── .gitignore
├── README.md
└── gbd.js


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[co]
 2 | 
 3 | # Packages
 4 | *.egg
 5 | *.egg-info
 6 | dist
 7 | build
 8 | eggs
 9 | parts
10 | bin
11 | var
12 | sdist
13 | develop-eggs
14 | .installed.cfg
15 | 
16 | # Installer logs
17 | pip-log.txt
18 | 
19 | # Unit test / coverage reports
20 | .coverage
21 | .tox
22 | 
23 | #Translations
24 | *.mo
25 | 
26 | #Mr Developer
27 | .mr.developer.cfg
28 | *.png
29 | *.sh
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | google-books-downloader
 2 | =======================
 3 | 
 4 | Just a proof of concept Google Books downloader.
 5 | 
 6 | This script started as a pet project to learn about CasperJS, but since it might be helpful to other people, I have decided to release it.
 7 | 
 8 | Google Books is a great source of information, but I would love to have an offline version of some books to read later on my iPad.
 9 | 
10 | I wanted to scrape the HTML and extract all the image information that contains the actual pages, but Google Books implements lazy loading to download the pages as you scroll down on the page.
11 | 
12 | Thus, the only way to extract the images is using an actual, javascript enabled browser. Or a a headless implementation of one, and this is where PhantomJS comes in :)
13 | 
14 | I have put together some very ugly code that loads a book on Google Books, scrolls all the way down and saves all the pages in PNG format. Now that CasperJS supports file downloading properly, I have changed the standard behavior of printing the URLs of the files to download afterwards with wget, but I would like to have it as an option.
15 | 
16 | In the future, I would like to generate a PDF with all the pages together. Wanna help me out? :)
17 | 
18 | Dependencies
19 | ------------
20 | 
21 | - CapserJS 1.0.0-RC1
22 | - PhantomJS 1.6.1
23 | 
24 | Known Limitations
25 | -----------------
26 | 
27 | - The book URL is hardcoded
28 | - The script downloads the pages automatically, since the download bug was fixed
29 | - It does not download books with limited preview (though I have plans to add proxy and Tor support to bypass the preview limitation :)
30 | 
31 | Usage
32 | -----
33 | 
34 | `casperjs gbd.js`
35 | 
36 | TODO
37 | ----
38 | 
39 | `[ ] Load the URL from the command line
40 | [ ] Change the filenames to make it easier to combine or read all the files
41 | [ ] Merge the files to PDF`


--------------------------------------------------------------------------------
/gbd.js:
--------------------------------------------------------------------------------
 1 | /*jslint node: true */
 2 | /*jslint browser: true */
 3 | /*jslint plusplus: true */
 4 | 
 5 | var pages = [];
 6 | 
 7 | //TODO: load the URL from the command line
 8 | var url = "http://books.google.com.br/books?id=AS4DAAAAMBAJ&printsec=frontcover&hl=pt-BR&source=gbs_ge_summary_r&cad=0#v=onepage&q&f=false";
 9 | 
10 | Array.prototype.exists = function (search) {
11 |     "use strict";
12 |     var i;
13 |     for (i = 0; i < this.length; i++) {
14 |         if (this[i] === search) {
15 |             return true;
16 |         }
17 |     }
18 |     return false;
19 | };
20 | 
21 | var casper = require('casper').create({
22 |     onError: function (self, m) {
23 |         "use strict";
24 |         console.log('FATAL:' + m);
25 |         self.exit();
26 |     },
27 |     viewportSize: {
28 |         width: 1024,
29 |         height: 600
30 |     }
31 | });
32 | 
33 | casper.start(url, function () {
34 |     "use strict";
35 |     var loading_div, page_down;
36 |     loading_div = "div[style*='position: absolute; left: 0px; color: rgb(128, 128, 128); font-size: 13px; background-color: white; bottom: 0px; -webkit-user-select: none;']";
37 |     page_down = ".SPRITE_page_down";
38 |     casper.then(function () {
39 | 
40 |         this.waitFor(
41 |             function () {
42 |                 this.click(page_down);
43 | 
44 |                 var elem, current_position;
45 | 
46 |                 elem = this.evaluate(function () {
47 |                     var viewport_div, elem;
48 |                     viewport_div = "div[id='viewport'] > div > div > div";
49 |                     elem = document.querySelector(viewport_div).offsetParent;
50 |                     return elem;
51 |                 });
52 | 
53 |                 current_position = elem.scrollHeight - elem.scrollTop - elem.offsetHeight;
54 |                 return (elem.scrollHeight - elem.scrollTop - elem.offsetHeight === 0);
55 |             },
56 |             function () {
57 |                 // this.echo("Done scrolling");
58 |             },
59 |             function () {
60 |                 // this.echo("Timeout!?");
61 |             },
62 |             Infinity //This is a strong candidate to win a TheDailyWTF award :)
63 |         );
64 |     });
65 | });
66 | 
67 | casper.on('resource.received', function (resource) {
68 |     "use strict";
69 |     if ((resource.url.indexOf("&pg=P") !== -1) && (resource.url.indexOf("&jscmd=click3") === -1)) {
70 |         if (!pages.exists(resource.url)) {
71 |             pages.push(resource.url);
72 |             var url, file;
73 |             url = resource.url;
74 |             file = (url.substring(url.indexOf("&pg=") + 4, url.indexOf("&img"))) + ".png";
75 |             //TODO: Add an option to either output a wget script or download right away
76 |             //this.echo("wget --quiet --output-document='" + file + "' --user-agent=\"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.3) Gecko/2008092416 Firefox/3.0.3\" '" + resource.url + "'");
77 |             try {
78 |                 this.echo("Attempting to download file " + file);
79 |                 casper.download(resource.url, file);
80 |             } catch (e) {
81 |                 this.echo(e);
82 |             }
83 |         }
84 |     }
85 | });
86 | 
87 | casper.run();
88 | 


--------------------------------------------------------------------------------