├── .gitignore ├── README.md └── gbd.js /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | parts 10 | bin 11 | var 12 | sdist 13 | develop-eggs 14 | .installed.cfg 15 | 16 | # Installer logs 17 | pip-log.txt 18 | 19 | # Unit test / coverage reports 20 | .coverage 21 | .tox 22 | 23 | #Translations 24 | *.mo 25 | 26 | #Mr Developer 27 | .mr.developer.cfg 28 | *.png 29 | *.sh 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | google-books-downloader 2 | ======================= 3 | 4 | Just a proof of concept Google Books downloader. 5 | 6 | This script started as a pet project to learn about CasperJS, but since it might be helpful to other people, I have decided to release it. 7 | 8 | Google Books is a great source of information, but I would love to have an offline version of some books to read later on my iPad. 9 | 10 | I wanted to scrape the HTML and extract all the image information that contains the actual pages, but Google Books implements lazy loading to download the pages as you scroll down on the page. 11 | 12 | Thus, the only way to extract the images is using an actual, javascript enabled browser. Or a a headless implementation of one, and this is where PhantomJS comes in :) 13 | 14 | I have put together some very ugly code that loads a book on Google Books, scrolls all the way down and saves all the pages in PNG format. Now that CasperJS supports file downloading properly, I have changed the standard behavior of printing the URLs of the files to download afterwards with wget, but I would like to have it as an option. 15 | 16 | In the future, I would like to generate a PDF with all the pages together. Wanna help me out? :) 17 | 18 | Dependencies 19 | ------------ 20 | 21 | - CapserJS 1.0.0-RC1 22 | - PhantomJS 1.6.1 23 | 24 | Known Limitations 25 | ----------------- 26 | 27 | - The book URL is hardcoded 28 | - The script downloads the pages automatically, since the download bug was fixed 29 | - It does not download books with limited preview (though I have plans to add proxy and Tor support to bypass the preview limitation :) 30 | 31 | Usage 32 | ----- 33 | 34 | `casperjs gbd.js` 35 | 36 | TODO 37 | ---- 38 | 39 | `[ ] Load the URL from the command line 40 | [ ] Change the filenames to make it easier to combine or read all the files 41 | [ ] Merge the files to PDF` -------------------------------------------------------------------------------- /gbd.js: -------------------------------------------------------------------------------- 1 | /*jslint node: true */ 2 | /*jslint browser: true */ 3 | /*jslint plusplus: true */ 4 | 5 | var pages = []; 6 | 7 | //TODO: load the URL from the command line 8 | var url = "http://books.google.com.br/books?id=AS4DAAAAMBAJ&printsec=frontcover&hl=pt-BR&source=gbs_ge_summary_r&cad=0#v=onepage&q&f=false"; 9 | 10 | Array.prototype.exists = function (search) { 11 | "use strict"; 12 | var i; 13 | for (i = 0; i < this.length; i++) { 14 | if (this[i] === search) { 15 | return true; 16 | } 17 | } 18 | return false; 19 | }; 20 | 21 | var casper = require('casper').create({ 22 | onError: function (self, m) { 23 | "use strict"; 24 | console.log('FATAL:' + m); 25 | self.exit(); 26 | }, 27 | viewportSize: { 28 | width: 1024, 29 | height: 600 30 | } 31 | }); 32 | 33 | casper.start(url, function () { 34 | "use strict"; 35 | var loading_div, page_down; 36 | loading_div = "div[style*='position: absolute; left: 0px; color: rgb(128, 128, 128); font-size: 13px; background-color: white; bottom: 0px; -webkit-user-select: none;']"; 37 | page_down = ".SPRITE_page_down"; 38 | casper.then(function () { 39 | 40 | this.waitFor( 41 | function () { 42 | this.click(page_down); 43 | 44 | var elem, current_position; 45 | 46 | elem = this.evaluate(function () { 47 | var viewport_div, elem; 48 | viewport_div = "div[id='viewport'] > div > div > div"; 49 | elem = document.querySelector(viewport_div).offsetParent; 50 | return elem; 51 | }); 52 | 53 | current_position = elem.scrollHeight - elem.scrollTop - elem.offsetHeight; 54 | return (elem.scrollHeight - elem.scrollTop - elem.offsetHeight === 0); 55 | }, 56 | function () { 57 | // this.echo("Done scrolling"); 58 | }, 59 | function () { 60 | // this.echo("Timeout!?"); 61 | }, 62 | Infinity //This is a strong candidate to win a TheDailyWTF award :) 63 | ); 64 | }); 65 | }); 66 | 67 | casper.on('resource.received', function (resource) { 68 | "use strict"; 69 | if ((resource.url.indexOf("&pg=P") !== -1) && (resource.url.indexOf("&jscmd=click3") === -1)) { 70 | if (!pages.exists(resource.url)) { 71 | pages.push(resource.url); 72 | var url, file; 73 | url = resource.url; 74 | file = (url.substring(url.indexOf("&pg=") + 4, url.indexOf("&img"))) + ".png"; 75 | //TODO: Add an option to either output a wget script or download right away 76 | //this.echo("wget --quiet --output-document='" + file + "' --user-agent=\"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.3) Gecko/2008092416 Firefox/3.0.3\" '" + resource.url + "'"); 77 | try { 78 | this.echo("Attempting to download file " + file); 79 | casper.download(resource.url, file); 80 | } catch (e) { 81 | this.echo(e); 82 | } 83 | } 84 | } 85 | }); 86 | 87 | casper.run(); 88 | --------------------------------------------------------------------------------