├── .editorconfig ├── .gitignore ├── .jshintrc ├── .travis.yml ├── CHANGELOG.md ├── Gruntfile.js ├── LICENSE ├── README.md ├── bin └── scraperjs ├── doc └── examples │ ├── ErrorHandling.js │ ├── HackerNews.js │ ├── IMDBOpeningThisWeek.js │ ├── LinkGetter.js │ └── WikimediaScraper.js ├── package.json ├── src ├── AbstractScraper.js ├── DynamicScraper.js ├── PhantomPoll.js ├── PhantomWrapper.js ├── Router.js ├── Scraper.js ├── ScraperError.js ├── ScraperPromise.js └── StaticScraper.js └── test ├── AbstractScraper.js ├── DynamicScraper.js ├── Router.js ├── ScraperError.js ├── ScraperPromise.js ├── StaticScraper.js ├── commandLine.js ├── setupServer.js └── static ├── code.js └── hacker-news-clone.html /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | 4 | [*] 5 | 6 | indent_style = tab 7 | indent_size = 2 8 | 9 | end_of_line = lf 10 | charset = utf-8 11 | trim_trailing_whitespace = true 12 | insert_final_newline = true 13 | 14 | [*.md] 15 | trim_trailing_whitespace = false 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.swp 3 | .project 4 | .DS_Store? 5 | ._* 6 | .Spotlight-V100 7 | .Trashes 8 | Icon? 9 | ehthumbs.db 10 | Thumbs.db 11 | *.sublime-project 12 | *.sublime-workspace 13 | *.ignore 14 | node_modules 15 | bower_components 16 | docs 17 | coverage -------------------------------------------------------------------------------- /.jshintrc: -------------------------------------------------------------------------------- 1 | { 2 | "bitwise": false, 3 | "camelcase": true, 4 | "curly": true, 5 | "eqeqeq": false, 6 | "es3": false, 7 | "freeze": true, 8 | "immed": true, 9 | "latedef": true, 10 | "newcap": true, 11 | "noarg": false, 12 | "noempty": true, 13 | "nonbsp": true, 14 | "nonew": true, 15 | "plusplus": false, 16 | "quotmark": "single", 17 | "undef": true, 18 | "unused": true, 19 | 20 | "trailing": true, 21 | "boss": true, 22 | "browser": false, 23 | "node": true 24 | } 25 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: node_js 3 | node_js: 4 | - "0.12" 5 | - "4.0" 6 | - "0.11" 7 | - "0.10" 8 | script: grunt coveralls && grunt clean 9 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | 3 | ## [1.2.0] - 2015-12-14 4 | - Change order of parameters when continuing promise chain. 5 | - ``` async ```'s callback function receives (err, result) parameters. 6 | - Last result is passed to ``` done ``` promise. 7 | 8 | ## [1.1.0] - 2015-12-12 9 | - Result of last promise is passed as first parameter to the ``` async ``` promise. 10 | 11 | ## [1.0.2] - 2015-12-10 12 | - Dependency bump 13 | 14 | ## [1.0.0] - 2015-10-17 15 | - ``` catch ``` promise is the new standard way to deal with errors. ``` onError ``` is being deprecated, the two work the same way. 16 | - ``` then ``` promise receives the value returned in the last promise as the first parameter, the second parameter is the ``` utils ``` object. 17 | - Errors generated inside the dynamic scraper's scraping function will fire the ``` catch ``` promise. 18 | 19 | ## [0.4.1] - 2015-10-04 20 | - Url of the page being scraped can now be easily accessed using ``` utils.url ```. 21 | - Added error handling example. 22 | 23 | ## [0.4.0] - 2015-09-19 24 | - Passing utils to the error callback by @rvernica 25 | - Add an 'options' argument to DynamicScraper that get passed to Phantom by @vdraceil 26 | - Updated dependencies 27 | 28 | ## [0.3.4] - 2015-04-26 29 | - Minor fixes related with documentation. 30 | - Fixed ``` async ``` promise works. It can receive values to be passed to the next promise. Internally it now uses this mechanism. 31 | - Support for node 0.12. 32 | - Changes in the command-line interface. 33 | 34 | ## [0.3.3] - 2015-02-11 35 | - Fixed bug where no argument was given to the ```done``` promise when there was an error. 36 | - Added experimental support for command-line interface. 37 | - Added example. 38 | 39 | ## [0.3.2] - 2014-12-20 40 | - The ```lastResult``` is now made accessible to a ```Router```. 41 | -------------------------------------------------------------------------------- /Gruntfile.js: -------------------------------------------------------------------------------- 1 | var testServer = require('./test/setupServer'); 2 | 3 | var MOCHA_TIMEOUT_S = 10, 4 | MOCHA_TIMEOUT_MS = MOCHA_TIMEOUT_S * 1000, 5 | MOCHA_OPTIONS = { 6 | reporter: 'spec', 7 | timeout: MOCHA_TIMEOUT_MS 8 | }, 9 | COVERAGE_THRESHOLD = 95; 10 | 11 | module.exports = function(grunt) { 12 | grunt.loadNpmTasks('grunt-mocha-test'); 13 | grunt.loadNpmTasks('grunt-contrib-jshint'); 14 | grunt.loadNpmTasks('grunt-contrib-watch'); 15 | grunt.loadNpmTasks('grunt-contrib-clean'); 16 | grunt.loadNpmTasks('grunt-exec'); 17 | 18 | grunt.initConfig({ 19 | exec: { 20 | coverage: { 21 | command: 'istanbul cover ./node_modules/mocha/bin/_mocha -x src/PhantomWrapper.js -- -t ' + MOCHA_TIMEOUT_MS + ' --root src/ test/' 22 | }, 23 | coveralls: { 24 | command: 'istanbul cover ./node_modules/mocha/bin/_mocha -x src/PhantomWrapper.js --report lcovonly -- -t ' + MOCHA_TIMEOUT_MS + ' -x src/PhantomWrapper.js --root src/ test/ && cat ./coverage/lcov.info | ./node_modules/coveralls/bin/coveralls.js' 25 | }, 26 | 'check-coverage': { 27 | command: 'istanbul check-coverage --lines ' + COVERAGE_THRESHOLD + ' --statements ' + COVERAGE_THRESHOLD + ' --functions ' + COVERAGE_THRESHOLD + ' --branches ' + COVERAGE_THRESHOLD + ' ./coverage/coverage.json' 28 | } 29 | }, 30 | clean: { 31 | coverage: { 32 | src: ['coverage/'] 33 | } 34 | }, 35 | watch: { 36 | common: { 37 | files: ['src/**/*.js', 'test/**/*.js', 'Gruntfile.js'], 38 | tasks: ['test'] 39 | } 40 | }, 41 | jshint: { 42 | all: ['src/**/*.js', 'test/**/*.js'] 43 | }, 44 | mochaTest: { 45 | abstractScraper: { 46 | src: 'test/AbstractScraper.js', 47 | options: MOCHA_OPTIONS 48 | }, 49 | staticScraper: { 50 | src: 'test/StaticScraper.js', 51 | options: MOCHA_OPTIONS 52 | }, 53 | dynamicScraper: { 54 | src: 'test/DynamicScraper.js', 55 | options: MOCHA_OPTIONS 56 | }, 57 | scraperPromise: { 58 | src: 'test/ScraperPromise.js', 59 | options: MOCHA_OPTIONS 60 | }, 61 | router: { 62 | src: 'test/Router.js', 63 | options: MOCHA_OPTIONS 64 | }, 65 | scraperError: { 66 | src: 'test/ScraperError.js', 67 | options: MOCHA_OPTIONS 68 | }, 69 | commandLine: { 70 | src: 'test/commandLine.js', 71 | options: MOCHA_OPTIONS 72 | }, 73 | all: { 74 | src: ['test/AbstractScraper.js', 'test/StaticScraper.js', 'test/DynamicScraper.js', 'test/ScraperPromise.js', 'test/Router.js', 'test/ScraperError.js', 'test/commandLine.js'], 75 | options: MOCHA_OPTIONS 76 | } 77 | } 78 | }); 79 | 80 | var server; 81 | 82 | grunt.registerTask('serve', 'Starts express testing server', function() { 83 | server = testServer(grunt); 84 | }); 85 | 86 | grunt.registerTask('unserve', function() { 87 | if (server) { 88 | server.close(); 89 | } 90 | }); 91 | 92 | grunt.registerTask('serve-and-test', ['serve', 'mochaTest:all', 'unserve']); 93 | 94 | grunt.registerTask('coverage', ['clean', 'jshint', 'serve', 'exec:coverage', 'exec:check-coverage', 'unserve']); 95 | grunt.registerTask('coveralls', ['clean', 'jshint', 'serve', 'exec:coveralls', 'exec:check-coverage', 'unserve']); 96 | 97 | grunt.registerTask('unit', ['jshint', 'serve-and-test']); 98 | grunt.registerTask('test', ['coverage']); 99 | 100 | grunt.registerTask('watch-all', ['serve', 'watch', 'unserve']); 101 | }; -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Copyright (C) 2013 Rui Gil 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scraperjs 2 | [![Build Status](https://travis-ci.org/ruipgil/scraperjs.svg?branch=master)](https://travis-ci.org/ruipgil/scraperjs) [![Dependency Status](https://gemnasium.com/ruipgil/scraperjs.svg)](https://gemnasium.com/ruipgil/scraperjs) [![Coverage Status](https://coveralls.io/repos/ruipgil/scraperjs/badge.svg?branch=master)](https://coveralls.io/r/ruipgil/scraperjs?branch=master) [![NPM version](https://badge.fury.io/js/scraperjs.svg)](http://badge.fury.io/js/scraperjs) [![Inline docs](http://inch-ci.org/github/ruipgil/scraperjs.svg?branch=master)](http://inch-ci.org/github/ruipgil/scraperjs) 3 | 4 | Scraperjs is a web scraper module that make scraping the web an easy job. 5 | 6 | ## Installing 7 | 8 | ``` 9 | npm install scraperjs 10 | ``` 11 | 12 | If you would like to test (this is optional and requires the installation with the ``` --save-dev ``` tag), 13 | ``` 14 | grunt test 15 | ``` 16 | 17 | *To use some features you’ll need to install [phantomjs](http://phantomjs.org/download.html), if you haven’t already* 18 | 19 | # Getting started 20 | 21 | Scraperjs exposes two different scrapers, 22 | + a **StaticScraper**, that is light fast and with a low footprint, however it doesn't allow for more complex situations, like scraping dynamic content. 23 | + a **DynamicScraper**, that is a bit more heavy, but allows you to scrape dynamic content, like in the browser console. 24 | both scrapers expose a *very* similar API, with some minor differences when it comes to scraping. 25 | 26 | ## Lets scrape [Hacker News](https://news.ycombinator.com/), with both scrapers. 27 | 28 | Try to spot the differences. 29 | 30 | ### Static Scraper 31 | 32 | ```javascript 33 | var scraperjs = require('scraperjs'); 34 | scraperjs.StaticScraper.create('https://news.ycombinator.com/') 35 | .scrape(function($) { 36 | return $(".title a").map(function() { 37 | return $(this).text(); 38 | }).get(); 39 | }) 40 | .then(function(news) { 41 | console.log(news); 42 | }) 43 | ``` 44 | 45 | The ```scrape``` promise receives a function that will scrape the page and return the result, it only receives jQuery a parameter to scrape the page. Still, very powerful. It uses [cheerio](https://github.com/cheeriojs/cheerio) to do the magic behind the scenes. 46 | 47 | ### Dynamic Scraper 48 | 49 | ```javascript 50 | var scraperjs = require('scraperjs'); 51 | scraperjs.DynamicScraper.create('https://news.ycombinator.com/') 52 | .scrape(function($) { 53 | return $(".title a").map(function() { 54 | return $(this).text(); 55 | }).get(); 56 | }) 57 | .then(function(news) { 58 | console.log(news); 59 | }) 60 | ``` 61 | 62 | Again, the ```scrape``` promise receives a function to scrape the page, the only difference is that, because we're using a dynamic scraper, the scraping function is [sandboxed](https://github.com/sgentle/phantomjs-node/wiki#evaluating-pages) only with the page scope, so **no closures!** This means that in *this* (and only in this) scraper you can't call a function that has not been defined inside the scraping function. Also, the result of the scraping function must be [JSON-serializable](https://github.com/sgentle/phantomjs-node/wiki#evaluating-pages). 63 | We use [phantom](https://github.com/sgentle/phantomjs-node) and [phantomjs](https://github.com/ariya/phantomjs) to make it happen, we also inject jQuery for you. 64 | 65 | However, it's possible to [pass JSON-serializable data](**Example**) to *any* scraper. 66 | 67 | *The ```$``` varible received by the scraping function is, only for the dynamic scraper, hardcoded.* 68 | 69 | ## Show me the way! (aka Routes) 70 | 71 | For a more flexible scraping and crawling of the web sometimes we need to go through multiple web sites and we don't want map every possible url format. For that scraperjs provides the Router class. 72 | 73 | ### Example 74 | 75 | ```javascript 76 | var scraperjs = require('scraperjs'), 77 | router = new scraperjs.Router(); 78 | 79 | router 80 | .otherwise(function(url) { 81 | console.log("Url '"+url+"' couldn't be routed."); 82 | }); 83 | 84 | var path = {}; 85 | 86 | router.on('https?://(www.)?youtube.com/watch/:id') 87 | .createStatic() 88 | .scrape(function($) { 89 | return $("a").map(function() { 90 | return $(this).attr("href"); 91 | }).get(); 92 | }) 93 | .then(function(links, utils) { 94 | path[utils.params.id] = links 95 | }) 96 | 97 | router.route("https://www.youtube.com/watch/YE7VzlLtp-4", function() { 98 | console.log("i'm done"); 99 | }); 100 | ``` 101 | 102 | Code that allows for parameters in paths is from the project [Routes.js](https://github.com/aaronblohowiak/routes.js), information about the [path formating](https://github.com/aaronblohowiak/routes.js#path-formats) is there too. 103 | 104 | # API overview 105 | 106 | Scraperjs uses promises whenever possible. 107 | 108 | #### StaticScraper, DynamicScraper and ScraperPromise 109 | 110 | So, the scrapers should be used with the ScraperPromise. By creating a scraper 111 | ```javascript 112 | var scraperPromise = scraperjs.StaticScraper.create() // or DynamicScraper 113 | ``` 114 | The following promises can be made over it, they all return a scraper promise, 115 | + ```onStatusCode(code:number, callback:function(utils:Object))```, executes the callback when the status code is equal to the code, 116 | + ```onStatusCode(callback:function(code:number, utils:Object))```, executes the callback when receives the status code. The callback receives the current status code, 117 | + ```delay(time:number, callback:function(last:?, utils:Object))```, delays the execution of the chain by time (in milliseconds), 118 | + ```timeout(time:number, callback:function(last:?, utils:Object))```, executes the callback function after time (in milliseconds), 119 | + ```then(lastResult:?, callback:function(last:?, utils:Object))```, executes the callback after the last promise, 120 | + ```async(callback:function(last:?, done:function(result:?, err:?), utils))```, executes the callback, stopping the promise chain, resuming it when the ```done``` function is called. You can provide a result to be passed down the promise chain, or an error to trigger the catch promise, 121 | + ```catch(callback:function(error:Error, utils:Object))```, executes the callback when there was an error, errors block the execution of the chain even if the promise was not defined, 122 | + ```done(callback:function(last:?, utils:Object))```, executes the callback at the end of the promise chain, this is always executed, even if there was an error, 123 | + ```get(url:string)```, makes a simple HTTP GET request to the url. This promise should be used only once per scraper. 124 | + ```request(options:Object)```, makes a (possibly) more complex HTTP request, scraperjs uses the [request](https://github.com/mikeal/request) module, and this method is a simple wrapper of ```request.request()```. This promise should be used only once per scraper. 125 | + ```scrape(scrapeFn:function(...?), callback:function(result:?, utils:Object)=, ...?)```, scrapes the page. It executes the scrapeFn and passes it's result to the callback. When using the StaticScraper, the scrapeFn receives a jQuery function that is used to scrape the page. When using the DynamicScraper, the scrapeFn doesn't receive anything and can only return a [JSON-serializable](https://github.com/sgentle/phantomjs-node/wiki#evaluating-pages) type. Optionally an arbitrary number of arguments can be passed to the scraping function. The callback may be omitted, if so, the result of the scraping may be accessed with the ``` then ``` promise or ``` utils.lastReturn ``` in the next promise. 126 | 127 | All callback functions receive as their last parameter a utils object, with it the parameters of an url from a router can be accessed. Also the chain can be stopped. 128 | ```javascript 129 | DynamicScraper.create() 130 | .get("http://news.ycombinator.com") 131 | .then(function(_, utils) { 132 | utils.stop(); 133 | // utils.params.paramName 134 | }); 135 | ``` 136 | 137 | The promise chain is fired with the same sequence it was declared, with the exception of the promises get and request that fire the chain when they've received a valid response, and the promises ``` done ``` and ``` catch ```, which were explained above. 138 | 139 | You can also waterfall values between promises by returning them (with the exception of the promise ```timeout```, that will always return ```undefined```) and it can be access through ```utils.lastReturn```. 140 | 141 | ##### The ``` utils ``` object 142 | 143 | You've seen the ``` utils ``` object that is passed to promises, it provides useful information and methods to your promises. Here's what you can do with it: 144 | + ``` .lastResult ```, value returned in the last promise 145 | + ``` .stop() ```, function to stop the promise chain, 146 | + ``` .url ```, url provided to do the scraper, 147 | + ``` .params ```, object with the parameters defined in the router matching pattern. 148 | 149 | 150 | ##### A more powerful DynamicScraper. 151 | 152 | When lots of instances of DynamicScraper are needed, it's creation gets really heavy on resources and takes a lot of time. To make this more lighter you can use a *factory*, that will create only one PhantomJS instance, and every DynamicScraper will request a page to work with. To use it you must start the factory before any DynamicSrcaper is created, ``` scraperjs.DynamicScraper.startFactory() ``` and then close the factory after the execution of your program, ``` scraperjs.DynamicScraper.closeFactory() ```. 153 | To make the scraping function more robust you can inject code into the page, 154 | ```js 155 | var ds = scraperjs.DynamicScraper 156 | .create('http://news.ycombinator.com') 157 | .async(function(_, done, utils) { 158 | utils.scraper.inject(__dirname+'/path/to/code.js', function(err) { 159 | // in this case if there was an error won't fire catch promise. 160 | if(err) { 161 | done(err); 162 | } else { 163 | done(); 164 | } 165 | }); 166 | }) 167 | .scrape(function() { 168 | return functionInTheCodeInjected(); 169 | }) 170 | .then(function(result) { 171 | console.log(result); 172 | }); 173 | ``` 174 | 175 | #### Router 176 | 177 | The router should be initialized like a class 178 | ```javascript 179 | var router = new scraperjs.Router(options); 180 | ``` 181 | 182 | The options object is optional, and these are the options: 183 | + ``` firstMatch ```, a boolean, if true the routing will stop once the first path is matched, the default is false. 184 | 185 | The following promises can be made over it, 186 | + ```on(path:string|RegExp|function(url:string))```, makes the promise for the match url or regular expression, alternatively you can use a function to accept or not a passed url. The promises ```get``` or ```request``` and ```createStatic``` or ```createDynamic``` are expected after the on promise. 187 | + ```get()```, makes so that the page matched will be requested with a simple HTTP request, 188 | + ```request(options:Object)```, makes so that the page matched will be requested with a possible more complex HTTP request, , scraperjs uses the [request](https://github.com/mikeal/request) module, and this method is a simple wrapper of [request.request()](https://github.com/mikeal/request#requestoptions-callback), 189 | + ```createStatic()```, associates a static scraper to use to scrape the matched page, this returns ScraperPromise, so any promise made from now on will be made over a ScraperPromise of a StaticScraper. Also the ```done``` promise of the scraper will not be available. 190 | + ```createDynamic()```, associates a dynamic scraper to use to scrape the matched page, this returns ScraperPromise, so any promise made from now on will be made over a ScraperPromise of a DynamicScraper. Also the ```done``` promise of the scraper will not be available. 191 | + ```route(url:string, callback:function(boolean))```, routes an url through all matched paths, calls the callback when it's executed, true is passed if the route was successful, false otherwise. 192 | + ```use(scraperInstance:ScraperPromise)```, uses a ScraperPromise already instantiated. 193 | + ```otherwise(callback:function(url:string))```, executes the callback function if the routing url didn't match any path. 194 | + ```catch(callback:function(url:string, error:Error))```, executes the callback when an error occurred on the routing scope, not on any scraper, for that situations you should use the ```catch``` promise of the scraper. 195 | 196 | #### Notes 197 | 198 | * Scraperjs **always** fetches the document with `request`, and then when using a DynamicScraper, leverages phantom's `setContent()` to set the body of the page object. This will result in subtly different processing of web pages compared to directly loading a URL in PhantomJS. 199 | 200 | #### More 201 | 202 | Check the [examples](./doc/examples), the [tests](./test) or just dig into the code, it's well documented and it's simple to understand. 203 | 204 | # Dependencies 205 | 206 | As mentioned above, scraperjs is uses some dependencies to do the the heavy work, such as 207 | + [```async```](https://github.com/caolan/async), for flow control 208 | + [```request```](https://github.com/mikeal/request), to make HTTP requests, again, if you want more complex requests see it's [documentation](https://github.com/mikeal/request#requestoptions-callback) 209 | + [```phantom```](https://github.com/sgentle/phantomjs-node) + [```phantomjs```](https://github.com/ariya/phantomjs), phantom is an awesome module that links node to phantom, used in the DynamicScraper 210 | + [```cheerio```](https://github.com/cheeriojs/cheerio), light and fast DOM manipulation, used to implement the StaticScraper 211 | + [```jquery```](https://github.com/jquery/jquery), to include jquery in the DynamicScraper 212 | + although [```Routes.js```](https://github.com/aaronblohowiak/routes.js) is great, scraperjs doesn't use it to maintain it's "interface layout", but the code to transform the path given on the on promise to regular expressions is from them 213 | 214 | # License 215 | 216 | This project is under the [MIT](./LICENSE) license. 217 | -------------------------------------------------------------------------------- /bin/scraperjs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | /* 4 | scraperjs http://example.com --text #title-id // returns array with text of #title-id's 5 | scraperjs http://example.com --html #title-id // returns the html 6 | scraperjs http://example.com --text #title-id -D // uses dynamic scraper 7 | scraperjs http://example.com --text #title-id -S // uses static scraper 8 | scraperjs http://example.com --eval "function($) { return $('#title-id').map(function() { return $(this).text(); }).get(); }" 9 | scraperjs http://example.com http://example.org ... --text #title-id 10 | 11 | 12 | 13 | scraperjs http://example.com --delay 15 --text #title-id -D 14 | scraperjs http://example.com --text #title-id --then "function(utils) { return utils.lastResult; }" --delay 15 -D 15 | */ 16 | 17 | var program = require('commander'), 18 | sjs = require('../'), 19 | async = require('async'); 20 | 21 | program 22 | .version(require('../package.json').version) 23 | .usage(['url [url ...] --text --selector -s', 24 | 'url [url ...] --html --selector -s', 25 | 'url [url ...] --attr --selector -s', 26 | 'url [url ...] --eval -s'].join('\n\t\t')) 27 | .option('--selector ', 'Selects an element') 28 | .option('--text', 'Extracts the text from the selector element.') 29 | .option('--html', 'Extracts the html from the selector element.') 30 | .option('--attr ', 'Extracts an atribute of the selector element.') 31 | .option('--eval ', 'Uses a function to scrape, providing only it\'s body.') 32 | .option('-s, --static', 'Uses the static scraper. Used by default.', true) 33 | .option('-d, --dynamic', 'Uses the dynamic scraper.', false) 34 | .parse(process.argv); 35 | 36 | var urls = program.args; 37 | var ScraperType = (program.dynamic ? 'Dynamic' : 'Static' ) + 'Scraper'; 38 | var fn, args; 39 | 40 | if(program.selector && (program.text || program.html || program.attr)) { 41 | fn = function athScraper($, obj) { 42 | return $(obj.selector).map(function(){ 43 | return obj.which?$(this)[obj.what](obj.which):$(this)[obj.what](); 44 | }).get(); 45 | }; 46 | args = { 47 | selector: program.selector, 48 | what: undefined, 49 | which: undefined 50 | }; 51 | 52 | var temp; 53 | if(program.attr) { 54 | args.what = 'attr'; 55 | args.which = program.attr; 56 | }else if(program.html) { 57 | args.what = 'html'; 58 | }else if(program.text){ 59 | args.what = 'text'; 60 | } 61 | } else if (program.eval) { 62 | fn = new Function("$", "obj", program.eval); 63 | args = {}; 64 | } else { 65 | console.error('Invalid usage. Run scraperjs --help for help'); 66 | return; 67 | } 68 | 69 | var scraper = sjs[ScraperType].create(); 70 | scraper.scrape.apply(scraper, [fn, function(result) { 71 | console.log(JSON.stringify(result)); 72 | }, args]); 73 | async.eachSeries(urls, function(url, done) { 74 | scraper 75 | .get(url) 76 | .catch(function(err) { 77 | done(err); 78 | }) 79 | .done(function() { 80 | done(); 81 | }); 82 | }, function(err) { 83 | if(err) { 84 | console.error(err.stack); 85 | } 86 | }); -------------------------------------------------------------------------------- /doc/examples/ErrorHandling.js: -------------------------------------------------------------------------------- 1 | /** 2 | * DISCLAIMER 3 | * This is a relatively simple example, to illustrate some of the 4 | * possible functionalities and how to achieve them. 5 | * There is no guarantee that this example will provide useful 6 | * results. 7 | * Use this example with and at your own responsibility. 8 | * 9 | * In this example we run through some urls and try to extract their 10 | * 30th link. It demonstrates how to deal with errors. 11 | * 12 | * To run: 13 | * 'node ErrorHandling.js' 14 | */ 15 | 16 | var sjs = require('../../'); 17 | 18 | var log = console.log; 19 | var router = new sjs.Router(); 20 | 21 | function create30thLinkError() { 22 | var err = new Error("Page doesn't have 30th link"); 23 | err.code = '30THLINK'; 24 | return err; 25 | } 26 | 27 | router 28 | .on('*') 29 | .createStatic() 30 | .onStatusCode(function(code, utils) { 31 | // if it's not Ok pause and log. 32 | if (code != 200) { 33 | log("Page '%s' has status code %d", utils.url, code); 34 | utils.stop(); 35 | } 36 | }) 37 | .catch(function(err, utils) { 38 | // deal identify with errors and recover or panic 39 | // this has the same problems as js error handling, 40 | // it's messy and ugly 41 | switch (err.code) { 42 | case 'ENOTFOUND': 43 | log("Page '%s' not found", err.hostname); 44 | break; 45 | case '30THLINK': 46 | log("Page '%s' doesn't have a 30th link", utils.url); 47 | break; 48 | default: 49 | log('Unknown error found %s', err); 50 | } 51 | }) 52 | .scrape(function($) { 53 | var thirty = $('a')[30]; 54 | if (thirty) { 55 | return $(thirty).attr('href'); 56 | } else { 57 | throw create30thLinkError(); 58 | } 59 | }) 60 | .then(function(thirty, utils) { 61 | log("'%s' has '%s' as it's 30th link", utils.url, thirty); 62 | }); 63 | 64 | // Front page of google doesn't have a 30th link 65 | router.route('http://google.com'); 66 | // This page doesn't exist 67 | router.route('http://wouvoogle.com'); 68 | // Hacker new have a 30th link 69 | router.route('http://news.ycombinator.com'); -------------------------------------------------------------------------------- /doc/examples/HackerNews.js: -------------------------------------------------------------------------------- 1 | var sjs = require('../../src/Scraper'); 2 | /* 3 | Scrape the news in Hacker News. 4 | */ 5 | sjs.StaticScraper 6 | .create('https://news.ycombinator.com') 7 | .scrape(function($) { 8 | return $('.title a').map(function() { 9 | return $(this).text(); 10 | }).get().filter(function(elm) { 11 | return elm != 'More'; 12 | }); 13 | }) 14 | .then(function(news) { 15 | news.forEach(function(elm) { 16 | console.log(elm); 17 | }); 18 | }); -------------------------------------------------------------------------------- /doc/examples/IMDBOpeningThisWeek.js: -------------------------------------------------------------------------------- 1 | /* global $ */ 2 | var sjs = require('../../'); 3 | /** 4 | * Displays the movies opening this week, from IMDB. 5 | * This example is inspired by user jasode at Hacker News. 6 | * {@link https://news.ycombinator.com/item?id=8193522} 7 | * Note that the list of movies opening this week is loaded 8 | * dynamically. A static scraper can't scrape this content, this way. 9 | */ 10 | sjs.DynamicScraper 11 | .create('https://www.imdb.com') 12 | .scrape(function($) { 13 | return $('.otw-title').map(function() { 14 | return $(this).text().trim(); 15 | }).get(); 16 | }) 17 | .then(function(movies) { 18 | movies.forEach(function(movie) { 19 | console.log(movie); 20 | }); 21 | }); -------------------------------------------------------------------------------- /doc/examples/LinkGetter.js: -------------------------------------------------------------------------------- 1 | var sjs = require('../../src/Scraper'), 2 | url = process.argv.slice(2)[0]; 3 | 4 | if(!url) { 5 | console.log('Usage: node LinkGetter.js '); 6 | return; 7 | } 8 | 9 | /* 10 | Get all the links in a page. 11 | */ 12 | sjs.StaticScraper 13 | .create() 14 | .onStatusCode(function(code) { 15 | console.log(code); 16 | }) 17 | .scrape(function($) { 18 | return $('a').map(function() { 19 | return $(this).attr('href'); 20 | }).get(); 21 | }) 22 | .then(function(links) { 23 | links.forEach(function(link) { 24 | console.log(link); 25 | }); 26 | }) 27 | .get(url); -------------------------------------------------------------------------------- /doc/examples/WikimediaScraper.js: -------------------------------------------------------------------------------- 1 | /** 2 | * DISCLAIMER 3 | * This is a relatively simple example, to illustrate some of the 4 | * possible functionalities and how to achieve them. 5 | * There is no guarantee that this example will provide useful 6 | * results. 7 | * Use this example with and at your own responsibility. 8 | * 9 | * In this example we run through a list of links, if they have a 10 | * route defined they will be scraped. Their title, language and 11 | * first paragraph. 12 | * 13 | * To run: 14 | * 'node WikimediaScraper.js link1 [... linkN]' 15 | */ 16 | 17 | var sjs = require('../../src/Scraper'), 18 | async = require('../../node_modules/async'), 19 | parseUrl = require('url').parse, 20 | urls = process.argv.slice(2); 21 | 22 | if(!urls || !urls.length) { 23 | console.log("Usage: node WikimediaScraper.js url [...url]"); 24 | return; 25 | } 26 | 27 | var IMDB_SELECTOR = '[itemprop=description]', 28 | gatheredInformation = [], 29 | unknownRoutes = []; 30 | 31 | var router = new sjs.Router({ 32 | firstMatch: true 33 | }); 34 | 35 | router 36 | .on('https?://:lang.wikipedia.org/wiki/:article') 37 | .get() 38 | .createStatic() 39 | // if the status code is different from OK (200) we stop 40 | .onStatusCode(function(statusCode, utils) { 41 | if(statusCode!==200) { 42 | utils.stop(); 43 | } 44 | }) 45 | .scrape(function($) { 46 | return { 47 | title: $('h1').first().text(), 48 | text: $('p').first().text() 49 | }; 50 | }) 51 | .then(function(last, utils) { 52 | last.lang = utils.params.lang; 53 | return last; 54 | }); 55 | 56 | // the same functionality than the above 57 | var scraperForWiki = sjs.StaticScraper 58 | .create() 59 | .onStatusCode(function(statusCode, utils) { 60 | if(statusCode!==200) { 61 | utils.stop(); 62 | } 63 | }) 64 | .scrape(function($) { 65 | return { 66 | title: $('h1').first().text(), 67 | text: $('p').first().text() 68 | }; 69 | }) 70 | .then(function(last, utils) { 71 | if(utils.params) { 72 | last.lang = utils.params.lang; 73 | } else { 74 | last.lang = "?"; 75 | } 76 | return last; 77 | }); 78 | 79 | router 80 | .on(function(url) { 81 | return parseUrl(url).host === 'en.wikiquote.com'; 82 | }) 83 | .use(scraperForWiki); 84 | 85 | router 86 | .on('https?://:lang.wikinews.org/wiki/:place') 87 | .use(scraperForWiki); 88 | 89 | router.otherwise(function(url) { 90 | unknownRoutes.push(url); 91 | }); 92 | 93 | async.eachLimit(urls, 2, function(url, done) { 94 | router.route(url, function(found, returned) { 95 | if(found && returned) { 96 | gatheredInformation.push(returned); 97 | } 98 | done(); 99 | }); 100 | }, function(err) { 101 | if(err) { 102 | return; 103 | } 104 | 105 | gatheredInformation.forEach(function(item) { 106 | console.log(item.title.toUpperCase()+" ("+item.lang+")"); 107 | console.log("\t"+item.text); 108 | }); 109 | }) -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "scraperjs", 3 | "version": "1.2.0", 4 | "description": "A complete and versatile web scraper.", 5 | "main": "./src/Scraper.js", 6 | "keywords": [ 7 | "scraper", 8 | "scraping", 9 | "web" 10 | ], 11 | "bin": "./bin/scraperjs", 12 | "scripts": { 13 | "test": "grunt test" 14 | }, 15 | "repository": { 16 | "type": "git", 17 | "url": "git://github.com/ruipgil/scraperjs.git" 18 | }, 19 | "bugs": { 20 | "url": "https://github.com/ruipgil/scraperjs/issues" 21 | }, 22 | "gitHead": "c58be022438e49564597bbb3ad7c036d610744f8", 23 | "homepage": "https://github.com/ruipgil/scraperjs", 24 | "author": "Rui Gil", 25 | "license": "MIT", 26 | "dependencies": { 27 | "async": "^1.5.0", 28 | "cheerio": "^0.19.0", 29 | "jquery": "^2.1.4", 30 | "phantom": "^0.8.4", 31 | "request": "^2.67.0", 32 | "commander": "^2.9.0" 33 | }, 34 | "readmeFilename": "README.md", 35 | "directories": { 36 | "test": "test" 37 | }, 38 | "devDependencies": { 39 | "coveralls": "^2.11.1", 40 | "express": "^4.8.3", 41 | "grunt": "^0.4.5", 42 | "grunt-cli": "~0.1.9", 43 | "grunt-contrib-clean": "^0.6.0", 44 | "grunt-contrib-jshint": "^0.10.0", 45 | "grunt-contrib-watch": "^0.6.1", 46 | "grunt-exec": "^0.4.6", 47 | "grunt-mocha-test": "^0.11.0", 48 | "istanbul": "^0.3.0", 49 | "mocha": "^1.21.4" 50 | }, 51 | "engines": { 52 | "node": ">=0.10" 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/AbstractScraper.js: -------------------------------------------------------------------------------- 1 | var request = require('request'), 2 | ScraperPromise = require('./ScraperPromise'); 3 | 4 | /** 5 | * An abstract scraper, this class should not be used directly as a 6 | * scraper, instead a concrete scraper should inherit or use this 7 | * class as a composite this class. 8 | * 9 | * @constructor 10 | */ 11 | var AbstractScraper = function() { 12 | /** 13 | * Status code of the last requested page. 14 | * 15 | * @type {!number} 16 | * @protected 17 | */ 18 | this.statusCode = null; 19 | /** 20 | * Response of the last requested page. 21 | * 22 | * @type {!Object} 23 | * @protected 24 | */ 25 | this.response = null; 26 | /** 27 | * Body of the last webpage, as a string. 28 | * 29 | * @type {!string} 30 | * @protected 31 | */ 32 | this.body = null; 33 | /** 34 | * URL. 35 | * 36 | * @type {!string} 37 | * @protected 38 | */ 39 | this.url = ''; 40 | }; 41 | AbstractScraper.prototype = { 42 | constructor: AbstractScraper, 43 | /** 44 | * Executes a simple HTTP GET request to the given url. 45 | * 46 | * @param {!string} url URL to request. 47 | * @param {!function(Error=)} callback Function to call when the 48 | * request is done. If the request was successful then it's 49 | * called with no arguments or null argument. Otherwise, if 50 | * there was an error the it's called with one argument not 51 | * null, that should be an error instance. 52 | * @return {!AbstractScraper} This scraper. 53 | * @public 54 | */ 55 | get: function(url, callback) { 56 | var that = this; 57 | request.get(url, function processGet(error, response, body) { 58 | if (error) { 59 | callback(error); 60 | } else { 61 | that.response = response; 62 | that.statusCode = response.statusCode; 63 | that.body = body; 64 | that.url = response.request.href; 65 | that.loadBody(function(err) { 66 | callback(err); 67 | }); 68 | } 69 | }); 70 | return this; 71 | }, 72 | /** 73 | * Executes an HTTP request to an url. This method allows for the 74 | * powerful use of the request package {@link https://github.com/mikeal/request}, 75 | * since it's basically a wrapper around the method request. 76 | * For more information about how it's used refer to {@link https://github.com/mikeal/request#requestoptions-callback}. 77 | * 78 | * @param {!(Object|string)} options Options of the request. 79 | * @param {!function(Error=)} callback Function to call when the 80 | * request is done. If the request was successful then it's 81 | * called with no arguments or null argument. Otherwise, if 82 | * there was an error the it's called with one argument not 83 | * null, that should be an error instance. 84 | * @return {!AbstractScraper} This scraper. 85 | * @public 86 | */ 87 | request: function(options, callback) { 88 | var that = this; 89 | request(options, function processRequest(error, response, body) { 90 | if (error) { 91 | callback(error); 92 | } else { 93 | that.response = response; 94 | that.statusCode = response.statusCode; 95 | that.body = body; 96 | that.url = response.request.href; 97 | that.loadBody(function(err) { 98 | callback(err); 99 | }); 100 | } 101 | }); 102 | return this; 103 | }, 104 | /** 105 | * Gets the status code of the last request. 106 | * 107 | * @return {?number} The status code, if a there was a successful 108 | * request, null otherwise. 109 | * @public 110 | */ 111 | getStatusCode: function() { 112 | return this.statusCode; 113 | }, 114 | /** 115 | * Gets the response of the last request. 116 | * 117 | * @return {?number} The status code, if a there was a successful 118 | * request, null otherwise. 119 | * @public 120 | */ 121 | getResponse: function() { 122 | return this.response; 123 | }, 124 | /** 125 | * Gets the body of the last request. 126 | * 127 | * @return {?number} The status code, if a there was a successful 128 | * request, null otherwise. 129 | * @public 130 | */ 131 | getBody: function() { 132 | return this.body; 133 | }, 134 | /* jshint unused:false */ 135 | /** 136 | * Loads the string, to a representation that can be used in the 137 | * scraping process. 138 | * 139 | * @param {!function()} done Callback function, for when the body 140 | * is done loading. 141 | * @return {!AbstractScraper} This scraper. 142 | * @protected 143 | */ 144 | loadBody: function(done) { 145 | done(); 146 | return this; 147 | }, 148 | /** 149 | * Scrapes the webpage. According to a function, and a callback. 150 | * 151 | * @param {!function(...?)} scraperFn Function to scrape the 152 | * content. 153 | * @param {!function(?)} callbackFn Function that receives the 154 | * result of the scraping. 155 | * @param {!Array} args Aditional arguments to pass to the 156 | * scraping function. 157 | * @return {!AbstractScraper} This scraper. 158 | * @public 159 | */ 160 | scrape: function(scraperFn, callbackFn, args) {}, 161 | /** 162 | * Closes the scraper. 163 | * 164 | * @return {!AbstractScraper} This scraper. 165 | * @public 166 | */ 167 | close: function() {}, 168 | /** 169 | * Clones the scraper. 170 | * 171 | * @return {!AbstractScraper} Empty clone. 172 | * @public 173 | */ 174 | clone: function() {} 175 | }; 176 | /* jshint unused:true */ 177 | 178 | /** 179 | * Creates a scraper, based on a scraper type, and creates it's 180 | * promise. 181 | * 182 | * @param {!AbstractScraper} ScraperType Some concrete implementation 183 | * of an abstract scraper. 184 | * @param {!string=} url Url to make an HTTP GET request. 185 | * @return {!ScraperPromise} A scraper promise. 186 | * @public 187 | * @static 188 | */ 189 | AbstractScraper.create = function(ScraperType, url, options) { 190 | var promise = new ScraperPromise(new ScraperType(options)); 191 | if (url) { 192 | promise.get(url); 193 | } 194 | return promise; 195 | }; 196 | 197 | module.exports = AbstractScraper; -------------------------------------------------------------------------------- /src/DynamicScraper.js: -------------------------------------------------------------------------------- 1 | var phantomOrig = require('phantom'), 2 | PhantomPoll = require('./PhantomPoll.js'), 3 | phantom = phantomOrig, 4 | AbstractScraper = require('./AbstractScraper'), 5 | ScraperError = require('./ScraperError'), 6 | PhantomWrapper = require('./PhantomWrapper'); 7 | 8 | /** 9 | * A dynamic scraper. This is a very versatile and powerful. This 10 | * solution is a little heavier and slower than the {@see StaticScraper}. 11 | * This version uses phantomjs {@link http://phantomjs.org/}, and {@link https://github.com/sgentle/phantomjs-node}. 12 | * 13 | * @extends {AbstractScraper} 14 | */ 15 | var DynamicScraper = function(options) { 16 | AbstractScraper.call(this); 17 | /** 18 | * Phantom instance. 19 | * 20 | * @type {?} 21 | * @private 22 | */ 23 | this.ph = null; 24 | /** 25 | * Phantom's page. 26 | * 27 | * @type {?} 28 | * @private 29 | */ 30 | this.page = null; 31 | /** 32 | * Phantom's options 33 | * 34 | * @type {?} 35 | * @private 36 | */ 37 | this.options = { 38 | onStdout: function() {}, 39 | onStderr: function() {} 40 | }; 41 | for (var key in options) { this.options[key] = options[key]; } 42 | }; 43 | DynamicScraper.prototype = Object.create(AbstractScraper.prototype); 44 | /** 45 | * @override 46 | * @inheritDoc 47 | */ 48 | DynamicScraper.prototype.loadBody = function(done) { 49 | var that = this; 50 | phantom.create('--load-images=no', that.options, function(ph) { 51 | that.ph = ph; 52 | ph.createPage(function(page) { 53 | that.page = page; 54 | page.setContent(that.body, that.url, function() { 55 | that.inject(DynamicScraper.JQUERY_FILE, function(err) { 56 | done(err ? new ScraperError('Couldn\'t inject jQuery into the page.') : undefined); 57 | }); 58 | }); 59 | }); 60 | }); 61 | return this; 62 | }; 63 | /** 64 | * The scraper function has it's own scope (can't access outside its 65 | * own scope), and only JSON serializable information can be return 66 | * by the function. For more information {@link https://github.com/sgentle/phantomjs-node}. 67 | * 68 | * @param {!function(...?)} scraperFn Function to scrape the content. 69 | * It receives the args as parameters, if passed. 70 | * @param {!function(?)} callbackFn Function that receives the 71 | * result of the scraping. 72 | * @param {!Array=} args Additional arguments to pass to the scraping 73 | * function. They must be JSON serializable. 74 | * @param {!string=} stackTrace Stack trace to produce better error 75 | * messages. 76 | * @return {!AbstractScraper} This scraper. 77 | * @override 78 | * @public 79 | */ 80 | DynamicScraper.prototype.scrape = function(scraperFn, callbackFn, args, stackTrace) { 81 | args = args || []; 82 | 83 | args.unshift(scraperFn.toString()); 84 | args.unshift(function(result) { 85 | if(result.error) { 86 | callbackFn(DynamicScraper.generateMockErrorMessage(result.error, stackTrace), null); 87 | } else { 88 | callbackFn(null, result.result); 89 | } 90 | }); 91 | args.unshift(PhantomWrapper); 92 | 93 | this.page.evaluate.apply(this.page, args); 94 | return this; 95 | }; 96 | /** 97 | * Injects a javascript file into the page. 98 | * 99 | * @param {!string} file File to inject. 100 | * @param {!function(!ScraperError=)} callback Function to be called 101 | * when the file has injected. If the injection fails, then the 102 | * first argument is not is a {@see ScraperError}. 103 | * @public 104 | */ 105 | DynamicScraper.prototype.inject = function(file, callback) { 106 | if (this.page) { 107 | this.page.injectJs(file, function(success) { 108 | if (success) { 109 | callback(); 110 | } else { 111 | callback(new ScraperError('Couldn\'t inject code, at "' + file + '".')); 112 | } 113 | }); 114 | } else { 115 | throw new ScraperError('Couldn\'t inject code, at "' + file + '". The page has not been initialized yet.'); 116 | } 117 | }; 118 | /** 119 | * @override 120 | * @inheritDoc 121 | */ 122 | DynamicScraper.prototype.close = function() { 123 | if (this.page) { 124 | this.page.close(); 125 | } 126 | if (this.ph) { 127 | this.ph.exit(); 128 | } 129 | return this; 130 | }; 131 | /** 132 | * @override 133 | * @inheritDoc 134 | */ 135 | DynamicScraper.prototype.clone = function() { 136 | return new DynamicScraper(); 137 | }; 138 | /** 139 | * Creates a dynamic scraper, wrapped around a scraper promise. 140 | * 141 | * @param {!string=} url If provided makes an HTTP GET request to the 142 | * given URL. 143 | * @return {!ScraperPromise} Scraper promise, with a dynamic scraper. 144 | * @public 145 | * @static 146 | */ 147 | DynamicScraper.create = function(url, options) { 148 | return AbstractScraper.create(DynamicScraper, url, options); 149 | }; 150 | /** 151 | * Starts the factory. A factory should only be open once, and after 152 | * it's open it must be closed with {@see DynamicScraper#closeFactory}. 153 | * A factory makes so that there's only one instance of phantom at a 154 | * time, which makes the creation/usage of dynamic scrapers much 155 | * more efficient. 156 | * 157 | * @return {!DynamicScraper} 158 | * @public 159 | * @static 160 | */ 161 | DynamicScraper.startFactory = function() { 162 | phantom = new PhantomPoll(); 163 | return DynamicScraper; 164 | }; 165 | /** 166 | * Closes the factory. For more information {@see DynamicScraper#closeFactory} 167 | * 168 | * @return {!DynamicScraper} 169 | * @public 170 | * @static 171 | */ 172 | DynamicScraper.closeFactory = function() { 173 | if (phantom instanceof PhantomPoll) { 174 | phantom.close(); 175 | } 176 | phantom = phantomOrig; 177 | return DynamicScraper; 178 | }; 179 | /** 180 | * Generates a mock error message that is similar to one produced 181 | * by a function runned in node, and not phantomjs. 182 | * @param {!Object} err Error object sent by Phantom. 183 | * @param {!string} stackTrace Stack trace of where the promise was defined. 184 | * @return {!Error} Error message. 185 | * @private 186 | * @static 187 | */ 188 | DynamicScraper.generateMockErrorMessage = function(err, stackTrace) { 189 | var rg = /^\s{4}at ([^\s]+) \(([^\s]*)\:(\d+):(\d+)\)$/mg; 190 | rg.exec(stackTrace); 191 | var emsg = rg.exec(stackTrace); 192 | var sob = emsg[1]; 193 | var sfile = emsg[2]; 194 | var sline = emsg[3]; 195 | var sc = emsg[4]; 196 | 197 | var line = Number(sline) + Math.max(err.line-1, 0); 198 | 199 | var mock = new Error(err.message); 200 | // Prevents the use of a property named 'line'! 201 | delete err.line; 202 | for(var x in err) { 203 | mock[x] = err[x]; 204 | } 205 | mock.stack = mock.stack.replace(/\t/g, ' '); 206 | 207 | var ats = mock.stack.split('\n'); 208 | ats.unshift(' at ' + sob + ' (' + sfile + ':' + line + ':' + sc + ')'); 209 | ats.unshift('Error' + (err.message?': '+err.message:'')); 210 | mock.stack = ats.join('\n'); 211 | 212 | return mock; 213 | }; 214 | /** 215 | * Location of the jquery file. 216 | * 217 | * @type {!string} 218 | * @private 219 | * @static 220 | */ 221 | DynamicScraper.JQUERY_FILE = require.resolve('jquery'); 222 | 223 | module.exports = DynamicScraper; 224 | -------------------------------------------------------------------------------- /src/PhantomPoll.js: -------------------------------------------------------------------------------- 1 | var phantom = require('phantom'); 2 | 3 | /** 4 | * This maintains only one PhantomJS instance. It works like a proxy 5 | * between the phantom package, and should expose the methods same 6 | * methods. An additional call to close the phantomJS instance 7 | * properly is needed. 8 | * 9 | * @constructor 10 | */ 11 | var PhantomPoll = function() { 12 | /** 13 | * The real PhantomJS instance. 14 | * 15 | * @type {?} 16 | * @private 17 | */ 18 | this.instance = null; 19 | /** 20 | * The PhantomJS instance is being created. 21 | * 22 | * @type {!boolean} 23 | * @private 24 | */ 25 | this.creating = false; 26 | /** 27 | * PhantomJS flags. 28 | * 29 | * @type {!string} 30 | * @private 31 | */ 32 | this.flags = ''; 33 | /** 34 | * PhantomJS options. 35 | * 36 | * @type {!Object} 37 | * @private 38 | */ 39 | this.options = { 40 | onStdout: function() {}, 41 | onStderr: function() {} 42 | }; 43 | /** 44 | * List of functions waiting to be called after the PhantomJS 45 | * instance is created. 46 | * 47 | * @type {!Array.} 48 | * @private 49 | */ 50 | this.waiting = []; 51 | this._createInstance(); 52 | }; 53 | PhantomPoll.prototype = { 54 | constructor: PhantomPoll, 55 | /** 56 | * Creates a PhantomJS page, to be called with a callback, which 57 | * will receive the page. 58 | * 59 | * @param {!function(?)} callback Function to be called after the 60 | * page is created, it receives the page object. 61 | * @public 62 | */ 63 | createPage: function(callback) { 64 | if (this.instance) { 65 | this.instance.createPage(function(page) { 66 | callback(page); 67 | }); 68 | } else { 69 | var that = this; 70 | this._createInstance(function() { 71 | that.createPage(callback); 72 | }); 73 | } 74 | }, 75 | /** 76 | * Creates a PhantomJS instance. 77 | * 78 | * @param {!string} flags Creation flags. 79 | * @param {!Object} options Creation options. 80 | * @param {!function(?)} callback Function to be called after 81 | * the phantom instance is created. 82 | * 83 | * @public 84 | */ 85 | create: function(flags, options, callback) { 86 | this.flags = flags; 87 | this.options = options; 88 | callback(this); 89 | }, 90 | /** 91 | * Creates PhantomJS instance if needed be, and when it's done 92 | * triggers all the callbacks. 93 | * 94 | * @param {!function(?)} callback Function to be called when the 95 | * instance is created, if a phantom instance is waiting to be 96 | * created the callback will be added to a waiting list. 97 | * @private 98 | */ 99 | _createInstance: function(callback) { 100 | if (this.creating && callback) { 101 | this.waiting.push(callback); 102 | } else { 103 | var that = this; 104 | this.creating = true; 105 | phantom.create(this.flags, this.options, function(ph) { 106 | that.instance = ph; 107 | that.creating = false; 108 | that.waiting.forEach(function(callback) { 109 | callback(ph); 110 | }); 111 | that.waiting = []; 112 | }); 113 | } 114 | }, 115 | /** 116 | * This is a function just to maintain the same interface 117 | * with the phantom module. If the PhantomJS instance needs be 118 | * destroyed the method close must be used. 119 | * 120 | * @public 121 | */ 122 | exit: function() {}, 123 | /** 124 | * Exits the phantom instance. 125 | * 126 | * @public 127 | */ 128 | close: function() { 129 | if (this.instance) { 130 | this.instance.exit(); 131 | } 132 | } 133 | }; 134 | 135 | module.exports = PhantomPoll; -------------------------------------------------------------------------------- /src/PhantomWrapper.js: -------------------------------------------------------------------------------- 1 | module.exports = function wrapper(fnStr) { 2 | var args = Array.prototype.slice.call(arguments); 3 | var rg = /^function\s+([a-zA-Z_$][a-zA-Z_$0-9]*)?\((.*?)\) {/g; 4 | var a = rg.exec(fnStr); 5 | var fnArgs = a[2].match(/([^,\s]+)/g) || []; 6 | var fnBody = fnStr.slice(fnStr.indexOf("{")+1, fnStr.lastIndexOf("}")); 7 | fnArgs.push(fnBody); 8 | var scraperFn = Function.apply(this, fnArgs); 9 | 10 | try { 11 | var gs = args.slice(1); 12 | gs.unshift($); 13 | var result = scraperFn.apply(this, gs); 14 | return { 15 | error: null, 16 | result: result 17 | }; 18 | } catch(e) { 19 | var errObj = { 20 | message: e.message 21 | }; 22 | for(var x in e) { 23 | errObj[x] = e[x]; 24 | } 25 | return { 26 | error: errObj, 27 | result: null 28 | }; 29 | } 30 | }; 31 | -------------------------------------------------------------------------------- /src/Router.js: -------------------------------------------------------------------------------- 1 | var async = require('async'), 2 | StaticScraper = require('./StaticScraper'), 3 | DynamicScraper = require('./DynamicScraper'), 4 | ScraperError = require('./ScraperError'); 5 | 6 | /** 7 | * Transforms a string into a regular expression. 8 | * This function is from the project Routes.js, under the MIT licence, 9 | * {@link https://github.com/aaronblohowiak/routes.js} it's present 10 | * in the file {@link https://github.com/aaronblohowiak/routes.js/blob/bdad0a1ae10d11981bb286550bb3b8a1a71909bd/dist/routes.js#L49}. 11 | * 12 | * @param {!string} path String path. 13 | * @param {!Array.} keys Empty array to be filled with the 14 | * keys ids. 15 | * @return {!RegExp} Regular expression. 16 | */ 17 | function pathToRegExp(path, keys) { 18 | path = path 19 | .concat('/?') 20 | .replace(/\/\(/g, '(?:/') 21 | .replace(/(\/)?(\.)?:(\w+)(?:(\(.*?\)))?(\?)?|\*/g, function(_, slash, format, key, capture, optional) { 22 | if (_ === '*') { 23 | return _; 24 | } 25 | 26 | keys.push(key); 27 | slash = slash || ''; 28 | return '' + (optional ? '' : slash) + '(?:' + (optional ? slash : '') + (format || '') + (capture || '([^/]+?)') + ')' + (optional || ''); 29 | }) 30 | .replace(/([\/.])/g, '\\$1') 31 | .replace(/\*/g, '(.*)'); 32 | return new RegExp('^' + path + '$', 'i'); 33 | } 34 | 35 | /** 36 | * Routes an url thought a valid, predefined, path. 37 | * 38 | * @param {!Object=} options Setup options. 39 | * @param {!boolean=} options.firstMatch If true the router will stop 40 | * at the first path matched. The default is false, and tries to 41 | * match every path. 42 | * @constructor 43 | */ 44 | var Router = function(options) { 45 | options = options || {}; 46 | /** 47 | * Stops routing at first successful match. 48 | * 49 | * @type {!boolean} 50 | * @private 51 | */ 52 | this.firstMatchStop = options.firstMatch || false; 53 | /** 54 | * Chain of promises. 55 | * 56 | * @type {!Array.} 57 | * @private 58 | */ 59 | this.promises = []; 60 | /** 61 | * Otherwise promise. 62 | * 63 | * @type {!function(!string=)} 64 | * @private 65 | */ 66 | this.otherwiseFn = function() {}; 67 | }; 68 | Router.prototype = { 69 | constructor: Router, 70 | /** 71 | * Promise to url match. It's promise will fire only if the path 72 | * matches with and url being routed. 73 | * 74 | * @param {!(string|RegExp|function(string):?)} path The 75 | * path or regular expression to match an url. 76 | * Alternatively a function that receives the url to be matched 77 | * can be passed. If the result is false, or any 78 | * !!result===false), the path is considered valid and the 79 | * scraping should be done. If ,in case of a valid path, an Object is returned, it will be associated with the params of this 80 | * route/path. 81 | * For more information on the path matching refer to {@link https://github.com/aaronblohowiak/routes.js/blob/76bc517037a0321507c4d84a0cdaca6db31ebaa4/README.md#path-formats} 82 | * @return {!Router} This router. 83 | * @public 84 | */ 85 | on: function(path) { 86 | var callback; 87 | if (typeof path === 'function') { 88 | callback = path; 89 | } 90 | 91 | this.promises.push({ 92 | callback: callback ? function(url) { 93 | return callback(url); 94 | } : Router.pathMatcher(path), 95 | scraper: null, 96 | rqMethod: null 97 | }); 98 | return this.get(); 99 | }, 100 | /** 101 | * Sets the request method to be a simple HTTP GET. 102 | * {@see AbstractScraper.get} 103 | * 104 | * @return {!Router} This router. 105 | * @public 106 | */ 107 | get: function() { 108 | var length = this.promises.length, 109 | last = this.promises[length - 1]; 110 | if (length && last) { 111 | last.rqMethod = function(scraper, url) { 112 | scraper.get(url); 113 | }; 114 | return this; 115 | } else { 116 | throw new ScraperError(''); 117 | } 118 | }, 119 | /** 120 | * Sets the request method to be according to the options. 121 | * {@see AbstractScraper.request} 122 | * 123 | * @param {!Object} options Request options. 124 | * @return {!Router} This router. 125 | * @public 126 | */ 127 | request: function(options) { 128 | var length = this.promises.length, 129 | last = this.promises[length - 1]; 130 | if (length && last) { 131 | last.rqMethod = function(scraper, url) { 132 | options.uri = url; 133 | scraper.request(options); 134 | }; 135 | return this; 136 | } else { 137 | throw new ScraperError(''); 138 | } 139 | }, 140 | /** 141 | * A promise to be triggered when none of the paths where matched. 142 | * This is a one time promise, which means that the last promise 143 | * is gonna be the one to be executed. 144 | * 145 | * @param {!function(!string=)} callback Function with the url as 146 | * a parameter. 147 | * @return {!Router} This router. 148 | * @public 149 | */ 150 | otherwise: function(callback) { 151 | this.otherwiseFn = callback; 152 | return this; 153 | }, 154 | /** 155 | * Creates a static scraper, and associates it with the current 156 | * router promise chain. Note that this method returns a 157 | * {@see ScraperPromise} of a {@see StaticScraper}. 158 | * 159 | * @return {!ScraperPromise} A promise for the scraper. 160 | * @public 161 | */ 162 | createStatic: function() { 163 | var length = this.promises.length, 164 | last = this.promises[length - 1]; 165 | if (length && last && !last.scraper) { 166 | var ss = StaticScraper.create(); 167 | last.scraper = ss; 168 | return ss; 169 | } else { 170 | throw new ScraperError(''); 171 | } 172 | }, 173 | /** 174 | * Associates the current route with the a scraper (promise) 175 | * instance. Keep in mind that the done promise will not be 176 | * available. 177 | * 178 | * @param {!AbstractScraper} scraper A scraper instance to use. 179 | * @return {!Router} This router. 180 | * @public 181 | */ 182 | use: function(scraper) { 183 | var length = this.promises.length, 184 | last = this.promises[length - 1]; 185 | if (length && last && !last.scraper) { 186 | last.scraper = scraper; 187 | return this; 188 | } else { 189 | throw new ScraperError(''); 190 | } 191 | }, 192 | /** 193 | * Creates a dynamic scraper, and associates it with the current 194 | * router promise chain. Note that this method returns a 195 | * {@see ScraperPromise} of a {@see DynamicScraper}. 196 | * 197 | * @return {!ScraperPromise} A promise for the scraper. 198 | * @public 199 | */ 200 | createDynamic: function() { 201 | var length = this.promises.length, 202 | last = this.promises[length - 1]; 203 | if (length && last && !last.scraper) { 204 | var ss = DynamicScraper.create(); 205 | last.scraper = ss; 206 | return ss; 207 | } else { 208 | throw new ScraperError(''); 209 | } 210 | }, 211 | /** 212 | * Routes a url through every path that matches it. 213 | * 214 | * @param {!string} url The url to route. 215 | * @param {!function(boolean)} callback Function to call when the 216 | * routing is complete. If any of the paths was found the 217 | * parameter is true, false otherwise. 218 | * @return {!Router} This router. 219 | * @public 220 | */ 221 | route: function(url, callback) { 222 | var that = this, 223 | atLeastOne = false, 224 | stopFlag = {}, 225 | lastReturn; 226 | callback = callback || function() {}; 227 | async.eachSeries(this.promises, function(promiseObj, done) { 228 | 229 | var matcher = promiseObj.callback, 230 | scraper, 231 | reqMethod = promiseObj.rqMethod; 232 | var result = matcher(url); 233 | if (!!result) { 234 | scraper = promiseObj.scraper.clone(); 235 | atLeastOne = true; 236 | scraper._setChainParameter(result); 237 | scraper.done(function(lr, utils) { 238 | lastReturn = lr; 239 | done(that.firstMatchStop ? stopFlag : undefined); 240 | }); 241 | reqMethod(scraper, url); 242 | } else { 243 | done(); 244 | } 245 | 246 | }, function() { 247 | if (!atLeastOne) { 248 | that.otherwiseFn(url); 249 | } 250 | callback(atLeastOne, lastReturn); 251 | }); 252 | return this; 253 | } 254 | }; 255 | /** 256 | * Creates a function to match a path against a string. 257 | * 258 | * @param {!(string|RegExp)} pathOrRE Pattern to match, if it's a 259 | * string it will be transformed into a regular expression. 260 | * @return {!function(string):(Object|booelan)} A matching function, 261 | * that given a string will check if it matches the path. If the 262 | * path has parameters it will return an object with the parameters 263 | * as keys and the values as the values of the parameters. An empty 264 | * object if there were no valid parameters or false if the path 265 | * doesn't match with the string. 266 | * @public 267 | * @static 268 | */ 269 | Router.pathMatcher = function(pathOrRE) { 270 | var pattern, 271 | keys = ['url']; 272 | if (pathOrRE instanceof RegExp) { 273 | pattern = pathOrRE; 274 | } else if (typeof pathOrRE === 'string') { 275 | pattern = pathToRegExp(pathOrRE, keys); 276 | } else { 277 | throw new ScraperError('A path must be a string or a regular expression.'); 278 | } 279 | 280 | return function patternMatchingFunction(url) { 281 | var match = pattern.exec(url); 282 | if (!match) { 283 | return false; 284 | } else { 285 | return keys.reduce(function(obj, value, index) { 286 | obj[value] = match[index]; 287 | return obj; 288 | }, {}); 289 | } 290 | }; 291 | }; 292 | 293 | module.exports = Router; 294 | -------------------------------------------------------------------------------- /src/Scraper.js: -------------------------------------------------------------------------------- 1 | var StaticScraper = require('./StaticScraper.js'), 2 | DynamicScraper = require('./DynamicScraper.js'), 3 | ScraperPromise = require('./ScraperPromise.js'), 4 | Router = require('./Router'); 5 | 6 | module.exports = { 7 | StaticScraper: StaticScraper, 8 | DynamicScraper: DynamicScraper, 9 | ScraperPromise: ScraperPromise, 10 | Router: Router 11 | }; -------------------------------------------------------------------------------- /src/ScraperError.js: -------------------------------------------------------------------------------- 1 | /** 2 | * A scraper error, to refer error occurred in the scope of this 3 | * package. For more information about the error use it's message 4 | * property. 5 | * 6 | * @param {!string} message Error message. 7 | * @extends {Error} 8 | */ 9 | var ScraperError = function(message) { 10 | /** 11 | * Error message. 12 | * 13 | * @type {!string} 14 | * @public 15 | */ 16 | this.message = message; 17 | /** 18 | * This type. 19 | * 20 | * @type {!string} 21 | * @public 22 | */ 23 | this.name = 'ScraperError'; 24 | /** 25 | * Stack message. 26 | * 27 | * @type {!string} 28 | * @public 29 | */ 30 | this.stack = (new Error()).stack; 31 | }; 32 | ScraperError.prototype = new Error(); 33 | ScraperError.prototype.constructor = ScraperError; 34 | 35 | module.exports = ScraperError; -------------------------------------------------------------------------------- /src/ScraperPromise.js: -------------------------------------------------------------------------------- 1 | var async = require('async'); 2 | 3 | /** 4 | * @constructor 5 | */ 6 | var ScraperPromise = function(scraper) { 7 | /** 8 | * Scraper to use.. 9 | * 10 | * @type {!Scraper} 11 | * @private 12 | */ 13 | this.scraper = scraper; 14 | /** 15 | * Promise chunks. 16 | * 17 | * @type {!Array.} 18 | * @private 19 | */ 20 | this.promises = []; 21 | /** 22 | * Function to call when all the promises are fulfilled. 23 | * 24 | * @type {!function(?, ?)} 25 | * @private 26 | */ 27 | this.doneCallback = function(last, utils) { 28 | return last; 29 | }; 30 | /** 31 | * Function to call when there's an error. 32 | * 33 | * @type {!function(?)} 34 | * @private 35 | */ 36 | this.errorCallback = function(err) { 37 | throw err; 38 | }; 39 | /** 40 | * A parameter object to be passed to the chain, at the _fire 41 | * method. This should be set immediately before the call, and 42 | * reset to null right after the call, or after it's been stored 43 | * elsewhere. 44 | * 45 | * @type {?} 46 | * @private 47 | */ 48 | this.chainParameter = null; 49 | }; 50 | ScraperPromise.prototype = { 51 | constructor: ScraperPromise, 52 | /** 53 | * Sets a promise for a status code, of a response of a request. 54 | * 55 | * @param {!(number|function(number))} code Status code to 56 | * dispatch the message. Or a callback function, in this case 57 | * the function's first parameter is the status code, as a 58 | * number. 59 | * @param {!function()} callback Callback function for the case 60 | * where the status code is provided. 61 | * @return {!ScraperPromise} This object, so that new promises can 62 | * be made. 63 | * @public 64 | */ 65 | onStatusCode: function(code, callback) { 66 | if (typeof code == 'function') { 67 | callback = code; 68 | this.promises.push(function onGenericStatusCode(done, utils) { 69 | done(null, callback(this.scraper.getStatusCode(), utils)); 70 | }); 71 | } else { 72 | this.promises.push(function onStatusCode(done, utils) { 73 | if (code === this.scraper.getStatusCode()) { 74 | done(null, callback(utils)); 75 | } else { 76 | done(null, utils.lastReturn); 77 | } 78 | 79 | }); 80 | } 81 | return this; 82 | }, 83 | /** 84 | * Sets a promise to scrape the retrieved webpage. 85 | * 86 | * @param {!function(?, ?)} scrapeFn Function to scrape the 87 | * webpage. The parameters depend on what kind of scraper. 88 | * @param {!function(?)=} callback Callback function with the 89 | * result of the scraping function. If none is provided, the 90 | * result can be accessed in the next promise with 91 | * utils.lastReturn. 92 | * @param {...?} var_args Optional arguments to pass as 93 | * parameters to the scraping function. 94 | * @return {!ScraperPromise} This object, so that new promises can 95 | * be made. 96 | * @public 97 | */ 98 | scrape: function(scrapeFn, callback) { 99 | var stackTrace = new Error().stack; 100 | 101 | var extraArguments = Array.prototype.slice.call(arguments, 2); 102 | callback = callback || function(result) { 103 | return result; 104 | }; 105 | this.promises.push(function scrape(done, utils) { 106 | this.scraper.scrape(scrapeFn, function(err, result) { 107 | if (err) { 108 | done(err, undefined); 109 | } else { 110 | done(null, callback(result, utils)); 111 | } 112 | }, extraArguments, stackTrace); 113 | }); 114 | return this; 115 | }, 116 | /** 117 | * Sets a promise to delay the execution of the promises. 118 | * 119 | * @param {!number} time Time in milliseconds to delay the 120 | * execution. 121 | * @param {!function()=} callback Function to call after the 122 | * delay. 123 | * @return {!ScraperPromise} This object, so that new promises can 124 | * be made. 125 | * @public 126 | */ 127 | delay: function(time, callback) { 128 | callback = callback || function() {}; 129 | this.promises.push(function delay(done, utils) { 130 | setTimeout(function() { 131 | done(null, callback(utils)); 132 | }, time); 133 | }); 134 | return this; 135 | }, 136 | /** 137 | * Sets a promise to execute a promise after a time period. This 138 | * does not cause the promise chain to block. 139 | * 140 | * @param {!number} time Time in milliseconds to the execution of 141 | * the callback. 142 | * @param {!function()} callback Function to call after the 143 | * time period has passed. 144 | * @return {!ScraperPromise} This object, so that new promises can 145 | * be made. 146 | * @public 147 | */ 148 | timeout: function(time, callback) { 149 | this.promises.push(function timeout(done, utils) { 150 | setTimeout(function() { 151 | callback(utils); 152 | }, time); 153 | done(null, null); 154 | }); 155 | return this; 156 | }, 157 | /** 158 | * Sets the end of the promise chain callback, if there were no 159 | * errors. 160 | * 161 | * @param {!function()} doneFn Callback function. 162 | * @return {!ScraperPromise} This object, so that new promises can 163 | * be made. 164 | * @public 165 | */ 166 | done: function(doneFn) { 167 | this.doneCallback = doneFn; 168 | return this; 169 | }, 170 | /** 171 | * Sets a generic promise. 172 | * 173 | * @param {!function()} callback Callback. 174 | * @return {!ScraperPromise} This object, so that new promises can 175 | * be made. 176 | * @public 177 | */ 178 | then: function(callback) { 179 | this.promises.push(function then(done, utils) { 180 | done(null, callback(utils.lastReturn, utils)); 181 | }); 182 | return this; 183 | }, 184 | /** 185 | * Stops the promise chain and resumes it after a callback 186 | * function. 187 | * 188 | * @param {!function(!function, !Object)} callback Callback. 189 | * @return {!ScraperPromise} This object, so that new promises can 190 | * be made. 191 | * @public 192 | */ 193 | async: function(callback) { 194 | this.promises.push(function async(done, utils) { 195 | callback(utils.lastReturn, done, utils); 196 | }); 197 | return this; 198 | }, 199 | /** 200 | * @deprecated 201 | */ 202 | onError: function(callback) { 203 | console.warn("The 'onError' is being DEPRECATED in favor of 'catch'"); 204 | return this.catch(callback); 205 | }, 206 | /** 207 | * Sets a promise to when an error occur, note that an error will 208 | * break the promise chain, so this is the next promise to be 209 | * called and if the done promise is not set the last. To avoid 210 | * silent errors, if this promise is not defined the error will 211 | * be thrown up. 212 | * 213 | * @param {!function(?)} callback Callback. 214 | * @return {!ScraperPromise} This object, so that new promises can 215 | * be made. 216 | * @public 217 | */ 218 | "catch": function(callback) { 219 | this.errorCallback = callback; 220 | return this; 221 | }, 222 | /** 223 | * Makes an HTTP GET request to the url. 224 | * 225 | * @param {!string} url Url to make the request. 226 | * @return {!ScraperPromise} This object, so that new promises can 227 | * be made. 228 | * @public 229 | */ 230 | get: function(url) { 231 | var that = this; 232 | this.scraper.get(url, function(err) { 233 | that._fire(err); 234 | }); 235 | return this; 236 | }, 237 | /** 238 | * Makes a (possible more complex) HTTP request. For more 239 | * information refer to {@link https://github.com/mikeal/request#requestoptions-callback}. 240 | * 241 | * @param {!Object} options Options of the request. 242 | * @return {!ScraperPromise} This object, so that new promises can 243 | * be made. 244 | * @public 245 | */ 246 | request: function(options) { 247 | var that = this; 248 | this.scraper.request(options, function(err) { 249 | that._fire(err); 250 | }); 251 | return this; 252 | }, 253 | /** 254 | * Sets a parameter to be used in the next _fire call. 255 | * 256 | * @param {?Object} param Parameter. 257 | * @public 258 | */ 259 | _setChainParameter: function(param) { 260 | this.chainParameter = param; 261 | }, 262 | /** 263 | * Starts the promise chain. 264 | * 265 | * @param {?} error Error object, to fire the error callback, 266 | * from an error that happened before. 267 | * @param {!Scraper} scraper Scraper to use in the promise chain. 268 | * @protected 269 | */ 270 | _fire: function(error) { 271 | var that = this, 272 | param = this.chainParameter, 273 | stopPointer = {}, 274 | utils = { 275 | stop: null, 276 | url: this.scraper.url, 277 | scraper: this, 278 | params: param, 279 | lastReturn: undefined 280 | }, 281 | keep = true; 282 | this.chainParameter = null; 283 | 284 | if (error) { 285 | this.errorCallback(error, utils); 286 | this.doneCallback(utils); 287 | return; 288 | } 289 | 290 | async.eachSeries(this.promises, function dispatcher(fn, callback) { 291 | var done = function(err, lastReturn) { 292 | utils.lastReturn = lastReturn; 293 | if (err === stopPointer) { 294 | keep = false; 295 | callback(err); 296 | } else if (err) { 297 | callback(err); 298 | } else if (keep) { 299 | callback(); 300 | } 301 | }; 302 | utils.stop = function() { 303 | done(stopPointer, null); 304 | }; 305 | 306 | try { 307 | fn.call(that, done, utils); 308 | } catch (err) { 309 | done(err, null); 310 | } 311 | }, function(err) { 312 | utils.stop = null; 313 | if (err && err !== stopPointer) { 314 | that.errorCallback(err, utils); 315 | } 316 | that.doneCallback(utils.lastReturn, utils); 317 | that.scraper.close(); 318 | }); 319 | }, 320 | /** 321 | * Sets the promises. 322 | * 323 | * @param {!Array.} promises Promises array. 324 | * @public 325 | */ 326 | _setPromises: function(promises) { 327 | this.promises = promises; 328 | }, 329 | /** 330 | * Clones the promise and the scraper. 331 | * 332 | * @return {!ScraperPromise} Scraper promise with an empty scraper 333 | * clone. 334 | * @public 335 | */ 336 | clone: function() { 337 | var instance = this.scraper.clone(), 338 | promise = new ScraperPromise(instance); 339 | promise._setPromises(this.promises); 340 | promise.done(this.doneCallback); 341 | promise.catch(this.errorCallback); 342 | return promise; 343 | } 344 | }; 345 | 346 | module.exports = ScraperPromise; 347 | -------------------------------------------------------------------------------- /src/StaticScraper.js: -------------------------------------------------------------------------------- 1 | var cheerio = require('cheerio'), 2 | AbstractScraper = require('./AbstractScraper'); 3 | 4 | /** 5 | * A static scraper. This can only scrape static content, with the 6 | * help of jQuery. 7 | * This version uses cheerio {@link https://github.com/cheeriojs/cheerio}. 8 | * 9 | * @extends {AbstractScraper} 10 | */ 11 | var StaticScraper = function() { 12 | AbstractScraper.call(this); 13 | /** 14 | * jQuery. 15 | * 16 | * @type {!function} 17 | * @private 18 | */ 19 | this.$ = null; 20 | }; 21 | StaticScraper.prototype = Object.create(AbstractScraper.prototype); 22 | /** 23 | * @override 24 | * @inheritDoc 25 | */ 26 | StaticScraper.prototype.loadBody = function(done) { 27 | this.$ = cheerio.load(this.body); 28 | done(); 29 | return this; 30 | }; 31 | /** 32 | * Scrapes the webpage. According to a function, and a callback. 33 | * 34 | * @param {!function(function(), ...?)} scraperFn Function to scrape 35 | * the content. It receives the jQuery function to manipulate the 36 | * DOM, and the args as parameters, if passed. 37 | * @param {!function(?)} callbackFn Function that receives the 38 | * result of the scraping. 39 | * @param {!Array=} args Extra arguments to pass to the scraping 40 | * function. 41 | * @return {!AbstractScraper} This scraper. 42 | * @override 43 | * @public 44 | */ 45 | StaticScraper.prototype.scrape = function(scraperFn, callbackFn, args) { 46 | var result = null, err = null; 47 | args = args || []; 48 | args.unshift(this.$); 49 | try { 50 | result = scraperFn.apply(null, args); 51 | } catch (e) { 52 | err = e; 53 | } 54 | callbackFn(err, result); 55 | return this; 56 | }; 57 | /** 58 | * @override 59 | * @inheritDoc 60 | */ 61 | StaticScraper.prototype.close = function() { 62 | return this; 63 | }; 64 | /** 65 | * @override 66 | * @inheritDoc 67 | */ 68 | StaticScraper.prototype.clone = function() { 69 | return new StaticScraper(); 70 | }; 71 | /** 72 | * Creates a static scraper, wrapped around a scraper promise. 73 | * 74 | * @param {!string=} url If provided makes an HTTP GET request to the 75 | * given URL. 76 | * @return {!ScraperPromise} Scraper promise, with a static scraper. 77 | * @public 78 | * @static 79 | */ 80 | StaticScraper.create = function(url) { 81 | return AbstractScraper.create(StaticScraper, url); 82 | }; 83 | 84 | module.exports = StaticScraper; -------------------------------------------------------------------------------- /test/AbstractScraper.js: -------------------------------------------------------------------------------- 1 | /* global describe, it */ 2 | var AbstractScraper = require('../src/AbstractScraper'), 3 | fs = require('fs'), 4 | assert = require('assert'), 5 | MISSING = 'http://0.0.0.0', 6 | HN_CLONE = 'http://localhost:3000/hacker-news-clone'; 7 | 8 | 9 | describe('AbstractScraper', function() { 10 | it('get', function(done) { 11 | var as = new AbstractScraper(); 12 | as.get(MISSING, function(err) { 13 | assert.ok((err.code == 'EADDRNOTAVAIL') || (err.code == 'ECONNREFUSED')); 14 | done(); 15 | }); 16 | }); 17 | it('request', function(done) { 18 | var as = new AbstractScraper(); 19 | as.request({ 20 | url: MISSING 21 | }, function(err) { 22 | assert.ok((err.code == 'EADDRNOTAVAIL') || (err.code == 'ECONNREFUSED')); 23 | done(); 24 | }); 25 | }); 26 | it('getStatusCode', function(done) { 27 | var as = new AbstractScraper(); 28 | as.get(HN_CLONE, function(err) { 29 | assert.ok(!err); 30 | assert.equal(as.getStatusCode(), 200); 31 | done(); 32 | }); 33 | }); 34 | it('getResponse', function(done) { 35 | var as = new AbstractScraper(); 36 | as.get(HN_CLONE, function(err) { 37 | assert.ok(!err); 38 | assert.ok(!!as.getResponse()); 39 | assert.equal(as.getResponse().statusCode, 200); 40 | done(); 41 | }); 42 | }); 43 | it('getBody', function(done) { 44 | var as = new AbstractScraper(); 45 | as.get(HN_CLONE, function(err) { 46 | assert.ok(!err); 47 | assert.equal(as.getBody(), fs.readFileSync(__dirname + '/static/hacker-news-clone.html').toString()); 48 | done(); 49 | }); 50 | }); 51 | it('loadBody', function(done) { 52 | var as = new AbstractScraper(); 53 | as.loadBody(function() { 54 | done(); 55 | }); 56 | }); 57 | it('scrape', function() { 58 | var as = new AbstractScraper(); 59 | as.scrape(function() { 60 | assert.fail('Function shouldn\'t be called'); 61 | }, function() { 62 | assert.fail('Function shouldn\'t be called'); 63 | }); 64 | }); 65 | it('close', function() { 66 | var as = new AbstractScraper(); 67 | assert.ok(as.close() === undefined); 68 | }); 69 | it('clone', function() { 70 | var as = new AbstractScraper(); 71 | assert.ok(as.clone() === undefined); 72 | }); 73 | }); -------------------------------------------------------------------------------- /test/DynamicScraper.js: -------------------------------------------------------------------------------- 1 | /* global describe, it, $ */ 2 | var sjs = require('../src/Scraper'), 3 | ScraperPromise = sjs.ScraperPromise, 4 | DynamicScraper = sjs.DynamicScraper, 5 | assert = require('assert'), 6 | HN_CLONE = 'http://localhost:3000/hacker-news-clone'; 7 | 8 | 9 | describe('DynamicScraper', function() { 10 | 11 | describe('#create', function() { 12 | it('with argument', function(done) { 13 | var ds = DynamicScraper.create(HN_CLONE); 14 | ds 15 | .done(function() { 16 | assert.ok(ds instanceof ScraperPromise); 17 | done(); 18 | }); 19 | }); 20 | 21 | it('without argument', function(done) { 22 | var ds = DynamicScraper.create(); 23 | ds 24 | .get(HN_CLONE) 25 | .done(function() { 26 | assert.ok(ds instanceof ScraperPromise); 27 | done(); 28 | }); 29 | }); 30 | }); 31 | 32 | it('.loadBody, .scrape, .close', function(done) { 33 | var ds = new DynamicScraper(); 34 | ds.body = '
text
'; 35 | var temp = ds.loadBody(function() { 36 | var temp = ds.scrape(function() { 37 | return $('#f').text(); 38 | }, function(err, result) { 39 | assert.equal(err, null); 40 | assert.equal(result, 'text'); 41 | assert.ok(ds.close() === ds); 42 | assert.ok(!!ds.ph); 43 | assert.ok(!!ds.page); 44 | done(); 45 | }); 46 | assert.ok(temp === ds); 47 | }); 48 | assert.ok(temp === ds); 49 | }); 50 | 51 | describe('.inject', function() { 52 | it('page not loaded', function() { 53 | var ds = new DynamicScraper(); 54 | try { 55 | ds.inject(''); 56 | assert.fail('Should have thrown'); 57 | } catch (e) { 58 | assert.equal(e.message, 'Couldn\'t inject code, at "". The page has not been initialized yet.'); 59 | } 60 | }); 61 | 62 | it('success', function(done) { 63 | var ds = new DynamicScraper(); 64 | ds.get(HN_CLONE, function(err) { 65 | if (err) { 66 | assert.fail('Shouldn\'t have returned an error.'); 67 | } 68 | ds.inject(__dirname + '/static/code.js', function(err) { 69 | if (err) { 70 | assert.fail('Should load code successfully.'); 71 | } else { 72 | done(); 73 | } 74 | }); 75 | 76 | }); 77 | }); 78 | 79 | it('fails', function(done) { 80 | var ds = new DynamicScraper(); 81 | ds.get(HN_CLONE, function(err) { 82 | if (err) { 83 | assert.fail('Shouldn\'t have returned an error.'); 84 | } 85 | var file = __dirname + '/static/invalid-code.js'; 86 | ds.inject(file, function(err) { 87 | if (err) { 88 | assert.equal(err.message, 'Couldn\'t inject code, at "' + file + '".'); 89 | done(); 90 | } else { 91 | assert.fail('Shouldn\'t load code successfully.'); 92 | } 93 | }); 94 | 95 | }); 96 | }); 97 | 98 | it('fails jQuery', function(done) { 99 | var jq = DynamicScraper.JQUERY_FILE; 100 | DynamicScraper.JQUERY_FILE += '.non'; 101 | var ds = new DynamicScraper(); 102 | ds.get(HN_CLONE, function(err) { 103 | DynamicScraper.JQUERY_FILE = jq; 104 | if (err) { 105 | assert.equal(err.message, 'Couldn\'t inject jQuery into the page.'); 106 | } else { 107 | assert.fail('Should have returned an error.'); 108 | } 109 | done(); 110 | }); 111 | }); 112 | }); 113 | 114 | it('.clone', function() { 115 | var ds = new DynamicScraper(), 116 | clone = ds.clone(); 117 | assert.ok(clone instanceof DynamicScraper); 118 | assert.ok(ds != clone); 119 | }); 120 | 121 | it('#startFactory, #closeFactory', function() { 122 | var temp; 123 | temp = DynamicScraper.startFactory(); 124 | assert.ok(temp === DynamicScraper); 125 | temp = DynamicScraper.closeFactory(); 126 | assert.ok(temp === DynamicScraper); 127 | temp = DynamicScraper.closeFactory(); 128 | assert.ok(temp === DynamicScraper); 129 | }); 130 | }); -------------------------------------------------------------------------------- /test/Router.js: -------------------------------------------------------------------------------- 1 | /* global describe, it, $ */ 2 | var scraper = require('../src/Scraper'), 3 | Router = scraper.Router, 4 | assert = require('assert'), 5 | LH = 'http://localhost:3000'; 6 | 7 | function compareObjects(obj1, obj2) { 8 | function co(a, b) { 9 | for (var x in a) { 10 | if (a[x] !== b[x]) { 11 | return false; 12 | } 13 | } 14 | return true; 15 | } 16 | return co(obj1, obj2) && co(obj2, obj1); 17 | } 18 | 19 | describe('Router', function() { 20 | 21 | describe('#pathMatcher', function() { 22 | it('with string', function(done) { 23 | var fn = Router.pathMatcher(':protocol(https?://)?:www(www.)?youtube.com/(watch/:id)?'); 24 | assert.equal(typeof fn, 'function'); 25 | assert.ok(compareObjects(fn('youtube.com/'), { 26 | url: 'youtube.com/', 27 | protocol: undefined, 28 | www: undefined, 29 | id: undefined 30 | })); 31 | assert.ok(compareObjects(fn('https://youtube.com/'), { 32 | url: 'https://youtube.com/', 33 | protocol: 'https://', 34 | www: undefined, 35 | id: undefined 36 | })); 37 | assert.ok(compareObjects(fn('https://www.youtube.com/'), { 38 | url: 'https://www.youtube.com/', 39 | protocol: 'https://', 40 | www: 'www.', 41 | id: undefined 42 | })); 43 | assert.ok(compareObjects(fn('https://www.youtube.com/watch/mNhMogx3YmU'), { 44 | url: 'https://www.youtube.com/watch/mNhMogx3YmU', 45 | protocol: 'https://', 46 | www: 'www.', 47 | id: 'mNhMogx3YmU' 48 | })); 49 | assert.ok(compareObjects(Router.pathMatcher('*')('https://www.youtube.com/watch/mNhMogx3YmU'), { 50 | url: 'https://www.youtube.com/watch/mNhMogx3YmU' 51 | })); 52 | try { 53 | Router.pathMatcher(function() {}); 54 | } catch (e) { 55 | assert.equal(e.name, 'ScraperError'); 56 | done(); 57 | } 58 | }); 59 | it('with regular expression', function(done) { 60 | var fn = Router.pathMatcher(/s*crape/); 61 | assert.equal(typeof fn, 'function'); 62 | assert.ok(!!fn('craper')); 63 | assert.ok(!!fn('scraper')); 64 | assert.ok(!!fn('ssscraper')); 65 | done(); 66 | }); 67 | }); 68 | describe('on', function() { 69 | var r = new Router(); 70 | it('with path', function(done) { 71 | r.on(LH + '/info/:id') 72 | .createStatic() 73 | .onStatusCode(200, function() { 74 | done(); 75 | }); 76 | r.route(LH + '/info/ajhfdhsgf', function(found) { 77 | assert.ok(found); 78 | }); 79 | }); 80 | it('with function', function(done) { 81 | r 82 | .on(Router.pathMatcher(LH + '/watch/:id')) 83 | .createStatic() 84 | .onStatusCode(200, function() { 85 | done(); 86 | }); 87 | r.route(LH + '/watch/hjsgdfhdgf', function(found) { 88 | assert.ok(found); 89 | }); 90 | }); 91 | }); 92 | it('get', function(done) { 93 | var r = new Router(); 94 | r 95 | .on(LH + '/info/:id') 96 | .get() 97 | .createStatic() 98 | .onStatusCode(200, function() { 99 | done(); 100 | }); 101 | r.route(LH + '/info/8973iuhrwjhef'); 102 | }); 103 | it('request', function(done) { 104 | var r = new Router(); 105 | r 106 | .on(LH + '/watch/:id') 107 | .request({ 108 | method: 'POST' 109 | }) 110 | .createStatic() 111 | .onStatusCode(200, function() { 112 | done(); 113 | }); 114 | r.route(LH + '/watch/8973iuhrwjhef', function(found) { 115 | assert.ok(found); 116 | }); 117 | }); 118 | it('otherwise', function(done) { 119 | var r = new Router(), 120 | testURL = LH + 'infoo/fjsdgfmhgsdf'; 121 | r.on(LH + '/watch/:id'); 122 | r.otherwise(function(url) { 123 | assert.equal(url, testURL); 124 | done(); 125 | }); 126 | r.route(testURL, function(found) { 127 | assert.ok(!found); 128 | }); 129 | }); 130 | it('route', function(done) { 131 | var r = new Router(); 132 | r.on(LH + '/watch/:id') 133 | .createStatic(); 134 | r.route(LH + '/watch/fjsdgfmhgsdf', function(found) { 135 | assert.ok(found); 136 | r.route(LH + '/scrpng', function(found) { 137 | assert.ok(!found); 138 | done(); 139 | }); 140 | }); 141 | }); 142 | it('createStatic', function(done) { 143 | var r = new Router(); 144 | r.on(LH + '/hacker-news-clone') 145 | .createStatic() 146 | .scrape(function($) { 147 | return $('.title a').map(function() { 148 | return $(this).text(); 149 | }).get(); 150 | }, function(news) { 151 | assert.equal(news.length, 10); 152 | done(); 153 | }); 154 | r.route(LH + '/hacker-news-clone', function(found) { 155 | assert.ok(found); 156 | }); 157 | }); 158 | it('createDynamic', function(done) { 159 | var r = new Router(); 160 | r.on(LH + '/hacker-news-clone') 161 | .createDynamic() 162 | .delay(100) 163 | .scrape(function() { 164 | return $('.title a').map(function() { 165 | return $(this).text(); 166 | }).get(); 167 | }, function(news) { 168 | assert.equal(news.length, 9); 169 | done(); 170 | }); 171 | r.route(LH + '/hacker-news-clone', function(found) { 172 | assert.ok(found); 173 | }); 174 | }); 175 | it('use', function(done) { 176 | var r = new Router(), 177 | stInstance; 178 | stInstance = scraper.StaticScraper 179 | .create() 180 | .scrape(function($) { 181 | return $('.title a').map(function() { 182 | return $(this).text(); 183 | }).get(); 184 | }, function(news) { 185 | assert.equal(news.length, 10); 186 | done(); 187 | }); 188 | r.on(LH + '/hacker-news-clone') 189 | .use(stInstance); 190 | r.route(LH + '/hacker-news-clone', function(found) { 191 | assert.ok(found); 192 | }); 193 | }); 194 | 195 | it('usage of params', function(done) { 196 | var r = new Router(); 197 | r 198 | .on(LH + '/info/:id') 199 | .createStatic() 200 | .then(function(last, utils) { 201 | assert.ok(utils.params.id, '7623hgjfs73'); 202 | }); 203 | r.route(LH + '/info/7623hgjfs73', function(found) { 204 | assert.ok(found); 205 | done(); 206 | }); 207 | }); 208 | 209 | describe('instantiation', function() { 210 | function testCase(firstMatch, expected) { 211 | it('with' + (firstMatch ? '' : 'out') + ' firstMatch', function(done) { 212 | var c = 0; 213 | var r = new Router({ 214 | firstMatch: !!firstMatch 215 | }); 216 | r.on(LH + '/info/:id') 217 | .createStatic() 218 | .then(function() { 219 | c++; 220 | }); 221 | r.on(LH + '/info/:id') 222 | .createStatic() 223 | .then(function() { 224 | c++; 225 | }); 226 | r.route(LH + '/info/7623hgjfs73', function(found) { 227 | assert.ok(found); 228 | assert.equal(c, expected); 229 | done(); 230 | }); 231 | }); 232 | } 233 | 234 | testCase(true, 1); 235 | testCase(false, 2); 236 | }); 237 | 238 | describe('bad formatting', function() { 239 | it('get', function(done) { 240 | var r = new Router(); 241 | try { 242 | r.get(); 243 | } catch (e) { 244 | assert.equal(e.name, 'ScraperError'); 245 | done(); 246 | } 247 | }); 248 | it('request', function(done) { 249 | var r = new Router(); 250 | try { 251 | r.request(); 252 | } catch (e) { 253 | assert.equal(e.name, 'ScraperError'); 254 | done(); 255 | } 256 | }); 257 | it('createStatic', function(done) { 258 | var r = new Router(); 259 | try { 260 | r.createStatic(); 261 | } catch (e) { 262 | assert.equal(e.name, 'ScraperError'); 263 | done(); 264 | } 265 | }); 266 | it('createDynamic', function(done) { 267 | var r = new Router(); 268 | try { 269 | r.createDynamic(); 270 | } catch (e) { 271 | assert.equal(e.name, 'ScraperError'); 272 | done(); 273 | } 274 | }); 275 | it('use', function(done) { 276 | var r = new Router(); 277 | try { 278 | r.use(scraper.StaticScraper.create()); 279 | } catch (e) { 280 | assert.equal(e.name, 'ScraperError'); 281 | done(); 282 | } 283 | }); 284 | }); 285 | }); -------------------------------------------------------------------------------- /test/ScraperError.js: -------------------------------------------------------------------------------- 1 | /* global it */ 2 | var ScraperError = require('../src/ScraperError'), 3 | assert = require('assert'); 4 | 5 | it('ScraperError', function() { 6 | var err = new ScraperError('random message'); 7 | assert.equal(err.message, 'random message'); 8 | assert.equal(err.name, 'ScraperError'); 9 | assert.ok(err.stack); 10 | }); -------------------------------------------------------------------------------- /test/ScraperPromise.js: -------------------------------------------------------------------------------- 1 | /* global describe, it, beforeEach, afterEach, $ */ 2 | var assert = require('assert'), 3 | sjs = require('../src/Scraper'), 4 | StaticScraper = sjs.StaticScraper, 5 | DynamicScraper = sjs.DynamicScraper, 6 | ScraperPromise = sjs.ScraperPromise, 7 | HN_CLONE = 'http://localhost:3000/hacker-news-clone', 8 | domain = require('domain'); 9 | 10 | function exec(ScraperType) { 11 | function isDynamic() { 12 | return ScraperType === DynamicScraper; 13 | } 14 | 15 | describe('onStatusCode', function() { 16 | it('with code', function(done) { 17 | var s = new ScraperPromise(new ScraperType()) 18 | .get(HN_CLONE); 19 | var temp = s 20 | .onStatusCode(202, function() { 21 | assert.fail('This status code should not trigger.'); 22 | }) 23 | .onStatusCode(200, function() { 24 | done(); 25 | }); 26 | assert.ok(temp === s); 27 | }); 28 | 29 | it('without code', function(done) { 30 | var s = new ScraperPromise(new ScraperType()) 31 | .get(HN_CLONE); 32 | var temp = s 33 | .onStatusCode(function(code) { 34 | assert.equal(code, 200); 35 | done(); 36 | }); 37 | assert.ok(temp === s); 38 | }); 39 | }); 40 | 41 | it('timeout', function(done) { 42 | var s = new ScraperPromise(new ScraperType()) 43 | .get(HN_CLONE) 44 | .onStatusCode(function(code) { 45 | assert.equal(code, 200); 46 | }); 47 | var temp = s.timeout(100, function() { 48 | done(); 49 | }); 50 | assert.ok(temp === s); 51 | }); 52 | 53 | it('then', function(done) { 54 | var s = new ScraperPromise(new ScraperType()) 55 | .get(HN_CLONE); 56 | var temp = s.then(function() { 57 | done(); 58 | }); 59 | assert.ok(temp === s); 60 | }); 61 | 62 | it('then', function(done) { 63 | var s = new ScraperPromise(new ScraperType()) 64 | .get(HN_CLONE); 65 | var temp = s.async(function(_, done) { 66 | done(); 67 | }); 68 | s.done(function() { 69 | done(); 70 | }); 71 | assert.ok(temp === s); 72 | }); 73 | 74 | describe('catch', function() { 75 | it('on sync', function(done) { 76 | var s = new ScraperPromise(new ScraperType()) 77 | .get(HN_CLONE) 78 | .then(function() { 79 | throw new Error('random message'); 80 | }); 81 | var temp = s.catch(function(err) { 82 | assert.equal(err.message, 'random message'); 83 | done(); 84 | }); 85 | assert.ok(s === temp); 86 | }); 87 | it('on async', function(done) { 88 | var s = new ScraperPromise(new ScraperType()) 89 | .get(HN_CLONE) 90 | .async(function(_, done) { 91 | done(new Error('random message')); 92 | }); 93 | var temp = s.catch(function(err) { 94 | assert.equal(err.message, 'random message'); 95 | done(); 96 | }); 97 | assert.ok(s === temp); 98 | }); 99 | }); 100 | 101 | // FIXME - this is not working for the dynamic scraper with factory 102 | if (!isDynamic()) { 103 | it('error without catch', function(done) { 104 | var d = domain.create(); 105 | d.on('error', function(err) { 106 | assert.equal(err.message, 'random message'); 107 | done(); 108 | }); 109 | d.run(function() { 110 | new ScraperPromise(new ScraperType()) 111 | .get(HN_CLONE) 112 | .then(function() { 113 | throw new Error('random message'); 114 | }); 115 | }); 116 | }); 117 | } 118 | 119 | describe('scrape', function() { 120 | var expectedVal; 121 | if (isDynamic()) { 122 | expectedVal = 9; 123 | } else { 124 | expectedVal = 10; 125 | } 126 | it('without extra arguments', function(done) { 127 | var s = new ScraperPromise(new ScraperType()) 128 | .get(HN_CLONE); 129 | var fn = function($) { 130 | return $('.title a').map(function() { 131 | return $(this).text(); 132 | }).get(); 133 | }; 134 | var temp = s.scrape(fn, function(news) { 135 | assert.equal(news.length, expectedVal); 136 | done(); 137 | }); 138 | assert.ok(temp === s); 139 | }); 140 | 141 | it('without extra arguments', function(done) { 142 | var s = new ScraperPromise(new ScraperType()) 143 | .get(HN_CLONE); 144 | var fn = function($, selector) { 145 | return $(selector).map(function() { 146 | return $(this).text(); 147 | }).get(); 148 | }; 149 | var temp = s.scrape(fn, function(news) { 150 | assert.equal(news.length, expectedVal); 151 | done(); 152 | }, '.title a'); 153 | assert.ok(temp === s); 154 | }); 155 | 156 | it('with only the scraping function', function(done) { 157 | var s = new ScraperPromise(new ScraperType()) 158 | .get(HN_CLONE); 159 | var fn = function($) { 160 | return $('.title a').map(function() { 161 | return $(this).text(); 162 | }).get(); 163 | }; 164 | var temp = s.scrape(fn); 165 | temp.then(function(news, utils) { 166 | assert.equal(news.length, expectedVal); 167 | done(); 168 | }); 169 | assert.ok(temp === s); 170 | }); 171 | 172 | it('with error', function(done) { 173 | var s = new ScraperPromise(new ScraperType()) 174 | .get(HN_CLONE); 175 | var temp; 176 | temp = s 177 | .catch(function(err) { 178 | assert.equal(err.message, 'Error inside scraping fn.'); 179 | }) 180 | .scrape(function() { 181 | throw new Error('Error inside scraping fn.'); 182 | }, function() { 183 | assert.fail('Invalid call.'); 184 | }) 185 | .done(function() { 186 | done(); 187 | }); 188 | assert.ok(temp === s); 189 | }); 190 | }); 191 | 192 | it('delay', function(done) { 193 | var s = new ScraperPromise(new ScraperType()) 194 | .get(HN_CLONE) 195 | .onStatusCode(function(code) { 196 | assert.equal(code, 200); 197 | }); 198 | var temp = s.delay(100); 199 | assert.ok(temp === s); 200 | var expectedContent = isDynamic()?'Dynamic Content':0; 201 | s.scrape(function($) { 202 | return $('.dynamic').text(); 203 | }, function(result) { 204 | assert.equal(result, expectedContent); 205 | done(); 206 | }); 207 | }); 208 | 209 | it('request', function(done) { 210 | var s = new ScraperPromise(new ScraperType()); 211 | var temp = s.request({ 212 | url: HN_CLONE, 213 | method: 'POST' 214 | }); 215 | assert.ok(temp === s); 216 | var fn = function($) { 217 | return $('#POST_MESSAGE').text(); 218 | }; 219 | s.scrape(fn, function(result) { 220 | assert.equal(result, 'random text'); 221 | done(); 222 | }); 223 | }); 224 | 225 | it('done', function(done) { 226 | var s = new ScraperPromise(new ScraperType()); 227 | s.get(HN_CLONE); 228 | var temp = s.done(function() { 229 | done(); 230 | }); 231 | assert.ok(temp === s); 232 | }); 233 | 234 | it('_setChainParameter', function() { 235 | var s = new ScraperPromise(new ScraperType()); 236 | s._setChainParameter(5); 237 | assert.equal(s.chainParameter, 5); 238 | }); 239 | 240 | describe('_fire', function() { 241 | it('without error', function(done) { 242 | var s = new ScraperPromise(new ScraperType()); 243 | s.done(function() { 244 | done(); 245 | }); 246 | s._fire(); 247 | }); 248 | it('with error', function(done) { 249 | var c = 0; 250 | var s = new ScraperPromise(new ScraperType()) 251 | .done(function() { 252 | assert.equal(c, 1); 253 | done(); 254 | }) 255 | .catch(function(err) { 256 | c++; 257 | assert.equal(err.message, 'msg'); 258 | }); 259 | s._fire(new Error('msg')); 260 | }); 261 | }); 262 | 263 | it('_setPromises', function() { 264 | var s = new ScraperPromise(new ScraperType()); 265 | var promises = [ 266 | 267 | function() {} 268 | ]; 269 | s._setPromises(promises); 270 | assert.ok(s.promises === promises); 271 | }); 272 | 273 | it('clone', function() { 274 | var s = new ScraperPromise(new ScraperType()) 275 | .catch(function() {}) 276 | .done(function() {}) 277 | .then(function() {}) 278 | .onStatusCode(200, function() {}) 279 | .onStatusCode(function() {}) 280 | .timeout(10) 281 | .delay(10); 282 | var clone = s.clone(); 283 | assert.ok(clone instanceof ScraperPromise); 284 | assert.ok(clone.promises === s.promises); 285 | assert.ok(clone.scraper !== s.scraper); 286 | assert.ok(clone.doneCallback === s.doneCallback); 287 | assert.ok(clone.errorCallback === s.errorCallback); 288 | assert.ok(clone.chainParameter === s.chainParameter); 289 | }); 290 | 291 | it('passing values between promises', function(done) { 292 | new ScraperPromise(new ScraperType()) 293 | .done(function(result, utils) { 294 | assert.deepEqual(result, 5); 295 | done(); 296 | }) 297 | .then(function(last, utils) { 298 | assert.deepEqual(last, undefined); 299 | return 1; 300 | }) 301 | .onStatusCode(200, function(utils) { 302 | assert.deepEqual(utils.lastReturn, 1); 303 | return utils.lastReturn + 1; 304 | }) 305 | .onStatusCode(function(code, utils) { 306 | assert.deepEqual(utils.lastReturn, 2); 307 | return utils.lastReturn + 1; 308 | }) 309 | .delay(10, function(utils) { 310 | assert.deepEqual(utils.lastReturn, 3); 311 | return utils.lastReturn + 1; 312 | }) 313 | .scrape(function() {}, function(result, utils) { 314 | assert.deepEqual(utils.lastReturn, 4); 315 | return utils.lastReturn + 1; 316 | }) 317 | .get(HN_CLONE); 318 | }); 319 | 320 | describe('usage of utils', function() { 321 | it('stop()', function(done) { 322 | var c = 0; 323 | new ScraperPromise(new ScraperType()) 324 | .get(HN_CLONE) 325 | .then(function() { 326 | c++; 327 | }) 328 | .then(function(last, utils) { 329 | c++; 330 | utils.stop(); 331 | }) 332 | .then(function() { 333 | c++; 334 | }) 335 | .done(function() { 336 | assert.equal(c, 2); 337 | done(); 338 | }); 339 | }); 340 | it('scraper', function(done) { 341 | var s = new ScraperPromise(new ScraperType()); 342 | s.get(HN_CLONE) 343 | .done(function(_, utils) { 344 | assert.ok(utils.scraper === s); 345 | done(); 346 | }); 347 | }); 348 | it('params', function(done) { 349 | var s = new ScraperPromise(new ScraperType()); 350 | s.get(HN_CLONE) 351 | .done(function(_, utils) { 352 | assert.ok(!utils.params); 353 | done(); 354 | }); 355 | }); 356 | 357 | }); 358 | } 359 | 360 | describe('Scraper Promise', function() { 361 | 362 | describe('with StaticScraper', function() { 363 | exec(StaticScraper); 364 | }); 365 | describe('with DynamicScraper', function() { 366 | describe('with Factory', function() { 367 | beforeEach(function() { 368 | DynamicScraper.startFactory(); 369 | }); 370 | afterEach(function() { 371 | DynamicScraper.closeFactory(); 372 | }); 373 | exec(DynamicScraper); 374 | }); 375 | describe('without Factory', function() { 376 | exec(DynamicScraper); 377 | }); 378 | }); 379 | 380 | 381 | }); 382 | -------------------------------------------------------------------------------- /test/StaticScraper.js: -------------------------------------------------------------------------------- 1 | /* global describe, it */ 2 | var sjs = require('../src/Scraper'), 3 | ScraperPromise = sjs.ScraperPromise, 4 | StaticScraper = sjs.StaticScraper, 5 | assert = require('assert'), 6 | HN_CLONE = 'http://localhost:3000/hacker-news-clone'; 7 | 8 | describe('StaticScraper', function() { 9 | 10 | describe('#create', function() { 11 | it('with argument', function(done) { 12 | var ds = StaticScraper.create(HN_CLONE); 13 | ds 14 | .done(function() { 15 | assert.ok(ds instanceof ScraperPromise); 16 | done(); 17 | }); 18 | }); 19 | 20 | it('without argument', function(done) { 21 | var ds = StaticScraper.create(); 22 | ds 23 | .get(HN_CLONE) 24 | .done(function() { 25 | assert.ok(ds instanceof ScraperPromise); 26 | done(); 27 | }); 28 | }); 29 | }); 30 | 31 | describe('.loadBody, .scrape, .close', function() { 32 | it('without errors', function(done) { 33 | var ds = new StaticScraper(); 34 | ds.body = '
text
'; 35 | var temp = ds.loadBody(function() { 36 | var temp2 = ds.scrape(function($) { 37 | return $('#f').text(); 38 | }, function(err, result) { 39 | assert.ok(err === null); 40 | assert.equal(result, 'text'); 41 | assert.ok(ds.close() === ds); 42 | assert.ok(ds.$); 43 | done(); 44 | }); 45 | assert.ok(temp2 === ds); 46 | }); 47 | assert.ok(temp === ds); 48 | }); 49 | 50 | it('with errors', function(done) { 51 | var ds = new StaticScraper(); 52 | ds.body = '
text
'; 53 | var temp = ds.loadBody(function() { 54 | var temp2 = ds.scrape(function() { 55 | throw new Error('Error in scraping fn.'); 56 | }, function(err) { 57 | if (err) { 58 | assert.ok(!!err); 59 | assert.equal(err.message, 'Error in scraping fn.'); 60 | assert.ok(ds.close() === ds); 61 | assert.ok(ds.$); 62 | done(); 63 | } else { 64 | assert.fail('Should return an error.'); 65 | } 66 | }); 67 | assert.ok(temp2 === ds); 68 | }); 69 | assert.ok(temp === ds); 70 | }); 71 | }); 72 | 73 | it('.clone', function() { 74 | var ds = new StaticScraper(), 75 | clone = ds.clone(); 76 | assert.ok(clone instanceof StaticScraper); 77 | assert.ok(ds != clone); 78 | }); 79 | }); -------------------------------------------------------------------------------- /test/commandLine.js: -------------------------------------------------------------------------------- 1 | /* global describe, it, $ */ 2 | var scraper = require('../src/Scraper'), 3 | exec = require('child_process').exec, 4 | Router = scraper.Router, 5 | assert = require('assert'), 6 | LH = 'http://localhost:3000'; 7 | 8 | function execSjs(more, callback) { 9 | var command = 'node ./bin/scraperjs ' + LH + '/hacker-news-clone ' + more; 10 | exec(command, function(error, out, err) { 11 | if(err || error) { 12 | return; 13 | } else { 14 | callback(JSON.parse(out)); 15 | } 16 | }); 17 | } 18 | 19 | describe('Command line tool', function() { 20 | describe('--text', function() { 21 | describe('--selector', function(done) { 22 | it('--static', function(done) { 23 | execSjs('--selector ".title a" --text -s', function(result) { 24 | assert.equal(result.length, 10); 25 | done(); 26 | }); 27 | }); 28 | it('--dynamic', function(done) { 29 | execSjs('--selector ".title a" --text -d', function(result) { 30 | assert.equal(result.length, 9); 31 | done(); 32 | }); 33 | }); 34 | }); 35 | }); 36 | 37 | describe('--html', function() { 38 | describe('--selector', function(done) { 39 | it('--static', function(done) { 40 | execSjs('--selector ".title a" --html -s', function(result) { 41 | assert.equal(result.length, 10); 42 | done(); 43 | }); 44 | }); 45 | it('--dynamic', function(done) { 46 | execSjs('--selector ".title a" --html -d', function(result) { 47 | assert.equal(result.length, 9); 48 | done(); 49 | }); 50 | }); 51 | }); 52 | }); 53 | 54 | describe('--attr', function() { 55 | describe('--selector', function(done) { 56 | it('--static', function(done) { 57 | execSjs('--selector ".title a" --attr href -s', function(result) { 58 | assert.equal(result.length, 10); 59 | done(); 60 | }); 61 | }); 62 | it('--dynamic', function(done) { 63 | execSjs('--selector ".title a" --attr href -d', function(result) { 64 | assert.equal(result.length, 9); 65 | done(); 66 | }); 67 | }); 68 | }); 69 | }); 70 | }); 71 | -------------------------------------------------------------------------------- /test/setupServer.js: -------------------------------------------------------------------------------- 1 | var express = require('express'), 2 | fs = require('fs'); 3 | 4 | module.exports = function(grunt, port) { 5 | var app = express(), 6 | HN_CLONE = fs.readFileSync(__dirname + '/static/hacker-news-clone.html'); 7 | 8 | app.get('/hacker-news-clone', function(req, res) { 9 | res.status(200); 10 | res.send(HN_CLONE); 11 | }); 12 | 13 | app.post('/hacker-news-clone', function(req, res) { 14 | res.status(200); 15 | res.send('
random text
'); 16 | }); 17 | 18 | app.param('id', function(req, res, next, id) { 19 | var regex = /^[\d\w]+$/; 20 | if (regex.test(id)) { 21 | next(); 22 | } else { 23 | next('route'); 24 | } 25 | }); 26 | app.get('/watch/:id', function(req, res, next) { 27 | res.status(200); 28 | res.send(req.params.id); 29 | }); 30 | app.get('/info/:id', function(req, res, next) { 31 | res.status(200); 32 | res.send(req.params.id); 33 | }); 34 | app.post('/watch/:id', function(req, res, next) { 35 | res.status(200); 36 | res.send(req.params.id + "post"); 37 | }); 38 | 39 | var server = app.listen(port || 3000, function() { 40 | console.log('Listening on port %d', server.address().port); 41 | }); 42 | 43 | return server; 44 | }; -------------------------------------------------------------------------------- /test/static/code.js: -------------------------------------------------------------------------------- 1 | /* global window */ 2 | window.someFn = function(n) { 3 | return 'SomeFN ' + n + ' to '+ (n+1); 4 | }; -------------------------------------------------------------------------------- /test/static/hacker-news-clone.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Hacker news clone 5 | 6 | 7 | 10 |
11 | Random Article 2 12 |
13 |
14 | Random Article 3 15 |
16 |
17 | Random Article 4 18 |
19 |
20 | Random Article 5 21 |
22 |
23 | Random Article 6 24 |
25 |
26 | Random Article 7 27 |
28 |
29 | Random Article 8 30 |
31 |
32 | Random Article 9 33 |
34 |
35 | Random Article 10 36 |
37 | 42 | 43 | --------------------------------------------------------------------------------