├── .editorconfig
├── .gitignore
├── .jshintrc
├── .travis.yml
├── CHANGELOG.md
├── Gruntfile.js
├── LICENSE
├── README.md
├── bin
    └── scraperjs
├── doc
    └── examples
    │   ├── ErrorHandling.js
    │   ├── HackerNews.js
    │   ├── IMDBOpeningThisWeek.js
    │   ├── LinkGetter.js
    │   └── WikimediaScraper.js
├── package.json
├── src
    ├── AbstractScraper.js
    ├── DynamicScraper.js
    ├── PhantomPoll.js
    ├── PhantomWrapper.js
    ├── Router.js
    ├── Scraper.js
    ├── ScraperError.js
    ├── ScraperPromise.js
    └── StaticScraper.js
└── test
    ├── AbstractScraper.js
    ├── DynamicScraper.js
    ├── Router.js
    ├── ScraperError.js
    ├── ScraperPromise.js
    ├── StaticScraper.js
    ├── commandLine.js
    ├── setupServer.js
    └── static
        ├── code.js
        └── hacker-news-clone.html


/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | 
 4 | [*]
 5 | 
 6 | indent_style = tab
 7 | indent_size = 2
 8 | 
 9 | end_of_line = lf
10 | charset = utf-8
11 | trim_trailing_whitespace = true
12 | insert_final_newline = true
13 | 
14 | [*.md]
15 | trim_trailing_whitespace = false
16 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | *.swp
 3 | .project
 4 | .DS_Store?
 5 | ._*
 6 | .Spotlight-V100
 7 | .Trashes
 8 | Icon?
 9 | ehthumbs.db
10 | Thumbs.db
11 | *.sublime-project
12 | *.sublime-workspace
13 | *.ignore
14 | node_modules
15 | bower_components
16 | docs
17 | coverage


--------------------------------------------------------------------------------
/.jshintrc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "bitwise": false,
 3 |   "camelcase": true,
 4 |   "curly": true,
 5 |   "eqeqeq": false,
 6 |   "es3": false,
 7 |   "freeze": true,
 8 |   "immed": true,
 9 |   "latedef": true,
10 |   "newcap": true,
11 |   "noarg": false,
12 |   "noempty": true,
13 |   "nonbsp": true,
14 |   "nonew": true,
15 |   "plusplus": false,
16 |   "quotmark": "single",
17 |   "undef": true,
18 |   "unused": true,
19 |   
20 |   "trailing": true,
21 |   "boss": true,
22 |   "browser": false,
23 |   "node": true
24 | }
25 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | sudo: false
2 | language: node_js
3 | node_js:
4 |   - "0.12"
5 |   - "4.0"
6 |   - "0.11"
7 |   - "0.10"
8 | script: grunt coveralls && grunt clean
9 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Change Log
 2 | 
 3 | ## [1.2.0] - 2015-12-14
 4 | - Change order of parameters when continuing promise chain.
 5 | - ``` async ```'s callback function receives (err, result) parameters.
 6 | - Last result is passed to ``` done ``` promise.
 7 | 
 8 | ## [1.1.0] - 2015-12-12
 9 | - Result of last promise is passed as first parameter to the ``` async ``` promise.
10 | 
11 | ## [1.0.2] - 2015-12-10
12 | - Dependency bump
13 | 
14 | ## [1.0.0] - 2015-10-17
15 | - ``` catch ``` promise is the new standard way to deal with errors. ``` onError ``` is being deprecated, the two work the same way.
16 | - ``` then ``` promise receives the value returned in the last promise as the first parameter, the second parameter is the ``` utils ``` object.
17 | - Errors generated inside the dynamic scraper's scraping function will fire the ``` catch ``` promise.
18 | 
19 | ## [0.4.1] - 2015-10-04
20 | - Url of the page being scraped can now be easily accessed using ``` utils.url ```.
21 | - Added error handling example.
22 | 
23 | ## [0.4.0] - 2015-09-19
24 | - Passing utils to the error callback by @rvernica
25 | - Add an 'options' argument to DynamicScraper that get passed to Phantom by @vdraceil
26 | - Updated dependencies
27 | 
28 | ## [0.3.4] - 2015-04-26
29 | - Minor fixes related with documentation.
30 | - Fixed ``` async ``` promise works. It can receive values to be passed to the next promise. Internally it now uses this mechanism.
31 | - Support for node 0.12.
32 | - Changes in the command-line interface.
33 | 
34 | ## [0.3.3] - 2015-02-11
35 | - Fixed bug where no argument was given to the ```done``` promise when there was an error.
36 | - Added experimental support for command-line interface.
37 | - Added example.
38 | 
39 | ## [0.3.2] - 2014-12-20
40 | - The ```lastResult``` is now made accessible to a ```Router```.
41 | 


--------------------------------------------------------------------------------
/Gruntfile.js:
--------------------------------------------------------------------------------
  1 | var testServer = require('./test/setupServer');
  2 | 
  3 | var MOCHA_TIMEOUT_S = 10,
  4 | 	MOCHA_TIMEOUT_MS = MOCHA_TIMEOUT_S * 1000,
  5 | 	MOCHA_OPTIONS = {
  6 | 		reporter: 'spec',
  7 | 		timeout: MOCHA_TIMEOUT_MS
  8 | 	},
  9 | 	COVERAGE_THRESHOLD = 95;
 10 | 
 11 | module.exports = function(grunt) {
 12 | 	grunt.loadNpmTasks('grunt-mocha-test');
 13 | 	grunt.loadNpmTasks('grunt-contrib-jshint');
 14 | 	grunt.loadNpmTasks('grunt-contrib-watch');
 15 | 	grunt.loadNpmTasks('grunt-contrib-clean');
 16 | 	grunt.loadNpmTasks('grunt-exec');
 17 | 
 18 | 	grunt.initConfig({
 19 | 		exec: {
 20 | 			coverage: {
 21 | 				command: 'istanbul cover ./node_modules/mocha/bin/_mocha -x src/PhantomWrapper.js -- -t ' + MOCHA_TIMEOUT_MS + ' --root src/ test/'
 22 | 			},
 23 | 			coveralls: {
 24 | 				command: 'istanbul cover ./node_modules/mocha/bin/_mocha -x src/PhantomWrapper.js --report lcovonly -- -t ' + MOCHA_TIMEOUT_MS + ' -x src/PhantomWrapper.js --root src/ test/ && cat ./coverage/lcov.info | ./node_modules/coveralls/bin/coveralls.js'
 25 | 			},
 26 | 			'check-coverage': {
 27 | 				command: 'istanbul check-coverage --lines ' + COVERAGE_THRESHOLD + ' --statements ' + COVERAGE_THRESHOLD + ' --functions ' + COVERAGE_THRESHOLD + ' --branches ' + COVERAGE_THRESHOLD + ' ./coverage/coverage.json'
 28 | 			}
 29 | 		},
 30 | 		clean: {
 31 | 			coverage: {
 32 | 				src: ['coverage/']
 33 | 			}
 34 | 		},
 35 | 		watch: {
 36 | 			common: {
 37 | 				files: ['src/**/*.js', 'test/**/*.js', 'Gruntfile.js'],
 38 | 				tasks: ['test']
 39 | 			}
 40 | 		},
 41 | 		jshint: {
 42 | 			all: ['src/**/*.js', 'test/**/*.js']
 43 | 		},
 44 | 		mochaTest: {
 45 | 			abstractScraper: {
 46 | 				src: 'test/AbstractScraper.js',
 47 | 				options: MOCHA_OPTIONS
 48 | 			},
 49 | 			staticScraper: {
 50 | 				src: 'test/StaticScraper.js',
 51 | 				options: MOCHA_OPTIONS
 52 | 			},
 53 | 			dynamicScraper: {
 54 | 				src: 'test/DynamicScraper.js',
 55 | 				options: MOCHA_OPTIONS
 56 | 			},
 57 | 			scraperPromise: {
 58 | 				src: 'test/ScraperPromise.js',
 59 | 				options: MOCHA_OPTIONS
 60 | 			},
 61 | 			router: {
 62 | 				src: 'test/Router.js',
 63 | 				options: MOCHA_OPTIONS
 64 | 			},
 65 | 			scraperError: {
 66 | 				src: 'test/ScraperError.js',
 67 | 				options: MOCHA_OPTIONS
 68 | 			},
 69 | 			commandLine: {
 70 | 				src: 'test/commandLine.js',
 71 | 				options: MOCHA_OPTIONS
 72 | 			},
 73 | 			all: {
 74 | 				src: ['test/AbstractScraper.js', 'test/StaticScraper.js', 'test/DynamicScraper.js', 'test/ScraperPromise.js', 'test/Router.js', 'test/ScraperError.js', 'test/commandLine.js'],
 75 | 				options: MOCHA_OPTIONS
 76 | 			}
 77 | 		}
 78 | 	});
 79 | 
 80 | 	var server;
 81 | 
 82 | 	grunt.registerTask('serve', 'Starts express testing server', function() {
 83 | 		server = testServer(grunt);
 84 | 	});
 85 | 
 86 | 	grunt.registerTask('unserve', function() {
 87 | 		if (server) {
 88 | 			server.close();
 89 | 		}
 90 | 	});
 91 | 
 92 | 	grunt.registerTask('serve-and-test', ['serve', 'mochaTest:all', 'unserve']);
 93 | 
 94 | 	grunt.registerTask('coverage', ['clean', 'jshint', 'serve', 'exec:coverage', 'exec:check-coverage', 'unserve']);
 95 | 	grunt.registerTask('coveralls', ['clean', 'jshint', 'serve', 'exec:coveralls', 'exec:check-coverage', 'unserve']);
 96 | 
 97 | 	grunt.registerTask('unit', ['jshint', 'serve-and-test']);
 98 | 	grunt.registerTask('test', ['coverage']);
 99 | 
100 | 	grunt.registerTask('watch-all', ['serve', 'watch', 'unserve']);
101 | };


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | 
 2 | Copyright (C) 2013 Rui Gil
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | of this software and associated documentation files (the "Software"), to deal
 6 | in the Software without restriction, including without limitation the rights
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the Software is 
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in 
12 | all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Scraperjs
  2 | [![Build Status](https://travis-ci.org/ruipgil/scraperjs.svg?branch=master)](https://travis-ci.org/ruipgil/scraperjs) [![Dependency Status](https://gemnasium.com/ruipgil/scraperjs.svg)](https://gemnasium.com/ruipgil/scraperjs) [![Coverage Status](https://coveralls.io/repos/ruipgil/scraperjs/badge.svg?branch=master)](https://coveralls.io/r/ruipgil/scraperjs?branch=master) [![NPM version](https://badge.fury.io/js/scraperjs.svg)](http://badge.fury.io/js/scraperjs) [![Inline docs](http://inch-ci.org/github/ruipgil/scraperjs.svg?branch=master)](http://inch-ci.org/github/ruipgil/scraperjs)
  3 | 
  4 | Scraperjs is a web scraper module that make scraping the web an easy job.
  5 | 
  6 | ## Installing
  7 | 
  8 | ```
  9 | npm install scraperjs
 10 | ```
 11 | 
 12 | If you would like to test (this is optional and requires the installation with the ``` --save-dev ``` tag),
 13 | ```
 14 | grunt test
 15 | ```
 16 | 
 17 | *To use some features you’ll need to install [phantomjs](http://phantomjs.org/download.html), if you haven’t already*
 18 | 
 19 | # Getting started
 20 | 
 21 | Scraperjs exposes two different scrapers,
 22 | + a **StaticScraper**, that is light fast and with a low footprint, however it doesn't allow for more complex situations, like scraping dynamic content.
 23 | + a **DynamicScraper**, that is a bit more heavy, but allows you to scrape dynamic content, like in the browser console.
 24 | both scrapers expose a *very* similar API, with some minor differences when it comes to scraping.
 25 | 
 26 | ## Lets scrape [Hacker News](https://news.ycombinator.com/), with both scrapers.
 27 | 
 28 | Try to spot the differences.
 29 | 
 30 | ### Static Scraper
 31 | 
 32 | ```javascript
 33 | var scraperjs = require('scraperjs');
 34 | scraperjs.StaticScraper.create('https://news.ycombinator.com/')
 35 | 	.scrape(function($) {
 36 | 		return $(".title a").map(function() {
 37 | 			return $(this).text();
 38 | 		}).get();
 39 | 	})
 40 | 	.then(function(news) {
 41 | 		console.log(news);
 42 | 	})
 43 | ```
 44 | 
 45 | The ```scrape``` promise receives a function that will scrape the page and return the result, it only receives jQuery a parameter to scrape the page. Still, very powerful. It uses [cheerio](https://github.com/cheeriojs/cheerio) to do the magic behind the scenes.
 46 | 
 47 | ### Dynamic Scraper
 48 | 
 49 | ```javascript
 50 | var scraperjs = require('scraperjs');
 51 | scraperjs.DynamicScraper.create('https://news.ycombinator.com/')
 52 | 	.scrape(function($) {
 53 | 		return $(".title a").map(function() {
 54 | 			return $(this).text();
 55 | 		}).get();
 56 | 	})
 57 | 	.then(function(news) {
 58 | 		console.log(news);
 59 | 	})
 60 | ```
 61 | 
 62 | Again, the ```scrape``` promise receives a function to scrape the page, the only difference is that, because we're using a dynamic scraper, the scraping function is [sandboxed](https://github.com/sgentle/phantomjs-node/wiki#evaluating-pages) only with the page scope, so **no closures!** This means that in *this* (and only in this) scraper you can't call a function that has not been defined inside the scraping function. Also, the result of the scraping function must be [JSON-serializable](https://github.com/sgentle/phantomjs-node/wiki#evaluating-pages).
 63 | We use [phantom](https://github.com/sgentle/phantomjs-node) and [phantomjs](https://github.com/ariya/phantomjs) to make it happen, we also inject jQuery for you.
 64 | 
 65 | However, it's possible to [pass JSON-serializable data](**Example**) to *any* scraper.
 66 | 
 67 | *The ```$``` varible received by the scraping function is, only for the dynamic scraper, hardcoded.*
 68 | 
 69 | ## Show me the way! (aka Routes)
 70 | 
 71 | For a more flexible scraping and crawling of the web sometimes we need to go through multiple web sites and we don't want map every possible url format. For that scraperjs provides the Router class.
 72 | 
 73 | ### Example
 74 | 
 75 | ```javascript
 76 | var scraperjs = require('scraperjs'),
 77 | 	router = new scraperjs.Router();
 78 | 
 79 | router
 80 | 	.otherwise(function(url) {
 81 | 	console.log("Url '"+url+"' couldn't be routed.");
 82 | });
 83 | 
 84 | var path = {};
 85 | 
 86 | router.on('https?://(www.)?youtube.com/watch/:id')
 87 | 	.createStatic()
 88 | 	.scrape(function($) {
 89 | 		return $("a").map(function() {
 90 | 			return $(this).attr("href");
 91 | 		}).get();
 92 | 	})
 93 | 	.then(function(links, utils) {
 94 | 		path[utils.params.id] = links
 95 | 	})
 96 | 
 97 | router.route("https://www.youtube.com/watch/YE7VzlLtp-4", function() {
 98 | 	console.log("i'm done");
 99 | });
100 | ```
101 | 
102 | Code that allows for parameters in paths is from the project [Routes.js](https://github.com/aaronblohowiak/routes.js), information about the [path formating](https://github.com/aaronblohowiak/routes.js#path-formats) is there too.
103 | 
104 | # API overview
105 | 
106 | Scraperjs uses promises whenever possible.
107 | 
108 | #### StaticScraper, DynamicScraper and ScraperPromise
109 | 
110 | So, the scrapers should be used with the ScraperPromise. By creating a scraper
111 | ```javascript
112 | var scraperPromise = scraperjs.StaticScraper.create() // or DynamicScraper
113 | ```
114 | The following promises can be made over it, they all return a scraper promise,
115 | + ```onStatusCode(code:number, callback:function(utils:Object))```, executes the callback when the status code is equal to the code,
116 | + ```onStatusCode(callback:function(code:number, utils:Object))```, executes the callback when receives the status code. The callback receives the current status code,
117 | + ```delay(time:number, callback:function(last:?, utils:Object))```, delays the execution of the chain by time (in milliseconds),
118 | + ```timeout(time:number, callback:function(last:?, utils:Object))```, executes the callback function after time (in milliseconds),
119 | + ```then(lastResult:?, callback:function(last:?, utils:Object))```, executes the callback after the last promise,
120 | + ```async(callback:function(last:?, done:function(result:?, err:?), utils))```, executes the callback, stopping the promise chain, resuming it when the ```done``` function is called. You can provide a result to be passed down the promise chain, or an error to trigger the catch promise,
121 | + ```catch(callback:function(error:Error, utils:Object))```, executes the callback when there was an error, errors block the execution of the chain even if the promise was not defined,
122 | + ```done(callback:function(last:?, utils:Object))```, executes the callback at the end of the promise chain, this is always executed, even if there was an error,
123 | + ```get(url:string)```, makes a simple HTTP GET request to the url. This promise should be used only once per scraper.
124 | + ```request(options:Object)```, makes a (possibly) more complex HTTP request, scraperjs uses the [request](https://github.com/mikeal/request) module, and this method is a simple wrapper of ```request.request()```. This promise should be used only once per scraper.
125 | + ```scrape(scrapeFn:function(...?), callback:function(result:?, utils:Object)=, ...?)```, scrapes the page. It executes the scrapeFn and passes it's result to the callback. When using the StaticScraper, the scrapeFn receives a jQuery function that is used to scrape the page. When using the DynamicScraper, the scrapeFn doesn't receive anything and can only return a [JSON-serializable](https://github.com/sgentle/phantomjs-node/wiki#evaluating-pages) type. Optionally an arbitrary number of arguments can be passed to the scraping function. The callback may be omitted, if so, the result of the scraping may be accessed with the ``` then ``` promise or ``` utils.lastReturn ``` in the next promise.
126 | 
127 | All callback functions receive as their last parameter a utils object, with it the parameters of an url from a router can be accessed. Also the chain can be stopped.
128 | ```javascript
129 | DynamicScraper.create()
130 | 	.get("http://news.ycombinator.com")
131 | 	.then(function(_, utils) {
132 | 		utils.stop();
133 | 		// utils.params.paramName
134 | 	});
135 | ```
136 | 
137 | The promise chain is fired with the same sequence it was declared, with the exception of the promises get and request that fire the chain when they've received a valid response, and the promises ``` done ``` and ``` catch ```, which were explained above.
138 | 
139 | You can also waterfall values between promises by returning them (with the exception of the promise ```timeout```, that will always return ```undefined```) and it can be access through ```utils.lastReturn```.
140 | 
141 | ##### The ``` utils ``` object
142 | 
143 | You've seen the ``` utils ``` object that is passed to promises, it provides useful information and methods to your promises. Here's what you can do with it:
144 | + ``` .lastResult ```, value returned in the last promise
145 | + ``` .stop() ```, function to stop the promise chain,
146 | + ``` .url ```, url provided to do the scraper,
147 | + ``` .params ```, object with the parameters defined in the router matching pattern.
148 | 
149 | 
150 | ##### A more powerful DynamicScraper.
151 | 
152 | When lots of instances of DynamicScraper are needed, it's creation gets really heavy on resources and takes a lot of time. To make this more lighter you can use a *factory*, that will create only one PhantomJS instance, and every DynamicScraper will request a page to work with. To use it you must start the factory before any DynamicSrcaper is created, ``` scraperjs.DynamicScraper.startFactory() ``` and then close the factory after the execution of your program, ``` scraperjs.DynamicScraper.closeFactory() ```.
153 | To make the scraping function more robust you can inject code into the page,
154 | ```js
155 | var ds = scraperjs.DynamicScraper
156 | 	.create('http://news.ycombinator.com')
157 | 	.async(function(_, done, utils) {
158 | 		utils.scraper.inject(__dirname+'/path/to/code.js', function(err) {
159 | 			// in this case if there was an error won't fire catch promise.
160 | 			if(err) {
161 | 				done(err);
162 | 			} else {
163 | 				done();
164 | 			}
165 | 		});
166 | 	})
167 | 	.scrape(function() {
168 | 		return functionInTheCodeInjected();
169 | 	})
170 | 	.then(function(result) {
171 | 		console.log(result);
172 | 	});
173 | ```
174 | 
175 | #### Router
176 | 
177 | The router should be initialized like a class
178 | ```javascript
179 | var router = new scraperjs.Router(options);
180 | ```
181 | 
182 | The options object is optional, and these are the options:
183 | + ``` firstMatch ```, a boolean, if true the routing will stop once the first path is matched, the default is false.
184 | 
185 | The following promises can be made over it,
186 | + ```on(path:string|RegExp|function(url:string))```, makes the promise for the match url or regular expression, alternatively you can use a function to accept or not a passed url. The promises ```get``` or ```request``` and ```createStatic``` or ```createDynamic``` are expected after the on promise.
187 | + ```get()```, makes so that the page matched will be requested with a simple HTTP request,
188 | + ```request(options:Object)```, makes so that the page matched will be requested with a possible more complex HTTP request, , scraperjs uses the [request](https://github.com/mikeal/request) module, and this method is a simple wrapper of [request.request()](https://github.com/mikeal/request#requestoptions-callback),
189 | + ```createStatic()```, associates a static scraper to use to scrape the matched page, this returns ScraperPromise, so any promise made from now on will be made over a ScraperPromise of a StaticScraper. Also the ```done``` promise of the scraper will not be available.
190 | + ```createDynamic()```, associates a dynamic scraper to use to scrape the matched page, this returns ScraperPromise, so any promise made from now on will be made over a ScraperPromise of a DynamicScraper. Also the ```done``` promise of the scraper will not be available.
191 | + ```route(url:string, callback:function(boolean))```, routes an url through all matched paths, calls the callback when it's executed, true is passed if the route was successful, false otherwise.
192 | + ```use(scraperInstance:ScraperPromise)```, uses a ScraperPromise already instantiated.
193 | + ```otherwise(callback:function(url:string))```, executes the callback function if the routing url didn't match any path.
194 | + ```catch(callback:function(url:string, error:Error))```, executes the callback when an error occurred on the routing scope, not on any scraper, for that situations you should use the ```catch``` promise of the scraper.
195 | 
196 | #### Notes
197 | 
198 | * Scraperjs **always** fetches the document with `request`, and then when using a DynamicScraper, leverages phantom's `setContent()` to set the body of the page object. This will result in subtly different processing of web pages compared to directly loading a URL in PhantomJS.
199 | 
200 | #### More
201 | 
202 | Check the [examples](./doc/examples), the [tests](./test) or just dig into the code, it's well documented and it's simple to understand.
203 | 
204 | # Dependencies
205 | 
206 | As mentioned above, scraperjs is uses some dependencies to do the the heavy work, such as
207 | + [```async```](https://github.com/caolan/async), for flow control
208 | + [```request```](https://github.com/mikeal/request), to make HTTP requests, again, if you want more complex requests see it's [documentation](https://github.com/mikeal/request#requestoptions-callback)
209 | + [```phantom```](https://github.com/sgentle/phantomjs-node) + [```phantomjs```](https://github.com/ariya/phantomjs), phantom is an awesome module that links node to phantom, used in the DynamicScraper
210 | + [```cheerio```](https://github.com/cheeriojs/cheerio), light and fast DOM manipulation, used to implement the StaticScraper
211 | + [```jquery```](https://github.com/jquery/jquery), to include jquery in the DynamicScraper
212 | + although [```Routes.js```](https://github.com/aaronblohowiak/routes.js) is great, scraperjs doesn't use it to maintain it's "interface layout", but the code to transform the path given on the on promise to regular expressions is from them
213 | 
214 | # License
215 | 
216 | This project is under the [MIT](./LICENSE) license. 
217 | 


--------------------------------------------------------------------------------
/bin/scraperjs:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | /*
 4 | scraperjs http://example.com --text #title-id // returns array with text of #title-id's
 5 | scraperjs http://example.com --html #title-id // returns the html
 6 | scraperjs http://example.com --text #title-id -D // uses dynamic scraper
 7 | scraperjs http://example.com --text #title-id -S // uses static scraper
 8 | scraperjs http://example.com --eval "function($) { return $('#title-id').map(function() { return $(this).text(); }).get(); }"
 9 | scraperjs http://example.com http://example.org ... --text #title-id
10 | 
11 | 
12 | 
13 | scraperjs http://example.com --delay 15 --text #title-id -D
14 | scraperjs http://example.com --text #title-id --then "function(utils) { return utils.lastResult; }" --delay 15 -D
15 | */
16 | 
17 | var program = require('commander'),
18 | 	sjs = require('../'),
19 | 	async = require('async');
20 | 
21 | program
22 | 	.version(require('../package.json').version)
23 | 	.usage(['url [url ...] --text --selector <selector> -s',
24 | 		'url [url ...] --html --selector <selector> -s',
25 | 		'url [url ...] --attr <attribute> --selector <selector> -s',
26 | 		'url [url ...] --eval <functionBody> -s'].join('\n\t\t'))
27 | 	.option('--selector <selector>', 'Selects an element')
28 | 	.option('--text', 'Extracts the text from the selector element.')
29 | 	.option('--html', 'Extracts the html from the selector element.')
30 | 	.option('--attr <attribute>', 'Extracts an atribute of the selector element.')
31 | 	.option('--eval <functionBody>', 'Uses a function to scrape, providing only it\'s body.')
32 | 	.option('-s, --static', 'Uses the static scraper. Used by default.', true)
33 | 	.option('-d, --dynamic', 'Uses the dynamic scraper.', false)
34 | 	.parse(process.argv);
35 | 
36 | var urls = program.args;
37 | var ScraperType = (program.dynamic ? 'Dynamic' : 'Static' ) + 'Scraper';
38 | var fn, args;
39 | 
40 | if(program.selector && (program.text || program.html || program.attr)) {
41 | 	fn = function athScraper($, obj) {
42 | 		return $(obj.selector).map(function(){
43 | 			return obj.which?$(this)[obj.what](obj.which):$(this)[obj.what]();
44 | 		}).get();
45 | 	};
46 | 	args = {
47 | 		selector: program.selector,
48 | 		what: undefined,
49 | 		which: undefined
50 | 	};
51 | 
52 | 	var temp;
53 | 	if(program.attr) {
54 | 		args.what = 'attr';
55 | 		args.which = program.attr;
56 | 	}else if(program.html) {
57 | 		args.what = 'html';
58 | 	}else if(program.text){
59 | 		args.what = 'text';
60 | 	}
61 | } else if (program.eval) {
62 | 	fn = new Function("$", "obj", program.eval);
63 | 	args = {};
64 | } else {
65 | 	console.error('Invalid usage. Run scraperjs --help for help');
66 | 	return;
67 | }
68 | 
69 | var scraper = sjs[ScraperType].create();
70 | scraper.scrape.apply(scraper, [fn, function(result) {
71 | 	console.log(JSON.stringify(result));
72 | }, args]);
73 | async.eachSeries(urls, function(url, done) {
74 | 	scraper
75 | 		.get(url)
76 | 		.catch(function(err) {
77 | 			done(err);
78 | 		})
79 | 		.done(function() {
80 | 			done();
81 | 		});
82 | }, function(err) {
83 | 	if(err) {
84 | 		console.error(err.stack);
85 | 	}
86 | });


--------------------------------------------------------------------------------
/doc/examples/ErrorHandling.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * DISCLAIMER
 3 |  * This is a relatively simple example, to illustrate some of the
 4 |  *   possible functionalities and how to achieve them.
 5 |  *   There is no guarantee that this example will provide useful
 6 |  *   results.
 7 |  *   Use this example with and at your own responsibility.
 8 |  *
 9 |  * In this example we run through some urls and try to extract their
10 |  *   30th link. It demonstrates how to deal with errors.
11 |  *
12 |  * To run:
13 |  * 'node ErrorHandling.js'
14 |  */
15 | 
16 | var sjs = require('../../');
17 | 
18 | var log = console.log;
19 | var router = new sjs.Router();
20 | 
21 | function create30thLinkError() {
22 |   var err = new Error("Page doesn't have 30th link");
23 |   err.code = '30THLINK';
24 |   return err;
25 | }
26 | 
27 | router
28 |   .on('*')
29 |   .createStatic()
30 |   .onStatusCode(function(code, utils) {
31 |     // if it's not Ok pause and log.
32 |     if (code != 200) {
33 |       log("Page '%s' has status code %d", utils.url, code);
34 |       utils.stop();
35 |     }
36 |   })
37 |   .catch(function(err, utils) {
38 |     // deal identify with errors and recover or panic
39 |     // this has the same problems as js error handling,
40 |     // it's messy and ugly
41 |     switch (err.code) {
42 |       case 'ENOTFOUND':
43 |         log("Page '%s' not found", err.hostname);
44 |         break;
45 |       case '30THLINK':
46 |         log("Page '%s' doesn't have a 30th link", utils.url);
47 |         break;
48 |       default:
49 |         log('Unknown error found %s', err);
50 |     }
51 |   })
52 |   .scrape(function($) {
53 |     var thirty = $('a')[30];
54 |     if (thirty) {
55 |       return $(thirty).attr('href');
56 |     } else {
57 |       throw create30thLinkError();
58 |     }
59 |   })
60 |   .then(function(thirty, utils) {
61 |     log("'%s' has '%s' as it's 30th link", utils.url, thirty);
62 |   });
63 | 
64 | // Front page of google doesn't have a 30th link
65 | router.route('http://google.com');
66 | // This page doesn't exist
67 | router.route('http://wouvoogle.com');
68 | // Hacker new have a 30th link
69 | router.route('http://news.ycombinator.com');


--------------------------------------------------------------------------------
/doc/examples/HackerNews.js:
--------------------------------------------------------------------------------
 1 | var sjs = require('../../src/Scraper');
 2 | /*
 3 |  Scrape the news in Hacker News.
 4 |  */
 5 | sjs.StaticScraper
 6 | 	.create('https://news.ycombinator.com')
 7 | 	.scrape(function($) {
 8 | 		return $('.title a').map(function() {
 9 | 			return $(this).text();
10 | 		}).get().filter(function(elm) {
11 | 			return elm != 'More';
12 | 		});
13 | 	})
14 | 	.then(function(news) {
15 | 		news.forEach(function(elm) {
16 | 			console.log(elm);
17 | 		});
18 | 	});


--------------------------------------------------------------------------------
/doc/examples/IMDBOpeningThisWeek.js:
--------------------------------------------------------------------------------
 1 | /* global $ */
 2 | var sjs = require('../../');
 3 | /**
 4 |  * Displays the movies opening this week, from IMDB.
 5 |  * This example is inspired by user jasode at Hacker News.
 6 |  *   {@link https://news.ycombinator.com/item?id=8193522}
 7 |  *   Note that the list of movies opening this week is loaded
 8 |  *   dynamically. A static scraper can't scrape this content, this way.
 9 |  */
10 | sjs.DynamicScraper
11 | 	.create('https://www.imdb.com')
12 | 	.scrape(function($) {
13 | 		return $('.otw-title').map(function() {
14 | 			return $(this).text().trim();
15 | 		}).get();
16 | 	})
17 | 	.then(function(movies) {
18 | 		movies.forEach(function(movie) {
19 | 			console.log(movie);
20 | 		});
21 | 	});


--------------------------------------------------------------------------------
/doc/examples/LinkGetter.js:
--------------------------------------------------------------------------------
 1 | var sjs = require('../../src/Scraper'),
 2 | 	url = process.argv.slice(2)[0];
 3 | 
 4 | if(!url) {
 5 | 	console.log('Usage: node LinkGetter.js <url>');
 6 | 	return;
 7 | }
 8 | 
 9 | /*
10 |  Get all the links in a page.
11 |  */
12 | sjs.StaticScraper
13 | 	.create()
14 | 	.onStatusCode(function(code) {
15 | 		console.log(code);
16 | 	})
17 | 	.scrape(function($) {
18 | 		return $('a').map(function() {
19 | 			return $(this).attr('href');
20 | 		}).get();
21 | 	})
22 | 	.then(function(links) {
23 | 		links.forEach(function(link) {
24 | 			console.log(link);
25 | 		});
26 | 	})
27 | 	.get(url);


--------------------------------------------------------------------------------
/doc/examples/WikimediaScraper.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * DISCLAIMER
  3 |  * This is a relatively simple example, to illustrate some of the
  4 |  *   possible functionalities and how to achieve them.
  5 |  *   There is no guarantee that this example will provide useful
  6 |  *   results.
  7 |  *   Use this example with and at your own responsibility.
  8 |  *
  9 |  * In this example we run through a list of links, if they have a
 10 |  *   route defined they will be scraped. Their title, language and 
 11 |  *   first paragraph.
 12 |  *
 13 |  * To run:
 14 |  * 'node WikimediaScraper.js link1 [... linkN]'
 15 |  */
 16 | 
 17 | var sjs = require('../../src/Scraper'),
 18 | 	async = require('../../node_modules/async'),
 19 | 	parseUrl = require('url').parse,
 20 | 	urls = process.argv.slice(2);
 21 | 
 22 | if(!urls || !urls.length) {
 23 | 	console.log("Usage: node WikimediaScraper.js url [...url]");
 24 | 	return;
 25 | }
 26 | 
 27 | var IMDB_SELECTOR = '[itemprop=description]',
 28 | 	gatheredInformation = [],
 29 | 	unknownRoutes = [];
 30 | 
 31 | var router = new sjs.Router({
 32 | 	firstMatch: true
 33 | });
 34 | 
 35 | router
 36 | 	.on('https?://:lang.wikipedia.org/wiki/:article')
 37 | 	.get()
 38 | 	.createStatic()
 39 | 	// if the status code is different from OK (200) we stop
 40 | 	.onStatusCode(function(statusCode, utils) {
 41 | 		if(statusCode!==200) {
 42 | 			utils.stop();
 43 | 		}
 44 | 	})
 45 | 	.scrape(function($) {
 46 | 		return {
 47 | 			title: $('h1').first().text(),
 48 | 			text: $('p').first().text()
 49 | 		};
 50 | 	})
 51 | 	.then(function(last, utils) {
 52 | 		last.lang = utils.params.lang;
 53 | 		return last;
 54 | 	});
 55 | 
 56 | // the same functionality than the above
 57 | var scraperForWiki = sjs.StaticScraper
 58 | 	.create()
 59 | 	.onStatusCode(function(statusCode, utils) {
 60 | 		if(statusCode!==200) {
 61 | 			utils.stop();
 62 | 		}
 63 | 	})
 64 | 	.scrape(function($) {
 65 | 		return {
 66 | 			title: $('h1').first().text(),
 67 | 			text: $('p').first().text()
 68 | 		};
 69 | 	})
 70 | 	.then(function(last, utils) {
 71 | 		if(utils.params) {
 72 | 			last.lang = utils.params.lang;
 73 | 		} else {
 74 | 			last.lang = "?";
 75 | 		}
 76 | 		return last;
 77 | 	});
 78 | 
 79 | router
 80 | 	.on(function(url) {
 81 | 		return parseUrl(url).host === 'en.wikiquote.com';
 82 | 	})
 83 | 	.use(scraperForWiki);
 84 | 
 85 | router
 86 | 	.on('https?://:lang.wikinews.org/wiki/:place')
 87 | 	.use(scraperForWiki);
 88 | 
 89 | router.otherwise(function(url) {
 90 | 	unknownRoutes.push(url);
 91 | });
 92 | 
 93 | async.eachLimit(urls, 2, function(url, done) {
 94 | 	router.route(url, function(found, returned) {
 95 | 		if(found && returned) {
 96 | 			gatheredInformation.push(returned);
 97 | 		}
 98 | 		done();
 99 | 	});
100 | }, function(err) {
101 | 	if(err) {
102 | 		return;
103 | 	}
104 | 
105 | 	gatheredInformation.forEach(function(item) {
106 | 		console.log(item.title.toUpperCase()+" ("+item.lang+")");
107 | 		console.log("\t"+item.text);
108 | 	});
109 | })


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "scraperjs",
 3 |   "version": "1.2.0",
 4 |   "description": "A complete and versatile web scraper.",
 5 |   "main": "./src/Scraper.js",
 6 |   "keywords": [
 7 |     "scraper",
 8 |     "scraping",
 9 |     "web"
10 |   ],
11 |   "bin": "./bin/scraperjs",
12 |   "scripts": {
13 |     "test": "grunt test"
14 |   },
15 |   "repository": {
16 |     "type": "git",
17 |     "url": "git://github.com/ruipgil/scraperjs.git"
18 |   },
19 |   "bugs": {
20 |     "url": "https://github.com/ruipgil/scraperjs/issues"
21 |   },
22 |   "gitHead": "c58be022438e49564597bbb3ad7c036d610744f8",
23 |   "homepage": "https://github.com/ruipgil/scraperjs",
24 |   "author": "Rui Gil",
25 |   "license": "MIT",
26 |   "dependencies": {
27 |     "async": "^1.5.0",
28 |     "cheerio": "^0.19.0",
29 |     "jquery": "^2.1.4",
30 |     "phantom": "^0.8.4",
31 |     "request": "^2.67.0",
32 |     "commander": "^2.9.0"
33 |   },
34 |   "readmeFilename": "README.md",
35 |   "directories": {
36 |     "test": "test"
37 |   },
38 |   "devDependencies": {
39 |     "coveralls": "^2.11.1",
40 |     "express": "^4.8.3",
41 |     "grunt": "^0.4.5",
42 |     "grunt-cli": "~0.1.9",
43 |     "grunt-contrib-clean": "^0.6.0",
44 |     "grunt-contrib-jshint": "^0.10.0",
45 |     "grunt-contrib-watch": "^0.6.1",
46 |     "grunt-exec": "^0.4.6",
47 |     "grunt-mocha-test": "^0.11.0",
48 |     "istanbul": "^0.3.0",
49 |     "mocha": "^1.21.4"
50 |   },
51 |   "engines": {
52 |     "node": ">=0.10"
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/src/AbstractScraper.js:
--------------------------------------------------------------------------------
  1 | var request = require('request'),
  2 | 	ScraperPromise = require('./ScraperPromise');
  3 | 
  4 | /**
  5 |  * An abstract scraper, this class should not be used directly as a
  6 |  *   scraper, instead a concrete scraper should inherit or use this
  7 |  *   class as a composite this class.
  8 |  *
  9 |  * @constructor
 10 |  */
 11 | var AbstractScraper = function() {
 12 | 	/**
 13 | 	 * Status code of the last requested page.
 14 | 	 *
 15 | 	 * @type {!number}
 16 | 	 * @protected
 17 | 	 */
 18 | 	this.statusCode = null;
 19 | 	/**
 20 | 	 * Response of the last requested page.
 21 | 	 *
 22 | 	 * @type {!Object}
 23 | 	 * @protected
 24 | 	 */
 25 | 	this.response = null;
 26 | 	/**
 27 | 	 * Body of the last webpage, as a string.
 28 | 	 *
 29 | 	 * @type {!string}
 30 | 	 * @protected
 31 | 	 */
 32 | 	this.body = null;
 33 | 	/**
 34 | 	 * URL.
 35 | 	 *
 36 | 	 * @type {!string}
 37 | 	 * @protected
 38 | 	 */
 39 | 	this.url = '';
 40 | };
 41 | AbstractScraper.prototype = {
 42 | 	constructor: AbstractScraper,
 43 | 	/**
 44 | 	 * Executes a simple HTTP GET request to the given url.
 45 | 	 *
 46 | 	 * @param  {!string} url URL to request.
 47 | 	 * @param  {!function(Error=)} callback Function to call when the
 48 | 	 *   request is done. If the request was successful then it's
 49 | 	 *   called with no arguments or null argument. Otherwise, if
 50 | 	 *   there was an error the it's called with one argument not
 51 | 	 *   null, that should be an error instance.
 52 | 	 * @return {!AbstractScraper} This scraper.
 53 | 	 * @public
 54 | 	 */
 55 | 	get: function(url, callback) {
 56 | 		var that = this;
 57 | 		request.get(url, function processGet(error, response, body) {
 58 | 			if (error) {
 59 | 				callback(error);
 60 | 			} else {
 61 | 				that.response = response;
 62 | 				that.statusCode = response.statusCode;
 63 | 				that.body = body;
 64 | 				that.url = response.request.href;
 65 | 				that.loadBody(function(err) {
 66 | 					callback(err);
 67 | 				});
 68 | 			}
 69 | 		});
 70 | 		return this;
 71 | 	},
 72 | 	/**
 73 | 	 * Executes an HTTP request to an url. This method allows for the
 74 | 	 *   powerful use of the request package {@link https://github.com/mikeal/request},
 75 | 	 *   since it's basically a wrapper around the method request.
 76 | 	 *   For more information about how it's used refer to {@link https://github.com/mikeal/request#requestoptions-callback}.
 77 | 	 *
 78 | 	 * @param  {!(Object|string)} options Options of the request.
 79 | 	 * @param  {!function(Error=)} callback Function to call when the
 80 | 	 *   request is done. If the request was successful then it's
 81 | 	 *   called with no arguments or null argument. Otherwise, if
 82 | 	 *   there was an error the it's called with one argument not
 83 | 	 *   null, that should be an error instance.
 84 | 	 * @return {!AbstractScraper} This scraper.
 85 | 	 * @public
 86 | 	 */
 87 | 	request: function(options, callback) {
 88 | 		var that = this;
 89 | 		request(options, function processRequest(error, response, body) {
 90 | 			if (error) {
 91 | 				callback(error);
 92 | 			} else {
 93 | 				that.response = response;
 94 | 				that.statusCode = response.statusCode;
 95 | 				that.body = body;
 96 | 				that.url = response.request.href;
 97 | 				that.loadBody(function(err) {
 98 | 					callback(err);
 99 | 				});
100 | 			}
101 | 		});
102 | 		return this;
103 | 	},
104 | 	/**
105 | 	 * Gets the status code of the last request.
106 | 	 *
107 | 	 * @return {?number} The status code, if a there was a successful
108 | 	 *   request, null otherwise.
109 | 	 * @public
110 | 	 */
111 | 	getStatusCode: function() {
112 | 		return this.statusCode;
113 | 	},
114 | 	/**
115 | 	 * Gets the response of the last request.
116 | 	 *
117 | 	 * @return {?number} The status code, if a there was a successful
118 | 	 *   request, null otherwise.
119 | 	 * @public
120 | 	 */
121 | 	getResponse: function() {
122 | 		return this.response;
123 | 	},
124 | 	/**
125 | 	 * Gets the body of the last request.
126 | 	 *
127 | 	 * @return {?number} The status code, if a there was a successful
128 | 	 *   request, null otherwise.
129 | 	 * @public
130 | 	 */
131 | 	getBody: function() {
132 | 		return this.body;
133 | 	},
134 | 	/* jshint unused:false */
135 | 	/**
136 | 	 * Loads the string, to a representation that can be used in the
137 | 	 *   scraping process.
138 | 	 *
139 | 	 * @param  {!function()} done Callback function, for when the body
140 | 	 *   is done loading.
141 | 	 * @return {!AbstractScraper} This scraper.
142 | 	 * @protected
143 | 	 */
144 | 	loadBody: function(done) {
145 | 		done();
146 | 		return this;
147 | 	},
148 | 	/**
149 | 	 * Scrapes the webpage. According to a function, and a callback.
150 | 	 *
151 | 	 * @param  {!function(...?)} scraperFn Function to scrape the
152 | 	 *   content.
153 | 	 * @param  {!function(?)} callbackFn Function that receives the
154 | 	 *   result of the scraping.
155 | 	 * @param  {!Array} args Aditional arguments to pass to the
156 | 	 *   scraping function.
157 | 	 * @return {!AbstractScraper} This scraper.
158 | 	 * @public
159 | 	 */
160 | 	scrape: function(scraperFn, callbackFn, args) {},
161 | 	/**
162 | 	 * Closes the scraper.
163 | 	 *
164 | 	 * @return {!AbstractScraper} This scraper.
165 | 	 * @public
166 | 	 */
167 | 	close: function() {},
168 | 	/**
169 | 	 * Clones the scraper.
170 | 	 *
171 | 	 * @return {!AbstractScraper} Empty clone.
172 | 	 * @public
173 | 	 */
174 | 	clone: function() {}
175 | };
176 | /* jshint unused:true */
177 | 
178 | /**
179 |  * Creates a scraper, based on a scraper type, and creates it's
180 |  *   promise.
181 |  *
182 |  * @param  {!AbstractScraper} ScraperType Some concrete implementation
183 |  *   of an abstract scraper.
184 |  * @param  {!string=} url Url to make an HTTP GET request.
185 |  * @return {!ScraperPromise} A scraper promise.
186 |  * @public
187 |  * @static
188 |  */
189 | AbstractScraper.create = function(ScraperType, url, options) {
190 | 	var promise = new ScraperPromise(new ScraperType(options));
191 | 	if (url) {
192 | 		promise.get(url);
193 | 	}
194 | 	return promise;
195 | };
196 | 
197 | module.exports = AbstractScraper;


--------------------------------------------------------------------------------
/src/DynamicScraper.js:
--------------------------------------------------------------------------------
  1 | var phantomOrig = require('phantom'),
  2 | 	PhantomPoll = require('./PhantomPoll.js'),
  3 | 	phantom = phantomOrig,
  4 | 	AbstractScraper = require('./AbstractScraper'),
  5 | 	ScraperError = require('./ScraperError'),
  6 | 	PhantomWrapper = require('./PhantomWrapper');
  7 | 
  8 | /**
  9 |  * A dynamic scraper. This is a very versatile and powerful. This
 10 |  *   solution is a little heavier and slower than the {@see StaticScraper}.
 11 |  * This version uses phantomjs {@link http://phantomjs.org/}, and {@link https://github.com/sgentle/phantomjs-node}.
 12 |  *
 13 |  * @extends {AbstractScraper}
 14 |  */
 15 | var DynamicScraper = function(options) {
 16 | 	AbstractScraper.call(this);
 17 | 	/**
 18 | 	 * Phantom instance.
 19 | 	 *
 20 | 	 * @type {?}
 21 | 	 * @private
 22 | 	 */
 23 | 	this.ph = null;
 24 | 	/**
 25 | 	 * Phantom's page.
 26 | 	 *
 27 | 	 * @type {?}
 28 | 	 * @private
 29 | 	 */
 30 | 	this.page = null;
 31 | 	/**
 32 | 	 * Phantom's options
 33 | 	 *
 34 | 	 * @type {?}
 35 | 	 * @private
 36 | 	 */
 37 | 	this.options = {
 38 | 		onStdout: function() {},
 39 | 		onStderr: function() {}
 40 | 	};
 41 | 	for (var key in options) { this.options[key] = options[key]; }
 42 | };
 43 | DynamicScraper.prototype = Object.create(AbstractScraper.prototype);
 44 | /**
 45 |  * @override
 46 |  * @inheritDoc
 47 |  */
 48 | DynamicScraper.prototype.loadBody = function(done) {
 49 | 	var that = this;
 50 | 	phantom.create('--load-images=no', that.options, function(ph) {
 51 | 		that.ph = ph;
 52 | 		ph.createPage(function(page) {
 53 | 			that.page = page;
 54 | 			page.setContent(that.body, that.url, function() {
 55 | 				that.inject(DynamicScraper.JQUERY_FILE, function(err) {
 56 | 					done(err ? new ScraperError('Couldn\'t inject jQuery into the page.') : undefined);
 57 | 				});
 58 | 			});
 59 | 		});
 60 | 	});
 61 | 	return this;
 62 | };
 63 | /**
 64 |  * The scraper function has it's own scope (can't access outside its
 65 |  *   own scope), and only JSON serializable information can be return
 66 |  *   by the function. For more information {@link https://github.com/sgentle/phantomjs-node}.
 67 |  *
 68 |  * @param  {!function(...?)} scraperFn Function to scrape the content.
 69 |  *   It receives the args as parameters, if passed.
 70 |  * @param  {!function(?)} callbackFn Function that receives the
 71 |  *   result of the scraping.
 72 |  * @param  {!Array=} args Additional arguments to pass to the scraping
 73 |  *   function. They must be JSON serializable.
 74 |  * @param  {!string=} stackTrace Stack trace to produce better error
 75 |  *   messages.
 76 |  * @return {!AbstractScraper} This scraper.
 77 |  * @override
 78 |  * @public
 79 |  */
 80 | DynamicScraper.prototype.scrape = function(scraperFn, callbackFn, args, stackTrace) {
 81 | 	args = args || [];
 82 | 
 83 | 	args.unshift(scraperFn.toString());
 84 | 	args.unshift(function(result) {
 85 | 		if(result.error) {
 86 | 			callbackFn(DynamicScraper.generateMockErrorMessage(result.error, stackTrace), null);
 87 | 		} else {
 88 | 			callbackFn(null, result.result);
 89 | 		}
 90 | 	});
 91 | 	args.unshift(PhantomWrapper);
 92 | 
 93 | 	this.page.evaluate.apply(this.page, args);
 94 | 	return this;
 95 | };
 96 | /**
 97 |  * Injects a javascript file into the page.
 98 |  *
 99 |  * @param  {!string} file File to inject.
100 |  * @param  {!function(!ScraperError=)} callback Function to be called
101 |  *   when the file has injected. If the injection fails, then the
102 |  *   first argument is not is a {@see ScraperError}.
103 |  * @public
104 |  */
105 | DynamicScraper.prototype.inject = function(file, callback) {
106 | 	if (this.page) {
107 | 		this.page.injectJs(file, function(success) {
108 | 			if (success) {
109 | 				callback();
110 | 			} else {
111 | 				callback(new ScraperError('Couldn\'t inject code, at "' + file + '".'));
112 | 			}
113 | 		});
114 | 	} else {
115 | 		throw new ScraperError('Couldn\'t inject code, at "' + file + '". The page has not been initialized yet.');
116 | 	}
117 | };
118 | /**
119 |  * @override
120 |  * @inheritDoc
121 |  */
122 | DynamicScraper.prototype.close = function() {
123 | 	if (this.page) {
124 | 		this.page.close();
125 | 	}
126 | 	if (this.ph) {
127 | 		this.ph.exit();
128 | 	}
129 | 	return this;
130 | };
131 | /**
132 |  * @override
133 |  * @inheritDoc
134 |  */
135 | DynamicScraper.prototype.clone = function() {
136 | 	return new DynamicScraper();
137 | };
138 | /**
139 |  * Creates a dynamic scraper, wrapped around a scraper promise.
140 |  *
141 |  * @param  {!string=} url If provided makes an HTTP GET request to the
142 |  *   given URL.
143 |  * @return {!ScraperPromise} Scraper promise, with a dynamic scraper.
144 |  * @public
145 |  * @static
146 |  */
147 | DynamicScraper.create = function(url, options) {
148 | 	return AbstractScraper.create(DynamicScraper, url, options);
149 | };
150 | /**
151 |  * Starts the factory. A factory should only be open once, and after
152 |  *   it's open it must be closed with {@see DynamicScraper#closeFactory}.
153 |  *   A factory makes so that there's only one instance of phantom at a
154 |  *   time, which makes the creation/usage of dynamic scrapers much
155 |  *   more efficient.
156 |  *
157 |  * @return {!DynamicScraper}
158 |  * @public
159 |  * @static
160 |  */
161 | DynamicScraper.startFactory = function() {
162 | 	phantom = new PhantomPoll();
163 | 	return DynamicScraper;
164 | };
165 | /**
166 |  * Closes the factory. For more information {@see DynamicScraper#closeFactory}
167 |  *
168 |  * @return {!DynamicScraper}
169 |  * @public
170 |  * @static
171 |  */
172 | DynamicScraper.closeFactory = function() {
173 | 	if (phantom instanceof PhantomPoll) {
174 | 		phantom.close();
175 | 	}
176 | 	phantom = phantomOrig;
177 | 	return DynamicScraper;
178 | };
179 | /**
180 |  * Generates a mock error message that is similar to one produced
181 |  *   by a function runned in node, and not phantomjs.
182 |  * @param  {!Object} err       	Error object sent by Phantom.
183 |  * @param  {!string} stackTrace Stack trace of where the promise was defined.
184 |  * @return {!Error}             Error message.
185 |  * @private
186 |  * @static
187 |  */
188 | DynamicScraper.generateMockErrorMessage = function(err, stackTrace) {
189 | 	var rg = /^\s{4}at ([^\s]+) \(([^\s]*)\:(\d+):(\d+)\)$/mg;
190 | 	rg.exec(stackTrace);
191 | 	var emsg = rg.exec(stackTrace);
192 | 	var sob = emsg[1];
193 | 	var sfile = emsg[2];
194 | 	var sline = emsg[3];
195 | 	var sc = emsg[4];
196 | 
197 | 	var line = Number(sline) + Math.max(err.line-1, 0);
198 | 
199 | 	var mock = new Error(err.message);
200 | 	// Prevents the use of a property named 'line'!
201 | 	delete err.line;
202 | 	for(var x in err) {
203 | 		mock[x] = err[x];
204 | 	}
205 | 	mock.stack = mock.stack.replace(/\t/g, '    ');
206 | 
207 | 	var ats = mock.stack.split('\n');
208 | 	ats.unshift('    at ' + sob + ' (' + sfile + ':' + line + ':' + sc + ')');
209 | 	ats.unshift('Error' + (err.message?': '+err.message:''));
210 | 	mock.stack = ats.join('\n');
211 | 
212 | 	return mock;
213 | };
214 | /**
215 |  * Location of the jquery file.
216 |  *
217 |  * @type {!string}
218 |  * @private
219 |  * @static
220 |  */
221 | DynamicScraper.JQUERY_FILE = require.resolve('jquery');
222 | 
223 | module.exports = DynamicScraper;
224 | 


--------------------------------------------------------------------------------
/src/PhantomPoll.js:
--------------------------------------------------------------------------------
  1 | var phantom = require('phantom');
  2 | 
  3 | /**
  4 |  * This maintains only one PhantomJS instance. It works like a proxy
  5 |  *   between the phantom package, and should expose the methods same
  6 |  *   methods. An additional call to close the phantomJS instance
  7 |  *   properly is needed.
  8 |  *
  9 |  * @constructor
 10 |  */
 11 | var PhantomPoll = function() {
 12 | 	/**
 13 | 	 * The real PhantomJS instance.
 14 | 	 *
 15 | 	 * @type {?}
 16 | 	 * @private
 17 | 	 */
 18 | 	this.instance = null;
 19 | 	/**
 20 | 	 * The PhantomJS instance is being created.
 21 | 	 *
 22 | 	 * @type {!boolean}
 23 | 	 * @private
 24 | 	 */
 25 | 	this.creating = false;
 26 | 	/**
 27 | 	 * PhantomJS flags.
 28 | 	 *
 29 | 	 * @type {!string}
 30 | 	 * @private
 31 | 	 */
 32 | 	this.flags = '';
 33 | 	/**
 34 | 	 * PhantomJS options.
 35 | 	 *
 36 | 	 * @type {!Object}
 37 | 	 * @private
 38 | 	 */
 39 | 	this.options = {
 40 | 		onStdout: function() {},
 41 | 		onStderr: function() {}
 42 | 	};
 43 | 	/**
 44 | 	 * List of functions waiting to be called after the PhantomJS
 45 | 	 *   instance is created.
 46 | 	 *
 47 | 	 * @type {!Array.<!function(?)>}
 48 | 	 * @private
 49 | 	 */
 50 | 	this.waiting = [];
 51 | 	this._createInstance();
 52 | };
 53 | PhantomPoll.prototype = {
 54 | 	constructor: PhantomPoll,
 55 | 	/**
 56 | 	 * Creates a PhantomJS page, to be called with a callback, which
 57 | 	 *   will receive the page.
 58 | 	 *
 59 | 	 * @param  {!function(?)} callback Function to be called after the
 60 | 	 *   page is created, it receives the page object.
 61 | 	 * @public
 62 | 	 */
 63 | 	createPage: function(callback) {
 64 | 		if (this.instance) {
 65 | 			this.instance.createPage(function(page) {
 66 | 				callback(page);
 67 | 			});
 68 | 		} else {
 69 | 			var that = this;
 70 | 			this._createInstance(function() {
 71 | 				that.createPage(callback);
 72 | 			});
 73 | 		}
 74 | 	},
 75 | 	/**
 76 | 	 * Creates a PhantomJS instance.
 77 | 	 *
 78 | 	 * @param  {!string} flags Creation flags.
 79 | 	 * @param  {!Object} options Creation options.
 80 | 	 * @param  {!function(?)} callback Function to be called after
 81 | 	 *   the phantom instance is created.
 82 | 	 *
 83 | 	 * @public
 84 | 	 */
 85 | 	create: function(flags, options, callback) {
 86 | 		this.flags = flags;
 87 | 		this.options = options;
 88 | 		callback(this);
 89 | 	},
 90 | 	/**
 91 | 	 * Creates PhantomJS instance if needed be, and when it's done
 92 | 	 *   triggers all the callbacks.
 93 | 	 *
 94 | 	 * @param  {!function(?)} callback Function to be called when the
 95 | 	 *   instance is created, if a phantom instance is waiting to be
 96 | 	 *   created the callback will be added to a waiting list.
 97 | 	 * @private
 98 | 	 */
 99 | 	_createInstance: function(callback) {
100 | 		if (this.creating && callback) {
101 | 			this.waiting.push(callback);
102 | 		} else {
103 | 			var that = this;
104 | 			this.creating = true;
105 | 			phantom.create(this.flags, this.options, function(ph) {
106 | 				that.instance = ph;
107 | 				that.creating = false;
108 | 				that.waiting.forEach(function(callback) {
109 | 					callback(ph);
110 | 				});
111 | 				that.waiting = [];
112 | 			});
113 | 		}
114 | 	},
115 | 	/**
116 | 	 * This is a function just to maintain the same interface
117 | 	 *   with the phantom module. If the PhantomJS instance needs be
118 | 	 *   destroyed the method close must be used.
119 | 	 *
120 | 	 * @public
121 | 	 */
122 | 	exit: function() {},
123 | 	/**
124 | 	 * Exits the phantom instance.
125 | 	 *
126 | 	 * @public
127 | 	 */
128 | 	close: function() {
129 | 		if (this.instance) {
130 | 			this.instance.exit();
131 | 		}
132 | 	}
133 | };
134 | 
135 | module.exports = PhantomPoll;


--------------------------------------------------------------------------------
/src/PhantomWrapper.js:
--------------------------------------------------------------------------------
 1 | module.exports = function wrapper(fnStr) {
 2 |   var args = Array.prototype.slice.call(arguments);
 3 |   var rg = /^function\s+([a-zA-Z_$][a-zA-Z_$0-9]*)?\((.*?)\) {/g;
 4 |   var a = rg.exec(fnStr);
 5 |   var fnArgs = a[2].match(/([^,\s]+)/g) || [];
 6 |   var fnBody = fnStr.slice(fnStr.indexOf("{")+1, fnStr.lastIndexOf("}"));
 7 |   fnArgs.push(fnBody);
 8 |   var scraperFn = Function.apply(this, fnArgs);
 9 | 
10 |   try {
11 |     var gs = args.slice(1);
12 |     gs.unshift($);
13 |     var result = scraperFn.apply(this, gs);
14 |     return {
15 |       error: null,
16 |       result: result
17 |     };
18 |   } catch(e) {
19 |     var errObj = {
20 |       message: e.message
21 |     };
22 |     for(var x in e) {
23 |       errObj[x] = e[x];
24 |     }
25 |     return {
26 |       error: errObj,
27 |       result: null
28 |     };
29 |   }
30 | };
31 | 


--------------------------------------------------------------------------------
/src/Router.js:
--------------------------------------------------------------------------------
  1 | var async = require('async'),
  2 | 	StaticScraper = require('./StaticScraper'),
  3 | 	DynamicScraper = require('./DynamicScraper'),
  4 | 	ScraperError = require('./ScraperError');
  5 | 
  6 | /**
  7 |  * Transforms a string into a regular expression.
  8 |  * This function is from the project Routes.js, under the MIT licence,
  9 |  *   {@link https://github.com/aaronblohowiak/routes.js} it's present
 10 |  *   in the file {@link https://github.com/aaronblohowiak/routes.js/blob/bdad0a1ae10d11981bb286550bb3b8a1a71909bd/dist/routes.js#L49}.
 11 |  *
 12 |  * @param  {!string} path String path.
 13 |  * @param  {!Array.<string>} keys Empty array to be filled with the
 14 |  *   keys ids.
 15 |  * @return {!RegExp} Regular expression.
 16 |  */
 17 | function pathToRegExp(path, keys) {
 18 | 	path = path
 19 | 		.concat('/?')
 20 | 		.replace(/\/\(/g, '(?:/')
 21 | 		.replace(/(\/)?(\.)?:(\w+)(?:(\(.*?\)))?(\?)?|\*/g, function(_, slash, format, key, capture, optional) {
 22 | 			if (_ === '*') {
 23 | 				return _;
 24 | 			}
 25 | 
 26 | 			keys.push(key);
 27 | 			slash = slash || '';
 28 | 			return '' + (optional ? '' : slash) + '(?:' + (optional ? slash : '') + (format || '') + (capture || '([^/]+?)') + ')' + (optional || '');
 29 | 		})
 30 | 		.replace(/([\/.])/g, '\\$1')
 31 | 		.replace(/\*/g, '(.*)');
 32 | 	return new RegExp('^' + path + '$', 'i');
 33 | }
 34 | 
 35 | /**
 36 |  * Routes an url thought a valid, predefined, path.
 37 |  *
 38 |  * @param {!Object=} options Setup options.
 39 |  * @param {!boolean=} options.firstMatch If true the router will stop
 40 |  *   at the first path matched. The default is false, and tries to
 41 |  *   match every path.
 42 |  * @constructor
 43 |  */
 44 | var Router = function(options) {
 45 | 	options = options || {};
 46 | 	/**
 47 | 	 * Stops routing at first successful match.
 48 | 	 *
 49 | 	 * @type {!boolean}
 50 | 	 * @private
 51 | 	 */
 52 | 	this.firstMatchStop = options.firstMatch || false;
 53 | 	/**
 54 | 	 * Chain of promises.
 55 | 	 *
 56 | 	 * @type {!Array.<!Object>}
 57 | 	 * @private
 58 | 	 */
 59 | 	this.promises = [];
 60 | 	/**
 61 | 	 * Otherwise promise.
 62 | 	 *
 63 | 	 * @type {!function(!string=)}
 64 | 	 * @private
 65 | 	 */
 66 | 	this.otherwiseFn = function() {};
 67 | };
 68 | Router.prototype = {
 69 | 	constructor: Router,
 70 | 	/**
 71 | 	 * Promise to url match. It's promise will fire only if the path
 72 | 	 *   matches with and url being routed.
 73 | 	 *
 74 | 	 * @param  {!(string|RegExp|function(string):?)} path The
 75 | 	 *   path or regular expression to match an url.
 76 | 	 *   Alternatively a function that receives the url to be matched
 77 | 	 *   can be passed. If the result is false, or any
 78 | 	 *   !!result===false), the path is considered valid and the
 79 | 	 *   scraping should be done. If ,in case of a valid path, an Object is returned, it will be associated with the params of this
 80 | 	 *   route/path.
 81 | 	 *   For more information on the path matching refer to {@link https://github.com/aaronblohowiak/routes.js/blob/76bc517037a0321507c4d84a0cdaca6db31ebaa4/README.md#path-formats}
 82 | 	 * @return {!Router} This router.
 83 | 	 * @public
 84 | 	 */
 85 | 	on: function(path) {
 86 | 		var callback;
 87 | 		if (typeof path === 'function') {
 88 | 			callback = path;
 89 | 		}
 90 | 
 91 | 		this.promises.push({
 92 | 			callback: callback ? function(url) {
 93 | 				return callback(url);
 94 | 			} : Router.pathMatcher(path),
 95 | 			scraper: null,
 96 | 			rqMethod: null
 97 | 		});
 98 | 		return this.get();
 99 | 	},
100 | 	/**
101 | 	 * Sets the request method to be a simple HTTP GET.
102 | 	 * {@see AbstractScraper.get}
103 | 	 *
104 | 	 * @return {!Router} This router.
105 | 	 * @public
106 | 	 */
107 | 	get: function() {
108 | 		var length = this.promises.length,
109 | 			last = this.promises[length - 1];
110 | 		if (length && last) {
111 | 			last.rqMethod = function(scraper, url) {
112 | 				scraper.get(url);
113 | 			};
114 | 			return this;
115 | 		} else {
116 | 			throw new ScraperError('');
117 | 		}
118 | 	},
119 | 	/**
120 | 	 * Sets the request method to be according to the options.
121 | 	 * {@see AbstractScraper.request}
122 | 	 *
123 | 	 * @param  {!Object} options Request options.
124 | 	 * @return {!Router} This router.
125 | 	 * @public
126 | 	 */
127 | 	request: function(options) {
128 | 		var length = this.promises.length,
129 | 			last = this.promises[length - 1];
130 | 		if (length && last) {
131 | 			last.rqMethod = function(scraper, url) {
132 | 				options.uri = url;
133 | 				scraper.request(options);
134 | 			};
135 | 			return this;
136 | 		} else {
137 | 			throw new ScraperError('');
138 | 		}
139 | 	},
140 | 	/**
141 | 	 * A promise to be triggered when none of the paths where matched.
142 | 	 * This is a one time promise, which means that the last promise
143 | 	 *   is gonna be the one to be executed.
144 | 	 *
145 | 	 * @param  {!function(!string=)} callback Function with the url as
146 | 	 *   a parameter.
147 | 	 * @return {!Router} This router.
148 | 	 * @public
149 | 	 */
150 | 	otherwise: function(callback) {
151 | 		this.otherwiseFn = callback;
152 | 		return this;
153 | 	},
154 | 	/**
155 | 	 * Creates a static scraper, and associates it with the current
156 | 	 *   router promise chain. Note that this method returns a
157 | 	 *   {@see ScraperPromise} of a {@see StaticScraper}.
158 | 	 *
159 | 	 * @return {!ScraperPromise} A promise for the scraper.
160 | 	 * @public
161 | 	 */
162 | 	createStatic: function() {
163 | 		var length = this.promises.length,
164 | 			last = this.promises[length - 1];
165 | 		if (length && last && !last.scraper) {
166 | 			var ss = StaticScraper.create();
167 | 			last.scraper = ss;
168 | 			return ss;
169 | 		} else {
170 | 			throw new ScraperError('');
171 | 		}
172 | 	},
173 | 	/**
174 | 	 * Associates the current route with the a scraper (promise)
175 | 	 *   instance. Keep in mind that the done promise will not be
176 | 	 *   available.
177 | 	 *
178 | 	 * @param  {!AbstractScraper} scraper A scraper instance to use.
179 | 	 * @return {!Router} This router.
180 | 	 * @public
181 | 	 */
182 | 	use: function(scraper) {
183 | 		var length = this.promises.length,
184 | 			last = this.promises[length - 1];
185 | 		if (length && last && !last.scraper) {
186 | 			last.scraper = scraper;
187 | 			return this;
188 | 		} else {
189 | 			throw new ScraperError('');
190 | 		}
191 | 	},
192 | 	/**
193 | 	 * Creates a dynamic scraper, and associates it with the current
194 | 	 *   router promise chain. Note that this method returns a
195 | 	 *   {@see ScraperPromise} of a {@see DynamicScraper}.
196 | 	 *
197 | 	 * @return {!ScraperPromise} A promise for the scraper.
198 | 	 * @public
199 | 	 */
200 | 	createDynamic: function() {
201 | 		var length = this.promises.length,
202 | 			last = this.promises[length - 1];
203 | 		if (length && last && !last.scraper) {
204 | 			var ss = DynamicScraper.create();
205 | 			last.scraper = ss;
206 | 			return ss;
207 | 		} else {
208 | 			throw new ScraperError('');
209 | 		}
210 | 	},
211 | 	/**
212 | 	 * Routes a url through every path that matches it.
213 | 	 *
214 | 	 * @param  {!string} url The url to route.
215 | 	 * @param  {!function(boolean)} callback Function to call when the
216 | 	 *   routing is complete. If any of the paths was found the
217 | 	 *   parameter is true, false otherwise.
218 | 	 * @return {!Router} This router.
219 | 	 * @public
220 | 	 */
221 | 	route: function(url, callback) {
222 | 		var that = this,
223 | 			atLeastOne = false,
224 | 			stopFlag = {},
225 | 			lastReturn;
226 | 		callback = callback || function() {};
227 | 		async.eachSeries(this.promises, function(promiseObj, done) {
228 | 
229 | 			var matcher = promiseObj.callback,
230 | 				scraper,
231 | 				reqMethod = promiseObj.rqMethod;
232 | 			var result = matcher(url);
233 | 			if (!!result) {
234 | 				scraper = promiseObj.scraper.clone();
235 | 				atLeastOne = true;
236 | 				scraper._setChainParameter(result);
237 | 				scraper.done(function(lr, utils) {
238 | 					lastReturn = lr;
239 | 					done(that.firstMatchStop ? stopFlag : undefined);
240 | 				});
241 | 				reqMethod(scraper, url);
242 | 			} else {
243 | 				done();
244 | 			}
245 | 
246 | 		}, function() {
247 | 			if (!atLeastOne) {
248 | 				that.otherwiseFn(url);
249 | 			}
250 | 			callback(atLeastOne, lastReturn);
251 | 		});
252 | 		return this;
253 | 	}
254 | };
255 | /**
256 |  * Creates a function to match a path against a string.
257 |  *
258 |  * @param  {!(string|RegExp)} pathOrRE Pattern to match, if it's a
259 |  *   string it will be transformed into a regular expression.
260 |  * @return {!function(string):(Object|booelan)} A matching function,
261 |  *   that given a string will check if it matches the path. If the
262 |  *   path has parameters it will return an object with the parameters
263 |  *   as keys and the values as the values of the parameters. An empty
264 |  *   object if there were no valid parameters or false if the path
265 |  *   doesn't match with the string.
266 |  * @public
267 |  * @static
268 |  */
269 | Router.pathMatcher = function(pathOrRE) {
270 | 	var pattern,
271 | 		keys = ['url'];
272 | 	if (pathOrRE instanceof RegExp) {
273 | 		pattern = pathOrRE;
274 | 	} else if (typeof pathOrRE === 'string') {
275 | 		pattern = pathToRegExp(pathOrRE, keys);
276 | 	} else {
277 | 		throw new ScraperError('A path must be a string or a regular expression.');
278 | 	}
279 | 
280 | 	return function patternMatchingFunction(url) {
281 | 		var match = pattern.exec(url);
282 | 		if (!match) {
283 | 			return false;
284 | 		} else {
285 | 			return keys.reduce(function(obj, value, index) {
286 | 				obj[value] = match[index];
287 | 				return obj;
288 | 			}, {});
289 | 		}
290 | 	};
291 | };
292 | 
293 | module.exports = Router;
294 | 


--------------------------------------------------------------------------------
/src/Scraper.js:
--------------------------------------------------------------------------------
 1 | var StaticScraper = require('./StaticScraper.js'),
 2 | 	DynamicScraper = require('./DynamicScraper.js'),
 3 | 	ScraperPromise = require('./ScraperPromise.js'),
 4 | 	Router = require('./Router');
 5 | 
 6 | module.exports = {
 7 | 	StaticScraper: StaticScraper,
 8 | 	DynamicScraper: DynamicScraper,
 9 | 	ScraperPromise: ScraperPromise,
10 | 	Router: Router
11 | };


--------------------------------------------------------------------------------
/src/ScraperError.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * A scraper error, to refer error occurred in the scope of this
 3 |  *   package. For more information about the error use it's message
 4 |  *   property.
 5 |  *
 6 |  * @param   {!string} message Error message.
 7 |  * @extends {Error}
 8 |  */
 9 | var ScraperError = function(message) {
10 | 	/**
11 | 	 * Error message.
12 | 	 *
13 | 	 * @type {!string}
14 | 	 * @public
15 | 	 */
16 | 	this.message = message;
17 | 	/**
18 | 	 * This type.
19 | 	 *
20 | 	 * @type {!string}
21 | 	 * @public
22 | 	 */
23 | 	this.name = 'ScraperError';
24 | 	/**
25 | 	 * Stack message.
26 | 	 *
27 | 	 * @type {!string}
28 | 	 * @public
29 | 	 */
30 | 	this.stack = (new Error()).stack;
31 | };
32 | ScraperError.prototype = new Error();
33 | ScraperError.prototype.constructor = ScraperError;
34 | 
35 | module.exports = ScraperError;


--------------------------------------------------------------------------------
/src/ScraperPromise.js:
--------------------------------------------------------------------------------
  1 | var async = require('async');
  2 | 
  3 | /**
  4 |  * @constructor
  5 |  */
  6 | var ScraperPromise = function(scraper) {
  7 | 	/**
  8 | 	 * Scraper to use..
  9 | 	 *
 10 | 	 * @type {!Scraper}
 11 | 	 * @private
 12 | 	 */
 13 | 	this.scraper = scraper;
 14 | 	/**
 15 | 	 * Promise chunks.
 16 | 	 *
 17 | 	 * @type {!Array.<function(function(?))>}
 18 | 	 * @private
 19 | 	 */
 20 | 	this.promises = [];
 21 | 	/**
 22 | 	 * Function to call when all the promises are fulfilled.
 23 | 	 *
 24 | 	 * @type {!function(?, ?)}
 25 | 	 * @private
 26 | 	 */
 27 | 	this.doneCallback = function(last, utils) {
 28 | 		return last;
 29 | 	};
 30 | 	/**
 31 | 	 * Function to call when there's an error.
 32 | 	 *
 33 | 	 * @type {!function(?)}
 34 | 	 * @private
 35 | 	 */
 36 | 	this.errorCallback = function(err) {
 37 | 		throw err;
 38 | 	};
 39 | 	/**
 40 | 	 * A parameter object to be passed to the chain, at the _fire
 41 | 	 *   method. This should be set immediately before the call, and
 42 | 	 *   reset to null right after the call, or after it's been stored
 43 | 	 *   elsewhere.
 44 | 	 *
 45 | 	 * @type {?}
 46 | 	 * @private
 47 | 	 */
 48 | 	this.chainParameter = null;
 49 | };
 50 | ScraperPromise.prototype = {
 51 | 	constructor: ScraperPromise,
 52 | 	/**
 53 | 	 * Sets a promise for a status code, of a response of a request.
 54 | 	 *
 55 | 	 * @param  {!(number|function(number))} code Status code to
 56 | 	 *   dispatch the message. Or a callback function, in this case
 57 | 	 *   the function's first parameter is the status code, as a
 58 | 	 *   number.
 59 | 	 * @param  {!function()} callback Callback function for the case
 60 | 	 *   where the status code is provided.
 61 | 	 * @return {!ScraperPromise} This object, so that new promises can
 62 | 	 *   be made.
 63 | 	 * @public
 64 | 	 */
 65 | 	onStatusCode: function(code, callback) {
 66 | 		if (typeof code == 'function') {
 67 | 			callback = code;
 68 | 			this.promises.push(function onGenericStatusCode(done, utils) {
 69 | 				done(null, callback(this.scraper.getStatusCode(), utils));
 70 | 			});
 71 | 		} else {
 72 | 			this.promises.push(function onStatusCode(done, utils) {
 73 | 				if (code === this.scraper.getStatusCode()) {
 74 | 					done(null, callback(utils));
 75 | 				} else {
 76 | 					done(null, utils.lastReturn);
 77 | 				}
 78 | 
 79 | 			});
 80 | 		}
 81 | 		return this;
 82 | 	},
 83 | 	/**
 84 | 	 * Sets a promise to scrape the retrieved webpage.
 85 | 	 *
 86 | 	 * @param  {!function(?, ?)} scrapeFn Function to scrape the
 87 | 	 *   webpage. The parameters depend on what kind of scraper.
 88 | 	 * @param  {!function(?)=} callback Callback function with the
 89 | 	 *   result of the scraping function. If none is provided, the
 90 | 	 *   result can be accessed in the next promise with
 91 | 	 *   <code>utils.lastReturn</code>.
 92 | 	 * @param  {...?} var_args Optional arguments to pass as
 93 | 	 *   parameters to the scraping function.
 94 | 	 * @return {!ScraperPromise} This object, so that new promises can
 95 | 	 *   be made.
 96 | 	 * @public
 97 | 	 */
 98 | 	scrape: function(scrapeFn, callback) {
 99 | 		var stackTrace = new Error().stack;
100 | 
101 | 		var extraArguments = Array.prototype.slice.call(arguments, 2);
102 | 		callback = callback || function(result) {
103 | 			return result;
104 | 		};
105 | 		this.promises.push(function scrape(done, utils) {
106 | 			this.scraper.scrape(scrapeFn, function(err, result) {
107 | 				if (err) {
108 | 					done(err, undefined);
109 | 				} else {
110 | 					done(null, callback(result, utils));
111 | 				}
112 | 			}, extraArguments, stackTrace);
113 | 		});
114 | 		return this;
115 | 	},
116 | 	/**
117 | 	 * Sets a promise to delay the execution of the promises.
118 | 	 *
119 | 	 * @param  {!number} time Time in milliseconds to delay the
120 | 	 *   execution.
121 | 	 * @param  {!function()=} callback Function to call after the
122 | 	 *   delay.
123 | 	 * @return {!ScraperPromise} This object, so that new promises can
124 | 	 *   be made.
125 | 	 * @public
126 | 	 */
127 | 	delay: function(time, callback) {
128 | 		callback = callback || function() {};
129 | 		this.promises.push(function delay(done, utils) {
130 | 			setTimeout(function() {
131 | 				done(null, callback(utils));
132 | 			}, time);
133 | 		});
134 | 		return this;
135 | 	},
136 | 	/**
137 | 	 * Sets a promise to execute a promise after a time period. This
138 | 	 *   does not cause the promise chain to block.
139 | 	 *
140 | 	 * @param  {!number} time Time in milliseconds to the execution of
141 | 	 *   the callback.
142 | 	 * @param  {!function()} callback Function to call after the
143 | 	 *   time period has passed.
144 | 	 * @return {!ScraperPromise} This object, so that new promises can
145 | 	 *   be made.
146 | 	 * @public
147 | 	 */
148 | 	timeout: function(time, callback) {
149 | 		this.promises.push(function timeout(done, utils) {
150 | 			setTimeout(function() {
151 | 				callback(utils);
152 | 			}, time);
153 | 			done(null, null);
154 | 		});
155 | 		return this;
156 | 	},
157 | 	/**
158 | 	 * Sets the end of the promise chain callback, if there were no
159 | 	 *   errors.
160 | 	 *
161 | 	 * @param  {!function()} doneFn Callback function.
162 | 	 * @return {!ScraperPromise} This object, so that new promises can
163 | 	 *   be made.
164 | 	 * @public
165 | 	 */
166 | 	done: function(doneFn) {
167 | 		this.doneCallback = doneFn;
168 | 		return this;
169 | 	},
170 | 	/**
171 | 	 * Sets a generic promise.
172 | 	 *
173 | 	 * @param  {!function()} callback Callback.
174 | 	 * @return {!ScraperPromise} This object, so that new promises can
175 | 	 *   be made.
176 | 	 * @public
177 | 	 */
178 | 	then: function(callback) {
179 | 		this.promises.push(function then(done, utils) {
180 | 			done(null, callback(utils.lastReturn, utils));
181 | 		});
182 | 		return this;
183 | 	},
184 | 	/**
185 | 	 * Stops the promise chain and resumes it after a callback
186 | 	 *   function.
187 | 	 *
188 | 	 * @param  {!function(!function, !Object)} callback Callback.
189 | 	 * @return {!ScraperPromise} This object, so that new promises can
190 | 	 *   be made.
191 | 	 * @public
192 | 	 */
193 | 	async: function(callback) {
194 | 		this.promises.push(function async(done, utils) {
195 | 			callback(utils.lastReturn, done, utils);
196 | 		});
197 | 		return this;
198 | 	},
199 | 	/**
200 | 	 * @deprecated
201 | 	 */
202 | 	onError: function(callback) {
203 | 		console.warn("The 'onError' is being DEPRECATED in favor of 'catch'");
204 | 		return this.catch(callback);
205 | 	},
206 | 	/**
207 | 	 * Sets a promise to when an error occur, note that an error will
208 | 	 *   break the promise chain, so this is the next promise to be
209 | 	 *   called and if the done promise is not set the last. To avoid
210 | 	 *   silent errors, if this promise is not defined the error will
211 | 	 *   be thrown up.
212 | 	 *
213 | 	 * @param  {!function(?)} callback Callback.
214 | 	 * @return {!ScraperPromise} This object, so that new promises can
215 | 	 *   be made.
216 | 	 * @public
217 | 	 */
218 | 	"catch": function(callback) {
219 | 		this.errorCallback = callback;
220 | 		return this;
221 | 	},
222 | 	/**
223 | 	 * Makes an HTTP GET request to the url.
224 | 	 *
225 | 	 * @param  {!string} url Url to make the request.
226 | 	 * @return {!ScraperPromise} This object, so that new promises can
227 | 	 *   be made.
228 | 	 * @public
229 | 	 */
230 | 	get: function(url) {
231 | 		var that = this;
232 | 		this.scraper.get(url, function(err) {
233 | 			that._fire(err);
234 | 		});
235 | 		return this;
236 | 	},
237 | 	/**
238 | 	 * Makes a (possible more complex) HTTP request. For more
239 | 	 *   information refer to {@link https://github.com/mikeal/request#requestoptions-callback}.
240 | 	 *
241 | 	 * @param  {!Object} options Options of the request.
242 | 	 * @return {!ScraperPromise} This object, so that new promises can
243 | 	 *   be made.
244 | 	 * @public
245 | 	 */
246 | 	request: function(options) {
247 | 		var that = this;
248 | 		this.scraper.request(options, function(err) {
249 | 			that._fire(err);
250 | 		});
251 | 		return this;
252 | 	},
253 | 	/**
254 | 	 * Sets a parameter to be used in the next _fire call.
255 | 	 *
256 | 	 * @param {?Object} param Parameter.
257 | 	 * @public
258 | 	 */
259 | 	_setChainParameter: function(param) {
260 | 		this.chainParameter = param;
261 | 	},
262 | 	/**
263 | 	 * Starts the promise chain.
264 | 	 *
265 | 	 * @param  {?} error Error object, to fire the error callback,
266 | 	 *   from an error that happened before.
267 | 	 * @param  {!Scraper} scraper Scraper to use in the promise chain.
268 | 	 * @protected
269 | 	 */
270 | 	_fire: function(error) {
271 | 		var that = this,
272 | 			param = this.chainParameter,
273 | 			stopPointer = {},
274 | 			utils = {
275 | 				stop: null,
276 | 				url: this.scraper.url,
277 | 				scraper: this,
278 | 				params: param,
279 | 				lastReturn: undefined
280 | 			},
281 | 			keep = true;
282 | 		this.chainParameter = null;
283 | 
284 | 		if (error) {
285 | 			this.errorCallback(error, utils);
286 | 			this.doneCallback(utils);
287 | 			return;
288 | 		}
289 | 
290 | 		async.eachSeries(this.promises, function dispatcher(fn, callback) {
291 | 			var done = function(err, lastReturn) {
292 | 				utils.lastReturn = lastReturn;
293 | 				if (err === stopPointer) {
294 | 					keep = false;
295 | 					callback(err);
296 | 				} else if (err) {
297 | 					callback(err);
298 | 				} else if (keep) {
299 | 					callback();
300 | 				}
301 | 			};
302 | 			utils.stop = function() {
303 | 				done(stopPointer, null);
304 | 			};
305 | 
306 | 			try {
307 | 				fn.call(that, done, utils);
308 | 			} catch (err) {
309 | 				done(err, null);
310 | 			}
311 | 		}, function(err) {
312 | 			utils.stop = null;
313 | 			if (err && err !== stopPointer) {
314 | 				that.errorCallback(err, utils);
315 | 			}
316 | 			that.doneCallback(utils.lastReturn, utils);
317 | 			that.scraper.close();
318 | 		});
319 | 	},
320 | 	/**
321 | 	 * Sets the promises.
322 | 	 *
323 | 	 * @param {!Array.<function(function(?))>} promises Promises array.
324 | 	 * @public
325 | 	 */
326 | 	_setPromises: function(promises) {
327 | 		this.promises = promises;
328 | 	},
329 | 	/**
330 | 	 * Clones the promise and the scraper.
331 | 	 *
332 | 	 * @return {!ScraperPromise} Scraper promise with an empty scraper
333 | 	 *   clone.
334 | 	 * @public
335 | 	 */
336 | 	clone: function() {
337 | 		var instance = this.scraper.clone(),
338 | 			promise = new ScraperPromise(instance);
339 | 		promise._setPromises(this.promises);
340 | 		promise.done(this.doneCallback);
341 | 		promise.catch(this.errorCallback);
342 | 		return promise;
343 | 	}
344 | };
345 | 
346 | module.exports = ScraperPromise;
347 | 


--------------------------------------------------------------------------------
/src/StaticScraper.js:
--------------------------------------------------------------------------------
 1 | var cheerio = require('cheerio'),
 2 | 	AbstractScraper = require('./AbstractScraper');
 3 | 
 4 | /**
 5 |  * A static scraper. This can only scrape static content, with the
 6 |  *   help of jQuery.
 7 |  * This version uses cheerio {@link https://github.com/cheeriojs/cheerio}.
 8 |  *
 9 |  * @extends {AbstractScraper}
10 |  */
11 | var StaticScraper = function() {
12 | 	AbstractScraper.call(this);
13 | 	/**
14 | 	 * jQuery.
15 | 	 *
16 | 	 * @type {!function}
17 | 	 * @private
18 | 	 */
19 | 	this.$ = null;
20 | };
21 | StaticScraper.prototype = Object.create(AbstractScraper.prototype);
22 | /**
23 |  * @override
24 |  * @inheritDoc
25 |  */
26 | StaticScraper.prototype.loadBody = function(done) {
27 | 	this.$ = cheerio.load(this.body);
28 | 	done();
29 | 	return this;
30 | };
31 | /**
32 |  * Scrapes the webpage. According to a function, and a callback.
33 |  *
34 |  * @param  {!function(function(), ...?)} scraperFn Function to scrape
35 |  *   the content. It receives the jQuery function to manipulate the
36 |  *   DOM, and the args as parameters, if passed.
37 |  * @param  {!function(?)} callbackFn Function that receives the
38 |  *   result of the scraping.
39 |  * @param  {!Array=} args Extra arguments to pass to the scraping
40 |  *   function.
41 |  * @return {!AbstractScraper} This scraper.
42 |  * @override
43 |  * @public
44 |  */
45 | StaticScraper.prototype.scrape = function(scraperFn, callbackFn, args) {
46 | 	var result = null, err = null;
47 | 	args = args || [];
48 | 	args.unshift(this.$);
49 | 	try {
50 | 		result = scraperFn.apply(null, args);
51 | 	} catch (e) {
52 | 		err = e;
53 | 	}
54 | 	callbackFn(err, result);
55 | 	return this;
56 | };
57 | /**
58 |  * @override
59 |  * @inheritDoc
60 |  */
61 | StaticScraper.prototype.close = function() {
62 | 	return this;
63 | };
64 | /**
65 |  * @override
66 |  * @inheritDoc
67 |  */
68 | StaticScraper.prototype.clone = function() {
69 | 	return new StaticScraper();
70 | };
71 | /**
72 |  * Creates a static scraper, wrapped around a scraper promise.
73 |  *
74 |  * @param  {!string=} url If provided makes an HTTP GET request to the
75 |  *   given URL.
76 |  * @return {!ScraperPromise} Scraper promise, with a static scraper.
77 |  * @public
78 |  * @static
79 |  */
80 | StaticScraper.create = function(url) {
81 | 	return AbstractScraper.create(StaticScraper, url);
82 | };
83 | 
84 | module.exports = StaticScraper;


--------------------------------------------------------------------------------
/test/AbstractScraper.js:
--------------------------------------------------------------------------------
 1 | /* global describe, it */
 2 | var AbstractScraper = require('../src/AbstractScraper'),
 3 | 	fs = require('fs'),
 4 | 	assert = require('assert'),
 5 | 	MISSING = 'http://0.0.0.0',
 6 | 	HN_CLONE = 'http://localhost:3000/hacker-news-clone';
 7 | 
 8 | 
 9 | describe('AbstractScraper', function() {
10 | 	it('get', function(done) {
11 | 		var as = new AbstractScraper();
12 | 		as.get(MISSING, function(err) {
13 | 			assert.ok((err.code == 'EADDRNOTAVAIL') || (err.code == 'ECONNREFUSED'));
14 | 			done();
15 | 		});
16 | 	});
17 | 	it('request', function(done) {
18 | 		var as = new AbstractScraper();
19 | 		as.request({
20 | 			url: MISSING
21 | 		}, function(err) {
22 | 			assert.ok((err.code == 'EADDRNOTAVAIL') || (err.code == 'ECONNREFUSED'));
23 | 			done();
24 | 		});
25 | 	});
26 | 	it('getStatusCode', function(done) {
27 | 		var as = new AbstractScraper();
28 | 		as.get(HN_CLONE, function(err) {
29 | 			assert.ok(!err);
30 | 			assert.equal(as.getStatusCode(), 200);
31 | 			done();
32 | 		});
33 | 	});
34 | 	it('getResponse', function(done) {
35 | 		var as = new AbstractScraper();
36 | 		as.get(HN_CLONE, function(err) {
37 | 			assert.ok(!err);
38 | 			assert.ok(!!as.getResponse());
39 | 			assert.equal(as.getResponse().statusCode, 200);
40 | 			done();
41 | 		});
42 | 	});
43 | 	it('getBody', function(done) {
44 | 		var as = new AbstractScraper();
45 | 		as.get(HN_CLONE, function(err) {
46 | 			assert.ok(!err);
47 | 			assert.equal(as.getBody(), fs.readFileSync(__dirname + '/static/hacker-news-clone.html').toString());
48 | 			done();
49 | 		});
50 | 	});
51 | 	it('loadBody', function(done) {
52 | 		var as = new AbstractScraper();
53 | 		as.loadBody(function() {
54 | 			done();
55 | 		});
56 | 	});
57 | 	it('scrape', function() {
58 | 		var as = new AbstractScraper();
59 | 		as.scrape(function() {
60 | 			assert.fail('Function shouldn\'t be called');
61 | 		}, function() {
62 | 			assert.fail('Function shouldn\'t be called');
63 | 		});
64 | 	});
65 | 	it('close', function() {
66 | 		var as = new AbstractScraper();
67 | 		assert.ok(as.close() === undefined);
68 | 	});
69 | 	it('clone', function() {
70 | 		var as = new AbstractScraper();
71 | 		assert.ok(as.clone() === undefined);
72 | 	});
73 | });


--------------------------------------------------------------------------------
/test/DynamicScraper.js:
--------------------------------------------------------------------------------
  1 | /* global describe, it, $ */
  2 | var sjs = require('../src/Scraper'),
  3 | 	ScraperPromise = sjs.ScraperPromise,
  4 | 	DynamicScraper = sjs.DynamicScraper,
  5 | 	assert = require('assert'),
  6 | 	HN_CLONE = 'http://localhost:3000/hacker-news-clone';
  7 | 
  8 | 
  9 | describe('DynamicScraper', function() {
 10 | 
 11 | 	describe('#create', function() {
 12 | 		it('with argument', function(done) {
 13 | 			var ds = DynamicScraper.create(HN_CLONE);
 14 | 			ds
 15 | 				.done(function() {
 16 | 					assert.ok(ds instanceof ScraperPromise);
 17 | 					done();
 18 | 				});
 19 | 		});
 20 | 
 21 | 		it('without argument', function(done) {
 22 | 			var ds = DynamicScraper.create();
 23 | 			ds
 24 | 				.get(HN_CLONE)
 25 | 				.done(function() {
 26 | 					assert.ok(ds instanceof ScraperPromise);
 27 | 					done();
 28 | 				});
 29 | 		});
 30 | 	});
 31 | 
 32 | 	it('.loadBody, .scrape, .close', function(done) {
 33 | 		var ds = new DynamicScraper();
 34 | 		ds.body = '<html><body><div id="f">text</div></body></html>';
 35 | 		var temp = ds.loadBody(function() {
 36 | 			var temp = ds.scrape(function() {
 37 | 				return $('#f').text();
 38 | 			}, function(err, result) {
 39 | 				assert.equal(err, null);
 40 | 				assert.equal(result, 'text');
 41 | 				assert.ok(ds.close() === ds);
 42 | 				assert.ok(!!ds.ph);
 43 | 				assert.ok(!!ds.page);
 44 | 				done();
 45 | 			});
 46 | 			assert.ok(temp === ds);
 47 | 		});
 48 | 		assert.ok(temp === ds);
 49 | 	});
 50 | 
 51 | 	describe('.inject', function() {
 52 | 		it('page not loaded', function() {
 53 | 			var ds = new DynamicScraper();
 54 | 			try {
 55 | 				ds.inject('');
 56 | 				assert.fail('Should have thrown');
 57 | 			} catch (e) {
 58 | 				assert.equal(e.message, 'Couldn\'t inject code, at "". The page has not been initialized yet.');
 59 | 			}
 60 | 		});
 61 | 
 62 | 		it('success', function(done) {
 63 | 			var ds = new DynamicScraper();
 64 | 			ds.get(HN_CLONE, function(err) {
 65 | 				if (err) {
 66 | 					assert.fail('Shouldn\'t have returned an error.');
 67 | 				}
 68 | 				ds.inject(__dirname + '/static/code.js', function(err) {
 69 | 					if (err) {
 70 | 						assert.fail('Should load code successfully.');
 71 | 					} else {
 72 | 						done();
 73 | 					}
 74 | 				});
 75 | 
 76 | 			});
 77 | 		});
 78 | 
 79 | 		it('fails', function(done) {
 80 | 			var ds = new DynamicScraper();
 81 | 			ds.get(HN_CLONE, function(err) {
 82 | 				if (err) {
 83 | 					assert.fail('Shouldn\'t have returned an error.');
 84 | 				}
 85 | 				var file = __dirname + '/static/invalid-code.js';
 86 | 				ds.inject(file, function(err) {
 87 | 					if (err) {
 88 | 						assert.equal(err.message, 'Couldn\'t inject code, at "' + file + '".');
 89 | 						done();
 90 | 					} else {
 91 | 						assert.fail('Shouldn\'t load code successfully.');
 92 | 					}
 93 | 				});
 94 | 
 95 | 			});
 96 | 		});
 97 | 
 98 | 		it('fails jQuery', function(done) {
 99 | 			var jq = DynamicScraper.JQUERY_FILE;
100 | 			DynamicScraper.JQUERY_FILE += '.non';
101 | 			var ds = new DynamicScraper();
102 | 			ds.get(HN_CLONE, function(err) {
103 | 				DynamicScraper.JQUERY_FILE = jq;
104 | 				if (err) {
105 | 					assert.equal(err.message, 'Couldn\'t inject jQuery into the page.');
106 | 				} else {
107 | 					assert.fail('Should have returned an error.');
108 | 				}
109 | 				done();
110 | 			});
111 | 		});
112 | 	});
113 | 
114 | 	it('.clone', function() {
115 | 		var ds = new DynamicScraper(),
116 | 			clone = ds.clone();
117 | 		assert.ok(clone instanceof DynamicScraper);
118 | 		assert.ok(ds != clone);
119 | 	});
120 | 
121 | 	it('#startFactory, #closeFactory', function() {
122 | 		var temp;
123 | 		temp = DynamicScraper.startFactory();
124 | 		assert.ok(temp === DynamicScraper);
125 | 		temp = DynamicScraper.closeFactory();
126 | 		assert.ok(temp === DynamicScraper);
127 | 		temp = DynamicScraper.closeFactory();
128 | 		assert.ok(temp === DynamicScraper);
129 | 	});
130 | });


--------------------------------------------------------------------------------
/test/Router.js:
--------------------------------------------------------------------------------
  1 | /* global describe, it, $ */
  2 | var scraper = require('../src/Scraper'),
  3 | 	Router = scraper.Router,
  4 | 	assert = require('assert'),
  5 | 	LH = 'http://localhost:3000';
  6 | 
  7 | function compareObjects(obj1, obj2) {
  8 | 	function co(a, b) {
  9 | 		for (var x in a) {
 10 | 			if (a[x] !== b[x]) {
 11 | 				return false;
 12 | 			}
 13 | 		}
 14 | 		return true;
 15 | 	}
 16 | 	return co(obj1, obj2) && co(obj2, obj1);
 17 | }
 18 | 
 19 | describe('Router', function() {
 20 | 
 21 | 	describe('#pathMatcher', function() {
 22 | 		it('with string', function(done) {
 23 | 			var fn = Router.pathMatcher(':protocol(https?://)?:www(www.)?youtube.com/(watch/:id)?');
 24 | 			assert.equal(typeof fn, 'function');
 25 | 			assert.ok(compareObjects(fn('youtube.com/'), {
 26 | 				url: 'youtube.com/',
 27 | 				protocol: undefined,
 28 | 				www: undefined,
 29 | 				id: undefined
 30 | 			}));
 31 | 			assert.ok(compareObjects(fn('https://youtube.com/'), {
 32 | 				url: 'https://youtube.com/',
 33 | 				protocol: 'https://',
 34 | 				www: undefined,
 35 | 				id: undefined
 36 | 			}));
 37 | 			assert.ok(compareObjects(fn('https://www.youtube.com/'), {
 38 | 				url: 'https://www.youtube.com/',
 39 | 				protocol: 'https://',
 40 | 				www: 'www.',
 41 | 				id: undefined
 42 | 			}));
 43 | 			assert.ok(compareObjects(fn('https://www.youtube.com/watch/mNhMogx3YmU'), {
 44 | 				url: 'https://www.youtube.com/watch/mNhMogx3YmU',
 45 | 				protocol: 'https://',
 46 | 				www: 'www.',
 47 | 				id: 'mNhMogx3YmU'
 48 | 			}));
 49 | 			assert.ok(compareObjects(Router.pathMatcher('*')('https://www.youtube.com/watch/mNhMogx3YmU'), {
 50 | 				url: 'https://www.youtube.com/watch/mNhMogx3YmU'
 51 | 			}));
 52 | 			try {
 53 | 				Router.pathMatcher(function() {});
 54 | 			} catch (e) {
 55 | 				assert.equal(e.name, 'ScraperError');
 56 | 				done();
 57 | 			}
 58 | 		});
 59 | 		it('with regular expression', function(done) {
 60 | 			var fn = Router.pathMatcher(/s*crape/);
 61 | 			assert.equal(typeof fn, 'function');
 62 | 			assert.ok(!!fn('craper'));
 63 | 			assert.ok(!!fn('scraper'));
 64 | 			assert.ok(!!fn('ssscraper'));
 65 | 			done();
 66 | 		});
 67 | 	});
 68 | 	describe('on', function() {
 69 | 		var r = new Router();
 70 | 		it('with path', function(done) {
 71 | 			r.on(LH + '/info/:id')
 72 | 				.createStatic()
 73 | 				.onStatusCode(200, function() {
 74 | 					done();
 75 | 				});
 76 | 			r.route(LH + '/info/ajhfdhsgf', function(found) {
 77 | 				assert.ok(found);
 78 | 			});
 79 | 		});
 80 | 		it('with function', function(done) {
 81 | 			r
 82 | 				.on(Router.pathMatcher(LH + '/watch/:id'))
 83 | 				.createStatic()
 84 | 				.onStatusCode(200, function() {
 85 | 					done();
 86 | 				});
 87 | 			r.route(LH + '/watch/hjsgdfhdgf', function(found) {
 88 | 				assert.ok(found);
 89 | 			});
 90 | 		});
 91 | 	});
 92 | 	it('get', function(done) {
 93 | 		var r = new Router();
 94 | 		r
 95 | 			.on(LH + '/info/:id')
 96 | 			.get()
 97 | 			.createStatic()
 98 | 			.onStatusCode(200, function() {
 99 | 				done();
100 | 			});
101 | 		r.route(LH + '/info/8973iuhrwjhef');
102 | 	});
103 | 	it('request', function(done) {
104 | 		var r = new Router();
105 | 		r
106 | 			.on(LH + '/watch/:id')
107 | 			.request({
108 | 				method: 'POST'
109 | 			})
110 | 			.createStatic()
111 | 			.onStatusCode(200, function() {
112 | 				done();
113 | 			});
114 | 		r.route(LH + '/watch/8973iuhrwjhef', function(found) {
115 | 			assert.ok(found);
116 | 		});
117 | 	});
118 | 	it('otherwise', function(done) {
119 | 		var r = new Router(),
120 | 			testURL = LH + 'infoo/fjsdgfmhgsdf';
121 | 		r.on(LH + '/watch/:id');
122 | 		r.otherwise(function(url) {
123 | 			assert.equal(url, testURL);
124 | 			done();
125 | 		});
126 | 		r.route(testURL, function(found) {
127 | 			assert.ok(!found);
128 | 		});
129 | 	});
130 | 	it('route', function(done) {
131 | 		var r = new Router();
132 | 		r.on(LH + '/watch/:id')
133 | 			.createStatic();
134 | 		r.route(LH + '/watch/fjsdgfmhgsdf', function(found) {
135 | 			assert.ok(found);
136 | 			r.route(LH + '/scrpng', function(found) {
137 | 				assert.ok(!found);
138 | 				done();
139 | 			});
140 | 		});
141 | 	});
142 | 	it('createStatic', function(done) {
143 | 		var r = new Router();
144 | 		r.on(LH + '/hacker-news-clone')
145 | 			.createStatic()
146 | 			.scrape(function($) {
147 | 				return $('.title a').map(function() {
148 | 					return $(this).text();
149 | 				}).get();
150 | 			}, function(news) {
151 | 				assert.equal(news.length, 10);
152 | 				done();
153 | 			});
154 | 		r.route(LH + '/hacker-news-clone', function(found) {
155 | 			assert.ok(found);
156 | 		});
157 | 	});
158 | 	it('createDynamic', function(done) {
159 | 		var r = new Router();
160 | 		r.on(LH + '/hacker-news-clone')
161 | 			.createDynamic()
162 | 			.delay(100)
163 | 			.scrape(function() {
164 | 				return $('.title a').map(function() {
165 | 					return $(this).text();
166 | 				}).get();
167 | 			}, function(news) {
168 | 				assert.equal(news.length, 9);
169 | 				done();
170 | 			});
171 | 		r.route(LH + '/hacker-news-clone', function(found) {
172 | 			assert.ok(found);
173 | 		});
174 | 	});
175 | 	it('use', function(done) {
176 | 		var r = new Router(),
177 | 			stInstance;
178 | 		stInstance = scraper.StaticScraper
179 | 			.create()
180 | 			.scrape(function($) {
181 | 				return $('.title a').map(function() {
182 | 					return $(this).text();
183 | 				}).get();
184 | 			}, function(news) {
185 | 				assert.equal(news.length, 10);
186 | 				done();
187 | 			});
188 | 		r.on(LH + '/hacker-news-clone')
189 | 			.use(stInstance);
190 | 		r.route(LH + '/hacker-news-clone', function(found) {
191 | 			assert.ok(found);
192 | 		});
193 | 	});
194 | 
195 | 	it('usage of params', function(done) {
196 | 		var r = new Router();
197 | 		r
198 | 			.on(LH + '/info/:id')
199 | 			.createStatic()
200 | 			.then(function(last, utils) {
201 | 				assert.ok(utils.params.id, '7623hgjfs73');
202 | 			});
203 | 		r.route(LH + '/info/7623hgjfs73', function(found) {
204 | 			assert.ok(found);
205 | 			done();
206 | 		});
207 | 	});
208 | 
209 | 	describe('instantiation', function() {
210 | 		function testCase(firstMatch, expected) {
211 | 			it('with' + (firstMatch ? '' : 'out') + ' firstMatch', function(done) {
212 | 				var c = 0;
213 | 				var r = new Router({
214 | 					firstMatch: !!firstMatch
215 | 				});
216 | 				r.on(LH + '/info/:id')
217 | 					.createStatic()
218 | 					.then(function() {
219 | 						c++;
220 | 					});
221 | 				r.on(LH + '/info/:id')
222 | 					.createStatic()
223 | 					.then(function() {
224 | 						c++;
225 | 					});
226 | 				r.route(LH + '/info/7623hgjfs73', function(found) {
227 | 					assert.ok(found);
228 | 					assert.equal(c, expected);
229 | 					done();
230 | 				});
231 | 			});
232 | 		}
233 | 
234 | 		testCase(true, 1);
235 | 		testCase(false, 2);
236 | 	});
237 | 
238 | 	describe('bad formatting', function() {
239 | 		it('get', function(done) {
240 | 			var r = new Router();
241 | 			try {
242 | 				r.get();
243 | 			} catch (e) {
244 | 				assert.equal(e.name, 'ScraperError');
245 | 				done();
246 | 			}
247 | 		});
248 | 		it('request', function(done) {
249 | 			var r = new Router();
250 | 			try {
251 | 				r.request();
252 | 			} catch (e) {
253 | 				assert.equal(e.name, 'ScraperError');
254 | 				done();
255 | 			}
256 | 		});
257 | 		it('createStatic', function(done) {
258 | 			var r = new Router();
259 | 			try {
260 | 				r.createStatic();
261 | 			} catch (e) {
262 | 				assert.equal(e.name, 'ScraperError');
263 | 				done();
264 | 			}
265 | 		});
266 | 		it('createDynamic', function(done) {
267 | 			var r = new Router();
268 | 			try {
269 | 				r.createDynamic();
270 | 			} catch (e) {
271 | 				assert.equal(e.name, 'ScraperError');
272 | 				done();
273 | 			}
274 | 		});
275 | 		it('use', function(done) {
276 | 			var r = new Router();
277 | 			try {
278 | 				r.use(scraper.StaticScraper.create());
279 | 			} catch (e) {
280 | 				assert.equal(e.name, 'ScraperError');
281 | 				done();
282 | 			}
283 | 		});
284 | 	});
285 | });


--------------------------------------------------------------------------------
/test/ScraperError.js:
--------------------------------------------------------------------------------
 1 | /* global it */
 2 | var ScraperError = require('../src/ScraperError'),
 3 | 	assert = require('assert');
 4 | 
 5 | it('ScraperError', function() {
 6 | 	var err = new ScraperError('random message');
 7 | 	assert.equal(err.message, 'random message');
 8 | 	assert.equal(err.name, 'ScraperError');
 9 | 	assert.ok(err.stack);
10 | });


--------------------------------------------------------------------------------
/test/ScraperPromise.js:
--------------------------------------------------------------------------------
  1 | /* global describe, it, beforeEach, afterEach, $ */
  2 | var assert = require('assert'),
  3 | 	sjs = require('../src/Scraper'),
  4 | 	StaticScraper = sjs.StaticScraper,
  5 | 	DynamicScraper = sjs.DynamicScraper,
  6 | 	ScraperPromise = sjs.ScraperPromise,
  7 | 	HN_CLONE = 'http://localhost:3000/hacker-news-clone',
  8 | 	domain = require('domain');
  9 | 
 10 | function exec(ScraperType) {
 11 | 	function isDynamic() {
 12 | 		return ScraperType === DynamicScraper;
 13 | 	}
 14 | 
 15 | 	describe('onStatusCode', function() {
 16 | 		it('with code', function(done) {
 17 | 			var s = new ScraperPromise(new ScraperType())
 18 | 				.get(HN_CLONE);
 19 | 			var temp = s
 20 | 				.onStatusCode(202, function() {
 21 | 					assert.fail('This status code should not trigger.');
 22 | 				})
 23 | 				.onStatusCode(200, function() {
 24 | 					done();
 25 | 				});
 26 | 			assert.ok(temp === s);
 27 | 		});
 28 | 
 29 | 		it('without code', function(done) {
 30 | 			var s = new ScraperPromise(new ScraperType())
 31 | 				.get(HN_CLONE);
 32 | 			var temp = s
 33 | 				.onStatusCode(function(code) {
 34 | 					assert.equal(code, 200);
 35 | 					done();
 36 | 				});
 37 | 			assert.ok(temp === s);
 38 | 		});
 39 | 	});
 40 | 
 41 | 	it('timeout', function(done) {
 42 | 		var s = new ScraperPromise(new ScraperType())
 43 | 			.get(HN_CLONE)
 44 | 			.onStatusCode(function(code) {
 45 | 				assert.equal(code, 200);
 46 | 			});
 47 | 		var temp = s.timeout(100, function() {
 48 | 			done();
 49 | 		});
 50 | 		assert.ok(temp === s);
 51 | 	});
 52 | 
 53 | 	it('then', function(done) {
 54 | 		var s = new ScraperPromise(new ScraperType())
 55 | 			.get(HN_CLONE);
 56 | 		var temp = s.then(function() {
 57 | 			done();
 58 | 		});
 59 | 		assert.ok(temp === s);
 60 | 	});
 61 | 
 62 | 	it('then', function(done) {
 63 | 		var s = new ScraperPromise(new ScraperType())
 64 | 			.get(HN_CLONE);
 65 | 		var temp = s.async(function(_, done) {
 66 | 			done();
 67 | 		});
 68 | 		s.done(function() {
 69 | 			done();
 70 | 		});
 71 | 		assert.ok(temp === s);
 72 | 	});
 73 | 
 74 |   describe('catch', function() {
 75 |     it('on sync', function(done) {
 76 |       var s = new ScraperPromise(new ScraperType())
 77 |         .get(HN_CLONE)
 78 |         .then(function() {
 79 |           throw new Error('random message');
 80 |         });
 81 |       var temp = s.catch(function(err) {
 82 |         assert.equal(err.message, 'random message');
 83 |         done();
 84 |       });
 85 |       assert.ok(s === temp);
 86 |     });
 87 |     it('on async', function(done) {
 88 |       var s = new ScraperPromise(new ScraperType())
 89 |         .get(HN_CLONE)
 90 |         .async(function(_, done) {
 91 |           done(new Error('random message'));
 92 |         });
 93 |       var temp = s.catch(function(err) {
 94 |         assert.equal(err.message, 'random message');
 95 |         done();
 96 |       });
 97 |       assert.ok(s === temp);
 98 |     });
 99 |   });
100 | 
101 | 	// FIXME - this is not working for the dynamic scraper with factory
102 | 	if (!isDynamic()) {
103 | 		it('error without catch', function(done) {
104 | 			var d = domain.create();
105 | 			d.on('error', function(err) {
106 | 				assert.equal(err.message, 'random message');
107 | 				done();
108 | 			});
109 | 			d.run(function() {
110 | 				new ScraperPromise(new ScraperType())
111 | 					.get(HN_CLONE)
112 | 					.then(function() {
113 | 						throw new Error('random message');
114 | 					});
115 | 			});
116 | 		});
117 | 	}
118 | 
119 | 	describe('scrape', function() {
120 | 		var expectedVal;
121 | 		if (isDynamic()) {
122 | 			expectedVal = 9;
123 | 		} else {
124 | 			expectedVal = 10;
125 | 		}
126 | 		it('without extra arguments', function(done) {
127 | 			var s = new ScraperPromise(new ScraperType())
128 | 				.get(HN_CLONE);
129 | 			var fn = function($) {
130 | 					return $('.title a').map(function() {
131 | 						return $(this).text();
132 | 					}).get();
133 | 				};
134 | 			var temp = s.scrape(fn, function(news) {
135 | 				assert.equal(news.length, expectedVal);
136 | 				done();
137 | 			});
138 | 			assert.ok(temp === s);
139 | 		});
140 | 
141 | 		it('without extra arguments', function(done) {
142 | 			var s = new ScraperPromise(new ScraperType())
143 | 				.get(HN_CLONE);
144 | 			var fn = function($, selector) {
145 | 				return $(selector).map(function() {
146 | 					return $(this).text();
147 | 				}).get();
148 | 			};
149 | 			var temp = s.scrape(fn, function(news) {
150 | 				assert.equal(news.length, expectedVal);
151 | 				done();
152 | 			}, '.title a');
153 | 			assert.ok(temp === s);
154 | 		});
155 | 
156 | 		it('with only the scraping function', function(done) {
157 | 			var s = new ScraperPromise(new ScraperType())
158 | 				.get(HN_CLONE);
159 | 			var fn = function($) {
160 | 				return $('.title a').map(function() {
161 | 					return $(this).text();
162 | 				}).get();
163 | 			};
164 | 			var temp = s.scrape(fn);
165 | 			temp.then(function(news, utils) {
166 | 				assert.equal(news.length, expectedVal);
167 | 				done();
168 | 			});
169 | 			assert.ok(temp === s);
170 | 		});
171 | 
172 | 		it('with error', function(done) {
173 | 			var s = new ScraperPromise(new ScraperType())
174 | 				.get(HN_CLONE);
175 | 			var temp;
176 | 			temp = s
177 | 				.catch(function(err) {
178 | 					assert.equal(err.message, 'Error inside scraping fn.');
179 | 				})
180 | 				.scrape(function() {
181 | 					throw new Error('Error inside scraping fn.');
182 | 				}, function() {
183 | 					assert.fail('Invalid call.');
184 | 				})
185 | 				.done(function() {
186 | 					done();
187 | 				});
188 | 			assert.ok(temp === s);
189 | 		});
190 | 	});
191 | 
192 | 	it('delay', function(done) {
193 | 		var s = new ScraperPromise(new ScraperType())
194 | 			.get(HN_CLONE)
195 | 			.onStatusCode(function(code) {
196 | 				assert.equal(code, 200);
197 | 			});
198 | 		var temp = s.delay(100);
199 | 		assert.ok(temp === s);
200 | 		var expectedContent = isDynamic()?'Dynamic Content':0;
201 | 		s.scrape(function($) {
202 | 			return $('.dynamic').text();
203 | 		}, function(result) {
204 | 			assert.equal(result, expectedContent);
205 | 			done();
206 | 		});
207 | 	});
208 | 
209 | 	it('request', function(done) {
210 | 		var s = new ScraperPromise(new ScraperType());
211 | 		var temp = s.request({
212 | 			url: HN_CLONE,
213 | 			method: 'POST'
214 | 		});
215 | 		assert.ok(temp === s);
216 | 		var fn = function($) {
217 | 			return $('#POST_MESSAGE').text();
218 | 		};
219 | 		s.scrape(fn, function(result) {
220 | 			assert.equal(result, 'random text');
221 | 			done();
222 | 		});
223 | 	});
224 | 
225 | 	it('done', function(done) {
226 | 		var s = new ScraperPromise(new ScraperType());
227 | 		s.get(HN_CLONE);
228 | 		var temp = s.done(function() {
229 | 			done();
230 | 		});
231 | 		assert.ok(temp === s);
232 | 	});
233 | 
234 | 	it('_setChainParameter', function() {
235 | 		var s = new ScraperPromise(new ScraperType());
236 | 		s._setChainParameter(5);
237 | 		assert.equal(s.chainParameter, 5);
238 | 	});
239 | 
240 | 	describe('_fire', function() {
241 | 		it('without error', function(done) {
242 | 			var s = new ScraperPromise(new ScraperType());
243 | 			s.done(function() {
244 | 				done();
245 | 			});
246 | 			s._fire();
247 | 		});
248 | 		it('with error', function(done) {
249 | 			var c = 0;
250 | 			var s = new ScraperPromise(new ScraperType())
251 | 				.done(function() {
252 | 					assert.equal(c, 1);
253 | 					done();
254 | 				})
255 | 				.catch(function(err) {
256 | 					c++;
257 | 					assert.equal(err.message, 'msg');
258 | 				});
259 | 			s._fire(new Error('msg'));
260 | 		});
261 | 	});
262 | 
263 | 	it('_setPromises', function() {
264 | 		var s = new ScraperPromise(new ScraperType());
265 | 		var promises = [
266 | 
267 | 			function() {}
268 | 		];
269 | 		s._setPromises(promises);
270 | 		assert.ok(s.promises === promises);
271 | 	});
272 | 
273 | 	it('clone', function() {
274 | 		var s = new ScraperPromise(new ScraperType())
275 | 			.catch(function() {})
276 | 			.done(function() {})
277 | 			.then(function() {})
278 | 			.onStatusCode(200, function() {})
279 | 			.onStatusCode(function() {})
280 | 			.timeout(10)
281 | 			.delay(10);
282 | 		var clone = s.clone();
283 | 		assert.ok(clone instanceof ScraperPromise);
284 | 		assert.ok(clone.promises === s.promises);
285 | 		assert.ok(clone.scraper !== s.scraper);
286 | 		assert.ok(clone.doneCallback === s.doneCallback);
287 | 		assert.ok(clone.errorCallback === s.errorCallback);
288 | 		assert.ok(clone.chainParameter === s.chainParameter);
289 | 	});
290 | 
291 | 	it('passing values between promises', function(done) {
292 | 		new ScraperPromise(new ScraperType())
293 | 			.done(function(result, utils) {
294 | 				assert.deepEqual(result, 5);
295 | 				done();
296 | 			})
297 | 			.then(function(last, utils) {
298 | 				assert.deepEqual(last, undefined);
299 | 				return 1;
300 | 			})
301 | 			.onStatusCode(200, function(utils) {
302 | 				assert.deepEqual(utils.lastReturn, 1);
303 | 				return utils.lastReturn + 1;
304 | 			})
305 | 			.onStatusCode(function(code, utils) {
306 | 				assert.deepEqual(utils.lastReturn, 2);
307 | 				return utils.lastReturn + 1;
308 | 			})
309 | 			.delay(10, function(utils) {
310 | 				assert.deepEqual(utils.lastReturn, 3);
311 | 				return utils.lastReturn + 1;
312 | 			})
313 | 			.scrape(function() {}, function(result, utils) {
314 | 				assert.deepEqual(utils.lastReturn, 4);
315 | 				return utils.lastReturn + 1;
316 | 			})
317 | 			.get(HN_CLONE);
318 | 	});
319 | 
320 | 	describe('usage of utils', function() {
321 | 		it('stop()', function(done) {
322 | 			var c = 0;
323 | 			new ScraperPromise(new ScraperType())
324 | 				.get(HN_CLONE)
325 | 				.then(function() {
326 | 					c++;
327 | 				})
328 | 				.then(function(last, utils) {
329 | 					c++;
330 | 					utils.stop();
331 | 				})
332 | 				.then(function() {
333 | 					c++;
334 | 				})
335 | 				.done(function() {
336 | 					assert.equal(c, 2);
337 | 					done();
338 | 				});
339 | 		});
340 | 		it('scraper', function(done) {
341 | 			var s = new ScraperPromise(new ScraperType());
342 | 			s.get(HN_CLONE)
343 | 				.done(function(_, utils) {
344 | 					assert.ok(utils.scraper === s);
345 | 					done();
346 | 				});
347 | 		});
348 | 		it('params', function(done) {
349 | 			var s = new ScraperPromise(new ScraperType());
350 | 			s.get(HN_CLONE)
351 | 				.done(function(_, utils) {
352 | 					assert.ok(!utils.params);
353 | 					done();
354 | 				});
355 | 		});
356 | 
357 | 	});
358 | }
359 | 
360 | describe('Scraper Promise', function() {
361 | 
362 | 	describe('with StaticScraper', function() {
363 | 		exec(StaticScraper);
364 | 	});
365 | 	describe('with DynamicScraper', function() {
366 | 		describe('with Factory', function() {
367 | 			beforeEach(function() {
368 | 				DynamicScraper.startFactory();
369 | 			});
370 | 			afterEach(function() {
371 | 				DynamicScraper.closeFactory();
372 | 			});
373 | 			exec(DynamicScraper);
374 | 		});
375 | 		describe('without Factory', function() {
376 | 			exec(DynamicScraper);
377 | 		});
378 | 	});
379 | 
380 | 
381 | });
382 | 


--------------------------------------------------------------------------------
/test/StaticScraper.js:
--------------------------------------------------------------------------------
 1 | /* global describe, it */
 2 | var sjs = require('../src/Scraper'),
 3 | 	ScraperPromise = sjs.ScraperPromise,
 4 | 	StaticScraper = sjs.StaticScraper,
 5 | 	assert = require('assert'),
 6 | 	HN_CLONE = 'http://localhost:3000/hacker-news-clone';
 7 | 
 8 | describe('StaticScraper', function() {
 9 | 
10 | 	describe('#create', function() {
11 | 		it('with argument', function(done) {
12 | 			var ds = StaticScraper.create(HN_CLONE);
13 | 			ds
14 | 				.done(function() {
15 | 					assert.ok(ds instanceof ScraperPromise);
16 | 					done();
17 | 				});
18 | 		});
19 | 
20 | 		it('without argument', function(done) {
21 | 			var ds = StaticScraper.create();
22 | 			ds
23 | 				.get(HN_CLONE)
24 | 				.done(function() {
25 | 					assert.ok(ds instanceof ScraperPromise);
26 | 					done();
27 | 				});
28 | 		});
29 | 	});
30 | 
31 | 	describe('.loadBody, .scrape, .close', function() {
32 | 		it('without errors', function(done) {
33 | 			var ds = new StaticScraper();
34 | 			ds.body = '<html><head></head><body><div id="f">text</div></body></html>';
35 | 			var temp = ds.loadBody(function() {
36 | 				var temp2 = ds.scrape(function($) {
37 | 					return $('#f').text();
38 | 				}, function(err, result) {
39 | 					assert.ok(err === null);
40 | 					assert.equal(result, 'text');
41 | 					assert.ok(ds.close() === ds);
42 | 					assert.ok(ds.$);
43 | 					done();
44 | 				});
45 | 				assert.ok(temp2 === ds);
46 | 			});
47 | 			assert.ok(temp === ds);
48 | 		});
49 | 
50 | 		it('with errors', function(done) {
51 | 			var ds = new StaticScraper();
52 | 			ds.body = '<html><head></head><body><div id="f">text</div></body></html>';
53 | 			var temp = ds.loadBody(function() {
54 | 				var temp2 = ds.scrape(function() {
55 | 					throw new Error('Error in scraping fn.');
56 | 				}, function(err) {
57 | 					if (err) {
58 | 						assert.ok(!!err);
59 | 						assert.equal(err.message, 'Error in scraping fn.');
60 | 						assert.ok(ds.close() === ds);
61 | 						assert.ok(ds.$);
62 | 						done();
63 | 					} else {
64 | 						assert.fail('Should return an error.');
65 | 					}
66 | 				});
67 | 				assert.ok(temp2 === ds);
68 | 			});
69 | 			assert.ok(temp === ds);
70 | 		});
71 | 	});
72 | 
73 | 	it('.clone', function() {
74 | 		var ds = new StaticScraper(),
75 | 			clone = ds.clone();
76 | 		assert.ok(clone instanceof StaticScraper);
77 | 		assert.ok(ds != clone);
78 | 	});
79 | });


--------------------------------------------------------------------------------
/test/commandLine.js:
--------------------------------------------------------------------------------
 1 | /* global describe, it, $ */
 2 | var scraper = require('../src/Scraper'),
 3 |   exec = require('child_process').exec,
 4 |   Router = scraper.Router,
 5 |   assert = require('assert'),
 6 |   LH = 'http://localhost:3000';
 7 | 
 8 | function execSjs(more, callback) {
 9 |   var command = 'node ./bin/scraperjs ' + LH + '/hacker-news-clone ' + more;
10 |   exec(command, function(error, out, err) {
11 |     if(err || error) {
12 |       return;
13 |     } else {
14 |       callback(JSON.parse(out));
15 |     }
16 |   });
17 | }
18 | 
19 | describe('Command line tool', function() {
20 |   describe('--text', function() {
21 |     describe('--selector', function(done) {
22 |       it('--static', function(done) {
23 |         execSjs('--selector ".title a" --text -s', function(result) {
24 |           assert.equal(result.length, 10);
25 |           done();
26 |         });
27 |       });
28 |       it('--dynamic', function(done) {
29 |         execSjs('--selector ".title a" --text -d', function(result) {
30 |           assert.equal(result.length, 9);
31 |           done();
32 |         });
33 |       });
34 |     });
35 |   });
36 | 
37 |   describe('--html', function() {
38 |     describe('--selector', function(done) {
39 |       it('--static', function(done) {
40 |         execSjs('--selector ".title a" --html -s', function(result) {
41 |           assert.equal(result.length, 10);
42 |           done();
43 |         });
44 |       });
45 |       it('--dynamic', function(done) {
46 |         execSjs('--selector ".title a" --html -d', function(result) {
47 |           assert.equal(result.length, 9);
48 |           done();
49 |         });
50 |       });
51 |     });
52 |   });
53 | 
54 |   describe('--attr', function() {
55 |     describe('--selector', function(done) {
56 |       it('--static', function(done) {
57 |         execSjs('--selector ".title a" --attr href -s', function(result) {
58 |           assert.equal(result.length, 10);
59 |           done();
60 |         });
61 |       });
62 |       it('--dynamic', function(done) {
63 |         execSjs('--selector ".title a" --attr href -d', function(result) {
64 |           assert.equal(result.length, 9);
65 |           done();
66 |         });
67 |       });
68 |     });
69 |   });
70 | });
71 | 


--------------------------------------------------------------------------------
/test/setupServer.js:
--------------------------------------------------------------------------------
 1 | var express = require('express'),
 2 | 	fs = require('fs');
 3 | 
 4 | module.exports = function(grunt, port) {
 5 | 	var app = express(),
 6 | 		HN_CLONE = fs.readFileSync(__dirname + '/static/hacker-news-clone.html');
 7 | 
 8 | 	app.get('/hacker-news-clone', function(req, res) {
 9 | 		res.status(200);
10 | 		res.send(HN_CLONE);
11 | 	});
12 | 
13 | 	app.post('/hacker-news-clone', function(req, res) {
14 | 		res.status(200);
15 | 		res.send('<html><head></head><body><div id="POST_MESSAGE">random text</div></body></html>');
16 | 	});
17 | 
18 | 	app.param('id', function(req, res, next, id) {
19 | 		var regex = /^[\d\w]+$/;
20 | 		if (regex.test(id)) {
21 | 			next();
22 | 		} else {
23 | 			next('route');
24 | 		}
25 | 	});
26 | 	app.get('/watch/:id', function(req, res, next) {
27 | 		res.status(200);
28 | 		res.send(req.params.id);
29 | 	});
30 | 	app.get('/info/:id', function(req, res, next) {
31 | 		res.status(200);
32 | 		res.send(req.params.id);
33 | 	});
34 | 	app.post('/watch/:id', function(req, res, next) {
35 | 		res.status(200);
36 | 		res.send(req.params.id + "post");
37 | 	});
38 | 
39 | 	var server = app.listen(port || 3000, function() {
40 | 		console.log('Listening on port %d', server.address().port);
41 | 	});
42 | 
43 | 	return server;
44 | };


--------------------------------------------------------------------------------
/test/static/code.js:
--------------------------------------------------------------------------------
1 | /* global window */
2 | window.someFn = function(n) {
3 | 	return 'SomeFN ' + n + ' to '+ (n+1);
4 | };


--------------------------------------------------------------------------------
/test/static/hacker-news-clone.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | 	<head>
 4 | 		<title>Hacker news clone</title>
 5 | 	</head>
 6 | 	<body>
 7 | 		<div class="title" id="first">
 8 | 			<a href="/1">Random Article 1</a>
 9 | 		</div>
10 | 		<div class="title">
11 | 			<a href="/2">Random Article 2</a>
12 | 		</div>
13 | 		<div class="title">
14 | 			<a href="/3">Random Article 3</a>
15 | 		</div>
16 | 		<div class="title">
17 | 			<a href="/4">Random Article 4</a>
18 | 		</div>
19 | 		<div class="title">
20 | 			<a href="/5">Random Article 5</a>
21 | 		</div>
22 | 		<div class="title">
23 | 			<a href="/6">Random Article 6</a>
24 | 		</div>
25 | 		<div class="title">
26 | 			<a href="/7">Random Article 7</a>
27 | 		</div>
28 | 		<div class="title">
29 | 			<a href="/8">Random Article 8</a>
30 | 		</div>
31 | 		<div class="title">
32 | 			<a href="/9">Random Article 9</a>
33 | 		</div>
34 | 		<div class="title">
35 | 			<a href="/10">Random Article 10</a>
36 | 		</div>
37 | 		<script>
38 | 			setTimeout(function() {
39 | 				document.getElementById('first').innerHTML = '<div class="dynamic">Dynamic Content</div>';
40 | 			}, 20);
41 | 		</script>
42 | 	</body>
43 | </html>


--------------------------------------------------------------------------------