├── .gitignore ├── README.md ├── bin ├── noodle-server └── tests ├── docs ├── .satya-config.yml ├── 1. Overview.md ├── 10. Server quick start ├── 2. Try it out.md ├── 3. Web service.md ├── 4. Query syntax.md ├── 5. Noodle as node module.md ├── 6. Error handling.md ├── 7. Caching.md ├── 8. Adding to noodle.md └── 9. Tests.md ├── index.js ├── lib ├── cache.js ├── config.json ├── logger.js ├── noodle-middleware.js ├── noodle.js └── types │ ├── feed.js │ ├── html.js │ ├── json.js │ └── xml.js ├── package.json └── tests ├── document.atom ├── document.html ├── document.json ├── document.xml ├── fixtures.js ├── server.js └── tests.js /.gitignore: -------------------------------------------------------------------------------- 1 | _site 2 | _bin 3 | _config-local.yml 4 | FiddlerRules.farx 5 | .DS_Store 6 | 7 | lib-cov 8 | *.seed 9 | *.log 10 | *.csv 11 | *.dat 12 | *.out 13 | *.pid 14 | *.gz 15 | 16 | pids 17 | logs 18 | results 19 | 20 | node_modules 21 | npm-debug.log 22 | .satya -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [noodle](https://noodle.dharmafly.com) 2 | ============================= 3 | 4 | noodle is a Node.js server and module for querying and scraping data from web documents. It features: 5 | 6 | ```JSON 7 | { 8 | "url": "https://github.com/explore", 9 | "selector": "ol.ranked-repositories h3 a", 10 | "extract": "href" 11 | } 12 | ``` 13 | 14 | Features 15 | -------- 16 | 17 | - Cross domain document querying (html, json, xml, atom, rss feeds) 18 | - Server supports querying via JSONP and JSON POST 19 | - Multiple queries per request 20 | - Access to queried server headers 21 | - Allows for POSTing to web documents 22 | - In memory caching for query results and web documents 23 | 24 | Server quick start 25 | ------------------ 26 | 27 | Setup 28 | 29 | $ npm install noodlejs 30 | 31 | or 32 | 33 | $ git clone git@github.com:dharmafly/noodle.git 34 | $ cd noodle 35 | $ npm install 36 | 37 | Start the server by running the binary 38 | 39 | $ bin/noodle-server 40 | Noodle node server started 41 | ├ process title node-noodle 42 | ├ process pid 4739 43 | └ server port 8888 44 | 45 | 46 | You may specify a port number as an argument 47 | 48 | $ bin/noodle-server 9090 49 | Noodle node server started 50 | ├ process title node-noodle 51 | ├ process pid 4739 52 | └ server port 9090 53 | 54 | 55 | Noodle as a node module 56 | ----------------------- 57 | 58 | If you are interested in the node module just run ```npm install noodlejs```, 59 | require it and check out the [noodle api](https://noodle.dharmafly.com/reference/#Noodle-as-node-module) 60 | 61 | ```javascript 62 | var noodle = require('noodlejs'); 63 | 64 | noodle.query({ 65 | url: 'https://github.com/explore', 66 | selector: 'ol.ranked-repositories h3 a', 67 | extract: 'href' 68 | }) 69 | .then(function (results) { 70 | console.log(results); 71 | }); 72 | ``` 73 | 74 | Tests 75 | ----- 76 | 77 | The noodle tests create a temporary server on port `8889` which the automated 78 | tests tell noodle to query against. 79 | 80 | To run tests you can use the provided binary *from the noodle package 81 | root directory*: 82 | 83 | $ cd noodle 84 | $ bin/tests 85 | 86 | Contribute 87 | ---------- 88 | 89 | Contributors and suggestions welcomed. 90 | 91 | - [https://noodle.dharmafly.com](https://noodle.dharmafly.com) 92 | - [https://github.com/dharmafly/noodle](https://github.com/dharmafly/noodle) 93 | -------------------------------------------------------------------------------- /bin/noodle-server: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | process.title = 'node-noodle'; 4 | 5 | var connect = require('connect'), 6 | http = require('http'), 7 | url = require('url'), 8 | fs = require('fs'), 9 | limiter = require('connect-ratelimit'), 10 | noodlemw = require('../lib/noodle-middleware'), 11 | version = getVersion(), 12 | limits = getConfig().rateLimit, 13 | port = process.argv[2] || 8888, 14 | app; 15 | 16 | limits.end = false; 17 | 18 | app = connect() 19 | .use(function (req, res, next) { 20 | if (url.parse(req.url).pathname === '/version') { 21 | res.writeHead(200, { 22 | 'Content-Type': 'application/json; charset=utf-8' 23 | }); 24 | res.end('{"version":' + version + '}'); 25 | } else { 26 | next(); 27 | } 28 | }) 29 | .use(limiter(limits)) 30 | .use(function (req, res, next) { 31 | if (res.ratelimit.exceeded) { 32 | res.statusCode = 429; 33 | res.end('[{"results": [], "error": "Rate limit exceeded"}]'); 34 | } else { 35 | next(); 36 | } 37 | }) 38 | .use(connect.query()) 39 | .use(connect.json()) 40 | .use(noodlemw.parseQueries) 41 | .use(noodlemw.noodleQueries) 42 | .use(noodlemw.respond); 43 | 44 | http.createServer(app).listen(port, function () { 45 | require('colors'); 46 | with (console) { 47 | log(' Noodle node server started'.magenta); 48 | log(' ├ process title '.magenta, process.title.toString().green); 49 | log(' ├ process pid '.magenta, process.pid.toString().green); 50 | log(' └ server port '.magenta, port.toString().green); 51 | } 52 | }); 53 | 54 | // Return the noodle config as an object 55 | 56 | function getConfig () { 57 | var path = require('path').resolve(__dirname, '../lib/config.json'), 58 | config = fs.readFileSync(path).toString(); 59 | return JSON.parse(config); 60 | } 61 | 62 | // Return the noodle version number 63 | 64 | function getVersion () { 65 | var path = require('path').resolve(__dirname, '../package.json'); 66 | return JSON.parse(fs.readFileSync(path).toString()).version; 67 | } -------------------------------------------------------------------------------- /bin/tests: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Start the testing server and store its pid 4 | node tests/server.js & pid=$! 5 | 6 | # Run the tests 7 | node_modules/mocha/bin/mocha tests/tests.js --timeout 4000 --reporter list 8 | 9 | # Kill the test server via its pid 10 | kill $! 11 | -------------------------------------------------------------------------------- /docs/.satya-config.yml: -------------------------------------------------------------------------------- 1 | ######################################### 2 | # project site config 3 | 4 | project_name: noodle 5 | project_url: https://github.com/dharmafly/noodle 6 | version: '0.3.2' 7 | # options: forest, ocean, horus, seagrass, sundae, slate 8 | theme: ocean 9 | twitter_url: https://twitter.com/dharmafly 10 | # options: javascript, css, html5 11 | lang: javascript 12 | scripts: 13 | - src: //ajax.googleapis.com/ajax/libs/jquery/1.8.2/jquery.min.js 14 | # - src: https://raw.github.com/dharmafly/PROJECT-REPO/master/PROJECT.js 15 | quote: 16 | #- quote: 17 | #- cite: 18 | # analytics 19 | ga_id: UA-34978047-5 20 | download_links: 21 | - text: Edge 22 | subtext: (master) 23 | href: https://github.com/dharmafly/noodle/zipball/master 24 | title: The repo's latest codebase (zip). Potentially unstable. 25 | sections: 26 | - path: /index.html 27 | name: Overview 28 | - path: /reference/index.html 29 | name: Reference 30 | 31 | ######## END project site config ######## 32 | -------------------------------------------------------------------------------- /docs/1. Overview.md: -------------------------------------------------------------------------------- 1 | --- 2 | category: overview 3 | heading: 'Overview' 4 | --- 5 | 6 |  noodle is a Node.js server and module for querying and scraping data from web documents. It features: 7 | 8 | - Cross domain document querying (html, json, xml, atom, rss feeds) 9 | - Server supports querying via JSONP and JSON POST 10 | - Multiple queries per request 11 | - Access to queried server headers 12 | - Allows for POSTing to web documents 13 | - In memory caching for query results and web documents -------------------------------------------------------------------------------- /docs/10. Server quick start: -------------------------------------------------------------------------------- 1 | --- 2 | category: overview 3 | heading: 'Server quick setup' 4 | --- 5 | 6 | Setup 7 | 8 | $ git clone https://github.com/dharmafly/noodle.git 9 | $ cd noodle 10 | $ npm install 11 | 12 | Start the server by running the binary 13 | 14 | $ bin/noodle-server 15 | Server running on port 8888 16 | 17 | You may specify a port number as an argument 18 | 19 | $ bin/noodle-server 9090 20 | Server running on port 9090 21 | -------------------------------------------------------------------------------- /docs/2. Try it out.md: -------------------------------------------------------------------------------- 1 | --- 2 | category: overview 3 | heading: 'Try it out' 4 | --- 5 | 6 | ## Install via NPM 7 | 8 | $ npm install noodlejs 9 | 10 | ## Install via Git 11 | 12 | $ git clone https://github.com/dharmafly/noodle.git 13 | 14 | ## Run the server and GET or POST queries on `localhost:8888` 15 | 16 | $ cd noodle 17 | # or `cd node_modules/noodlejs` if installed via npm 18 | $ bin/noodle-server 19 | Server running on port 8888 20 | 21 | ## Or use as a node module 22 | 23 | $ var noodle = require('noodlejs'); 24 | 25 | 26 | ## Editor 27 | 28 | Below is an editor where you can try writing a query yourself. 29 | 30 | The query below tells noodle to go to the google search result for 31 | JavaScript and expect a html file. Then using the selector pick out 32 | all of the result anchors. Finally the query says to extract the 33 | text for each of those anchor elements. 34 | 35 | Press run below to see the output: 36 | 37 | var query = { 38 | url: 'https://google.com/search?q=javascript', 39 | type: 'html', 40 | selector: 'h3.r a', 41 | extract: 'text' 42 | }, 43 | uriQuery = encodeURIComponent(JSON.stringify(query)), 44 | request = 'https://example.noodle.dharmafly.com/?q=' + 45 | uriQuery + '&callback=?'; 46 | 47 | // Make Ajax request to Noodle server 48 | jQuery.getJSON(request, function (data) { 49 | alert(data[0].results); 50 | }); 51 | 52 | Noodle queries don't just support html but also json, feeds and plain xml. They can be a lot more powerful too. 53 | [Read the reference for more details.](https://noodle.dharmafly.com/reference) 54 | -------------------------------------------------------------------------------- /docs/3. Web service.md: -------------------------------------------------------------------------------- 1 | --- 2 | category: reference 3 | heading: 'Web service' 4 | --- 5 | 6 | Noodle can be used as both a web service and a node module. In each case, key/value objects are used as queries to fetch and extract data from web documents. 7 | 8 | noodle currently supports multiple web documents with an almost uniform query syntax for grabbing data from the different types (html, json, feeds, xml). 9 | 10 | noodle is ready to run as a web service from `bin/noodle-server`. 11 | 12 | 13 | ## Run the server 14 | 15 | $ cd noodle 16 | # or `cd node_modules/noodlejs` if installed via npm 17 | $ bin/noodle-server 18 | Server running on port 8888 19 | 20 | 21 | ## GET or POST 22 | 23 | The server supports queries via both GET and POST. 24 | 25 | ### GET 26 | 27 | The query itself can be sent in the `q` parameter either as a url encoded JSON blob or as a querystring serialised representation (`jQuery.param()`). 28 | 29 | noodle supports JSONP if a `callback` parameter is supplied. 30 | 31 | GET https://example.noodle.dharmaflt.com?q={JSONBLOB}&callback=foo 32 | 33 | 34 | ### POST 35 | 36 | noodle also supports a query sent as JSON in the POST body. 37 | 38 | POST https://example.noodle.dharmafly.com 39 | 40 | 41 | ## Rate limiting 42 | 43 | The web service also provides rate limiting out of the box with 44 | [connect-ratelimit](https://github.com/dharmafly/connect-ratelimit). 45 | 46 | 47 | ## Configuration 48 | 49 | ### Server port 50 | 51 | The specify what port the noodle web service serves on just write it as the 52 | first argument to the binary. 53 | 54 | $ bin/noodle-server 9000 55 | Server running on port 9000 56 | 57 | ### Behaviour settings 58 | 59 | Various noodle settings like cache and ratelimit settings are exposed 60 | and can be edited in `lib/config.json`. 61 | 62 | { 63 | // Setting to true will log out information to the 64 | // terminal 65 | 66 | "debug": true, 67 | 68 | "resultsCacheMaxTime": 3600000, 69 | "resultsCachePurgeTime": 60480000, // -1 will turn purging off 70 | "resultsCacheMaxSize": 124, 71 | 72 | "pageCacheMaxTime": 3600000, 73 | "pageCachePurgeTime": 60480000, // -1 will turn purging off 74 | "pageCacheMaxSize": 32, 75 | 76 | // If no query type option is supplied then 77 | // what should noodle assume 78 | 79 | "defaultDocumentType": "html", 80 | 81 | // How the noodle scraper identifies itself 82 | // to scrape targets 83 | 84 | "userAgent": "", 85 | 86 | // Rate limit settings 87 | // https://npmjs.org/package/connect-ratelimit#readme 88 | 89 | "rateLimit": { 90 | "whitelist": ["127.0.0.1", "localhost"], 91 | "blacklist": [], 92 | "categories": { 93 | "normal": { 94 | "totalRequests": 1000, 95 | "every": 3600000000 96 | }, 97 | "whitelist": { 98 | "totalRequests": 10000, 99 | "every": 60000000 100 | }, 101 | "blacklist": { 102 | "totalRequests": 0, 103 | "every": 0 104 | } 105 | } 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /docs/4. Query syntax.md: -------------------------------------------------------------------------------- 1 | --- 2 | category: reference 3 | heading: 'Query syntax' 4 | --- 5 | 6 | A simple query looks like this: 7 | 8 | { 9 | "url": "http://chrisnewtn.com", 10 | "type": "html", 11 | "selector": "ul.social li a", 12 | "extract": "href", 13 | } 14 | 15 | It says to go to a friend's website and for noodle to expect a html document. 16 | Then to select anchor elements in a list and for each one extract the href 17 | attribute's value. 18 | 19 | The `type` property is used to tell noodle if you are wanting to scrape a html 20 | page, json document etc. If no type is specified then a html page will be 21 | assumed by default. 22 | 23 | A similar query can be constructed to extract information from a JSON document. 24 | JSONSelect is used as the underlying library to do this. It supports common CSS3 25 | selector functionality. You can [familiarize yourself with it here.](http://jsonselect.org/#tryit) 26 | 27 | { 28 | "url": "https://search.twitter.com/search.json?q=friendship", 29 | "selector": ".results .from_user", 30 | "type": "json" 31 | } 32 | 33 | An `extract` property is not needed for a query on JSON documents as json 34 | properties have no metadata and just a single value were as a html element 35 | can have text, the inner html or an attribute like `href`. 36 | 37 | ## Different types (html, json, feed & xml) 38 | 39 | ### html 40 | 41 | **Note:** Some xml documents can be parsed by noodle under the html type! 42 | 43 | The html type is the only type to have the `extract` property. This is because 44 | the other types are converted to JSON. 45 | 46 | The `extract` property (optional) could be the HTML element's attribute 47 | but it is not required. 48 | 49 | Having `"html"` or `"innerHTML"` as the `extract` value will return the 50 | containing HTML within that element. 51 | 52 | Having `"text"` as the `extract` value will return only the text. noodle will 53 | strip out any new line characters found in the text. 54 | 55 | Return data looks like this: 56 | 57 | [ 58 | { 59 | "results": [ 60 | "http://twitter.com/chrisnewtn", 61 | "http://plus.google.com/u/0/111845796843095584341" 62 | ], 63 | "created": "2012-08-01T16:22:14.705Z" 64 | } 65 | ] 66 | 67 | Having no specific extract rule will assume a default of extracting `"text"` 68 | from the `selector`. 69 | 70 | It is also possible to request multiple properties to extract in one query if 71 | one uses an array. 72 | 73 | Query: 74 | 75 | { 76 | "url": "http://chrisnewtn.com", 77 | "selector": "ul.social li a", 78 | "extract": ["href", "text"] 79 | } 80 | 81 | Response: 82 | 83 | [ 84 | { 85 | "results": [ 86 | { 87 | "href": "http://twitter.com/chrisnewtn", 88 | "text": "Twitter" 89 | }, 90 | { 91 | "href": "http://plus.google.com/u/0/111845796843095584341", 92 | "text": "Google+" 93 | } 94 | ], 95 | "created": "2012-08-01T16:23:41.913Z" 96 | } 97 | ] 98 | 99 | In the query's `selector` property use the standard CSS DOM selectors. 100 | 101 | ### json and xml 102 | 103 | The same rules apply from html to the json and xml types. Only that the 104 | `extract` property should be ommitted from queries as the JSON node value(s) 105 | targetted by the `selector` is always assumed. 106 | 107 | In the query's `selector` property use 108 | [JSONSelect](http://jsonselect.org/#tryit) style selectors. 109 | 110 | ### feeds 111 | 112 | The same rules apply to the json and xml types. Only that the `extract` property 113 | should be ommitted from queries as the JSON node value(s) targetted by the 114 | `selector` is always assumed. 115 | 116 | In the query's `selector` property use 117 | [JSONSelect](http://jsonselect.org/#tryit) style selectors. 118 | 119 | The feed type is based upon 120 | [node-feedparser](https://github.com/danmactough/node-feedparser) so it 121 | supports Robust RSS, Atom, and RDF standards. 122 | 123 | [Familiarize yourself with its](https://github.com/danmactough/node-feedparser#what-is-the-parsed-output-produced-by-feedparser) normalisation format before you use JSONSelect style 124 | selector. 125 | 126 | ## Getting the entire web document 127 | 128 | If no `selector` is specified than the entire document is returned. This is a 129 | rule applied to all types of docments. The `extract` rule will be ignored if 130 | included. 131 | 132 | Query: 133 | 134 | { 135 | "url": "https://search.twitter.com/search.json?q=friendship" 136 | } 137 | 138 | Response: 139 | 140 | [ 141 | { 142 | "results": [""], 143 | "created": "2012-10-24T15:37:29.796Z" 144 | } 145 | ] 146 | 147 | ## Mapping a query to familiar properties 148 | 149 | Queries can also be written in noodle's map notation. The map notation allows 150 | for the results to be accessible by your own more helpful property names. 151 | 152 | In the example below map is used to create a result object of a person and 153 | their repos. 154 | 155 | { 156 | "url": "https://github.com/chrisnewtn", 157 | "type": "html", 158 | "map": { 159 | "person": { 160 | "selector": "span[itemprop=name]", 161 | "extract": "text" 162 | }, 163 | "repos": { 164 | "selector": "li span.repo", 165 | "extract": "text" 166 | } 167 | } 168 | } 169 | 170 | With results looking like this: 171 | 172 | [ 173 | { 174 | "results": { 175 | "person": [ 176 | "Chris Newton" 177 | ], 178 | "repos": [ 179 | "cmd.js", 180 | "simplechat", 181 | "sitestatus", 182 | "jquery-async-uploader", 183 | "cmd-async-slides", 184 | "elsewhere", 185 | "pablo", 186 | "jsonpatch.js", 187 | "jquery.promises", 188 | "llamarama" 189 | ] 190 | }, 191 | "created": "2013-03-25T15:38:01.918Z" 192 | } 193 | ] 194 | 195 | ## Getting hold of page headers 196 | 197 | Within a query include the `headers` property with an array value listing the 198 | headers you wish to recieve back as an object structure. `'all'` may also be 199 | used as a value to return all of the server headers. 200 | 201 | Headers are treated case-insensitive and the returned property names will 202 | match exactly to the string you requested with. 203 | 204 | Query: 205 | 206 | { 207 | "url": "http://github.com", 208 | "headers": ["connection", "content-TYPE"] 209 | } 210 | 211 | Result: 212 | 213 | [ 214 | { 215 | "results": [...], 216 | "headers": { 217 | "connection": "keep-alive", 218 | "content-TYPE": "text/html" 219 | } 220 | "created":"2012-11-14T13:06:02.521Z" 221 | } 222 | ] 223 | 224 | ### Link headers for pagination 225 | 226 | noodle provides a shortcut to the server Link header with the query 227 | `linkHeader` property set to `true`. Link headers are useful as some web APIs 228 | use them to expose their pagination. 229 | 230 | The Link header will be parsed to an object structure. If you wish to have the Link header in its usual formatting then include it in the `headers` array instead. 231 | 232 | Query: 233 | 234 | { 235 | "url": "https://api.github.com/users/premasagar/starred", 236 | "type": "json", 237 | "selector": ".language", 238 | "headers": ["connection"], 239 | "linkHeader": true 240 | } 241 | 242 | Result: 243 | 244 | [ 245 | { 246 | "results": [ 247 | "JavaScript", 248 | "Ruby", 249 | "JavaScript", 250 | ], 251 | "headers": { 252 | "connection": "keep-alive", 253 | "link": { 254 | "next": "https://api.github.com/users/premasagar/starred?page=2", 255 | "last": "https://api.github.com/users/premasagar/starred?page=21" 256 | } 257 | }, 258 | "created": "2012-11-16T15:48:33.866Z" 259 | } 260 | ] 261 | 262 | 263 | ## Querying to a POST url 264 | 265 | noodle allows for post data to be passed along to the target web server 266 | specified in the url. This can be optionally done with the `post` property 267 | which takes an object map of the post data key/values. 268 | 269 | { 270 | "url": "http://example.com/login.php", 271 | "post": { 272 | "username": "john", 273 | "password": "123" 274 | }, 275 | "select": "h1.username", 276 | "type": "html" 277 | } 278 | 279 | Take not however that queries with the `post` property will not be cached. 280 | 281 | ## Querying without caching 282 | 283 | If `cache` is set to `false` in your query then noodle will not cache the 284 | results or associated page and it will get the data fresh. This is useful for 285 | debugging. 286 | 287 | { 288 | "url": "http://example.com", 289 | "selector": "h1", 290 | "cache": "false" 291 | } 292 | 293 | ## Query errors 294 | 295 | noodle aims to give errors for the possible use cases were a query does 296 | not yield any results. 297 | 298 | Each error is specific to one result object and are contained in the `error` 299 | property as a string message. 300 | 301 | Response: 302 | 303 | [ 304 | { 305 | "results": [], 306 | "error": "Document not found" 307 | } 308 | ] 309 | 310 | noodle also falls silently with the `'extract'` property by ommitting any 311 | extract results from the results object. 312 | 313 | Consider the following JSON response to a partially incorrect query. 314 | 315 | Query: 316 | 317 | { 318 | "url": "http://chrisnewtn.com", 319 | "selector": "ul.social li a", 320 | "extract": ["href", "nonexistent"] 321 | } 322 | 323 | Response: 324 | 325 | The extract "nonexistent" property is left out because it was not found 326 | on the element. 327 | 328 | [ 329 | { 330 | "results": [ 331 | { 332 | "href": "http://twitter.com/chrisnewtn" 333 | }, 334 | { 335 | "href": "http://plus.google.com/u/0/111845796843095584341" 336 | } 337 | ], 338 | "created": "2012-08-01T16:28:19.167Z" 339 | } 340 | ] 341 | 342 | ## Multiple queries 343 | 344 | Multiple queries can be made per request to the server. You can mix between 345 | different types of queries in the same request as well as queries in the map 346 | notation. 347 | 348 | Query: 349 | 350 | [ 351 | { 352 | "url": "http://chrisnewtn.com", 353 | "selector": "ul.social li a", 354 | "extract": ["text", "href"] 355 | }, 356 | { 357 | "url": "http://premasagar.com", 358 | "selector": "#social_networks li a.url", 359 | "extract": "href" 360 | } 361 | ] 362 | 363 | Response: 364 | 365 | [ 366 | { 367 | "results": [ 368 | { 369 | "href": "http://twitter.com/chrisnewtn", 370 | "text": "Twitter" 371 | }, 372 | { 373 | "href": "http://plus.google.com/u/0/111845796843095584341", 374 | "text": "Google+" 375 | } 376 | ], 377 | "created": "2012-08-01T16:23:41.913Z" 378 | }, 379 | { 380 | "results": [ 381 | "http://dharmafly.com/blog", 382 | "http://twitter.com/premasagar", 383 | "https://github.com/premasagar", 384 | ], 385 | "created": "2012-08-01T16:22:13.339Z" 386 | } 387 | ] 388 | 389 | ## Proxy Support 390 | 391 | When calling a page multiple times some sites can and will ban your IP address, Adding support for proxy IP addresses allows the rotation of IP addresses. 392 | 393 | Query: 394 | 395 | { 396 | "url": "http://chrisnewtn.com", 397 | "selector": "ul.social li a", 398 | "extract": ["text", "href"], 399 | "proxy": "XXX.XXX.XXX.XXX" 400 | } 401 | -------------------------------------------------------------------------------- /docs/5. Noodle as node module.md: -------------------------------------------------------------------------------- 1 | --- 2 | category: reference 3 | heading: 'Noodle as node module' 4 | --- 5 | 6 | **Note:** Since noodle's internal cache uses an interval this will keep the 7 | related node process running indefinately. Be sure to run `noodle.stopCache()` 8 | in your code when you're finished with noodle. 9 | 10 | ## Methods 11 | 12 | ### noodle.query 13 | 14 | The main entry point to noodle's functionality is the `query` method. This 15 | method accepts a query or an array of queries as its only parameter and returns 16 | a [promise](https://github.com/kriskowal/q). 17 | 18 | var noodle = require('noodlejs'); 19 | noodle.query(queries).then(function (results) { 20 | console.log(results); 21 | }); 22 | 23 | The makeup of query(s) is analagous to using noodle as a web service (as 24 | [stated above](http://noodlejs.com/reference/#query-syntax)). The 25 | exception being that you supply a proper object and not JSON. 26 | 27 | ### noodle.fetch 28 | 29 | This method returns a [promises](https://github.com/kriskowal/q). Which upon 30 | resolutions hands over the requested web document. 31 | 32 | noodle.fetch(url).then(function (page) { 33 | console.log(page); 34 | }); 35 | 36 | 37 | ### noodle.html.select 38 | 39 | For applying one query to a html string and retrieving the results. 40 | 41 | noodle.html.select(html, {selector: 'title', extract: 'innerHTML'}) 42 | .then(function (result) { 43 | console.log(result); 44 | }); 45 | 46 | 47 | ### noodle.json.select 48 | 49 | For applying one query to a parsed JSON representation (object). 50 | 51 | var parsed = JSON.parse(json); 52 | noodle.html.select(parsed, {selector: '.name'}) 53 | .then(function (result) { 54 | console.log(result); 55 | }); 56 | 57 | ## noodle.feed.select 58 | 59 | Normalises an RSS, ATOM or RDF string with 60 | [node-feedparser](https://github.com/danmactough/node-feedparser) then proxies 61 | that normalised object to `noodle.json.select`. 62 | 63 | ### noodle.xml.select 64 | 65 | Proxies to `noodle.json.select`. 66 | 67 | ### noodle events 68 | 69 | noodle's `noodle.events` namespace allows one to listen for emitted cache 70 | related events. Noodle inherits from node's [EventEmitter](http://nodejs.org/api/events.html#events_class_events_eventemitter). 71 | 72 | // Called when a page is cached 73 | noodle.events.on('cache/page', function (obj) { 74 | //obj is the page cache object detailing the page, its headers 75 | //and when it was first cached 76 | }); 77 | 78 | // Called when a result is cached 79 | noodle.events.on('cache/result', function (obj) { 80 | //obj is the result cache object detailing the result and when 81 | //it was first cached 82 | }); 83 | 84 | // Called when the cache is purged 85 | noodle.events.on('cache/purge', function (arg1, arg2) { 86 | //arg1 is a javascript date representing when the cache was purged 87 | //arg2 is the time in milliseconds until the next cache purge 88 | }); 89 | 90 | // Called when a cached item has expired from the cache 91 | noodle.events.on('cache/expire', function (obj) { 92 | //obj is the cache item 93 | }); 94 | 95 | ### Configuration 96 | 97 | Configuration is possible programmatically via `noodle.configure(obj)`. 98 | 99 | This accepts a conig object which can be partly or fully representing the 100 | config options. 101 | 102 | This object is applied over the existing config found in the `config.json`. 103 | 104 | Example for change just two settings: 105 | 106 | var noodle = require('noodlejs'); 107 | 108 | // Do not display messages to the terminal and set 109 | // the default document type to json 110 | 111 | noodle.configure({ 112 | debug: false, 113 | defaultDocumentType: "json" 114 | }); 115 | -------------------------------------------------------------------------------- /docs/6. Error handling.md: -------------------------------------------------------------------------------- 1 | --- 2 | category: reference 3 | heading: 'Error handling' 4 | --- 5 | 6 | Noodle will fire various errors which one can listen for with the `fail()` 7 | handler. 8 | 9 | noodle.html.fetch(query) 10 | .then(function (result) { 11 | console.log('The results are', results); 12 | }) 13 | .fail(function (error) { 14 | console.log('Uh oh', error.message); 15 | }); 16 | 17 | ## Possible errors 18 | 19 | The noodle module itself emits only one error: 20 | 21 | - `"Document not found"` when a targetted url is not found. 22 | 23 | Were as the specific document type modules emit their own but should bubble 24 | up to the main `noodle.query` method. 25 | 26 | - `'Could not parse XML to JSON'` 27 | - `'Could not parse JSON document'` 28 | - `'Could not match with that selector'` 29 | - `'Could not match with that selector or extract value'` 30 | -------------------------------------------------------------------------------- /docs/7. Caching.md: -------------------------------------------------------------------------------- 1 | --- 2 | category: reference 3 | heading: 'Caching' 4 | --- 5 | 6 | noodle includes an in memory cache for both queried pages and the query 7 | results to help with the speed of requests. 8 | 9 | This cache can be configured in the `noodlejs/lib/config.json` file. 10 | 11 | This cache is included in the noodle library core not at its web service. 12 | 13 | Caching is done on a singular query basis and not per all queries in a request. 14 | 15 | By default the page cache and results cache's individual items have a life time 16 | of an hour. With a cache itself having total size of 124 recorded items in 17 | memory at one time. A cache is also cleared entirely on a weekly basis. 18 | 19 | These values can all be changed from noodle's json config. 20 | 21 | ## HTTP caching headers 22 | 23 | The noodle web service includes `Expires` header. This is always set to the 24 | oldest to expire query result in a result set. 25 | 26 | Take not however that the browser [may not cache](http://stackoverflow.com/questions/626057/is-it-possible-to-cache-post-methods-in-http) POST requests to the noodle server. -------------------------------------------------------------------------------- /docs/8. Adding to noodle.md: -------------------------------------------------------------------------------- 1 | --- 2 | category: reference 3 | heading: 'Adding to noodle' 4 | --- 5 | 6 | noodle is an open-source project 7 | [maintained on github](https://github.com/dharmafly/premasagar) so raising 8 | issues and forking is encouraged. 9 | 10 | ## Supporting different web documents 11 | 12 | By default noodle supports html, json, standard feeds and xml web documents but 13 | noodle also provides a concise environment for developers to write their own 14 | type modules with prior knowledge only needed in 15 | [promises](https://github.com/kriskowal/q). 16 | 17 | To add their own type, one creates the script for that type in 18 | `noodlejs/lib/types` with the name being what one would type in a query. 19 | 20 | ` $ touch noodlejs/lib/types/csv.js` 21 | 22 | As for the content of the script a developer should expose at least 2 methods 23 | (`_init` & `fetch`) and is recommended to expose a `select` method. These 24 | methods must be written with a promise interface interoperable with 25 | [the q library](https://github.com/kriskowal/q). It is reccomended you just use 26 | [q](https://github.com/kriskowal/q). 27 | 28 | **Required methods** 29 | 30 | `exports._init = function (noodle) {}` 31 | 32 | This function is passed the main noodle library. You should keep hold of this 33 | reference so you can make use of some important noodle methods covered in a bit. 34 | 35 | `exports.fetch = function (url, query) {}` 36 | 37 | This method is the entry point to your module by noodle and possibly other 38 | developers. This is the function which leads to all of your processing. 39 | 40 | Make use of `noodle.cache.get` to resolve your promise early with a cached 41 | results without the need to fetch the page and process the query. 42 | 43 | It is higly recommended you do not fetch the page yourself but use the core 44 | `noodle.fetch` since this handles page caching for you. 45 | 46 | When you have the document pass it and the query to your `select` function for 47 | processing with the query. 48 | function fetch (url, query) { 49 | var deferred = q.defer(); 50 | if (noodle.cache.check(query)) { 51 | deferred.resolve(noodle.cache.get(query).value); 52 | return deferred.promise; 53 | } else { 54 | return noodle.fetch(url, query).then(function (page) { 55 | return select(page, query); 56 | }); 57 | } 58 | } 59 | 60 | **Recommended methods** 61 | 62 | `exports.select = function (document, query) {}` 63 | 64 | This method is where you do your actual selecting of the data using the web 65 | document given from your `fetch` method via `noodle.fetch`. 66 | 67 | In your algorithm do not account for multiple queries. This is done at a higher 68 | level by noodle which iterates over your type module. 69 | 70 | It is also highly recommended that you cache your result this is done simply by 71 | wrapping it in the `noodle._wrapResults` method. 72 | 73 | `deferred.resolve(noodle._wrapResults(results, query));` 74 | 75 | What defines query properties like `extract` or `select` is what your own 76 | select function expects to find in the `query` object passed in. For example: 77 | 78 | 79 | // Query 80 | { 81 | "url": "http://example.com/data.csv", 82 | "type": "csv", 83 | "from": "row1", 84 | "to": "row10" 85 | } 86 | 87 | // Your interpretation 88 | function select (document, query) { 89 | ... 90 | csvparser.slice(query.from, query.to); 91 | ... 92 | } 93 | 94 | **Example script** 95 | 96 | An example implementation could look like this: 97 | 98 | var q = require('q'), 99 | noodle = null; 100 | 101 | exports._init = function (n) { 102 | noodle = n; 103 | } 104 | 105 | exports.fetch = function (url, query) { 106 | var deferred = q.Defer(); 107 | if (noodle.cache.check(query)) { 108 | deferred.resolve(noodle.cache.get(query).value); 109 | return deferred.promise; 110 | } else { 111 | return noodle.fetch(url).then(function (page) { 112 | return exports.select(page, query); 113 | }); 114 | } 115 | } 116 | 117 | exports.select = function (page, query) { 118 | var deferred = q.Defer(), 119 | myResults = []; 120 | 121 | /* 122 | your algorithm here, dont forget to 123 | deferred.resolve(noodle._wrapResults(myResults, query)) 124 | or 125 | deferred.fail(new Error("Selector was bad or something like that")) 126 | */ 127 | 128 | return deferred.promise; 129 | } 130 | -------------------------------------------------------------------------------- /docs/9. Tests.md: -------------------------------------------------------------------------------- 1 | --- 2 | category: reference 3 | heading: 'Tests' 4 | --- 5 | 6 | The noodle tests create a temporary server on port `8889` which the automated 7 | tests tell noodle to query against. 8 | 9 | To run tests you can use the provided binary from the noodle package root 10 | directory: 11 | 12 | $ bin/tests 13 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | exports = require('./lib/noodle.js'); -------------------------------------------------------------------------------- /lib/cache.js: -------------------------------------------------------------------------------- 1 | var _ = require('underscore'); 2 | 3 | // ------------------------------------------------------------ 4 | // Cache class which can store, expose, expire and purge all 5 | // items in its memory. 6 | // 7 | // Two instances of Cache exist as pageCache and resultsCache 8 | // in noodle.js. 9 | // ------------------------------------------------------------ 10 | 11 | module.exports = function Cache (config, noodle) { 12 | var cache = [], 13 | intervalId1, 14 | intervalId2; 15 | 16 | // ------------------------------------------------------------ 17 | // Starts the interval for cache purging and cache expiry. 18 | // Called from noodle.js. 19 | // ------------------------------------------------------------ 20 | 21 | this.start = function () { 22 | 23 | // Check to see if a cache item is to be removed from the 24 | // cache (expired). 25 | 26 | intervalId1 = setInterval(function () { 27 | var now = new Date().getTime(), 28 | initialLength = cache.length, 29 | x = 0, 30 | keep = []; 31 | 32 | while (x < initialLength) { 33 | if ((now - cache[x].created) < config.cacheMaxTime) { 34 | keep.unshift(cache[x]); 35 | } else { 36 | noodle.events.emit('cache/expire', cache[x], config.cacheMaxTime); 37 | } 38 | x++; 39 | } 40 | 41 | cache = keep; 42 | }, 10000); 43 | 44 | // Remove all cache entries every time the cache purge time 45 | // is reached. 46 | 47 | if (config.cachePurgeTime > 0) { 48 | intervalId2 = setInterval(function () { 49 | cache = []; 50 | noodle.events.emit('cache/purge', new Date(), config.cachePurgeTime); 51 | }, config.cachePurgeTime); 52 | } 53 | }; 54 | 55 | // ------------------------------------------------------------ 56 | // Store an object in the cache tied to specific key. 57 | // 58 | // In noodle: resultsCache stores a result set with the query 59 | // being the key. pageCache stores a document and its headers 60 | // with the url being the key. 61 | // ------------------------------------------------------------ 62 | 63 | this.put = function (key, value) { 64 | var item = { 65 | key: key, 66 | value: value, 67 | created: new Date() 68 | }; 69 | 70 | if (cache.length >= config.maxSize) { 71 | cache.pop(); 72 | } 73 | 74 | cache.unshift(item); 75 | return this.get(key); 76 | }; 77 | 78 | // ------------------------------------------------------------ 79 | // Boolean representing if an item exists for a specific key. 80 | // ------------------------------------------------------------ 81 | 82 | this.check = function (key) { 83 | return (find(key)) ? true : false; 84 | }; 85 | 86 | // ------------------------------------------------------------ 87 | // Returns a cached item based on a specific key. 88 | // 89 | // Cached items are objects with the following structure: 90 | // 91 | // { 92 | // created: 93 | // value: 94 | // } 95 | // ------------------------------------------------------------ 96 | 97 | this.get = function (key) { 98 | var item = find(key), 99 | clone = _.clone(item); 100 | 101 | delete clone.key; 102 | return clone; 103 | }; 104 | 105 | // ------------------------------------------------------------ 106 | // The cache array is exposed. Useful for debugging purposes. 107 | // ------------------------------------------------------------ 108 | 109 | this.getCache = function () { 110 | return cache; 111 | }; 112 | 113 | // ------------------------------------------------------------ 114 | // Stops running the intervals for the cache checking. Useful 115 | // for removing cache objects from the event loop and keeping 116 | // the node process from running indefinitely. 117 | // ------------------------------------------------------------ 118 | 119 | this.stop = function () { 120 | clearInterval(intervalId1); 121 | clearInterval(intervalId2); 122 | }; 123 | 124 | // Loops through the cache array finding the cached item 125 | // associated with the key. 126 | 127 | function find (key) { 128 | var i = 0; 129 | for (i; i < cache.length; i++) { 130 | if (_.isEqual(key, cache[i].key)) { 131 | return cache[i]; 132 | } 133 | } 134 | } 135 | }; 136 | -------------------------------------------------------------------------------- /lib/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "debug": true, 3 | 4 | "resultsCacheMaxTime": 3600000, 5 | "resultsCachePurgeTime": 60480000, 6 | "resultsCacheMaxSize": 124, 7 | 8 | "pageCacheMaxTime": 3600000, 9 | "pageCachePurgeTime": 60480000, 10 | "pageCacheMaxSize": 32, 11 | 12 | "defaultDocumentType": "html", 13 | 14 | "userAgent": "", 15 | 16 | "rateLimit": { 17 | "whitelist": ["127.0.0.1", "localhost"], 18 | "blacklist": [], 19 | "catagories": { 20 | "normal": { 21 | "totalRequests": 1000, 22 | "every": 3600000000 23 | }, 24 | "whitelist": { 25 | "totalRequests": 10000, 26 | "every": 60000000 27 | }, 28 | "blacklist": { 29 | "totalRequests": 0, 30 | "every": 0 31 | } 32 | } 33 | } 34 | } -------------------------------------------------------------------------------- /lib/logger.js: -------------------------------------------------------------------------------- 1 | require('colors'); 2 | 3 | var messages = 0; 4 | 5 | module.exports = function (noodle) { 6 | var events = noodle.events, 7 | config = noodle.config; 8 | 9 | function toTerminal(message) { 10 | if (config.debug) { 11 | console.log(('\n [noodle log #' + ++messages + ']').green); 12 | console.log('', new Date().toString().magenta); 13 | console.log('', memUsage().magenta); 14 | console.log('', (message + '\n').magenta); 15 | } 16 | } 17 | 18 | // Called on a query 19 | events.on('noodle/query', function (query) { 20 | toTerminal('Noodle: The query follows...\n ' + JSON.stringify(query)); 21 | }); 22 | 23 | // Called when a page is cached 24 | events.on('cache/page', function (cachePage) { 25 | toTerminal('Cache: Page has been cached'); 26 | }); 27 | 28 | // Called when a result is cached 29 | events.on('cache/result', function (cacheResult) { 30 | toTerminal('Cache: Result has been cached'); 31 | }); 32 | 33 | // Called when the cache is purged 34 | events.on('cache/purge', function (when, next) { 35 | toTerminal('Cache: Purge @ ' + when + ' next in ' + next); 36 | }); 37 | 38 | // Called when a cached item has expired from the cache 39 | events.on('cache/expire', function (item, next) { 40 | toTerminal('Cache: An item expired from cache, next in ' + next); 41 | }); 42 | }; 43 | 44 | function memUsage () { 45 | var heapTotal = process.memoryUsage().heapTotal; 46 | return 'Memory: ' + (heapTotal / 1048576).toFixed(2) + 47 | 'mb (' + heapTotal + ' bytes)'; 48 | } -------------------------------------------------------------------------------- /lib/noodle-middleware.js: -------------------------------------------------------------------------------- 1 | var zlib = require('zlib'), 2 | moment = require('moment'), 3 | _ = require('underscore'), 4 | noodle = require('../lib/noodle'); 5 | 6 | exports.parseQueries = function (req, res, next) { 7 | var hasJSON = (Object.keys(req.body).length > 0), 8 | hasQueryString = (Object.keys(req.query).length > 0), 9 | queries; 10 | 11 | // Handle for different request types 12 | 13 | // Take JSON from request body (http post) 14 | queries = (hasJSON) ? req.body : false; 15 | // Take only single query JSON from request querystring (http get) 16 | queries = (queries === false && hasQueryString) ? req.query : queries; 17 | // Take JSON from request querystring (http get) 18 | queries = (req.query.q) ? toJSON(req.query.q) : queries; 19 | 20 | // Handle query(s) with noodle or fail early 21 | 22 | if (queries) { 23 | res.queries = queries; 24 | next(); 25 | } else { 26 | res.noodleData = {error: 'Malformed or no query'}; 27 | exports.respond(req, res); 28 | } 29 | }; 30 | 31 | exports.noodleQueries = function (req, res, next) { 32 | noodle.query(res.queries).then(function (results) { 33 | res.noodleData = results; 34 | next(); 35 | }); 36 | }; 37 | 38 | exports.respond = function (req, res) { 39 | var error = res.noodleData.error, 40 | callback = req.query.callback, 41 | responseBody; 42 | 43 | if (error) { 44 | res.statusCode = 401; 45 | responseBody = '[{"results": [], "error":"' + error + '"}]'; 46 | } else { 47 | res.statusCode = 200; 48 | res.setHeader('Expires', setExpiresHeader(res.noodleData.results)); 49 | responseBody = JSON.stringify(res.noodleData.results); 50 | } 51 | 52 | if (callback) { 53 | res.setHeader('Content-Type', 'application/javascript'); 54 | responseBody = callback + '(' + responseBody + ')'; 55 | } else { 56 | res.setHeader('Content-Type', 'application/json; charset=utf-8'); 57 | } 58 | 59 | responseBody = new Buffer(responseBody, 'utf8'); 60 | 61 | if (req.headers['accept-encoding']) { 62 | res.setHeader('content-encoding', 'gzip'); 63 | zlib.gzip(responseBody, function (err, buffer) { 64 | res.end(buffer); 65 | }); 66 | } else { 67 | res.end(responseBody); 68 | } 69 | }; 70 | 71 | function setExpiresHeader (results) { 72 | var temp; 73 | 74 | results = (_.isArray(results)) ? results : [results]; 75 | 76 | // Get the earliest time first (last to expire) 77 | // use concat() to not mutate the original results order 78 | 79 | temp = results.concat().sort(function (a, b) { 80 | return (b.created || 0) - (a.created || 0); 81 | }); 82 | 83 | // Return oldest to expire or return the present time for 84 | // a bad result which was not cached 85 | 86 | if (temp[0].created) { 87 | return moment(temp[0].created.getTime() + noodle.config.resultsCacheMaxTime) 88 | .format('ddd, D MMM YYYY HH:mm:ss') + ' GMT'; 89 | } else { 90 | return moment(new Date()) 91 | .format('ddd, D MMM YYYY HH:mm:ss') + ' GMT'; 92 | } 93 | }; 94 | 95 | 96 | // Wraps JSON.parse so that numbers are treated as an invalid argument 97 | 98 | function toJSON (str) { 99 | var x; 100 | try { 101 | x = JSON.parse(str); 102 | if (typeof x === 'number') { 103 | return false; 104 | } 105 | return x; 106 | } catch (e) { 107 | return false; 108 | } 109 | } -------------------------------------------------------------------------------- /lib/noodle.js: -------------------------------------------------------------------------------- 1 | var q = require('q'), 2 | fs = require('fs'), 3 | events = require('events'), 4 | request = require('request'), 5 | _ = require('underscore'), 6 | Cache = require('./cache'), 7 | pageCache, 8 | resultsCache; 9 | 10 | 11 | // ------------------------------------------------------------ 12 | // Main noodle entry point for usage. 13 | // 14 | // Accepts one or an array of noodle queries. Based on the 15 | // query type it will make use of the appropriate type module 16 | // to do the processing. 17 | // 18 | // See docs/ for information on what and noodle queries can 19 | // be written. 20 | // ------------------------------------------------------------ 21 | 22 | exports.query = function (queries) { 23 | var deferred = q.defer(), 24 | promises = []; 25 | 26 | // Normalise one query to an array 27 | 28 | queries = _.isArray(queries) ? queries : [queries], 29 | 30 | // For each query route resolve it as either a normal query 31 | // or a map query 32 | 33 | queries.forEach(function (query, i) { 34 | var deferred = q.defer(); 35 | 36 | query.type = query.type || exports.config.defaultDocumentType; 37 | query.cache = (query.cache === false) ? false : true; 38 | 39 | exports.events.emit('noodle/query', query); 40 | 41 | if (exports[query.type]) { 42 | if (query.map) { 43 | handleQueryMap(query, deferred, i); 44 | } else { 45 | handleQuery(query, deferred, i); 46 | } 47 | } else { 48 | deferred.resolve({results: [], error: 'Document type not supported'}); 49 | } 50 | promises.push(deferred.promise); 51 | }); 52 | 53 | // Return master promise when all queries have resolved 54 | // and ensure that the order they were evaluated is 55 | // maintained 56 | 57 | q.all(promises) 58 | .then(function (results) { 59 | results = results.sort(function (a, b) { 60 | return a.orderNo - b.orderNo; 61 | }); 62 | 63 | results.forEach(function (result) { 64 | delete result.orderNo; 65 | }); 66 | 67 | deferred.resolve({results: results}); 68 | }); 69 | 70 | return deferred.promise; 71 | }; 72 | 73 | function handleQuery (query, deferred, i) { 74 | exports[query.type].fetch(query.url, query) 75 | .then(function (result) { 76 | result.orderNo = i; 77 | if (query.cache) { 78 | result.created = resultsCache.get(query).created; 79 | } 80 | deferred.resolve(result); 81 | }) 82 | .fail(function (error) { 83 | deferred.resolve({results: [], error: error.message, orderNo: i}); 84 | }); 85 | } 86 | 87 | function handleQueryMap (query, deferred, i) { 88 | map(query, function (error, result) { 89 | if (!error) { 90 | result.orderNo = i; 91 | if (query.cache) { 92 | result.created = resultsCache.get(query).created; 93 | } 94 | deferred.resolve(result); 95 | } else { 96 | deferred.resolve({results: [], error: error.message, orderNo: i}); 97 | } 98 | }); 99 | } 100 | 101 | // ------------------------------------------------------------ 102 | // Fetch a web document (possibly from cache) with a url. 103 | // 104 | // The query should also be passed in as it contains 105 | // details if it should bypass the cache or if it is a POST 106 | // request. 107 | // 108 | // This fetch method is used by the different type modules to 109 | // get the document before they do they interpret the query 110 | // process the document. 111 | // ------------------------------------------------------------ 112 | 113 | exports.fetch = function (url, query, extendedHeaders) { 114 | var deferred = q.defer(), 115 | requestOptions = { 116 | method: 'GET', 117 | uri: url, 118 | headers: {'user-agent': exports.config.userAgent} 119 | }; 120 | 121 | if (query.proxy) { 122 | requestOptions.proxy = query.proxy; 123 | } 124 | 125 | 126 | if (query.post) { 127 | requestOptions.method = 'POST'; 128 | requestOptions.body = serialize(query.post); 129 | requestOptions.headers = _.extend(requestOptions.headers, { 130 | 'Content-Type': 'application/x-www-form-urlencoded', 131 | 'Content-Length': requestOptions.body.length 132 | }); 133 | query.cache = false; 134 | } 135 | 136 | if (extendedHeaders) { 137 | _.extend(requestOptions.headers, extendedHeaders); 138 | } 139 | 140 | // (!) This aspect should be revised. 141 | // Force cache true if the person wants header information 142 | // since header data is read from cache 143 | query.cache = (query.headers || query.linkHeader) ? true : query.cache; 144 | 145 | if (pageCache.check(url) && query.cache) { 146 | deferred.resolve(pageCache.get(url).value.body); 147 | } else { 148 | getDocument(query.cache, requestOptions, deferred); 149 | } 150 | 151 | return deferred.promise; 152 | }; 153 | 154 | function getDocument (shouldCache, options, deferred) { 155 | request(options, function (err, response, body) { 156 | if (err || response.statusCode !== 200) { 157 | deferred.reject(new Error('Document not found')); 158 | } else { 159 | if (shouldCache && !pageCache.check(options.uri)) { 160 | //added response.request in order to get the details like location and domain 161 | pageCache.put(options.uri, {body: body, headers: response.headers, request: response.request}); 162 | exports.events.emit('cache/page', pageCache.get(options.uri 163 | )); 164 | } 165 | deferred.resolve(body); 166 | } 167 | }); 168 | } 169 | 170 | // ------------------------------------------------------------ 171 | // Returns an object representing a result set which comprises 172 | // of an array of 1 or more results and the associate page 173 | // header information. 174 | // 175 | // (!!) This is where a result set is cached in resultsCache. 176 | // 177 | // Exposed as it is also called from some type modules. 178 | // ------------------------------------------------------------ 179 | 180 | exports._wrapResults = function (results, query) { 181 | var resultSet = {}; 182 | 183 | if (results.length || Object.keys(results).length) { 184 | resultSet.results = results; 185 | 186 | if (query.headers) { 187 | resultSet.headers = getHeadersForResultSet(query); 188 | } 189 | 190 | if (query.request) { 191 | resultSet.request = getRequestDetailsForResultSet(query); 192 | } 193 | 194 | if (query.linkHeader) { 195 | resultSet.headers = resultSet.headers || {}; 196 | resultSet.headers.link = getLinkHeaders(query) || null; 197 | } 198 | 199 | if (query.cache) { 200 | if (resultsCache.check(query) === false) { 201 | resultsCache.put(query, resultSet); 202 | exports.events.emit('cache/result', resultsCache.get(query)); 203 | } 204 | } 205 | 206 | return resultSet; 207 | } 208 | 209 | return []; 210 | }; 211 | 212 | // ------------------------------------------------------------ 213 | // The namespace for noodles events. 214 | // 215 | // Events are emitted from both this file and cache.js. 216 | // 217 | // One can subscribe to the following events: 218 | // - cache/page 219 | // - cache/result 220 | // - cache/purge 221 | // - cache/expire 222 | // 223 | // ------------------------------------------------------------ 224 | 225 | exports.events = new events.EventEmitter(); 226 | 227 | // ------------------------------------------------------------ 228 | // An exposed noodle config initialized by an editable 229 | // json representation at lib/config.json 230 | // ------------------------------------------------------------ 231 | 232 | exports.config = JSON.parse(fs.readFileSync(__dirname +'/config.json')); 233 | 234 | // ------------------------------------------------------------ 235 | // Accepts a full or part config object an extends it over 236 | // the existing noodle config. 237 | // 238 | // This is a way to programmatically configure the config 239 | // without touching lib/config.json 240 | // ------------------------------------------------------------ 241 | 242 | exports.configure = function (obj) { 243 | exports.config = _.extend(exports.config, obj); 244 | }; 245 | 246 | // ------------------------------------------------------------ 247 | // Stops the cache intervals from running in the event loop. 248 | // Allows for the node process to exit. 249 | // ------------------------------------------------------------ 250 | 251 | exports.stopCache = function () { 252 | resultsCache.stop(); 253 | pageCache.stop(); 254 | }; 255 | 256 | // Function called from exports.query() 257 | // 258 | // Takes in a query in the map notation 259 | // 260 | // For each map property, a call to the appropriate type module 261 | // is done and the result is grabbed for that map property's 262 | // value. 263 | // 264 | // When all properties are mapped with values this function calls 265 | // back to exports.query(). 266 | 267 | function map (query, callback) { 268 | var promises = [], 269 | mappedContainer = {}, 270 | getResultSet, 271 | toPush, 272 | mapTo; 273 | 274 | getResultSet = function (mapTo, query) { 275 | query.map[mapTo].url = query.url; 276 | query.map[mapTo].cache = query.cache; 277 | 278 | return exports[query.type].fetch(query.url, query.map[mapTo]) 279 | .then(function (result) { 280 | mappedContainer[mapTo] = result.results; 281 | }) 282 | .fail(function (error) { 283 | mappedContainer[mapTo] = {results: [], error: error.message}; 284 | }); 285 | }; 286 | 287 | for (mapTo in query.map) { 288 | promises.push(getResultSet(mapTo, query)); 289 | } 290 | 291 | q.all(promises) 292 | .then(function () { 293 | callback(null, exports._wrapResults(mappedContainer, query)); 294 | }) 295 | .fail(function (error) { 296 | callback(error); 297 | }); 298 | } 299 | 300 | // Function called from exports._wrapResults() 301 | // 302 | // Passed in a query and returns the full page headers 303 | // or specific page headers as specified by the query. 304 | 305 | function getHeadersForResultSet (query) { 306 | var bucket = {}, 307 | pageHeaders = pageCache.get(query.url).value.headers, 308 | prop; 309 | 310 | if (query.headers !== 'all' && _.isArray(query.headers)) { 311 | for (prop in pageHeaders) { 312 | query.headers.forEach(function (name) { 313 | if (prop.toLowerCase() === name.toLowerCase()) { 314 | bucket[name] = pageHeaders[prop]; 315 | } 316 | }); 317 | } 318 | return bucket; 319 | } else { 320 | return pageHeaders; 321 | } 322 | } 323 | 324 | 325 | // Function called from exports._wrapResults() 326 | // 327 | // Passed in a query and returns the full request headers 328 | // or specific request headers as specified by the query. 329 | function getRequestDetailsForResultSet(query) { 330 | var bucket = {}, 331 | requestHeaders = pageCache.get(query.url).value.request, 332 | prop; 333 | 334 | if (query.request !== 'all' && _.isArray(query.request)) { 335 | for (prop in requestHeaders) { 336 | query.request.forEach(function (name) { 337 | if(prop.toLowerCase() === name.toLowerCase()) { 338 | bucket[name] = requestHeaders[prop]; 339 | } 340 | }); 341 | } 342 | return bucket; 343 | } else { 344 | return requestHeaders; 345 | } 346 | } 347 | 348 | // Function called from exports._wrapResults() 349 | // 350 | // Passed in a query this function returns a parsed representation 351 | // of the Link header values (intended to aid people with navigation). 352 | 353 | function getLinkHeaders (query) { 354 | var header = pageCache.get(query.url).value.headers.link, 355 | links = {}, 356 | parts; 357 | 358 | if (header) { 359 | parts = header.split(','); 360 | } else { 361 | return false; 362 | } 363 | 364 | // Parse each part into a named link 365 | parts.forEach(function(p) { 366 | var section = p.split(';'), 367 | url = section[0].replace(/<(.*)>/, '$1').trim(), 368 | name = section[1].replace(/rel="(.*)"/, '$1').trim(); 369 | links[name] = url; 370 | }); 371 | 372 | return links; 373 | } 374 | 375 | // Function called from exports.query 376 | // 377 | // Will return a query parameter string from an object. 378 | 379 | function serialize (obj) { 380 | var str = [], p; 381 | for (p in obj) { 382 | str.push(encodeURIComponent(p) + "=" + encodeURIComponent(obj[p])); 383 | } 384 | return str.join("&"); 385 | } 386 | 387 | // .---------------------------. 388 | // |noodle initialization stuff| 389 | // '---------------------------' 390 | 391 | // Initialize supported document types 392 | 393 | fs.readdirSync(__dirname + '/types/').forEach(function (file) { 394 | file = file.substr(0, file.lastIndexOf('.')); 395 | exports[file] = require('./types/' + file); 396 | exports[file]._init(exports); 397 | }); 398 | 399 | // Start the logger. 400 | // The logger will output to terminal if config.debug is set 401 | // to true. 402 | 403 | require('./logger')(exports); 404 | 405 | // Initialize caches 406 | 407 | // ------------------------------------------------------------ 408 | // The results cache is exposed for different type modules 409 | // so they can cache their results. 410 | // ------------------------------------------------------------ 411 | 412 | exports.cache = resultsCache = new Cache({ 413 | cacheMaxTime: exports.config.resultsCacheMaxTime, 414 | cachePurgeTime: exports.config.resultsCachePurgeTime, 415 | cacheMaxSize: exports.config.resultsCacheMaxSize 416 | }, exports); 417 | 418 | pageCache = new Cache({ 419 | cacheMaxTime: exports.config.pageCacheMaxTime, 420 | cachePurgeTime: exports.config.pageCachePurgeTime, 421 | cacheMaxSize: exports.config.pageCacheMaxSize 422 | }, exports); 423 | 424 | resultsCache.start(); 425 | pageCache.start(); 426 | -------------------------------------------------------------------------------- /lib/types/feed.js: -------------------------------------------------------------------------------- 1 | var q = require('q'), 2 | feedparser = require('feedparser'), 3 | noodle; 4 | 5 | exports._init = function (n) { 6 | noodle = n; 7 | }; 8 | 9 | exports.fetch = fetch; 10 | exports.select = select; 11 | 12 | function fetch (url, query) { 13 | var deferred = q.defer(); 14 | 15 | if (noodle.cache.check(query)) { 16 | deferred.resolve(noodle.cache.get(query).value); 17 | return deferred.promise; 18 | } else { 19 | return noodle.fetch(url, query).then(function (data) { 20 | return select(data, query); 21 | }); 22 | } 23 | } 24 | 25 | function select (data, query) { 26 | return normalise(data).then(function (normalised) { 27 | if (normalised.length === 0) { 28 | throw new Error('The provided document couldn\'t be normalised'); 29 | } 30 | return noodle.json.select(normalised, query); 31 | }); 32 | } 33 | 34 | function normalise (body) { 35 | var deferred = q.defer(), 36 | articles = []; 37 | 38 | feedparser 39 | .parseString(body) 40 | .on('article', function (a) { 41 | articles.push(a); 42 | }) 43 | .on('error', deferred.reject) 44 | .on('complete', function () { 45 | deferred.resolve(articles); 46 | }); 47 | 48 | return deferred.promise; 49 | } -------------------------------------------------------------------------------- /lib/types/html.js: -------------------------------------------------------------------------------- 1 | var q = require('q'), 2 | util = require('util'), 3 | cheerio = require('cheerio'), 4 | noodle; 5 | 6 | exports._init = function (n) { 7 | noodle = n; 8 | }; 9 | 10 | exports.fetch = fetch; 11 | exports.select = select; 12 | 13 | function fetch (url, query) { 14 | var deferred = q.defer(); 15 | 16 | if (noodle.cache.check(query)) { 17 | deferred.resolve(noodle.cache.get(query).value); 18 | return deferred.promise; 19 | } else { 20 | return noodle.fetch(url, query).then(function (page) { 21 | return select(page, query); 22 | }); 23 | } 24 | } 25 | 26 | function select (body, query) { 27 | var deferred = q.defer(), 28 | extract = query.extract || 'text', 29 | selector = query.selector, 30 | page = cheerio.load(body, { lowerCaseTags: true, lowerCaseAttributeNames: true }), 31 | selected = page(selector), 32 | results = []; 33 | 34 | if (!selector) { 35 | deferred.resolve(noodle._wrapResults(body.trim(), query)); 36 | return deferred.promise; 37 | } 38 | else if (util.isArray(extract)) { 39 | selected.each(function (i, elem) { 40 | var item = {}, 41 | notEmpty; 42 | 43 | extract.forEach(function (property) { 44 | item[property] = extractProperty(page, elem, property); 45 | notEmpty = notEmpty || item[property]; 46 | }); 47 | 48 | if (notEmpty) { 49 | results.push(item); 50 | } 51 | }); 52 | } 53 | else { 54 | selected.each(function (i, elem) { 55 | results.push(extractProperty(page, elem, extract)); 56 | }); 57 | } 58 | 59 | // Pass back the extracted results from the DOM 60 | 61 | if (results.length === 0) { 62 | deferred.reject(new Error('Could not match with that selector or extract value')); 63 | } else { 64 | deferred.resolve(noodle._wrapResults(results, query)); 65 | } 66 | 67 | return deferred.promise; 68 | } 69 | 70 | function extractProperty (page, elem, property) { 71 | if (property === 'text') { 72 | return page(elem).text().replace(/(\r\n|\n|\r)/gm, "").trim(); 73 | } 74 | else if (property === 'html' || property === 'innerHTML') { 75 | return page(elem).html(); 76 | } 77 | else { 78 | return page(elem).attr(property); 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /lib/types/json.js: -------------------------------------------------------------------------------- 1 | var q = require('q'), 2 | jsonSelect = require('JSONSelect'), 3 | noodle; 4 | 5 | exports._init = function (n) { 6 | noodle = n; 7 | }; 8 | 9 | exports.fetch = fetch; 10 | exports.select = select; 11 | 12 | function fetch (url, query) { 13 | var deferred = q.defer(); 14 | 15 | if (noodle.cache.check(query)) { 16 | deferred.resolve(noodle.cache.get(query).value); 17 | return deferred.promise; 18 | } else { 19 | return noodle.fetch(url, query).then(function (data) { 20 | try { 21 | var parsed = JSON.parse(data); 22 | return select(parsed, query); 23 | } catch (e) { 24 | throw new Error('Could not parse JSON document'); 25 | } 26 | }); 27 | } 28 | } 29 | 30 | function select (parsed, query) { 31 | var deferred = q.defer(), 32 | results; 33 | 34 | try { 35 | if (!query.selector) { 36 | deferred.resolve(noodle._wrapResults([parsed], query)); 37 | } else { 38 | results = jsonSelect.match(query.selector, [], parsed); 39 | if (results.length === 0) { 40 | deferred.reject(new Error('Could not match with that selector')); 41 | } else { 42 | deferred.resolve(noodle._wrapResults(results, query)); 43 | } 44 | } 45 | } catch (e) { 46 | deferred.reject(new Error('Could not match with that selector')); 47 | } 48 | 49 | return deferred.promise; 50 | } -------------------------------------------------------------------------------- /lib/types/xml.js: -------------------------------------------------------------------------------- 1 | var q = require('q'), 2 | xml2json = require('xml2json'), 3 | noodle; 4 | 5 | exports._init = function (n) { 6 | noodle = n; 7 | }; 8 | 9 | exports.fetch = fetch; 10 | exports.select = select; 11 | 12 | function fetch (url, query) { 13 | var deferred = q.defer(); 14 | 15 | if (noodle.cache.check(query)) { 16 | deferred.resolve(noodle.cache.get(query).value); 17 | return deferred.promise; 18 | } else { 19 | return noodle.fetch(url, query).then(function (xml) { 20 | try { 21 | var parsed = JSON.parse(xml2json.toJson(xml)); 22 | return select(parsed, query); 23 | } catch (e) { 24 | throw new Error('Could not parse XML to JSON'); 25 | } 26 | }); 27 | } 28 | } 29 | 30 | function select (obj, query) { 31 | return noodle.json.select(obj, query); 32 | } -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "noodlejs", 3 | "version": "0.3.2", 4 | "description": "noodle is a proxy server which serves for cross domain data extraction from web documents for any client.", 5 | "main": "./lib/noodle", 6 | "bin": { 7 | "noodle": "./bin/noodle-server" 8 | }, 9 | "dependencies": { 10 | "connect": "~2.3.5", 11 | "connect-ratelimit": "0.0.5", 12 | "JSONSelect": "0.4.0", 13 | "feedparser": "0.10.7", 14 | "moment": "1.7.2", 15 | "cheerio": "0.10.1", 16 | "request": "2.11.4", 17 | "q": "0.8.9", 18 | "xml2json": "^0.5.1", 19 | "underscore": "1.4.2", 20 | "mocha": "1.7.4", 21 | "chai": "1.4.2", 22 | "colors": "0.6.0-1" 23 | }, 24 | "devDependencies": {}, 25 | "scripts": { 26 | "test": "echo \"Error: no test specified\" && exit 1", 27 | "start": "bin/noodle-server" 28 | }, 29 | "engines": { 30 | "node": "0.6.x" 31 | }, 32 | "repository": { 33 | "type": "git", 34 | "url": "git://github.com/dharmafly/noodle.git" 35 | }, 36 | "keywords": [ 37 | "scraper", 38 | "proxy", 39 | "cross-domain", 40 | "cross domain", 41 | "selectors", 42 | "JSONSelect", 43 | "json", 44 | "html", 45 | "web service", 46 | "rate limit" 47 | ], 48 | "author": "Dharmafly", 49 | "license": "BSD" 50 | } -------------------------------------------------------------------------------- /tests/document.atom: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Example Feed 5 | 6 | 2003-12-13T18:30:02Z 7 | 8 | John Doe 9 | 10 | urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6 11 | 12 | 13 | Atom-Powered Robots Run Amok 14 | 15 | urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 16 | 2003-12-13T18:30:02Z 17 | Some text. 18 | 19 | 20 | -------------------------------------------------------------------------------- /tests/document.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dharmafly/noodle/52f88f5df0d2a92506aad702d6719350d2e79459/tests/document.html -------------------------------------------------------------------------------- /tests/document.json: -------------------------------------------------------------------------------- 1 | { 2 | "completed_in": 0.013, 3 | "max_id": 288657757152870400, 4 | "max_id_str": "288657757152870400", 5 | "next_page": "?page=2&max_id=288657757152870400&q=dinosaurs", 6 | "page": 1, 7 | "query": "dinosaurs", 8 | "refresh_url": "?since_id=288657757152870400&q=dinosaurs", 9 | "results": [ 10 | { 11 | "created_at": "Tue, 08 Jan 2013 14:45:46 +0000", 12 | "from_user": "_MsMindless", 13 | "from_user_id": 878142511, 14 | "from_user_id_str": "878142511", 15 | "from_user_name": "uh uh", 16 | "geo": null, 17 | "id": 288657757152870400, 18 | "id_str": "288657757152870400", 19 | "iso_language_code": "en", 20 | "metadata": { 21 | "result_type": "recent" 22 | }, 23 | "profile_image_url": "http://a0.twimg.com/profile_images/3081235144/d762e9c3edb63360aa49d7e6e62b683a_normal.jpeg", 24 | "profile_image_url_https": "https://si0.twimg.com/profile_images/3081235144/d762e9c3edb63360aa49d7e6e62b683a_normal.jpeg", 25 | "source": "<a href="http://twitter.com/#!/download/ipad">Twitter for iPad</a>", 26 | "text": "@AmiMindless do they transform into dinosaurs in the night?", 27 | "to_user": "AmiMindless", 28 | "to_user_id": 1021682551, 29 | "to_user_id_str": "1021682551", 30 | "to_user_name": "amandaspiffytho.", 31 | "in_reply_to_status_id": 288657189856505860, 32 | "in_reply_to_status_id_str": "288657189856505856" 33 | }, 34 | { 35 | "created_at": "Tue, 08 Jan 2013 14:43:34 +0000", 36 | "from_user": "jirouishi", 37 | "from_user_id": 157924053, 38 | "from_user_id_str": "157924053", 39 | "from_user_name": "Aron ", 40 | "geo": { 41 | "coordinates": [ 42 | 37.596182, 43 | 127.056834 44 | ], 45 | "type": "Point" 46 | }, 47 | "id": 288657201952849900, 48 | "id_str": "288657201952849920", 49 | "iso_language_code": "ko", 50 | "metadata": { 51 | "result_type": "recent" 52 | }, 53 | "profile_image_url": "http://a0.twimg.com/profile_images/3080515500/b0b7d315abba887f50af14acdccfc1ab_normal.jpeg", 54 | "profile_image_url_https": "https://si0.twimg.com/profile_images/3080515500/b0b7d315abba887f50af14acdccfc1ab_normal.jpeg", 55 | "source": "<a href="http://twitter.com/download/android">Twitter for Android</a>", 56 | "text": "omg ㅠㅠ nver thought a docu about dinosaurs would hurt this much https://t.co/TFsRXbMN", 57 | "to_user": null, 58 | "to_user_id": 0, 59 | "to_user_id_str": "0", 60 | "to_user_name": null 61 | }, 62 | { 63 | "created_at": "Tue, 08 Jan 2013 14:43:06 +0000", 64 | "from_user": "imexdanny", 65 | "from_user_id": 302811136, 66 | "from_user_id_str": "302811136", 67 | "from_user_name": "Danny Hughes", 68 | "geo": null, 69 | "id": 288657084260683800, 70 | "id_str": "288657084260683777", 71 | "iso_language_code": "en", 72 | "metadata": { 73 | "result_type": "recent" 74 | }, 75 | "profile_image_url": "http://a0.twimg.com/profile_images/3037826375/3530f8d5cf699b4d024a82e9ac4d8624_normal.jpeg", 76 | "profile_image_url_https": "https://si0.twimg.com/profile_images/3037826375/3530f8d5cf699b4d024a82e9ac4d8624_normal.jpeg", 77 | "source": "<a href="http://twitter.com/download/iphone">Twitter for iPhone</a>", 78 | "text": "RT @AndyRothwell1: @imexdanny what killed ze dinosaurs? ZE ICE AGE!", 79 | "to_user": null, 80 | "to_user_id": 0, 81 | "to_user_id_str": "0", 82 | "to_user_name": null, 83 | "in_reply_to_status_id": 288656063555858400, 84 | "in_reply_to_status_id_str": "288656063555858432" 85 | }, 86 | { 87 | "created_at": "Tue, 08 Jan 2013 14:42:42 +0000", 88 | "from_user": "M3LbReEzY", 89 | "from_user_id": 89102612, 90 | "from_user_id_str": "89102612", 91 | "from_user_name": "Don Melocino", 92 | "geo": null, 93 | "id": 288656983974871040, 94 | "id_str": "288656983974871040", 95 | "iso_language_code": "en", 96 | "metadata": { 97 | "result_type": "recent" 98 | }, 99 | "profile_image_url": "http://a0.twimg.com/profile_images/1645636429/IMG_0747_normal.JPG", 100 | "profile_image_url_https": "https://si0.twimg.com/profile_images/1645636429/IMG_0747_normal.JPG", 101 | "source": "<a href="http://twitter.com/">web</a>", 102 | "text": "Some dinosaurs were as small as chickens #RandomSnappleFact.. Good morning Tweepsters", 103 | "to_user": null, 104 | "to_user_id": 0, 105 | "to_user_id_str": "0", 106 | "to_user_name": null 107 | }, 108 | { 109 | "created_at": "Tue, 08 Jan 2013 14:41:42 +0000", 110 | "from_user": "merissa_ariff", 111 | "from_user_id": 401955951, 112 | "from_user_id_str": "401955951", 113 | "from_user_name": "MissCaprisss", 114 | "geo": null, 115 | "id": 288656733193244700, 116 | "id_str": "288656733193244674", 117 | "iso_language_code": "en", 118 | "metadata": { 119 | "result_type": "recent" 120 | }, 121 | "profile_image_url": "http://a0.twimg.com/profile_images/3067213647/f7c0775af4a6d17d036192fd0ffea3d0_normal.jpeg", 122 | "profile_image_url_https": "https://si0.twimg.com/profile_images/3067213647/f7c0775af4a6d17d036192fd0ffea3d0_normal.jpeg", 123 | "source": "<a href="http://twitter.com/">web</a>", 124 | "text": "@Sya_fxqxh Mind u i hate dinosaurs. If u know what i mean. ITS NNNIIINNNOOOOOOOOOOOOO", 125 | "to_user": "Sya_fxqxh", 126 | "to_user_id": 303860298, 127 | "to_user_id_str": "303860298", 128 | "to_user_name": "", 129 | "in_reply_to_status_id": 288655990411378700, 130 | "in_reply_to_status_id_str": "288655990411378688" 131 | }, 132 | { 133 | "created_at": "Tue, 08 Jan 2013 14:41:42 +0000", 134 | "from_user": "TwycrossZoo", 135 | "from_user_id": 66683145, 136 | "from_user_id_str": "66683145", 137 | "from_user_name": "Twycross Zoo", 138 | "geo": null, 139 | "id": 288656729946865660, 140 | "id_str": "288656729946865666", 141 | "iso_language_code": "en", 142 | "metadata": { 143 | "result_type": "recent" 144 | }, 145 | "profile_image_url": "http://a0.twimg.com/profile_images/1367172040/TZoo_WPS_Logo_blue_portrait_normal.png", 146 | "profile_image_url_https": "https://si0.twimg.com/profile_images/1367172040/TZoo_WPS_Logo_blue_portrait_normal.png", 147 | "source": "<a href="http://www.facebook.com/twitter">Facebook</a>", 148 | "text": "DINOSAUR FACT OF THE DAY: \nBaryonyx was a carnivore. It is one of the only dinosaurs known to feed on fish,... http://t.co/cgtbjyVL", 149 | "to_user": null, 150 | "to_user_id": 0, 151 | "to_user_id_str": "0", 152 | "to_user_name": null 153 | }, 154 | { 155 | "created_at": "Tue, 08 Jan 2013 14:41:29 +0000", 156 | "from_user": "Hippobatman", 157 | "from_user_id": 42901828, 158 | "from_user_id_str": "42901828", 159 | "from_user_name": "Ulf Martinsen", 160 | "geo": null, 161 | "id": 288656677736157200, 162 | "id_str": "288656677736157187", 163 | "iso_language_code": "en", 164 | "metadata": { 165 | "result_type": "recent" 166 | }, 167 | "profile_image_url": "http://a0.twimg.com/profile_images/248772203/thumb-super-mario-bros-8bit-Mario_normal.jpg", 168 | "profile_image_url_https": "https://si0.twimg.com/profile_images/248772203/thumb-super-mario-bros-8bit-Mario_normal.jpg", 169 | "source": "<a href="http://twitter.com/">web</a>", 170 | "text": "Why aren't there more games with dinosaurs in them? There need to be more games with dinosaurs in them. Dinosaurs are cool.", 171 | "to_user": null, 172 | "to_user_id": 0, 173 | "to_user_id_str": "0", 174 | "to_user_name": null 175 | }, 176 | { 177 | "created_at": "Tue, 08 Jan 2013 14:40:47 +0000", 178 | "from_user": "JessiJ0108", 179 | "from_user_id": 737981952, 180 | "from_user_id_str": "737981952", 181 | "from_user_name": "Jessica Johnson", 182 | "geo": null, 183 | "id": 288656499142696960, 184 | "id_str": "288656499142696961", 185 | "iso_language_code": "en", 186 | "metadata": { 187 | "result_type": "recent" 188 | }, 189 | "profile_image_url": "http://a0.twimg.com/profile_images/2936728482/d9911742cdd1f022bd23755829f0dce4_normal.jpeg", 190 | "profile_image_url_https": "https://si0.twimg.com/profile_images/2936728482/d9911742cdd1f022bd23755829f0dce4_normal.jpeg", 191 | "source": "<a href="http://twitter.com/download/android">Twitter for Android</a>", 192 | "text": "RT @QuotingJokes: Kiss me if I'm wrong. But dinosaurs still exist right?", 193 | "to_user": null, 194 | "to_user_id": 0, 195 | "to_user_id_str": "0", 196 | "to_user_name": null 197 | }, 198 | { 199 | "created_at": "Tue, 08 Jan 2013 14:40:46 +0000", 200 | "from_user": "AndyRothwell1", 201 | "from_user_id": 288397628, 202 | "from_user_id_str": "288397628", 203 | "from_user_name": "Andy Rothwell", 204 | "geo": null, 205 | "id": 288656498899419140, 206 | "id_str": "288656498899419137", 207 | "iso_language_code": "en", 208 | "metadata": { 209 | "result_type": "recent" 210 | }, 211 | "profile_image_url": "http://a0.twimg.com/profile_images/2954727240/f16de28072350fdfa393679664894c8e_normal.jpeg", 212 | "profile_image_url_https": "https://si0.twimg.com/profile_images/2954727240/f16de28072350fdfa393679664894c8e_normal.jpeg", 213 | "source": "<a href="http://twitter.com/">web</a>", 214 | "text": "@imexdanny what killed ze dinosaurs? ZE ICE AGE!", 215 | "to_user": "imexdanny", 216 | "to_user_id": 302811136, 217 | "to_user_id_str": "302811136", 218 | "to_user_name": "Danny Hughes", 219 | "in_reply_to_status_id": 288656063555858400, 220 | "in_reply_to_status_id_str": "288656063555858432" 221 | }, 222 | { 223 | "created_at": "Tue, 08 Jan 2013 14:40:35 +0000", 224 | "from_user": "BertBannister", 225 | "from_user_id": 939905077, 226 | "from_user_id_str": "939905077", 227 | "from_user_name": "bert bannister", 228 | "geo": null, 229 | "id": 288656448794279940, 230 | "id_str": "288656448794279936", 231 | "iso_language_code": "en", 232 | "metadata": { 233 | "result_type": "recent" 234 | }, 235 | "profile_image_url": "http://a0.twimg.com/profile_images/3008530754/bea3fcbc9efacf7144688f499bddf587_normal.jpeg", 236 | "profile_image_url_https": "https://si0.twimg.com/profile_images/3008530754/bea3fcbc9efacf7144688f499bddf587_normal.jpeg", 237 | "source": "<a href="http://twitter.com/download/iphone">Twitter for iPhone</a>", 238 | "text": "@rogers2116 @sludgiesly Everyone: \"steak and a pint please mate\" Roge: \"can I have turkey dinosaurs with ketchup and a fanta please\"", 239 | "to_user": "rogers2116", 240 | "to_user_id": 316789507, 241 | "to_user_id_str": "316789507", 242 | "to_user_name": "Jamie Rogers", 243 | "in_reply_to_status_id": 288656043410599940, 244 | "in_reply_to_status_id_str": "288656043410599936" 245 | }, 246 | { 247 | "created_at": "Tue, 08 Jan 2013 14:40:34 +0000", 248 | "from_user": "johnkatez", 249 | "from_user_id": 6795122, 250 | "from_user_id_str": "6795122", 251 | "from_user_name": "johnkatez", 252 | "geo": null, 253 | "id": 288656443266170900, 254 | "id_str": "288656443266170880", 255 | "iso_language_code": "en", 256 | "metadata": { 257 | "result_type": "recent" 258 | }, 259 | "profile_image_url": "http://a0.twimg.com/profile_images/3062733248/6f3634e3bb847292e75f9e834ee5963d_normal.jpeg", 260 | "profile_image_url_https": "https://si0.twimg.com/profile_images/3062733248/6f3634e3bb847292e75f9e834ee5963d_normal.jpeg", 261 | "source": "<a href="http://tapbots.com/software/tweetbot/mac">Twееtbot for Mac</a>", 262 | "text": "You heard it here first- dinosaurs were gay. http://t.co/vSPlvGRc", 263 | "to_user": null, 264 | "to_user_id": 0, 265 | "to_user_id_str": "0", 266 | "to_user_name": null 267 | }, 268 | { 269 | "created_at": "Tue, 08 Jan 2013 14:40:24 +0000", 270 | "from_user": "fuckl0nely", 271 | "from_user_id": 549766379, 272 | "from_user_id_str": "549766379", 273 | "from_user_name": "† .", 274 | "geo": null, 275 | "id": 288656403944591360, 276 | "id_str": "288656403944591360", 277 | "iso_language_code": "pt", 278 | "metadata": { 279 | "result_type": "recent" 280 | }, 281 | "profile_image_url": "http://a0.twimg.com/profile_images/2979531343/9d7e3fcce5d469d9da80c1661b7dee60_normal.jpeg", 282 | "profile_image_url_https": "https://si0.twimg.com/profile_images/2979531343/9d7e3fcce5d469d9da80c1661b7dee60_normal.jpeg", 283 | "source": "<a href="http://www.tumblr.com/">Tumblr</a>", 284 | "text": "tumblrbot ha preguntado: ROBOTS OR DINOSAURS? http://t.co/uovAaks7", 285 | "to_user": null, 286 | "to_user_id": 0, 287 | "to_user_id_str": "0", 288 | "to_user_name": null 289 | }, 290 | { 291 | "created_at": "Tue, 08 Jan 2013 14:39:34 +0000", 292 | "from_user": "Phil_Savage", 293 | "from_user_id": 23068681, 294 | "from_user_id_str": "23068681", 295 | "from_user_name": "Phil Savage", 296 | "geo": null, 297 | "id": 288656193604419600, 298 | "id_str": "288656193604419584", 299 | "iso_language_code": "en", 300 | "metadata": { 301 | "result_type": "recent" 302 | }, 303 | "profile_image_url": "http://a0.twimg.com/profile_images/2856097536/e45f89e574c84c3e48f6e2df2c159f03_normal.jpeg", 304 | "profile_image_url_https": "https://si0.twimg.com/profile_images/2856097536/e45f89e574c84c3e48f6e2df2c159f03_normal.jpeg", 305 | "source": "<a href="http://www.tweetdeck.com">TweetDeck</a>", 306 | "text": "RT @EwingCalvin: Just so everyone's clear dinosaurs are extinct. That means there are none left. Whatsoever.", 307 | "to_user": null, 308 | "to_user_id": 0, 309 | "to_user_id_str": "0", 310 | "to_user_name": null 311 | }, 312 | { 313 | "created_at": "Tue, 08 Jan 2013 14:38:22 +0000", 314 | "from_user": "EwingCalvin", 315 | "from_user_id": 442655243, 316 | "from_user_id_str": "442655243", 317 | "from_user_name": "Calvin Ewing", 318 | "geo": null, 319 | "id": 288655891270623200, 320 | "id_str": "288655891270623232", 321 | "iso_language_code": "en", 322 | "metadata": { 323 | "result_type": "recent" 324 | }, 325 | "profile_image_url": "http://a0.twimg.com/profile_images/2860919035/3e8339791d1e32f64f329b4d10abdf6c_normal.jpeg", 326 | "profile_image_url_https": "https://si0.twimg.com/profile_images/2860919035/3e8339791d1e32f64f329b4d10abdf6c_normal.jpeg", 327 | "source": "<a href="http://twitter.com/">web</a>", 328 | "text": "Just so everyone's clear dinosaurs are extinct. That means there are none left. Whatsoever.", 329 | "to_user": null, 330 | "to_user_id": 0, 331 | "to_user_id_str": "0", 332 | "to_user_name": null 333 | }, 334 | { 335 | "created_at": "Tue, 08 Jan 2013 14:38:09 +0000", 336 | "from_user": "DinosaursTrap", 337 | "from_user_id": 550537265, 338 | "from_user_id_str": "550537265", 339 | "from_user_name": "The Dinosaurs Trap", 340 | "geo": null, 341 | "id": 288655840188194800, 342 | "id_str": "288655840188194816", 343 | "iso_language_code": "en", 344 | "metadata": { 345 | "result_type": "recent" 346 | }, 347 | "profile_image_url": "http://a0.twimg.com/profile_images/2483238349/31yj26hu61vfv4umwony_normal.png", 348 | "profile_image_url_https": "https://si0.twimg.com/profile_images/2483238349/31yj26hu61vfv4umwony_normal.png", 349 | "source": "<a href="http://trap.it">Trapit</a>", 350 | "text": "Ichthyosaur Fossil Spotlights Ancient 'Sea Monster,' World's Recovery After Mass Extinction http://t.co/UqQTcJG7 #dinos #dinosaurs", 351 | "to_user": null, 352 | "to_user_id": 0, 353 | "to_user_id_str": "0", 354 | "to_user_name": null 355 | } 356 | ], 357 | "results_per_page": 15, 358 | "since_id": 0, 359 | "since_id_str": "0" 360 | } -------------------------------------------------------------------------------- /tests/document.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 2003/07/04 4 | 123 5 | Acme Alpha 6 | 7 | 987 8 | Coupler 9 | 5 10 | 11 | 12 | 654 13 | Connector 14 | 3 15 | 16 | 17 | 579 18 | Clasp 19 | 1 20 | 21 | -------------------------------------------------------------------------------- /tests/fixtures.js: -------------------------------------------------------------------------------- 1 | var fs = require("fs"); 2 | 3 | // Web document samples for the test server to serve 4 | 5 | exports.documents = { 6 | html: fs.readFileSync("tests/document.html"), 7 | json: fs.readFileSync("tests/document.json"), 8 | feed: fs.readFileSync("tests/document.atom"), 9 | xml: fs.readFileSync("tests/document.xml") 10 | }; 11 | 12 | // Queries 13 | 14 | exports.queries = { 15 | html: { 16 | simple: { 17 | "url": "http://localhost:8889/html", 18 | "type": "html", 19 | "selector": "title", 20 | "extract": "text", 21 | "cache": false 22 | }, 23 | withCache: { 24 | "url": "http://localhost:8889/html", 25 | "type": "html", 26 | "selector": "h1", 27 | "extract": "text", 28 | "cache": true 29 | }, 30 | noSelector: { 31 | "url": "http://localhost:8889/html", 32 | "type": "html", 33 | "cache": false 34 | }, 35 | noExtract: { 36 | "url": "http://localhost:8889/html", 37 | "type": "html", 38 | "selector": "title", 39 | "cache": false 40 | }, 41 | noType: { 42 | "url": "http://localhost:8889/html", 43 | "selector": "title", 44 | "extract": "text", 45 | "cache": false 46 | }, 47 | badSelector: { 48 | "url": "http://localhost:8889/html", 49 | "type": "html", 50 | "selector": "BAD SELECTOR", 51 | "extract": "text", 52 | "cache": false 53 | }, 54 | badExtract: { 55 | "url": "http://localhost:8889/html", 56 | "type": "html", 57 | "selector": "title", 58 | "extract": "BAD EXTRACT", 59 | "cache": false 60 | } 61 | }, 62 | json: { 63 | simple: { 64 | "url": "http://localhost:8889/json", 65 | "type": "json", 66 | "selector": ".query", 67 | "cache": false 68 | }, 69 | noSelector: { 70 | "url": "http://localhost:8889/json", 71 | "type": "json", 72 | "cache": false 73 | }, 74 | noType: { 75 | "url": "http://localhost:8889/json", 76 | "selector": ".query", 77 | "cache": false 78 | }, 79 | badSelector: { 80 | "url": "http://localhost:8889/json", 81 | "type": "json", 82 | "selector": "BAD SELECTOR", 83 | "cache": false 84 | }, 85 | badParse: { 86 | "url": "http://localhost:8889/html", 87 | "type": "json", 88 | "selector": ".query", 89 | "cache": false 90 | } 91 | }, 92 | feed: { 93 | simple: { 94 | "url": "http://localhost:8889/feed", 95 | "type": "feed", 96 | "selector": ".title", 97 | "cache": false 98 | }, 99 | noSelector: { 100 | "url": "http://localhost:8889/feed", 101 | "type": "feed", 102 | "cache": false 103 | }, 104 | noType: { 105 | "url": "http://localhost:8889/feed", 106 | "selector": ".title", 107 | "cache": false 108 | }, 109 | badSelector: { 110 | "url": "http://localhost:8889/feed", 111 | "type": "feed", 112 | "selector": "BAD SELECTOR", 113 | "cache": false 114 | }, 115 | badParse: { 116 | "url": "http://localhost:8889/html", 117 | "type": "feed", 118 | "selector": ".title", 119 | "cache": false 120 | } 121 | }, 122 | xml: { 123 | simple: { 124 | "url": "http://localhost:8889/xml", 125 | "type": "xml", 126 | "selector": ".CustomerName", 127 | "cache": false 128 | }, 129 | noSelector: { 130 | "url": "http://localhost:8889/xml", 131 | "type": "xml", 132 | "cache": false 133 | }, 134 | noType: { 135 | "url": "http://localhost:8889/xml", 136 | "selector": ".CustomerName", 137 | "cache": false 138 | }, 139 | badSelector: { 140 | "url": "http://localhost:8889/xml", 141 | "type": "xml", 142 | "selector": "BAD SELECTOR", 143 | "cache": false 144 | }, 145 | badParse: { 146 | "url": "http://localhost:8889/html", 147 | "type": "xml", 148 | "selector": ".CustomerName", 149 | "cache": false 150 | } 151 | }, 152 | misc: { 153 | badUrl: { 154 | "url": "BAD URL", 155 | "cache": false 156 | }, 157 | badType: { 158 | "url": "http://localhost:8889/html", 159 | "type": "BAD TYPE", 160 | "cache": false 161 | } 162 | }, 163 | map: { 164 | simple: { 165 | "url": "http://localhost:8889/html", 166 | "type": "html", 167 | "map": { 168 | "foo": { 169 | "selector": "h1" 170 | }, 171 | "bar": { 172 | "selector": "title" 173 | } 174 | }, 175 | "cache": false 176 | } 177 | }, 178 | post: { 179 | simple: { 180 | "url": "http://localhost:8889", 181 | "type": "html", 182 | "selector": "h1", 183 | "extract": "text", 184 | "post": { 185 | "foo": "bar" 186 | }, 187 | "cache": false 188 | } 189 | }, 190 | headers: { 191 | simple: { 192 | "url": "http://localhost:8889/html", 193 | "type": "html", 194 | "selector": "h1", 195 | "headers": ["X-Powered-By"], 196 | "cache": "false" 197 | }, 198 | linkHeaders: { 199 | "url": "http://localhost:8889/html", 200 | "type": "html", 201 | "selector": "h1", 202 | "linkHeader": true, 203 | "cache": "false" 204 | } 205 | } 206 | }; 207 | 208 | // Query answers 209 | 210 | exports.queries.answers = { 211 | html: { 212 | simple: [ 213 | { 214 | "results": ["css Zen Garden: The Beauty in CSS Design"] 215 | } 216 | ], 217 | noExtract: [ 218 | { 219 | "results": ["css Zen Garden: The Beauty in CSS Design"] 220 | } 221 | ], 222 | noType: [ 223 | { 224 | "results": ["css Zen Garden: The Beauty in CSS Design"] 225 | } 226 | ], 227 | badSelector: [ 228 | { 229 | "results": [], 230 | "error": "Could not match with that selector or extract value" 231 | } 232 | ], 233 | badExtract: [ 234 | { 235 | "results": [], 236 | "error": "Could not match with that selector or extract value" 237 | } 238 | ] 239 | }, 240 | json: { 241 | simple: [ 242 | { 243 | "results": [ 244 | "dinosaurs" 245 | ] 246 | } 247 | ], 248 | noType: [ 249 | { 250 | "results": [ 251 | "dinosaurs" 252 | ] 253 | } 254 | ], 255 | badSelector: [ 256 | { 257 | "results": [], 258 | "error": "Could not match with that selector" 259 | } 260 | ], 261 | badParse: [ 262 | { 263 | "results": [], 264 | "error": "Could not parse JSON document" 265 | } 266 | ] 267 | }, 268 | feed: { 269 | simple: [ 270 | { 271 | "results": [ 272 | "Atom-Powered Robots Run Amok", 273 | "Example Feed" 274 | ] 275 | } 276 | ], 277 | noSelector: [ 278 | { 279 | "results": [ 280 | [ 281 | { 282 | "title": "Atom-Powered Robots Run Amok", 283 | "description": "Some text.", 284 | "summary": "Some text.", 285 | "date": "2003-12-13T18:30:02.000Z", 286 | "pubdate": "2003-12-13T18:30:02.000Z", 287 | "pubDate": "2003-12-13T18:30:02.000Z", 288 | "link": "http://example.org/2003/12/13/atom03", 289 | "guid": "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a", 290 | "author": "John Doe", 291 | "comments": null, 292 | "origlink": null, 293 | "image": {}, 294 | "source": {}, 295 | "categories": [], 296 | "enclosures": [], 297 | "atom:@": {}, 298 | "atom:title": { 299 | "@": {}, 300 | "#": "Atom-Powered Robots Run Amok" 301 | }, 302 | "atom:link": { 303 | "@": { 304 | "href": "http://example.org/2003/12/13/atom03" 305 | } 306 | }, 307 | "atom:id": { 308 | "@": {}, 309 | "#": "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" 310 | }, 311 | "atom:updated": { 312 | "@": {}, 313 | "#": "2003-12-13T18:30:02Z" 314 | }, 315 | "atom:summary": { 316 | "@": {}, 317 | "#": "Some text." 318 | }, 319 | "meta": { 320 | "#ns": [ 321 | { 322 | "xmlns": "http://www.w3.org/2005/Atom" 323 | } 324 | ], 325 | "@": [ 326 | { 327 | "xmlns": "http://www.w3.org/2005/Atom" 328 | } 329 | ], 330 | "#type": "atom", 331 | "#version": "1.0", 332 | "title": "Example Feed", 333 | "description": null, 334 | "date": "2003-12-13T18:30:02.000Z", 335 | "pubdate": "2003-12-13T18:30:02.000Z", 336 | "pubDate": "2003-12-13T18:30:02.000Z", 337 | "link": "http://example.org/", 338 | "xmlurl": null, 339 | "xmlUrl": null, 340 | "author": "John Doe", 341 | "language": null, 342 | "favicon": null, 343 | "copyright": null, 344 | "generator": null, 345 | "image": {}, 346 | "categories": [], 347 | "atom:@": { 348 | "xmlns": "http://www.w3.org/2005/Atom" 349 | }, 350 | "atom:title": { 351 | "@": {}, 352 | "#": "Example Feed" 353 | }, 354 | "atom:link": { 355 | "@": { 356 | "href": "http://example.org/" 357 | } 358 | }, 359 | "atom:updated": { 360 | "@": {}, 361 | "#": "2003-12-13T18:30:02Z" 362 | }, 363 | "atom:author": { 364 | "@": {}, 365 | "name": { 366 | "@": {}, 367 | "#": "John Doe" 368 | } 369 | }, 370 | "atom:id": { 371 | "@": {}, 372 | "#": "urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6" 373 | } 374 | } 375 | } 376 | ] 377 | ] 378 | } 379 | ], 380 | noType: [ 381 | { 382 | "results": [ 383 | "Atom-Powered Robots Run Amok", 384 | "Example Feed" 385 | ] 386 | } 387 | ], 388 | badSelector: [ 389 | { 390 | "results": [], 391 | "error": "Could not match with that selector" 392 | } 393 | ], 394 | badParse: [ 395 | { 396 | "results": [], 397 | "error": "The provided document couldn't be normalised" 398 | } 399 | ] 400 | }, 401 | xml: { 402 | simple: [ 403 | { 404 | "results": [ 405 | "Acme Alpha" 406 | ] 407 | } 408 | ], 409 | noSelector: [ 410 | { 411 | "results": [ 412 | { 413 | "Order": { 414 | "Date": "2003/07/04", 415 | "CustomerId": 123, 416 | "CustomerName": "Acme Alpha", 417 | "Item": [ 418 | { 419 | "ItemId": 987, 420 | "ItemName": "Coupler", 421 | "Quantity": 5 422 | }, 423 | { 424 | "ItemId": 654, 425 | "ItemName": "Connector", 426 | "Quantity": { 427 | "unit": 12, 428 | "$t": 3 429 | } 430 | }, 431 | { 432 | "ItemId": 579, 433 | "ItemName": "Clasp", 434 | "Quantity": 1 435 | } 436 | ] 437 | } 438 | } 439 | ] 440 | } 441 | ], 442 | noType: [ 443 | { 444 | "results": [ 445 | "Acme Alpha" 446 | ] 447 | } 448 | ], 449 | badSelector: [ 450 | { 451 | "results": [], 452 | "error": "Could not match with that selector" 453 | } 454 | ], 455 | badParse: [ 456 | { 457 | "results": [], 458 | "error": "Could not parse XML to JSON" 459 | } 460 | ] 461 | }, 462 | misc: { 463 | badUrl: [ 464 | { 465 | "results": [], 466 | "error": "Document not found" 467 | } 468 | ], 469 | badType: [ 470 | { 471 | "results": [], 472 | "error": "Document type not supported" 473 | } 474 | ] 475 | }, 476 | map : { 477 | simple: [ 478 | { 479 | "results": { 480 | "bar": ["css Zen Garden: The Beauty in CSS Design"], 481 | "foo": ["css Zen Garden"] 482 | } 483 | } 484 | ] 485 | }, 486 | post: { 487 | simple: [ 488 | { 489 | "results": ["was posted"] 490 | } 491 | ] 492 | }, 493 | headers: { 494 | simple: [ 495 | { 496 | "results": ["css Zen Garden"], 497 | "headers": { 498 | "X-Powered-By": "Noodle testing server" 499 | } 500 | } 501 | ], 502 | linkHeaders: [ 503 | { 504 | "results": ["css Zen Garden"], 505 | "headers": { 506 | "link": { 507 | "next": "foo", 508 | "last": "bar" 509 | } 510 | } 511 | } 512 | ] 513 | } 514 | }; -------------------------------------------------------------------------------- /tests/server.js: -------------------------------------------------------------------------------- 1 | var url = require('url'), 2 | fixtures = require('./fixtures'); 3 | 4 | require('http').createServer(function (req, res) { 5 | var serve = url.parse(req.url).pathname.split('/')[1]; 6 | 7 | if (req.method === 'POST') { 8 | parsePostData(req, function (data) { 9 | var respondWith = (data.foo === 'bar') ? '

was posted

' 10 | : '

test should fail

'; 11 | res.writeHead(200, getResponseHeaders('html')); 12 | res.end(respondWith); 13 | }); 14 | } else { 15 | res.writeHead(200, getResponseHeaders(serve)); 16 | res.end(fixtures.documents[serve]); 17 | } 18 | }) 19 | .listen(8889, function () { 20 | console.log('Test server temporarily running on port 8889'); 21 | }); 22 | 23 | function parsePostData (req, cb) { 24 | var body = ''; 25 | 26 | req.on('data', function (data) { 27 | body += data; 28 | }); 29 | 30 | req.on('end', function () { 31 | cb(require('querystring').parse(body)); 32 | }); 33 | } 34 | 35 | function getResponseHeaders (serve) { 36 | var ct = { 37 | 'html': 'text/html', 38 | 'json': 'application/json', 39 | 'feed': 'application/atom+xml', 40 | 'xml' : 'text/xml' 41 | }; 42 | return { 43 | 'Content-type': ct[serve], 44 | 'X-Powered-By': 'Noodle testing server', 45 | 'Link' : '; rel="next",; rel="last"' 46 | }; 47 | } -------------------------------------------------------------------------------- /tests/tests.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert'), 2 | _ = require('underscore'), 3 | fixtures = require('./fixtures'), 4 | noodle = require('../lib/noodle'), 5 | cache = require('../lib/cache'), 6 | html = require('../lib/types/html'), 7 | json = require('../lib/types/json'), 8 | feed = require('../lib/types/feed'), 9 | xml = require('../lib/types/xml'), 10 | stringify = JSON.stringify; 11 | 12 | noodle.configure({ 13 | "debug": false 14 | }); 15 | 16 | Array.prototype.AllValuesSame = function(){ 17 | if(this.length > 0) { 18 | for(var i = 1; i < this.length; i++) { 19 | if(this[i] !== this[0]) { 20 | return false; 21 | } 22 | } 23 | } 24 | return true; 25 | }; 26 | 27 | function isPromise (obj) { 28 | return !!obj.promiseSend; 29 | } 30 | 31 | // Noodle library 32 | 33 | describe('Noodle', function () { 34 | describe('noodle.query', function () { 35 | it('should return a promise', function () { 36 | var promise = noodle.query({url: 'foo'}); 37 | assert.equal(true, isPromise(promise)); 38 | }); 39 | }); 40 | 41 | describe('fetch()', function () { 42 | it('should return a promise', function () { 43 | var promise = noodle.fetch('foo', {}); 44 | assert.equal(true, isPromise(promise)); 45 | }); 46 | }); 47 | }); 48 | 49 | // Tests regarding the noodle library's type modules 50 | 51 | describe('Types', function () { 52 | describe('noodle.html', function () { 53 | it('its promise should resolve to an object containing results', function (done) { 54 | noodle.query(fixtures.queries.html.simple) 55 | .then(function (results) { 56 | if (_.isArray(results.results)) { 57 | done(); 58 | } else { 59 | done(new Error('results.results was not an array')); 60 | } 61 | }); 62 | }); 63 | }); 64 | 65 | describe('noodle.json', function () { 66 | it('promise should resolve to an array', function (done) { 67 | noodle.query(fixtures.queries.json.simple) 68 | .then(function (results) { 69 | if (_.isArray(results.results)) { 70 | done(); 71 | } else { 72 | done(new Error('results.results was not an array')); 73 | } 74 | }); 75 | }); 76 | }); 77 | 78 | describe('noodle.feed', function () { 79 | it('promise should resolve to an array', function (done) { 80 | noodle.query(fixtures.queries.feed.simple) 81 | .then(function (results) { 82 | if (_.isArray(results.results)) { 83 | done(); 84 | } else { 85 | done(new Error('results.results was not an array')); 86 | } 87 | }); 88 | }); 89 | }); 90 | 91 | describe('noodle.xml', function () { 92 | it('promise should resolve to an array', function (done) { 93 | noodle.query(fixtures.queries.xml.simple) 94 | .then(function (results) { 95 | if (_.isArray(results.results)) { 96 | done(); 97 | } else { 98 | done(new Error('results.results was not an array')); 99 | } 100 | }); 101 | }); 102 | }); 103 | }); 104 | 105 | 106 | // Noodle's cache 107 | 108 | describe('cache', function () { 109 | 110 | }); 111 | 112 | 113 | // Noodle query api 114 | 115 | describe('Noodle object query API', function () { 116 | var allArrays = []; 117 | 118 | describe('type: html', function () { 119 | it('should have accurate result data', function (done) { 120 | noodle.query(fixtures.queries.html.simple) 121 | .then(function (results) { 122 | allArrays.push(_.isArray(results.results)); 123 | if (_.isEqual(results.results, fixtures.queries.answers.html.simple)) { 124 | done(); 125 | } else { 126 | done(new Error('Results and fixtures do not match up.')); 127 | } 128 | }); 129 | }); 130 | 131 | it('should still return full document if no selector is specified', function (done) { 132 | noodle.query(fixtures.queries.html.noSelector) 133 | .then(function (results) { 134 | var expectedHTMLDoc = results.results[0].results; 135 | allArrays.push(_.isArray(results.results)); 136 | if (typeof expectedHTMLDoc === 'string' && expectedHTMLDoc.length > 1000) { 137 | done(); 138 | } else { 139 | done(new Error('Results did not contain full document')); 140 | } 141 | }); 142 | }); 143 | 144 | it('should still return some data if no extract is specified', function (done) { 145 | noodle.query(fixtures.queries.html.noExtract) 146 | .then(function (results) { 147 | allArrays.push(_.isArray(results.results)); 148 | if (_.isEqual(results.results, fixtures.queries.answers.html.noExtract)) { 149 | done(); 150 | } else { 151 | done(new Error('Results and fixtures do not match up.')); 152 | } 153 | }); 154 | }); 155 | 156 | it('should still return some data if no type is specified', function (done) { 157 | noodle.query(fixtures.queries.html.noType) 158 | .then(function (results) { 159 | allArrays.push(_.isArray(results.results)); 160 | if (_.isEqual(results.results, fixtures.queries.answers.html.noType)) { 161 | done(); 162 | } else { 163 | done(new Error('Results and fixtures do not match up.')); 164 | } 165 | }); 166 | }); 167 | 168 | describe('errors', function () { 169 | it('should report on a poor selector', function (done) { 170 | noodle.query(fixtures.queries.html.badSelector) 171 | .then(function (results) { 172 | allArrays.push(_.isArray(results.results)); 173 | if (_.isEqual(results.results, fixtures.queries.answers.html.badSelector)) { 174 | done(); 175 | } else { 176 | done(new Error('Results and fixtures do not match up.')); 177 | } 178 | }); 179 | }); 180 | 181 | it('should default to selecting text if no extract is supplied', function (done){ 182 | noodle.query(fixtures.queries.html.noExtract) 183 | .then(function (results) { 184 | allArrays.push(_.isArray(results.results)); 185 | if (_.isEqual(results.results, fixtures.queries.answers.html.noExtract)) { 186 | done(); 187 | } else { 188 | done(new Error('Results and fixtures do not match up.')); 189 | } 190 | }); 191 | }); 192 | }); 193 | }); 194 | 195 | describe('type: json', function () { 196 | before(function () { 197 | noodle.configure({ 198 | defaultDocumentType: 'json' 199 | }); 200 | }); 201 | 202 | it('should have result data', function (done) { 203 | noodle.query(fixtures.queries.json.simple) 204 | .then(function (results) { 205 | allArrays.push(_.isArray(results.results)); 206 | if (_.isEqual(results.results, fixtures.queries.answers.json.simple)) { 207 | done(); 208 | } else { 209 | done(new Error('Results and fixtures do not match up.')); 210 | } 211 | }); 212 | }); 213 | 214 | it('should still return some data if no selector is specified', function (done) { 215 | noodle.query(fixtures.queries.json.noSelector) 216 | .then(function (results) { 217 | var expectedJSONDoc = results.results[0].results; 218 | allArrays.push(_.isArray(results.results)); 219 | if (typeof expectedJSONDoc === 'object') { 220 | done(); 221 | } else { 222 | done(new Error('Results and fixtures do not match up.')); 223 | } 224 | }); 225 | }); 226 | 227 | it('should still return some data if no type is specified', function (done) { 228 | noodle.query(fixtures.queries.json.noType) 229 | .then(function (results) { 230 | allArrays.push(_.isArray(results.results)); 231 | if (_.isEqual(results.results, fixtures.queries.answers.json.noType)) { 232 | done(); 233 | } else { 234 | done(new Error('Results and fixtures do not match up.')); 235 | } 236 | }); 237 | }); 238 | 239 | describe('errors', function () { 240 | it('should report on a poor selector', function (done) { 241 | noodle.query(fixtures.queries.json.badSelector) 242 | .then(function (results) { 243 | allArrays.push(_.isArray(results.results)); 244 | if (_.isEqual(results.results, fixtures.queries.answers.json.badSelector)) { 245 | done(); 246 | } else { 247 | done(new Error('Results and fixtures do not match up.')); 248 | } 249 | }); 250 | }); 251 | 252 | it('should report on a parse error', function (done) { 253 | noodle.query(fixtures.queries.json.badParse) 254 | .then(function (results) { 255 | allArrays.push(_.isArray(results.results)); 256 | if (_.isEqual(results.results, fixtures.queries.answers.json.badParse)) { 257 | done(); 258 | } else { 259 | done(new Error('Results and fixtures do not match up.')); 260 | } 261 | }); 262 | }); 263 | }); 264 | }); 265 | 266 | describe('type: feed', function () { 267 | before(function () { 268 | noodle.configure({ 269 | defaultDocumentType: 'feed' 270 | }); 271 | }); 272 | 273 | it('should have result data', function (done) { 274 | noodle.query(fixtures.queries.feed.simple) 275 | .then(function (results) { 276 | allArrays.push(_.isArray(results.results)); 277 | if (_.isEqual(results.results) === _.isEqual(fixtures.queries.answers.feed.simple)) { 278 | done(); 279 | } else { 280 | done(new Error('Results and fixtures do not match up.')); 281 | } 282 | }); 283 | }); 284 | 285 | it('should still return some data if no selector is specified', function (done) { 286 | noodle.query(fixtures.queries.feed.noSelector) 287 | .then(function (results) { 288 | allArrays.push(_.isArray(results.results)); 289 | if (stringify(results.results) === stringify(fixtures.queries.answers.feed.noSelector)) { 290 | done(); 291 | } else { 292 | done(new Error('Results and fixtures do not match up.')); 293 | } 294 | }); 295 | }); 296 | 297 | it('should still return some data if no type is specified', function (done) { 298 | noodle.query(fixtures.queries.feed.noType) 299 | .then(function (results) { 300 | allArrays.push(_.isArray(results.results)); 301 | if (_.isEqual(results.results, fixtures.queries.answers.feed.noType)) { 302 | done(); 303 | } else { 304 | done(new Error('Results and fixtures do not match up.')); 305 | } 306 | }); 307 | }); 308 | 309 | describe('errors', function () { 310 | it('should report on a poor selector', function (done) { 311 | noodle.query(fixtures.queries.feed.badSelector) 312 | .then(function (results) { 313 | allArrays.push(_.isArray(results.results)); 314 | if (_.isEqual(results.results, fixtures.queries.answers.feed.badSelector)) { 315 | done(); 316 | } else { 317 | done(new Error('Results and fixtures do not match up.')); 318 | } 319 | }); 320 | }); 321 | 322 | it('should report on a parse error', function (done) { 323 | noodle.query(fixtures.queries.feed.badParse) 324 | .then(function (results) { 325 | allArrays.push(_.isArray(results.results)); 326 | if (_.isEqual(results.results, fixtures.queries.answers.feed.badParse)) { 327 | done(); 328 | } else { 329 | done(new Error('Results and fixtures do not match up.')); 330 | } 331 | }); 332 | }); 333 | }); 334 | }); 335 | 336 | describe('type: xml', function () { 337 | before(function () { 338 | noodle.configure({ 339 | defaultDocumentType: 'xml' 340 | }); 341 | }); 342 | 343 | it('should have result data', function (done) { 344 | noodle.query(fixtures.queries.xml.simple) 345 | .then(function (results) { 346 | allArrays.push(_.isArray(results.results)); 347 | if (_.isEqual(results.results, fixtures.queries.answers.xml.simple)) { 348 | done(); 349 | } else { 350 | done(new Error('Results and fixtures do not match up.')); 351 | } 352 | }); 353 | }); 354 | 355 | it('should still return some data if no selector is specified', function (done) { 356 | noodle.query(fixtures.queries.xml.noSelector) 357 | .then(function (results) { 358 | allArrays.push(_.isArray(results.results)); 359 | if (_.isEqual(results.results, fixtures.queries.answers.xml.noSelector)) { 360 | done(); 361 | } else { 362 | done(new Error('Results and fixtures do not match up.')); 363 | } 364 | }); 365 | }); 366 | 367 | it('should still return some data if no type is specified', function (done) { 368 | noodle.query(fixtures.queries.xml.noType) 369 | .then(function (results) { 370 | allArrays.push(_.isArray(results.results)); 371 | if (_.isEqual(results.results, fixtures.queries.answers.xml.noType)) { 372 | done(); 373 | } else { 374 | done(new Error('Results and fixtures do not match up.')); 375 | } 376 | }); 377 | }); 378 | 379 | describe('errors', function () { 380 | it('should report on a poor selector', function (done) { 381 | noodle.query(fixtures.queries.xml.badSelector) 382 | .then(function (results) { 383 | allArrays.push(_.isArray(results.results)); 384 | if (_.isEqual(results.results, fixtures.queries.answers.xml.badSelector)) { 385 | done(); 386 | } else { 387 | done(new Error('Results and fixtures do not match up.')); 388 | } 389 | }); 390 | }); 391 | 392 | it('should report on a parse error', function (done) { 393 | noodle.query(fixtures.queries.xml.badParse) 394 | .then(function (results) { 395 | allArrays.push(_.isArray(results.results)); 396 | if (_.isEqual(results.results, fixtures.queries.answers.xml.badParse)) { 397 | done(); 398 | } else { 399 | done(new Error('Results and fixtures do not match up.')); 400 | } 401 | }); 402 | }); 403 | }); 404 | }); 405 | 406 | describe('generic query error messages', function () { 407 | it('errors if no url is specified', function (done) { 408 | noodle.query(fixtures.queries.misc.badUrl) 409 | .then(function (results) { 410 | if (_.isEqual(results.results, fixtures.queries.answers.misc.badUrl)) { 411 | done(); 412 | } else { 413 | done(new Error('Results and fixtures do not match up.')); 414 | } 415 | }); 416 | }); 417 | 418 | it('errors if a non-supported type is specified', function (done) { 419 | noodle.query(fixtures.queries.misc.badType) 420 | .then(function (results) { 421 | if (_.isEqual(results.results, fixtures.queries.answers.misc.badType)) { 422 | done(); 423 | } else { 424 | done(new Error('Results and fixtures do not match up.')); 425 | } 426 | }); 427 | }); 428 | }); 429 | 430 | describe('map notation', function () { 431 | it('result should contain properties as specified in the map as well as data', function (done) { 432 | noodle.query(fixtures.queries.map.simple) 433 | .then(function (results) { 434 | allArrays.push(_.isArray(results.results)); 435 | if (_.isEqual(results.results, fixtures.queries.answers.map.simple)) { 436 | done(); 437 | } else { 438 | done(new Error('Results and fixtures do not match up.')); 439 | } 440 | }); 441 | }); 442 | }); 443 | 444 | describe('post data', function () { 445 | it('should return data from post requests', function (done) { 446 | noodle.query(fixtures.queries.post.simple) 447 | .then(function (results) { 448 | if (_.isEqual(results.results, fixtures.queries.answers.post.simple)) { 449 | done(); 450 | } else { 451 | done(new Error('Results and fixtures do not match up.')); 452 | } 453 | }); 454 | }); 455 | }); 456 | 457 | describe('headers', function () { 458 | it('should parse headers', function (done) { 459 | noodle.query(fixtures.queries.headers.simple) 460 | .then(function (results) { 461 | var fix = fixtures.queries.answers.headers.simple[0]; 462 | if (_.isEqual(results.results[0].results, fix.results) && 463 | _.isEqual(results.results[0].headers, fix.headers)) { 464 | done(); 465 | } else { 466 | done(new Error('Results and fixtures do not match up.')); 467 | } 468 | }); 469 | }); 470 | 471 | it('should parse link headers', function (done) { 472 | noodle.query(fixtures.queries.headers.linkHeaders) 473 | .then(function (results) { 474 | var fix = fixtures.queries.answers.headers.linkHeaders[0]; 475 | if (_.isEqual(results.results[0].results, fix.results) && 476 | _.isEqual(results.results[0].headers, fix.headers)) { 477 | done(); 478 | } else { 479 | done(new Error('Results and fixtures do not match up.')); 480 | } 481 | }); 482 | }); 483 | }); 484 | 485 | describe('multiple queries', function () { 486 | it('(A) the returned order should match the order of the sent query', function (done) { 487 | var queries = [ 488 | fixtures.queries.html.simple, 489 | fixtures.queries.json.simple, 490 | fixtures.queries.feed.simple, 491 | fixtures.queries.xml.simple 492 | ]; 493 | noodle.query(queries) 494 | .then(function (results) { 495 | var match1 = _.isEqual(results.results[0], fixtures.queries.answers.html.simple[0]), 496 | match2 = _.isEqual(results.results[1], fixtures.queries.answers.json.simple[0]), 497 | match3 = _.isEqual(results.results[2], fixtures.queries.answers.feed.simple[0]), 498 | match4 = _.isEqual(results.results[3], fixtures.queries.answers.xml.simple[0]); 499 | if (match1 && match2 && match3 && match4) { 500 | done(); 501 | } else { 502 | done(new Error('Order was not maintained or results/fixtures mismatch')); 503 | } 504 | }); 505 | }); 506 | it('(B) the returned order should match the order of the sent query', function (done) { 507 | var queries = [ 508 | fixtures.queries.json.simple, 509 | fixtures.queries.html.simple, 510 | fixtures.queries.xml.simple, 511 | fixtures.queries.feed.simple 512 | ]; 513 | noodle.query(queries) 514 | .then(function (results) { 515 | var match1 = _.isEqual(results.results[1], fixtures.queries.answers.html.simple[0]), 516 | match2 = _.isEqual(results.results[0], fixtures.queries.answers.json.simple[0]), 517 | match3 = _.isEqual(results.results[3], fixtures.queries.answers.feed.simple[0]), 518 | match4 = _.isEqual(results.results[2], fixtures.queries.answers.xml.simple[0]); 519 | if (match1 && match2 && match3 && match4) { 520 | done(); 521 | } else { 522 | done(new Error('Order was not maintained or results/fixtures mismatch')); 523 | } 524 | }); 525 | }); 526 | it('(C) the returned order should match the order of the sent query', function (done) { 527 | var queries = [ 528 | fixtures.queries.html.simple, 529 | fixtures.queries.json.simple, 530 | fixtures.queries.feed.simple, 531 | fixtures.queries.xml.simple 532 | ]; 533 | noodle.query(queries) 534 | .then(function (results) { 535 | var match1 = _.isEqual(results.results[0], fixtures.queries.answers.html.simple[0]), 536 | match2 = _.isEqual(results.results[1], fixtures.queries.answers.json.simple[0]), 537 | match3 = _.isEqual(results.results[2], fixtures.queries.answers.feed.simple[0]), 538 | match4 = _.isEqual(results.results[3], fixtures.queries.answers.xml.simple[0]); 539 | if (match1 && match2 && match3 && match4) { 540 | done(); 541 | } else { 542 | done(new Error('Order was not maintained or results/fixtures mismatch')); 543 | } 544 | }); 545 | }); 546 | }); 547 | 548 | describe('consistent response format', function () { 549 | it('should return all responses as arrays', function () { 550 | assert.equal(true, allArrays.indexOf(false) === -1); 551 | }); 552 | 553 | it('should always return the "created" property from cache', function (done) { 554 | noodle.query(fixtures.queries.html.withCache) 555 | .then(function (results) { 556 | if (results.results[0].created) { 557 | done(); 558 | } else { 559 | done(new Error('"created" property wasn\'t included with cached response.')); 560 | } 561 | }); 562 | }); 563 | }); 564 | }); --------------------------------------------------------------------------------