├── .gitignore
├── README.md
├── bin
    ├── noodle-server
    └── tests
├── docs
    ├── .satya-config.yml
    ├── 1. Overview.md
    ├── 10. Server quick start
    ├── 2. Try it out.md
    ├── 3. Web service.md
    ├── 4. Query syntax.md
    ├── 5. Noodle as node module.md
    ├── 6. Error handling.md
    ├── 7. Caching.md
    ├── 8. Adding to noodle.md
    └── 9. Tests.md
├── index.js
├── lib
    ├── cache.js
    ├── config.json
    ├── logger.js
    ├── noodle-middleware.js
    ├── noodle.js
    └── types
    │   ├── feed.js
    │   ├── html.js
    │   ├── json.js
    │   └── xml.js
├── package.json
└── tests
    ├── document.atom
    ├── document.html
    ├── document.json
    ├── document.xml
    ├── fixtures.js
    ├── server.js
    └── tests.js


/.gitignore:
--------------------------------------------------------------------------------
 1 | _site
 2 | _bin
 3 | _config-local.yml
 4 | FiddlerRules.farx
 5 | .DS_Store
 6 | 
 7 | lib-cov
 8 | *.seed
 9 | *.log
10 | *.csv
11 | *.dat
12 | *.out
13 | *.pid
14 | *.gz
15 | 
16 | pids
17 | logs
18 | results
19 | 
20 | node_modules
21 | npm-debug.log
22 | .satya


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [noodle](https://noodle.dharmafly.com) 
 2 | =============================
 3 | 
 4 | noodle is a Node.js server and module for querying and scraping data from web documents. It features:
 5 | 
 6 | ```JSON
 7 | {
 8 |   "url": "https://github.com/explore",
 9 |   "selector": "ol.ranked-repositories h3 a",
10 |   "extract": "href"
11 | }
12 | ```
13 | 
14 | Features
15 | --------
16 | 
17 | - Cross domain document querying (html, json, xml, atom, rss feeds)
18 | - Server supports querying via JSONP and JSON POST
19 | - Multiple queries per request
20 | - Access to queried server headers
21 | - Allows for POSTing to web documents
22 | - In memory caching for query results and web documents
23 | 
24 | Server quick start
25 | ------------------
26 | 
27 | Setup
28 | 
29 |     $ npm install noodlejs
30 | 
31 | or
32 | 
33 |     $ git clone git@github.com:dharmafly/noodle.git
34 |     $ cd noodle
35 |     $ npm install
36 | 
37 | Start the server by running the binary
38 | 
39 |     $ bin/noodle-server
40 |     Noodle node server started
41 |     ├ process title  node-noodle
42 |     ├ process pid    4739
43 |     └ server port    8888
44 | 
45 | 
46 | You may specify a port number as an argument
47 | 
48 |     $ bin/noodle-server 9090
49 |     Noodle node server started
50 |     ├ process title  node-noodle
51 |     ├ process pid    4739
52 |     └ server port    9090
53 | 
54 | 
55 | Noodle as a node module
56 | -----------------------
57 | 
58 | If you are interested in the node module just run ```npm install noodlejs```,
59 | require it and check out the [noodle api](https://noodle.dharmafly.com/reference/#Noodle-as-node-module)  
60 | 
61 | ```javascript
62 | var noodle = require('noodlejs');
63 | 
64 | noodle.query({
65 |   url:      'https://github.com/explore',
66 |   selector: 'ol.ranked-repositories h3 a',
67 |   extract:  'href'
68 | })
69 | .then(function (results) {
70 |   console.log(results);
71 | });
72 | ```
73 | 
74 | Tests
75 | -----
76 | 
77 | The noodle tests create a temporary server on port `8889` which the automated 
78 | tests tell noodle to query against. 
79 | 
80 | To run tests you can use the provided binary *from the noodle package 
81 | root directory*:
82 | 
83 |     $ cd noodle
84 |     $ bin/tests
85 | 
86 | Contribute
87 | ----------
88 | 
89 | Contributors and suggestions welcomed.
90 | 
91 | - [https://noodle.dharmafly.com](https://noodle.dharmafly.com)  
92 | - [https://github.com/dharmafly/noodle](https://github.com/dharmafly/noodle)  
93 | 


--------------------------------------------------------------------------------
/bin/noodle-server:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | process.title = 'node-noodle';
 4 | 
 5 | var connect  = require('connect'),
 6 |     http     = require('http'),
 7 |     url      = require('url'),
 8 |     fs       = require('fs'),
 9 |     limiter  = require('connect-ratelimit'),
10 |     noodlemw = require('../lib/noodle-middleware'),
11 |     version  = getVersion(),
12 |     limits   = getConfig().rateLimit,
13 |     port     = process.argv[2] || 8888,
14 |     app;
15 | 
16 |     limits.end = false;
17 | 
18 |     app      = connect()
19 |               .use(function (req, res, next) {
20 |                 if (url.parse(req.url).pathname === '/version') {
21 |                   res.writeHead(200, {
22 |                     'Content-Type': 'application/json; charset=utf-8'
23 |                   });
24 |                   res.end('{"version":' + version + '}');
25 |                 } else {
26 |                   next();
27 |                 }
28 |               })
29 |               .use(limiter(limits))
30 |               .use(function (req, res, next) {
31 |                 if (res.ratelimit.exceeded) {
32 |                   res.statusCode = 429;
33 |                   res.end('[{"results": [], "error": "Rate limit exceeded"}]');
34 |                 } else {
35 |                   next();
36 |                 }
37 |               })
38 |               .use(connect.query())
39 |               .use(connect.json())
40 |               .use(noodlemw.parseQueries)
41 |               .use(noodlemw.noodleQueries)
42 |               .use(noodlemw.respond);
43 | 
44 | http.createServer(app).listen(port, function () {
45 |   require('colors');
46 |   with (console) {
47 |     log(' Noodle node server started'.magenta);
48 |     log(' ├ process title '.magenta, process.title.toString().green);
49 |     log(' ├ process pid   '.magenta, process.pid.toString().green);
50 |     log(' └ server port   '.magenta, port.toString().green);
51 |   }
52 | });
53 | 
54 | // Return the noodle config as an object
55 | 
56 | function getConfig () {
57 |   var path   = require('path').resolve(__dirname, '../lib/config.json'),
58 |       config = fs.readFileSync(path).toString();
59 |   return JSON.parse(config);
60 | }
61 | 
62 | // Return the noodle version number
63 | 
64 | function getVersion () {
65 |   var path = require('path').resolve(__dirname, '../package.json');
66 |   return JSON.parse(fs.readFileSync(path).toString()).version;
67 | }


--------------------------------------------------------------------------------
/bin/tests:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Start the testing server and store its pid
 4 | node tests/server.js & pid=$!
 5 | 
 6 | # Run the tests
 7 | node_modules/mocha/bin/mocha tests/tests.js --timeout 4000 --reporter list
 8 | 
 9 | # Kill the test server via its pid
10 | kill $!
11 | 


--------------------------------------------------------------------------------
/docs/.satya-config.yml:
--------------------------------------------------------------------------------
 1 | #########################################
 2 | # project site config
 3 | 
 4 | project_name: noodle
 5 | project_url: https://github.com/dharmafly/noodle
 6 | version: '0.3.2'
 7 | # options: forest, ocean, horus, seagrass, sundae, slate
 8 | theme: ocean
 9 | twitter_url: https://twitter.com/dharmafly
10 | # options: javascript, css, html5
11 | lang: javascript
12 | scripts:
13 |   - src: //ajax.googleapis.com/ajax/libs/jquery/1.8.2/jquery.min.js
14 | #  - src: https://raw.github.com/dharmafly/PROJECT-REPO/master/PROJECT.js
15 | quote:
16 | #- quote: 
17 | #- cite: 
18 | # analytics
19 | ga_id: UA-34978047-5
20 | download_links:
21 |   - text: Edge
22 |     subtext: (master)
23 |     href: https://github.com/dharmafly/noodle/zipball/master
24 |     title: The repo's latest codebase (zip). Potentially unstable.
25 | sections:
26 |  - path: /index.html
27 |    name: Overview
28 |  - path: /reference/index.html
29 |    name: Reference
30 | 
31 | ######## END project site config ########
32 | 


--------------------------------------------------------------------------------
/docs/1. Overview.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | category: overview
 3 | heading: 'Overview'
 4 | ---
 5 | 
 6 | &#8202;<span class="project-name">noodle</span> is a Node.js server and module for querying and scraping data from web documents. It features:  
 7 | 
 8 | - Cross domain document querying (html, json, xml, atom, rss feeds)
 9 | - Server supports querying via JSONP and JSON POST
10 | - Multiple queries per request
11 | - Access to queried server headers
12 | - Allows for POSTing to web documents
13 | - In memory caching for query results and web documents


--------------------------------------------------------------------------------
/docs/10. Server quick start:
--------------------------------------------------------------------------------
 1 | ---
 2 | category: overview
 3 | heading: 'Server quick setup'
 4 | ---
 5 | 
 6 | Setup
 7 | 
 8 |     $ git clone https://github.com/dharmafly/noodle.git
 9 |     $ cd noodle
10 |     $ npm install
11 | 
12 | Start the server by running the binary
13 | 
14 |     $ bin/noodle-server
15 |      Server running on port 8888
16 | 
17 | You may specify a port number as an argument
18 | 
19 |     $ bin/noodle-server 9090
20 |      Server running on port 9090
21 | 


--------------------------------------------------------------------------------
/docs/2. Try it out.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | category: overview
 3 | heading: 'Try it out'
 4 | ---
 5 | 
 6 | ## Install via NPM
 7 | 
 8 |     $ npm install noodlejs
 9 | 
10 | ## Install via Git
11 | 
12 |     $ git clone https://github.com/dharmafly/noodle.git
13 | 
14 | ## Run the server and GET or POST queries on `localhost:8888`
15 | 
16 |     $ cd noodle
17 |     # or `cd node_modules/noodlejs` if installed via npm
18 |     $ bin/noodle-server
19 |       Server running on port 8888
20 | 
21 | ## Or use as a node module
22 | 
23 |     $ var noodle = require('noodlejs');
24 | 
25 | 
26 | ## Editor
27 | 
28 | Below is an editor where you can try writing a query yourself.
29 | 
30 | The query below tells noodle to go to the google search result for 
31 | JavaScript and expect a html file. Then using the selector pick out 
32 | all of the result anchors. Finally the query says to extract the 
33 | text for each of those anchor elements.
34 | 
35 | Press run below to see the output:
36 | 
37 |     var query = {
38 |         url: 'https://google.com/search?q=javascript',
39 |         type: 'html',
40 |         selector: 'h3.r a',
41 |         extract: 'text'
42 |       },
43 |       uriQuery = encodeURIComponent(JSON.stringify(query)),
44 |       request  = 'https://example.noodle.dharmafly.com/?q=' +
45 |                  uriQuery + '&callback=?';
46 | 
47 |     // Make Ajax request to Noodle server
48 |     jQuery.getJSON(request, function (data) {
49 |       alert(data[0].results);
50 |     });
51 | 
52 | Noodle queries don't just support html but also json, feeds and plain xml. They can be a lot more powerful too. 
53 | [Read the reference for more details.](https://noodle.dharmafly.com/reference)
54 | 


--------------------------------------------------------------------------------
/docs/3. Web service.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | category: reference
  3 | heading: 'Web service'
  4 | ---
  5 | 
  6 | Noodle can be used as both a web service and a node module. In each case, key/value objects are used as queries to fetch and extract data from web documents.
  7 | 
  8 | noodle currently supports multiple web documents with an almost uniform query syntax for grabbing data from the different types (html, json, feeds, xml).
  9 | 
 10 | noodle is ready to run as a web service from `bin/noodle-server`.
 11 | 
 12 | 
 13 | ## Run the server
 14 | 
 15 |     $ cd noodle
 16 |     # or `cd node_modules/noodlejs` if installed via npm
 17 |     $ bin/noodle-server
 18 |       Server running on port 8888
 19 | 
 20 | 
 21 | ## GET or POST
 22 | 
 23 | The server supports queries via both GET and POST.
 24 | 
 25 | ### GET
 26 | 
 27 | The query itself can be sent in the `q` parameter either as a url encoded JSON blob or as a querystring serialised representation (`jQuery.param()`).
 28 | 
 29 | noodle supports JSONP if a `callback` parameter is supplied.
 30 | 
 31 |     GET https://example.noodle.dharmaflt.com?q={JSONBLOB}&callback=foo
 32 | 
 33 | 
 34 | ### POST
 35 | 
 36 | noodle also supports a query sent as JSON in the POST body.
 37 | 
 38 |     POST https://example.noodle.dharmafly.com
 39 | 
 40 | 
 41 | ## Rate limiting
 42 | 
 43 | The web service also provides rate limiting out of the box with 
 44 | [connect-ratelimit](https://github.com/dharmafly/connect-ratelimit).
 45 | 
 46 | 
 47 | ## Configuration
 48 | 
 49 | ### Server port
 50 | 
 51 | The specify what port the noodle web service serves on just write it as the 
 52 | first argument to the binary.
 53 | 
 54 |     $ bin/noodle-server 9000
 55 |      Server running on port 9000
 56 | 
 57 | ### Behaviour settings
 58 | 
 59 | Various noodle settings like cache and ratelimit settings are exposed 
 60 | and can be edited in `lib/config.json`.  
 61 | 
 62 |     {
 63 |       // Setting to true will log out information to the 
 64 |       // terminal
 65 | 
 66 |       "debug":                 true,
 67 | 
 68 |       "resultsCacheMaxTime":   3600000,
 69 |       "resultsCachePurgeTime": 60480000, // -1 will turn purging off
 70 |       "resultsCacheMaxSize":   124,
 71 | 
 72 |       "pageCacheMaxTime":      3600000,
 73 |       "pageCachePurgeTime":    60480000, // -1 will turn purging off
 74 |       "pageCacheMaxSize":      32,
 75 | 
 76 |       // If no query type option is supplied then 
 77 |       // what should noodle assume
 78 | 
 79 |       "defaultDocumentType":   "html",
 80 | 
 81 |       // How the noodle scraper identifies itself 
 82 |       // to scrape targets
 83 | 
 84 |       "userAgent":             "",
 85 | 
 86 |       // Rate limit settings
 87 |       // https://npmjs.org/package/connect-ratelimit#readme
 88 | 
 89 |       "rateLimit": {
 90 |         "whitelist": ["127.0.0.1", "localhost"],
 91 |         "blacklist": [],
 92 |         "categories": {
 93 |           "normal": {
 94 |             "totalRequests": 1000,
 95 |             "every":         3600000000
 96 |           },
 97 |           "whitelist": {
 98 |             "totalRequests": 10000,
 99 |             "every":         60000000
100 |           },
101 |           "blacklist": {
102 |             "totalRequests": 0,
103 |             "every":         0
104 |           }
105 |         }
106 |       }
107 |     }
108 | 


--------------------------------------------------------------------------------
/docs/4. Query syntax.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | category: reference
  3 | heading: 'Query syntax'
  4 | ---
  5 | 
  6 | A simple query looks like this:
  7 | 
  8 |     {
  9 |       "url": "http://chrisnewtn.com",
 10 |       "type": "html",
 11 |       "selector": "ul.social li a",
 12 |       "extract": "href",
 13 |     }
 14 | 
 15 | It says to go to a friend's website and for noodle to expect a html document. 
 16 | Then to select anchor elements in a list and for each one extract the href 
 17 | attribute's value.
 18 | 
 19 | The `type` property is used to tell noodle if you are wanting to scrape a html 
 20 | page, json document etc. If no type is specified then a html page will be 
 21 | assumed by default.
 22 | 
 23 | A similar query can be constructed to extract information from a JSON document.
 24 | JSONSelect is used as the underlying library to do this. It supports common CSS3 
 25 | selector functionality. You can [familiarize yourself with it here.](http://jsonselect.org/#tryit)
 26 | 
 27 |     {
 28 |       "url": "https://search.twitter.com/search.json?q=friendship",
 29 |       "selector": ".results .from_user",
 30 |       "type": "json"
 31 |     }
 32 | 
 33 | An `extract` property is not needed for a query on JSON documents as json 
 34 | properties have no metadata and just a single value were as a html element 
 35 | can have text, the inner html or an attribute like `href`.
 36 | 
 37 | ## Different types (html, json, feed & xml)
 38 | 
 39 | ### html
 40 | 
 41 | **Note:** Some xml documents can be parsed by noodle under the html type!
 42 | 
 43 | The html type is the only type to have the `extract` property. This is because 
 44 | the other types are converted to JSON.
 45 | 
 46 | The `extract` property (optional) could be the HTML element's attribute 
 47 | but it is not required.
 48 | 
 49 | Having `"html"` or `"innerHTML"` as the `extract` value will return the
 50 | containing HTML within that element.
 51 | 
 52 | Having `"text"` as the `extract` value will return only the text. noodle will 
 53 | strip out any new line characters found in the text.
 54 | 
 55 | Return data looks like this:
 56 | 
 57 |     [
 58 |       {
 59 |           "results": [
 60 |             "http://twitter.com/chrisnewtn",
 61 |             "http://plus.google.com/u/0/111845796843095584341"
 62 |           ],
 63 |           "created": "2012-08-01T16:22:14.705Z"
 64 |       }
 65 |     ]
 66 | 
 67 | Having no specific extract rule will assume a default of extracting `"text"` 
 68 | from the `selector`.
 69 | 
 70 | It is also possible to request multiple properties to extract in one query if 
 71 | one uses an array.
 72 | 
 73 | Query:
 74 | 
 75 |     {
 76 |       "url": "http://chrisnewtn.com",
 77 |       "selector": "ul.social li a",
 78 |       "extract": ["href", "text"]
 79 |     }
 80 | 
 81 | Response:
 82 | 
 83 |     [
 84 |       {
 85 |         "results": [
 86 |           {
 87 |               "href": "http://twitter.com/chrisnewtn",
 88 |               "text": "Twitter"
 89 |           },
 90 |           {
 91 |               "href": "http://plus.google.com/u/0/111845796843095584341",
 92 |               "text": "Google+"
 93 |           }
 94 |         ],
 95 |         "created": "2012-08-01T16:23:41.913Z"
 96 |       }
 97 |     ]
 98 | 
 99 | In the query's `selector` property use the standard CSS DOM selectors.
100 | 
101 | ### json and xml
102 | 
103 | The same rules apply from html to the json and xml types. Only that the 
104 | `extract` property should be ommitted from queries as the JSON node value(s) 
105 | targetted by the `selector` is always assumed.
106 | 
107 | In the query's `selector` property use 
108 | [JSONSelect](http://jsonselect.org/#tryit) style selectors.
109 | 
110 | ### feeds
111 | 
112 | The same rules apply to the json and xml types. Only that the `extract` property 
113 | should be ommitted from queries as the JSON node value(s) targetted by the 
114 | `selector` is always assumed.
115 | 
116 | In the query's `selector` property use 
117 | [JSONSelect](http://jsonselect.org/#tryit) style selectors.
118 | 
119 | The feed type is based upon 
120 | [node-feedparser](https://github.com/danmactough/node-feedparser) so it 
121 | supports Robust RSS, Atom, and RDF standards.
122 | 
123 | [Familiarize yourself with its](https://github.com/danmactough/node-feedparser#what-is-the-parsed-output-produced-by-feedparser) normalisation format before you use JSONSelect style 
124 | selector.
125 | 
126 | ## Getting the entire web document
127 | 
128 | If no `selector` is specified than the entire document is returned. This is a 
129 | rule applied to all types of docments. The `extract` rule will be ignored if 
130 | included.
131 | 
132 | Query:
133 | 
134 |     {
135 |       "url": "https://search.twitter.com/search.json?q=friendship"
136 |     }
137 | 
138 | Response:
139 |   
140 |     [
141 |       {
142 |         "results": ["<full document contents>"],
143 |         "created": "2012-10-24T15:37:29.796Z"
144 |       }
145 |     ]
146 | 
147 | ## Mapping a query to familiar properties
148 | 
149 | Queries can also be written in noodle's map notation. The map notation allows 
150 | for the results to be accessible by your own more helpful property names.
151 | 
152 | In the example below map is used to create a result object of a person and 
153 | their repos.
154 | 
155 |     {
156 |         "url": "https://github.com/chrisnewtn",
157 |         "type": "html",
158 |         "map": {
159 |             "person": {
160 |                 "selector": "span[itemprop=name]",
161 |                 "extract": "text"
162 |             },
163 |             "repos": {
164 |                 "selector": "li span.repo",
165 |                 "extract": "text"
166 |             }
167 |         }
168 |     }
169 | 
170 | With results looking like this:  
171 | 
172 |     [
173 |         {
174 |             "results": {
175 |                 "person": [
176 |                     "Chris Newton"
177 |                 ],
178 |                 "repos": [
179 |                     "cmd.js",
180 |                     "simplechat",
181 |                     "sitestatus",
182 |                     "jquery-async-uploader",
183 |                     "cmd-async-slides",
184 |                     "elsewhere",
185 |                     "pablo",
186 |                     "jsonpatch.js",
187 |                     "jquery.promises",
188 |                     "llamarama"
189 |                 ]
190 |             },
191 |             "created": "2013-03-25T15:38:01.918Z"
192 |         }
193 |     ]
194 | 
195 | ## Getting hold of page headers
196 | 
197 | Within a query include the  `headers` property with an array value listing the 
198 | headers you wish to recieve back as an object structure. `'all'` may also be 
199 | used as a value to return all of the server headers.
200 | 
201 | Headers are treated case-insensitive and the returned property names will 
202 | match exactly to the string you requested with.
203 | 
204 | Query:
205 | 
206 |     {
207 |       "url": "http://github.com",
208 |       "headers": ["connection", "content-TYPE"]
209 |     }
210 | 
211 | Result:
212 |   
213 |     [
214 |       {
215 |         "results": [...],
216 |         "headers": {
217 |           "connection": "keep-alive",
218 |           "content-TYPE": "text/html"
219 |         }
220 |         "created":"2012-11-14T13:06:02.521Z"
221 |       }
222 |     ]
223 | 
224 | ### Link headers for pagination
225 | 
226 | noodle provides a shortcut to the server Link header with the query 
227 | `linkHeader` property set to `true`. Link headers are useful as some web APIs 
228 | use them to expose their pagination.
229 | 
230 | The Link header will be parsed to an object structure. If you wish to have the Link header in its usual formatting then include it in the `headers` array instead.
231 | 
232 | Query:
233 |     
234 |     {
235 |       "url": "https://api.github.com/users/premasagar/starred",
236 |       "type": "json",
237 |       "selector": ".language",
238 |       "headers": ["connection"],
239 |       "linkHeader": true
240 |     }
241 | 
242 | Result:  
243 |   
244 |     [
245 |       {
246 |         "results": [
247 |           "JavaScript",
248 |           "Ruby",
249 |           "JavaScript",
250 |         ],
251 |         "headers": {
252 |           "connection": "keep-alive",
253 |           "link": {
254 |             "next": "https://api.github.com/users/premasagar/starred?page=2",
255 |             "last": "https://api.github.com/users/premasagar/starred?page=21"
256 |           }
257 |         },
258 |         "created": "2012-11-16T15:48:33.866Z"
259 |       }
260 |     ]
261 | 
262 | 
263 | ## Querying to a POST url
264 | 
265 | noodle allows for post data to be passed along to the target web server 
266 | specified in the url. This can be optionally done with the `post` property 
267 | which takes an object map of the post data key/values.
268 | 
269 |     {
270 |       "url": "http://example.com/login.php",
271 |       "post": {
272 |         "username": "john",
273 |         "password": "123"
274 |       },
275 |       "select": "h1.username",
276 |       "type": "html"
277 |     }
278 | 
279 | Take not however that queries with the `post` property will not be cached.
280 | 
281 | ## Querying without caching
282 | 
283 | If `cache` is set to `false` in your query then noodle will not cache the 
284 | results or associated page and it will get the data fresh. This is useful for 
285 | debugging.
286 | 
287 |     {
288 |       "url": "http://example.com",
289 |       "selector": "h1",
290 |       "cache": "false"
291 |     }
292 |   
293 | ## Query errors
294 | 
295 | noodle aims to give errors for the possible use cases were a query does 
296 | not yield any results.
297 | 
298 | Each error is specific to one result object and are contained in the `error` 
299 | property as a string message.
300 | 
301 | Response:
302 | 
303 |     [
304 |       {
305 |         "results": [],
306 |         "error": "Document not found"
307 |       }
308 |     ]
309 | 
310 | noodle also falls silently with the `'extract'` property by ommitting any 
311 | extract results from the results object.
312 | 
313 | Consider the following JSON response to a partially incorrect query.
314 | 
315 | Query:
316 | 
317 |     {
318 |       "url": "http://chrisnewtn.com",
319 |       "selector": "ul.social li a",
320 |       "extract": ["href", "nonexistent"]
321 |     }
322 | 
323 | Response:
324 | 
325 | The extract "nonexistent" property is left out because it was not found
326 | on the element.
327 | 
328 |     [
329 |       {
330 |         "results": [
331 |           {
332 |             "href": "http://twitter.com/chrisnewtn"
333 |           },
334 |           {
335 |             "href": "http://plus.google.com/u/0/111845796843095584341"
336 |           }
337 |         ],
338 |         "created": "2012-08-01T16:28:19.167Z"
339 |       }
340 |     ]
341 | 
342 | ## Multiple queries
343 | 
344 | Multiple queries can be made per request to the server. You can mix between 
345 | different types of queries in the same request as well as queries in the map 
346 | notation.
347 | 
348 | Query:
349 | 
350 |     [
351 |       {
352 |         "url": "http://chrisnewtn.com",
353 |         "selector": "ul.social li a",
354 |         "extract": ["text", "href"]
355 |       },
356 |       {
357 |         "url": "http://premasagar.com",
358 |         "selector": "#social_networks li a.url",
359 |         "extract": "href"
360 |       }
361 |     ]
362 | 
363 | Response:
364 | 
365 |     [
366 |       {
367 |         "results": [
368 |           {
369 |             "href": "http://twitter.com/chrisnewtn",
370 |             "text": "Twitter"
371 |           },
372 |           {
373 |             "href": "http://plus.google.com/u/0/111845796843095584341",
374 |             "text": "Google+"
375 |           }
376 |         ],
377 |         "created": "2012-08-01T16:23:41.913Z"
378 |       },
379 |       {
380 |         "results": [
381 |             "http://dharmafly.com/blog",
382 |             "http://twitter.com/premasagar",
383 |             "https://github.com/premasagar",
384 |         ],
385 |         "created": "2012-08-01T16:22:13.339Z"
386 |       }
387 |     ]
388 |     
389 | ## Proxy Support
390 | 
391 | When calling a page multiple times some sites can and will ban your IP address,  Adding support for proxy IP addresses allows the rotation of IP addresses.
392 | 
393 | Query:
394 | 
395 |     {
396 |       "url": "http://chrisnewtn.com",
397 |       "selector": "ul.social li a",
398 |       "extract": ["text", "href"],
399 |       "proxy": "XXX.XXX.XXX.XXX"
400 |     }
401 | 


--------------------------------------------------------------------------------
/docs/5. Noodle as node module.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | category: reference
  3 | heading: 'Noodle as node module'
  4 | ---
  5 | 
  6 | **Note:** Since noodle's internal cache uses an interval this will keep the
  7 | related node process running indefinately. Be sure to run `noodle.stopCache()`
  8 | in your code when you're finished with noodle.
  9 | 
 10 | ## Methods
 11 | 
 12 | ### noodle.query
 13 | 
 14 | The main entry point to noodle's functionality is the `query` method. This
 15 | method accepts a query or an array of queries as its only parameter and returns
 16 | a [promise](https://github.com/kriskowal/q).
 17 | 
 18 |     var noodle = require('noodlejs');
 19 |     noodle.query(queries).then(function (results) {
 20 |       console.log(results);
 21 |     });
 22 | 
 23 | The makeup of query(s) is analagous to using noodle as a web service (as
 24 | [stated above](http://noodlejs.com/reference/#query-syntax)). The
 25 | exception being that you supply a proper object and not JSON.
 26 | 
 27 | ### noodle.fetch
 28 | 
 29 | This method returns a [promises](https://github.com/kriskowal/q). Which upon
 30 | resolutions hands over the requested web document.
 31 | 
 32 |     noodle.fetch(url).then(function (page) {
 33 |       console.log(page);
 34 |     });
 35 | 
 36 | 
 37 | ### noodle.html.select
 38 | 
 39 | For applying one query to a html string and retrieving the results.
 40 | 
 41 |     noodle.html.select(html, {selector: 'title', extract: 'innerHTML'})
 42 |     .then(function (result) {
 43 |       console.log(result);
 44 |     });
 45 | 
 46 | 
 47 | ### noodle.json.select
 48 | 
 49 | For applying one query to a parsed JSON representation (object).
 50 | 
 51 |     var parsed = JSON.parse(json);
 52 |     noodle.html.select(parsed, {selector: '.name'})
 53 |     .then(function (result) {
 54 |       console.log(result);
 55 |     });
 56 | 
 57 | ## noodle.feed.select
 58 | 
 59 | Normalises an RSS, ATOM or RDF string with
 60 | [node-feedparser](https://github.com/danmactough/node-feedparser) then proxies
 61 | that normalised object to `noodle.json.select`.
 62 | 
 63 | ### noodle.xml.select
 64 | 
 65 | Proxies to `noodle.json.select`.
 66 | 
 67 | ### noodle events
 68 | 
 69 | noodle's `noodle.events` namespace allows one to listen for emitted cache
 70 | related events. Noodle inherits from node's [EventEmitter](http://nodejs.org/api/events.html#events_class_events_eventemitter).
 71 | 
 72 |     // Called when a page is cached
 73 |     noodle.events.on('cache/page', function (obj) {
 74 |       //obj is the page cache object detailing the page, its headers
 75 |       //and when it was first cached
 76 |     });
 77 | 
 78 |     // Called when a result is cached
 79 |     noodle.events.on('cache/result', function (obj) {
 80 |       //obj is the result cache object detailing the result and when
 81 |       //it was first cached
 82 |     });
 83 | 
 84 |     // Called when the cache is purged
 85 |     noodle.events.on('cache/purge', function (arg1, arg2) {
 86 |       //arg1 is a javascript date representing when the cache was purged
 87 |       //arg2 is the time in milliseconds until the next cache purge
 88 |     });
 89 | 
 90 |     // Called when a cached item has expired from the cache
 91 |     noodle.events.on('cache/expire', function (obj) {
 92 |       //obj is the cache item
 93 |     });
 94 | 
 95 | ### Configuration
 96 | 
 97 | Configuration is possible programmatically via `noodle.configure(obj)`.
 98 | 
 99 | This accepts a conig object which can be partly or fully representing the
100 | config options.
101 | 
102 | This object is applied over the existing config found in the `config.json`.
103 | 
104 | Example for change just two settings:
105 | 
106 |     var noodle = require('noodlejs');
107 | 
108 |     // Do not display messages to the terminal and set
109 |     // the default document type to json
110 | 
111 |     noodle.configure({
112 |       debug: false,
113 |       defaultDocumentType: "json"
114 |     });
115 | 


--------------------------------------------------------------------------------
/docs/6. Error handling.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | category: reference
 3 | heading: 'Error handling'
 4 | ---
 5 | 
 6 | Noodle will fire various errors which one can listen for with the `fail()` 
 7 | handler.
 8 | 
 9 |     noodle.html.fetch(query)
10 |     .then(function (result) {
11 |       console.log('The results are', results);
12 |     })
13 |     .fail(function (error) {
14 |       console.log('Uh oh', error.message);
15 |     });
16 | 
17 | ## Possible errors
18 | 
19 | The noodle module itself emits only one error:
20 | 
21 | - `"Document not found"` when a targetted url is not found.
22 | 
23 | Were as the specific document type modules emit their own but should bubble 
24 | up to the main `noodle.query` method.
25 | 
26 | - `'Could not parse XML to JSON'`
27 | - `'Could not parse JSON document'`
28 | - `'Could not match with that selector'`
29 | - `'Could not match with that selector or extract value'`
30 | 


--------------------------------------------------------------------------------
/docs/7. Caching.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | category: reference
 3 | heading: 'Caching'
 4 | ---
 5 | 
 6 | noodle includes an in memory cache for both queried pages and the query 
 7 | results to help with the speed of requests.
 8 | 
 9 | This cache can be configured in the `noodlejs/lib/config.json` file.
10 | 
11 | This cache is included in the noodle library core not at its web service.
12 | 
13 | Caching is done on a singular query basis and not per all queries in a request.
14 | 
15 | By default the page cache and results cache's individual items have a life time 
16 | of an hour. With a cache itself having total size of 124 recorded items in 
17 | memory at one time. A cache is also cleared entirely on a weekly basis.
18 | 
19 | These values can all be changed from noodle's json config.
20 | 
21 | ## HTTP caching headers
22 | 
23 | The noodle web service includes `Expires` header. This is always set to the 
24 | oldest to expire query result in a result set.
25 | 
26 | Take not however that the browser [may not cache](http://stackoverflow.com/questions/626057/is-it-possible-to-cache-post-methods-in-http) POST requests to the noodle server.


--------------------------------------------------------------------------------
/docs/8. Adding to noodle.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | category: reference
  3 | heading: 'Adding to noodle'
  4 | ---
  5 | 
  6 | noodle is an open-source project 
  7 | [maintained on github](https://github.com/dharmafly/premasagar) so raising 
  8 | issues and forking is encouraged.
  9 | 
 10 | ## Supporting different web documents
 11 | 
 12 | By default noodle supports html, json, standard feeds and xml web documents but 
 13 | noodle also provides a concise environment for developers to write their own 
 14 | type modules with prior knowledge only needed in 
 15 | [promises](https://github.com/kriskowal/q).
 16 | 
 17 | To add their own type, one creates the script for that type in 
 18 | `noodlejs/lib/types` with the name being what one would type in a query.
 19 | 
 20 | `  $ touch noodlejs/lib/types/csv.js`
 21 | 
 22 | As for the content of the script a developer should expose at least 2 methods 
 23 | (`_init` & `fetch`) and is recommended to expose a `select` method. These 
 24 | methods must be written with a promise interface interoperable with 
 25 | [the q library](https://github.com/kriskowal/q). It is reccomended you just use 
 26 | [q](https://github.com/kriskowal/q).
 27 | 
 28 | **Required methods**
 29 | 
 30 | `exports._init = function (noodle) {}`
 31 | 
 32 | This function is passed the main noodle library. You should keep hold of this 
 33 | reference so you can make use of some important noodle methods covered in a bit.
 34 | 
 35 | `exports.fetch = function (url, query) {}`
 36 | 
 37 | This method is the entry point to your module by noodle and possibly other 
 38 | developers. This is the function which leads to all of your processing.
 39 | 
 40 | Make use of `noodle.cache.get` to resolve your promise early with a cached 
 41 | results without the need to fetch the page and process the query.
 42 | 
 43 | It is higly recommended you do not fetch the page yourself but use the core 
 44 | `noodle.fetch` since this handles page caching for you.
 45 | 
 46 | When you have the document pass it and the query to your `select` function for 
 47 | processing with the query. 
 48 |     function fetch (url, query) {
 49 |       var deferred = q.defer();
 50 |       if (noodle.cache.check(query)) {
 51 |         deferred.resolve(noodle.cache.get(query).value);
 52 |         return deferred.promise;
 53 |       } else {
 54 |         return noodle.fetch(url, query).then(function (page) {
 55 |           return select(page, query);
 56 |         });
 57 |       }
 58 |     }
 59 | 
 60 | **Recommended methods**
 61 | 
 62 | `exports.select = function (document, query) {}`
 63 | 
 64 | This method is where you do your actual selecting of the data using the web 
 65 | document given from your `fetch` method via `noodle.fetch`.
 66 | 
 67 | In your algorithm do not account for multiple queries. This is done at a higher 
 68 | level by noodle which iterates over your type module.
 69 | 
 70 | It is also highly recommended that you cache your result this is done simply by 
 71 | wrapping it in the `noodle._wrapResults` method.
 72 | 
 73 | `deferred.resolve(noodle._wrapResults(results, query));`
 74 | 
 75 | What defines query properties like `extract` or `select` is what your own 
 76 | select function expects to find in the `query` object passed in. For example:
 77 | 
 78 | 
 79 |     // Query
 80 |     {
 81 |       "url": "http://example.com/data.csv",
 82 |       "type": "csv",
 83 |       "from": "row1",
 84 |       "to": "row10"
 85 |     }
 86 | 
 87 |     // Your interpretation
 88 |     function select (document, query) {
 89 |       ...
 90 |       csvparser.slice(query.from, query.to);
 91 |       ...
 92 |     }
 93 | 
 94 | **Example script**
 95 | 
 96 | An example implementation could look like this:
 97 | 
 98 |     var q      = require('q'),
 99 |         noodle = null;
100 | 
101 |     exports._init = function (n) {
102 |       noodle = n;
103 |     }
104 | 
105 |     exports.fetch = function (url, query) {
106 |       var deferred = q.Defer();
107 |       if (noodle.cache.check(query)) {
108 |         deferred.resolve(noodle.cache.get(query).value);
109 |         return deferred.promise;
110 |       } else {
111 |         return noodle.fetch(url).then(function (page) {
112 |           return exports.select(page, query);
113 |         });
114 |       }
115 |     }
116 | 
117 |     exports.select = function (page, query) {
118 |       var deferred  = q.Defer(),
119 |           myResults = [];
120 | 
121 |       /* 
122 |         your algorithm here, dont forget to
123 |         deferred.resolve(noodle._wrapResults(myResults, query))
124 |         or
125 |         deferred.fail(new Error("Selector was bad or something like that"))
126 |       */
127 | 
128 |       return deferred.promise;
129 |     }
130 | 


--------------------------------------------------------------------------------
/docs/9. Tests.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | category: reference
 3 | heading: 'Tests'
 4 | ---
 5 | 
 6 | The noodle tests create a temporary server on port `8889` which the automated 
 7 | tests tell noodle to query against.
 8 | 
 9 | To run tests you can use the provided binary from the noodle package root 
10 | directory:
11 | 
12 |     $ bin/tests
13 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
1 | exports = require('./lib/noodle.js');


--------------------------------------------------------------------------------
/lib/cache.js:
--------------------------------------------------------------------------------
  1 | var _ = require('underscore');
  2 | 
  3 | // ------------------------------------------------------------
  4 | // Cache class which can store, expose, expire and purge all
  5 | // items in its memory.
  6 | //
  7 | // Two instances of Cache exist as pageCache and resultsCache
  8 | // in noodle.js.
  9 | // ------------------------------------------------------------
 10 | 
 11 | module.exports = function Cache (config, noodle) {
 12 |   var cache = [],
 13 |       intervalId1,
 14 |       intervalId2;
 15 | 
 16 |   // ------------------------------------------------------------
 17 |   // Starts the interval for cache purging and cache expiry.
 18 |   // Called from noodle.js.
 19 |   // ------------------------------------------------------------
 20 | 
 21 |   this.start = function () {
 22 | 
 23 |     // Check to see if a cache item is to be removed from the 
 24 |     // cache (expired).
 25 | 
 26 |     intervalId1 = setInterval(function () {
 27 |       var now           = new Date().getTime(),
 28 |           initialLength = cache.length,
 29 |           x             = 0,
 30 |           keep          = [];
 31 | 
 32 |       while (x < initialLength) {
 33 |         if ((now - cache[x].created) < config.cacheMaxTime) {
 34 |           keep.unshift(cache[x]);
 35 |         } else {
 36 |           noodle.events.emit('cache/expire', cache[x], config.cacheMaxTime);
 37 |         }
 38 |         x++;
 39 |       }
 40 | 
 41 |       cache = keep;
 42 |     }, 10000);
 43 | 
 44 |     // Remove all cache entries every time the cache purge time 
 45 |     // is reached.
 46 | 
 47 |     if (config.cachePurgeTime > 0) {
 48 |       intervalId2 = setInterval(function () {
 49 |         cache = [];
 50 |         noodle.events.emit('cache/purge', new Date(), config.cachePurgeTime);
 51 |       }, config.cachePurgeTime);
 52 |     }
 53 |   };
 54 | 
 55 |   // ------------------------------------------------------------
 56 |   // Store an object in the cache tied to specific key.
 57 |   // 
 58 |   // In noodle: resultsCache stores a result set with the query
 59 |   // being the key. pageCache stores a document and its headers
 60 |   // with the url being the key. 
 61 |   // ------------------------------------------------------------
 62 | 
 63 |   this.put = function (key, value) {
 64 |     var item = {
 65 |       key: key,
 66 |       value: value,
 67 |       created: new Date()
 68 |     };
 69 | 
 70 |     if (cache.length >= config.maxSize) {
 71 |       cache.pop();
 72 |     }
 73 | 
 74 |     cache.unshift(item);
 75 |     return this.get(key);
 76 |   };
 77 |   
 78 |   // ------------------------------------------------------------
 79 |   // Boolean representing if an item exists for a specific key.
 80 |   // ------------------------------------------------------------
 81 | 
 82 |   this.check = function (key) {
 83 |     return (find(key)) ? true : false;
 84 |   };
 85 | 
 86 |   // ------------------------------------------------------------
 87 |   // Returns a cached item based on a specific key.
 88 |   //
 89 |   // Cached items are objects with the following structure:
 90 |   //
 91 |   // {
 92 |   //   created: <JavaScript date>
 93 |   //   value:   <The Object stored> 
 94 |   // }
 95 |   // ------------------------------------------------------------
 96 | 
 97 |   this.get = function (key) {
 98 |     var item  = find(key),
 99 |         clone = _.clone(item);
100 | 
101 |     delete clone.key;
102 |     return clone;
103 |   };
104 | 
105 |   // ------------------------------------------------------------
106 |   // The cache array is exposed. Useful for debugging purposes.
107 |   // ------------------------------------------------------------
108 |   
109 |   this.getCache = function () {
110 |     return cache;
111 |   };
112 | 
113 |   // ------------------------------------------------------------
114 |   // Stops running the intervals for the cache checking. Useful
115 |   // for removing cache objects from the event loop and keeping
116 |   // the node process from running indefinitely.
117 |   // ------------------------------------------------------------
118 | 
119 |   this.stop = function () {
120 |     clearInterval(intervalId1);
121 |     clearInterval(intervalId2);
122 |   };
123 | 
124 |   // Loops through the cache array finding the cached item 
125 |   // associated with the key.
126 | 
127 |   function find (key) {
128 |     var i = 0;
129 |     for (i; i < cache.length; i++) {
130 |       if (_.isEqual(key, cache[i].key)) {
131 |         return cache[i];
132 |       }
133 |     }
134 |   }
135 | };
136 | 


--------------------------------------------------------------------------------
/lib/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "debug":                 true,
 3 | 
 4 |   "resultsCacheMaxTime":   3600000,
 5 |   "resultsCachePurgeTime": 60480000,
 6 |   "resultsCacheMaxSize":   124,
 7 | 
 8 |   "pageCacheMaxTime":      3600000,
 9 |   "pageCachePurgeTime":    60480000,
10 |   "pageCacheMaxSize":      32,
11 | 
12 |   "defaultDocumentType":   "html",
13 | 
14 |   "userAgent":             "",
15 | 
16 |   "rateLimit": {
17 |     "whitelist": ["127.0.0.1", "localhost"],
18 |     "blacklist": [],
19 |     "catagories": {
20 |       "normal": {
21 |         "totalRequests": 1000,
22 |         "every":         3600000000
23 |       },
24 |       "whitelist": {
25 |         "totalRequests": 10000,
26 |         "every":         60000000
27 |       },
28 |       "blacklist": {
29 |         "totalRequests": 0,
30 |         "every":         0
31 |       }
32 |     }
33 |   }
34 | }


--------------------------------------------------------------------------------
/lib/logger.js:
--------------------------------------------------------------------------------
 1 | require('colors');
 2 | 
 3 | var messages = 0;
 4 | 
 5 | module.exports = function (noodle) {
 6 |   var events = noodle.events,
 7 |       config = noodle.config;
 8 |       
 9 |   function toTerminal(message) {
10 |     if (config.debug) {
11 |       console.log(('\n [noodle log  #' + ++messages + ']').green);
12 |       console.log('', new Date().toString().magenta);
13 |       console.log('', memUsage().magenta);
14 |       console.log('', (message + '\n').magenta);
15 |     }
16 |   }
17 | 
18 |   // Called on a query
19 |   events.on('noodle/query', function (query) {
20 |     toTerminal('Noodle: The query follows...\n ' + JSON.stringify(query));
21 |   });
22 | 
23 |   // Called when a page is cached
24 |   events.on('cache/page', function (cachePage) {
25 |     toTerminal('Cache: Page has been cached');
26 |   });
27 | 
28 |   // Called when a result is cached
29 |   events.on('cache/result', function (cacheResult) {
30 |     toTerminal('Cache: Result has been cached');
31 |   });
32 | 
33 |   // Called when the cache is purged
34 |   events.on('cache/purge', function (when, next) {
35 |     toTerminal('Cache: Purge @ ' + when + ' next in ' + next);
36 |   });
37 | 
38 |   // Called when a cached item has expired from the cache
39 |   events.on('cache/expire', function (item, next) {
40 |     toTerminal('Cache: An item expired from cache, next in ' + next);
41 |   });
42 | };
43 | 
44 | function memUsage () {
45 |   var heapTotal = process.memoryUsage().heapTotal;
46 |   return 'Memory: ' + (heapTotal / 1048576).toFixed(2) + 
47 |          'mb (' + heapTotal + ' bytes)';
48 | }


--------------------------------------------------------------------------------
/lib/noodle-middleware.js:
--------------------------------------------------------------------------------
  1 | var zlib   = require('zlib'),
  2 |     moment = require('moment'),
  3 |     _      = require('underscore'),
  4 |     noodle = require('../lib/noodle');
  5 | 
  6 | exports.parseQueries = function (req, res, next) {
  7 |   var hasJSON        = (Object.keys(req.body).length  > 0),
  8 |       hasQueryString = (Object.keys(req.query).length > 0),
  9 |       queries;
 10 | 
 11 |   // Handle for different request types
 12 | 
 13 |   // Take JSON from request body (http post)
 14 |   queries = (hasJSON) ? req.body : false;
 15 |   // Take only single query JSON from request querystring (http get)
 16 |   queries = (queries === false && hasQueryString) ? req.query : queries;
 17 |   // Take JSON from request querystring (http get)
 18 |   queries = (req.query.q) ? toJSON(req.query.q) : queries;
 19 | 
 20 |   // Handle query(s) with noodle or fail early
 21 | 
 22 |   if (queries) {
 23 |     res.queries = queries;
 24 |     next();
 25 |   } else {
 26 |     res.noodleData = {error: 'Malformed or no query'};
 27 |     exports.respond(req, res);
 28 |   }
 29 | };
 30 | 
 31 | exports.noodleQueries = function (req, res, next) {
 32 |   noodle.query(res.queries).then(function (results) {
 33 |     res.noodleData = results;
 34 |     next();
 35 |   });
 36 | };
 37 | 
 38 | exports.respond = function (req, res) {
 39 |   var error    = res.noodleData.error,
 40 |       callback = req.query.callback,
 41 |       responseBody;
 42 | 
 43 |   if (error) {
 44 |     res.statusCode = 401;
 45 |     responseBody = '[{"results": [], "error":"' + error + '"}]';
 46 |   } else {
 47 |     res.statusCode = 200;
 48 |     res.setHeader('Expires', setExpiresHeader(res.noodleData.results));
 49 |     responseBody = JSON.stringify(res.noodleData.results);
 50 |   }
 51 | 
 52 |   if (callback) {
 53 |     res.setHeader('Content-Type', 'application/javascript');
 54 |     responseBody = callback + '(' + responseBody + ')';
 55 |   } else {
 56 |     res.setHeader('Content-Type', 'application/json; charset=utf-8');
 57 |   }
 58 | 
 59 |   responseBody = new Buffer(responseBody, 'utf8');
 60 | 
 61 |   if (req.headers['accept-encoding']) {
 62 |     res.setHeader('content-encoding', 'gzip');
 63 |     zlib.gzip(responseBody, function (err, buffer) {
 64 |       res.end(buffer);
 65 |     });
 66 |   } else {
 67 |     res.end(responseBody);
 68 |   }
 69 | };
 70 | 
 71 | function setExpiresHeader (results) {
 72 |   var temp;
 73 | 
 74 |   results = (_.isArray(results)) ? results : [results];
 75 |   
 76 |   // Get the earliest time first (last to expire)
 77 |   // use concat() to not mutate the original results order
 78 | 
 79 |   temp = results.concat().sort(function (a, b) {
 80 |     return (b.created || 0) - (a.created || 0);
 81 |   });
 82 | 
 83 |   // Return oldest to expire or return the present time for 
 84 |   // a bad result which was not cached
 85 | 
 86 |   if (temp[0].created) {
 87 |     return moment(temp[0].created.getTime() + noodle.config.resultsCacheMaxTime)
 88 |             .format('ddd, D MMM YYYY HH:mm:ss') + ' GMT';
 89 |   } else {
 90 |     return moment(new Date())
 91 |             .format('ddd, D MMM YYYY HH:mm:ss') + ' GMT';
 92 |   }
 93 | };
 94 | 
 95 | 
 96 | // Wraps JSON.parse so that numbers are treated as an invalid argument
 97 | 
 98 | function toJSON (str) {
 99 |   var x;
100 |   try {
101 |     x = JSON.parse(str);
102 |     if (typeof x === 'number') {
103 |       return false;
104 |     }
105 |     return x;
106 |   } catch (e) {
107 |     return false;
108 |   }
109 | }


--------------------------------------------------------------------------------
/lib/noodle.js:
--------------------------------------------------------------------------------
  1 | var q            = require('q'),
  2 |     fs           = require('fs'),
  3 |     events       = require('events'),
  4 |     request      = require('request'),
  5 |     _            = require('underscore'),
  6 |     Cache        = require('./cache'),
  7 |     pageCache,
  8 |     resultsCache;
  9 | 
 10 | 
 11 | // ------------------------------------------------------------
 12 | // Main noodle entry point for usage.
 13 | //
 14 | // Accepts one or an array of noodle queries. Based on the
 15 | // query type it will make use of the appropriate type module
 16 | // to do the processing.
 17 | //
 18 | // See docs/ for information on what and noodle queries can
 19 | // be written.
 20 | // ------------------------------------------------------------
 21 | 
 22 | exports.query = function (queries) {
 23 |   var deferred = q.defer(),
 24 |       promises = [];
 25 | 
 26 |   // Normalise one query to an array
 27 | 
 28 |   queries  = _.isArray(queries) ? queries : [queries],
 29 | 
 30 |   // For each query route resolve it as either a normal query
 31 |   // or a map query
 32 | 
 33 |   queries.forEach(function (query, i) {
 34 |     var deferred = q.defer();
 35 | 
 36 |     query.type  = query.type || exports.config.defaultDocumentType;
 37 |     query.cache = (query.cache === false) ? false : true;
 38 | 
 39 |     exports.events.emit('noodle/query', query);
 40 | 
 41 |     if (exports[query.type]) {
 42 |       if (query.map) {
 43 |         handleQueryMap(query, deferred, i);
 44 |       } else {
 45 |         handleQuery(query, deferred, i);
 46 |       }
 47 |     } else {
 48 |       deferred.resolve({results: [], error: 'Document type not supported'});
 49 |     }
 50 |     promises.push(deferred.promise);
 51 |   });
 52 | 
 53 |   // Return master promise when all queries have resolved
 54 |   // and ensure that the order they were evaluated is
 55 |   // maintained
 56 | 
 57 |   q.all(promises)
 58 |     .then(function (results) {
 59 |       results = results.sort(function (a, b) {
 60 |         return a.orderNo - b.orderNo;
 61 |       });
 62 | 
 63 |       results.forEach(function (result) {
 64 |         delete result.orderNo;
 65 |       });
 66 | 
 67 |       deferred.resolve({results: results});
 68 |     });
 69 | 
 70 |   return deferred.promise;
 71 | };
 72 | 
 73 | function handleQuery (query, deferred, i) {
 74 |   exports[query.type].fetch(query.url, query)
 75 |     .then(function (result) {
 76 |       result.orderNo = i;
 77 |       if (query.cache) {
 78 |         result.created = resultsCache.get(query).created;
 79 |       }
 80 |       deferred.resolve(result);
 81 |     })
 82 |     .fail(function (error) {
 83 |       deferred.resolve({results: [], error: error.message, orderNo: i});
 84 |     });
 85 | }
 86 | 
 87 | function handleQueryMap (query, deferred, i) {
 88 |   map(query, function (error, result) {
 89 |     if (!error) {
 90 |       result.orderNo = i;
 91 |       if (query.cache) {
 92 |         result.created = resultsCache.get(query).created;
 93 |       }
 94 |       deferred.resolve(result);
 95 |     } else {
 96 |       deferred.resolve({results: [], error: error.message, orderNo: i});
 97 |     }
 98 |   });
 99 | }
100 | 
101 | // ------------------------------------------------------------
102 | // Fetch a web document (possibly from cache) with a url.
103 | //
104 | // The query should also be passed in as it contains
105 | // details if it should bypass the cache or if it is a POST
106 | // request.
107 | //
108 | // This fetch method is used by the different type modules to
109 | // get the document before they do they interpret the query
110 | // process the document.
111 | // ------------------------------------------------------------
112 | 
113 | exports.fetch = function (url, query, extendedHeaders) {
114 |   var deferred       = q.defer(),
115 |       requestOptions = {
116 |         method: 'GET',
117 |         uri: url,
118 |         headers: {'user-agent': exports.config.userAgent}
119 |       };
120 | 
121 |   if (query.proxy) {
122 |     requestOptions.proxy = query.proxy;
123 |   }
124 | 
125 | 
126 |   if (query.post) {
127 |     requestOptions.method = 'POST';
128 |     requestOptions.body   = serialize(query.post);
129 |     requestOptions.headers = _.extend(requestOptions.headers, {
130 |       'Content-Type': 'application/x-www-form-urlencoded',
131 |       'Content-Length': requestOptions.body.length
132 |     });
133 |     query.cache           = false;
134 |   }
135 | 
136 |   if (extendedHeaders) {
137 |     _.extend(requestOptions.headers, extendedHeaders);
138 |   }
139 | 
140 |   // (!) This aspect should be revised.
141 |   // Force cache true if the person wants header information
142 |   // since header data is read from cache
143 |   query.cache = (query.headers || query.linkHeader) ? true : query.cache;
144 | 
145 |   if (pageCache.check(url) && query.cache) {
146 |     deferred.resolve(pageCache.get(url).value.body);
147 |   } else {
148 |     getDocument(query.cache, requestOptions, deferred);
149 |   }
150 | 
151 |   return deferred.promise;
152 | };
153 | 
154 | function getDocument (shouldCache, options, deferred) {
155 |   request(options, function (err, response, body) {
156 |     if (err || response.statusCode !== 200) {
157 |       deferred.reject(new Error('Document not found'));
158 |     } else {
159 |       if (shouldCache && !pageCache.check(options.uri)) {
160 |         //added response.request in order to get the details like location and domain
161 |         pageCache.put(options.uri, {body: body, headers: response.headers, request: response.request});
162 |         exports.events.emit('cache/page', pageCache.get(options.uri
163 |                             ));
164 |       }
165 |       deferred.resolve(body);
166 |     }
167 |   });
168 | }
169 | 
170 | // ------------------------------------------------------------
171 | // Returns an object representing a result set which comprises
172 | // of an array of 1 or more results and the associate page
173 | // header information.
174 | //
175 | // (!!) This is where a result set is cached in resultsCache.
176 | //
177 | // Exposed as it is also called from some type modules.
178 | // ------------------------------------------------------------
179 | 
180 | exports._wrapResults = function (results, query) {
181 |   var resultSet = {};
182 | 
183 |   if (results.length || Object.keys(results).length) {
184 |     resultSet.results = results;
185 | 
186 |     if (query.headers) {
187 |       resultSet.headers = getHeadersForResultSet(query);
188 |     }
189 | 
190 |     if (query.request) {
191 |       resultSet.request = getRequestDetailsForResultSet(query);
192 |     }
193 | 
194 |     if (query.linkHeader) {
195 |       resultSet.headers      = resultSet.headers     || {};
196 |       resultSet.headers.link = getLinkHeaders(query) || null;
197 |     }
198 | 
199 |     if (query.cache) {
200 |       if (resultsCache.check(query) === false) {
201 |         resultsCache.put(query, resultSet);
202 |         exports.events.emit('cache/result', resultsCache.get(query));
203 |       }
204 |     }
205 | 
206 |     return resultSet;
207 |   }
208 | 
209 |   return [];
210 | };
211 | 
212 | // ------------------------------------------------------------
213 | // The namespace for noodles events.
214 | //
215 | // Events are emitted from both this file and cache.js.
216 | //
217 | // One can subscribe to the following events:
218 | // - cache/page
219 | // - cache/result
220 | // - cache/purge
221 | // - cache/expire
222 | //
223 | // ------------------------------------------------------------
224 | 
225 | exports.events = new events.EventEmitter();
226 | 
227 | // ------------------------------------------------------------
228 | // An exposed noodle config initialized by an editable
229 | // json representation at lib/config.json
230 | // ------------------------------------------------------------
231 | 
232 | exports.config = JSON.parse(fs.readFileSync(__dirname +'/config.json'));
233 | 
234 | // ------------------------------------------------------------
235 | // Accepts a full or part config object an extends it over
236 | // the existing noodle config.
237 | //
238 | // This is a way to programmatically configure the config
239 | // without touching lib/config.json
240 | // ------------------------------------------------------------
241 | 
242 | exports.configure = function (obj) {
243 |   exports.config = _.extend(exports.config, obj);
244 | };
245 | 
246 | // ------------------------------------------------------------
247 | // Stops the cache intervals from running in the event loop.
248 | // Allows for the node process to exit.
249 | // ------------------------------------------------------------
250 | 
251 | exports.stopCache = function () {
252 |   resultsCache.stop();
253 |   pageCache.stop();
254 | };
255 | 
256 | // Function called from exports.query()
257 | //
258 | // Takes in a query in the map notation
259 | //
260 | // For each map property, a call to the appropriate type module
261 | // is done and the result is grabbed for that map property's
262 | // value.
263 | //
264 | // When all properties are mapped with values this function calls
265 | // back to exports.query().
266 | 
267 | function map (query, callback) {
268 |   var promises        = [],
269 |       mappedContainer = {},
270 |       getResultSet,
271 |       toPush,
272 |       mapTo;
273 | 
274 |   getResultSet = function (mapTo, query) {
275 |     query.map[mapTo].url   = query.url;
276 |     query.map[mapTo].cache = query.cache;
277 | 
278 |     return exports[query.type].fetch(query.url, query.map[mapTo])
279 |             .then(function (result) {
280 |               mappedContainer[mapTo] = result.results;
281 |             })
282 |             .fail(function (error) {
283 |               mappedContainer[mapTo] = {results: [], error: error.message};
284 |             });
285 |   };
286 | 
287 |   for (mapTo in query.map) {
288 |     promises.push(getResultSet(mapTo, query));
289 |   }
290 | 
291 |   q.all(promises)
292 |     .then(function () {
293 |       callback(null, exports._wrapResults(mappedContainer, query));
294 |     })
295 |     .fail(function (error) {
296 |       callback(error);
297 |     });
298 | }
299 | 
300 | // Function called from exports._wrapResults()
301 | //
302 | // Passed in a query and returns the full page headers
303 | // or specific page headers as specified by the query.
304 | 
305 | function getHeadersForResultSet (query) {
306 |   var bucket      = {},
307 |       pageHeaders = pageCache.get(query.url).value.headers,
308 |       prop;
309 | 
310 |   if (query.headers !== 'all' && _.isArray(query.headers)) {
311 |     for (prop in pageHeaders) {
312 |       query.headers.forEach(function (name) {
313 |         if (prop.toLowerCase() === name.toLowerCase()) {
314 |           bucket[name] = pageHeaders[prop];
315 |         }
316 |       });
317 |     }
318 |     return bucket;
319 |   } else {
320 |     return pageHeaders;
321 |   }
322 | }
323 | 
324 | 
325 | // Function called from exports._wrapResults()
326 | //
327 | // Passed in a query and returns the full request headers
328 | // or specific request headers as specified by the query.
329 | function getRequestDetailsForResultSet(query) {
330 |   var bucket       = {},
331 |     requestHeaders = pageCache.get(query.url).value.request,
332 |     prop;
333 | 
334 |   if (query.request !== 'all' && _.isArray(query.request)) {
335 |     for (prop in requestHeaders) {
336 |       query.request.forEach(function (name) {
337 |         if(prop.toLowerCase() === name.toLowerCase()) {
338 |           bucket[name] = requestHeaders[prop];
339 |         }
340 |       });
341 |     }
342 |     return bucket;
343 |   } else {
344 |     return requestHeaders;
345 |   }
346 | }
347 | 
348 | // Function called from exports._wrapResults()
349 | //
350 | // Passed in a query this function returns a parsed representation
351 | // of the Link header values (intended to aid people with navigation).
352 | 
353 | function getLinkHeaders (query) {
354 |   var header = pageCache.get(query.url).value.headers.link,
355 |       links  = {},
356 |       parts;
357 | 
358 |   if (header) {
359 |     parts = header.split(',');
360 |   } else {
361 |     return false;
362 |   }
363 | 
364 |   // Parse each part into a named link
365 |   parts.forEach(function(p) {
366 |     var section = p.split(';'),
367 |         url     = section[0].replace(/<(.*)>/, '$1').trim(),
368 |         name    = section[1].replace(/rel="(.*)"/, '$1').trim();
369 |     links[name] = url;
370 |   });
371 | 
372 |   return links;
373 | }
374 | 
375 | // Function called from exports.query
376 | //
377 | // Will return a query parameter string from an object.
378 | 
379 | function serialize (obj) {
380 |   var str = [], p;
381 |   for (p in obj) {
382 |     str.push(encodeURIComponent(p) + "=" + encodeURIComponent(obj[p]));
383 |   }
384 |   return str.join("&");
385 | }
386 | 
387 | // .---------------------------.
388 | // |noodle initialization stuff|
389 | // '---------------------------'
390 | 
391 | // Initialize supported document types
392 | 
393 | fs.readdirSync(__dirname + '/types/').forEach(function (file) {
394 |   file = file.substr(0, file.lastIndexOf('.'));
395 |   exports[file] = require('./types/' + file);
396 |   exports[file]._init(exports);
397 | });
398 | 
399 | // Start the logger.
400 | // The logger will output to terminal if config.debug is set
401 | // to true.
402 | 
403 | require('./logger')(exports);
404 | 
405 | // Initialize caches
406 | 
407 | // ------------------------------------------------------------
408 | // The results cache is exposed for different type modules
409 | // so they can cache their results.
410 | // ------------------------------------------------------------
411 | 
412 | exports.cache = resultsCache = new Cache({
413 |   cacheMaxTime:   exports.config.resultsCacheMaxTime,
414 |   cachePurgeTime: exports.config.resultsCachePurgeTime,
415 |   cacheMaxSize:   exports.config.resultsCacheMaxSize
416 | }, exports);
417 | 
418 | pageCache = new Cache({
419 |   cacheMaxTime:   exports.config.pageCacheMaxTime,
420 |   cachePurgeTime: exports.config.pageCachePurgeTime,
421 |   cacheMaxSize:   exports.config.pageCacheMaxSize
422 | }, exports);
423 | 
424 | resultsCache.start();
425 | pageCache.start();
426 | 


--------------------------------------------------------------------------------
/lib/types/feed.js:
--------------------------------------------------------------------------------
 1 | var q          = require('q'),
 2 |     feedparser = require('feedparser'),
 3 |     noodle;
 4 | 
 5 | exports._init   = function (n) {
 6 |   noodle = n;
 7 | };
 8 | 
 9 | exports.fetch  = fetch;
10 | exports.select = select;
11 | 
12 | function fetch (url, query) {
13 |   var deferred = q.defer();
14 | 
15 |   if (noodle.cache.check(query)) {
16 |     deferred.resolve(noodle.cache.get(query).value);
17 |     return deferred.promise;
18 |   } else {
19 |     return noodle.fetch(url, query).then(function (data) {
20 |       return select(data, query);
21 |     });
22 |   }
23 | }
24 | 
25 | function select (data, query) {
26 |   return normalise(data).then(function (normalised) {
27 |     if (normalised.length === 0) {
28 |       throw new Error('The provided document couldn\'t be normalised');
29 |     }
30 |     return noodle.json.select(normalised, query);
31 |   });
32 | }
33 | 
34 | function normalise (body) {
35 |   var deferred = q.defer(),
36 |       articles = [];
37 | 
38 |   feedparser
39 |     .parseString(body)
40 |     .on('article', function (a) {
41 |       articles.push(a);
42 |     })
43 |     .on('error', deferred.reject)
44 |     .on('complete', function () {
45 |       deferred.resolve(articles);
46 |     });
47 | 
48 |   return deferred.promise;
49 | }


--------------------------------------------------------------------------------
/lib/types/html.js:
--------------------------------------------------------------------------------
 1 | var q       = require('q'),
 2 |     util    = require('util'),
 3 |     cheerio = require('cheerio'),
 4 |     noodle;
 5 | 
 6 | exports._init  = function (n) {
 7 |   noodle = n;
 8 | };
 9 | 
10 | exports.fetch  = fetch;
11 | exports.select = select;
12 | 
13 | function fetch (url, query) {
14 |   var deferred = q.defer();
15 | 
16 |   if (noodle.cache.check(query)) {
17 |     deferred.resolve(noodle.cache.get(query).value);
18 |     return deferred.promise;
19 |   } else {
20 |     return noodle.fetch(url, query).then(function (page) {
21 |       return select(page, query);
22 |     });
23 |   }
24 | }
25 | 
26 | function select (body, query) {
27 |   var deferred = q.defer(),
28 |       extract  = query.extract || 'text',
29 |       selector = query.selector,
30 |       page     = cheerio.load(body, { lowerCaseTags: true, lowerCaseAttributeNames: true }),
31 |       selected = page(selector),
32 |       results  = [];
33 | 
34 |   if (!selector) {
35 |     deferred.resolve(noodle._wrapResults(body.trim(), query));
36 |     return deferred.promise;
37 |   }
38 |   else if (util.isArray(extract)) {
39 |     selected.each(function (i, elem) {
40 |       var item = {},
41 |           notEmpty;
42 | 
43 |       extract.forEach(function (property) {
44 |         item[property] = extractProperty(page, elem, property);
45 |         notEmpty       = notEmpty || item[property];
46 |       });
47 | 
48 |       if (notEmpty) {
49 |         results.push(item);
50 |       }
51 |     });
52 |   }
53 |   else {
54 |     selected.each(function (i, elem) {
55 |       results.push(extractProperty(page, elem, extract));
56 |     });
57 |   }
58 | 
59 |   // Pass back the extracted results from the DOM
60 | 
61 |   if (results.length === 0) {
62 |     deferred.reject(new Error('Could not match with that selector or extract value'));
63 |   } else {
64 |     deferred.resolve(noodle._wrapResults(results, query));
65 |   }
66 | 
67 |   return deferred.promise;
68 | }
69 | 
70 | function extractProperty (page, elem, property) {
71 |   if (property === 'text') {
72 |     return page(elem).text().replace(/(\r\n|\n|\r)/gm, "").trim();
73 |   }
74 |   else if (property === 'html' || property === 'innerHTML') {
75 |     return page(elem).html();
76 |   }
77 |   else {
78 |     return page(elem).attr(property);
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/lib/types/json.js:
--------------------------------------------------------------------------------
 1 | var q          = require('q'),
 2 |     jsonSelect = require('JSONSelect'),
 3 |     noodle;
 4 | 
 5 | exports._init = function (n) {
 6 |   noodle = n;
 7 | };
 8 | 
 9 | exports.fetch  = fetch;
10 | exports.select = select;
11 | 
12 | function fetch (url, query) {
13 |   var deferred = q.defer();
14 | 
15 |   if (noodle.cache.check(query)) {
16 |     deferred.resolve(noodle.cache.get(query).value);
17 |     return deferred.promise;
18 |   } else {
19 |     return noodle.fetch(url, query).then(function (data) {
20 |       try {
21 |         var parsed = JSON.parse(data);
22 |         return select(parsed, query);
23 |       } catch (e) {
24 |         throw new Error('Could not parse JSON document');
25 |       }
26 |     });
27 |   }
28 | }
29 | 
30 | function select (parsed, query) {
31 |   var deferred = q.defer(),
32 |       results;
33 | 
34 |   try {
35 |     if (!query.selector) {
36 |       deferred.resolve(noodle._wrapResults([parsed], query));
37 |     } else {
38 |       results = jsonSelect.match(query.selector, [], parsed);
39 |       if (results.length === 0) {
40 |         deferred.reject(new Error('Could not match with that selector'));
41 |       } else {
42 |         deferred.resolve(noodle._wrapResults(results, query));
43 |       }
44 |     }
45 |   } catch (e) {
46 |     deferred.reject(new Error('Could not match with that selector'));
47 |   }
48 | 
49 |   return deferred.promise;
50 | }


--------------------------------------------------------------------------------
/lib/types/xml.js:
--------------------------------------------------------------------------------
 1 | var q        = require('q'),
 2 |     xml2json = require('xml2json'),
 3 |     noodle;
 4 | 
 5 | exports._init   = function (n) {
 6 |   noodle = n;
 7 | };
 8 | 
 9 | exports.fetch  = fetch;
10 | exports.select = select;
11 | 
12 | function fetch (url, query) {
13 |   var deferred = q.defer();
14 | 
15 |   if (noodle.cache.check(query)) {
16 |     deferred.resolve(noodle.cache.get(query).value);
17 |     return deferred.promise;
18 |   } else {
19 |     return noodle.fetch(url, query).then(function (xml) {
20 |       try {
21 |         var parsed = JSON.parse(xml2json.toJson(xml));
22 |         return select(parsed, query);
23 |       } catch (e) {
24 |         throw new Error('Could not parse XML to JSON');
25 |       }
26 |     });
27 |   }
28 | }
29 | 
30 | function select (obj, query) {
31 |   return noodle.json.select(obj, query);
32 | }


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "noodlejs",
 3 |   "version": "0.3.2",
 4 |   "description": "noodle is a proxy server which serves for cross domain data extraction from web documents for any client.",
 5 |   "main": "./lib/noodle",
 6 |   "bin": {
 7 |     "noodle": "./bin/noodle-server"
 8 |   },
 9 |   "dependencies": {
10 |     "connect": "~2.3.5",
11 |     "connect-ratelimit": "0.0.5",
12 |     "JSONSelect": "0.4.0",
13 |     "feedparser": "0.10.7",
14 |     "moment": "1.7.2",
15 |     "cheerio": "0.10.1",
16 |     "request": "2.11.4",
17 |     "q": "0.8.9",
18 |     "xml2json": "^0.5.1",
19 |     "underscore": "1.4.2",
20 |     "mocha": "1.7.4",
21 |     "chai": "1.4.2",
22 |     "colors": "0.6.0-1"
23 |   },
24 |   "devDependencies": {},
25 |   "scripts": {
26 |     "test": "echo \"Error: no test specified\" && exit 1",
27 |     "start": "bin/noodle-server"
28 |   },
29 |   "engines": {
30 |     "node": "0.6.x"
31 |   },
32 |   "repository": {
33 |     "type": "git",
34 |     "url": "git://github.com/dharmafly/noodle.git"
35 |   },
36 |   "keywords": [
37 |     "scraper",
38 |     "proxy",
39 |     "cross-domain",
40 |     "cross domain",
41 |     "selectors",
42 |     "JSONSelect",
43 |     "json",
44 |     "html",
45 |     "web service",
46 |     "rate limit"
47 |   ],
48 |   "author": "Dharmafly",
49 |   "license": "BSD"
50 | }


--------------------------------------------------------------------------------
/tests/document.atom:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <feed xmlns="http://www.w3.org/2005/Atom">
 3 | 
 4 |   <title>Example Feed</title>
 5 |   <link href="http://example.org/"/>
 6 |   <updated>2003-12-13T18:30:02Z</updated>
 7 |   <author>
 8 |     <name>John Doe</name>
 9 |   </author>
10 |   <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
11 | 
12 |   <entry>
13 |     <title>Atom-Powered Robots Run Amok</title>
14 |     <link href="http://example.org/2003/12/13/atom03"/>
15 |     <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
16 |     <updated>2003-12-13T18:30:02Z</updated>
17 |     <summary>Some text.</summary>
18 |   </entry>
19 | 
20 | </feed>


--------------------------------------------------------------------------------
/tests/document.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dharmafly/noodle/52f88f5df0d2a92506aad702d6719350d2e79459/tests/document.html


--------------------------------------------------------------------------------
/tests/document.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "completed_in": 0.013,
  3 |     "max_id": 288657757152870400,
  4 |     "max_id_str": "288657757152870400",
  5 |     "next_page": "?page=2&max_id=288657757152870400&q=dinosaurs",
  6 |     "page": 1,
  7 |     "query": "dinosaurs",
  8 |     "refresh_url": "?since_id=288657757152870400&q=dinosaurs",
  9 |     "results": [
 10 |         {
 11 |             "created_at": "Tue, 08 Jan 2013 14:45:46 +0000",
 12 |             "from_user": "_MsMindless",
 13 |             "from_user_id": 878142511,
 14 |             "from_user_id_str": "878142511",
 15 |             "from_user_name": "uh uh",
 16 |             "geo": null,
 17 |             "id": 288657757152870400,
 18 |             "id_str": "288657757152870400",
 19 |             "iso_language_code": "en",
 20 |             "metadata": {
 21 |                 "result_type": "recent"
 22 |             },
 23 |             "profile_image_url": "http://a0.twimg.com/profile_images/3081235144/d762e9c3edb63360aa49d7e6e62b683a_normal.jpeg",
 24 |             "profile_image_url_https": "https://si0.twimg.com/profile_images/3081235144/d762e9c3edb63360aa49d7e6e62b683a_normal.jpeg",
 25 |             "source": "&lt;a href=&quot;http://twitter.com/#!/download/ipad&quot;&gt;Twitter for iPad&lt;/a&gt;",
 26 |             "text": "@AmiMindless do they transform into dinosaurs in the night?",
 27 |             "to_user": "AmiMindless",
 28 |             "to_user_id": 1021682551,
 29 |             "to_user_id_str": "1021682551",
 30 |             "to_user_name": "amandaspiffytho.",
 31 |             "in_reply_to_status_id": 288657189856505860,
 32 |             "in_reply_to_status_id_str": "288657189856505856"
 33 |         },
 34 |         {
 35 |             "created_at": "Tue, 08 Jan 2013 14:43:34 +0000",
 36 |             "from_user": "jirouishi",
 37 |             "from_user_id": 157924053,
 38 |             "from_user_id_str": "157924053",
 39 |             "from_user_name": "Aron ",
 40 |             "geo": {
 41 |                 "coordinates": [
 42 |                     37.596182,
 43 |                     127.056834
 44 |                 ],
 45 |                 "type": "Point"
 46 |             },
 47 |             "id": 288657201952849900,
 48 |             "id_str": "288657201952849920",
 49 |             "iso_language_code": "ko",
 50 |             "metadata": {
 51 |                 "result_type": "recent"
 52 |             },
 53 |             "profile_image_url": "http://a0.twimg.com/profile_images/3080515500/b0b7d315abba887f50af14acdccfc1ab_normal.jpeg",
 54 |             "profile_image_url_https": "https://si0.twimg.com/profile_images/3080515500/b0b7d315abba887f50af14acdccfc1ab_normal.jpeg",
 55 |             "source": "&lt;a href=&quot;http://twitter.com/download/android&quot;&gt;Twitter for Android&lt;/a&gt;",
 56 |             "text": "omg ㅠㅠ nver thought a docu about dinosaurs would hurt this much https://t.co/TFsRXbMN",
 57 |             "to_user": null,
 58 |             "to_user_id": 0,
 59 |             "to_user_id_str": "0",
 60 |             "to_user_name": null
 61 |         },
 62 |         {
 63 |             "created_at": "Tue, 08 Jan 2013 14:43:06 +0000",
 64 |             "from_user": "imexdanny",
 65 |             "from_user_id": 302811136,
 66 |             "from_user_id_str": "302811136",
 67 |             "from_user_name": "Danny Hughes",
 68 |             "geo": null,
 69 |             "id": 288657084260683800,
 70 |             "id_str": "288657084260683777",
 71 |             "iso_language_code": "en",
 72 |             "metadata": {
 73 |                 "result_type": "recent"
 74 |             },
 75 |             "profile_image_url": "http://a0.twimg.com/profile_images/3037826375/3530f8d5cf699b4d024a82e9ac4d8624_normal.jpeg",
 76 |             "profile_image_url_https": "https://si0.twimg.com/profile_images/3037826375/3530f8d5cf699b4d024a82e9ac4d8624_normal.jpeg",
 77 |             "source": "&lt;a href=&quot;http://twitter.com/download/iphone&quot;&gt;Twitter for iPhone&lt;/a&gt;",
 78 |             "text": "RT @AndyRothwell1: @imexdanny what killed ze dinosaurs? ZE ICE AGE!",
 79 |             "to_user": null,
 80 |             "to_user_id": 0,
 81 |             "to_user_id_str": "0",
 82 |             "to_user_name": null,
 83 |             "in_reply_to_status_id": 288656063555858400,
 84 |             "in_reply_to_status_id_str": "288656063555858432"
 85 |         },
 86 |         {
 87 |             "created_at": "Tue, 08 Jan 2013 14:42:42 +0000",
 88 |             "from_user": "M3LbReEzY",
 89 |             "from_user_id": 89102612,
 90 |             "from_user_id_str": "89102612",
 91 |             "from_user_name": "Don Melocino",
 92 |             "geo": null,
 93 |             "id": 288656983974871040,
 94 |             "id_str": "288656983974871040",
 95 |             "iso_language_code": "en",
 96 |             "metadata": {
 97 |                 "result_type": "recent"
 98 |             },
 99 |             "profile_image_url": "http://a0.twimg.com/profile_images/1645636429/IMG_0747_normal.JPG",
100 |             "profile_image_url_https": "https://si0.twimg.com/profile_images/1645636429/IMG_0747_normal.JPG",
101 |             "source": "&lt;a href=&quot;http://twitter.com/&quot;&gt;web&lt;/a&gt;",
102 |             "text": "Some dinosaurs were as small as chickens #RandomSnappleFact.. Good morning Tweepsters",
103 |             "to_user": null,
104 |             "to_user_id": 0,
105 |             "to_user_id_str": "0",
106 |             "to_user_name": null
107 |         },
108 |         {
109 |             "created_at": "Tue, 08 Jan 2013 14:41:42 +0000",
110 |             "from_user": "merissa_ariff",
111 |             "from_user_id": 401955951,
112 |             "from_user_id_str": "401955951",
113 |             "from_user_name": "MissCaprisss",
114 |             "geo": null,
115 |             "id": 288656733193244700,
116 |             "id_str": "288656733193244674",
117 |             "iso_language_code": "en",
118 |             "metadata": {
119 |                 "result_type": "recent"
120 |             },
121 |             "profile_image_url": "http://a0.twimg.com/profile_images/3067213647/f7c0775af4a6d17d036192fd0ffea3d0_normal.jpeg",
122 |             "profile_image_url_https": "https://si0.twimg.com/profile_images/3067213647/f7c0775af4a6d17d036192fd0ffea3d0_normal.jpeg",
123 |             "source": "&lt;a href=&quot;http://twitter.com/&quot;&gt;web&lt;/a&gt;",
124 |             "text": "@Sya_fxqxh Mind u i hate dinosaurs. If u know what i mean. ITS NNNIIINNNOOOOOOOOOOOOO",
125 |             "to_user": "Sya_fxqxh",
126 |             "to_user_id": 303860298,
127 |             "to_user_id_str": "303860298",
128 |             "to_user_name": "",
129 |             "in_reply_to_status_id": 288655990411378700,
130 |             "in_reply_to_status_id_str": "288655990411378688"
131 |         },
132 |         {
133 |             "created_at": "Tue, 08 Jan 2013 14:41:42 +0000",
134 |             "from_user": "TwycrossZoo",
135 |             "from_user_id": 66683145,
136 |             "from_user_id_str": "66683145",
137 |             "from_user_name": "Twycross Zoo",
138 |             "geo": null,
139 |             "id": 288656729946865660,
140 |             "id_str": "288656729946865666",
141 |             "iso_language_code": "en",
142 |             "metadata": {
143 |                 "result_type": "recent"
144 |             },
145 |             "profile_image_url": "http://a0.twimg.com/profile_images/1367172040/TZoo_WPS_Logo_blue_portrait_normal.png",
146 |             "profile_image_url_https": "https://si0.twimg.com/profile_images/1367172040/TZoo_WPS_Logo_blue_portrait_normal.png",
147 |             "source": "&lt;a href=&quot;http://www.facebook.com/twitter&quot;&gt;Facebook&lt;/a&gt;",
148 |             "text": "DINOSAUR FACT OF THE DAY: \nBaryonyx was a carnivore. It is one of the only dinosaurs known to feed on fish,... http://t.co/cgtbjyVL",
149 |             "to_user": null,
150 |             "to_user_id": 0,
151 |             "to_user_id_str": "0",
152 |             "to_user_name": null
153 |         },
154 |         {
155 |             "created_at": "Tue, 08 Jan 2013 14:41:29 +0000",
156 |             "from_user": "Hippobatman",
157 |             "from_user_id": 42901828,
158 |             "from_user_id_str": "42901828",
159 |             "from_user_name": "Ulf Martinsen",
160 |             "geo": null,
161 |             "id": 288656677736157200,
162 |             "id_str": "288656677736157187",
163 |             "iso_language_code": "en",
164 |             "metadata": {
165 |                 "result_type": "recent"
166 |             },
167 |             "profile_image_url": "http://a0.twimg.com/profile_images/248772203/thumb-super-mario-bros-8bit-Mario_normal.jpg",
168 |             "profile_image_url_https": "https://si0.twimg.com/profile_images/248772203/thumb-super-mario-bros-8bit-Mario_normal.jpg",
169 |             "source": "&lt;a href=&quot;http://twitter.com/&quot;&gt;web&lt;/a&gt;",
170 |             "text": "Why aren't there more games with dinosaurs in them? There need to be more games with dinosaurs in them. Dinosaurs are cool.",
171 |             "to_user": null,
172 |             "to_user_id": 0,
173 |             "to_user_id_str": "0",
174 |             "to_user_name": null
175 |         },
176 |         {
177 |             "created_at": "Tue, 08 Jan 2013 14:40:47 +0000",
178 |             "from_user": "JessiJ0108",
179 |             "from_user_id": 737981952,
180 |             "from_user_id_str": "737981952",
181 |             "from_user_name": "Jessica Johnson",
182 |             "geo": null,
183 |             "id": 288656499142696960,
184 |             "id_str": "288656499142696961",
185 |             "iso_language_code": "en",
186 |             "metadata": {
187 |                 "result_type": "recent"
188 |             },
189 |             "profile_image_url": "http://a0.twimg.com/profile_images/2936728482/d9911742cdd1f022bd23755829f0dce4_normal.jpeg",
190 |             "profile_image_url_https": "https://si0.twimg.com/profile_images/2936728482/d9911742cdd1f022bd23755829f0dce4_normal.jpeg",
191 |             "source": "&lt;a href=&quot;http://twitter.com/download/android&quot;&gt;Twitter for Android&lt;/a&gt;",
192 |             "text": "RT @QuotingJokes: Kiss me if I'm wrong. But dinosaurs still exist right?",
193 |             "to_user": null,
194 |             "to_user_id": 0,
195 |             "to_user_id_str": "0",
196 |             "to_user_name": null
197 |         },
198 |         {
199 |             "created_at": "Tue, 08 Jan 2013 14:40:46 +0000",
200 |             "from_user": "AndyRothwell1",
201 |             "from_user_id": 288397628,
202 |             "from_user_id_str": "288397628",
203 |             "from_user_name": "Andy Rothwell",
204 |             "geo": null,
205 |             "id": 288656498899419140,
206 |             "id_str": "288656498899419137",
207 |             "iso_language_code": "en",
208 |             "metadata": {
209 |                 "result_type": "recent"
210 |             },
211 |             "profile_image_url": "http://a0.twimg.com/profile_images/2954727240/f16de28072350fdfa393679664894c8e_normal.jpeg",
212 |             "profile_image_url_https": "https://si0.twimg.com/profile_images/2954727240/f16de28072350fdfa393679664894c8e_normal.jpeg",
213 |             "source": "&lt;a href=&quot;http://twitter.com/&quot;&gt;web&lt;/a&gt;",
214 |             "text": "@imexdanny what killed ze dinosaurs? ZE ICE AGE!",
215 |             "to_user": "imexdanny",
216 |             "to_user_id": 302811136,
217 |             "to_user_id_str": "302811136",
218 |             "to_user_name": "Danny Hughes",
219 |             "in_reply_to_status_id": 288656063555858400,
220 |             "in_reply_to_status_id_str": "288656063555858432"
221 |         },
222 |         {
223 |             "created_at": "Tue, 08 Jan 2013 14:40:35 +0000",
224 |             "from_user": "BertBannister",
225 |             "from_user_id": 939905077,
226 |             "from_user_id_str": "939905077",
227 |             "from_user_name": "bert bannister",
228 |             "geo": null,
229 |             "id": 288656448794279940,
230 |             "id_str": "288656448794279936",
231 |             "iso_language_code": "en",
232 |             "metadata": {
233 |                 "result_type": "recent"
234 |             },
235 |             "profile_image_url": "http://a0.twimg.com/profile_images/3008530754/bea3fcbc9efacf7144688f499bddf587_normal.jpeg",
236 |             "profile_image_url_https": "https://si0.twimg.com/profile_images/3008530754/bea3fcbc9efacf7144688f499bddf587_normal.jpeg",
237 |             "source": "&lt;a href=&quot;http://twitter.com/download/iphone&quot;&gt;Twitter for iPhone&lt;/a&gt;",
238 |             "text": "@rogers2116 @sludgiesly Everyone: \"steak and a pint please mate\" Roge: \"can I have turkey dinosaurs with ketchup and a fanta please\"",
239 |             "to_user": "rogers2116",
240 |             "to_user_id": 316789507,
241 |             "to_user_id_str": "316789507",
242 |             "to_user_name": "Jamie Rogers",
243 |             "in_reply_to_status_id": 288656043410599940,
244 |             "in_reply_to_status_id_str": "288656043410599936"
245 |         },
246 |         {
247 |             "created_at": "Tue, 08 Jan 2013 14:40:34 +0000",
248 |             "from_user": "johnkatez",
249 |             "from_user_id": 6795122,
250 |             "from_user_id_str": "6795122",
251 |             "from_user_name": "johnkatez",
252 |             "geo": null,
253 |             "id": 288656443266170900,
254 |             "id_str": "288656443266170880",
255 |             "iso_language_code": "en",
256 |             "metadata": {
257 |                 "result_type": "recent"
258 |             },
259 |             "profile_image_url": "http://a0.twimg.com/profile_images/3062733248/6f3634e3bb847292e75f9e834ee5963d_normal.jpeg",
260 |             "profile_image_url_https": "https://si0.twimg.com/profile_images/3062733248/6f3634e3bb847292e75f9e834ee5963d_normal.jpeg",
261 |             "source": "&lt;a href=&quot;http://tapbots.com/software/tweetbot/mac&quot;&gt;Twееtbot for Mac&lt;/a&gt;",
262 |             "text": "You heard it here first- dinosaurs were gay. http://t.co/vSPlvGRc",
263 |             "to_user": null,
264 |             "to_user_id": 0,
265 |             "to_user_id_str": "0",
266 |             "to_user_name": null
267 |         },
268 |         {
269 |             "created_at": "Tue, 08 Jan 2013 14:40:24 +0000",
270 |             "from_user": "fuckl0nely",
271 |             "from_user_id": 549766379,
272 |             "from_user_id_str": "549766379",
273 |             "from_user_name": "† .",
274 |             "geo": null,
275 |             "id": 288656403944591360,
276 |             "id_str": "288656403944591360",
277 |             "iso_language_code": "pt",
278 |             "metadata": {
279 |                 "result_type": "recent"
280 |             },
281 |             "profile_image_url": "http://a0.twimg.com/profile_images/2979531343/9d7e3fcce5d469d9da80c1661b7dee60_normal.jpeg",
282 |             "profile_image_url_https": "https://si0.twimg.com/profile_images/2979531343/9d7e3fcce5d469d9da80c1661b7dee60_normal.jpeg",
283 |             "source": "&lt;a href=&quot;http://www.tumblr.com/&quot;&gt;Tumblr&lt;/a&gt;",
284 |             "text": "tumblrbot ha preguntado: ROBOTS OR DINOSAURS? http://t.co/uovAaks7",
285 |             "to_user": null,
286 |             "to_user_id": 0,
287 |             "to_user_id_str": "0",
288 |             "to_user_name": null
289 |         },
290 |         {
291 |             "created_at": "Tue, 08 Jan 2013 14:39:34 +0000",
292 |             "from_user": "Phil_Savage",
293 |             "from_user_id": 23068681,
294 |             "from_user_id_str": "23068681",
295 |             "from_user_name": "Phil Savage",
296 |             "geo": null,
297 |             "id": 288656193604419600,
298 |             "id_str": "288656193604419584",
299 |             "iso_language_code": "en",
300 |             "metadata": {
301 |                 "result_type": "recent"
302 |             },
303 |             "profile_image_url": "http://a0.twimg.com/profile_images/2856097536/e45f89e574c84c3e48f6e2df2c159f03_normal.jpeg",
304 |             "profile_image_url_https": "https://si0.twimg.com/profile_images/2856097536/e45f89e574c84c3e48f6e2df2c159f03_normal.jpeg",
305 |             "source": "&lt;a href=&quot;http://www.tweetdeck.com&quot;&gt;TweetDeck&lt;/a&gt;",
306 |             "text": "RT @EwingCalvin: Just so everyone's clear dinosaurs are extinct. That means there are none left. Whatsoever.",
307 |             "to_user": null,
308 |             "to_user_id": 0,
309 |             "to_user_id_str": "0",
310 |             "to_user_name": null
311 |         },
312 |         {
313 |             "created_at": "Tue, 08 Jan 2013 14:38:22 +0000",
314 |             "from_user": "EwingCalvin",
315 |             "from_user_id": 442655243,
316 |             "from_user_id_str": "442655243",
317 |             "from_user_name": "Calvin Ewing",
318 |             "geo": null,
319 |             "id": 288655891270623200,
320 |             "id_str": "288655891270623232",
321 |             "iso_language_code": "en",
322 |             "metadata": {
323 |                 "result_type": "recent"
324 |             },
325 |             "profile_image_url": "http://a0.twimg.com/profile_images/2860919035/3e8339791d1e32f64f329b4d10abdf6c_normal.jpeg",
326 |             "profile_image_url_https": "https://si0.twimg.com/profile_images/2860919035/3e8339791d1e32f64f329b4d10abdf6c_normal.jpeg",
327 |             "source": "&lt;a href=&quot;http://twitter.com/&quot;&gt;web&lt;/a&gt;",
328 |             "text": "Just so everyone's clear dinosaurs are extinct. That means there are none left. Whatsoever.",
329 |             "to_user": null,
330 |             "to_user_id": 0,
331 |             "to_user_id_str": "0",
332 |             "to_user_name": null
333 |         },
334 |         {
335 |             "created_at": "Tue, 08 Jan 2013 14:38:09 +0000",
336 |             "from_user": "DinosaursTrap",
337 |             "from_user_id": 550537265,
338 |             "from_user_id_str": "550537265",
339 |             "from_user_name": "The Dinosaurs Trap",
340 |             "geo": null,
341 |             "id": 288655840188194800,
342 |             "id_str": "288655840188194816",
343 |             "iso_language_code": "en",
344 |             "metadata": {
345 |                 "result_type": "recent"
346 |             },
347 |             "profile_image_url": "http://a0.twimg.com/profile_images/2483238349/31yj26hu61vfv4umwony_normal.png",
348 |             "profile_image_url_https": "https://si0.twimg.com/profile_images/2483238349/31yj26hu61vfv4umwony_normal.png",
349 |             "source": "&lt;a href=&quot;http://trap.it&quot;&gt;Trapit&lt;/a&gt;",
350 |             "text": "Ichthyosaur Fossil Spotlights Ancient 'Sea Monster,' World's Recovery After Mass Extinction http://t.co/UqQTcJG7 #dinos #dinosaurs",
351 |             "to_user": null,
352 |             "to_user_id": 0,
353 |             "to_user_id_str": "0",
354 |             "to_user_name": null
355 |         }
356 |     ],
357 |     "results_per_page": 15,
358 |     "since_id": 0,
359 |     "since_id_str": "0"
360 | }


--------------------------------------------------------------------------------
/tests/document.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <Order>
 3 |   <Date>2003/07/04</Date>
 4 |   <CustomerId>123</CustomerId>
 5 |   <CustomerName>Acme Alpha</CustomerName>
 6 |   <Item>
 7 |     <ItemId> 987</ItemId>
 8 |     <ItemName>Coupler</ItemName>
 9 |     <Quantity>5</Quantity>
10 |   </Item>
11 |   <Item>
12 |     <ItemId>654</ItemId>
13 |     <ItemName>Connector</ItemName>
14 |     <Quantity unit="12">3</Quantity>
15 |   </Item>
16 |   <Item>  
17 |     <ItemId>579</ItemId>
18 |     <ItemName>Clasp</ItemName>
19 |     <Quantity>1</Quantity>
20 |   </Item>
21 | </Order>


--------------------------------------------------------------------------------
/tests/fixtures.js:
--------------------------------------------------------------------------------
  1 | var fs = require("fs");
  2 | 
  3 | // Web document samples for the test server to serve
  4 | 
  5 | exports.documents = {
  6 |   html: fs.readFileSync("tests/document.html"),
  7 |   json: fs.readFileSync("tests/document.json"),
  8 |   feed: fs.readFileSync("tests/document.atom"),
  9 |   xml:  fs.readFileSync("tests/document.xml")
 10 | };
 11 | 
 12 | // Queries
 13 | 
 14 | exports.queries = {
 15 |   html: {
 16 |     simple: {
 17 |       "url": "http://localhost:8889/html",
 18 |       "type": "html",
 19 |       "selector": "title",
 20 |       "extract": "text",
 21 |       "cache": false
 22 |     },
 23 |     withCache: {
 24 |       "url": "http://localhost:8889/html",
 25 |       "type": "html",
 26 |       "selector": "h1",
 27 |       "extract": "text",
 28 |       "cache": true
 29 |     },
 30 |     noSelector: {
 31 |       "url": "http://localhost:8889/html",
 32 |       "type": "html",
 33 |       "cache": false
 34 |     },
 35 |     noExtract: {
 36 |       "url": "http://localhost:8889/html",
 37 |       "type": "html",
 38 |       "selector": "title",
 39 |       "cache": false
 40 |     },
 41 |     noType: {
 42 |       "url": "http://localhost:8889/html",
 43 |       "selector": "title",
 44 |       "extract": "text",
 45 |       "cache": false
 46 |     },
 47 |     badSelector: {
 48 |       "url": "http://localhost:8889/html",
 49 |       "type": "html",
 50 |       "selector": "BAD SELECTOR",
 51 |       "extract": "text",
 52 |       "cache": false
 53 |     },
 54 |     badExtract: {
 55 |       "url": "http://localhost:8889/html",
 56 |       "type": "html",
 57 |       "selector": "title",
 58 |       "extract": "BAD EXTRACT",
 59 |       "cache": false
 60 |     }
 61 |   },
 62 |   json: {
 63 |     simple: {
 64 |       "url": "http://localhost:8889/json",
 65 |       "type": "json",
 66 |       "selector": ".query",
 67 |       "cache": false
 68 |     },
 69 |     noSelector: {
 70 |       "url": "http://localhost:8889/json",
 71 |       "type": "json",
 72 |       "cache": false
 73 |     },
 74 |     noType: {
 75 |       "url": "http://localhost:8889/json",
 76 |       "selector": ".query",
 77 |       "cache": false
 78 |     },
 79 |     badSelector: {
 80 |       "url": "http://localhost:8889/json",
 81 |       "type": "json",
 82 |       "selector": "BAD SELECTOR",
 83 |       "cache": false
 84 |     },
 85 |     badParse: {
 86 |       "url": "http://localhost:8889/html",
 87 |       "type": "json",
 88 |       "selector": ".query",
 89 |       "cache": false
 90 |     }
 91 |   },
 92 |   feed: {
 93 |     simple: {
 94 |       "url": "http://localhost:8889/feed",
 95 |       "type": "feed",
 96 |       "selector": ".title",
 97 |       "cache": false
 98 |     },
 99 |     noSelector: {
100 |       "url": "http://localhost:8889/feed",
101 |       "type": "feed",
102 |       "cache": false
103 |     },
104 |     noType: {
105 |       "url": "http://localhost:8889/feed",
106 |       "selector": ".title",
107 |       "cache": false
108 |     },
109 |     badSelector: {
110 |       "url": "http://localhost:8889/feed",
111 |       "type": "feed",
112 |       "selector": "BAD SELECTOR",
113 |       "cache": false
114 |     },
115 |     badParse: {
116 |       "url": "http://localhost:8889/html",
117 |       "type": "feed",
118 |       "selector": ".title",
119 |       "cache": false
120 |     }
121 |   },
122 |   xml: {
123 |     simple: {
124 |       "url": "http://localhost:8889/xml",
125 |       "type": "xml",
126 |       "selector": ".CustomerName",
127 |       "cache": false
128 |     },
129 |     noSelector: {
130 |       "url": "http://localhost:8889/xml",
131 |       "type": "xml",
132 |       "cache": false
133 |     },
134 |     noType: {
135 |       "url": "http://localhost:8889/xml",
136 |       "selector": ".CustomerName",
137 |       "cache": false
138 |     },
139 |     badSelector: {
140 |       "url": "http://localhost:8889/xml",
141 |       "type": "xml",
142 |       "selector": "BAD SELECTOR",
143 |       "cache": false
144 |     },
145 |     badParse: {
146 |       "url": "http://localhost:8889/html",
147 |       "type": "xml",
148 |       "selector": ".CustomerName",
149 |       "cache": false
150 |     }
151 |   },
152 |   misc: {
153 |     badUrl: {
154 |       "url": "BAD URL",
155 |       "cache": false
156 |     },
157 |     badType: {
158 |       "url": "http://localhost:8889/html",
159 |       "type": "BAD TYPE",
160 |       "cache": false
161 |     }
162 |   },
163 |   map: {
164 |     simple: {
165 |       "url": "http://localhost:8889/html",
166 |       "type": "html",
167 |       "map": {
168 |         "foo": {
169 |           "selector": "h1"
170 |         },
171 |         "bar": {
172 |           "selector": "title"
173 |         }
174 |       },
175 |       "cache": false
176 |     }
177 |   },
178 |   post: {
179 |     simple: {
180 |       "url": "http://localhost:8889",
181 |       "type": "html",
182 |       "selector": "h1",
183 |       "extract": "text",
184 |       "post": {
185 |         "foo": "bar"
186 |       },
187 |       "cache": false
188 |     }
189 |   },
190 |   headers: {
191 |     simple: {
192 |       "url": "http://localhost:8889/html",
193 |       "type": "html",
194 |       "selector": "h1",
195 |       "headers": ["X-Powered-By"],
196 |       "cache": "false"
197 |     },
198 |     linkHeaders: {
199 |       "url": "http://localhost:8889/html",
200 |       "type": "html",
201 |       "selector": "h1",
202 |       "linkHeader": true,
203 |       "cache": "false"
204 |     }
205 |   }
206 | };
207 | 
208 | // Query answers
209 | 
210 | exports.queries.answers = {
211 |   html: {
212 |     simple: [
213 |       {
214 |         "results": ["css Zen Garden: The Beauty in CSS Design"]
215 |       }
216 |     ],
217 |     noExtract: [
218 |       {
219 |         "results": ["css Zen Garden: The Beauty in CSS Design"]
220 |       }
221 |     ],
222 |     noType: [
223 |       {
224 |         "results": ["css Zen Garden: The Beauty in CSS Design"]
225 |       }
226 |     ],
227 |     badSelector: [
228 |       {
229 |         "results": [],
230 |         "error": "Could not match with that selector or extract value"
231 |       }
232 |     ],
233 |     badExtract: [
234 |       {
235 |         "results": [],
236 |         "error": "Could not match with that selector or extract value"
237 |       }
238 |     ]
239 |   },
240 |   json: {
241 |     simple: [
242 |         {
243 |             "results": [
244 |                 "dinosaurs"
245 |             ]
246 |         }
247 |     ],
248 |     noType: [
249 |         {
250 |             "results": [
251 |                 "dinosaurs"
252 |             ]
253 |         }
254 |     ],
255 |     badSelector: [
256 |         {
257 |             "results": [],
258 |             "error": "Could not match with that selector"
259 |         }
260 |     ],
261 |     badParse: [
262 |         {
263 |             "results": [],
264 |             "error": "Could not parse JSON document"
265 |         }
266 |     ]
267 |   },
268 |   feed: {
269 |     simple: [
270 |         {
271 |             "results": [
272 |                 "Atom-Powered Robots Run Amok",
273 |                 "Example Feed"
274 |             ]
275 |         }
276 |     ],
277 |     noSelector: [
278 |         {
279 |             "results": [
280 |                 [
281 |                     {
282 |                         "title": "Atom-Powered Robots Run Amok",
283 |                         "description": "Some text.",
284 |                         "summary": "Some text.",
285 |                         "date": "2003-12-13T18:30:02.000Z",
286 |                         "pubdate": "2003-12-13T18:30:02.000Z",
287 |                         "pubDate": "2003-12-13T18:30:02.000Z",
288 |                         "link": "http://example.org/2003/12/13/atom03",
289 |                         "guid": "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a",
290 |                         "author": "John Doe",
291 |                         "comments": null,
292 |                         "origlink": null,
293 |                         "image": {},
294 |                         "source": {},
295 |                         "categories": [],
296 |                         "enclosures": [],
297 |                         "atom:@": {},
298 |                         "atom:title": {
299 |                             "@": {},
300 |                             "#": "Atom-Powered Robots Run Amok"
301 |                         },
302 |                         "atom:link": {
303 |                             "@": {
304 |                                 "href": "http://example.org/2003/12/13/atom03"
305 |                             }
306 |                         },
307 |                         "atom:id": {
308 |                             "@": {},
309 |                             "#": "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a"
310 |                         },
311 |                         "atom:updated": {
312 |                             "@": {},
313 |                             "#": "2003-12-13T18:30:02Z"
314 |                         },
315 |                         "atom:summary": {
316 |                             "@": {},
317 |                             "#": "Some text."
318 |                         },
319 |                         "meta": {
320 |                             "#ns": [
321 |                                 {
322 |                                     "xmlns": "http://www.w3.org/2005/Atom"
323 |                                 }
324 |                             ],
325 |                             "@": [
326 |                                 {
327 |                                     "xmlns": "http://www.w3.org/2005/Atom"
328 |                                 }
329 |                             ],
330 |                             "#type": "atom",
331 |                             "#version": "1.0",
332 |                             "title": "Example Feed",
333 |                             "description": null,
334 |                             "date": "2003-12-13T18:30:02.000Z",
335 |                             "pubdate": "2003-12-13T18:30:02.000Z",
336 |                             "pubDate": "2003-12-13T18:30:02.000Z",
337 |                             "link": "http://example.org/",
338 |                             "xmlurl": null,
339 |                             "xmlUrl": null,
340 |                             "author": "John Doe",
341 |                             "language": null,
342 |                             "favicon": null,
343 |                             "copyright": null,
344 |                             "generator": null,
345 |                             "image": {},
346 |                             "categories": [],
347 |                             "atom:@": {
348 |                                 "xmlns": "http://www.w3.org/2005/Atom"
349 |                             },
350 |                             "atom:title": {
351 |                                 "@": {},
352 |                                 "#": "Example Feed"
353 |                             },
354 |                             "atom:link": {
355 |                                 "@": {
356 |                                     "href": "http://example.org/"
357 |                                 }
358 |                             },
359 |                             "atom:updated": {
360 |                                 "@": {},
361 |                                 "#": "2003-12-13T18:30:02Z"
362 |                             },
363 |                             "atom:author": {
364 |                                 "@": {},
365 |                                 "name": {
366 |                                     "@": {},
367 |                                     "#": "John Doe"
368 |                                 }
369 |                             },
370 |                             "atom:id": {
371 |                                 "@": {},
372 |                                 "#": "urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6"
373 |                             }
374 |                         }
375 |                     }
376 |                 ]
377 |             ]
378 |         }
379 |     ],
380 |     noType: [
381 |         {
382 |             "results": [
383 |                 "Atom-Powered Robots Run Amok",
384 |                 "Example Feed"
385 |             ]
386 |         }
387 |     ],
388 |     badSelector: [
389 |         {
390 |             "results": [],
391 |             "error": "Could not match with that selector"
392 |         }
393 |     ],
394 |     badParse: [
395 |         {
396 |             "results": [],
397 |             "error": "The provided document couldn't be normalised"
398 |         }
399 |     ]
400 |   },
401 |   xml: {
402 |     simple: [
403 |         {
404 |             "results": [
405 |                 "Acme Alpha"
406 |             ]
407 |         }
408 |     ],
409 |     noSelector: [
410 |         {
411 |             "results": [
412 |                 {
413 |                     "Order": {
414 |                         "Date": "2003/07/04",
415 |                         "CustomerId": 123,
416 |                         "CustomerName": "Acme Alpha",
417 |                         "Item": [
418 |                             {
419 |                                 "ItemId": 987,
420 |                                 "ItemName": "Coupler",
421 |                                 "Quantity": 5
422 |                             },
423 |                             {
424 |                                 "ItemId": 654,
425 |                                 "ItemName": "Connector",
426 |                                 "Quantity": {
427 |                                     "unit": 12,
428 |                                     "$t": 3
429 |                                 }
430 |                             },
431 |                             {
432 |                                 "ItemId": 579,
433 |                                 "ItemName": "Clasp",
434 |                                 "Quantity": 1
435 |                             }
436 |                         ]
437 |                     }
438 |                 }
439 |             ]
440 |         }
441 |     ],
442 |     noType: [
443 |         {
444 |             "results": [
445 |                 "Acme Alpha"
446 |             ]
447 |         }
448 |     ],
449 |     badSelector: [
450 |         {
451 |             "results": [],
452 |             "error": "Could not match with that selector"
453 |         }
454 |     ],
455 |     badParse: [
456 |         {
457 |             "results": [],
458 |             "error": "Could not parse XML to JSON"
459 |         }
460 |     ]
461 |   },
462 |   misc: {
463 |     badUrl: [
464 |         {
465 |             "results": [],
466 |             "error": "Document not found"
467 |         }
468 |     ],
469 |     badType: [
470 |         {
471 |             "results": [],
472 |             "error": "Document type not supported"
473 |         }
474 |     ]
475 |   },
476 |   map : {
477 |     simple: [
478 |         {
479 |             "results": {
480 |                 "bar": ["css Zen Garden: The Beauty in CSS Design"],
481 |                 "foo": ["css Zen Garden"]
482 |             }
483 |         }
484 |     ]
485 |   },
486 |   post: {
487 |     simple: [
488 |       {
489 |         "results": ["was posted"]
490 |       }
491 |     ]
492 |   },
493 |   headers: {
494 |     simple: [
495 |         {
496 |             "results": ["css Zen Garden"],
497 |             "headers": {
498 |                 "X-Powered-By": "Noodle testing server"
499 |             }
500 |         }
501 |     ],
502 |     linkHeaders: [
503 |         {
504 |             "results": ["css Zen Garden"],
505 |             "headers": {
506 |                 "link": {
507 |                     "next": "foo",
508 |                     "last": "bar"
509 |                 }
510 |             }
511 |         }
512 |     ]
513 |   }
514 | };


--------------------------------------------------------------------------------
/tests/server.js:
--------------------------------------------------------------------------------
 1 | var url      = require('url'),
 2 |     fixtures = require('./fixtures');
 3 | 
 4 | require('http').createServer(function (req, res) {
 5 |   var serve = url.parse(req.url).pathname.split('/')[1];
 6 | 
 7 |   if (req.method === 'POST') {
 8 |     parsePostData(req, function (data) {
 9 |       var respondWith = (data.foo === 'bar') ? '<h1>was posted</h1>'
10 |                                              : '<h1>test should fail</h1>';
11 |       res.writeHead(200, getResponseHeaders('html'));
12 |       res.end(respondWith);
13 |     });
14 |   } else {
15 |     res.writeHead(200, getResponseHeaders(serve));
16 |     res.end(fixtures.documents[serve]);
17 |   }
18 | })
19 | .listen(8889, function () {
20 |   console.log('Test server temporarily running on port 8889');
21 | });
22 | 
23 | function parsePostData (req, cb) {
24 |   var body = '';
25 | 
26 |   req.on('data', function (data) {
27 |       body += data;
28 |   });
29 | 
30 |   req.on('end', function () {
31 |     cb(require('querystring').parse(body));
32 |   });
33 | }
34 | 
35 | function getResponseHeaders (serve) {
36 |   var ct = {
37 |         'html': 'text/html',
38 |         'json': 'application/json',
39 |         'feed': 'application/atom+xml',
40 |         'xml' : 'text/xml'
41 |       };
42 |   return {
43 |     'Content-type': ct[serve],
44 |     'X-Powered-By': 'Noodle testing server',
45 |     'Link'        : '<foo>; rel="next",<bar>; rel="last"'
46 |   };
47 | }


--------------------------------------------------------------------------------
/tests/tests.js:
--------------------------------------------------------------------------------
  1 | var assert    = require('assert'),
  2 |     _         = require('underscore'),
  3 |     fixtures  = require('./fixtures'),
  4 |     noodle    = require('../lib/noodle'),
  5 |     cache     = require('../lib/cache'),
  6 |     html      = require('../lib/types/html'),
  7 |     json      = require('../lib/types/json'),
  8 |     feed      = require('../lib/types/feed'),
  9 |     xml       = require('../lib/types/xml'),
 10 |     stringify = JSON.stringify;
 11 | 
 12 | noodle.configure({
 13 |   "debug": false
 14 | });
 15 | 
 16 | Array.prototype.AllValuesSame = function(){
 17 |   if(this.length > 0) {
 18 |     for(var i = 1; i < this.length; i++) {
 19 |       if(this[i] !== this[0]) {
 20 |         return false;
 21 |       }
 22 |     }
 23 |   } 
 24 |   return true;
 25 | };
 26 | 
 27 | function isPromise (obj) {
 28 |   return !!obj.promiseSend;
 29 | }
 30 | 
 31 | // Noodle library
 32 | 
 33 | describe('Noodle', function () {
 34 |   describe('noodle.query', function () {
 35 |     it('should return a promise', function () {
 36 |       var promise = noodle.query({url: 'foo'});
 37 |       assert.equal(true, isPromise(promise));
 38 |     });
 39 |   });
 40 | 
 41 |   describe('fetch()', function () {
 42 |     it('should return a promise', function () {
 43 |       var promise = noodle.fetch('foo', {});
 44 |       assert.equal(true, isPromise(promise));
 45 |     });
 46 |   });
 47 | });
 48 | 
 49 | // Tests regarding the noodle library's type modules
 50 | 
 51 | describe('Types', function () {
 52 |   describe('noodle.html', function () {
 53 |     it('its promise should resolve to an object containing results', function (done) {
 54 |       noodle.query(fixtures.queries.html.simple)
 55 |         .then(function (results) {
 56 |           if (_.isArray(results.results)) {
 57 |             done();
 58 |           } else {
 59 |             done(new Error('results.results was not an array'));
 60 |           }
 61 |         });
 62 |     });
 63 |   });
 64 | 
 65 |   describe('noodle.json', function () {
 66 |     it('promise should resolve to an array', function (done) {
 67 |       noodle.query(fixtures.queries.json.simple)
 68 |         .then(function (results) {
 69 |           if (_.isArray(results.results)) {
 70 |             done();
 71 |           } else {
 72 |             done(new Error('results.results was not an array'));
 73 |           }
 74 |         });
 75 |     });
 76 |   });
 77 | 
 78 |   describe('noodle.feed', function () {
 79 |     it('promise should resolve to an array', function (done) {
 80 |       noodle.query(fixtures.queries.feed.simple)
 81 |         .then(function (results) {
 82 |           if (_.isArray(results.results)) {
 83 |             done();
 84 |           } else {
 85 |             done(new Error('results.results was not an array'));
 86 |           }
 87 |         });
 88 |     });
 89 |   });
 90 | 
 91 |   describe('noodle.xml', function () {
 92 |     it('promise should resolve to an array', function (done) {
 93 |       noodle.query(fixtures.queries.xml.simple)
 94 |         .then(function (results) {
 95 |           if (_.isArray(results.results)) {
 96 |             done();
 97 |           } else {
 98 |             done(new Error('results.results was not an array'));
 99 |           }
100 |         });
101 |     });
102 |   });
103 | });
104 | 
105 | 
106 | // Noodle's cache
107 | 
108 | describe('cache', function () {
109 | 
110 | });
111 | 
112 | 
113 | // Noodle query api
114 | 
115 | describe('Noodle object query API', function () {
116 |   var allArrays = [];
117 | 
118 |   describe('type: html', function () {
119 |     it('should have accurate result data', function (done) {
120 |       noodle.query(fixtures.queries.html.simple)
121 |         .then(function (results) {
122 |           allArrays.push(_.isArray(results.results));
123 |           if (_.isEqual(results.results, fixtures.queries.answers.html.simple)) {
124 |             done();
125 |           } else {
126 |             done(new Error('Results and fixtures do not match up.'));
127 |           }
128 |         });
129 |     });
130 | 
131 |     it('should still return full document if no selector is specified', function (done) {
132 |       noodle.query(fixtures.queries.html.noSelector)
133 |         .then(function (results) {
134 |           var expectedHTMLDoc = results.results[0].results;
135 |           allArrays.push(_.isArray(results.results));
136 |           if (typeof expectedHTMLDoc === 'string' && expectedHTMLDoc.length > 1000) {
137 |             done();
138 |           } else {
139 |             done(new Error('Results did not contain full document'));
140 |           }
141 |         });
142 |     });
143 | 
144 |     it('should still return some data if no extract is specified', function (done) {
145 |       noodle.query(fixtures.queries.html.noExtract)
146 |         .then(function (results) {
147 |           allArrays.push(_.isArray(results.results));
148 |           if (_.isEqual(results.results, fixtures.queries.answers.html.noExtract)) {
149 |             done();
150 |           } else {
151 |             done(new Error('Results and fixtures do not match up.'));
152 |           }
153 |         });
154 |     });
155 | 
156 |     it('should still return some data if no type is specified', function (done) {
157 |       noodle.query(fixtures.queries.html.noType)
158 |         .then(function (results) {
159 |           allArrays.push(_.isArray(results.results));
160 |           if (_.isEqual(results.results, fixtures.queries.answers.html.noType)) {
161 |             done();
162 |           } else {
163 |             done(new Error('Results and fixtures do not match up.'));
164 |           }
165 |         });
166 |     });
167 | 
168 |     describe('errors', function () {
169 |       it('should report on a poor selector', function (done) {
170 |         noodle.query(fixtures.queries.html.badSelector)
171 |           .then(function (results) {
172 |             allArrays.push(_.isArray(results.results));
173 |             if (_.isEqual(results.results, fixtures.queries.answers.html.badSelector)) {
174 |               done();
175 |             } else {
176 |               done(new Error('Results and fixtures do not match up.'));
177 |             }
178 |           });
179 |       });
180 | 
181 |       it('should default to selecting text if no extract is supplied', function (done){
182 |         noodle.query(fixtures.queries.html.noExtract)
183 |           .then(function (results) {
184 |             allArrays.push(_.isArray(results.results));
185 |             if (_.isEqual(results.results, fixtures.queries.answers.html.noExtract)) {
186 |               done();
187 |             } else {
188 |               done(new Error('Results and fixtures do not match up.'));
189 |             }
190 |           });
191 |       });
192 |     });
193 |   });
194 | 
195 |   describe('type: json', function () {
196 |     before(function () {
197 |       noodle.configure({
198 |         defaultDocumentType: 'json'
199 |       });
200 |     });
201 | 
202 |     it('should have result data', function (done) {
203 |       noodle.query(fixtures.queries.json.simple)
204 |         .then(function (results) {
205 |           allArrays.push(_.isArray(results.results));
206 |           if (_.isEqual(results.results, fixtures.queries.answers.json.simple)) {
207 |             done();
208 |           } else {
209 |             done(new Error('Results and fixtures do not match up.'));
210 |           }
211 |         });
212 |     });
213 | 
214 |     it('should still return some data if no selector is specified', function (done) {
215 |       noodle.query(fixtures.queries.json.noSelector)
216 |         .then(function (results) {
217 |           var expectedJSONDoc = results.results[0].results;
218 |           allArrays.push(_.isArray(results.results));
219 |           if (typeof expectedJSONDoc === 'object') {
220 |             done();
221 |           } else {
222 |             done(new Error('Results and fixtures do not match up.'));
223 |           }
224 |         });
225 |     });
226 | 
227 |     it('should still return some data if no type is specified', function (done) {
228 |       noodle.query(fixtures.queries.json.noType)
229 |         .then(function (results) {
230 |           allArrays.push(_.isArray(results.results));
231 |           if (_.isEqual(results.results, fixtures.queries.answers.json.noType)) {
232 |             done();
233 |           } else {
234 |             done(new Error('Results and fixtures do not match up.'));
235 |           }
236 |         });
237 |     });
238 | 
239 |     describe('errors', function () {
240 |       it('should report on a poor selector', function (done) {
241 |         noodle.query(fixtures.queries.json.badSelector)
242 |           .then(function (results) {
243 |             allArrays.push(_.isArray(results.results));
244 |             if (_.isEqual(results.results, fixtures.queries.answers.json.badSelector)) {
245 |               done();
246 |             } else {
247 |               done(new Error('Results and fixtures do not match up.'));
248 |             }
249 |           });
250 |       });
251 |       
252 |       it('should report on a parse error', function (done) {
253 |         noodle.query(fixtures.queries.json.badParse)
254 |           .then(function (results) {
255 |             allArrays.push(_.isArray(results.results));
256 |             if (_.isEqual(results.results, fixtures.queries.answers.json.badParse)) {
257 |               done();
258 |             } else {
259 |               done(new Error('Results and fixtures do not match up.'));
260 |             }
261 |           });
262 |       });
263 |     });
264 |   });
265 | 
266 |   describe('type: feed', function () {
267 |     before(function () {
268 |       noodle.configure({
269 |         defaultDocumentType: 'feed'
270 |       });
271 |     });
272 | 
273 |     it('should have result data', function (done) {
274 |       noodle.query(fixtures.queries.feed.simple)
275 |         .then(function (results) {
276 |           allArrays.push(_.isArray(results.results));
277 |           if (_.isEqual(results.results) === _.isEqual(fixtures.queries.answers.feed.simple)) {
278 |             done();
279 |           } else {
280 |             done(new Error('Results and fixtures do not match up.'));
281 |           }
282 |         });
283 |     });
284 | 
285 |     it('should still return some data if no selector is specified', function (done) {
286 |       noodle.query(fixtures.queries.feed.noSelector)
287 |         .then(function (results) {
288 |           allArrays.push(_.isArray(results.results));
289 |           if (stringify(results.results) === stringify(fixtures.queries.answers.feed.noSelector)) {
290 |             done();
291 |           } else {
292 |             done(new Error('Results and fixtures do not match up.'));
293 |           }
294 |         });
295 |     });
296 | 
297 |     it('should still return some data if no type is specified', function (done) {
298 |       noodle.query(fixtures.queries.feed.noType)
299 |         .then(function (results) {
300 |           allArrays.push(_.isArray(results.results));
301 |           if (_.isEqual(results.results, fixtures.queries.answers.feed.noType)) {
302 |             done();
303 |           } else {
304 |             done(new Error('Results and fixtures do not match up.'));
305 |           }
306 |         });
307 |     });
308 | 
309 |     describe('errors', function () {
310 |       it('should report on a poor selector', function (done) {
311 |         noodle.query(fixtures.queries.feed.badSelector)
312 |           .then(function (results) {
313 |             allArrays.push(_.isArray(results.results));
314 |             if (_.isEqual(results.results, fixtures.queries.answers.feed.badSelector)) {
315 |               done();
316 |             } else {
317 |               done(new Error('Results and fixtures do not match up.'));
318 |             }
319 |           });
320 |       });
321 |       
322 |       it('should report on a parse error', function (done) {
323 |         noodle.query(fixtures.queries.feed.badParse)
324 |           .then(function (results) {
325 |             allArrays.push(_.isArray(results.results));
326 |             if (_.isEqual(results.results, fixtures.queries.answers.feed.badParse)) {
327 |               done();
328 |             } else {
329 |               done(new Error('Results and fixtures do not match up.'));
330 |             }
331 |           });
332 |         });
333 |     });
334 |   });
335 | 
336 |   describe('type: xml', function () {
337 |     before(function () {
338 |       noodle.configure({
339 |         defaultDocumentType: 'xml'
340 |       });
341 |     });
342 | 
343 |     it('should have result data', function (done) {
344 |       noodle.query(fixtures.queries.xml.simple)
345 |         .then(function (results) {
346 |           allArrays.push(_.isArray(results.results));
347 |           if (_.isEqual(results.results, fixtures.queries.answers.xml.simple)) {
348 |             done();
349 |           } else {
350 |             done(new Error('Results and fixtures do not match up.'));
351 |           }
352 |         });
353 |     });
354 | 
355 |     it('should still return some data if no selector is specified', function (done) {
356 |       noodle.query(fixtures.queries.xml.noSelector)
357 |         .then(function (results) {
358 |           allArrays.push(_.isArray(results.results));
359 |           if (_.isEqual(results.results, fixtures.queries.answers.xml.noSelector)) {
360 |             done();
361 |           } else {
362 |             done(new Error('Results and fixtures do not match up.'));
363 |           }
364 |         });
365 |     });
366 | 
367 |     it('should still return some data if no type is specified', function (done) {
368 |       noodle.query(fixtures.queries.xml.noType)
369 |         .then(function (results) {
370 |           allArrays.push(_.isArray(results.results));
371 |           if (_.isEqual(results.results, fixtures.queries.answers.xml.noType)) {
372 |             done();
373 |           } else {
374 |             done(new Error('Results and fixtures do not match up.'));
375 |           }
376 |         });
377 |     });
378 | 
379 |     describe('errors', function () {
380 |       it('should report on a poor selector', function (done) {
381 |         noodle.query(fixtures.queries.xml.badSelector)
382 |           .then(function (results) {
383 |             allArrays.push(_.isArray(results.results));
384 |             if (_.isEqual(results.results, fixtures.queries.answers.xml.badSelector)) {
385 |               done();
386 |             } else {
387 |               done(new Error('Results and fixtures do not match up.'));
388 |             }
389 |           });
390 |       });
391 |       
392 |       it('should report on a parse error', function (done) {
393 |         noodle.query(fixtures.queries.xml.badParse)
394 |           .then(function (results) {
395 |             allArrays.push(_.isArray(results.results));
396 |             if (_.isEqual(results.results, fixtures.queries.answers.xml.badParse)) {
397 |               done();
398 |             } else {
399 |               done(new Error('Results and fixtures do not match up.'));
400 |             }
401 |           });
402 |       });
403 |     });
404 |   });
405 | 
406 | describe('generic query error messages', function () {
407 |   it('errors if no url is specified', function (done) {
408 |     noodle.query(fixtures.queries.misc.badUrl)
409 |       .then(function (results) {
410 |         if (_.isEqual(results.results, fixtures.queries.answers.misc.badUrl)) {
411 |           done();
412 |         } else {
413 |           done(new Error('Results and fixtures do not match up.'));
414 |         }
415 |       });
416 |   });
417 | 
418 |   it('errors if a non-supported type is specified', function (done) {
419 |     noodle.query(fixtures.queries.misc.badType)
420 |       .then(function (results) {
421 |         if (_.isEqual(results.results, fixtures.queries.answers.misc.badType)) {
422 |           done();
423 |         } else {
424 |           done(new Error('Results and fixtures do not match up.'));
425 |         }
426 |       });
427 |   });
428 | });
429 | 
430 |   describe('map notation', function () {
431 |     it('result should contain properties as specified in the map as well as data', function (done) {
432 |       noodle.query(fixtures.queries.map.simple)
433 |         .then(function (results) {
434 |           allArrays.push(_.isArray(results.results));
435 |           if (_.isEqual(results.results, fixtures.queries.answers.map.simple)) {
436 |             done();
437 |           } else {
438 |             done(new Error('Results and fixtures do not match up.'));
439 |           }
440 |         });
441 |     });
442 |   });
443 | 
444 |   describe('post data', function () {
445 |     it('should return data from post requests', function (done) {
446 |       noodle.query(fixtures.queries.post.simple)
447 |       .then(function (results) {
448 |         if (_.isEqual(results.results, fixtures.queries.answers.post.simple)) {
449 |           done();
450 |         } else {
451 |           done(new Error('Results and fixtures do not match up.'));
452 |         }
453 |       });
454 |     });
455 |   });
456 | 
457 |   describe('headers', function () {
458 |     it('should parse headers', function (done) {
459 |       noodle.query(fixtures.queries.headers.simple)
460 |         .then(function (results) {
461 |           var fix = fixtures.queries.answers.headers.simple[0];
462 |           if (_.isEqual(results.results[0].results, fix.results) &&
463 |             _.isEqual(results.results[0].headers, fix.headers)) {
464 |             done();
465 |           } else {
466 |             done(new Error('Results and fixtures do not match up.'));
467 |           }
468 |         });
469 |     });
470 | 
471 |     it('should parse link headers', function (done) {
472 |       noodle.query(fixtures.queries.headers.linkHeaders)
473 |         .then(function (results) {
474 |           var fix = fixtures.queries.answers.headers.linkHeaders[0];
475 |           if (_.isEqual(results.results[0].results, fix.results) &&
476 |             _.isEqual(results.results[0].headers, fix.headers)) {
477 |             done();
478 |           } else {
479 |             done(new Error('Results and fixtures do not match up.'));
480 |           }
481 |         });
482 |     });
483 |   });
484 | 
485 |   describe('multiple queries', function () {
486 |     it('(A) the returned order should match the order of the sent query', function (done) {
487 |       var queries = [
488 |         fixtures.queries.html.simple,
489 |         fixtures.queries.json.simple,
490 |         fixtures.queries.feed.simple,
491 |         fixtures.queries.xml.simple
492 |       ];
493 |       noodle.query(queries)
494 |         .then(function (results) {
495 |           var match1 = _.isEqual(results.results[0], fixtures.queries.answers.html.simple[0]),
496 |               match2 = _.isEqual(results.results[1], fixtures.queries.answers.json.simple[0]),
497 |               match3 = _.isEqual(results.results[2], fixtures.queries.answers.feed.simple[0]),
498 |               match4 = _.isEqual(results.results[3], fixtures.queries.answers.xml.simple[0]);
499 |           if (match1 && match2 && match3 && match4) {
500 |             done();
501 |           } else {
502 |             done(new Error('Order was not maintained or results/fixtures mismatch'));
503 |           }
504 |         });
505 |     });
506 |     it('(B) the returned order should match the order of the sent query', function (done) {
507 |       var queries = [
508 |         fixtures.queries.json.simple,
509 |         fixtures.queries.html.simple,
510 |         fixtures.queries.xml.simple,
511 |         fixtures.queries.feed.simple
512 |       ];
513 |       noodle.query(queries)
514 |         .then(function (results) {
515 |           var match1 = _.isEqual(results.results[1], fixtures.queries.answers.html.simple[0]),
516 |               match2 = _.isEqual(results.results[0], fixtures.queries.answers.json.simple[0]),
517 |               match3 = _.isEqual(results.results[3], fixtures.queries.answers.feed.simple[0]),
518 |               match4 = _.isEqual(results.results[2], fixtures.queries.answers.xml.simple[0]);
519 |           if (match1 && match2 && match3 && match4) {
520 |             done();
521 |           } else {
522 |             done(new Error('Order was not maintained or results/fixtures mismatch'));
523 |           }
524 |         });
525 |     });
526 |     it('(C) the returned order should match the order of the sent query', function (done) {
527 |       var queries = [
528 |         fixtures.queries.html.simple,
529 |         fixtures.queries.json.simple,
530 |         fixtures.queries.feed.simple,
531 |         fixtures.queries.xml.simple
532 |       ];
533 |       noodle.query(queries)
534 |         .then(function (results) {
535 |           var match1 = _.isEqual(results.results[0], fixtures.queries.answers.html.simple[0]),
536 |               match2 = _.isEqual(results.results[1], fixtures.queries.answers.json.simple[0]),
537 |               match3 = _.isEqual(results.results[2], fixtures.queries.answers.feed.simple[0]),
538 |               match4 = _.isEqual(results.results[3], fixtures.queries.answers.xml.simple[0]);
539 |           if (match1 && match2 && match3 && match4) {
540 |             done();
541 |           } else {
542 |             done(new Error('Order was not maintained or results/fixtures mismatch'));
543 |           }
544 |         });
545 |     });
546 |   });
547 | 
548 |   describe('consistent response format', function () {
549 |     it('should return all responses as arrays', function () {
550 |       assert.equal(true, allArrays.indexOf(false) === -1);
551 |     });
552 | 
553 |     it('should always return the "created" property from cache', function (done) {
554 |       noodle.query(fixtures.queries.html.withCache)
555 |         .then(function (results) {
556 |           if (results.results[0].created) {
557 |             done();
558 |           } else {
559 |             done(new Error('"created" property wasn\'t included with cached response.'));
560 |           }
561 |         });
562 |     });
563 |   });
564 | });


--------------------------------------------------------------------------------