├── .gitignore
├── README.md
├── bin
├── noodle-server
└── tests
├── docs
├── .satya-config.yml
├── 1. Overview.md
├── 10. Server quick start
├── 2. Try it out.md
├── 3. Web service.md
├── 4. Query syntax.md
├── 5. Noodle as node module.md
├── 6. Error handling.md
├── 7. Caching.md
├── 8. Adding to noodle.md
└── 9. Tests.md
├── index.js
├── lib
├── cache.js
├── config.json
├── logger.js
├── noodle-middleware.js
├── noodle.js
└── types
│ ├── feed.js
│ ├── html.js
│ ├── json.js
│ └── xml.js
├── package.json
└── tests
├── document.atom
├── document.html
├── document.json
├── document.xml
├── fixtures.js
├── server.js
└── tests.js
/.gitignore:
--------------------------------------------------------------------------------
1 | _site
2 | _bin
3 | _config-local.yml
4 | FiddlerRules.farx
5 | .DS_Store
6 |
7 | lib-cov
8 | *.seed
9 | *.log
10 | *.csv
11 | *.dat
12 | *.out
13 | *.pid
14 | *.gz
15 |
16 | pids
17 | logs
18 | results
19 |
20 | node_modules
21 | npm-debug.log
22 | .satya
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [noodle](https://noodle.dharmafly.com)
2 | =============================
3 |
4 | noodle is a Node.js server and module for querying and scraping data from web documents. It features:
5 |
6 | ```JSON
7 | {
8 | "url": "https://github.com/explore",
9 | "selector": "ol.ranked-repositories h3 a",
10 | "extract": "href"
11 | }
12 | ```
13 |
14 | Features
15 | --------
16 |
17 | - Cross domain document querying (html, json, xml, atom, rss feeds)
18 | - Server supports querying via JSONP and JSON POST
19 | - Multiple queries per request
20 | - Access to queried server headers
21 | - Allows for POSTing to web documents
22 | - In memory caching for query results and web documents
23 |
24 | Server quick start
25 | ------------------
26 |
27 | Setup
28 |
29 | $ npm install noodlejs
30 |
31 | or
32 |
33 | $ git clone git@github.com:dharmafly/noodle.git
34 | $ cd noodle
35 | $ npm install
36 |
37 | Start the server by running the binary
38 |
39 | $ bin/noodle-server
40 | Noodle node server started
41 | ├ process title node-noodle
42 | ├ process pid 4739
43 | └ server port 8888
44 |
45 |
46 | You may specify a port number as an argument
47 |
48 | $ bin/noodle-server 9090
49 | Noodle node server started
50 | ├ process title node-noodle
51 | ├ process pid 4739
52 | └ server port 9090
53 |
54 |
55 | Noodle as a node module
56 | -----------------------
57 |
58 | If you are interested in the node module just run ```npm install noodlejs```,
59 | require it and check out the [noodle api](https://noodle.dharmafly.com/reference/#Noodle-as-node-module)
60 |
61 | ```javascript
62 | var noodle = require('noodlejs');
63 |
64 | noodle.query({
65 | url: 'https://github.com/explore',
66 | selector: 'ol.ranked-repositories h3 a',
67 | extract: 'href'
68 | })
69 | .then(function (results) {
70 | console.log(results);
71 | });
72 | ```
73 |
74 | Tests
75 | -----
76 |
77 | The noodle tests create a temporary server on port `8889` which the automated
78 | tests tell noodle to query against.
79 |
80 | To run tests you can use the provided binary *from the noodle package
81 | root directory*:
82 |
83 | $ cd noodle
84 | $ bin/tests
85 |
86 | Contribute
87 | ----------
88 |
89 | Contributors and suggestions welcomed.
90 |
91 | - [https://noodle.dharmafly.com](https://noodle.dharmafly.com)
92 | - [https://github.com/dharmafly/noodle](https://github.com/dharmafly/noodle)
93 |
--------------------------------------------------------------------------------
/bin/noodle-server:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 |
3 | process.title = 'node-noodle';
4 |
5 | var connect = require('connect'),
6 | http = require('http'),
7 | url = require('url'),
8 | fs = require('fs'),
9 | limiter = require('connect-ratelimit'),
10 | noodlemw = require('../lib/noodle-middleware'),
11 | version = getVersion(),
12 | limits = getConfig().rateLimit,
13 | port = process.argv[2] || 8888,
14 | app;
15 |
16 | limits.end = false;
17 |
18 | app = connect()
19 | .use(function (req, res, next) {
20 | if (url.parse(req.url).pathname === '/version') {
21 | res.writeHead(200, {
22 | 'Content-Type': 'application/json; charset=utf-8'
23 | });
24 | res.end('{"version":' + version + '}');
25 | } else {
26 | next();
27 | }
28 | })
29 | .use(limiter(limits))
30 | .use(function (req, res, next) {
31 | if (res.ratelimit.exceeded) {
32 | res.statusCode = 429;
33 | res.end('[{"results": [], "error": "Rate limit exceeded"}]');
34 | } else {
35 | next();
36 | }
37 | })
38 | .use(connect.query())
39 | .use(connect.json())
40 | .use(noodlemw.parseQueries)
41 | .use(noodlemw.noodleQueries)
42 | .use(noodlemw.respond);
43 |
44 | http.createServer(app).listen(port, function () {
45 | require('colors');
46 | with (console) {
47 | log(' Noodle node server started'.magenta);
48 | log(' ├ process title '.magenta, process.title.toString().green);
49 | log(' ├ process pid '.magenta, process.pid.toString().green);
50 | log(' └ server port '.magenta, port.toString().green);
51 | }
52 | });
53 |
54 | // Return the noodle config as an object
55 |
56 | function getConfig () {
57 | var path = require('path').resolve(__dirname, '../lib/config.json'),
58 | config = fs.readFileSync(path).toString();
59 | return JSON.parse(config);
60 | }
61 |
62 | // Return the noodle version number
63 |
64 | function getVersion () {
65 | var path = require('path').resolve(__dirname, '../package.json');
66 | return JSON.parse(fs.readFileSync(path).toString()).version;
67 | }
--------------------------------------------------------------------------------
/bin/tests:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Start the testing server and store its pid
4 | node tests/server.js & pid=$!
5 |
6 | # Run the tests
7 | node_modules/mocha/bin/mocha tests/tests.js --timeout 4000 --reporter list
8 |
9 | # Kill the test server via its pid
10 | kill $!
11 |
--------------------------------------------------------------------------------
/docs/.satya-config.yml:
--------------------------------------------------------------------------------
1 | #########################################
2 | # project site config
3 |
4 | project_name: noodle
5 | project_url: https://github.com/dharmafly/noodle
6 | version: '0.3.2'
7 | # options: forest, ocean, horus, seagrass, sundae, slate
8 | theme: ocean
9 | twitter_url: https://twitter.com/dharmafly
10 | # options: javascript, css, html5
11 | lang: javascript
12 | scripts:
13 | - src: //ajax.googleapis.com/ajax/libs/jquery/1.8.2/jquery.min.js
14 | # - src: https://raw.github.com/dharmafly/PROJECT-REPO/master/PROJECT.js
15 | quote:
16 | #- quote:
17 | #- cite:
18 | # analytics
19 | ga_id: UA-34978047-5
20 | download_links:
21 | - text: Edge
22 | subtext: (master)
23 | href: https://github.com/dharmafly/noodle/zipball/master
24 | title: The repo's latest codebase (zip). Potentially unstable.
25 | sections:
26 | - path: /index.html
27 | name: Overview
28 | - path: /reference/index.html
29 | name: Reference
30 |
31 | ######## END project site config ########
32 |
--------------------------------------------------------------------------------
/docs/1. Overview.md:
--------------------------------------------------------------------------------
1 | ---
2 | category: overview
3 | heading: 'Overview'
4 | ---
5 |
6 | noodle is a Node.js server and module for querying and scraping data from web documents. It features:
7 |
8 | - Cross domain document querying (html, json, xml, atom, rss feeds)
9 | - Server supports querying via JSONP and JSON POST
10 | - Multiple queries per request
11 | - Access to queried server headers
12 | - Allows for POSTing to web documents
13 | - In memory caching for query results and web documents
--------------------------------------------------------------------------------
/docs/10. Server quick start:
--------------------------------------------------------------------------------
1 | ---
2 | category: overview
3 | heading: 'Server quick setup'
4 | ---
5 |
6 | Setup
7 |
8 | $ git clone https://github.com/dharmafly/noodle.git
9 | $ cd noodle
10 | $ npm install
11 |
12 | Start the server by running the binary
13 |
14 | $ bin/noodle-server
15 | Server running on port 8888
16 |
17 | You may specify a port number as an argument
18 |
19 | $ bin/noodle-server 9090
20 | Server running on port 9090
21 |
--------------------------------------------------------------------------------
/docs/2. Try it out.md:
--------------------------------------------------------------------------------
1 | ---
2 | category: overview
3 | heading: 'Try it out'
4 | ---
5 |
6 | ## Install via NPM
7 |
8 | $ npm install noodlejs
9 |
10 | ## Install via Git
11 |
12 | $ git clone https://github.com/dharmafly/noodle.git
13 |
14 | ## Run the server and GET or POST queries on `localhost:8888`
15 |
16 | $ cd noodle
17 | # or `cd node_modules/noodlejs` if installed via npm
18 | $ bin/noodle-server
19 | Server running on port 8888
20 |
21 | ## Or use as a node module
22 |
23 | $ var noodle = require('noodlejs');
24 |
25 |
26 | ## Editor
27 |
28 | Below is an editor where you can try writing a query yourself.
29 |
30 | The query below tells noodle to go to the google search result for
31 | JavaScript and expect a html file. Then using the selector pick out
32 | all of the result anchors. Finally the query says to extract the
33 | text for each of those anchor elements.
34 |
35 | Press run below to see the output:
36 |
37 | var query = {
38 | url: 'https://google.com/search?q=javascript',
39 | type: 'html',
40 | selector: 'h3.r a',
41 | extract: 'text'
42 | },
43 | uriQuery = encodeURIComponent(JSON.stringify(query)),
44 | request = 'https://example.noodle.dharmafly.com/?q=' +
45 | uriQuery + '&callback=?';
46 |
47 | // Make Ajax request to Noodle server
48 | jQuery.getJSON(request, function (data) {
49 | alert(data[0].results);
50 | });
51 |
52 | Noodle queries don't just support html but also json, feeds and plain xml. They can be a lot more powerful too.
53 | [Read the reference for more details.](https://noodle.dharmafly.com/reference)
54 |
--------------------------------------------------------------------------------
/docs/3. Web service.md:
--------------------------------------------------------------------------------
1 | ---
2 | category: reference
3 | heading: 'Web service'
4 | ---
5 |
6 | Noodle can be used as both a web service and a node module. In each case, key/value objects are used as queries to fetch and extract data from web documents.
7 |
8 | noodle currently supports multiple web documents with an almost uniform query syntax for grabbing data from the different types (html, json, feeds, xml).
9 |
10 | noodle is ready to run as a web service from `bin/noodle-server`.
11 |
12 |
13 | ## Run the server
14 |
15 | $ cd noodle
16 | # or `cd node_modules/noodlejs` if installed via npm
17 | $ bin/noodle-server
18 | Server running on port 8888
19 |
20 |
21 | ## GET or POST
22 |
23 | The server supports queries via both GET and POST.
24 |
25 | ### GET
26 |
27 | The query itself can be sent in the `q` parameter either as a url encoded JSON blob or as a querystring serialised representation (`jQuery.param()`).
28 |
29 | noodle supports JSONP if a `callback` parameter is supplied.
30 |
31 | GET https://example.noodle.dharmaflt.com?q={JSONBLOB}&callback=foo
32 |
33 |
34 | ### POST
35 |
36 | noodle also supports a query sent as JSON in the POST body.
37 |
38 | POST https://example.noodle.dharmafly.com
39 |
40 |
41 | ## Rate limiting
42 |
43 | The web service also provides rate limiting out of the box with
44 | [connect-ratelimit](https://github.com/dharmafly/connect-ratelimit).
45 |
46 |
47 | ## Configuration
48 |
49 | ### Server port
50 |
51 | The specify what port the noodle web service serves on just write it as the
52 | first argument to the binary.
53 |
54 | $ bin/noodle-server 9000
55 | Server running on port 9000
56 |
57 | ### Behaviour settings
58 |
59 | Various noodle settings like cache and ratelimit settings are exposed
60 | and can be edited in `lib/config.json`.
61 |
62 | {
63 | // Setting to true will log out information to the
64 | // terminal
65 |
66 | "debug": true,
67 |
68 | "resultsCacheMaxTime": 3600000,
69 | "resultsCachePurgeTime": 60480000, // -1 will turn purging off
70 | "resultsCacheMaxSize": 124,
71 |
72 | "pageCacheMaxTime": 3600000,
73 | "pageCachePurgeTime": 60480000, // -1 will turn purging off
74 | "pageCacheMaxSize": 32,
75 |
76 | // If no query type option is supplied then
77 | // what should noodle assume
78 |
79 | "defaultDocumentType": "html",
80 |
81 | // How the noodle scraper identifies itself
82 | // to scrape targets
83 |
84 | "userAgent": "",
85 |
86 | // Rate limit settings
87 | // https://npmjs.org/package/connect-ratelimit#readme
88 |
89 | "rateLimit": {
90 | "whitelist": ["127.0.0.1", "localhost"],
91 | "blacklist": [],
92 | "categories": {
93 | "normal": {
94 | "totalRequests": 1000,
95 | "every": 3600000000
96 | },
97 | "whitelist": {
98 | "totalRequests": 10000,
99 | "every": 60000000
100 | },
101 | "blacklist": {
102 | "totalRequests": 0,
103 | "every": 0
104 | }
105 | }
106 | }
107 | }
108 |
--------------------------------------------------------------------------------
/docs/4. Query syntax.md:
--------------------------------------------------------------------------------
1 | ---
2 | category: reference
3 | heading: 'Query syntax'
4 | ---
5 |
6 | A simple query looks like this:
7 |
8 | {
9 | "url": "http://chrisnewtn.com",
10 | "type": "html",
11 | "selector": "ul.social li a",
12 | "extract": "href",
13 | }
14 |
15 | It says to go to a friend's website and for noodle to expect a html document.
16 | Then to select anchor elements in a list and for each one extract the href
17 | attribute's value.
18 |
19 | The `type` property is used to tell noodle if you are wanting to scrape a html
20 | page, json document etc. If no type is specified then a html page will be
21 | assumed by default.
22 |
23 | A similar query can be constructed to extract information from a JSON document.
24 | JSONSelect is used as the underlying library to do this. It supports common CSS3
25 | selector functionality. You can [familiarize yourself with it here.](http://jsonselect.org/#tryit)
26 |
27 | {
28 | "url": "https://search.twitter.com/search.json?q=friendship",
29 | "selector": ".results .from_user",
30 | "type": "json"
31 | }
32 |
33 | An `extract` property is not needed for a query on JSON documents as json
34 | properties have no metadata and just a single value were as a html element
35 | can have text, the inner html or an attribute like `href`.
36 |
37 | ## Different types (html, json, feed & xml)
38 |
39 | ### html
40 |
41 | **Note:** Some xml documents can be parsed by noodle under the html type!
42 |
43 | The html type is the only type to have the `extract` property. This is because
44 | the other types are converted to JSON.
45 |
46 | The `extract` property (optional) could be the HTML element's attribute
47 | but it is not required.
48 |
49 | Having `"html"` or `"innerHTML"` as the `extract` value will return the
50 | containing HTML within that element.
51 |
52 | Having `"text"` as the `extract` value will return only the text. noodle will
53 | strip out any new line characters found in the text.
54 |
55 | Return data looks like this:
56 |
57 | [
58 | {
59 | "results": [
60 | "http://twitter.com/chrisnewtn",
61 | "http://plus.google.com/u/0/111845796843095584341"
62 | ],
63 | "created": "2012-08-01T16:22:14.705Z"
64 | }
65 | ]
66 |
67 | Having no specific extract rule will assume a default of extracting `"text"`
68 | from the `selector`.
69 |
70 | It is also possible to request multiple properties to extract in one query if
71 | one uses an array.
72 |
73 | Query:
74 |
75 | {
76 | "url": "http://chrisnewtn.com",
77 | "selector": "ul.social li a",
78 | "extract": ["href", "text"]
79 | }
80 |
81 | Response:
82 |
83 | [
84 | {
85 | "results": [
86 | {
87 | "href": "http://twitter.com/chrisnewtn",
88 | "text": "Twitter"
89 | },
90 | {
91 | "href": "http://plus.google.com/u/0/111845796843095584341",
92 | "text": "Google+"
93 | }
94 | ],
95 | "created": "2012-08-01T16:23:41.913Z"
96 | }
97 | ]
98 |
99 | In the query's `selector` property use the standard CSS DOM selectors.
100 |
101 | ### json and xml
102 |
103 | The same rules apply from html to the json and xml types. Only that the
104 | `extract` property should be ommitted from queries as the JSON node value(s)
105 | targetted by the `selector` is always assumed.
106 |
107 | In the query's `selector` property use
108 | [JSONSelect](http://jsonselect.org/#tryit) style selectors.
109 |
110 | ### feeds
111 |
112 | The same rules apply to the json and xml types. Only that the `extract` property
113 | should be ommitted from queries as the JSON node value(s) targetted by the
114 | `selector` is always assumed.
115 |
116 | In the query's `selector` property use
117 | [JSONSelect](http://jsonselect.org/#tryit) style selectors.
118 |
119 | The feed type is based upon
120 | [node-feedparser](https://github.com/danmactough/node-feedparser) so it
121 | supports Robust RSS, Atom, and RDF standards.
122 |
123 | [Familiarize yourself with its](https://github.com/danmactough/node-feedparser#what-is-the-parsed-output-produced-by-feedparser) normalisation format before you use JSONSelect style
124 | selector.
125 |
126 | ## Getting the entire web document
127 |
128 | If no `selector` is specified than the entire document is returned. This is a
129 | rule applied to all types of docments. The `extract` rule will be ignored if
130 | included.
131 |
132 | Query:
133 |
134 | {
135 | "url": "https://search.twitter.com/search.json?q=friendship"
136 | }
137 |
138 | Response:
139 |
140 | [
141 | {
142 | "results": [""],
143 | "created": "2012-10-24T15:37:29.796Z"
144 | }
145 | ]
146 |
147 | ## Mapping a query to familiar properties
148 |
149 | Queries can also be written in noodle's map notation. The map notation allows
150 | for the results to be accessible by your own more helpful property names.
151 |
152 | In the example below map is used to create a result object of a person and
153 | their repos.
154 |
155 | {
156 | "url": "https://github.com/chrisnewtn",
157 | "type": "html",
158 | "map": {
159 | "person": {
160 | "selector": "span[itemprop=name]",
161 | "extract": "text"
162 | },
163 | "repos": {
164 | "selector": "li span.repo",
165 | "extract": "text"
166 | }
167 | }
168 | }
169 |
170 | With results looking like this:
171 |
172 | [
173 | {
174 | "results": {
175 | "person": [
176 | "Chris Newton"
177 | ],
178 | "repos": [
179 | "cmd.js",
180 | "simplechat",
181 | "sitestatus",
182 | "jquery-async-uploader",
183 | "cmd-async-slides",
184 | "elsewhere",
185 | "pablo",
186 | "jsonpatch.js",
187 | "jquery.promises",
188 | "llamarama"
189 | ]
190 | },
191 | "created": "2013-03-25T15:38:01.918Z"
192 | }
193 | ]
194 |
195 | ## Getting hold of page headers
196 |
197 | Within a query include the `headers` property with an array value listing the
198 | headers you wish to recieve back as an object structure. `'all'` may also be
199 | used as a value to return all of the server headers.
200 |
201 | Headers are treated case-insensitive and the returned property names will
202 | match exactly to the string you requested with.
203 |
204 | Query:
205 |
206 | {
207 | "url": "http://github.com",
208 | "headers": ["connection", "content-TYPE"]
209 | }
210 |
211 | Result:
212 |
213 | [
214 | {
215 | "results": [...],
216 | "headers": {
217 | "connection": "keep-alive",
218 | "content-TYPE": "text/html"
219 | }
220 | "created":"2012-11-14T13:06:02.521Z"
221 | }
222 | ]
223 |
224 | ### Link headers for pagination
225 |
226 | noodle provides a shortcut to the server Link header with the query
227 | `linkHeader` property set to `true`. Link headers are useful as some web APIs
228 | use them to expose their pagination.
229 |
230 | The Link header will be parsed to an object structure. If you wish to have the Link header in its usual formatting then include it in the `headers` array instead.
231 |
232 | Query:
233 |
234 | {
235 | "url": "https://api.github.com/users/premasagar/starred",
236 | "type": "json",
237 | "selector": ".language",
238 | "headers": ["connection"],
239 | "linkHeader": true
240 | }
241 |
242 | Result:
243 |
244 | [
245 | {
246 | "results": [
247 | "JavaScript",
248 | "Ruby",
249 | "JavaScript",
250 | ],
251 | "headers": {
252 | "connection": "keep-alive",
253 | "link": {
254 | "next": "https://api.github.com/users/premasagar/starred?page=2",
255 | "last": "https://api.github.com/users/premasagar/starred?page=21"
256 | }
257 | },
258 | "created": "2012-11-16T15:48:33.866Z"
259 | }
260 | ]
261 |
262 |
263 | ## Querying to a POST url
264 |
265 | noodle allows for post data to be passed along to the target web server
266 | specified in the url. This can be optionally done with the `post` property
267 | which takes an object map of the post data key/values.
268 |
269 | {
270 | "url": "http://example.com/login.php",
271 | "post": {
272 | "username": "john",
273 | "password": "123"
274 | },
275 | "select": "h1.username",
276 | "type": "html"
277 | }
278 |
279 | Take not however that queries with the `post` property will not be cached.
280 |
281 | ## Querying without caching
282 |
283 | If `cache` is set to `false` in your query then noodle will not cache the
284 | results or associated page and it will get the data fresh. This is useful for
285 | debugging.
286 |
287 | {
288 | "url": "http://example.com",
289 | "selector": "h1",
290 | "cache": "false"
291 | }
292 |
293 | ## Query errors
294 |
295 | noodle aims to give errors for the possible use cases were a query does
296 | not yield any results.
297 |
298 | Each error is specific to one result object and are contained in the `error`
299 | property as a string message.
300 |
301 | Response:
302 |
303 | [
304 | {
305 | "results": [],
306 | "error": "Document not found"
307 | }
308 | ]
309 |
310 | noodle also falls silently with the `'extract'` property by ommitting any
311 | extract results from the results object.
312 |
313 | Consider the following JSON response to a partially incorrect query.
314 |
315 | Query:
316 |
317 | {
318 | "url": "http://chrisnewtn.com",
319 | "selector": "ul.social li a",
320 | "extract": ["href", "nonexistent"]
321 | }
322 |
323 | Response:
324 |
325 | The extract "nonexistent" property is left out because it was not found
326 | on the element.
327 |
328 | [
329 | {
330 | "results": [
331 | {
332 | "href": "http://twitter.com/chrisnewtn"
333 | },
334 | {
335 | "href": "http://plus.google.com/u/0/111845796843095584341"
336 | }
337 | ],
338 | "created": "2012-08-01T16:28:19.167Z"
339 | }
340 | ]
341 |
342 | ## Multiple queries
343 |
344 | Multiple queries can be made per request to the server. You can mix between
345 | different types of queries in the same request as well as queries in the map
346 | notation.
347 |
348 | Query:
349 |
350 | [
351 | {
352 | "url": "http://chrisnewtn.com",
353 | "selector": "ul.social li a",
354 | "extract": ["text", "href"]
355 | },
356 | {
357 | "url": "http://premasagar.com",
358 | "selector": "#social_networks li a.url",
359 | "extract": "href"
360 | }
361 | ]
362 |
363 | Response:
364 |
365 | [
366 | {
367 | "results": [
368 | {
369 | "href": "http://twitter.com/chrisnewtn",
370 | "text": "Twitter"
371 | },
372 | {
373 | "href": "http://plus.google.com/u/0/111845796843095584341",
374 | "text": "Google+"
375 | }
376 | ],
377 | "created": "2012-08-01T16:23:41.913Z"
378 | },
379 | {
380 | "results": [
381 | "http://dharmafly.com/blog",
382 | "http://twitter.com/premasagar",
383 | "https://github.com/premasagar",
384 | ],
385 | "created": "2012-08-01T16:22:13.339Z"
386 | }
387 | ]
388 |
389 | ## Proxy Support
390 |
391 | When calling a page multiple times some sites can and will ban your IP address, Adding support for proxy IP addresses allows the rotation of IP addresses.
392 |
393 | Query:
394 |
395 | {
396 | "url": "http://chrisnewtn.com",
397 | "selector": "ul.social li a",
398 | "extract": ["text", "href"],
399 | "proxy": "XXX.XXX.XXX.XXX"
400 | }
401 |
--------------------------------------------------------------------------------
/docs/5. Noodle as node module.md:
--------------------------------------------------------------------------------
1 | ---
2 | category: reference
3 | heading: 'Noodle as node module'
4 | ---
5 |
6 | **Note:** Since noodle's internal cache uses an interval this will keep the
7 | related node process running indefinately. Be sure to run `noodle.stopCache()`
8 | in your code when you're finished with noodle.
9 |
10 | ## Methods
11 |
12 | ### noodle.query
13 |
14 | The main entry point to noodle's functionality is the `query` method. This
15 | method accepts a query or an array of queries as its only parameter and returns
16 | a [promise](https://github.com/kriskowal/q).
17 |
18 | var noodle = require('noodlejs');
19 | noodle.query(queries).then(function (results) {
20 | console.log(results);
21 | });
22 |
23 | The makeup of query(s) is analagous to using noodle as a web service (as
24 | [stated above](http://noodlejs.com/reference/#query-syntax)). The
25 | exception being that you supply a proper object and not JSON.
26 |
27 | ### noodle.fetch
28 |
29 | This method returns a [promises](https://github.com/kriskowal/q). Which upon
30 | resolutions hands over the requested web document.
31 |
32 | noodle.fetch(url).then(function (page) {
33 | console.log(page);
34 | });
35 |
36 |
37 | ### noodle.html.select
38 |
39 | For applying one query to a html string and retrieving the results.
40 |
41 | noodle.html.select(html, {selector: 'title', extract: 'innerHTML'})
42 | .then(function (result) {
43 | console.log(result);
44 | });
45 |
46 |
47 | ### noodle.json.select
48 |
49 | For applying one query to a parsed JSON representation (object).
50 |
51 | var parsed = JSON.parse(json);
52 | noodle.html.select(parsed, {selector: '.name'})
53 | .then(function (result) {
54 | console.log(result);
55 | });
56 |
57 | ## noodle.feed.select
58 |
59 | Normalises an RSS, ATOM or RDF string with
60 | [node-feedparser](https://github.com/danmactough/node-feedparser) then proxies
61 | that normalised object to `noodle.json.select`.
62 |
63 | ### noodle.xml.select
64 |
65 | Proxies to `noodle.json.select`.
66 |
67 | ### noodle events
68 |
69 | noodle's `noodle.events` namespace allows one to listen for emitted cache
70 | related events. Noodle inherits from node's [EventEmitter](http://nodejs.org/api/events.html#events_class_events_eventemitter).
71 |
72 | // Called when a page is cached
73 | noodle.events.on('cache/page', function (obj) {
74 | //obj is the page cache object detailing the page, its headers
75 | //and when it was first cached
76 | });
77 |
78 | // Called when a result is cached
79 | noodle.events.on('cache/result', function (obj) {
80 | //obj is the result cache object detailing the result and when
81 | //it was first cached
82 | });
83 |
84 | // Called when the cache is purged
85 | noodle.events.on('cache/purge', function (arg1, arg2) {
86 | //arg1 is a javascript date representing when the cache was purged
87 | //arg2 is the time in milliseconds until the next cache purge
88 | });
89 |
90 | // Called when a cached item has expired from the cache
91 | noodle.events.on('cache/expire', function (obj) {
92 | //obj is the cache item
93 | });
94 |
95 | ### Configuration
96 |
97 | Configuration is possible programmatically via `noodle.configure(obj)`.
98 |
99 | This accepts a conig object which can be partly or fully representing the
100 | config options.
101 |
102 | This object is applied over the existing config found in the `config.json`.
103 |
104 | Example for change just two settings:
105 |
106 | var noodle = require('noodlejs');
107 |
108 | // Do not display messages to the terminal and set
109 | // the default document type to json
110 |
111 | noodle.configure({
112 | debug: false,
113 | defaultDocumentType: "json"
114 | });
115 |
--------------------------------------------------------------------------------
/docs/6. Error handling.md:
--------------------------------------------------------------------------------
1 | ---
2 | category: reference
3 | heading: 'Error handling'
4 | ---
5 |
6 | Noodle will fire various errors which one can listen for with the `fail()`
7 | handler.
8 |
9 | noodle.html.fetch(query)
10 | .then(function (result) {
11 | console.log('The results are', results);
12 | })
13 | .fail(function (error) {
14 | console.log('Uh oh', error.message);
15 | });
16 |
17 | ## Possible errors
18 |
19 | The noodle module itself emits only one error:
20 |
21 | - `"Document not found"` when a targetted url is not found.
22 |
23 | Were as the specific document type modules emit their own but should bubble
24 | up to the main `noodle.query` method.
25 |
26 | - `'Could not parse XML to JSON'`
27 | - `'Could not parse JSON document'`
28 | - `'Could not match with that selector'`
29 | - `'Could not match with that selector or extract value'`
30 |
--------------------------------------------------------------------------------
/docs/7. Caching.md:
--------------------------------------------------------------------------------
1 | ---
2 | category: reference
3 | heading: 'Caching'
4 | ---
5 |
6 | noodle includes an in memory cache for both queried pages and the query
7 | results to help with the speed of requests.
8 |
9 | This cache can be configured in the `noodlejs/lib/config.json` file.
10 |
11 | This cache is included in the noodle library core not at its web service.
12 |
13 | Caching is done on a singular query basis and not per all queries in a request.
14 |
15 | By default the page cache and results cache's individual items have a life time
16 | of an hour. With a cache itself having total size of 124 recorded items in
17 | memory at one time. A cache is also cleared entirely on a weekly basis.
18 |
19 | These values can all be changed from noodle's json config.
20 |
21 | ## HTTP caching headers
22 |
23 | The noodle web service includes `Expires` header. This is always set to the
24 | oldest to expire query result in a result set.
25 |
26 | Take not however that the browser [may not cache](http://stackoverflow.com/questions/626057/is-it-possible-to-cache-post-methods-in-http) POST requests to the noodle server.
--------------------------------------------------------------------------------
/docs/8. Adding to noodle.md:
--------------------------------------------------------------------------------
1 | ---
2 | category: reference
3 | heading: 'Adding to noodle'
4 | ---
5 |
6 | noodle is an open-source project
7 | [maintained on github](https://github.com/dharmafly/premasagar) so raising
8 | issues and forking is encouraged.
9 |
10 | ## Supporting different web documents
11 |
12 | By default noodle supports html, json, standard feeds and xml web documents but
13 | noodle also provides a concise environment for developers to write their own
14 | type modules with prior knowledge only needed in
15 | [promises](https://github.com/kriskowal/q).
16 |
17 | To add their own type, one creates the script for that type in
18 | `noodlejs/lib/types` with the name being what one would type in a query.
19 |
20 | ` $ touch noodlejs/lib/types/csv.js`
21 |
22 | As for the content of the script a developer should expose at least 2 methods
23 | (`_init` & `fetch`) and is recommended to expose a `select` method. These
24 | methods must be written with a promise interface interoperable with
25 | [the q library](https://github.com/kriskowal/q). It is reccomended you just use
26 | [q](https://github.com/kriskowal/q).
27 |
28 | **Required methods**
29 |
30 | `exports._init = function (noodle) {}`
31 |
32 | This function is passed the main noodle library. You should keep hold of this
33 | reference so you can make use of some important noodle methods covered in a bit.
34 |
35 | `exports.fetch = function (url, query) {}`
36 |
37 | This method is the entry point to your module by noodle and possibly other
38 | developers. This is the function which leads to all of your processing.
39 |
40 | Make use of `noodle.cache.get` to resolve your promise early with a cached
41 | results without the need to fetch the page and process the query.
42 |
43 | It is higly recommended you do not fetch the page yourself but use the core
44 | `noodle.fetch` since this handles page caching for you.
45 |
46 | When you have the document pass it and the query to your `select` function for
47 | processing with the query.
48 | function fetch (url, query) {
49 | var deferred = q.defer();
50 | if (noodle.cache.check(query)) {
51 | deferred.resolve(noodle.cache.get(query).value);
52 | return deferred.promise;
53 | } else {
54 | return noodle.fetch(url, query).then(function (page) {
55 | return select(page, query);
56 | });
57 | }
58 | }
59 |
60 | **Recommended methods**
61 |
62 | `exports.select = function (document, query) {}`
63 |
64 | This method is where you do your actual selecting of the data using the web
65 | document given from your `fetch` method via `noodle.fetch`.
66 |
67 | In your algorithm do not account for multiple queries. This is done at a higher
68 | level by noodle which iterates over your type module.
69 |
70 | It is also highly recommended that you cache your result this is done simply by
71 | wrapping it in the `noodle._wrapResults` method.
72 |
73 | `deferred.resolve(noodle._wrapResults(results, query));`
74 |
75 | What defines query properties like `extract` or `select` is what your own
76 | select function expects to find in the `query` object passed in. For example:
77 |
78 |
79 | // Query
80 | {
81 | "url": "http://example.com/data.csv",
82 | "type": "csv",
83 | "from": "row1",
84 | "to": "row10"
85 | }
86 |
87 | // Your interpretation
88 | function select (document, query) {
89 | ...
90 | csvparser.slice(query.from, query.to);
91 | ...
92 | }
93 |
94 | **Example script**
95 |
96 | An example implementation could look like this:
97 |
98 | var q = require('q'),
99 | noodle = null;
100 |
101 | exports._init = function (n) {
102 | noodle = n;
103 | }
104 |
105 | exports.fetch = function (url, query) {
106 | var deferred = q.Defer();
107 | if (noodle.cache.check(query)) {
108 | deferred.resolve(noodle.cache.get(query).value);
109 | return deferred.promise;
110 | } else {
111 | return noodle.fetch(url).then(function (page) {
112 | return exports.select(page, query);
113 | });
114 | }
115 | }
116 |
117 | exports.select = function (page, query) {
118 | var deferred = q.Defer(),
119 | myResults = [];
120 |
121 | /*
122 | your algorithm here, dont forget to
123 | deferred.resolve(noodle._wrapResults(myResults, query))
124 | or
125 | deferred.fail(new Error("Selector was bad or something like that"))
126 | */
127 |
128 | return deferred.promise;
129 | }
130 |
--------------------------------------------------------------------------------
/docs/9. Tests.md:
--------------------------------------------------------------------------------
1 | ---
2 | category: reference
3 | heading: 'Tests'
4 | ---
5 |
6 | The noodle tests create a temporary server on port `8889` which the automated
7 | tests tell noodle to query against.
8 |
9 | To run tests you can use the provided binary from the noodle package root
10 | directory:
11 |
12 | $ bin/tests
13 |
--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
1 | exports = require('./lib/noodle.js');
--------------------------------------------------------------------------------
/lib/cache.js:
--------------------------------------------------------------------------------
1 | var _ = require('underscore');
2 |
3 | // ------------------------------------------------------------
4 | // Cache class which can store, expose, expire and purge all
5 | // items in its memory.
6 | //
7 | // Two instances of Cache exist as pageCache and resultsCache
8 | // in noodle.js.
9 | // ------------------------------------------------------------
10 |
11 | module.exports = function Cache (config, noodle) {
12 | var cache = [],
13 | intervalId1,
14 | intervalId2;
15 |
16 | // ------------------------------------------------------------
17 | // Starts the interval for cache purging and cache expiry.
18 | // Called from noodle.js.
19 | // ------------------------------------------------------------
20 |
21 | this.start = function () {
22 |
23 | // Check to see if a cache item is to be removed from the
24 | // cache (expired).
25 |
26 | intervalId1 = setInterval(function () {
27 | var now = new Date().getTime(),
28 | initialLength = cache.length,
29 | x = 0,
30 | keep = [];
31 |
32 | while (x < initialLength) {
33 | if ((now - cache[x].created) < config.cacheMaxTime) {
34 | keep.unshift(cache[x]);
35 | } else {
36 | noodle.events.emit('cache/expire', cache[x], config.cacheMaxTime);
37 | }
38 | x++;
39 | }
40 |
41 | cache = keep;
42 | }, 10000);
43 |
44 | // Remove all cache entries every time the cache purge time
45 | // is reached.
46 |
47 | if (config.cachePurgeTime > 0) {
48 | intervalId2 = setInterval(function () {
49 | cache = [];
50 | noodle.events.emit('cache/purge', new Date(), config.cachePurgeTime);
51 | }, config.cachePurgeTime);
52 | }
53 | };
54 |
55 | // ------------------------------------------------------------
56 | // Store an object in the cache tied to specific key.
57 | //
58 | // In noodle: resultsCache stores a result set with the query
59 | // being the key. pageCache stores a document and its headers
60 | // with the url being the key.
61 | // ------------------------------------------------------------
62 |
63 | this.put = function (key, value) {
64 | var item = {
65 | key: key,
66 | value: value,
67 | created: new Date()
68 | };
69 |
70 | if (cache.length >= config.maxSize) {
71 | cache.pop();
72 | }
73 |
74 | cache.unshift(item);
75 | return this.get(key);
76 | };
77 |
78 | // ------------------------------------------------------------
79 | // Boolean representing if an item exists for a specific key.
80 | // ------------------------------------------------------------
81 |
82 | this.check = function (key) {
83 | return (find(key)) ? true : false;
84 | };
85 |
86 | // ------------------------------------------------------------
87 | // Returns a cached item based on a specific key.
88 | //
89 | // Cached items are objects with the following structure:
90 | //
91 | // {
92 | // created:
93 | // value:
94 | // }
95 | // ------------------------------------------------------------
96 |
97 | this.get = function (key) {
98 | var item = find(key),
99 | clone = _.clone(item);
100 |
101 | delete clone.key;
102 | return clone;
103 | };
104 |
105 | // ------------------------------------------------------------
106 | // The cache array is exposed. Useful for debugging purposes.
107 | // ------------------------------------------------------------
108 |
109 | this.getCache = function () {
110 | return cache;
111 | };
112 |
113 | // ------------------------------------------------------------
114 | // Stops running the intervals for the cache checking. Useful
115 | // for removing cache objects from the event loop and keeping
116 | // the node process from running indefinitely.
117 | // ------------------------------------------------------------
118 |
119 | this.stop = function () {
120 | clearInterval(intervalId1);
121 | clearInterval(intervalId2);
122 | };
123 |
124 | // Loops through the cache array finding the cached item
125 | // associated with the key.
126 |
127 | function find (key) {
128 | var i = 0;
129 | for (i; i < cache.length; i++) {
130 | if (_.isEqual(key, cache[i].key)) {
131 | return cache[i];
132 | }
133 | }
134 | }
135 | };
136 |
--------------------------------------------------------------------------------
/lib/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "debug": true,
3 |
4 | "resultsCacheMaxTime": 3600000,
5 | "resultsCachePurgeTime": 60480000,
6 | "resultsCacheMaxSize": 124,
7 |
8 | "pageCacheMaxTime": 3600000,
9 | "pageCachePurgeTime": 60480000,
10 | "pageCacheMaxSize": 32,
11 |
12 | "defaultDocumentType": "html",
13 |
14 | "userAgent": "",
15 |
16 | "rateLimit": {
17 | "whitelist": ["127.0.0.1", "localhost"],
18 | "blacklist": [],
19 | "catagories": {
20 | "normal": {
21 | "totalRequests": 1000,
22 | "every": 3600000000
23 | },
24 | "whitelist": {
25 | "totalRequests": 10000,
26 | "every": 60000000
27 | },
28 | "blacklist": {
29 | "totalRequests": 0,
30 | "every": 0
31 | }
32 | }
33 | }
34 | }
--------------------------------------------------------------------------------
/lib/logger.js:
--------------------------------------------------------------------------------
1 | require('colors');
2 |
3 | var messages = 0;
4 |
5 | module.exports = function (noodle) {
6 | var events = noodle.events,
7 | config = noodle.config;
8 |
9 | function toTerminal(message) {
10 | if (config.debug) {
11 | console.log(('\n [noodle log #' + ++messages + ']').green);
12 | console.log('', new Date().toString().magenta);
13 | console.log('', memUsage().magenta);
14 | console.log('', (message + '\n').magenta);
15 | }
16 | }
17 |
18 | // Called on a query
19 | events.on('noodle/query', function (query) {
20 | toTerminal('Noodle: The query follows...\n ' + JSON.stringify(query));
21 | });
22 |
23 | // Called when a page is cached
24 | events.on('cache/page', function (cachePage) {
25 | toTerminal('Cache: Page has been cached');
26 | });
27 |
28 | // Called when a result is cached
29 | events.on('cache/result', function (cacheResult) {
30 | toTerminal('Cache: Result has been cached');
31 | });
32 |
33 | // Called when the cache is purged
34 | events.on('cache/purge', function (when, next) {
35 | toTerminal('Cache: Purge @ ' + when + ' next in ' + next);
36 | });
37 |
38 | // Called when a cached item has expired from the cache
39 | events.on('cache/expire', function (item, next) {
40 | toTerminal('Cache: An item expired from cache, next in ' + next);
41 | });
42 | };
43 |
44 | function memUsage () {
45 | var heapTotal = process.memoryUsage().heapTotal;
46 | return 'Memory: ' + (heapTotal / 1048576).toFixed(2) +
47 | 'mb (' + heapTotal + ' bytes)';
48 | }
--------------------------------------------------------------------------------
/lib/noodle-middleware.js:
--------------------------------------------------------------------------------
1 | var zlib = require('zlib'),
2 | moment = require('moment'),
3 | _ = require('underscore'),
4 | noodle = require('../lib/noodle');
5 |
6 | exports.parseQueries = function (req, res, next) {
7 | var hasJSON = (Object.keys(req.body).length > 0),
8 | hasQueryString = (Object.keys(req.query).length > 0),
9 | queries;
10 |
11 | // Handle for different request types
12 |
13 | // Take JSON from request body (http post)
14 | queries = (hasJSON) ? req.body : false;
15 | // Take only single query JSON from request querystring (http get)
16 | queries = (queries === false && hasQueryString) ? req.query : queries;
17 | // Take JSON from request querystring (http get)
18 | queries = (req.query.q) ? toJSON(req.query.q) : queries;
19 |
20 | // Handle query(s) with noodle or fail early
21 |
22 | if (queries) {
23 | res.queries = queries;
24 | next();
25 | } else {
26 | res.noodleData = {error: 'Malformed or no query'};
27 | exports.respond(req, res);
28 | }
29 | };
30 |
31 | exports.noodleQueries = function (req, res, next) {
32 | noodle.query(res.queries).then(function (results) {
33 | res.noodleData = results;
34 | next();
35 | });
36 | };
37 |
38 | exports.respond = function (req, res) {
39 | var error = res.noodleData.error,
40 | callback = req.query.callback,
41 | responseBody;
42 |
43 | if (error) {
44 | res.statusCode = 401;
45 | responseBody = '[{"results": [], "error":"' + error + '"}]';
46 | } else {
47 | res.statusCode = 200;
48 | res.setHeader('Expires', setExpiresHeader(res.noodleData.results));
49 | responseBody = JSON.stringify(res.noodleData.results);
50 | }
51 |
52 | if (callback) {
53 | res.setHeader('Content-Type', 'application/javascript');
54 | responseBody = callback + '(' + responseBody + ')';
55 | } else {
56 | res.setHeader('Content-Type', 'application/json; charset=utf-8');
57 | }
58 |
59 | responseBody = new Buffer(responseBody, 'utf8');
60 |
61 | if (req.headers['accept-encoding']) {
62 | res.setHeader('content-encoding', 'gzip');
63 | zlib.gzip(responseBody, function (err, buffer) {
64 | res.end(buffer);
65 | });
66 | } else {
67 | res.end(responseBody);
68 | }
69 | };
70 |
71 | function setExpiresHeader (results) {
72 | var temp;
73 |
74 | results = (_.isArray(results)) ? results : [results];
75 |
76 | // Get the earliest time first (last to expire)
77 | // use concat() to not mutate the original results order
78 |
79 | temp = results.concat().sort(function (a, b) {
80 | return (b.created || 0) - (a.created || 0);
81 | });
82 |
83 | // Return oldest to expire or return the present time for
84 | // a bad result which was not cached
85 |
86 | if (temp[0].created) {
87 | return moment(temp[0].created.getTime() + noodle.config.resultsCacheMaxTime)
88 | .format('ddd, D MMM YYYY HH:mm:ss') + ' GMT';
89 | } else {
90 | return moment(new Date())
91 | .format('ddd, D MMM YYYY HH:mm:ss') + ' GMT';
92 | }
93 | };
94 |
95 |
96 | // Wraps JSON.parse so that numbers are treated as an invalid argument
97 |
98 | function toJSON (str) {
99 | var x;
100 | try {
101 | x = JSON.parse(str);
102 | if (typeof x === 'number') {
103 | return false;
104 | }
105 | return x;
106 | } catch (e) {
107 | return false;
108 | }
109 | }
--------------------------------------------------------------------------------
/lib/noodle.js:
--------------------------------------------------------------------------------
1 | var q = require('q'),
2 | fs = require('fs'),
3 | events = require('events'),
4 | request = require('request'),
5 | _ = require('underscore'),
6 | Cache = require('./cache'),
7 | pageCache,
8 | resultsCache;
9 |
10 |
11 | // ------------------------------------------------------------
12 | // Main noodle entry point for usage.
13 | //
14 | // Accepts one or an array of noodle queries. Based on the
15 | // query type it will make use of the appropriate type module
16 | // to do the processing.
17 | //
18 | // See docs/ for information on what and noodle queries can
19 | // be written.
20 | // ------------------------------------------------------------
21 |
22 | exports.query = function (queries) {
23 | var deferred = q.defer(),
24 | promises = [];
25 |
26 | // Normalise one query to an array
27 |
28 | queries = _.isArray(queries) ? queries : [queries],
29 |
30 | // For each query route resolve it as either a normal query
31 | // or a map query
32 |
33 | queries.forEach(function (query, i) {
34 | var deferred = q.defer();
35 |
36 | query.type = query.type || exports.config.defaultDocumentType;
37 | query.cache = (query.cache === false) ? false : true;
38 |
39 | exports.events.emit('noodle/query', query);
40 |
41 | if (exports[query.type]) {
42 | if (query.map) {
43 | handleQueryMap(query, deferred, i);
44 | } else {
45 | handleQuery(query, deferred, i);
46 | }
47 | } else {
48 | deferred.resolve({results: [], error: 'Document type not supported'});
49 | }
50 | promises.push(deferred.promise);
51 | });
52 |
53 | // Return master promise when all queries have resolved
54 | // and ensure that the order they were evaluated is
55 | // maintained
56 |
57 | q.all(promises)
58 | .then(function (results) {
59 | results = results.sort(function (a, b) {
60 | return a.orderNo - b.orderNo;
61 | });
62 |
63 | results.forEach(function (result) {
64 | delete result.orderNo;
65 | });
66 |
67 | deferred.resolve({results: results});
68 | });
69 |
70 | return deferred.promise;
71 | };
72 |
73 | function handleQuery (query, deferred, i) {
74 | exports[query.type].fetch(query.url, query)
75 | .then(function (result) {
76 | result.orderNo = i;
77 | if (query.cache) {
78 | result.created = resultsCache.get(query).created;
79 | }
80 | deferred.resolve(result);
81 | })
82 | .fail(function (error) {
83 | deferred.resolve({results: [], error: error.message, orderNo: i});
84 | });
85 | }
86 |
87 | function handleQueryMap (query, deferred, i) {
88 | map(query, function (error, result) {
89 | if (!error) {
90 | result.orderNo = i;
91 | if (query.cache) {
92 | result.created = resultsCache.get(query).created;
93 | }
94 | deferred.resolve(result);
95 | } else {
96 | deferred.resolve({results: [], error: error.message, orderNo: i});
97 | }
98 | });
99 | }
100 |
101 | // ------------------------------------------------------------
102 | // Fetch a web document (possibly from cache) with a url.
103 | //
104 | // The query should also be passed in as it contains
105 | // details if it should bypass the cache or if it is a POST
106 | // request.
107 | //
108 | // This fetch method is used by the different type modules to
109 | // get the document before they do they interpret the query
110 | // process the document.
111 | // ------------------------------------------------------------
112 |
113 | exports.fetch = function (url, query, extendedHeaders) {
114 | var deferred = q.defer(),
115 | requestOptions = {
116 | method: 'GET',
117 | uri: url,
118 | headers: {'user-agent': exports.config.userAgent}
119 | };
120 |
121 | if (query.proxy) {
122 | requestOptions.proxy = query.proxy;
123 | }
124 |
125 |
126 | if (query.post) {
127 | requestOptions.method = 'POST';
128 | requestOptions.body = serialize(query.post);
129 | requestOptions.headers = _.extend(requestOptions.headers, {
130 | 'Content-Type': 'application/x-www-form-urlencoded',
131 | 'Content-Length': requestOptions.body.length
132 | });
133 | query.cache = false;
134 | }
135 |
136 | if (extendedHeaders) {
137 | _.extend(requestOptions.headers, extendedHeaders);
138 | }
139 |
140 | // (!) This aspect should be revised.
141 | // Force cache true if the person wants header information
142 | // since header data is read from cache
143 | query.cache = (query.headers || query.linkHeader) ? true : query.cache;
144 |
145 | if (pageCache.check(url) && query.cache) {
146 | deferred.resolve(pageCache.get(url).value.body);
147 | } else {
148 | getDocument(query.cache, requestOptions, deferred);
149 | }
150 |
151 | return deferred.promise;
152 | };
153 |
154 | function getDocument (shouldCache, options, deferred) {
155 | request(options, function (err, response, body) {
156 | if (err || response.statusCode !== 200) {
157 | deferred.reject(new Error('Document not found'));
158 | } else {
159 | if (shouldCache && !pageCache.check(options.uri)) {
160 | //added response.request in order to get the details like location and domain
161 | pageCache.put(options.uri, {body: body, headers: response.headers, request: response.request});
162 | exports.events.emit('cache/page', pageCache.get(options.uri
163 | ));
164 | }
165 | deferred.resolve(body);
166 | }
167 | });
168 | }
169 |
170 | // ------------------------------------------------------------
171 | // Returns an object representing a result set which comprises
172 | // of an array of 1 or more results and the associate page
173 | // header information.
174 | //
175 | // (!!) This is where a result set is cached in resultsCache.
176 | //
177 | // Exposed as it is also called from some type modules.
178 | // ------------------------------------------------------------
179 |
180 | exports._wrapResults = function (results, query) {
181 | var resultSet = {};
182 |
183 | if (results.length || Object.keys(results).length) {
184 | resultSet.results = results;
185 |
186 | if (query.headers) {
187 | resultSet.headers = getHeadersForResultSet(query);
188 | }
189 |
190 | if (query.request) {
191 | resultSet.request = getRequestDetailsForResultSet(query);
192 | }
193 |
194 | if (query.linkHeader) {
195 | resultSet.headers = resultSet.headers || {};
196 | resultSet.headers.link = getLinkHeaders(query) || null;
197 | }
198 |
199 | if (query.cache) {
200 | if (resultsCache.check(query) === false) {
201 | resultsCache.put(query, resultSet);
202 | exports.events.emit('cache/result', resultsCache.get(query));
203 | }
204 | }
205 |
206 | return resultSet;
207 | }
208 |
209 | return [];
210 | };
211 |
212 | // ------------------------------------------------------------
213 | // The namespace for noodles events.
214 | //
215 | // Events are emitted from both this file and cache.js.
216 | //
217 | // One can subscribe to the following events:
218 | // - cache/page
219 | // - cache/result
220 | // - cache/purge
221 | // - cache/expire
222 | //
223 | // ------------------------------------------------------------
224 |
225 | exports.events = new events.EventEmitter();
226 |
227 | // ------------------------------------------------------------
228 | // An exposed noodle config initialized by an editable
229 | // json representation at lib/config.json
230 | // ------------------------------------------------------------
231 |
232 | exports.config = JSON.parse(fs.readFileSync(__dirname +'/config.json'));
233 |
234 | // ------------------------------------------------------------
235 | // Accepts a full or part config object an extends it over
236 | // the existing noodle config.
237 | //
238 | // This is a way to programmatically configure the config
239 | // without touching lib/config.json
240 | // ------------------------------------------------------------
241 |
242 | exports.configure = function (obj) {
243 | exports.config = _.extend(exports.config, obj);
244 | };
245 |
246 | // ------------------------------------------------------------
247 | // Stops the cache intervals from running in the event loop.
248 | // Allows for the node process to exit.
249 | // ------------------------------------------------------------
250 |
251 | exports.stopCache = function () {
252 | resultsCache.stop();
253 | pageCache.stop();
254 | };
255 |
256 | // Function called from exports.query()
257 | //
258 | // Takes in a query in the map notation
259 | //
260 | // For each map property, a call to the appropriate type module
261 | // is done and the result is grabbed for that map property's
262 | // value.
263 | //
264 | // When all properties are mapped with values this function calls
265 | // back to exports.query().
266 |
267 | function map (query, callback) {
268 | var promises = [],
269 | mappedContainer = {},
270 | getResultSet,
271 | toPush,
272 | mapTo;
273 |
274 | getResultSet = function (mapTo, query) {
275 | query.map[mapTo].url = query.url;
276 | query.map[mapTo].cache = query.cache;
277 |
278 | return exports[query.type].fetch(query.url, query.map[mapTo])
279 | .then(function (result) {
280 | mappedContainer[mapTo] = result.results;
281 | })
282 | .fail(function (error) {
283 | mappedContainer[mapTo] = {results: [], error: error.message};
284 | });
285 | };
286 |
287 | for (mapTo in query.map) {
288 | promises.push(getResultSet(mapTo, query));
289 | }
290 |
291 | q.all(promises)
292 | .then(function () {
293 | callback(null, exports._wrapResults(mappedContainer, query));
294 | })
295 | .fail(function (error) {
296 | callback(error);
297 | });
298 | }
299 |
300 | // Function called from exports._wrapResults()
301 | //
302 | // Passed in a query and returns the full page headers
303 | // or specific page headers as specified by the query.
304 |
305 | function getHeadersForResultSet (query) {
306 | var bucket = {},
307 | pageHeaders = pageCache.get(query.url).value.headers,
308 | prop;
309 |
310 | if (query.headers !== 'all' && _.isArray(query.headers)) {
311 | for (prop in pageHeaders) {
312 | query.headers.forEach(function (name) {
313 | if (prop.toLowerCase() === name.toLowerCase()) {
314 | bucket[name] = pageHeaders[prop];
315 | }
316 | });
317 | }
318 | return bucket;
319 | } else {
320 | return pageHeaders;
321 | }
322 | }
323 |
324 |
325 | // Function called from exports._wrapResults()
326 | //
327 | // Passed in a query and returns the full request headers
328 | // or specific request headers as specified by the query.
329 | function getRequestDetailsForResultSet(query) {
330 | var bucket = {},
331 | requestHeaders = pageCache.get(query.url).value.request,
332 | prop;
333 |
334 | if (query.request !== 'all' && _.isArray(query.request)) {
335 | for (prop in requestHeaders) {
336 | query.request.forEach(function (name) {
337 | if(prop.toLowerCase() === name.toLowerCase()) {
338 | bucket[name] = requestHeaders[prop];
339 | }
340 | });
341 | }
342 | return bucket;
343 | } else {
344 | return requestHeaders;
345 | }
346 | }
347 |
348 | // Function called from exports._wrapResults()
349 | //
350 | // Passed in a query this function returns a parsed representation
351 | // of the Link header values (intended to aid people with navigation).
352 |
353 | function getLinkHeaders (query) {
354 | var header = pageCache.get(query.url).value.headers.link,
355 | links = {},
356 | parts;
357 |
358 | if (header) {
359 | parts = header.split(',');
360 | } else {
361 | return false;
362 | }
363 |
364 | // Parse each part into a named link
365 | parts.forEach(function(p) {
366 | var section = p.split(';'),
367 | url = section[0].replace(/<(.*)>/, '$1').trim(),
368 | name = section[1].replace(/rel="(.*)"/, '$1').trim();
369 | links[name] = url;
370 | });
371 |
372 | return links;
373 | }
374 |
375 | // Function called from exports.query
376 | //
377 | // Will return a query parameter string from an object.
378 |
379 | function serialize (obj) {
380 | var str = [], p;
381 | for (p in obj) {
382 | str.push(encodeURIComponent(p) + "=" + encodeURIComponent(obj[p]));
383 | }
384 | return str.join("&");
385 | }
386 |
387 | // .---------------------------.
388 | // |noodle initialization stuff|
389 | // '---------------------------'
390 |
391 | // Initialize supported document types
392 |
393 | fs.readdirSync(__dirname + '/types/').forEach(function (file) {
394 | file = file.substr(0, file.lastIndexOf('.'));
395 | exports[file] = require('./types/' + file);
396 | exports[file]._init(exports);
397 | });
398 |
399 | // Start the logger.
400 | // The logger will output to terminal if config.debug is set
401 | // to true.
402 |
403 | require('./logger')(exports);
404 |
405 | // Initialize caches
406 |
407 | // ------------------------------------------------------------
408 | // The results cache is exposed for different type modules
409 | // so they can cache their results.
410 | // ------------------------------------------------------------
411 |
412 | exports.cache = resultsCache = new Cache({
413 | cacheMaxTime: exports.config.resultsCacheMaxTime,
414 | cachePurgeTime: exports.config.resultsCachePurgeTime,
415 | cacheMaxSize: exports.config.resultsCacheMaxSize
416 | }, exports);
417 |
418 | pageCache = new Cache({
419 | cacheMaxTime: exports.config.pageCacheMaxTime,
420 | cachePurgeTime: exports.config.pageCachePurgeTime,
421 | cacheMaxSize: exports.config.pageCacheMaxSize
422 | }, exports);
423 |
424 | resultsCache.start();
425 | pageCache.start();
426 |
--------------------------------------------------------------------------------
/lib/types/feed.js:
--------------------------------------------------------------------------------
1 | var q = require('q'),
2 | feedparser = require('feedparser'),
3 | noodle;
4 |
5 | exports._init = function (n) {
6 | noodle = n;
7 | };
8 |
9 | exports.fetch = fetch;
10 | exports.select = select;
11 |
12 | function fetch (url, query) {
13 | var deferred = q.defer();
14 |
15 | if (noodle.cache.check(query)) {
16 | deferred.resolve(noodle.cache.get(query).value);
17 | return deferred.promise;
18 | } else {
19 | return noodle.fetch(url, query).then(function (data) {
20 | return select(data, query);
21 | });
22 | }
23 | }
24 |
25 | function select (data, query) {
26 | return normalise(data).then(function (normalised) {
27 | if (normalised.length === 0) {
28 | throw new Error('The provided document couldn\'t be normalised');
29 | }
30 | return noodle.json.select(normalised, query);
31 | });
32 | }
33 |
34 | function normalise (body) {
35 | var deferred = q.defer(),
36 | articles = [];
37 |
38 | feedparser
39 | .parseString(body)
40 | .on('article', function (a) {
41 | articles.push(a);
42 | })
43 | .on('error', deferred.reject)
44 | .on('complete', function () {
45 | deferred.resolve(articles);
46 | });
47 |
48 | return deferred.promise;
49 | }
--------------------------------------------------------------------------------
/lib/types/html.js:
--------------------------------------------------------------------------------
1 | var q = require('q'),
2 | util = require('util'),
3 | cheerio = require('cheerio'),
4 | noodle;
5 |
6 | exports._init = function (n) {
7 | noodle = n;
8 | };
9 |
10 | exports.fetch = fetch;
11 | exports.select = select;
12 |
13 | function fetch (url, query) {
14 | var deferred = q.defer();
15 |
16 | if (noodle.cache.check(query)) {
17 | deferred.resolve(noodle.cache.get(query).value);
18 | return deferred.promise;
19 | } else {
20 | return noodle.fetch(url, query).then(function (page) {
21 | return select(page, query);
22 | });
23 | }
24 | }
25 |
26 | function select (body, query) {
27 | var deferred = q.defer(),
28 | extract = query.extract || 'text',
29 | selector = query.selector,
30 | page = cheerio.load(body, { lowerCaseTags: true, lowerCaseAttributeNames: true }),
31 | selected = page(selector),
32 | results = [];
33 |
34 | if (!selector) {
35 | deferred.resolve(noodle._wrapResults(body.trim(), query));
36 | return deferred.promise;
37 | }
38 | else if (util.isArray(extract)) {
39 | selected.each(function (i, elem) {
40 | var item = {},
41 | notEmpty;
42 |
43 | extract.forEach(function (property) {
44 | item[property] = extractProperty(page, elem, property);
45 | notEmpty = notEmpty || item[property];
46 | });
47 |
48 | if (notEmpty) {
49 | results.push(item);
50 | }
51 | });
52 | }
53 | else {
54 | selected.each(function (i, elem) {
55 | results.push(extractProperty(page, elem, extract));
56 | });
57 | }
58 |
59 | // Pass back the extracted results from the DOM
60 |
61 | if (results.length === 0) {
62 | deferred.reject(new Error('Could not match with that selector or extract value'));
63 | } else {
64 | deferred.resolve(noodle._wrapResults(results, query));
65 | }
66 |
67 | return deferred.promise;
68 | }
69 |
70 | function extractProperty (page, elem, property) {
71 | if (property === 'text') {
72 | return page(elem).text().replace(/(\r\n|\n|\r)/gm, "").trim();
73 | }
74 | else if (property === 'html' || property === 'innerHTML') {
75 | return page(elem).html();
76 | }
77 | else {
78 | return page(elem).attr(property);
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/lib/types/json.js:
--------------------------------------------------------------------------------
1 | var q = require('q'),
2 | jsonSelect = require('JSONSelect'),
3 | noodle;
4 |
5 | exports._init = function (n) {
6 | noodle = n;
7 | };
8 |
9 | exports.fetch = fetch;
10 | exports.select = select;
11 |
12 | function fetch (url, query) {
13 | var deferred = q.defer();
14 |
15 | if (noodle.cache.check(query)) {
16 | deferred.resolve(noodle.cache.get(query).value);
17 | return deferred.promise;
18 | } else {
19 | return noodle.fetch(url, query).then(function (data) {
20 | try {
21 | var parsed = JSON.parse(data);
22 | return select(parsed, query);
23 | } catch (e) {
24 | throw new Error('Could not parse JSON document');
25 | }
26 | });
27 | }
28 | }
29 |
30 | function select (parsed, query) {
31 | var deferred = q.defer(),
32 | results;
33 |
34 | try {
35 | if (!query.selector) {
36 | deferred.resolve(noodle._wrapResults([parsed], query));
37 | } else {
38 | results = jsonSelect.match(query.selector, [], parsed);
39 | if (results.length === 0) {
40 | deferred.reject(new Error('Could not match with that selector'));
41 | } else {
42 | deferred.resolve(noodle._wrapResults(results, query));
43 | }
44 | }
45 | } catch (e) {
46 | deferred.reject(new Error('Could not match with that selector'));
47 | }
48 |
49 | return deferred.promise;
50 | }
--------------------------------------------------------------------------------
/lib/types/xml.js:
--------------------------------------------------------------------------------
1 | var q = require('q'),
2 | xml2json = require('xml2json'),
3 | noodle;
4 |
5 | exports._init = function (n) {
6 | noodle = n;
7 | };
8 |
9 | exports.fetch = fetch;
10 | exports.select = select;
11 |
12 | function fetch (url, query) {
13 | var deferred = q.defer();
14 |
15 | if (noodle.cache.check(query)) {
16 | deferred.resolve(noodle.cache.get(query).value);
17 | return deferred.promise;
18 | } else {
19 | return noodle.fetch(url, query).then(function (xml) {
20 | try {
21 | var parsed = JSON.parse(xml2json.toJson(xml));
22 | return select(parsed, query);
23 | } catch (e) {
24 | throw new Error('Could not parse XML to JSON');
25 | }
26 | });
27 | }
28 | }
29 |
30 | function select (obj, query) {
31 | return noodle.json.select(obj, query);
32 | }
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "noodlejs",
3 | "version": "0.3.2",
4 | "description": "noodle is a proxy server which serves for cross domain data extraction from web documents for any client.",
5 | "main": "./lib/noodle",
6 | "bin": {
7 | "noodle": "./bin/noodle-server"
8 | },
9 | "dependencies": {
10 | "connect": "~2.3.5",
11 | "connect-ratelimit": "0.0.5",
12 | "JSONSelect": "0.4.0",
13 | "feedparser": "0.10.7",
14 | "moment": "1.7.2",
15 | "cheerio": "0.10.1",
16 | "request": "2.11.4",
17 | "q": "0.8.9",
18 | "xml2json": "^0.5.1",
19 | "underscore": "1.4.2",
20 | "mocha": "1.7.4",
21 | "chai": "1.4.2",
22 | "colors": "0.6.0-1"
23 | },
24 | "devDependencies": {},
25 | "scripts": {
26 | "test": "echo \"Error: no test specified\" && exit 1",
27 | "start": "bin/noodle-server"
28 | },
29 | "engines": {
30 | "node": "0.6.x"
31 | },
32 | "repository": {
33 | "type": "git",
34 | "url": "git://github.com/dharmafly/noodle.git"
35 | },
36 | "keywords": [
37 | "scraper",
38 | "proxy",
39 | "cross-domain",
40 | "cross domain",
41 | "selectors",
42 | "JSONSelect",
43 | "json",
44 | "html",
45 | "web service",
46 | "rate limit"
47 | ],
48 | "author": "Dharmafly",
49 | "license": "BSD"
50 | }
--------------------------------------------------------------------------------
/tests/document.atom:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Example Feed
5 |
6 | 2003-12-13T18:30:02Z
7 |
8 | John Doe
9 |
10 | urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6
11 |
12 |
13 | Atom-Powered Robots Run Amok
14 |
15 | urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a
16 | 2003-12-13T18:30:02Z
17 | Some text.
18 |
19 |
20 |
--------------------------------------------------------------------------------
/tests/document.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dharmafly/noodle/52f88f5df0d2a92506aad702d6719350d2e79459/tests/document.html
--------------------------------------------------------------------------------
/tests/document.json:
--------------------------------------------------------------------------------
1 | {
2 | "completed_in": 0.013,
3 | "max_id": 288657757152870400,
4 | "max_id_str": "288657757152870400",
5 | "next_page": "?page=2&max_id=288657757152870400&q=dinosaurs",
6 | "page": 1,
7 | "query": "dinosaurs",
8 | "refresh_url": "?since_id=288657757152870400&q=dinosaurs",
9 | "results": [
10 | {
11 | "created_at": "Tue, 08 Jan 2013 14:45:46 +0000",
12 | "from_user": "_MsMindless",
13 | "from_user_id": 878142511,
14 | "from_user_id_str": "878142511",
15 | "from_user_name": "uh uh",
16 | "geo": null,
17 | "id": 288657757152870400,
18 | "id_str": "288657757152870400",
19 | "iso_language_code": "en",
20 | "metadata": {
21 | "result_type": "recent"
22 | },
23 | "profile_image_url": "http://a0.twimg.com/profile_images/3081235144/d762e9c3edb63360aa49d7e6e62b683a_normal.jpeg",
24 | "profile_image_url_https": "https://si0.twimg.com/profile_images/3081235144/d762e9c3edb63360aa49d7e6e62b683a_normal.jpeg",
25 | "source": "<a href="http://twitter.com/#!/download/ipad">Twitter for iPad</a>",
26 | "text": "@AmiMindless do they transform into dinosaurs in the night?",
27 | "to_user": "AmiMindless",
28 | "to_user_id": 1021682551,
29 | "to_user_id_str": "1021682551",
30 | "to_user_name": "amandaspiffytho.",
31 | "in_reply_to_status_id": 288657189856505860,
32 | "in_reply_to_status_id_str": "288657189856505856"
33 | },
34 | {
35 | "created_at": "Tue, 08 Jan 2013 14:43:34 +0000",
36 | "from_user": "jirouishi",
37 | "from_user_id": 157924053,
38 | "from_user_id_str": "157924053",
39 | "from_user_name": "Aron ",
40 | "geo": {
41 | "coordinates": [
42 | 37.596182,
43 | 127.056834
44 | ],
45 | "type": "Point"
46 | },
47 | "id": 288657201952849900,
48 | "id_str": "288657201952849920",
49 | "iso_language_code": "ko",
50 | "metadata": {
51 | "result_type": "recent"
52 | },
53 | "profile_image_url": "http://a0.twimg.com/profile_images/3080515500/b0b7d315abba887f50af14acdccfc1ab_normal.jpeg",
54 | "profile_image_url_https": "https://si0.twimg.com/profile_images/3080515500/b0b7d315abba887f50af14acdccfc1ab_normal.jpeg",
55 | "source": "<a href="http://twitter.com/download/android">Twitter for Android</a>",
56 | "text": "omg ㅠㅠ nver thought a docu about dinosaurs would hurt this much https://t.co/TFsRXbMN",
57 | "to_user": null,
58 | "to_user_id": 0,
59 | "to_user_id_str": "0",
60 | "to_user_name": null
61 | },
62 | {
63 | "created_at": "Tue, 08 Jan 2013 14:43:06 +0000",
64 | "from_user": "imexdanny",
65 | "from_user_id": 302811136,
66 | "from_user_id_str": "302811136",
67 | "from_user_name": "Danny Hughes",
68 | "geo": null,
69 | "id": 288657084260683800,
70 | "id_str": "288657084260683777",
71 | "iso_language_code": "en",
72 | "metadata": {
73 | "result_type": "recent"
74 | },
75 | "profile_image_url": "http://a0.twimg.com/profile_images/3037826375/3530f8d5cf699b4d024a82e9ac4d8624_normal.jpeg",
76 | "profile_image_url_https": "https://si0.twimg.com/profile_images/3037826375/3530f8d5cf699b4d024a82e9ac4d8624_normal.jpeg",
77 | "source": "<a href="http://twitter.com/download/iphone">Twitter for iPhone</a>",
78 | "text": "RT @AndyRothwell1: @imexdanny what killed ze dinosaurs? ZE ICE AGE!",
79 | "to_user": null,
80 | "to_user_id": 0,
81 | "to_user_id_str": "0",
82 | "to_user_name": null,
83 | "in_reply_to_status_id": 288656063555858400,
84 | "in_reply_to_status_id_str": "288656063555858432"
85 | },
86 | {
87 | "created_at": "Tue, 08 Jan 2013 14:42:42 +0000",
88 | "from_user": "M3LbReEzY",
89 | "from_user_id": 89102612,
90 | "from_user_id_str": "89102612",
91 | "from_user_name": "Don Melocino",
92 | "geo": null,
93 | "id": 288656983974871040,
94 | "id_str": "288656983974871040",
95 | "iso_language_code": "en",
96 | "metadata": {
97 | "result_type": "recent"
98 | },
99 | "profile_image_url": "http://a0.twimg.com/profile_images/1645636429/IMG_0747_normal.JPG",
100 | "profile_image_url_https": "https://si0.twimg.com/profile_images/1645636429/IMG_0747_normal.JPG",
101 | "source": "<a href="http://twitter.com/">web</a>",
102 | "text": "Some dinosaurs were as small as chickens #RandomSnappleFact.. Good morning Tweepsters",
103 | "to_user": null,
104 | "to_user_id": 0,
105 | "to_user_id_str": "0",
106 | "to_user_name": null
107 | },
108 | {
109 | "created_at": "Tue, 08 Jan 2013 14:41:42 +0000",
110 | "from_user": "merissa_ariff",
111 | "from_user_id": 401955951,
112 | "from_user_id_str": "401955951",
113 | "from_user_name": "MissCaprisss",
114 | "geo": null,
115 | "id": 288656733193244700,
116 | "id_str": "288656733193244674",
117 | "iso_language_code": "en",
118 | "metadata": {
119 | "result_type": "recent"
120 | },
121 | "profile_image_url": "http://a0.twimg.com/profile_images/3067213647/f7c0775af4a6d17d036192fd0ffea3d0_normal.jpeg",
122 | "profile_image_url_https": "https://si0.twimg.com/profile_images/3067213647/f7c0775af4a6d17d036192fd0ffea3d0_normal.jpeg",
123 | "source": "<a href="http://twitter.com/">web</a>",
124 | "text": "@Sya_fxqxh Mind u i hate dinosaurs. If u know what i mean. ITS NNNIIINNNOOOOOOOOOOOOO",
125 | "to_user": "Sya_fxqxh",
126 | "to_user_id": 303860298,
127 | "to_user_id_str": "303860298",
128 | "to_user_name": "",
129 | "in_reply_to_status_id": 288655990411378700,
130 | "in_reply_to_status_id_str": "288655990411378688"
131 | },
132 | {
133 | "created_at": "Tue, 08 Jan 2013 14:41:42 +0000",
134 | "from_user": "TwycrossZoo",
135 | "from_user_id": 66683145,
136 | "from_user_id_str": "66683145",
137 | "from_user_name": "Twycross Zoo",
138 | "geo": null,
139 | "id": 288656729946865660,
140 | "id_str": "288656729946865666",
141 | "iso_language_code": "en",
142 | "metadata": {
143 | "result_type": "recent"
144 | },
145 | "profile_image_url": "http://a0.twimg.com/profile_images/1367172040/TZoo_WPS_Logo_blue_portrait_normal.png",
146 | "profile_image_url_https": "https://si0.twimg.com/profile_images/1367172040/TZoo_WPS_Logo_blue_portrait_normal.png",
147 | "source": "<a href="http://www.facebook.com/twitter">Facebook</a>",
148 | "text": "DINOSAUR FACT OF THE DAY: \nBaryonyx was a carnivore. It is one of the only dinosaurs known to feed on fish,... http://t.co/cgtbjyVL",
149 | "to_user": null,
150 | "to_user_id": 0,
151 | "to_user_id_str": "0",
152 | "to_user_name": null
153 | },
154 | {
155 | "created_at": "Tue, 08 Jan 2013 14:41:29 +0000",
156 | "from_user": "Hippobatman",
157 | "from_user_id": 42901828,
158 | "from_user_id_str": "42901828",
159 | "from_user_name": "Ulf Martinsen",
160 | "geo": null,
161 | "id": 288656677736157200,
162 | "id_str": "288656677736157187",
163 | "iso_language_code": "en",
164 | "metadata": {
165 | "result_type": "recent"
166 | },
167 | "profile_image_url": "http://a0.twimg.com/profile_images/248772203/thumb-super-mario-bros-8bit-Mario_normal.jpg",
168 | "profile_image_url_https": "https://si0.twimg.com/profile_images/248772203/thumb-super-mario-bros-8bit-Mario_normal.jpg",
169 | "source": "<a href="http://twitter.com/">web</a>",
170 | "text": "Why aren't there more games with dinosaurs in them? There need to be more games with dinosaurs in them. Dinosaurs are cool.",
171 | "to_user": null,
172 | "to_user_id": 0,
173 | "to_user_id_str": "0",
174 | "to_user_name": null
175 | },
176 | {
177 | "created_at": "Tue, 08 Jan 2013 14:40:47 +0000",
178 | "from_user": "JessiJ0108",
179 | "from_user_id": 737981952,
180 | "from_user_id_str": "737981952",
181 | "from_user_name": "Jessica Johnson",
182 | "geo": null,
183 | "id": 288656499142696960,
184 | "id_str": "288656499142696961",
185 | "iso_language_code": "en",
186 | "metadata": {
187 | "result_type": "recent"
188 | },
189 | "profile_image_url": "http://a0.twimg.com/profile_images/2936728482/d9911742cdd1f022bd23755829f0dce4_normal.jpeg",
190 | "profile_image_url_https": "https://si0.twimg.com/profile_images/2936728482/d9911742cdd1f022bd23755829f0dce4_normal.jpeg",
191 | "source": "<a href="http://twitter.com/download/android">Twitter for Android</a>",
192 | "text": "RT @QuotingJokes: Kiss me if I'm wrong. But dinosaurs still exist right?",
193 | "to_user": null,
194 | "to_user_id": 0,
195 | "to_user_id_str": "0",
196 | "to_user_name": null
197 | },
198 | {
199 | "created_at": "Tue, 08 Jan 2013 14:40:46 +0000",
200 | "from_user": "AndyRothwell1",
201 | "from_user_id": 288397628,
202 | "from_user_id_str": "288397628",
203 | "from_user_name": "Andy Rothwell",
204 | "geo": null,
205 | "id": 288656498899419140,
206 | "id_str": "288656498899419137",
207 | "iso_language_code": "en",
208 | "metadata": {
209 | "result_type": "recent"
210 | },
211 | "profile_image_url": "http://a0.twimg.com/profile_images/2954727240/f16de28072350fdfa393679664894c8e_normal.jpeg",
212 | "profile_image_url_https": "https://si0.twimg.com/profile_images/2954727240/f16de28072350fdfa393679664894c8e_normal.jpeg",
213 | "source": "<a href="http://twitter.com/">web</a>",
214 | "text": "@imexdanny what killed ze dinosaurs? ZE ICE AGE!",
215 | "to_user": "imexdanny",
216 | "to_user_id": 302811136,
217 | "to_user_id_str": "302811136",
218 | "to_user_name": "Danny Hughes",
219 | "in_reply_to_status_id": 288656063555858400,
220 | "in_reply_to_status_id_str": "288656063555858432"
221 | },
222 | {
223 | "created_at": "Tue, 08 Jan 2013 14:40:35 +0000",
224 | "from_user": "BertBannister",
225 | "from_user_id": 939905077,
226 | "from_user_id_str": "939905077",
227 | "from_user_name": "bert bannister",
228 | "geo": null,
229 | "id": 288656448794279940,
230 | "id_str": "288656448794279936",
231 | "iso_language_code": "en",
232 | "metadata": {
233 | "result_type": "recent"
234 | },
235 | "profile_image_url": "http://a0.twimg.com/profile_images/3008530754/bea3fcbc9efacf7144688f499bddf587_normal.jpeg",
236 | "profile_image_url_https": "https://si0.twimg.com/profile_images/3008530754/bea3fcbc9efacf7144688f499bddf587_normal.jpeg",
237 | "source": "<a href="http://twitter.com/download/iphone">Twitter for iPhone</a>",
238 | "text": "@rogers2116 @sludgiesly Everyone: \"steak and a pint please mate\" Roge: \"can I have turkey dinosaurs with ketchup and a fanta please\"",
239 | "to_user": "rogers2116",
240 | "to_user_id": 316789507,
241 | "to_user_id_str": "316789507",
242 | "to_user_name": "Jamie Rogers",
243 | "in_reply_to_status_id": 288656043410599940,
244 | "in_reply_to_status_id_str": "288656043410599936"
245 | },
246 | {
247 | "created_at": "Tue, 08 Jan 2013 14:40:34 +0000",
248 | "from_user": "johnkatez",
249 | "from_user_id": 6795122,
250 | "from_user_id_str": "6795122",
251 | "from_user_name": "johnkatez",
252 | "geo": null,
253 | "id": 288656443266170900,
254 | "id_str": "288656443266170880",
255 | "iso_language_code": "en",
256 | "metadata": {
257 | "result_type": "recent"
258 | },
259 | "profile_image_url": "http://a0.twimg.com/profile_images/3062733248/6f3634e3bb847292e75f9e834ee5963d_normal.jpeg",
260 | "profile_image_url_https": "https://si0.twimg.com/profile_images/3062733248/6f3634e3bb847292e75f9e834ee5963d_normal.jpeg",
261 | "source": "<a href="http://tapbots.com/software/tweetbot/mac">Twееtbot for Mac</a>",
262 | "text": "You heard it here first- dinosaurs were gay. http://t.co/vSPlvGRc",
263 | "to_user": null,
264 | "to_user_id": 0,
265 | "to_user_id_str": "0",
266 | "to_user_name": null
267 | },
268 | {
269 | "created_at": "Tue, 08 Jan 2013 14:40:24 +0000",
270 | "from_user": "fuckl0nely",
271 | "from_user_id": 549766379,
272 | "from_user_id_str": "549766379",
273 | "from_user_name": "† .",
274 | "geo": null,
275 | "id": 288656403944591360,
276 | "id_str": "288656403944591360",
277 | "iso_language_code": "pt",
278 | "metadata": {
279 | "result_type": "recent"
280 | },
281 | "profile_image_url": "http://a0.twimg.com/profile_images/2979531343/9d7e3fcce5d469d9da80c1661b7dee60_normal.jpeg",
282 | "profile_image_url_https": "https://si0.twimg.com/profile_images/2979531343/9d7e3fcce5d469d9da80c1661b7dee60_normal.jpeg",
283 | "source": "<a href="http://www.tumblr.com/">Tumblr</a>",
284 | "text": "tumblrbot ha preguntado: ROBOTS OR DINOSAURS? http://t.co/uovAaks7",
285 | "to_user": null,
286 | "to_user_id": 0,
287 | "to_user_id_str": "0",
288 | "to_user_name": null
289 | },
290 | {
291 | "created_at": "Tue, 08 Jan 2013 14:39:34 +0000",
292 | "from_user": "Phil_Savage",
293 | "from_user_id": 23068681,
294 | "from_user_id_str": "23068681",
295 | "from_user_name": "Phil Savage",
296 | "geo": null,
297 | "id": 288656193604419600,
298 | "id_str": "288656193604419584",
299 | "iso_language_code": "en",
300 | "metadata": {
301 | "result_type": "recent"
302 | },
303 | "profile_image_url": "http://a0.twimg.com/profile_images/2856097536/e45f89e574c84c3e48f6e2df2c159f03_normal.jpeg",
304 | "profile_image_url_https": "https://si0.twimg.com/profile_images/2856097536/e45f89e574c84c3e48f6e2df2c159f03_normal.jpeg",
305 | "source": "<a href="http://www.tweetdeck.com">TweetDeck</a>",
306 | "text": "RT @EwingCalvin: Just so everyone's clear dinosaurs are extinct. That means there are none left. Whatsoever.",
307 | "to_user": null,
308 | "to_user_id": 0,
309 | "to_user_id_str": "0",
310 | "to_user_name": null
311 | },
312 | {
313 | "created_at": "Tue, 08 Jan 2013 14:38:22 +0000",
314 | "from_user": "EwingCalvin",
315 | "from_user_id": 442655243,
316 | "from_user_id_str": "442655243",
317 | "from_user_name": "Calvin Ewing",
318 | "geo": null,
319 | "id": 288655891270623200,
320 | "id_str": "288655891270623232",
321 | "iso_language_code": "en",
322 | "metadata": {
323 | "result_type": "recent"
324 | },
325 | "profile_image_url": "http://a0.twimg.com/profile_images/2860919035/3e8339791d1e32f64f329b4d10abdf6c_normal.jpeg",
326 | "profile_image_url_https": "https://si0.twimg.com/profile_images/2860919035/3e8339791d1e32f64f329b4d10abdf6c_normal.jpeg",
327 | "source": "<a href="http://twitter.com/">web</a>",
328 | "text": "Just so everyone's clear dinosaurs are extinct. That means there are none left. Whatsoever.",
329 | "to_user": null,
330 | "to_user_id": 0,
331 | "to_user_id_str": "0",
332 | "to_user_name": null
333 | },
334 | {
335 | "created_at": "Tue, 08 Jan 2013 14:38:09 +0000",
336 | "from_user": "DinosaursTrap",
337 | "from_user_id": 550537265,
338 | "from_user_id_str": "550537265",
339 | "from_user_name": "The Dinosaurs Trap",
340 | "geo": null,
341 | "id": 288655840188194800,
342 | "id_str": "288655840188194816",
343 | "iso_language_code": "en",
344 | "metadata": {
345 | "result_type": "recent"
346 | },
347 | "profile_image_url": "http://a0.twimg.com/profile_images/2483238349/31yj26hu61vfv4umwony_normal.png",
348 | "profile_image_url_https": "https://si0.twimg.com/profile_images/2483238349/31yj26hu61vfv4umwony_normal.png",
349 | "source": "<a href="http://trap.it">Trapit</a>",
350 | "text": "Ichthyosaur Fossil Spotlights Ancient 'Sea Monster,' World's Recovery After Mass Extinction http://t.co/UqQTcJG7 #dinos #dinosaurs",
351 | "to_user": null,
352 | "to_user_id": 0,
353 | "to_user_id_str": "0",
354 | "to_user_name": null
355 | }
356 | ],
357 | "results_per_page": 15,
358 | "since_id": 0,
359 | "since_id_str": "0"
360 | }
--------------------------------------------------------------------------------
/tests/document.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 2003/07/04
4 | 123
5 | Acme Alpha
6 | -
7 | 987
8 | Coupler
9 | 5
10 |
11 | -
12 | 654
13 | Connector
14 | 3
15 |
16 | -
17 | 579
18 | Clasp
19 | 1
20 |
21 |
--------------------------------------------------------------------------------
/tests/fixtures.js:
--------------------------------------------------------------------------------
1 | var fs = require("fs");
2 |
3 | // Web document samples for the test server to serve
4 |
5 | exports.documents = {
6 | html: fs.readFileSync("tests/document.html"),
7 | json: fs.readFileSync("tests/document.json"),
8 | feed: fs.readFileSync("tests/document.atom"),
9 | xml: fs.readFileSync("tests/document.xml")
10 | };
11 |
12 | // Queries
13 |
14 | exports.queries = {
15 | html: {
16 | simple: {
17 | "url": "http://localhost:8889/html",
18 | "type": "html",
19 | "selector": "title",
20 | "extract": "text",
21 | "cache": false
22 | },
23 | withCache: {
24 | "url": "http://localhost:8889/html",
25 | "type": "html",
26 | "selector": "h1",
27 | "extract": "text",
28 | "cache": true
29 | },
30 | noSelector: {
31 | "url": "http://localhost:8889/html",
32 | "type": "html",
33 | "cache": false
34 | },
35 | noExtract: {
36 | "url": "http://localhost:8889/html",
37 | "type": "html",
38 | "selector": "title",
39 | "cache": false
40 | },
41 | noType: {
42 | "url": "http://localhost:8889/html",
43 | "selector": "title",
44 | "extract": "text",
45 | "cache": false
46 | },
47 | badSelector: {
48 | "url": "http://localhost:8889/html",
49 | "type": "html",
50 | "selector": "BAD SELECTOR",
51 | "extract": "text",
52 | "cache": false
53 | },
54 | badExtract: {
55 | "url": "http://localhost:8889/html",
56 | "type": "html",
57 | "selector": "title",
58 | "extract": "BAD EXTRACT",
59 | "cache": false
60 | }
61 | },
62 | json: {
63 | simple: {
64 | "url": "http://localhost:8889/json",
65 | "type": "json",
66 | "selector": ".query",
67 | "cache": false
68 | },
69 | noSelector: {
70 | "url": "http://localhost:8889/json",
71 | "type": "json",
72 | "cache": false
73 | },
74 | noType: {
75 | "url": "http://localhost:8889/json",
76 | "selector": ".query",
77 | "cache": false
78 | },
79 | badSelector: {
80 | "url": "http://localhost:8889/json",
81 | "type": "json",
82 | "selector": "BAD SELECTOR",
83 | "cache": false
84 | },
85 | badParse: {
86 | "url": "http://localhost:8889/html",
87 | "type": "json",
88 | "selector": ".query",
89 | "cache": false
90 | }
91 | },
92 | feed: {
93 | simple: {
94 | "url": "http://localhost:8889/feed",
95 | "type": "feed",
96 | "selector": ".title",
97 | "cache": false
98 | },
99 | noSelector: {
100 | "url": "http://localhost:8889/feed",
101 | "type": "feed",
102 | "cache": false
103 | },
104 | noType: {
105 | "url": "http://localhost:8889/feed",
106 | "selector": ".title",
107 | "cache": false
108 | },
109 | badSelector: {
110 | "url": "http://localhost:8889/feed",
111 | "type": "feed",
112 | "selector": "BAD SELECTOR",
113 | "cache": false
114 | },
115 | badParse: {
116 | "url": "http://localhost:8889/html",
117 | "type": "feed",
118 | "selector": ".title",
119 | "cache": false
120 | }
121 | },
122 | xml: {
123 | simple: {
124 | "url": "http://localhost:8889/xml",
125 | "type": "xml",
126 | "selector": ".CustomerName",
127 | "cache": false
128 | },
129 | noSelector: {
130 | "url": "http://localhost:8889/xml",
131 | "type": "xml",
132 | "cache": false
133 | },
134 | noType: {
135 | "url": "http://localhost:8889/xml",
136 | "selector": ".CustomerName",
137 | "cache": false
138 | },
139 | badSelector: {
140 | "url": "http://localhost:8889/xml",
141 | "type": "xml",
142 | "selector": "BAD SELECTOR",
143 | "cache": false
144 | },
145 | badParse: {
146 | "url": "http://localhost:8889/html",
147 | "type": "xml",
148 | "selector": ".CustomerName",
149 | "cache": false
150 | }
151 | },
152 | misc: {
153 | badUrl: {
154 | "url": "BAD URL",
155 | "cache": false
156 | },
157 | badType: {
158 | "url": "http://localhost:8889/html",
159 | "type": "BAD TYPE",
160 | "cache": false
161 | }
162 | },
163 | map: {
164 | simple: {
165 | "url": "http://localhost:8889/html",
166 | "type": "html",
167 | "map": {
168 | "foo": {
169 | "selector": "h1"
170 | },
171 | "bar": {
172 | "selector": "title"
173 | }
174 | },
175 | "cache": false
176 | }
177 | },
178 | post: {
179 | simple: {
180 | "url": "http://localhost:8889",
181 | "type": "html",
182 | "selector": "h1",
183 | "extract": "text",
184 | "post": {
185 | "foo": "bar"
186 | },
187 | "cache": false
188 | }
189 | },
190 | headers: {
191 | simple: {
192 | "url": "http://localhost:8889/html",
193 | "type": "html",
194 | "selector": "h1",
195 | "headers": ["X-Powered-By"],
196 | "cache": "false"
197 | },
198 | linkHeaders: {
199 | "url": "http://localhost:8889/html",
200 | "type": "html",
201 | "selector": "h1",
202 | "linkHeader": true,
203 | "cache": "false"
204 | }
205 | }
206 | };
207 |
208 | // Query answers
209 |
210 | exports.queries.answers = {
211 | html: {
212 | simple: [
213 | {
214 | "results": ["css Zen Garden: The Beauty in CSS Design"]
215 | }
216 | ],
217 | noExtract: [
218 | {
219 | "results": ["css Zen Garden: The Beauty in CSS Design"]
220 | }
221 | ],
222 | noType: [
223 | {
224 | "results": ["css Zen Garden: The Beauty in CSS Design"]
225 | }
226 | ],
227 | badSelector: [
228 | {
229 | "results": [],
230 | "error": "Could not match with that selector or extract value"
231 | }
232 | ],
233 | badExtract: [
234 | {
235 | "results": [],
236 | "error": "Could not match with that selector or extract value"
237 | }
238 | ]
239 | },
240 | json: {
241 | simple: [
242 | {
243 | "results": [
244 | "dinosaurs"
245 | ]
246 | }
247 | ],
248 | noType: [
249 | {
250 | "results": [
251 | "dinosaurs"
252 | ]
253 | }
254 | ],
255 | badSelector: [
256 | {
257 | "results": [],
258 | "error": "Could not match with that selector"
259 | }
260 | ],
261 | badParse: [
262 | {
263 | "results": [],
264 | "error": "Could not parse JSON document"
265 | }
266 | ]
267 | },
268 | feed: {
269 | simple: [
270 | {
271 | "results": [
272 | "Atom-Powered Robots Run Amok",
273 | "Example Feed"
274 | ]
275 | }
276 | ],
277 | noSelector: [
278 | {
279 | "results": [
280 | [
281 | {
282 | "title": "Atom-Powered Robots Run Amok",
283 | "description": "Some text.",
284 | "summary": "Some text.",
285 | "date": "2003-12-13T18:30:02.000Z",
286 | "pubdate": "2003-12-13T18:30:02.000Z",
287 | "pubDate": "2003-12-13T18:30:02.000Z",
288 | "link": "http://example.org/2003/12/13/atom03",
289 | "guid": "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a",
290 | "author": "John Doe",
291 | "comments": null,
292 | "origlink": null,
293 | "image": {},
294 | "source": {},
295 | "categories": [],
296 | "enclosures": [],
297 | "atom:@": {},
298 | "atom:title": {
299 | "@": {},
300 | "#": "Atom-Powered Robots Run Amok"
301 | },
302 | "atom:link": {
303 | "@": {
304 | "href": "http://example.org/2003/12/13/atom03"
305 | }
306 | },
307 | "atom:id": {
308 | "@": {},
309 | "#": "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a"
310 | },
311 | "atom:updated": {
312 | "@": {},
313 | "#": "2003-12-13T18:30:02Z"
314 | },
315 | "atom:summary": {
316 | "@": {},
317 | "#": "Some text."
318 | },
319 | "meta": {
320 | "#ns": [
321 | {
322 | "xmlns": "http://www.w3.org/2005/Atom"
323 | }
324 | ],
325 | "@": [
326 | {
327 | "xmlns": "http://www.w3.org/2005/Atom"
328 | }
329 | ],
330 | "#type": "atom",
331 | "#version": "1.0",
332 | "title": "Example Feed",
333 | "description": null,
334 | "date": "2003-12-13T18:30:02.000Z",
335 | "pubdate": "2003-12-13T18:30:02.000Z",
336 | "pubDate": "2003-12-13T18:30:02.000Z",
337 | "link": "http://example.org/",
338 | "xmlurl": null,
339 | "xmlUrl": null,
340 | "author": "John Doe",
341 | "language": null,
342 | "favicon": null,
343 | "copyright": null,
344 | "generator": null,
345 | "image": {},
346 | "categories": [],
347 | "atom:@": {
348 | "xmlns": "http://www.w3.org/2005/Atom"
349 | },
350 | "atom:title": {
351 | "@": {},
352 | "#": "Example Feed"
353 | },
354 | "atom:link": {
355 | "@": {
356 | "href": "http://example.org/"
357 | }
358 | },
359 | "atom:updated": {
360 | "@": {},
361 | "#": "2003-12-13T18:30:02Z"
362 | },
363 | "atom:author": {
364 | "@": {},
365 | "name": {
366 | "@": {},
367 | "#": "John Doe"
368 | }
369 | },
370 | "atom:id": {
371 | "@": {},
372 | "#": "urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6"
373 | }
374 | }
375 | }
376 | ]
377 | ]
378 | }
379 | ],
380 | noType: [
381 | {
382 | "results": [
383 | "Atom-Powered Robots Run Amok",
384 | "Example Feed"
385 | ]
386 | }
387 | ],
388 | badSelector: [
389 | {
390 | "results": [],
391 | "error": "Could not match with that selector"
392 | }
393 | ],
394 | badParse: [
395 | {
396 | "results": [],
397 | "error": "The provided document couldn't be normalised"
398 | }
399 | ]
400 | },
401 | xml: {
402 | simple: [
403 | {
404 | "results": [
405 | "Acme Alpha"
406 | ]
407 | }
408 | ],
409 | noSelector: [
410 | {
411 | "results": [
412 | {
413 | "Order": {
414 | "Date": "2003/07/04",
415 | "CustomerId": 123,
416 | "CustomerName": "Acme Alpha",
417 | "Item": [
418 | {
419 | "ItemId": 987,
420 | "ItemName": "Coupler",
421 | "Quantity": 5
422 | },
423 | {
424 | "ItemId": 654,
425 | "ItemName": "Connector",
426 | "Quantity": {
427 | "unit": 12,
428 | "$t": 3
429 | }
430 | },
431 | {
432 | "ItemId": 579,
433 | "ItemName": "Clasp",
434 | "Quantity": 1
435 | }
436 | ]
437 | }
438 | }
439 | ]
440 | }
441 | ],
442 | noType: [
443 | {
444 | "results": [
445 | "Acme Alpha"
446 | ]
447 | }
448 | ],
449 | badSelector: [
450 | {
451 | "results": [],
452 | "error": "Could not match with that selector"
453 | }
454 | ],
455 | badParse: [
456 | {
457 | "results": [],
458 | "error": "Could not parse XML to JSON"
459 | }
460 | ]
461 | },
462 | misc: {
463 | badUrl: [
464 | {
465 | "results": [],
466 | "error": "Document not found"
467 | }
468 | ],
469 | badType: [
470 | {
471 | "results": [],
472 | "error": "Document type not supported"
473 | }
474 | ]
475 | },
476 | map : {
477 | simple: [
478 | {
479 | "results": {
480 | "bar": ["css Zen Garden: The Beauty in CSS Design"],
481 | "foo": ["css Zen Garden"]
482 | }
483 | }
484 | ]
485 | },
486 | post: {
487 | simple: [
488 | {
489 | "results": ["was posted"]
490 | }
491 | ]
492 | },
493 | headers: {
494 | simple: [
495 | {
496 | "results": ["css Zen Garden"],
497 | "headers": {
498 | "X-Powered-By": "Noodle testing server"
499 | }
500 | }
501 | ],
502 | linkHeaders: [
503 | {
504 | "results": ["css Zen Garden"],
505 | "headers": {
506 | "link": {
507 | "next": "foo",
508 | "last": "bar"
509 | }
510 | }
511 | }
512 | ]
513 | }
514 | };
--------------------------------------------------------------------------------
/tests/server.js:
--------------------------------------------------------------------------------
1 | var url = require('url'),
2 | fixtures = require('./fixtures');
3 |
4 | require('http').createServer(function (req, res) {
5 | var serve = url.parse(req.url).pathname.split('/')[1];
6 |
7 | if (req.method === 'POST') {
8 | parsePostData(req, function (data) {
9 | var respondWith = (data.foo === 'bar') ? 'was posted
'
10 | : 'test should fail
';
11 | res.writeHead(200, getResponseHeaders('html'));
12 | res.end(respondWith);
13 | });
14 | } else {
15 | res.writeHead(200, getResponseHeaders(serve));
16 | res.end(fixtures.documents[serve]);
17 | }
18 | })
19 | .listen(8889, function () {
20 | console.log('Test server temporarily running on port 8889');
21 | });
22 |
23 | function parsePostData (req, cb) {
24 | var body = '';
25 |
26 | req.on('data', function (data) {
27 | body += data;
28 | });
29 |
30 | req.on('end', function () {
31 | cb(require('querystring').parse(body));
32 | });
33 | }
34 |
35 | function getResponseHeaders (serve) {
36 | var ct = {
37 | 'html': 'text/html',
38 | 'json': 'application/json',
39 | 'feed': 'application/atom+xml',
40 | 'xml' : 'text/xml'
41 | };
42 | return {
43 | 'Content-type': ct[serve],
44 | 'X-Powered-By': 'Noodle testing server',
45 | 'Link' : '; rel="next",; rel="last"'
46 | };
47 | }
--------------------------------------------------------------------------------
/tests/tests.js:
--------------------------------------------------------------------------------
1 | var assert = require('assert'),
2 | _ = require('underscore'),
3 | fixtures = require('./fixtures'),
4 | noodle = require('../lib/noodle'),
5 | cache = require('../lib/cache'),
6 | html = require('../lib/types/html'),
7 | json = require('../lib/types/json'),
8 | feed = require('../lib/types/feed'),
9 | xml = require('../lib/types/xml'),
10 | stringify = JSON.stringify;
11 |
12 | noodle.configure({
13 | "debug": false
14 | });
15 |
16 | Array.prototype.AllValuesSame = function(){
17 | if(this.length > 0) {
18 | for(var i = 1; i < this.length; i++) {
19 | if(this[i] !== this[0]) {
20 | return false;
21 | }
22 | }
23 | }
24 | return true;
25 | };
26 |
27 | function isPromise (obj) {
28 | return !!obj.promiseSend;
29 | }
30 |
31 | // Noodle library
32 |
33 | describe('Noodle', function () {
34 | describe('noodle.query', function () {
35 | it('should return a promise', function () {
36 | var promise = noodle.query({url: 'foo'});
37 | assert.equal(true, isPromise(promise));
38 | });
39 | });
40 |
41 | describe('fetch()', function () {
42 | it('should return a promise', function () {
43 | var promise = noodle.fetch('foo', {});
44 | assert.equal(true, isPromise(promise));
45 | });
46 | });
47 | });
48 |
49 | // Tests regarding the noodle library's type modules
50 |
51 | describe('Types', function () {
52 | describe('noodle.html', function () {
53 | it('its promise should resolve to an object containing results', function (done) {
54 | noodle.query(fixtures.queries.html.simple)
55 | .then(function (results) {
56 | if (_.isArray(results.results)) {
57 | done();
58 | } else {
59 | done(new Error('results.results was not an array'));
60 | }
61 | });
62 | });
63 | });
64 |
65 | describe('noodle.json', function () {
66 | it('promise should resolve to an array', function (done) {
67 | noodle.query(fixtures.queries.json.simple)
68 | .then(function (results) {
69 | if (_.isArray(results.results)) {
70 | done();
71 | } else {
72 | done(new Error('results.results was not an array'));
73 | }
74 | });
75 | });
76 | });
77 |
78 | describe('noodle.feed', function () {
79 | it('promise should resolve to an array', function (done) {
80 | noodle.query(fixtures.queries.feed.simple)
81 | .then(function (results) {
82 | if (_.isArray(results.results)) {
83 | done();
84 | } else {
85 | done(new Error('results.results was not an array'));
86 | }
87 | });
88 | });
89 | });
90 |
91 | describe('noodle.xml', function () {
92 | it('promise should resolve to an array', function (done) {
93 | noodle.query(fixtures.queries.xml.simple)
94 | .then(function (results) {
95 | if (_.isArray(results.results)) {
96 | done();
97 | } else {
98 | done(new Error('results.results was not an array'));
99 | }
100 | });
101 | });
102 | });
103 | });
104 |
105 |
106 | // Noodle's cache
107 |
108 | describe('cache', function () {
109 |
110 | });
111 |
112 |
113 | // Noodle query api
114 |
115 | describe('Noodle object query API', function () {
116 | var allArrays = [];
117 |
118 | describe('type: html', function () {
119 | it('should have accurate result data', function (done) {
120 | noodle.query(fixtures.queries.html.simple)
121 | .then(function (results) {
122 | allArrays.push(_.isArray(results.results));
123 | if (_.isEqual(results.results, fixtures.queries.answers.html.simple)) {
124 | done();
125 | } else {
126 | done(new Error('Results and fixtures do not match up.'));
127 | }
128 | });
129 | });
130 |
131 | it('should still return full document if no selector is specified', function (done) {
132 | noodle.query(fixtures.queries.html.noSelector)
133 | .then(function (results) {
134 | var expectedHTMLDoc = results.results[0].results;
135 | allArrays.push(_.isArray(results.results));
136 | if (typeof expectedHTMLDoc === 'string' && expectedHTMLDoc.length > 1000) {
137 | done();
138 | } else {
139 | done(new Error('Results did not contain full document'));
140 | }
141 | });
142 | });
143 |
144 | it('should still return some data if no extract is specified', function (done) {
145 | noodle.query(fixtures.queries.html.noExtract)
146 | .then(function (results) {
147 | allArrays.push(_.isArray(results.results));
148 | if (_.isEqual(results.results, fixtures.queries.answers.html.noExtract)) {
149 | done();
150 | } else {
151 | done(new Error('Results and fixtures do not match up.'));
152 | }
153 | });
154 | });
155 |
156 | it('should still return some data if no type is specified', function (done) {
157 | noodle.query(fixtures.queries.html.noType)
158 | .then(function (results) {
159 | allArrays.push(_.isArray(results.results));
160 | if (_.isEqual(results.results, fixtures.queries.answers.html.noType)) {
161 | done();
162 | } else {
163 | done(new Error('Results and fixtures do not match up.'));
164 | }
165 | });
166 | });
167 |
168 | describe('errors', function () {
169 | it('should report on a poor selector', function (done) {
170 | noodle.query(fixtures.queries.html.badSelector)
171 | .then(function (results) {
172 | allArrays.push(_.isArray(results.results));
173 | if (_.isEqual(results.results, fixtures.queries.answers.html.badSelector)) {
174 | done();
175 | } else {
176 | done(new Error('Results and fixtures do not match up.'));
177 | }
178 | });
179 | });
180 |
181 | it('should default to selecting text if no extract is supplied', function (done){
182 | noodle.query(fixtures.queries.html.noExtract)
183 | .then(function (results) {
184 | allArrays.push(_.isArray(results.results));
185 | if (_.isEqual(results.results, fixtures.queries.answers.html.noExtract)) {
186 | done();
187 | } else {
188 | done(new Error('Results and fixtures do not match up.'));
189 | }
190 | });
191 | });
192 | });
193 | });
194 |
195 | describe('type: json', function () {
196 | before(function () {
197 | noodle.configure({
198 | defaultDocumentType: 'json'
199 | });
200 | });
201 |
202 | it('should have result data', function (done) {
203 | noodle.query(fixtures.queries.json.simple)
204 | .then(function (results) {
205 | allArrays.push(_.isArray(results.results));
206 | if (_.isEqual(results.results, fixtures.queries.answers.json.simple)) {
207 | done();
208 | } else {
209 | done(new Error('Results and fixtures do not match up.'));
210 | }
211 | });
212 | });
213 |
214 | it('should still return some data if no selector is specified', function (done) {
215 | noodle.query(fixtures.queries.json.noSelector)
216 | .then(function (results) {
217 | var expectedJSONDoc = results.results[0].results;
218 | allArrays.push(_.isArray(results.results));
219 | if (typeof expectedJSONDoc === 'object') {
220 | done();
221 | } else {
222 | done(new Error('Results and fixtures do not match up.'));
223 | }
224 | });
225 | });
226 |
227 | it('should still return some data if no type is specified', function (done) {
228 | noodle.query(fixtures.queries.json.noType)
229 | .then(function (results) {
230 | allArrays.push(_.isArray(results.results));
231 | if (_.isEqual(results.results, fixtures.queries.answers.json.noType)) {
232 | done();
233 | } else {
234 | done(new Error('Results and fixtures do not match up.'));
235 | }
236 | });
237 | });
238 |
239 | describe('errors', function () {
240 | it('should report on a poor selector', function (done) {
241 | noodle.query(fixtures.queries.json.badSelector)
242 | .then(function (results) {
243 | allArrays.push(_.isArray(results.results));
244 | if (_.isEqual(results.results, fixtures.queries.answers.json.badSelector)) {
245 | done();
246 | } else {
247 | done(new Error('Results and fixtures do not match up.'));
248 | }
249 | });
250 | });
251 |
252 | it('should report on a parse error', function (done) {
253 | noodle.query(fixtures.queries.json.badParse)
254 | .then(function (results) {
255 | allArrays.push(_.isArray(results.results));
256 | if (_.isEqual(results.results, fixtures.queries.answers.json.badParse)) {
257 | done();
258 | } else {
259 | done(new Error('Results and fixtures do not match up.'));
260 | }
261 | });
262 | });
263 | });
264 | });
265 |
266 | describe('type: feed', function () {
267 | before(function () {
268 | noodle.configure({
269 | defaultDocumentType: 'feed'
270 | });
271 | });
272 |
273 | it('should have result data', function (done) {
274 | noodle.query(fixtures.queries.feed.simple)
275 | .then(function (results) {
276 | allArrays.push(_.isArray(results.results));
277 | if (_.isEqual(results.results) === _.isEqual(fixtures.queries.answers.feed.simple)) {
278 | done();
279 | } else {
280 | done(new Error('Results and fixtures do not match up.'));
281 | }
282 | });
283 | });
284 |
285 | it('should still return some data if no selector is specified', function (done) {
286 | noodle.query(fixtures.queries.feed.noSelector)
287 | .then(function (results) {
288 | allArrays.push(_.isArray(results.results));
289 | if (stringify(results.results) === stringify(fixtures.queries.answers.feed.noSelector)) {
290 | done();
291 | } else {
292 | done(new Error('Results and fixtures do not match up.'));
293 | }
294 | });
295 | });
296 |
297 | it('should still return some data if no type is specified', function (done) {
298 | noodle.query(fixtures.queries.feed.noType)
299 | .then(function (results) {
300 | allArrays.push(_.isArray(results.results));
301 | if (_.isEqual(results.results, fixtures.queries.answers.feed.noType)) {
302 | done();
303 | } else {
304 | done(new Error('Results and fixtures do not match up.'));
305 | }
306 | });
307 | });
308 |
309 | describe('errors', function () {
310 | it('should report on a poor selector', function (done) {
311 | noodle.query(fixtures.queries.feed.badSelector)
312 | .then(function (results) {
313 | allArrays.push(_.isArray(results.results));
314 | if (_.isEqual(results.results, fixtures.queries.answers.feed.badSelector)) {
315 | done();
316 | } else {
317 | done(new Error('Results and fixtures do not match up.'));
318 | }
319 | });
320 | });
321 |
322 | it('should report on a parse error', function (done) {
323 | noodle.query(fixtures.queries.feed.badParse)
324 | .then(function (results) {
325 | allArrays.push(_.isArray(results.results));
326 | if (_.isEqual(results.results, fixtures.queries.answers.feed.badParse)) {
327 | done();
328 | } else {
329 | done(new Error('Results and fixtures do not match up.'));
330 | }
331 | });
332 | });
333 | });
334 | });
335 |
336 | describe('type: xml', function () {
337 | before(function () {
338 | noodle.configure({
339 | defaultDocumentType: 'xml'
340 | });
341 | });
342 |
343 | it('should have result data', function (done) {
344 | noodle.query(fixtures.queries.xml.simple)
345 | .then(function (results) {
346 | allArrays.push(_.isArray(results.results));
347 | if (_.isEqual(results.results, fixtures.queries.answers.xml.simple)) {
348 | done();
349 | } else {
350 | done(new Error('Results and fixtures do not match up.'));
351 | }
352 | });
353 | });
354 |
355 | it('should still return some data if no selector is specified', function (done) {
356 | noodle.query(fixtures.queries.xml.noSelector)
357 | .then(function (results) {
358 | allArrays.push(_.isArray(results.results));
359 | if (_.isEqual(results.results, fixtures.queries.answers.xml.noSelector)) {
360 | done();
361 | } else {
362 | done(new Error('Results and fixtures do not match up.'));
363 | }
364 | });
365 | });
366 |
367 | it('should still return some data if no type is specified', function (done) {
368 | noodle.query(fixtures.queries.xml.noType)
369 | .then(function (results) {
370 | allArrays.push(_.isArray(results.results));
371 | if (_.isEqual(results.results, fixtures.queries.answers.xml.noType)) {
372 | done();
373 | } else {
374 | done(new Error('Results and fixtures do not match up.'));
375 | }
376 | });
377 | });
378 |
379 | describe('errors', function () {
380 | it('should report on a poor selector', function (done) {
381 | noodle.query(fixtures.queries.xml.badSelector)
382 | .then(function (results) {
383 | allArrays.push(_.isArray(results.results));
384 | if (_.isEqual(results.results, fixtures.queries.answers.xml.badSelector)) {
385 | done();
386 | } else {
387 | done(new Error('Results and fixtures do not match up.'));
388 | }
389 | });
390 | });
391 |
392 | it('should report on a parse error', function (done) {
393 | noodle.query(fixtures.queries.xml.badParse)
394 | .then(function (results) {
395 | allArrays.push(_.isArray(results.results));
396 | if (_.isEqual(results.results, fixtures.queries.answers.xml.badParse)) {
397 | done();
398 | } else {
399 | done(new Error('Results and fixtures do not match up.'));
400 | }
401 | });
402 | });
403 | });
404 | });
405 |
406 | describe('generic query error messages', function () {
407 | it('errors if no url is specified', function (done) {
408 | noodle.query(fixtures.queries.misc.badUrl)
409 | .then(function (results) {
410 | if (_.isEqual(results.results, fixtures.queries.answers.misc.badUrl)) {
411 | done();
412 | } else {
413 | done(new Error('Results and fixtures do not match up.'));
414 | }
415 | });
416 | });
417 |
418 | it('errors if a non-supported type is specified', function (done) {
419 | noodle.query(fixtures.queries.misc.badType)
420 | .then(function (results) {
421 | if (_.isEqual(results.results, fixtures.queries.answers.misc.badType)) {
422 | done();
423 | } else {
424 | done(new Error('Results and fixtures do not match up.'));
425 | }
426 | });
427 | });
428 | });
429 |
430 | describe('map notation', function () {
431 | it('result should contain properties as specified in the map as well as data', function (done) {
432 | noodle.query(fixtures.queries.map.simple)
433 | .then(function (results) {
434 | allArrays.push(_.isArray(results.results));
435 | if (_.isEqual(results.results, fixtures.queries.answers.map.simple)) {
436 | done();
437 | } else {
438 | done(new Error('Results and fixtures do not match up.'));
439 | }
440 | });
441 | });
442 | });
443 |
444 | describe('post data', function () {
445 | it('should return data from post requests', function (done) {
446 | noodle.query(fixtures.queries.post.simple)
447 | .then(function (results) {
448 | if (_.isEqual(results.results, fixtures.queries.answers.post.simple)) {
449 | done();
450 | } else {
451 | done(new Error('Results and fixtures do not match up.'));
452 | }
453 | });
454 | });
455 | });
456 |
457 | describe('headers', function () {
458 | it('should parse headers', function (done) {
459 | noodle.query(fixtures.queries.headers.simple)
460 | .then(function (results) {
461 | var fix = fixtures.queries.answers.headers.simple[0];
462 | if (_.isEqual(results.results[0].results, fix.results) &&
463 | _.isEqual(results.results[0].headers, fix.headers)) {
464 | done();
465 | } else {
466 | done(new Error('Results and fixtures do not match up.'));
467 | }
468 | });
469 | });
470 |
471 | it('should parse link headers', function (done) {
472 | noodle.query(fixtures.queries.headers.linkHeaders)
473 | .then(function (results) {
474 | var fix = fixtures.queries.answers.headers.linkHeaders[0];
475 | if (_.isEqual(results.results[0].results, fix.results) &&
476 | _.isEqual(results.results[0].headers, fix.headers)) {
477 | done();
478 | } else {
479 | done(new Error('Results and fixtures do not match up.'));
480 | }
481 | });
482 | });
483 | });
484 |
485 | describe('multiple queries', function () {
486 | it('(A) the returned order should match the order of the sent query', function (done) {
487 | var queries = [
488 | fixtures.queries.html.simple,
489 | fixtures.queries.json.simple,
490 | fixtures.queries.feed.simple,
491 | fixtures.queries.xml.simple
492 | ];
493 | noodle.query(queries)
494 | .then(function (results) {
495 | var match1 = _.isEqual(results.results[0], fixtures.queries.answers.html.simple[0]),
496 | match2 = _.isEqual(results.results[1], fixtures.queries.answers.json.simple[0]),
497 | match3 = _.isEqual(results.results[2], fixtures.queries.answers.feed.simple[0]),
498 | match4 = _.isEqual(results.results[3], fixtures.queries.answers.xml.simple[0]);
499 | if (match1 && match2 && match3 && match4) {
500 | done();
501 | } else {
502 | done(new Error('Order was not maintained or results/fixtures mismatch'));
503 | }
504 | });
505 | });
506 | it('(B) the returned order should match the order of the sent query', function (done) {
507 | var queries = [
508 | fixtures.queries.json.simple,
509 | fixtures.queries.html.simple,
510 | fixtures.queries.xml.simple,
511 | fixtures.queries.feed.simple
512 | ];
513 | noodle.query(queries)
514 | .then(function (results) {
515 | var match1 = _.isEqual(results.results[1], fixtures.queries.answers.html.simple[0]),
516 | match2 = _.isEqual(results.results[0], fixtures.queries.answers.json.simple[0]),
517 | match3 = _.isEqual(results.results[3], fixtures.queries.answers.feed.simple[0]),
518 | match4 = _.isEqual(results.results[2], fixtures.queries.answers.xml.simple[0]);
519 | if (match1 && match2 && match3 && match4) {
520 | done();
521 | } else {
522 | done(new Error('Order was not maintained or results/fixtures mismatch'));
523 | }
524 | });
525 | });
526 | it('(C) the returned order should match the order of the sent query', function (done) {
527 | var queries = [
528 | fixtures.queries.html.simple,
529 | fixtures.queries.json.simple,
530 | fixtures.queries.feed.simple,
531 | fixtures.queries.xml.simple
532 | ];
533 | noodle.query(queries)
534 | .then(function (results) {
535 | var match1 = _.isEqual(results.results[0], fixtures.queries.answers.html.simple[0]),
536 | match2 = _.isEqual(results.results[1], fixtures.queries.answers.json.simple[0]),
537 | match3 = _.isEqual(results.results[2], fixtures.queries.answers.feed.simple[0]),
538 | match4 = _.isEqual(results.results[3], fixtures.queries.answers.xml.simple[0]);
539 | if (match1 && match2 && match3 && match4) {
540 | done();
541 | } else {
542 | done(new Error('Order was not maintained or results/fixtures mismatch'));
543 | }
544 | });
545 | });
546 | });
547 |
548 | describe('consistent response format', function () {
549 | it('should return all responses as arrays', function () {
550 | assert.equal(true, allArrays.indexOf(false) === -1);
551 | });
552 |
553 | it('should always return the "created" property from cache', function (done) {
554 | noodle.query(fixtures.queries.html.withCache)
555 | .then(function (results) {
556 | if (results.results[0].created) {
557 | done();
558 | } else {
559 | done(new Error('"created" property wasn\'t included with cached response.'));
560 | }
561 | });
562 | });
563 | });
564 | });
--------------------------------------------------------------------------------