├── .gitignore ├── lib ├── index.js ├── eq.js └── j-distiller.js ├── package.json ├── examples └── wikipedia.js ├── test └── j-distiller-test.js └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | node_modules 3 | npm-debug.log 4 | -------------------------------------------------------------------------------- /lib/index.js: -------------------------------------------------------------------------------- 1 | exports.jDistiller = require('./j-distiller').jDistiller; -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "jdistiller", 3 | "directories": { 4 | "lib": "./lib" 5 | }, 6 | "scripts": { 7 | "test": "micro-test" 8 | }, 9 | "main": "./lib/index.js", 10 | "version": "2.0.0", 11 | "author": "Ben Coe ", 12 | "engines": [ 13 | "node" 14 | ], 15 | "description": "A page scraping DSL for extracting structured information from unstructured XHTML, built on Node.js and jQuery.", 16 | "keywords": [ 17 | "crawler", 18 | "jQuery" 19 | ], 20 | "repository": { 21 | "type": "git", 22 | "url": "git://github.com/bcoe/jDistiller.git" 23 | }, 24 | "dependencies": { 25 | "cheerio": "^0.18.0", 26 | "request": "^2.51.0", 27 | "sexy-args": ">=1.1.5" 28 | }, 29 | "devDependencies": { 30 | "micro-test": "^1.0.0" 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /lib/eq.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | var regex = /^(.*?)(?:(?:\:(first))|(?:\:(last))|(?:\:eq\((\d+)\)))(.*)/; 4 | 5 | var cheerioEq = function ($, selector) { 6 | var parts = []; 7 | var match; 8 | 9 | while (match = selector.match(regex)) { 10 | parts.push(match[1]); 11 | if (match[2]) parts.push(0); 12 | else if (match[3]) parts.push(-1); 13 | else parts.push(parseInt(match[4], 10)); 14 | selector = match[5].trim(); 15 | } 16 | parts.push(selector); 17 | 18 | var cursor = $(parts.shift()); 19 | parts 20 | .filter(function (selector) { 21 | return selector !== ''; 22 | }) 23 | .forEach(function (selector) { 24 | cursor = typeof selector === 'number' ? cursor.eq(selector) : cursor.find(selector); 25 | }); 26 | 27 | return cursor; 28 | }; 29 | 30 | // wrap cheerio, exposing the new API. 31 | cheerioEq.wrap = function($) { 32 | var original = $.load; 33 | 34 | $.load = function(body) { 35 | var parsed = original(body); 36 | 37 | return function(selector) { 38 | return cheerioEq(parsed, selector); 39 | } 40 | } 41 | 42 | return $; 43 | } 44 | 45 | module.exports = cheerioEq; 46 | -------------------------------------------------------------------------------- /examples/wikipedia.js: -------------------------------------------------------------------------------- 1 | var jDistiller = require('../lib').jDistiller, 2 | fs = require('fs'); 3 | 4 | new jDistiller() 5 | .set('title', '#firstHeading') 6 | .set('links', '#bodyContent p a', function(element, prev) { 7 | var href = element.attr('href'), 8 | key = href.replace('/wiki/', ''); 9 | 10 | if ( key === href) return; 11 | 12 | element.replaceWith('[[' + key + ']]'); 13 | 14 | return [key, { 15 | title: element.attr('title'), 16 | text: prev[key] ? prev[key].text : element.text(), 17 | occurrences: prev[key] ? prev[key].occurrences + 1 : 1 18 | }] 19 | }) 20 | .set('sections', '#bodyContent p,#firstHeading,h2,h3,img', function(element, prev) { 21 | var images = []; 22 | 23 | if ( element.is('h1') || element.is('h2') ) { 24 | this.heading = element.text().trim(); 25 | return 26 | } 27 | 28 | return [this.heading, { 29 | text: prev[this.heading] ? prev[this.heading].text + element.text() : element.text(), 30 | images: element.is('img') && element.attr('width') > 50 ? [element.attr('src').replace('//', 'http://')] : [] 31 | }]; 32 | }) 33 | .distill('http://en.wikipedia.org/wiki/Dog', function(err, distilledPage) { 34 | console.log(JSON.stringify(distilledPage.sections)); 35 | }); 36 | -------------------------------------------------------------------------------- /lib/j-distiller.js: -------------------------------------------------------------------------------- 1 | (function(require) { 2 | 3 | var eq = require('./eq'), 4 | cheerio = eq.wrap(require('cheerio')), 5 | request = require('request'), 6 | sexy = require('sexy-args'); 7 | 8 | jDistiller = function(opts) { 9 | sexy.args([this, 'object1'], { 10 | 'object1': { 11 | request: request, 12 | dsl: {} 13 | } 14 | }, function() { 15 | sexy.extend(this, opts); 16 | }); 17 | } 18 | 19 | jDistiller.prototype.set = function(key, selector, munger) { 20 | 21 | this.dsl[key] = function(element, prev) { 22 | if (munger) { 23 | //this.dsl[key] is executed with a special scope, we mainain this by using another .call(this). 24 | return munger.call(this, element, prev); 25 | } else { 26 | 27 | // If no munger is provided combine 28 | // all the text for the elments that match 29 | // the selector. 30 | if (prev.content) { 31 | prev.content += ' '; 32 | } else { 33 | prev.content = ''; 34 | } 35 | 36 | prev.content += element.text().trim(); 37 | return prev.content; 38 | } 39 | }; 40 | this.dsl[key].selector = selector; 41 | 42 | return this; 43 | }; 44 | 45 | jDistiller.prototype.distill = function(url, callback) { 46 | if (this._isURL(url)) { 47 | this.distillURL(url, callback); 48 | } 49 | else if (typeof Buffer !== 'undefined' && Buffer.isBuffer(url)) { 50 | this.distillBuffer(url, callback); 51 | } 52 | else if (typeof url === 'string') { 53 | this.distillString(url, callback); 54 | } 55 | else if (typeof url === 'object') { 56 | this.distillJQuery(url, callback); 57 | } 58 | }; 59 | 60 | jDistiller.prototype._isURL = function(string) { 61 | if (typeof string !== 'string') return false; 62 | 63 | var match = string.match(/https?:/); 64 | 65 | if (match && match.index === 0) { 66 | return true; 67 | } 68 | }; 69 | 70 | jDistiller.prototype.distillURL = function(url, callback) { 71 | var _this = this; 72 | 73 | this.request({ 74 | url: url, 75 | followAllRedirects: true, 76 | encoding: 'binary' 77 | }, function(err, res, page) { 78 | 79 | if (!res) { 80 | err = new Error('empty response'); 81 | } else if (res.statusCode < 200 || res.statusCode >= 300) { 82 | err = new Error('http status ', res.statusCode) 83 | } 84 | 85 | if (err) { 86 | callback(err, null); 87 | return; 88 | } 89 | 90 | _this._applyDSL(_this._getJQueryObject(page), callback); 91 | }); 92 | }; 93 | 94 | jDistiller.prototype.distillBuffer = function(url, callback) { 95 | this._applyDSL(this._getJQueryObject(url.toString(), callback), callback); 96 | }; 97 | 98 | jDistiller.prototype.distillString = function(url, callback) { 99 | this._applyDSL(this._getJQueryObject(url, callback), callback); 100 | }; 101 | 102 | jDistiller.prototype.distillJQuery = function(url, callback) { 103 | this._applyDSL(url, callback); 104 | }; 105 | 106 | jDistiller.prototype._getJQueryObject = function(rawPage, callback) { 107 | try { 108 | return cheerio.load(rawPage); 109 | } catch (e) { 110 | callback(e, null); 111 | } 112 | }; 113 | 114 | jDistiller.prototype._applyDSL = function(page, callback) { 115 | var distilledPage = {}, 116 | _this = this; 117 | 118 | try { 119 | Object.keys(this.dsl).forEach(function(k) { 120 | distilledPage[k] = _this._function(k, _this.dsl[k], page, distilledPage); 121 | }); 122 | 123 | callback( null, distilledPage ); 124 | 125 | } catch (e) { 126 | callback(e, null); 127 | } 128 | }; 129 | 130 | jDistiller.prototype._function = function(key, func, page, distilledPage) { 131 | var scope = { 132 | distilledSoFar: function() { 133 | return distilledPage; 134 | }, 135 | ___array: [], 136 | ___object: {}, 137 | ___value: null 138 | }, 139 | elements = page(func.selector); 140 | 141 | if (!elements.length) { 142 | scope.___value = ''; // Default to returning an empty string if no elements mat the selector. 143 | } else { 144 | for (var i = 0, element; (element = elements[i]) != null; i++) { 145 | scope.___currentElement = cheerio(element); 146 | this._handleReturn( key, func.call(scope, scope.___currentElement, scope.___object), scope ); 147 | } 148 | } 149 | 150 | if (typeof scope.___value === 'string' || scope.___value) { 151 | return scope.___value; 152 | } 153 | 154 | else if (scope.___array.length > 0) { 155 | return scope.___array; 156 | } 157 | 158 | return scope.___object; 159 | }; 160 | 161 | jDistiller.prototype._handleReturn = function( key, value, scope ) { 162 | // Handle the special case of setting variable named keys on an object value. 163 | if (Array.isArray(value) && value.length === 2 && typeof value[0] === 'string' && typeof value[1] === 'object') { 164 | if (!scope.___object[value[0]]) { 165 | scope.___object[value[0]] = value[1]; 166 | } else { 167 | scope.___object[value[0]] = this._smartMerge(scope.___object[value[0]], value[1]); 168 | } 169 | } 170 | 171 | else if (Array.isArray(value)) { 172 | scope.___array = scope.___array.concat(value); 173 | } 174 | 175 | else if (typeof(value) === 'object') { 176 | scope.___object = this._smartMerge(scope.___object, value); 177 | } 178 | 179 | else if (value) { 180 | scope.___value = value; 181 | } 182 | }; 183 | 184 | jDistiller.prototype._smartMerge = function(v1, v2) { 185 | var _this = this; 186 | 187 | if (Array.isArray(v1)) { 188 | return v1.concat(v2); 189 | } 190 | 191 | else if (typeof v1 === 'object') { 192 | Object.keys(v2).forEach(function(k) { 193 | v1[k] = _this._smartMerge(v1[k], v2[k]); 194 | }); 195 | return v1; 196 | } 197 | 198 | return v2; 199 | }; 200 | 201 | if (typeof exports !== 'undefined') exports.jDistiller = jDistiller; 202 | })(typeof window === 'undefined' ? require : function() {}); 203 | -------------------------------------------------------------------------------- /test/j-distiller-test.js: -------------------------------------------------------------------------------- 1 | var equal = require('assert').equal, 2 | eq = require('../lib/eq'), 3 | fs = require('fs'), 4 | jDistiller = require('../lib').jDistiller, 5 | cheerio = eq.wrap(require('cheerio')); 6 | 7 | var dogArticle = fs.readFileSync('./fixtures/dog.html').toString(), 8 | mockRequest = function(params, callback) { 9 | callback(null, {statusCode: 200}, dogArticle); 10 | }, 11 | page = cheerio.load(dogArticle); 12 | 13 | exports.tests = { 14 | 'set() set with a DOM selector and no closure sets a value on the distlled page equal to text() of the selector': function(finished, prefix) { 15 | new jDistiller({request: mockRequest}) 16 | .set('title', '#firstHeading span') 17 | .set('firstHeadline', '.mw-headline:first') 18 | .distill('http://www.example.com', function(err, distilledPage) { 19 | equal(distilledPage.title, page('#firstHeading span').text(), prefix + ' title was not parsed.'); 20 | equal(distilledPage.firstHeadline, page('.mw-headline:first').text(), prefix + ' first heading was not parsed.'); 21 | finished(); 22 | }); 23 | }, 24 | 'set() with a DOM selector that matches multiple elements should combine text': function(finished, prefix) { 25 | new jDistiller({request: mockRequest}) 26 | .set('h2Text', 'h2') 27 | .distill('http://www.example.com', function(err, distilledPage) { 28 | equal(distilledPage.h2Text.indexOf('Contents') > -1, true); 29 | equal(distilledPage.h2Text.indexOf('Etymology and related terminology') > -1, true); 30 | finished(); 31 | }); 32 | }, 33 | 'set() when an element is not found it should default to an empty string': function(finished, prefix) { 34 | new jDistiller({request: mockRequest}) 35 | .set('h2Text', '.banana') 36 | .distill('http://www.example.com', function(err, distilledPage) { 37 | equal('', distilledPage.h2Text); 38 | finished(); 39 | }); 40 | }, 41 | 'set() with a closure returning strings sets a string value on the distilled page': function(finished, prefix) { 42 | new jDistiller({request: mockRequest}) 43 | .set('headline3', '.mw-headline', function(element) { 44 | this.count = this.count || 0; 45 | this.count ++; 46 | // Grab the third headline. 47 | if (this.count === 3) { 48 | return element.text().trim(); 49 | } 50 | }) 51 | .distillURL('http://www.example.com', function(err, distilledPage) { 52 | equal(distilledPage.headline3, page('.mw-headline').eq(2).text(), prefix + ' third headline not found.'); 53 | finished(); 54 | }); 55 | }, 56 | 'set() with a closure returning an object merges object keys': function(finished, prefix) { 57 | new jDistiller({request: mockRequest}) 58 | .set('headlines', '.mw-headline', function(element) { 59 | this.count = this.count || 0; 60 | this.count ++; 61 | if (this.count === 2) { 62 | return { 63 | 'second_heading': element.text().trim() 64 | } 65 | } 66 | if (this.count === 3) { 67 | return { 68 | 'third_heading': element.text().trim() 69 | } 70 | } 71 | }) 72 | .distill('http://www.example.com', function(err, distilledPage) { 73 | equal(distilledPage.headlines['second_heading'], page('.mw-headline').eq(1).text(), prefix + ' third headline not found.'); 74 | equal(distilledPage.headlines['third_heading'], page('.mw-headline').eq(2).text(), prefix + ' third headline not found.'); 75 | finished(); 76 | }); 77 | }, 78 | 'set() with a closure returning an array merges arrays together and sets an array value on the distilled page': function(finished, prefix) { 79 | new jDistiller({request: mockRequest}) 80 | .set('headlines', '.mw-headline', function(element) { 81 | return [element.text().trim()]; 82 | }) 83 | .distill('http://www.example.com', function(err, distilledPage) { 84 | equal(distilledPage.headlines.length, page('.mw-headline').length, prefix + ' did not parse all headlines.'); 85 | finished(); 86 | }); 87 | }, 88 | 'set() with a closure returning an object merges objects together and sets an object value on the distilled page.': function(finished, prefix) { 89 | new jDistiller({request: mockRequest}) 90 | .set('links', '#bodyContent p a', function(element) { 91 | return [element.attr('href'), { 92 | title: element.attr('title'), 93 | href: element.attr('href') 94 | }] 95 | }) 96 | .distill('http://www.example.com', function(err, distilledPage) { 97 | equal(Object.keys(distilledPage.links).length, 482, prefix + ' did not pull all links from page.'); 98 | finished(); 99 | }); 100 | }, 101 | 'the previous object manipulated is passed into the closure as a parameter': function(finished, prefix) { 102 | new jDistiller({request: mockRequest}) 103 | .set('links', '#bodyContent p a', function(element, prev) { 104 | var key = element.attr('href'); 105 | return [key, { 106 | title: element.attr('title'), 107 | href: key, 108 | occurrences: prev[key] ? prev[key].occurrences + 1 : 1 109 | }] 110 | }) 111 | .distill('http://www.example.com', function(err, distilledPage) { 112 | var linkCount = 0; 113 | Object.keys(distilledPage.links).forEach(function(link) { 114 | linkCount += distilledPage.links[link].occurrences; 115 | }); 116 | equal(linkCount, page('#bodyContent p a').length, prefix + ' previous object was not set.'); 117 | finished(); 118 | }); 119 | }, 120 | 'distill() method accepts a buffer rather than a url': function(finished, prefix) { 121 | new jDistiller({request: mockRequest}) 122 | .set('title', '#firstHeading span') 123 | .set('firstHeadline', '.mw-headline:first') 124 | .distill(fs.readFileSync('./fixtures/dog.html'), function(err, distilledPage) { 125 | equal(distilledPage.title, page('#firstHeading span').text(), prefix + ' title was not parsed.'); 126 | equal(distilledPage.firstHeadline, page('.mw-headline:first').text(), prefix + ' first heading was not parsed.'); 127 | finished(); 128 | }); 129 | }, 130 | 'distill() method accepts a string rather than a url': function(finished, prefix) { 131 | new jDistiller({request: mockRequest}) 132 | .set('title', '.title') 133 | .distill('

Hello World!

', function(err, distilledPage) { 134 | equal(distilledPage.title, 'Hello World!', prefix + ' title was not parsed.'); 135 | finished(); 136 | }); 137 | }, 138 | 'distill() method accepts a jQuery object rather than a url': function(finished, prefix) { 139 | new jDistiller({request: mockRequest}) 140 | .set('title', '.title') 141 | .distillJQuery(cheerio.load('

Hello World!

'), function(err, distilledPage) { 142 | equal(distilledPage.title, 'Hello World!', prefix + ' title was not parsed.'); 143 | finished(); 144 | }); 145 | }, 146 | 'this.distilledSoFar() returns the partially distilled page': function(finished, prefix) { 147 | new jDistiller({request: mockRequest}) 148 | .set('title', '#firstHeading span') 149 | .set('firstHeadline', '.mw-headline:first', function() { 150 | equal(this.distilledSoFar().title, page('#firstHeading span').text(), prefix + ' title was not parsed.'); 151 | }) 152 | .distill('http://www.example.com', function(err, distilledPage) { 153 | if (err) throw err; 154 | finished(); 155 | }); 156 | } 157 | }; 158 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | jDistiller 2 | ========= 3 | 4 | **Author:** [@benjamincoe](https://twitter.com/#/benjamincoe) 5 | 6 | Over my past couple years in the industry, there have been several times where I need to scrape structured information from (relatively) unstructured XHTML websites. 7 | 8 | My approach to doing this has gradually evolved to include the following technologies: 9 | 10 | * [Node.js](http://nodejs.org/) 11 | * [jQuery](http://jquery.com/) 12 | * [Request](https://github.com/mikeal/request) 13 | 14 | I was starting to notice a lot of code duplication in my scraping scripts, enter jDistiller: 15 | 16 | What is jDistiller? 17 | ------------------ 18 | 19 | * jDistiller is a simple and powerful DSL for scraping structured information from XHTML websites. 20 | * it is built on jQuery and Node.js. 21 | * it grows out of my experiences, having built several one-off page scrapers. 22 | 23 | Installation 24 | ----------- 25 | 26 | ```bash 27 | npm install jdistiller 28 | ``` 29 | 30 | The DSL 31 | ------- 32 | 33 | * first you create an instance of the __jDistiller__ object: 34 | 35 | ```javascript 36 | var jDistiller = require('jdistiller').jDistiller; 37 | new jDistiller() 38 | ``` 39 | 40 | * the __set()__ method is used to specify key/css-selector pairs to scrape data from: 41 | 42 | ```javascript 43 | new jDistiller() 44 | .set('headline', '#article h1.articleHeadline') 45 | .set('firstParagraph', '#article .articleBody p:eq(0)'); 46 | ``` 47 | 48 | * when the __distill()__ method is called, with an URL as input, a JavaScript object will be returned populated with the scraped data. 49 | 50 | **Simple Example (New York Times)** 51 | 52 | ```javascript 53 | var jDistiller = require('jdistiller').jDistiller; 54 | 55 | new jDistiller() 56 | .set('headline', '#article h1.articleHeadline') 57 | .set('firstParagraph', '#article .articleBody p:eq(0)') 58 | .distill('http://www.nytimes.com/2012/09/09/us/politics/obama-and-romney-battle-for-votes-in-2-swing-states.html?_r=1&hp', function(err, distilledPage) { 59 | console.log(JSON.stringify(distilledPage)) 60 | }); 61 | ``` 62 | 63 | **Output** 64 | 65 | ```json 66 | {"headline":"Obama Tries to Turn Focus to Medicare From Jobs Figures","firstParagraph":"SEMINOLE, Fla. — President Obama on Saturday began hammering away at the Republican ticket’s plans for Medicare, using a campaign swing through Florida, with its large number of retired and elderly voters, to try to turn the page from anemic employment growth, his biggest weakness, to entitlements, a Democratic strength."} 67 | ``` 68 | 69 | An Optional Closure can be Provided for Processing the Value 70 | -------------------------- 71 | 72 | A closure can optionally be provided as the third parameter for the __set()__ method. 73 | 74 | If a closure is given, the return value of the closure will be set as a key's value, rather than the text value of the selector. 75 | 76 | **DSL Using an Optional Data Processing Closure** 77 | 78 | ```javascript 79 | var jDistiller = require('jdistiller').jDistiller; 80 | 81 | new jDistiller() 82 | .set('headline', '#article h1.articleHeadline') 83 | .set('firstParagraph', '#article .articleBody p:eq(0)') 84 | .set('image', '#article .articleBody .articleSpanImage img', function(element, prev) { 85 | return element.attr('src') 86 | }) 87 | .distill('http://www.nytimes.com/2012/09/09/us/politics/obama-and-romney-battle-for-votes-in-2-swing-states.html?_r=1&hp', function(err, distilledPage) { 88 | console.log(JSON.stringify(distilledPage)) 89 | }); 90 | ``` 91 | 92 | **Output** 93 | 94 | ```json 95 | {"headline":"Obama Tries to Turn Focus to Medicare From Jobs Figures","firstParagraph":"SEMINOLE, Fla. — President Obama on Saturday began hammering away at the Republican ticket’s plans for Medicare, using a campaign swing through Florida, with its large number of retired and elderly voters, to try to turn the page from anemic employment growth, his biggest weakness, to entitlements, a Democratic strength.","image":"http://graphics8.nytimes.com/images/2012/09/09/us/JP-CANDIDATE-1/JP-CANDIDATE-1-articleLarge.jpg"} 96 | ``` 97 | The closure will be passed the following values: 98 | 99 | * **element:** a jQuery element matching the CSS selector specified in __set()__. 100 | * **prev:** if multiple elements on the page match the selector, the closure is will be executed once for each. __prev__ can be used to interact with the object created by previous executions of the closure. As an example, we might want to increment a counter if the same link occurs multiple times on the same page. 101 | * **this:** the state is shared between multiple executions of the same closure (see __examples/wikipedia.js__, to get an idea of why this is useful). 102 | 103 | Closure Return Types 104 | ------------------- 105 | 106 | * **strings:** the last string returned by the closure will be used as the value. 107 | * **numbers:** the last number returned by the closure will be used as the value. 108 | * **arrays:** when an array is returned, it will be merged with all other arrays returned for the given key. The final merged array will be set as value. 109 | * **objects:** when an object is returned, the object will be merged with all other objects returned. The final object will be used as the value. 110 | * **key/object-pair:** this special return type allows value to be populated with an object that has dynamically generated key names. 111 | 112 | Some Examples 113 | ------------- 114 | 115 | **Array Merging Example** 116 | 117 | ```javascript 118 | var jDistiller = require('jdistiller').jDistiller; 119 | 120 | new jDistiller() 121 | .set('paragraphs', '#article .articleBody p', function(element) { 122 | return [element.text()] 123 | }) 124 | .distill('http://www.nytimes.com/2012/09/09/us/politics/obama-and-romney-battle-for-votes-in-2-swing-states.html?_r=1&hp', function(err, distilledPage) { 125 | console.log(JSON.stringify(distilledPage)) 126 | }); 127 | ``` 128 | 129 | **output** 130 | 131 | ```json 132 | {"paragraphs": ["SEMINOLE, Fla. — President Obama on Saturday began hammering away at the Republican ticket’s...", "Kicking off a two-day bus tour through...", ...]} 133 | ``` 134 | 135 | **Object Merging Example** 136 | 137 | ```javascript 138 | var jDistiller = require('jdistiller').jDistiller; 139 | 140 | new jDistiller() 141 | .set('headlines', '.mw-headline', function(element) { 142 | this.count = this.count || 0; 143 | this.count ++; 144 | if (this.count === 2) { 145 | return { 146 | 'second_heading': element.text().trim() 147 | } 148 | } 149 | if (this.count === 3) { 150 | return { 151 | 'third_heading': element.text().trim() 152 | } 153 | } 154 | }) 155 | .distill('http://en.wikipedia.org/wiki/Dog', function(err, distilledPage) { 156 | console.log(JSON.stringify(distilledPage)); 157 | }); 158 | ``` 159 | 160 | **Output** 161 | 162 | ```json 163 | {"headlines":{"second_heading":"Taxonomy","third_heading":"History and evolution"}} 164 | ``` 165 | 166 | **Key/Object-Pair Example** 167 | 168 | ```javascript 169 | var jDistiller = require('jdistiller').jDistiller; 170 | 171 | new jDistiller() 172 | .set('links', '#bodyContent p a', function(element, prev) { 173 | var key = element.attr('href'); 174 | return [key, { 175 | title: element.attr('title'), 176 | href: key, 177 | occurrences: prev[key] ? prev[key].occurrences + 1 : 1 178 | }] 179 | }) 180 | .distill('http://en.wikipedia.org/wiki/Dog', function(err, distilledPage) { 181 | console.log(JSON.stringify(distilledPage)); 182 | }); 183 | ``` 184 | 185 | **Output** 186 | 187 | ```json 188 | {"links":{"#cite_note-MSW3_Lupus-1":{"title":"","href":"#cite_note-MSW3_Lupus-1","occurrences":1},"#cite_note-ADW-2":{"title":"","href":"#cite_note-ADW-2","occurrences":1},"/wiki/Gray_wolf_subspecies":{"title":"Gray wolf subspecies","href":"/wiki/Gray_wolf_subspecies","occurrences":1},"/wiki/Gray_wolf":{"title":"Gray wolf","href":"/wiki/Gray_wolf","occurrences":1},"/wiki/Canidae":{"title":"Canidae","href":"/wiki/Canidae","occurrences":1}}} 189 | ``` 190 | 191 | That's About It 192 | ---------- 193 | 194 | I'm excited about jDistiller, I think it solves the scraping problem in an elegant way. 195 | 196 | Don't be shy with your feedback, and please contribute. 197 | 198 | -- Ben [@benjamincoe](https://twitter.com/#/benjamincoe) 199 | --------------------------------------------------------------------------------