├── .gitignore
├── examples
    ├── httpstatuses.js
    └── old
    │   ├── tpb.js
    │   ├── github.js
    │   └── hackernews.js
├── package.json
├── test
    ├── test.js
    └── olindining.js
├── README.md
└── skim.js


/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | 


--------------------------------------------------------------------------------
/examples/httpstatuses.js:
--------------------------------------------------------------------------------
 1 | var skim = require('..')
 2 |   , rem = require('rem');
 3 | 
 4 | rem.stream('http://httpstatus.es').get().pipe(skim({
 5 | 	$query: "#statuses a",
 6 | 	$each: {
 7 | 		"number": "(text \\d+)",
 8 | 		"href": "(attr href)"
 9 | 	}
10 | }, function () {
11 | 	console.log(arguments);
12 | }));


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "skim",
 3 |   "version": "0.1.1",
 4 |   "description": "Scrape websites simply. Streaming HTML parser combined with a flexible HTTP client.",
 5 |   "main": "skim.js",
 6 |   "dependencies": {
 7 |     "cssax": "0.0.6",
 8 |     "read": "~1.0.4",
 9 |     "async": "~0.1.22"
10 |   },
11 |   "scripts": {
12 |     "test": "echo \"Error: no test specified\" && exit 1"
13 |   },
14 |   "repository": {
15 |     "type": "git",
16 |     "url": "https://github.com/tcr/scrapi.git"
17 |   },
18 |   "author": "Tim Cameron Ryan",
19 |   "license": "MIT"
20 | }
21 | 


--------------------------------------------------------------------------------
/test/test.js:
--------------------------------------------------------------------------------
 1 | var scrapi = require('../scrapi');
 2 | 
 3 | // Define a specification for scraping Hacker News
 4 | 
 5 | var spec = {
 6 |   base: 'http://news.ycombinator.com/',
 7 |   spec: {
 8 |     $query: 'table table tr:nth-child(3n+1)',
 9 |     $each: {
10 |       title: '(text) a',
11 |       link: '(attr href) a',
12 |       user: '(text) + tr a[href^=user]',
13 |       comments: '(text ^\\d+) + tr a[href^=item]',
14 |       id: '(attr href \\d+$) + tr a[href^=item]'
15 |     },
16 |     $filter: 'id'
17 |   }
18 | }
19 | 
20 | scrapi(spec, '', function (ret) {
21 |   console.log(ret);
22 | });


--------------------------------------------------------------------------------
/test/olindining.js:
--------------------------------------------------------------------------------
 1 | var scrapi = require('../scrapi');
 2 | 
 3 | // Define a specification for scraping Olin Dining Menu
 4 | 
 5 | var spec = {
 6 |   base: 'http://olindining.com/WeeklyMenu4_002.htm',
 7 |   spec: {
 8 |     $query: 'a[name=monday] + table.dayinner tr.lun',
 9 |     $each: {
10 |     	section: '(text \\S.*) td.station',
11 |     	item: '(text \\S.*) td.menuitem'
12 |     },
13 |     $filter: 'section'
14 |   }
15 | }
16 | 
17 | scrapi(spec, '', function (ret) {
18 | 	var food = {};
19 | 	var section = null;
20 |   ret.forEach(function (item) {
21 |   	section = item.section || section;
22 |   	(food[section] || (food[section] = [])).push(item.item);
23 |   })
24 |   console.log(food);
25 | });


--------------------------------------------------------------------------------
/examples/old/tpb.js:
--------------------------------------------------------------------------------
 1 | var scrapi = require('..');
 2 | 
 3 | // Define a specification for scraping Hacker News
 4 | 
 5 | var manifest = {
 6 |   base: 'http://thepiratebay.se',
 7 |   spec: {
 8 |     '*': {
 9 |       torrents: {
10 |         $query: '#searchResult tr',
11 |         $each: {
12 |           title: '(text [^\\t\\n]+) .detName',
13 |           magnet: '(attr href) a[href^=magnet]'
14 |         },
15 |         $filter: 'magnet'
16 |       }
17 |     }
18 |   }
19 | };
20 | 
21 | var tpb = scrapi(manifest);
22 | 
23 | tpb('recent/0').get(function (err, json) {
24 |   json.torrents.forEach(function (torrent) {
25 |     var trackers = require('url').parse(torrent.magnet, true).query.tr;
26 |     console.log(torrent.title, trackers);
27 |   });
28 | });


--------------------------------------------------------------------------------
/examples/old/github.js:
--------------------------------------------------------------------------------
 1 | var scrapi = require('..');
 2 | 
 3 | // Define a specification for scraping Github commits
 4 | 
 5 | var manifest = {
 6 |   base: 'https://github.com',
 7 |   spec: {
 8 |     '*': {
 9 |       changes: {
10 |         $query: '#toc p.explain',
11 |         $value: '(text ^\\d+) span + strong',
12 |       },
13 |       added: {
14 |         $query: '#toc p.explain',
15 |         $value: '(text ^\\d+) span + strong + strong',
16 |       },
17 |       deleted: {
18 |         $query: '#toc p.explain',
19 |         $value: '(text ^\\d+) span + strong + strong + strong',
20 |       }
21 |     }
22 |   }
23 | };
24 | 
25 | var github = scrapi(manifest);
26 | 
27 | github('tcr/scrapi/commit/ba192e77a0797e64b6dc82542b2a4806c4d7db8e').get(function (err, json) {
28 |   console.log(json);
29 | });


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # skim
 2 | 
 3 | Parse a stream of HTML using CSS selectors to extract data. Doesn't require a DOM, thus it's fast and low-memory.
 4 | 
 5 | ## Example
 6 | 
 7 | Define your scraping parameters in a JSON manifest:
 8 | 
 9 | ```javascript
10 | rem.stream("http://news.ycombinator.com/").pipe(skim({
11 |   "$query": "td.title ~ td ~ td.title > a",
12 |   "$each": {
13 |     "title": "(text)",
14 |     "link": "(attr href)"
15 |   }
16 | }, function (err, json) {
17 |   console.log(json);
18 | });
19 | ```
20 | 
21 | Result:
22 | 
23 | ```javascript
24 | [ { link: 'https://www.hackerschool.com/blog/5-learning-c-with-gdb',
25 |     title: 'Learning C with gdb' },
26 |   { link: 'http://blogs.scientificamerican.com/guest-blog/2012/08/27/the-hidden-truths-about-calories/',
27 |     title: 'Hidden Truths about Calories' },
28 |   { link: 'http://cantada.ca/',
29 |     title: 'Can\'tada - Tracking the stuff you can\'t use in Canada' },
30 |   { link: 'https://blog.gregbrockman.com/2012/08/system-design-stripe-capture-the-flag/',
31 |     title: 'Seccuring Stripe\'s Capture the Flag' },
32 |   { link: 'http://swanson.github.com/blog/2012/08/27/move-your-feet.html',
33 |     title: 'Move your feet' },
34 |     ... ]
35 | ```


--------------------------------------------------------------------------------
/examples/old/hackernews.js:
--------------------------------------------------------------------------------
 1 | var scrapi = require('..');
 2 | 
 3 | // Define a specification for scraping Hacker News
 4 | 
 5 | var manifest = {
 6 |   base: 'http://news.ycombinator.com',
 7 |   spec: {
 8 |     '/ submitted': {
 9 |       stories: {
10 |         $query: 'table table tr:nth-child(3n+1)',
11 |         $each: {
12 |           title: '(text) td.title a',
13 |           link: '(attr href) td.title a',
14 |           user: '(text) + tr a[href^=user]',
15 |           comments_count: '(text ^\\d+) + tr a[href^=item]',
16 |           id: '(attr href \\d+$) + tr a[href^=item]',
17 |           points: '(text ^\\d+) + tr td.subtext',
18 |           age: '(text \\d+ \\S+ ago) + tr td.subtext'
19 |         },
20 |         $filter: 'id'
21 |       },
22 |       next: {
23 |         $query: 'table table td:nth-child(1) + td.title',
24 |         $value: '(attr href) a'
25 |       }
26 |     },
27 |     'item': {
28 |       story: '(html) table table:nth-child(1) tr:nth-child(4) td',
29 |       comments: {
30 |         $query: 'tr tr tr',
31 |         $each: {
32 |           user: '(text) .comhead a',
33 |           age: '(text \\b\\d+ \\S+ \\S+) .comhead',
34 |           id: '(attr href \\d+$) .comhead a:nth-child(2)',
35 |           indent_width: '(attr width) > td:nth-child(1) img',
36 |           text: '(html) .comment font',
37 |           color: '(attr color) .comment font'
38 |         }
39 |       }
40 |     },
41 |     'user': {
42 |       $query: 'table table',
43 |       name: '(text) tr:nth-child(1) td:nth-child(2)',
44 |       created: '(text) tr:nth-child(2) td:nth-child(2)',
45 |       karma: '(text) tr:nth-child(3) td:nth-child(2)',
46 |       average_karma: '(text) tr:nth-child(4) td:nth-child(2)',
47 |       about: '(text) tr:nth-child(5) td:nth-child(2)'
48 |     }
49 |   }
50 | };
51 | 
52 | // List stories from a given page.
53 | var hnews = scrapi(manifest);
54 | 
55 | hnews('/').get(function (err, json) {
56 |   json.stories.forEach(function (story, i) {
57 |     console.log('[' + (i + 1) + ']', story.title);
58 |     console.log(JSON.stringify(story));
59 |   });
60 | });


--------------------------------------------------------------------------------
/skim.js:
--------------------------------------------------------------------------------
  1 | var async = require('async');
  2 | var cssax = require('cssax');
  3 | 
  4 | // Utilities
  5 | 
  6 | function stripHTML (html) {
  7 |   return html.replace(/<.+?>/g, '');
  8 | }
  9 | 
 10 | function combineQueries (a, b) {
 11 |   return (a.replace(/(?=,)|$/g, ' ' + b.replace(/^\(.*?\)/, ''))).trim();
 12 | }
 13 | 
 14 | // Scrapi
 15 | 
 16 | function onValue (stream, query, str, callback) {
 17 |   stream.query(query).on('match', function (tag, attributes) {
 18 |     if (match = str.match(/^\(attr( [^)]+?)?( [^)]+?)?\)/)) {
 19 |       var value = attributes[match[1].substr(1)] || '';
 20 |       callback((match[2] ? (value.match(new RegExp(match[2].substr(1))) || [])[0] : value) || '');
 21 |     } else if (match = str.match(/^\(text( [^)]+?)?\)/)) {
 22 |       this.readText(function (match, text) {
 23 |         callback((match[1] ? (text.match(new RegExp(match[1].substr(1))) || [])[0] : text) || '');
 24 |       }.bind(null, match))
 25 |     } else if (match = str.match(/^\(html( [^)]+?)?\)/)) {
 26 |       this.readHTML(function (match, text) {
 27 |         text = text.replace(/^<[^>]+>|<[^>]+>$/g, '')
 28 |         callback((match[1] ? (text.match(new RegExp(match[1].substr(1))) || [])[0] : text) || '');
 29 |       }.bind(null, match))
 30 |     }
 31 |   });
 32 | }
 33 | 
 34 | function parseValueSpec (str) {
 35 |   return {
 36 |     $value: (str.match(/^[^)]+\)/) || [])[0],
 37 |     $query: (str.match(/\)\s*(.*)$/) || [])[1]
 38 |   };
 39 | }
 40 | 
 41 | // Setup listeners based on a JSON specification or subspec.
 42 | function onSpecification (stream, spec, prefix) {
 43 |   prefix = prefix || '';
 44 |   spec = (typeof spec == 'string') ? parseValueSpec(spec) : spec;
 45 | 
 46 |   // Augment $query parameter.
 47 |   var query = prefix + (spec.$query ? ' ' + spec.$query : '');
 48 | 
 49 |   if ('$each' in spec) {
 50 | 
 51 |     // Array to populate.
 52 |     // Skip first call.
 53 |     var ret = [], first = true;
 54 |     var parser = onSpecification(stream, spec.$each, query);
 55 |     stream.query(query).on('match', function (tag, attributes) {
 56 |       this.skip(function () {
 57 |         ret.push(parser.result());
 58 |         parser.reset();
 59 |       })
 60 |     });
 61 |   
 62 |     return {
 63 |       result: function () {
 64 |         var vals = ret;
 65 |         return vals.filter(function (obj) {
 66 |           return '$filter' in spec ? Object.prototype.hasOwnProperty.call(obj, spec.$filter) && obj[spec.$filter] : obj;
 67 |         });
 68 |       },
 69 |       reset: function () {
 70 |         ret = [];
 71 |       }
 72 |     };
 73 | 
 74 |   } else if ('$value' in spec) {
 75 | 
 76 |     // String to populate.
 77 |     var ret = null;
 78 |     onValue(stream, combineQueries(query, spec.$value), spec.$value, function (value) {
 79 |       ret = value;
 80 |     });
 81 | 
 82 |     return {
 83 |       result: function () {
 84 |         return ret;
 85 |       },
 86 |       reset: function () {
 87 |         ret = null;
 88 |       }
 89 |     };
 90 |   }
 91 | 
 92 |   // Object of named fields to populate.
 93 |   var parsers = {};
 94 |   Object.keys(spec).filter(function (key) {
 95 |     return key.charAt(0) != '$';
 96 |   }).forEach(function (key) {
 97 |     parsers[key] = onSpecification(stream, spec[key], query);
 98 |   });
 99 | 
100 |   return {
101 |     result: function () {
102 |       var values = ('$query' in spec) ? null : {};
103 |       Object.keys(parsers).forEach(function (key) {
104 |         var res = parsers[key].result();
105 |         if (res !== null) {
106 |           values = values || {};
107 |           values[key] = res;
108 |         }
109 |       })
110 |       return values;
111 |     },
112 |     reset: function () {
113 |       Object.keys(parsers).forEach(function (key) {
114 |         parsers[key].reset();
115 |       })
116 |     }
117 |   };
118 | }
119 | 
120 | module.exports = function (spec, next) {
121 |   // Build specification parser, return result after stream ends.
122 |   var stream = cssax.createStream();
123 |   var parser = onSpecification(stream, spec);
124 |   stream
125 |     .on('error', function () { }) // Toss errors
126 |     .on('end', function () {
127 |       next(parser.result());
128 |     })
129 |   return stream;
130 | };


--------------------------------------------------------------------------------