├── .gitignore ├── examples ├── httpstatuses.js └── old │ ├── tpb.js │ ├── github.js │ └── hackernews.js ├── package.json ├── test ├── test.js └── olindining.js ├── README.md └── skim.js /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | -------------------------------------------------------------------------------- /examples/httpstatuses.js: -------------------------------------------------------------------------------- 1 | var skim = require('..') 2 | , rem = require('rem'); 3 | 4 | rem.stream('http://httpstatus.es').get().pipe(skim({ 5 | $query: "#statuses a", 6 | $each: { 7 | "number": "(text \\d+)", 8 | "href": "(attr href)" 9 | } 10 | }, function () { 11 | console.log(arguments); 12 | })); -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "skim", 3 | "version": "0.1.1", 4 | "description": "Scrape websites simply. Streaming HTML parser combined with a flexible HTTP client.", 5 | "main": "skim.js", 6 | "dependencies": { 7 | "cssax": "0.0.6", 8 | "read": "~1.0.4", 9 | "async": "~0.1.22" 10 | }, 11 | "scripts": { 12 | "test": "echo \"Error: no test specified\" && exit 1" 13 | }, 14 | "repository": { 15 | "type": "git", 16 | "url": "https://github.com/tcr/scrapi.git" 17 | }, 18 | "author": "Tim Cameron Ryan", 19 | "license": "MIT" 20 | } 21 | -------------------------------------------------------------------------------- /test/test.js: -------------------------------------------------------------------------------- 1 | var scrapi = require('../scrapi'); 2 | 3 | // Define a specification for scraping Hacker News 4 | 5 | var spec = { 6 | base: 'http://news.ycombinator.com/', 7 | spec: { 8 | $query: 'table table tr:nth-child(3n+1)', 9 | $each: { 10 | title: '(text) a', 11 | link: '(attr href) a', 12 | user: '(text) + tr a[href^=user]', 13 | comments: '(text ^\\d+) + tr a[href^=item]', 14 | id: '(attr href \\d+$) + tr a[href^=item]' 15 | }, 16 | $filter: 'id' 17 | } 18 | } 19 | 20 | scrapi(spec, '', function (ret) { 21 | console.log(ret); 22 | }); -------------------------------------------------------------------------------- /test/olindining.js: -------------------------------------------------------------------------------- 1 | var scrapi = require('../scrapi'); 2 | 3 | // Define a specification for scraping Olin Dining Menu 4 | 5 | var spec = { 6 | base: 'http://olindining.com/WeeklyMenu4_002.htm', 7 | spec: { 8 | $query: 'a[name=monday] + table.dayinner tr.lun', 9 | $each: { 10 | section: '(text \\S.*) td.station', 11 | item: '(text \\S.*) td.menuitem' 12 | }, 13 | $filter: 'section' 14 | } 15 | } 16 | 17 | scrapi(spec, '', function (ret) { 18 | var food = {}; 19 | var section = null; 20 | ret.forEach(function (item) { 21 | section = item.section || section; 22 | (food[section] || (food[section] = [])).push(item.item); 23 | }) 24 | console.log(food); 25 | }); -------------------------------------------------------------------------------- /examples/old/tpb.js: -------------------------------------------------------------------------------- 1 | var scrapi = require('..'); 2 | 3 | // Define a specification for scraping Hacker News 4 | 5 | var manifest = { 6 | base: 'http://thepiratebay.se', 7 | spec: { 8 | '*': { 9 | torrents: { 10 | $query: '#searchResult tr', 11 | $each: { 12 | title: '(text [^\\t\\n]+) .detName', 13 | magnet: '(attr href) a[href^=magnet]' 14 | }, 15 | $filter: 'magnet' 16 | } 17 | } 18 | } 19 | }; 20 | 21 | var tpb = scrapi(manifest); 22 | 23 | tpb('recent/0').get(function (err, json) { 24 | json.torrents.forEach(function (torrent) { 25 | var trackers = require('url').parse(torrent.magnet, true).query.tr; 26 | console.log(torrent.title, trackers); 27 | }); 28 | }); -------------------------------------------------------------------------------- /examples/old/github.js: -------------------------------------------------------------------------------- 1 | var scrapi = require('..'); 2 | 3 | // Define a specification for scraping Github commits 4 | 5 | var manifest = { 6 | base: 'https://github.com', 7 | spec: { 8 | '*': { 9 | changes: { 10 | $query: '#toc p.explain', 11 | $value: '(text ^\\d+) span + strong', 12 | }, 13 | added: { 14 | $query: '#toc p.explain', 15 | $value: '(text ^\\d+) span + strong + strong', 16 | }, 17 | deleted: { 18 | $query: '#toc p.explain', 19 | $value: '(text ^\\d+) span + strong + strong + strong', 20 | } 21 | } 22 | } 23 | }; 24 | 25 | var github = scrapi(manifest); 26 | 27 | github('tcr/scrapi/commit/ba192e77a0797e64b6dc82542b2a4806c4d7db8e').get(function (err, json) { 28 | console.log(json); 29 | }); -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # skim 2 | 3 | Parse a stream of HTML using CSS selectors to extract data. Doesn't require a DOM, thus it's fast and low-memory. 4 | 5 | ## Example 6 | 7 | Define your scraping parameters in a JSON manifest: 8 | 9 | ```javascript 10 | rem.stream("http://news.ycombinator.com/").pipe(skim({ 11 | "$query": "td.title ~ td ~ td.title > a", 12 | "$each": { 13 | "title": "(text)", 14 | "link": "(attr href)" 15 | } 16 | }, function (err, json) { 17 | console.log(json); 18 | }); 19 | ``` 20 | 21 | Result: 22 | 23 | ```javascript 24 | [ { link: 'https://www.hackerschool.com/blog/5-learning-c-with-gdb', 25 | title: 'Learning C with gdb' }, 26 | { link: 'http://blogs.scientificamerican.com/guest-blog/2012/08/27/the-hidden-truths-about-calories/', 27 | title: 'Hidden Truths about Calories' }, 28 | { link: 'http://cantada.ca/', 29 | title: 'Can\'tada - Tracking the stuff you can\'t use in Canada' }, 30 | { link: 'https://blog.gregbrockman.com/2012/08/system-design-stripe-capture-the-flag/', 31 | title: 'Seccuring Stripe\'s Capture the Flag' }, 32 | { link: 'http://swanson.github.com/blog/2012/08/27/move-your-feet.html', 33 | title: 'Move your feet' }, 34 | ... ] 35 | ``` -------------------------------------------------------------------------------- /examples/old/hackernews.js: -------------------------------------------------------------------------------- 1 | var scrapi = require('..'); 2 | 3 | // Define a specification for scraping Hacker News 4 | 5 | var manifest = { 6 | base: 'http://news.ycombinator.com', 7 | spec: { 8 | '/ submitted': { 9 | stories: { 10 | $query: 'table table tr:nth-child(3n+1)', 11 | $each: { 12 | title: '(text) td.title a', 13 | link: '(attr href) td.title a', 14 | user: '(text) + tr a[href^=user]', 15 | comments_count: '(text ^\\d+) + tr a[href^=item]', 16 | id: '(attr href \\d+$) + tr a[href^=item]', 17 | points: '(text ^\\d+) + tr td.subtext', 18 | age: '(text \\d+ \\S+ ago) + tr td.subtext' 19 | }, 20 | $filter: 'id' 21 | }, 22 | next: { 23 | $query: 'table table td:nth-child(1) + td.title', 24 | $value: '(attr href) a' 25 | } 26 | }, 27 | 'item': { 28 | story: '(html) table table:nth-child(1) tr:nth-child(4) td', 29 | comments: { 30 | $query: 'tr tr tr', 31 | $each: { 32 | user: '(text) .comhead a', 33 | age: '(text \\b\\d+ \\S+ \\S+) .comhead', 34 | id: '(attr href \\d+$) .comhead a:nth-child(2)', 35 | indent_width: '(attr width) > td:nth-child(1) img', 36 | text: '(html) .comment font', 37 | color: '(attr color) .comment font' 38 | } 39 | } 40 | }, 41 | 'user': { 42 | $query: 'table table', 43 | name: '(text) tr:nth-child(1) td:nth-child(2)', 44 | created: '(text) tr:nth-child(2) td:nth-child(2)', 45 | karma: '(text) tr:nth-child(3) td:nth-child(2)', 46 | average_karma: '(text) tr:nth-child(4) td:nth-child(2)', 47 | about: '(text) tr:nth-child(5) td:nth-child(2)' 48 | } 49 | } 50 | }; 51 | 52 | // List stories from a given page. 53 | var hnews = scrapi(manifest); 54 | 55 | hnews('/').get(function (err, json) { 56 | json.stories.forEach(function (story, i) { 57 | console.log('[' + (i + 1) + ']', story.title); 58 | console.log(JSON.stringify(story)); 59 | }); 60 | }); -------------------------------------------------------------------------------- /skim.js: -------------------------------------------------------------------------------- 1 | var async = require('async'); 2 | var cssax = require('cssax'); 3 | 4 | // Utilities 5 | 6 | function stripHTML (html) { 7 | return html.replace(/<.+?>/g, ''); 8 | } 9 | 10 | function combineQueries (a, b) { 11 | return (a.replace(/(?=,)|$/g, ' ' + b.replace(/^\(.*?\)/, ''))).trim(); 12 | } 13 | 14 | // Scrapi 15 | 16 | function onValue (stream, query, str, callback) { 17 | stream.query(query).on('match', function (tag, attributes) { 18 | if (match = str.match(/^\(attr( [^)]+?)?( [^)]+?)?\)/)) { 19 | var value = attributes[match[1].substr(1)] || ''; 20 | callback((match[2] ? (value.match(new RegExp(match[2].substr(1))) || [])[0] : value) || ''); 21 | } else if (match = str.match(/^\(text( [^)]+?)?\)/)) { 22 | this.readText(function (match, text) { 23 | callback((match[1] ? (text.match(new RegExp(match[1].substr(1))) || [])[0] : text) || ''); 24 | }.bind(null, match)) 25 | } else if (match = str.match(/^\(html( [^)]+?)?\)/)) { 26 | this.readHTML(function (match, text) { 27 | text = text.replace(/^<[^>]+>|<[^>]+>$/g, '') 28 | callback((match[1] ? (text.match(new RegExp(match[1].substr(1))) || [])[0] : text) || ''); 29 | }.bind(null, match)) 30 | } 31 | }); 32 | } 33 | 34 | function parseValueSpec (str) { 35 | return { 36 | $value: (str.match(/^[^)]+\)/) || [])[0], 37 | $query: (str.match(/\)\s*(.*)$/) || [])[1] 38 | }; 39 | } 40 | 41 | // Setup listeners based on a JSON specification or subspec. 42 | function onSpecification (stream, spec, prefix) { 43 | prefix = prefix || ''; 44 | spec = (typeof spec == 'string') ? parseValueSpec(spec) : spec; 45 | 46 | // Augment $query parameter. 47 | var query = prefix + (spec.$query ? ' ' + spec.$query : ''); 48 | 49 | if ('$each' in spec) { 50 | 51 | // Array to populate. 52 | // Skip first call. 53 | var ret = [], first = true; 54 | var parser = onSpecification(stream, spec.$each, query); 55 | stream.query(query).on('match', function (tag, attributes) { 56 | this.skip(function () { 57 | ret.push(parser.result()); 58 | parser.reset(); 59 | }) 60 | }); 61 | 62 | return { 63 | result: function () { 64 | var vals = ret; 65 | return vals.filter(function (obj) { 66 | return '$filter' in spec ? Object.prototype.hasOwnProperty.call(obj, spec.$filter) && obj[spec.$filter] : obj; 67 | }); 68 | }, 69 | reset: function () { 70 | ret = []; 71 | } 72 | }; 73 | 74 | } else if ('$value' in spec) { 75 | 76 | // String to populate. 77 | var ret = null; 78 | onValue(stream, combineQueries(query, spec.$value), spec.$value, function (value) { 79 | ret = value; 80 | }); 81 | 82 | return { 83 | result: function () { 84 | return ret; 85 | }, 86 | reset: function () { 87 | ret = null; 88 | } 89 | }; 90 | } 91 | 92 | // Object of named fields to populate. 93 | var parsers = {}; 94 | Object.keys(spec).filter(function (key) { 95 | return key.charAt(0) != '$'; 96 | }).forEach(function (key) { 97 | parsers[key] = onSpecification(stream, spec[key], query); 98 | }); 99 | 100 | return { 101 | result: function () { 102 | var values = ('$query' in spec) ? null : {}; 103 | Object.keys(parsers).forEach(function (key) { 104 | var res = parsers[key].result(); 105 | if (res !== null) { 106 | values = values || {}; 107 | values[key] = res; 108 | } 109 | }) 110 | return values; 111 | }, 112 | reset: function () { 113 | Object.keys(parsers).forEach(function (key) { 114 | parsers[key].reset(); 115 | }) 116 | } 117 | }; 118 | } 119 | 120 | module.exports = function (spec, next) { 121 | // Build specification parser, return result after stream ends. 122 | var stream = cssax.createStream(); 123 | var parser = onSpecification(stream, spec); 124 | stream 125 | .on('error', function () { }) // Toss errors 126 | .on('end', function () { 127 | next(parser.result()); 128 | }) 129 | return stream; 130 | }; --------------------------------------------------------------------------------