├── .gitignore ├── lib ├── index.js ├── sources │ ├── blogs │ │ ├── personal │ │ │ └── bencoe.js │ │ └── major │ │ │ ├── mashable.js │ │ │ ├── gizmodo.js │ │ │ ├── techrunch.js │ │ │ ├── fastcompany.js │ │ │ └── fastcolabs.js │ ├── other │ │ ├── oatmeal.js │ │ ├── xkcd.js │ │ └── github.js │ └── news │ │ ├── tech │ │ ├── wired.js │ │ └── arstechnica.js │ │ └── major │ │ ├── latimes.js │ │ ├── usatoday.js │ │ └── newyorktimes.js ├── grabber.js ├── source.js ├── loader.js └── feed-source.js ├── package.json ├── LICENSE.txt ├── bin └── routers-news.js └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | .DS_Store 3 | npm-debug.log 4 | -------------------------------------------------------------------------------- /lib/index.js: -------------------------------------------------------------------------------- 1 | exports.loadNewsSources = require('./loader').loadNewsSources; 2 | exports.Source = require('./source').Source; 3 | exports.Loader = require('./loader').Loader; 4 | exports.Grabber = require('./grabber').Grabber; -------------------------------------------------------------------------------- /lib/sources/blogs/personal/bencoe.js: -------------------------------------------------------------------------------- 1 | var jDistiller = require('jdistiller').jDistiller, 2 | FeedSource = require('../../../feed-source').FeedSource; 3 | 4 | exports.source = new FeedSource({ 5 | source: "bencoe", 6 | description: "Codes from the Underground, Ben's blog.", 7 | feedURL: 'http://bencoe.tumblr.com/rss' 8 | }); 9 | -------------------------------------------------------------------------------- /lib/sources/other/oatmeal.js: -------------------------------------------------------------------------------- 1 | var jDistiller = require('jdistiller').jDistiller, 2 | FeedSource = require('../../feed-source').FeedSource; 3 | 4 | exports.source = new FeedSource({ 5 | source: "Oatmeal", 6 | description: "Tastes better than stale skittles found under the couch cushions.", 7 | feedURL: 'http://feeds.feedburner.com/oatmealfeed.atom', 8 | type: 'atom' 9 | }); -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "routers-news", 3 | "directories": { 4 | "lib": "./lib", 5 | "bin": "./bin" 6 | }, 7 | "main": "./lib/index.js", 8 | "bin": "./bin/routers-news.js", 9 | "version": "1.0.4", 10 | "author": "Ben Coe ", 11 | "engines": [ 12 | "node" 13 | ], 14 | "description": "A crawler for various popular tech news sources. Read technology news from the comfort of your CLI.", 15 | "keywords": [ 16 | "crawler" 17 | ], 18 | "repository": { 19 | "type": "git", 20 | "url": "git://github.com/bcoe/routers" 21 | }, 22 | "dependencies": { 23 | "jdistiller": ">=1.1.3", 24 | "underscore": ">=1.4.2", 25 | "optimist": ">=0.3.4", 26 | "jquery": ">=1.6.3" 27 | } 28 | } -------------------------------------------------------------------------------- /lib/sources/other/xkcd.js: -------------------------------------------------------------------------------- 1 | var jDistiller = require('jdistiller').jDistiller, 2 | Source = require('../../source').Source; 3 | 4 | exports.source = new Source({ 5 | source: "xkcd", 6 | description: 'A webcomic of romance and math humor.', 7 | headlineURL: 'http://xkcd.com/atom.xml', 8 | headlineDistiller: new jDistiller() 9 | .set('headlines', 'entry', function(element) { 10 | return [{ 11 | title: element.find('title').text(), 12 | href: element.find('link').attr('href') 13 | }] 14 | }), 15 | articleDistiller: new jDistiller() 16 | .set('title', '#ctitle') 17 | .set('body', '#comic img', function(element) { 18 | return element.attr('title'); 19 | }) 20 | .set('img', '#comic img', function(element) { 21 | return element.attr('src'); 22 | }) 23 | }); 24 | -------------------------------------------------------------------------------- /lib/sources/news/tech/wired.js: -------------------------------------------------------------------------------- 1 | var jDistiller = require('jdistiller').jDistiller, 2 | Source = require('../../../source').Source; 3 | 4 | exports.source = new Source({ 5 | source: "Wired.com", 6 | description: "Wired magazine is a monthly US technology publication.", 7 | headlineURL: 'http://wired.com', 8 | headlineDistiller: new jDistiller() 9 | .set('headlines', '.headline1 a, .headline2 a', function(element) { 10 | return [{ 11 | title: element.text(), 12 | href: element.attr('href') 13 | }] 14 | }), 15 | articleDistiller: new jDistiller() 16 | .set('title', '.post h1') 17 | .set('img', '.entry img', function(element) { 18 | return element.attr('src'); 19 | }) 20 | .set('body', '.entry p', function(element, prev) { 21 | prev.body = prev.body || ''; 22 | prev.body += element.text().trim() + '\n\n'; 23 | return prev.body; 24 | }) 25 | }); -------------------------------------------------------------------------------- /lib/sources/blogs/major/mashable.js: -------------------------------------------------------------------------------- 1 | 2 | var jDistiller = require('jdistiller').jDistiller, 3 | Source = require('../../../source').Source; 4 | 5 | exports.source = new Source({ 6 | source: "Mashable", 7 | description: "Mashable covers the top social media news on topics like Facebook, YouTube, Gmail, Twitter, Amazon, Pinterest and More.", 8 | headlineURL: 'http://mashable.com/tech/', 9 | headlineDistiller: new jDistiller() 10 | .set('headlines', 'h1 a', function(element) { 11 | return [{ 12 | title: element.text().trim(), 13 | href: element.attr('href') 14 | }] 15 | }), 16 | articleDistiller: new jDistiller() 17 | .set('title', 'h1.title') 18 | .set('img', '.article-image img:first', function(element) { 19 | return element.attr('src'); 20 | }) 21 | .set('body', '.article-content p', function(element, prev) { 22 | prev.body = prev.body || ''; 23 | prev.body += element.text().trim() + '\n\n'; 24 | return prev.body; 25 | }) 26 | }); -------------------------------------------------------------------------------- /lib/sources/blogs/major/gizmodo.js: -------------------------------------------------------------------------------- 1 | 2 | var jDistiller = require('jdistiller').jDistiller, 3 | Source = require('../../../source').Source; 4 | 5 | exports.source = new Source({ 6 | source: "Gizmodo", 7 | description: "Gizmodo is the go-to authority for gadget news and digital culture.", 8 | headlineURL: 'http://gizmodo.com/', 9 | headlineDistiller: new jDistiller() 10 | .set('headlines', 'h1 a,h2 a,h3 a', function(element) { 11 | 12 | if (element.attr('href').indexOf('gizmodo') === -1) return; 13 | 14 | return [{ 15 | title: element.text(), 16 | href: element.attr('href') 17 | }] 18 | }), 19 | articleDistiller: new jDistiller() 20 | .set('title', 'h1') 21 | .set('img', '#page img:first', function(element) { 22 | return element.attr('src'); 23 | }) 24 | .set('body', '.post-body p', function(element, prev) { 25 | prev.body = prev.body || ''; 26 | prev.body += element.text().trim() + '\n\n'; 27 | return prev.body; 28 | }) 29 | }); -------------------------------------------------------------------------------- /lib/sources/other/github.js: -------------------------------------------------------------------------------- 1 | var jDistiller = require('jdistiller').jDistiller, 2 | Source = require('../../source').Source; 3 | 4 | exports.source = new Source({ 5 | source: "Github", 6 | description: 'Trending and featured repos on Github.com', 7 | headlineURL: 'https://github.com/explore', 8 | headlineDistiller: new jDistiller() 9 | .set('headlines', '#trending-repositories li h3, .ranked-repositories li h3', function(element) { 10 | return [{ 11 | title: element.text().trim(), 12 | href: 'https://github.com' + element.find('a:last').attr('href') 13 | }] 14 | }), 15 | articleDistiller: new jDistiller() 16 | .set('title', '.entry-title', function(element) { 17 | return element.find('.author').text().trim() + ' / ' + element.find('strong').text().trim(); 18 | }) 19 | .set('body', '.markdown-body p', function(element, prev) { 20 | prev.body = prev.body || ''; 21 | prev.body += element.text().trim() + '\n\n'; 22 | return prev.body 23 | }) 24 | }); 25 | -------------------------------------------------------------------------------- /lib/sources/news/major/latimes.js: -------------------------------------------------------------------------------- 1 | var jDistiller = require('jdistiller').jDistiller, 2 | Source = require('../../../source').Source; 3 | 4 | exports.source = new Source({ 5 | source: "LATimes", 6 | description: "The business and culture of our digital lives, from the L.A. Times.", 7 | headlineURL: 'http://www.latimes.com/business/technology/', 8 | headlineDistiller: new jDistiller() 9 | .set('headlines', '.headline a', function(element) { 10 | return [{ 11 | title: element.text().trim(), 12 | href: 'http://www.latimes.com' + element.attr('href') 13 | }] 14 | }), 15 | articleDistiller: new jDistiller() 16 | .set('title', '.story h1') 17 | .set('img', '.story img:first', function(element) { 18 | return element.attr('src'); 19 | }) 20 | .set('body', '#story-body-text p', function(element, prev) { 21 | prev.body = prev.body || ''; 22 | if (element.children().length) return; 23 | prev.body += element.text().trim() + '\n\n'; 24 | return prev.body; 25 | }) 26 | }); -------------------------------------------------------------------------------- /lib/sources/news/major/usatoday.js: -------------------------------------------------------------------------------- 1 | var jDistiller = require('jdistiller').jDistiller, 2 | Source = require('../../../source').Source; 3 | 4 | exports.source = new Source({ 5 | source: "USAToday", 6 | description: "Power up with breaking news on personal technology, electronics, gaming and computers.", 7 | headlineURL: 'http://www.usatoday.com/tech/', 8 | headlineDistiller: new jDistiller() 9 | .set('headlines', '.hero-list-item a', function(element) { 10 | var title = element.text().split('\r\n')[1].trim(); 11 | return [{ 12 | title: title, 13 | href: 'http://www.usatoday.com' + element.attr('href') + '?ajax=true' 14 | }] 15 | }), 16 | articleDistiller: new jDistiller() 17 | .set('title', '.content h1') 18 | .set('img', '.content img:first', function(element) { 19 | return element.attr('src'); 20 | }) 21 | .set('body', '.content p', function(element, prev) { 22 | prev.body = prev.body || ''; 23 | prev.body += element.text().trim() + '\n\n'; 24 | return prev.body; 25 | }) 26 | }); -------------------------------------------------------------------------------- /lib/sources/news/tech/arstechnica.js: -------------------------------------------------------------------------------- 1 | var jDistiller = require('jdistiller').jDistiller, 2 | Source = require('../../../source').Source; 3 | 4 | exports.source = new Source({ 5 | source: "ArsTechnica", 6 | description: "Ars Technica is a technology news site catering to PC enthusiasts.", 7 | headlineURL: 'http://arstechnica.com/', 8 | headlineDistiller: new jDistiller() 9 | .set('headlines', '.heading', function(element) { 10 | 11 | var href = element.parents('a').attr('href') || element.find('a').attr('href') || element.attr('href'); 12 | 13 | return [{ 14 | title: element.text().trim(), 15 | href: href 16 | }] 17 | }), 18 | articleDistiller: new jDistiller() 19 | .set('title', 'h1.heading:first') 20 | .set('img', 'figure img', function(element) { 21 | return element.attr('src'); 22 | }) 23 | .set('body', '.article-content p', function(element, prev) { 24 | prev.body = prev.body || ''; 25 | prev.body += element.text().trim() + '\n\n'; 26 | return prev.body; 27 | }) 28 | }); -------------------------------------------------------------------------------- /lib/sources/news/major/newyorktimes.js: -------------------------------------------------------------------------------- 1 | var jDistiller = require('jdistiller').jDistiller, 2 | Source = require('../../../source').Source; 3 | 4 | exports.source = new Source({ 5 | source: "NewYorkTimes", 6 | description: "The New York Times Bits blog.", 7 | headlineURL: 'http://bits.blogs.nytimes.com', 8 | headlineDistiller: new jDistiller() 9 | .set('headlines', '.postHeading', function(element) { 10 | 11 | var href = element.find('a').attr('href'); 12 | 13 | if (href.indexOf('bits.blogs.nytimes.com') === -1) return; 14 | 15 | return [{ 16 | title: element.text().trim().replace('|', ''), 17 | href: href 18 | }] 19 | }), 20 | articleDistiller: new jDistiller() 21 | .set('title', 'h1:first') 22 | .set('img', '.postContent img:first', function(element) { 23 | return element.attr('src'); 24 | }) 25 | .set('body', '.postContent p', function(element, prev) { 26 | prev.body = prev.body || ''; 27 | prev.body += element.text().trim() + '\n\n'; 28 | return prev.body; 29 | }) 30 | }); -------------------------------------------------------------------------------- /lib/sources/blogs/major/techrunch.js: -------------------------------------------------------------------------------- 1 | 2 | var jDistiller = require('jdistiller').jDistiller, 3 | Source = require('../../../source').Source; 4 | 5 | exports.source = new Source({ 6 | source: "TechCrunch", 7 | description: "A network of technology-oriented blogs and other web properties.", 8 | headlineURL: 'http://techcrunch.com', 9 | headlineDistiller: new jDistiller() 10 | .set('headlines', 'h2.headline', function(element) { 11 | return [{ 12 | title: element.text().trim(), 13 | href: element.find('a').attr('href') 14 | }] 15 | }), 16 | articleDistiller: new jDistiller() 17 | .set('title', 'h1.headline') 18 | .set('img', '.wp-post-image', function(element) { 19 | return element.attr('src'); 20 | }) 21 | .set('body', '.body-copy:first p', function(element, prev) { 22 | 23 | prev.body = prev.body || ''; 24 | 25 | // Don't include the text crunchbase bio. 26 | if ( !element.parents('.leftgreen').length ) { 27 | prev.body += element.text().trim() + '\n\n'; 28 | } 29 | 30 | return prev.body; 31 | }) 32 | }); -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011 Benjamin Coe and Joshua Hull and Gabriel Silk 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /lib/sources/blogs/major/fastcompany.js: -------------------------------------------------------------------------------- 1 | var jDistiller = require('jdistiller').jDistiller, 2 | Source = require('../../../source').Source; 3 | 4 | exports.source = new Source({ 5 | source: "FastCompany", 6 | description: "Business media brand with a unique editorial focus on innovation in technology, ethical economics, leadership, and design.", 7 | headlineURL: 'http://www.fastcompany.com/', 8 | headlineDistiller: new jDistiller() 9 | .set('headlines', 'h1 a,h2 a', function(element) { 10 | 11 | var href = element.attr('href'); 12 | if (href.indexOf('http://') === -1) { 13 | href = 'http://www.fastcompany.com' + href; 14 | } 15 | 16 | if (href.indexOf('fastcompany') === -1) { 17 | return; 18 | } 19 | 20 | return [{ 21 | title: element.text(), 22 | href: href 23 | }] 24 | 25 | }), 26 | 27 | articleDistiller: new jDistiller() 28 | .set('title', 'h1', function(element) { 29 | return element.text(); 30 | }) 31 | .set('img', '.node-poster img:first', function(element) { 32 | return element.attr('src'); 33 | }) 34 | .set('body', '.node-content', function(element, prev) { 35 | prev.body = prev.body || ''; 36 | prev.body += element.text().trim() + '\n\n'; 37 | return prev.body; 38 | }) 39 | }); -------------------------------------------------------------------------------- /lib/grabber.js: -------------------------------------------------------------------------------- 1 | var _ = require('underscore'), 2 | Loader = require('./loader').Loader; 3 | 4 | function Grabber(opts) { 5 | _.extend(this, { 6 | sources: (new Loader()).loadNewsSources() 7 | }, opts); 8 | } 9 | 10 | Grabber.prototype.grabHeadlines = function(source, callback) { 11 | if (typeof source === 'string') source = source.toLowerCase(); 12 | 13 | if (!this.sources[source]) { 14 | callback(new Error('News source ' + source + ' not found.')); 15 | return; 16 | } 17 | 18 | this.sources[source].listHeadlines(function(err, headlines) { 19 | if (!err && (!headlines || !headlines.length)) { 20 | callback(new Error('failed to list headlines')); 21 | } else { 22 | callback(err, headlines); 23 | } 24 | }); 25 | }; 26 | 27 | Grabber.prototype.grabArticle = function(source, headline, callback) { 28 | if (typeof source === 'string') source = source.toLowerCase(); 29 | 30 | if (!this.sources[source]) { 31 | callback(new Error('News source ' + source + ' not found.')); 32 | return; 33 | } 34 | 35 | this.sources[source].loadArticle(headline, function(err, article) { 36 | if (!err && (!article.title.length || !article.body.length)) { 37 | callback(new Error('failed to load article')) 38 | } else { 39 | callback(err, article); 40 | } 41 | }); 42 | }; 43 | 44 | exports.Grabber = Grabber; -------------------------------------------------------------------------------- /lib/sources/blogs/major/fastcolabs.js: -------------------------------------------------------------------------------- 1 | var jDistiller = require('jdistiller').jDistiller, 2 | Source = require('../../../source').Source; 3 | 4 | exports.source = new Source({ 5 | source: "FastCoLabs", 6 | description: "Code + Community by FastCompany", 7 | headlineURL: 'http://www.fastcolabs.com/', 8 | headlineDistiller: new jDistiller() 9 | .set('headlines', 'h1.title,h2.title a', function(element) { 10 | 11 | var href = element.attr('href'); 12 | if (href.indexOf('http://') === -1) { 13 | href = 'http://www.fastcolabs.com' + href; 14 | } 15 | 16 | if (href.indexOf('fastcolabs') === -1) { 17 | return; 18 | } 19 | 20 | return [{ 21 | title: element.text(), 22 | href: href 23 | }] 24 | 25 | }), 26 | 27 | articleDistiller: new jDistiller() 28 | .set('title', 'h1.title', function(element) { 29 | return element.text(); 30 | }) 31 | .set('img', 'figure.poster img:first', function(element) { 32 | var img = element.attr('src'); 33 | 34 | if (img.indexOf('http://') === -1) { 35 | img = 'http://www.fastcolabs.com' + img; 36 | } 37 | 38 | return img; 39 | }) 40 | .set('body', 'span.deck,.body', function(element, prev) { 41 | prev.body = prev.body || ''; 42 | prev.body += element.text().trim() + '\n\n'; 43 | return prev.body; 44 | }) 45 | }); -------------------------------------------------------------------------------- /lib/source.js: -------------------------------------------------------------------------------- 1 | var _ = require('underscore'), 2 | crypto = require('crypto'); 3 | 4 | function Source(opts) { 5 | _.extend(this, { 6 | source: null, // name of the news source, e.g., TechCrunch. 7 | description: null, // description of news source. 8 | headlineURL: null// URL to fetch headlines from. 9 | }, opts); 10 | }; 11 | 12 | // Given an URL from the headline listing, load the article: 13 | // 14 | // { 15 | // title: 'More Cats than Ever Using iPads', 16 | // body: 'in a recent study it has been shown that more owners are...', 17 | // img: 'http://techcrunch.com/cats.jpg' 18 | // } 19 | // 20 | Source.prototype.loadArticle = function(headline, callback) { 21 | // articleDistiller must be created in subclass. 22 | this.articleDistiller.distill(headline.href, function(err, distilledPage) { 23 | callback(err, distilledPage); 24 | }); 25 | }; 26 | 27 | // Use jDistiller to parse a eadline listing for this new source. 28 | // 29 | // [{title: 'More Cats Than Ever Using iPads', href: 'http://techcrunch.com/cats'}, ...] 30 | // 31 | Source.prototype.listHeadlines = function(callback) { 32 | // headlineDistiller must be created in subclass. 33 | this.headlineDistiller.distill(this.headlineURL, function(err, distilledPage) { 34 | callback(err, (distilledPage.headlines || []) ); 35 | }); 36 | }; 37 | 38 | // Create a hash for the current headline listing. 39 | Source.prototype.hash = function(callback) { 40 | var _this = this; 41 | 42 | this.listHeadlines(function(err, headlines) { 43 | 44 | if (err) { 45 | callback(err); 46 | return; 47 | } 48 | 49 | var hash = crypto.createHash('md5') 50 | .update(JSON.stringify(headlines)) 51 | .digest('hex'); 52 | 53 | callback(null, hash); 54 | }); 55 | }; 56 | 57 | exports.Source = Source; -------------------------------------------------------------------------------- /lib/loader.js: -------------------------------------------------------------------------------- 1 | var fs = require('fs'), 2 | _ = require('underscore'); 3 | 4 | function Loader(opts) { 5 | _.extend(this, { 6 | sources: {} 7 | }, opts); 8 | }; 9 | 10 | Loader.prototype._loadNewsSourcesShared = function(directories, directory, node) { 11 | fs.readdirSync(directory).forEach(function(source) { 12 | var path = directory + '/' + source; 13 | 14 | if ( fs.statSync(path).isDirectory() ) { 15 | directories.push(path) 16 | } else if (source.indexOf('.js') > -1 ) { 17 | var requirePath = path.replace('.js', ''), 18 | source = require(requirePath).source; 19 | 20 | node[source.source.toLowerCase()] = source; 21 | } 22 | }); 23 | } 24 | 25 | Loader.prototype.loadNewsSourcesHierarchical = function() { 26 | var directories = [__dirname + '/sources']; 27 | 28 | this.sources = {}; 29 | 30 | while (directories.length) { 31 | 32 | var directory = directories.pop(), 33 | node = this._getCurrentNode(directory); 34 | 35 | this._loadNewsSourcesShared(directories, directory, node); 36 | } 37 | 38 | return this.sources; 39 | }; 40 | 41 | Loader.prototype._getCurrentNode = function(directory) { 42 | var node = this.sources; 43 | 44 | directory = directory.replace(__dirname + '/sources', ''); 45 | 46 | directory.split('/').forEach(function(key) { 47 | if (!key.length) return; 48 | node[key] = node[key] || {}; 49 | node = node[key]; 50 | }); 51 | 52 | return node; 53 | }; 54 | 55 | Loader.prototype.loadNewsSources = function() { 56 | var directories = [__dirname + '/sources']; 57 | 58 | this.sources = {}; 59 | 60 | while (directories.length) { 61 | 62 | var directory = directories.pop(); 63 | 64 | this._loadNewsSourcesShared(directories, directory, this.sources); 65 | } 66 | 67 | return this.sources; 68 | }; 69 | 70 | exports.Loader = Loader; -------------------------------------------------------------------------------- /lib/feed-source.js: -------------------------------------------------------------------------------- 1 | var Source = require('./source').Source, 2 | util = require('util'), 3 | _ = require('underscore'), 4 | jQuery = require('jquery'); 5 | 6 | function FeedSource(opts) { 7 | _.extend(this, { 8 | source: null, // name of the news source, e.g., TechCrunch. 9 | description: null, // description of news source. 10 | feedURL: null, // a path to the News feed., 11 | type: 'rss', // type of feed, either rss or atom. 12 | headlineDistiller: new jDistiller() 13 | .set('headlines', 'item', function(element) { 14 | 15 | // link is a reserved word in HTML, and is 16 | // collapsed by JSDom into a self-closing 17 | // element, we can grab the link out by looking 18 | // for a dangling text node. 19 | var link = element 20 | .contents() 21 | .filter(function() { 22 | return this.nodeType === 3; 23 | }); 24 | 25 | return [{ 26 | title: element.find('title').text().trim(), 27 | href: link.text().trim() 28 | }] 29 | }) 30 | }, opts); 31 | } 32 | 33 | util.inherits(FeedSource, Source); 34 | 35 | // Given an URL from the headline listing, load the article: 36 | // 37 | // { 38 | // title: 'More Cats than Ever Using iPads', 39 | // body: 'in a recent study it has been shown that more owners are...', 40 | // img: 'http://techcrunch.com/cats.jpg' 41 | // } 42 | // 43 | FeedSource.prototype.loadArticle = function(headline, callback) { 44 | 45 | var articleDistiller = new jDistiller() 46 | .set('articles', 'item', function(element) { 47 | 48 | var title = element.find('title').text().trim(); 49 | 50 | // Only return the article requested. 51 | if (title !== headline.title) return; 52 | 53 | var innerElement = jQuery( '
' + element.find('description').text() + '
' ), 54 | body = ''; 55 | 56 | // If the content has paragraphs, pull together 57 | // all the paragraphs. 58 | innerElement.find('p').each(function() { 59 | body += jQuery(this).text() + '\n\n'; 60 | }); 61 | 62 | // If the content is just text, grab the text. 63 | if (!body) { 64 | innerElement.find('*').each(function() { 65 | body += jQuery(this).text() + ' '; 66 | }); 67 | } 68 | 69 | var article = { 70 | title: title, 71 | body: body 72 | }; 73 | 74 | // If possible, grab a thumbnail image. 75 | var img = innerElement.find('img:first'); 76 | if (img.length) { 77 | article.img = img.attr('src'); 78 | } 79 | 80 | return [article]; 81 | }); 82 | 83 | articleDistiller.distill(this.feedURL, function(err, distilledPage) { 84 | if (!distilledPage || !distilledPage.articles.length) { 85 | callback(new Error('article not found.')); 86 | } else { 87 | callback(err, distilledPage.articles.pop()); 88 | } 89 | }); 90 | }; 91 | 92 | // Use jDistiller to parse a eadline listing for this new source. 93 | // 94 | // [{title: 'More Cats Than Ever Using iPads', href: 'http://techcrunch.com/cats'}, ...] 95 | // 96 | FeedSource.prototype.listHeadlines = function(callback) { 97 | // headlineDistiller must be created in subclass. 98 | this.headlineDistiller.distill(this.feedURL, function(err, distilledPage) { 99 | callback(err, (distilledPage.headlines || []) ); 100 | }); 101 | }; 102 | 103 | exports.FeedSource = FeedSource; 104 | -------------------------------------------------------------------------------- /bin/routers-news.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | var exec = require('child_process').exec, 4 | optimist = require('optimist'), 5 | Loader = require('../lib').Loader, 6 | Source = require('../lib').Source, 7 | Grabber = require('../lib').Grabber; 8 | 9 | var argv = optimist 10 | .options('S', { 11 | alias: 'sources', 12 | describe: 'List news sources' 13 | }) 14 | .options('s', { 15 | alias: 'source', 16 | describe: 'List articles from a source' 17 | }) 18 | .options('a', { 19 | alias: 'article', 20 | describe: 'Select an article number' 21 | }) 22 | .options('o', { 23 | alias: 'output', 24 | describe: 'Output format: default, json, url' 25 | }) 26 | .options('O', { 27 | alias: 'open', 28 | describe: 'open the article in your browser' 29 | }) 30 | .usage("Usage:\n\ 31 | \trouters-news --sources\tlist the news sources available.\n\ 32 | \trouters-news --source=[source]\tlist the headlines for a news source.\n\ 33 | \trouters-news --source=[source] --article=[id]\tload an article by integer id.\n\ 34 | ") 35 | .argv; 36 | 37 | var actions = { 38 | sources: sources, 39 | articles: listArticles, 40 | article: showArticle 41 | }; 42 | 43 | function sources () { 44 | var loader = new Loader(), 45 | categories = [{ 46 | name: '\033[1;30mRouters News Sources:\033[m\n', 47 | data: loader.loadNewsSourcesHierarchical(), 48 | indent: '' 49 | }]; 50 | 51 | while (categories.length) { 52 | var category = categories.pop(); 53 | console.log(category.indent + category.name); 54 | Object.keys(category.data).forEach(function(key) { 55 | if (category.data[key] instanceof Source) { 56 | console.log(category.indent + ' ' + category.data[key].source + ': ' + category.data[key].description); 57 | } else { 58 | categories.push({ 59 | name: '\033[32m' + key + ':\033[m', 60 | data: category.data[key], 61 | indent: category.indent + ' ' 62 | }); 63 | } 64 | }); 65 | } 66 | } 67 | 68 | function listArticles (source) { 69 | var grabber = new Grabber(); 70 | 71 | grabber.grabHeadlines(source, function(err, headlines) { 72 | if (err) { 73 | console.log(err.message); 74 | return; 75 | } 76 | 77 | if (argv.output === 'json') { 78 | console.log(JSON.stringify(headlines)); 79 | } else { 80 | 81 | for (var i = 0, headline; (headline = headlines[i]) != null; i++) { 82 | console.log('[' + (i + 1) + ']\t' + headline.title); 83 | console.log('\t\033[32m' + headline.href + '\033[m\n'); 84 | } 85 | 86 | } 87 | }); 88 | } 89 | 90 | function showArticle (source, index) { 91 | var grabber = new Grabber(); 92 | index = parseInt(index) - 1; 93 | 94 | grabber.grabHeadlines(source, function(err, headlines) { 95 | 96 | if (err) { 97 | console.log(err.message); 98 | return; 99 | } 100 | 101 | if (!headlines[index]) { 102 | console.log('headline does not exist.'); 103 | return; 104 | } 105 | 106 | grabber.grabArticle(source, headlines[index], function(err, article) { 107 | 108 | if (err) { 109 | console.log(err.message); 110 | return; 111 | } 112 | 113 | article.href = headlines[index].href; 114 | 115 | // open the article using the CLI's default action. 116 | if (argv.open) { 117 | var command = 'open ' + article.href; 118 | console.log(command); 119 | exec(command); 120 | return; 121 | } 122 | 123 | if (argv.output === 'json') { 124 | console.log(JSON.stringify(article)); 125 | } else if (argv.output === 'url') { 126 | console.log(article.href); 127 | } else { 128 | console.log('\033[1;30m' + article.title + ':\033[m'); 129 | if (article.img) console.log('\n[' + article.img + ']'); 130 | console.log('\n' + article.body + '\n---------' ); 131 | console.log('\033[32m' + article.href + '\033[m\n'); 132 | } 133 | 134 | }); 135 | }); 136 | } 137 | 138 | if (argv.debug) { 139 | console.log('argv', argv) 140 | } 141 | 142 | // Route actions 143 | if (argv.sources) { 144 | 145 | // Display all the news sources Routers currently parses. 146 | return actions.sources(); 147 | 148 | } else if (argv.source && !argv.article) { 149 | 150 | // List the headlines for a source. 151 | return actions.articles(argv.source); 152 | 153 | } else if (argv.article) { 154 | 155 | return actions.article(argv.source, argv.article); 156 | 157 | } else { 158 | console.log(optimist.help()); 159 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Routers News 2 | ------------------ 3 | 4 | Routers is a collection of web-crawlers for various popular technology news sources. 5 | 6 | It exposes a command-line interface to these crawlers, allowing for the distinguishing tech-news enthusiast to avoid leaving the comfort of their terminal. 7 | 8 | It Currently Supports: 9 | 10 | __Technology News Sources__ 11 | * Ars Technica 12 | * Wired.com 13 | 14 | __Major Technology Blogs__ 15 | 16 | * TechCrunch 17 | * Mashable 18 | * Gizmodo 19 | * Fast Company 20 | * FastCo.Labs 21 | 22 | __Personal Technology Blogs__ 23 | 24 | * Codes From The Underground, my blog 25 | 26 | __Mainstream News Sources__ 27 | 28 | * New York Times 29 | * USA Today 30 | * L.A. Times 31 | 32 | __Other Random Stuff__ 33 | 34 | * Github 35 | * The Oatmeal 36 | * xkcd 37 | 38 | _(this categorization is loose, please feel free to shuffle stuff around.)_ 39 | 40 | It's Also An Experiment 41 | ------------ 42 | 43 | It is my hope that, by open-sourcing a collection of news scrapers, a community can be built around building a powerful set of real-time news aggregation tools. 44 | 45 | 46 | Installation 47 | ------------ 48 | 49 | ```bash 50 | npm install routers-news -g 51 | ``` 52 | 53 | Usage 54 | ----- 55 | 56 | __Listing News Sources__ 57 | 58 | ```bash 59 | routers-news --sources 60 | ``` 61 | 62 | __Outputs__ 63 | 64 | ```bash 65 | Routers News Sources: 66 | 67 | news: 68 | major: 69 | NewYorkTimes: The New York Times Bits blog. 70 | LATimes: The business and culture of our digital lives, from the L.A. Times. 71 | USAToday: Power up with breaking news on personal technology, electronics, gaming and computers. 72 | tech: 73 | Wired.com: Wired magazine is a monthly US technology publication. 74 | ArsTechnica: Ars Technica is a technology news site catering to PC enthusiasts. 75 | TechCrunch: A network of technology-oriented blogs and other web properties. 76 | other: 77 | Github: Trending and featured repos on Github.com 78 | ``` 79 | 80 | __Displaying Headlines__ 81 | 82 | ```bash 83 | routers-news --source=github 84 | ``` 85 | 86 | __Outputs__ 87 | 88 | ```bash 89 | [1] MacLemon / CongressChecklist 90 | https://github.com/MacLemon/CongressChecklist 91 | 92 | [2] dejan / rails_panel 93 | https://github.com/dejan/rails_panel 94 | 95 | [3] feross / md5-password-cracker.js 96 | https://github.com/feross/md5-password-cracker.js 97 | 98 | [4] shadowsocks / shadowsocks-go 99 | https://github.com/shadowsocks/shadowsocks-go 100 | 101 | [5] bcoe / routers-news 102 | https://github.com/bcoe/routers-news 103 | 104 | [6] andrew / 24pullrequests 105 | https://github.com/andrew/24pullrequests 106 | 107 | [7] nkohari / jwalk 108 | https://github.com/nkohari/jwalk 109 | 110 | [8] lockitron / selfstarter 111 | https://github.com/lockitron/selfstarter 112 | 113 | [9] twitter / bower 114 | https://github.com/twitter/bower 115 | 116 | [10] Spaceman-Labs / SMPageControl 117 | https://github.com/Spaceman-Labs/SMPageControl 118 | ``` 119 | 120 | __Loading Articles__ 121 | 122 | ```bash 123 | routers-news --source=github --article=5 124 | ``` 125 | 126 | __Outputs:__ 127 | 128 | ```bash 129 | bcoe / routers-news: 130 | 131 | 132 | A crawler for various popular tech news sources. Read technology news from the comfort of your CLI. 133 | — Read more 134 | --------- 135 | https://github.com/bcoe/routers-news 136 | ``` 137 | 138 | The Crawlers 139 | ---------- 140 | 141 | The news crawlers used by Routers come in two varieties: 142 | 143 | * Page scrapers which use CSS selectors to extract content from news sources. 144 | * RSS/Atom feed parsers, which crawl articles using an RSS or Atom news feed. 145 | 146 | Examples of both can be found in the __lib/sources__ directory. 147 | 148 | Contributing 149 | ---------- 150 | 151 | It's easy to add a new news source: 152 | 153 | * fork the routers news repo. 154 | * clone it locally. 155 | * run __npm install__ to install the libraries locally. 156 | * create a new crawler in the __lib/sources__ directory (everything in this hierarchy is automatically loaded). 157 | * to test your crawler run: __node ./bin/routers-news.js__. 158 | 159 | You can also help a ton by: 160 | 161 | * reporting when crawlers are broken. 162 | * extending on the crawelrs, I'd love to have: 163 | * Dates. 164 | * Authors. 165 | * Better image extraction. 166 | * improving on the CLI client. 167 | 168 | Help make our dreams of a collaborative web-crawler a reality :) 169 | 170 | Copyright 171 | --------- 172 | 173 | Copyright (c) 2012 Benjamin Coe and Joshua Hull and Gabriel Silk. See LICENSE.txt for further details. --------------------------------------------------------------------------------