├── .gitignore
├── lib
    ├── index.js
    ├── sources
    │   ├── blogs
    │   │   ├── personal
    │   │   │   └── bencoe.js
    │   │   └── major
    │   │   │   ├── mashable.js
    │   │   │   ├── gizmodo.js
    │   │   │   ├── techrunch.js
    │   │   │   ├── fastcompany.js
    │   │   │   └── fastcolabs.js
    │   ├── other
    │   │   ├── oatmeal.js
    │   │   ├── xkcd.js
    │   │   └── github.js
    │   └── news
    │   │   ├── tech
    │   │       ├── wired.js
    │   │       └── arstechnica.js
    │   │   └── major
    │   │       ├── latimes.js
    │   │       ├── usatoday.js
    │   │       └── newyorktimes.js
    ├── grabber.js
    ├── source.js
    ├── loader.js
    └── feed-source.js
├── package.json
├── LICENSE.txt
├── bin
    └── routers-news.js
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | .DS_Store
3 | npm-debug.log
4 | 


--------------------------------------------------------------------------------
/lib/index.js:
--------------------------------------------------------------------------------
1 | exports.loadNewsSources = require('./loader').loadNewsSources;
2 | exports.Source = require('./source').Source;
3 | exports.Loader = require('./loader').Loader;
4 | exports.Grabber = require('./grabber').Grabber;


--------------------------------------------------------------------------------
/lib/sources/blogs/personal/bencoe.js:
--------------------------------------------------------------------------------
1 | var jDistiller = require('jdistiller').jDistiller,
2 |   FeedSource = require('../../../feed-source').FeedSource;
3 | 
4 | exports.source = new FeedSource({
5 |   source: "bencoe",
6 |   description: "Codes from the Underground, Ben's blog.",
7 |   feedURL: 'http://bencoe.tumblr.com/rss'
8 | });
9 | 


--------------------------------------------------------------------------------
/lib/sources/other/oatmeal.js:
--------------------------------------------------------------------------------
1 | var jDistiller = require('jdistiller').jDistiller,
2 |   FeedSource = require('../../feed-source').FeedSource;
3 | 
4 | exports.source = new FeedSource({
5 |   source: "Oatmeal",
6 |   description: "Tastes better than stale skittles found under the couch cushions.",
7 |   feedURL: 'http://feeds.feedburner.com/oatmealfeed.atom',
8 |   type: 'atom'
9 | });


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "routers-news",
 3 |   "directories": {
 4 |     "lib": "./lib",
 5 |     "bin": "./bin"
 6 |   },
 7 |   "main": "./lib/index.js",
 8 |   "bin": "./bin/routers-news.js",
 9 |   "version": "1.0.4",
10 |   "author": "Ben Coe <bencoe@gmail.com>",
11 |   "engines": [
12 |     "node"
13 |   ],
14 |   "description": "A crawler for various popular tech news sources. Read technology news from the comfort of your CLI.",
15 |   "keywords": [
16 |     "crawler"
17 |   ],
18 |   "repository": {
19 |     "type": "git",
20 |     "url": "git://github.com/bcoe/routers"
21 |   },
22 |   "dependencies": {
23 |     "jdistiller": ">=1.1.3",
24 |     "underscore": ">=1.4.2",
25 |     "optimist": ">=0.3.4",
26 |     "jquery": ">=1.6.3"
27 |   }
28 | }


--------------------------------------------------------------------------------
/lib/sources/other/xkcd.js:
--------------------------------------------------------------------------------
 1 | var jDistiller = require('jdistiller').jDistiller,
 2 |   Source = require('../../source').Source;
 3 | 
 4 | exports.source = new Source({
 5 |   source: "xkcd",
 6 |   description: 'A webcomic of romance and math humor.',
 7 |   headlineURL: 'http://xkcd.com/atom.xml',
 8 |   headlineDistiller: new jDistiller()
 9 |     .set('headlines', 'entry', function(element) {
10 |       return [{
11 |         title: element.find('title').text(),
12 |         href: element.find('link').attr('href')
13 |       }]
14 |     }),
15 |   articleDistiller: new jDistiller()
16 |     .set('title', '#ctitle')
17 |     .set('body', '#comic img', function(element) {
18 |       return element.attr('title');
19 |     })
20 |     .set('img', '#comic img', function(element) {
21 |       return element.attr('src');
22 |     })
23 | });
24 | 


--------------------------------------------------------------------------------
/lib/sources/news/tech/wired.js:
--------------------------------------------------------------------------------
 1 | var jDistiller = require('jdistiller').jDistiller,
 2 |   Source = require('../../../source').Source;
 3 | 
 4 | exports.source = new Source({
 5 |   source: "Wired.com",
 6 |   description: "Wired magazine is a monthly US technology publication.",
 7 |   headlineURL: 'http://wired.com',
 8 |   headlineDistiller: new jDistiller()
 9 |     .set('headlines', '.headline1 a, .headline2 a', function(element) {
10 |       return [{
11 |         title: element.text(),
12 |         href: element.attr('href')
13 |       }]
14 |     }),
15 |   articleDistiller: new jDistiller()
16 |     .set('title', '.post h1')
17 |     .set('img', '.entry img', function(element) {
18 |         return element.attr('src');
19 |     })
20 |     .set('body', '.entry p', function(element, prev) {
21 |       prev.body = prev.body || '';
22 |       prev.body += element.text().trim() + '\n\n';
23 |       return prev.body;
24 |     })
25 | });


--------------------------------------------------------------------------------
/lib/sources/blogs/major/mashable.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var jDistiller = require('jdistiller').jDistiller,
 3 |   Source = require('../../../source').Source;
 4 | 
 5 | exports.source = new Source({
 6 |   source: "Mashable",
 7 |   description: "Mashable covers the top social media news on topics like Facebook, YouTube, Gmail, Twitter, Amazon, Pinterest and More.",
 8 |   headlineURL: 'http://mashable.com/tech/',
 9 |   headlineDistiller: new jDistiller()
10 |     .set('headlines', 'h1 a', function(element) {
11 |       return [{
12 |         title: element.text().trim(),
13 |         href: element.attr('href')
14 |       }]
15 |     }),
16 |   articleDistiller: new jDistiller()
17 |     .set('title', 'h1.title')
18 |     .set('img', '.article-image img:first', function(element) {
19 |       return element.attr('src');
20 |     })
21 |     .set('body', '.article-content p', function(element, prev) {
22 |       prev.body = prev.body || '';
23 |       prev.body += element.text().trim() + '\n\n';
24 |       return prev.body;
25 |     })
26 | });


--------------------------------------------------------------------------------
/lib/sources/blogs/major/gizmodo.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var jDistiller = require('jdistiller').jDistiller,
 3 |   Source = require('../../../source').Source;
 4 | 
 5 | exports.source = new Source({
 6 |   source: "Gizmodo",
 7 |   description: "Gizmodo is the go-to authority for gadget news and digital culture.",
 8 |   headlineURL: 'http://gizmodo.com/',
 9 |   headlineDistiller: new jDistiller()
10 |     .set('headlines', 'h1 a,h2 a,h3 a', function(element) {
11 | 
12 |       if (element.attr('href').indexOf('gizmodo') === -1) return;
13 | 
14 |       return [{
15 |         title: element.text(),
16 |         href: element.attr('href')
17 |       }]
18 |     }),
19 |   articleDistiller: new jDistiller()
20 |     .set('title', 'h1')
21 |     .set('img', '#page img:first', function(element) {
22 |       return element.attr('src');
23 |     })
24 |     .set('body', '.post-body p', function(element, prev) {
25 |       prev.body = prev.body || '';
26 |       prev.body += element.text().trim() + '\n\n';
27 |       return prev.body;
28 |     })
29 | });


--------------------------------------------------------------------------------
/lib/sources/other/github.js:
--------------------------------------------------------------------------------
 1 | var jDistiller = require('jdistiller').jDistiller,
 2 |   Source = require('../../source').Source;
 3 | 
 4 | exports.source = new Source({
 5 |   source: "Github",
 6 |   description: 'Trending and featured repos on Github.com',
 7 |   headlineURL: 'https://github.com/explore',
 8 |   headlineDistiller: new jDistiller()
 9 |     .set('headlines', '#trending-repositories li h3, .ranked-repositories li h3', function(element) {
10 |       return [{
11 |         title: element.text().trim(),
12 |         href: 'https://github.com' + element.find('a:last').attr('href')
13 |       }]
14 |     }),
15 |   articleDistiller: new jDistiller()
16 |     .set('title', '.entry-title', function(element) {
17 |       return element.find('.author').text().trim() + ' / ' + element.find('strong').text().trim();
18 |     })
19 |     .set('body', '.markdown-body p', function(element, prev) {
20 |       prev.body = prev.body || '';
21 |       prev.body += element.text().trim() + '\n\n';
22 |       return prev.body
23 |     })
24 | });
25 | 


--------------------------------------------------------------------------------
/lib/sources/news/major/latimes.js:
--------------------------------------------------------------------------------
 1 | var jDistiller = require('jdistiller').jDistiller,
 2 |   Source = require('../../../source').Source;
 3 | 
 4 | exports.source = new Source({
 5 |   source: "LATimes",
 6 |   description: "The business and culture of our digital lives, from the L.A. Times.",
 7 |   headlineURL: 'http://www.latimes.com/business/technology/',
 8 |   headlineDistiller: new jDistiller()
 9 |     .set('headlines', '.headline a', function(element) {
10 |       return [{
11 |         title: element.text().trim(),
12 |         href: 'http://www.latimes.com' + element.attr('href')
13 |       }]
14 |     }),
15 |   articleDistiller: new jDistiller()
16 |     .set('title', '.story h1')
17 |     .set('img', '.story img:first', function(element) {
18 |         return element.attr('src');
19 |     })
20 |     .set('body', '#story-body-text p', function(element, prev) {
21 |       prev.body = prev.body || '';
22 |       if (element.children().length) return;
23 |       prev.body += element.text().trim() + '\n\n';
24 |       return prev.body;
25 |     })
26 | });


--------------------------------------------------------------------------------
/lib/sources/news/major/usatoday.js:
--------------------------------------------------------------------------------
 1 | var jDistiller = require('jdistiller').jDistiller,
 2 |   Source = require('../../../source').Source;
 3 | 
 4 | exports.source = new Source({
 5 |   source: "USAToday",
 6 |   description: "Power up with breaking news on personal technology, electronics, gaming and computers.",
 7 |   headlineURL: 'http://www.usatoday.com/tech/',
 8 |   headlineDistiller: new jDistiller()
 9 |     .set('headlines', '.hero-list-item a', function(element) {
10 |       var title = element.text().split('\r\n')[1].trim();
11 |       return [{
12 |         title: title,
13 |         href: 'http://www.usatoday.com' + element.attr('href') + '?ajax=true'
14 |       }]
15 |     }),
16 |   articleDistiller: new jDistiller()
17 |     .set('title', '.content h1')
18 |     .set('img', '.content img:first', function(element) {
19 |         return element.attr('src');
20 |     })
21 |     .set('body', '.content p', function(element, prev) {
22 |       prev.body = prev.body || '';
23 |       prev.body += element.text().trim() + '\n\n';
24 |       return prev.body;
25 |     })
26 | });


--------------------------------------------------------------------------------
/lib/sources/news/tech/arstechnica.js:
--------------------------------------------------------------------------------
 1 | var jDistiller = require('jdistiller').jDistiller,
 2 |   Source = require('../../../source').Source;
 3 | 
 4 | exports.source = new Source({
 5 |   source: "ArsTechnica",
 6 |   description: "Ars Technica is a technology news site catering to PC enthusiasts.",
 7 |   headlineURL: 'http://arstechnica.com/',
 8 |   headlineDistiller: new jDistiller()
 9 |     .set('headlines', '.heading', function(element) {
10 | 
11 |       var href = element.parents('a').attr('href') || element.find('a').attr('href') || element.attr('href');
12 | 
13 |       return [{
14 |         title: element.text().trim(),
15 |         href: href
16 |       }]
17 |     }),
18 |   articleDistiller: new jDistiller()
19 |     .set('title', 'h1.heading:first')
20 |     .set('img', 'figure img', function(element) {
21 |         return element.attr('src');
22 |     })
23 |     .set('body', '.article-content p', function(element, prev) {
24 |       prev.body = prev.body || '';
25 |       prev.body += element.text().trim() + '\n\n';
26 |       return prev.body;
27 |     })
28 | });


--------------------------------------------------------------------------------
/lib/sources/news/major/newyorktimes.js:
--------------------------------------------------------------------------------
 1 | var jDistiller = require('jdistiller').jDistiller,
 2 |   Source = require('../../../source').Source;
 3 | 
 4 | exports.source = new Source({
 5 |   source: "NewYorkTimes",
 6 |   description: "The New York Times Bits blog.",
 7 |   headlineURL: 'http://bits.blogs.nytimes.com',
 8 |   headlineDistiller: new jDistiller()
 9 |     .set('headlines', '.postHeading', function(element) {
10 | 
11 |       var href = element.find('a').attr('href');
12 | 
13 |       if (href.indexOf('bits.blogs.nytimes.com') === -1) return;
14 | 
15 |       return [{
16 |         title: element.text().trim().replace('|', ''),
17 |         href: href
18 |       }]
19 |     }),
20 |   articleDistiller: new jDistiller()
21 |     .set('title', 'h1:first')
22 |     .set('img', '.postContent img:first', function(element) {
23 |         return element.attr('src');
24 |     })
25 |     .set('body', '.postContent p', function(element, prev) {
26 |       prev.body = prev.body || '';
27 |       prev.body += element.text().trim() + '\n\n';
28 |       return prev.body;
29 |     })
30 | });


--------------------------------------------------------------------------------
/lib/sources/blogs/major/techrunch.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var jDistiller = require('jdistiller').jDistiller,
 3 |   Source = require('../../../source').Source;
 4 | 
 5 | exports.source = new Source({
 6 |   source: "TechCrunch",
 7 |   description: "A network of technology-oriented blogs and other web properties.",
 8 |   headlineURL: 'http://techcrunch.com',
 9 |   headlineDistiller: new jDistiller()
10 |     .set('headlines', 'h2.headline', function(element) {
11 |       return [{
12 |         title: element.text().trim(),
13 |         href: element.find('a').attr('href')
14 |       }]
15 |     }),
16 |   articleDistiller: new jDistiller()
17 |     .set('title', 'h1.headline')
18 |     .set('img', '.wp-post-image', function(element) {
19 |       return element.attr('src');
20 |     })
21 |     .set('body', '.body-copy:first p', function(element, prev) {
22 | 
23 |       prev.body = prev.body || '';
24 | 
25 |       // Don't include the text crunchbase bio.
26 |       if ( !element.parents('.leftgreen').length ) {
27 |         prev.body += element.text().trim() + '\n\n';
28 |       }
29 |       
30 |       return prev.body;
31 |     })
32 | });


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2011 Benjamin Coe and Joshua Hull and Gabriel Silk
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/lib/sources/blogs/major/fastcompany.js:
--------------------------------------------------------------------------------
 1 | var jDistiller = require('jdistiller').jDistiller,
 2 |   Source = require('../../../source').Source;
 3 | 
 4 | exports.source = new Source({
 5 |   source: "FastCompany",
 6 |   description: "Business media brand with a unique editorial focus on innovation in technology, ethical economics, leadership, and design.",
 7 |   headlineURL: 'http://www.fastcompany.com/',
 8 |   headlineDistiller: new jDistiller()
 9 |     .set('headlines', 'h1 a,h2 a', function(element) {
10 | 
11 |       var href = element.attr('href');
12 |       if (href.indexOf('http://') === -1) {
13 |         href = 'http://www.fastcompany.com' + href;
14 |       }
15 | 
16 |       if (href.indexOf('fastcompany') === -1) {
17 |        return;
18 |      }
19 | 
20 |       return [{
21 |         title: element.text(),
22 |         href: href
23 |       }]
24 | 
25 |     }),
26 | 
27 |   articleDistiller: new jDistiller()
28 |     .set('title', 'h1', function(element) {
29 |       return element.text();
30 |     })
31 |     .set('img', '.node-poster img:first', function(element) {
32 |       return element.attr('src');
33 |     })
34 |     .set('body', '.node-content', function(element, prev) {
35 |       prev.body = prev.body || '';
36 |       prev.body += element.text().trim() + '\n\n';
37 |       return prev.body;
38 |     })
39 | });


--------------------------------------------------------------------------------
/lib/grabber.js:
--------------------------------------------------------------------------------
 1 | var _ = require('underscore'),
 2 |   Loader = require('./loader').Loader;
 3 | 
 4 | function Grabber(opts) {
 5 |   _.extend(this, {
 6 |     sources: (new Loader()).loadNewsSources()
 7 |   }, opts);
 8 | }
 9 | 
10 | Grabber.prototype.grabHeadlines = function(source, callback) {
11 |   if (typeof source === 'string') source = source.toLowerCase();
12 |   
13 |   if (!this.sources[source]) {
14 |     callback(new Error('News source ' + source + ' not found.'));
15 |     return;
16 |   }
17 | 
18 |   this.sources[source].listHeadlines(function(err, headlines) {
19 |     if (!err && (!headlines || !headlines.length)) {
20 |       callback(new Error('failed to list headlines'));
21 |     } else {
22 |       callback(err, headlines);
23 |     }
24 |   });
25 | };
26 | 
27 | Grabber.prototype.grabArticle = function(source, headline, callback) {
28 |   if (typeof source === 'string') source = source.toLowerCase();
29 | 
30 |   if (!this.sources[source]) {
31 |     callback(new Error('News source ' + source + ' not found.'));
32 |     return;
33 |   }
34 | 
35 |   this.sources[source].loadArticle(headline, function(err, article) {
36 |     if (!err && (!article.title.length || !article.body.length)) {
37 |       callback(new Error('failed to load article'))
38 |     } else {
39 |       callback(err, article);
40 |     }
41 |   });
42 | };
43 | 
44 | exports.Grabber = Grabber;


--------------------------------------------------------------------------------
/lib/sources/blogs/major/fastcolabs.js:
--------------------------------------------------------------------------------
 1 | var jDistiller = require('jdistiller').jDistiller,
 2 |   Source = require('../../../source').Source;
 3 | 
 4 | exports.source = new Source({
 5 |   source: "FastCoLabs",
 6 |   description: "Code + Community by FastCompany",
 7 |   headlineURL: 'http://www.fastcolabs.com/',
 8 |   headlineDistiller: new jDistiller()
 9 |     .set('headlines', 'h1.title,h2.title a', function(element) {
10 | 
11 |       var href = element.attr('href');
12 |       if (href.indexOf('http://') === -1) {
13 |         href = 'http://www.fastcolabs.com' + href;
14 |       }
15 | 
16 |       if (href.indexOf('fastcolabs') === -1) {
17 |        return;
18 |       }
19 | 
20 |       return [{
21 |         title: element.text(),
22 |         href: href
23 |       }]
24 | 
25 |     }),
26 | 
27 |   articleDistiller: new jDistiller()
28 |     .set('title', 'h1.title', function(element) {
29 |       return element.text();
30 |     })
31 |     .set('img', 'figure.poster img:first', function(element) {
32 |       var img = element.attr('src');
33 | 
34 |       if (img.indexOf('http://') === -1) {
35 |         img = 'http://www.fastcolabs.com' + img;
36 |       }
37 | 
38 |       return img;
39 |     })
40 |     .set('body', 'span.deck,.body', function(element, prev) {
41 |       prev.body = prev.body || '';
42 |       prev.body += element.text().trim() + '\n\n';
43 |       return prev.body;
44 |     })
45 | });


--------------------------------------------------------------------------------
/lib/source.js:
--------------------------------------------------------------------------------
 1 | var _ = require('underscore'),
 2 |   crypto = require('crypto');
 3 | 
 4 | function Source(opts) {
 5 |   _.extend(this, {
 6 |     source: null, // name of the news source, e.g., TechCrunch.
 7 |     description: null, // description of news source.
 8 |     headlineURL: null// URL to fetch headlines from.
 9 |   }, opts);
10 | };
11 | 
12 | // Given an URL from the headline listing, load the article:
13 | //
14 | // {
15 | //  title: 'More Cats than Ever Using iPads',
16 | //  body: 'in a recent study it has been shown that more owners are...',
17 | //  img: 'http://techcrunch.com/cats.jpg'
18 | // }
19 | //
20 | Source.prototype.loadArticle = function(headline, callback) {
21 |   // articleDistiller must be created in subclass.
22 |   this.articleDistiller.distill(headline.href, function(err, distilledPage) {
23 |     callback(err, distilledPage);
24 |   });
25 | };
26 | 
27 | // Use jDistiller to parse a eadline listing for this new source.
28 | //
29 | // [{title: 'More Cats Than Ever Using iPads', href: 'http://techcrunch.com/cats'}, ...]
30 | //
31 | Source.prototype.listHeadlines = function(callback) {
32 |   // headlineDistiller must be created in subclass.
33 |   this.headlineDistiller.distill(this.headlineURL, function(err, distilledPage) {
34 |     callback(err, (distilledPage.headlines || []) );
35 |   });
36 | };
37 | 
38 | // Create a hash for the current headline listing.
39 | Source.prototype.hash = function(callback) {
40 |   var _this = this;
41 | 
42 |   this.listHeadlines(function(err, headlines) {
43 |     
44 |     if (err) {
45 |       callback(err);
46 |       return;
47 |     }
48 | 
49 |     var hash = crypto.createHash('md5')
50 |      .update(JSON.stringify(headlines))
51 |      .digest('hex');
52 | 
53 |     callback(null, hash);
54 |   });
55 | };
56 | 
57 | exports.Source = Source;


--------------------------------------------------------------------------------
/lib/loader.js:
--------------------------------------------------------------------------------
 1 | var fs = require('fs'),
 2 |   _ = require('underscore');
 3 | 
 4 | function Loader(opts) {
 5 |   _.extend(this, {
 6 |     sources: {}
 7 |   }, opts);
 8 | }; 
 9 | 
10 | Loader.prototype._loadNewsSourcesShared = function(directories, directory, node) {
11 |   fs.readdirSync(directory).forEach(function(source) {
12 |     var path = directory + '/' + source;
13 | 
14 |     if ( fs.statSync(path).isDirectory() ) {
15 |       directories.push(path)
16 |     } else if (source.indexOf('.js') > -1 ) {
17 |       var requirePath = path.replace('.js', ''),
18 |         source = require(requirePath).source;
19 | 
20 |       node[source.source.toLowerCase()] = source;
21 |     }
22 |   });
23 | }
24 | 
25 | Loader.prototype.loadNewsSourcesHierarchical = function() {
26 |   var directories = [__dirname + '/sources'];
27 | 
28 |   this.sources = {};
29 | 
30 |   while (directories.length) {
31 | 
32 |     var directory = directories.pop(),
33 |       node = this._getCurrentNode(directory);
34 | 
35 |     this._loadNewsSourcesShared(directories, directory, node);
36 |   }
37 | 
38 |   return this.sources;
39 | };
40 | 
41 | Loader.prototype._getCurrentNode = function(directory) {
42 |   var node = this.sources;
43 | 
44 |   directory = directory.replace(__dirname + '/sources', '');
45 | 
46 |   directory.split('/').forEach(function(key) {
47 |     if (!key.length) return;
48 |     node[key] = node[key] || {};
49 |     node = node[key];
50 |   });
51 | 
52 |   return node;
53 | };
54 | 
55 | Loader.prototype.loadNewsSources = function() {
56 |   var directories = [__dirname + '/sources'];
57 | 
58 |   this.sources = {};
59 | 
60 |   while (directories.length) {
61 | 
62 |     var directory = directories.pop();
63 | 
64 |     this._loadNewsSourcesShared(directories, directory, this.sources);
65 |   }
66 | 
67 |   return this.sources;
68 | };
69 | 
70 | exports.Loader = Loader;


--------------------------------------------------------------------------------
/lib/feed-source.js:
--------------------------------------------------------------------------------
  1 | var Source = require('./source').Source,
  2 |     util = require('util'),
  3 |     _ = require('underscore'),
  4 |     jQuery = require('jquery');
  5 | 
  6 | function FeedSource(opts) {
  7 |   _.extend(this, {
  8 |     source: null, // name of the news source, e.g., TechCrunch.
  9 |     description: null, // description of news source.
 10 |     feedURL: null, // a path to the News feed.,
 11 |     type: 'rss', // type of feed, either rss or atom.
 12 |     headlineDistiller: new jDistiller()
 13 |       .set('headlines', 'item', function(element) {
 14 | 
 15 |         // link is a reserved word in HTML, and is
 16 |         // collapsed by JSDom into a self-closing
 17 |         // element, we can grab the link out by looking
 18 |         // for a dangling text node.
 19 |         var link = element
 20 |           .contents()
 21 |           .filter(function() {
 22 |             return this.nodeType === 3;
 23 |           });
 24 | 
 25 |         return [{
 26 |           title: element.find('title').text().trim(),
 27 |           href: link.text().trim()
 28 |         }]
 29 |       })
 30 |   }, opts);
 31 | }
 32 | 
 33 | util.inherits(FeedSource, Source);
 34 | 
 35 | // Given an URL from the headline listing, load the article:
 36 | //
 37 | // {
 38 | //  title: 'More Cats than Ever Using iPads',
 39 | //  body: 'in a recent study it has been shown that more owners are...',
 40 | //  img: 'http://techcrunch.com/cats.jpg'
 41 | // }
 42 | //
 43 | FeedSource.prototype.loadArticle = function(headline, callback) {
 44 | 
 45 |   var articleDistiller = new jDistiller()
 46 |     .set('articles', 'item', function(element) {
 47 |       
 48 |       var title = element.find('title').text().trim();
 49 | 
 50 |       // Only return the article requested.
 51 |       if (title !== headline.title) return;
 52 | 
 53 |       var innerElement = jQuery( '<div>' + element.find('description').text() + '</div>' ),
 54 |         body = '';
 55 | 
 56 |       // If the content has paragraphs, pull together
 57 |       // all the paragraphs.
 58 |       innerElement.find('p').each(function() {
 59 |         body += jQuery(this).text() + '\n\n';
 60 |       });
 61 | 
 62 |       // If the content is just text, grab the text.
 63 |       if (!body) {
 64 |         innerElement.find('*').each(function() {
 65 |           body += jQuery(this).text() + ' ';
 66 |         });
 67 |       }
 68 | 
 69 |       var article = {
 70 |         title: title,
 71 |         body: body
 72 |       };
 73 | 
 74 |       // If possible, grab a thumbnail image.
 75 |       var img = innerElement.find('img:first');
 76 |       if (img.length) {
 77 |         article.img = img.attr('src');
 78 |       }
 79 | 
 80 |       return [article];
 81 |     });
 82 | 
 83 |   articleDistiller.distill(this.feedURL, function(err, distilledPage) {
 84 |     if (!distilledPage || !distilledPage.articles.length) {
 85 |       callback(new Error('article not found.'));
 86 |     } else {
 87 |       callback(err, distilledPage.articles.pop());
 88 |     }
 89 |   });
 90 | };
 91 | 
 92 | // Use jDistiller to parse a eadline listing for this new source.
 93 | //
 94 | // [{title: 'More Cats Than Ever Using iPads', href: 'http://techcrunch.com/cats'}, ...]
 95 | //
 96 | FeedSource.prototype.listHeadlines = function(callback) {
 97 |   // headlineDistiller must be created in subclass.
 98 |   this.headlineDistiller.distill(this.feedURL, function(err, distilledPage) {
 99 |     callback(err, (distilledPage.headlines || []) );
100 |   });
101 | };
102 | 
103 | exports.FeedSource = FeedSource;
104 |   


--------------------------------------------------------------------------------
/bin/routers-news.js:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env node
  2 | 
  3 | var exec = require('child_process').exec,
  4 |   optimist = require('optimist'),
  5 |   Loader = require('../lib').Loader,
  6 |   Source = require('../lib').Source,
  7 |   Grabber = require('../lib').Grabber;
  8 | 
  9 | var argv = optimist
 10 |   .options('S', {
 11 |     alias: 'sources',
 12 |     describe: 'List news sources'
 13 |   })
 14 |   .options('s', {
 15 |     alias: 'source',
 16 |     describe: 'List articles from a source'
 17 |   })
 18 |   .options('a', {
 19 |     alias: 'article',
 20 |     describe: 'Select an article number'
 21 |   })
 22 |   .options('o', {
 23 |     alias: 'output',
 24 |     describe: 'Output format: default, json, url'
 25 |   })
 26 |   .options('O', {
 27 |     alias: 'open',
 28 |     describe: 'open the article in your browser'
 29 |   })
 30 |   .usage("Usage:\n\
 31 |     \trouters-news --sources\tlist the news sources available.\n\
 32 |     \trouters-news --source=[source]\tlist the headlines for a news source.\n\
 33 |     \trouters-news --source=[source] --article=[id]\tload an article by integer id.\n\
 34 |   ")
 35 |   .argv;
 36 | 
 37 | var actions = {
 38 |   sources: sources,
 39 |   articles: listArticles,
 40 |   article: showArticle
 41 | };
 42 | 
 43 | function sources () {
 44 |   var loader = new Loader(),
 45 |     categories = [{
 46 |       name: '\033[1;30mRouters News Sources:\033[m\n',
 47 |       data: loader.loadNewsSourcesHierarchical(),
 48 |       indent: ''
 49 |     }];
 50 | 
 51 |   while (categories.length) {
 52 |     var category = categories.pop();
 53 |     console.log(category.indent + category.name);
 54 |     Object.keys(category.data).forEach(function(key) {
 55 |       if (category.data[key] instanceof Source) {
 56 |         console.log(category.indent + '  ' + category.data[key].source + ': ' + category.data[key].description);
 57 |       } else {
 58 |         categories.push({
 59 |           name: '\033[32m' + key + ':\033[m',
 60 |           data: category.data[key],
 61 |           indent: category.indent + '  '
 62 |         });
 63 |       }
 64 |     });
 65 |   }
 66 | }
 67 | 
 68 | function listArticles (source) {
 69 |   var grabber = new Grabber();
 70 | 
 71 |   grabber.grabHeadlines(source, function(err, headlines) {
 72 |     if (err) {
 73 |       console.log(err.message);
 74 |       return;
 75 |     }
 76 | 
 77 |     if (argv.output === 'json') {
 78 |       console.log(JSON.stringify(headlines));
 79 |     } else {
 80 | 
 81 |       for (var i = 0, headline; (headline = headlines[i]) != null; i++) {
 82 |         console.log('[' + (i + 1) + ']\t' + headline.title);
 83 |         console.log('\t\033[32m' + headline.href + '\033[m\n');
 84 |       }
 85 | 
 86 |     }
 87 |   });
 88 | }
 89 | 
 90 | function showArticle (source, index) {
 91 |   var grabber = new Grabber();
 92 |   index = parseInt(index) - 1;
 93 | 
 94 |   grabber.grabHeadlines(source, function(err, headlines) {
 95 | 
 96 |     if (err) {
 97 |       console.log(err.message);
 98 |       return;
 99 |     }
100 | 
101 |     if (!headlines[index]) {
102 |       console.log('headline does not exist.');
103 |       return;
104 |     }
105 | 
106 |     grabber.grabArticle(source, headlines[index], function(err, article) {
107 | 
108 |       if (err) {
109 |         console.log(err.message);
110 |         return;
111 |       }
112 | 
113 |       article.href = headlines[index].href;
114 | 
115 |       // open the article using the CLI's default action.
116 |       if (argv.open) {
117 |         var command = 'open ' + article.href;
118 |         console.log(command);
119 |         exec(command);
120 |         return;
121 |       }
122 | 
123 |       if (argv.output === 'json') {
124 |         console.log(JSON.stringify(article));
125 |       } else if (argv.output === 'url') {
126 |         console.log(article.href);
127 |       } else {
128 |         console.log('\033[1;30m' + article.title + ':\033[m');
129 |         if (article.img) console.log('\n[' + article.img + ']');
130 |         console.log('\n' + article.body + '\n---------' );
131 |         console.log('\033[32m' + article.href + '\033[m\n');
132 |       }
133 | 
134 |     });
135 |   });
136 | }
137 | 
138 | if (argv.debug) {
139 |   console.log('argv', argv)
140 | }
141 | 
142 | // Route actions
143 | if (argv.sources) {
144 | 
145 |   // Display all the news sources Routers currently parses.
146 |   return actions.sources();
147 | 
148 | } else if (argv.source && !argv.article) {
149 | 
150 |   // List the headlines for a source.
151 |   return actions.articles(argv.source);
152 | 
153 | } else if (argv.article) {
154 | 
155 |   return actions.article(argv.source, argv.article);
156 | 
157 | } else {
158 |   console.log(optimist.help());
159 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Routers News
  2 | ------------------
  3 | 
  4 | Routers is a collection of web-crawlers for various popular technology news sources.
  5 | 
  6 | It exposes a command-line interface to these crawlers, allowing for the distinguishing tech-news enthusiast to avoid leaving the comfort of their terminal.
  7 | 
  8 | It Currently Supports:
  9 | 
 10 | __Technology News Sources__
 11 | * Ars Technica
 12 | * Wired.com
 13 | 
 14 | __Major Technology Blogs__
 15 | 
 16 | * TechCrunch
 17 | * Mashable
 18 | * Gizmodo
 19 | * Fast Company
 20 | * FastCo.Labs
 21 | 
 22 | __Personal Technology Blogs__
 23 | 
 24 | * Codes From The Underground, my blog
 25 | 
 26 | __Mainstream News Sources__
 27 | 
 28 | * New York Times
 29 | * USA Today
 30 | * L.A. Times
 31 | 
 32 | __Other Random Stuff__
 33 | 
 34 | * Github
 35 | * The Oatmeal
 36 | * xkcd
 37 | 
 38 | _(this categorization is loose, please feel free to shuffle stuff around.)_
 39 | 
 40 | It's Also An Experiment
 41 | ------------
 42 | 
 43 | It is my hope that, by open-sourcing a collection of news scrapers, a community can be built around building a powerful set of real-time news aggregation tools.
 44 | 
 45 | 
 46 | Installation
 47 | ------------
 48 | 
 49 | ```bash
 50 | npm install routers-news -g
 51 | ```
 52 | 
 53 | Usage
 54 | -----
 55 | 
 56 | __Listing News Sources__
 57 | 
 58 | ```bash
 59 | routers-news --sources
 60 | ```
 61 | 
 62 | __Outputs__
 63 | 
 64 | ```bash
 65 | Routers News Sources:
 66 | 
 67 |   news:
 68 |     major:
 69 |       NewYorkTimes: The New York Times Bits blog.
 70 |       LATimes: The business and culture of our digital lives, from the L.A. Times.
 71 |       USAToday: Power up with breaking news on personal technology, electronics, gaming and computers.
 72 |     tech:
 73 |       Wired.com: Wired magazine is a monthly US technology publication.
 74 |       ArsTechnica: Ars Technica is a technology news site catering to PC enthusiasts.
 75 |       TechCrunch: A network of technology-oriented blogs and other web properties.
 76 |   other:
 77 |     Github: Trending and featured repos on Github.com
 78 | ```
 79 | 
 80 | __Displaying Headlines__
 81 | 
 82 | ```bash
 83 | routers-news --source=github
 84 | ```
 85 | 
 86 | __Outputs__
 87 | 
 88 | ```bash
 89 | [1] MacLemon / CongressChecklist
 90 |   https://github.com/MacLemon/CongressChecklist
 91 | 
 92 | [2] dejan / rails_panel
 93 |   https://github.com/dejan/rails_panel
 94 | 
 95 | [3] feross / md5-password-cracker.js
 96 |   https://github.com/feross/md5-password-cracker.js
 97 | 
 98 | [4] shadowsocks / shadowsocks-go
 99 |   https://github.com/shadowsocks/shadowsocks-go
100 | 
101 | [5] bcoe / routers-news
102 |   https://github.com/bcoe/routers-news
103 | 
104 | [6] andrew / 24pullrequests
105 |   https://github.com/andrew/24pullrequests
106 | 
107 | [7] nkohari / jwalk
108 |   https://github.com/nkohari/jwalk
109 | 
110 | [8] lockitron / selfstarter
111 |   https://github.com/lockitron/selfstarter
112 | 
113 | [9] twitter / bower
114 |   https://github.com/twitter/bower
115 | 
116 | [10]  Spaceman-Labs / SMPageControl
117 |   https://github.com/Spaceman-Labs/SMPageControl
118 | ```
119 | 
120 | __Loading Articles__
121 | 
122 | ```bash
123 | routers-news --source=github --article=5
124 | ```
125 | 
126 | __Outputs:__
127 | 
128 | ```bash
129 | bcoe / routers-news:
130 | 
131 | 
132 | A crawler for various popular tech news sources. Read technology news from the comfort of your CLI.
133 |       — Read more
134 | ---------
135 | https://github.com/bcoe/routers-news
136 | ```
137 | 
138 | The Crawlers
139 | ----------
140 | 
141 | The news crawlers used by Routers come in two varieties:
142 | 
143 | * Page scrapers which use CSS selectors to extract content from news sources.
144 | * RSS/Atom feed parsers, which crawl articles using an RSS or Atom news feed.
145 | 
146 | Examples of both can be found in the __lib/sources__ directory.
147 | 
148 | Contributing
149 | ----------
150 | 
151 | It's easy to add a new news source:
152 | 
153 | * fork the routers news repo.
154 | * clone it locally.
155 | * run __npm install__ to install the libraries locally.
156 | * create a new crawler in the __lib/sources__ directory (everything in this hierarchy is automatically loaded).
157 | * to test your crawler run: __node ./bin/routers-news.js__.
158 | 
159 | You can also help a ton by:
160 | 
161 | * reporting when crawlers are broken.
162 | * extending on the crawelrs, I'd love to have:
163 |   * Dates.
164 |   * Authors.
165 |   * Better image extraction.
166 | * improving on the CLI client.
167 | 
168 | Help make our dreams of a collaborative web-crawler a reality :)
169 | 
170 | Copyright
171 | ---------
172 | 
173 | Copyright (c) 2012 Benjamin Coe and Joshua Hull and Gabriel Silk. See LICENSE.txt for further details.


--------------------------------------------------------------------------------