├── .gitignore ├── .npmignore ├── .travis.yml ├── LICENSE ├── README.md ├── index.js ├── package.json └── test ├── fixtures ├── atom-invalid.xml ├── atom.xml ├── google-news.rss ├── rss.xml └── techcrunch.rss └── index.test.js /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sentientwaffle/feed-read/ce729de607d98b4460203a0b906c5785004865eb/.npmignore -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - 0.4 4 | - 0.6 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012 [DJG](https://github.com/sentientwaffle) 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files 5 | (the "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject 9 | to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included 12 | in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 15 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Feed-Read 2 | 3 | [![Build Status](https://secure.travis-ci.org/sentientwaffle/feed-read.png?branch=master)](http://travis-ci.org/sentientwaffle/feed-read) 4 | 5 | [Node.js](http://nodejs.org/) module for parsing RSS and ATOM feeds into 6 | a common article object. 7 | 8 | # Installation 9 | 10 | $ npm install feed-read 11 | 12 | # Usage 13 | 14 | var feed = require("feed-read"); 15 | 16 | ## `feed(url, callback)` 17 | Fetch a feed. 18 | 19 | feed("http://craphound.com/?feed=rss2", function(err, articles) { 20 | if (err) throw err; 21 | // Each article has the following properties: 22 | // 23 | // * "title" - The article title (String). 24 | // * "author" - The author's name (String). 25 | // * "link" - The original article link (String). 26 | // * "content" - The HTML content of the article (String). 27 | // * "published" - The date that the article was published (Date). 28 | // * "feed" - {name, source, link} 29 | // 30 | }); 31 | 32 | ## `feed.rss(rss_string, callback)` 33 | Parse a string of XML as RSS. 34 | 35 | The callback receives `(err, articles)`. 36 | 37 | ## `feed.atom(atom_string, callback)` 38 | Parse a string of XML as ATOM. 39 | 40 | The callback receives `(err, articles)`. 41 | 42 | ## `feed.identify(xml_string)` // => "atom", "rss", or false 43 | Identify what type of feed the XML represents. 44 | 45 | Returns `false` when it is neither RSS or ATOM. 46 | 47 | 48 | # License 49 | See LICENSE. 50 | 51 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | var request = require('request') 2 | , sax = require('sax') 3 | , _ = require('underscore'); 4 | 5 | 6 | // Public: Fetch the articles from the RSS or ATOM feed. 7 | // 8 | // url - The String feed url, or an Array of urls. 9 | // callback - Receives `(err, articles)`, where each article has properties: 10 | // 11 | // * "title" 12 | // * "author" 13 | // * "link" 14 | // * "content" 15 | // * "published" 16 | // * "feed" - {name, source, link} 17 | // 18 | // Returns nothing. 19 | var FeedRead = module.exports = function(feed_url, callback) { 20 | if (feed_url instanceof Array) { 21 | var feed_urls = feed_url 22 | , articles = []; 23 | var next = function(i) { 24 | var feed_url = feed_urls[i]; 25 | if (!feed_url) return callback(null, articles); 26 | FeedRead.get(feed_url, function(err, _articles) { 27 | if (err) return callback(err); 28 | articles = articles.concat(_articles); 29 | next(i + 1); 30 | }); 31 | }; 32 | next(0); 33 | } else { 34 | FeedRead.get(feed_url, callback); 35 | } 36 | }; 37 | 38 | 39 | // Public: Check if the XML is RSS, ATOM, or neither. 40 | // 41 | // xml - A String of XML. 42 | // 43 | // Returns "atom", "rss", or false when it is neither. 44 | FeedRead.identify = function(xml) { 45 | if (/<(rss|rdf)\b/i.test(xml)) { 46 | return "rss"; 47 | } else if (/", res.statusCode)); 71 | } 72 | }); 73 | }; 74 | 75 | 76 | 77 | // Public: Parse the articles from some ATOM. 78 | // 79 | // xml - A XML String. 80 | // source - (optional) 81 | // callback - Receives `(err, articles)`. 82 | // 83 | // Returns an Array of Articles. 84 | FeedRead.atom = function(xml, source, callback) { 85 | if (!callback) return FeedRead.atom(xml, "", source); 86 | 87 | var parser = new FeedParser() 88 | , articles = [] 89 | // Info about the feed itself, not an article. 90 | , meta = {source: source} 91 | // The current article. 92 | , article 93 | // The author for when no author is specified for the post. 94 | , default_author; 95 | 96 | 97 | parser.onopentag = function(tag) { 98 | if (tag.name == "entry") article = tag; 99 | }; 100 | 101 | parser.onclosetag = function(tagname, current_tag) { 102 | if (tagname == "entry") { 103 | articles.push(article); 104 | article = null; 105 | } else if (tagname == "author" && !article) { 106 | default_author = child_data(current_tag, "name"); 107 | } else if (tagname == "link" && current_tag.attributes.rel != "self") { 108 | meta.link || (meta.link = current_tag.attributes.href); 109 | } else if (tagname == "title" && !current_tag.parent.parent) { 110 | meta.name = current_tag.children[0]; 111 | } 112 | }; 113 | 114 | parser.onend = function() { 115 | callback(null, _.filter(_.map(articles, 116 | function(art) { 117 | if (!art.children.length) return false; 118 | var author = child_by_name(art, "author"); 119 | if (author) author = child_data(author, "name"); 120 | 121 | var obj = { 122 | title: child_data(art, "title") 123 | , content: scrub_html(child_data(art, "content")) 124 | , published: child_data(art, "published") 125 | || child_data(art, "updated") 126 | , author: author || default_author 127 | , link: child_by_name(art, "link").attributes.href 128 | , feed: meta 129 | }; 130 | if (obj.published) obj.published = new Date(obj.published); 131 | return obj; 132 | } 133 | ), function(art) { return !!art; })); 134 | }; 135 | 136 | parser.write(xml); 137 | }; 138 | 139 | 140 | // Public: Parse the articles from some RSS. 141 | // 142 | // xml - A XML String. 143 | // source - (optional) 144 | // callback - Receives `(err, articles)`. 145 | // 146 | // Returns an Array of Articles. 147 | FeedRead.rss = function(xml, source, callback) { 148 | if (!callback) return FeedRead.rss(xml, "", source); 149 | 150 | var parser = new FeedParser() 151 | , articles = [] 152 | // Info about the feed itself, not an article. 153 | , meta = {source: source} 154 | // The current article. 155 | , article; 156 | 157 | 158 | parser.onopentag = function(tag) { 159 | if (tag.name == "item") article = tag; 160 | }; 161 | 162 | parser.onclosetag = function(tagname, current_tag) { 163 | if (tagname == "item") { 164 | articles.push(article); 165 | article = null; 166 | } else if (tagname == "channel") { 167 | meta.link || (meta.link = child_data(current_tag, "link")); 168 | meta.name = child_data(current_tag, "title"); 169 | } 170 | }; 171 | 172 | parser.onend = function() { 173 | callback(null, _.filter(_.map(articles, 174 | function(art) { 175 | if (!art.children.length) return false; 176 | var obj = { 177 | title: child_data(art, "title") 178 | , content: scrub_html(child_data(art, "content:encoded")) 179 | || scrub_html(child_data(art, "description")) 180 | , published: child_data(art, "pubDate") 181 | , author: child_data(art, "author") 182 | || child_data(art, "dc:creator") 183 | , link: child_data(art, "link") 184 | , feed: meta 185 | }; 186 | if (obj.published) obj.published = new Date(obj.published); 187 | return obj; 188 | } 189 | ), function(art) { return !!art; })); 190 | }; 191 | 192 | parser.write(xml); 193 | }; 194 | 195 | 196 | // Methods to override: 197 | // 198 | // * onopentag 199 | // * onclosetag 200 | // * onend 201 | // 202 | var FeedParser = (function() { 203 | // Internal: Parse the XML. 204 | // 205 | // xml - An XML String. 206 | // callback - Receives `(err, obj)`. 207 | // 208 | function FeedParser() { 209 | this.current_tag = null; 210 | var parser = this.parser = sax.parser(true, 211 | { trim: true 212 | , normalize: true 213 | }) 214 | , _this = this; 215 | 216 | parser.onopentag = function(tag) { _this.open(tag); }; 217 | parser.onclosetag = function(tag) { _this.close(tag); }; 218 | 219 | parser.onerror = function() { this.error = undefined; } 220 | parser.ontext = function(text) { _this.ontext(text); }; 221 | parser.oncdata = function(text) { _this.ontext(text); }; 222 | parser.onend = function() { _this.onend(); }; 223 | } 224 | 225 | 226 | // Public: Parse the XML. 227 | FeedParser.prototype.write = function(xml) { 228 | this.parser.write(xml).close(); 229 | }; 230 | 231 | // Internal: Open a tag. 232 | FeedParser.prototype.open = function(tag) { 233 | tag.parent = this.current_tag; 234 | tag.children = []; 235 | if (tag.parent) tag.parent.children.push(tag); 236 | this.current_tag = tag; 237 | this.onopentag(tag); 238 | }; 239 | 240 | // Internal: CLose a tag. 241 | FeedParser.prototype.close = function(tagname) { 242 | this.onclosetag(tagname, this.current_tag); 243 | if (this.current_tag && this.current_tag.parent) { 244 | var p = this.current_tag.parent; 245 | delete this.current_tag.parent; 246 | this.current_tag = p; 247 | } 248 | }; 249 | 250 | // Internal: Add the text as a child of the current tag. 251 | FeedParser.prototype.ontext = function(text) { 252 | if (this.current_tag) { 253 | this.current_tag.children.push(text); 254 | } 255 | }; 256 | 257 | return FeedParser; 258 | })(); 259 | 260 | 261 | // Internal: Remove