├── .travis.yml ├── .gitignore ├── package.json ├── LICENSE.txt ├── README.md ├── test └── test.js └── lib └── feed.js /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - 0.6 4 | - 0.8 5 | - 0.10 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | *.DS_Store 3 | 4 | *.log 5 | 6 | *.sublime-project 7 | *.sublime-workspace -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "blindparser", 3 | "version": "0.1.1", 4 | "description": "blindparser is an all purpose RSS/ATOM feed parser that parses feeds into a common format so that you do not have to care if they are RSS or ATOM feeds.", 5 | "keywords": [ 6 | "rss", 7 | "atom", 8 | "feed", 9 | "parser" 10 | ], 11 | "repository": 12 | { 13 | "type": "git", 14 | "url": "http://github.com/dropdownmenu/node-blindparser.git" 15 | } 16 | , 17 | "dependencies": { 18 | "xml2js": "0.2.3", 19 | "request":"2.12.0", 20 | "underscore":"*" 21 | }, 22 | "devDependencies":{ 23 | "vows":"*" 24 | }, 25 | "scripts":{ 26 | "test":"vows --spec" 27 | }, 28 | "main": "./lib/feed.js" 29 | } 30 | 31 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012 Tim McGowan (dropdownmenu) 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | node-blindparser 2 | ---------------- 3 | [![build status](https://secure.travis-ci.org/dropdownmenu/node-blindparser.png)](http://travis-ci.org/dropdownmenu/node-blindparser) 4 | 5 | blindparser is a RSS/ATOM feed parser that returns the requested feed urls in a json object that is formatted so that you will not have to worry (much) about the format of the requested feed. 6 | 7 | Motivation 8 | ---------- 9 | 10 | RSS and ATOM feeds are both trying to deliver similar content, but are different enough with their structure to be aggravating. The purpose of blindparser is to allow for the important parts of the feeds (article titles, links, etc) to be returned in a standard format, but to also return the rest of the feed in a reasonable way. 11 | 12 | Installing 13 | ---------- 14 | 15 | Like all node.js modules, just use npm! 16 | 17 | ``` 18 | npm install blindparser 19 | ``` 20 | 21 | Usage 22 | ----- 23 | 24 | Using blind parser is easy, just call: 25 | 26 | ``` 27 | var parser = require('blindparser'); 28 | 29 | // with no options 30 | parser.parseURL('http://rss.cnn.com/rss/cnn_topstories.rss', function(err, out){ 31 | console.log(out); 32 | }); 33 | 34 | var options = { 35 | followRedirect: false, 36 | timeout: 1000 37 | }; 38 | //rss feeds 39 | parser.parseURL('http://rss.cnn.com/rss/cnn_topstories.rss', options, function(err, out){ 40 | console.log(out); 41 | }); 42 | //atom feeds 43 | parser.parseURL('http://www.blogger.com/feeds/10861780/posts/default', options, function(err, out){ 44 | console.log(out); 45 | }); 46 | ``` 47 | 48 | Options 49 | ------- 50 | 51 | The options hash is passed through to [request](https://github.com/mikeal/request) for fetching a given url. 52 | 53 | Output 54 | ------ 55 | 56 | The point of `blindparser` is to try and hide the format of the originally requested feed. Thus RSS and ATOM feeds are returned in a common format. Similar fields (pubDate vs update) will be mapped to the same field in the output. 57 | 58 | The 'minimal' output format is: 59 | 60 | ``` 61 | { 62 | type:"rss" or "atom" 63 | metadata:{ 64 | title: Title of the feed 65 | desc: description or subtitle 66 | url: url of the feed 67 | update: pubDate or update time of the feed 68 | }, 69 | items:[ 70 | { 71 | title: Title of article 72 | desc: Description or content of article 73 | link: Link to article 74 | date: Time article was published 75 | }... 76 | ] 77 | ``` 78 | 79 | Tests 80 | ----- 81 | 82 | Tests for blindparser can be run using the command: 83 | 84 | ``` 85 | npm test 86 | ``` 87 | 88 | Make sure that you machine has an internet connection before running the 89 | tests. 90 | 91 | License 92 | ------- 93 | Copyright (c) 2013 Tim McGowan (dropdownmenu) 94 | 95 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 96 | 97 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 98 | 99 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 100 | -------------------------------------------------------------------------------- /test/test.js: -------------------------------------------------------------------------------- 1 | // string test.js 2 | // why does this not work? 3 | 4 | var vows = require('vows'); 5 | var assert = require('assert'); 6 | 7 | var parser = require('../lib/feed.js'); 8 | 9 | vows.describe('bindparser').addBatch({ 10 | 11 | 'rss tests':{ 12 | topic:function(){ 13 | parser.parseURL('http://rss.cnn.com/rss/cnn_topstories.rss', {}, this.callback); 14 | }, 15 | 'response is not null':function(err, docs){ 16 | assert.isNull(err); 17 | assert.isNotNull(docs); 18 | }, 19 | 'response is properly formatted':function(err, docs){ 20 | assert.equal(docs.type, 'rss'); 21 | assert.isObject(docs.metadata); 22 | assert.isArray(docs.items); 23 | } 24 | }, 25 | 'atom tests':{ 26 | topic:function(){ 27 | parser.parseURL('http://www.blogger.com/feeds/10861780/posts/default', {}, this.callback); 28 | }, 29 | 'response is not null':function(err, docs){ 30 | assert.isNull(err); 31 | assert.isNotNull(docs); 32 | }, 33 | 'response is properly formatted':function(err, docs){ 34 | assert.equal(docs.type, 'atom'); 35 | assert.isObject(docs.metadata); 36 | assert.isArray(docs.items); 37 | }, 38 | 'response contains items': function (err, docs) { 39 | assert.isArray(docs.items); 40 | assert.ok(docs.items.length > 0); 41 | }, 42 | }, 43 | 'feedburner tests': { 44 | topic: function() { 45 | parser.parseURL('http://feeds.feedburner.com/TechCrunch', this.callback); 46 | }, 47 | 'response is not null':function(err, docs){ 48 | assert.isNull(err); 49 | assert.isNotNull(docs); 50 | }, 51 | 'response is formatted as rss':function(err, docs){ 52 | assert.equal(docs.type, 'rss'); 53 | assert.isObject(docs.metadata); 54 | assert.isArray(docs.items); 55 | }, 56 | 'response contains items':function(err, docs) { 57 | assert.isArray(docs.items); 58 | assert.ok(docs.items.length > 0); 59 | }, 60 | 'response contains images':function(err, docs) { 61 | assert.ok(docs.metadata.image); 62 | docs.items.forEach(function(item){ 63 | assert.ok(item.media.thumbnail); 64 | }); 65 | } 66 | }, 67 | 'oddities':{ 68 | 'empty xml':{ 69 | topic:function(){ 70 | parser.parseString('', {}, this.callback); 71 | }, 72 | 'returns an error && docs is null':function(err, docs){ 73 | assert.isNotNull(err); 74 | assert.isNull(docs); 75 | } 76 | } 77 | }, 78 | 'craigslist':{ 79 | topic: function () { 80 | parser.parseURL('http://portland.craigslist.org/sof/index.rss', this.callback); 81 | }, 82 | 'response is formatted as rss': function (err, docs) { 83 | assert.equal(docs.type, 'rss'); 84 | assert.isObject(docs.metadata); 85 | assert.isArray(docs.items); 86 | }, 87 | 'response contains items': function (err, docs) { 88 | assert.isArray(docs.items); 89 | assert.ok(docs.items.length > 0); 90 | }, 91 | 'response items have titles': function (err, docs) { 92 | assert.isArray(docs.items); 93 | assert.ok(docs.items.length > 0); 94 | assert.isNotNull(docs.items[0].title); 95 | }, 96 | 'response items have links': function (err, docs) { 97 | assert.isArray(docs.items); 98 | assert.ok(docs.items.length > 0); 99 | assert.isNotNull(docs.items[0].link); 100 | }, 101 | 'response items have desc': function (err, docs) { 102 | assert.isArray(docs.items); 103 | assert.ok(docs.items.length > 0); 104 | assert.isNotNull(docs.items[0].desc); 105 | }, 106 | 'response items have date': function (err, docs) { 107 | assert.isArray(docs.items); 108 | assert.ok(docs.items.length > 0); 109 | assert.isNotNull(docs.items[0].date); 110 | } 111 | } 112 | }).export(module); 113 | -------------------------------------------------------------------------------- /lib/feed.js: -------------------------------------------------------------------------------- 1 | //feed.js 2 | var xml2js = require('xml2js'); 3 | var _ = require('underscore'); 4 | var request = require('request'); 5 | var URL = require('url'); 6 | 7 | /** 8 | All you need to do is send a feed URL that can be opened via fs 9 | Options are optional, see xml2js for extensive list 10 | And a callback of course 11 | 12 | The returned formats will be structually the same, but you should still check the 'format' property 13 | **/ 14 | function parseURL(feedURL, options, callback) { 15 | if (typeof options == 'function' && !callback) { 16 | callback = options; 17 | options = {}; 18 | } 19 | var defaults = {uri: feedURL, jar: false, proxy: false, followRedirect: true, timeout: 1000 * 30}; 20 | options = _.extend(defaults, options); 21 | //check that the protocall is either http or https 22 | var u = URL.parse(feedURL); 23 | if (u.protocol == 'http:' || u.protocol == 'https:') { 24 | //make sure to have a 30 second timeout 25 | var req = request(options, function(err, response, xml) { 26 | if (err || xml == null) { 27 | if (err) { 28 | callback(err, null); 29 | }else { 30 | callback('failed to retrive source', null); 31 | } 32 | }else { 33 | parseString(xml, options, callback); 34 | } 35 | }); 36 | }else { 37 | callback({error: 'Only http or https protocalls are accepted'}, null); 38 | } 39 | } 40 | module.exports.parseURL = parseURL; 41 | 42 | function parseString(xml, options, callback) { 43 | // we need to check that the input in not a null input 44 | if (xml.split('<').length >= 3) { 45 | var parser = new xml2js.Parser({trim: false, normalize: true, mergeAttrs: true}); 46 | parser.addListener('end', function(jsonDOM) { 47 | if (jsonDOM) { 48 | //console.log(jsonDOM.rss.channel[0]); 49 | jsonDOM = normalize(jsonDOM); 50 | var err, output; 51 | if (isRSS(jsonDOM)) { 52 | output = formatRSS(jsonDOM); 53 | }else { 54 | output = formatATOM(jsonDOM); 55 | } 56 | callback(null, output); 57 | }else { 58 | callback('failed to parse xml', null); 59 | } 60 | }); 61 | parser.addListener('error', function(err) { 62 | callback(err, null); 63 | }); 64 | parser.parseString(xml); 65 | }else { 66 | callback('malformed xml', null); 67 | } 68 | } 69 | module.exports.parseString = parseString; 70 | 71 | //detects if RSS, otherwise assume atom 72 | function isRSS(json) { 73 | return (json.channel != null); 74 | } 75 | 76 | // normalizes input to make feed burner work 77 | function normalize(json) { 78 | if (json.rss || json['rdf:RDF']) { 79 | return json.rss || json['rdf:RDF']; 80 | } 81 | return json; 82 | } 83 | 84 | //xml2js will return commented material in a # tag which can be a pain 85 | //this will remove the # tag and set its child text in it's place 86 | //ment to work on a feed item, so will iterate over json's and check 87 | function flattenComments(json) { 88 | for (key in json) { 89 | if (json[key]['#']) { 90 | json[key] = json[key]['#']; 91 | } 92 | } 93 | return json; 94 | } 95 | 96 | //formats the RSS feed to the needed outpu 97 | //also parses FeedBurner 98 | function formatRSS(json) { 99 | var output = {'type': 'rss', metadata: {}, items: []}; 100 | //Start with the metadata for the feed 101 | var metadata = {}; 102 | var channel = json.channel; 103 | 104 | if (_.isArray(json.channel)) { 105 | channel = json.channel[0]; 106 | } 107 | 108 | var items = json.item || channel.item; 109 | 110 | if (channel.title) { 111 | metadata.title = channel.title; 112 | } 113 | if (channel.description) { 114 | metadata.desc = channel.description; 115 | } 116 | if (channel.link) { 117 | metadata.url = channel.link; 118 | } 119 | if (channel.lastBuildDate) { 120 | metadata.lastBuildDate = channel.lastBuildDate; 121 | } 122 | if (channel.pubDate) { 123 | metadata.update = channel.pubDate; 124 | } 125 | if (channel.ttl) { 126 | metadata.ttl = channel.ttl; 127 | } 128 | if (channel.image) { 129 | 130 | metadata.image = []; 131 | 132 | channel.image.forEach(function(image, index) { 133 | metadata.image[index] = {}; 134 | Object.keys(image).forEach(function(attr) { 135 | metadata.image[index][attr] = image[attr]; 136 | }); 137 | }); 138 | 139 | } 140 | 141 | output.metadata = metadata; 142 | 143 | //ok, now lets get into the meat of the feed 144 | //just double check that it exists 145 | if (items) { 146 | if (!_.isArray(items)) { 147 | items = [items]; 148 | } 149 | _.each(items, function(val, index) { 150 | val = flattenComments(val); 151 | var obj = {}; 152 | obj.title = val.title; 153 | obj.desc = val.description; 154 | obj.link = val.link; 155 | if (val.category) { 156 | obj.category = val.category; 157 | } 158 | //since we are going to format the date, we want to make sure it exists 159 | if (val.pubDate || val['dc:date']) { 160 | //lets try basis js date parsing for now 161 | obj.date = Date.parse(val.pubDate || val['dc:date']); 162 | } 163 | //now lets handel the GUID 164 | if (val.guid) { 165 | //xml2js parses this kina odd... 166 | var link = val.guid; 167 | // var param = val.guid['@']; 168 | var isPermaLink = true; 169 | //if(param){ 170 | // isPermaLink = param.isPermaLink; 171 | //} 172 | obj.guid = {'link': link, isPermaLink: isPermaLink}; 173 | } 174 | //Check for images 175 | if (val['media:content']) { 176 | obj.media = val.media || {}; 177 | obj.media.content = val['media:content']; 178 | } 179 | if (val['media:thumbnail']) { 180 | obj.media = val.media || {}; 181 | obj.media.thumbnail = val['media:content']; 182 | } 183 | 184 | if(val['content:encoded']) { 185 | obj.content = val['content:encoded']; 186 | } 187 | //now push the obj onto the stack 188 | output.items.push(obj); 189 | }); 190 | } 191 | return output; 192 | } 193 | 194 | //formats the ATOM feed to the needed output 195 | //yes, this is a shamless copy-pasta of the RSS code (its all the same structure!) 196 | function formatATOM(json) { 197 | var output = {'type': 'atom', metadata: {}, items: []}; 198 | //Start with the metadata for the feed 199 | var metadata = {}; 200 | var channel = json.feed || json; 201 | if (channel.title) { 202 | metadata.title = channel.title; 203 | } 204 | if (channel.subtitle) { 205 | metadata.desc = channel.subtitle; 206 | } 207 | if (channel.link) { 208 | metadata.url = channel.link; 209 | } 210 | if (channel.id) { 211 | metadata.id = channel.id; 212 | } 213 | if (channel.update) { 214 | metadata.update = channel.update; 215 | } 216 | if (channel.author) { 217 | metadata.author = channel.author; 218 | } 219 | 220 | output.metadata = metadata; 221 | //just double check that it exists and that it is an array 222 | if (channel.entry) { 223 | if (!_.isArray(channel.entry)) { 224 | channel.entry = [channel.entry]; 225 | } 226 | _.each(channel.entry, function(val, index) { 227 | val = flattenComments(val); 228 | var obj = {}; 229 | obj.id = val.id; 230 | if (!val.title) { 231 | console.log(json); 232 | } 233 | obj.title = val.title; 234 | if (val.content) { 235 | obj.desc = val.content; 236 | }else if (val.summary) { 237 | obj.desc = val.summary; 238 | } 239 | var categories = []; 240 | //just grab the category text 241 | if (val.category) { 242 | if (_.isArray(val.category)) { 243 | _.each(val.category, function(val, i) { 244 | categories.push(val['term']); 245 | }); 246 | }else { 247 | categories.push(val.category); 248 | } 249 | } 250 | obj.category = categories; 251 | var link = ''; 252 | //just get the alternate link 253 | if (val.link) { 254 | if (_.isArray(val.link)) { 255 | _.each(val.link, function(val, i) { 256 | if (val.rel == 'alternate') { 257 | link = val.href; 258 | } 259 | }); 260 | }else { 261 | link = val.link.href; 262 | } 263 | } 264 | obj.link = link; 265 | //since we are going to format the date, we want to make sure it exists 266 | if (val.published) { 267 | //lets try basis js date parsing for now 268 | obj.date = Date.parse(val.published); 269 | } 270 | if (val.updated) { 271 | //lets try basis js date parsing for now 272 | obj.updated = Date.parse(val.updated); 273 | } 274 | //grab thumbnail if exists 275 | if (val['media:thumbnail']) { 276 | obj.media = val.media || {}; 277 | obj.media.thumbnail = val['media:content']; 278 | } //grab media content if exists 279 | if (val['media:content']) { 280 | obj.media = val.media || {}; 281 | obj.media.content = val['media:content']; 282 | } 283 | 284 | if(val['content:encoded']) { 285 | obj.content = val['content:encoded']; 286 | } 287 | 288 | //now push the obj onto the stack 289 | output.items.push(obj); 290 | }); 291 | } 292 | return output; 293 | } 294 | 295 | 296 | --------------------------------------------------------------------------------