├── .travis.yml
├── .gitignore
├── package.json
├── LICENSE.txt
├── README.md
├── test
    └── test.js
└── lib
    └── feed.js


/.travis.yml:
--------------------------------------------------------------------------------
1 | language: node_js
2 | node_js:
3 |   - 0.6
4 |   - 0.8
5 |   - 0.10


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules/
2 | *.DS_Store
3 | 
4 | *.log
5 | 
6 | *.sublime-project
7 | *.sublime-workspace


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "blindparser",
 3 |     "version": "0.1.1",
 4 |     "description": "blindparser is an all purpose RSS/ATOM feed parser that parses feeds into a common format so that you do not have to care if they are RSS or ATOM feeds.",
 5 |     "keywords": [
 6 |         "rss",
 7 |         "atom",
 8 |         "feed",
 9 |         "parser"
10 |     ],
11 |     "repository":
12 |         {
13 |             "type": "git",
14 |             "url": "http://github.com/dropdownmenu/node-blindparser.git"
15 |         }
16 |     ,
17 |     "dependencies": {
18 |       "xml2js": "0.2.3",
19 |       "request":"2.12.0",
20 |       "underscore":"*"
21 |     },
22 |     "devDependencies":{
23 |       "vows":"*"
24 |     },
25 |     "scripts":{
26 |       "test":"vows --spec"
27 |     },
28 |     "main": "./lib/feed.js"
29 | }
30 | 
31 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2012 Tim McGowan (dropdownmenu)
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | node-blindparser
  2 | ----------------
  3 | [![build status](https://secure.travis-ci.org/dropdownmenu/node-blindparser.png)](http://travis-ci.org/dropdownmenu/node-blindparser)
  4 | 
  5 | blindparser is a RSS/ATOM feed parser that returns the requested feed urls in a json object that is formatted so that you will not have to worry (much) about the format of the requested feed.
  6 | 
  7 | Motivation
  8 | ----------
  9 | 
 10 | RSS and ATOM feeds are both trying to deliver similar content, but are different enough with their structure to be aggravating. The purpose of blindparser is to allow for the important parts of the feeds (article titles, links, etc) to be returned in a standard format, but to also return the rest of the feed in a reasonable way.
 11 | 
 12 | Installing
 13 | ----------
 14 | 
 15 | Like all node.js modules, just use npm!
 16 | 
 17 | ```
 18 | npm install blindparser
 19 | ```
 20 | 
 21 | Usage
 22 | -----
 23 | 
 24 | Using blind parser is easy, just call:
 25 | 
 26 | ```
 27 | var parser = require('blindparser');
 28 | 
 29 | // with no options
 30 | parser.parseURL('http://rss.cnn.com/rss/cnn_topstories.rss', function(err, out){
 31 | 	console.log(out);
 32 | });
 33 | 
 34 | var options = {
 35 | 	followRedirect: false,
 36 | 	timeout: 1000
 37 | };
 38 | //rss feeds
 39 | parser.parseURL('http://rss.cnn.com/rss/cnn_topstories.rss', options, function(err, out){
 40 | 	console.log(out);
 41 | });
 42 | //atom feeds
 43 | parser.parseURL('http://www.blogger.com/feeds/10861780/posts/default', options, function(err, out){
 44 | 	console.log(out);
 45 | });
 46 | ```
 47 | 
 48 | Options
 49 | -------
 50 | 
 51 | The options hash is passed through to [request](https://github.com/mikeal/request) for fetching a given url.
 52 | 
 53 | Output
 54 | ------
 55 | 
 56 | The point of `blindparser` is to try and hide the format of the originally requested feed. Thus RSS and ATOM feeds are returned in a common format. Similar fields (pubDate vs update) will be mapped to the same field in the output.
 57 | 
 58 | The 'minimal' output format is:
 59 | 
 60 | ```
 61 | {
 62 | 	type:"rss" or "atom"
 63 | 	metadata:{
 64 | 		title: Title of the feed
 65 | 		desc: description or subtitle
 66 | 		url: url of the feed
 67 | 		update: pubDate or update time of the feed
 68 | 	},
 69 | 	items:[
 70 | 		{
 71 | 			title: Title of article
 72 | 			desc:	Description or content of article
 73 | 			link: Link to article
 74 | 			date: Time article was published
 75 | 		}...
 76 | 	]
 77 | ```
 78 | 
 79 | Tests
 80 | -----
 81 | 
 82 | Tests for blindparser can be run using the command:
 83 | 
 84 | ```
 85 | npm test
 86 | ```
 87 | 
 88 | Make sure that you machine has an internet connection before running the
 89 | tests.
 90 | 
 91 | License
 92 | -------
 93 | Copyright (c) 2013 Tim McGowan (dropdownmenu)
 94 | 
 95 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 96 | 
 97 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 98 | 
 99 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
100 | 


--------------------------------------------------------------------------------
/test/test.js:
--------------------------------------------------------------------------------
  1 | // string test.js
  2 | // why does this not work?
  3 | 
  4 | var vows = require('vows');
  5 | var assert = require('assert');
  6 | 
  7 | var parser = require('../lib/feed.js');
  8 | 
  9 | vows.describe('bindparser').addBatch({
 10 | 
 11 |   'rss tests':{
 12 |     topic:function(){
 13 |       parser.parseURL('http://rss.cnn.com/rss/cnn_topstories.rss', {}, this.callback);
 14 |     },
 15 |     'response is not null':function(err, docs){
 16 |       assert.isNull(err);
 17 |       assert.isNotNull(docs);
 18 |     },
 19 |     'response is properly formatted':function(err, docs){
 20 |       assert.equal(docs.type, 'rss');
 21 |       assert.isObject(docs.metadata);
 22 |       assert.isArray(docs.items);
 23 |     }
 24 |   },
 25 |   'atom tests':{
 26 |     topic:function(){
 27 |       parser.parseURL('http://www.blogger.com/feeds/10861780/posts/default', {}, this.callback);
 28 |     },
 29 |     'response is not null':function(err, docs){
 30 |       assert.isNull(err);
 31 |       assert.isNotNull(docs);
 32 |     },
 33 |     'response is properly formatted':function(err, docs){
 34 |       assert.equal(docs.type, 'atom');
 35 |       assert.isObject(docs.metadata);
 36 |       assert.isArray(docs.items);
 37 |     },
 38 |     'response contains items': function (err, docs) {
 39 |       assert.isArray(docs.items);
 40 |       assert.ok(docs.items.length > 0);
 41 |     },
 42 |   },
 43 |   'feedburner tests': {
 44 |     topic: function() {
 45 |       parser.parseURL('http://feeds.feedburner.com/TechCrunch', this.callback);
 46 |     },
 47 |     'response is not null':function(err, docs){
 48 |       assert.isNull(err);
 49 |       assert.isNotNull(docs);
 50 |     },
 51 |     'response is formatted as rss':function(err, docs){
 52 |       assert.equal(docs.type, 'rss');
 53 |       assert.isObject(docs.metadata);
 54 |       assert.isArray(docs.items);
 55 |     },
 56 |     'response contains items':function(err, docs) {
 57 |       assert.isArray(docs.items);
 58 |       assert.ok(docs.items.length > 0);
 59 |     },
 60 |     'response contains images':function(err, docs) {
 61 |       assert.ok(docs.metadata.image);
 62 |       docs.items.forEach(function(item){
 63 |         assert.ok(item.media.thumbnail);
 64 |       });
 65 |     }
 66 |   },
 67 |   'oddities':{
 68 |     'empty xml':{
 69 |       topic:function(){
 70 |         parser.parseString('<?xml version="1.0" ecoding="UTF-8"?>', {}, this.callback);
 71 |       },
 72 |       'returns an error && docs is null':function(err, docs){
 73 |         assert.isNotNull(err);
 74 |         assert.isNull(docs);
 75 |       }
 76 |     }
 77 |   },
 78 |   'craigslist':{
 79 |     topic: function () {
 80 |       parser.parseURL('http://portland.craigslist.org/sof/index.rss', this.callback);
 81 |     },
 82 |     'response is formatted as rss': function (err, docs) {
 83 |       assert.equal(docs.type, 'rss');
 84 |       assert.isObject(docs.metadata);
 85 |       assert.isArray(docs.items);
 86 |     },
 87 |     'response contains items': function (err, docs) {
 88 |       assert.isArray(docs.items);
 89 |       assert.ok(docs.items.length > 0);
 90 |     },
 91 |     'response items have titles': function (err, docs) {
 92 |       assert.isArray(docs.items);
 93 |       assert.ok(docs.items.length > 0);
 94 |       assert.isNotNull(docs.items[0].title);
 95 |     },
 96 |     'response items have links': function (err, docs) {
 97 |       assert.isArray(docs.items);
 98 |       assert.ok(docs.items.length > 0);
 99 |       assert.isNotNull(docs.items[0].link);
100 |     },
101 |     'response items have desc': function (err, docs) {
102 |       assert.isArray(docs.items);
103 |       assert.ok(docs.items.length > 0);
104 |       assert.isNotNull(docs.items[0].desc);
105 |     },
106 |     'response items have date': function (err, docs) {
107 |       assert.isArray(docs.items);
108 |       assert.ok(docs.items.length > 0);
109 |       assert.isNotNull(docs.items[0].date);
110 |     }
111 |   }
112 | }).export(module);
113 | 


--------------------------------------------------------------------------------
/lib/feed.js:
--------------------------------------------------------------------------------
  1 | //feed.js
  2 | var xml2js = require('xml2js');
  3 | var _ = require('underscore');
  4 | var request = require('request');
  5 | var URL = require('url');
  6 | 
  7 | /**
  8 | All you need to do is send a feed URL that can be opened via fs
  9 | Options are optional, see xml2js for extensive list
 10 | And a callback of course
 11 | 
 12 | The returned formats will be structually the same, but you should still check the 'format' property
 13 | **/
 14 | function parseURL(feedURL, options, callback) {
 15 |   if (typeof options == 'function' && !callback) {
 16 |     callback = options;
 17 |     options = {};
 18 |   }
 19 |   var defaults = {uri: feedURL, jar: false, proxy: false, followRedirect: true, timeout: 1000 * 30};
 20 |   options = _.extend(defaults, options);
 21 |   //check that the protocall is either http or https
 22 |   var u = URL.parse(feedURL);
 23 |   if (u.protocol == 'http:' || u.protocol == 'https:') {
 24 |     //make sure to have a 30 second timeout
 25 |     var req = request(options, function(err, response, xml) {
 26 |       if (err || xml == null) {
 27 |         if (err) {
 28 |           callback(err, null);
 29 |         }else {
 30 |           callback('failed to retrive source', null);
 31 |         }
 32 |       }else {
 33 |         parseString(xml, options, callback);
 34 |       }
 35 |     });
 36 |   }else {
 37 |     callback({error: 'Only http or https protocalls are accepted'}, null);
 38 |   }
 39 | }
 40 | module.exports.parseURL = parseURL;
 41 | 
 42 | function parseString(xml, options, callback) {
 43 |   // we need to check that the input in not a null input
 44 |   if (xml.split('<').length >= 3) {
 45 |     var parser = new xml2js.Parser({trim: false, normalize: true, mergeAttrs: true});
 46 |     parser.addListener('end', function(jsonDOM) {
 47 |       if (jsonDOM) {
 48 |         //console.log(jsonDOM.rss.channel[0]);
 49 |         jsonDOM = normalize(jsonDOM);
 50 |         var err, output;
 51 |         if (isRSS(jsonDOM)) {
 52 |           output = formatRSS(jsonDOM);
 53 |         }else {
 54 |           output = formatATOM(jsonDOM);
 55 |         }
 56 |         callback(null, output);
 57 |       }else {
 58 |         callback('failed to parse xml', null);
 59 |       }
 60 |     });
 61 |     parser.addListener('error', function(err) {
 62 |       callback(err, null);
 63 |     });
 64 |     parser.parseString(xml);
 65 |   }else {
 66 |     callback('malformed xml', null);
 67 |   }
 68 | }
 69 | module.exports.parseString = parseString;
 70 | 
 71 | //detects if RSS, otherwise assume atom
 72 | function isRSS(json) {
 73 | 	return (json.channel != null);
 74 | }
 75 | 
 76 | // normalizes input to make feed burner work
 77 | function normalize(json) {
 78 |   if (json.rss || json['rdf:RDF']) {
 79 |     return json.rss || json['rdf:RDF'];
 80 |   }
 81 |   return json;
 82 | }
 83 | 
 84 | //xml2js will return commented material in a # tag which can be a pain
 85 | //this will remove the # tag and set its child text in it's place
 86 | //ment to work on a feed item, so will iterate over json's and check
 87 | function flattenComments(json) {
 88 | 	for (key in json) {
 89 | 		if (json[key]['#']) {
 90 | 			json[key] = json[key]['#'];
 91 | 		}
 92 | 	}
 93 | 	return json;
 94 | }
 95 | 
 96 | //formats the RSS feed to the needed outpu
 97 | //also parses FeedBurner
 98 | function formatRSS(json) {
 99 | 	var output = {'type': 'rss', metadata: {}, items: []};
100 | 	//Start with the metadata for the feed
101 | 	var metadata = {};
102 | 	var channel = json.channel;
103 | 
104 |   if (_.isArray(json.channel)) {
105 |     channel = json.channel[0];
106 |   }
107 | 
108 |   var items = json.item || channel.item;
109 | 
110 | 	if (channel.title) {
111 | 		metadata.title = channel.title;
112 | 	}
113 | 	if (channel.description) {
114 | 		metadata.desc = channel.description;
115 | 	}
116 | 	if (channel.link) {
117 | 		metadata.url = channel.link;
118 | 	}
119 | 	if (channel.lastBuildDate) {
120 | 		metadata.lastBuildDate = channel.lastBuildDate;
121 | 	}
122 | 	if (channel.pubDate) {
123 | 		metadata.update = channel.pubDate;
124 | 	}
125 | 	if (channel.ttl) {
126 | 		metadata.ttl = channel.ttl;
127 | 	}
128 |         if (channel.image) {
129 | 
130 |           metadata.image = [];
131 | 
132 |           channel.image.forEach(function(image, index) {
133 |             metadata.image[index] = {};
134 |             Object.keys(image).forEach(function(attr) {
135 |               metadata.image[index][attr] = image[attr];
136 |             });
137 |           });
138 | 
139 |         }
140 | 
141 | 	output.metadata = metadata;
142 | 
143 | 	//ok, now lets get into the meat of the feed
144 | 	//just double check that it exists
145 | 	if (items) {
146 | 		if (!_.isArray(items)) {
147 | 			items = [items];
148 | 		}
149 | 		_.each(items, function(val, index) {
150 | 			val = flattenComments(val);
151 | 			var obj = {};
152 | 			obj.title = val.title;
153 | 			obj.desc = val.description;
154 | 			obj.link = val.link;
155 | 			if (val.category) {
156 | 				obj.category = val.category;
157 | 			}
158 | 			//since we are going to format the date, we want to make sure it exists
159 | 			if (val.pubDate || val['dc:date']) {
160 | 				//lets try basis js date parsing for now
161 | 				obj.date = Date.parse(val.pubDate || val['dc:date']);
162 | 			}
163 | 			//now lets handel the GUID
164 | 			if (val.guid) {
165 | 				//xml2js parses this kina odd...
166 | 				var link = val.guid;
167 | //					var param = val.guid['@'];
168 | 				var isPermaLink = true;
169 | 				//if(param){
170 | 				//	isPermaLink = param.isPermaLink;
171 | 				//}
172 | 				obj.guid = {'link': link, isPermaLink: isPermaLink};
173 | 			}
174 |             //Check for images
175 |             if (val['media:content']) {
176 |                 obj.media = val.media || {};
177 |                 obj.media.content = val['media:content'];
178 |             }
179 |             if (val['media:thumbnail']) {
180 |                 obj.media = val.media || {};
181 |                 obj.media.thumbnail = val['media:content'];
182 |             }
183 | 
184 |             if(val['content:encoded']) {
185 |                 obj.content = val['content:encoded'];
186 |             }
187 | 			//now push the obj onto the stack
188 | 			output.items.push(obj);
189 | 		});
190 | 	}
191 | 	return output;
192 | }
193 | 
194 | //formats the ATOM feed to the needed output
195 | //yes, this is a shamless copy-pasta of the RSS code (its all the same structure!)
196 | function formatATOM(json) {
197 | 	var output = {'type': 'atom', metadata: {}, items: []};
198 | 	//Start with the metadata for the feed
199 | 	var metadata = {};
200 | 	var channel = json.feed || json;
201 | 	if (channel.title) {
202 | 		metadata.title = channel.title;
203 | 	}
204 | 	if (channel.subtitle) {
205 | 		metadata.desc = channel.subtitle;
206 | 	}
207 | 	if (channel.link) {
208 | 		metadata.url = channel.link;
209 | 	}
210 | 	if (channel.id) {
211 | 		metadata.id = channel.id;
212 | 	}
213 | 	if (channel.update) {
214 | 		metadata.update = channel.update;
215 | 	}
216 | 	if (channel.author) {
217 | 		metadata.author = channel.author;
218 | 	}
219 | 
220 | 	output.metadata = metadata;
221 | 	//just double check that it exists and that it is an array
222 | 	if (channel.entry) {
223 | 		if (!_.isArray(channel.entry)) {
224 | 			channel.entry = [channel.entry];
225 | 		}
226 | 		_.each(channel.entry, function(val, index) {
227 | 			val = flattenComments(val);
228 | 			var obj = {};
229 | 			obj.id = val.id;
230 | 			if (!val.title) {
231 | 				console.log(json);
232 | 			}
233 | 			obj.title = val.title;
234 | 			if (val.content) {
235 | 				obj.desc = val.content;
236 | 			}else if (val.summary) {
237 | 				obj.desc = val.summary;
238 | 			}
239 | 			var categories = [];
240 | 			//just grab the category text
241 | 			if (val.category) {
242 | 				if (_.isArray(val.category)) {
243 | 					_.each(val.category, function(val, i) {
244 | 						categories.push(val['term']);
245 | 					});
246 | 				}else {
247 | 					categories.push(val.category);
248 | 				}
249 | 			}
250 | 			obj.category = categories;
251 | 			var link = '';
252 | 			//just get the alternate link
253 | 			if (val.link) {
254 | 				if (_.isArray(val.link)) {
255 | 					_.each(val.link, function(val, i) {
256 | 						if (val.rel == 'alternate') {
257 | 							link = val.href;
258 | 						}
259 | 					});
260 | 				}else {
261 | 					link = val.link.href;
262 | 				}
263 | 			}
264 | 			obj.link = link;
265 | 			//since we are going to format the date, we want to make sure it exists
266 | 			if (val.published) {
267 | 				//lets try basis js date parsing for now
268 | 				obj.date = Date.parse(val.published);
269 | 			}
270 | 			if (val.updated) {
271 | 				//lets try basis js date parsing for now
272 | 				obj.updated = Date.parse(val.updated);
273 | 			}
274 |             //grab thumbnail if exists
275 |             if (val['media:thumbnail']) {
276 |                 obj.media = val.media || {};
277 |                 obj.media.thumbnail = val['media:content'];
278 |             } //grab media content if exists
279 |             if (val['media:content']) {
280 |                 obj.media = val.media || {};
281 |                 obj.media.content = val['media:content'];
282 |             }
283 | 
284 |             if(val['content:encoded']) {
285 |                 obj.content = val['content:encoded'];
286 |             }
287 | 
288 | 			//now push the obj onto the stack
289 | 			output.items.push(obj);
290 | 		});
291 | 	}
292 | 	return output;
293 | }
294 | 
295 | 
296 | 


--------------------------------------------------------------------------------