├── LICENSE ├── README ├── example.js ├── node-rss.js ├── node-xml.js └── nodeblogs.com.feed.xml /LICENSE: -------------------------------------------------------------------------------- 1 | ---------------------------------------------------------------------- 2 | node-rss is released under the MIT License 3 | 4 | Copyright (c) 2010 Rob Searles - http://www.robsearles.com 5 | 6 | Permission is hereby granted, free of charge, to any person 7 | obtaining a copy of this software and associated documentation 8 | files (the "Software"), to deal in the Software without 9 | restriction, including without limitation the rights to use, 10 | copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the 12 | Software is furnished to do so, subject to the following 13 | conditions: 14 | 15 | The above copyright notice and this permission notice shall be 16 | included in all copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | OTHER DEALINGS IN THE SOFTWARE. 26 | ---------------------------------------------------------------------- 27 | node-xml, which node-rss makes heavy usage is also released under the 28 | MIT License - see http://github.com/robrighter/node-xml for more info 29 | ---------------------------------------------------------------------- 30 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | ---------------------------------------------------------------------- 2 | node-rss - an RSS parser for node. 3 | http://github.com/ibrow/node-rss 4 | Rob Searles - http://www.robsearles.com 5 | ---------------------------------------------------------------------- 6 | node-rss makes heavy use of the node-xml module written by 7 | Rob Righter - @robrighter 8 | http://github.com/robrighter/node-xml 9 | ---------------------------------------------------------------------- 10 | node-rss is released under the MIT licence 11 | ---------------------------------------------------------------------- 12 | See examples.js for working examples of node-rss 13 | 14 | ---------------------------------------------------------------------- 15 | TODO 16 | ---------------------------------------------------------------------- 17 | Lots, mainly: 18 | - error checking 19 | - writing tests 20 | - make parsing mode robust 21 | - conform to all specifications 22 | 23 | ---------------------------------------------------------------------- 24 | HISTORY 25 | ---------------------------------------------------------------------- 26 | 21 May 2010 27 | Initial release, working on v0.1.95-17-g1036aa9 28 | ---------------------------------------------------------------------- 29 | 30 | ---------------------------------------------------------------------- 31 | REFERENCE 32 | ---------------------------------------------------------------------- 33 | RSS 2.0 specification 34 | http://cyber.law.harvard.edu/rss/rss.html 35 | 36 | RSS 1.0 specification 37 | http://web.resource.org/rss/1.0/spec 38 | 39 | Atom 1.0 specification 40 | http://atompub.org/2005/07/11/draft-ietf-atompub-format-10.html -------------------------------------------------------------------------------- /example.js: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | example.js 3 | Example of the node-rss feed parser 4 | 5 | **********************************************************************/ 6 | var sys = require('sys'); 7 | var rss = require('./node-rss'); 8 | 9 | 10 | /********************************************************************** 11 | Example One: 12 | Getting a remote RSS feed and parsing 13 | rss.parseURL(feed_url, use_excerpt, callback); 14 | **********************************************************************/ 15 | // URL of the feed you want to parse 16 | var feed_url = 'http://feeds.feedburner.com/github'; 17 | 18 | var response = rss.parseURL(feed_url, function(articles) { 19 | sys.puts(articles.length); 20 | for(i=0; i this is 43 | // an article, add container array to the list of articles 44 | cb.onStartElementNS(function(elem, attrs, prefix, uri, namespaces) { 45 | current_element = elem.toLowerCase(); 46 | if(current_element == 'item' || current_element == 'entry') { 47 | in_item = true; 48 | articles[article_count] = Array(); 49 | } 50 | }); 51 | // when we are at the end of an element, save its related content 52 | cb.onEndElementNS(function(elem, prefix, uri) { 53 | if(in_item) { 54 | switch(current_element) 55 | { 56 | case 'description': 57 | case 'summary': 58 | articles[article_count][current_element] = current_chars.replace(/^\s\s*/, '').replace(/\s\s*$/, ''); 59 | break; 60 | case 'content': 61 | case 'encoded': // feedburner is , node-xml reads as 62 | current_element = 'content'; 63 | articles[article_count][current_element] = current_chars.replace(/^\s\s*/, '').replace(/\s\s*$/, ''); 64 | break; 65 | case 'link': 66 | case 'title': 67 | articles[article_count][current_element] = current_chars.replace(/^\s\s*/, '').replace(/\s\s*$/, ''); 68 | break; 69 | } 70 | 71 | current_element = false; 72 | current_chars = ''; 73 | if(elem.toLowerCase() == 'item' || elem.toString() == 'entry') { 74 | in_item = false; 75 | article_count ++; 76 | } 77 | } 78 | }); 79 | 80 | cb.onCharacters(addContent); 81 | cb.onCdata(addContent); 82 | function addContent(chars) { 83 | if(in_item) { 84 | current_chars += chars; 85 | } 86 | }; 87 | 88 | // @TODO handle warnings and errors properly 89 | cb.onWarning(function(msg) { 90 | sys.puts(''+msg+""); 91 | }); 92 | cb.onError(function(msg) { 93 | sys.puts(''+JSON.stringify(msg)+""); 94 | }); 95 | }); 96 | 97 | 98 | /** 99 | * parseFile() 100 | * Parses an RSS feed from a file. 101 | * @param file - path to the RSS feed file 102 | * @param cb - callback function to be triggered at end of parsing 103 | */ 104 | exports.parseFile = function(file, cb) { 105 | callback = cb; 106 | parser.parseFile(file); 107 | } 108 | /** 109 | * parseURL() 110 | * Parses an RSS feed from a URL. 111 | * @param url - URL of the RSS feed file 112 | * @param cb - callback function to be triggered at end of parsing 113 | * 114 | * @TODO - decent error checking 115 | */ 116 | exports.parseURL = function(url, cb) { 117 | callback = cb; 118 | 119 | get_rss(url); 120 | function get_rss(url) { 121 | var u = require('url'), http = require('http'); 122 | var parts = u.parse(url); 123 | //sys.puts(JSON.stringify(parts)); 124 | 125 | // set the default port to 80 126 | if(!parts.port) { parts.port = 80; } 127 | 128 | 129 | var redirection_level = 0; 130 | var client = http.createClient(parts.port, parts.hostname); 131 | var request = client.request('GET', parts.pathname, {'host': parts.hostname}); 132 | request.addListener('response', function (response) { 133 | //sys.puts('STATUS: ' + response.statusCode); 134 | //sys.puts('HEADERS: ' + JSON.stringify(response.headers)); 135 | 136 | // check to see the type of status 137 | switch(response.statusCode) { 138 | // check for ALL OK 139 | case 200: 140 | var body = ''; 141 | response.addListener('data', function (chunk) { 142 | body += chunk; 143 | }); 144 | response.addListener('end', function() { 145 | parser.parseString(body); 146 | }); 147 | break; 148 | // redirect status returned 149 | case 301: 150 | case 302: 151 | if(redirection_level > 10) { 152 | sys.puts("too many redirects"); 153 | } 154 | else { 155 | sys.puts("redirect to "+response.headers.location); 156 | get_rss(response.headers.location); 157 | } 158 | break; 159 | default: 160 | /* 161 | response.setEncoding('utf8'); 162 | response.addListener('data', function (chunk) { 163 | //sys.puts('BODY: ' + chunk); 164 | }); 165 | */ 166 | break; 167 | } 168 | }); 169 | request.end(); 170 | } 171 | }; -------------------------------------------------------------------------------- /node-xml.js: -------------------------------------------------------------------------------- 1 | // node-xml 2 | // An xml parser for node.js 3 | // (C) Rob Righter (@robrighter) 2009 - 2010, Licensed under the MIT-LICENSE 4 | // Contributions from David Joham 5 | 6 | 7 | (function () { 8 | 9 | // CONSTANTS 10 | var whitespace = "\n\r\t "; 11 | 12 | 13 | //XMLP is a pull-based parser. The calling application passes in a XML string 14 | //to the constructor, then repeatedly calls .next() to parse the next segment. 15 | //.next() returns a flag indicating what type of segment was found, and stores 16 | //data temporarily in couple member variables (name, content, array of 17 | //attributes), which can be accessed by several .get____() methods. 18 | // 19 | //Basically, XMLP is the lowest common denominator parser - an very simple 20 | //API which other wrappers can be built against. 21 | 22 | 23 | var XMLP = function(strXML) { 24 | // Normalize line breaks 25 | strXML = SAXStrings.replace(strXML, null, null, "\r\n", "\n"); 26 | strXML = SAXStrings.replace(strXML, null, null, "\r", "\n"); 27 | 28 | this.m_xml = strXML; 29 | this.m_iP = 0; 30 | this.m_iState = XMLP._STATE_PROLOG; 31 | this.m_stack = new Stack(); 32 | this._clearAttributes(); 33 | this.m_pause = false; 34 | this.m_preInterruptIState = XMLP._STATE_PROLOG; 35 | this.m_namespaceList = new Array(); 36 | this.m_chunkTransitionContinuation = null; 37 | 38 | } 39 | 40 | 41 | // CONSTANTS (these must be below the constructor) 42 | XMLP._NONE = 0; 43 | XMLP._ELM_B = 1; 44 | XMLP._ELM_E = 2; 45 | XMLP._ELM_EMP = 3; 46 | XMLP._ATT = 4; 47 | XMLP._TEXT = 5; 48 | XMLP._ENTITY = 6; 49 | XMLP._PI = 7; 50 | XMLP._CDATA = 8; 51 | XMLP._COMMENT = 9; 52 | XMLP._DTD = 10; 53 | XMLP._ERROR = 11; 54 | XMLP._INTERRUPT = 12; 55 | 56 | XMLP._CONT_XML = 0; 57 | XMLP._CONT_ALT = 1; 58 | 59 | XMLP._ATT_NAME = 0; 60 | XMLP._ATT_VAL = 1; 61 | 62 | XMLP._STATE_PROLOG = 1; 63 | XMLP._STATE_DOCUMENT = 2; 64 | XMLP._STATE_MISC = 3; 65 | 66 | XMLP._errs = new Array(); 67 | XMLP._errs[XMLP.ERR_CLOSE_PI = 0 ] = "PI: missing closing sequence"; 68 | XMLP._errs[XMLP.ERR_CLOSE_DTD = 1 ] = "DTD: missing closing sequence"; 69 | XMLP._errs[XMLP.ERR_CLOSE_COMMENT = 2 ] = "Comment: missing closing sequence"; 70 | XMLP._errs[XMLP.ERR_CLOSE_CDATA = 3 ] = "CDATA: missing closing sequence"; 71 | XMLP._errs[XMLP.ERR_CLOSE_ELM = 4 ] = "Element: missing closing sequence"; 72 | XMLP._errs[XMLP.ERR_CLOSE_ENTITY = 5 ] = "Entity: missing closing sequence"; 73 | XMLP._errs[XMLP.ERR_PI_TARGET = 6 ] = "PI: target is required"; 74 | XMLP._errs[XMLP.ERR_ELM_EMPTY = 7 ] = "Element: cannot be both empty and closing"; 75 | XMLP._errs[XMLP.ERR_ELM_NAME = 8 ] = "Element: name must immediatly follow \"<\""; 76 | XMLP._errs[XMLP.ERR_ELM_LT_NAME = 9 ] = "Element: \"<\" not allowed in element names"; 77 | XMLP._errs[XMLP.ERR_ATT_VALUES = 10] = "Attribute: values are required and must be in quotes"; 78 | XMLP._errs[XMLP.ERR_ATT_LT_NAME = 11] = "Element: \"<\" not allowed in attribute names"; 79 | XMLP._errs[XMLP.ERR_ATT_LT_VALUE = 12] = "Attribute: \"<\" not allowed in attribute values"; 80 | XMLP._errs[XMLP.ERR_ATT_DUP = 13] = "Attribute: duplicate attributes not allowed"; 81 | XMLP._errs[XMLP.ERR_ENTITY_UNKNOWN = 14] = "Entity: unknown entity"; 82 | XMLP._errs[XMLP.ERR_INFINITELOOP = 15] = "Infininte loop"; 83 | XMLP._errs[XMLP.ERR_DOC_STRUCTURE = 16] = "Document: only comments, processing instructions, or whitespace allowed outside of document element"; 84 | XMLP._errs[XMLP.ERR_ELM_NESTING = 17] = "Element: must be nested correctly"; 85 | 86 | 87 | 88 | XMLP.prototype.continueParsing = function(strXML) { 89 | 90 | if(this.m_chunkTransitionContinuation){ 91 | strXML = this.m_chunkTransitionContinuation + strXML; 92 | } 93 | // Normalize line breaks 94 | strXML = SAXStrings.replace(strXML, null, null, "\r\n", "\n"); 95 | strXML = SAXStrings.replace(strXML, null, null, "\r", "\n"); 96 | 97 | this.m_xml = strXML; 98 | this.m_iP = 0; 99 | this.m_iState = XMLP._STATE_DOCUMENT; 100 | //this.m_stack = new Stack(); 101 | //this._clearAttributes(); 102 | this.m_pause = false; 103 | this.m_preInterruptIState = XMLP._STATE_PROLOG; 104 | this.m_chunkTransitionContinuation = null; 105 | 106 | } 107 | 108 | XMLP.prototype._addAttribute = function(name, value) { 109 | this.m_atts[this.m_atts.length] = new Array(name, value); 110 | } 111 | 112 | XMLP.prototype._checkStructure = function(iEvent) { 113 | if(XMLP._STATE_PROLOG == this.m_iState) { 114 | if((XMLP._TEXT == iEvent) || (XMLP._ENTITY == iEvent)) { 115 | if(SAXStrings.indexOfNonWhitespace(this.getContent(), this.getContentBegin(), this.getContentEnd()) != -1) { 116 | return this._setErr(XMLP.ERR_DOC_STRUCTURE); 117 | } 118 | } 119 | 120 | if((XMLP._ELM_B == iEvent) || (XMLP._ELM_EMP == iEvent)) { 121 | this.m_iState = XMLP._STATE_DOCUMENT; 122 | // Don't return - fall through to next state 123 | } 124 | } 125 | if(XMLP._STATE_DOCUMENT == this.m_iState) { 126 | if((XMLP._ELM_B == iEvent) || (XMLP._ELM_EMP == iEvent)) { 127 | this.m_stack.push(this.getName()); 128 | } 129 | 130 | if((XMLP._ELM_E == iEvent) || (XMLP._ELM_EMP == iEvent)) { 131 | var strTop = this.m_stack.pop(); 132 | if((strTop == null) || (strTop != this.getName())) { 133 | return this._setErr(XMLP.ERR_ELM_NESTING); 134 | } 135 | } 136 | 137 | if(this.m_stack.count() == 0) { 138 | this.m_iState = XMLP._STATE_MISC; 139 | return iEvent; 140 | } 141 | } 142 | if(XMLP._STATE_MISC == this.m_iState) { 143 | if((XMLP._ELM_B == iEvent) || (XMLP._ELM_E == iEvent) || (XMLP._ELM_EMP == iEvent) || (XMLP.EVT_DTD == iEvent)) { 144 | return this._setErr(XMLP.ERR_DOC_STRUCTURE); 145 | } 146 | 147 | if((XMLP._TEXT == iEvent) || (XMLP._ENTITY == iEvent)) { 148 | if(SAXStrings.indexOfNonWhitespace(this.getContent(), this.getContentBegin(), this.getContentEnd()) != -1) { 149 | return this._setErr(XMLP.ERR_DOC_STRUCTURE); 150 | } 151 | } 152 | } 153 | 154 | return iEvent; 155 | 156 | } 157 | 158 | XMLP.prototype._clearAttributes = function() { 159 | this.m_atts = new Array(); 160 | } 161 | 162 | XMLP.prototype._findAttributeIndex = function(name) { 163 | for(var i = 0; i < this.m_atts.length; i++) { 164 | if(this.m_atts[i][XMLP._ATT_NAME] == name) { 165 | return i; 166 | } 167 | } 168 | return -1; 169 | 170 | } 171 | 172 | XMLP.prototype.getAttributeCount = function() { 173 | return this.m_atts ? this.m_atts.length : 0; 174 | } 175 | 176 | XMLP.prototype.getAttributeName = function(index) { 177 | return ((index < 0) || (index >= this.m_atts.length)) ? null : this.m_atts[index][XMLP._ATT_NAME]; 178 | } 179 | 180 | XMLP.prototype.getAttributeValue = function(index) { 181 | return ((index < 0) || (index >= this.m_atts.length)) ? null : __unescapeString(this.m_atts[index][XMLP._ATT_VAL]); 182 | } 183 | 184 | XMLP.prototype.getAttributeValueByName = function(name) { 185 | return this.getAttributeValue(this._findAttributeIndex(name)); 186 | } 187 | 188 | XMLP.prototype.getColumnNumber = function() { 189 | return SAXStrings.getColumnNumber(this.m_xml, this.m_iP); 190 | } 191 | 192 | XMLP.prototype.getContent = function() { 193 | return (this.m_cSrc == XMLP._CONT_XML) ? this.m_xml : this.m_cAlt; 194 | } 195 | 196 | XMLP.prototype.getContentBegin = function() { 197 | return this.m_cB; 198 | } 199 | 200 | XMLP.prototype.getContentEnd = function() { 201 | return this.m_cE; 202 | } 203 | 204 | XMLP.prototype.getLineNumber = function() { 205 | return SAXStrings.getLineNumber(this.m_xml, this.m_iP); 206 | } 207 | 208 | XMLP.prototype.getName = function() { 209 | return this.m_name; 210 | } 211 | 212 | XMLP.prototype.pause = function(){ 213 | this.m_pause = true; 214 | } 215 | 216 | XMLP.prototype.resume = function(){ 217 | this.m_pause = false; 218 | this.m_iState = this.m_preInterruptIState; 219 | } 220 | 221 | XMLP.prototype.next = function() { 222 | if(!this.m_pause){ 223 | return this._checkStructure(this._parse()); 224 | } 225 | else{ 226 | //save off the current event loop state and set the state to interrupt 227 | this.m_preInterruptIState = this.m_iState; 228 | return XMLP._INTERRUPT; 229 | } 230 | } 231 | 232 | XMLP.prototype._parse = function() { 233 | if(this.m_iP == this.m_xml.length) { 234 | return XMLP._NONE; 235 | } 236 | 237 | if(this.m_iP == this.m_xml.indexOf("= 0; i--){ 303 | var item = this.m_namespaceList[i]; 304 | if(item.prefix === ''){ 305 | return item.uri; 306 | } 307 | } 308 | 309 | //still nothing, lets just return an empty string 310 | return ''; 311 | } 312 | 313 | XMLP.prototype._removeExpiredNamesapces = function (closingtagname) { 314 | //remove the expiring namespaces from the list (you can id them by scopetag) 315 | var keeps = []; 316 | this.m_namespaceList.map(function (item){ 317 | if(item.scopetag !== closingtagname){ 318 | keeps.push(item); 319 | } 320 | }); 321 | 322 | this.m_namespaceList = keeps; 323 | 324 | } 325 | 326 | //////////////////////////////////////////////////////////////////////// 327 | 328 | 329 | XMLP.prototype._parseAttribute = function(iB, iE) { 330 | var iNB, iNE, iEq, iVB, iVE; 331 | var cQuote, strN, strV; 332 | 333 | this.m_cAlt = ""; //resets the value so we don't use an old one by accident (see testAttribute7 in the test suite) 334 | 335 | iNB = SAXStrings.indexOfNonWhitespace(this.m_xml, iB, iE); 336 | if((iNB == -1) ||(iNB >= iE)) { 337 | return iNB; 338 | } 339 | 340 | iEq = this.m_xml.indexOf("=", iNB); 341 | if((iEq == -1) || (iEq > iE)) { 342 | return this._setErr(XMLP.ERR_ATT_VALUES); 343 | } 344 | 345 | iNE = SAXStrings.lastIndexOfNonWhitespace(this.m_xml, iNB, iEq); 346 | 347 | iVB = SAXStrings.indexOfNonWhitespace(this.m_xml, iEq + 1, iE); 348 | if((iVB == -1) ||(iVB > iE)) { 349 | return this._setErr(XMLP.ERR_ATT_VALUES); 350 | } 351 | 352 | cQuote = this.m_xml.charAt(iVB); 353 | if(SAXStrings.QUOTES.indexOf(cQuote) == -1) { 354 | return this._setErr(XMLP.ERR_ATT_VALUES); 355 | } 356 | 357 | iVE = this.m_xml.indexOf(cQuote, iVB + 1); 358 | if((iVE == -1) ||(iVE > iE)) { 359 | return this._setErr(XMLP.ERR_ATT_VALUES); 360 | } 361 | 362 | strN = this.m_xml.substring(iNB, iNE + 1); 363 | strV = this.m_xml.substring(iVB + 1, iVE); 364 | 365 | if(strN.indexOf("<") != -1) { 366 | return this._setErr(XMLP.ERR_ATT_LT_NAME); 367 | } 368 | 369 | if(strV.indexOf("<") != -1) { 370 | return this._setErr(XMLP.ERR_ATT_LT_VALUE); 371 | } 372 | 373 | strV = SAXStrings.replace(strV, null, null, "\n", " "); 374 | strV = SAXStrings.replace(strV, null, null, "\t", " "); 375 | iRet = this._replaceEntities(strV); 376 | if(iRet == XMLP._ERROR) { 377 | return iRet; 378 | } 379 | 380 | strV = this.m_cAlt; 381 | 382 | if(this._findAttributeIndex(strN) == -1) { 383 | this._addAttribute(strN, strV); 384 | } 385 | else { 386 | return this._setErr(XMLP.ERR_ATT_DUP); 387 | } 388 | 389 | this.m_iP = iVE + 2; 390 | 391 | return XMLP._ATT; 392 | 393 | } 394 | 395 | XMLP.prototype._parseCDATA = function(iB) { 396 | var iE = this.m_xml.indexOf("]]>", iB); 397 | if (iE == -1) { 398 | //This item never closes, although it could be a malformed document, we will assume that we are mid-chunck, save the string and reurn as interrupted 399 | this.m_chunkTransitionContinuation = this.m_xml.slice(iB-9);//the '-", iB); 414 | if (iE == -1) { 415 | //This item never closes, although it could be a malformed document, we will assume that we are mid-chunck, save the string and reurn as interrupted 416 | this.m_chunkTransitionContinuation = this.m_xml.slice(iB-4);//the '-4' adds the '