├── README.md ├── example.js ├── lib └── node-xml.js ├── package.json └── sample.xml /README.md: -------------------------------------------------------------------------------- 1 | node-xml 2 | =================== 3 | 4 | (C) Rob Righter (@robrighter) 2009 - 2010, Licensed under the MIT-LICENSE 5 | Contributions from David Joham 6 | 7 | node-xml is an xml parser for node.js written in javascript. 8 | 9 | # Install 10 | 11 | npm install node-xml 12 | 13 | API 14 | --- 15 | 16 | 17 | SaxParser 18 | --------- 19 | 20 | Node-xml provides a SAX2 parser interface that can take a string, file. The parser can take characters from the document in chunks. To send chunks of the document to the parser use 'parseString(xml)' 21 | 22 | #SAX Parser# 23 | 24 | ##new xml.SaxParser()## 25 | * Instantiate a new SaxParser 26 | * returns: a SaxParser object 27 | 28 | ##new xml.SaxParser(callback)## 29 | * Instantiate a new SaxParser 30 | * returns: a SaxParser object 31 | * Arguments 32 | *callback - a function that accepts the new sax parser as an argument 33 | 34 | #Parse# 35 | 36 | ##parser.parseString(string)## 37 | 38 | Parse an in memory string 39 | * return: boolean. true if no errors, false otherwise 40 | * Arguments 41 | * string - a string representing the document to parse 42 | 43 | ##parser.parseFile(filename)## 44 | 45 | Parse a file 46 | * return: boolean. true if no errors, false otherwise 47 | * Arguments 48 | * filename - a string representing the file to be parsed 49 | 50 | ##parser.pause()## 51 | pauses parsing of the document 52 | 53 | ##parser.resume()## 54 | resumes parsing of the document 55 | 56 | #Callbacks# 57 | 58 | ##parser.onStartDocument(function() {})## 59 | 60 | Called at the start of a document 61 | 62 | ##parse.onEndDocument(function() {})## 63 | 64 | Called at the end of the document parse 65 | 66 | ##parser.onStartElementNS(function(elem, attrs, prefix, uri, namespaces) {})## 67 | 68 | Called on an open element tag 69 | * Arguments 70 | * elem - a string representing the element name 71 | * attrs - an array of arrays: [[key, value], [key, value]] 72 | * prefix - a string representing the namespace prefix of the element 73 | * uri - the namespace URI of the element 74 | * namespaces - an array of arrays: [[prefix, uri], [prefix, uri]] 75 | 76 | ##parser.onEndElementNS(function(elem, prefix, uri) {})## 77 | 78 | Called at the close of an element 79 | * Arguments 80 | * elem - a string representing the element name 81 | * prefix - a string representing the namespace prefix of the element 82 | * uri - the namespace URI of the element 83 | 84 | ##parser.onCharacters(function(chars) {})## 85 | 86 | Called when a set of content characters is encountered 87 | * Arguments 88 | * chars - a string of characters 89 | 90 | ##parser.onCdata(function(cdata) {})## 91 | 92 | Called when a CDATA is encountered 93 | * Arguments 94 | * cdata - a string representing the CDATA 95 | 96 | ##parser.onComment(function(msg) {})## 97 | 98 | Called when a comment is encountered 99 | * Arguments 100 | * msg - a string representing the comment 101 | 102 | ##parser.onWarning(function(msg) {})## 103 | 104 | Called when a warning is encountered 105 | * Arguments 106 | * msg - a string representing the warning message 107 | 108 | ##parser.onError(function(msg) {})## 109 | 110 | Called when an error is encountered 111 | * Arguments 112 | * msg - a string representing the error message 113 | 114 | 115 | EXAMPLE USAGE 116 | ------------- 117 | 118 | var util = require('util'); 119 | var xml = require("./lib/node-xml"); 120 | 121 | var parser = new xml.SaxParser(function(cb) { 122 | cb.onStartDocument(function() { 123 | 124 | }); 125 | cb.onEndDocument(function() { 126 | 127 | }); 128 | cb.onStartElementNS(function(elem, attrs, prefix, uri, namespaces) { 129 | util.log("=> Started: " + elem + " uri="+uri +" (Attributes: " + JSON.stringify(attrs) + " )"); 130 | }); 131 | cb.onEndElementNS(function(elem, prefix, uri) { 132 | util.log("<= End: " + elem + " uri="+uri + "\n"); 133 | parser.pause();// pause the parser 134 | setTimeout(function (){parser.resume();}, 200); //resume the parser 135 | }); 136 | cb.onCharacters(function(chars) { 137 | //util.log(''+chars+""); 138 | }); 139 | cb.onCdata(function(cdata) { 140 | util.log(''+cdata+""); 141 | }); 142 | cb.onComment(function(msg) { 143 | util.log(''+msg+""); 144 | }); 145 | cb.onWarning(function(msg) { 146 | util.log(''+msg+""); 147 | }); 148 | cb.onError(function(msg) { 149 | util.log(''+JSON.stringify(msg)+""); 150 | }); 151 | }); 152 | 153 | 154 | //example read from chunks 155 | parser.parseString(""); 156 | parser.parseString(""); 158 | parser.parseString("and lots"); 159 | parser.parseString("and lots of text&am"); 160 | parser.parseString("p;some more."); 161 | parser.parseString(""); 164 | parser.parseString(""); 166 | 167 | //example read from file 168 | parser.parseFile("sample.xml"); 169 | -------------------------------------------------------------------------------- /example.js: -------------------------------------------------------------------------------- 1 | var util = require('util'); 2 | var xml = require("./lib/node-xml"); 3 | 4 | var parser = new xml.SaxParser(function(cb) { 5 | cb.onStartDocument(function() { 6 | 7 | }); 8 | cb.onEndDocument(function() { 9 | 10 | }); 11 | cb.onStartElementNS(function(elem, attrs, prefix, uri, namespaces) { 12 | util.log("=> Started: " + elem + " uri="+uri +" (Attributes: " + JSON.stringify(attrs) + " )"); 13 | }); 14 | cb.onEndElementNS(function(elem, prefix, uri) { 15 | util.log("<= End: " + elem + " uri="+uri + "\n"); 16 | parser.pause();// pause the parser 17 | setTimeout(function (){parser.resume();}, 100); //resume the parser 18 | }); 19 | cb.onCharacters(function(chars) { 20 | util.log(''+chars+""); 21 | }); 22 | cb.onCdata(function(cdata) { 23 | util.log(''+cdata+""); 24 | }); 25 | cb.onComment(function(msg) { 26 | util.log(''+msg+""); 27 | }); 28 | cb.onWarning(function(msg) { 29 | util.log(''+msg+""); 30 | }); 31 | cb.onError(function(msg) { 32 | util.log(''+JSON.stringify(msg)+""); 33 | }); 34 | }); 35 | 36 | 37 | //example read from file 38 | parser.parseFile("sample.xml"); 39 | 40 | //example read from chunks 41 | parser.parseString(""); 42 | parser.parseString(""); 44 | parser.parseString("and lots"); 45 | parser.parseString("and lots of text&am"); 46 | parser.parseString("p;some more."); 47 | parser.parseString(""); 50 | parser.parseString(""); 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /lib/node-xml.js: -------------------------------------------------------------------------------- 1 | // node-xml 2 | // An xml parser for node.js 3 | // (C) Rob Righter (@robrighter) 2009 - 2010, Licensed under the MIT-LICENSE 4 | // Contributions from David Joham 5 | 6 | 7 | (function () { 8 | 9 | // CONSTANTS 10 | var whitespace = "\n\r\t "; 11 | 12 | 13 | //XMLP is a pull-based parser. The calling application passes in a XML string 14 | //to the constructor, then repeatedly calls .next() to parse the next segment. 15 | //.next() returns a flag indicating what type of segment was found, and stores 16 | //data temporarily in couple member variables (name, content, array of 17 | //attributes), which can be accessed by several .get____() methods. 18 | // 19 | //Basically, XMLP is the lowest common denominator parser - an very simple 20 | //API which other wrappers can be built against. 21 | 22 | 23 | var XMLP = function(strXML) { 24 | // Normalize line breaks 25 | strXML = SAXStrings.replace(strXML, null, null, "\r\n", "\n"); 26 | strXML = SAXStrings.replace(strXML, null, null, "\r", "\n"); 27 | 28 | this.m_xml = strXML; 29 | this.m_iP = 0; 30 | this.m_iState = XMLP._STATE_PROLOG; 31 | this.m_stack = new Stack(); 32 | this._clearAttributes(); 33 | this.m_pause = false; 34 | this.m_preInterruptIState = XMLP._STATE_PROLOG; 35 | this.m_namespaceList = new Array(); 36 | this.m_chunkTransitionContinuation = null; 37 | 38 | } 39 | 40 | 41 | // CONSTANTS (these must be below the constructor) 42 | XMLP._NONE = 0; 43 | XMLP._ELM_B = 1; 44 | XMLP._ELM_E = 2; 45 | XMLP._ELM_EMP = 3; 46 | XMLP._ATT = 4; 47 | XMLP._TEXT = 5; 48 | XMLP._ENTITY = 6; 49 | XMLP._PI = 7; 50 | XMLP._CDATA = 8; 51 | XMLP._COMMENT = 9; 52 | XMLP._DTD = 10; 53 | XMLP._ERROR = 11; 54 | XMLP._INTERRUPT = 12; 55 | 56 | XMLP._CONT_XML = 0; 57 | XMLP._CONT_ALT = 1; 58 | 59 | XMLP._ATT_NAME = 0; 60 | XMLP._ATT_VAL = 1; 61 | 62 | XMLP._STATE_PROLOG = 1; 63 | XMLP._STATE_DOCUMENT = 2; 64 | XMLP._STATE_MISC = 3; 65 | 66 | XMLP._errs = new Array(); 67 | XMLP._errs[XMLP.ERR_CLOSE_PI = 0 ] = "PI: missing closing sequence"; 68 | XMLP._errs[XMLP.ERR_CLOSE_DTD = 1 ] = "DTD: missing closing sequence"; 69 | XMLP._errs[XMLP.ERR_CLOSE_COMMENT = 2 ] = "Comment: missing closing sequence"; 70 | XMLP._errs[XMLP.ERR_CLOSE_CDATA = 3 ] = "CDATA: missing closing sequence"; 71 | XMLP._errs[XMLP.ERR_CLOSE_ELM = 4 ] = "Element: missing closing sequence"; 72 | XMLP._errs[XMLP.ERR_CLOSE_ENTITY = 5 ] = "Entity: missing closing sequence"; 73 | XMLP._errs[XMLP.ERR_PI_TARGET = 6 ] = "PI: target is required"; 74 | XMLP._errs[XMLP.ERR_ELM_EMPTY = 7 ] = "Element: cannot be both empty and closing"; 75 | XMLP._errs[XMLP.ERR_ELM_NAME = 8 ] = "Element: name must immediatly follow \"<\""; 76 | XMLP._errs[XMLP.ERR_ELM_LT_NAME = 9 ] = "Element: \"<\" not allowed in element names"; 77 | XMLP._errs[XMLP.ERR_ATT_VALUES = 10] = "Attribute: values are required and must be in quotes"; 78 | XMLP._errs[XMLP.ERR_ATT_LT_NAME = 11] = "Element: \"<\" not allowed in attribute names"; 79 | XMLP._errs[XMLP.ERR_ATT_LT_VALUE = 12] = "Attribute: \"<\" not allowed in attribute values"; 80 | XMLP._errs[XMLP.ERR_ATT_DUP = 13] = "Attribute: duplicate attributes not allowed"; 81 | XMLP._errs[XMLP.ERR_ENTITY_UNKNOWN = 14] = "Entity: unknown entity"; 82 | XMLP._errs[XMLP.ERR_INFINITELOOP = 15] = "Infininte loop"; 83 | XMLP._errs[XMLP.ERR_DOC_STRUCTURE = 16] = "Document: only comments, processing instructions, or whitespace allowed outside of document element"; 84 | XMLP._errs[XMLP.ERR_ELM_NESTING = 17] = "Element: must be nested correctly"; 85 | 86 | 87 | 88 | XMLP.prototype.continueParsing = function(strXML) { 89 | 90 | if(this.m_chunkTransitionContinuation){ 91 | strXML = this.m_chunkTransitionContinuation + strXML; 92 | } 93 | // Normalize line breaks 94 | strXML = SAXStrings.replace(strXML, null, null, "\r\n", "\n"); 95 | strXML = SAXStrings.replace(strXML, null, null, "\r", "\n"); 96 | 97 | this.m_xml = strXML; 98 | this.m_iP = 0; 99 | this.m_iState = XMLP._STATE_DOCUMENT; 100 | //this.m_stack = new Stack(); 101 | //this._clearAttributes(); 102 | this.m_pause = false; 103 | this.m_preInterruptIState = XMLP._STATE_PROLOG; 104 | this.m_chunkTransitionContinuation = null; 105 | 106 | } 107 | 108 | XMLP.prototype._addAttribute = function(name, value) { 109 | this.m_atts[this.m_atts.length] = new Array(name, value); 110 | } 111 | 112 | XMLP.prototype._checkStructure = function(iEvent) { 113 | if(XMLP._STATE_PROLOG == this.m_iState) { 114 | if((XMLP._TEXT == iEvent) || (XMLP._ENTITY == iEvent)) { 115 | if(SAXStrings.indexOfNonWhitespace(this.getContent(), this.getContentBegin(), this.getContentEnd()) != -1) { 116 | return this._setErr(XMLP.ERR_DOC_STRUCTURE); 117 | } 118 | } 119 | 120 | if((XMLP._ELM_B == iEvent) || (XMLP._ELM_EMP == iEvent)) { 121 | this.m_iState = XMLP._STATE_DOCUMENT; 122 | // Don't return - fall through to next state 123 | } 124 | } 125 | if(XMLP._STATE_DOCUMENT == this.m_iState) { 126 | if((XMLP._ELM_B == iEvent) || (XMLP._ELM_EMP == iEvent)) { 127 | this.m_stack.push(this.getName()); 128 | } 129 | 130 | if((XMLP._ELM_E == iEvent) || (XMLP._ELM_EMP == iEvent)) { 131 | var strTop = this.m_stack.pop(); 132 | if((strTop == null) || (strTop != this.getName())) { 133 | return this._setErr(XMLP.ERR_ELM_NESTING); 134 | } 135 | } 136 | 137 | if(this.m_stack.count() == 0) { 138 | this.m_iState = XMLP._STATE_MISC; 139 | return iEvent; 140 | } 141 | } 142 | if(XMLP._STATE_MISC == this.m_iState) { 143 | if((XMLP._ELM_B == iEvent) || (XMLP._ELM_E == iEvent) || (XMLP._ELM_EMP == iEvent) || (XMLP.EVT_DTD == iEvent)) { 144 | return this._setErr(XMLP.ERR_DOC_STRUCTURE); 145 | } 146 | 147 | if((XMLP._TEXT == iEvent) || (XMLP._ENTITY == iEvent)) { 148 | if(SAXStrings.indexOfNonWhitespace(this.getContent(), this.getContentBegin(), this.getContentEnd()) != -1) { 149 | return this._setErr(XMLP.ERR_DOC_STRUCTURE); 150 | } 151 | } 152 | } 153 | 154 | return iEvent; 155 | 156 | } 157 | 158 | XMLP.prototype._clearAttributes = function() { 159 | this.m_atts = new Array(); 160 | } 161 | 162 | XMLP.prototype._findAttributeIndex = function(name) { 163 | for(var i = 0; i < this.m_atts.length; i++) { 164 | if(this.m_atts[i][XMLP._ATT_NAME] == name) { 165 | return i; 166 | } 167 | } 168 | return -1; 169 | 170 | } 171 | 172 | XMLP.prototype.getAttributeCount = function() { 173 | return this.m_atts ? this.m_atts.length : 0; 174 | } 175 | 176 | XMLP.prototype.getAttributeName = function(index) { 177 | return ((index < 0) || (index >= this.m_atts.length)) ? null : this.m_atts[index][XMLP._ATT_NAME]; 178 | } 179 | 180 | XMLP.prototype.getAttributeValue = function(index) { 181 | return ((index < 0) || (index >= this.m_atts.length)) ? null : __unescapeString(this.m_atts[index][XMLP._ATT_VAL]); 182 | } 183 | 184 | XMLP.prototype.getAttributeValueByName = function(name) { 185 | return this.getAttributeValue(this._findAttributeIndex(name)); 186 | } 187 | 188 | XMLP.prototype.getColumnNumber = function() { 189 | return SAXStrings.getColumnNumber(this.m_xml, this.m_iP); 190 | } 191 | 192 | XMLP.prototype.getContent = function() { 193 | return (this.m_cSrc == XMLP._CONT_XML) ? this.m_xml : this.m_cAlt; 194 | } 195 | 196 | XMLP.prototype.getContentBegin = function() { 197 | return this.m_cB; 198 | } 199 | 200 | XMLP.prototype.getContentEnd = function() { 201 | return this.m_cE; 202 | } 203 | 204 | XMLP.prototype.getLineNumber = function() { 205 | return SAXStrings.getLineNumber(this.m_xml, this.m_iP); 206 | } 207 | 208 | XMLP.prototype.getName = function() { 209 | return this.m_name; 210 | } 211 | 212 | XMLP.prototype.pause = function(){ 213 | this.m_pause = true; 214 | } 215 | 216 | XMLP.prototype.resume = function(){ 217 | this.m_pause = false; 218 | this.m_iState = this.m_preInterruptIState; 219 | } 220 | 221 | XMLP.prototype.next = function() { 222 | if(!this.m_pause){ 223 | return this._checkStructure(this._parse()); 224 | } 225 | else{ 226 | //save off the current event loop state and set the state to interrupt 227 | this.m_preInterruptIState = this.m_iState; 228 | return XMLP._INTERRUPT; 229 | } 230 | } 231 | 232 | XMLP.prototype._parse = function() { 233 | if(this.m_iP == this.m_xml.length) { 234 | return XMLP._NONE; 235 | } 236 | 237 | function _indexOf(needle, haystack, start) { 238 | // This is an improvement over the native indexOf because it stops at the 239 | // end of the needle and doesn't continue to the end of the haystack looking. 240 | for(var i = 0; i < needle.length; i++) { 241 | if(needle.charAt(i) != haystack.charAt(start + i)) 242 | return -1; 243 | } 244 | return start; 245 | } 246 | 247 | var fc = this.m_xml.charAt(this.m_iP); 248 | if (fc !== '<' && fc !== '&') { 249 | return this._parseText (this.m_iP); 250 | } 251 | else if(this.m_iP == _indexOf("= 0; i--){ 315 | var item = this.m_namespaceList[i]; 316 | if(item.prefix === ''){ 317 | return item.uri; 318 | } 319 | } 320 | 321 | //still nothing, lets just return an empty string 322 | return ''; 323 | } 324 | 325 | XMLP.prototype._removeExpiredNamesapces = function (closingtagname) { 326 | //remove the expiring namespaces from the list (you can id them by scopetag) 327 | var keeps = []; 328 | this.m_namespaceList.map(function (item){ 329 | if(item.scopetag !== closingtagname){ 330 | keeps.push(item); 331 | } 332 | }); 333 | 334 | this.m_namespaceList = keeps; 335 | 336 | } 337 | 338 | //////////////////////////////////////////////////////////////////////// 339 | 340 | 341 | XMLP.prototype._parseAttribute = function(iB, iE) { 342 | var iNB, iNE, iEq, iVB, iVE; 343 | var cQuote, strN, strV; 344 | 345 | this.m_cAlt = ""; //resets the value so we don't use an old one by accident (see testAttribute7 in the test suite) 346 | 347 | iNB = SAXStrings.indexOfNonWhitespace(this.m_xml, iB, iE); 348 | if((iNB == -1) ||(iNB >= iE)) { 349 | return iNB; 350 | } 351 | 352 | iEq = this.m_xml.indexOf("=", iNB); 353 | if((iEq == -1) || (iEq > iE)) { 354 | return this._setErr(XMLP.ERR_ATT_VALUES); 355 | } 356 | 357 | iNE = SAXStrings.lastIndexOfNonWhitespace(this.m_xml, iNB, iEq); 358 | 359 | iVB = SAXStrings.indexOfNonWhitespace(this.m_xml, iEq + 1, iE); 360 | if((iVB == -1) ||(iVB > iE)) { 361 | return this._setErr(XMLP.ERR_ATT_VALUES); 362 | } 363 | 364 | cQuote = this.m_xml.charAt(iVB); 365 | if(SAXStrings.QUOTES.indexOf(cQuote) == -1) { 366 | return this._setErr(XMLP.ERR_ATT_VALUES); 367 | } 368 | 369 | iVE = this.m_xml.indexOf(cQuote, iVB + 1); 370 | if((iVE == -1) ||(iVE > iE)) { 371 | return this._setErr(XMLP.ERR_ATT_VALUES); 372 | } 373 | 374 | strN = this.m_xml.substring(iNB, iNE + 1); 375 | strV = this.m_xml.substring(iVB + 1, iVE); 376 | 377 | if(strN.indexOf("<") != -1) { 378 | return this._setErr(XMLP.ERR_ATT_LT_NAME); 379 | } 380 | 381 | if(strV.indexOf("<") != -1) { 382 | return this._setErr(XMLP.ERR_ATT_LT_VALUE); 383 | } 384 | 385 | strV = SAXStrings.replace(strV, null, null, "\n", " "); 386 | strV = SAXStrings.replace(strV, null, null, "\t", " "); 387 | iRet = this._replaceEntities(strV); 388 | if(iRet == XMLP._ERROR) { 389 | return iRet; 390 | } 391 | 392 | strV = this.m_cAlt; 393 | 394 | if(this._findAttributeIndex(strN) == -1) { 395 | this._addAttribute(strN, strV); 396 | } 397 | else { 398 | return this._setErr(XMLP.ERR_ATT_DUP); 399 | } 400 | 401 | this.m_iP = iVE + 2; 402 | 403 | return XMLP._ATT; 404 | 405 | } 406 | 407 | XMLP.prototype._parseCDATA = function(iB) { 408 | var iE = this.m_xml.indexOf("]]>", iB); 409 | if (iE == -1) { 410 | //This item never closes, although it could be a malformed document, we will assume that we are mid-chunck, save the string and reurn as interrupted 411 | this.m_chunkTransitionContinuation = this.m_xml.slice(iB-9);//the '-", iB); 426 | if (iE == -1) { 427 | //This item never closes, although it could be a malformed document, we will assume that we are mid-chunck, save the string and reurn as interrupted 428 | this.m_chunkTransitionContinuation = this.m_xml.slice(iB-4);//the '-4' adds the ' --------------------------------------------------------------------------------