├── README.md
├── example.js
├── lib
└── node-xml.js
├── package.json
└── sample.xml
/README.md:
--------------------------------------------------------------------------------
1 | node-xml
2 | ===================
3 |
4 | (C) Rob Righter (@robrighter) 2009 - 2010, Licensed under the MIT-LICENSE
5 | Contributions from David Joham
6 |
7 | node-xml is an xml parser for node.js written in javascript.
8 |
9 | # Install
10 |
11 | npm install node-xml
12 |
13 | API
14 | ---
15 |
16 |
17 | SaxParser
18 | ---------
19 |
20 | Node-xml provides a SAX2 parser interface that can take a string, file. The parser can take characters from the document in chunks. To send chunks of the document to the parser use 'parseString(xml)'
21 |
22 | #SAX Parser#
23 |
24 | ##new xml.SaxParser()##
25 | * Instantiate a new SaxParser
26 | * returns: a SaxParser object
27 |
28 | ##new xml.SaxParser(callback)##
29 | * Instantiate a new SaxParser
30 | * returns: a SaxParser object
31 | * Arguments
32 | *callback - a function that accepts the new sax parser as an argument
33 |
34 | #Parse#
35 |
36 | ##parser.parseString(string)##
37 |
38 | Parse an in memory string
39 | * return: boolean. true if no errors, false otherwise
40 | * Arguments
41 | * string - a string representing the document to parse
42 |
43 | ##parser.parseFile(filename)##
44 |
45 | Parse a file
46 | * return: boolean. true if no errors, false otherwise
47 | * Arguments
48 | * filename - a string representing the file to be parsed
49 |
50 | ##parser.pause()##
51 | pauses parsing of the document
52 |
53 | ##parser.resume()##
54 | resumes parsing of the document
55 |
56 | #Callbacks#
57 |
58 | ##parser.onStartDocument(function() {})##
59 |
60 | Called at the start of a document
61 |
62 | ##parse.onEndDocument(function() {})##
63 |
64 | Called at the end of the document parse
65 |
66 | ##parser.onStartElementNS(function(elem, attrs, prefix, uri, namespaces) {})##
67 |
68 | Called on an open element tag
69 | * Arguments
70 | * elem - a string representing the element name
71 | * attrs - an array of arrays: [[key, value], [key, value]]
72 | * prefix - a string representing the namespace prefix of the element
73 | * uri - the namespace URI of the element
74 | * namespaces - an array of arrays: [[prefix, uri], [prefix, uri]]
75 |
76 | ##parser.onEndElementNS(function(elem, prefix, uri) {})##
77 |
78 | Called at the close of an element
79 | * Arguments
80 | * elem - a string representing the element name
81 | * prefix - a string representing the namespace prefix of the element
82 | * uri - the namespace URI of the element
83 |
84 | ##parser.onCharacters(function(chars) {})##
85 |
86 | Called when a set of content characters is encountered
87 | * Arguments
88 | * chars - a string of characters
89 |
90 | ##parser.onCdata(function(cdata) {})##
91 |
92 | Called when a CDATA is encountered
93 | * Arguments
94 | * cdata - a string representing the CDATA
95 |
96 | ##parser.onComment(function(msg) {})##
97 |
98 | Called when a comment is encountered
99 | * Arguments
100 | * msg - a string representing the comment
101 |
102 | ##parser.onWarning(function(msg) {})##
103 |
104 | Called when a warning is encountered
105 | * Arguments
106 | * msg - a string representing the warning message
107 |
108 | ##parser.onError(function(msg) {})##
109 |
110 | Called when an error is encountered
111 | * Arguments
112 | * msg - a string representing the error message
113 |
114 |
115 | EXAMPLE USAGE
116 | -------------
117 |
118 | var util = require('util');
119 | var xml = require("./lib/node-xml");
120 |
121 | var parser = new xml.SaxParser(function(cb) {
122 | cb.onStartDocument(function() {
123 |
124 | });
125 | cb.onEndDocument(function() {
126 |
127 | });
128 | cb.onStartElementNS(function(elem, attrs, prefix, uri, namespaces) {
129 | util.log("=> Started: " + elem + " uri="+uri +" (Attributes: " + JSON.stringify(attrs) + " )");
130 | });
131 | cb.onEndElementNS(function(elem, prefix, uri) {
132 | util.log("<= End: " + elem + " uri="+uri + "\n");
133 | parser.pause();// pause the parser
134 | setTimeout(function (){parser.resume();}, 200); //resume the parser
135 | });
136 | cb.onCharacters(function(chars) {
137 | //util.log(''+chars+"");
138 | });
139 | cb.onCdata(function(cdata) {
140 | util.log(''+cdata+"");
141 | });
142 | cb.onComment(function(msg) {
143 | util.log(''+msg+"");
144 | });
145 | cb.onWarning(function(msg) {
146 | util.log(''+msg+"");
147 | });
148 | cb.onError(function(msg) {
149 | util.log(''+JSON.stringify(msg)+"");
150 | });
151 | });
152 |
153 |
154 | //example read from chunks
155 | parser.parseString("
");
156 | parser.parseString("");
158 | parser.parseString("and lots");
159 | parser.parseString("and lots of text&am");
160 | parser.parseString("p;some more.");
161 | parser.parseString("");
164 | parser.parseString("");
166 |
167 | //example read from file
168 | parser.parseFile("sample.xml");
169 |
--------------------------------------------------------------------------------
/example.js:
--------------------------------------------------------------------------------
1 | var util = require('util');
2 | var xml = require("./lib/node-xml");
3 |
4 | var parser = new xml.SaxParser(function(cb) {
5 | cb.onStartDocument(function() {
6 |
7 | });
8 | cb.onEndDocument(function() {
9 |
10 | });
11 | cb.onStartElementNS(function(elem, attrs, prefix, uri, namespaces) {
12 | util.log("=> Started: " + elem + " uri="+uri +" (Attributes: " + JSON.stringify(attrs) + " )");
13 | });
14 | cb.onEndElementNS(function(elem, prefix, uri) {
15 | util.log("<= End: " + elem + " uri="+uri + "\n");
16 | parser.pause();// pause the parser
17 | setTimeout(function (){parser.resume();}, 100); //resume the parser
18 | });
19 | cb.onCharacters(function(chars) {
20 | util.log(''+chars+"");
21 | });
22 | cb.onCdata(function(cdata) {
23 | util.log(''+cdata+"");
24 | });
25 | cb.onComment(function(msg) {
26 | util.log(''+msg+"");
27 | });
28 | cb.onWarning(function(msg) {
29 | util.log(''+msg+"");
30 | });
31 | cb.onError(function(msg) {
32 | util.log(''+JSON.stringify(msg)+"");
33 | });
34 | });
35 |
36 |
37 | //example read from file
38 | parser.parseFile("sample.xml");
39 |
40 | //example read from chunks
41 | parser.parseString("");
42 | parser.parseString("");
44 | parser.parseString("and lots");
45 | parser.parseString("and lots of text&am");
46 | parser.parseString("p;some more.");
47 | parser.parseString("");
50 | parser.parseString("");
52 |
53 |
54 |
55 |
56 |
--------------------------------------------------------------------------------
/lib/node-xml.js:
--------------------------------------------------------------------------------
1 | // node-xml
2 | // An xml parser for node.js
3 | // (C) Rob Righter (@robrighter) 2009 - 2010, Licensed under the MIT-LICENSE
4 | // Contributions from David Joham
5 |
6 |
7 | (function () {
8 |
9 | // CONSTANTS
10 | var whitespace = "\n\r\t ";
11 |
12 |
13 | //XMLP is a pull-based parser. The calling application passes in a XML string
14 | //to the constructor, then repeatedly calls .next() to parse the next segment.
15 | //.next() returns a flag indicating what type of segment was found, and stores
16 | //data temporarily in couple member variables (name, content, array of
17 | //attributes), which can be accessed by several .get____() methods.
18 | //
19 | //Basically, XMLP is the lowest common denominator parser - an very simple
20 | //API which other wrappers can be built against.
21 |
22 |
23 | var XMLP = function(strXML) {
24 | // Normalize line breaks
25 | strXML = SAXStrings.replace(strXML, null, null, "\r\n", "\n");
26 | strXML = SAXStrings.replace(strXML, null, null, "\r", "\n");
27 |
28 | this.m_xml = strXML;
29 | this.m_iP = 0;
30 | this.m_iState = XMLP._STATE_PROLOG;
31 | this.m_stack = new Stack();
32 | this._clearAttributes();
33 | this.m_pause = false;
34 | this.m_preInterruptIState = XMLP._STATE_PROLOG;
35 | this.m_namespaceList = new Array();
36 | this.m_chunkTransitionContinuation = null;
37 |
38 | }
39 |
40 |
41 | // CONSTANTS (these must be below the constructor)
42 | XMLP._NONE = 0;
43 | XMLP._ELM_B = 1;
44 | XMLP._ELM_E = 2;
45 | XMLP._ELM_EMP = 3;
46 | XMLP._ATT = 4;
47 | XMLP._TEXT = 5;
48 | XMLP._ENTITY = 6;
49 | XMLP._PI = 7;
50 | XMLP._CDATA = 8;
51 | XMLP._COMMENT = 9;
52 | XMLP._DTD = 10;
53 | XMLP._ERROR = 11;
54 | XMLP._INTERRUPT = 12;
55 |
56 | XMLP._CONT_XML = 0;
57 | XMLP._CONT_ALT = 1;
58 |
59 | XMLP._ATT_NAME = 0;
60 | XMLP._ATT_VAL = 1;
61 |
62 | XMLP._STATE_PROLOG = 1;
63 | XMLP._STATE_DOCUMENT = 2;
64 | XMLP._STATE_MISC = 3;
65 |
66 | XMLP._errs = new Array();
67 | XMLP._errs[XMLP.ERR_CLOSE_PI = 0 ] = "PI: missing closing sequence";
68 | XMLP._errs[XMLP.ERR_CLOSE_DTD = 1 ] = "DTD: missing closing sequence";
69 | XMLP._errs[XMLP.ERR_CLOSE_COMMENT = 2 ] = "Comment: missing closing sequence";
70 | XMLP._errs[XMLP.ERR_CLOSE_CDATA = 3 ] = "CDATA: missing closing sequence";
71 | XMLP._errs[XMLP.ERR_CLOSE_ELM = 4 ] = "Element: missing closing sequence";
72 | XMLP._errs[XMLP.ERR_CLOSE_ENTITY = 5 ] = "Entity: missing closing sequence";
73 | XMLP._errs[XMLP.ERR_PI_TARGET = 6 ] = "PI: target is required";
74 | XMLP._errs[XMLP.ERR_ELM_EMPTY = 7 ] = "Element: cannot be both empty and closing";
75 | XMLP._errs[XMLP.ERR_ELM_NAME = 8 ] = "Element: name must immediatly follow \"<\"";
76 | XMLP._errs[XMLP.ERR_ELM_LT_NAME = 9 ] = "Element: \"<\" not allowed in element names";
77 | XMLP._errs[XMLP.ERR_ATT_VALUES = 10] = "Attribute: values are required and must be in quotes";
78 | XMLP._errs[XMLP.ERR_ATT_LT_NAME = 11] = "Element: \"<\" not allowed in attribute names";
79 | XMLP._errs[XMLP.ERR_ATT_LT_VALUE = 12] = "Attribute: \"<\" not allowed in attribute values";
80 | XMLP._errs[XMLP.ERR_ATT_DUP = 13] = "Attribute: duplicate attributes not allowed";
81 | XMLP._errs[XMLP.ERR_ENTITY_UNKNOWN = 14] = "Entity: unknown entity";
82 | XMLP._errs[XMLP.ERR_INFINITELOOP = 15] = "Infininte loop";
83 | XMLP._errs[XMLP.ERR_DOC_STRUCTURE = 16] = "Document: only comments, processing instructions, or whitespace allowed outside of document element";
84 | XMLP._errs[XMLP.ERR_ELM_NESTING = 17] = "Element: must be nested correctly";
85 |
86 |
87 |
88 | XMLP.prototype.continueParsing = function(strXML) {
89 |
90 | if(this.m_chunkTransitionContinuation){
91 | strXML = this.m_chunkTransitionContinuation + strXML;
92 | }
93 | // Normalize line breaks
94 | strXML = SAXStrings.replace(strXML, null, null, "\r\n", "\n");
95 | strXML = SAXStrings.replace(strXML, null, null, "\r", "\n");
96 |
97 | this.m_xml = strXML;
98 | this.m_iP = 0;
99 | this.m_iState = XMLP._STATE_DOCUMENT;
100 | //this.m_stack = new Stack();
101 | //this._clearAttributes();
102 | this.m_pause = false;
103 | this.m_preInterruptIState = XMLP._STATE_PROLOG;
104 | this.m_chunkTransitionContinuation = null;
105 |
106 | }
107 |
108 | XMLP.prototype._addAttribute = function(name, value) {
109 | this.m_atts[this.m_atts.length] = new Array(name, value);
110 | }
111 |
112 | XMLP.prototype._checkStructure = function(iEvent) {
113 | if(XMLP._STATE_PROLOG == this.m_iState) {
114 | if((XMLP._TEXT == iEvent) || (XMLP._ENTITY == iEvent)) {
115 | if(SAXStrings.indexOfNonWhitespace(this.getContent(), this.getContentBegin(), this.getContentEnd()) != -1) {
116 | return this._setErr(XMLP.ERR_DOC_STRUCTURE);
117 | }
118 | }
119 |
120 | if((XMLP._ELM_B == iEvent) || (XMLP._ELM_EMP == iEvent)) {
121 | this.m_iState = XMLP._STATE_DOCUMENT;
122 | // Don't return - fall through to next state
123 | }
124 | }
125 | if(XMLP._STATE_DOCUMENT == this.m_iState) {
126 | if((XMLP._ELM_B == iEvent) || (XMLP._ELM_EMP == iEvent)) {
127 | this.m_stack.push(this.getName());
128 | }
129 |
130 | if((XMLP._ELM_E == iEvent) || (XMLP._ELM_EMP == iEvent)) {
131 | var strTop = this.m_stack.pop();
132 | if((strTop == null) || (strTop != this.getName())) {
133 | return this._setErr(XMLP.ERR_ELM_NESTING);
134 | }
135 | }
136 |
137 | if(this.m_stack.count() == 0) {
138 | this.m_iState = XMLP._STATE_MISC;
139 | return iEvent;
140 | }
141 | }
142 | if(XMLP._STATE_MISC == this.m_iState) {
143 | if((XMLP._ELM_B == iEvent) || (XMLP._ELM_E == iEvent) || (XMLP._ELM_EMP == iEvent) || (XMLP.EVT_DTD == iEvent)) {
144 | return this._setErr(XMLP.ERR_DOC_STRUCTURE);
145 | }
146 |
147 | if((XMLP._TEXT == iEvent) || (XMLP._ENTITY == iEvent)) {
148 | if(SAXStrings.indexOfNonWhitespace(this.getContent(), this.getContentBegin(), this.getContentEnd()) != -1) {
149 | return this._setErr(XMLP.ERR_DOC_STRUCTURE);
150 | }
151 | }
152 | }
153 |
154 | return iEvent;
155 |
156 | }
157 |
158 | XMLP.prototype._clearAttributes = function() {
159 | this.m_atts = new Array();
160 | }
161 |
162 | XMLP.prototype._findAttributeIndex = function(name) {
163 | for(var i = 0; i < this.m_atts.length; i++) {
164 | if(this.m_atts[i][XMLP._ATT_NAME] == name) {
165 | return i;
166 | }
167 | }
168 | return -1;
169 |
170 | }
171 |
172 | XMLP.prototype.getAttributeCount = function() {
173 | return this.m_atts ? this.m_atts.length : 0;
174 | }
175 |
176 | XMLP.prototype.getAttributeName = function(index) {
177 | return ((index < 0) || (index >= this.m_atts.length)) ? null : this.m_atts[index][XMLP._ATT_NAME];
178 | }
179 |
180 | XMLP.prototype.getAttributeValue = function(index) {
181 | return ((index < 0) || (index >= this.m_atts.length)) ? null : __unescapeString(this.m_atts[index][XMLP._ATT_VAL]);
182 | }
183 |
184 | XMLP.prototype.getAttributeValueByName = function(name) {
185 | return this.getAttributeValue(this._findAttributeIndex(name));
186 | }
187 |
188 | XMLP.prototype.getColumnNumber = function() {
189 | return SAXStrings.getColumnNumber(this.m_xml, this.m_iP);
190 | }
191 |
192 | XMLP.prototype.getContent = function() {
193 | return (this.m_cSrc == XMLP._CONT_XML) ? this.m_xml : this.m_cAlt;
194 | }
195 |
196 | XMLP.prototype.getContentBegin = function() {
197 | return this.m_cB;
198 | }
199 |
200 | XMLP.prototype.getContentEnd = function() {
201 | return this.m_cE;
202 | }
203 |
204 | XMLP.prototype.getLineNumber = function() {
205 | return SAXStrings.getLineNumber(this.m_xml, this.m_iP);
206 | }
207 |
208 | XMLP.prototype.getName = function() {
209 | return this.m_name;
210 | }
211 |
212 | XMLP.prototype.pause = function(){
213 | this.m_pause = true;
214 | }
215 |
216 | XMLP.prototype.resume = function(){
217 | this.m_pause = false;
218 | this.m_iState = this.m_preInterruptIState;
219 | }
220 |
221 | XMLP.prototype.next = function() {
222 | if(!this.m_pause){
223 | return this._checkStructure(this._parse());
224 | }
225 | else{
226 | //save off the current event loop state and set the state to interrupt
227 | this.m_preInterruptIState = this.m_iState;
228 | return XMLP._INTERRUPT;
229 | }
230 | }
231 |
232 | XMLP.prototype._parse = function() {
233 | if(this.m_iP == this.m_xml.length) {
234 | return XMLP._NONE;
235 | }
236 |
237 | function _indexOf(needle, haystack, start) {
238 | // This is an improvement over the native indexOf because it stops at the
239 | // end of the needle and doesn't continue to the end of the haystack looking.
240 | for(var i = 0; i < needle.length; i++) {
241 | if(needle.charAt(i) != haystack.charAt(start + i))
242 | return -1;
243 | }
244 | return start;
245 | }
246 |
247 | var fc = this.m_xml.charAt(this.m_iP);
248 | if (fc !== '<' && fc !== '&') {
249 | return this._parseText (this.m_iP);
250 | }
251 | else if(this.m_iP == _indexOf("", this.m_xml, this.m_iP)) {
252 | return this._parsePI (this.m_iP + 2);
253 | }
254 | else if(this.m_iP == _indexOf("= 0; i--){
315 | var item = this.m_namespaceList[i];
316 | if(item.prefix === ''){
317 | return item.uri;
318 | }
319 | }
320 |
321 | //still nothing, lets just return an empty string
322 | return '';
323 | }
324 |
325 | XMLP.prototype._removeExpiredNamesapces = function (closingtagname) {
326 | //remove the expiring namespaces from the list (you can id them by scopetag)
327 | var keeps = [];
328 | this.m_namespaceList.map(function (item){
329 | if(item.scopetag !== closingtagname){
330 | keeps.push(item);
331 | }
332 | });
333 |
334 | this.m_namespaceList = keeps;
335 |
336 | }
337 |
338 | ////////////////////////////////////////////////////////////////////////
339 |
340 |
341 | XMLP.prototype._parseAttribute = function(iB, iE) {
342 | var iNB, iNE, iEq, iVB, iVE;
343 | var cQuote, strN, strV;
344 |
345 | this.m_cAlt = ""; //resets the value so we don't use an old one by accident (see testAttribute7 in the test suite)
346 |
347 | iNB = SAXStrings.indexOfNonWhitespace(this.m_xml, iB, iE);
348 | if((iNB == -1) ||(iNB >= iE)) {
349 | return iNB;
350 | }
351 |
352 | iEq = this.m_xml.indexOf("=", iNB);
353 | if((iEq == -1) || (iEq > iE)) {
354 | return this._setErr(XMLP.ERR_ATT_VALUES);
355 | }
356 |
357 | iNE = SAXStrings.lastIndexOfNonWhitespace(this.m_xml, iNB, iEq);
358 |
359 | iVB = SAXStrings.indexOfNonWhitespace(this.m_xml, iEq + 1, iE);
360 | if((iVB == -1) ||(iVB > iE)) {
361 | return this._setErr(XMLP.ERR_ATT_VALUES);
362 | }
363 |
364 | cQuote = this.m_xml.charAt(iVB);
365 | if(SAXStrings.QUOTES.indexOf(cQuote) == -1) {
366 | return this._setErr(XMLP.ERR_ATT_VALUES);
367 | }
368 |
369 | iVE = this.m_xml.indexOf(cQuote, iVB + 1);
370 | if((iVE == -1) ||(iVE > iE)) {
371 | return this._setErr(XMLP.ERR_ATT_VALUES);
372 | }
373 |
374 | strN = this.m_xml.substring(iNB, iNE + 1);
375 | strV = this.m_xml.substring(iVB + 1, iVE);
376 |
377 | if(strN.indexOf("<") != -1) {
378 | return this._setErr(XMLP.ERR_ATT_LT_NAME);
379 | }
380 |
381 | if(strV.indexOf("<") != -1) {
382 | return this._setErr(XMLP.ERR_ATT_LT_VALUE);
383 | }
384 |
385 | strV = SAXStrings.replace(strV, null, null, "\n", " ");
386 | strV = SAXStrings.replace(strV, null, null, "\t", " ");
387 | iRet = this._replaceEntities(strV);
388 | if(iRet == XMLP._ERROR) {
389 | return iRet;
390 | }
391 |
392 | strV = this.m_cAlt;
393 |
394 | if(this._findAttributeIndex(strN) == -1) {
395 | this._addAttribute(strN, strV);
396 | }
397 | else {
398 | return this._setErr(XMLP.ERR_ATT_DUP);
399 | }
400 |
401 | this.m_iP = iVE + 2;
402 |
403 | return XMLP._ATT;
404 |
405 | }
406 |
407 | XMLP.prototype._parseCDATA = function(iB) {
408 | var iE = this.m_xml.indexOf("]]>", iB);
409 | if (iE == -1) {
410 | //This item never closes, although it could be a malformed document, we will assume that we are mid-chunck, save the string and reurn as interrupted
411 | this.m_chunkTransitionContinuation = this.m_xml.slice(iB-9);//the '-", iB);
426 | if (iE == -1) {
427 | //This item never closes, although it could be a malformed document, we will assume that we are mid-chunck, save the string and reurn as interrupted
428 | this.m_chunkTransitionContinuation = this.m_xml.slice(iB-4);//the '-4' adds the '
--------------------------------------------------------------------------------