├── .npmignore ├── .travis.yml ├── index.js ├── .gitignore ├── examples ├── encoding.xml ├── collect-preserve.xml ├── collect-preserve.js ├── encoding.js └── http-stream.js ├── package.json ├── tests ├── test-collect-preserve.js ├── test-readable-stream.js └── fixtures │ └── collect-preserve.json ├── LICENSE ├── lib ├── finite-automata.js └── xml-stream.js └── README.md /.npmignore: -------------------------------------------------------------------------------- 1 | .git* 2 | .npmignore 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - "0.10" 4 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | module.exports = require('./lib/xml-stream.js'); 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.swo 3 | *~ 4 | .DS_Store 5 | /assets/ 6 | /node_modules/ 7 | -------------------------------------------------------------------------------- /examples/encoding.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/assistunion/xml-stream/HEAD/examples/encoding.xml -------------------------------------------------------------------------------- /examples/collect-preserve.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 1 one 4 | 2 two 5 | 3 three 6 | 7 | 8 | 4 four 9 | 5 10 | five 11 | A 12 | B 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /examples/collect-preserve.js: -------------------------------------------------------------------------------- 1 | var fs = require('fs') 2 | , path = require('path') 3 | , XmlStream = require('../lib/xml-stream') 4 | ; 5 | 6 | // Create a file stream and pass it to XmlStream 7 | var stream = fs.createReadStream(path.join(__dirname, 'collect-preserve.xml')); 8 | var xml = new XmlStream(stream); 9 | 10 | xml.preserve('item', true); 11 | xml.collect('subitem'); 12 | xml.on('endElement: item', function(item) { 13 | console.log(item); 14 | }); 15 | -------------------------------------------------------------------------------- /examples/encoding.js: -------------------------------------------------------------------------------- 1 | var fs = require('fs') 2 | , path = require('path') 3 | , XmlStream = require('../lib/xml-stream') 4 | ; 5 | 6 | // Create a file stream and pass it to XmlStream 7 | function setup(encoding) { 8 | var stream = fs.createReadStream(path.join(__dirname, 'encoding.xml')); 9 | var xml = new XmlStream(stream, encoding); 10 | xml.on('endElement: node', function(node) { 11 | console.log(node); 12 | }); 13 | xml.on('error', function(message) { 14 | console.log('Parsing as ' + (encoding || 'auto') + ' failed: ' + message); 15 | }); 16 | return xml; 17 | } 18 | 19 | var xml = setup('utf8'); // Parse as UTF-8 20 | var xml = setup('iso-8859-5'); // Parse as ISO 8859-5 21 | var xml = setup(); // Detect on the fly. 22 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "xml-stream", 3 | "description": "XML stream to JavaScript object converter based on Expat.", 4 | "keywords": [ 5 | "xml", 6 | "parser", 7 | "expat" 8 | ], 9 | "license": "MIT", 10 | "version": "0.4.5", 11 | "author": "AssistUnion ", 12 | "maintainers": [ 13 | "Anatoly Ressin ", 14 | "Dimitry Solovyov (http://100-hour.com)" 15 | ], 16 | "scripts": { 17 | "test": "mocha tests" 18 | }, 19 | "repository": { 20 | "type": "git", 21 | "url": "git://github.com/assistunion/xml-stream.git" 22 | }, 23 | "dependencies": { 24 | "iconv": "^2.1.4", 25 | "node-expat": "^2.3.7", 26 | "readable-stream": "^1.0.31" 27 | }, 28 | "directories": { 29 | "lib": "./lib" 30 | }, 31 | "main": "index", 32 | "devDependencies": { 33 | "mocha": "^1.21.4" 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /tests/test-collect-preserve.js: -------------------------------------------------------------------------------- 1 | var fs = require('fs') 2 | , path = require('path') 3 | , assert = require('assert') 4 | , XmlStream = require('../lib/xml-stream'); 5 | 6 | 7 | describe('XmlStream', function() { 8 | 9 | it('should deal nicely with preserve and collect when reading from file', function(done) { 10 | var stream = fs.createReadStream(path.resolve(__dirname, '../examples/collect-preserve.xml')); 11 | var fileExpected = fs.readFileSync(path.resolve(__dirname, 'fixtures/collect-preserve.json')); 12 | var xml = new XmlStream(stream); 13 | var results = []; 14 | 15 | xml.preserve('item', true); 16 | xml.collect('subitem'); 17 | xml.on('endElement: item', function(item) { 18 | results.push(item); 19 | }); 20 | 21 | xml.on('end', function () { 22 | 23 | var expected = JSON.parse(fileExpected); 24 | 25 | assert.deepEqual(results, expected); 26 | done(); 27 | }); 28 | 29 | xml.on('error', function (err) { 30 | done(err); 31 | }); 32 | }); 33 | }); 34 | -------------------------------------------------------------------------------- /tests/test-readable-stream.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 'use strict'; 3 | 4 | var fs = require('fs') 5 | , assert = require('assert') 6 | , filename = require('path').resolve(__dirname, '../examples/collect-preserve.xml') 7 | , XmlStream = require('../lib/xml-stream') 8 | , Readable = require('stream').Readable || require('readable-stream'); 9 | 10 | /** 11 | * Creates a stream w/ data. 12 | */ 13 | function createStream (data) { 14 | var rs = new Readable(); 15 | rs.push(data); 16 | rs.push(null); 17 | 18 | return rs; 19 | } 20 | 21 | describe('XmlStream', function() { 22 | var file = fs.readFileSync(filename, {encoding: 'utf8'}); 23 | 24 | it('should deal with fake streams', function(done) { 25 | var stream = createStream(file); 26 | var results = []; 27 | var xml = new XmlStream(stream); 28 | 29 | xml.preserve('item', true); 30 | xml.collect('subitem'); 31 | xml.on('endElement: item', function(item) { 32 | results.push(item); 33 | }); 34 | 35 | xml.on('end', function () { 36 | assert(results.length); 37 | done(); 38 | }); 39 | 40 | xml.on('error', function (err) { 41 | done(err); 42 | }); 43 | }); 44 | }); 45 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010-2011 Anatoly Ressin, Dimitry Solovyov, Kirill Korolyov 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /examples/http-stream.js: -------------------------------------------------------------------------------- 1 | var http = require('http'); 2 | var XmlStream = require('../lib/xml-stream'); 3 | 4 | // Request an RSS for a Twitter stream 5 | var request = http.get({ 6 | host: 'api.twitter.com', 7 | path: '/1/statuses/user_timeline/dimituri.rss' 8 | }).on('response', function(response) { 9 | // Pass the response as UTF-8 to XmlStream 10 | response.setEncoding('utf8'); 11 | var xml = new XmlStream(response); 12 | 13 | // When each item node is completely parsed, buffer its contents 14 | xml.on('updateElement: item', function(item) { 15 | // Change child to a new value, composed of its previous value 16 | // and the value of <pubDate> child. 17 | item.title = item.title.match(/^[^:]+/)[0] + ' on ' + 18 | item.pubDate.replace(/ \+[0-9]{4}/, ''); 19 | }); 20 | 21 | // When <item>'s <description> descendant text is completely parsed, 22 | // buffer it and pass the containing node 23 | xml.on('text: item > description', function(element) { 24 | // Modify the <description> text to make it more readable, 25 | // highlight Twitter-specific and other links 26 | var url = /\b[a-zA-Z][a-zA-Z0-9\+\.\-]+:[^\s]+/g; 27 | var hashtag = /\b#[\w]+/g; 28 | var username = /\b@([\w]+)/g; 29 | element.$text = element.$text 30 | .replace(/^[^:]+:\s+/, '') //strip username prefix from tweet 31 | .replace(url, '<a href="$0">$0</a>') 32 | .replace(hashtag, '<a href="https://twitter.com/search/$0">$0</a>') 33 | .replace(username, '<a href="https://twitter.com/$1">$0</a>'); 34 | }); 35 | 36 | // When each chunk of unselected on unbuffered data is returned, 37 | // pass it to stdout 38 | xml.on('data', function(data) { 39 | process.stdout.write(data); 40 | }); 41 | }); 42 | -------------------------------------------------------------------------------- /lib/finite-automata.js: -------------------------------------------------------------------------------- 1 | module.exports = FiniteAutomata; 2 | function FiniteAutomata() { 3 | this._symbols = {}; 4 | this._states = {}; 5 | this._deterministic = true; 6 | this._state = {}; 7 | this._callbacks = { 8 | enter: {}, 9 | leave: {}, 10 | state: {}, 11 | flag: {} 12 | }; 13 | this._stack = []; 14 | this._stackPtr = -1; 15 | } 16 | 17 | var __own = Object.prototype.hasOwnProperty; 18 | 19 | function extend(target, source) { 20 | for (var key in source) if (__own.call(source, key)) { 21 | target[key] = source[key]; 22 | } 23 | } 24 | 25 | function run(type, args) { 26 | var cbs = this._callbacks[type]; 27 | for (var cb in this._state) if (__own.call(this._state, cb)) { 28 | if (__own.call(cbs, cb)) { 29 | var length = cbs[cb].length; 30 | var cbList = cbs[cb]; 31 | for (var i = 0; i < length; i++) { 32 | cbList[i].apply(global, args); 33 | } 34 | } 35 | } 36 | } 37 | 38 | FiniteAutomata.prototype.isDeterministic = function() { 39 | return this._deterministic; 40 | }; 41 | 42 | FiniteAutomata.prototype.on = function(type, state, cb) { 43 | if (!__own.call(this._callbacks, type)) { 44 | this._callbacks[type] = {}; 45 | } 46 | var typeCbs = this._callbacks[type]; 47 | if (!__own.call(typeCbs, state)) { 48 | typeCbs[state] = []; 49 | } 50 | typeCbs[state].push(cb); 51 | return this; 52 | }; 53 | 54 | FiniteAutomata.prototype.setState = function(state, args) { 55 | this._state = state; 56 | run.call(this, 'enter', args); 57 | run.call(this, 'state', args); 58 | return this; 59 | }; 60 | 61 | FiniteAutomata.prototype.nextState = function(symbol) { 62 | var newState = {}; 63 | for (var st in this._state) if (__own.call(this._state, st)) { 64 | if (__own.call(this._states, st)) { 65 | var next = this._states[st]; 66 | if (__own.call(next, symbol)) { 67 | extend(newState, next[symbol]); 68 | } 69 | if (__own.call(next, '')) { 70 | extend(newState, (next[''])); 71 | } 72 | } 73 | } 74 | return newState; 75 | }; 76 | 77 | FiniteAutomata.prototype.go = function(symbol, args) { 78 | var next = this.nextState(symbol) 79 | this.setState(next, args); 80 | return this; 81 | }; 82 | 83 | FiniteAutomata.prototype.leave = function(args) { 84 | this._stack[this._stackPtr] = undefined; 85 | run.call(this, 'leave', args); 86 | this._state = this._stack[--this._stackPtr]; 87 | return this; 88 | }; 89 | 90 | FiniteAutomata.prototype.enter = function(symbol, args) { 91 | if (args == null) { 92 | args = []; 93 | } 94 | var next = this.nextState(symbol); 95 | this._stack[++this._stackPtr] = next; 96 | this._state = next; 97 | run.call(this, 'flag'); 98 | run.call(this, 'enter', args); 99 | return this; 100 | }; 101 | 102 | FiniteAutomata.prototype.run = function(state, args) { 103 | run.call(this, state, args); 104 | }; 105 | 106 | FiniteAutomata.prototype.transition = function(stateFrom, symbol, stateTo) { 107 | this._symbols[symbol] = true; 108 | var s; 109 | if (__own.call(this._states, stateFrom)) { 110 | s = this._states[stateFrom]; 111 | } else { 112 | s = this._states[stateFrom] = {}; 113 | } 114 | var exists = __own.call(s, symbol); 115 | if (exists) { 116 | s = s[symbol]; 117 | } else { 118 | s = s[symbol] = {}; 119 | } 120 | if (!__own.call(s, stateTo)) { 121 | s[stateTo] = true; 122 | this._deterministic &= !exists; 123 | } 124 | return this; 125 | }; 126 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # XmlStream 2 | 3 | XmlStream is a Node.js XML stream parser and editor, based on 4 | [node-expat](https://github.com/astro/node-expat) (libexpat SAX-like parser 5 | binding). 6 | 7 | $ npm install xml-stream 8 | 9 | ## Rationale 10 | 11 | When working with large XML files, it is probably a bad idea to use an XML to 12 | JavaScript object converter, or simply buffer the whole document in memory. 13 | Then again, a typical SAX parser might be too low-level for some tasks (and 14 | often a real pain). 15 | 16 | This is why we've rolled our own stream parser that tries to address these 17 | shortcomings. It processes an XML stream chunk by chunk and fires events only 18 | for nodes of interest, matching them with CSS-like selectors. 19 | 20 | ## Events 21 | 22 | Supported events: 23 | 24 | * `data` on outgoing data chunk, 25 | * `end` when parsing has ended, 26 | * `startElement[: selector]` on opening tag for selector match, 27 | * `updateElement[: selector]` on finished node for selector match 28 | with its contents buffered, 29 | * `endElement[: selector]` on closing tag for selector match, 30 | * `text[: selector]` on tag text for selector match. 31 | 32 | When adding listeners for `startElement`, `updateElement`, and `text` the 33 | callback can modify the provided node, before it is sent to the consumer. 34 | 35 | Selector syntax is CSS-like and currently supports: 36 | 37 | * `ancestor descendant` 38 | * `parent > child` 39 | 40 | Take a look at the examples for more information. 41 | 42 | ## Element Node 43 | 44 | Each of the four node events has a callback with one argument. When parsing, 45 | this argument is set to the current matched node. Having a chunk of XML like 46 | this: 47 | 48 | ```xml 49 | <item id="123" type="common"> 50 | <title>Item Title 51 | Description of this item. 52 | (text) 53 | 54 | ``` 55 | 56 | The structure of the **item** element node would be: 57 | 58 | ```javascript 59 | { 60 | title: 'Item Title', 61 | description: 'Description of this item.', 62 | '$': { 63 | 'id': '123', 64 | 'type': 'common' 65 | }, 66 | '$name': 'item', 67 | '$text': '(text)' 68 | } 69 | ``` 70 | 71 | Naturally, element text and child elements wouldn't be known until discovered 72 | in the stream, so the structure may differ across events. The complete 73 | structure as displayed should be available on **updateElement**. The **$name** 74 | is not available on **endElement**. 75 | 76 | # Collecting Children 77 | 78 | It is sometimes required to select elements that have many children with 79 | one and the same name. Like this XML: 80 | 81 | ```xml 82 | 83 | one 84 | two 85 | 86 | 87 | three 88 | four 89 | five 90 | 91 | ``` 92 | 93 | By default, parsed element node contains children as properties. In the case 94 | of several children with same names, the last one would overwrite others. 95 | To collect all of *subitem* elements in an array use **collect**: 96 | 97 | ```javascript 98 | xml.collect('subitem'); 99 | xml.on('endElement: item', function(item) { 100 | console.log(item); 101 | }) 102 | ``` 103 | 104 | # Preserving Elements and Text 105 | 106 | By default, element text is returned as one concatenated string. In this XML: 107 | 108 | ```xml 109 | 110 | one 1 111 | two 2 112 | 113 | ``` 114 | 115 | The value of **$text** for *item* would be: `one 1 two 2` without any 116 | indication of the order of element *a*, element *b*, and text parts. 117 | To preserve this order: 118 | 119 | ```javascript 120 | xml.preserve('item'); 121 | xml.on('endElement: item', function(item) { 122 | console.log(item); 123 | }) 124 | ``` 125 | 126 | # Pause and resume parsing 127 | 128 | If you want parsing to pause (for example, until some asynchronous operation 129 | of yours is finished), you can pause and resume XML parsing: 130 | ```javascript 131 | xml.pause(); 132 | myAsyncFunction( function() { 133 | xml.resume(); 134 | }); 135 | ``` 136 | Beware that resume() **must not** be called from within a handler callback. 137 | 138 | -------------------------------------------------------------------------------- /tests/fixtures/collect-preserve.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "$children": [ 4 | "\n", 5 | " 1 ", 6 | { 7 | "$children": [ 8 | "one" 9 | ], 10 | "$name": "subitem", 11 | "$text": "one" 12 | }, 13 | "\n", 14 | " 2 ", 15 | { 16 | "$children": [ 17 | "two" 18 | ], 19 | "$name": "subitem", 20 | "$text": "two" 21 | }, 22 | "\n", 23 | " 3 ", 24 | { 25 | "$children": [ 26 | "three" 27 | ], 28 | "$name": "subitem", 29 | "$text": "three" 30 | }, 31 | "\n", 32 | " " 33 | ], 34 | "$name": "item", 35 | "$text": "\n 1 \n 2 \n 3 \n ", 36 | "subitem": [ 37 | { 38 | "$children": [ 39 | "one" 40 | ], 41 | "$name": "subitem", 42 | "$text": "one" 43 | }, 44 | { 45 | "$children": [ 46 | "two" 47 | ], 48 | "$name": "subitem", 49 | "$text": "two" 50 | }, 51 | { 52 | "$children": [ 53 | "three" 54 | ], 55 | "$name": "subitem", 56 | "$text": "three" 57 | } 58 | ] 59 | }, 60 | { 61 | "$children": [ 62 | "\n", 63 | " 4 ", 64 | { 65 | "$children": [ 66 | "four" 67 | ], 68 | "$name": "subitem", 69 | "$text": "four" 70 | }, 71 | "\n", 72 | " 5 ", 73 | { 74 | "$children": [ 75 | "\n", 76 | " five", 77 | "\n", 78 | " ", 79 | { 80 | "$children": [ 81 | "A" 82 | ], 83 | "$name": "letter", 84 | "$text": "A" 85 | }, 86 | "\n", 87 | " ", 88 | { 89 | "$children": [ 90 | "B" 91 | ], 92 | "$name": "letter", 93 | "$text": "B" 94 | }, 95 | "\n", 96 | " " 97 | ], 98 | "$name": "subitem", 99 | "$text": "\n five\n \n \n ", 100 | "letter": { 101 | "$children": [ 102 | "B" 103 | ], 104 | "$name": "letter", 105 | "$text": "B" 106 | } 107 | }, 108 | "\n", 109 | " " 110 | ], 111 | "$name": "item", 112 | "$text": "\n 4 \n 5 \n ", 113 | "subitem": [ 114 | { 115 | "$children": [ 116 | "four" 117 | ], 118 | "$name": "subitem", 119 | "$text": "four" 120 | }, 121 | { 122 | "$children": [ 123 | "\n", 124 | " five", 125 | "\n", 126 | " ", 127 | { 128 | "$children": [ 129 | "A" 130 | ], 131 | "$name": "letter", 132 | "$text": "A" 133 | }, 134 | "\n", 135 | " ", 136 | { 137 | "$children": [ 138 | "B" 139 | ], 140 | "$name": "letter", 141 | "$text": "B" 142 | }, 143 | "\n", 144 | " " 145 | ], 146 | "$name": "subitem", 147 | "$text": "\n five\n \n \n ", 148 | "letter": { 149 | "$children": [ 150 | "B" 151 | ], 152 | "$name": "letter", 153 | "$text": "B" 154 | } 155 | } 156 | ] 157 | } 158 | ] 159 | -------------------------------------------------------------------------------- /lib/xml-stream.js: -------------------------------------------------------------------------------- 1 | var events = require('events') 2 | , expat = require('node-expat') 3 | , FiniteAutomata = require('./finite-automata') 4 | , Iconv = require('iconv').Iconv 5 | ; 6 | 7 | // Retains link to hasOwnProperty. 8 | var __own = Object.prototype.hasOwnProperty; 9 | 10 | // Tests if object is empty (has no own properties). 11 | function isEmpty(obj) { 12 | for (var key in obj) if (__own.call(obj, key)) { 13 | return false; 14 | } 15 | return true; 16 | } 17 | 18 | // XML entities. 19 | var entities = { 20 | '"': '"', 21 | '&': '&', 22 | '\'': ''', 23 | '<': '<', 24 | '>': '>' 25 | }; 26 | 27 | // Escapes text for XML. 28 | function escape(value) { 29 | return value.replace(/"|&|'|<|>/g, function(entity) { 30 | return entities[entity]; 31 | }); 32 | } 33 | 34 | // Parser events to finite automata events mapping. 35 | var faModes = { 36 | 'startElement': 'enter', 37 | 'endElement': 'leave', 38 | 'text': 'state' 39 | }; 40 | 41 | // I accidentally the whole class. 42 | module.exports = XmlStream; 43 | 44 | // **XmlStream** is an XML stream filter based on Expat. 45 | // It traverses a given stream and emits events for predefined selectors. 46 | // Event listeners receive selected elements, context, and trace from root. 47 | function XmlStream(stream, encoding) { 48 | events.EventEmitter.call(this); 49 | this._stream = stream; 50 | this._fa = new FiniteAutomata(); 51 | this._lastState = 0; 52 | this._startState = {}; 53 | this._finalStates = {}; 54 | this._emitData = false; 55 | this._bufferLevel = 0; 56 | this._preserveLevel = 0; 57 | this._preserveWhitespace = 0; 58 | this._preserveAll = false; 59 | this._collect = false; 60 | this._parser = undefined; 61 | 62 | // Set input stream encoding and create an iconv instance, 63 | // if conversion is required. Default working encoding is UTF-8, 64 | // so iconv is used when input is anything else, but UTF-8. 65 | this._encoding = encoding || null; 66 | this._encoder = makeEncoder(this._encoding); 67 | 68 | var scope = this; 69 | 70 | // Start parsing. 71 | process.nextTick(function () { 72 | parse.call(scope); 73 | }); 74 | } 75 | 76 | // Either make an iconv instance, or not. 77 | function makeEncoder(encoding) { 78 | if (encoding && !/^utf-?8$/i.test(encoding)) { 79 | return new Iconv(encoding, 'utf8'); 80 | } 81 | return null; 82 | } 83 | 84 | // Inherit events.EventEmitter. 85 | XmlStream.super_ = events.EventEmitter; 86 | XmlStream.prototype = Object.create(events.EventEmitter.prototype, { 87 | constructor: { 88 | value: XmlStream, 89 | enumerable: false 90 | } 91 | }); 92 | 93 | // Adds a listener for the specified event. 94 | // 95 | // Supported events: 96 | // 97 | // * `data` on outgoing data chunk, 98 | // * `end` when parsing has ended, 99 | // * `startElement[: selector]` on opening tag for selector match, 100 | // * `updateElement[: selector]` on finished node for selector match 101 | // with its contents buffered, 102 | // * `endElement[: selector]` on closing tag for selector match, 103 | // * `text[: selector]` on tag text for selector match. 104 | // 105 | // When adding listeners for `startElement`, `updateElement`, and `text` the 106 | // callback can modify the provided node, before it is sent to the consumer. 107 | // 108 | // Selector syntax is CSS-like and currently supports: 109 | // 110 | // * `ancestor descendant` 111 | // * `parent > child` 112 | XmlStream.prototype.on = function(eventName, listener) { 113 | var event = parseEvent(eventName); 114 | if (event !== null) { 115 | // If we're dealing with a selector event, 116 | // continue with selector-specific processing logic. 117 | XmlStream.super_.prototype.on.call(this, event.name, listener); 118 | var finalState = getFinalState.call(this, event.selector); 119 | var self = this; 120 | if (event.type === 'updateElement') { 121 | this._fa.on('enter', finalState, function() { 122 | self._bufferLevel++; 123 | }); 124 | this._fa.on('leave', finalState, function(element, context, trace) { 125 | self.emit(event.name, element, context, trace); 126 | if (!--self._bufferLevel && self._emitData) { 127 | emitElement.call(self, element, self._name, true); 128 | } 129 | }); 130 | } else { 131 | var fn = function(element, context, trace) { 132 | self.emit(event.name, element, context, trace); 133 | }; 134 | this._fa.on(faModes[event.type], finalState, fn); 135 | } 136 | } else { 137 | // Otherwise, we're dealing with a non-selector event. 138 | if (eventName === 'data') { 139 | this._emitData = true; 140 | } 141 | XmlStream.super_.prototype.on.call(this, eventName, listener); 142 | } 143 | 144 | return this; 145 | }; 146 | 147 | // Collects elements with identical names, specified by a selector. 148 | // They will reside in the parent element as an array. 149 | XmlStream.prototype.collect = function(selector) { 150 | selector = normalizeSelector(selector); 151 | var finalState = getFinalState.call(this, selector); 152 | var self = this; 153 | this._fa.on('flag', finalState, function() { 154 | self._collect = true; 155 | }); 156 | 157 | return this; 158 | }; 159 | 160 | // Preserves the order of element and text nodes inside elements 161 | // that match the selector. Optionally, preserves whitespace. 162 | XmlStream.prototype.preserve = function(selector, whitespace) { 163 | selector = normalizeSelector(selector); 164 | var finalState = getFinalState.call(this, selector); 165 | var self = this; 166 | this._fa.on('enter', finalState, function() { 167 | self._preserveLevel++; 168 | if (whitespace) { 169 | self._preserveWhitespace++; 170 | } 171 | }); 172 | this._fa.on('leave', finalState, function() { 173 | self._preserveLevel--; 174 | if (whitespace) { 175 | self._preserveWhitespace--; 176 | } 177 | }); 178 | 179 | return this; 180 | }; 181 | 182 | // pause expat 183 | XmlStream.prototype.pause = function() { 184 | this._stream.pause(); 185 | this._suspended = true; 186 | if( !this._parser.pause() ) { 187 | throw(new Error("Cannot pause parser: "+this._parser.getError())); 188 | } 189 | 190 | return this; 191 | } 192 | 193 | // resume expat 194 | XmlStream.prototype.resume = function() { 195 | this._suspended = false; 196 | 197 | if( !this._parser.resume() ) { 198 | throw(new Error("Cannot resume parser: "+this._parser.getError())); 199 | } 200 | 201 | // resume stream only if parser hasn't been paused again 202 | if( !this._suspended ) { 203 | this._stream.resume(); 204 | } 205 | 206 | return this; 207 | } 208 | 209 | // Normalizes the selector and returns the new version and its parts. 210 | function normalizeSelector(selector) { 211 | var parts = selector.match(/[^\s>]+|>/ig); 212 | selector = (parts) ? parts.join(' ') : ''; 213 | return { 214 | normalized: selector, 215 | parts: parts || [] 216 | }; 217 | } 218 | 219 | // Parses the selector event string and returns event information. 220 | function parseEvent(event) { 221 | var eventParts = event.match(/^((?:start|end|update)Element|text):?(.*)/); 222 | if (eventParts === null) { 223 | return null; 224 | } 225 | var eventType = eventParts[1]; 226 | var selector = normalizeSelector(eventParts[2]); 227 | return { 228 | selector: selector, 229 | type: eventType, 230 | name: (eventParts[2]) ? eventType + ': ' + selector.normalized 231 | : eventType 232 | }; 233 | } 234 | 235 | // Compiles a given selector object to a finite automata 236 | // and returns its last state. 237 | function getFinalState(selector) { 238 | if (__own.call(this._finalStates, selector.normalized)) { 239 | var finalState = this._finalStates[selector.normalized]; 240 | } else { 241 | var n = selector.parts.length; 242 | var immediate = false; 243 | this._startState[this._lastState] = true; 244 | for (var i = 0; i < n; i++) { 245 | var part = selector.parts[i]; 246 | if (part === '>') { 247 | immediate = true; 248 | } else { 249 | if (!immediate) { 250 | this._fa.transition(this._lastState, '', this._lastState); 251 | } 252 | this._fa.transition(this._lastState, part, ++this._lastState); 253 | immediate = false; 254 | } 255 | } 256 | var finalState = this._lastState++; 257 | this._finalStates[selector.normalized] = finalState; 258 | } 259 | return finalState; 260 | } 261 | 262 | // Emits XML for element opening tag. 263 | function emitStart(name, attrs) { 264 | this.emit('data', '<' + name); 265 | for (var attr in attrs) if (__own.call(attrs, attr)) { 266 | this.emit('data', ' ' + attr + '="' + escape(attrs[attr]) + '"'); 267 | } 268 | this.emit('data', '>'); 269 | } 270 | 271 | // Emits XML for element closing tag. 272 | function emitEnd(name) { 273 | this.emit('data', ''); 274 | } 275 | 276 | // Emits XML for element text. 277 | function emitText(text) { 278 | this.emit('data', escape(text)); 279 | } 280 | 281 | // Emits a single element and its descendants, or an array of elements. 282 | function emitElement(element, name, onLeave) { 283 | if (Array.isArray(element)) { 284 | var i; 285 | for (i = 0; i < element.length - 1; i++) { 286 | emitOneElement.call(this, element[i], name); 287 | } 288 | emitOneElement.call(this, element[i], name, onLeave); 289 | } else { 290 | emitOneElement.call(this, element, name, onLeave); 291 | } 292 | } 293 | 294 | // Emits child element collection and their descendants. 295 | // Works only with preserved nodes. 296 | function emitChildren(elements) { 297 | var i; 298 | for (i = 0; i < elements.length; i++) { 299 | var element = elements[i]; 300 | if (typeof element === 'object') { 301 | emitStart.call(this, element.$name, element.$); 302 | emitChildren.call(this, element.$children); 303 | emitEnd.call(this, element.$name); 304 | } else { 305 | emitText.call(this, element); 306 | } 307 | } 308 | } 309 | 310 | // Recursively emits a given element and its descendants. 311 | function emitOneElement(element, name, onLeave) { 312 | if (typeof element === 'object') { 313 | emitStart.call(this, name, element.$); 314 | if (__own.call(element, '$children')) { 315 | emitChildren.call(this, element.$children); 316 | } else { 317 | var hasText = false; 318 | for (var child in element) { 319 | if (__own.call(element, child) && child !== '$' && child != '$name') { 320 | if (child === '$text') { 321 | hasText = true; 322 | } else { 323 | emitElement.call(this, element[child], child); 324 | } 325 | } 326 | } 327 | if (hasText) { 328 | emitText.call(this, element.$text); 329 | } 330 | } 331 | } else { 332 | emitStart.call(this, name, element.$); 333 | emitText.call(this, element); 334 | } 335 | if (!onLeave) { 336 | emitEnd.call(this, name); 337 | } 338 | } 339 | 340 | // Starts parsing the source stream and emitting various events. 341 | // The Expat parser is assigned several listeners for this purpose. 342 | function parse() { 343 | var self = this; 344 | var xml = new expat.Parser('utf-8'); 345 | this._parser = xml; 346 | this._suspended = false; 347 | var stack = []; 348 | var trace = {}; 349 | var curr = { 350 | element: {}, 351 | collect: this._collect, 352 | fullText: '', 353 | space: 0, 354 | path: '', 355 | context: {} 356 | }; 357 | var fa = this._fa; 358 | fa.setState(this._startState); 359 | 360 | // A listener is assigned on opening tag encounter. 361 | // Here we traverse the configured finite automata use the stack 362 | // to form the context and trace for selector event emission. 363 | xml.on('startElement', function(name, attr) { 364 | self.emit('startElement', name, attr); 365 | stack.push(curr); 366 | trace[curr.path] = curr.element; 367 | var context = Object.create(curr.context); 368 | var element = { 369 | $: attr, 370 | $name: name, 371 | $text: '' 372 | }; 373 | var parent = curr.element; 374 | curr = { 375 | element: element, 376 | collect: false, 377 | fullText: '', 378 | space: 0, 379 | path: curr.path + '/' + name, 380 | context: context 381 | }; 382 | self._collect = false; 383 | fa.enter(name, [element, context, trace]); 384 | if (self._preserveLevel > 0) { 385 | element.$children = []; 386 | } 387 | name = element.$name; 388 | curr.collect = self._collect; 389 | if (curr.collect) { 390 | var container; 391 | if (__own.call(parent, name)) { 392 | container = parent[name]; 393 | container.push(element); 394 | } else { 395 | container = [element]; 396 | parent[name] = container; 397 | } 398 | } else { 399 | parent[name] = element; 400 | context[name] = element; 401 | } 402 | if (self._bufferLevel === 0 && self._emitData) { 403 | emitStart.call(self, name, element.$); 404 | } 405 | }); 406 | 407 | // A listener is assigned on closing tag encounter. 408 | // Current node structure object is finalized. A selector listener is 409 | // invoked with current node, context, and trace; these arguments are 410 | // removed from the stack afterwards. 411 | xml.on('endElement', function(name) { 412 | self.emit('endElement', name); 413 | var prev = stack.pop(); 414 | var element = curr.element; 415 | var text = curr.fullText; 416 | var attr = element.$; 417 | if (typeof attr !== 'object') { 418 | attr = {}; 419 | } 420 | var name = element.$name; 421 | self._name = name; 422 | delete element.$; 423 | delete element.$text; 424 | delete element.$name; 425 | var val = element; 426 | if (isEmpty(element) && isEmpty(attr)) { 427 | val = text; 428 | } else if (!isEmpty(attr)) { 429 | element.$ = attr; 430 | } 431 | if (text !== '') { 432 | element.$text = text; 433 | } 434 | if (self._bufferLevel > 0 || self._preserveLevel > 0) { 435 | element.$name = name; 436 | } 437 | curr.context[name] = val; 438 | if (curr.collect) { 439 | var container = prev.element[name]; 440 | container[container.length - 1] = val; 441 | } else { 442 | prev.element[name] = val; 443 | } 444 | fa.leave([element, curr.context, trace]); 445 | if (self._preserveLevel > 0) { 446 | prev.element.$children.push(val); 447 | } 448 | if (self._bufferLevel === 0 && self._emitData) { 449 | emitEnd.call(self, name); 450 | } 451 | curr = prev; 452 | this._collect = curr.collect; 453 | }); 454 | 455 | // Collect node text part by part 456 | // (and trim leading and trailing whitespace). 457 | xml.on('text', function(text) { 458 | curr.element.$text = text; 459 | fa.run('state', [curr.element, curr.context, trace]); 460 | if (self._bufferLevel === 0 && self._emitData) { 461 | emitText.call(self, text); 462 | } 463 | if (!self._preserveAll) { 464 | var trimmed = curr.element.$text.trim(); 465 | var spaced = curr.element.$text.substr(0, 1); 466 | spaced = (spaced !== '') && (spaced.trim() === ''); 467 | var after = curr.element.$text.substr(-1, 1); 468 | after = (after !== '') && (after.trim() === ''); 469 | switch (curr.space) { 470 | // No words yet (pass through spaces). 471 | case 0: 472 | if (trimmed !== '') { 473 | curr.space = after ? 2 : 1; 474 | } 475 | break; 476 | 477 | // Immediately after text or entity. 478 | case 1: 479 | if (trimmed === '') { 480 | curr.space = 2; 481 | } else { 482 | if (spaced) { 483 | curr.fullText += ' '; 484 | } 485 | if (after) { 486 | curr.space = 2; 487 | } 488 | } 489 | break; 490 | 491 | // Some words were emitted, pass through spaces again. 492 | // Emit spaces only when a word is encountered afterwards. 493 | case 2: 494 | if (trimmed !== '') { 495 | curr.fullText += ' '; 496 | curr.space = 1; 497 | } 498 | break; 499 | } 500 | text = self._preserveWhitespace > 0 ? text : trimmed; 501 | if (self._preserveLevel > 0) { 502 | if (text !== '') { 503 | curr.element.$children.push(text); 504 | } 505 | } 506 | } 507 | curr.fullText += text; 508 | }); 509 | 510 | 511 | // This prelude array and string are used during encoding detection. 512 | // Incoming buffers are collected and parsing is postponed, 513 | // but only until the first tag. 514 | var prelude = ''; 515 | var preludeBuffers = []; 516 | 517 | // Parse incoming chunk. 518 | // Convert to UTF-8 or emit errors when appropriate. 519 | var parseChunk = function(data) { 520 | if (self._encoder) { 521 | data = self._encoder.convert(data); 522 | } 523 | if (!xml.parse(data, false)) { 524 | self.emit('error', new Error(xml.getError()+" in line "+xml.getCurrentLineNumber())); 525 | } 526 | } 527 | 528 | // Pass data from stream to parser. 529 | this._stream.on('data', function(data) { 530 | if (self._encoding) { 531 | parseChunk(data); 532 | } else { 533 | // We can't parse when the encoding is unknown, so we'll look into 534 | // the XML declaration, if there is one. For this, we need to buffer 535 | // incoming data until a full tag is received. 536 | preludeBuffers.push(data); 537 | prelude += data.toString(); 538 | if (/^\s*<[^>]+>/.test(prelude)) { 539 | var matches = prelude.match(/^\s*<\?xml[^>]+encoding="(.+?)"[^>]*\?>/); 540 | self._encoding = matches ? matches[1] : 'utf8'; 541 | self._encoder = makeEncoder(self._encoding); 542 | for (var i = 0, n = preludeBuffers.length; i < n; i++) { 543 | parseChunk(preludeBuffers[i]); 544 | } 545 | } 546 | } 547 | }); 548 | 549 | // End parsing on stream EOF and emit an *end* event ourselves. 550 | this._stream.on('end', function() { 551 | if (!xml.parse('', true)) { 552 | self.emit('error', new Error(xml.getError()+" in line "+xml.getCurrentLineNumber())); 553 | } 554 | self.emit('end'); 555 | }); 556 | } 557 | --------------------------------------------------------------------------------