├── .npmignore
├── .travis.yml
├── index.js
├── .gitignore
├── examples
├── encoding.xml
├── collect-preserve.xml
├── collect-preserve.js
├── encoding.js
└── http-stream.js
├── package.json
├── tests
├── test-collect-preserve.js
├── test-readable-stream.js
└── fixtures
│ └── collect-preserve.json
├── LICENSE
├── lib
├── finite-automata.js
└── xml-stream.js
└── README.md
/.npmignore:
--------------------------------------------------------------------------------
1 | .git*
2 | .npmignore
3 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: node_js
2 | node_js:
3 | - "0.10"
4 |
--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
1 | module.exports = require('./lib/xml-stream.js');
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | *.swo
3 | *~
4 | .DS_Store
5 | /assets/
6 | /node_modules/
7 |
--------------------------------------------------------------------------------
/examples/encoding.xml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/assistunion/xml-stream/HEAD/examples/encoding.xml
--------------------------------------------------------------------------------
/examples/collect-preserve.xml:
--------------------------------------------------------------------------------
1 |
2 | -
3 | 1 one
4 | 2 two
5 | 3 three
6 |
7 | -
8 | 4 four
9 | 5
10 | five
11 | A
12 | B
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/examples/collect-preserve.js:
--------------------------------------------------------------------------------
1 | var fs = require('fs')
2 | , path = require('path')
3 | , XmlStream = require('../lib/xml-stream')
4 | ;
5 |
6 | // Create a file stream and pass it to XmlStream
7 | var stream = fs.createReadStream(path.join(__dirname, 'collect-preserve.xml'));
8 | var xml = new XmlStream(stream);
9 |
10 | xml.preserve('item', true);
11 | xml.collect('subitem');
12 | xml.on('endElement: item', function(item) {
13 | console.log(item);
14 | });
15 |
--------------------------------------------------------------------------------
/examples/encoding.js:
--------------------------------------------------------------------------------
1 | var fs = require('fs')
2 | , path = require('path')
3 | , XmlStream = require('../lib/xml-stream')
4 | ;
5 |
6 | // Create a file stream and pass it to XmlStream
7 | function setup(encoding) {
8 | var stream = fs.createReadStream(path.join(__dirname, 'encoding.xml'));
9 | var xml = new XmlStream(stream, encoding);
10 | xml.on('endElement: node', function(node) {
11 | console.log(node);
12 | });
13 | xml.on('error', function(message) {
14 | console.log('Parsing as ' + (encoding || 'auto') + ' failed: ' + message);
15 | });
16 | return xml;
17 | }
18 |
19 | var xml = setup('utf8'); // Parse as UTF-8
20 | var xml = setup('iso-8859-5'); // Parse as ISO 8859-5
21 | var xml = setup(); // Detect on the fly.
22 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "xml-stream",
3 | "description": "XML stream to JavaScript object converter based on Expat.",
4 | "keywords": [
5 | "xml",
6 | "parser",
7 | "expat"
8 | ],
9 | "license": "MIT",
10 | "version": "0.4.5",
11 | "author": "AssistUnion ",
12 | "maintainers": [
13 | "Anatoly Ressin ",
14 | "Dimitry Solovyov (http://100-hour.com)"
15 | ],
16 | "scripts": {
17 | "test": "mocha tests"
18 | },
19 | "repository": {
20 | "type": "git",
21 | "url": "git://github.com/assistunion/xml-stream.git"
22 | },
23 | "dependencies": {
24 | "iconv": "^2.1.4",
25 | "node-expat": "^2.3.7",
26 | "readable-stream": "^1.0.31"
27 | },
28 | "directories": {
29 | "lib": "./lib"
30 | },
31 | "main": "index",
32 | "devDependencies": {
33 | "mocha": "^1.21.4"
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/tests/test-collect-preserve.js:
--------------------------------------------------------------------------------
1 | var fs = require('fs')
2 | , path = require('path')
3 | , assert = require('assert')
4 | , XmlStream = require('../lib/xml-stream');
5 |
6 |
7 | describe('XmlStream', function() {
8 |
9 | it('should deal nicely with preserve and collect when reading from file', function(done) {
10 | var stream = fs.createReadStream(path.resolve(__dirname, '../examples/collect-preserve.xml'));
11 | var fileExpected = fs.readFileSync(path.resolve(__dirname, 'fixtures/collect-preserve.json'));
12 | var xml = new XmlStream(stream);
13 | var results = [];
14 |
15 | xml.preserve('item', true);
16 | xml.collect('subitem');
17 | xml.on('endElement: item', function(item) {
18 | results.push(item);
19 | });
20 |
21 | xml.on('end', function () {
22 |
23 | var expected = JSON.parse(fileExpected);
24 |
25 | assert.deepEqual(results, expected);
26 | done();
27 | });
28 |
29 | xml.on('error', function (err) {
30 | done(err);
31 | });
32 | });
33 | });
34 |
--------------------------------------------------------------------------------
/tests/test-readable-stream.js:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 | 'use strict';
3 |
4 | var fs = require('fs')
5 | , assert = require('assert')
6 | , filename = require('path').resolve(__dirname, '../examples/collect-preserve.xml')
7 | , XmlStream = require('../lib/xml-stream')
8 | , Readable = require('stream').Readable || require('readable-stream');
9 |
10 | /**
11 | * Creates a stream w/ data.
12 | */
13 | function createStream (data) {
14 | var rs = new Readable();
15 | rs.push(data);
16 | rs.push(null);
17 |
18 | return rs;
19 | }
20 |
21 | describe('XmlStream', function() {
22 | var file = fs.readFileSync(filename, {encoding: 'utf8'});
23 |
24 | it('should deal with fake streams', function(done) {
25 | var stream = createStream(file);
26 | var results = [];
27 | var xml = new XmlStream(stream);
28 |
29 | xml.preserve('item', true);
30 | xml.collect('subitem');
31 | xml.on('endElement: item', function(item) {
32 | results.push(item);
33 | });
34 |
35 | xml.on('end', function () {
36 | assert(results.length);
37 | done();
38 | });
39 |
40 | xml.on('error', function (err) {
41 | done(err);
42 | });
43 | });
44 | });
45 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2010-2011 Anatoly Ressin, Dimitry Solovyov, Kirill Korolyov
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 |
--------------------------------------------------------------------------------
/examples/http-stream.js:
--------------------------------------------------------------------------------
1 | var http = require('http');
2 | var XmlStream = require('../lib/xml-stream');
3 |
4 | // Request an RSS for a Twitter stream
5 | var request = http.get({
6 | host: 'api.twitter.com',
7 | path: '/1/statuses/user_timeline/dimituri.rss'
8 | }).on('response', function(response) {
9 | // Pass the response as UTF-8 to XmlStream
10 | response.setEncoding('utf8');
11 | var xml = new XmlStream(response);
12 |
13 | // When each item node is completely parsed, buffer its contents
14 | xml.on('updateElement: item', function(item) {
15 | // Change child to a new value, composed of its previous value
16 | // and the value of child.
17 | item.title = item.title.match(/^[^:]+/)[0] + ' on ' +
18 | item.pubDate.replace(/ \+[0-9]{4}/, '');
19 | });
20 |
21 | // When - 's descendant text is completely parsed,
22 | // buffer it and pass the containing node
23 | xml.on('text: item > description', function(element) {
24 | // Modify the text to make it more readable,
25 | // highlight Twitter-specific and other links
26 | var url = /\b[a-zA-Z][a-zA-Z0-9\+\.\-]+:[^\s]+/g;
27 | var hashtag = /\b#[\w]+/g;
28 | var username = /\b@([\w]+)/g;
29 | element.$text = element.$text
30 | .replace(/^[^:]+:\s+/, '') //strip username prefix from tweet
31 | .replace(url, '$0')
32 | .replace(hashtag, '$0')
33 | .replace(username, '$0');
34 | });
35 |
36 | // When each chunk of unselected on unbuffered data is returned,
37 | // pass it to stdout
38 | xml.on('data', function(data) {
39 | process.stdout.write(data);
40 | });
41 | });
42 |
--------------------------------------------------------------------------------
/lib/finite-automata.js:
--------------------------------------------------------------------------------
1 | module.exports = FiniteAutomata;
2 | function FiniteAutomata() {
3 | this._symbols = {};
4 | this._states = {};
5 | this._deterministic = true;
6 | this._state = {};
7 | this._callbacks = {
8 | enter: {},
9 | leave: {},
10 | state: {},
11 | flag: {}
12 | };
13 | this._stack = [];
14 | this._stackPtr = -1;
15 | }
16 |
17 | var __own = Object.prototype.hasOwnProperty;
18 |
19 | function extend(target, source) {
20 | for (var key in source) if (__own.call(source, key)) {
21 | target[key] = source[key];
22 | }
23 | }
24 |
25 | function run(type, args) {
26 | var cbs = this._callbacks[type];
27 | for (var cb in this._state) if (__own.call(this._state, cb)) {
28 | if (__own.call(cbs, cb)) {
29 | var length = cbs[cb].length;
30 | var cbList = cbs[cb];
31 | for (var i = 0; i < length; i++) {
32 | cbList[i].apply(global, args);
33 | }
34 | }
35 | }
36 | }
37 |
38 | FiniteAutomata.prototype.isDeterministic = function() {
39 | return this._deterministic;
40 | };
41 |
42 | FiniteAutomata.prototype.on = function(type, state, cb) {
43 | if (!__own.call(this._callbacks, type)) {
44 | this._callbacks[type] = {};
45 | }
46 | var typeCbs = this._callbacks[type];
47 | if (!__own.call(typeCbs, state)) {
48 | typeCbs[state] = [];
49 | }
50 | typeCbs[state].push(cb);
51 | return this;
52 | };
53 |
54 | FiniteAutomata.prototype.setState = function(state, args) {
55 | this._state = state;
56 | run.call(this, 'enter', args);
57 | run.call(this, 'state', args);
58 | return this;
59 | };
60 |
61 | FiniteAutomata.prototype.nextState = function(symbol) {
62 | var newState = {};
63 | for (var st in this._state) if (__own.call(this._state, st)) {
64 | if (__own.call(this._states, st)) {
65 | var next = this._states[st];
66 | if (__own.call(next, symbol)) {
67 | extend(newState, next[symbol]);
68 | }
69 | if (__own.call(next, '')) {
70 | extend(newState, (next['']));
71 | }
72 | }
73 | }
74 | return newState;
75 | };
76 |
77 | FiniteAutomata.prototype.go = function(symbol, args) {
78 | var next = this.nextState(symbol)
79 | this.setState(next, args);
80 | return this;
81 | };
82 |
83 | FiniteAutomata.prototype.leave = function(args) {
84 | this._stack[this._stackPtr] = undefined;
85 | run.call(this, 'leave', args);
86 | this._state = this._stack[--this._stackPtr];
87 | return this;
88 | };
89 |
90 | FiniteAutomata.prototype.enter = function(symbol, args) {
91 | if (args == null) {
92 | args = [];
93 | }
94 | var next = this.nextState(symbol);
95 | this._stack[++this._stackPtr] = next;
96 | this._state = next;
97 | run.call(this, 'flag');
98 | run.call(this, 'enter', args);
99 | return this;
100 | };
101 |
102 | FiniteAutomata.prototype.run = function(state, args) {
103 | run.call(this, state, args);
104 | };
105 |
106 | FiniteAutomata.prototype.transition = function(stateFrom, symbol, stateTo) {
107 | this._symbols[symbol] = true;
108 | var s;
109 | if (__own.call(this._states, stateFrom)) {
110 | s = this._states[stateFrom];
111 | } else {
112 | s = this._states[stateFrom] = {};
113 | }
114 | var exists = __own.call(s, symbol);
115 | if (exists) {
116 | s = s[symbol];
117 | } else {
118 | s = s[symbol] = {};
119 | }
120 | if (!__own.call(s, stateTo)) {
121 | s[stateTo] = true;
122 | this._deterministic &= !exists;
123 | }
124 | return this;
125 | };
126 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # XmlStream
2 |
3 | XmlStream is a Node.js XML stream parser and editor, based on
4 | [node-expat](https://github.com/astro/node-expat) (libexpat SAX-like parser
5 | binding).
6 |
7 | $ npm install xml-stream
8 |
9 | ## Rationale
10 |
11 | When working with large XML files, it is probably a bad idea to use an XML to
12 | JavaScript object converter, or simply buffer the whole document in memory.
13 | Then again, a typical SAX parser might be too low-level for some tasks (and
14 | often a real pain).
15 |
16 | This is why we've rolled our own stream parser that tries to address these
17 | shortcomings. It processes an XML stream chunk by chunk and fires events only
18 | for nodes of interest, matching them with CSS-like selectors.
19 |
20 | ## Events
21 |
22 | Supported events:
23 |
24 | * `data` on outgoing data chunk,
25 | * `end` when parsing has ended,
26 | * `startElement[: selector]` on opening tag for selector match,
27 | * `updateElement[: selector]` on finished node for selector match
28 | with its contents buffered,
29 | * `endElement[: selector]` on closing tag for selector match,
30 | * `text[: selector]` on tag text for selector match.
31 |
32 | When adding listeners for `startElement`, `updateElement`, and `text` the
33 | callback can modify the provided node, before it is sent to the consumer.
34 |
35 | Selector syntax is CSS-like and currently supports:
36 |
37 | * `ancestor descendant`
38 | * `parent > child`
39 |
40 | Take a look at the examples for more information.
41 |
42 | ## Element Node
43 |
44 | Each of the four node events has a callback with one argument. When parsing,
45 | this argument is set to the current matched node. Having a chunk of XML like
46 | this:
47 |
48 | ```xml
49 |
-
50 | Item Title
51 | Description of this item.
52 | (text)
53 |
54 | ```
55 |
56 | The structure of the **item** element node would be:
57 |
58 | ```javascript
59 | {
60 | title: 'Item Title',
61 | description: 'Description of this item.',
62 | '$': {
63 | 'id': '123',
64 | 'type': 'common'
65 | },
66 | '$name': 'item',
67 | '$text': '(text)'
68 | }
69 | ```
70 |
71 | Naturally, element text and child elements wouldn't be known until discovered
72 | in the stream, so the structure may differ across events. The complete
73 | structure as displayed should be available on **updateElement**. The **$name**
74 | is not available on **endElement**.
75 |
76 | # Collecting Children
77 |
78 | It is sometimes required to select elements that have many children with
79 | one and the same name. Like this XML:
80 |
81 | ```xml
82 | -
83 | one
84 | two
85 |
86 | -
87 | three
88 | four
89 | five
90 |
91 | ```
92 |
93 | By default, parsed element node contains children as properties. In the case
94 | of several children with same names, the last one would overwrite others.
95 | To collect all of *subitem* elements in an array use **collect**:
96 |
97 | ```javascript
98 | xml.collect('subitem');
99 | xml.on('endElement: item', function(item) {
100 | console.log(item);
101 | })
102 | ```
103 |
104 | # Preserving Elements and Text
105 |
106 | By default, element text is returned as one concatenated string. In this XML:
107 |
108 | ```xml
109 | -
110 | one 1
111 | two 2
112 |
113 | ```
114 |
115 | The value of **$text** for *item* would be: `one 1 two 2` without any
116 | indication of the order of element *a*, element *b*, and text parts.
117 | To preserve this order:
118 |
119 | ```javascript
120 | xml.preserve('item');
121 | xml.on('endElement: item', function(item) {
122 | console.log(item);
123 | })
124 | ```
125 |
126 | # Pause and resume parsing
127 |
128 | If you want parsing to pause (for example, until some asynchronous operation
129 | of yours is finished), you can pause and resume XML parsing:
130 | ```javascript
131 | xml.pause();
132 | myAsyncFunction( function() {
133 | xml.resume();
134 | });
135 | ```
136 | Beware that resume() **must not** be called from within a handler callback.
137 |
138 |
--------------------------------------------------------------------------------
/tests/fixtures/collect-preserve.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "$children": [
4 | "\n",
5 | " 1 ",
6 | {
7 | "$children": [
8 | "one"
9 | ],
10 | "$name": "subitem",
11 | "$text": "one"
12 | },
13 | "\n",
14 | " 2 ",
15 | {
16 | "$children": [
17 | "two"
18 | ],
19 | "$name": "subitem",
20 | "$text": "two"
21 | },
22 | "\n",
23 | " 3 ",
24 | {
25 | "$children": [
26 | "three"
27 | ],
28 | "$name": "subitem",
29 | "$text": "three"
30 | },
31 | "\n",
32 | " "
33 | ],
34 | "$name": "item",
35 | "$text": "\n 1 \n 2 \n 3 \n ",
36 | "subitem": [
37 | {
38 | "$children": [
39 | "one"
40 | ],
41 | "$name": "subitem",
42 | "$text": "one"
43 | },
44 | {
45 | "$children": [
46 | "two"
47 | ],
48 | "$name": "subitem",
49 | "$text": "two"
50 | },
51 | {
52 | "$children": [
53 | "three"
54 | ],
55 | "$name": "subitem",
56 | "$text": "three"
57 | }
58 | ]
59 | },
60 | {
61 | "$children": [
62 | "\n",
63 | " 4 ",
64 | {
65 | "$children": [
66 | "four"
67 | ],
68 | "$name": "subitem",
69 | "$text": "four"
70 | },
71 | "\n",
72 | " 5 ",
73 | {
74 | "$children": [
75 | "\n",
76 | " five",
77 | "\n",
78 | " ",
79 | {
80 | "$children": [
81 | "A"
82 | ],
83 | "$name": "letter",
84 | "$text": "A"
85 | },
86 | "\n",
87 | " ",
88 | {
89 | "$children": [
90 | "B"
91 | ],
92 | "$name": "letter",
93 | "$text": "B"
94 | },
95 | "\n",
96 | " "
97 | ],
98 | "$name": "subitem",
99 | "$text": "\n five\n \n \n ",
100 | "letter": {
101 | "$children": [
102 | "B"
103 | ],
104 | "$name": "letter",
105 | "$text": "B"
106 | }
107 | },
108 | "\n",
109 | " "
110 | ],
111 | "$name": "item",
112 | "$text": "\n 4 \n 5 \n ",
113 | "subitem": [
114 | {
115 | "$children": [
116 | "four"
117 | ],
118 | "$name": "subitem",
119 | "$text": "four"
120 | },
121 | {
122 | "$children": [
123 | "\n",
124 | " five",
125 | "\n",
126 | " ",
127 | {
128 | "$children": [
129 | "A"
130 | ],
131 | "$name": "letter",
132 | "$text": "A"
133 | },
134 | "\n",
135 | " ",
136 | {
137 | "$children": [
138 | "B"
139 | ],
140 | "$name": "letter",
141 | "$text": "B"
142 | },
143 | "\n",
144 | " "
145 | ],
146 | "$name": "subitem",
147 | "$text": "\n five\n \n \n ",
148 | "letter": {
149 | "$children": [
150 | "B"
151 | ],
152 | "$name": "letter",
153 | "$text": "B"
154 | }
155 | }
156 | ]
157 | }
158 | ]
159 |
--------------------------------------------------------------------------------
/lib/xml-stream.js:
--------------------------------------------------------------------------------
1 | var events = require('events')
2 | , expat = require('node-expat')
3 | , FiniteAutomata = require('./finite-automata')
4 | , Iconv = require('iconv').Iconv
5 | ;
6 |
7 | // Retains link to hasOwnProperty.
8 | var __own = Object.prototype.hasOwnProperty;
9 |
10 | // Tests if object is empty (has no own properties).
11 | function isEmpty(obj) {
12 | for (var key in obj) if (__own.call(obj, key)) {
13 | return false;
14 | }
15 | return true;
16 | }
17 |
18 | // XML entities.
19 | var entities = {
20 | '"': '"',
21 | '&': '&',
22 | '\'': ''',
23 | '<': '<',
24 | '>': '>'
25 | };
26 |
27 | // Escapes text for XML.
28 | function escape(value) {
29 | return value.replace(/"|&|'|<|>/g, function(entity) {
30 | return entities[entity];
31 | });
32 | }
33 |
34 | // Parser events to finite automata events mapping.
35 | var faModes = {
36 | 'startElement': 'enter',
37 | 'endElement': 'leave',
38 | 'text': 'state'
39 | };
40 |
41 | // I accidentally the whole class.
42 | module.exports = XmlStream;
43 |
44 | // **XmlStream** is an XML stream filter based on Expat.
45 | // It traverses a given stream and emits events for predefined selectors.
46 | // Event listeners receive selected elements, context, and trace from root.
47 | function XmlStream(stream, encoding) {
48 | events.EventEmitter.call(this);
49 | this._stream = stream;
50 | this._fa = new FiniteAutomata();
51 | this._lastState = 0;
52 | this._startState = {};
53 | this._finalStates = {};
54 | this._emitData = false;
55 | this._bufferLevel = 0;
56 | this._preserveLevel = 0;
57 | this._preserveWhitespace = 0;
58 | this._preserveAll = false;
59 | this._collect = false;
60 | this._parser = undefined;
61 |
62 | // Set input stream encoding and create an iconv instance,
63 | // if conversion is required. Default working encoding is UTF-8,
64 | // so iconv is used when input is anything else, but UTF-8.
65 | this._encoding = encoding || null;
66 | this._encoder = makeEncoder(this._encoding);
67 |
68 | var scope = this;
69 |
70 | // Start parsing.
71 | process.nextTick(function () {
72 | parse.call(scope);
73 | });
74 | }
75 |
76 | // Either make an iconv instance, or not.
77 | function makeEncoder(encoding) {
78 | if (encoding && !/^utf-?8$/i.test(encoding)) {
79 | return new Iconv(encoding, 'utf8');
80 | }
81 | return null;
82 | }
83 |
84 | // Inherit events.EventEmitter.
85 | XmlStream.super_ = events.EventEmitter;
86 | XmlStream.prototype = Object.create(events.EventEmitter.prototype, {
87 | constructor: {
88 | value: XmlStream,
89 | enumerable: false
90 | }
91 | });
92 |
93 | // Adds a listener for the specified event.
94 | //
95 | // Supported events:
96 | //
97 | // * `data` on outgoing data chunk,
98 | // * `end` when parsing has ended,
99 | // * `startElement[: selector]` on opening tag for selector match,
100 | // * `updateElement[: selector]` on finished node for selector match
101 | // with its contents buffered,
102 | // * `endElement[: selector]` on closing tag for selector match,
103 | // * `text[: selector]` on tag text for selector match.
104 | //
105 | // When adding listeners for `startElement`, `updateElement`, and `text` the
106 | // callback can modify the provided node, before it is sent to the consumer.
107 | //
108 | // Selector syntax is CSS-like and currently supports:
109 | //
110 | // * `ancestor descendant`
111 | // * `parent > child`
112 | XmlStream.prototype.on = function(eventName, listener) {
113 | var event = parseEvent(eventName);
114 | if (event !== null) {
115 | // If we're dealing with a selector event,
116 | // continue with selector-specific processing logic.
117 | XmlStream.super_.prototype.on.call(this, event.name, listener);
118 | var finalState = getFinalState.call(this, event.selector);
119 | var self = this;
120 | if (event.type === 'updateElement') {
121 | this._fa.on('enter', finalState, function() {
122 | self._bufferLevel++;
123 | });
124 | this._fa.on('leave', finalState, function(element, context, trace) {
125 | self.emit(event.name, element, context, trace);
126 | if (!--self._bufferLevel && self._emitData) {
127 | emitElement.call(self, element, self._name, true);
128 | }
129 | });
130 | } else {
131 | var fn = function(element, context, trace) {
132 | self.emit(event.name, element, context, trace);
133 | };
134 | this._fa.on(faModes[event.type], finalState, fn);
135 | }
136 | } else {
137 | // Otherwise, we're dealing with a non-selector event.
138 | if (eventName === 'data') {
139 | this._emitData = true;
140 | }
141 | XmlStream.super_.prototype.on.call(this, eventName, listener);
142 | }
143 |
144 | return this;
145 | };
146 |
147 | // Collects elements with identical names, specified by a selector.
148 | // They will reside in the parent element as an array.
149 | XmlStream.prototype.collect = function(selector) {
150 | selector = normalizeSelector(selector);
151 | var finalState = getFinalState.call(this, selector);
152 | var self = this;
153 | this._fa.on('flag', finalState, function() {
154 | self._collect = true;
155 | });
156 |
157 | return this;
158 | };
159 |
160 | // Preserves the order of element and text nodes inside elements
161 | // that match the selector. Optionally, preserves whitespace.
162 | XmlStream.prototype.preserve = function(selector, whitespace) {
163 | selector = normalizeSelector(selector);
164 | var finalState = getFinalState.call(this, selector);
165 | var self = this;
166 | this._fa.on('enter', finalState, function() {
167 | self._preserveLevel++;
168 | if (whitespace) {
169 | self._preserveWhitespace++;
170 | }
171 | });
172 | this._fa.on('leave', finalState, function() {
173 | self._preserveLevel--;
174 | if (whitespace) {
175 | self._preserveWhitespace--;
176 | }
177 | });
178 |
179 | return this;
180 | };
181 |
182 | // pause expat
183 | XmlStream.prototype.pause = function() {
184 | this._stream.pause();
185 | this._suspended = true;
186 | if( !this._parser.pause() ) {
187 | throw(new Error("Cannot pause parser: "+this._parser.getError()));
188 | }
189 |
190 | return this;
191 | }
192 |
193 | // resume expat
194 | XmlStream.prototype.resume = function() {
195 | this._suspended = false;
196 |
197 | if( !this._parser.resume() ) {
198 | throw(new Error("Cannot resume parser: "+this._parser.getError()));
199 | }
200 |
201 | // resume stream only if parser hasn't been paused again
202 | if( !this._suspended ) {
203 | this._stream.resume();
204 | }
205 |
206 | return this;
207 | }
208 |
209 | // Normalizes the selector and returns the new version and its parts.
210 | function normalizeSelector(selector) {
211 | var parts = selector.match(/[^\s>]+|>/ig);
212 | selector = (parts) ? parts.join(' ') : '';
213 | return {
214 | normalized: selector,
215 | parts: parts || []
216 | };
217 | }
218 |
219 | // Parses the selector event string and returns event information.
220 | function parseEvent(event) {
221 | var eventParts = event.match(/^((?:start|end|update)Element|text):?(.*)/);
222 | if (eventParts === null) {
223 | return null;
224 | }
225 | var eventType = eventParts[1];
226 | var selector = normalizeSelector(eventParts[2]);
227 | return {
228 | selector: selector,
229 | type: eventType,
230 | name: (eventParts[2]) ? eventType + ': ' + selector.normalized
231 | : eventType
232 | };
233 | }
234 |
235 | // Compiles a given selector object to a finite automata
236 | // and returns its last state.
237 | function getFinalState(selector) {
238 | if (__own.call(this._finalStates, selector.normalized)) {
239 | var finalState = this._finalStates[selector.normalized];
240 | } else {
241 | var n = selector.parts.length;
242 | var immediate = false;
243 | this._startState[this._lastState] = true;
244 | for (var i = 0; i < n; i++) {
245 | var part = selector.parts[i];
246 | if (part === '>') {
247 | immediate = true;
248 | } else {
249 | if (!immediate) {
250 | this._fa.transition(this._lastState, '', this._lastState);
251 | }
252 | this._fa.transition(this._lastState, part, ++this._lastState);
253 | immediate = false;
254 | }
255 | }
256 | var finalState = this._lastState++;
257 | this._finalStates[selector.normalized] = finalState;
258 | }
259 | return finalState;
260 | }
261 |
262 | // Emits XML for element opening tag.
263 | function emitStart(name, attrs) {
264 | this.emit('data', '<' + name);
265 | for (var attr in attrs) if (__own.call(attrs, attr)) {
266 | this.emit('data', ' ' + attr + '="' + escape(attrs[attr]) + '"');
267 | }
268 | this.emit('data', '>');
269 | }
270 |
271 | // Emits XML for element closing tag.
272 | function emitEnd(name) {
273 | this.emit('data', '' + name + '>');
274 | }
275 |
276 | // Emits XML for element text.
277 | function emitText(text) {
278 | this.emit('data', escape(text));
279 | }
280 |
281 | // Emits a single element and its descendants, or an array of elements.
282 | function emitElement(element, name, onLeave) {
283 | if (Array.isArray(element)) {
284 | var i;
285 | for (i = 0; i < element.length - 1; i++) {
286 | emitOneElement.call(this, element[i], name);
287 | }
288 | emitOneElement.call(this, element[i], name, onLeave);
289 | } else {
290 | emitOneElement.call(this, element, name, onLeave);
291 | }
292 | }
293 |
294 | // Emits child element collection and their descendants.
295 | // Works only with preserved nodes.
296 | function emitChildren(elements) {
297 | var i;
298 | for (i = 0; i < elements.length; i++) {
299 | var element = elements[i];
300 | if (typeof element === 'object') {
301 | emitStart.call(this, element.$name, element.$);
302 | emitChildren.call(this, element.$children);
303 | emitEnd.call(this, element.$name);
304 | } else {
305 | emitText.call(this, element);
306 | }
307 | }
308 | }
309 |
310 | // Recursively emits a given element and its descendants.
311 | function emitOneElement(element, name, onLeave) {
312 | if (typeof element === 'object') {
313 | emitStart.call(this, name, element.$);
314 | if (__own.call(element, '$children')) {
315 | emitChildren.call(this, element.$children);
316 | } else {
317 | var hasText = false;
318 | for (var child in element) {
319 | if (__own.call(element, child) && child !== '$' && child != '$name') {
320 | if (child === '$text') {
321 | hasText = true;
322 | } else {
323 | emitElement.call(this, element[child], child);
324 | }
325 | }
326 | }
327 | if (hasText) {
328 | emitText.call(this, element.$text);
329 | }
330 | }
331 | } else {
332 | emitStart.call(this, name, element.$);
333 | emitText.call(this, element);
334 | }
335 | if (!onLeave) {
336 | emitEnd.call(this, name);
337 | }
338 | }
339 |
340 | // Starts parsing the source stream and emitting various events.
341 | // The Expat parser is assigned several listeners for this purpose.
342 | function parse() {
343 | var self = this;
344 | var xml = new expat.Parser('utf-8');
345 | this._parser = xml;
346 | this._suspended = false;
347 | var stack = [];
348 | var trace = {};
349 | var curr = {
350 | element: {},
351 | collect: this._collect,
352 | fullText: '',
353 | space: 0,
354 | path: '',
355 | context: {}
356 | };
357 | var fa = this._fa;
358 | fa.setState(this._startState);
359 |
360 | // A listener is assigned on opening tag encounter.
361 | // Here we traverse the configured finite automata use the stack
362 | // to form the context and trace for selector event emission.
363 | xml.on('startElement', function(name, attr) {
364 | self.emit('startElement', name, attr);
365 | stack.push(curr);
366 | trace[curr.path] = curr.element;
367 | var context = Object.create(curr.context);
368 | var element = {
369 | $: attr,
370 | $name: name,
371 | $text: ''
372 | };
373 | var parent = curr.element;
374 | curr = {
375 | element: element,
376 | collect: false,
377 | fullText: '',
378 | space: 0,
379 | path: curr.path + '/' + name,
380 | context: context
381 | };
382 | self._collect = false;
383 | fa.enter(name, [element, context, trace]);
384 | if (self._preserveLevel > 0) {
385 | element.$children = [];
386 | }
387 | name = element.$name;
388 | curr.collect = self._collect;
389 | if (curr.collect) {
390 | var container;
391 | if (__own.call(parent, name)) {
392 | container = parent[name];
393 | container.push(element);
394 | } else {
395 | container = [element];
396 | parent[name] = container;
397 | }
398 | } else {
399 | parent[name] = element;
400 | context[name] = element;
401 | }
402 | if (self._bufferLevel === 0 && self._emitData) {
403 | emitStart.call(self, name, element.$);
404 | }
405 | });
406 |
407 | // A listener is assigned on closing tag encounter.
408 | // Current node structure object is finalized. A selector listener is
409 | // invoked with current node, context, and trace; these arguments are
410 | // removed from the stack afterwards.
411 | xml.on('endElement', function(name) {
412 | self.emit('endElement', name);
413 | var prev = stack.pop();
414 | var element = curr.element;
415 | var text = curr.fullText;
416 | var attr = element.$;
417 | if (typeof attr !== 'object') {
418 | attr = {};
419 | }
420 | var name = element.$name;
421 | self._name = name;
422 | delete element.$;
423 | delete element.$text;
424 | delete element.$name;
425 | var val = element;
426 | if (isEmpty(element) && isEmpty(attr)) {
427 | val = text;
428 | } else if (!isEmpty(attr)) {
429 | element.$ = attr;
430 | }
431 | if (text !== '') {
432 | element.$text = text;
433 | }
434 | if (self._bufferLevel > 0 || self._preserveLevel > 0) {
435 | element.$name = name;
436 | }
437 | curr.context[name] = val;
438 | if (curr.collect) {
439 | var container = prev.element[name];
440 | container[container.length - 1] = val;
441 | } else {
442 | prev.element[name] = val;
443 | }
444 | fa.leave([element, curr.context, trace]);
445 | if (self._preserveLevel > 0) {
446 | prev.element.$children.push(val);
447 | }
448 | if (self._bufferLevel === 0 && self._emitData) {
449 | emitEnd.call(self, name);
450 | }
451 | curr = prev;
452 | this._collect = curr.collect;
453 | });
454 |
455 | // Collect node text part by part
456 | // (and trim leading and trailing whitespace).
457 | xml.on('text', function(text) {
458 | curr.element.$text = text;
459 | fa.run('state', [curr.element, curr.context, trace]);
460 | if (self._bufferLevel === 0 && self._emitData) {
461 | emitText.call(self, text);
462 | }
463 | if (!self._preserveAll) {
464 | var trimmed = curr.element.$text.trim();
465 | var spaced = curr.element.$text.substr(0, 1);
466 | spaced = (spaced !== '') && (spaced.trim() === '');
467 | var after = curr.element.$text.substr(-1, 1);
468 | after = (after !== '') && (after.trim() === '');
469 | switch (curr.space) {
470 | // No words yet (pass through spaces).
471 | case 0:
472 | if (trimmed !== '') {
473 | curr.space = after ? 2 : 1;
474 | }
475 | break;
476 |
477 | // Immediately after text or entity.
478 | case 1:
479 | if (trimmed === '') {
480 | curr.space = 2;
481 | } else {
482 | if (spaced) {
483 | curr.fullText += ' ';
484 | }
485 | if (after) {
486 | curr.space = 2;
487 | }
488 | }
489 | break;
490 |
491 | // Some words were emitted, pass through spaces again.
492 | // Emit spaces only when a word is encountered afterwards.
493 | case 2:
494 | if (trimmed !== '') {
495 | curr.fullText += ' ';
496 | curr.space = 1;
497 | }
498 | break;
499 | }
500 | text = self._preserveWhitespace > 0 ? text : trimmed;
501 | if (self._preserveLevel > 0) {
502 | if (text !== '') {
503 | curr.element.$children.push(text);
504 | }
505 | }
506 | }
507 | curr.fullText += text;
508 | });
509 |
510 |
511 | // This prelude array and string are used during encoding detection.
512 | // Incoming buffers are collected and parsing is postponed,
513 | // but only until the first tag.
514 | var prelude = '';
515 | var preludeBuffers = [];
516 |
517 | // Parse incoming chunk.
518 | // Convert to UTF-8 or emit errors when appropriate.
519 | var parseChunk = function(data) {
520 | if (self._encoder) {
521 | data = self._encoder.convert(data);
522 | }
523 | if (!xml.parse(data, false)) {
524 | self.emit('error', new Error(xml.getError()+" in line "+xml.getCurrentLineNumber()));
525 | }
526 | }
527 |
528 | // Pass data from stream to parser.
529 | this._stream.on('data', function(data) {
530 | if (self._encoding) {
531 | parseChunk(data);
532 | } else {
533 | // We can't parse when the encoding is unknown, so we'll look into
534 | // the XML declaration, if there is one. For this, we need to buffer
535 | // incoming data until a full tag is received.
536 | preludeBuffers.push(data);
537 | prelude += data.toString();
538 | if (/^\s*<[^>]+>/.test(prelude)) {
539 | var matches = prelude.match(/^\s*<\?xml[^>]+encoding="(.+?)"[^>]*\?>/);
540 | self._encoding = matches ? matches[1] : 'utf8';
541 | self._encoder = makeEncoder(self._encoding);
542 | for (var i = 0, n = preludeBuffers.length; i < n; i++) {
543 | parseChunk(preludeBuffers[i]);
544 | }
545 | }
546 | }
547 | });
548 |
549 | // End parsing on stream EOF and emit an *end* event ourselves.
550 | this._stream.on('end', function() {
551 | if (!xml.parse('', true)) {
552 | self.emit('error', new Error(xml.getError()+" in line "+xml.getCurrentLineNumber()));
553 | }
554 | self.emit('end');
555 | });
556 | }
557 |
--------------------------------------------------------------------------------