├── .gitignore ├── .gitattributes ├── test ├── Documents │ ├── Basic.html │ ├── Attributes.html │ ├── Atom_Example.xml │ ├── RSS_Example.xml │ └── RDF_Example.xml ├── 01-events.js ├── Events │ ├── 25-empty_tag_name.json │ ├── 28-cdata_in_html.json │ ├── 31-comment_false-ending.json │ ├── 06-leading-lt.json │ ├── 15-lt-whitespace.json │ ├── 23-legacy_entity_fail.json │ ├── 18-legacy_entities.json │ ├── 17-numeric_entities.json │ ├── 19-named_entities.json │ ├── 20-xml_entities.json │ ├── 29-comment_edge-cases.json │ ├── 26-not-quite-closed.json │ ├── 30-cdata_edge-cases.json │ ├── 32-script-ending-with-lessthan.json │ ├── 05-cdata-special.json │ ├── 01-simple.json │ ├── 22-double_brackets.json │ ├── 12-long-comment-end.json │ ├── 16-double_attribs.json │ ├── 03-lowercase_tags.json │ ├── 13-long-cdata-end.json │ ├── 21-entity_in_attribute.json │ ├── 10-crazy-attrib.json │ ├── 04-cdata.json │ ├── 11-script_in_script.json │ ├── 07-self-closing.json │ ├── 14-implicit-open-tags.json │ ├── 27-entities_in_attributes.json │ ├── 02-template.json │ ├── 09-attributes.json │ ├── 24-special_special.json │ └── 08-implicit-close-tags.json ├── 03-feed.js ├── unicode.js ├── Feeds │ ├── 02-atom.js │ ├── 03-rdf.js │ └── 01-rss.js ├── 02-stream.js ├── Stream │ ├── 01-basic.json │ ├── 05-Attributes.json │ ├── 03-Atom.json │ ├── 02-RSS.json │ └── 04-RDF.json ├── test-helper.js └── api.js ├── .travis.yml ├── README.md ├── lib ├── ProxyHandler.js ├── WritableStream.js ├── Stream.js ├── CollectingHandler.js ├── index.js ├── FeedHandler.js ├── Parser.js └── Tokenizer.js ├── LICENSE ├── package.json └── .eslintrc /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/** 2 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text eol=lf -------------------------------------------------------------------------------- /test/Documents/Basic.html: -------------------------------------------------------------------------------- 1 | The TitleHello world -------------------------------------------------------------------------------- /test/01-events.js: -------------------------------------------------------------------------------- 1 | var helper = require("./test-helper.js"); 2 | 3 | helper.mochaTest("Events", __dirname, function(test, cb){ 4 | helper.writeToParser( 5 | helper.getEventCollector(cb), 6 | test.options.parser, 7 | test.html 8 | ); 9 | }); -------------------------------------------------------------------------------- /test/Events/25-empty_tag_name.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Empty tag name", 3 | "options": {}, 4 | "html": "< >", 5 | "expected": [ 6 | { 7 | "event": "text", 8 | "data": [ 9 | "< >" 10 | ] 11 | } 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /test/Events/28-cdata_in_html.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "CDATA in HTML", 3 | "options": {}, 4 | "html": "", 5 | "expected": [ 6 | { "event": "comment", "data": [ "[CDATA[ foo ]]" ] }, 7 | { "event": "commentend", "data": [] } 8 | ] 9 | } -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - stable 4 | - unstable 5 | - 6 6 | - 4 7 | - 0.12 8 | 9 | sudo: false 10 | 11 | matrix: 12 | fast_finish: true 13 | allow_failures: 14 | - node_js: unstable 15 | 16 | script: npm run coveralls 17 | -------------------------------------------------------------------------------- /test/Events/31-comment_false-ending.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Comment false ending", 3 | "options": {}, 4 | "html": "", 5 | "expected": [ 6 | { "event": "comment", "data": [ " a-b-> " ] }, 7 | { "event": "commentend", "data": [] } 8 | ] 9 | } 10 | -------------------------------------------------------------------------------- /test/Events/06-leading-lt.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "leading lt", 3 | "options": { 4 | "handler": {}, 5 | "parser": {} 6 | }, 7 | "html": ">a>", 8 | "expected": [ 9 | { 10 | "event": "text", 11 | "data": [ 12 | ">a>" 13 | ] 14 | } 15 | ] 16 | } -------------------------------------------------------------------------------- /test/Events/15-lt-whitespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "lt followed by whitespace", 3 | "options": { 4 | "handler": {}, 5 | "parser": {} 6 | }, 7 | "html": "a < b", 8 | "expected": [ 9 | { 10 | "event": "text", 11 | "data": [ 12 | "a < b" 13 | ] 14 | } 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /test/Events/23-legacy_entity_fail.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "legacy entities", 3 | "options": { 4 | "handler": {}, 5 | "parser": {"decodeEntities": true} 6 | }, 7 | "html": "M&M", 8 | "expected": [ 9 | { 10 | "event": "text", 11 | "data": [ 12 | "M&M" 13 | ] 14 | } 15 | ] 16 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # htmlparser2-without-node-native 2 | 3 | [htmlparser2](https://github.com/fb55/htmlparser2) build that excludes node native modules so that you can use it in platforms like React Native. 4 | 5 | * Remove `Stream` and `WritableStream`. 6 | * Use [eventemitter2](https://github.com/asyncly/EventEmitter2) instead of native `events`. 7 | -------------------------------------------------------------------------------- /test/Events/18-legacy_entities.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "legacy entities", 3 | "options": { 4 | "handler": {}, 5 | "parser": {"decodeEntities": true} 6 | }, 7 | "html": "&elíe&eer;s<er", 8 | "expected": [ 9 | { 10 | "event": "text", 11 | "data": [ 12 | "&el\u00EDe&eer;s&<üaجde" 13 | ] 14 | } 15 | ] 16 | } -------------------------------------------------------------------------------- /test/03-feed.js: -------------------------------------------------------------------------------- 1 | //Runs tests for feeds 2 | 3 | var helper = require("./test-helper.js"), 4 | FeedHandler = require("..").RssHandler, 5 | fs = require("fs"), 6 | path = require("path"); 7 | 8 | helper.mochaTest("Feeds", __dirname, function(test, cb){ 9 | fs.readFile( 10 | path.join(__dirname, "Documents", test.file), 11 | function(err, file){ 12 | helper.writeToParser( 13 | new FeedHandler(cb), 14 | { xmlMode: true }, 15 | file.toString() 16 | ); 17 | } 18 | ); 19 | }); -------------------------------------------------------------------------------- /test/Events/29-comment_edge-cases.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Comment edge-cases", 3 | "options": {}, 4 | "html": " 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /test/unicode.js: -------------------------------------------------------------------------------- 1 | // var htmlparser2 = require(".."), 2 | // assert = require("assert"); 3 | // 4 | // describe("WritableStream", function(){ 5 | // 6 | // it("should decode fragmented unicode characters", function(){ 7 | // var processed = false; 8 | // var stream = new htmlparser2.WritableStream({ 9 | // ontext: function(text){ 10 | // assert.equal(text, "€"); 11 | // processed = true; 12 | // } 13 | // }); 14 | // 15 | // stream.write(new Buffer([0xE2, 0x82])); 16 | // stream.write(new Buffer([0xAC])); 17 | // stream.end(); 18 | // 19 | // assert(processed); 20 | // }); 21 | // }); 22 | -------------------------------------------------------------------------------- /test/Feeds/02-atom.js: -------------------------------------------------------------------------------- 1 | exports.name = "Atom (1.0)"; 2 | exports.file = "/Atom_Example.xml"; 3 | exports.expected = { 4 | type: "atom", 5 | id: "urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6", 6 | title: "Example Feed", 7 | link: "http://example.org/feed/", 8 | description: "A subtitle.", 9 | updated: new Date("2003-12-13T18:30:02Z"), 10 | author: "johndoe@example.com", 11 | items: [{ 12 | id: "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a", 13 | title: "Atom-Powered Robots Run Amok", 14 | link: "http://example.org/2003/12/13/atom03", 15 | description: "Some content.", 16 | pubDate: new Date("2003-12-13T18:30:02Z") 17 | }] 18 | }; 19 | -------------------------------------------------------------------------------- /test/Events/26-not-quite-closed.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Not quite closed", 3 | "options": {}, 4 | "html": "", 5 | "expected": [ 6 | { 7 | "event": "opentagname", 8 | "data": [ 9 | "foo" 10 | ] 11 | }, 12 | { 13 | "event": "attribute", 14 | "data": [ 15 | "bar", 16 | "" 17 | ] 18 | }, 19 | { 20 | "event": "opentag", 21 | "data": [ 22 | "foo", 23 | { 24 | "bar": "" 25 | } 26 | ] 27 | }, 28 | { 29 | "event": "closetag", 30 | "data": [ 31 | "foo" 32 | ] 33 | } 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /test/Events/30-cdata_edge-cases.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "CDATA edge-cases", 3 | "options": { 4 | "parser": {"recognizeCDATA": true} 5 | }, 6 | "html": "<", 8 | "expected": [ 9 | { 10 | "event": "opentagname", 11 | "data": [ 12 | "script" 13 | ] 14 | }, 15 | { 16 | "event": "opentag", 17 | "data": [ 18 | "script", 19 | {} 20 | ] 21 | }, 22 | { 23 | "event": "text", 24 | "data": [ 25 | "<" 26 | ] 27 | }, 28 | { 29 | "event": "closetag", 30 | "data": [ 31 | "script" 32 | ] 33 | } 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /test/Feeds/03-rdf.js: -------------------------------------------------------------------------------- 1 | exports.name = "RDF test"; 2 | exports.file = "/RDF_Example.xml"; 3 | exports.expected = { 4 | "type": "rdf", 5 | "id": "", 6 | "title": "A title to parse and remember", 7 | "link": "https://github.com/fb55/htmlparser2/", 8 | "items": [ 9 | { 10 | "title": "Fast HTML Parsing", 11 | "link": "http://somefakesite/path/to/something.html", 12 | "description": "Great test content
A link: Github" 13 | }, 14 | { 15 | "title": "This space intentionally left blank", 16 | "link": "http://somefakesite/path/to/something-else.html", 17 | "description": "The early bird gets the worm" 18 | } 19 | ] 20 | }; 21 | -------------------------------------------------------------------------------- /test/Events/05-cdata-special.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "CDATA (inside special)", 3 | "options": { 4 | "handler": {}, 5 | "parser": {} 6 | }, 7 | "html": "", 8 | "expected": [ 9 | { 10 | "event": "opentagname", 11 | "data": [ 12 | "script" 13 | ] 14 | }, 15 | { 16 | "event": "opentag", 17 | "data": [ 18 | "script", 19 | {} 20 | ] 21 | }, 22 | { 23 | "event": "text", 24 | "data": [ 25 | "/*<> fo/*]]>*/" 26 | ] 27 | }, 28 | { 29 | "event": "closetag", 30 | "data": [ 31 | "script" 32 | ] 33 | } 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /lib/ProxyHandler.js: -------------------------------------------------------------------------------- 1 | module.exports = ProxyHandler; 2 | 3 | function ProxyHandler(cbs){ 4 | this._cbs = cbs || {}; 5 | } 6 | 7 | var EVENTS = require("./").EVENTS; 8 | Object.keys(EVENTS).forEach(function(name){ 9 | if(EVENTS[name] === 0){ 10 | name = "on" + name; 11 | ProxyHandler.prototype[name] = function(){ 12 | if(this._cbs[name]) this._cbs[name](); 13 | }; 14 | } else if(EVENTS[name] === 1){ 15 | name = "on" + name; 16 | ProxyHandler.prototype[name] = function(a){ 17 | if(this._cbs[name]) this._cbs[name](a); 18 | }; 19 | } else if(EVENTS[name] === 2){ 20 | name = "on" + name; 21 | ProxyHandler.prototype[name] = function(a, b){ 22 | if(this._cbs[name]) this._cbs[name](a, b); 23 | }; 24 | } else { 25 | throw Error("wrong number of arguments"); 26 | } 27 | }); -------------------------------------------------------------------------------- /test/02-stream.js: -------------------------------------------------------------------------------- 1 | // var helper = require("./test-helper.js"), 2 | // Stream = require("..").WritableStream, 3 | // fs = require("fs"), 4 | // path = require("path"); 5 | 6 | // helper.mochaTest("Stream", __dirname, function(test, cb){ 7 | // var filePath = path.join(__dirname, "Documents", test.file); 8 | // fs.createReadStream(filePath).pipe( 9 | // new Stream( 10 | // helper.getEventCollector(function(err, events){ 11 | // cb(err, events); 12 | // 13 | // var handler = helper.getEventCollector(cb), 14 | // stream = new Stream(handler, test.options); 15 | // 16 | // fs.readFile(filePath, function(err, data){ 17 | // if(err) throw err; 18 | // else stream.end(data); 19 | // }); 20 | // } 21 | // ), test.options) 22 | // ).on("error", cb); 23 | // }); 24 | -------------------------------------------------------------------------------- /test/Events/01-simple.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "simple", 3 | "options": { 4 | "handler": {}, 5 | "parser": {} 6 | }, 7 | "html": "

adsf

", 8 | "expected": [ 9 | { 10 | "event": "opentagname", 11 | "data": [ 12 | "h1" 13 | ] 14 | }, 15 | { 16 | "event": "attribute", 17 | "data": [ 18 | "class", 19 | "test" 20 | ] 21 | }, 22 | { 23 | "event": "opentag", 24 | "data": [ 25 | "h1", 26 | { 27 | "class": "test" 28 | } 29 | ] 30 | }, 31 | { 32 | "event": "text", 33 | "data": [ 34 | "adsf" 35 | ] 36 | }, 37 | { 38 | "event": "closetag", 39 | "data": [ 40 | "h1" 41 | ] 42 | } 43 | ] 44 | } -------------------------------------------------------------------------------- /test/Events/22-double_brackets.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "double brackets", 3 | "options": { 4 | "handler": {}, 5 | "parser": {} 6 | }, 7 | "html": "<>testing", 8 | "expected": [ 9 | { 10 | "event": "text", 11 | "data": [ 12 | "<" 13 | ] 14 | }, 15 | { 16 | "event": "opentagname", 17 | "data": [ 18 | "princess-purpose" 19 | ] 20 | }, 21 | { 22 | "event": "opentag", 23 | "data": [ 24 | "princess-purpose", 25 | {} 26 | ] 27 | }, 28 | { 29 | "event": "text", 30 | "data": [ 31 | ">testing" 32 | ] 33 | }, 34 | { 35 | "event": "closetag", 36 | "data": [ 37 | "princess-purpose" 38 | ] 39 | } 40 | ] 41 | } 42 | -------------------------------------------------------------------------------- /test/Events/12-long-comment-end.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Long comment ending", 3 | "options": { 4 | "handler": {}, 5 | "parser": {} 6 | }, 7 | "html": "", 8 | "expected": [ 9 | { "event": "opentagname", "data": [ "meta" ] }, 10 | { "event": "attribute", "data": [ "id", "before" ] }, 11 | { "event": "opentag", "data": [ "meta", {"id": "before"} ] }, 12 | { "event": "closetag", "data": [ "meta" ] }, 13 | { "event": "comment", "data": [ " text -" ] }, 14 | { "event": "commentend", "data": [] }, 15 | { "event": "opentagname", "data": [ "meta" ] }, 16 | { "event": "attribute", "data": [ "id", "after" ] }, 17 | { "event": "opentag", "data": [ "meta", {"id": "after"} ] }, 18 | { "event": "closetag", "data": [ "meta" ] } 19 | ] 20 | } -------------------------------------------------------------------------------- /lib/WritableStream.js: -------------------------------------------------------------------------------- 1 | module.exports = Stream; 2 | 3 | var Parser = require("./Parser.js"), 4 | WritableStream = require("stream").Writable || require("readable-stream").Writable, 5 | StringDecoder = require("string_decoder").StringDecoder, 6 | Buffer = require("buffer").Buffer; 7 | 8 | function Stream(cbs, options){ 9 | var parser = this._parser = new Parser(cbs, options); 10 | var decoder = this._decoder = new StringDecoder(); 11 | 12 | WritableStream.call(this, {decodeStrings: false}); 13 | 14 | this.once("finish", function(){ 15 | parser.end(decoder.end()); 16 | }); 17 | } 18 | 19 | require("inherits")(Stream, WritableStream); 20 | 21 | WritableStream.prototype._write = function(chunk, encoding, cb){ 22 | if(chunk instanceof Buffer) chunk = this._decoder.write(chunk); 23 | this._parser.write(chunk); 24 | cb(); 25 | }; -------------------------------------------------------------------------------- /test/Events/16-double_attribs.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "double attribute", 3 | "options": { 4 | "handler": {}, 5 | "parser": {} 6 | }, 7 | "html": "

", 8 | "expected": [ 9 | { 10 | "event": "opentagname", 11 | "data": [ 12 | "h1" 13 | ] 14 | }, 15 | { 16 | "event": "attribute", 17 | "data": [ 18 | "class", 19 | "test" 20 | ] 21 | }, 22 | { 23 | "event": "attribute", 24 | "data": [ 25 | "class", 26 | "boo" 27 | ] 28 | }, 29 | { 30 | "event": "opentag", 31 | "data": [ 32 | "h1", 33 | { 34 | "class": "test" 35 | } 36 | ] 37 | }, 38 | { 39 | "event": "closetag", 40 | "data": [ 41 | "h1" 42 | ] 43 | } 44 | ] 45 | } -------------------------------------------------------------------------------- /test/Events/03-lowercase_tags.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Lowercase tags", 3 | "options": { 4 | "handler": {}, 5 | "parser": { 6 | "lowerCaseTags": true 7 | } 8 | }, 9 | "html": "

adsf

", 10 | "expected": [ 11 | { 12 | "event": "opentagname", 13 | "data": [ 14 | "h1" 15 | ] 16 | }, 17 | { 18 | "event": "attribute", 19 | "data": [ 20 | "class", 21 | "test" 22 | ] 23 | }, 24 | { 25 | "event": "opentag", 26 | "data": [ 27 | "h1", 28 | { 29 | "class": "test" 30 | } 31 | ] 32 | }, 33 | { 34 | "event": "text", 35 | "data": [ 36 | "adsf" 37 | ] 38 | }, 39 | { 40 | "event": "closetag", 41 | "data": [ 42 | "h1" 43 | ] 44 | } 45 | ] 46 | } -------------------------------------------------------------------------------- /lib/Stream.js: -------------------------------------------------------------------------------- 1 | module.exports = Stream; 2 | 3 | var Parser = require("./WritableStream.js"); 4 | 5 | function Stream(options){ 6 | Parser.call(this, new Cbs(this), options); 7 | } 8 | 9 | require("inherits")(Stream, Parser); 10 | 11 | Stream.prototype.readable = true; 12 | 13 | function Cbs(scope){ 14 | this.scope = scope; 15 | } 16 | 17 | var EVENTS = require("../").EVENTS; 18 | 19 | Object.keys(EVENTS).forEach(function(name){ 20 | if(EVENTS[name] === 0){ 21 | Cbs.prototype["on" + name] = function(){ 22 | this.scope.emit(name); 23 | }; 24 | } else if(EVENTS[name] === 1){ 25 | Cbs.prototype["on" + name] = function(a){ 26 | this.scope.emit(name, a); 27 | }; 28 | } else if(EVENTS[name] === 2){ 29 | Cbs.prototype["on" + name] = function(a, b){ 30 | this.scope.emit(name, a, b); 31 | }; 32 | } else { 33 | throw Error("wrong number of arguments!"); 34 | } 35 | }); -------------------------------------------------------------------------------- /test/Events/13-long-cdata-end.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Long CDATA ending", 3 | "options": { 4 | "handler": {}, 5 | "parser": {"xmlMode": true} 6 | }, 7 | "html": "", 8 | "expected": [ 9 | { "event": "opentagname", "data": [ "before" ] }, 10 | { "event": "opentag", "data": [ "before", {} ] }, 11 | { "event": "closetag", "data": [ "before" ] }, 12 | { "event": "opentagname", "data": [ "tag" ] }, 13 | { "event": "opentag", "data": [ "tag", {} ] }, 14 | { "event": "cdatastart", "data": [] }, 15 | { "event": "text", "data": [ " text ]" ] }, 16 | { "event": "cdataend", "data": [] }, 17 | { "event": "closetag", "data": [ "tag" ] }, 18 | { "event": "opentagname", "data": [ "after" ] }, 19 | { "event": "opentag", "data": [ "after", {} ] }, 20 | { "event": "closetag", "data": [ "after" ] } 21 | ] 22 | } -------------------------------------------------------------------------------- /test/Events/21-entity_in_attribute.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "entity in attribute", 3 | "options": { 4 | "handler": {}, 5 | "parser": {"decodeEntities": true} 6 | }, 7 | "html": "", 8 | "expected": [ 9 | { 10 | "event": "opentagname", 11 | "data": [ 12 | "a" 13 | ] 14 | }, 15 | { 16 | "event": "attribute", 17 | "data": [ 18 | "href", 19 | "http://example.com/page?param=value¶m2¶m3=stuff

<> fo]]>", 8 | "expected": [ 9 | { 10 | "event": "opentagname", 11 | "data": [ 12 | "tag" 13 | ] 14 | }, 15 | { 16 | "event": "opentag", 17 | "data": [ 18 | "tag", 19 | {} 20 | ] 21 | }, 22 | { 23 | "event": "cdatastart", 24 | "data": [] 25 | }, 26 | { 27 | "event": "text", 28 | "data": [ 29 | " asdf ><> fo" 30 | ] 31 | }, 32 | { 33 | "event": "cdataend", 34 | "data": [] 35 | }, 36 | { 37 | "event": "closetag", 38 | "data": [ 39 | "tag" 40 | ] 41 | }, 42 | { 43 | "event": "processinginstruction", 44 | "data": [ 45 | "![CD", 46 | "![CD" 47 | ] 48 | } 49 | ] 50 | } -------------------------------------------------------------------------------- /test/Documents/Atom_Example.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Example Feed 5 | A subtitle. 6 | 7 | 8 | urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6 9 | 2003-12-13T18:30:02Z 10 | 11 | John Doe 12 | johndoe@example.com 13 | 14 | 15 | 16 | Atom-Powered Robots Run Amok 17 | 18 | 19 | 20 | urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 21 | 2003-12-13T18:30:02Z 22 |

Some content.

23 |
24 | 25 |
26 | -------------------------------------------------------------------------------- /test/Events/11-script_in_script.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Scripts creating other scripts", 3 | "options": { 4 | "handler": {}, 5 | "parser": {} 6 | }, 7 | "html": "

", 8 | "expected": [ 9 | { 10 | "event": "opentagname", 11 | "data": [ 12 | "p" 13 | ] 14 | }, 15 | { 16 | "event": "opentag", 17 | "data": [ 18 | "p", 19 | {} 20 | ] 21 | }, 22 | { 23 | "event": "opentagname", 24 | "data": [ 25 | "script" 26 | ] 27 | }, 28 | { 29 | "event": "opentag", 30 | "data": [ 31 | "script", 32 | {} 33 | ] 34 | }, 35 | { 36 | "event": "text", 37 | "data": [ 38 | "var str = '

", 8 | "expected": [ 9 | { 10 | "event": "opentagname", 11 | "data": [ 12 | "p" 13 | ] 14 | }, 15 | { 16 | "event": "opentag", 17 | "data": [ 18 | "p", 19 | {} 20 | ] 21 | }, 22 | { 23 | "event": "opentagname", 24 | "data": [ 25 | "script" 26 | ] 27 | }, 28 | { 29 | "event": "attribute", 30 | "data": [ 31 | "type", 32 | "text/template" 33 | ] 34 | }, 35 | { 36 | "event": "opentag", 37 | "data": [ 38 | "script", 39 | { 40 | "type": "text/template" 41 | } 42 | ] 43 | }, 44 | { 45 | "event": "text", 46 | "data": [ 47 | "

Heading1

" 48 | ] 49 | }, 50 | { 51 | "event": "closetag", 52 | "data": [ 53 | "script" 54 | ] 55 | }, 56 | { 57 | "event": "closetag", 58 | "data": [ 59 | "p" 60 | ] 61 | } 62 | ] 63 | } -------------------------------------------------------------------------------- /test/Events/09-attributes.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "attributes (no white space, no value, no quotes)", 3 | "options": { 4 | "handler": {}, 5 | "parser": {} 6 | }, 7 | "html": "", 8 | "expected": [ 9 | { 10 | "event": "opentagname", 11 | "data": [ 12 | "button" 13 | ] 14 | }, 15 | { 16 | "event": "attribute", 17 | "data": [ 18 | "class", 19 | "test0" 20 | ] 21 | }, 22 | { 23 | "event": "attribute", 24 | "data": [ 25 | "title", 26 | "test1" 27 | ] 28 | }, 29 | { 30 | "event": "attribute", 31 | "data": [ 32 | "disabled", 33 | "" 34 | ] 35 | }, 36 | { 37 | "event": "attribute", 38 | "data": [ 39 | "value", 40 | "test2" 41 | ] 42 | }, 43 | { 44 | "event": "opentag", 45 | "data": [ 46 | "button", 47 | { 48 | "class": "test0", 49 | "title": "test1", 50 | "disabled": "", 51 | "value": "test2" 52 | } 53 | ] 54 | }, 55 | { 56 | "event": "text", 57 | "data": [ 58 | "adsf" 59 | ] 60 | }, 61 | { 62 | "event": "closetag", 63 | "data": [ 64 | "button" 65 | ] 66 | } 67 | ] 68 | } -------------------------------------------------------------------------------- /test/Stream/01-basic.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Basic html", 3 | "options": {}, 4 | "file": "Basic.html", 5 | "expected": [ 6 | { 7 | "event": "processinginstruction", 8 | "data": [ 9 | "!doctype", 10 | "!DOCTYPE html" 11 | ] 12 | }, 13 | { 14 | "event": "opentagname", 15 | "data": [ 16 | "html" 17 | ] 18 | }, 19 | { 20 | "event": "opentag", 21 | "data": [ 22 | "html", 23 | {} 24 | ] 25 | }, 26 | { 27 | "event": "opentagname", 28 | "data": [ 29 | "title" 30 | ] 31 | }, 32 | { 33 | "event": "opentag", 34 | "data": [ 35 | "title", 36 | {} 37 | ] 38 | }, 39 | { 40 | "event": "text", 41 | "data": [ 42 | "The Title" 43 | ] 44 | }, 45 | { 46 | "event": "closetag", 47 | "data": [ 48 | "title" 49 | ] 50 | }, 51 | { 52 | "event": "opentagname", 53 | "data": [ 54 | "body" 55 | ] 56 | }, 57 | { 58 | "event": "opentag", 59 | "data": [ 60 | "body", 61 | {} 62 | ] 63 | }, 64 | { 65 | "event": "text", 66 | "data": [ 67 | "Hello world" 68 | ] 69 | }, 70 | { 71 | "event": "closetag", 72 | "data": [ 73 | "body" 74 | ] 75 | }, 76 | { 77 | "event": "closetag", 78 | "data": [ 79 | "html" 80 | ] 81 | } 82 | ] 83 | } -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "htmlparser2-without-node-native", 3 | "description": "htmlparser2 build that excludes node native modules so that you can use it in platforms like React Native.", 4 | "version": "3.9.2", 5 | "author": "Felix Boehm ", 6 | "keywords": [ 7 | "html", 8 | "parser", 9 | "streams", 10 | "xml", 11 | "dom", 12 | "rss", 13 | "feed", 14 | "atom" 15 | ], 16 | "repository": { 17 | "type": "git", 18 | "url": "git://github.com/fb55/htmlparser2.git" 19 | }, 20 | "bugs": { 21 | "mail": "me@feedic.com", 22 | "url": "http://github.com/fb55/htmlparser2/issues" 23 | }, 24 | "directories": { 25 | "lib": "lib/" 26 | }, 27 | "main": "lib/index.js", 28 | "files": [ 29 | "lib" 30 | ], 31 | "scripts": { 32 | "lcov": "istanbul cover _mocha --report lcovonly -- -R spec", 33 | "coveralls": "npm run lint && npm run lcov && (cat coverage/lcov.info | coveralls || exit 0)", 34 | "test": "mocha && npm run lint", 35 | "lint": "eslint lib test" 36 | }, 37 | "dependencies": { 38 | "domelementtype": "^1.3.0", 39 | "domhandler": "^2.3.0", 40 | "domutils": "^1.5.1", 41 | "entities": "^1.1.1", 42 | "eventemitter2": "^1.0.0", 43 | "inherits": "^2.0.1", 44 | "readable-stream": "^2.0.2" 45 | }, 46 | "devDependencies": { 47 | "coveralls": "^2.11.4", 48 | "istanbul": "^0.4.3", 49 | "mocha": "^2.2.5", 50 | "eslint": "^2.12.0", 51 | "mocha-lcov-reporter": "^1.2.0" 52 | }, 53 | "browser": { 54 | "readable-stream": false 55 | }, 56 | "license": "MIT" 57 | } 58 | -------------------------------------------------------------------------------- /lib/CollectingHandler.js: -------------------------------------------------------------------------------- 1 | module.exports = CollectingHandler; 2 | 3 | function CollectingHandler(cbs){ 4 | this._cbs = cbs || {}; 5 | this.events = []; 6 | } 7 | 8 | var EVENTS = require("./").EVENTS; 9 | Object.keys(EVENTS).forEach(function(name){ 10 | if(EVENTS[name] === 0){ 11 | name = "on" + name; 12 | CollectingHandler.prototype[name] = function(){ 13 | this.events.push([name]); 14 | if(this._cbs[name]) this._cbs[name](); 15 | }; 16 | } else if(EVENTS[name] === 1){ 17 | name = "on" + name; 18 | CollectingHandler.prototype[name] = function(a){ 19 | this.events.push([name, a]); 20 | if(this._cbs[name]) this._cbs[name](a); 21 | }; 22 | } else if(EVENTS[name] === 2){ 23 | name = "on" + name; 24 | CollectingHandler.prototype[name] = function(a, b){ 25 | this.events.push([name, a, b]); 26 | if(this._cbs[name]) this._cbs[name](a, b); 27 | }; 28 | } else { 29 | throw Error("wrong number of arguments"); 30 | } 31 | }); 32 | 33 | CollectingHandler.prototype.onreset = function(){ 34 | this.events = []; 35 | if(this._cbs.onreset) this._cbs.onreset(); 36 | }; 37 | 38 | CollectingHandler.prototype.restart = function(){ 39 | if(this._cbs.onreset) this._cbs.onreset(); 40 | 41 | for(var i = 0, len = this.events.length; i < len; i++){ 42 | if(this._cbs[this.events[i][0]]){ 43 | 44 | var num = this.events[i].length; 45 | 46 | if(num === 1){ 47 | this._cbs[this.events[i][0]](); 48 | } else if(num === 2){ 49 | this._cbs[this.events[i][0]](this.events[i][1]); 50 | } else { 51 | this._cbs[this.events[i][0]](this.events[i][1], this.events[i][2]); 52 | } 53 | } 54 | } 55 | }; 56 | -------------------------------------------------------------------------------- /lib/index.js: -------------------------------------------------------------------------------- 1 | var Parser = require("./Parser.js"), 2 | DomHandler = require("domhandler"); 3 | 4 | function defineProp(name, value){ 5 | delete module.exports[name]; 6 | module.exports[name] = value; 7 | return value; 8 | } 9 | 10 | module.exports = { 11 | Parser: Parser, 12 | Tokenizer: require("./Tokenizer.js"), 13 | ElementType: require("domelementtype"), 14 | DomHandler: DomHandler, 15 | get FeedHandler(){ 16 | return defineProp("FeedHandler", require("./FeedHandler.js")); 17 | }, 18 | get ProxyHandler(){ 19 | return defineProp("ProxyHandler", require("./ProxyHandler.js")); 20 | }, 21 | get DomUtils(){ 22 | return defineProp("DomUtils", require("domutils")); 23 | }, 24 | get CollectingHandler(){ 25 | return defineProp("CollectingHandler", require("./CollectingHandler.js")); 26 | }, 27 | // For legacy support 28 | DefaultHandler: DomHandler, 29 | get RssHandler(){ 30 | return defineProp("RssHandler", this.FeedHandler); 31 | }, 32 | //helper methods 33 | parseDOM: function(data, options){ 34 | var handler = new DomHandler(options); 35 | new Parser(handler, options).end(data); 36 | return handler.dom; 37 | }, 38 | parseFeed: function(feed, options){ 39 | var handler = new module.exports.FeedHandler(options); 40 | new Parser(handler, options).end(feed); 41 | return handler.dom; 42 | }, 43 | createDomStream: function(cb, options, elementCb){ 44 | var handler = new DomHandler(cb, options, elementCb); 45 | return new Parser(handler, options); 46 | }, 47 | // List of all events that the parser emits 48 | EVENTS: { /* Format: eventname: number of arguments */ 49 | attribute: 2, 50 | cdatastart: 0, 51 | cdataend: 0, 52 | text: 1, 53 | processinginstruction: 2, 54 | comment: 1, 55 | commentend: 0, 56 | closetag: 1, 57 | opentag: 2, 58 | opentagname: 1, 59 | error: 1, 60 | end: 0 61 | } 62 | }; 63 | -------------------------------------------------------------------------------- /.eslintrc: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "eslint:recommended", 3 | "env": { 4 | "node": true 5 | }, 6 | "globals": { 7 | "describe": true, 8 | "it": true 9 | }, 10 | "rules": { 11 | "eqeqeq": 2, 12 | "no-extend-native": 2, 13 | "no-use-before-define": [ 14 | 2, 15 | { 16 | "functions": false, 17 | "classes": false 18 | } 19 | ], 20 | "no-caller": 2, 21 | "no-irregular-whitespace": 2, 22 | "quotes": [ 23 | 2, 24 | "double" 25 | ], 26 | "no-undef": 2, 27 | "no-unused-vars": 2, 28 | "no-eq-null": 2, 29 | "no-proto": 2, 30 | "curly": [ 31 | 2, 32 | "multi-line" 33 | ], 34 | "no-mixed-spaces-and-tabs": [ 35 | 2, 36 | "smart-tabs" 37 | ], 38 | "space-infix-ops": 2, 39 | "keyword-spacing": [ 40 | 2, 41 | { 42 | "overrides": { 43 | "if": { 44 | "after": false 45 | }, 46 | "catch": { 47 | "after": false 48 | }, 49 | "for": { 50 | "after": false 51 | }, 52 | "while": { 53 | "after": false 54 | } 55 | } 56 | } 57 | ], 58 | "new-cap": 2, 59 | "comma-style": [ 60 | 2, 61 | "last" 62 | ], 63 | "dot-notation": 2, 64 | "wrap-iife": 2, 65 | "no-empty": 2, 66 | "space-unary-ops": [ 67 | 2, 68 | { 69 | "words": false, 70 | "nonwords": false 71 | } 72 | ], 73 | "no-with": 2, 74 | "no-multi-str": 2, 75 | "no-trailing-spaces": 2, 76 | "indent": [ 77 | 2, 78 | "tab", 79 | { 80 | "SwitchCase": 1, 81 | "VariableDeclarator": 0 82 | } 83 | ], 84 | "linebreak-style": [ 85 | 2, 86 | "unix" 87 | ], 88 | "consistent-this": [ 89 | 2, 90 | "_this" 91 | ], 92 | "no-extra-semi": 0 // https://github.com/eslint/eslint/issues/6386 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /test/Feeds/01-rss.js: -------------------------------------------------------------------------------- 1 | exports.name = "RSS (2.0)"; 2 | exports.file = "/RSS_Example.xml"; 3 | exports.expected = { 4 | type: "rss", 5 | id: "", 6 | title: "Liftoff News", 7 | link: "http://liftoff.msfc.nasa.gov/", 8 | description: "Liftoff to Space Exploration.", 9 | updated: new Date("Tue, 10 Jun 2003 09:41:01 GMT"), 10 | author: "editor@example.com", 11 | items: [{ 12 | id: "http://liftoff.msfc.nasa.gov/2003/06/03.html#item573", 13 | title: "Star City", 14 | link: "http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp", 15 | description: "How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's <a href=\"http://howe.iki.rssi.ru/GCTC/gctc_e.htm\">Star City</a>.", 16 | pubDate: new Date("Tue, 03 Jun 2003 09:39:21 GMT") 17 | }, { 18 | id: "http://liftoff.msfc.nasa.gov/2003/05/30.html#item572", 19 | description: "Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a <a href=\"http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm\">partial eclipse of the Sun</a> on Saturday, May 31st.", 20 | pubDate: new Date("Fri, 30 May 2003 11:06:42 GMT") 21 | }, { 22 | id: "http://liftoff.msfc.nasa.gov/2003/05/27.html#item571", 23 | title: "The Engine That Does More", 24 | link: "http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp", 25 | description: "Before man travels to Mars, NASA hopes to design new engines that will let us fly through the Solar System more quickly. The proposed VASIMR engine would do that.", 26 | pubDate: new Date("Tue, 27 May 2003 08:37:32 GMT") 27 | }, { 28 | id: "http://liftoff.msfc.nasa.gov/2003/05/20.html#item570", 29 | title: "Astronauts' Dirty Laundry", 30 | link: "http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp", 31 | description: "Compared to earlier spacecraft, the International Space Station has many luxuries, but laundry facilities are not one of them. Instead, astronauts have other options.", 32 | pubDate: new Date("Tue, 20 May 2003 08:56:02 GMT") 33 | }] 34 | }; -------------------------------------------------------------------------------- /test/test-helper.js: -------------------------------------------------------------------------------- 1 | var htmlparser2 = require(".."), 2 | fs = require("fs"), 3 | path = require("path"), 4 | assert = require("assert"), 5 | Parser = htmlparser2.Parser, 6 | CollectingHandler = htmlparser2.CollectingHandler; 7 | 8 | exports.writeToParser = function(handler, options, data){ 9 | var parser = new Parser(handler, options); 10 | //first, try to run the test via chunks 11 | for(var i = 0; i < data.length; i++){ 12 | parser.write(data.charAt(i)); 13 | } 14 | parser.end(); 15 | //then parse everything 16 | parser.parseComplete(data); 17 | }; 18 | 19 | //returns a tree structure 20 | exports.getEventCollector = function(cb){ 21 | var handler = new CollectingHandler({onerror: cb, onend: onend}); 22 | 23 | return handler; 24 | 25 | function onend(){ 26 | cb(null, handler.events.reduce(eventReducer, [])); 27 | } 28 | }; 29 | 30 | function eventReducer(events, arr){ 31 | if(arr[0] === "onerror" || arr[0] === "onend"); 32 | else if(arr[0] === "ontext" && events.length && events[events.length - 1].event === "text"){ 33 | events[events.length - 1].data[0] += arr[1]; 34 | } else { 35 | events.push({ 36 | event: arr[0].substr(2), 37 | data: arr.slice(1) 38 | }); 39 | } 40 | 41 | return events; 42 | } 43 | 44 | function getCallback(expected, done){ 45 | var repeated = false; 46 | 47 | return function(err, actual){ 48 | assert.ifError(err); 49 | try { 50 | assert.deepEqual(expected, actual, "didn't get expected output"); 51 | } catch(e){ 52 | e.expected = JSON.stringify(expected, null, 2); 53 | e.actual = JSON.stringify(actual, null, 2); 54 | throw e; 55 | } 56 | 57 | if(repeated) done(); 58 | else repeated = true; 59 | }; 60 | } 61 | 62 | exports.mochaTest = function(name, root, test){ 63 | describe(name, readDir); 64 | 65 | function readDir(){ 66 | var dir = path.join(root, name); 67 | 68 | fs 69 | .readdirSync(dir) 70 | .filter(RegExp.prototype.test, /^[^\._]/) //ignore all files with a leading dot or underscore 71 | .map(function(name){ 72 | return path.join(dir, name); 73 | }) 74 | .map(require) 75 | .forEach(runTest); 76 | } 77 | 78 | function runTest(file){ 79 | it(file.name, function(done){ 80 | test(file, getCallback(file.expected, done)); 81 | }); 82 | } 83 | }; 84 | -------------------------------------------------------------------------------- /test/Events/24-special_special.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Special special tags", 3 | "options": {}, 4 | "html": "", 5 | "expected": [ 6 | { 7 | "event": "opentagname", 8 | "data": [ 9 | "script" 10 | ] 11 | }, 12 | { 13 | "event": "opentag", 14 | "data": [ 15 | "script", 16 | {} 17 | ] 18 | }, 19 | { 20 | "event": "text", 21 | "data": [ 22 | " 2 | 3 | 4 | 5 | Liftoff News 6 | http://liftoff.msfc.nasa.gov/ 7 | Liftoff to Space Exploration. 8 | en-us 9 | Tue, 10 Jun 2003 04:00:00 GMT 10 | 11 | Tue, 10 Jun 2003 09:41:01 GMT 12 | http://blogs.law.harvard.edu/tech/rss 13 | Weblog Editor 2.0 14 | editor@example.com 15 | webmaster@example.com 16 | 17 | 18 | Star City 19 | http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp 20 | How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's <a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm">Star City</a>. 21 | Tue, 03 Jun 2003 09:39:21 GMT 22 | http://liftoff.msfc.nasa.gov/2003/06/03.html#item573 23 | 24 | 25 | 26 | Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a <a href="http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm">partial eclipse of the Sun</a> on Saturday, May 31st. 27 | Fri, 30 May 2003 11:06:42 GMT 28 | http://liftoff.msfc.nasa.gov/2003/05/30.html#item572 29 | 30 | 31 | 32 | The Engine That Does More 33 | http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp 34 | Before man travels to Mars, NASA hopes to design new engines that will let us fly through the Solar System more quickly. The proposed VASIMR engine would do that. 35 | Tue, 27 May 2003 08:37:32 GMT 36 | http://liftoff.msfc.nasa.gov/2003/05/27.html#item571 37 | 38 | 39 | 40 | Astronauts' Dirty Laundry 41 | http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp 42 | Compared to earlier spacecraft, the International Space Station has many luxuries, but laundry facilities are not one of them. Instead, astronauts have other options. 43 | Tue, 20 May 2003 08:56:02 GMT 44 | http://liftoff.msfc.nasa.gov/2003/05/20.html#item570 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /test/Documents/RDF_Example.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | A title to parse and remember 5 | https://github.com/fb55/htmlparser2/ 6 | 7 | en-us 8 | Copyright 2015 the authors 9 | webmaster@thisisafakedoma.in 10 | webmaster@thisisafakedoma.in 11 | https://github.com/fb55/htmlparser2/ 12 | A title to parse and remember 13 | Collection 14 | 2011-11-04T09:39:10-07:00 15 | 4 16 | hourly 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | <![CDATA[ Fast HTML Parsing ]]> 25 | 26 | http://somefakesite/path/to/something.html 27 | 28 | A link:
Github 30 | ]]> 31 | 2011-11-04T09:35:17-07:00 32 | en-us 33 | Copyright 2015 the authors 34 | 35 | http://somefakesite/path/to/something.html 36 | 37 | 38 | text 39 | 2011-11-04T09:35:17-07:00 40 | 41 | 42 | <![CDATA[ 43 | This space intentionally left blank 44 | ]]> 45 | 46 | http://somefakesite/path/to/something-else.html 47 | 48 | 51 | 2011-11-04T09:34:54-07:00 52 | en-us 53 | Copyright 2015 the authors 54 | 55 | http://somefakesite/path/to/something-else.html 56 | 57 | 60 | text 61 | 2011-11-04T09:34:54-07:00 62 | 63 | -------------------------------------------------------------------------------- /test/api.js: -------------------------------------------------------------------------------- 1 | var htmlparser2 = require(".."), 2 | assert = require("assert"); 3 | 4 | describe("API", function(){ 5 | 6 | it("should load all modules", function(){ 7 | // var Stream = require("../lib/Stream.js"); 8 | // assert.strictEqual(htmlparser2.Stream, Stream, "should load module"); 9 | // assert.strictEqual(htmlparser2.Stream, Stream, "should load it again (cache)"); 10 | 11 | var ProxyHandler = require("../lib/ProxyHandler.js"); 12 | assert.strictEqual(htmlparser2.ProxyHandler, ProxyHandler, "should load module"); 13 | assert.strictEqual(htmlparser2.ProxyHandler, ProxyHandler, "should load it again (cache)"); 14 | }); 15 | 16 | it("should work without callbacks", function(){ 17 | var p = new htmlparser2.Parser(null, {xmlMode: true, lowerCaseAttributeNames: true}); 18 | 19 | p.end("boohay"); 20 | p.write("foo"); 21 | 22 | //check for an error 23 | p.end(); 24 | var err = false; 25 | p._cbs.onerror = function(){ err = true; }; 26 | p.write("foo"); 27 | assert(err); 28 | err = false; 29 | p.end(); 30 | assert(err); 31 | 32 | p.reset(); 33 | 34 | //remove method 35 | p._cbs.onopentag = function(){}; 36 | p.write(""); 39 | 40 | //pause/resume 41 | var processed = false; 42 | p._cbs.ontext = function(t){ 43 | assert.equal(t, "foo"); 44 | processed = true; 45 | }; 46 | p.pause(); 47 | p.write("foo"); 48 | assert(!processed); 49 | p.resume(); 50 | assert(processed); 51 | processed = false; 52 | p.pause(); 53 | assert(!processed); 54 | p.resume(); 55 | assert(!processed); 56 | p.pause(); 57 | p.end("foo"); 58 | assert(!processed); 59 | p.resume(); 60 | assert(processed); 61 | 62 | }); 63 | 64 | it("should update the position", function(){ 65 | var p = new htmlparser2.Parser(null); 66 | 67 | p.write("foo"); 68 | 69 | assert.equal(p.startIndex, 0); 70 | assert.equal(p.endIndex, 2); 71 | 72 | p.write(""); 73 | 74 | assert.equal(p.startIndex, 3); 75 | assert.equal(p.endIndex, 7); 76 | }); 77 | 78 | it("should update the position when a single tag is spread across multiple chunks", function(){ 79 | var p = new htmlparser2.Parser(null); 80 | 81 | p.write("
"); 83 | 84 | assert.equal(p.startIndex, 0); 85 | assert.equal(p.endIndex, 12); 86 | }); 87 | 88 | it("should support custom tokenizer", function(){ 89 | function CustomTokenizer(options, cbs){ 90 | htmlparser2.Tokenizer.call(this, options, cbs); 91 | return this; 92 | } 93 | CustomTokenizer.prototype = Object.create(htmlparser2.Tokenizer.prototype); 94 | CustomTokenizer.prototype.constructor = CustomTokenizer; 95 | 96 | var p = new htmlparser2.Parser({ 97 | onparserinit: function(parser){ 98 | assert(parser._tokenizer instanceof CustomTokenizer); 99 | } 100 | }, { Tokenizer: CustomTokenizer }); 101 | p.done(); 102 | }); 103 | }); 104 | -------------------------------------------------------------------------------- /lib/FeedHandler.js: -------------------------------------------------------------------------------- 1 | var index = require("./index.js"), 2 | DomHandler = index.DomHandler, 3 | DomUtils = index.DomUtils; 4 | 5 | //TODO: make this a streamable handler 6 | function FeedHandler(callback, options){ 7 | this.init(callback, options); 8 | } 9 | 10 | require("inherits")(FeedHandler, DomHandler); 11 | 12 | FeedHandler.prototype.init = DomHandler; 13 | 14 | function getElements(what, where){ 15 | return DomUtils.getElementsByTagName(what, where, true); 16 | } 17 | function getOneElement(what, where){ 18 | return DomUtils.getElementsByTagName(what, where, true, 1)[0]; 19 | } 20 | function fetch(what, where, recurse){ 21 | return DomUtils.getText( 22 | DomUtils.getElementsByTagName(what, where, recurse, 1) 23 | ).trim(); 24 | } 25 | 26 | function addConditionally(obj, prop, what, where, recurse){ 27 | var tmp = fetch(what, where, recurse); 28 | if(tmp) obj[prop] = tmp; 29 | } 30 | 31 | var isValidFeed = function(value){ 32 | return value === "rss" || value === "feed" || value === "rdf:RDF"; 33 | }; 34 | 35 | FeedHandler.prototype.onend = function(){ 36 | var feed = {}, 37 | feedRoot = getOneElement(isValidFeed, this.dom), 38 | tmp, childs; 39 | 40 | if(feedRoot){ 41 | if(feedRoot.name === "feed"){ 42 | childs = feedRoot.children; 43 | 44 | feed.type = "atom"; 45 | addConditionally(feed, "id", "id", childs); 46 | addConditionally(feed, "title", "title", childs); 47 | if((tmp = getOneElement("link", childs)) && (tmp = tmp.attribs) && (tmp = tmp.href)) feed.link = tmp; 48 | addConditionally(feed, "description", "subtitle", childs); 49 | if((tmp = fetch("updated", childs))) feed.updated = new Date(tmp); 50 | addConditionally(feed, "author", "email", childs, true); 51 | 52 | feed.items = getElements("entry", childs).map(function(item){ 53 | var entry = {}, tmp; 54 | 55 | item = item.children; 56 | 57 | addConditionally(entry, "id", "id", item); 58 | addConditionally(entry, "title", "title", item); 59 | if((tmp = getOneElement("link", item)) && (tmp = tmp.attribs) && (tmp = tmp.href)) entry.link = tmp; 60 | if((tmp = fetch("summary", item) || fetch("content", item))) entry.description = tmp; 61 | if((tmp = fetch("updated", item))) entry.pubDate = new Date(tmp); 62 | return entry; 63 | }); 64 | } else { 65 | childs = getOneElement("channel", feedRoot.children).children; 66 | 67 | feed.type = feedRoot.name.substr(0, 3); 68 | feed.id = ""; 69 | addConditionally(feed, "title", "title", childs); 70 | addConditionally(feed, "link", "link", childs); 71 | addConditionally(feed, "description", "description", childs); 72 | if((tmp = fetch("lastBuildDate", childs))) feed.updated = new Date(tmp); 73 | addConditionally(feed, "author", "managingEditor", childs, true); 74 | 75 | feed.items = getElements("item", feedRoot.children).map(function(item){ 76 | var entry = {}, tmp; 77 | 78 | item = item.children; 79 | 80 | addConditionally(entry, "id", "guid", item); 81 | addConditionally(entry, "title", "title", item); 82 | addConditionally(entry, "link", "link", item); 83 | addConditionally(entry, "description", "description", item); 84 | if((tmp = fetch("pubDate", item))) entry.pubDate = new Date(tmp); 85 | return entry; 86 | }); 87 | } 88 | } 89 | this.dom = feed; 90 | DomHandler.prototype._handleCallback.call( 91 | this, feedRoot ? null : Error("couldn't find root of feed") 92 | ); 93 | }; 94 | 95 | module.exports = FeedHandler; 96 | -------------------------------------------------------------------------------- /test/Events/08-implicit-close-tags.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Implicit close tags", 3 | "options": {}, 4 | "html": "
  1. TH

    Heading

    Div
    Div2
  2. Heading 2

Para

Heading 4

", 5 | "expected": [ 6 | { "event": "opentagname", "data": [ "ol" ] }, 7 | { "event": "opentag", "data": [ "ol", {} ] }, 8 | { "event": "opentagname", "data": [ "li" ] }, 9 | { "event": "attribute", "data": [ "class", "test" ] }, 10 | { "event": "opentag", "data": [ "li", { "class": "test" } ] }, 11 | { "event": "opentagname", "data": [ "div" ] }, 12 | { "event": "opentag", "data": [ "div", {} ] }, 13 | { "event": "opentagname", "data": [ "table" ] }, 14 | { "event": "attribute", "data": [ "style", "width:100%" ] }, 15 | { "event": "opentag", "data": [ "table", { "style": "width:100%" } ] }, 16 | { "event": "opentagname", "data": [ "tr" ] }, 17 | { "event": "opentag", "data": [ "tr", {} ] }, 18 | { "event": "opentagname", "data": [ "th" ] }, 19 | { "event": "opentag", "data": [ "th", {} ] }, 20 | { "event": "text", "data": [ "TH" ] }, 21 | { "event": "closetag", "data": [ "th" ] }, 22 | { "event": "opentagname", "data": [ "td" ] }, 23 | { "event": "attribute", "data": [ "colspan", "2" ] }, 24 | { "event": "opentag", "data": [ "td", { "colspan": "2" } ] }, 25 | { "event": "opentagname", "data": [ "h3" ] }, 26 | { "event": "opentag", "data": [ "h3", {} ] }, 27 | { "event": "text", "data": [ "Heading" ] }, 28 | { "event": "closetag", "data": [ "h3" ] }, 29 | { "event": "closetag", "data": [ "td" ] }, 30 | { "event": "closetag", "data": [ "tr" ] }, 31 | { "event": "opentagname", "data": [ "tr" ] }, 32 | { "event": "opentag", "data": [ "tr", {} ] }, 33 | { "event": "opentagname", "data": [ "td" ] }, 34 | { "event": "opentag", "data": [ "td", {} ] }, 35 | { "event": "opentagname", "data": [ "div" ] }, 36 | { "event": "opentag", "data": [ "div", {} ] }, 37 | { "event": "text", "data": [ "Div" ] }, 38 | { "event": "closetag", "data": [ "div" ] }, 39 | { "event": "closetag", "data": [ "td" ] }, 40 | { "event": "opentagname", "data": [ "td" ] }, 41 | { "event": "opentag", "data": [ "td", {} ] }, 42 | { "event": "opentagname", "data": [ "div" ] }, 43 | { "event": "opentag", "data": [ "div", {} ] }, 44 | { "event": "text", "data": [ "Div2" ] }, 45 | { "event": "closetag", "data": [ "div" ] }, 46 | { "event": "closetag", "data": [ "td" ] }, 47 | { "event": "closetag", "data": [ "tr" ] }, 48 | { "event": "closetag", "data": [ "table" ] }, 49 | { "event": "closetag", "data": [ "div" ] }, 50 | { "event": "closetag", "data": [ "li" ] }, 51 | { "event": "opentagname", "data": [ "li" ] }, 52 | { "event": "opentag", "data": [ "li", {} ] }, 53 | { "event": "opentagname", "data": [ "div" ] }, 54 | { "event": "opentag", "data": [ "div", {} ] }, 55 | { "event": "opentagname", "data": [ "h3" ] }, 56 | { "event": "opentag", "data": [ "h3", {} ] }, 57 | { "event": "text", "data": [ "Heading 2" ] }, 58 | { "event": "closetag", "data": [ "h3" ] }, 59 | { "event": "closetag", "data": [ "div" ] }, 60 | { "event": "closetag", "data": [ "li" ] }, 61 | { "event": "closetag", "data": [ "ol" ] }, 62 | { "event": "opentagname", "data": [ "p" ] }, 63 | { "event": "opentag", "data": [ "p", {} ] }, 64 | { "event": "text", "data": [ "Para" ] }, 65 | { "event": "closetag", "data": [ "p" ] }, 66 | { "event": "opentagname", "data": [ "h4" ] }, 67 | { "event": "opentag", "data": [ "h4", {} ] }, 68 | { "event": "text", "data": [ "Heading 4" ] }, 69 | { "event": "closetag", "data": [ "h4" ] } 70 | ] 71 | } -------------------------------------------------------------------------------- /test/Stream/05-Attributes.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Attributes", 3 | "options": {}, 4 | "file": "Attributes.html", 5 | "expected": [ 6 | { 7 | "event": "processinginstruction", 8 | "data": [ 9 | "!doctype", 10 | "!doctype html" 11 | ] 12 | }, 13 | { 14 | "event": "text", 15 | "data": [ 16 | "\n" 17 | ] 18 | }, 19 | { 20 | "event": "opentagname", 21 | "data": [ 22 | "html" 23 | ] 24 | }, 25 | { 26 | "event": "opentag", 27 | "data": [ 28 | "html", 29 | {} 30 | ] 31 | }, 32 | { 33 | "event": "text", 34 | "data": [ 35 | "\n" 36 | ] 37 | }, 38 | { 39 | "event": "opentagname", 40 | "data": [ 41 | "head" 42 | ] 43 | }, 44 | { 45 | "event": "opentag", 46 | "data": [ 47 | "head", 48 | {} 49 | ] 50 | }, 51 | { 52 | "event": "text", 53 | "data": [ 54 | "\n\t" 55 | ] 56 | }, 57 | { 58 | "event": "opentagname", 59 | "data": [ 60 | "title" 61 | ] 62 | }, 63 | { 64 | "event": "opentag", 65 | "data": [ 66 | "title", 67 | {} 68 | ] 69 | }, 70 | { 71 | "event": "text", 72 | "data": [ 73 | "Attributes test" 74 | ] 75 | }, 76 | { 77 | "event": "closetag", 78 | "data": [ 79 | "title" 80 | ] 81 | }, 82 | { 83 | "event": "text", 84 | "data": [ 85 | "\n" 86 | ] 87 | }, 88 | { 89 | "event": "closetag", 90 | "data": [ 91 | "head" 92 | ] 93 | }, 94 | { 95 | "event": "text", 96 | "data": [ 97 | "\n" 98 | ] 99 | }, 100 | { 101 | "event": "opentagname", 102 | "data": [ 103 | "body" 104 | ] 105 | }, 106 | { 107 | "event": "opentag", 108 | "data": [ 109 | "body", 110 | {} 111 | ] 112 | }, 113 | { 114 | "event": "text", 115 | "data": [ 116 | "\n\t" 117 | ] 118 | }, 119 | { 120 | "event": "comment", 121 | "data": [ 122 | " Normal attributes " 123 | ] 124 | }, 125 | { 126 | "event": "commentend", 127 | "data": [] 128 | }, 129 | { 130 | "event": "text", 131 | "data": [ 132 | "\n\t" 133 | ] 134 | }, 135 | { 136 | "event": "opentagname", 137 | "data": [ 138 | "button" 139 | ] 140 | }, 141 | { 142 | "event": "attribute", 143 | "data": [ 144 | "id", 145 | "test0" 146 | ] 147 | }, 148 | { 149 | "event": "attribute", 150 | "data": [ 151 | "class", 152 | "value0" 153 | ] 154 | }, 155 | { 156 | "event": "attribute", 157 | "data": [ 158 | "title", 159 | "value1" 160 | ] 161 | }, 162 | { 163 | "event": "opentag", 164 | "data": [ 165 | "button", 166 | { 167 | "id": "test0", 168 | "class": "value0", 169 | "title": "value1" 170 | } 171 | ] 172 | }, 173 | { 174 | "event": "text", 175 | "data": [ 176 | "class=\"value0\" title=\"value1\"" 177 | ] 178 | }, 179 | { 180 | "event": "closetag", 181 | "data": [ 182 | "button" 183 | ] 184 | }, 185 | { 186 | "event": "text", 187 | "data": [ 188 | "\n\n\t" 189 | ] 190 | }, 191 | { 192 | "event": "comment", 193 | "data": [ 194 | " Attributes with no quotes or value " 195 | ] 196 | }, 197 | { 198 | "event": "commentend", 199 | "data": [] 200 | }, 201 | { 202 | "event": "text", 203 | "data": [ 204 | "\n\t" 205 | ] 206 | }, 207 | { 208 | "event": "opentagname", 209 | "data": [ 210 | "button" 211 | ] 212 | }, 213 | { 214 | "event": "attribute", 215 | "data": [ 216 | "id", 217 | "test1" 218 | ] 219 | }, 220 | { 221 | "event": "attribute", 222 | "data": [ 223 | "class", 224 | "value2" 225 | ] 226 | }, 227 | { 228 | "event": "attribute", 229 | "data": [ 230 | "disabled", 231 | "" 232 | ] 233 | }, 234 | { 235 | "event": "opentag", 236 | "data": [ 237 | "button", 238 | { 239 | "id": "test1", 240 | "class": "value2", 241 | "disabled": "" 242 | } 243 | ] 244 | }, 245 | { 246 | "event": "text", 247 | "data": [ 248 | "class=value2 disabled" 249 | ] 250 | }, 251 | { 252 | "event": "closetag", 253 | "data": [ 254 | "button" 255 | ] 256 | }, 257 | { 258 | "event": "text", 259 | "data": [ 260 | "\n\n\t" 261 | ] 262 | }, 263 | { 264 | "event": "comment", 265 | "data": [ 266 | " Attributes with no space between them. No valid, but accepted by the browser " 267 | ] 268 | }, 269 | { 270 | "event": "commentend", 271 | "data": [] 272 | }, 273 | { 274 | "event": "text", 275 | "data": [ 276 | "\n\t" 277 | ] 278 | }, 279 | { 280 | "event": "opentagname", 281 | "data": [ 282 | "button" 283 | ] 284 | }, 285 | { 286 | "event": "attribute", 287 | "data": [ 288 | "id", 289 | "test2" 290 | ] 291 | }, 292 | { 293 | "event": "attribute", 294 | "data": [ 295 | "class", 296 | "value4" 297 | ] 298 | }, 299 | { 300 | "event": "attribute", 301 | "data": [ 302 | "title", 303 | "value5" 304 | ] 305 | }, 306 | { 307 | "event": "opentag", 308 | "data": [ 309 | "button", 310 | { 311 | "id": "test2", 312 | "class": "value4", 313 | "title": "value5" 314 | } 315 | ] 316 | }, 317 | { 318 | "event": "text", 319 | "data": [ 320 | "class=\"value4\"title=\"value5\"" 321 | ] 322 | }, 323 | { 324 | "event": "closetag", 325 | "data": [ 326 | "button" 327 | ] 328 | }, 329 | { 330 | "event": "text", 331 | "data": [ 332 | "\n" 333 | ] 334 | }, 335 | { 336 | "event": "closetag", 337 | "data": [ 338 | "body" 339 | ] 340 | }, 341 | { 342 | "event": "text", 343 | "data": [ 344 | "\n" 345 | ] 346 | }, 347 | { 348 | "event": "closetag", 349 | "data": [ 350 | "html" 351 | ] 352 | } 353 | ] 354 | } -------------------------------------------------------------------------------- /lib/Parser.js: -------------------------------------------------------------------------------- 1 | var Tokenizer; 2 | 3 | /* 4 | Options: 5 | 6 | xmlMode: Disables the special behavior for script/style tags (false by default) 7 | lowerCaseAttributeNames: call .toLowerCase for each attribute name (true if xmlMode is `false`) 8 | lowerCaseTags: call .toLowerCase for each tag name (true if xmlMode is `false`) 9 | */ 10 | 11 | /* 12 | Callbacks: 13 | 14 | oncdataend, 15 | oncdatastart, 16 | onclosetag, 17 | oncomment, 18 | oncommentend, 19 | onerror, 20 | onopentag, 21 | onprocessinginstruction, 22 | onreset, 23 | ontext 24 | */ 25 | 26 | var formTags = { 27 | input: true, 28 | option: true, 29 | optgroup: true, 30 | select: true, 31 | button: true, 32 | datalist: true, 33 | textarea: true 34 | }; 35 | 36 | var openImpliesClose = { 37 | tr : { tr:true, th:true, td:true }, 38 | th : { th:true }, 39 | td : { thead:true, th:true, td:true }, 40 | body : { head:true, link:true, script:true }, 41 | li : { li:true }, 42 | p : { p:true }, 43 | h1 : { p:true }, 44 | h2 : { p:true }, 45 | h3 : { p:true }, 46 | h4 : { p:true }, 47 | h5 : { p:true }, 48 | h6 : { p:true }, 49 | select : formTags, 50 | input : formTags, 51 | output : formTags, 52 | button : formTags, 53 | datalist: formTags, 54 | textarea: formTags, 55 | option : { option:true }, 56 | optgroup: { optgroup:true } 57 | }; 58 | 59 | var voidElements = { 60 | __proto__: null, 61 | area: true, 62 | base: true, 63 | basefont: true, 64 | br: true, 65 | col: true, 66 | command: true, 67 | embed: true, 68 | frame: true, 69 | hr: true, 70 | img: true, 71 | input: true, 72 | isindex: true, 73 | keygen: true, 74 | link: true, 75 | meta: true, 76 | param: true, 77 | source: true, 78 | track: true, 79 | wbr: true, 80 | 81 | //common self closing svg elements 82 | path: true, 83 | circle: true, 84 | ellipse: true, 85 | line: true, 86 | rect: true, 87 | use: true, 88 | stop: true, 89 | polyline: true, 90 | polygon: true 91 | }; 92 | 93 | var re_nameEnd = /\s|\//; 94 | 95 | function Parser(cbs, options){ 96 | this._options = options || {}; 97 | this._cbs = cbs || {}; 98 | 99 | this._tagname = ""; 100 | this._attribname = ""; 101 | this._attribvalue = ""; 102 | this._attribs = null; 103 | this._stack = []; 104 | 105 | this.startIndex = 0; 106 | this.endIndex = null; 107 | 108 | this._lowerCaseTagNames = "lowerCaseTags" in this._options ? 109 | !!this._options.lowerCaseTags : 110 | !this._options.xmlMode; 111 | this._lowerCaseAttributeNames = "lowerCaseAttributeNames" in this._options ? 112 | !!this._options.lowerCaseAttributeNames : 113 | !this._options.xmlMode; 114 | 115 | if(this._options.Tokenizer) { 116 | Tokenizer = this._options.Tokenizer; 117 | } else { 118 | Tokenizer = require("./Tokenizer.js"); 119 | } 120 | this._tokenizer = new Tokenizer(this._options, this); 121 | 122 | if(this._cbs.onparserinit) this._cbs.onparserinit(this); 123 | } 124 | 125 | require("inherits")(Parser, require("eventemitter2")); 126 | 127 | Parser.prototype._updatePosition = function(initialOffset){ 128 | if(this.endIndex === null){ 129 | if(this._tokenizer._sectionStart <= initialOffset){ 130 | this.startIndex = 0; 131 | } else { 132 | this.startIndex = this._tokenizer._sectionStart - initialOffset; 133 | } 134 | } 135 | else this.startIndex = this.endIndex + 1; 136 | this.endIndex = this._tokenizer.getAbsoluteIndex(); 137 | }; 138 | 139 | //Tokenizer event handlers 140 | Parser.prototype.ontext = function(data){ 141 | this._updatePosition(1); 142 | this.endIndex--; 143 | 144 | if(this._cbs.ontext) this._cbs.ontext(data); 145 | }; 146 | 147 | Parser.prototype.onopentagname = function(name){ 148 | if(this._lowerCaseTagNames){ 149 | name = name.toLowerCase(); 150 | } 151 | 152 | this._tagname = name; 153 | 154 | if(!this._options.xmlMode && name in openImpliesClose) { 155 | for( 156 | var el; 157 | (el = this._stack[this._stack.length - 1]) in openImpliesClose[name]; 158 | this.onclosetag(el) 159 | ); 160 | } 161 | 162 | if(this._options.xmlMode || !(name in voidElements)){ 163 | this._stack.push(name); 164 | } 165 | 166 | if(this._cbs.onopentagname) this._cbs.onopentagname(name); 167 | if(this._cbs.onopentag) this._attribs = {}; 168 | }; 169 | 170 | Parser.prototype.onopentagend = function(){ 171 | this._updatePosition(1); 172 | 173 | if(this._attribs){ 174 | if(this._cbs.onopentag) this._cbs.onopentag(this._tagname, this._attribs); 175 | this._attribs = null; 176 | } 177 | 178 | if(!this._options.xmlMode && this._cbs.onclosetag && this._tagname in voidElements){ 179 | this._cbs.onclosetag(this._tagname); 180 | } 181 | 182 | this._tagname = ""; 183 | }; 184 | 185 | Parser.prototype.onclosetag = function(name){ 186 | this._updatePosition(1); 187 | 188 | if(this._lowerCaseTagNames){ 189 | name = name.toLowerCase(); 190 | } 191 | 192 | if(this._stack.length && (!(name in voidElements) || this._options.xmlMode)){ 193 | var pos = this._stack.lastIndexOf(name); 194 | if(pos !== -1){ 195 | if(this._cbs.onclosetag){ 196 | pos = this._stack.length - pos; 197 | while(pos--) this._cbs.onclosetag(this._stack.pop()); 198 | } 199 | else this._stack.length = pos; 200 | } else if(name === "p" && !this._options.xmlMode){ 201 | this.onopentagname(name); 202 | this._closeCurrentTag(); 203 | } 204 | } else if(!this._options.xmlMode && (name === "br" || name === "p")){ 205 | this.onopentagname(name); 206 | this._closeCurrentTag(); 207 | } 208 | }; 209 | 210 | Parser.prototype.onselfclosingtag = function(){ 211 | if(this._options.xmlMode || this._options.recognizeSelfClosing){ 212 | this._closeCurrentTag(); 213 | } else { 214 | this.onopentagend(); 215 | } 216 | }; 217 | 218 | Parser.prototype._closeCurrentTag = function(){ 219 | var name = this._tagname; 220 | 221 | this.onopentagend(); 222 | 223 | //self-closing tags will be on the top of the stack 224 | //(cheaper check than in onclosetag) 225 | if(this._stack[this._stack.length - 1] === name){ 226 | if(this._cbs.onclosetag){ 227 | this._cbs.onclosetag(name); 228 | } 229 | this._stack.pop(); 230 | } 231 | }; 232 | 233 | Parser.prototype.onattribname = function(name){ 234 | if(this._lowerCaseAttributeNames){ 235 | name = name.toLowerCase(); 236 | } 237 | this._attribname = name; 238 | }; 239 | 240 | Parser.prototype.onattribdata = function(value){ 241 | this._attribvalue += value; 242 | }; 243 | 244 | Parser.prototype.onattribend = function(){ 245 | if(this._cbs.onattribute) this._cbs.onattribute(this._attribname, this._attribvalue); 246 | if( 247 | this._attribs && 248 | !Object.prototype.hasOwnProperty.call(this._attribs, this._attribname) 249 | ){ 250 | this._attribs[this._attribname] = this._attribvalue; 251 | } 252 | this._attribname = ""; 253 | this._attribvalue = ""; 254 | }; 255 | 256 | Parser.prototype._getInstructionName = function(value){ 257 | var idx = value.search(re_nameEnd), 258 | name = idx < 0 ? value : value.substr(0, idx); 259 | 260 | if(this._lowerCaseTagNames){ 261 | name = name.toLowerCase(); 262 | } 263 | 264 | return name; 265 | }; 266 | 267 | Parser.prototype.ondeclaration = function(value){ 268 | if(this._cbs.onprocessinginstruction){ 269 | var name = this._getInstructionName(value); 270 | this._cbs.onprocessinginstruction("!" + name, "!" + value); 271 | } 272 | }; 273 | 274 | Parser.prototype.onprocessinginstruction = function(value){ 275 | if(this._cbs.onprocessinginstruction){ 276 | var name = this._getInstructionName(value); 277 | this._cbs.onprocessinginstruction("?" + name, "?" + value); 278 | } 279 | }; 280 | 281 | Parser.prototype.oncomment = function(value){ 282 | this._updatePosition(4); 283 | 284 | if(this._cbs.oncomment) this._cbs.oncomment(value); 285 | if(this._cbs.oncommentend) this._cbs.oncommentend(); 286 | }; 287 | 288 | Parser.prototype.oncdata = function(value){ 289 | this._updatePosition(1); 290 | 291 | if(this._options.xmlMode || this._options.recognizeCDATA){ 292 | if(this._cbs.oncdatastart) this._cbs.oncdatastart(); 293 | if(this._cbs.ontext) this._cbs.ontext(value); 294 | if(this._cbs.oncdataend) this._cbs.oncdataend(); 295 | } else { 296 | this.oncomment("[CDATA[" + value + "]]"); 297 | } 298 | }; 299 | 300 | Parser.prototype.onerror = function(err){ 301 | if(this._cbs.onerror) this._cbs.onerror(err); 302 | }; 303 | 304 | Parser.prototype.onend = function(){ 305 | if(this._cbs.onclosetag){ 306 | for( 307 | var i = this._stack.length; 308 | i > 0; 309 | this._cbs.onclosetag(this._stack[--i]) 310 | ); 311 | } 312 | if(this._cbs.onend) this._cbs.onend(); 313 | }; 314 | 315 | 316 | //Resets the parser to a blank state, ready to parse a new HTML document 317 | Parser.prototype.reset = function(){ 318 | if(this._cbs.onreset) this._cbs.onreset(); 319 | this._tokenizer.reset(); 320 | 321 | this._tagname = ""; 322 | this._attribname = ""; 323 | this._attribs = null; 324 | this._stack = []; 325 | 326 | if(this._cbs.onparserinit) this._cbs.onparserinit(this); 327 | }; 328 | 329 | //Parses a complete HTML document and pushes it to the handler 330 | Parser.prototype.parseComplete = function(data){ 331 | this.reset(); 332 | this.end(data); 333 | }; 334 | 335 | Parser.prototype.write = function(chunk){ 336 | this._tokenizer.write(chunk); 337 | }; 338 | 339 | Parser.prototype.end = function(chunk){ 340 | this._tokenizer.end(chunk); 341 | }; 342 | 343 | Parser.prototype.pause = function(){ 344 | this._tokenizer.pause(); 345 | }; 346 | 347 | Parser.prototype.resume = function(){ 348 | this._tokenizer.resume(); 349 | }; 350 | 351 | //alias for backwards compat 352 | Parser.prototype.parseChunk = Parser.prototype.write; 353 | Parser.prototype.done = Parser.prototype.end; 354 | 355 | module.exports = Parser; 356 | -------------------------------------------------------------------------------- /test/Stream/03-Atom.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Atom feed", 3 | "options": {"xmlMode": true}, 4 | "file": "Atom_Example.xml", 5 | "expected": [ 6 | { 7 | "event": "processinginstruction", 8 | "data": [ 9 | "?xml", 10 | "?xml version=\"1.0\" encoding=\"utf-8\"?" 11 | ] 12 | }, 13 | { 14 | "event": "text", 15 | "data": [ 16 | "\n" 17 | ] 18 | }, 19 | { 20 | "event": "comment", 21 | "data": [ 22 | " http://en.wikipedia.org/wiki/Atom_%28standard%29 " 23 | ] 24 | }, 25 | { 26 | "event": "commentend", 27 | "data": [] 28 | }, 29 | { 30 | "event": "text", 31 | "data": [ 32 | "\n" 33 | ] 34 | }, 35 | { 36 | "event": "opentagname", 37 | "data": [ 38 | "feed" 39 | ] 40 | }, 41 | { 42 | "event": "attribute", 43 | "data": [ 44 | "xmlns", 45 | "http://www.w3.org/2005/Atom" 46 | ] 47 | }, 48 | { 49 | "event": "opentag", 50 | "data": [ 51 | "feed", 52 | { 53 | "xmlns": "http://www.w3.org/2005/Atom" 54 | } 55 | ] 56 | }, 57 | { 58 | "event": "text", 59 | "data": [ 60 | "\n\t" 61 | ] 62 | }, 63 | { 64 | "event": "opentagname", 65 | "data": [ 66 | "title" 67 | ] 68 | }, 69 | { 70 | "event": "opentag", 71 | "data": [ 72 | "title", 73 | {} 74 | ] 75 | }, 76 | { 77 | "event": "text", 78 | "data": [ 79 | "Example Feed" 80 | ] 81 | }, 82 | { 83 | "event": "closetag", 84 | "data": [ 85 | "title" 86 | ] 87 | }, 88 | { 89 | "event": "text", 90 | "data": [ 91 | "\n\t" 92 | ] 93 | }, 94 | { 95 | "event": "opentagname", 96 | "data": [ 97 | "subtitle" 98 | ] 99 | }, 100 | { 101 | "event": "opentag", 102 | "data": [ 103 | "subtitle", 104 | {} 105 | ] 106 | }, 107 | { 108 | "event": "text", 109 | "data": [ 110 | "A subtitle." 111 | ] 112 | }, 113 | { 114 | "event": "closetag", 115 | "data": [ 116 | "subtitle" 117 | ] 118 | }, 119 | { 120 | "event": "text", 121 | "data": [ 122 | "\n\t" 123 | ] 124 | }, 125 | { 126 | "event": "opentagname", 127 | "data": [ 128 | "link" 129 | ] 130 | }, 131 | { 132 | "event": "attribute", 133 | "data": [ 134 | "href", 135 | "http://example.org/feed/" 136 | ] 137 | }, 138 | { 139 | "event": "attribute", 140 | "data": [ 141 | "rel", 142 | "self" 143 | ] 144 | }, 145 | { 146 | "event": "opentag", 147 | "data": [ 148 | "link", 149 | { 150 | "href": "http://example.org/feed/", 151 | "rel": "self" 152 | } 153 | ] 154 | }, 155 | { 156 | "event": "closetag", 157 | "data": [ 158 | "link" 159 | ] 160 | }, 161 | { 162 | "event": "text", 163 | "data": [ 164 | "\n\t" 165 | ] 166 | }, 167 | { 168 | "event": "opentagname", 169 | "data": [ 170 | "link" 171 | ] 172 | }, 173 | { 174 | "event": "attribute", 175 | "data": [ 176 | "href", 177 | "http://example.org/" 178 | ] 179 | }, 180 | { 181 | "event": "opentag", 182 | "data": [ 183 | "link", 184 | { 185 | "href": "http://example.org/" 186 | } 187 | ] 188 | }, 189 | { 190 | "event": "closetag", 191 | "data": [ 192 | "link" 193 | ] 194 | }, 195 | { 196 | "event": "text", 197 | "data": [ 198 | "\n\t" 199 | ] 200 | }, 201 | { 202 | "event": "opentagname", 203 | "data": [ 204 | "id" 205 | ] 206 | }, 207 | { 208 | "event": "opentag", 209 | "data": [ 210 | "id", 211 | {} 212 | ] 213 | }, 214 | { 215 | "event": "text", 216 | "data": [ 217 | "urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6" 218 | ] 219 | }, 220 | { 221 | "event": "closetag", 222 | "data": [ 223 | "id" 224 | ] 225 | }, 226 | { 227 | "event": "text", 228 | "data": [ 229 | "\n\t" 230 | ] 231 | }, 232 | { 233 | "event": "opentagname", 234 | "data": [ 235 | "updated" 236 | ] 237 | }, 238 | { 239 | "event": "opentag", 240 | "data": [ 241 | "updated", 242 | {} 243 | ] 244 | }, 245 | { 246 | "event": "text", 247 | "data": [ 248 | "2003-12-13T18:30:02Z" 249 | ] 250 | }, 251 | { 252 | "event": "closetag", 253 | "data": [ 254 | "updated" 255 | ] 256 | }, 257 | { 258 | "event": "text", 259 | "data": [ 260 | "\n\t" 261 | ] 262 | }, 263 | { 264 | "event": "opentagname", 265 | "data": [ 266 | "author" 267 | ] 268 | }, 269 | { 270 | "event": "opentag", 271 | "data": [ 272 | "author", 273 | {} 274 | ] 275 | }, 276 | { 277 | "event": "text", 278 | "data": [ 279 | "\n\t\t" 280 | ] 281 | }, 282 | { 283 | "event": "opentagname", 284 | "data": [ 285 | "name" 286 | ] 287 | }, 288 | { 289 | "event": "opentag", 290 | "data": [ 291 | "name", 292 | {} 293 | ] 294 | }, 295 | { 296 | "event": "text", 297 | "data": [ 298 | "John Doe" 299 | ] 300 | }, 301 | { 302 | "event": "closetag", 303 | "data": [ 304 | "name" 305 | ] 306 | }, 307 | { 308 | "event": "text", 309 | "data": [ 310 | "\n\t\t" 311 | ] 312 | }, 313 | { 314 | "event": "opentagname", 315 | "data": [ 316 | "email" 317 | ] 318 | }, 319 | { 320 | "event": "opentag", 321 | "data": [ 322 | "email", 323 | {} 324 | ] 325 | }, 326 | { 327 | "event": "text", 328 | "data": [ 329 | "johndoe@example.com" 330 | ] 331 | }, 332 | { 333 | "event": "closetag", 334 | "data": [ 335 | "email" 336 | ] 337 | }, 338 | { 339 | "event": "text", 340 | "data": [ 341 | "\n\t" 342 | ] 343 | }, 344 | { 345 | "event": "closetag", 346 | "data": [ 347 | "author" 348 | ] 349 | }, 350 | { 351 | "event": "text", 352 | "data": [ 353 | "\n\n\t" 354 | ] 355 | }, 356 | { 357 | "event": "opentagname", 358 | "data": [ 359 | "entry" 360 | ] 361 | }, 362 | { 363 | "event": "opentag", 364 | "data": [ 365 | "entry", 366 | {} 367 | ] 368 | }, 369 | { 370 | "event": "text", 371 | "data": [ 372 | "\n\t\t" 373 | ] 374 | }, 375 | { 376 | "event": "opentagname", 377 | "data": [ 378 | "title" 379 | ] 380 | }, 381 | { 382 | "event": "opentag", 383 | "data": [ 384 | "title", 385 | {} 386 | ] 387 | }, 388 | { 389 | "event": "text", 390 | "data": [ 391 | "Atom-Powered Robots Run Amok" 392 | ] 393 | }, 394 | { 395 | "event": "closetag", 396 | "data": [ 397 | "title" 398 | ] 399 | }, 400 | { 401 | "event": "text", 402 | "data": [ 403 | "\n\t\t" 404 | ] 405 | }, 406 | { 407 | "event": "opentagname", 408 | "data": [ 409 | "link" 410 | ] 411 | }, 412 | { 413 | "event": "attribute", 414 | "data": [ 415 | "href", 416 | "http://example.org/2003/12/13/atom03" 417 | ] 418 | }, 419 | { 420 | "event": "opentag", 421 | "data": [ 422 | "link", 423 | { 424 | "href": "http://example.org/2003/12/13/atom03" 425 | } 426 | ] 427 | }, 428 | { 429 | "event": "closetag", 430 | "data": [ 431 | "link" 432 | ] 433 | }, 434 | { 435 | "event": "text", 436 | "data": [ 437 | "\n\t\t" 438 | ] 439 | }, 440 | { 441 | "event": "opentagname", 442 | "data": [ 443 | "link" 444 | ] 445 | }, 446 | { 447 | "event": "attribute", 448 | "data": [ 449 | "rel", 450 | "alternate" 451 | ] 452 | }, 453 | { 454 | "event": "attribute", 455 | "data": [ 456 | "type", 457 | "text/html" 458 | ] 459 | }, 460 | { 461 | "event": "attribute", 462 | "data": [ 463 | "href", 464 | "http://example.org/2003/12/13/atom03.html" 465 | ] 466 | }, 467 | { 468 | "event": "opentag", 469 | "data": [ 470 | "link", 471 | { 472 | "rel": "alternate", 473 | "type": "text/html", 474 | "href": "http://example.org/2003/12/13/atom03.html" 475 | } 476 | ] 477 | }, 478 | { 479 | "event": "closetag", 480 | "data": [ 481 | "link" 482 | ] 483 | }, 484 | { 485 | "event": "text", 486 | "data": [ 487 | "\n\t\t" 488 | ] 489 | }, 490 | { 491 | "event": "opentagname", 492 | "data": [ 493 | "link" 494 | ] 495 | }, 496 | { 497 | "event": "attribute", 498 | "data": [ 499 | "rel", 500 | "edit" 501 | ] 502 | }, 503 | { 504 | "event": "attribute", 505 | "data": [ 506 | "href", 507 | "http://example.org/2003/12/13/atom03/edit" 508 | ] 509 | }, 510 | { 511 | "event": "opentag", 512 | "data": [ 513 | "link", 514 | { 515 | "rel": "edit", 516 | "href": "http://example.org/2003/12/13/atom03/edit" 517 | } 518 | ] 519 | }, 520 | { 521 | "event": "closetag", 522 | "data": [ 523 | "link" 524 | ] 525 | }, 526 | { 527 | "event": "text", 528 | "data": [ 529 | "\n\t\t" 530 | ] 531 | }, 532 | { 533 | "event": "opentagname", 534 | "data": [ 535 | "id" 536 | ] 537 | }, 538 | { 539 | "event": "opentag", 540 | "data": [ 541 | "id", 542 | {} 543 | ] 544 | }, 545 | { 546 | "event": "text", 547 | "data": [ 548 | "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" 549 | ] 550 | }, 551 | { 552 | "event": "closetag", 553 | "data": [ 554 | "id" 555 | ] 556 | }, 557 | { 558 | "event": "text", 559 | "data": [ 560 | "\n\t\t" 561 | ] 562 | }, 563 | { 564 | "event": "opentagname", 565 | "data": [ 566 | "updated" 567 | ] 568 | }, 569 | { 570 | "event": "opentag", 571 | "data": [ 572 | "updated", 573 | {} 574 | ] 575 | }, 576 | { 577 | "event": "text", 578 | "data": [ 579 | "2003-12-13T18:30:02Z" 580 | ] 581 | }, 582 | { 583 | "event": "closetag", 584 | "data": [ 585 | "updated" 586 | ] 587 | }, 588 | { 589 | "event": "text", 590 | "data": [ 591 | "\n\t\t" 592 | ] 593 | }, 594 | { 595 | "event": "opentagname", 596 | "data": [ 597 | "content" 598 | ] 599 | }, 600 | { 601 | "event": "attribute", 602 | "data": [ 603 | "type", 604 | "html" 605 | ] 606 | }, 607 | { 608 | "event": "opentag", 609 | "data": [ 610 | "content", 611 | { 612 | "type": "html" 613 | } 614 | ] 615 | }, 616 | { 617 | "event": "opentagname", 618 | "data": [ 619 | "p" 620 | ] 621 | }, 622 | { 623 | "event": "opentag", 624 | "data": [ 625 | "p", 626 | {} 627 | ] 628 | }, 629 | { 630 | "event": "text", 631 | "data": [ 632 | "Some content." 633 | ] 634 | }, 635 | { 636 | "event": "closetag", 637 | "data": [ 638 | "p" 639 | ] 640 | }, 641 | { 642 | "event": "closetag", 643 | "data": [ 644 | "content" 645 | ] 646 | }, 647 | { 648 | "event": "text", 649 | "data": [ 650 | "\n\t" 651 | ] 652 | }, 653 | { 654 | "event": "closetag", 655 | "data": [ 656 | "entry" 657 | ] 658 | }, 659 | { 660 | "event": "text", 661 | "data": [ 662 | "\n\n" 663 | ] 664 | }, 665 | { 666 | "event": "closetag", 667 | "data": [ 668 | "feed" 669 | ] 670 | }, 671 | { 672 | "event": "text", 673 | "data": [ 674 | "\n" 675 | ] 676 | } 677 | ] 678 | } 679 | -------------------------------------------------------------------------------- /test/Stream/02-RSS.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "RSS feed", 3 | "options": {"xmlMode": true}, 4 | "file": "RSS_Example.xml", 5 | "expected": [ 6 | { 7 | "event": "processinginstruction", 8 | "data": [ 9 | "?xml", 10 | "?xml version=\"1.0\"?" 11 | ] 12 | }, 13 | { 14 | "event": "text", 15 | "data": [ 16 | "\n" 17 | ] 18 | }, 19 | { 20 | "event": "comment", 21 | "data": [ 22 | " http://cyber.law.harvard.edu/rss/examples/rss2sample.xml " 23 | ] 24 | }, 25 | { 26 | "event": "commentend", 27 | "data": [] 28 | }, 29 | { 30 | "event": "text", 31 | "data": [ 32 | "\n" 33 | ] 34 | }, 35 | { 36 | "event": "opentagname", 37 | "data": [ 38 | "rss" 39 | ] 40 | }, 41 | { 42 | "event": "attribute", 43 | "data": [ 44 | "version", 45 | "2.0" 46 | ] 47 | }, 48 | { 49 | "event": "opentag", 50 | "data": [ 51 | "rss", 52 | { 53 | "version": "2.0" 54 | } 55 | ] 56 | }, 57 | { 58 | "event": "text", 59 | "data": [ 60 | "\n " 61 | ] 62 | }, 63 | { 64 | "event": "opentagname", 65 | "data": [ 66 | "channel" 67 | ] 68 | }, 69 | { 70 | "event": "opentag", 71 | "data": [ 72 | "channel", 73 | {} 74 | ] 75 | }, 76 | { 77 | "event": "text", 78 | "data": [ 79 | "\n " 80 | ] 81 | }, 82 | { 83 | "event": "opentagname", 84 | "data": [ 85 | "title" 86 | ] 87 | }, 88 | { 89 | "event": "opentag", 90 | "data": [ 91 | "title", 92 | {} 93 | ] 94 | }, 95 | { 96 | "event": "text", 97 | "data": [ 98 | "Liftoff News" 99 | ] 100 | }, 101 | { 102 | "event": "closetag", 103 | "data": [ 104 | "title" 105 | ] 106 | }, 107 | { 108 | "event": "text", 109 | "data": [ 110 | "\n " 111 | ] 112 | }, 113 | { 114 | "event": "opentagname", 115 | "data": [ 116 | "link" 117 | ] 118 | }, 119 | { 120 | "event": "opentag", 121 | "data": [ 122 | "link", 123 | {} 124 | ] 125 | }, 126 | { 127 | "event": "text", 128 | "data": [ 129 | "http://liftoff.msfc.nasa.gov/" 130 | ] 131 | }, 132 | { 133 | "event": "closetag", 134 | "data": [ 135 | "link" 136 | ] 137 | }, 138 | { 139 | "event": "text", 140 | "data": [ 141 | "\n " 142 | ] 143 | }, 144 | { 145 | "event": "opentagname", 146 | "data": [ 147 | "description" 148 | ] 149 | }, 150 | { 151 | "event": "opentag", 152 | "data": [ 153 | "description", 154 | {} 155 | ] 156 | }, 157 | { 158 | "event": "text", 159 | "data": [ 160 | "Liftoff to Space Exploration." 161 | ] 162 | }, 163 | { 164 | "event": "closetag", 165 | "data": [ 166 | "description" 167 | ] 168 | }, 169 | { 170 | "event": "text", 171 | "data": [ 172 | "\n " 173 | ] 174 | }, 175 | { 176 | "event": "opentagname", 177 | "data": [ 178 | "language" 179 | ] 180 | }, 181 | { 182 | "event": "opentag", 183 | "data": [ 184 | "language", 185 | {} 186 | ] 187 | }, 188 | { 189 | "event": "text", 190 | "data": [ 191 | "en-us" 192 | ] 193 | }, 194 | { 195 | "event": "closetag", 196 | "data": [ 197 | "language" 198 | ] 199 | }, 200 | { 201 | "event": "text", 202 | "data": [ 203 | "\n " 204 | ] 205 | }, 206 | { 207 | "event": "opentagname", 208 | "data": [ 209 | "pubDate" 210 | ] 211 | }, 212 | { 213 | "event": "opentag", 214 | "data": [ 215 | "pubDate", 216 | {} 217 | ] 218 | }, 219 | { 220 | "event": "text", 221 | "data": [ 222 | "Tue, 10 Jun 2003 04:00:00 GMT" 223 | ] 224 | }, 225 | { 226 | "event": "closetag", 227 | "data": [ 228 | "pubDate" 229 | ] 230 | }, 231 | { 232 | "event": "text", 233 | "data": [ 234 | "\n\n " 235 | ] 236 | }, 237 | { 238 | "event": "opentagname", 239 | "data": [ 240 | "lastBuildDate" 241 | ] 242 | }, 243 | { 244 | "event": "opentag", 245 | "data": [ 246 | "lastBuildDate", 247 | {} 248 | ] 249 | }, 250 | { 251 | "event": "text", 252 | "data": [ 253 | "Tue, 10 Jun 2003 09:41:01 GMT" 254 | ] 255 | }, 256 | { 257 | "event": "closetag", 258 | "data": [ 259 | "lastBuildDate" 260 | ] 261 | }, 262 | { 263 | "event": "text", 264 | "data": [ 265 | "\n " 266 | ] 267 | }, 268 | { 269 | "event": "opentagname", 270 | "data": [ 271 | "docs" 272 | ] 273 | }, 274 | { 275 | "event": "opentag", 276 | "data": [ 277 | "docs", 278 | {} 279 | ] 280 | }, 281 | { 282 | "event": "text", 283 | "data": [ 284 | "http://blogs.law.harvard.edu/tech/rss" 285 | ] 286 | }, 287 | { 288 | "event": "closetag", 289 | "data": [ 290 | "docs" 291 | ] 292 | }, 293 | { 294 | "event": "text", 295 | "data": [ 296 | "\n " 297 | ] 298 | }, 299 | { 300 | "event": "opentagname", 301 | "data": [ 302 | "generator" 303 | ] 304 | }, 305 | { 306 | "event": "opentag", 307 | "data": [ 308 | "generator", 309 | {} 310 | ] 311 | }, 312 | { 313 | "event": "text", 314 | "data": [ 315 | "Weblog Editor 2.0" 316 | ] 317 | }, 318 | { 319 | "event": "closetag", 320 | "data": [ 321 | "generator" 322 | ] 323 | }, 324 | { 325 | "event": "text", 326 | "data": [ 327 | "\n " 328 | ] 329 | }, 330 | { 331 | "event": "opentagname", 332 | "data": [ 333 | "managingEditor" 334 | ] 335 | }, 336 | { 337 | "event": "opentag", 338 | "data": [ 339 | "managingEditor", 340 | {} 341 | ] 342 | }, 343 | { 344 | "event": "text", 345 | "data": [ 346 | "editor@example.com" 347 | ] 348 | }, 349 | { 350 | "event": "closetag", 351 | "data": [ 352 | "managingEditor" 353 | ] 354 | }, 355 | { 356 | "event": "text", 357 | "data": [ 358 | "\n " 359 | ] 360 | }, 361 | { 362 | "event": "opentagname", 363 | "data": [ 364 | "webMaster" 365 | ] 366 | }, 367 | { 368 | "event": "opentag", 369 | "data": [ 370 | "webMaster", 371 | {} 372 | ] 373 | }, 374 | { 375 | "event": "text", 376 | "data": [ 377 | "webmaster@example.com" 378 | ] 379 | }, 380 | { 381 | "event": "closetag", 382 | "data": [ 383 | "webMaster" 384 | ] 385 | }, 386 | { 387 | "event": "text", 388 | "data": [ 389 | "\n " 390 | ] 391 | }, 392 | { 393 | "event": "opentagname", 394 | "data": [ 395 | "item" 396 | ] 397 | }, 398 | { 399 | "event": "opentag", 400 | "data": [ 401 | "item", 402 | {} 403 | ] 404 | }, 405 | { 406 | "event": "text", 407 | "data": [ 408 | "\n\n " 409 | ] 410 | }, 411 | { 412 | "event": "opentagname", 413 | "data": [ 414 | "title" 415 | ] 416 | }, 417 | { 418 | "event": "opentag", 419 | "data": [ 420 | "title", 421 | {} 422 | ] 423 | }, 424 | { 425 | "event": "text", 426 | "data": [ 427 | "Star City" 428 | ] 429 | }, 430 | { 431 | "event": "closetag", 432 | "data": [ 433 | "title" 434 | ] 435 | }, 436 | { 437 | "event": "text", 438 | "data": [ 439 | "\n " 440 | ] 441 | }, 442 | { 443 | "event": "opentagname", 444 | "data": [ 445 | "link" 446 | ] 447 | }, 448 | { 449 | "event": "opentag", 450 | "data": [ 451 | "link", 452 | {} 453 | ] 454 | }, 455 | { 456 | "event": "text", 457 | "data": [ 458 | "http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp" 459 | ] 460 | }, 461 | { 462 | "event": "closetag", 463 | "data": [ 464 | "link" 465 | ] 466 | }, 467 | { 468 | "event": "text", 469 | "data": [ 470 | "\n " 471 | ] 472 | }, 473 | { 474 | "event": "opentagname", 475 | "data": [ 476 | "description" 477 | ] 478 | }, 479 | { 480 | "event": "opentag", 481 | "data": [ 482 | "description", 483 | {} 484 | ] 485 | }, 486 | { 487 | "event": "text", 488 | "data": [ 489 | "How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's <a href=\"http://howe.iki.rssi.ru/GCTC/gctc_e.htm\">Star City</a>." 490 | ] 491 | }, 492 | { 493 | "event": "closetag", 494 | "data": [ 495 | "description" 496 | ] 497 | }, 498 | { 499 | "event": "text", 500 | "data": [ 501 | "\n " 502 | ] 503 | }, 504 | { 505 | "event": "opentagname", 506 | "data": [ 507 | "pubDate" 508 | ] 509 | }, 510 | { 511 | "event": "opentag", 512 | "data": [ 513 | "pubDate", 514 | {} 515 | ] 516 | }, 517 | { 518 | "event": "text", 519 | "data": [ 520 | "Tue, 03 Jun 2003 09:39:21 GMT" 521 | ] 522 | }, 523 | { 524 | "event": "closetag", 525 | "data": [ 526 | "pubDate" 527 | ] 528 | }, 529 | { 530 | "event": "text", 531 | "data": [ 532 | "\n " 533 | ] 534 | }, 535 | { 536 | "event": "opentagname", 537 | "data": [ 538 | "guid" 539 | ] 540 | }, 541 | { 542 | "event": "opentag", 543 | "data": [ 544 | "guid", 545 | {} 546 | ] 547 | }, 548 | { 549 | "event": "text", 550 | "data": [ 551 | "http://liftoff.msfc.nasa.gov/2003/06/03.html#item573" 552 | ] 553 | }, 554 | { 555 | "event": "closetag", 556 | "data": [ 557 | "guid" 558 | ] 559 | }, 560 | { 561 | "event": "text", 562 | "data": [ 563 | "\n\n " 564 | ] 565 | }, 566 | { 567 | "event": "closetag", 568 | "data": [ 569 | "item" 570 | ] 571 | }, 572 | { 573 | "event": "text", 574 | "data": [ 575 | "\n " 576 | ] 577 | }, 578 | { 579 | "event": "opentagname", 580 | "data": [ 581 | "item" 582 | ] 583 | }, 584 | { 585 | "event": "opentag", 586 | "data": [ 587 | "item", 588 | {} 589 | ] 590 | }, 591 | { 592 | "event": "text", 593 | "data": [ 594 | "\n " 595 | ] 596 | }, 597 | { 598 | "event": "opentagname", 599 | "data": [ 600 | "description" 601 | ] 602 | }, 603 | { 604 | "event": "opentag", 605 | "data": [ 606 | "description", 607 | {} 608 | ] 609 | }, 610 | { 611 | "event": "text", 612 | "data": [ 613 | "Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a <a href=\"http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm\">partial eclipse of the Sun</a> on Saturday, May 31st." 614 | ] 615 | }, 616 | { 617 | "event": "closetag", 618 | "data": [ 619 | "description" 620 | ] 621 | }, 622 | { 623 | "event": "text", 624 | "data": [ 625 | "\n " 626 | ] 627 | }, 628 | { 629 | "event": "opentagname", 630 | "data": [ 631 | "pubDate" 632 | ] 633 | }, 634 | { 635 | "event": "opentag", 636 | "data": [ 637 | "pubDate", 638 | {} 639 | ] 640 | }, 641 | { 642 | "event": "text", 643 | "data": [ 644 | "Fri, 30 May 2003 11:06:42 GMT" 645 | ] 646 | }, 647 | { 648 | "event": "closetag", 649 | "data": [ 650 | "pubDate" 651 | ] 652 | }, 653 | { 654 | "event": "text", 655 | "data": [ 656 | "\n " 657 | ] 658 | }, 659 | { 660 | "event": "opentagname", 661 | "data": [ 662 | "guid" 663 | ] 664 | }, 665 | { 666 | "event": "opentag", 667 | "data": [ 668 | "guid", 669 | {} 670 | ] 671 | }, 672 | { 673 | "event": "text", 674 | "data": [ 675 | "http://liftoff.msfc.nasa.gov/2003/05/30.html#item572" 676 | ] 677 | }, 678 | { 679 | "event": "closetag", 680 | "data": [ 681 | "guid" 682 | ] 683 | }, 684 | { 685 | "event": "text", 686 | "data": [ 687 | "\n\n " 688 | ] 689 | }, 690 | { 691 | "event": "closetag", 692 | "data": [ 693 | "item" 694 | ] 695 | }, 696 | { 697 | "event": "text", 698 | "data": [ 699 | "\n " 700 | ] 701 | }, 702 | { 703 | "event": "opentagname", 704 | "data": [ 705 | "item" 706 | ] 707 | }, 708 | { 709 | "event": "opentag", 710 | "data": [ 711 | "item", 712 | {} 713 | ] 714 | }, 715 | { 716 | "event": "text", 717 | "data": [ 718 | "\n " 719 | ] 720 | }, 721 | { 722 | "event": "opentagname", 723 | "data": [ 724 | "title" 725 | ] 726 | }, 727 | { 728 | "event": "opentag", 729 | "data": [ 730 | "title", 731 | {} 732 | ] 733 | }, 734 | { 735 | "event": "text", 736 | "data": [ 737 | "The Engine That Does More" 738 | ] 739 | }, 740 | { 741 | "event": "closetag", 742 | "data": [ 743 | "title" 744 | ] 745 | }, 746 | { 747 | "event": "text", 748 | "data": [ 749 | "\n " 750 | ] 751 | }, 752 | { 753 | "event": "opentagname", 754 | "data": [ 755 | "link" 756 | ] 757 | }, 758 | { 759 | "event": "opentag", 760 | "data": [ 761 | "link", 762 | {} 763 | ] 764 | }, 765 | { 766 | "event": "text", 767 | "data": [ 768 | "http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp" 769 | ] 770 | }, 771 | { 772 | "event": "closetag", 773 | "data": [ 774 | "link" 775 | ] 776 | }, 777 | { 778 | "event": "text", 779 | "data": [ 780 | "\n " 781 | ] 782 | }, 783 | { 784 | "event": "opentagname", 785 | "data": [ 786 | "description" 787 | ] 788 | }, 789 | { 790 | "event": "opentag", 791 | "data": [ 792 | "description", 793 | {} 794 | ] 795 | }, 796 | { 797 | "event": "text", 798 | "data": [ 799 | "Before man travels to Mars, NASA hopes to design new engines that will let us fly through the Solar System more quickly. The proposed VASIMR engine would do that." 800 | ] 801 | }, 802 | { 803 | "event": "closetag", 804 | "data": [ 805 | "description" 806 | ] 807 | }, 808 | { 809 | "event": "text", 810 | "data": [ 811 | "\n " 812 | ] 813 | }, 814 | { 815 | "event": "opentagname", 816 | "data": [ 817 | "pubDate" 818 | ] 819 | }, 820 | { 821 | "event": "opentag", 822 | "data": [ 823 | "pubDate", 824 | {} 825 | ] 826 | }, 827 | { 828 | "event": "text", 829 | "data": [ 830 | "Tue, 27 May 2003 08:37:32 GMT" 831 | ] 832 | }, 833 | { 834 | "event": "closetag", 835 | "data": [ 836 | "pubDate" 837 | ] 838 | }, 839 | { 840 | "event": "text", 841 | "data": [ 842 | "\n " 843 | ] 844 | }, 845 | { 846 | "event": "opentagname", 847 | "data": [ 848 | "guid" 849 | ] 850 | }, 851 | { 852 | "event": "opentag", 853 | "data": [ 854 | "guid", 855 | {} 856 | ] 857 | }, 858 | { 859 | "event": "text", 860 | "data": [ 861 | "http://liftoff.msfc.nasa.gov/2003/05/27.html#item571" 862 | ] 863 | }, 864 | { 865 | "event": "closetag", 866 | "data": [ 867 | "guid" 868 | ] 869 | }, 870 | { 871 | "event": "text", 872 | "data": [ 873 | "\n\n " 874 | ] 875 | }, 876 | { 877 | "event": "closetag", 878 | "data": [ 879 | "item" 880 | ] 881 | }, 882 | { 883 | "event": "text", 884 | "data": [ 885 | "\n " 886 | ] 887 | }, 888 | { 889 | "event": "opentagname", 890 | "data": [ 891 | "item" 892 | ] 893 | }, 894 | { 895 | "event": "opentag", 896 | "data": [ 897 | "item", 898 | {} 899 | ] 900 | }, 901 | { 902 | "event": "text", 903 | "data": [ 904 | "\n " 905 | ] 906 | }, 907 | { 908 | "event": "opentagname", 909 | "data": [ 910 | "title" 911 | ] 912 | }, 913 | { 914 | "event": "opentag", 915 | "data": [ 916 | "title", 917 | {} 918 | ] 919 | }, 920 | { 921 | "event": "text", 922 | "data": [ 923 | "Astronauts' Dirty Laundry" 924 | ] 925 | }, 926 | { 927 | "event": "closetag", 928 | "data": [ 929 | "title" 930 | ] 931 | }, 932 | { 933 | "event": "text", 934 | "data": [ 935 | "\n " 936 | ] 937 | }, 938 | { 939 | "event": "opentagname", 940 | "data": [ 941 | "link" 942 | ] 943 | }, 944 | { 945 | "event": "opentag", 946 | "data": [ 947 | "link", 948 | {} 949 | ] 950 | }, 951 | { 952 | "event": "text", 953 | "data": [ 954 | "http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp" 955 | ] 956 | }, 957 | { 958 | "event": "closetag", 959 | "data": [ 960 | "link" 961 | ] 962 | }, 963 | { 964 | "event": "text", 965 | "data": [ 966 | "\n " 967 | ] 968 | }, 969 | { 970 | "event": "opentagname", 971 | "data": [ 972 | "description" 973 | ] 974 | }, 975 | { 976 | "event": "opentag", 977 | "data": [ 978 | "description", 979 | {} 980 | ] 981 | }, 982 | { 983 | "event": "text", 984 | "data": [ 985 | "Compared to earlier spacecraft, the International Space Station has many luxuries, but laundry facilities are not one of them. Instead, astronauts have other options." 986 | ] 987 | }, 988 | { 989 | "event": "closetag", 990 | "data": [ 991 | "description" 992 | ] 993 | }, 994 | { 995 | "event": "text", 996 | "data": [ 997 | "\n " 998 | ] 999 | }, 1000 | { 1001 | "event": "opentagname", 1002 | "data": [ 1003 | "pubDate" 1004 | ] 1005 | }, 1006 | { 1007 | "event": "opentag", 1008 | "data": [ 1009 | "pubDate", 1010 | {} 1011 | ] 1012 | }, 1013 | { 1014 | "event": "text", 1015 | "data": [ 1016 | "Tue, 20 May 2003 08:56:02 GMT" 1017 | ] 1018 | }, 1019 | { 1020 | "event": "closetag", 1021 | "data": [ 1022 | "pubDate" 1023 | ] 1024 | }, 1025 | { 1026 | "event": "text", 1027 | "data": [ 1028 | "\n " 1029 | ] 1030 | }, 1031 | { 1032 | "event": "opentagname", 1033 | "data": [ 1034 | "guid" 1035 | ] 1036 | }, 1037 | { 1038 | "event": "opentag", 1039 | "data": [ 1040 | "guid", 1041 | {} 1042 | ] 1043 | }, 1044 | { 1045 | "event": "text", 1046 | "data": [ 1047 | "http://liftoff.msfc.nasa.gov/2003/05/20.html#item570" 1048 | ] 1049 | }, 1050 | { 1051 | "event": "closetag", 1052 | "data": [ 1053 | "guid" 1054 | ] 1055 | }, 1056 | { 1057 | "event": "text", 1058 | "data": [ 1059 | "\n\n " 1060 | ] 1061 | }, 1062 | { 1063 | "event": "closetag", 1064 | "data": [ 1065 | "item" 1066 | ] 1067 | }, 1068 | { 1069 | "event": "text", 1070 | "data": [ 1071 | "\n " 1072 | ] 1073 | }, 1074 | { 1075 | "event": "closetag", 1076 | "data": [ 1077 | "channel" 1078 | ] 1079 | }, 1080 | { 1081 | "event": "text", 1082 | "data": [ 1083 | "\n" 1084 | ] 1085 | }, 1086 | { 1087 | "event": "closetag", 1088 | "data": [ 1089 | "rss" 1090 | ] 1091 | } 1092 | ] 1093 | } -------------------------------------------------------------------------------- /test/Stream/04-RDF.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "RDF feed", 3 | "options": {"xmlMode": true}, 4 | "file": "RDF_Example.xml", 5 | "expected": [ 6 | { 7 | "event": "processinginstruction", 8 | "data": [ 9 | "?xml", 10 | "?xml version=\"1.0\" encoding=\"UTF-8\"?" 11 | ] 12 | }, 13 | { 14 | "event": "text", 15 | "data": [ 16 | "\n" 17 | ] 18 | }, 19 | { 20 | "event": "opentagname", 21 | "data": [ 22 | "rdf:RDF" 23 | ] 24 | }, 25 | { 26 | "event": "attribute", 27 | "data": [ 28 | "xmlns:rdf", 29 | "http://www.w3.org/1999/02/22-rdf-syntax-ns#" 30 | ] 31 | }, 32 | { 33 | "event": "attribute", 34 | "data": [ 35 | "xmlns", 36 | "http://purl.org/rss/1.0/" 37 | ] 38 | }, 39 | { 40 | "event": "attribute", 41 | "data": [ 42 | "xmlns:ev", 43 | "http://purl.org/rss/1.0/modules/event/" 44 | ] 45 | }, 46 | { 47 | "event": "attribute", 48 | "data": [ 49 | "xmlns:content", 50 | "http://purl.org/rss/1.0/modules/content/" 51 | ] 52 | }, 53 | { 54 | "event": "attribute", 55 | "data": [ 56 | "xmlns:taxo", 57 | "http://purl.org/rss/1.0/modules/taxonomy/" 58 | ] 59 | }, 60 | { 61 | "event": "attribute", 62 | "data": [ 63 | "xmlns:dc", 64 | "http://purl.org/dc/elements/1.1/" 65 | ] 66 | }, 67 | { 68 | "event": "attribute", 69 | "data": [ 70 | "xmlns:syn", 71 | "http://purl.org/rss/1.0/modules/syndication/" 72 | ] 73 | }, 74 | { 75 | "event": "attribute", 76 | "data": [ 77 | "xmlns:dcterms", 78 | "http://purl.org/dc/terms/" 79 | ] 80 | }, 81 | { 82 | "event": "attribute", 83 | "data": [ 84 | "xmlns:admin", 85 | "http://webns.net/mvcb/" 86 | ] 87 | }, 88 | { 89 | "event": "opentag", 90 | "data": [ 91 | "rdf:RDF", 92 | { 93 | "xmlns:rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", 94 | "xmlns": "http://purl.org/rss/1.0/", 95 | "xmlns:ev": "http://purl.org/rss/1.0/modules/event/", 96 | "xmlns:content": "http://purl.org/rss/1.0/modules/content/", 97 | "xmlns:taxo": "http://purl.org/rss/1.0/modules/taxonomy/", 98 | "xmlns:dc": "http://purl.org/dc/elements/1.1/", 99 | "xmlns:syn": "http://purl.org/rss/1.0/modules/syndication/", 100 | "xmlns:dcterms": "http://purl.org/dc/terms/", 101 | "xmlns:admin": "http://webns.net/mvcb/" 102 | } 103 | ] 104 | }, 105 | { 106 | "event": "text", 107 | "data": [ 108 | "\n\t" 109 | ] 110 | }, 111 | { 112 | "event": "opentagname", 113 | "data": [ 114 | "channel" 115 | ] 116 | }, 117 | { 118 | "event": "attribute", 119 | "data": [ 120 | "rdf:about", 121 | "https://github.com/fb55/htmlparser2/" 122 | ] 123 | }, 124 | { 125 | "event": "opentag", 126 | "data": [ 127 | "channel", 128 | { 129 | "rdf:about": "https://github.com/fb55/htmlparser2/" 130 | } 131 | ] 132 | }, 133 | { 134 | "event": "text", 135 | "data": [ 136 | "\n\t\t" 137 | ] 138 | }, 139 | { 140 | "event": "opentagname", 141 | "data": [ 142 | "title" 143 | ] 144 | }, 145 | { 146 | "event": "opentag", 147 | "data": [ 148 | "title", 149 | {} 150 | ] 151 | }, 152 | { 153 | "event": "text", 154 | "data": [ 155 | "A title to parse and remember" 156 | ] 157 | }, 158 | { 159 | "event": "closetag", 160 | "data": [ 161 | "title" 162 | ] 163 | }, 164 | { 165 | "event": "text", 166 | "data": [ 167 | "\n\t\t" 168 | ] 169 | }, 170 | { 171 | "event": "opentagname", 172 | "data": [ 173 | "link" 174 | ] 175 | }, 176 | { 177 | "event": "opentag", 178 | "data": [ 179 | "link", 180 | {} 181 | ] 182 | }, 183 | { 184 | "event": "text", 185 | "data": [ 186 | "https://github.com/fb55/htmlparser2/" 187 | ] 188 | }, 189 | { 190 | "event": "closetag", 191 | "data": [ 192 | "link" 193 | ] 194 | }, 195 | { 196 | "event": "text", 197 | "data": [ 198 | "\n\t\t" 199 | ] 200 | }, 201 | { 202 | "event": "opentagname", 203 | "data": [ 204 | "description" 205 | ] 206 | }, 207 | { 208 | "event": "opentag", 209 | "data": [ 210 | "description", 211 | {} 212 | ] 213 | }, 214 | { 215 | "event": "closetag", 216 | "data": [ 217 | "description" 218 | ] 219 | }, 220 | { 221 | "event": "text", 222 | "data": [ 223 | "\n\t\t" 224 | ] 225 | }, 226 | { 227 | "event": "opentagname", 228 | "data": [ 229 | "dc:language" 230 | ] 231 | }, 232 | { 233 | "event": "opentag", 234 | "data": [ 235 | "dc:language", 236 | {} 237 | ] 238 | }, 239 | { 240 | "event": "text", 241 | "data": [ 242 | "en-us" 243 | ] 244 | }, 245 | { 246 | "event": "closetag", 247 | "data": [ 248 | "dc:language" 249 | ] 250 | }, 251 | { 252 | "event": "text", 253 | "data": [ 254 | "\n\t\t" 255 | ] 256 | }, 257 | { 258 | "event": "opentagname", 259 | "data": [ 260 | "dc:rights" 261 | ] 262 | }, 263 | { 264 | "event": "opentag", 265 | "data": [ 266 | "dc:rights", 267 | {} 268 | ] 269 | }, 270 | { 271 | "event": "text", 272 | "data": [ 273 | "Copyright 2015 the authors" 274 | ] 275 | }, 276 | { 277 | "event": "closetag", 278 | "data": [ 279 | "dc:rights" 280 | ] 281 | }, 282 | { 283 | "event": "text", 284 | "data": [ 285 | "\n\t\t" 286 | ] 287 | }, 288 | { 289 | "event": "opentagname", 290 | "data": [ 291 | "dc:publisher" 292 | ] 293 | }, 294 | { 295 | "event": "opentag", 296 | "data": [ 297 | "dc:publisher", 298 | {} 299 | ] 300 | }, 301 | { 302 | "event": "text", 303 | "data": [ 304 | "webmaster@thisisafakedoma.in" 305 | ] 306 | }, 307 | { 308 | "event": "closetag", 309 | "data": [ 310 | "dc:publisher" 311 | ] 312 | }, 313 | { 314 | "event": "text", 315 | "data": [ 316 | "\n\t\t" 317 | ] 318 | }, 319 | { 320 | "event": "opentagname", 321 | "data": [ 322 | "dc:creator" 323 | ] 324 | }, 325 | { 326 | "event": "opentag", 327 | "data": [ 328 | "dc:creator", 329 | {} 330 | ] 331 | }, 332 | { 333 | "event": "text", 334 | "data": [ 335 | "webmaster@thisisafakedoma.in" 336 | ] 337 | }, 338 | { 339 | "event": "closetag", 340 | "data": [ 341 | "dc:creator" 342 | ] 343 | }, 344 | { 345 | "event": "text", 346 | "data": [ 347 | "\n\t\t" 348 | ] 349 | }, 350 | { 351 | "event": "opentagname", 352 | "data": [ 353 | "dc:source" 354 | ] 355 | }, 356 | { 357 | "event": "opentag", 358 | "data": [ 359 | "dc:source", 360 | {} 361 | ] 362 | }, 363 | { 364 | "event": "text", 365 | "data": [ 366 | "https://github.com/fb55/htmlparser2/" 367 | ] 368 | }, 369 | { 370 | "event": "closetag", 371 | "data": [ 372 | "dc:source" 373 | ] 374 | }, 375 | { 376 | "event": "text", 377 | "data": [ 378 | "\n\t\t" 379 | ] 380 | }, 381 | { 382 | "event": "opentagname", 383 | "data": [ 384 | "dc:title" 385 | ] 386 | }, 387 | { 388 | "event": "opentag", 389 | "data": [ 390 | "dc:title", 391 | {} 392 | ] 393 | }, 394 | { 395 | "event": "text", 396 | "data": [ 397 | "A title to parse and remember" 398 | ] 399 | }, 400 | { 401 | "event": "closetag", 402 | "data": [ 403 | "dc:title" 404 | ] 405 | }, 406 | { 407 | "event": "text", 408 | "data": [ 409 | "\n\t\t" 410 | ] 411 | }, 412 | { 413 | "event": "opentagname", 414 | "data": [ 415 | "dc:type" 416 | ] 417 | }, 418 | { 419 | "event": "opentag", 420 | "data": [ 421 | "dc:type", 422 | {} 423 | ] 424 | }, 425 | { 426 | "event": "text", 427 | "data": [ 428 | "Collection" 429 | ] 430 | }, 431 | { 432 | "event": "closetag", 433 | "data": [ 434 | "dc:type" 435 | ] 436 | }, 437 | { 438 | "event": "text", 439 | "data": [ 440 | "\n\t\t" 441 | ] 442 | }, 443 | { 444 | "event": "opentagname", 445 | "data": [ 446 | "syn:updateBase" 447 | ] 448 | }, 449 | { 450 | "event": "opentag", 451 | "data": [ 452 | "syn:updateBase", 453 | {} 454 | ] 455 | }, 456 | { 457 | "event": "text", 458 | "data": [ 459 | "2011-11-04T09:39:10-07:00" 460 | ] 461 | }, 462 | { 463 | "event": "closetag", 464 | "data": [ 465 | "syn:updateBase" 466 | ] 467 | }, 468 | { 469 | "event": "text", 470 | "data": [ 471 | "\n\t\t" 472 | ] 473 | }, 474 | { 475 | "event": "opentagname", 476 | "data": [ 477 | "syn:updateFrequency" 478 | ] 479 | }, 480 | { 481 | "event": "opentag", 482 | "data": [ 483 | "syn:updateFrequency", 484 | {} 485 | ] 486 | }, 487 | { 488 | "event": "text", 489 | "data": [ 490 | "4" 491 | ] 492 | }, 493 | { 494 | "event": "closetag", 495 | "data": [ 496 | "syn:updateFrequency" 497 | ] 498 | }, 499 | { 500 | "event": "text", 501 | "data": [ 502 | "\n\t\t" 503 | ] 504 | }, 505 | { 506 | "event": "opentagname", 507 | "data": [ 508 | "syn:updatePeriod" 509 | ] 510 | }, 511 | { 512 | "event": "opentag", 513 | "data": [ 514 | "syn:updatePeriod", 515 | {} 516 | ] 517 | }, 518 | { 519 | "event": "text", 520 | "data": [ 521 | "hourly" 522 | ] 523 | }, 524 | { 525 | "event": "closetag", 526 | "data": [ 527 | "syn:updatePeriod" 528 | ] 529 | }, 530 | { 531 | "event": "text", 532 | "data": [ 533 | "\n\t\t" 534 | ] 535 | }, 536 | { 537 | "event": "opentagname", 538 | "data": [ 539 | "items" 540 | ] 541 | }, 542 | { 543 | "event": "opentag", 544 | "data": [ 545 | "items", 546 | {} 547 | ] 548 | }, 549 | { 550 | "event": "text", 551 | "data": [ 552 | "\n\t\t\t" 553 | ] 554 | }, 555 | { 556 | "event": "opentagname", 557 | "data": [ 558 | "rdf:Seq" 559 | ] 560 | }, 561 | { 562 | "event": "opentag", 563 | "data": [ 564 | "rdf:Seq", 565 | {} 566 | ] 567 | }, 568 | { 569 | "event": "text", 570 | "data": [ 571 | "\n\t\t\t\t" 572 | ] 573 | }, 574 | { 575 | "event": "opentagname", 576 | "data": [ 577 | "rdf:li" 578 | ] 579 | }, 580 | { 581 | "event": "attribute", 582 | "data": [ 583 | "rdf:resource", 584 | "http://somefakesite/path/to/something.html" 585 | ] 586 | }, 587 | { 588 | "event": "opentag", 589 | "data": [ 590 | "rdf:li", 591 | { 592 | "rdf:resource": "http://somefakesite/path/to/something.html" 593 | } 594 | ] 595 | }, 596 | { 597 | "event": "closetag", 598 | "data": [ 599 | "rdf:li" 600 | ] 601 | }, 602 | { 603 | "event": "text", 604 | "data": [ 605 | "\n\t\t\t" 606 | ] 607 | }, 608 | { 609 | "event": "closetag", 610 | "data": [ 611 | "rdf:Seq" 612 | ] 613 | }, 614 | { 615 | "event": "text", 616 | "data": [ 617 | "\n\t\t" 618 | ] 619 | }, 620 | { 621 | "event": "closetag", 622 | "data": [ 623 | "items" 624 | ] 625 | }, 626 | { 627 | "event": "text", 628 | "data": [ 629 | "\n\t" 630 | ] 631 | }, 632 | { 633 | "event": "closetag", 634 | "data": [ 635 | "channel" 636 | ] 637 | }, 638 | { 639 | "event": "text", 640 | "data": [ 641 | "\n\t" 642 | ] 643 | }, 644 | { 645 | "event": "opentagname", 646 | "data": [ 647 | "item" 648 | ] 649 | }, 650 | { 651 | "event": "attribute", 652 | "data": [ 653 | "rdf:about", 654 | "http://somefakesite/path/to/something.html" 655 | ] 656 | }, 657 | { 658 | "event": "opentag", 659 | "data": [ 660 | "item", 661 | { 662 | "rdf:about": "http://somefakesite/path/to/something.html" 663 | } 664 | ] 665 | }, 666 | { 667 | "event": "text", 668 | "data": [ 669 | "\n\t\t" 670 | ] 671 | }, 672 | { 673 | "event": "opentagname", 674 | "data": [ 675 | "title" 676 | ] 677 | }, 678 | { 679 | "event": "opentag", 680 | "data": [ 681 | "title", 682 | {} 683 | ] 684 | }, 685 | { 686 | "event": "cdatastart", 687 | "data": [] 688 | }, 689 | { 690 | "event": "text", 691 | "data": [ 692 | " Fast HTML Parsing " 693 | ] 694 | }, 695 | { 696 | "event": "cdataend", 697 | "data": [] 698 | }, 699 | { 700 | "event": "closetag", 701 | "data": [ 702 | "title" 703 | ] 704 | }, 705 | { 706 | "event": "text", 707 | "data": [ 708 | "\n\t\t" 709 | ] 710 | }, 711 | { 712 | "event": "opentagname", 713 | "data": [ 714 | "link" 715 | ] 716 | }, 717 | { 718 | "event": "opentag", 719 | "data": [ 720 | "link", 721 | {} 722 | ] 723 | }, 724 | { 725 | "event": "text", 726 | "data": [ 727 | "\nhttp://somefakesite/path/to/something.html\n" 728 | ] 729 | }, 730 | { 731 | "event": "closetag", 732 | "data": [ 733 | "link" 734 | ] 735 | }, 736 | { 737 | "event": "text", 738 | "data": [ 739 | "\n\t\t" 740 | ] 741 | }, 742 | { 743 | "event": "opentagname", 744 | "data": [ 745 | "description" 746 | ] 747 | }, 748 | { 749 | "event": "opentag", 750 | "data": [ 751 | "description", 752 | {} 753 | ] 754 | }, 755 | { 756 | "event": "cdatastart", 757 | "data": [] 758 | }, 759 | { 760 | "event": "text", 761 | "data": [ 762 | "\nGreat test content
A link:
Github\n" 763 | ] 764 | }, 765 | { 766 | "event": "cdataend", 767 | "data": [] 768 | }, 769 | { 770 | "event": "closetag", 771 | "data": [ 772 | "description" 773 | ] 774 | }, 775 | { 776 | "event": "text", 777 | "data": [ 778 | "\n\t\t" 779 | ] 780 | }, 781 | { 782 | "event": "opentagname", 783 | "data": [ 784 | "dc:date" 785 | ] 786 | }, 787 | { 788 | "event": "opentag", 789 | "data": [ 790 | "dc:date", 791 | {} 792 | ] 793 | }, 794 | { 795 | "event": "text", 796 | "data": [ 797 | "2011-11-04T09:35:17-07:00" 798 | ] 799 | }, 800 | { 801 | "event": "closetag", 802 | "data": [ 803 | "dc:date" 804 | ] 805 | }, 806 | { 807 | "event": "text", 808 | "data": [ 809 | "\n\t\t" 810 | ] 811 | }, 812 | { 813 | "event": "opentagname", 814 | "data": [ 815 | "dc:language" 816 | ] 817 | }, 818 | { 819 | "event": "opentag", 820 | "data": [ 821 | "dc:language", 822 | {} 823 | ] 824 | }, 825 | { 826 | "event": "text", 827 | "data": [ 828 | "en-us" 829 | ] 830 | }, 831 | { 832 | "event": "closetag", 833 | "data": [ 834 | "dc:language" 835 | ] 836 | }, 837 | { 838 | "event": "text", 839 | "data": [ 840 | "\n\t\t" 841 | ] 842 | }, 843 | { 844 | "event": "opentagname", 845 | "data": [ 846 | "dc:rights" 847 | ] 848 | }, 849 | { 850 | "event": "opentag", 851 | "data": [ 852 | "dc:rights", 853 | {} 854 | ] 855 | }, 856 | { 857 | "event": "text", 858 | "data": [ 859 | "Copyright 2015 the authors" 860 | ] 861 | }, 862 | { 863 | "event": "closetag", 864 | "data": [ 865 | "dc:rights" 866 | ] 867 | }, 868 | { 869 | "event": "text", 870 | "data": [ 871 | "\n\t\t" 872 | ] 873 | }, 874 | { 875 | "event": "opentagname", 876 | "data": [ 877 | "dc:source" 878 | ] 879 | }, 880 | { 881 | "event": "opentag", 882 | "data": [ 883 | "dc:source", 884 | {} 885 | ] 886 | }, 887 | { 888 | "event": "text", 889 | "data": [ 890 | "\nhttp://somefakesite/path/to/something.html\n" 891 | ] 892 | }, 893 | { 894 | "event": "closetag", 895 | "data": [ 896 | "dc:source" 897 | ] 898 | }, 899 | { 900 | "event": "text", 901 | "data": [ 902 | "\n\t\t" 903 | ] 904 | }, 905 | { 906 | "event": "opentagname", 907 | "data": [ 908 | "dc:title" 909 | ] 910 | }, 911 | { 912 | "event": "opentag", 913 | "data": [ 914 | "dc:title", 915 | {} 916 | ] 917 | }, 918 | { 919 | "event": "cdatastart", 920 | "data": [] 921 | }, 922 | { 923 | "event": "text", 924 | "data": [ 925 | " Fast HTML Parsing " 926 | ] 927 | }, 928 | { 929 | "event": "cdataend", 930 | "data": [] 931 | }, 932 | { 933 | "event": "closetag", 934 | "data": [ 935 | "dc:title" 936 | ] 937 | }, 938 | { 939 | "event": "text", 940 | "data": [ 941 | "\n\t\t" 942 | ] 943 | }, 944 | { 945 | "event": "opentagname", 946 | "data": [ 947 | "dc:type" 948 | ] 949 | }, 950 | { 951 | "event": "opentag", 952 | "data": [ 953 | "dc:type", 954 | {} 955 | ] 956 | }, 957 | { 958 | "event": "text", 959 | "data": [ 960 | "text" 961 | ] 962 | }, 963 | { 964 | "event": "closetag", 965 | "data": [ 966 | "dc:type" 967 | ] 968 | }, 969 | { 970 | "event": "text", 971 | "data": [ 972 | "\n\t\t" 973 | ] 974 | }, 975 | { 976 | "event": "opentagname", 977 | "data": [ 978 | "dcterms:issued" 979 | ] 980 | }, 981 | { 982 | "event": "opentag", 983 | "data": [ 984 | "dcterms:issued", 985 | {} 986 | ] 987 | }, 988 | { 989 | "event": "text", 990 | "data": [ 991 | "2011-11-04T09:35:17-07:00" 992 | ] 993 | }, 994 | { 995 | "event": "closetag", 996 | "data": [ 997 | "dcterms:issued" 998 | ] 999 | }, 1000 | { 1001 | "event": "text", 1002 | "data": [ 1003 | "\n\t" 1004 | ] 1005 | }, 1006 | { 1007 | "event": "closetag", 1008 | "data": [ 1009 | "item" 1010 | ] 1011 | }, 1012 | { 1013 | "event": "text", 1014 | "data": [ 1015 | "\n\t" 1016 | ] 1017 | }, 1018 | { 1019 | "event": "opentagname", 1020 | "data": [ 1021 | "item" 1022 | ] 1023 | }, 1024 | { 1025 | "event": "attribute", 1026 | "data": [ 1027 | "rdf:about", 1028 | "http://somefakesite/path/to/something-else.html" 1029 | ] 1030 | }, 1031 | { 1032 | "event": "opentag", 1033 | "data": [ 1034 | "item", 1035 | { 1036 | "rdf:about": "http://somefakesite/path/to/something-else.html" 1037 | } 1038 | ] 1039 | }, 1040 | { 1041 | "event": "text", 1042 | "data": [ 1043 | "\n\t\t" 1044 | ] 1045 | }, 1046 | { 1047 | "event": "opentagname", 1048 | "data": [ 1049 | "title" 1050 | ] 1051 | }, 1052 | { 1053 | "event": "opentag", 1054 | "data": [ 1055 | "title", 1056 | {} 1057 | ] 1058 | }, 1059 | { 1060 | "event": "cdatastart", 1061 | "data": [] 1062 | }, 1063 | { 1064 | "event": "text", 1065 | "data": [ 1066 | "\nThis space intentionally left blank\n" 1067 | ] 1068 | }, 1069 | { 1070 | "event": "cdataend", 1071 | "data": [] 1072 | }, 1073 | { 1074 | "event": "closetag", 1075 | "data": [ 1076 | "title" 1077 | ] 1078 | }, 1079 | { 1080 | "event": "text", 1081 | "data": [ 1082 | "\n\t\t" 1083 | ] 1084 | }, 1085 | { 1086 | "event": "opentagname", 1087 | "data": [ 1088 | "link" 1089 | ] 1090 | }, 1091 | { 1092 | "event": "opentag", 1093 | "data": [ 1094 | "link", 1095 | {} 1096 | ] 1097 | }, 1098 | { 1099 | "event": "text", 1100 | "data": [ 1101 | "\nhttp://somefakesite/path/to/something-else.html\n" 1102 | ] 1103 | }, 1104 | { 1105 | "event": "closetag", 1106 | "data": [ 1107 | "link" 1108 | ] 1109 | }, 1110 | { 1111 | "event": "text", 1112 | "data": [ 1113 | "\n\t\t" 1114 | ] 1115 | }, 1116 | { 1117 | "event": "opentagname", 1118 | "data": [ 1119 | "description" 1120 | ] 1121 | }, 1122 | { 1123 | "event": "opentag", 1124 | "data": [ 1125 | "description", 1126 | {} 1127 | ] 1128 | }, 1129 | { 1130 | "event": "cdatastart", 1131 | "data": [] 1132 | }, 1133 | { 1134 | "event": "text", 1135 | "data": [ 1136 | "\nThe early bird gets the worm\n" 1137 | ] 1138 | }, 1139 | { 1140 | "event": "cdataend", 1141 | "data": [] 1142 | }, 1143 | { 1144 | "event": "closetag", 1145 | "data": [ 1146 | "description" 1147 | ] 1148 | }, 1149 | { 1150 | "event": "text", 1151 | "data": [ 1152 | "\n\t\t" 1153 | ] 1154 | }, 1155 | { 1156 | "event": "opentagname", 1157 | "data": [ 1158 | "dc:date" 1159 | ] 1160 | }, 1161 | { 1162 | "event": "opentag", 1163 | "data": [ 1164 | "dc:date", 1165 | {} 1166 | ] 1167 | }, 1168 | { 1169 | "event": "text", 1170 | "data": [ 1171 | "2011-11-04T09:34:54-07:00" 1172 | ] 1173 | }, 1174 | { 1175 | "event": "closetag", 1176 | "data": [ 1177 | "dc:date" 1178 | ] 1179 | }, 1180 | { 1181 | "event": "text", 1182 | "data": [ 1183 | "\n\t\t" 1184 | ] 1185 | }, 1186 | { 1187 | "event": "opentagname", 1188 | "data": [ 1189 | "dc:language" 1190 | ] 1191 | }, 1192 | { 1193 | "event": "opentag", 1194 | "data": [ 1195 | "dc:language", 1196 | {} 1197 | ] 1198 | }, 1199 | { 1200 | "event": "text", 1201 | "data": [ 1202 | "en-us" 1203 | ] 1204 | }, 1205 | { 1206 | "event": "closetag", 1207 | "data": [ 1208 | "dc:language" 1209 | ] 1210 | }, 1211 | { 1212 | "event": "text", 1213 | "data": [ 1214 | "\n\t\t" 1215 | ] 1216 | }, 1217 | { 1218 | "event": "opentagname", 1219 | "data": [ 1220 | "dc:rights" 1221 | ] 1222 | }, 1223 | { 1224 | "event": "opentag", 1225 | "data": [ 1226 | "dc:rights", 1227 | {} 1228 | ] 1229 | }, 1230 | { 1231 | "event": "text", 1232 | "data": [ 1233 | "Copyright 2015 the authors" 1234 | ] 1235 | }, 1236 | { 1237 | "event": "closetag", 1238 | "data": [ 1239 | "dc:rights" 1240 | ] 1241 | }, 1242 | { 1243 | "event": "text", 1244 | "data": [ 1245 | "\n\t\t" 1246 | ] 1247 | }, 1248 | { 1249 | "event": "opentagname", 1250 | "data": [ 1251 | "dc:source" 1252 | ] 1253 | }, 1254 | { 1255 | "event": "opentag", 1256 | "data": [ 1257 | "dc:source", 1258 | {} 1259 | ] 1260 | }, 1261 | { 1262 | "event": "text", 1263 | "data": [ 1264 | "\nhttp://somefakesite/path/to/something-else.html\n" 1265 | ] 1266 | }, 1267 | { 1268 | "event": "closetag", 1269 | "data": [ 1270 | "dc:source" 1271 | ] 1272 | }, 1273 | { 1274 | "event": "text", 1275 | "data": [ 1276 | "\n\t\t" 1277 | ] 1278 | }, 1279 | { 1280 | "event": "opentagname", 1281 | "data": [ 1282 | "dc:title" 1283 | ] 1284 | }, 1285 | { 1286 | "event": "opentag", 1287 | "data": [ 1288 | "dc:title", 1289 | {} 1290 | ] 1291 | }, 1292 | { 1293 | "event": "cdatastart", 1294 | "data": [] 1295 | }, 1296 | { 1297 | "event": "text", 1298 | "data": [ 1299 | "\nThis space intentionally left blank\n" 1300 | ] 1301 | }, 1302 | { 1303 | "event": "cdataend", 1304 | "data": [] 1305 | }, 1306 | { 1307 | "event": "closetag", 1308 | "data": [ 1309 | "dc:title" 1310 | ] 1311 | }, 1312 | { 1313 | "event": "text", 1314 | "data": [ 1315 | "\n\t\t" 1316 | ] 1317 | }, 1318 | { 1319 | "event": "opentagname", 1320 | "data": [ 1321 | "dc:type" 1322 | ] 1323 | }, 1324 | { 1325 | "event": "opentag", 1326 | "data": [ 1327 | "dc:type", 1328 | {} 1329 | ] 1330 | }, 1331 | { 1332 | "event": "text", 1333 | "data": [ 1334 | "text" 1335 | ] 1336 | }, 1337 | { 1338 | "event": "closetag", 1339 | "data": [ 1340 | "dc:type" 1341 | ] 1342 | }, 1343 | { 1344 | "event": "text", 1345 | "data": [ 1346 | "\n\t\t" 1347 | ] 1348 | }, 1349 | { 1350 | "event": "opentagname", 1351 | "data": [ 1352 | "dcterms:issued" 1353 | ] 1354 | }, 1355 | { 1356 | "event": "opentag", 1357 | "data": [ 1358 | "dcterms:issued", 1359 | {} 1360 | ] 1361 | }, 1362 | { 1363 | "event": "text", 1364 | "data": [ 1365 | "2011-11-04T09:34:54-07:00" 1366 | ] 1367 | }, 1368 | { 1369 | "event": "closetag", 1370 | "data": [ 1371 | "dcterms:issued" 1372 | ] 1373 | }, 1374 | { 1375 | "event": "text", 1376 | "data": [ 1377 | "\n\t" 1378 | ] 1379 | }, 1380 | { 1381 | "event": "closetag", 1382 | "data": [ 1383 | "item" 1384 | ] 1385 | }, 1386 | { 1387 | "event": "text", 1388 | "data": [ 1389 | "\n" 1390 | ] 1391 | }, 1392 | { 1393 | "event": "closetag", 1394 | "data": [ 1395 | "rdf:RDF" 1396 | ] 1397 | } 1398 | ] 1399 | } 1400 | -------------------------------------------------------------------------------- /lib/Tokenizer.js: -------------------------------------------------------------------------------- 1 | module.exports = Tokenizer; 2 | 3 | var decodeCodePoint = require("entities/lib/decode_codepoint.js"), 4 | entityMap = require("entities/maps/entities.json"), 5 | legacyMap = require("entities/maps/legacy.json"), 6 | xmlMap = require("entities/maps/xml.json"), 7 | 8 | i = 0, 9 | 10 | TEXT = i++, 11 | BEFORE_TAG_NAME = i++, //after < 12 | IN_TAG_NAME = i++, 13 | IN_SELF_CLOSING_TAG = i++, 14 | BEFORE_CLOSING_TAG_NAME = i++, 15 | IN_CLOSING_TAG_NAME = i++, 16 | AFTER_CLOSING_TAG_NAME = i++, 17 | 18 | //attributes 19 | BEFORE_ATTRIBUTE_NAME = i++, 20 | IN_ATTRIBUTE_NAME = i++, 21 | AFTER_ATTRIBUTE_NAME = i++, 22 | BEFORE_ATTRIBUTE_VALUE = i++, 23 | IN_ATTRIBUTE_VALUE_DQ = i++, // " 24 | IN_ATTRIBUTE_VALUE_SQ = i++, // ' 25 | IN_ATTRIBUTE_VALUE_NQ = i++, 26 | 27 | //declarations 28 | BEFORE_DECLARATION = i++, // ! 29 | IN_DECLARATION = i++, 30 | 31 | //processing instructions 32 | IN_PROCESSING_INSTRUCTION = i++, // ? 33 | 34 | //comments 35 | BEFORE_COMMENT = i++, 36 | IN_COMMENT = i++, 37 | AFTER_COMMENT_1 = i++, 38 | AFTER_COMMENT_2 = i++, 39 | 40 | //cdata 41 | BEFORE_CDATA_1 = i++, // [ 42 | BEFORE_CDATA_2 = i++, // C 43 | BEFORE_CDATA_3 = i++, // D 44 | BEFORE_CDATA_4 = i++, // A 45 | BEFORE_CDATA_5 = i++, // T 46 | BEFORE_CDATA_6 = i++, // A 47 | IN_CDATA = i++, // [ 48 | AFTER_CDATA_1 = i++, // ] 49 | AFTER_CDATA_2 = i++, // ] 50 | 51 | //special tags 52 | BEFORE_SPECIAL = i++, //S 53 | BEFORE_SPECIAL_END = i++, //S 54 | 55 | BEFORE_SCRIPT_1 = i++, //C 56 | BEFORE_SCRIPT_2 = i++, //R 57 | BEFORE_SCRIPT_3 = i++, //I 58 | BEFORE_SCRIPT_4 = i++, //P 59 | BEFORE_SCRIPT_5 = i++, //T 60 | AFTER_SCRIPT_1 = i++, //C 61 | AFTER_SCRIPT_2 = i++, //R 62 | AFTER_SCRIPT_3 = i++, //I 63 | AFTER_SCRIPT_4 = i++, //P 64 | AFTER_SCRIPT_5 = i++, //T 65 | 66 | BEFORE_STYLE_1 = i++, //T 67 | BEFORE_STYLE_2 = i++, //Y 68 | BEFORE_STYLE_3 = i++, //L 69 | BEFORE_STYLE_4 = i++, //E 70 | AFTER_STYLE_1 = i++, //T 71 | AFTER_STYLE_2 = i++, //Y 72 | AFTER_STYLE_3 = i++, //L 73 | AFTER_STYLE_4 = i++, //E 74 | 75 | BEFORE_ENTITY = i++, //& 76 | BEFORE_NUMERIC_ENTITY = i++, //# 77 | IN_NAMED_ENTITY = i++, 78 | IN_NUMERIC_ENTITY = i++, 79 | IN_HEX_ENTITY = i++, //X 80 | 81 | j = 0, 82 | 83 | SPECIAL_NONE = j++, 84 | SPECIAL_SCRIPT = j++, 85 | SPECIAL_STYLE = j++; 86 | 87 | function whitespace(c){ 88 | return c === " " || c === "\n" || c === "\t" || c === "\f" || c === "\r"; 89 | } 90 | 91 | function characterState(char, SUCCESS){ 92 | return function(c){ 93 | if(c === char) this._state = SUCCESS; 94 | }; 95 | } 96 | 97 | function ifElseState(upper, SUCCESS, FAILURE){ 98 | var lower = upper.toLowerCase(); 99 | 100 | if(upper === lower){ 101 | return function(c){ 102 | if(c === lower){ 103 | this._state = SUCCESS; 104 | } else { 105 | this._state = FAILURE; 106 | this._index--; 107 | } 108 | }; 109 | } else { 110 | return function(c){ 111 | if(c === lower || c === upper){ 112 | this._state = SUCCESS; 113 | } else { 114 | this._state = FAILURE; 115 | this._index--; 116 | } 117 | }; 118 | } 119 | } 120 | 121 | function consumeSpecialNameChar(upper, NEXT_STATE){ 122 | var lower = upper.toLowerCase(); 123 | 124 | return function(c){ 125 | if(c === lower || c === upper){ 126 | this._state = NEXT_STATE; 127 | } else { 128 | this._state = IN_TAG_NAME; 129 | this._index--; //consume the token again 130 | } 131 | }; 132 | } 133 | 134 | function Tokenizer(options, cbs){ 135 | this._state = TEXT; 136 | this._buffer = ""; 137 | this._sectionStart = 0; 138 | this._index = 0; 139 | this._bufferOffset = 0; //chars removed from _buffer 140 | this._baseState = TEXT; 141 | this._special = SPECIAL_NONE; 142 | this._cbs = cbs; 143 | this._running = true; 144 | this._ended = false; 145 | this._xmlMode = !!(options && options.xmlMode); 146 | this._decodeEntities = !!(options && options.decodeEntities); 147 | } 148 | 149 | Tokenizer.prototype._stateText = function(c){ 150 | if(c === "<"){ 151 | if(this._index > this._sectionStart){ 152 | this._cbs.ontext(this._getSection()); 153 | } 154 | this._state = BEFORE_TAG_NAME; 155 | this._sectionStart = this._index; 156 | } else if(this._decodeEntities && this._special === SPECIAL_NONE && c === "&"){ 157 | if(this._index > this._sectionStart){ 158 | this._cbs.ontext(this._getSection()); 159 | } 160 | this._baseState = TEXT; 161 | this._state = BEFORE_ENTITY; 162 | this._sectionStart = this._index; 163 | } 164 | }; 165 | 166 | Tokenizer.prototype._stateBeforeTagName = function(c){ 167 | if(c === "/"){ 168 | this._state = BEFORE_CLOSING_TAG_NAME; 169 | } else if(c === "<"){ 170 | this._cbs.ontext(this._getSection()); 171 | this._sectionStart = this._index; 172 | } else if(c === ">" || this._special !== SPECIAL_NONE || whitespace(c)) { 173 | this._state = TEXT; 174 | } else if(c === "!"){ 175 | this._state = BEFORE_DECLARATION; 176 | this._sectionStart = this._index + 1; 177 | } else if(c === "?"){ 178 | this._state = IN_PROCESSING_INSTRUCTION; 179 | this._sectionStart = this._index + 1; 180 | } else { 181 | this._state = (!this._xmlMode && (c === "s" || c === "S")) ? 182 | BEFORE_SPECIAL : IN_TAG_NAME; 183 | this._sectionStart = this._index; 184 | } 185 | }; 186 | 187 | Tokenizer.prototype._stateInTagName = function(c){ 188 | if(c === "/" || c === ">" || whitespace(c)){ 189 | this._emitToken("onopentagname"); 190 | this._state = BEFORE_ATTRIBUTE_NAME; 191 | this._index--; 192 | } 193 | }; 194 | 195 | Tokenizer.prototype._stateBeforeCloseingTagName = function(c){ 196 | if(whitespace(c)); 197 | else if(c === ">"){ 198 | this._state = TEXT; 199 | } else if(this._special !== SPECIAL_NONE){ 200 | if(c === "s" || c === "S"){ 201 | this._state = BEFORE_SPECIAL_END; 202 | } else { 203 | this._state = TEXT; 204 | this._index--; 205 | } 206 | } else { 207 | this._state = IN_CLOSING_TAG_NAME; 208 | this._sectionStart = this._index; 209 | } 210 | }; 211 | 212 | Tokenizer.prototype._stateInCloseingTagName = function(c){ 213 | if(c === ">" || whitespace(c)){ 214 | this._emitToken("onclosetag"); 215 | this._state = AFTER_CLOSING_TAG_NAME; 216 | this._index--; 217 | } 218 | }; 219 | 220 | Tokenizer.prototype._stateAfterCloseingTagName = function(c){ 221 | //skip everything until ">" 222 | if(c === ">"){ 223 | this._state = TEXT; 224 | this._sectionStart = this._index + 1; 225 | } 226 | }; 227 | 228 | Tokenizer.prototype._stateBeforeAttributeName = function(c){ 229 | if(c === ">"){ 230 | this._cbs.onopentagend(); 231 | this._state = TEXT; 232 | this._sectionStart = this._index + 1; 233 | } else if(c === "/"){ 234 | this._state = IN_SELF_CLOSING_TAG; 235 | } else if(!whitespace(c)){ 236 | this._state = IN_ATTRIBUTE_NAME; 237 | this._sectionStart = this._index; 238 | } 239 | }; 240 | 241 | Tokenizer.prototype._stateInSelfClosingTag = function(c){ 242 | if(c === ">"){ 243 | this._cbs.onselfclosingtag(); 244 | this._state = TEXT; 245 | this._sectionStart = this._index + 1; 246 | } else if(!whitespace(c)){ 247 | this._state = BEFORE_ATTRIBUTE_NAME; 248 | this._index--; 249 | } 250 | }; 251 | 252 | Tokenizer.prototype._stateInAttributeName = function(c){ 253 | if(c === "=" || c === "/" || c === ">" || whitespace(c)){ 254 | this._cbs.onattribname(this._getSection()); 255 | this._sectionStart = -1; 256 | this._state = AFTER_ATTRIBUTE_NAME; 257 | this._index--; 258 | } 259 | }; 260 | 261 | Tokenizer.prototype._stateAfterAttributeName = function(c){ 262 | if(c === "="){ 263 | this._state = BEFORE_ATTRIBUTE_VALUE; 264 | } else if(c === "/" || c === ">"){ 265 | this._cbs.onattribend(); 266 | this._state = BEFORE_ATTRIBUTE_NAME; 267 | this._index--; 268 | } else if(!whitespace(c)){ 269 | this._cbs.onattribend(); 270 | this._state = IN_ATTRIBUTE_NAME; 271 | this._sectionStart = this._index; 272 | } 273 | }; 274 | 275 | Tokenizer.prototype._stateBeforeAttributeValue = function(c){ 276 | if(c === "\""){ 277 | this._state = IN_ATTRIBUTE_VALUE_DQ; 278 | this._sectionStart = this._index + 1; 279 | } else if(c === "'"){ 280 | this._state = IN_ATTRIBUTE_VALUE_SQ; 281 | this._sectionStart = this._index + 1; 282 | } else if(!whitespace(c)){ 283 | this._state = IN_ATTRIBUTE_VALUE_NQ; 284 | this._sectionStart = this._index; 285 | this._index--; //reconsume token 286 | } 287 | }; 288 | 289 | Tokenizer.prototype._stateInAttributeValueDoubleQuotes = function(c){ 290 | if(c === "\""){ 291 | this._emitToken("onattribdata"); 292 | this._cbs.onattribend(); 293 | this._state = BEFORE_ATTRIBUTE_NAME; 294 | } else if(this._decodeEntities && c === "&"){ 295 | this._emitToken("onattribdata"); 296 | this._baseState = this._state; 297 | this._state = BEFORE_ENTITY; 298 | this._sectionStart = this._index; 299 | } 300 | }; 301 | 302 | Tokenizer.prototype._stateInAttributeValueSingleQuotes = function(c){ 303 | if(c === "'"){ 304 | this._emitToken("onattribdata"); 305 | this._cbs.onattribend(); 306 | this._state = BEFORE_ATTRIBUTE_NAME; 307 | } else if(this._decodeEntities && c === "&"){ 308 | this._emitToken("onattribdata"); 309 | this._baseState = this._state; 310 | this._state = BEFORE_ENTITY; 311 | this._sectionStart = this._index; 312 | } 313 | }; 314 | 315 | Tokenizer.prototype._stateInAttributeValueNoQuotes = function(c){ 316 | if(whitespace(c) || c === ">"){ 317 | this._emitToken("onattribdata"); 318 | this._cbs.onattribend(); 319 | this._state = BEFORE_ATTRIBUTE_NAME; 320 | this._index--; 321 | } else if(this._decodeEntities && c === "&"){ 322 | this._emitToken("onattribdata"); 323 | this._baseState = this._state; 324 | this._state = BEFORE_ENTITY; 325 | this._sectionStart = this._index; 326 | } 327 | }; 328 | 329 | Tokenizer.prototype._stateBeforeDeclaration = function(c){ 330 | this._state = c === "[" ? BEFORE_CDATA_1 : 331 | c === "-" ? BEFORE_COMMENT : 332 | IN_DECLARATION; 333 | }; 334 | 335 | Tokenizer.prototype._stateInDeclaration = function(c){ 336 | if(c === ">"){ 337 | this._cbs.ondeclaration(this._getSection()); 338 | this._state = TEXT; 339 | this._sectionStart = this._index + 1; 340 | } 341 | }; 342 | 343 | Tokenizer.prototype._stateInProcessingInstruction = function(c){ 344 | if(c === ">"){ 345 | this._cbs.onprocessinginstruction(this._getSection()); 346 | this._state = TEXT; 347 | this._sectionStart = this._index + 1; 348 | } 349 | }; 350 | 351 | Tokenizer.prototype._stateBeforeComment = function(c){ 352 | if(c === "-"){ 353 | this._state = IN_COMMENT; 354 | this._sectionStart = this._index + 1; 355 | } else { 356 | this._state = IN_DECLARATION; 357 | } 358 | }; 359 | 360 | Tokenizer.prototype._stateInComment = function(c){ 361 | if(c === "-") this._state = AFTER_COMMENT_1; 362 | }; 363 | 364 | Tokenizer.prototype._stateAfterComment1 = function(c){ 365 | if(c === "-"){ 366 | this._state = AFTER_COMMENT_2; 367 | } else { 368 | this._state = IN_COMMENT; 369 | } 370 | }; 371 | 372 | Tokenizer.prototype._stateAfterComment2 = function(c){ 373 | if(c === ">"){ 374 | //remove 2 trailing chars 375 | this._cbs.oncomment(this._buffer.substring(this._sectionStart, this._index - 2)); 376 | this._state = TEXT; 377 | this._sectionStart = this._index + 1; 378 | } else if(c !== "-"){ 379 | this._state = IN_COMMENT; 380 | } 381 | // else: stay in AFTER_COMMENT_2 (`--->`) 382 | }; 383 | 384 | Tokenizer.prototype._stateBeforeCdata1 = ifElseState("C", BEFORE_CDATA_2, IN_DECLARATION); 385 | Tokenizer.prototype._stateBeforeCdata2 = ifElseState("D", BEFORE_CDATA_3, IN_DECLARATION); 386 | Tokenizer.prototype._stateBeforeCdata3 = ifElseState("A", BEFORE_CDATA_4, IN_DECLARATION); 387 | Tokenizer.prototype._stateBeforeCdata4 = ifElseState("T", BEFORE_CDATA_5, IN_DECLARATION); 388 | Tokenizer.prototype._stateBeforeCdata5 = ifElseState("A", BEFORE_CDATA_6, IN_DECLARATION); 389 | 390 | Tokenizer.prototype._stateBeforeCdata6 = function(c){ 391 | if(c === "["){ 392 | this._state = IN_CDATA; 393 | this._sectionStart = this._index + 1; 394 | } else { 395 | this._state = IN_DECLARATION; 396 | this._index--; 397 | } 398 | }; 399 | 400 | Tokenizer.prototype._stateInCdata = function(c){ 401 | if(c === "]") this._state = AFTER_CDATA_1; 402 | }; 403 | 404 | Tokenizer.prototype._stateAfterCdata1 = characterState("]", AFTER_CDATA_2); 405 | 406 | Tokenizer.prototype._stateAfterCdata2 = function(c){ 407 | if(c === ">"){ 408 | //remove 2 trailing chars 409 | this._cbs.oncdata(this._buffer.substring(this._sectionStart, this._index - 2)); 410 | this._state = TEXT; 411 | this._sectionStart = this._index + 1; 412 | } else if(c !== "]") { 413 | this._state = IN_CDATA; 414 | } 415 | //else: stay in AFTER_CDATA_2 (`]]]>`) 416 | }; 417 | 418 | Tokenizer.prototype._stateBeforeSpecial = function(c){ 419 | if(c === "c" || c === "C"){ 420 | this._state = BEFORE_SCRIPT_1; 421 | } else if(c === "t" || c === "T"){ 422 | this._state = BEFORE_STYLE_1; 423 | } else { 424 | this._state = IN_TAG_NAME; 425 | this._index--; //consume the token again 426 | } 427 | }; 428 | 429 | Tokenizer.prototype._stateBeforeSpecialEnd = function(c){ 430 | if(this._special === SPECIAL_SCRIPT && (c === "c" || c === "C")){ 431 | this._state = AFTER_SCRIPT_1; 432 | } else if(this._special === SPECIAL_STYLE && (c === "t" || c === "T")){ 433 | this._state = AFTER_STYLE_1; 434 | } 435 | else this._state = TEXT; 436 | }; 437 | 438 | Tokenizer.prototype._stateBeforeScript1 = consumeSpecialNameChar("R", BEFORE_SCRIPT_2); 439 | Tokenizer.prototype._stateBeforeScript2 = consumeSpecialNameChar("I", BEFORE_SCRIPT_3); 440 | Tokenizer.prototype._stateBeforeScript3 = consumeSpecialNameChar("P", BEFORE_SCRIPT_4); 441 | Tokenizer.prototype._stateBeforeScript4 = consumeSpecialNameChar("T", BEFORE_SCRIPT_5); 442 | 443 | Tokenizer.prototype._stateBeforeScript5 = function(c){ 444 | if(c === "/" || c === ">" || whitespace(c)){ 445 | this._special = SPECIAL_SCRIPT; 446 | } 447 | this._state = IN_TAG_NAME; 448 | this._index--; //consume the token again 449 | }; 450 | 451 | Tokenizer.prototype._stateAfterScript1 = ifElseState("R", AFTER_SCRIPT_2, TEXT); 452 | Tokenizer.prototype._stateAfterScript2 = ifElseState("I", AFTER_SCRIPT_3, TEXT); 453 | Tokenizer.prototype._stateAfterScript3 = ifElseState("P", AFTER_SCRIPT_4, TEXT); 454 | Tokenizer.prototype._stateAfterScript4 = ifElseState("T", AFTER_SCRIPT_5, TEXT); 455 | 456 | Tokenizer.prototype._stateAfterScript5 = function(c){ 457 | if(c === ">" || whitespace(c)){ 458 | this._special = SPECIAL_NONE; 459 | this._state = IN_CLOSING_TAG_NAME; 460 | this._sectionStart = this._index - 6; 461 | this._index--; //reconsume the token 462 | } 463 | else this._state = TEXT; 464 | }; 465 | 466 | Tokenizer.prototype._stateBeforeStyle1 = consumeSpecialNameChar("Y", BEFORE_STYLE_2); 467 | Tokenizer.prototype._stateBeforeStyle2 = consumeSpecialNameChar("L", BEFORE_STYLE_3); 468 | Tokenizer.prototype._stateBeforeStyle3 = consumeSpecialNameChar("E", BEFORE_STYLE_4); 469 | 470 | Tokenizer.prototype._stateBeforeStyle4 = function(c){ 471 | if(c === "/" || c === ">" || whitespace(c)){ 472 | this._special = SPECIAL_STYLE; 473 | } 474 | this._state = IN_TAG_NAME; 475 | this._index--; //consume the token again 476 | }; 477 | 478 | Tokenizer.prototype._stateAfterStyle1 = ifElseState("Y", AFTER_STYLE_2, TEXT); 479 | Tokenizer.prototype._stateAfterStyle2 = ifElseState("L", AFTER_STYLE_3, TEXT); 480 | Tokenizer.prototype._stateAfterStyle3 = ifElseState("E", AFTER_STYLE_4, TEXT); 481 | 482 | Tokenizer.prototype._stateAfterStyle4 = function(c){ 483 | if(c === ">" || whitespace(c)){ 484 | this._special = SPECIAL_NONE; 485 | this._state = IN_CLOSING_TAG_NAME; 486 | this._sectionStart = this._index - 5; 487 | this._index--; //reconsume the token 488 | } 489 | else this._state = TEXT; 490 | }; 491 | 492 | Tokenizer.prototype._stateBeforeEntity = ifElseState("#", BEFORE_NUMERIC_ENTITY, IN_NAMED_ENTITY); 493 | Tokenizer.prototype._stateBeforeNumericEntity = ifElseState("X", IN_HEX_ENTITY, IN_NUMERIC_ENTITY); 494 | 495 | //for entities terminated with a semicolon 496 | Tokenizer.prototype._parseNamedEntityStrict = function(){ 497 | //offset = 1 498 | if(this._sectionStart + 1 < this._index){ 499 | var entity = this._buffer.substring(this._sectionStart + 1, this._index), 500 | map = this._xmlMode ? xmlMap : entityMap; 501 | 502 | if(map.hasOwnProperty(entity)){ 503 | this._emitPartial(map[entity]); 504 | this._sectionStart = this._index + 1; 505 | } 506 | } 507 | }; 508 | 509 | 510 | //parses legacy entities (without trailing semicolon) 511 | Tokenizer.prototype._parseLegacyEntity = function(){ 512 | var start = this._sectionStart + 1, 513 | limit = this._index - start; 514 | 515 | if(limit > 6) limit = 6; //the max length of legacy entities is 6 516 | 517 | while(limit >= 2){ //the min length of legacy entities is 2 518 | var entity = this._buffer.substr(start, limit); 519 | 520 | if(legacyMap.hasOwnProperty(entity)){ 521 | this._emitPartial(legacyMap[entity]); 522 | this._sectionStart += limit + 1; 523 | return; 524 | } else { 525 | limit--; 526 | } 527 | } 528 | }; 529 | 530 | Tokenizer.prototype._stateInNamedEntity = function(c){ 531 | if(c === ";"){ 532 | this._parseNamedEntityStrict(); 533 | if(this._sectionStart + 1 < this._index && !this._xmlMode){ 534 | this._parseLegacyEntity(); 535 | } 536 | this._state = this._baseState; 537 | } else if((c < "a" || c > "z") && (c < "A" || c > "Z") && (c < "0" || c > "9")){ 538 | if(this._xmlMode); 539 | else if(this._sectionStart + 1 === this._index); 540 | else if(this._baseState !== TEXT){ 541 | if(c !== "="){ 542 | this._parseNamedEntityStrict(); 543 | } 544 | } else { 545 | this._parseLegacyEntity(); 546 | } 547 | 548 | this._state = this._baseState; 549 | this._index--; 550 | } 551 | }; 552 | 553 | Tokenizer.prototype._decodeNumericEntity = function(offset, base){ 554 | var sectionStart = this._sectionStart + offset; 555 | 556 | if(sectionStart !== this._index){ 557 | //parse entity 558 | var entity = this._buffer.substring(sectionStart, this._index); 559 | var parsed = parseInt(entity, base); 560 | 561 | this._emitPartial(decodeCodePoint(parsed)); 562 | this._sectionStart = this._index; 563 | } else { 564 | this._sectionStart--; 565 | } 566 | 567 | this._state = this._baseState; 568 | }; 569 | 570 | Tokenizer.prototype._stateInNumericEntity = function(c){ 571 | if(c === ";"){ 572 | this._decodeNumericEntity(2, 10); 573 | this._sectionStart++; 574 | } else if(c < "0" || c > "9"){ 575 | if(!this._xmlMode){ 576 | this._decodeNumericEntity(2, 10); 577 | } else { 578 | this._state = this._baseState; 579 | } 580 | this._index--; 581 | } 582 | }; 583 | 584 | Tokenizer.prototype._stateInHexEntity = function(c){ 585 | if(c === ";"){ 586 | this._decodeNumericEntity(3, 16); 587 | this._sectionStart++; 588 | } else if((c < "a" || c > "f") && (c < "A" || c > "F") && (c < "0" || c > "9")){ 589 | if(!this._xmlMode){ 590 | this._decodeNumericEntity(3, 16); 591 | } else { 592 | this._state = this._baseState; 593 | } 594 | this._index--; 595 | } 596 | }; 597 | 598 | Tokenizer.prototype._cleanup = function (){ 599 | if(this._sectionStart < 0){ 600 | this._buffer = ""; 601 | this._bufferOffset += this._index; 602 | this._index = 0; 603 | } else if(this._running){ 604 | if(this._state === TEXT){ 605 | if(this._sectionStart !== this._index){ 606 | this._cbs.ontext(this._buffer.substr(this._sectionStart)); 607 | } 608 | this._buffer = ""; 609 | this._bufferOffset += this._index; 610 | this._index = 0; 611 | } else if(this._sectionStart === this._index){ 612 | //the section just started 613 | this._buffer = ""; 614 | this._bufferOffset += this._index; 615 | this._index = 0; 616 | } else { 617 | //remove everything unnecessary 618 | this._buffer = this._buffer.substr(this._sectionStart); 619 | this._index -= this._sectionStart; 620 | this._bufferOffset += this._sectionStart; 621 | } 622 | 623 | this._sectionStart = 0; 624 | } 625 | }; 626 | 627 | //TODO make events conditional 628 | Tokenizer.prototype.write = function(chunk){ 629 | if(this._ended) this._cbs.onerror(Error(".write() after done!")); 630 | 631 | this._buffer += chunk; 632 | this._parse(); 633 | }; 634 | 635 | Tokenizer.prototype._parse = function(){ 636 | while(this._index < this._buffer.length && this._running){ 637 | var c = this._buffer.charAt(this._index); 638 | if(this._state === TEXT) { 639 | this._stateText(c); 640 | } else if(this._state === BEFORE_TAG_NAME){ 641 | this._stateBeforeTagName(c); 642 | } else if(this._state === IN_TAG_NAME) { 643 | this._stateInTagName(c); 644 | } else if(this._state === BEFORE_CLOSING_TAG_NAME){ 645 | this._stateBeforeCloseingTagName(c); 646 | } else if(this._state === IN_CLOSING_TAG_NAME){ 647 | this._stateInCloseingTagName(c); 648 | } else if(this._state === AFTER_CLOSING_TAG_NAME){ 649 | this._stateAfterCloseingTagName(c); 650 | } else if(this._state === IN_SELF_CLOSING_TAG){ 651 | this._stateInSelfClosingTag(c); 652 | } 653 | 654 | /* 655 | * attributes 656 | */ 657 | else if(this._state === BEFORE_ATTRIBUTE_NAME){ 658 | this._stateBeforeAttributeName(c); 659 | } else if(this._state === IN_ATTRIBUTE_NAME){ 660 | this._stateInAttributeName(c); 661 | } else if(this._state === AFTER_ATTRIBUTE_NAME){ 662 | this._stateAfterAttributeName(c); 663 | } else if(this._state === BEFORE_ATTRIBUTE_VALUE){ 664 | this._stateBeforeAttributeValue(c); 665 | } else if(this._state === IN_ATTRIBUTE_VALUE_DQ){ 666 | this._stateInAttributeValueDoubleQuotes(c); 667 | } else if(this._state === IN_ATTRIBUTE_VALUE_SQ){ 668 | this._stateInAttributeValueSingleQuotes(c); 669 | } else if(this._state === IN_ATTRIBUTE_VALUE_NQ){ 670 | this._stateInAttributeValueNoQuotes(c); 671 | } 672 | 673 | /* 674 | * declarations 675 | */ 676 | else if(this._state === BEFORE_DECLARATION){ 677 | this._stateBeforeDeclaration(c); 678 | } else if(this._state === IN_DECLARATION){ 679 | this._stateInDeclaration(c); 680 | } 681 | 682 | /* 683 | * processing instructions 684 | */ 685 | else if(this._state === IN_PROCESSING_INSTRUCTION){ 686 | this._stateInProcessingInstruction(c); 687 | } 688 | 689 | /* 690 | * comments 691 | */ 692 | else if(this._state === BEFORE_COMMENT){ 693 | this._stateBeforeComment(c); 694 | } else if(this._state === IN_COMMENT){ 695 | this._stateInComment(c); 696 | } else if(this._state === AFTER_COMMENT_1){ 697 | this._stateAfterComment1(c); 698 | } else if(this._state === AFTER_COMMENT_2){ 699 | this._stateAfterComment2(c); 700 | } 701 | 702 | /* 703 | * cdata 704 | */ 705 | else if(this._state === BEFORE_CDATA_1){ 706 | this._stateBeforeCdata1(c); 707 | } else if(this._state === BEFORE_CDATA_2){ 708 | this._stateBeforeCdata2(c); 709 | } else if(this._state === BEFORE_CDATA_3){ 710 | this._stateBeforeCdata3(c); 711 | } else if(this._state === BEFORE_CDATA_4){ 712 | this._stateBeforeCdata4(c); 713 | } else if(this._state === BEFORE_CDATA_5){ 714 | this._stateBeforeCdata5(c); 715 | } else if(this._state === BEFORE_CDATA_6){ 716 | this._stateBeforeCdata6(c); 717 | } else if(this._state === IN_CDATA){ 718 | this._stateInCdata(c); 719 | } else if(this._state === AFTER_CDATA_1){ 720 | this._stateAfterCdata1(c); 721 | } else if(this._state === AFTER_CDATA_2){ 722 | this._stateAfterCdata2(c); 723 | } 724 | 725 | /* 726 | * special tags 727 | */ 728 | else if(this._state === BEFORE_SPECIAL){ 729 | this._stateBeforeSpecial(c); 730 | } else if(this._state === BEFORE_SPECIAL_END){ 731 | this._stateBeforeSpecialEnd(c); 732 | } 733 | 734 | /* 735 | * script 736 | */ 737 | else if(this._state === BEFORE_SCRIPT_1){ 738 | this._stateBeforeScript1(c); 739 | } else if(this._state === BEFORE_SCRIPT_2){ 740 | this._stateBeforeScript2(c); 741 | } else if(this._state === BEFORE_SCRIPT_3){ 742 | this._stateBeforeScript3(c); 743 | } else if(this._state === BEFORE_SCRIPT_4){ 744 | this._stateBeforeScript4(c); 745 | } else if(this._state === BEFORE_SCRIPT_5){ 746 | this._stateBeforeScript5(c); 747 | } 748 | 749 | else if(this._state === AFTER_SCRIPT_1){ 750 | this._stateAfterScript1(c); 751 | } else if(this._state === AFTER_SCRIPT_2){ 752 | this._stateAfterScript2(c); 753 | } else if(this._state === AFTER_SCRIPT_3){ 754 | this._stateAfterScript3(c); 755 | } else if(this._state === AFTER_SCRIPT_4){ 756 | this._stateAfterScript4(c); 757 | } else if(this._state === AFTER_SCRIPT_5){ 758 | this._stateAfterScript5(c); 759 | } 760 | 761 | /* 762 | * style 763 | */ 764 | else if(this._state === BEFORE_STYLE_1){ 765 | this._stateBeforeStyle1(c); 766 | } else if(this._state === BEFORE_STYLE_2){ 767 | this._stateBeforeStyle2(c); 768 | } else if(this._state === BEFORE_STYLE_3){ 769 | this._stateBeforeStyle3(c); 770 | } else if(this._state === BEFORE_STYLE_4){ 771 | this._stateBeforeStyle4(c); 772 | } 773 | 774 | else if(this._state === AFTER_STYLE_1){ 775 | this._stateAfterStyle1(c); 776 | } else if(this._state === AFTER_STYLE_2){ 777 | this._stateAfterStyle2(c); 778 | } else if(this._state === AFTER_STYLE_3){ 779 | this._stateAfterStyle3(c); 780 | } else if(this._state === AFTER_STYLE_4){ 781 | this._stateAfterStyle4(c); 782 | } 783 | 784 | /* 785 | * entities 786 | */ 787 | else if(this._state === BEFORE_ENTITY){ 788 | this._stateBeforeEntity(c); 789 | } else if(this._state === BEFORE_NUMERIC_ENTITY){ 790 | this._stateBeforeNumericEntity(c); 791 | } else if(this._state === IN_NAMED_ENTITY){ 792 | this._stateInNamedEntity(c); 793 | } else if(this._state === IN_NUMERIC_ENTITY){ 794 | this._stateInNumericEntity(c); 795 | } else if(this._state === IN_HEX_ENTITY){ 796 | this._stateInHexEntity(c); 797 | } 798 | 799 | else { 800 | this._cbs.onerror(Error("unknown _state"), this._state); 801 | } 802 | 803 | this._index++; 804 | } 805 | 806 | this._cleanup(); 807 | }; 808 | 809 | Tokenizer.prototype.pause = function(){ 810 | this._running = false; 811 | }; 812 | Tokenizer.prototype.resume = function(){ 813 | this._running = true; 814 | 815 | if(this._index < this._buffer.length){ 816 | this._parse(); 817 | } 818 | if(this._ended){ 819 | this._finish(); 820 | } 821 | }; 822 | 823 | Tokenizer.prototype.end = function(chunk){ 824 | if(this._ended) this._cbs.onerror(Error(".end() after done!")); 825 | if(chunk) this.write(chunk); 826 | 827 | this._ended = true; 828 | 829 | if(this._running) this._finish(); 830 | }; 831 | 832 | Tokenizer.prototype._finish = function(){ 833 | //if there is remaining data, emit it in a reasonable way 834 | if(this._sectionStart < this._index){ 835 | this._handleTrailingData(); 836 | } 837 | 838 | this._cbs.onend(); 839 | }; 840 | 841 | Tokenizer.prototype._handleTrailingData = function(){ 842 | var data = this._buffer.substr(this._sectionStart); 843 | 844 | if(this._state === IN_CDATA || this._state === AFTER_CDATA_1 || this._state === AFTER_CDATA_2){ 845 | this._cbs.oncdata(data); 846 | } else if(this._state === IN_COMMENT || this._state === AFTER_COMMENT_1 || this._state === AFTER_COMMENT_2){ 847 | this._cbs.oncomment(data); 848 | } else if(this._state === IN_NAMED_ENTITY && !this._xmlMode){ 849 | this._parseLegacyEntity(); 850 | if(this._sectionStart < this._index){ 851 | this._state = this._baseState; 852 | this._handleTrailingData(); 853 | } 854 | } else if(this._state === IN_NUMERIC_ENTITY && !this._xmlMode){ 855 | this._decodeNumericEntity(2, 10); 856 | if(this._sectionStart < this._index){ 857 | this._state = this._baseState; 858 | this._handleTrailingData(); 859 | } 860 | } else if(this._state === IN_HEX_ENTITY && !this._xmlMode){ 861 | this._decodeNumericEntity(3, 16); 862 | if(this._sectionStart < this._index){ 863 | this._state = this._baseState; 864 | this._handleTrailingData(); 865 | } 866 | } else if( 867 | this._state !== IN_TAG_NAME && 868 | this._state !== BEFORE_ATTRIBUTE_NAME && 869 | this._state !== BEFORE_ATTRIBUTE_VALUE && 870 | this._state !== AFTER_ATTRIBUTE_NAME && 871 | this._state !== IN_ATTRIBUTE_NAME && 872 | this._state !== IN_ATTRIBUTE_VALUE_SQ && 873 | this._state !== IN_ATTRIBUTE_VALUE_DQ && 874 | this._state !== IN_ATTRIBUTE_VALUE_NQ && 875 | this._state !== IN_CLOSING_TAG_NAME 876 | ){ 877 | this._cbs.ontext(data); 878 | } 879 | //else, ignore remaining data 880 | //TODO add a way to remove current tag 881 | }; 882 | 883 | Tokenizer.prototype.reset = function(){ 884 | Tokenizer.call(this, {xmlMode: this._xmlMode, decodeEntities: this._decodeEntities}, this._cbs); 885 | }; 886 | 887 | Tokenizer.prototype.getAbsoluteIndex = function(){ 888 | return this._bufferOffset + this._index; 889 | }; 890 | 891 | Tokenizer.prototype._getSection = function(){ 892 | return this._buffer.substring(this._sectionStart, this._index); 893 | }; 894 | 895 | Tokenizer.prototype._emitToken = function(name){ 896 | this._cbs[name](this._getSection()); 897 | this._sectionStart = -1; 898 | }; 899 | 900 | Tokenizer.prototype._emitPartial = function(value){ 901 | if(this._baseState !== TEXT){ 902 | this._cbs.onattribdata(value); //TODO implement the new event 903 | } else { 904 | this._cbs.ontext(value); 905 | } 906 | }; 907 | --------------------------------------------------------------------------------