├── .gitignore ├── package.json ├── src ├── inherits.tsx ├── index.tsx ├── Parser.tsx └── Tokenizer.tsx ├── README.md └── tests └── index.spec.js /.gitignore: -------------------------------------------------------------------------------- 1 | *.diff 2 | *.patch 3 | *.bak 4 | .DS_Store 5 | Thumbs.db 6 | .project 7 | .*proj 8 | .svn 9 | *.swp 10 | *.swo 11 | *.pyc 12 | *.pyo 13 | .idea 14 | .prettierrc 15 | lib 16 | es 17 | yarn.lock 18 | node_modules 19 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "mini-html-parser2", 3 | "version": "0.3.0", 4 | "description": "小程序富文本", 5 | "repository": { 6 | "type": "git", 7 | "url": "git@github.com:ant-mini-program/mini-html-parser.git" 8 | }, 9 | "files": [ 10 | "lib" 11 | ], 12 | "main": "lib/index.js", 13 | "scripts": { 14 | "test": "jest", 15 | "build": "rc-tools run compile", 16 | "pub": "git push origin && npm run build && npm publish" 17 | }, 18 | "keywords": [ 19 | "rich-text", 20 | "mini-program", 21 | "html-parser" 22 | ], 23 | "author": "issac.lj@alibaba-inc.com", 24 | "devDependencies": { 25 | "@types/node": "^11.9.6", 26 | "jest": "^23.6.0", 27 | "rc-tools": "^8.1.5" 28 | }, 29 | "dependencies": { 30 | "domhandler": "^2.4.2", 31 | "entities": "^1.1.1", 32 | "events": "^3.0.0" 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/inherits.tsx: -------------------------------------------------------------------------------- 1 | if (typeof Object.create === 'function') { 2 | // implementation from standard node.js 'util' module 3 | module.exports = function inherits(ctor, superCtor) { 4 | ctor.super_ = superCtor 5 | ctor.prototype = Object.create(superCtor.prototype, { 6 | constructor: { 7 | value: ctor, 8 | enumerable: false, 9 | writable: true, 10 | configurable: true 11 | } 12 | }); 13 | }; 14 | } else { 15 | // old school shim for old browsers 16 | module.exports = function inherits(ctor, superCtor) { 17 | ctor.super_ = superCtor 18 | var TempCtor = function () {} 19 | TempCtor.prototype = superCtor.prototype 20 | ctor.prototype = new TempCtor() 21 | ctor.prototype.constructor = ctor 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/index.tsx: -------------------------------------------------------------------------------- 1 | import Handler from 'domhandler'; 2 | import Parser from './Parser'; 3 | 4 | function transformNode(node) { 5 | if (['tag', 'text'].indexOf(node.type) === -1) { 6 | throw new Error(`not supported name ${node.name} of type ${node.type}`); 7 | } 8 | if (node.type === 'text') { 9 | return { 10 | type: node.type, 11 | text: node.data, 12 | }; 13 | } 14 | return { 15 | name: node.name, 16 | children: transform(node.children), 17 | attrs: node.attribs, 18 | }; 19 | } 20 | 21 | function transform(nodes) { 22 | return nodes.map(transformNode); 23 | } 24 | 25 | export default function parse(html, done) { 26 | const handler = new Handler(function(err, children) { 27 | if (err) { 28 | console.error(err); 29 | done(err); 30 | } 31 | try { 32 | done(null, transform(children)); 33 | } catch (e) { 34 | console.error(e); 35 | done(e); 36 | } 37 | }, {}); 38 | 39 | const parser = new Parser(handler, { xmlMode: false }); 40 | 41 | parser.write(html); 42 | parser.done(); 43 | } 44 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mini-html-parser2 2 | 3 | > 支付宝小程序基础库在 2.8.5 及之后版本已在 组件中内置 HTML 解析能力:https://opendocs.alipay.com/mini/component/rich-text 4 | 5 | ## 安装 6 | 7 | ``` 8 | $ npm install mini-html-parser2 --save 9 | ``` 10 | 11 | ## 使用 12 | 13 | ```js 14 | // page.js 15 | const html = `
16 | test 17 |
18 | table test 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 |
titletitle
yyxxxxxx
35 |
36 |
` 37 | import parse from 'mini-html-parser2'; 38 | 39 | Page({ 40 | data: { 41 | nodes: [], 42 | }, 43 | onLoad() { 44 | parse(html, (err, nodes) => { 45 | if (!err) { 46 | this.setData({ 47 | nodes, 48 | }); 49 | } 50 | }) 51 | }, 52 | }) 53 | ``` 54 | 55 | ```html 56 | 57 | 58 | ``` 59 | 60 | ## 运行测试 61 | 62 | ``` 63 | $ npm run build 64 | $ npm test 65 | ``` 66 | -------------------------------------------------------------------------------- /tests/index.spec.js: -------------------------------------------------------------------------------- 1 | const parse = require('../lib'); 2 | 3 | describe('html parser', () => { 4 | it('basic', function () { 5 | const html = `test
xx yyy
`; 6 | parse(html, (err, nodes) => { 7 | expect(err).toBeFalsy(); 8 | expect(nodes).toEqual([ 9 | { type: "text", text: "test "}, 10 | { 11 | attrs: {}, 12 | name: 'div', 13 | children: [ 14 | { type: "text", text: "xx "}, 15 | { 16 | attrs: {}, 17 | name: 'span', 18 | children: [ 19 | { type: "text", text: "yyy"}, 20 | ] 21 | }, 22 | ] 23 | }, 24 | ]) 25 | }) 26 | }); 27 | it('support selfClose element', function () { 28 | const html = `` 29 | parse(html, (err, nodes) => { 30 | expect(err).toBeFalsy(); 31 | expect(nodes).toEqual([ 32 | { 33 | attrs: {}, 34 | name: 'input', 35 | children: [], 36 | }, 37 | ]); 38 | }) 39 | }); 40 | it('support attrs', function () { 41 | const html = `test`; 42 | parse(html, (err, nodes) => { 43 | expect(err).toBeFalsy(); 44 | expect(nodes).toEqual([ 45 | { 46 | attrs: { src: 'http://xxx.com/yyy.png', alt: 'test' }, 47 | name: 'img', 48 | children: [], 49 | }, 50 | ]) 51 | }) 52 | }); 53 | it('does not support directive', function () { 54 | const html = `The TitleHello world`; 55 | parse(html, (err) => { 56 | expect(err).toBeTruthy(); 57 | }) 58 | }); 59 | }); 60 | 61 | -------------------------------------------------------------------------------- /src/Parser.tsx: -------------------------------------------------------------------------------- 1 | var Tokenizer = require("./Tokenizer.js"); 2 | 3 | /* 4 | Options: 5 | 6 | xmlMode: Disables the special behavior for script/style tags (false by default) 7 | lowerCaseAttributeNames: call .toLowerCase for each attribute name (true if xmlMode is `false`) 8 | lowerCaseTags: call .toLowerCase for each tag name (true if xmlMode is `false`) 9 | */ 10 | 11 | /* 12 | Callbacks: 13 | 14 | oncdataend, 15 | oncdatastart, 16 | onclosetag, 17 | oncomment, 18 | oncommentend, 19 | onerror, 20 | onopentag, 21 | onprocessinginstruction, 22 | onreset, 23 | ontext 24 | */ 25 | 26 | var formTags = { 27 | input: true, 28 | option: true, 29 | optgroup: true, 30 | select: true, 31 | button: true, 32 | datalist: true, 33 | textarea: true 34 | }; 35 | 36 | var openImpliesClose = { 37 | tr: { tr: true, th: true, td: true }, 38 | th: { th: true }, 39 | td: { thead: true, th: true, td: true }, 40 | body: { head: true, link: true, script: true }, 41 | li: { li: true }, 42 | p: { p: true }, 43 | h1: { p: true }, 44 | h2: { p: true }, 45 | h3: { p: true }, 46 | h4: { p: true }, 47 | h5: { p: true }, 48 | h6: { p: true }, 49 | select: formTags, 50 | input: formTags, 51 | output: formTags, 52 | button: formTags, 53 | datalist: formTags, 54 | textarea: formTags, 55 | option: { option: true }, 56 | optgroup: { optgroup: true } 57 | }; 58 | 59 | var voidElements = { 60 | __proto__: null, 61 | area: true, 62 | base: true, 63 | basefont: true, 64 | br: true, 65 | col: true, 66 | command: true, 67 | embed: true, 68 | frame: true, 69 | hr: true, 70 | img: true, 71 | input: true, 72 | isindex: true, 73 | keygen: true, 74 | link: true, 75 | meta: true, 76 | param: true, 77 | source: true, 78 | track: true, 79 | wbr: true 80 | }; 81 | 82 | var foreignContextElements = { 83 | __proto__: null, 84 | math: true, 85 | svg: true 86 | }; 87 | var htmlIntegrationElements = { 88 | __proto__: null, 89 | mi: true, 90 | mo: true, 91 | mn: true, 92 | ms: true, 93 | mtext: true, 94 | "annotation-xml": true, 95 | foreignObject: true, 96 | desc: true, 97 | title: true 98 | }; 99 | 100 | var re_nameEnd = /\s|\//; 101 | 102 | function Parser(cbs, options) { 103 | this._options = options || {}; 104 | this._cbs = cbs || {}; 105 | 106 | this._tagname = ""; 107 | this._attribname = ""; 108 | this._attribvalue = ""; 109 | this._attribs = null; 110 | this._stack = []; 111 | this._foreignContext = []; 112 | 113 | this.startIndex = 0; 114 | this.endIndex = null; 115 | 116 | this._lowerCaseTagNames = 117 | "lowerCaseTags" in this._options 118 | ? !!this._options.lowerCaseTags 119 | : !this._options.xmlMode; 120 | this._lowerCaseAttributeNames = 121 | "lowerCaseAttributeNames" in this._options 122 | ? !!this._options.lowerCaseAttributeNames 123 | : !this._options.xmlMode; 124 | 125 | if (this._options.Tokenizer) { 126 | Tokenizer = this._options.Tokenizer; 127 | } 128 | this._tokenizer = new Tokenizer(this._options, this); 129 | 130 | if (this._cbs.onparserinit) this._cbs.onparserinit(this); 131 | } 132 | 133 | require("./inherits")(Parser, require("events").EventEmitter); 134 | 135 | Parser.prototype._updatePosition = function(initialOffset) { 136 | if (this.endIndex === null) { 137 | if (this._tokenizer._sectionStart <= initialOffset) { 138 | this.startIndex = 0; 139 | } else { 140 | this.startIndex = this._tokenizer._sectionStart - initialOffset; 141 | } 142 | } else this.startIndex = this.endIndex + 1; 143 | this.endIndex = this._tokenizer.getAbsoluteIndex(); 144 | }; 145 | 146 | //Tokenizer event handlers 147 | Parser.prototype.ontext = function(data) { 148 | this._updatePosition(1); 149 | this.endIndex--; 150 | 151 | if (this._cbs.ontext) this._cbs.ontext(data); 152 | }; 153 | 154 | Parser.prototype.onopentagname = function(name) { 155 | if (this._lowerCaseTagNames) { 156 | name = name.toLowerCase(); 157 | } 158 | 159 | this._tagname = name; 160 | 161 | if (!this._options.xmlMode && name in openImpliesClose) { 162 | for ( 163 | var el; 164 | (el = this._stack[this._stack.length - 1]) in 165 | openImpliesClose[name]; 166 | this.onclosetag(el) 167 | ); 168 | } 169 | 170 | if (this._options.xmlMode || !(name in voidElements)) { 171 | this._stack.push(name); 172 | if (name in foreignContextElements) this._foreignContext.push(true); 173 | else if (name in htmlIntegrationElements) 174 | this._foreignContext.push(false); 175 | } 176 | 177 | if (this._cbs.onopentagname) this._cbs.onopentagname(name); 178 | if (this._cbs.onopentag) this._attribs = {}; 179 | }; 180 | 181 | Parser.prototype.onopentagend = function() { 182 | this._updatePosition(1); 183 | 184 | if (this._attribs) { 185 | if (this._cbs.onopentag) 186 | this._cbs.onopentag(this._tagname, this._attribs); 187 | this._attribs = null; 188 | } 189 | 190 | if ( 191 | !this._options.xmlMode && 192 | this._cbs.onclosetag && 193 | this._tagname in voidElements 194 | ) { 195 | this._cbs.onclosetag(this._tagname); 196 | } 197 | 198 | this._tagname = ""; 199 | }; 200 | 201 | Parser.prototype.onclosetag = function(name) { 202 | this._updatePosition(1); 203 | 204 | if (this._lowerCaseTagNames) { 205 | name = name.toLowerCase(); 206 | } 207 | 208 | if (name in foreignContextElements || name in htmlIntegrationElements) { 209 | this._foreignContext.pop(); 210 | } 211 | 212 | if ( 213 | this._stack.length && 214 | (!(name in voidElements) || this._options.xmlMode) 215 | ) { 216 | var pos = this._stack.lastIndexOf(name); 217 | if (pos !== -1) { 218 | if (this._cbs.onclosetag) { 219 | pos = this._stack.length - pos; 220 | while (pos--) this._cbs.onclosetag(this._stack.pop()); 221 | } else this._stack.length = pos; 222 | } else if (name === "p" && !this._options.xmlMode) { 223 | this.onopentagname(name); 224 | this._closeCurrentTag(); 225 | } 226 | } else if (!this._options.xmlMode && (name === "br" || name === "p")) { 227 | this.onopentagname(name); 228 | this._closeCurrentTag(); 229 | } 230 | }; 231 | 232 | Parser.prototype.onselfclosingtag = function() { 233 | if ( 234 | this._options.xmlMode || 235 | this._options.recognizeSelfClosing || 236 | this._foreignContext[this._foreignContext.length - 1] 237 | ) { 238 | this._closeCurrentTag(); 239 | } else { 240 | this.onopentagend(); 241 | } 242 | }; 243 | 244 | Parser.prototype._closeCurrentTag = function() { 245 | var name = this._tagname; 246 | 247 | this.onopentagend(); 248 | 249 | //self-closing tags will be on the top of the stack 250 | //(cheaper check than in onclosetag) 251 | if (this._stack[this._stack.length - 1] === name) { 252 | if (this._cbs.onclosetag) { 253 | this._cbs.onclosetag(name); 254 | } 255 | this._stack.pop(); 256 | 257 | } 258 | }; 259 | 260 | Parser.prototype.onattribname = function(name) { 261 | if (this._lowerCaseAttributeNames) { 262 | name = name.toLowerCase(); 263 | } 264 | this._attribname = name; 265 | }; 266 | 267 | Parser.prototype.onattribdata = function(value) { 268 | this._attribvalue += value; 269 | }; 270 | 271 | Parser.prototype.onattribend = function() { 272 | if (this._cbs.onattribute) 273 | this._cbs.onattribute(this._attribname, this._attribvalue); 274 | if ( 275 | this._attribs && 276 | !Object.prototype.hasOwnProperty.call(this._attribs, this._attribname) 277 | ) { 278 | this._attribs[this._attribname] = this._attribvalue; 279 | } 280 | this._attribname = ""; 281 | this._attribvalue = ""; 282 | }; 283 | 284 | Parser.prototype._getInstructionName = function(value) { 285 | var idx = value.search(re_nameEnd), 286 | name = idx < 0 ? value : value.substr(0, idx); 287 | 288 | if (this._lowerCaseTagNames) { 289 | name = name.toLowerCase(); 290 | } 291 | 292 | return name; 293 | }; 294 | 295 | Parser.prototype.ondeclaration = function(value) { 296 | if (this._cbs.onprocessinginstruction) { 297 | var name = this._getInstructionName(value); 298 | this._cbs.onprocessinginstruction("!" + name, "!" + value); 299 | } 300 | }; 301 | 302 | Parser.prototype.onprocessinginstruction = function(value) { 303 | if (this._cbs.onprocessinginstruction) { 304 | var name = this._getInstructionName(value); 305 | this._cbs.onprocessinginstruction("?" + name, "?" + value); 306 | } 307 | }; 308 | 309 | Parser.prototype.oncomment = function(value) { 310 | this._updatePosition(4); 311 | 312 | if (this._cbs.oncomment) this._cbs.oncomment(value); 313 | if (this._cbs.oncommentend) this._cbs.oncommentend(); 314 | }; 315 | 316 | Parser.prototype.oncdata = function(value) { 317 | this._updatePosition(1); 318 | 319 | if (this._options.xmlMode || this._options.recognizeCDATA) { 320 | if (this._cbs.oncdatastart) this._cbs.oncdatastart(); 321 | if (this._cbs.ontext) this._cbs.ontext(value); 322 | if (this._cbs.oncdataend) this._cbs.oncdataend(); 323 | } else { 324 | this.oncomment("[CDATA[" + value + "]]"); 325 | } 326 | }; 327 | 328 | Parser.prototype.onerror = function(err) { 329 | if (this._cbs.onerror) this._cbs.onerror(err); 330 | }; 331 | 332 | Parser.prototype.onend = function() { 333 | if (this._cbs.onclosetag) { 334 | for ( 335 | var i = this._stack.length; 336 | i > 0; 337 | this._cbs.onclosetag(this._stack[--i]) 338 | ); 339 | } 340 | if (this._cbs.onend) this._cbs.onend(); 341 | }; 342 | 343 | //Resets the parser to a blank state, ready to parse a new HTML document 344 | Parser.prototype.reset = function() { 345 | if (this._cbs.onreset) this._cbs.onreset(); 346 | this._tokenizer.reset(); 347 | 348 | this._tagname = ""; 349 | this._attribname = ""; 350 | this._attribs = null; 351 | this._stack = []; 352 | 353 | if (this._cbs.onparserinit) this._cbs.onparserinit(this); 354 | }; 355 | 356 | //Parses a complete HTML document and pushes it to the handler 357 | Parser.prototype.parseComplete = function(data) { 358 | this.reset(); 359 | this.end(data); 360 | }; 361 | 362 | Parser.prototype.write = function(chunk) { 363 | this._tokenizer.write(chunk); 364 | }; 365 | 366 | Parser.prototype.end = function(chunk) { 367 | this._tokenizer.end(chunk); 368 | }; 369 | 370 | Parser.prototype.pause = function() { 371 | this._tokenizer.pause(); 372 | }; 373 | 374 | Parser.prototype.resume = function() { 375 | this._tokenizer.resume(); 376 | }; 377 | 378 | //alias for backwards compat 379 | Parser.prototype.parseChunk = Parser.prototype.write; 380 | Parser.prototype.done = Parser.prototype.end; 381 | 382 | export default Parser; 383 | -------------------------------------------------------------------------------- /src/Tokenizer.tsx: -------------------------------------------------------------------------------- 1 | var decodeCodePoint = require("entities/lib/decode_codepoint.js"); 2 | var entityMap = {}; 3 | var legacyMap = require("entities/maps/legacy.json"); 4 | var xmlMap = require("entities/maps/xml.json"); 5 | 6 | var i = 0; 7 | 8 | var TEXT = i++; 9 | var BEFORE_TAG_NAME = i++; //after < 10 | var IN_TAG_NAME = i++; 11 | var IN_SELF_CLOSING_TAG = i++; 12 | var BEFORE_CLOSING_TAG_NAME = i++; 13 | var IN_CLOSING_TAG_NAME = i++; 14 | var AFTER_CLOSING_TAG_NAME = i++; 15 | 16 | //attributes 17 | var BEFORE_ATTRIBUTE_NAME = i++; 18 | var IN_ATTRIBUTE_NAME = i++; 19 | var AFTER_ATTRIBUTE_NAME = i++; 20 | var BEFORE_ATTRIBUTE_VALUE = i++; 21 | var IN_ATTRIBUTE_VALUE_DQ = i++; // " 22 | var IN_ATTRIBUTE_VALUE_SQ = i++; // ' 23 | var IN_ATTRIBUTE_VALUE_NQ = i++; 24 | 25 | //declarations 26 | var BEFORE_DECLARATION = i++; // ! 27 | var IN_DECLARATION = i++; 28 | 29 | //processing instructions 30 | var IN_PROCESSING_INSTRUCTION = i++; // ? 31 | 32 | //comments 33 | var BEFORE_COMMENT = i++; 34 | var IN_COMMENT = i++; 35 | var AFTER_COMMENT_1 = i++; 36 | var AFTER_COMMENT_2 = i++; 37 | 38 | //cdata 39 | var BEFORE_CDATA_1 = i++; // [ 40 | var BEFORE_CDATA_2 = i++; // C 41 | var BEFORE_CDATA_3 = i++; // D 42 | var BEFORE_CDATA_4 = i++; // A 43 | var BEFORE_CDATA_5 = i++; // T 44 | var BEFORE_CDATA_6 = i++; // A 45 | var IN_CDATA = i++; // [ 46 | var AFTER_CDATA_1 = i++; // ] 47 | var AFTER_CDATA_2 = i++; // ] 48 | 49 | //special tags 50 | var BEFORE_SPECIAL = i++; //S 51 | var BEFORE_SPECIAL_END = i++; //S 52 | 53 | var BEFORE_SCRIPT_1 = i++; //C 54 | var BEFORE_SCRIPT_2 = i++; //R 55 | var BEFORE_SCRIPT_3 = i++; //I 56 | var BEFORE_SCRIPT_4 = i++; //P 57 | var BEFORE_SCRIPT_5 = i++; //T 58 | var AFTER_SCRIPT_1 = i++; //C 59 | var AFTER_SCRIPT_2 = i++; //R 60 | var AFTER_SCRIPT_3 = i++; //I 61 | var AFTER_SCRIPT_4 = i++; //P 62 | var AFTER_SCRIPT_5 = i++; //T 63 | 64 | var BEFORE_STYLE_1 = i++; //T 65 | var BEFORE_STYLE_2 = i++; //Y 66 | var BEFORE_STYLE_3 = i++; //L 67 | var BEFORE_STYLE_4 = i++; //E 68 | var AFTER_STYLE_1 = i++; //T 69 | var AFTER_STYLE_2 = i++; //Y 70 | var AFTER_STYLE_3 = i++; //L 71 | var AFTER_STYLE_4 = i++; //E 72 | 73 | var BEFORE_ENTITY = i++; //& 74 | var BEFORE_NUMERIC_ENTITY = i++; //# 75 | var IN_NAMED_ENTITY = i++; 76 | var IN_NUMERIC_ENTITY = i++; 77 | var IN_HEX_ENTITY = i++; //X 78 | 79 | var j = 0; 80 | 81 | var SPECIAL_NONE = j++; 82 | var SPECIAL_SCRIPT = j++; 83 | var SPECIAL_STYLE = j++; 84 | 85 | function whitespace(c) { 86 | return c === " " || c === "\n" || c === "\t" || c === "\f" || c === "\r"; 87 | } 88 | 89 | function ifElseState(upper, SUCCESS, FAILURE) { 90 | var lower = upper.toLowerCase(); 91 | 92 | if (upper === lower) { 93 | return function(c) { 94 | if (c === lower) { 95 | this._state = SUCCESS; 96 | } else { 97 | this._state = FAILURE; 98 | this._index--; 99 | } 100 | }; 101 | } else { 102 | return function(c) { 103 | if (c === lower || c === upper) { 104 | this._state = SUCCESS; 105 | } else { 106 | this._state = FAILURE; 107 | this._index--; 108 | } 109 | }; 110 | } 111 | } 112 | 113 | function consumeSpecialNameChar(upper, NEXT_STATE) { 114 | var lower = upper.toLowerCase(); 115 | 116 | return function(c) { 117 | if (c === lower || c === upper) { 118 | this._state = NEXT_STATE; 119 | } else { 120 | this._state = IN_TAG_NAME; 121 | this._index--; //consume the token again 122 | } 123 | }; 124 | } 125 | 126 | function Tokenizer(options, cbs) { 127 | this._state = TEXT; 128 | this._buffer = ""; 129 | this._sectionStart = 0; 130 | this._index = 0; 131 | this._bufferOffset = 0; //chars removed from _buffer 132 | this._baseState = TEXT; 133 | this._special = SPECIAL_NONE; 134 | this._cbs = cbs; 135 | this._running = true; 136 | this._ended = false; 137 | this._xmlMode = !!(options && options.xmlMode); 138 | this._decodeEntities = !!(options && options.decodeEntities); 139 | } 140 | 141 | Tokenizer.prototype._stateText = function(c) { 142 | if (c === "<") { 143 | if (this._index > this._sectionStart) { 144 | this._cbs.ontext(this._getSection()); 145 | } 146 | this._state = BEFORE_TAG_NAME; 147 | this._sectionStart = this._index; 148 | } else if ( 149 | this._decodeEntities && 150 | this._special === SPECIAL_NONE && 151 | c === "&" 152 | ) { 153 | if (this._index > this._sectionStart) { 154 | this._cbs.ontext(this._getSection()); 155 | } 156 | this._baseState = TEXT; 157 | this._state = BEFORE_ENTITY; 158 | this._sectionStart = this._index; 159 | } 160 | }; 161 | 162 | Tokenizer.prototype._stateBeforeTagName = function(c) { 163 | if (c === "/") { 164 | this._state = BEFORE_CLOSING_TAG_NAME; 165 | } else if (c === "<") { 166 | this._cbs.ontext(this._getSection()); 167 | this._sectionStart = this._index; 168 | } else if (c === ">" || this._special !== SPECIAL_NONE || whitespace(c)) { 169 | this._state = TEXT; 170 | } else if (c === "!") { 171 | this._state = BEFORE_DECLARATION; 172 | this._sectionStart = this._index + 1; 173 | } else if (c === "?") { 174 | this._state = IN_PROCESSING_INSTRUCTION; 175 | this._sectionStart = this._index + 1; 176 | } else { 177 | this._state = 178 | !this._xmlMode && (c === "s" || c === "S") 179 | ? BEFORE_SPECIAL 180 | : IN_TAG_NAME; 181 | this._sectionStart = this._index; 182 | } 183 | }; 184 | 185 | Tokenizer.prototype._stateInTagName = function(c) { 186 | if (c === "/" || c === ">" || whitespace(c)) { 187 | this._emitToken("onopentagname"); 188 | this._state = BEFORE_ATTRIBUTE_NAME; 189 | this._index--; 190 | } 191 | }; 192 | 193 | Tokenizer.prototype._stateBeforeCloseingTagName = function(c) { 194 | if (whitespace(c)){} 195 | else if (c === ">") { 196 | this._state = TEXT; 197 | } else if (this._special !== SPECIAL_NONE) { 198 | if (c === "s" || c === "S") { 199 | this._state = BEFORE_SPECIAL_END; 200 | } else { 201 | this._state = TEXT; 202 | this._index--; 203 | } 204 | } else { 205 | this._state = IN_CLOSING_TAG_NAME; 206 | this._sectionStart = this._index; 207 | } 208 | }; 209 | 210 | Tokenizer.prototype._stateInCloseingTagName = function(c) { 211 | if (c === ">" || whitespace(c)) { 212 | this._emitToken("onclosetag"); 213 | this._state = AFTER_CLOSING_TAG_NAME; 214 | this._index--; 215 | } 216 | }; 217 | 218 | Tokenizer.prototype._stateAfterCloseingTagName = function(c) { 219 | //skip everything until ">" 220 | if (c === ">") { 221 | this._state = TEXT; 222 | this._sectionStart = this._index + 1; 223 | } 224 | }; 225 | 226 | Tokenizer.prototype._stateBeforeAttributeName = function(c) { 227 | if (c === ">") { 228 | this._cbs.onopentagend(); 229 | this._state = TEXT; 230 | this._sectionStart = this._index + 1; 231 | } else if (c === "/") { 232 | this._state = IN_SELF_CLOSING_TAG; 233 | } else if (!whitespace(c)) { 234 | this._state = IN_ATTRIBUTE_NAME; 235 | this._sectionStart = this._index; 236 | } 237 | }; 238 | 239 | Tokenizer.prototype._stateInSelfClosingTag = function(c) { 240 | if (c === ">") { 241 | this._cbs.onselfclosingtag(); 242 | this._state = TEXT; 243 | this._sectionStart = this._index + 1; 244 | } else if (!whitespace(c)) { 245 | this._state = BEFORE_ATTRIBUTE_NAME; 246 | this._index--; 247 | } 248 | }; 249 | 250 | Tokenizer.prototype._stateInAttributeName = function(c) { 251 | if (c === "=" || c === "/" || c === ">" || whitespace(c)) { 252 | this._cbs.onattribname(this._getSection()); 253 | this._sectionStart = -1; 254 | this._state = AFTER_ATTRIBUTE_NAME; 255 | this._index--; 256 | } 257 | }; 258 | 259 | Tokenizer.prototype._stateAfterAttributeName = function(c) { 260 | if (c === "=") { 261 | this._state = BEFORE_ATTRIBUTE_VALUE; 262 | } else if (c === "/" || c === ">") { 263 | this._cbs.onattribend(); 264 | this._state = BEFORE_ATTRIBUTE_NAME; 265 | this._index--; 266 | } else if (!whitespace(c)) { 267 | this._cbs.onattribend(); 268 | this._state = IN_ATTRIBUTE_NAME; 269 | this._sectionStart = this._index; 270 | } 271 | }; 272 | 273 | Tokenizer.prototype._stateBeforeAttributeValue = function(c) { 274 | if (c === '"') { 275 | this._state = IN_ATTRIBUTE_VALUE_DQ; 276 | this._sectionStart = this._index + 1; 277 | } else if (c === "'") { 278 | this._state = IN_ATTRIBUTE_VALUE_SQ; 279 | this._sectionStart = this._index + 1; 280 | } else if (!whitespace(c)) { 281 | this._state = IN_ATTRIBUTE_VALUE_NQ; 282 | this._sectionStart = this._index; 283 | this._index--; //reconsume token 284 | } 285 | }; 286 | 287 | Tokenizer.prototype._stateInAttributeValueDoubleQuotes = function(c) { 288 | if (c === '"') { 289 | this._emitToken("onattribdata"); 290 | this._cbs.onattribend(); 291 | this._state = BEFORE_ATTRIBUTE_NAME; 292 | } else if (this._decodeEntities && c === "&") { 293 | this._emitToken("onattribdata"); 294 | this._baseState = this._state; 295 | this._state = BEFORE_ENTITY; 296 | this._sectionStart = this._index; 297 | } 298 | }; 299 | 300 | Tokenizer.prototype._stateInAttributeValueSingleQuotes = function(c) { 301 | if (c === "'") { 302 | this._emitToken("onattribdata"); 303 | this._cbs.onattribend(); 304 | this._state = BEFORE_ATTRIBUTE_NAME; 305 | } else if (this._decodeEntities && c === "&") { 306 | this._emitToken("onattribdata"); 307 | this._baseState = this._state; 308 | this._state = BEFORE_ENTITY; 309 | this._sectionStart = this._index; 310 | } 311 | }; 312 | 313 | Tokenizer.prototype._stateInAttributeValueNoQuotes = function(c) { 314 | if (whitespace(c) || c === ">") { 315 | this._emitToken("onattribdata"); 316 | this._cbs.onattribend(); 317 | this._state = BEFORE_ATTRIBUTE_NAME; 318 | this._index--; 319 | } else if (this._decodeEntities && c === "&") { 320 | this._emitToken("onattribdata"); 321 | this._baseState = this._state; 322 | this._state = BEFORE_ENTITY; 323 | this._sectionStart = this._index; 324 | } 325 | }; 326 | 327 | Tokenizer.prototype._stateBeforeDeclaration = function(c) { 328 | this._state = 329 | c === "[" 330 | ? BEFORE_CDATA_1 331 | : c === "-" 332 | ? BEFORE_COMMENT 333 | : IN_DECLARATION; 334 | }; 335 | 336 | Tokenizer.prototype._stateInDeclaration = function(c) { 337 | if (c === ">") { 338 | this._cbs.ondeclaration(this._getSection()); 339 | this._state = TEXT; 340 | this._sectionStart = this._index + 1; 341 | } 342 | }; 343 | 344 | Tokenizer.prototype._stateInProcessingInstruction = function(c) { 345 | if (c === ">") { 346 | this._cbs.onprocessinginstruction(this._getSection()); 347 | this._state = TEXT; 348 | this._sectionStart = this._index + 1; 349 | } 350 | }; 351 | 352 | Tokenizer.prototype._stateBeforeComment = function(c) { 353 | if (c === "-") { 354 | this._state = IN_COMMENT; 355 | this._sectionStart = this._index + 1; 356 | } else { 357 | this._state = IN_DECLARATION; 358 | } 359 | }; 360 | 361 | Tokenizer.prototype._stateInComment = function(c) { 362 | if (c === "-") this._state = AFTER_COMMENT_1; 363 | }; 364 | 365 | Tokenizer.prototype._stateAfterComment1 = function(c) { 366 | if (c === "-") { 367 | this._state = AFTER_COMMENT_2; 368 | } else { 369 | this._state = IN_COMMENT; 370 | } 371 | }; 372 | 373 | Tokenizer.prototype._stateAfterComment2 = function(c) { 374 | if (c === ">") { 375 | //remove 2 trailing chars 376 | this._cbs.oncomment( 377 | this._buffer.substring(this._sectionStart, this._index - 2) 378 | ); 379 | this._state = TEXT; 380 | this._sectionStart = this._index + 1; 381 | } else if (c !== "-") { 382 | this._state = IN_COMMENT; 383 | } 384 | // else: stay in AFTER_COMMENT_2 (`--->`) 385 | }; 386 | 387 | Tokenizer.prototype._stateBeforeCdata1 = ifElseState( 388 | "C", 389 | BEFORE_CDATA_2, 390 | IN_DECLARATION 391 | ); 392 | Tokenizer.prototype._stateBeforeCdata2 = ifElseState( 393 | "D", 394 | BEFORE_CDATA_3, 395 | IN_DECLARATION 396 | ); 397 | Tokenizer.prototype._stateBeforeCdata3 = ifElseState( 398 | "A", 399 | BEFORE_CDATA_4, 400 | IN_DECLARATION 401 | ); 402 | Tokenizer.prototype._stateBeforeCdata4 = ifElseState( 403 | "T", 404 | BEFORE_CDATA_5, 405 | IN_DECLARATION 406 | ); 407 | Tokenizer.prototype._stateBeforeCdata5 = ifElseState( 408 | "A", 409 | BEFORE_CDATA_6, 410 | IN_DECLARATION 411 | ); 412 | 413 | Tokenizer.prototype._stateBeforeCdata6 = function(c) { 414 | if (c === "[") { 415 | this._state = IN_CDATA; 416 | this._sectionStart = this._index + 1; 417 | } else { 418 | this._state = IN_DECLARATION; 419 | this._index--; 420 | } 421 | }; 422 | 423 | Tokenizer.prototype._stateInCdata = function(c) { 424 | if (c === "]") this._state = AFTER_CDATA_1; 425 | }; 426 | 427 | Tokenizer.prototype._stateAfterCdata1 = function(c) { 428 | if (c === "]") this._state = AFTER_CDATA_2; 429 | else this._state = IN_CDATA; 430 | }; 431 | 432 | Tokenizer.prototype._stateAfterCdata2 = function(c) { 433 | if (c === ">") { 434 | //remove 2 trailing chars 435 | this._cbs.oncdata( 436 | this._buffer.substring(this._sectionStart, this._index - 2) 437 | ); 438 | this._state = TEXT; 439 | this._sectionStart = this._index + 1; 440 | } else if (c !== "]") { 441 | this._state = IN_CDATA; 442 | } 443 | //else: stay in AFTER_CDATA_2 (`]]]>`) 444 | }; 445 | 446 | Tokenizer.prototype._stateBeforeSpecial = function(c) { 447 | if (c === "c" || c === "C") { 448 | this._state = BEFORE_SCRIPT_1; 449 | } else if (c === "t" || c === "T") { 450 | this._state = BEFORE_STYLE_1; 451 | } else { 452 | this._state = IN_TAG_NAME; 453 | this._index--; //consume the token again 454 | } 455 | }; 456 | 457 | Tokenizer.prototype._stateBeforeSpecialEnd = function(c) { 458 | if (this._special === SPECIAL_SCRIPT && (c === "c" || c === "C")) { 459 | this._state = AFTER_SCRIPT_1; 460 | } else if (this._special === SPECIAL_STYLE && (c === "t" || c === "T")) { 461 | this._state = AFTER_STYLE_1; 462 | } else this._state = TEXT; 463 | }; 464 | 465 | Tokenizer.prototype._stateBeforeScript1 = consumeSpecialNameChar( 466 | "R", 467 | BEFORE_SCRIPT_2 468 | ); 469 | Tokenizer.prototype._stateBeforeScript2 = consumeSpecialNameChar( 470 | "I", 471 | BEFORE_SCRIPT_3 472 | ); 473 | Tokenizer.prototype._stateBeforeScript3 = consumeSpecialNameChar( 474 | "P", 475 | BEFORE_SCRIPT_4 476 | ); 477 | Tokenizer.prototype._stateBeforeScript4 = consumeSpecialNameChar( 478 | "T", 479 | BEFORE_SCRIPT_5 480 | ); 481 | 482 | Tokenizer.prototype._stateBeforeScript5 = function(c) { 483 | if (c === "/" || c === ">" || whitespace(c)) { 484 | this._special = SPECIAL_SCRIPT; 485 | } 486 | this._state = IN_TAG_NAME; 487 | this._index--; //consume the token again 488 | }; 489 | 490 | Tokenizer.prototype._stateAfterScript1 = ifElseState("R", AFTER_SCRIPT_2, TEXT); 491 | Tokenizer.prototype._stateAfterScript2 = ifElseState("I", AFTER_SCRIPT_3, TEXT); 492 | Tokenizer.prototype._stateAfterScript3 = ifElseState("P", AFTER_SCRIPT_4, TEXT); 493 | Tokenizer.prototype._stateAfterScript4 = ifElseState("T", AFTER_SCRIPT_5, TEXT); 494 | 495 | Tokenizer.prototype._stateAfterScript5 = function(c) { 496 | if (c === ">" || whitespace(c)) { 497 | this._special = SPECIAL_NONE; 498 | this._state = IN_CLOSING_TAG_NAME; 499 | this._sectionStart = this._index - 6; 500 | this._index--; //reconsume the token 501 | } else this._state = TEXT; 502 | }; 503 | 504 | Tokenizer.prototype._stateBeforeStyle1 = consumeSpecialNameChar( 505 | "Y", 506 | BEFORE_STYLE_2 507 | ); 508 | Tokenizer.prototype._stateBeforeStyle2 = consumeSpecialNameChar( 509 | "L", 510 | BEFORE_STYLE_3 511 | ); 512 | Tokenizer.prototype._stateBeforeStyle3 = consumeSpecialNameChar( 513 | "E", 514 | BEFORE_STYLE_4 515 | ); 516 | 517 | Tokenizer.prototype._stateBeforeStyle4 = function(c) { 518 | if (c === "/" || c === ">" || whitespace(c)) { 519 | this._special = SPECIAL_STYLE; 520 | } 521 | this._state = IN_TAG_NAME; 522 | this._index--; //consume the token again 523 | }; 524 | 525 | Tokenizer.prototype._stateAfterStyle1 = ifElseState("Y", AFTER_STYLE_2, TEXT); 526 | Tokenizer.prototype._stateAfterStyle2 = ifElseState("L", AFTER_STYLE_3, TEXT); 527 | Tokenizer.prototype._stateAfterStyle3 = ifElseState("E", AFTER_STYLE_4, TEXT); 528 | 529 | Tokenizer.prototype._stateAfterStyle4 = function(c) { 530 | if (c === ">" || whitespace(c)) { 531 | this._special = SPECIAL_NONE; 532 | this._state = IN_CLOSING_TAG_NAME; 533 | this._sectionStart = this._index - 5; 534 | this._index--; //reconsume the token 535 | } else this._state = TEXT; 536 | }; 537 | 538 | Tokenizer.prototype._stateBeforeEntity = ifElseState( 539 | "#", 540 | BEFORE_NUMERIC_ENTITY, 541 | IN_NAMED_ENTITY 542 | ); 543 | Tokenizer.prototype._stateBeforeNumericEntity = ifElseState( 544 | "X", 545 | IN_HEX_ENTITY, 546 | IN_NUMERIC_ENTITY 547 | ); 548 | 549 | //for entities terminated with a semicolon 550 | Tokenizer.prototype._parseNamedEntityStrict = function() { 551 | //offset = 1 552 | if (this._sectionStart + 1 < this._index) { 553 | var entity = this._buffer.substring( 554 | this._sectionStart + 1, 555 | this._index 556 | ), 557 | map = this._xmlMode ? xmlMap : entityMap; 558 | 559 | if (map.hasOwnProperty(entity)) { 560 | this._emitPartial(map[entity]); 561 | this._sectionStart = this._index + 1; 562 | } 563 | } 564 | }; 565 | 566 | //parses legacy entities (without trailing semicolon) 567 | Tokenizer.prototype._parseLegacyEntity = function() { 568 | var start = this._sectionStart + 1, 569 | limit = this._index - start; 570 | 571 | if (limit > 6) limit = 6; //the max length of legacy entities is 6 572 | 573 | while (limit >= 2) { 574 | //the min length of legacy entities is 2 575 | var entity = this._buffer.substr(start, limit); 576 | 577 | if (legacyMap.hasOwnProperty(entity)) { 578 | this._emitPartial(legacyMap[entity]); 579 | this._sectionStart += limit + 1; 580 | return; 581 | } else { 582 | limit--; 583 | } 584 | } 585 | }; 586 | 587 | Tokenizer.prototype._stateInNamedEntity = function(c) { 588 | if (c === ";") { 589 | this._parseNamedEntityStrict(); 590 | if (this._sectionStart + 1 < this._index && !this._xmlMode) { 591 | this._parseLegacyEntity(); 592 | } 593 | this._state = this._baseState; 594 | } else if ( 595 | (c < "a" || c > "z") && 596 | (c < "A" || c > "Z") && 597 | (c < "0" || c > "9") 598 | ) { 599 | if (this._xmlMode){} 600 | else if (this._sectionStart + 1 === this._index){} 601 | else if (this._baseState !== TEXT) { 602 | if (c !== "=") { 603 | this._parseNamedEntityStrict(); 604 | } 605 | } else { 606 | this._parseLegacyEntity(); 607 | } 608 | 609 | this._state = this._baseState; 610 | this._index--; 611 | } 612 | }; 613 | 614 | Tokenizer.prototype._decodeNumericEntity = function(offset, base) { 615 | var sectionStart = this._sectionStart + offset; 616 | 617 | if (sectionStart !== this._index) { 618 | //parse entity 619 | var entity = this._buffer.substring(sectionStart, this._index); 620 | var parsed = parseInt(entity, base); 621 | 622 | this._emitPartial(decodeCodePoint(parsed)); 623 | this._sectionStart = this._index; 624 | } else { 625 | this._sectionStart--; 626 | } 627 | 628 | this._state = this._baseState; 629 | }; 630 | 631 | Tokenizer.prototype._stateInNumericEntity = function(c) { 632 | if (c === ";") { 633 | this._decodeNumericEntity(2, 10); 634 | this._sectionStart++; 635 | } else if (c < "0" || c > "9") { 636 | if (!this._xmlMode) { 637 | this._decodeNumericEntity(2, 10); 638 | } else { 639 | this._state = this._baseState; 640 | } 641 | this._index--; 642 | } 643 | }; 644 | 645 | Tokenizer.prototype._stateInHexEntity = function(c) { 646 | if (c === ";") { 647 | this._decodeNumericEntity(3, 16); 648 | this._sectionStart++; 649 | } else if ( 650 | (c < "a" || c > "f") && 651 | (c < "A" || c > "F") && 652 | (c < "0" || c > "9") 653 | ) { 654 | if (!this._xmlMode) { 655 | this._decodeNumericEntity(3, 16); 656 | } else { 657 | this._state = this._baseState; 658 | } 659 | this._index--; 660 | } 661 | }; 662 | 663 | Tokenizer.prototype._cleanup = function() { 664 | if (this._sectionStart < 0) { 665 | this._buffer = ""; 666 | this._bufferOffset += this._index; 667 | this._index = 0; 668 | } else if (this._running) { 669 | if (this._state === TEXT) { 670 | if (this._sectionStart !== this._index) { 671 | this._cbs.ontext(this._buffer.substr(this._sectionStart)); 672 | } 673 | this._buffer = ""; 674 | this._bufferOffset += this._index; 675 | this._index = 0; 676 | } else if (this._sectionStart === this._index) { 677 | //the section just started 678 | this._buffer = ""; 679 | this._bufferOffset += this._index; 680 | this._index = 0; 681 | } else { 682 | //remove everything unnecessary 683 | this._buffer = this._buffer.substr(this._sectionStart); 684 | this._index -= this._sectionStart; 685 | this._bufferOffset += this._sectionStart; 686 | } 687 | 688 | this._sectionStart = 0; 689 | } 690 | }; 691 | 692 | //TODO make events conditional 693 | Tokenizer.prototype.write = function(chunk) { 694 | if (this._ended) this._cbs.onerror(Error(".write() after done!")); 695 | 696 | this._buffer += chunk; 697 | this._parse(); 698 | }; 699 | 700 | Tokenizer.prototype._parse = function() { 701 | while (this._index < this._buffer.length && this._running) { 702 | var c = this._buffer.charAt(this._index); 703 | if (this._state === TEXT) { 704 | this._stateText(c); 705 | } else if (this._state === BEFORE_TAG_NAME) { 706 | this._stateBeforeTagName(c); 707 | } else if (this._state === IN_TAG_NAME) { 708 | this._stateInTagName(c); 709 | } else if (this._state === BEFORE_CLOSING_TAG_NAME) { 710 | this._stateBeforeCloseingTagName(c); 711 | } else if (this._state === IN_CLOSING_TAG_NAME) { 712 | this._stateInCloseingTagName(c); 713 | } else if (this._state === AFTER_CLOSING_TAG_NAME) { 714 | this._stateAfterCloseingTagName(c); 715 | } else if (this._state === IN_SELF_CLOSING_TAG) { 716 | this._stateInSelfClosingTag(c); 717 | } else if (this._state === BEFORE_ATTRIBUTE_NAME) { 718 | 719 | /* 720 | * attributes 721 | */ 722 | this._stateBeforeAttributeName(c); 723 | } else if (this._state === IN_ATTRIBUTE_NAME) { 724 | this._stateInAttributeName(c); 725 | } else if (this._state === AFTER_ATTRIBUTE_NAME) { 726 | this._stateAfterAttributeName(c); 727 | } else if (this._state === BEFORE_ATTRIBUTE_VALUE) { 728 | this._stateBeforeAttributeValue(c); 729 | } else if (this._state === IN_ATTRIBUTE_VALUE_DQ) { 730 | this._stateInAttributeValueDoubleQuotes(c); 731 | } else if (this._state === IN_ATTRIBUTE_VALUE_SQ) { 732 | this._stateInAttributeValueSingleQuotes(c); 733 | } else if (this._state === IN_ATTRIBUTE_VALUE_NQ) { 734 | this._stateInAttributeValueNoQuotes(c); 735 | } else if (this._state === BEFORE_DECLARATION) { 736 | 737 | /* 738 | * declarations 739 | */ 740 | this._stateBeforeDeclaration(c); 741 | } else if (this._state === IN_DECLARATION) { 742 | this._stateInDeclaration(c); 743 | } else if (this._state === IN_PROCESSING_INSTRUCTION) { 744 | 745 | /* 746 | * processing instructions 747 | */ 748 | this._stateInProcessingInstruction(c); 749 | } else if (this._state === BEFORE_COMMENT) { 750 | 751 | /* 752 | * comments 753 | */ 754 | this._stateBeforeComment(c); 755 | } else if (this._state === IN_COMMENT) { 756 | this._stateInComment(c); 757 | } else if (this._state === AFTER_COMMENT_1) { 758 | this._stateAfterComment1(c); 759 | } else if (this._state === AFTER_COMMENT_2) { 760 | this._stateAfterComment2(c); 761 | } else if (this._state === BEFORE_CDATA_1) { 762 | 763 | /* 764 | * cdata 765 | */ 766 | this._stateBeforeCdata1(c); 767 | } else if (this._state === BEFORE_CDATA_2) { 768 | this._stateBeforeCdata2(c); 769 | } else if (this._state === BEFORE_CDATA_3) { 770 | this._stateBeforeCdata3(c); 771 | } else if (this._state === BEFORE_CDATA_4) { 772 | this._stateBeforeCdata4(c); 773 | } else if (this._state === BEFORE_CDATA_5) { 774 | this._stateBeforeCdata5(c); 775 | } else if (this._state === BEFORE_CDATA_6) { 776 | this._stateBeforeCdata6(c); 777 | } else if (this._state === IN_CDATA) { 778 | this._stateInCdata(c); 779 | } else if (this._state === AFTER_CDATA_1) { 780 | this._stateAfterCdata1(c); 781 | } else if (this._state === AFTER_CDATA_2) { 782 | this._stateAfterCdata2(c); 783 | } else if (this._state === BEFORE_SPECIAL) { 784 | 785 | /* 786 | * special tags 787 | */ 788 | this._stateBeforeSpecial(c); 789 | } else if (this._state === BEFORE_SPECIAL_END) { 790 | this._stateBeforeSpecialEnd(c); 791 | } else if (this._state === BEFORE_SCRIPT_1) { 792 | 793 | /* 794 | * script 795 | */ 796 | this._stateBeforeScript1(c); 797 | } else if (this._state === BEFORE_SCRIPT_2) { 798 | this._stateBeforeScript2(c); 799 | } else if (this._state === BEFORE_SCRIPT_3) { 800 | this._stateBeforeScript3(c); 801 | } else if (this._state === BEFORE_SCRIPT_4) { 802 | this._stateBeforeScript4(c); 803 | } else if (this._state === BEFORE_SCRIPT_5) { 804 | this._stateBeforeScript5(c); 805 | } else if (this._state === AFTER_SCRIPT_1) { 806 | this._stateAfterScript1(c); 807 | } else if (this._state === AFTER_SCRIPT_2) { 808 | this._stateAfterScript2(c); 809 | } else if (this._state === AFTER_SCRIPT_3) { 810 | this._stateAfterScript3(c); 811 | } else if (this._state === AFTER_SCRIPT_4) { 812 | this._stateAfterScript4(c); 813 | } else if (this._state === AFTER_SCRIPT_5) { 814 | this._stateAfterScript5(c); 815 | } else if (this._state === BEFORE_STYLE_1) { 816 | 817 | /* 818 | * style 819 | */ 820 | this._stateBeforeStyle1(c); 821 | } else if (this._state === BEFORE_STYLE_2) { 822 | this._stateBeforeStyle2(c); 823 | } else if (this._state === BEFORE_STYLE_3) { 824 | this._stateBeforeStyle3(c); 825 | } else if (this._state === BEFORE_STYLE_4) { 826 | this._stateBeforeStyle4(c); 827 | } else if (this._state === AFTER_STYLE_1) { 828 | this._stateAfterStyle1(c); 829 | } else if (this._state === AFTER_STYLE_2) { 830 | this._stateAfterStyle2(c); 831 | } else if (this._state === AFTER_STYLE_3) { 832 | this._stateAfterStyle3(c); 833 | } else if (this._state === AFTER_STYLE_4) { 834 | this._stateAfterStyle4(c); 835 | } else if (this._state === BEFORE_ENTITY) { 836 | 837 | /* 838 | * entities 839 | */ 840 | this._stateBeforeEntity(c); 841 | } else if (this._state === BEFORE_NUMERIC_ENTITY) { 842 | this._stateBeforeNumericEntity(c); 843 | } else if (this._state === IN_NAMED_ENTITY) { 844 | this._stateInNamedEntity(c); 845 | } else if (this._state === IN_NUMERIC_ENTITY) { 846 | this._stateInNumericEntity(c); 847 | } else if (this._state === IN_HEX_ENTITY) { 848 | this._stateInHexEntity(c); 849 | } else { 850 | this._cbs.onerror(Error("unknown _state"), this._state); 851 | } 852 | 853 | this._index++; 854 | } 855 | 856 | this._cleanup(); 857 | }; 858 | 859 | Tokenizer.prototype.pause = function() { 860 | this._running = false; 861 | }; 862 | Tokenizer.prototype.resume = function() { 863 | this._running = true; 864 | 865 | if (this._index < this._buffer.length) { 866 | this._parse(); 867 | } 868 | if (this._ended) { 869 | this._finish(); 870 | } 871 | }; 872 | 873 | Tokenizer.prototype.end = function(chunk) { 874 | if (this._ended) this._cbs.onerror(Error(".end() after done!")); 875 | if (chunk) this.write(chunk); 876 | 877 | this._ended = true; 878 | 879 | if (this._running) this._finish(); 880 | }; 881 | 882 | Tokenizer.prototype._finish = function() { 883 | //if there is remaining data, emit it in a reasonable way 884 | if (this._sectionStart < this._index) { 885 | this._handleTrailingData(); 886 | } 887 | 888 | this._cbs.onend(); 889 | }; 890 | 891 | Tokenizer.prototype._handleTrailingData = function() { 892 | var data = this._buffer.substr(this._sectionStart); 893 | 894 | if ( 895 | this._state === IN_CDATA || 896 | this._state === AFTER_CDATA_1 || 897 | this._state === AFTER_CDATA_2 898 | ) { 899 | this._cbs.oncdata(data); 900 | } else if ( 901 | this._state === IN_COMMENT || 902 | this._state === AFTER_COMMENT_1 || 903 | this._state === AFTER_COMMENT_2 904 | ) { 905 | this._cbs.oncomment(data); 906 | } else if (this._state === IN_NAMED_ENTITY && !this._xmlMode) { 907 | this._parseLegacyEntity(); 908 | if (this._sectionStart < this._index) { 909 | this._state = this._baseState; 910 | this._handleTrailingData(); 911 | } 912 | } else if (this._state === IN_NUMERIC_ENTITY && !this._xmlMode) { 913 | this._decodeNumericEntity(2, 10); 914 | if (this._sectionStart < this._index) { 915 | this._state = this._baseState; 916 | this._handleTrailingData(); 917 | } 918 | } else if (this._state === IN_HEX_ENTITY && !this._xmlMode) { 919 | this._decodeNumericEntity(3, 16); 920 | if (this._sectionStart < this._index) { 921 | this._state = this._baseState; 922 | this._handleTrailingData(); 923 | } 924 | } else if ( 925 | this._state !== IN_TAG_NAME && 926 | this._state !== BEFORE_ATTRIBUTE_NAME && 927 | this._state !== BEFORE_ATTRIBUTE_VALUE && 928 | this._state !== AFTER_ATTRIBUTE_NAME && 929 | this._state !== IN_ATTRIBUTE_NAME && 930 | this._state !== IN_ATTRIBUTE_VALUE_SQ && 931 | this._state !== IN_ATTRIBUTE_VALUE_DQ && 932 | this._state !== IN_ATTRIBUTE_VALUE_NQ && 933 | this._state !== IN_CLOSING_TAG_NAME 934 | ) { 935 | this._cbs.ontext(data); 936 | } 937 | //else, ignore remaining data 938 | //TODO add a way to remove current tag 939 | }; 940 | 941 | Tokenizer.prototype.reset = function() { 942 | Tokenizer.call( 943 | this, 944 | { xmlMode: this._xmlMode, decodeEntities: this._decodeEntities }, 945 | this._cbs 946 | ); 947 | }; 948 | 949 | Tokenizer.prototype.getAbsoluteIndex = function() { 950 | return this._bufferOffset + this._index; 951 | }; 952 | 953 | Tokenizer.prototype._getSection = function() { 954 | return this._buffer.substring(this._sectionStart, this._index); 955 | }; 956 | 957 | Tokenizer.prototype._emitToken = function(name) { 958 | this._cbs[name](this._getSection()); 959 | this._sectionStart = -1; 960 | }; 961 | 962 | Tokenizer.prototype._emitPartial = function(value) { 963 | if (this._baseState !== TEXT) { 964 | this._cbs.onattribdata(value); //TODO implement the new event 965 | } else { 966 | this._cbs.ontext(value); 967 | } 968 | }; 969 | 970 | module.exports = Tokenizer; 971 | --------------------------------------------------------------------------------