├── .travis.yml ├── test ├── mocha.opts ├── html.js └── html │ └── incomplete-script ├── .gitignore ├── LICENSE ├── package.json ├── README.md └── index.js /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - "0.11" 4 | - "0.10" 5 | -------------------------------------------------------------------------------- /test/mocha.opts: -------------------------------------------------------------------------------- 1 | --require blanket 2 | --require should 3 | --reporter spec 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | lib-cov 2 | *.seed 3 | *.log 4 | *.csv 5 | *.dat 6 | *.out 7 | *.pid 8 | *.gz 9 | 10 | pids 11 | logs 12 | results 13 | 14 | npm-debug.log 15 | node_modules 16 | bower_components 17 | 18 | .* 19 | !.gitignore 20 | 21 | *.sublime-* 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 ashi009 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "fast-html-parser", 3 | "version": "1.0.1", 4 | "description": "A very fast HTML parser, generating a simplified DOM, with basic element query support.", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "mocha", 8 | "posttest": "mocha -R travis-cov", 9 | "coverage": "mocha -R html-cov > coverage.html" 10 | }, 11 | "author": "Xiaoyi Shi ", 12 | "license": "MIT", 13 | "dependencies": { 14 | "apollojs": "^1.3.0", 15 | "entities": "^1.1.1" 16 | }, 17 | "devDependencies": { 18 | "mocha": "^1", 19 | "should": "*", 20 | "blanket": "*", 21 | "travis-cov": "*" 22 | }, 23 | "config": { 24 | "blanket": { 25 | "pattern": "index.js", 26 | "data-cover-never": ["node_modules"] 27 | }, 28 | "travis-cov": { 29 | "threshold": 70 30 | } 31 | }, 32 | "directories": { 33 | "test": "test" 34 | }, 35 | "repository": { 36 | "type": "git", 37 | "url": "https://github.com/ashi009/node-fast-html-parser.git" 38 | }, 39 | "bugs": { 40 | "url": "https://github.com/ashi009/node-fast-html-parser/issues" 41 | }, 42 | "homepage": "https://github.com/ashi009/node-fast-html-parser" 43 | } 44 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Fast HTML Parser [![NPM version](https://badge.fury.io/js/fast-html-parser.png)](http://badge.fury.io/js/fast-html-parser) [![Build Status](https://travis-ci.org/ashi009/node-fast-html-parser.svg?branch=master)](https://travis-ci.org/ashi009/node-fast-html-parser) ![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg) 2 | 3 | Fast HTML Parser is a _very fast_ HTML parser. Which will generate a simplified 4 | DOM tree, with basic element query support. 5 | 6 | Per the design, it intends to parse massive HTML files in lowest price, thus the 7 | performance is the top priority. For this reason, some malformatted HTML may not 8 | be able to parse correctly, but most usual errors are covered (eg. HTML4 style 9 | no closing `
  • `, `` etc). 10 | 11 | ## Install 12 | 13 | ```shell 14 | npm install --save fast-html-parser 15 | ``` 16 | 17 | ## Performance 18 | 19 | Faster than htmlparser2! 20 | 21 | ```shell 22 | fast-html-parser: 2.18409 ms/file ± 1.37431 23 | high5 : 4.55435 ms/file ± 2.51132 24 | htmlparser : 27.6920 ms/file ± 171.588 25 | htmlparser2-dom : 6.22320 ms/file ± 3.48772 26 | htmlparser2 : 3.58360 ms/file ± 2.23658 27 | hubbub : 16.1774 ms/file ± 8.95079 28 | libxmljs : 7.19406 ms/file ± 7.04495 29 | parse5 : 10.7590 ms/file ± 8.09687 30 | ``` 31 | 32 | Tested with [htmlparser-benchmark](https://github.com/AndreasMadsen/htmlparser-benchmark). 33 | 34 | ## Usage 35 | 36 | ```js 37 | var HTMLParser = require('fast-html-parser'); 38 | 39 | var root = HTMLParser.parse(''); 40 | 41 | console.log(root.firstChild.structure); 42 | // ul#list 43 | // li 44 | // #text 45 | 46 | console.log(root.querySelector('#list')); 47 | // { tagName: 'ul', 48 | // rawAttrs: 'id="list"', 49 | // childNodes: 50 | // [ { tagName: 'li', 51 | // rawAttrs: '', 52 | // childNodes: [Object], 53 | // classNames: [] } ], 54 | // id: 'list', 55 | // classNames: [] } 56 | ``` 57 | 58 | ## API 59 | 60 | ### parse(data[, options]) 61 | 62 | Parse given data, and return root of the generated DOM. 63 | 64 | - **data**, data to parse 65 | - **options**, parse options 66 | 67 | ```js 68 | { 69 | lowerCaseTagName: false, // convert tag name to lower case (hurt performance heavily) 70 | script: false, // retrieve content in '); 114 | 115 | root.firstChild.childNodes.should.be.empty; 116 | root.lastChild.childNodes.should.be.empty; 117 | 118 | }); 119 | 120 | it('should extract text in script and style when ask so', function() { 121 | 122 | var root = parseHTML('', { 123 | script: true, 124 | style: true 125 | }); 126 | 127 | root.firstChild.childNodes.should.not.be.empty; 128 | root.firstChild.childNodes.should.eql([new TextNode('1')]); 129 | root.firstChild.text.should.eql('1'); 130 | root.lastChild.childNodes.should.not.be.empty; 131 | root.lastChild.childNodes.should.eql([new TextNode('2&')]); 132 | root.lastChild.text.should.eql('2&'); 133 | root.lastChild.rawText.should.eql('2&'); 134 | }); 135 | 136 | it('should be able to parse "html/incomplete-script" file', function() { 137 | 138 | var root = parseHTML(fs.readFileSync(__dirname + '/html/incomplete-script').toString(), { 139 | script: true 140 | }); 141 | 142 | }); 143 | 144 | it('should parse "

    .." very fast', function() { 145 | 146 | for (var i = 0; i < 100; i++) 147 | parseHTML('

    '); 148 | 149 | }); 150 | 151 | it('should parse "

    .." fast', function() { 152 | 153 | for (var i = 0; i < 100; i++) 154 | parseHTML('

    ', { 155 | lowerCaseTagName: true 156 | }); 157 | 158 | }); 159 | 160 | }); 161 | 162 | describe('TextNode', function() { 163 | 164 | describe('#isWhitespace', function() { 165 | var node = new TextNode(''); 166 | node.isWhitespace.should.be.ok; 167 | node = new TextNode(' \t'); 168 | node.isWhitespace.should.be.ok; 169 | node = new TextNode(' \t  \t'); 170 | node.isWhitespace.should.be.ok; 171 | }); 172 | 173 | }); 174 | 175 | describe('HTMLElement', function() { 176 | 177 | describe('#removeWhitespace()', function() { 178 | 179 | it('should remove whitespaces while preserving nodes with content', function() { 180 | 181 | var root = parseHTML('

    \r \n \t

    123

    '); 182 | 183 | var p = new HTMLElement('p', {}, ''); 184 | p.appendChild(new HTMLElement('h5', {}, '')) 185 | .appendChild(new TextNode('123')); 186 | 187 | root.firstChild.removeWhitespace().should.eql(p); 188 | 189 | }); 190 | 191 | }); 192 | 193 | describe('#rawAttributes', function() { 194 | 195 | it('should return escaped attributes of the element', function() { 196 | 197 | var root = parseHTML('

    '); 198 | 199 | root.firstChild.rawAttributes.should.eql({ 200 | 'a': '12', 201 | 'data-id': '!$$&', 202 | 'yAz': '1' 203 | }); 204 | 205 | }); 206 | 207 | }); 208 | 209 | describe('#attributes', function() { 210 | 211 | it('should return attributes of the element', function() { 212 | 213 | var root = parseHTML('

    '); 214 | 215 | root.firstChild.attributes.should.eql({ 216 | 'a': '12', 217 | 'data-id': '!$$&', 218 | 'yAz': '1' 219 | }); 220 | 221 | }); 222 | 223 | }); 224 | 225 | describe('#querySelectorAll()', function() { 226 | 227 | it('should return correct elements in DOM tree', function() { 228 | 229 | var root = parseHTML('
    '); 230 | 231 | root.querySelectorAll('#id').should.eql([root.firstChild]); 232 | root.querySelectorAll('span.a').should.eql([root.firstChild.firstChild.firstChild]); 233 | root.querySelectorAll('span.b').should.eql([root.firstChild.firstChild.firstChild]); 234 | root.querySelectorAll('span.a.b').should.eql([root.firstChild.firstChild.firstChild]); 235 | root.querySelectorAll('#id .b').should.eql([root.firstChild.firstChild.firstChild]); 236 | root.querySelectorAll('#id span').should.eql(root.firstChild.firstChild.childNodes); 237 | 238 | }); 239 | 240 | }); 241 | 242 | describe('#structuredText', function() { 243 | 244 | it('should return correct structured text', function() { 245 | 246 | var root = parseHTML('o

    a

    b

    c
    '); 247 | root.structuredText.should.eql('o\na\nb\nc'); 248 | 249 | }); 250 | 251 | }); 252 | 253 | }); 254 | 255 | }); 256 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | require('apollojs'); 2 | 3 | var entities = require('entities'); 4 | 5 | /** 6 | * Node Class as base class for TextNode and HTMLElement. 7 | */ 8 | function Node() { 9 | 10 | } 11 | $declare(Node, { 12 | 13 | }); 14 | $defenum(Node, { 15 | ELEMENT_NODE: 1, 16 | TEXT_NODE: 3 17 | }); 18 | 19 | /** 20 | * TextNode to contain a text element in DOM tree. 21 | * @param {string} value [description] 22 | */ 23 | function TextNode(value) { 24 | this.rawText = value; 25 | } 26 | $inherit(TextNode, Node, { 27 | 28 | /** 29 | * Node Type declaration. 30 | * @type {Number} 31 | */ 32 | nodeType: Node.TEXT_NODE, 33 | 34 | /** 35 | * Get unescaped text value of current node and its children. 36 | * @return {string} text content 37 | */ 38 | get text() { 39 | return entities.decodeHTML5(this.rawText); 40 | }, 41 | 42 | /** 43 | * Detect if the node contains only white space. 44 | * @return {bool} 45 | */ 46 | get isWhitespace() { 47 | return /^(\s| )*$/.test(this.rawText); 48 | } 49 | 50 | }); 51 | 52 | var kBlockElements = { 53 | div: true, 54 | p: true, 55 | // ul: true, 56 | // ol: true, 57 | li: true, 58 | // table: true, 59 | // tr: true, 60 | td: true, 61 | section: true, 62 | br: true 63 | }; 64 | 65 | /** 66 | * HTMLElement, which contains a set of children. 67 | * Note: this is a minimalist implementation, no complete tree 68 | * structure provided (no parentNode, nextSibling, 69 | * previousSibling etc). 70 | * @param {string} name tagName 71 | * @param {Object} keyAttrs id and class attribute 72 | * @param {Object} rawAttrs attributes in string 73 | */ 74 | function HTMLElement(name, keyAttrs, rawAttrs) { 75 | this.tagName = name; 76 | this.rawAttrs = rawAttrs || ''; 77 | // this.parentNode = null; 78 | this.childNodes = []; 79 | if (keyAttrs.id) 80 | this.id = keyAttrs.id; 81 | if (keyAttrs.class) 82 | this.classNames = keyAttrs.class.split(/\s+/); 83 | else 84 | this.classNames = []; 85 | } 86 | $inherit(HTMLElement, Node, { 87 | 88 | /** 89 | * Node Type declaration. 90 | * @type {Number} 91 | */ 92 | nodeType: Node.ELEMENT_NODE, 93 | 94 | /** 95 | * Get unescaped text value of current node and its children. 96 | * @return {string} text content 97 | */ 98 | get text() { 99 | return entities.decodeHTML5(this.rawText); 100 | }, 101 | 102 | /** 103 | * Get escpaed (as-it) text value of current node and its children. 104 | * @return {string} text content 105 | */ 106 | get rawText() { 107 | var res = ''; 108 | for (var i = 0; i < this.childNodes.length; i++) 109 | res += this.childNodes[i].rawText; 110 | return res; 111 | }, 112 | 113 | /** 114 | * Get structured Text (with '\n' etc.) 115 | * @return {string} structured text 116 | */ 117 | get structuredText() { 118 | var currentBlock = []; 119 | var blocks = [currentBlock]; 120 | function dfs(node) { 121 | if (node.nodeType === Node.ELEMENT_NODE) { 122 | if (kBlockElements[node.tagName]) { 123 | if (currentBlock.length > 0) 124 | blocks.push(currentBlock = []); 125 | node.childNodes.forEach(dfs); 126 | if (currentBlock.length > 0) 127 | blocks.push(currentBlock = []); 128 | } else { 129 | node.childNodes.forEach(dfs); 130 | } 131 | } else if (node.nodeType === Node.TEXT_NODE) { 132 | if (node.isWhitespace) { 133 | // Whitespace node, postponed output 134 | currentBlock.prependWhitespace = true; 135 | } else { 136 | var text = node.text; 137 | if (currentBlock.prependWhitespace) { 138 | text = ' ' + text; 139 | currentBlock.prependWhitespace = false; 140 | } 141 | currentBlock.push(text); 142 | } 143 | } 144 | } 145 | dfs(this); 146 | return blocks 147 | .map(function(block) { 148 | // Normalize each line's whitespace 149 | return block.join('').trim().replace(/\s{2,}/g, ' '); 150 | }) 151 | .join('\n').trimRight(); 152 | }, 153 | 154 | /** 155 | * Trim element from right (in block) after seeing pattern in a TextNode. 156 | * @param {RegExp} pattern pattern to find 157 | * @return {HTMLElement} reference to current node 158 | */ 159 | trimRight: function(pattern) { 160 | function dfs(node) { 161 | for (var i = 0; i < node.childNodes.length; i++) { 162 | var childNode = node.childNodes[i]; 163 | if (childNode.nodeType === Node.ELEMENT_NODE) { 164 | dfs(childNode); 165 | } else { 166 | var index = childNode.rawText.search(pattern); 167 | if (index > -1) { 168 | childNode.rawText = childNode.rawText.substr(0, index); 169 | // trim all following nodes. 170 | node.childNodes.length = i+1; 171 | } 172 | } 173 | } 174 | } 175 | dfs(this); 176 | return this; 177 | }, 178 | 179 | /** 180 | * Get DOM structure 181 | * @return {string} strucutre 182 | */ 183 | get structure() { 184 | var res = []; 185 | var indention = 0; 186 | function write(str) { 187 | res.push(' '.repeat(indention) + str); 188 | } 189 | function dfs(node) { 190 | var idStr = node.id ? ('#' + node.id) : ''; 191 | var classStr = node.classNames.length ? ('.' + node.classNames.join('.')) : ''; 192 | write(node.tagName + idStr + classStr); 193 | indention++; 194 | for (var i = 0; i < node.childNodes.length; i++) { 195 | var childNode = node.childNodes[i]; 196 | if (childNode.nodeType === Node.ELEMENT_NODE) { 197 | dfs(childNode); 198 | } else if (childNode.nodeType === Node.TEXT_NODE) { 199 | if (!childNode.isWhitespace) 200 | write('#text'); 201 | } 202 | } 203 | indention--; 204 | } 205 | dfs(this); 206 | return res.join('\n'); 207 | }, 208 | 209 | /** 210 | * Remove whitespaces in this sub tree. 211 | * @return {HTMLElement} pointer to this 212 | */ 213 | removeWhitespace: function() { 214 | var i = 0, o = 0; 215 | for (; i < this.childNodes.length; i++) { 216 | var node = this.childNodes[i]; 217 | if (node.nodeType === Node.TEXT_NODE) { 218 | if (node.isWhitespace) 219 | continue; 220 | node.rawText = node.rawText.trim(); 221 | } else if (node.nodeType === Node.ELEMENT_NODE) { 222 | node.removeWhitespace(); 223 | } 224 | this.childNodes[o++] = node; 225 | } 226 | this.childNodes.length = o; 227 | return this; 228 | }, 229 | 230 | /** 231 | * Query CSS selector to find matching nodes. 232 | * @param {string} selector Simplified CSS selector 233 | * @param {Matcher} selector A Matcher instance 234 | * @return {HTMLElement[]} matching elements 235 | */ 236 | querySelectorAll: function(selector) { 237 | var matcher; 238 | if (selector instanceof Matcher) { 239 | matcher = selector; 240 | matcher.reset(); 241 | } else { 242 | matcher = new Matcher(selector); 243 | } 244 | var res = []; 245 | var stack = []; 246 | for (var i = 0; i < this.childNodes.length; i++) { 247 | stack.push([this.childNodes[i], 0, false]); 248 | while (stack.length) { 249 | var state = stack.back; 250 | var el = state[0]; 251 | if (state[1] === 0) { 252 | // Seen for first time. 253 | if (el.nodeType !== Node.ELEMENT_NODE) { 254 | stack.pop(); 255 | continue; 256 | } 257 | if (state[2] = matcher.advance(el)) { 258 | if (matcher.matched) { 259 | res.push(el); 260 | // no need to go further. 261 | matcher.rewind(); 262 | stack.pop(); 263 | continue; 264 | } 265 | } 266 | } 267 | if (state[1] < el.childNodes.length) { 268 | stack.push([el.childNodes[state[1]++], 0, false]); 269 | } else { 270 | if (state[2]) 271 | matcher.rewind(); 272 | stack.pop(); 273 | } 274 | } 275 | } 276 | return res; 277 | }, 278 | 279 | /** 280 | * Query CSS Selector to find matching node. 281 | * @param {string} selector Simplified CSS selector 282 | * @param {Matcher} selector A Matcher instance 283 | * @return {HTMLElement} matching node 284 | */ 285 | querySelector: function(selector) { 286 | var matcher; 287 | if (selector instanceof Matcher) { 288 | matcher = selector; 289 | matcher.reset(); 290 | } else { 291 | matcher = new Matcher(selector); 292 | } 293 | var stack = []; 294 | for (var i = 0; i < this.childNodes.length; i++) { 295 | stack.push([this.childNodes[i], 0, false]); 296 | while (stack.length) { 297 | var state = stack.back; 298 | var el = state[0]; 299 | if (state[1] === 0) { 300 | // Seen for first time. 301 | if (el.nodeType !== Node.ELEMENT_NODE) { 302 | stack.pop(); 303 | continue; 304 | } 305 | if (state[2] = matcher.advance(el)) { 306 | if (matcher.matched) { 307 | return el; 308 | } 309 | } 310 | } 311 | if (state[1] < el.childNodes.length) { 312 | stack.push([el.childNodes[state[1]++], 0, false]); 313 | } else { 314 | if (state[2]) 315 | matcher.rewind(); 316 | stack.pop(); 317 | } 318 | } 319 | } 320 | return null; 321 | }, 322 | 323 | /** 324 | * Append a child node to childNodes 325 | * @param {Node} node node to append 326 | * @return {Node} node appended 327 | */ 328 | appendChild: function(node) { 329 | // node.parentNode = this; 330 | this.childNodes.push(node); 331 | return node; 332 | }, 333 | 334 | /** 335 | * Get first child node 336 | * @return {Node} first child node 337 | */ 338 | get firstChild() { 339 | return this.childNodes.front; 340 | }, 341 | 342 | /** 343 | * Get last child node 344 | * @return {Node} last child node 345 | */ 346 | get lastChild() { 347 | return this.childNodes.back; 348 | }, 349 | 350 | /** 351 | * Get attributes 352 | * @return {Object} parsed and unescaped attributes 353 | */ 354 | get attributes() { 355 | if (this._attrs) 356 | return this._attrs; 357 | this._attrs = {}; 358 | var attrs = this.rawAttributes; 359 | for (var key in attrs) { 360 | this._attrs[key] = entities.decodeHTML5(attrs[key]); 361 | } 362 | return this._attrs; 363 | }, 364 | 365 | /** 366 | * Get escaped (as-it) attributes 367 | * @return {Object} parsed attributes 368 | */ 369 | get rawAttributes() { 370 | if (this._rawAttrs) 371 | return this._rawAttrs; 372 | var attrs = {}; 373 | if (this.rawAttrs) { 374 | var re = /\b([a-z][a-z0-9\-]*)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig; 375 | for (var match; match = re.exec(this.rawAttrs); ) 376 | attrs[match[1]] = match[3] || match[4] || match[5]; 377 | } 378 | this._rawAttrs = attrs; 379 | return attrs; 380 | } 381 | 382 | }); 383 | $define(HTMLElement, { 384 | __wrap: function(el) { 385 | el.childNodes.forEach(function(node) { 386 | if (node.rawText) { 387 | $wrap(node, TextNode); 388 | } else { 389 | $wrap(node, HTMLElement); 390 | } 391 | }); 392 | } 393 | }); 394 | 395 | /** 396 | * Cache to store generated match functions 397 | * @type {Object} 398 | */ 399 | var pMatchFunctionCache = {}; 400 | 401 | /** 402 | * Matcher class to make CSS match 403 | * @param {string} selector Selector 404 | */ 405 | function Matcher(selector) { 406 | this.matchers = selector.split(' ').map(function(matcher) { 407 | if (pMatchFunctionCache[matcher]) 408 | return pMatchFunctionCache[matcher]; 409 | var parts = matcher.split('.'); 410 | var tagName = parts[0]; 411 | var classes = parts.slice(1).sort(); 412 | var source = ''; 413 | if (tagName && tagName != '*') { 414 | if (tagName[0] == '#') 415 | source += 'if (el.id != ' + JSON.stringify(tagName.substr(1)) + ') return false;'; 416 | else 417 | source += 'if (el.tagName != ' + JSON.stringify(tagName) + ') return false;'; 418 | } 419 | if (classes.length > 0) 420 | source += 'for (var cls = ' + JSON.stringify(classes) + ', i = 0; i < cls.length; i++) if (el.classNames.indexOf(cls[i]) === -1) return false;'; 421 | source += 'return true;'; 422 | return pMatchFunctionCache[matcher] = new Function('el', source); 423 | }); 424 | this.nextMatch = 0; 425 | } 426 | $declare(Matcher, { 427 | /** 428 | * Trying to advance match pointer 429 | * @param {HTMLElement} el element to make the match 430 | * @return {bool} true when pointer advanced. 431 | */ 432 | advance: function(el) { 433 | if (this.nextMatch < this.matchers.length && 434 | this.matchers[this.nextMatch](el)) { 435 | this.nextMatch++; 436 | return true; 437 | } 438 | return false; 439 | }, 440 | /** 441 | * Rewind the match pointer 442 | */ 443 | rewind: function() { 444 | this.nextMatch--; 445 | }, 446 | /** 447 | * Trying to determine if match made. 448 | * @return {bool} true when the match is made 449 | */ 450 | get matched() { 451 | return this.nextMatch == this.matchers.length; 452 | }, 453 | /** 454 | * Rest match pointer. 455 | * @return {[type]} [description] 456 | */ 457 | reset: function() { 458 | this.nextMatch = 0; 459 | } 460 | }); 461 | $define(Matcher, { 462 | /** 463 | * flush cache to free memory 464 | */ 465 | flushCache: function() { 466 | pMatchFunctionCache = {}; 467 | } 468 | }); 469 | 470 | var kMarkupPattern = /)-->|<(\/?)([a-z][a-z0-9]*)\s*([^>]*?)(\/?)>/ig; 471 | var kAttributePattern = /\b(id|class)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig; 472 | var kSelfClosingElements = { 473 | meta: true, 474 | img: true, 475 | link: true, 476 | input: true, 477 | area: true, 478 | br: true, 479 | hr: true 480 | }; 481 | var kElementsClosedByOpening = { 482 | li: {li: true}, 483 | p: {p: true, div: true}, 484 | td: {td: true, th: true}, 485 | th: {td: true, th: true} 486 | }; 487 | var kElementsClosedByClosing = { 488 | li: {ul: true, ol: true}, 489 | a: {div: true}, 490 | b: {div: true}, 491 | i: {div: true}, 492 | p: {div: true}, 493 | td: {tr: true, table: true}, 494 | th: {tr: true, table: true} 495 | }; 496 | var kBlockTextElements = { 497 | script: true, 498 | noscript: true, 499 | style: true, 500 | pre: true 501 | }; 502 | 503 | /** 504 | * Parses HTML and returns a root element 505 | */ 506 | module.exports = { 507 | 508 | Matcher: Matcher, 509 | Node: Node, 510 | HTMLElement: HTMLElement, 511 | TextNode: TextNode, 512 | 513 | /** 514 | * Parse a chuck of HTML source. 515 | * @param {string} data html 516 | * @return {HTMLElement} root element 517 | */ 518 | parse: function(data, options) { 519 | 520 | var root = new HTMLElement(null, {}); 521 | var currentParent = root; 522 | var stack = [root]; 523 | var lastTextPos = -1; 524 | 525 | options = options || {}; 526 | 527 | for (var match, text; match = kMarkupPattern.exec(data); ) { 528 | if (lastTextPos > -1) { 529 | if (lastTextPos + match[0].length < kMarkupPattern.lastIndex) { 530 | // if has content 531 | text = data.substring(lastTextPos, kMarkupPattern.lastIndex - match[0].length); 532 | currentParent.appendChild(new TextNode(text)); 533 | } 534 | } 535 | lastTextPos = kMarkupPattern.lastIndex; 536 | if (match[0][1] == '!') { 537 | // this is a comment 538 | continue; 539 | } 540 | if (options.lowerCaseTagName) 541 | match[2] = match[2].toLowerCase(); 542 | if (!match[1]) { 543 | // not or ... 559 | var closeMarkup = ''; 560 | var index = data.indexOf(closeMarkup, kMarkupPattern.lastIndex); 561 | if (options[match[2]]) { 562 | if (index == -1) { 563 | // there is no matching ending for the text element. 564 | text = data.substr(kMarkupPattern.lastIndex); 565 | } else { 566 | text = data.substring(kMarkupPattern.lastIndex, index); 567 | } 568 | if (text.length > 0) 569 | currentParent.appendChild(new TextNode(text)); 570 | } 571 | if (index == -1) { 572 | lastTextPos = kMarkupPattern.lastIndex = data.length + 1; 573 | } else { 574 | lastTextPos = kMarkupPattern.lastIndex = index + closeMarkup.length; 575 | match[1] = true; 576 | } 577 | } 578 | } 579 | if (match[1] || match[4] || 580 | kSelfClosingElements[match[2]]) { 581 | // or
    etc. 582 | while (true) { 583 | if (currentParent.tagName == match[2]) { 584 | stack.pop(); 585 | currentParent = stack.back; 586 | break; 587 | } else { 588 | // Trying to close current tag, and move on 589 | if (kElementsClosedByClosing[currentParent.tagName]) { 590 | if (kElementsClosedByClosing[currentParent.tagName][match[2]]) { 591 | stack.pop(); 592 | currentParent = stack.back; 593 | continue; 594 | } 595 | } 596 | // Use aggressive strategy to handle unmatching markups. 597 | break; 598 | } 599 | } 600 | } 601 | } 602 | 603 | return root; 604 | 605 | } 606 | 607 | }; 608 | -------------------------------------------------------------------------------- /test/html/incomplete-script: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | Designer Backpacks for women | Leather & Textile Backpacks | SSENSE 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 75 | 85 | 101 | 102 | 103 | 104 | 105 | 106 |
    107 |
    108 | 109 | 113 | 124 | 134 | 135 |
    136 |
    137 | 138 |
    139 | 413 | 465 | 466 |
    467 | 473 |
    474 |
    477 |
    478 | 479 |
    480 | 578 | 595 |