├── .gitignore ├── .travis.yml ├── Gruntfile.js ├── LICENSE ├── README.md ├── bin ├── benchmark ├── context-dump └── state-inspector ├── bower.json ├── dist └── context-parser.js ├── package.json ├── src ├── context-parser.js └── html5-state-machine.js └── tests ├── benchmarks ├── quick.js └── simplehtml.js ├── samples └── tests │ ├── 001.hbs │ ├── 001.html │ └── 1m.html └── unit ├── run-bug-spec.js ├── run-command-spec.js ├── run-functions-spec.js ├── run-states-spec.js └── run-strict-context-parser.js /.gitignore: -------------------------------------------------------------------------------- 1 | # install 2 | *.log 3 | node_modules 4 | 5 | # test 6 | xunit.xml 7 | artifacts 8 | coverage 9 | 10 | # build 11 | docs 12 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - '0.12' 4 | - '0.11' 5 | - '0.10' 6 | notifications: 7 | email: 8 | recipients: 9 | - neraliu@yahoo-inc.com 10 | on_success: change 11 | on_failure: always 12 | after_success: 13 | - test $(cat $TRAVIS_BUILD_DIR/package.json | grep version | awk '{print $2}' | sed 's/"//g' | sed 's/,//g' | awk '{print "v"$1}' ) = $TRAVIS_TAG && test $(echo $TRAVIS_NODE_VERSION | awk '{print $1}' ) = '0.12' && export VALID_VERSION=true 14 | deploy: 15 | provider: npm 16 | email: neraliu@yahoo-inc.com 17 | api_key: 18 | secure: bDXKqWnIbB3i29FHZp0RDDIL7AUBbldSOwWQ+05yCZNuWeAEJWxFy/RquHOIlFl2g01XrUjUElvkgLDZz2YbQmwDoendAsTG/RNlXgopL3MaK0a0tsHFNfXJsTblhqvqp9Rckh8cLD9CDKqKY4th+kpYXX3iDfj1RVJSbsRePIQ 19 | on: 20 | condition: $VALID_VERSION = true 21 | tags: true 22 | branch: master 23 | -------------------------------------------------------------------------------- /Gruntfile.js: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015, Yahoo! Inc. All rights reserved. 3 | * Copyrights licensed under the New BSD License. 4 | * See the accompanying LICENSE file for terms. 5 | */ 6 | module.exports = function(grunt) { 7 | 8 | grunt.initConfig({ 9 | pkg: grunt.file.readJSON('package.json'), 10 | jshint: { 11 | files: ['src/*.js'], 12 | options: { 13 | scripturl: true, 14 | camelcase: true 15 | } 16 | }, 17 | benchmark: { 18 | all: { 19 | src: ['tests/benchmarks/simplehtml.js'], 20 | } 21 | }, 22 | browserify: { 23 | standalone: { 24 | src: [ 'src/<%= pkg.name %>.js' ], 25 | dest: 'dist/<%= pkg.name %>.js', 26 | options: { 27 | browserifyOptions: { 28 | standalone: 'ContextParser' 29 | } 30 | } 31 | } 32 | }, 33 | mocha_istanbul: { 34 | target: { 35 | src: 'tests/unit/*.js', 36 | options: { 37 | coverage:true, 38 | check: { 39 | lines: 80, 40 | statements: 80 41 | } 42 | } 43 | } 44 | }, 45 | clean: { 46 | all: ['xunit.xml', 'artifacts', 'coverage', 'node_modules'], 47 | buildResidues: ['xunit.xml', 'artifacts', 'coverage'] 48 | } 49 | }); 50 | 51 | grunt.loadNpmTasks('grunt-benchmark'); 52 | grunt.loadNpmTasks('grunt-browserify'); 53 | grunt.loadNpmTasks('grunt-contrib-clean'); 54 | grunt.loadNpmTasks('grunt-contrib-jshint'); 55 | grunt.loadNpmTasks('grunt-mocha-istanbul'); 56 | 57 | grunt.registerTask('test', ['clean:buildResidues', 'jshint', 'mocha_istanbul']); 58 | grunt.registerTask('dist', ['browserify']) 59 | grunt.registerTask('default', ['test', 'dist']); 60 | }; 61 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Yahoo! Inc. All rights reserved. 2 | 3 | Redistribution and use of this software in source and binary forms, 4 | with or without modification, are permitted provided that the following 5 | conditions are met: 6 | 7 | * Redistributions of source code must retain the above 8 | copyright notice, this list of conditions and the 9 | following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the 13 | following disclaimer in the documentation and/or other 14 | materials provided with the distribution. 15 | 16 | * Neither the name of Yahoo! Inc. nor the names of its 17 | contributors may be used to endorse or promote products 18 | derived from this software without specific prior 19 | written permission of Yahoo! Inc. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 22 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 23 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 24 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | HTML5 Context Parser 2 | ==================== 3 | HTML5 Context Parser is a robust and small footprint HTML5 context parser that parses HTML 5 web pages and reports the execution context of each character seen. 4 | 5 | [![npm version][npm-badge]][npm] 6 | [![dependency status][dep-badge]][dep-status] 7 | 8 | [npm]: https://www.npmjs.org/package/context-parser 9 | [npm-badge]: https://img.shields.io/npm/v/context-parser.svg?style=flat-square 10 | [dep-status]: https://david-dm.org/yahoo/context-parser 11 | [dep-badge]: https://img.shields.io/david/yahoo/context-parser.svg?style=flat-square 12 | 13 | ## Overview 14 | 15 | ### Execution Context 16 | 17 | Browsers use Javascript and CSS engine in order to construct the dynamic components of a page correctly. In order to determine which engine should be used, browsers use HTML parsing algorithm to determine the **context** of HTML blocks (aka tokens). 18 | 19 | ### Cross Site Scripting 20 | 21 | Cross site scripting (XSS) can be prevented when input validation and filtering is performed aggressively such that it should remove all possible characters that could trigger changes in execution context in HTML. However, this has often proven as developer unfriendly and error prone. 22 | 23 | The other way to solve XSS is to apply the filtering at the time the output is rendered, and just remove the characters that would trigger changes in context based on the current context in the HTML. 24 | 25 | ## Design Principles 26 | 27 | ### Secure 28 | 29 | Parser need to be aligning with browser [specification](http://www.w3.org/TR/html5/), in order to determine context accurately. One single parsing mistake would result in security exploit. 30 | 31 | ### Keep It Simple and Straightforward 32 | 33 | Keeping code simple and straightforward allows easier code review. Moreover, that would allow smaller compilation time (or JS code loading time in browser client side). 34 | 35 | Since we are only interested in analyzing the execution context of the HTML5 page, we focused on the [tokenization process](http://www.w3.org/TR/html5/syntax.html#tokenization) and dropped other parts that are not related to context parsing logics. 36 | 37 | 38 | ## Quick Start 39 | 40 | Install the npm context-parser from the npm repo. 41 | ``` 42 | npm install -g context-parser 43 | ``` 44 | 45 | ### Server-side (nodejs) 46 | 47 | Analyze the execution context of HTML 5 web page in server side. 48 | ``` 49 | /* create the context parser */ 50 | var Parser = require("context-parser").Parser; 51 | var parser = new Parser(); 52 | 53 | /* read the html web page */ 54 | var file = "..."; 55 | var data = fs.readFileSync(file, 'utf-8'); 56 | 57 | /* analyze the execution context */ 58 | parser.contextualize(data); 59 | 60 | ``` 61 | 62 | ### Server-side (command line) 63 | 64 | Run against the HTML5 file with our parser and the state defined in [HTML 5 Specification](http://www.w3.org/TR/html5/syntax.html#tokenization) and print out the state of each character. 65 | ``` 66 | ./bin/context-dump 67 | HTML-State { statesSize: 819 } +0ms 68 | HTML-State { ch: 0, state: 1, symbol: 0 } +1ms 69 | HTML-State { ch: [0x20], state: 1, symbol: 0 } +1ms 70 | HTML-State { ch: [0x20], state: 1, symbol: 0 } +0ms 71 | HTML-State { ch: [0x20], state: 1, symbol: 0 } +0ms 72 | HTML-State { ch: [0x20], state: 1, symbol: 0 } +0ms 73 | HTML-State { ch: < [0x3c], state: 8, symbol: 7 } +0ms 74 | ... 75 | ``` 76 | 77 | It reports back the execution context of each character in the format explained below. 78 | ``` 79 | {ch: , state: , symbol: } 80 | ``` 81 | 82 | For the execution context number and character type, please refer to the state number defined in the [specification](http://www.w3.org/TR/html5/syntax.html#tokenization) and [our code](src/html5-state-machine.js). 83 | 84 | ## Development 85 | 86 | ### How to build 87 | ``` 88 | npm install 89 | npm run-script build 90 | ``` 91 | 92 | ### How to test 93 | ``` 94 | npm test 95 | ``` 96 | 97 | ### Build 98 | [![Build Status](https://travis-ci.org/yahoo/context-parser.svg?branch=master)](https://travis-ci.org/yahoo/context-parser) 99 | 100 | ## License 101 | 102 | This software is free to use under the Yahoo Inc. BSD license. 103 | See the [LICENSE file][] for license text and copyright information. 104 | 105 | [LICENSE file]: ./LICENSE 106 | 107 | ## Related Works 108 | 109 | * [parse5](https://github.com/inikulin/parse5) is an HTML5 compliant parser implemented in native javascript. It is used by [jsdom](https://github.com/tmpvar/jsdom) as the underlying HTML parsing engine. Parse5 has a larger code base and it exposes the parsing tree instead of execution context thus it may require some patching or trimming in order to provide context parsing functionality. 110 | 111 | * [htmlparser2](https://github.com/fb55/htmlparser2) is another HTML parser implemented in native javascript. It is used by [cheerio](https://github.com/cheeriojs/cheerio) as the underlying HTML parsing engine. HTMLparser2 is not a fully compliant parser thus it is less desirable to be used for application security related work. 112 | -------------------------------------------------------------------------------- /bin/benchmark: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | /* 3 | Copyright (c) 2015, Yahoo! Inc. All rights reserved. 4 | Copyrights licensed under the New BSD License. 5 | See the accompanying LICENSE file for terms. 6 | */ 7 | 8 | var fs = require('fs'); 9 | var html = fs.readFileSync('tests/samples/tests/1m.html', "utf8"); 10 | 11 | var parsers = [ 12 | ['Context Parser (Fast)', '../src/context-parser', 'FastParser', 'contextualize'], 13 | ]; 14 | 15 | var allParsers = parsers.concat([ 16 | ['Context Parser (Full)', '../src/context-parser', 'Parser', 'contextualize'], 17 | ['Parse5', 'parse5', 'Parser', 'parse'], 18 | ['Gumbo', 'gumbo-parser'], 19 | ['High5', 'high5', '', 'end'], 20 | ['HtmlParser2', 'htmlparser2', 'Parser', 'end'], 21 | ]); 22 | 23 | console.log("Usage: benchmark [all]"); 24 | 25 | if ( process.argv.length > 2 ) { 26 | parsers = allParsers; 27 | } 28 | 29 | parsers.forEach(function(parser) { 30 | 31 | var start, end; 32 | var parsername = parser[0]; 33 | var classname = parser[1]; 34 | var name = parser[2]; 35 | var method = parser[3]; 36 | 37 | try { 38 | if ( name || method ) { 39 | var Parser = name ? require(classname)[name] : require(classname); 40 | start = +new Date(); 41 | for(var i=0; i<10; i++) { 42 | var parser = new Parser(); 43 | parser[method](html); 44 | } 45 | end = +new Date(); 46 | } else { 47 | start = +new Date(); 48 | var method = require(classname); 49 | for(var i=0; i<10; i++) { 50 | method(html); 51 | } 52 | end = +new Date(); 53 | } 54 | console.log(parsername + " runs at a speed of " + 10/((end - start)/1000) + " MB per seconds [" + (end-start)/10/1000 + " second per MB].") 55 | } catch (e) { 56 | console.log('Error running ' + parsername + '. Try running npm install ' + classname + '? [' + e + ']'); 57 | 58 | } 59 | }); -------------------------------------------------------------------------------- /bin/context-dump: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | /* 3 | Copyright (c) 2015, Yahoo! Inc. All rights reserved. 4 | Copyrights licensed under the New BSD License. 5 | See the accompanying LICENSE file for terms. 6 | */ 7 | 8 | var Debug = require("debug"), 9 | progname = 'HTML-State'; 10 | 11 | Debug.enable(progname); 12 | 13 | (function() { 14 | var Parser = require("../src/context-parser").Parser, 15 | debug = Debug(progname), 16 | fs = require('fs'), 17 | file, 18 | enableInputPreProcessing = false, enableCanonicalization = false, enableIEConditionalComments = false, enableStateTracking = true, 19 | noofargs = 0; 20 | 21 | process.argv.forEach(function(val, index) { 22 | ++noofargs; 23 | if (index === 2) { 24 | file = val; 25 | } else if (index === 3) { 26 | enableInputPreProcessing = val === "1"? true:false; 27 | } else if (index === 4) { 28 | enableCanonicalization = val === "1"? true:false; 29 | } 30 | }); 31 | 32 | var config = { 33 | enableInputPreProcessing: enableInputPreProcessing, 34 | enableCanonicalization: enableCanonicalization, 35 | enableIEConditionalComments: enableIEConditionalComments, 36 | enableStateTracking: enableStateTracking 37 | }, 38 | parser = new Parser(config); 39 | 40 | Parser.prototype.printCharWithState = function() { 41 | var len = this.states.length; 42 | debug('{ statesSize: '+len+' }'); 43 | for(var i=0;i= 3 && noofargs <= 5) { 65 | if (fs.existsSync(file)) { 66 | var data = fs.readFileSync(file, 'utf-8'); 67 | parser.contextualize(data, data.length); 68 | parser.printCharWithState(); 69 | process.exit(0); 70 | } else { 71 | console.error("[ERROR] " + file + " not exist"); 72 | process.exit(1); 73 | } 74 | } else { 75 | console.log("Usage: context-dump "); 76 | process.exit(1); 77 | } 78 | 79 | }).call(this); 80 | -------------------------------------------------------------------------------- /bin/state-inspector: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | /* 3 | Copyright (c) 2015, Yahoo! Inc. All rights reserved. 4 | Copyrights licensed under the New BSD License. 5 | See the accompanying LICENSE file for terms. 6 | */ 7 | 8 | var Debug = require("debug"), 9 | progname = 'HTML-State'; 10 | 11 | Debug.enable(progname); 12 | 13 | (function() { 14 | var CP = require("../src/context-parser"), 15 | Parser = CP.FastParser, 16 | StateMachine = CP.StateMachine, 17 | ch, 18 | symbol, 19 | state, 20 | newState, 21 | reconsume, 22 | extraLogic, 23 | noofargs = 0, 24 | parser = new Parser(); 25 | 26 | process.argv.forEach(function(val, index) { 27 | ++noofargs; 28 | if (index === 2) { 29 | ch = val; 30 | } else if (index === 3) { 31 | state = parseInt(val); 32 | } 33 | }); 34 | 35 | if (noofargs === 4) { 36 | symbol = parser.lookupChar(ch); 37 | newState = StateMachine.lookupStateFromSymbol[symbol][state]; 38 | reconsume = StateMachine.lookupReconsumeFromSymbol[symbol][state]; 39 | extraLogic = StateMachine.lookupAltLogicFromSymbol[symbol][state]; 40 | console.log( { ch: ch, symbol: symbol, newState: newState, reconsume: reconsume, extraLogic: extraLogic } ); 41 | process.exit(0); 42 | } else { 43 | console.log("Usage: state-inspector char state"); 44 | process.exit(1); 45 | } 46 | 47 | }).call(this); 48 | -------------------------------------------------------------------------------- /bower.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "context-parser", 3 | "version": "2.0.1", 4 | "homepage": "https://github.com/yahoo/context-parser", 5 | "description": "HTML5 Context Parser", 6 | "main": "src/context-parser.js", 7 | "keywords": [ 8 | "context", 9 | "parser", 10 | "html5" 11 | ], 12 | "authors": [ 13 | "Nera Liu ", 14 | "Adonis Fung ", 15 | "Albert Yu " 16 | ], 17 | "license": "BSD", 18 | "ignore": [ 19 | "**/.*", 20 | "bin", 21 | "node_modules", 22 | "bower_components", 23 | "test", 24 | "tests" 25 | ] 26 | } 27 | -------------------------------------------------------------------------------- /dist/context-parser.js: -------------------------------------------------------------------------------- 1 | (function(f){if(typeof exports==="object"&&typeof module!=="undefined"){module.exports=f()}else if(typeof define==="function"&&define.amd){define([],f)}else{var g;if(typeof window!=="undefined"){g=window}else if(typeof global!=="undefined"){g=global}else if(typeof self!=="undefined"){g=self}else{g=this}g.ContextParser = f()}})(function(){var define,module,exports;return (function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error("Cannot find module '"+o+"'");throw f.code="MODULE_NOT_FOUND",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require=="function"&&require;for(var o=0;o 8 | Albert Yu 9 | Adonis Fung 10 | */ 11 | /*jshint -W030 */ 12 | (function() { 13 | "use strict"; 14 | 15 | var stateMachine = require('./html5-state-machine.js'), 16 | htmlState = stateMachine.State, 17 | reInputPreProcessing = /(?:\r\n?|[\x01-\x08\x0B\x0E-\x1F\x7F-\x9F\uFDD0-\uFDEF\uFFFE\uFFFF]|[\uD83F\uD87F\uD8BF\uD8FF\uD93F\uD97F\uD9BF\uD9FF\uDA3F\uDA3F\uDA7F\uDABF\uDAFF\uDB3F\uDB7F\uDBBF\uDBFF][\uDFFE\uDFFF])/g; 18 | 19 | /** 20 | * @class FastParser 21 | * @constructor FastParser 22 | */ 23 | function FastParser(config) { 24 | var self = this, k; 25 | 26 | // deep copy config to this.config 27 | self.config = {}; 28 | if (config) { 29 | for (k in config) { 30 | self.config[k] = config[k]; 31 | } 32 | } 33 | config = self.config; 34 | 35 | // config enabled by default - no conversion needed 36 | // config.enableInputPreProcessing = (config.enableInputPreProcessing !== false); 37 | 38 | self.listeners = {}; 39 | self.reset(); 40 | } 41 | 42 | /** 43 | * @function FastParser#reset 44 | * 45 | * @description 46 | * Reset all internal states, as if being created with the new operator 47 | */ 48 | FastParser.prototype.reset = function () { 49 | var self = this; 50 | 51 | self.state = stateMachine.State.STATE_DATA; /* Save the current status */ 52 | self.tags = ['', '']; /* Save the current tag name */ 53 | self.tagIdx = 0; 54 | self.attrName = ''; /* Save the current attribute name */ 55 | self.attributeValue = null; /* Save the current attribute value */ 56 | self.input = ''; 57 | self.inputLen = 0; 58 | 59 | return self; 60 | }; 61 | 62 | /** 63 | * @function FastParser#on 64 | * 65 | * @param {string} eventType - the event type 66 | * @param {function} listener - the event listener 67 | * @returns this 68 | * 69 | * @description 70 | *

register the given event listener to the given eventType

71 | * 72 | */ 73 | FastParser.prototype.on = function (eventType, listener) { 74 | var l = this.listeners[eventType]; 75 | if (listener) { 76 | if (l) { 77 | l.push(listener); 78 | } else { 79 | this.listeners[eventType] = [listener]; 80 | } 81 | } 82 | return this; 83 | }; 84 | 85 | /** 86 | * @function FastParser#once 87 | * 88 | * @param {string} eventType - the event type (e.g., preWalk, reWalk, postWalk, ...) 89 | * @param {function} listener - the event listener 90 | * @returns this 91 | * 92 | * @description 93 | *

register the given event listener to the given eventType, for which it will be fired only once

94 | * 95 | */ 96 | FastParser.prototype.once = function(eventType, listener) { 97 | var self = this, onceListener; 98 | if (listener) { 99 | onceListener = function () { 100 | self.off(eventType, onceListener); 101 | listener.apply(self, arguments); 102 | }; 103 | return this.on(eventType, onceListener); 104 | } 105 | return this; 106 | }; 107 | 108 | /** 109 | * @function FastParser#off 110 | * 111 | * @param {string} eventType - the event type (e.g., preWalk, reWalk, postWalk, ...) 112 | * @param {function} listener - the event listener 113 | * @returns this 114 | * 115 | * @description 116 | *

remove the listener from being fired when the eventType happen

117 | * 118 | */ 119 | FastParser.prototype.off = function (eventType, listener) { 120 | if (listener) { 121 | var i, len, listeners = this.listeners[eventType]; 122 | if (listeners) { 123 | for (i = 0; listeners[i]; i++) { 124 | if (listeners[i] === listener) { 125 | listeners.splice(i, 1); 126 | break; 127 | } 128 | } 129 | } 130 | } 131 | return this; 132 | }; 133 | 134 | /** 135 | * @function FastParser#emit 136 | * 137 | * @param {string} eventType - the event type (e.g., preWalk, reWalk, postWalk, ...) 138 | * @returns this 139 | * 140 | * @description 141 | *

fire those listeners correspoding to the given eventType

142 | * 143 | */ 144 | FastParser.prototype.emit = function (listeners, args) { 145 | if (listeners) { 146 | var i = -1, len; 147 | if ((len = listeners.length)) { 148 | while (++i < len) { 149 | listeners[i].apply(this, args || []); 150 | } 151 | } 152 | } 153 | return this; 154 | }; 155 | 156 | /* 157 | * @function FastParser#walk 158 | * 159 | * @param {integer} i - the position of the current character in the input stream 160 | * @param {string} input - the input stream 161 | * @returns {integer} the new location of the current character. 162 | * 163 | */ 164 | FastParser.prototype.walk = function(i, input, endsWithEOF) { 165 | 166 | var ch = input[i], 167 | symbol = this.lookupChar(ch), 168 | extraLogic = stateMachine.lookupAltLogicFromSymbol[symbol][this.state], 169 | reconsume = stateMachine.lookupReconsumeFromSymbol[symbol][this.state]; 170 | 171 | /* Set state based on the current head pointer symbol */ 172 | this.state = stateMachine.lookupStateFromSymbol[symbol][this.state]; 173 | 174 | /* See if there is any extra logic required for this state transition */ 175 | switch (extraLogic) { 176 | case 1: this.createStartTag(ch); break; 177 | case 2: this.createEndTag(ch); break; 178 | case 3: this.appendTagName(ch); break; 179 | case 4: this.resetEndTag(ch); break; 180 | case 6: /* match end tag token with start tag token's tag name */ 181 | if(this.tags[0].toLowerCase() === this.tags[1].toLowerCase()) { 182 | reconsume = 0; /* see 12.2.4.13 - switch state for the following case, otherwise, reconsume. */ 183 | this.matchEndTagWithStartTag(symbol); 184 | } 185 | break; 186 | case 8: this.matchEscapedScriptTag(ch); break; 187 | case 11: this.processTagName(ch); break; 188 | case 12: this.createAttributeNameAndValueTag(ch); break; 189 | case 13: this.appendAttributeNameTag(ch); break; 190 | case 14: this.appendAttributeValueTag(ch); break; 191 | } 192 | 193 | if (reconsume) { /* reconsume the character */ 194 | this.listeners.reWalk && this.emit(this.listeners.reWalk, [this.state, i, endsWithEOF]); 195 | return this.walk(i, input); 196 | } 197 | 198 | return i; 199 | }; 200 | 201 | FastParser.prototype.createStartTag = function (ch) { 202 | this.tagIdx = 0; 203 | this.tags[0] = ch; 204 | }; 205 | 206 | FastParser.prototype.createEndTag = function (ch) { 207 | this.tagIdx = 1; 208 | this.tags[1] = ch; 209 | }; 210 | 211 | FastParser.prototype.appendTagName = function (ch) { 212 | this.tags[this.tagIdx] += ch; 213 | }; 214 | 215 | FastParser.prototype.resetEndTag = function (ch) { 216 | this.tagIdx = 1; 217 | this.tags[1] = ''; 218 | }; 219 | 220 | FastParser.prototype.matchEndTagWithStartTag = function (symbol) { 221 | /* Extra Logic #6 : 222 | WHITESPACE: If the current end tag token is an appropriate end tag token, then switch to the before attribute name state. 223 | Otherwise, treat it as per the 'anything else' entry below. 224 | SOLIDUS (/): If the current end tag token is an appropriate end tag token, then switch to the this.closing start tag state. 225 | Otherwise, treat it as per the 'anything else' entry below. 226 | GREATER-THAN SIGN (>): If the current end tag token is an appropriate end tag token, then switch to the data state and emit the current tag token. 227 | Otherwise, treat it as per the 'anything else' entry below. 228 | */ 229 | this.tags[0] = ''; 230 | this.tags[1] = ''; 231 | 232 | switch (symbol) { 233 | case stateMachine.Symbol.SPACE: /** Whitespaces */ 234 | this.state = stateMachine.State.STATE_BEFORE_ATTRIBUTE_NAME; 235 | return ; 236 | case stateMachine.Symbol.SOLIDUS: /** [/] */ 237 | this.state = stateMachine.State.STATE_SELF_CLOSING_START_TAG; 238 | return ; 239 | case stateMachine.Symbol.GREATER: /** [>] */ 240 | this.state = stateMachine.State.STATE_DATA; 241 | return ; 242 | } 243 | }; 244 | 245 | FastParser.prototype.matchEscapedScriptTag = function (ch) { 246 | /* switch to the script data double escaped state if we see 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 20 | -------------------------------------------------------------------------------- /tests/unit/run-bug-spec.js: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015, Yahoo! Inc. All rights reserved. 3 | Copyrights licensed under the New BSD License. 4 | See the accompanying LICENSE file for terms. 5 | 6 | Authors: Nera Liu 7 | Albert Yu 8 | Adonis Fung 9 | */ 10 | (function () { 11 | 12 | require("mocha"); 13 | var expect = require("expect.js"), 14 | fs = require("fs"); 15 | 16 | var config = { 17 | enableInputPreProcessing: false, 18 | enableCanonicalization: false, 19 | enableIEConditionalComments: false 20 | }; 21 | 22 | describe('HTML5 Context Parser with Buggy Subclass Prototype', function(){ 23 | 24 | it('should not print char twice in reconsume logic test', function(){ 25 | var file = "./tests/samples/tests/001.html"; 26 | var Parser = require("../../src/context-parser").Parser; 27 | var BuggyParser = function() { Parser.call(this); } 28 | BuggyParser.prototype = Object.create(Parser.prototype); 29 | BuggyParser.prototype.constructor = Parser; 30 | BuggyParser.prototype.afterWalk = function( ch, i ) { 31 | if (!this.bytes) { 32 | this.bytes = []; 33 | } 34 | this.bytes[i] = ch; 35 | }; 36 | var parser = new BuggyParser(config); 37 | var data = fs.readFileSync(file, 'utf-8'); 38 | parser.contextualize(data); 39 | o = parser.bytes.join(''); 40 | 41 | expect(o).not.to.match(/sscript/); 42 | expect(o).not.to.match(/script>>/); 43 | expect(o).not.to.match(/\/a>>/); 44 | }); 45 | 46 | it('should not crash with "beforeWalk" returning out of bound index', function() { 47 | var Parser = require("../../src/context-parser").Parser; 48 | var BuggyParser = function() { Parser.call(this); } 49 | BuggyParser.prototype = Object.create(Parser.prototype); 50 | BuggyParser.prototype.constructor = Parser; 51 | BuggyParser.prototype.beforeWalk = function( ) { 52 | return 1000; 53 | } 54 | var parser = new BuggyParser(config); 55 | parser.contextualize(''); 56 | 57 | }); 58 | 59 | it('should not crash with "walk" returning out of bound index', function() { 60 | var Parser = require("../../src/context-parser").Parser; 61 | var BuggyParser = function() { Parser.call(this); } 62 | BuggyParser.prototype = Object.create(Parser.prototype); 63 | BuggyParser.prototype.constructor = Parser; 64 | BuggyParser.prototype.walk = function( ) { 65 | return 1000; 66 | } 67 | var parser = new BuggyParser(config); 68 | parser.contextualize(''); 69 | 70 | }); 71 | 72 | }); 73 | 74 | }()); 75 | -------------------------------------------------------------------------------- /tests/unit/run-command-spec.js: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015, Yahoo! Inc. All rights reserved. 3 | Copyrights licensed under the New BSD License. 4 | See the accompanying LICENSE file for terms. 5 | 6 | Authors: Nera Liu 7 | Albert Yu 8 | Adonis Fung 9 | */ 10 | (function () { 11 | 12 | require("mocha"); 13 | var expect = require("expect.js"); 14 | 15 | describe('HTML5 Context Parser Command Line Utility', function(){ 16 | 17 | it("should run benchmark command without error", function(done) { 18 | var exec = require('child_process').exec, 19 | child; 20 | var child = exec('./bin/benchmark', 21 | function (error, stdout, stderr) { 22 | if (error === null) { 23 | expect(true).to.equal(true); 24 | expect(stdout).to.match(/^context-parser runs at a speed of/); 25 | } 26 | } 27 | ); 28 | setTimeout(function(f) { 29 | done(); 30 | }, 100); 31 | }); 32 | 33 | it("should run context-dump command without error", function(done) { 34 | var exec = require('child_process').exec, 35 | child; 36 | var file = "./tests/samples/tests/001.html"; 37 | child = exec('./bin/context-dump '+file, 38 | function (error, stdout, stderr) { 39 | if (error === null) { 40 | expect(true).to.equal(true); 41 | } 42 | } 43 | ); 44 | setTimeout(function(f) { 45 | done(); 46 | }, 100); 47 | }); 48 | 49 | it("should run state-inspector command without error", function(done) { 50 | var exec = require('child_process').exec, 51 | child; 52 | var child = exec('./bin/state-inspector 1 1', 53 | function (error, stdout, stderr) { 54 | if (error === null) { 55 | expect(true).to.equal(true); 56 | expect(stdout).to.match(/{ ch: '1', symbol: 12, newState: 1, reconsume: 0, extraLogic: 0 }/); 57 | } 58 | } 59 | ); 60 | setTimeout(function(f) { 61 | done(); 62 | }, 100); 63 | }); 64 | }); 65 | }()); 66 | -------------------------------------------------------------------------------- /tests/unit/run-functions-spec.js: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015, Yahoo! Inc. All rights reserved. 3 | Copyrights licensed under the New BSD License. 4 | See the accompanying LICENSE file for terms. 5 | 6 | Authors: Nera Liu 7 | Albert Yu 8 | Adonis Fung 9 | */ 10 | (function () { 11 | 12 | require("mocha"); 13 | var assert = require("assert"), 14 | expect = require("expect.js"), 15 | Parser = require("../../src/context-parser").Parser, 16 | FastParser = require("../../src/context-parser").FastParser; 17 | 18 | var config = { 19 | enableInputPreProcessing: false, 20 | enableCanonicalization: false, 21 | enableIEConditionalComments: false 22 | }; 23 | 24 | describe('HTML5 Context Parser Functions', function() { 25 | 26 | describe('#getStates', function(){ 27 | it('should parse ', function(){ 28 | var p1 = new Parser(config); 29 | var html = ""; 30 | p1.contextualize(html); 31 | var states = p1.getStates(); 32 | assert.equal(states.toString(), '1,8,10,10,10,10,1,8,9,10,10,10,10,1'); 33 | }); 34 | }); 35 | 36 | describe('#setCurrentState', function(){ 37 | it('should exist)', function(){ 38 | var p1 = new Parser(config); 39 | p1.setCurrentState(10); 40 | }); 41 | }); 42 | describe('#setInitState and #getInitState', function(){ 43 | 44 | it('should exist and set state', function(){ 45 | var p1 = new Parser(config); 46 | p1.setInitState(10); 47 | var state = p1.getInitState(); 48 | assert.equal(state, 10); 49 | }); 50 | 51 | 52 | it('should get state', function(){ 53 | var p1 = new Parser(config); 54 | var html = ""; 55 | p1.contextualize(html); 56 | var state = p1.getInitState(); 57 | assert.equal(state, 1); 58 | }); 59 | }); 60 | 61 | describe('#getLastState', function(){ 62 | 63 | it('should get last state', function(){ 64 | var p1 = new Parser(config); 65 | var html = ""; 66 | p1.contextualize(html); 67 | var state = p1.getLastState(); 68 | assert.equal(state, 1); 69 | }); 70 | }); 71 | 72 | describe('#getAttributeName', function(){ 73 | 74 | var html; 75 | it('should get attribute name following with quoted attribute value', function(){ 76 | var p1 = new Parser(config); 77 | html = "
"; 78 | p1.contextualize(html); 79 | assert.equal(p1.getAttributeName(), 'class'); 80 | }); 81 | 82 | it('should get attribute name following with double quoted attribute value', function(){ 83 | var p2 = new Parser(config); 84 | html = '
'; 85 | p2.contextualize(html); 86 | assert.equal(p2.getAttributeName(), 'class'); 87 | }); 88 | 89 | it('should get attribute name following with unquoted attribute value', function(){ 90 | var p3 = new Parser(config); 91 | html = "
"; 92 | p3.contextualize(html); 93 | assert.equal(p3.getAttributeName(), 'class'); 94 | }); 95 | 96 | it('should get second attribute name', function(){ 97 | var p1 = new Parser(config); 98 | html = "
"; 99 | p1.contextualize(html); 100 | assert.equal(p1.getAttributeName(), 'style'); 101 | }); 102 | 103 | it('should get second attribute name (double quoted attribute value)', function(){ 104 | 105 | var p2 = new Parser(config); 106 | html = "
"; 107 | p2.contextualize(html); 108 | assert.equal(p2.getAttributeName(), 'style'); 109 | }); 110 | 111 | it('should get second attribute name (unquoted attribute value)', function(){ 112 | 113 | var p3 = new Parser(config); 114 | html = "
"; 115 | p3.contextualize(html); 116 | assert.equal(p3.getAttributeName(), 'style'); 117 | }); 118 | }); 119 | describe('#getAttributeValue', function(){ 120 | 121 | it('should get attribute value (quoted)', function(){ 122 | var p1 = new Parser(config); 123 | var html = "
"; 124 | p1.contextualize(html); 125 | assert.equal(p1.getAttributeValue(), 'classname'); 126 | }); 127 | it('should get attribute value (double quoted)', function(){ 128 | var p2 = new Parser(config); 129 | var html = '
'; 130 | p2.contextualize(html); 131 | assert.equal(p2.getAttributeValue(), 'classname'); 132 | }); 133 | it('should get attribute value (unquoted)', function(){ 134 | var p3 = new Parser(config); 135 | var html = "
"; 136 | p3.contextualize(html); 137 | assert.equal(p3.getAttributeValue(), 'classname'); 138 | }); 139 | 140 | 141 | it('should get 2nd attribute value', function(){ 142 | var p1 = new Parser(config); 143 | var html = "
"; 144 | p1.contextualize(html); 145 | assert.equal(p1.getAttributeValue(), 'color:red'); 146 | }); 147 | 148 | it('should get 2nd attribute value (double quoted)', function(){ 149 | var p2 = new Parser(config); 150 | var html = '
'; 151 | p2.contextualize(html); 152 | assert.equal(p2.getAttributeValue(), 'color:red'); 153 | }); 154 | 155 | it('should get 2nd attribute value (unquoted)', function(){ 156 | var p3 = new Parser(config); 157 | var html = "
"; 158 | p3.contextualize(html); 159 | assert.equal(p3.getAttributeValue(), 'color:red'); 160 | }); 161 | }); 162 | 163 | describe('#lookupChar', function(){ 164 | it('should match symbol lookup table', function(){ 165 | var parser = new Parser(config); 166 | var r = parser.lookupChar('\t'); 167 | assert.equal(r, 0); 168 | r = parser.lookupChar('\n'); 169 | assert.equal(r, 0); 170 | r = parser.lookupChar('\f'); 171 | assert.equal(r, 0); 172 | r = parser.lookupChar(' '); 173 | assert.equal(r, 0); 174 | r = parser.lookupChar('!'); 175 | assert.equal(r, 1); 176 | r = parser.lookupChar('"'); 177 | assert.equal(r, 2); 178 | r = parser.lookupChar('&'); 179 | assert.equal(r, 3); 180 | r = parser.lookupChar('\''); 181 | assert.equal(r, 4); 182 | r = parser.lookupChar('-'); 183 | assert.equal(r, 5); 184 | r = parser.lookupChar('/'); 185 | assert.equal(r, 6); 186 | r = parser.lookupChar('<'); 187 | assert.equal(r, 7); 188 | r = parser.lookupChar('='); 189 | assert.equal(r, 8); 190 | r = parser.lookupChar('>'); 191 | assert.equal(r, 9); 192 | r = parser.lookupChar('?'); 193 | assert.equal(r, 10); 194 | r = parser.lookupChar('a'); 195 | assert.equal(r, 11); 196 | r = parser.lookupChar('z'); 197 | assert.equal(r, 11); 198 | r = parser.lookupChar('A'); 199 | assert.equal(r, 11); 200 | r = parser.lookupChar('Z'); 201 | assert.equal(r, 11); 202 | r = parser.lookupChar('1'); 203 | assert.equal(r, 12); 204 | }); 205 | }); 206 | 207 | describe('#getStartTagName', function(){ 208 | 209 | it('should return start tag name', function(){ 210 | var p1 = new Parser(config); 211 | var html = "
"; 212 | p1.contextualize(html); 213 | assert.equal(p1.getStartTagName(), 'div'); 214 | 215 | }); 216 | 217 | }); 218 | 219 | describe('#getCurrentTagIndex and #getCurrentTag', function(){ 220 | 221 | it('should return correct tag name/index', function(){ 222 | 223 | [ { html: "
", tag0: 'div', tag1: 'div', index: 1}, 224 | { html: "
", tag0: 'div', tag1: 'div', index: 1}, 225 | { html: "
", tag0: 'img', tag1: 'div', index: 0}, 226 | { html: "
7 | Albert Yu 8 | Adonis Fung 9 | */ 10 | (function () { 11 | 12 | require("mocha"); 13 | var assert = require("assert"), 14 | Parser = require("../../src/context-parser").Parser; 15 | 16 | var config = { 17 | enableInputPreProcessing: false, 18 | enableCanonicalization: false, 19 | enableIEConditionalComments: false 20 | }; 21 | 22 | describe('HTML5 Context Parser StateMachine', function() { 23 | 24 | // https://html.spec.whatwg.org/multipage/syntax.html#tokenization 25 | it('should parse {}', function(){ 26 | var p1 = new Parser(config); 27 | var html = "{}"; 28 | p1.contextualize(html); 29 | var states = p1.getStates(); 30 | assert.equal(states.toString(), '1,8,10,10,10,10,1,1,1,8,9,10,10,10,10,1'); 31 | }); 32 | 33 | it('should parse attribute name', function(){ 34 | var p1 = new Parser(config); 35 | var html = "