10 | */
11 | /*jshint -W030 */
12 | (function() {
13 | "use strict";
14 |
15 | var stateMachine = require('./html5-state-machine.js'),
16 | htmlState = stateMachine.State,
17 | reInputPreProcessing = /(?:\r\n?|[\x01-\x08\x0B\x0E-\x1F\x7F-\x9F\uFDD0-\uFDEF\uFFFE\uFFFF]|[\uD83F\uD87F\uD8BF\uD8FF\uD93F\uD97F\uD9BF\uD9FF\uDA3F\uDA3F\uDA7F\uDABF\uDAFF\uDB3F\uDB7F\uDBBF\uDBFF][\uDFFE\uDFFF])/g;
18 |
19 | /**
20 | * @class FastParser
21 | * @constructor FastParser
22 | */
23 | function FastParser(config) {
24 | var self = this, k;
25 |
26 | // deep copy config to this.config
27 | self.config = {};
28 | if (config) {
29 | for (k in config) {
30 | self.config[k] = config[k];
31 | }
32 | }
33 | config = self.config;
34 |
35 | // config enabled by default - no conversion needed
36 | // config.enableInputPreProcessing = (config.enableInputPreProcessing !== false);
37 |
38 | self.listeners = {};
39 | self.reset();
40 | }
41 |
42 | /**
43 | * @function FastParser#reset
44 | *
45 | * @description
46 | * Reset all internal states, as if being created with the new operator
47 | */
48 | FastParser.prototype.reset = function () {
49 | var self = this;
50 |
51 | self.state = stateMachine.State.STATE_DATA; /* Save the current status */
52 | self.tags = ['', '']; /* Save the current tag name */
53 | self.tagIdx = 0;
54 | self.attrName = ''; /* Save the current attribute name */
55 | self.attributeValue = null; /* Save the current attribute value */
56 | self.input = '';
57 | self.inputLen = 0;
58 |
59 | return self;
60 | };
61 |
62 | /**
63 | * @function FastParser#on
64 | *
65 | * @param {string} eventType - the event type
66 | * @param {function} listener - the event listener
67 | * @returns this
68 | *
69 | * @description
70 | * register the given event listener to the given eventType
71 | *
72 | */
73 | FastParser.prototype.on = function (eventType, listener) {
74 | var l = this.listeners[eventType];
75 | if (listener) {
76 | if (l) {
77 | l.push(listener);
78 | } else {
79 | this.listeners[eventType] = [listener];
80 | }
81 | }
82 | return this;
83 | };
84 |
85 | /**
86 | * @function FastParser#once
87 | *
88 | * @param {string} eventType - the event type (e.g., preWalk, reWalk, postWalk, ...)
89 | * @param {function} listener - the event listener
90 | * @returns this
91 | *
92 | * @description
93 | * register the given event listener to the given eventType, for which it will be fired only once
94 | *
95 | */
96 | FastParser.prototype.once = function(eventType, listener) {
97 | var self = this, onceListener;
98 | if (listener) {
99 | onceListener = function () {
100 | self.off(eventType, onceListener);
101 | listener.apply(self, arguments);
102 | };
103 | return this.on(eventType, onceListener);
104 | }
105 | return this;
106 | };
107 |
108 | /**
109 | * @function FastParser#off
110 | *
111 | * @param {string} eventType - the event type (e.g., preWalk, reWalk, postWalk, ...)
112 | * @param {function} listener - the event listener
113 | * @returns this
114 | *
115 | * @description
116 | * remove the listener from being fired when the eventType happen
117 | *
118 | */
119 | FastParser.prototype.off = function (eventType, listener) {
120 | if (listener) {
121 | var i, len, listeners = this.listeners[eventType];
122 | if (listeners) {
123 | for (i = 0; listeners[i]; i++) {
124 | if (listeners[i] === listener) {
125 | listeners.splice(i, 1);
126 | break;
127 | }
128 | }
129 | }
130 | }
131 | return this;
132 | };
133 |
134 | /**
135 | * @function FastParser#emit
136 | *
137 | * @param {string} eventType - the event type (e.g., preWalk, reWalk, postWalk, ...)
138 | * @returns this
139 | *
140 | * @description
141 | * fire those listeners correspoding to the given eventType
142 | *
143 | */
144 | FastParser.prototype.emit = function (listeners, args) {
145 | if (listeners) {
146 | var i = -1, len;
147 | if ((len = listeners.length)) {
148 | while (++i < len) {
149 | listeners[i].apply(this, args || []);
150 | }
151 | }
152 | }
153 | return this;
154 | };
155 |
156 | /*
157 | * @function FastParser#walk
158 | *
159 | * @param {integer} i - the position of the current character in the input stream
160 | * @param {string} input - the input stream
161 | * @returns {integer} the new location of the current character.
162 | *
163 | */
164 | FastParser.prototype.walk = function(i, input, endsWithEOF) {
165 |
166 | var ch = input[i],
167 | symbol = this.lookupChar(ch),
168 | extraLogic = stateMachine.lookupAltLogicFromSymbol[symbol][this.state],
169 | reconsume = stateMachine.lookupReconsumeFromSymbol[symbol][this.state];
170 |
171 | /* Set state based on the current head pointer symbol */
172 | this.state = stateMachine.lookupStateFromSymbol[symbol][this.state];
173 |
174 | /* See if there is any extra logic required for this state transition */
175 | switch (extraLogic) {
176 | case 1: this.createStartTag(ch); break;
177 | case 2: this.createEndTag(ch); break;
178 | case 3: this.appendTagName(ch); break;
179 | case 4: this.resetEndTag(ch); break;
180 | case 6: /* match end tag token with start tag token's tag name */
181 | if(this.tags[0].toLowerCase() === this.tags[1].toLowerCase()) {
182 | reconsume = 0; /* see 12.2.4.13 - switch state for the following case, otherwise, reconsume. */
183 | this.matchEndTagWithStartTag(symbol);
184 | }
185 | break;
186 | case 8: this.matchEscapedScriptTag(ch); break;
187 | case 11: this.processTagName(ch); break;
188 | case 12: this.createAttributeNameAndValueTag(ch); break;
189 | case 13: this.appendAttributeNameTag(ch); break;
190 | case 14: this.appendAttributeValueTag(ch); break;
191 | }
192 |
193 | if (reconsume) { /* reconsume the character */
194 | this.listeners.reWalk && this.emit(this.listeners.reWalk, [this.state, i, endsWithEOF]);
195 | return this.walk(i, input);
196 | }
197 |
198 | return i;
199 | };
200 |
201 | FastParser.prototype.createStartTag = function (ch) {
202 | this.tagIdx = 0;
203 | this.tags[0] = ch;
204 | };
205 |
206 | FastParser.prototype.createEndTag = function (ch) {
207 | this.tagIdx = 1;
208 | this.tags[1] = ch;
209 | };
210 |
211 | FastParser.prototype.appendTagName = function (ch) {
212 | this.tags[this.tagIdx] += ch;
213 | };
214 |
215 | FastParser.prototype.resetEndTag = function (ch) {
216 | this.tagIdx = 1;
217 | this.tags[1] = '';
218 | };
219 |
220 | FastParser.prototype.matchEndTagWithStartTag = function (symbol) {
221 | /* Extra Logic #6 :
222 | WHITESPACE: If the current end tag token is an appropriate end tag token, then switch to the before attribute name state.
223 | Otherwise, treat it as per the 'anything else' entry below.
224 | SOLIDUS (/): If the current end tag token is an appropriate end tag token, then switch to the this.closing start tag state.
225 | Otherwise, treat it as per the 'anything else' entry below.
226 | GREATER-THAN SIGN (>): If the current end tag token is an appropriate end tag token, then switch to the data state and emit the current tag token.
227 | Otherwise, treat it as per the 'anything else' entry below.
228 | */
229 | this.tags[0] = '';
230 | this.tags[1] = '';
231 |
232 | switch (symbol) {
233 | case stateMachine.Symbol.SPACE: /** Whitespaces */
234 | this.state = stateMachine.State.STATE_BEFORE_ATTRIBUTE_NAME;
235 | return ;
236 | case stateMachine.Symbol.SOLIDUS: /** [/] */
237 | this.state = stateMachine.State.STATE_SELF_CLOSING_START_TAG;
238 | return ;
239 | case stateMachine.Symbol.GREATER: /** [>] */
240 | this.state = stateMachine.State.STATE_DATA;
241 | return ;
242 | }
243 | };
244 |
245 | FastParser.prototype.matchEscapedScriptTag = function (ch) {
246 | /* switch to the script data double escaped state if we see
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
20 |
--------------------------------------------------------------------------------
/tests/unit/run-bug-spec.js:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2015, Yahoo! Inc. All rights reserved.
3 | Copyrights licensed under the New BSD License.
4 | See the accompanying LICENSE file for terms.
5 |
6 | Authors: Nera Liu
7 | Albert Yu
8 | Adonis Fung
9 | */
10 | (function () {
11 |
12 | require("mocha");
13 | var expect = require("expect.js"),
14 | fs = require("fs");
15 |
16 | var config = {
17 | enableInputPreProcessing: false,
18 | enableCanonicalization: false,
19 | enableIEConditionalComments: false
20 | };
21 |
22 | describe('HTML5 Context Parser with Buggy Subclass Prototype', function(){
23 |
24 | it('should not print char twice in reconsume logic test', function(){
25 | var file = "./tests/samples/tests/001.html";
26 | var Parser = require("../../src/context-parser").Parser;
27 | var BuggyParser = function() { Parser.call(this); }
28 | BuggyParser.prototype = Object.create(Parser.prototype);
29 | BuggyParser.prototype.constructor = Parser;
30 | BuggyParser.prototype.afterWalk = function( ch, i ) {
31 | if (!this.bytes) {
32 | this.bytes = [];
33 | }
34 | this.bytes[i] = ch;
35 | };
36 | var parser = new BuggyParser(config);
37 | var data = fs.readFileSync(file, 'utf-8');
38 | parser.contextualize(data);
39 | o = parser.bytes.join('');
40 |
41 | expect(o).not.to.match(/sscript/);
42 | expect(o).not.to.match(/script>>/);
43 | expect(o).not.to.match(/\/a>>/);
44 | });
45 |
46 | it('should not crash with "beforeWalk" returning out of bound index', function() {
47 | var Parser = require("../../src/context-parser").Parser;
48 | var BuggyParser = function() { Parser.call(this); }
49 | BuggyParser.prototype = Object.create(Parser.prototype);
50 | BuggyParser.prototype.constructor = Parser;
51 | BuggyParser.prototype.beforeWalk = function( ) {
52 | return 1000;
53 | }
54 | var parser = new BuggyParser(config);
55 | parser.contextualize('');
56 |
57 | });
58 |
59 | it('should not crash with "walk" returning out of bound index', function() {
60 | var Parser = require("../../src/context-parser").Parser;
61 | var BuggyParser = function() { Parser.call(this); }
62 | BuggyParser.prototype = Object.create(Parser.prototype);
63 | BuggyParser.prototype.constructor = Parser;
64 | BuggyParser.prototype.walk = function( ) {
65 | return 1000;
66 | }
67 | var parser = new BuggyParser(config);
68 | parser.contextualize('');
69 |
70 | });
71 |
72 | });
73 |
74 | }());
75 |
--------------------------------------------------------------------------------
/tests/unit/run-command-spec.js:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2015, Yahoo! Inc. All rights reserved.
3 | Copyrights licensed under the New BSD License.
4 | See the accompanying LICENSE file for terms.
5 |
6 | Authors: Nera Liu
7 | Albert Yu
8 | Adonis Fung
9 | */
10 | (function () {
11 |
12 | require("mocha");
13 | var expect = require("expect.js");
14 |
15 | describe('HTML5 Context Parser Command Line Utility', function(){
16 |
17 | it("should run benchmark command without error", function(done) {
18 | var exec = require('child_process').exec,
19 | child;
20 | var child = exec('./bin/benchmark',
21 | function (error, stdout, stderr) {
22 | if (error === null) {
23 | expect(true).to.equal(true);
24 | expect(stdout).to.match(/^context-parser runs at a speed of/);
25 | }
26 | }
27 | );
28 | setTimeout(function(f) {
29 | done();
30 | }, 100);
31 | });
32 |
33 | it("should run context-dump command without error", function(done) {
34 | var exec = require('child_process').exec,
35 | child;
36 | var file = "./tests/samples/tests/001.html";
37 | child = exec('./bin/context-dump '+file,
38 | function (error, stdout, stderr) {
39 | if (error === null) {
40 | expect(true).to.equal(true);
41 | }
42 | }
43 | );
44 | setTimeout(function(f) {
45 | done();
46 | }, 100);
47 | });
48 |
49 | it("should run state-inspector command without error", function(done) {
50 | var exec = require('child_process').exec,
51 | child;
52 | var child = exec('./bin/state-inspector 1 1',
53 | function (error, stdout, stderr) {
54 | if (error === null) {
55 | expect(true).to.equal(true);
56 | expect(stdout).to.match(/{ ch: '1', symbol: 12, newState: 1, reconsume: 0, extraLogic: 0 }/);
57 | }
58 | }
59 | );
60 | setTimeout(function(f) {
61 | done();
62 | }, 100);
63 | });
64 | });
65 | }());
66 |
--------------------------------------------------------------------------------
/tests/unit/run-functions-spec.js:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2015, Yahoo! Inc. All rights reserved.
3 | Copyrights licensed under the New BSD License.
4 | See the accompanying LICENSE file for terms.
5 |
6 | Authors: Nera Liu
7 | Albert Yu
8 | Adonis Fung
9 | */
10 | (function () {
11 |
12 | require("mocha");
13 | var assert = require("assert"),
14 | expect = require("expect.js"),
15 | Parser = require("../../src/context-parser").Parser,
16 | FastParser = require("../../src/context-parser").FastParser;
17 |
18 | var config = {
19 | enableInputPreProcessing: false,
20 | enableCanonicalization: false,
21 | enableIEConditionalComments: false
22 | };
23 |
24 | describe('HTML5 Context Parser Functions', function() {
25 |
26 | describe('#getStates', function(){
27 | it('should parse ', function(){
28 | var p1 = new Parser(config);
29 | var html = "";
30 | p1.contextualize(html);
31 | var states = p1.getStates();
32 | assert.equal(states.toString(), '1,8,10,10,10,10,1,8,9,10,10,10,10,1');
33 | });
34 | });
35 |
36 | describe('#setCurrentState', function(){
37 | it('should exist)', function(){
38 | var p1 = new Parser(config);
39 | p1.setCurrentState(10);
40 | });
41 | });
42 | describe('#setInitState and #getInitState', function(){
43 |
44 | it('should exist and set state', function(){
45 | var p1 = new Parser(config);
46 | p1.setInitState(10);
47 | var state = p1.getInitState();
48 | assert.equal(state, 10);
49 | });
50 |
51 |
52 | it('should get state', function(){
53 | var p1 = new Parser(config);
54 | var html = "";
55 | p1.contextualize(html);
56 | var state = p1.getInitState();
57 | assert.equal(state, 1);
58 | });
59 | });
60 |
61 | describe('#getLastState', function(){
62 |
63 | it('should get last state', function(){
64 | var p1 = new Parser(config);
65 | var html = "";
66 | p1.contextualize(html);
67 | var state = p1.getLastState();
68 | assert.equal(state, 1);
69 | });
70 | });
71 |
72 | describe('#getAttributeName', function(){
73 |
74 | var html;
75 | it('should get attribute name following with quoted attribute value', function(){
76 | var p1 = new Parser(config);
77 | html = "";
78 | p1.contextualize(html);
79 | assert.equal(p1.getAttributeName(), 'class');
80 | });
81 |
82 | it('should get attribute name following with double quoted attribute value', function(){
83 | var p2 = new Parser(config);
84 | html = '';
85 | p2.contextualize(html);
86 | assert.equal(p2.getAttributeName(), 'class');
87 | });
88 |
89 | it('should get attribute name following with unquoted attribute value', function(){
90 | var p3 = new Parser(config);
91 | html = "";
92 | p3.contextualize(html);
93 | assert.equal(p3.getAttributeName(), 'class');
94 | });
95 |
96 | it('should get second attribute name', function(){
97 | var p1 = new Parser(config);
98 | html = "";
99 | p1.contextualize(html);
100 | assert.equal(p1.getAttributeName(), 'style');
101 | });
102 |
103 | it('should get second attribute name (double quoted attribute value)', function(){
104 |
105 | var p2 = new Parser(config);
106 | html = "";
107 | p2.contextualize(html);
108 | assert.equal(p2.getAttributeName(), 'style');
109 | });
110 |
111 | it('should get second attribute name (unquoted attribute value)', function(){
112 |
113 | var p3 = new Parser(config);
114 | html = "";
115 | p3.contextualize(html);
116 | assert.equal(p3.getAttributeName(), 'style');
117 | });
118 | });
119 | describe('#getAttributeValue', function(){
120 |
121 | it('should get attribute value (quoted)', function(){
122 | var p1 = new Parser(config);
123 | var html = "";
124 | p1.contextualize(html);
125 | assert.equal(p1.getAttributeValue(), 'classname');
126 | });
127 | it('should get attribute value (double quoted)', function(){
128 | var p2 = new Parser(config);
129 | var html = '';
130 | p2.contextualize(html);
131 | assert.equal(p2.getAttributeValue(), 'classname');
132 | });
133 | it('should get attribute value (unquoted)', function(){
134 | var p3 = new Parser(config);
135 | var html = "";
136 | p3.contextualize(html);
137 | assert.equal(p3.getAttributeValue(), 'classname');
138 | });
139 |
140 |
141 | it('should get 2nd attribute value', function(){
142 | var p1 = new Parser(config);
143 | var html = "";
144 | p1.contextualize(html);
145 | assert.equal(p1.getAttributeValue(), 'color:red');
146 | });
147 |
148 | it('should get 2nd attribute value (double quoted)', function(){
149 | var p2 = new Parser(config);
150 | var html = '';
151 | p2.contextualize(html);
152 | assert.equal(p2.getAttributeValue(), 'color:red');
153 | });
154 |
155 | it('should get 2nd attribute value (unquoted)', function(){
156 | var p3 = new Parser(config);
157 | var html = "";
158 | p3.contextualize(html);
159 | assert.equal(p3.getAttributeValue(), 'color:red');
160 | });
161 | });
162 |
163 | describe('#lookupChar', function(){
164 | it('should match symbol lookup table', function(){
165 | var parser = new Parser(config);
166 | var r = parser.lookupChar('\t');
167 | assert.equal(r, 0);
168 | r = parser.lookupChar('\n');
169 | assert.equal(r, 0);
170 | r = parser.lookupChar('\f');
171 | assert.equal(r, 0);
172 | r = parser.lookupChar(' ');
173 | assert.equal(r, 0);
174 | r = parser.lookupChar('!');
175 | assert.equal(r, 1);
176 | r = parser.lookupChar('"');
177 | assert.equal(r, 2);
178 | r = parser.lookupChar('&');
179 | assert.equal(r, 3);
180 | r = parser.lookupChar('\'');
181 | assert.equal(r, 4);
182 | r = parser.lookupChar('-');
183 | assert.equal(r, 5);
184 | r = parser.lookupChar('/');
185 | assert.equal(r, 6);
186 | r = parser.lookupChar('<');
187 | assert.equal(r, 7);
188 | r = parser.lookupChar('=');
189 | assert.equal(r, 8);
190 | r = parser.lookupChar('>');
191 | assert.equal(r, 9);
192 | r = parser.lookupChar('?');
193 | assert.equal(r, 10);
194 | r = parser.lookupChar('a');
195 | assert.equal(r, 11);
196 | r = parser.lookupChar('z');
197 | assert.equal(r, 11);
198 | r = parser.lookupChar('A');
199 | assert.equal(r, 11);
200 | r = parser.lookupChar('Z');
201 | assert.equal(r, 11);
202 | r = parser.lookupChar('1');
203 | assert.equal(r, 12);
204 | });
205 | });
206 |
207 | describe('#getStartTagName', function(){
208 |
209 | it('should return start tag name', function(){
210 | var p1 = new Parser(config);
211 | var html = "";
212 | p1.contextualize(html);
213 | assert.equal(p1.getStartTagName(), 'div');
214 |
215 | });
216 |
217 | });
218 |
219 | describe('#getCurrentTagIndex and #getCurrentTag', function(){
220 |
221 | it('should return correct tag name/index', function(){
222 |
223 | [ { html: "", tag0: 'div', tag1: 'div', index: 1},
224 | { html: " ", tag0: 'div', tag1: 'div', index: 1},
225 | { html: "
", tag0: 'img', tag1: 'div', index: 0},
226 | { html: "![]()
![]()
1 < 2";
261 | fastParser.contextualize(html);
262 | expect(fastParser.state, 10)
263 | });
264 |
265 | });
266 | }());
267 |
--------------------------------------------------------------------------------
/tests/unit/run-states-spec.js:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2015, Yahoo! Inc. All rights reserved.
3 | Copyrights licensed under the New BSD License.
4 | See the accompanying LICENSE file for terms.
5 |
6 | Authors: Nera Liu
7 | Albert Yu
8 | Adonis Fung
9 | */
10 | (function () {
11 |
12 | require("mocha");
13 | var assert = require("assert"),
14 | Parser = require("../../src/context-parser").Parser;
15 |
16 | var config = {
17 | enableInputPreProcessing: false,
18 | enableCanonicalization: false,
19 | enableIEConditionalComments: false
20 | };
21 |
22 | describe('HTML5 Context Parser StateMachine', function() {
23 |
24 | // https://html.spec.whatwg.org/multipage/syntax.html#tokenization
25 | it('should parse {}', function(){
26 | var p1 = new Parser(config);
27 | var html = "{}";
28 | p1.contextualize(html);
29 | var states = p1.getStates();
30 | assert.equal(states.toString(), '1,8,10,10,10,10,1,1,1,8,9,10,10,10,10,1');
31 | });
32 |
33 | it('should parse attribute name', function(){
34 | var p1 = new Parser(config);
35 | var html = "