├── README.md
├── demo.html
└── htmlparser.js
/README.md:
--------------------------------------------------------------------------------
1 | # Pure JavaScript HTML5 Parser #
2 |
3 |
4 | A working demo can be seen [here](http://htmlpreview.github.io/?https://github.com/blowsie/Pure-JavaScript-HTML-Parser/blob/master/demo.html).
5 |
6 | _Credit goes to John Resig for his [code](http://ejohn.org/blog/pure-javascript-html-parser/) written back in 2008 and Erik Arvidsson for his [code](http://erik.eae.net/simplehtmlparser/simplehtmlparser.js) written prior to that._
7 |
8 | This code has been updated to work with HTML 5 to fix several problems.
9 |
10 |
11 |
12 |
13 | ## 4 Libraries in One! ##
14 |
15 | ### A SAX-style API ###
16 |
17 | Handles tag, text, and comments with callbacks. For example, let’s say you wanted to implement a simple HTML to XML serialization scheme – you could do so using the following:
18 |
19 | var results = "";
20 |
21 | HTMLParser("
"
42 |
43 | ### XML Serializer ###
44 |
45 | Now, there’s no need to worry about implementing the above, since it’s included directly in the library, as well. Just feed in HTML and it spits back an XML string.
46 |
47 | var results = HTMLtoXML("
Data: ")
48 | results == '
Data:
'
49 |
50 |
51 | ### DOM Builder ###
52 |
53 | If you’re using the HTML parser to inject into an existing DOM document (or within an existing DOM element) then htmlparser.js provides a simple method for handling that:
54 |
55 | // The following is appended into the document body
56 | HTMLtoDOM("
Hello World", document)
57 |
58 | // The follow is appended into the specified element
59 | HTMLtoDOM("
Hello World", document.getElementById("test"))
60 |
61 |
62 | ### DOM Document Creator ###
63 |
64 | This is a more-advanced version of the DOM builder – it includes logic for handling the overall structure of a web page, returning a new DOM document.
65 |
66 | A couple points are enforced by this method:
67 |
68 | - There will always be a html, head, body, and title element.
69 | - There will only be one html, head, body, and title element (if the user specifies more, then will be moved to the appropriate locations and merged).
70 | link and base elements are forced into the head.
71 |
72 | You would use the method like so:
73 |
74 | var dom = HTMLtoDOM("
Data: ");
75 | dom.getElementsByTagName("body").length == 1
76 | dom.getElementsByTagName("p").length == 1
77 |
78 |
79 | While this library doesn’t cover the full gamut of possible weirdness that HTML provides, it does handle a lot of the most obvious stuff. All of the following are accounted for:
80 |
81 | **Unclosed Tags:**
82 |
83 | HTMLtoXML("
While this library doesn't cover the full gamut of possible weirdness that HTML provides, it does handle a lot of the most obvious stuff. All of the following are accounted for:
Note: It does not take into account where in the document an element should exist. Right now you can put block elements in a head or th inside a p and it'll happily accept them. It's not entirely clear how the logic should work for those, but it's something that I'm open to exploring.
55 |
56 |
57 |
58 |
59 |
60 |
78 |
79 |
80 |
--------------------------------------------------------------------------------
/htmlparser.js:
--------------------------------------------------------------------------------
1 | /*
2 | * HTML5 Parser By Sam Blowes
3 | *
4 | * Designed for HTML5 documents
5 | *
6 | * Original code by John Resig (ejohn.org)
7 | * http://ejohn.org/blog/pure-javascript-html-parser/
8 | * Original code by Erik Arvidsson, Mozilla Public License
9 | * http://erik.eae.net/simplehtmlparser/simplehtmlparser.js
10 | *
11 | * ----------------------------------------------------------------------------
12 | * License
13 | * ----------------------------------------------------------------------------
14 | *
15 | * This code is triple licensed using Apache Software License 2.0,
16 | * Mozilla Public License or GNU Public License
17 | *
18 | * ////////////////////////////////////////////////////////////////////////////
19 | *
20 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not
21 | * use this file except in compliance with the License. You may obtain a copy
22 | * of the License at http://www.apache.org/licenses/LICENSE-2.0
23 | *
24 | * ////////////////////////////////////////////////////////////////////////////
25 | *
26 | * The contents of this file are subject to the Mozilla Public License
27 | * Version 1.1 (the "License"); you may not use this file except in
28 | * compliance with the License. You may obtain a copy of the License at
29 | * http://www.mozilla.org/MPL/
30 | *
31 | * Software distributed under the License is distributed on an "AS IS"
32 | * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
33 | * License for the specific language governing rights and limitations
34 | * under the License.
35 | *
36 | * The Original Code is Simple HTML Parser.
37 | *
38 | * The Initial Developer of the Original Code is Erik Arvidsson.
39 | * Portions created by Erik Arvidssson are Copyright (C) 2004. All Rights
40 | * Reserved.
41 | *
42 | * ////////////////////////////////////////////////////////////////////////////
43 | *
44 | * This program is free software; you can redistribute it and/or
45 | * modify it under the terms of the GNU General Public License
46 | * as published by the Free Software Foundation; either version 2
47 | * of the License, or (at your option) any later version.
48 | *
49 | * This program is distributed in the hope that it will be useful,
50 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
51 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
52 | * GNU General Public License for more details.
53 | *
54 | * You should have received a copy of the GNU General Public License
55 | * along with this program; if not, write to the Free Software
56 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
57 | *
58 | * ----------------------------------------------------------------------------
59 | * Usage
60 | * ----------------------------------------------------------------------------
61 | *
62 | * // Use like so:
63 | * HTMLParser(htmlString, {
64 | * start: function(tag, attrs, unary) {},
65 | * end: function(tag) {},
66 | * chars: function(text) {},
67 | * comment: function(text) {}
68 | * });
69 | *
70 | * // or to get an XML string:
71 | * HTMLtoXML(htmlString);
72 | *
73 | * // or to get an XML DOM Document
74 | * HTMLtoDOM(htmlString);
75 | *
76 | * // or to inject into an existing document/DOM node
77 | * HTMLtoDOM(htmlString, document);
78 | * HTMLtoDOM(htmlString, document.body);
79 | *
80 | */
81 |
82 | (function () {
83 |
84 | // Regular Expressions for parsing tags and attributes
85 | var startTag = /^<([-A-Za-z0-9_]+)((?:\s+[a-zA-Z_:][-a-zA-Z0-9_:.]*(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>/,
86 | endTag = /^<\/([-A-Za-z0-9_]+)[^>]*>/,
87 | attr = /([a-zA-Z_:][-a-zA-Z0-9_:.]*)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:'((?:\\.|[^'])*)')|([^>\s]+)))?/g;
88 |
89 | // Empty Elements - HTML 5
90 | var empty = makeMap("area,base,basefont,br,col,frame,hr,img,input,link,meta,param,embed,command,keygen,source,track,wbr");
91 |
92 | // Block Elements - HTML 5
93 | var block = makeMap("a,address,article,applet,aside,audio,blockquote,button,canvas,center,dd,del,dir,div,dl,dt,fieldset,figcaption,figure,footer,form,frameset,h1,h2,h3,h4,h5,h6,header,hgroup,hr,iframe,ins,isindex,li,map,menu,noframes,noscript,object,ol,output,p,pre,section,script,table,tbody,td,tfoot,th,thead,tr,ul,video");
94 |
95 | // Inline Elements - HTML 5
96 | var inline = makeMap("abbr,acronym,applet,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,iframe,img,input,ins,kbd,label,map,object,q,s,samp,script,select,small,span,strike,strong,sub,sup,textarea,tt,u,var");
97 |
98 | // Elements that you can, intentionally, leave open
99 | // (and which close themselves)
100 | var closeSelf = makeMap("colgroup,dd,dt,li,options,p,td,tfoot,th,thead,tr");
101 |
102 | // Attributes that have their values filled in disabled="disabled"
103 | var fillAttrs = makeMap("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected");
104 |
105 | // Special Elements (can contain anything)
106 | var special = makeMap("script,style");
107 |
108 | var HTMLParser = this.HTMLParser = function (html, handler) {
109 | var index, chars, match, stack = [], last = html;
110 | stack.last = function () {
111 | return this[this.length - 1];
112 | };
113 |
114 | while (html) {
115 | chars = true;
116 |
117 | // Make sure we're not in a script or style element
118 | if (!stack.last() || !special[stack.last()]) {
119 |
120 | // Comment
121 | if (html.indexOf("");
123 |
124 | if (index >= 0) {
125 | if (handler.comment)
126 | handler.comment(html.substring(4, index));
127 | html = html.substring(index + 3);
128 | chars = false;
129 | }
130 |
131 | // end tag
132 | } else if (html.indexOf("") == 0) {
133 | match = html.match(endTag);
134 |
135 | if (match) {
136 | html = html.substring(match[0].length);
137 | match[0].replace(endTag, parseEndTag);
138 | chars = false;
139 | }
140 |
141 | // start tag
142 | } else if (html.indexOf("<") == 0) {
143 | match = html.match(startTag);
144 |
145 | if (match) {
146 | html = html.substring(match[0].length);
147 | match[0].replace(startTag, parseStartTag);
148 | chars = false;
149 | }
150 | }
151 |
152 | if (chars) {
153 | index = html.indexOf("<");
154 |
155 | var text = index < 0 ? html : html.substring(0, index);
156 | html = index < 0 ? "" : html.substring(index);
157 |
158 | if (handler.chars)
159 | handler.chars(text);
160 | }
161 |
162 | } else {
163 | html = html.replace(new RegExp("([\\s\\S]*?)<\/" + stack.last() + "[^>]*>"), function (all, text) {
164 | text = text.replace(/|/g, "$1$2");
165 | if (handler.chars)
166 | handler.chars(text);
167 |
168 | return "";
169 | });
170 |
171 | parseEndTag("", stack.last());
172 | }
173 |
174 | if (html == last)
175 | throw "Parse Error: " + html;
176 | last = html;
177 | }
178 |
179 | // Clean up any remaining tags
180 | parseEndTag();
181 |
182 | function parseStartTag(tag, tagName, rest, unary) {
183 | tagName = tagName.toLowerCase();
184 |
185 | if (block[tagName]) {
186 | while (stack.last() && inline[stack.last()]) {
187 | parseEndTag("", stack.last());
188 | }
189 | }
190 |
191 | if (closeSelf[tagName] && stack.last() == tagName) {
192 | parseEndTag("", tagName);
193 | }
194 |
195 | unary = empty[tagName] || !!unary;
196 |
197 | if (!unary)
198 | stack.push(tagName);
199 |
200 | if (handler.start) {
201 | var attrs = [];
202 |
203 | rest.replace(attr, function (match, name) {
204 | var value = arguments[2] ? arguments[2] :
205 | arguments[3] ? arguments[3] :
206 | arguments[4] ? arguments[4] :
207 | fillAttrs[name] ? name : "";
208 |
209 | attrs.push({
210 | name: name,
211 | value: value,
212 | escaped: value.replace(/(^|[^\\])"/g, '$1\\\"') //"
213 | });
214 | });
215 |
216 | if (handler.start)
217 | handler.start(tagName, attrs, unary);
218 | }
219 | }
220 |
221 | function parseEndTag(tag, tagName) {
222 | if(tagName!== undefined) tagName = tagName.toLowerCase();
223 | // If no tag name is provided, clean shop
224 | if (!tagName)
225 | var pos = 0;
226 |
227 | // Find the closest opened tag of the same type
228 | else
229 | for (var pos = stack.length - 1; pos >= 0; pos--)
230 | if (stack[pos] == tagName)
231 | break;
232 |
233 | if (pos >= 0) {
234 | // Close all the open elements, up the stack
235 | for (var i = stack.length - 1; i >= pos; i--)
236 | if (handler.end)
237 | handler.end(stack[i]);
238 |
239 | // Remove the open elements from the stack
240 | stack.length = pos;
241 | }
242 | }
243 | };
244 |
245 | this.HTMLtoXML = function (html) {
246 | var results = "";
247 |
248 | HTMLParser(html, {
249 | start: function (tag, attrs, unary) {
250 | results += "<" + tag;
251 |
252 | for (var i = 0; i < attrs.length; i++)
253 | results += " " + attrs[i].name + '="' + attrs[i].escaped + '"';
254 | results += ">";
255 | },
256 | end: function (tag) {
257 | results += "" + tag + ">";
258 | },
259 | chars: function (text) {
260 | results += text;
261 | },
262 | comment: function (text) {
263 | results += "";
264 | }
265 | });
266 |
267 | return results;
268 | };
269 |
270 | this.HTMLtoDOM = function (html, doc) {
271 | // There can be only one of these elements
272 | var one = makeMap("html,head,body,title");
273 |
274 | // Enforce a structure for the document
275 | var structure = {
276 | link: "head",
277 | base: "head"
278 | };
279 |
280 | if (!doc) {
281 | if (typeof DOMDocument != "undefined")
282 | doc = new DOMDocument();
283 | else if (typeof document != "undefined" && document.implementation && document.implementation.createDocument)
284 | doc = document.implementation.createDocument("", "", null);
285 | else if (typeof ActiveX != "undefined")
286 | doc = new ActiveXObject("Msxml.DOMDocument");
287 |
288 | } else
289 | doc = doc.ownerDocument ||
290 | doc.getOwnerDocument && doc.getOwnerDocument() ||
291 | doc;
292 |
293 | var elems = [],
294 | documentElement = doc.documentElement ||
295 | doc.getDocumentElement && doc.getDocumentElement();
296 |
297 | // If we're dealing with an empty document then we
298 | // need to pre-populate it with the HTML document structure
299 | if (!documentElement && doc.createElement) (function () {
300 | var html = doc.createElement("html");
301 | var head = doc.createElement("head");
302 | head.appendChild(doc.createElement("title"));
303 | html.appendChild(head);
304 | html.appendChild(doc.createElement("body"));
305 | doc.appendChild(html);
306 | })();
307 |
308 | // Find all the unique elements
309 | if (doc.getElementsByTagName)
310 | for (var i in one)
311 | one[i] = doc.getElementsByTagName(i)[0];
312 |
313 | // If we're working with a document, inject contents into
314 | // the body element
315 | var curParentNode = one.body;
316 |
317 | HTMLParser(html, {
318 | start: function (tagName, attrs, unary) {
319 | // If it's a pre-built element, then we can ignore
320 | // its construction
321 | if (one[tagName]) {
322 | curParentNode = one[tagName];
323 | if (!unary) {
324 | elems.push(curParentNode);
325 | }
326 | return;
327 | }
328 |
329 | var elem = doc.createElement(tagName);
330 |
331 | for (var attr in attrs)
332 | elem.setAttribute(attrs[attr].name, attrs[attr].value);
333 |
334 | if (structure[tagName] && typeof one[structure[tagName]] != "boolean")
335 | one[structure[tagName]].appendChild(elem);
336 |
337 | else if (curParentNode && curParentNode.appendChild)
338 | curParentNode.appendChild(elem);
339 |
340 | if (!unary) {
341 | elems.push(elem);
342 | curParentNode = elem;
343 | }
344 | },
345 | end: function (tag) {
346 | elems.length -= 1;
347 |
348 | // Init the new parentNode
349 | curParentNode = elems[elems.length - 1];
350 | },
351 | chars: function (text) {
352 | curParentNode.appendChild(doc.createTextNode(text));
353 | },
354 | comment: function (text) {
355 | // create comment node
356 | }
357 | });
358 |
359 | return doc;
360 | };
361 |
362 | function makeMap(str) {
363 | var obj = {}, items = str.split(",");
364 | for (var i = 0; i < items.length; i++)
365 | obj[items[i]] = true;
366 | return obj;
367 | }
368 | })();
369 |
--------------------------------------------------------------------------------