├── .gitignore ├── CHANGELOG ├── LICENSE ├── README.md ├── TODO ├── json2.js ├── lib └── htmlparser.js ├── package.json ├── runtests.html ├── runtests.js ├── snippet.js ├── tests ├── html.js ├── parser.js ├── rss.js └── testutils.js └── utils_example.js /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .project* 3 | .settings/* 4 | old/* 5 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | v2.0.0 2 | * Brand new parser, handles edge cases old parser did not 3 | * Parser handlers renamed to builders 4 | * Builder method signature simplified 5 | * Moved element position calculation to builders for efficiency 6 | * Added case-sensitivity options for tag and attribute names 7 | * Parser output minimized (unecessary values removed) 8 | * Element attribute list renamed from attribs to attributes 9 | * Node types consolidated; "script" and "style" moved to "tag" 10 | * An order of magnitude more tests, with many targeting the parser rather than just the builders 11 | * Tests consolidated into single files per test type (e.g. parser tests, html tests, rss tests) 12 | * Testing code rewritten (e.g. direct object comparator instead of comparison of and object's JSON) 13 | * Brand new bugs! (not sure what they are yet but I am sure there are at least a few) 14 | 15 | v1.7.6 16 | * Removed "os" entry from package.json 17 | 18 | v1.7.5 19 | * Fixed case sensitivity of tag names in DefaultHandler, fixed README.md formatting 20 | 21 | v1.7.4 22 | * Updated copyright dates 23 | 24 | v1.7.3 25 | * Renamed node-htmlparser.* to htmlparser.* and created shims for people still expecting node-htmlparser.* 26 | 27 | v1.7.2 28 | * Document position feature fixed to work correctly with chunked parsing 29 | 30 | v1.7.1 31 | * Document position feature disabled until it works correctly with chunked parsing 32 | 33 | v1.7.0 34 | * Empty tag checking switch to being case insensitive [fgnass] 35 | * Added feature to include document position (row, col) in element data [fgnass] 36 | * Added parser option "includeLocation" to enable document position data 37 | 38 | v1.6.4 39 | * Fixed 'prevElement' error [Swizec] 40 | 41 | v1.6.3 42 | * Updated to support being an npm package 43 | * Fixed DomUtils.testElement() 44 | 45 | v1.6.1 46 | * Optimized DomUtils by up to 2-3x 47 | 48 | v1.6.0 49 | * Added support for RSS/Atom feeds 50 | 51 | v1.5.0 52 | * Added DefaultHandler option "enforceEmptyTags" so that XML can be parsed correctly 53 | 54 | v1.4.2 55 | * Added tests for parsing XML with namespaces 56 | 57 | v1.4.1 58 | * Added minified version 59 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2010 - 2012, Chris Winberry . All rights reserved. 2 | Permission is hereby granted, free of charge, to any person obtaining a copy 3 | of this software and associated documentation files (the "Software"), to 4 | deal in the Software without restriction, including without limitation the 5 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 6 | sell copies of the Software, and to permit persons to whom the Software is 7 | furnished to do so, subject to the following conditions: 8 | 9 | The above copyright notice and this permission notice shall be included in 10 | all copies or substantial portions of the Software. 11 | 12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 13 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 14 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 15 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 16 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 17 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 18 | IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #NodeHtmlParser 2 | A forgiving HTML/XML/RSS parser written in JS for both the browser and NodeJS (yes, despite the name it works just fine in any modern browser). The parser can handle streams (chunked data) and supports custom handlers for writing custom DOMs/output. 3 | 4 | ##Installing 5 | 6 | npm install htmlparser 7 | 8 | ##Running Tests 9 | 10 | ###Run tests under node: 11 | node runtests.js 12 | 13 | ###Run tests in browser: 14 | View runtests.html in any browser 15 | 16 | ##Usage In Node 17 | 18 | ```javascript 19 | var htmlparser = require("htmlparser"); 20 | var rawHtml = "Xyz 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 237 | 238 | 239 | -------------------------------------------------------------------------------- /runtests.js: -------------------------------------------------------------------------------- 1 | /*********************************************** 2 | Copyright 2010 - 2012, Chris Winberry . All rights reserved. 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to 5 | deal in the Software without restriction, including without limitation the 6 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | sell copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | IN THE SOFTWARE. 20 | ***********************************************/ 21 | 22 | Object.prototype.equals = function (x) { 23 | //http://stackoverflow.com/questions/1068834/object-comparison-in-javascript 24 | var p; 25 | 26 | for (p in this) { 27 | if (typeof(x[p]) == 'undefined') { 28 | // console.log('Missing property: ', p); 29 | return false; 30 | } 31 | } 32 | 33 | for (p in x) { 34 | if (typeof(this[p]) == 'undefined') { 35 | // console.log('Extra property: ', p); 36 | return false; 37 | } 38 | } 39 | 40 | for (p in this) { 41 | if (this[p]) { 42 | switch(typeof(this[p])) { 43 | case 'object': 44 | if (!this[p].equals(x[p])) { 45 | // console.log('Mismatched property: ', p); 46 | return false; 47 | } 48 | break; 49 | case 'function': 50 | if (typeof(x[p])=='undefined' || (p != 'equals' && this[p].toString() != x[p].toString())) { 51 | // console.log('Mismatched property: ', p); 52 | return false; 53 | } 54 | break; 55 | default: 56 | if (this[p] != x[p]) { 57 | // console.log('Mismatched property: ', p); 58 | return false; 59 | } 60 | } 61 | } else { 62 | if (x[p]) { 63 | // console.log('Poop: ', p); 64 | return false; 65 | } 66 | } 67 | } 68 | 69 | return true; 70 | } 71 | 72 | var util = require("util"); 73 | var fs = require("fs"); 74 | var htmlparser = require("./lib/htmlparser"); 75 | 76 | var testUtils = require('./tests/testutils'); 77 | var htmlTests = require('./tests/html'); 78 | var rssTests = require('./tests/rss'); 79 | var parserTests = require('./tests/parser'); 80 | 81 | var testResults = {}; 82 | 83 | testUtils.runBuilderTests( 84 | htmlTests 85 | , htmlparser.Parser 86 | , htmlparser.HtmlBuilder 87 | , null 88 | , function (testName, testResult, actual, expected) { 89 | console.log("[" + testName + "]: " + (testResult ? "passed" : "FAILED")); 90 | }, function (elapsed, passed, failed) { 91 | testResults['HTML builder'] = { 92 | elapsed: elapsed 93 | , passed: passed 94 | , failed: failed 95 | }; 96 | }); 97 | testUtils.runBuilderTests( 98 | htmlTests 99 | , htmlparser.Parser 100 | , htmlparser.HtmlBuilder 101 | , function (test) { 102 | var newTest = {}; 103 | for (var key in test) { 104 | if (!test.hasOwnProperty(key)) { 105 | continue; 106 | } 107 | newTest[key] = (key === 'data') ? 108 | test.data.join('').split('') 109 | : 110 | test[key] 111 | ; 112 | } 113 | return newTest; 114 | } 115 | , function (testName, testResult, actual, expected) { 116 | console.log("[" + testName + "]: " + (testResult ? "passed" : "FAILED")); 117 | }, function (elapsed, passed, failed) { 118 | testResults['HTML builder (streamed)'] = { 119 | elapsed: elapsed 120 | , passed: passed 121 | , failed: failed 122 | }; 123 | }); 124 | 125 | testUtils.runBuilderTests( 126 | rssTests 127 | , htmlparser.Parser 128 | , htmlparser.RssBuilder 129 | , null 130 | , function (testName, testResult, actual, expected) { 131 | console.log("[" + testName + "]: " + (testResult ? "passed" : "FAILED")); 132 | }, function (elapsed, passed, failed) { 133 | testResults['RSS builder'] = { 134 | elapsed: elapsed 135 | , passed: passed 136 | , failed: failed 137 | }; 138 | }); 139 | testUtils.runBuilderTests( 140 | rssTests 141 | , htmlparser.Parser 142 | , htmlparser.RssBuilder 143 | , function (test) { 144 | var newTest = {}; 145 | for (var key in test) { 146 | if (!test.hasOwnProperty(key)) { 147 | continue; 148 | } 149 | newTest[key] = (key === 'data') ? 150 | test.data.join('').split('') 151 | : 152 | test[key] 153 | ; 154 | } 155 | return newTest; 156 | } 157 | , function (testName, testResult, actual, expected) { 158 | console.log("[" + testName + "]: " + (testResult ? "passed" : "FAILED")); 159 | }, function (elapsed, passed, failed) { 160 | testResults['RSS builder (streamed)'] = { 161 | elapsed: elapsed 162 | , passed: passed 163 | , failed: failed 164 | }; 165 | }); 166 | 167 | testUtils.runParserTests( 168 | parserTests 169 | , htmlparser.Parser 170 | , null 171 | , function (testName, testResult, actual, expected) { 172 | console.log("[" + testName + "]: " + (testResult ? "passed" : "FAILED")); 173 | }, function (elapsed, passed, failed) { 174 | testResults['Parser'] = { 175 | elapsed: elapsed 176 | , passed: passed 177 | , failed: failed 178 | }; 179 | }); 180 | testUtils.runParserTests( 181 | parserTests 182 | , htmlparser.Parser 183 | , function (test) { 184 | var newTest = {}; 185 | for (var key in test) { 186 | if (!test.hasOwnProperty(key)) { 187 | continue; 188 | } 189 | newTest[key] = (key === 'data') ? 190 | test.data.join('').split('') 191 | : 192 | test[key] 193 | ; 194 | } 195 | return newTest; 196 | } 197 | , function (testName, testResult, actual, expected) { 198 | console.log("[" + testName + "]: " + (testResult ? "passed" : "FAILED")); 199 | }, function (elapsed, passed, failed) { 200 | testResults['Parser (streamed)'] = { 201 | elapsed: elapsed 202 | , passed: passed 203 | , failed: failed 204 | }; 205 | }); 206 | 207 | testUtils.runStreamingParserTests( 208 | parserTests 209 | , htmlparser.Parser 210 | , null 211 | , function (testName, testResult, actual, expected) { 212 | console.log("[" + testName + "]: " + (testResult ? "passed" : "FAILED")); 213 | }, function (elapsed, passed, failed) { 214 | testResults['StreamingParser'] = { 215 | elapsed: elapsed 216 | , passed: passed 217 | , failed: failed 218 | }; 219 | }); 220 | 221 | testUtils.runStreamingParserTests( 222 | parserTests 223 | , htmlparser.Parser 224 | , function (test) { 225 | var newTest = {}; 226 | for (var key in test) { 227 | if (!test.hasOwnProperty(key)) { 228 | continue; 229 | } 230 | newTest[key] = (key === 'data') ? 231 | test.data.join('').split('') 232 | : 233 | test[key] 234 | ; 235 | } 236 | return newTest; 237 | } 238 | , function (testName, testResult, actual, expected) { 239 | console.log("[" + testName + "]: " + (testResult ? "passed" : "FAILED")); 240 | }, function (elapsed, passed, failed) { 241 | testResults['StreamingParser (streamed)'] = { 242 | elapsed: elapsed 243 | , passed: passed 244 | , failed: failed 245 | }; 246 | }); 247 | 248 | console.log(''); 249 | console.log('Test Results'); 250 | console.log('------------------'); 251 | var passedTotal = 0; 252 | var failedTotal = 0; 253 | var elapsedTotal = 0; 254 | for (var testName in testResults) { 255 | if (!testResults.hasOwnProperty(testName)) { 256 | continue; 257 | } 258 | var test = testResults[testName]; 259 | passedTotal += test.passed; 260 | failedTotal += test.failed; 261 | elapsedTotal += test.elapsed; 262 | console.log(testName + ': ' + test.passed + '/' + (test.passed + test.failed) + ' (' + Math.round(test.passed / (test.passed + test.failed) * 100) + '%) [' + test.elapsed + 'ms]'); 263 | } 264 | console.log('------------------'); 265 | console.log('Total: ' + passedTotal + '/' + (passedTotal + failedTotal) + ' (' + Math.round(passedTotal / (passedTotal + failedTotal) * 100) + '%) [' + elapsedTotal + 'ms]'); 266 | -------------------------------------------------------------------------------- /snippet.js: -------------------------------------------------------------------------------- 1 | //node --prof --prof_auto profile.js 2 | //deps/v8/tools/mac-tick-processor v8.log 3 | var sys = require("sys"); 4 | var htmlparser = require("./htmlparser"); 5 | 6 | var html = "text"; 7 | 8 | var handler = new htmlparser.DefaultHandler(function(err, dom) { 9 | if (err) 10 | sys.debug("Error: " + err); 11 | else 12 | sys.debug(sys.inspect(dom, false, null)); 13 | }, { enforceEmptyTags: true }); 14 | var parser = new htmlparser.Parser(handler); 15 | parser.parseComplete(html); 16 | -------------------------------------------------------------------------------- /tests/html.js: -------------------------------------------------------------------------------- 1 | /*********************************************** 2 | Copyright 2010 - 2012, Chris Winberry . All rights reserved. 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to 5 | deal in the Software without restriction, including without limitation the 6 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | sell copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | IN THE SOFTWARE. 20 | ***********************************************/ 21 | 22 | (function () { 23 | 24 | var exports; 25 | if (typeof(module) !== 'undefined' && typeof(module.exports) !== 'undefined') { 26 | exports = module.exports; 27 | } else { 28 | exports = {}; 29 | if (!this.Tautologistics) { 30 | this.Tautologistics = {}; 31 | } 32 | if (!this.Tautologistics.NodeHtmlParser) { 33 | this.Tautologistics.NodeHtmlParser = {}; 34 | } 35 | if (!this.Tautologistics.NodeHtmlParser.Tests) { 36 | this.Tautologistics.NodeHtmlParser.Tests = []; 37 | } 38 | this.Tautologistics.NodeHtmlParser.Tests.Html = exports; 39 | } 40 | 41 | exports['Basic test'] = { 42 | options: { 43 | builder: {} 44 | , parser: {} 45 | }, 46 | data: ["The TitleHello world"], 47 | expected: [ 48 | { raw: 'html' 49 | , type: 'tag' 50 | , name: 'html' 51 | , children: 52 | [ { raw: 'title' 53 | , type: 'tag' 54 | , name: 'title' 55 | , children: [ { data: 'The Title', type: 'text' } ] 56 | } 57 | , { raw: 'body' 58 | , type: 'tag' 59 | , name: 'body' 60 | , children: [ { data: 'Hello world', type: 'text' } ] 61 | } 62 | ] 63 | } 64 | ] 65 | }; 66 | 67 | exports["Single Tag 1"] = { 68 | options: { 69 | builder: {} 70 | , parser: {} 71 | }, 72 | data: ["
text
"], 73 | expected: [ 74 | { raw: 'br', type: 'tag', name: 'br' } 75 | , { data: 'text', type: 'text' } 76 | ] 77 | }; 78 | 79 | exports["Single Tag 2"] = { 80 | options: { 81 | builder: {} 82 | , parser: {} 83 | }, 84 | data: ["
text
"], 85 | expected: [ 86 | { raw: 'br', type: 'tag', name: 'br' } 87 | , { data: 'text', type: 'text' } 88 | , { raw: 'br', type: 'tag', name: 'br' } 89 | ] 90 | }; 91 | 92 | exports["Unescaped chars in script"] = { 93 | options: { 94 | builder: {} 95 | , parser: {} 96 | }, 97 | data: [""], 98 | expected: [ 99 | { raw: 'head' 100 | , type: 'tag' 101 | , name: 'head' 102 | , children: 103 | [ { raw: 'script language="Javascript"' 104 | , type: 'tag' 105 | , name: 'script' 106 | , attributes: { language: 'Javascript' } 107 | , children: 108 | [ { data: 'var foo = ""; alert(2 > foo); var baz = 10 << 2; var zip = 10 >> 1; var yap = \"<<>>>><<\";' 109 | , type: 'text' 110 | } 111 | ] 112 | } 113 | ] 114 | } 115 | ] 116 | }; 117 | 118 | exports["Special char in comment"] = { 119 | options: { 120 | builder: {} 121 | , parser: {} 122 | }, 123 | data: [""], 124 | expected: [ 125 | { raw: 'head' 126 | , type: 'tag' 127 | , name: 'head' 128 | , children: 129 | [ { data: ' commented out tags Test' 130 | , type: 'comment' 131 | } 132 | ] 133 | } 134 | ] 135 | }; 136 | 137 | exports["Script source in comment"] = { 138 | options: { 139 | builder: {} 140 | , parser: {} 141 | }, 142 | data: [""], 143 | expected: [ 144 | { raw: 'script' 145 | , type: 'tag' 146 | , name: 'script' 147 | , children: 148 | [ { data: '' 149 | , type: 'text' 150 | } 151 | ] 152 | } 153 | ] 154 | }; 155 | 156 | exports["Unescaped chars in style"] = { 157 | options: { 158 | builder: {} 159 | , parser: {} 160 | }, 161 | data: [""], 162 | expected: [ 163 | { raw: 'style type="text/css"' 164 | , type: 'tag' 165 | , name: 'style' 166 | , attributes: { type: 'text/css' } 167 | , children: 168 | [ { data: "\n body > p\n { font-weight: bold; }" 169 | , type: 'text' 170 | } 171 | ] 172 | } 173 | ] 174 | }; 175 | 176 | exports["Extra spaces in tag"] = { 177 | options: { 178 | builder: {} 179 | , parser: {} 180 | }, 181 | data: ["<\n font \n size='14' \n>the text<\n / \nfont \n>"], 182 | expected: [ 183 | { raw: "\n font \n size='14' \n" 184 | , type: 'tag' 185 | , name: 'font' 186 | , attributes: { size: '14' } 187 | , children: 188 | [ { data: 'the text' 189 | , type: 'text' 190 | } 191 | ] 192 | } 193 | ] 194 | }; 195 | 196 | exports["Unquoted attributes"] = { 197 | options: { 198 | builder: {} 199 | , parser: {} 200 | }, 201 | data: ["the text"], 202 | expected: [ 203 | { raw: 'font size= 14' 204 | , type: 'tag' 205 | , name: 'font' 206 | , attributes: { size: '14' } 207 | , children: 208 | [ { data: 'the text' 209 | , type: 'text' 210 | } 211 | ] 212 | } 213 | ] 214 | }; 215 | 216 | exports["Singular attribute"] = { 217 | options: { 218 | builder: {} 219 | , parser: {} 220 | }, 221 | data: ["