├── .babelrc ├── .gitignore ├── LICENSE ├── README.md ├── ava.js ├── dist ├── htmlParser.es6.js ├── htmlParser.js └── htmlParser.min.js ├── package.json ├── rollup.config.js ├── src ├── index.js ├── parser │ ├── index.js │ └── nodes.js └── tokenizer │ ├── index.js │ ├── makers.js │ ├── regexp.js │ └── types.js └── test ├── index.js ├── parser.js └── tokenizer.js /.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "env": { 3 | "build": { 4 | "presets": [ 5 | ["latest", { 6 | "es2015": { 7 | "modules": false 8 | } 9 | }] 10 | ], 11 | "plugins": ["external-helpers"] 12 | } 13 | } 14 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /node_modules/* 2 | /.DS_Store -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 henryluki 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # html-parser 2 | Simple HTML to JSON parser use Regexp and String.indexOf 3 | 4 | ## Install 5 | 6 | ```shell 7 | npm install htmlstr-parser 8 | 9 | ``` 10 | ## Basic usage 11 | 12 | ```javascript 13 | 14 | var html = "
1

2
3

2

" 15 | htmlParser(html) 16 | 17 | ``` 18 | ### Output 19 | ```javascript 20 | 21 | { 22 | "tag": "root", 23 | "children": [{ 24 | "type": "Element", 25 | "tagName": "div", 26 | "attributes": { 27 | "style": "height:10rpx;width: 20rpx;" 28 | }, 29 | "children": [{ 30 | "type": "Text", 31 | "content": "1" 32 | }, { 33 | "type": "Element", 34 | "tagName": "p", 35 | "attributes": {}, 36 | "children": [{ 37 | "type": "Text", 38 | "content": "2" 39 | }, { 40 | "type": "Element", 41 | "tagName": "br" 42 | }, { 43 | "type": "Element", 44 | "tagName": "a", 45 | "attributes": { 46 | "href": "http://www.baidu.com" 47 | }, 48 | "children": [{ 49 | "type": "Text", 50 | "content": "3" 51 | }] 52 | }] 53 | }, { 54 | "type": "Element", 55 | "tagName": "p", 56 | "attributes": {}, 57 | "children": [{ 58 | "type": "Text", 59 | "content": "2" 60 | }] 61 | }] 62 | }] 63 | } 64 | ``` 65 | -------------------------------------------------------------------------------- /ava.js: -------------------------------------------------------------------------------- 1 | require('babel-register')({ 2 | "presets": [ 3 | ["latest", { 4 | "modules": false 5 | }] 6 | ] 7 | }); -------------------------------------------------------------------------------- /dist/htmlParser.es6.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const STARTTAG_REX = /^<([-A-Za-z0-9_]+)((?:\s+[a-zA-Z_:][-a-zA-Z0-9_:.]*(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>/; 4 | const ENDTAG_REX = /^<\/([-A-Za-z0-9_]+)[^>]*>/; 5 | const ATTR_REX = /([a-zA-Z_:][-a-zA-Z0-9_:.]*)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:'((?:\\.|[^'])*)')|([^>\s]+)))?/g; 6 | 7 | function makeMap(str) { 8 | return str.split(",").reduce((map, cur) => { 9 | map[cur] = true; 10 | return map 11 | }, {}) 12 | } 13 | const EMPTY_MAKER = makeMap("area,base,basefont,br,col,frame,hr,img,input,link,meta,param,embed,command,keygen,source,track,wbr"); 14 | const FILLATTRS_MAKER = makeMap("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected"); 15 | 16 | function isEmptyMaker(tag){ 17 | return !!EMPTY_MAKER[tag] 18 | } 19 | 20 | function isFillattrsMaker(attr){ 21 | return !!FILLATTRS_MAKER[attr] 22 | } 23 | 24 | class TagStart { 25 | constructor(name, tag){ 26 | this.name = name; 27 | this.attributes = this.getAttributes(tag); 28 | } 29 | getAttributes(str) { 30 | let attrsMap = {}; 31 | str.replace(ATTR_REX, function(match, name){ 32 | const args = Array.prototype.slice.call(arguments); 33 | const value = args[2] ? args[2] : 34 | args[3] ? args[3] : 35 | args[4] ? args[4] : 36 | isFillattrsMaker(name) ? name : ""; 37 | 38 | attrsMap[name] = value.replace(/(^|[^\\])"/g, '$1\\\"'); 39 | }); 40 | return attrsMap 41 | } 42 | } 43 | 44 | class TagEmpty extends TagStart { 45 | constructor(name, tag){ 46 | super(name, tag); 47 | } 48 | } 49 | 50 | class TagEnd { 51 | constructor(name) { 52 | this.name = name; 53 | } 54 | } 55 | 56 | class Text { 57 | constructor(text) { 58 | this.text = text; 59 | } 60 | } 61 | 62 | const ElEMENT_TYPE = "Element"; 63 | const TEXT_TYPE = "Text"; 64 | 65 | function createElement(token){ 66 | const tagName = token.name; 67 | const attributes = token.attributes; 68 | if (token instanceof TagEmpty) { 69 | return { 70 | type: ElEMENT_TYPE, 71 | tagName, 72 | attributes 73 | } 74 | } 75 | return { 76 | type: ElEMENT_TYPE, 77 | tagName, 78 | attributes, 79 | children: [] 80 | } 81 | } 82 | 83 | function createText(token){ 84 | const content = token.text; 85 | return { 86 | type: TEXT_TYPE, 87 | content 88 | } 89 | } 90 | 91 | function createNodeFactory(type, token){ 92 | switch(type){ 93 | case ElEMENT_TYPE: return createElement(token) 94 | case TEXT_TYPE: return createText(token) 95 | default: break 96 | } 97 | } 98 | 99 | function parse(tokens) { 100 | let root = { 101 | tag: "root", 102 | children: [] 103 | }; 104 | let tagArray = [root]; 105 | tagArray.last = () => tagArray[tagArray.length - 1]; 106 | 107 | for (let i = 0; i < tokens.length; i++) { 108 | const token = tokens[i]; 109 | if (token instanceof TagStart) { 110 | const node = createNodeFactory(ElEMENT_TYPE, token); 111 | if (node.children) { 112 | tagArray.push(node); 113 | } else { 114 | tagArray.last().children.push(node); 115 | } 116 | continue 117 | } 118 | if (token instanceof TagEnd) { 119 | let parent = tagArray[tagArray.length - 2]; 120 | let node = tagArray.pop(); 121 | parent.children.push(node); 122 | continue 123 | } 124 | if (token instanceof Text) { 125 | tagArray.last().children.push(createNodeFactory(TEXT_TYPE, token)); 126 | continue 127 | } 128 | } 129 | 130 | return root 131 | } 132 | 133 | function tokenize(html) { 134 | let string = html; 135 | let tokens = []; 136 | const maxTime = Date.now() + 1000; 137 | 138 | while (string) { 139 | if (string.indexOf("") + 3; 141 | string = string.substring(lastIndex); 142 | continue 143 | } 144 | if (string.indexOf("= maxTime) break 173 | } 174 | return tokens 175 | } 176 | 177 | function htmlParser(html) { 178 | return parse(tokenize(html)) 179 | } 180 | 181 | module.exports = htmlParser; 182 | -------------------------------------------------------------------------------- /dist/htmlParser.js: -------------------------------------------------------------------------------- 1 | (function (global, factory) { 2 | typeof exports === 'object' && typeof module !== 'undefined' ? module.exports = factory() : 3 | typeof define === 'function' && define.amd ? define(factory) : 4 | (global.htmlParser = factory()); 5 | }(this, (function () { 'use strict'; 6 | 7 | var STARTTAG_REX = /^<([-A-Za-z0-9_]+)((?:\s+[a-zA-Z_:][-a-zA-Z0-9_:.]*(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>/; 8 | var ENDTAG_REX = /^<\/([-A-Za-z0-9_]+)[^>]*>/; 9 | var ATTR_REX = /([a-zA-Z_:][-a-zA-Z0-9_:.]*)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:'((?:\\.|[^'])*)')|([^>\s]+)))?/g; 10 | 11 | function makeMap(str) { 12 | return str.split(",").reduce(function (map, cur) { 13 | map[cur] = true; 14 | return map; 15 | }, {}); 16 | } 17 | var EMPTY_MAKER = makeMap("area,base,basefont,br,col,frame,hr,img,input,link,meta,param,embed,command,keygen,source,track,wbr"); 18 | var FILLATTRS_MAKER = makeMap("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected"); 19 | 20 | function isEmptyMaker(tag) { 21 | return !!EMPTY_MAKER[tag]; 22 | } 23 | 24 | function isFillattrsMaker(attr) { 25 | return !!FILLATTRS_MAKER[attr]; 26 | } 27 | 28 | var classCallCheck = function (instance, Constructor) { 29 | if (!(instance instanceof Constructor)) { 30 | throw new TypeError("Cannot call a class as a function"); 31 | } 32 | }; 33 | 34 | var createClass = function () { 35 | function defineProperties(target, props) { 36 | for (var i = 0; i < props.length; i++) { 37 | var descriptor = props[i]; 38 | descriptor.enumerable = descriptor.enumerable || false; 39 | descriptor.configurable = true; 40 | if ("value" in descriptor) descriptor.writable = true; 41 | Object.defineProperty(target, descriptor.key, descriptor); 42 | } 43 | } 44 | 45 | return function (Constructor, protoProps, staticProps) { 46 | if (protoProps) defineProperties(Constructor.prototype, protoProps); 47 | if (staticProps) defineProperties(Constructor, staticProps); 48 | return Constructor; 49 | }; 50 | }(); 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | var inherits = function (subClass, superClass) { 61 | if (typeof superClass !== "function" && superClass !== null) { 62 | throw new TypeError("Super expression must either be null or a function, not " + typeof superClass); 63 | } 64 | 65 | subClass.prototype = Object.create(superClass && superClass.prototype, { 66 | constructor: { 67 | value: subClass, 68 | enumerable: false, 69 | writable: true, 70 | configurable: true 71 | } 72 | }); 73 | if (superClass) Object.setPrototypeOf ? Object.setPrototypeOf(subClass, superClass) : subClass.__proto__ = superClass; 74 | }; 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | var possibleConstructorReturn = function (self, call) { 87 | if (!self) { 88 | throw new ReferenceError("this hasn't been initialised - super() hasn't been called"); 89 | } 90 | 91 | return call && (typeof call === "object" || typeof call === "function") ? call : self; 92 | }; 93 | 94 | var TagStart = function () { 95 | function TagStart(name, tag) { 96 | classCallCheck(this, TagStart); 97 | 98 | this.name = name; 99 | this.attributes = this.getAttributes(tag); 100 | } 101 | 102 | createClass(TagStart, [{ 103 | key: 'getAttributes', 104 | value: function getAttributes(str) { 105 | var attrsMap = {}; 106 | str.replace(ATTR_REX, function (match, name) { 107 | var args = Array.prototype.slice.call(arguments); 108 | var value = args[2] ? args[2] : args[3] ? args[3] : args[4] ? args[4] : isFillattrsMaker(name) ? name : ""; 109 | 110 | attrsMap[name] = value.replace(/(^|[^\\])"/g, '$1\\\"'); 111 | }); 112 | return attrsMap; 113 | } 114 | }]); 115 | return TagStart; 116 | }(); 117 | 118 | var TagEmpty = function (_TagStart) { 119 | inherits(TagEmpty, _TagStart); 120 | 121 | function TagEmpty(name, tag) { 122 | classCallCheck(this, TagEmpty); 123 | return possibleConstructorReturn(this, (TagEmpty.__proto__ || Object.getPrototypeOf(TagEmpty)).call(this, name, tag)); 124 | } 125 | 126 | return TagEmpty; 127 | }(TagStart); 128 | 129 | var TagEnd = function TagEnd(name) { 130 | classCallCheck(this, TagEnd); 131 | 132 | this.name = name; 133 | }; 134 | 135 | var Text = function Text(text) { 136 | classCallCheck(this, Text); 137 | 138 | this.text = text; 139 | }; 140 | 141 | var ElEMENT_TYPE = "Element"; 142 | var TEXT_TYPE = "Text"; 143 | 144 | function createElement(token) { 145 | var tagName = token.name; 146 | var attributes = token.attributes; 147 | if (token instanceof TagEmpty) { 148 | return { 149 | type: ElEMENT_TYPE, 150 | tagName: tagName, 151 | attributes: attributes 152 | }; 153 | } 154 | return { 155 | type: ElEMENT_TYPE, 156 | tagName: tagName, 157 | attributes: attributes, 158 | children: [] 159 | }; 160 | } 161 | 162 | function createText(token) { 163 | var content = token.text; 164 | return { 165 | type: TEXT_TYPE, 166 | content: content 167 | }; 168 | } 169 | 170 | function createNodeFactory(type, token) { 171 | switch (type) { 172 | case ElEMENT_TYPE: 173 | return createElement(token); 174 | case TEXT_TYPE: 175 | return createText(token); 176 | default: 177 | break; 178 | } 179 | } 180 | 181 | function parse(tokens) { 182 | var root = { 183 | tag: "root", 184 | children: [] 185 | }; 186 | var tagArray = [root]; 187 | tagArray.last = function () { 188 | return tagArray[tagArray.length - 1]; 189 | }; 190 | 191 | for (var i = 0; i < tokens.length; i++) { 192 | var token = tokens[i]; 193 | if (token instanceof TagStart) { 194 | var node = createNodeFactory(ElEMENT_TYPE, token); 195 | if (node.children) { 196 | tagArray.push(node); 197 | } else { 198 | tagArray.last().children.push(node); 199 | } 200 | continue; 201 | } 202 | if (token instanceof TagEnd) { 203 | var parent = tagArray[tagArray.length - 2]; 204 | var _node = tagArray.pop(); 205 | parent.children.push(_node); 206 | continue; 207 | } 208 | if (token instanceof Text) { 209 | tagArray.last().children.push(createNodeFactory(TEXT_TYPE, token)); 210 | continue; 211 | } 212 | } 213 | 214 | return root; 215 | } 216 | 217 | function tokenize(html) { 218 | var string = html; 219 | var tokens = []; 220 | var maxTime = Date.now() + 1000; 221 | 222 | while (string) { 223 | if (string.indexOf("") + 3; 225 | string = string.substring(lastIndex); 226 | continue; 227 | } 228 | if (string.indexOf("= maxTime) break; 257 | } 258 | return tokens; 259 | } 260 | 261 | function htmlParser(html) { 262 | return parse(tokenize(html)); 263 | } 264 | 265 | return htmlParser; 266 | 267 | }))); 268 | -------------------------------------------------------------------------------- /dist/htmlParser.min.js: -------------------------------------------------------------------------------- 1 | !function(e,t){"object"==typeof exports&&"undefined"!=typeof module?module.exports=t():"function"==typeof define&&define.amd?define(t):e.htmlParser=t()}(this,function(){"use strict";function e(e){return e.split(",").reduce(function(e,t){return e[t]=!0,e},{})}function t(e){return!!l[e]}function n(e){return!!p[e]}function r(e){var t=e.name,n=e.attributes;return e instanceof g?{type:_,tagName:t,attributes:n}:{type:_,tagName:t,attributes:n,children:[]}}function i(e){var t=e.text;return{type:x,content:t}}function o(e,t){switch(e){case _:return r(t);case x:return i(t)}}function a(e){var t={tag:"root",children:[]},n=[t];n.last=function(){return n[n.length-1]};for(var r=0;r=i)break}else{var u=n.match(c);if(!u)continue;n=n.substring(u[0].length);var s=u[1],l=u[2],p=t(s)?new g(s,l):new y(s,l);r.push(p)}else{var h=n.match(f);if(!h)continue;n=n.substring(h[0].length);var b=h[1];if(t(b))continue;r.push(new v(b))}else{var d=n.indexOf("--\x3e")+3;n=n.substring(d)}return r}function s(e){return a(u(e))}var c=/^<([-A-Za-z0-9_]+)((?:\s+[a-zA-Z_:][-a-zA-Z0-9_:.]*(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>/,f=/^<\/([-A-Za-z0-9_]+)[^>]*>/,l=e("area,base,basefont,br,col,frame,hr,img,input,link,meta,param,embed,command,keygen,source,track,wbr"),p=e("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected"),h=function(e,t){if(!(e instanceof t))throw new TypeError("Cannot call a class as a function")},b=function(){function e(e,t){for(var n=0;n\s]+)))?/g,function(e,r){var i=Array.prototype.slice.call(arguments),o=i[2]?i[2]:i[3]?i[3]:i[4]?i[4]:n(r)?r:"";t[r]=o.replace(/(^|[^\\])"/g,'$1\\"')}),t}}]),e}(),g=function(e){function t(e,n){return h(this,t),m(this,(t.__proto__||Object.getPrototypeOf(t)).call(this,e,n))}return d(t,e),t}(y),v=function e(t){h(this,e),this.name=t},w=function e(t){h(this,e),this.text=t},_="Element",x="Text";return s}); 2 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "htmlstr-parser", 3 | "version": "2.0.1", 4 | "description": "Simple HTML to JSON parser use Regexp and String.indexOf", 5 | "main": "dist/htmlParser.js", 6 | "scripts": { 7 | "build": "BABEL_ENV=build rollup -c", 8 | "build:es6": "rollup src/index.js -f cjs -o dist/htmlParser.es6.js", 9 | "watch": "BABEL_ENV=build rollup -c -w", 10 | "pro": "BABEL_ENV=build NODE_ENV=production rollup -c", 11 | "update": "npm run build && npm run build:es6 && npm run pro", 12 | "test": "ava --watch" 13 | }, 14 | "ava": { 15 | "require": [ 16 | "./ava.js" 17 | ] 18 | }, 19 | "repository": { 20 | "type": "git", 21 | "url": "git+https://github.com/henryluki/html-parser.git" 22 | }, 23 | "keywords": [ 24 | "html-parser", 25 | "parser", 26 | "json", 27 | "html-to-json", 28 | "ast", 29 | "Regex" 30 | ], 31 | "author": "henryluki", 32 | "license": "MIT", 33 | "bugs": { 34 | "url": "https://github.com/henryluki/html-parser/issues" 35 | }, 36 | "homepage": "https://github.com/henryluki/html-parser#readme", 37 | "devDependencies": { 38 | "ava": "^0.16.0", 39 | "babel-plugin-external-helpers": "^6.22.0", 40 | "babel-preset-latest": "^6.24.1", 41 | "babel-register": "^6.24.1", 42 | "rollup-plugin-babel": "^2.7.1", 43 | "rollup-plugin-node-resolve": "^3.0.0", 44 | "rollup-plugin-uglify": "^1.0.1", 45 | "rollup-watch": "^3.2.2" 46 | } 47 | } -------------------------------------------------------------------------------- /rollup.config.js: -------------------------------------------------------------------------------- 1 | import resolve from 'rollup-plugin-node-resolve'; 2 | import babel from 'rollup-plugin-babel'; 3 | import uglify from 'rollup-plugin-uglify'; 4 | 5 | const isProduction = process.env.NODE_ENV === 'production' 6 | 7 | export default { 8 | entry: 'src/index.js', 9 | moduleName: "htmlParser", 10 | format: 'umd', 11 | plugins: [ 12 | resolve(), 13 | babel({ 14 | exclude: 'node_modules/**' 15 | }), 16 | ( isProduction && uglify()) 17 | ], 18 | dest: isProduction ? 'dist/htmlParser.min.js' : 'dist/htmlParser.js' 19 | }; -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | import { parse } from './parser/index' 2 | import { tokenize } from './tokenizer/index' 3 | 4 | export default function htmlParser(html) { 5 | return parse(tokenize(html)) 6 | } -------------------------------------------------------------------------------- /src/parser/index.js: -------------------------------------------------------------------------------- 1 | import { TagStart, TagEnd, Text } from '../tokenizer/types' 2 | import { ElEMENT_TYPE, TEXT_TYPE, createNodeFactory } from './nodes' 3 | 4 | export function parse(tokens) { 5 | let root = { 6 | tag: "root", 7 | children: [] 8 | } 9 | let tagArray = [root] 10 | tagArray.last = () => tagArray[tagArray.length - 1] 11 | 12 | for (let i = 0; i < tokens.length; i++) { 13 | const token = tokens[i] 14 | if (token instanceof TagStart) { 15 | const node = createNodeFactory(ElEMENT_TYPE, token) 16 | if (node.children) { 17 | tagArray.push(node) 18 | } else { 19 | tagArray.last().children.push(node) 20 | } 21 | continue 22 | } 23 | if (token instanceof TagEnd) { 24 | let parent = tagArray[tagArray.length - 2] 25 | let node = tagArray.pop() 26 | parent.children.push(node) 27 | continue 28 | } 29 | if (token instanceof Text) { 30 | tagArray.last().children.push(createNodeFactory(TEXT_TYPE, token)) 31 | continue 32 | } 33 | } 34 | 35 | return root 36 | } -------------------------------------------------------------------------------- /src/parser/nodes.js: -------------------------------------------------------------------------------- 1 | import { TagEmpty } from '../tokenizer/types' 2 | 3 | export const ElEMENT_TYPE = "Element" 4 | export const TEXT_TYPE = "Text" 5 | 6 | function createElement(token){ 7 | const tagName = token.name 8 | const attributes = token.attributes 9 | if (token instanceof TagEmpty) { 10 | return { 11 | type: ElEMENT_TYPE, 12 | tagName, 13 | attributes 14 | } 15 | } 16 | return { 17 | type: ElEMENT_TYPE, 18 | tagName, 19 | attributes, 20 | children: [] 21 | } 22 | } 23 | 24 | function createText(token){ 25 | const content = token.text 26 | return { 27 | type: TEXT_TYPE, 28 | content 29 | } 30 | } 31 | 32 | export function createNodeFactory(type, token){ 33 | switch(type){ 34 | case ElEMENT_TYPE: return createElement(token) 35 | case TEXT_TYPE: return createText(token) 36 | default: break 37 | } 38 | } -------------------------------------------------------------------------------- /src/tokenizer/index.js: -------------------------------------------------------------------------------- 1 | import { STARTTAG_REX, ENDTAG_REX }from './regexp' 2 | import { isEmptyMaker } from './makers' 3 | import { TagStart, TagEmpty, TagEnd, Text} from './types' 4 | 5 | export function tokenize(html) { 6 | let string = html 7 | let tokens = [] 8 | const maxTime = Date.now() + 1000 9 | 10 | while (string) { 11 | if (string.indexOf("") + 3 13 | string = string.substring(lastIndex) 14 | continue 15 | } 16 | if (string.indexOf("= maxTime) break 45 | } 46 | return tokens 47 | } 48 | -------------------------------------------------------------------------------- /src/tokenizer/makers.js: -------------------------------------------------------------------------------- 1 | function makeMap(str) { 2 | return str.split(",").reduce((map, cur) => { 3 | map[cur] = true 4 | return map 5 | }, {}) 6 | } 7 | export const EMPTY_MAKER = makeMap("area,base,basefont,br,col,frame,hr,img,input,link,meta,param,embed,command,keygen,source,track,wbr") 8 | export const FILLATTRS_MAKER = makeMap("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected") 9 | 10 | export function isEmptyMaker(tag){ 11 | return !!EMPTY_MAKER[tag] 12 | } 13 | 14 | export function isFillattrsMaker(attr){ 15 | return !!FILLATTRS_MAKER[attr] 16 | } -------------------------------------------------------------------------------- /src/tokenizer/regexp.js: -------------------------------------------------------------------------------- 1 | export const STARTTAG_REX = /^<([-A-Za-z0-9_]+)((?:\s+[a-zA-Z_:][-a-zA-Z0-9_:.]*(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>/ 2 | export const ENDTAG_REX = /^<\/([-A-Za-z0-9_]+)[^>]*>/ 3 | export const ATTR_REX = /([a-zA-Z_:][-a-zA-Z0-9_:.]*)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:'((?:\\.|[^'])*)')|([^>\s]+)))?/g -------------------------------------------------------------------------------- /src/tokenizer/types.js: -------------------------------------------------------------------------------- 1 | import { ATTR_REX } from './regexp' 2 | import { isFillattrsMaker } from './makers' 3 | 4 | export class TagStart { 5 | constructor(name, tag){ 6 | this.name = name 7 | this.attributes = this.getAttributes(tag) 8 | } 9 | getAttributes(str) { 10 | let attrsMap = {} 11 | str.replace(ATTR_REX, function(match, name){ 12 | const args = Array.prototype.slice.call(arguments) 13 | const value = args[2] ? args[2] : 14 | args[3] ? args[3] : 15 | args[4] ? args[4] : 16 | isFillattrsMaker(name) ? name : "" 17 | 18 | attrsMap[name] = value.replace(/(^|[^\\])"/g, '$1\\\"') 19 | }) 20 | return attrsMap 21 | } 22 | } 23 | 24 | export class TagEmpty extends TagStart { 25 | constructor(name, tag){ 26 | super(name, tag) 27 | } 28 | } 29 | 30 | export class TagEnd { 31 | constructor(name) { 32 | this.name = name 33 | } 34 | } 35 | 36 | export class Text { 37 | constructor(text) { 38 | this.text = text 39 | } 40 | } -------------------------------------------------------------------------------- /test/index.js: -------------------------------------------------------------------------------- 1 | import htmlParser from '../src/index' 2 | import test from 'ava' 3 | 4 | test('should pass the Hello World case', t=> { 5 | const html = "
hello world
" 6 | const tree = { 7 | tag: "root", 8 | children: [{ 9 | type: "Element", 10 | tagName: "div", 11 | attributes: {}, 12 | children: [{ 13 | type: "Text", 14 | content: "hello world" 15 | }] 16 | }] 17 | } 18 | t.deepEqual(htmlParser(html), tree) 19 | }) 20 | 21 | test('should pass the attributes case', t=> { 22 | const html = "
hello world
" 23 | const attributes = { 24 | style: "width:100px" 25 | } 26 | const div = htmlParser(html).children[0] 27 | t.deepEqual(div.attributes, attributes) 28 | }) 29 | 30 | test('should pass the empty tag case', t=> { 31 | const html = "

" 32 | const div = htmlParser(html).children[0] 33 | const br = { 34 | type: "Element", 35 | tagName: "br", 36 | attributes: {} 37 | } 38 | t.deepEqual(div.children[0], br) 39 | }) 40 | 41 | test('should pass the comment case', t=> { 42 | const html = "
" 43 | const div = htmlParser(html).children[0] 44 | t.deepEqual(div.children.length, 0) 45 | }) 46 | 47 | test('should pass the nested element case', t=> { 48 | const html = "
a

bc

" 49 | const tree = { 50 | tag: "root", 51 | children: [{ 52 | type: "Element", 53 | tagName: "div", 54 | attributes: {}, 55 | children: [{ 56 | type: "Text", 57 | content: "a" 58 | }, { 59 | type: "Element", 60 | tagName: "p", 61 | attributes: {}, 62 | children: [{ 63 | type: "Text", 64 | content: "b" 65 | }, { 66 | type: "Element", 67 | tagName: "span", 68 | attributes: {}, 69 | children: [{ 70 | type: "Text", 71 | content: "c" 72 | }] 73 | }] 74 | }] 75 | }] 76 | } 77 | t.deepEqual(htmlParser(html), tree) 78 | }) -------------------------------------------------------------------------------- /test/parser.js: -------------------------------------------------------------------------------- 1 | import { parse } from '../src/parser/index' 2 | import { ElEMENT_TYPE, TEXT_TYPE, createNodeFactory } from '../src/parser/nodes' 3 | import { tokenize } from '../src/tokenizer/index' 4 | import { TagStart, TagEmpty, Text } from '../src/tokenizer/types' 5 | import test from 'ava' 6 | 7 | test('parse() should return json tree', t => { 8 | const html = "
hello world
" 9 | const tokens = tokenize(html) 10 | const tree = { 11 | tag: "root", 12 | children: [{ 13 | type: "Element", 14 | tagName: "div", 15 | attributes: {}, 16 | children: [{ 17 | type: "Text", 18 | content: "hello world" 19 | }] 20 | }] 21 | } 22 | 23 | t.deepEqual(parse(tokens), tree) 24 | }) 25 | 26 | test("createNodeFactory() should return different nodes", t => { 27 | t.deepEqual(createNodeFactory(ElEMENT_TYPE, new TagStart("div", "style='width:100px'")), { 28 | type: ElEMENT_TYPE, 29 | tagName: "div", 30 | attributes: { 31 | style: "width:100px" 32 | }, 33 | children: [] 34 | }) 35 | 36 | t.deepEqual(createNodeFactory(ElEMENT_TYPE, new TagEmpty("div", "style='width:100px'")), { 37 | type: ElEMENT_TYPE, 38 | tagName: "div", 39 | attributes: { 40 | style: "width:100px" 41 | }, 42 | }) 43 | 44 | t.deepEqual(createNodeFactory(TEXT_TYPE, new Text("aaa")), { 45 | type: TEXT_TYPE, 46 | content: "aaa" 47 | }) 48 | }) -------------------------------------------------------------------------------- /test/tokenizer.js: -------------------------------------------------------------------------------- 1 | import { tokenize } from '../src/tokenizer/index' 2 | import { TagStart, TagEmpty, TagEnd, Text } from '../src/tokenizer/types' 3 | import test from 'ava' 4 | 5 | test('tokenize() should return tokens', t => { 6 | const html = "
a
" 7 | const tokens = tokenize(html) 8 | 9 | t.true(tokens.length == 3) 10 | t.true(tokens[0] instanceof TagStart) 11 | t.true(tokens[1] instanceof Text) 12 | t.true(tokens[2] instanceof TagEnd) 13 | 14 | }) 15 | 16 | test('a instanceof TagStart should have property: name, attributes', t=> { 17 | const tag = new TagStart("div", "href='' style='width:100px'") 18 | const obj = { 19 | name: "div", 20 | attributes: { 21 | href: '', 22 | style: 'width:100px' 23 | } 24 | } 25 | 26 | t.deepEqual(tag.name, obj.name) 27 | t.deepEqual(tag.attributes, obj.attributes) 28 | }) 29 | 30 | test('a instanceof TagEmpty should have property: name, attributes', t=> { 31 | const tag = new TagEmpty("div", "href='' style='width:100px'") 32 | const obj = { 33 | name: "div", 34 | attributes: { 35 | href: '', 36 | style: 'width:100px' 37 | } 38 | } 39 | t.deepEqual(tag.name, obj.name) 40 | t.deepEqual(tag.attributes, obj.attributes) 41 | }) 42 | 43 | test('a instanceof TagEnd should have property: name', t=> { 44 | t.deepEqual(new TagEnd("div").name, "div") 45 | 46 | }) 47 | 48 | test('a instanceof Text should have property: text', t=> { 49 | t.deepEqual(new Text("aaa").text, "aaa") 50 | }) --------------------------------------------------------------------------------