├── .babelrc
├── .gitignore
├── LICENSE
├── README.md
├── ava.js
├── dist
├── htmlParser.es6.js
├── htmlParser.js
└── htmlParser.min.js
├── package.json
├── rollup.config.js
├── src
├── index.js
├── parser
│ ├── index.js
│ └── nodes.js
└── tokenizer
│ ├── index.js
│ ├── makers.js
│ ├── regexp.js
│ └── types.js
└── test
├── index.js
├── parser.js
└── tokenizer.js
/.babelrc:
--------------------------------------------------------------------------------
1 | {
2 | "env": {
3 | "build": {
4 | "presets": [
5 | ["latest", {
6 | "es2015": {
7 | "modules": false
8 | }
9 | }]
10 | ],
11 | "plugins": ["external-helpers"]
12 | }
13 | }
14 | }
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /node_modules/*
2 | /.DS_Store
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 henryluki
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # html-parser
2 | Simple HTML to JSON parser use Regexp and String.indexOf
3 |
4 | ## Install
5 |
6 | ```shell
7 | npm install htmlstr-parser
8 |
9 | ```
10 | ## Basic usage
11 |
12 | ```javascript
13 |
14 | var html = "
"
15 | htmlParser(html)
16 |
17 | ```
18 | ### Output
19 | ```javascript
20 |
21 | {
22 | "tag": "root",
23 | "children": [{
24 | "type": "Element",
25 | "tagName": "div",
26 | "attributes": {
27 | "style": "height:10rpx;width: 20rpx;"
28 | },
29 | "children": [{
30 | "type": "Text",
31 | "content": "1"
32 | }, {
33 | "type": "Element",
34 | "tagName": "p",
35 | "attributes": {},
36 | "children": [{
37 | "type": "Text",
38 | "content": "2"
39 | }, {
40 | "type": "Element",
41 | "tagName": "br"
42 | }, {
43 | "type": "Element",
44 | "tagName": "a",
45 | "attributes": {
46 | "href": "http://www.baidu.com"
47 | },
48 | "children": [{
49 | "type": "Text",
50 | "content": "3"
51 | }]
52 | }]
53 | }, {
54 | "type": "Element",
55 | "tagName": "p",
56 | "attributes": {},
57 | "children": [{
58 | "type": "Text",
59 | "content": "2"
60 | }]
61 | }]
62 | }]
63 | }
64 | ```
65 |
--------------------------------------------------------------------------------
/ava.js:
--------------------------------------------------------------------------------
1 | require('babel-register')({
2 | "presets": [
3 | ["latest", {
4 | "modules": false
5 | }]
6 | ]
7 | });
--------------------------------------------------------------------------------
/dist/htmlParser.es6.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const STARTTAG_REX = /^<([-A-Za-z0-9_]+)((?:\s+[a-zA-Z_:][-a-zA-Z0-9_:.]*(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>/;
4 | const ENDTAG_REX = /^<\/([-A-Za-z0-9_]+)[^>]*>/;
5 | const ATTR_REX = /([a-zA-Z_:][-a-zA-Z0-9_:.]*)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:'((?:\\.|[^'])*)')|([^>\s]+)))?/g;
6 |
7 | function makeMap(str) {
8 | return str.split(",").reduce((map, cur) => {
9 | map[cur] = true;
10 | return map
11 | }, {})
12 | }
13 | const EMPTY_MAKER = makeMap("area,base,basefont,br,col,frame,hr,img,input,link,meta,param,embed,command,keygen,source,track,wbr");
14 | const FILLATTRS_MAKER = makeMap("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected");
15 |
16 | function isEmptyMaker(tag){
17 | return !!EMPTY_MAKER[tag]
18 | }
19 |
20 | function isFillattrsMaker(attr){
21 | return !!FILLATTRS_MAKER[attr]
22 | }
23 |
24 | class TagStart {
25 | constructor(name, tag){
26 | this.name = name;
27 | this.attributes = this.getAttributes(tag);
28 | }
29 | getAttributes(str) {
30 | let attrsMap = {};
31 | str.replace(ATTR_REX, function(match, name){
32 | const args = Array.prototype.slice.call(arguments);
33 | const value = args[2] ? args[2] :
34 | args[3] ? args[3] :
35 | args[4] ? args[4] :
36 | isFillattrsMaker(name) ? name : "";
37 |
38 | attrsMap[name] = value.replace(/(^|[^\\])"/g, '$1\\\"');
39 | });
40 | return attrsMap
41 | }
42 | }
43 |
44 | class TagEmpty extends TagStart {
45 | constructor(name, tag){
46 | super(name, tag);
47 | }
48 | }
49 |
50 | class TagEnd {
51 | constructor(name) {
52 | this.name = name;
53 | }
54 | }
55 |
56 | class Text {
57 | constructor(text) {
58 | this.text = text;
59 | }
60 | }
61 |
62 | const ElEMENT_TYPE = "Element";
63 | const TEXT_TYPE = "Text";
64 |
65 | function createElement(token){
66 | const tagName = token.name;
67 | const attributes = token.attributes;
68 | if (token instanceof TagEmpty) {
69 | return {
70 | type: ElEMENT_TYPE,
71 | tagName,
72 | attributes
73 | }
74 | }
75 | return {
76 | type: ElEMENT_TYPE,
77 | tagName,
78 | attributes,
79 | children: []
80 | }
81 | }
82 |
83 | function createText(token){
84 | const content = token.text;
85 | return {
86 | type: TEXT_TYPE,
87 | content
88 | }
89 | }
90 |
91 | function createNodeFactory(type, token){
92 | switch(type){
93 | case ElEMENT_TYPE: return createElement(token)
94 | case TEXT_TYPE: return createText(token)
95 | default: break
96 | }
97 | }
98 |
99 | function parse(tokens) {
100 | let root = {
101 | tag: "root",
102 | children: []
103 | };
104 | let tagArray = [root];
105 | tagArray.last = () => tagArray[tagArray.length - 1];
106 |
107 | for (let i = 0; i < tokens.length; i++) {
108 | const token = tokens[i];
109 | if (token instanceof TagStart) {
110 | const node = createNodeFactory(ElEMENT_TYPE, token);
111 | if (node.children) {
112 | tagArray.push(node);
113 | } else {
114 | tagArray.last().children.push(node);
115 | }
116 | continue
117 | }
118 | if (token instanceof TagEnd) {
119 | let parent = tagArray[tagArray.length - 2];
120 | let node = tagArray.pop();
121 | parent.children.push(node);
122 | continue
123 | }
124 | if (token instanceof Text) {
125 | tagArray.last().children.push(createNodeFactory(TEXT_TYPE, token));
126 | continue
127 | }
128 | }
129 |
130 | return root
131 | }
132 |
133 | function tokenize(html) {
134 | let string = html;
135 | let tokens = [];
136 | const maxTime = Date.now() + 1000;
137 |
138 | while (string) {
139 | if (string.indexOf("") + 3;
141 | string = string.substring(lastIndex);
142 | continue
143 | }
144 | if (string.indexOf("") === 0) {
145 | const match = string.match(ENDTAG_REX);
146 | if (!match) continue
147 | string = string.substring(match[0].length);
148 | const name = match[1];
149 | if (isEmptyMaker(name)) continue
150 |
151 | tokens.push(new TagEnd(name));
152 | continue
153 | }
154 | if (string.indexOf("<") === 0) {
155 | const match = string.match(STARTTAG_REX);
156 | if (!match) continue
157 | string = string.substring(match[0].length);
158 | const name = match[1];
159 | const attrs = match[2];
160 | const token = isEmptyMaker(name) ? new TagEmpty(name, attrs) : new TagStart(name, attrs);
161 |
162 | tokens.push(token);
163 | continue
164 | }
165 |
166 | const index = string.indexOf('<');
167 | const text = index < 0 ? string : string.substring(0, index);
168 |
169 | string = index < 0 ? "" : string.substring(index);
170 | tokens.push(new Text(text));
171 |
172 | if (Date.now() >= maxTime) break
173 | }
174 | return tokens
175 | }
176 |
177 | function htmlParser(html) {
178 | return parse(tokenize(html))
179 | }
180 |
181 | module.exports = htmlParser;
182 |
--------------------------------------------------------------------------------
/dist/htmlParser.js:
--------------------------------------------------------------------------------
1 | (function (global, factory) {
2 | typeof exports === 'object' && typeof module !== 'undefined' ? module.exports = factory() :
3 | typeof define === 'function' && define.amd ? define(factory) :
4 | (global.htmlParser = factory());
5 | }(this, (function () { 'use strict';
6 |
7 | var STARTTAG_REX = /^<([-A-Za-z0-9_]+)((?:\s+[a-zA-Z_:][-a-zA-Z0-9_:.]*(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>/;
8 | var ENDTAG_REX = /^<\/([-A-Za-z0-9_]+)[^>]*>/;
9 | var ATTR_REX = /([a-zA-Z_:][-a-zA-Z0-9_:.]*)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:'((?:\\.|[^'])*)')|([^>\s]+)))?/g;
10 |
11 | function makeMap(str) {
12 | return str.split(",").reduce(function (map, cur) {
13 | map[cur] = true;
14 | return map;
15 | }, {});
16 | }
17 | var EMPTY_MAKER = makeMap("area,base,basefont,br,col,frame,hr,img,input,link,meta,param,embed,command,keygen,source,track,wbr");
18 | var FILLATTRS_MAKER = makeMap("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected");
19 |
20 | function isEmptyMaker(tag) {
21 | return !!EMPTY_MAKER[tag];
22 | }
23 |
24 | function isFillattrsMaker(attr) {
25 | return !!FILLATTRS_MAKER[attr];
26 | }
27 |
28 | var classCallCheck = function (instance, Constructor) {
29 | if (!(instance instanceof Constructor)) {
30 | throw new TypeError("Cannot call a class as a function");
31 | }
32 | };
33 |
34 | var createClass = function () {
35 | function defineProperties(target, props) {
36 | for (var i = 0; i < props.length; i++) {
37 | var descriptor = props[i];
38 | descriptor.enumerable = descriptor.enumerable || false;
39 | descriptor.configurable = true;
40 | if ("value" in descriptor) descriptor.writable = true;
41 | Object.defineProperty(target, descriptor.key, descriptor);
42 | }
43 | }
44 |
45 | return function (Constructor, protoProps, staticProps) {
46 | if (protoProps) defineProperties(Constructor.prototype, protoProps);
47 | if (staticProps) defineProperties(Constructor, staticProps);
48 | return Constructor;
49 | };
50 | }();
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 | var inherits = function (subClass, superClass) {
61 | if (typeof superClass !== "function" && superClass !== null) {
62 | throw new TypeError("Super expression must either be null or a function, not " + typeof superClass);
63 | }
64 |
65 | subClass.prototype = Object.create(superClass && superClass.prototype, {
66 | constructor: {
67 | value: subClass,
68 | enumerable: false,
69 | writable: true,
70 | configurable: true
71 | }
72 | });
73 | if (superClass) Object.setPrototypeOf ? Object.setPrototypeOf(subClass, superClass) : subClass.__proto__ = superClass;
74 | };
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 | var possibleConstructorReturn = function (self, call) {
87 | if (!self) {
88 | throw new ReferenceError("this hasn't been initialised - super() hasn't been called");
89 | }
90 |
91 | return call && (typeof call === "object" || typeof call === "function") ? call : self;
92 | };
93 |
94 | var TagStart = function () {
95 | function TagStart(name, tag) {
96 | classCallCheck(this, TagStart);
97 |
98 | this.name = name;
99 | this.attributes = this.getAttributes(tag);
100 | }
101 |
102 | createClass(TagStart, [{
103 | key: 'getAttributes',
104 | value: function getAttributes(str) {
105 | var attrsMap = {};
106 | str.replace(ATTR_REX, function (match, name) {
107 | var args = Array.prototype.slice.call(arguments);
108 | var value = args[2] ? args[2] : args[3] ? args[3] : args[4] ? args[4] : isFillattrsMaker(name) ? name : "";
109 |
110 | attrsMap[name] = value.replace(/(^|[^\\])"/g, '$1\\\"');
111 | });
112 | return attrsMap;
113 | }
114 | }]);
115 | return TagStart;
116 | }();
117 |
118 | var TagEmpty = function (_TagStart) {
119 | inherits(TagEmpty, _TagStart);
120 |
121 | function TagEmpty(name, tag) {
122 | classCallCheck(this, TagEmpty);
123 | return possibleConstructorReturn(this, (TagEmpty.__proto__ || Object.getPrototypeOf(TagEmpty)).call(this, name, tag));
124 | }
125 |
126 | return TagEmpty;
127 | }(TagStart);
128 |
129 | var TagEnd = function TagEnd(name) {
130 | classCallCheck(this, TagEnd);
131 |
132 | this.name = name;
133 | };
134 |
135 | var Text = function Text(text) {
136 | classCallCheck(this, Text);
137 |
138 | this.text = text;
139 | };
140 |
141 | var ElEMENT_TYPE = "Element";
142 | var TEXT_TYPE = "Text";
143 |
144 | function createElement(token) {
145 | var tagName = token.name;
146 | var attributes = token.attributes;
147 | if (token instanceof TagEmpty) {
148 | return {
149 | type: ElEMENT_TYPE,
150 | tagName: tagName,
151 | attributes: attributes
152 | };
153 | }
154 | return {
155 | type: ElEMENT_TYPE,
156 | tagName: tagName,
157 | attributes: attributes,
158 | children: []
159 | };
160 | }
161 |
162 | function createText(token) {
163 | var content = token.text;
164 | return {
165 | type: TEXT_TYPE,
166 | content: content
167 | };
168 | }
169 |
170 | function createNodeFactory(type, token) {
171 | switch (type) {
172 | case ElEMENT_TYPE:
173 | return createElement(token);
174 | case TEXT_TYPE:
175 | return createText(token);
176 | default:
177 | break;
178 | }
179 | }
180 |
181 | function parse(tokens) {
182 | var root = {
183 | tag: "root",
184 | children: []
185 | };
186 | var tagArray = [root];
187 | tagArray.last = function () {
188 | return tagArray[tagArray.length - 1];
189 | };
190 |
191 | for (var i = 0; i < tokens.length; i++) {
192 | var token = tokens[i];
193 | if (token instanceof TagStart) {
194 | var node = createNodeFactory(ElEMENT_TYPE, token);
195 | if (node.children) {
196 | tagArray.push(node);
197 | } else {
198 | tagArray.last().children.push(node);
199 | }
200 | continue;
201 | }
202 | if (token instanceof TagEnd) {
203 | var parent = tagArray[tagArray.length - 2];
204 | var _node = tagArray.pop();
205 | parent.children.push(_node);
206 | continue;
207 | }
208 | if (token instanceof Text) {
209 | tagArray.last().children.push(createNodeFactory(TEXT_TYPE, token));
210 | continue;
211 | }
212 | }
213 |
214 | return root;
215 | }
216 |
217 | function tokenize(html) {
218 | var string = html;
219 | var tokens = [];
220 | var maxTime = Date.now() + 1000;
221 |
222 | while (string) {
223 | if (string.indexOf("") + 3;
225 | string = string.substring(lastIndex);
226 | continue;
227 | }
228 | if (string.indexOf("") === 0) {
229 | var match = string.match(ENDTAG_REX);
230 | if (!match) continue;
231 | string = string.substring(match[0].length);
232 | var name = match[1];
233 | if (isEmptyMaker(name)) continue;
234 |
235 | tokens.push(new TagEnd(name));
236 | continue;
237 | }
238 | if (string.indexOf("<") === 0) {
239 | var _match = string.match(STARTTAG_REX);
240 | if (!_match) continue;
241 | string = string.substring(_match[0].length);
242 | var _name = _match[1];
243 | var attrs = _match[2];
244 | var token = isEmptyMaker(_name) ? new TagEmpty(_name, attrs) : new TagStart(_name, attrs);
245 |
246 | tokens.push(token);
247 | continue;
248 | }
249 |
250 | var index = string.indexOf('<');
251 | var text = index < 0 ? string : string.substring(0, index);
252 |
253 | string = index < 0 ? "" : string.substring(index);
254 | tokens.push(new Text(text));
255 |
256 | if (Date.now() >= maxTime) break;
257 | }
258 | return tokens;
259 | }
260 |
261 | function htmlParser(html) {
262 | return parse(tokenize(html));
263 | }
264 |
265 | return htmlParser;
266 |
267 | })));
268 |
--------------------------------------------------------------------------------
/dist/htmlParser.min.js:
--------------------------------------------------------------------------------
1 | !function(e,t){"object"==typeof exports&&"undefined"!=typeof module?module.exports=t():"function"==typeof define&&define.amd?define(t):e.htmlParser=t()}(this,function(){"use strict";function e(e){return e.split(",").reduce(function(e,t){return e[t]=!0,e},{})}function t(e){return!!l[e]}function n(e){return!!p[e]}function r(e){var t=e.name,n=e.attributes;return e instanceof g?{type:_,tagName:t,attributes:n}:{type:_,tagName:t,attributes:n,children:[]}}function i(e){var t=e.text;return{type:x,content:t}}function o(e,t){switch(e){case _:return r(t);case x:return i(t)}}function a(e){var t={tag:"root",children:[]},n=[t];n.last=function(){return n[n.length-1]};for(var r=0;r=i)break}else{var u=n.match(c);if(!u)continue;n=n.substring(u[0].length);var s=u[1],l=u[2],p=t(s)?new g(s,l):new y(s,l);r.push(p)}else{var h=n.match(f);if(!h)continue;n=n.substring(h[0].length);var b=h[1];if(t(b))continue;r.push(new v(b))}else{var d=n.indexOf("--\x3e")+3;n=n.substring(d)}return r}function s(e){return a(u(e))}var c=/^<([-A-Za-z0-9_]+)((?:\s+[a-zA-Z_:][-a-zA-Z0-9_:.]*(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>/,f=/^<\/([-A-Za-z0-9_]+)[^>]*>/,l=e("area,base,basefont,br,col,frame,hr,img,input,link,meta,param,embed,command,keygen,source,track,wbr"),p=e("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected"),h=function(e,t){if(!(e instanceof t))throw new TypeError("Cannot call a class as a function")},b=function(){function e(e,t){for(var n=0;n\s]+)))?/g,function(e,r){var i=Array.prototype.slice.call(arguments),o=i[2]?i[2]:i[3]?i[3]:i[4]?i[4]:n(r)?r:"";t[r]=o.replace(/(^|[^\\])"/g,'$1\\"')}),t}}]),e}(),g=function(e){function t(e,n){return h(this,t),m(this,(t.__proto__||Object.getPrototypeOf(t)).call(this,e,n))}return d(t,e),t}(y),v=function e(t){h(this,e),this.name=t},w=function e(t){h(this,e),this.text=t},_="Element",x="Text";return s});
2 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "htmlstr-parser",
3 | "version": "2.0.1",
4 | "description": "Simple HTML to JSON parser use Regexp and String.indexOf",
5 | "main": "dist/htmlParser.js",
6 | "scripts": {
7 | "build": "BABEL_ENV=build rollup -c",
8 | "build:es6": "rollup src/index.js -f cjs -o dist/htmlParser.es6.js",
9 | "watch": "BABEL_ENV=build rollup -c -w",
10 | "pro": "BABEL_ENV=build NODE_ENV=production rollup -c",
11 | "update": "npm run build && npm run build:es6 && npm run pro",
12 | "test": "ava --watch"
13 | },
14 | "ava": {
15 | "require": [
16 | "./ava.js"
17 | ]
18 | },
19 | "repository": {
20 | "type": "git",
21 | "url": "git+https://github.com/henryluki/html-parser.git"
22 | },
23 | "keywords": [
24 | "html-parser",
25 | "parser",
26 | "json",
27 | "html-to-json",
28 | "ast",
29 | "Regex"
30 | ],
31 | "author": "henryluki",
32 | "license": "MIT",
33 | "bugs": {
34 | "url": "https://github.com/henryluki/html-parser/issues"
35 | },
36 | "homepage": "https://github.com/henryluki/html-parser#readme",
37 | "devDependencies": {
38 | "ava": "^0.16.0",
39 | "babel-plugin-external-helpers": "^6.22.0",
40 | "babel-preset-latest": "^6.24.1",
41 | "babel-register": "^6.24.1",
42 | "rollup-plugin-babel": "^2.7.1",
43 | "rollup-plugin-node-resolve": "^3.0.0",
44 | "rollup-plugin-uglify": "^1.0.1",
45 | "rollup-watch": "^3.2.2"
46 | }
47 | }
--------------------------------------------------------------------------------
/rollup.config.js:
--------------------------------------------------------------------------------
1 | import resolve from 'rollup-plugin-node-resolve';
2 | import babel from 'rollup-plugin-babel';
3 | import uglify from 'rollup-plugin-uglify';
4 |
5 | const isProduction = process.env.NODE_ENV === 'production'
6 |
7 | export default {
8 | entry: 'src/index.js',
9 | moduleName: "htmlParser",
10 | format: 'umd',
11 | plugins: [
12 | resolve(),
13 | babel({
14 | exclude: 'node_modules/**'
15 | }),
16 | ( isProduction && uglify())
17 | ],
18 | dest: isProduction ? 'dist/htmlParser.min.js' : 'dist/htmlParser.js'
19 | };
--------------------------------------------------------------------------------
/src/index.js:
--------------------------------------------------------------------------------
1 | import { parse } from './parser/index'
2 | import { tokenize } from './tokenizer/index'
3 |
4 | export default function htmlParser(html) {
5 | return parse(tokenize(html))
6 | }
--------------------------------------------------------------------------------
/src/parser/index.js:
--------------------------------------------------------------------------------
1 | import { TagStart, TagEnd, Text } from '../tokenizer/types'
2 | import { ElEMENT_TYPE, TEXT_TYPE, createNodeFactory } from './nodes'
3 |
4 | export function parse(tokens) {
5 | let root = {
6 | tag: "root",
7 | children: []
8 | }
9 | let tagArray = [root]
10 | tagArray.last = () => tagArray[tagArray.length - 1]
11 |
12 | for (let i = 0; i < tokens.length; i++) {
13 | const token = tokens[i]
14 | if (token instanceof TagStart) {
15 | const node = createNodeFactory(ElEMENT_TYPE, token)
16 | if (node.children) {
17 | tagArray.push(node)
18 | } else {
19 | tagArray.last().children.push(node)
20 | }
21 | continue
22 | }
23 | if (token instanceof TagEnd) {
24 | let parent = tagArray[tagArray.length - 2]
25 | let node = tagArray.pop()
26 | parent.children.push(node)
27 | continue
28 | }
29 | if (token instanceof Text) {
30 | tagArray.last().children.push(createNodeFactory(TEXT_TYPE, token))
31 | continue
32 | }
33 | }
34 |
35 | return root
36 | }
--------------------------------------------------------------------------------
/src/parser/nodes.js:
--------------------------------------------------------------------------------
1 | import { TagEmpty } from '../tokenizer/types'
2 |
3 | export const ElEMENT_TYPE = "Element"
4 | export const TEXT_TYPE = "Text"
5 |
6 | function createElement(token){
7 | const tagName = token.name
8 | const attributes = token.attributes
9 | if (token instanceof TagEmpty) {
10 | return {
11 | type: ElEMENT_TYPE,
12 | tagName,
13 | attributes
14 | }
15 | }
16 | return {
17 | type: ElEMENT_TYPE,
18 | tagName,
19 | attributes,
20 | children: []
21 | }
22 | }
23 |
24 | function createText(token){
25 | const content = token.text
26 | return {
27 | type: TEXT_TYPE,
28 | content
29 | }
30 | }
31 |
32 | export function createNodeFactory(type, token){
33 | switch(type){
34 | case ElEMENT_TYPE: return createElement(token)
35 | case TEXT_TYPE: return createText(token)
36 | default: break
37 | }
38 | }
--------------------------------------------------------------------------------
/src/tokenizer/index.js:
--------------------------------------------------------------------------------
1 | import { STARTTAG_REX, ENDTAG_REX }from './regexp'
2 | import { isEmptyMaker } from './makers'
3 | import { TagStart, TagEmpty, TagEnd, Text} from './types'
4 |
5 | export function tokenize(html) {
6 | let string = html
7 | let tokens = []
8 | const maxTime = Date.now() + 1000
9 |
10 | while (string) {
11 | if (string.indexOf("") + 3
13 | string = string.substring(lastIndex)
14 | continue
15 | }
16 | if (string.indexOf("") === 0) {
17 | const match = string.match(ENDTAG_REX)
18 | if (!match) continue
19 | string = string.substring(match[0].length)
20 | const name = match[1]
21 | if (isEmptyMaker(name)) continue
22 |
23 | tokens.push(new TagEnd(name))
24 | continue
25 | }
26 | if (string.indexOf("<") === 0) {
27 | const match = string.match(STARTTAG_REX)
28 | if (!match) continue
29 | string = string.substring(match[0].length)
30 | const name = match[1]
31 | const attrs = match[2]
32 | const token = isEmptyMaker(name) ? new TagEmpty(name, attrs) : new TagStart(name, attrs)
33 |
34 | tokens.push(token)
35 | continue
36 | }
37 |
38 | const index = string.indexOf('<')
39 | const text = index < 0 ? string : string.substring(0, index)
40 |
41 | string = index < 0 ? "" : string.substring(index)
42 | tokens.push(new Text(text))
43 |
44 | if (Date.now() >= maxTime) break
45 | }
46 | return tokens
47 | }
48 |
--------------------------------------------------------------------------------
/src/tokenizer/makers.js:
--------------------------------------------------------------------------------
1 | function makeMap(str) {
2 | return str.split(",").reduce((map, cur) => {
3 | map[cur] = true
4 | return map
5 | }, {})
6 | }
7 | export const EMPTY_MAKER = makeMap("area,base,basefont,br,col,frame,hr,img,input,link,meta,param,embed,command,keygen,source,track,wbr")
8 | export const FILLATTRS_MAKER = makeMap("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected")
9 |
10 | export function isEmptyMaker(tag){
11 | return !!EMPTY_MAKER[tag]
12 | }
13 |
14 | export function isFillattrsMaker(attr){
15 | return !!FILLATTRS_MAKER[attr]
16 | }
--------------------------------------------------------------------------------
/src/tokenizer/regexp.js:
--------------------------------------------------------------------------------
1 | export const STARTTAG_REX = /^<([-A-Za-z0-9_]+)((?:\s+[a-zA-Z_:][-a-zA-Z0-9_:.]*(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>/
2 | export const ENDTAG_REX = /^<\/([-A-Za-z0-9_]+)[^>]*>/
3 | export const ATTR_REX = /([a-zA-Z_:][-a-zA-Z0-9_:.]*)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:'((?:\\.|[^'])*)')|([^>\s]+)))?/g
--------------------------------------------------------------------------------
/src/tokenizer/types.js:
--------------------------------------------------------------------------------
1 | import { ATTR_REX } from './regexp'
2 | import { isFillattrsMaker } from './makers'
3 |
4 | export class TagStart {
5 | constructor(name, tag){
6 | this.name = name
7 | this.attributes = this.getAttributes(tag)
8 | }
9 | getAttributes(str) {
10 | let attrsMap = {}
11 | str.replace(ATTR_REX, function(match, name){
12 | const args = Array.prototype.slice.call(arguments)
13 | const value = args[2] ? args[2] :
14 | args[3] ? args[3] :
15 | args[4] ? args[4] :
16 | isFillattrsMaker(name) ? name : ""
17 |
18 | attrsMap[name] = value.replace(/(^|[^\\])"/g, '$1\\\"')
19 | })
20 | return attrsMap
21 | }
22 | }
23 |
24 | export class TagEmpty extends TagStart {
25 | constructor(name, tag){
26 | super(name, tag)
27 | }
28 | }
29 |
30 | export class TagEnd {
31 | constructor(name) {
32 | this.name = name
33 | }
34 | }
35 |
36 | export class Text {
37 | constructor(text) {
38 | this.text = text
39 | }
40 | }
--------------------------------------------------------------------------------
/test/index.js:
--------------------------------------------------------------------------------
1 | import htmlParser from '../src/index'
2 | import test from 'ava'
3 |
4 | test('should pass the Hello World case', t=> {
5 | const html = "hello world
"
6 | const tree = {
7 | tag: "root",
8 | children: [{
9 | type: "Element",
10 | tagName: "div",
11 | attributes: {},
12 | children: [{
13 | type: "Text",
14 | content: "hello world"
15 | }]
16 | }]
17 | }
18 | t.deepEqual(htmlParser(html), tree)
19 | })
20 |
21 | test('should pass the attributes case', t=> {
22 | const html = "hello world
"
23 | const attributes = {
24 | style: "width:100px"
25 | }
26 | const div = htmlParser(html).children[0]
27 | t.deepEqual(div.attributes, attributes)
28 | })
29 |
30 | test('should pass the empty tag case', t=> {
31 | const html = "
"
32 | const div = htmlParser(html).children[0]
33 | const br = {
34 | type: "Element",
35 | tagName: "br",
36 | attributes: {}
37 | }
38 | t.deepEqual(div.children[0], br)
39 | })
40 |
41 | test('should pass the comment case', t=> {
42 | const html = ""
43 | const div = htmlParser(html).children[0]
44 | t.deepEqual(div.children.length, 0)
45 | })
46 |
47 | test('should pass the nested element case', t=> {
48 | const html = ""
49 | const tree = {
50 | tag: "root",
51 | children: [{
52 | type: "Element",
53 | tagName: "div",
54 | attributes: {},
55 | children: [{
56 | type: "Text",
57 | content: "a"
58 | }, {
59 | type: "Element",
60 | tagName: "p",
61 | attributes: {},
62 | children: [{
63 | type: "Text",
64 | content: "b"
65 | }, {
66 | type: "Element",
67 | tagName: "span",
68 | attributes: {},
69 | children: [{
70 | type: "Text",
71 | content: "c"
72 | }]
73 | }]
74 | }]
75 | }]
76 | }
77 | t.deepEqual(htmlParser(html), tree)
78 | })
--------------------------------------------------------------------------------
/test/parser.js:
--------------------------------------------------------------------------------
1 | import { parse } from '../src/parser/index'
2 | import { ElEMENT_TYPE, TEXT_TYPE, createNodeFactory } from '../src/parser/nodes'
3 | import { tokenize } from '../src/tokenizer/index'
4 | import { TagStart, TagEmpty, Text } from '../src/tokenizer/types'
5 | import test from 'ava'
6 |
7 | test('parse() should return json tree', t => {
8 | const html = "hello world
"
9 | const tokens = tokenize(html)
10 | const tree = {
11 | tag: "root",
12 | children: [{
13 | type: "Element",
14 | tagName: "div",
15 | attributes: {},
16 | children: [{
17 | type: "Text",
18 | content: "hello world"
19 | }]
20 | }]
21 | }
22 |
23 | t.deepEqual(parse(tokens), tree)
24 | })
25 |
26 | test("createNodeFactory() should return different nodes", t => {
27 | t.deepEqual(createNodeFactory(ElEMENT_TYPE, new TagStart("div", "style='width:100px'")), {
28 | type: ElEMENT_TYPE,
29 | tagName: "div",
30 | attributes: {
31 | style: "width:100px"
32 | },
33 | children: []
34 | })
35 |
36 | t.deepEqual(createNodeFactory(ElEMENT_TYPE, new TagEmpty("div", "style='width:100px'")), {
37 | type: ElEMENT_TYPE,
38 | tagName: "div",
39 | attributes: {
40 | style: "width:100px"
41 | },
42 | })
43 |
44 | t.deepEqual(createNodeFactory(TEXT_TYPE, new Text("aaa")), {
45 | type: TEXT_TYPE,
46 | content: "aaa"
47 | })
48 | })
--------------------------------------------------------------------------------
/test/tokenizer.js:
--------------------------------------------------------------------------------
1 | import { tokenize } from '../src/tokenizer/index'
2 | import { TagStart, TagEmpty, TagEnd, Text } from '../src/tokenizer/types'
3 | import test from 'ava'
4 |
5 | test('tokenize() should return tokens', t => {
6 | const html = "a
"
7 | const tokens = tokenize(html)
8 |
9 | t.true(tokens.length == 3)
10 | t.true(tokens[0] instanceof TagStart)
11 | t.true(tokens[1] instanceof Text)
12 | t.true(tokens[2] instanceof TagEnd)
13 |
14 | })
15 |
16 | test('a instanceof TagStart should have property: name, attributes', t=> {
17 | const tag = new TagStart("div", "href='' style='width:100px'")
18 | const obj = {
19 | name: "div",
20 | attributes: {
21 | href: '',
22 | style: 'width:100px'
23 | }
24 | }
25 |
26 | t.deepEqual(tag.name, obj.name)
27 | t.deepEqual(tag.attributes, obj.attributes)
28 | })
29 |
30 | test('a instanceof TagEmpty should have property: name, attributes', t=> {
31 | const tag = new TagEmpty("div", "href='' style='width:100px'")
32 | const obj = {
33 | name: "div",
34 | attributes: {
35 | href: '',
36 | style: 'width:100px'
37 | }
38 | }
39 | t.deepEqual(tag.name, obj.name)
40 | t.deepEqual(tag.attributes, obj.attributes)
41 | })
42 |
43 | test('a instanceof TagEnd should have property: name', t=> {
44 | t.deepEqual(new TagEnd("div").name, "div")
45 |
46 | })
47 |
48 | test('a instanceof Text should have property: text', t=> {
49 | t.deepEqual(new Text("aaa").text, "aaa")
50 | })
--------------------------------------------------------------------------------