├── .gitattributes
├── bower.json
├── .gitignore
├── component.json
├── data
├── decode-map-overrides.json
├── encode-paired-symbols.json
├── invalid-code-points.json
├── decode-legacy-named-references.json
├── decode-map-legacy.json
├── encode-lone-code-points.json
├── encode-map.json
└── decode-map.json
├── coverage
├── prettify.css
├── index.html
├── he
│ └── index.html
└── prettify.js
├── tests
└── index.html
├── LICENSE-MIT.txt
├── package.json
├── .travis.yml
├── scripts
├── export-data.js
├── process-data.js
└── scrape-spec.js
├── Gruntfile.js
├── src
└── he.js
└── README.md
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Automatically normalize line endings for all text-based files
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/bower.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "he",
3 | "version": "0.3.6",
4 | "main": "he.js",
5 | "ignore": [
6 | "bin",
7 | "coverage",
8 | "data",
9 | "man",
10 | "scripts",
11 | "src",
12 | "tests",
13 | ".*",
14 | "component.json",
15 | "Gruntfile.js",
16 | "node_modules",
17 | "package.json"
18 | ]
19 | }
20 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # JSON version of coverage report
2 | coverage/coverage.json
3 |
4 | # Installed npm modules
5 | node_modules
6 |
7 | # Folder view configuration files
8 | .DS_Store
9 | Desktop.ini
10 |
11 | # Thumbnail cache files
12 | ._*
13 | Thumbs.db
14 |
15 | # Files that might appear on external disks
16 | .Spotlight-V100
17 | .Trashes
18 |
--------------------------------------------------------------------------------
/component.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "he",
3 | "version": "0.3.6",
4 | "description": "A robust HTML entities encoder/decoder with full Unicode support.",
5 | "repo": "mathiasbynens/he",
6 | "license": "MIT",
7 | "scripts": [
8 | "he.js"
9 | ],
10 | "main": "he.js",
11 | "keywords": [
12 | "string",
13 | "entities",
14 | "entity",
15 | "html",
16 | "encode",
17 | "decode",
18 | "unicode"
19 | ]
20 | }
21 |
--------------------------------------------------------------------------------
/data/decode-map-overrides.json:
--------------------------------------------------------------------------------
1 | {
2 | "0": "\uFFFD",
3 | "128": "\u20AC",
4 | "130": "\u201A",
5 | "131": "\u0192",
6 | "132": "\u201E",
7 | "133": "\u2026",
8 | "134": "\u2020",
9 | "135": "\u2021",
10 | "136": "\u02C6",
11 | "137": "\u2030",
12 | "138": "\u0160",
13 | "139": "\u2039",
14 | "140": "\u0152",
15 | "142": "\u017D",
16 | "145": "\u2018",
17 | "146": "\u2019",
18 | "147": "\u201C",
19 | "148": "\u201D",
20 | "149": "\u2022",
21 | "150": "\u2013",
22 | "151": "\u2014",
23 | "152": "\u02DC",
24 | "153": "\u2122",
25 | "154": "\u0161",
26 | "155": "\u203A",
27 | "156": "\u0153",
28 | "158": "\u017E",
29 | "159": "\u0178"
30 | }
31 |
--------------------------------------------------------------------------------
/coverage/prettify.css:
--------------------------------------------------------------------------------
1 | .pln{color:#000}@media screen{.str{color:#080}.kwd{color:#008}.com{color:#800}.typ{color:#606}.lit{color:#066}.pun,.opn,.clo{color:#660}.tag{color:#008}.atn{color:#606}.atv{color:#080}.dec,.var{color:#606}.fun{color:red}}@media print,projection{.str{color:#060}.kwd{color:#006;font-weight:bold}.com{color:#600;font-style:italic}.typ{color:#404;font-weight:bold}.lit{color:#044}.pun,.opn,.clo{color:#440}.tag{color:#006;font-weight:bold}.atn{color:#404}.atv{color:#060}}pre.prettyprint{padding:2px;border:1px solid #888}ol.linenums{margin-top:0;margin-bottom:0}li.L0,li.L1,li.L2,li.L3,li.L5,li.L6,li.L7,li.L8{list-style-type:none}li.L1,li.L3,li.L5,li.L7,li.L9{background:#eee}
2 |
--------------------------------------------------------------------------------
/tests/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | he test suite
6 |
7 |
8 |
9 |
10 |
11 |
12 |
22 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/LICENSE-MIT.txt:
--------------------------------------------------------------------------------
1 | Copyright Mathias Bynens
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining
4 | a copy of this software and associated documentation files (the
5 | "Software"), to deal in the Software without restriction, including
6 | without limitation the rights to use, copy, modify, merge, publish,
7 | distribute, sublicense, and/or sell copies of the Software, and to
8 | permit persons to whom the Software is furnished to do so, subject to
9 | the following conditions:
10 |
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 |
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "he",
3 | "version": "0.3.6-patch1",
4 | "description": "A robust HTML entities encoder/decoder with full Unicode support.",
5 | "homepage": "http://mths.be/he",
6 | "main": "he.js",
7 | "keywords": [
8 | "string",
9 | "entities",
10 | "entity",
11 | "html",
12 | "encode",
13 | "decode",
14 | "unicode"
15 | ],
16 | "licenses": [
17 | {
18 | "type": "MIT",
19 | "url": "http://mths.be/mit"
20 | }
21 | ],
22 | "author": {
23 | "name": "Mathias Bynens",
24 | "url": "http://mathiasbynens.be/"
25 | },
26 | "repository": {
27 | "type": "git",
28 | "url": "https://github.com/mathiasbynens/he.git"
29 | },
30 | "bugs": {
31 | "url": "https://github.com/mathiasbynens/he/issues"
32 | },
33 | "files": [
34 | "LICENSE-MIT.txt",
35 | "he.js"
36 | ],
37 | "scripts": {
38 | "test": "node tests/tests.js"
39 | },
40 | "dependencies": {},
41 | "devDependencies": {
42 | "grunt": "~0.4.1",
43 | "grunt-shell": "~0.5.0",
44 | "grunt-template": "~0.2.1",
45 | "istanbul": "~0.1.44",
46 | "jsesc": "~0.4.2",
47 | "lodash": "~2.2.1",
48 | "qunit-clib": "~1.3.0",
49 | "qunitjs": "~1.11.0",
50 | "regenerate": "~0.5.4",
51 | "requirejs": "~2.1.9"
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: node_js
2 | node_js:
3 | - "0.10"
4 | before_script:
5 | - "npm install -g grunt-cli"
6 | # Narwhal uses a hardcoded path to openjdk v6, so use that version
7 | - "sudo apt-get update -qq"
8 | - "sudo apt-get install -qq openjdk-6-jre"
9 | - "PACKAGE=rhino1_7R3; wget http://ftp.mozilla.org/pub/mozilla.org/js/$PACKAGE.zip && sudo unzip $PACKAGE -d /opt/ && rm $PACKAGE.zip"
10 | - "PACKAGE=rhino1_7R3; echo -e '#!/bin/sh\\njava -jar /opt/'$PACKAGE'/js.jar $@' | sudo tee /usr/local/bin/rhino && sudo chmod +x /usr/local/bin/rhino"
11 | - "PACKAGE=ringojs-0.9; wget http://ringojs.org/downloads/$PACKAGE.zip && sudo unzip $PACKAGE -d /opt/ && rm $PACKAGE.zip"
12 | - "PACKAGE=ringojs-0.9; sudo ln -s /opt/$PACKAGE/bin/ringo /usr/local/bin/ringo && sudo chmod +x /usr/local/bin/ringo"
13 | - "PACKAGE=v0.3.2; wget https://github.com/280north/narwhal/archive/$PACKAGE.zip && sudo unzip $PACKAGE -d /opt/ && rm $PACKAGE.zip"
14 | - "PACKAGE=narwhal-0.3.2; sudo ln -s /opt/$PACKAGE/bin/narwhal /usr/local/bin/narwhal && sudo chmod +x /usr/local/bin/narwhal"
15 | # If the enviroment stores rt.jar in a different directory, find it and symlink the directory
16 | - "PREFIX=/usr/lib/jvm; if [ ! -d $PREFIX/java-6-openjdk ]; then for d in $PREFIX/java-6-openjdk-*; do if [ -e $d/jre/lib/rt.jar ]; then sudo ln -s $d $PREFIX/java-6-openjdk; break; fi; done; fi"
17 | script:
18 | "grunt ci"
19 |
--------------------------------------------------------------------------------
/data/encode-paired-symbols.json:
--------------------------------------------------------------------------------
1 | [
2 | "<\u20D2",
3 | "=\u20E5",
4 | ">\u20D2",
5 | "fj",
6 | "\u205F\u200A",
7 | "\u219D\u0338",
8 | "\u2202\u0338",
9 | "\u2220\u20D2",
10 | "\u2229\uFE00",
11 | "\u222A\uFE00",
12 | "\u223C\u20D2",
13 | "\u223D\u0331",
14 | "\u223E\u0333",
15 | "\u2242\u0338",
16 | "\u224B\u0338",
17 | "\u224D\u20D2",
18 | "\u224E\u0338",
19 | "\u224F\u0338",
20 | "\u2250\u0338",
21 | "\u2261\u20E5",
22 | "\u2264\u20D2",
23 | "\u2265\u20D2",
24 | "\u2266\u0338",
25 | "\u2267\u0338",
26 | "\u2268\uFE00",
27 | "\u2269\uFE00",
28 | "\u226A\u0338",
29 | "\u226A\u20D2",
30 | "\u226B\u0338",
31 | "\u226B\u20D2",
32 | "\u227F\u0338",
33 | "\u2282\u20D2",
34 | "\u2283\u20D2",
35 | "\u228A\uFE00",
36 | "\u228B\uFE00",
37 | "\u228F\u0338",
38 | "\u2290\u0338",
39 | "\u2293\uFE00",
40 | "\u2294\uFE00",
41 | "\u22B4\u20D2",
42 | "\u22B5\u20D2",
43 | "\u22D8\u0338",
44 | "\u22D9\u0338",
45 | "\u22DA\uFE00",
46 | "\u22DB\uFE00",
47 | "\u22F5\u0338",
48 | "\u22F9\u0338",
49 | "\u2933\u0338",
50 | "\u29CF\u0338",
51 | "\u29D0\u0338",
52 | "\u2A6D\u0338",
53 | "\u2A70\u0338",
54 | "\u2A7D\u0338",
55 | "\u2A7E\u0338",
56 | "\u2AA1\u0338",
57 | "\u2AA2\u0338",
58 | "\u2AAC\uFE00",
59 | "\u2AAD\uFE00",
60 | "\u2AAF\u0338",
61 | "\u2AB0\u0338",
62 | "\u2AC5\u0338",
63 | "\u2AC6\u0338",
64 | "\u2ACB\uFE00",
65 | "\u2ACC\uFE00",
66 | "\u2AFD\u20E5"
67 | ]
68 |
--------------------------------------------------------------------------------
/data/invalid-code-points.json:
--------------------------------------------------------------------------------
1 | [
2 | 1,
3 | 2,
4 | 3,
5 | 4,
6 | 5,
7 | 6,
8 | 7,
9 | 8,
10 | 11,
11 | 13,
12 | 14,
13 | 15,
14 | 16,
15 | 17,
16 | 18,
17 | 19,
18 | 20,
19 | 21,
20 | 22,
21 | 23,
22 | 24,
23 | 25,
24 | 26,
25 | 27,
26 | 28,
27 | 29,
28 | 30,
29 | 31,
30 | 127,
31 | 128,
32 | 129,
33 | 130,
34 | 131,
35 | 132,
36 | 133,
37 | 134,
38 | 135,
39 | 136,
40 | 137,
41 | 138,
42 | 139,
43 | 140,
44 | 141,
45 | 142,
46 | 143,
47 | 144,
48 | 145,
49 | 146,
50 | 147,
51 | 148,
52 | 149,
53 | 150,
54 | 151,
55 | 152,
56 | 153,
57 | 154,
58 | 155,
59 | 156,
60 | 157,
61 | 158,
62 | 159,
63 | 64976,
64 | 64977,
65 | 64978,
66 | 64979,
67 | 64980,
68 | 64981,
69 | 64982,
70 | 64983,
71 | 64984,
72 | 64985,
73 | 64986,
74 | 64987,
75 | 64988,
76 | 64989,
77 | 64990,
78 | 64991,
79 | 64992,
80 | 64993,
81 | 64994,
82 | 64995,
83 | 64996,
84 | 64997,
85 | 64998,
86 | 64999,
87 | 65000,
88 | 65001,
89 | 65002,
90 | 65003,
91 | 65004,
92 | 65005,
93 | 65006,
94 | 65007,
95 | 65534,
96 | 65535,
97 | 131070,
98 | 131071,
99 | 196606,
100 | 196607,
101 | 262142,
102 | 262143,
103 | 327678,
104 | 327679,
105 | 393214,
106 | 393215,
107 | 458750,
108 | 458751,
109 | 524286,
110 | 524287,
111 | 589822,
112 | 589823,
113 | 655358,
114 | 655359,
115 | 720894,
116 | 720895,
117 | 786430,
118 | 786431,
119 | 851966,
120 | 851967,
121 | 917502,
122 | 917503,
123 | 983038,
124 | 983039,
125 | 1048574,
126 | 1048575,
127 | 1114110,
128 | 1114111
129 | ]
130 |
--------------------------------------------------------------------------------
/data/decode-legacy-named-references.json:
--------------------------------------------------------------------------------
1 | [
2 | "Aacute",
3 | "iacute",
4 | "Uacute",
5 | "plusmn",
6 | "otilde",
7 | "Otilde",
8 | "Agrave",
9 | "agrave",
10 | "yacute",
11 | "Yacute",
12 | "oslash",
13 | "Oslash",
14 | "Atilde",
15 | "atilde",
16 | "brvbar",
17 | "Ccedil",
18 | "ccedil",
19 | "ograve",
20 | "curren",
21 | "divide",
22 | "Eacute",
23 | "eacute",
24 | "Ograve",
25 | "oacute",
26 | "Egrave",
27 | "egrave",
28 | "ugrave",
29 | "frac12",
30 | "frac14",
31 | "frac34",
32 | "Ugrave",
33 | "Oacute",
34 | "Iacute",
35 | "ntilde",
36 | "Ntilde",
37 | "uacute",
38 | "middot",
39 | "Igrave",
40 | "igrave",
41 | "iquest",
42 | "aacute",
43 | "laquo",
44 | "THORN",
45 | "micro",
46 | "iexcl",
47 | "icirc",
48 | "Icirc",
49 | "Acirc",
50 | "ucirc",
51 | "ecirc",
52 | "Ocirc",
53 | "ocirc",
54 | "Ecirc",
55 | "Ucirc",
56 | "aring",
57 | "Aring",
58 | "aelig",
59 | "AElig",
60 | "acute",
61 | "pound",
62 | "raquo",
63 | "acirc",
64 | "times",
65 | "thorn",
66 | "szlig",
67 | "cedil",
68 | "COPY",
69 | "Auml",
70 | "ordf",
71 | "ordm",
72 | "uuml",
73 | "macr",
74 | "Uuml",
75 | "auml",
76 | "Ouml",
77 | "ouml",
78 | "para",
79 | "nbsp",
80 | "Euml",
81 | "quot",
82 | "QUOT",
83 | "euml",
84 | "yuml",
85 | "cent",
86 | "sect",
87 | "copy",
88 | "sup1",
89 | "sup2",
90 | "sup3",
91 | "Iuml",
92 | "iuml",
93 | "shy",
94 | "eth",
95 | "reg",
96 | "not",
97 | "yen",
98 | "amp",
99 | "AMP",
100 | "REG",
101 | "uml",
102 | "ETH",
103 | "deg",
104 | "gt",
105 | "GT",
106 | "LT",
107 | "lt"
108 | ]
109 |
--------------------------------------------------------------------------------
/scripts/export-data.js:
--------------------------------------------------------------------------------
1 | var fs = require('fs');
2 | var jsesc = require('jsesc');
3 | var regenerate = require('regenerate');
4 |
5 | var readJSON = function(fileName) {
6 | var contents = fs.readFileSync('data/' + fileName + '.json', 'utf-8');
7 | var object = JSON.parse(contents);
8 | if (Array.isArray(object)) {
9 | return object;
10 | }
11 | return jsesc(object, {
12 | 'compact': true,
13 | 'quotes': 'single'
14 | });
15 | };
16 |
17 | var joinStrings = function(a, b) {
18 | if (a && b) {
19 | return a + '|' + b;
20 | }
21 | return a + b;
22 | };
23 |
24 | var loneCodePoints = readJSON('encode-lone-code-points');
25 | var arrayEncodeMultipleSymbols = readJSON('encode-paired-symbols');
26 | var arrayEncodeMultipleSymbolsASCII = arrayEncodeMultipleSymbols
27 | .filter(function(string) {
28 | return /^[\0-\x7F]+$/.test(string);
29 | });
30 |
31 | var encodeSingleSymbolsASCII = regenerate(loneCodePoints)
32 | .removeRange(0x7F + 1, 0x10FFFF).toString();
33 | var encodeSingleSymbolsNonASCII = regenerate(loneCodePoints)
34 | .removeRange(0x00, 0x7F).toString();
35 | var encodeMultipleSymbolsASCII = jsesc(
36 | arrayEncodeMultipleSymbolsASCII.join('|')
37 | );
38 | var encodeMultipleSymbolsNonASCII = jsesc(
39 | regenerate.difference(
40 | arrayEncodeMultipleSymbols,
41 | arrayEncodeMultipleSymbolsASCII
42 | ).join('|')
43 | );
44 | var encodeASCII = joinStrings(
45 | encodeMultipleSymbolsASCII,
46 | encodeSingleSymbolsASCII
47 | );
48 | var encodeNonASCII = joinStrings(
49 | encodeMultipleSymbolsNonASCII,
50 | encodeSingleSymbolsNonASCII
51 | );
52 |
53 | module.exports = {
54 | 'encodeMap': readJSON('encode-map'),
55 | 'encodeASCII': encodeASCII, // not used
56 | 'encodeNonASCII': encodeNonASCII,
57 | 'decodeOverrides': readJSON('decode-map-overrides'),
58 | 'decodeMap': readJSON('decode-map'),
59 | 'decodeMapLegacy': readJSON('decode-map-legacy'),
60 | 'astralSymbol': regenerate.fromCodePointRange(0x010000, 0x10FFFF),
61 | 'invalidCodePoints': jsesc(readJSON('invalid-code-points')),
62 | 'regexDecimalEscapeSource': '([0-9]+)(;?)',
63 | 'regexHexadecimalEscapeSource': '[xX]([a-fA-F0-9]+)(;?)',
64 | 'regexNamedReferenceSource': '&([0-9a-zA-Z]+);',
65 | 'regexLegacyReferenceSource': '&(' +
66 | readJSON('decode-legacy-named-references').join('|') + ')([=a-zA-Z0-9])?',
67 | 'version': JSON.parse(fs.readFileSync('package.json', 'utf-8')).version
68 | };
69 |
--------------------------------------------------------------------------------
/data/decode-map-legacy.json:
--------------------------------------------------------------------------------
1 | {
2 | "Aacute": "\u00C1",
3 | "aacute": "\u00E1",
4 | "Acirc": "\u00C2",
5 | "acirc": "\u00E2",
6 | "acute": "\u00B4",
7 | "AElig": "\u00C6",
8 | "aelig": "\u00E6",
9 | "Agrave": "\u00C0",
10 | "agrave": "\u00E0",
11 | "amp": "&",
12 | "AMP": "&",
13 | "Aring": "\u00C5",
14 | "aring": "\u00E5",
15 | "Atilde": "\u00C3",
16 | "atilde": "\u00E3",
17 | "Auml": "\u00C4",
18 | "auml": "\u00E4",
19 | "brvbar": "\u00A6",
20 | "Ccedil": "\u00C7",
21 | "ccedil": "\u00E7",
22 | "cedil": "\u00B8",
23 | "cent": "\u00A2",
24 | "copy": "\u00A9",
25 | "COPY": "\u00A9",
26 | "curren": "\u00A4",
27 | "deg": "\u00B0",
28 | "divide": "\u00F7",
29 | "Eacute": "\u00C9",
30 | "eacute": "\u00E9",
31 | "Ecirc": "\u00CA",
32 | "ecirc": "\u00EA",
33 | "Egrave": "\u00C8",
34 | "egrave": "\u00E8",
35 | "ETH": "\u00D0",
36 | "eth": "\u00F0",
37 | "Euml": "\u00CB",
38 | "euml": "\u00EB",
39 | "frac12": "\u00BD",
40 | "frac14": "\u00BC",
41 | "frac34": "\u00BE",
42 | "gt": ">",
43 | "GT": ">",
44 | "Iacute": "\u00CD",
45 | "iacute": "\u00ED",
46 | "Icirc": "\u00CE",
47 | "icirc": "\u00EE",
48 | "iexcl": "\u00A1",
49 | "Igrave": "\u00CC",
50 | "igrave": "\u00EC",
51 | "iquest": "\u00BF",
52 | "Iuml": "\u00CF",
53 | "iuml": "\u00EF",
54 | "laquo": "\u00AB",
55 | "lt": "<",
56 | "LT": "<",
57 | "macr": "\u00AF",
58 | "micro": "\u00B5",
59 | "middot": "\u00B7",
60 | "nbsp": "\u00A0",
61 | "not": "\u00AC",
62 | "Ntilde": "\u00D1",
63 | "ntilde": "\u00F1",
64 | "Oacute": "\u00D3",
65 | "oacute": "\u00F3",
66 | "Ocirc": "\u00D4",
67 | "ocirc": "\u00F4",
68 | "Ograve": "\u00D2",
69 | "ograve": "\u00F2",
70 | "ordf": "\u00AA",
71 | "ordm": "\u00BA",
72 | "Oslash": "\u00D8",
73 | "oslash": "\u00F8",
74 | "Otilde": "\u00D5",
75 | "otilde": "\u00F5",
76 | "Ouml": "\u00D6",
77 | "ouml": "\u00F6",
78 | "para": "\u00B6",
79 | "plusmn": "\u00B1",
80 | "pound": "\u00A3",
81 | "quot": "\"",
82 | "QUOT": "\"",
83 | "raquo": "\u00BB",
84 | "reg": "\u00AE",
85 | "REG": "\u00AE",
86 | "sect": "\u00A7",
87 | "shy": "\u00AD",
88 | "sup1": "\u00B9",
89 | "sup2": "\u00B2",
90 | "sup3": "\u00B3",
91 | "szlig": "\u00DF",
92 | "THORN": "\u00DE",
93 | "thorn": "\u00FE",
94 | "times": "\u00D7",
95 | "Uacute": "\u00DA",
96 | "uacute": "\u00FA",
97 | "Ucirc": "\u00DB",
98 | "ucirc": "\u00FB",
99 | "Ugrave": "\u00D9",
100 | "ugrave": "\u00F9",
101 | "uml": "\u00A8",
102 | "Uuml": "\u00DC",
103 | "uuml": "\u00FC",
104 | "Yacute": "\u00DD",
105 | "yacute": "\u00FD",
106 | "yen": "\u00A5",
107 | "yuml": "\u00FF"
108 | }
109 |
--------------------------------------------------------------------------------
/scripts/process-data.js:
--------------------------------------------------------------------------------
1 | var fs = require('fs');
2 | var jsesc = require('jsesc');
3 | var _ = require('lodash');
4 |
5 | // http://www.whatwg.org/specs/web-apps/current-work/multipage/entities.json
6 | var data = JSON.parse(fs.readFileSync('data/entities.json', 'utf8'));
7 |
8 | var encodeMap = {};
9 | var encodeMultipleSymbols = [];
10 | var encodeSingleCodePoints = [];
11 | var decodeMap = {};
12 | var decodeMapLegacy = {};
13 |
14 | _.forOwn(data, function(value, key) {
15 | var referenceWithLeadingAmpersand = key;
16 | var referenceWithoutLeadingAmpersand = referenceWithLeadingAmpersand.replace(/^&/, '');
17 | var referenceOnly = referenceWithoutLeadingAmpersand.replace(/;$/, '');
18 | var string = value.characters;
19 | var codePoints = value.codepoints;
20 | var tmp;
21 | if (/;$/.test(referenceWithoutLeadingAmpersand)) {
22 | // only if the entity has a trailing semicolon
23 | tmp = encodeMap[string];
24 | // Prefer short named character references with as few uppercase letters as possible
25 | if ( // only add an entry if…
26 | !tmp || ( // …there is no entry for this string yet, or…
27 | tmp.length > referenceOnly.length || // …this reference is shorter, or…
28 | (
29 | // …this reference contains fewer uppercase letters
30 | tmp.length == referenceOnly.length &&
31 | (referenceOnly.match(/[A-Z]/g) || []).length <
32 | (tmp.match(/[A-Z]/g) || []).length
33 | )
34 | )
35 | ) {
36 | encodeMap[string] = referenceOnly;
37 | } else {
38 | // do nothing
39 | }
40 | if (codePoints.length == 1) {
41 | encodeSingleCodePoints.push(codePoints[0]);
42 | } else {
43 | encodeMultipleSymbols.push(string);
44 | }
45 | }
46 | if (/;$/.test(referenceWithoutLeadingAmpersand)) {
47 | decodeMap[referenceWithoutLeadingAmpersand.replace(/;$/, '')] = string;
48 | } else {
49 | decodeMapLegacy[referenceWithoutLeadingAmpersand] = string;
50 | }
51 | });
52 |
53 | encodeMultipleSymbols = _.uniq(
54 | encodeMultipleSymbols.sort(), // sort strings by code point value
55 | true
56 | );
57 |
58 | encodeSingleCodePoints = _.uniq(
59 | _.sortBy(encodeSingleCodePoints), // numeric sort
60 | true
61 | );
62 |
63 | var legacyReferences = _.keys(decodeMapLegacy).sort(function(a, b) {
64 | if (a.length > b.length) {
65 | return -1;
66 | }
67 | if (a.length < b.length) {
68 | return 1;
69 | }
70 | // a.length == b.length, so sort alphabetically
71 | return a - b;
72 | });
73 |
74 | var writeJSON = function(fileName, object) {
75 | var json = jsesc(object, {
76 | 'compact': false,
77 | 'json': true
78 | });
79 | fs.writeFileSync(fileName, json + '\n');
80 | };
81 |
82 | writeJSON('data/decode-map.json', decodeMap);
83 | writeJSON('data/decode-map-legacy.json', decodeMapLegacy);
84 | writeJSON('data/decode-legacy-named-references.json', legacyReferences);
85 | writeJSON('data/encode-map.json', encodeMap);
86 | writeJSON('data/encode-paired-symbols.json', encodeMultipleSymbols);
87 | writeJSON('data/encode-lone-code-points.json', encodeSingleCodePoints);
88 |
--------------------------------------------------------------------------------
/Gruntfile.js:
--------------------------------------------------------------------------------
1 | module.exports = function(grunt) {
2 |
3 | grunt.initConfig({
4 | 'shell': {
5 | 'options': {
6 | 'stdout': true,
7 | 'stderr': true,
8 | 'failOnError': true
9 | },
10 | 'cover': {
11 | 'command': 'istanbul cover --report "html" --verbose --dir "coverage" "tests/tests.js"'
12 | },
13 | 'fetch-entities': {
14 | 'command': 'curl http://www.whatwg.org/specs/web-apps/current-work/entities.json | sed "s/ /\t/g" > data/entities.json'
15 | },
16 | 'fetch-and-scrape-spec': {
17 | 'command': 'phantomjs --load-images=no scripts/scrape-spec.js'
18 | },
19 | 'process-data': {
20 | 'command': 'node scripts/process-data.js'
21 | },
22 | 'test-narwhal': {
23 | 'command': 'echo "Testing in Narwhal..."; export NARWHAL_OPTIMIZATION=-1; narwhal "tests/tests.js"'
24 | },
25 | 'test-phantomjs': {
26 | 'command': 'echo "Testing in PhantomJS..."; phantomjs "tests/tests.js"'
27 | },
28 | // Rhino 1.7R4 has a bug that makes it impossible to test he.
29 | // https://bugzilla.mozilla.org/show_bug.cgi?id=775566
30 | // To test, use Rhino 1.7R3, or wait (heh) for the 1.7R5 release.
31 | 'test-rhino': {
32 | 'command': 'echo "Testing in Rhino..."; rhino -opt -1 "tests.js"',
33 | 'options': {
34 | 'execOptions': {
35 | 'cwd': 'tests'
36 | }
37 | }
38 | },
39 | 'test-ringo': {
40 | 'command': 'echo "Testing in Ringo..."; ringo -o -1 "tests/tests.js"'
41 | },
42 | 'test-node': {
43 | 'command': 'echo "Testing in Node..."; node "tests/tests.js"'
44 | },
45 | 'test-browser': {
46 | 'command': 'echo "Testing in a browser..."; open "tests/index.html"'
47 | }
48 | },
49 | 'template': {
50 | 'build-he': {
51 | 'options': {
52 | 'data': function() {
53 | return require('./scripts/export-data.js');
54 | }
55 | },
56 | 'files': {
57 | 'he.js': ['src/he.js']
58 | }
59 | },
60 | 'build-tests': {
61 | 'options': {
62 | 'data': function() {
63 | return {
64 | 'testData': require('fs')
65 | .readFileSync('data/entities.json', 'utf-8').trim()
66 | }
67 | }
68 | },
69 | 'files': {
70 | 'tests/tests.js': ['tests/tests.src.js']
71 | }
72 | }
73 | }
74 | });
75 |
76 | grunt.loadNpmTasks('grunt-template');
77 | grunt.loadNpmTasks('grunt-shell');
78 |
79 | grunt.registerTask('cover', 'shell:cover');
80 | grunt.registerTask('ci', [
81 | 'shell:test-narwhal',
82 | 'shell:test-phantomjs',
83 | 'shell:test-rhino',
84 | 'shell:test-ringo',
85 | 'shell:test-node',
86 | ]);
87 | grunt.registerTask('test', [
88 | 'ci',
89 | 'shell:test-browser'
90 | ]);
91 |
92 | grunt.registerTask('default', [
93 | 'template',
94 | 'shell:test-node'
95 | ]);
96 |
97 | grunt.registerTask('build', [
98 | 'shell:process-data',
99 | 'default'
100 | ]);
101 |
102 | grunt.registerTask('fetch', [
103 | 'shell:fetch-entities',
104 | 'shell:fetch-and-scrape-spec',
105 | 'build'
106 | ]);
107 |
108 | };
109 |
--------------------------------------------------------------------------------
/scripts/scrape-spec.js:
--------------------------------------------------------------------------------
1 | var page = require('webpage').create();
2 | var fs = require('fs');
3 | var jsesc = require('jsesc');
4 |
5 | var open = function(url, callback) {
6 | page.open(url, function(status) {
7 | if (status != 'success') {
8 | return phantom.exit();
9 | }
10 | callback();
11 | });
12 | };
13 |
14 | var writeJSON = function(fileName, contents) {
15 | fs.write(fileName, contents + '\n', 'w');
16 | console.log(fileName + ' created successfully.');
17 | };
18 |
19 | open('http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#table-charref-overrides', function() {
20 | var result = page.evaluate(function() {
21 |
22 | // Modified version of `ucs2encode`; see http://mths.be/punycode
23 | var stringFromCharCode = String.fromCharCode;
24 | var codePointToSymbol = function(codePoint) {
25 | var output = '';
26 | if (codePoint > 0xFFFF) {
27 | codePoint -= 0x10000;
28 | output += stringFromCharCode(codePoint >>> 10 & 0x3FF | 0xD800);
29 | codePoint = 0xDC00 | codePoint & 0x3FF;
30 | }
31 | output += stringFromCharCode(codePoint);
32 | return output;
33 | };
34 |
35 | var range = function(start, stop) {
36 | for (var result = []; start <= stop; result.push(start++));
37 | return result;
38 | };
39 |
40 | var table = document.querySelector('#table-charref-overrides');
41 |
42 | // Code points that cause parse errors
43 | var siblings = table.parentNode.children;
44 | var max = siblings.length - 1;
45 | var text = siblings[max].innerText;
46 | var codePoints = [];
47 | text.replace(/0x([a-fA-F0-9]+)\s+to\s+0x([a-fA-F0-9]+)/g, function($0, $1, $2) {
48 | var start = parseInt($1, 16);
49 | var end = parseInt($2, 16);
50 | codePoints = codePoints.concat(range(start, end));
51 | return '';
52 | }).replace(/0x([a-fA-F0-9]+)/g, function($0, $1) {
53 | var codePoint = parseInt($1, 16);
54 | codePoints.push(codePoint);
55 | return '';
56 | });
57 |
58 | // Character reference overrides
59 | var cells = table.querySelectorAll('td');
60 | var keys = [].filter.call(cells, function(cell, index) {
61 | return index % 3 == 0;
62 | }).map(function(cell) {
63 | return Number(cell.innerText.trim());
64 | });
65 | var values = [].filter.call(cells, function(cell, index) {
66 | return index % 3 == 1;
67 | }).map(function(cell) {
68 | var hex = cell.innerText.trim().replace('U+', '');
69 | var codePoint = parseInt(hex, 16);
70 | return codePointToSymbol(codePoint);
71 | });
72 |
73 | var overrides = {};
74 | keys = keys.forEach(function(codePoint, index) {
75 | var symbol = codePointToSymbol(codePoint);
76 | var correspondingValue = values[index];
77 | var mapsToItself = symbol == correspondingValue;
78 | var alreadyMarkedAsInvalid = codePoints.indexOf(codePoint) > -1;
79 | if (mapsToItself && !alreadyMarkedAsInvalid) {
80 | codePoints.push(codePoint);
81 | return;
82 | }
83 | if (!mapsToItself || !alreadyMarkedAsInvalid) {
84 | overrides[codePoint] = correspondingValue;
85 | }
86 | });
87 |
88 | // Pass everything back to PhantomJS
89 | return {
90 | 'overrides': overrides,
91 | // When passed as an array, it comes out as an object, so pass it as a
92 | // comma-separated string instead
93 | 'codePoints': codePoints.join(',')
94 | };
95 |
96 | });
97 |
98 | writeJSON('data/decode-map-overrides.json', jsesc(result.overrides, {
99 | 'json': true,
100 | 'compact': false
101 | }));
102 |
103 | var codePoints = result.codePoints.split(',').map(function(string) {
104 | return parseInt(string, 10);
105 | }).sort(function(a, b) {
106 | return a - b;
107 | });
108 | writeJSON('data/invalid-code-points.json', jsesc(codePoints, {
109 | 'json': true,
110 | 'compact': false
111 | }));
112 |
113 | phantom.exit();
114 | });
115 |
--------------------------------------------------------------------------------
/src/he.js:
--------------------------------------------------------------------------------
1 | /*! http://mths.be/he v<%= version %> by @mathias | MIT license */
2 | ;(function(root) {
3 |
4 | // Detect free variables `exports`
5 | var freeExports = typeof exports == 'object' && exports;
6 |
7 | // Detect free variable `module`
8 | var freeModule = typeof module == 'object' && module &&
9 | module.exports == freeExports && module;
10 |
11 | // Detect free variable `global`, from Node.js or Browserified code,
12 | // and use it as `root`
13 | var freeGlobal = typeof global == 'object' && global;
14 | if (freeGlobal.global === freeGlobal || freeGlobal.window === freeGlobal) {
15 | root = freeGlobal;
16 | }
17 |
18 | /*--------------------------------------------------------------------------*/
19 |
20 | var regexAstralSymbols = /<%= astralSymbol %>/g;
21 | var regexASCII = /[\0-\x7F]/g;
22 | var regexNonASCII = /[^\0-\x7F]/g;
23 |
24 | var regexEncodeNonASCII = /<%= encodeNonASCII %>/g;
25 | var encodeMap = <%= encodeMap %>;
26 |
27 | var regexEscape = /[&<>"']/g;
28 | var escapeMap = {
29 | '&': '&',
30 | '<': '<',
31 | '"': '"',
32 | '\'': ''',
33 | // See http://mathiasbynens.be/notes/ambiguous-ampersands: in HTML, the
34 | // following is not strictly necessary unless it’s part of a tag or an
35 | // unquoted attribute value. We’re only escaping it for XML support, and to
36 | // match existing `htmlEscape` implementations.
37 | '>': '>'
38 | };
39 |
40 | var regexInvalidEntity = /(?:[xX][^a-fA-F0-9]|[^0-9xX])/;
41 | var regexDecode = /<%=
42 | regexDecimalEscapeSource
43 | %>|<%=
44 | regexHexadecimalEscapeSource
45 | %>|<%=
46 | regexNamedReferenceSource
47 | %>|<%=
48 | regexLegacyReferenceSource
49 | %>/g;
50 | var decodeMap = <%= decodeMap %>;
51 | var decodeMapLegacy = <%= decodeMapLegacy %>;
52 | var decodeMapNumeric = <%= decodeOverrides %>;
53 | var invalidCodePoints = <%= invalidCodePoints %>;
54 |
55 | /*--------------------------------------------------------------------------*/
56 |
57 | var stringFromCharCode = String.fromCharCode;
58 |
59 | var object = {};
60 | var hasOwnProperty = object.hasOwnProperty;
61 | var has = function(object, propertyName) {
62 | return hasOwnProperty.call(object, propertyName);
63 | };
64 |
65 | var contains = function(array, value) {
66 | var index = -1;
67 | var length = array.length;
68 | while (++index < length) {
69 | if (array[index] == value) {
70 | return true;
71 | }
72 | }
73 | return false;
74 | };
75 |
76 | var merge = function(options, defaults) {
77 | if (!options) {
78 | return defaults;
79 | }
80 | var key;
81 | var result = {};
82 | for (key in defaults) {
83 | // `hasOwnProperty` check is not needed here, since only recognized
84 | // option names are used
85 | result[key] = has(options, key) ? options[key] : defaults[key];
86 | }
87 | return result;
88 | };
89 |
90 | // Modified version of `ucs2encode`; see http://mths.be/punycode
91 | var codePointToSymbol = function(codePoint, strict) {
92 | var output = '';
93 | if ((codePoint >= 0xD800 && codePoint <= 0xDFFF) || codePoint > 0x10FFFF) {
94 | // See issue #4:
95 | // “Otherwise, if the number is in the range 0xD800 to 0xDFFF or is
96 | // greater than 0x10FFFF, then this is a parse error. Return a U+FFFD
97 | // REPLACEMENT CHARACTER.”
98 | if (strict) {
99 | parseError('character reference outside the permissible Unicode range');
100 | }
101 | return '\uFFFD';
102 | }
103 | if (has(decodeMapNumeric, codePoint)) {
104 | if (strict) {
105 | parseError('disallowed character reference');
106 | }
107 | return decodeMapNumeric[codePoint];
108 | }
109 | if (strict && contains(invalidCodePoints, codePoint)) {
110 | parseError('disallowed character reference');
111 | }
112 | if (codePoint > 0xFFFF) {
113 | codePoint -= 0x10000;
114 | output += stringFromCharCode(codePoint >>> 10 & 0x3FF | 0xD800);
115 | codePoint = 0xDC00 | codePoint & 0x3FF;
116 | }
117 | output += stringFromCharCode(codePoint);
118 | return output;
119 | };
120 |
121 | var hexEscape = function(symbol) {
122 | return '' + symbol.charCodeAt(0).toString(16).toUpperCase() + ';';
123 | };
124 |
125 | var parseError = function(message) {
126 | throw Error('Parse error: ' + message);
127 | };
128 |
129 | /*--------------------------------------------------------------------------*/
130 |
131 | var encode = function(string, options) {
132 | options = merge(options, encode.options);
133 | var encodeEverything = options.encodeEverything;
134 | var useNamedReferences = options.useNamedReferences;
135 | if (encodeEverything) {
136 | // Encode ASCII symbols
137 | string = string.replace(regexASCII, function(symbol) {
138 | // Use named references if requested & possible
139 | if (useNamedReferences && has(encodeMap, symbol)) {
140 | return '&' + encodeMap[symbol] + ';';
141 | }
142 | return hexEscape(symbol);
143 | });
144 | // Shorten a few escapes that represent two symbols, of which at least one
145 | // is within the ASCII range
146 | if (useNamedReferences) {
147 | string = string
148 | .replace(/>\u20D2/g, '>⃒')
149 | .replace(/<\u20D2/g, '<⃒')
150 | .replace(/fj/g, 'fj');
151 | }
152 | // Encode non-ASCII symbols
153 | if (useNamedReferences) {
154 | // Encode non-ASCII symbols that can be replaced with a named reference
155 | string = string.replace(regexEncodeNonASCII, function(string) {
156 | return '&' + encodeMap[string] + ';'; // no need to check `has()` here
157 | });
158 | }
159 | // Note: any remaining non-ASCII symbols are handled outside of the `if`
160 | } else if (useNamedReferences) {
161 | // Apply named character references
162 | // Encode `<>"'&` using named character references
163 | string = string.replace(regexEscape, function(string) {
164 | return '&' + encodeMap[string] + ';'; // no need to check `has()` here
165 | });
166 | // Shorten escapes that represent two symbols, of which at least one is
167 | // `<>"'&`
168 | string = string
169 | .replace(/>\u20D2/g, '>⃒')
170 | .replace(/<\u20D2/g, '<⃒');
171 | // Encode non-ASCII symbols that can be replaced with a named reference
172 | string = string.replace(regexEncodeNonASCII, function(string) {
173 | return '&' + encodeMap[string] + ';'; // no need to check `has()` here
174 | });
175 | } else {
176 | // Encode `<>"'&` using hexadecimal escapes, now that they’re not handled
177 | // using named character references
178 | string = string.replace(regexEscape, hexEscape);
179 | }
180 | return string
181 | // Encode astral symbols
182 | .replace(regexAstralSymbols, function($0) {
183 | // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
184 | var high = $0.charCodeAt(0);
185 | var low = $0.charCodeAt(1);
186 | var codePoint = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000;
187 | return '' + codePoint.toString(16).toUpperCase() + ';';
188 | })
189 | // Encode any remaining non-ASCII symbols using a hexadecimal escape
190 | .replace(regexNonASCII, hexEscape);
191 | };
192 | // Expose default options (so they can be overridden globally)
193 | encode.options = {
194 | 'useNamedReferences': false,
195 | 'encodeEverything': false
196 | };
197 |
198 | var decode = function(html, options) {
199 | options = merge(options, decode.options);
200 | var strict = options.strict;
201 | if (strict && regexInvalidEntity.test(html)) {
202 | parseError('malformed character reference');
203 | }
204 | return html.replace(regexDecode, function($0, $1, $2, $3, $4, $5, $6, $7) {
205 | var codePoint;
206 | var semicolon;
207 | var hexDigits;
208 | var reference;
209 | var next;
210 | if ($1) {
211 | // Decode decimal escapes, e.g. `𝌆`
212 | codePoint = $1;
213 | semicolon = $2;
214 | if (strict && !semicolon) {
215 | parseError('character reference was not terminated by a semicolon');
216 | }
217 | return codePointToSymbol(codePoint, strict);
218 | }
219 | if ($3) {
220 | // Decode hexadecimal escapes, e.g. `𝌆`
221 | hexDigits = $3;
222 | semicolon = $4;
223 | if (strict && !semicolon) {
224 | parseError('character reference was not terminated by a semicolon');
225 | }
226 | codePoint = parseInt(hexDigits, 16);
227 | return codePointToSymbol(codePoint, strict);
228 | }
229 | if ($5) {
230 | // Decode named character references with trailing `;`, e.g. `©`
231 | reference = $5;
232 | if (has(decodeMap, reference)) {
233 | return decodeMap[reference];
234 | } else {
235 | // ambiguous ampersand; see http://mths.be/notes/ambiguous-ampersands
236 | if (strict) {
237 | parseError(
238 | 'named character reference was not terminated by a semicolon'
239 | );
240 | }
241 | return $0;
242 | }
243 | }
244 | // If we’re still here, it’s a legacy reference for sure. No need for an
245 | // extra `if` check.
246 | // Decode named character references without trailing `;`, e.g. `&`
247 | // This is only a parse error if it gets converted to `&`, or if it is
248 | // followed by `=` in an attribute context.
249 | reference = $6;
250 | next = $7;
251 | if (next && options.isAttributeValue) {
252 | if (strict && next == '=') {
253 | parseError('`&` did not start a character reference');
254 | }
255 | return $0;
256 | } else {
257 | if (strict) {
258 | parseError(
259 | 'named character reference was not terminated by a semicolon'
260 | );
261 | }
262 | // no need to check `has()` here
263 | return decodeMapLegacy[reference] + (next || '');
264 | }
265 | });
266 | };
267 | // Expose default options (so they can be overridden globally)
268 | decode.options = {
269 | 'isAttributeValue': false,
270 | 'strict': false
271 | };
272 |
273 | var escape = function(string) {
274 | return string.replace(regexEscape, function($0) {
275 | return escapeMap[$0]; // no need to check `has()` here
276 | });
277 | };
278 |
279 | /*--------------------------------------------------------------------------*/
280 |
281 | var he = {
282 | 'version': '<%= version %>',
283 | 'encode': encode,
284 | 'decode': decode,
285 | 'escape': escape,
286 | 'unescape': decode
287 | };
288 |
289 | // Some AMD build optimizers, like r.js, check for specific condition patterns
290 | // like the following:
291 | if (
292 | typeof define == 'function' &&
293 | typeof define.amd == 'object' &&
294 | define.amd
295 | ) {
296 | define(function() {
297 | return he;
298 | });
299 | } else if (freeExports && !freeExports.nodeType) {
300 | if (freeModule) { // in Node.js or RingoJS v0.8.0+
301 | freeModule.exports = he;
302 | } else { // in Narwhal or RingoJS v0.7.0-
303 | for (var key in he) {
304 | has(he, key) && (freeExports[key] = he[key]);
305 | }
306 | }
307 | } else { // in Rhino or a web browser
308 | root.he = he;
309 | }
310 |
311 | }(this));
312 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # he [](https://travis-ci.org/mathiasbynens/he) [](https://gemnasium.com/mathiasbynens/he)
2 |
3 | _he_ (for “HTML entities”) is a robust HTML entity encoder/decoder written in JavaScript. It supports [all standardized named character references as per HTML](http://www.whatwg.org/specs/web-apps/current-work/multipage/named-character-references.html), handles [ambiguous ampersands](http://mathiasbynens.be/notes/ambiguous-ampersands) and other edge cases [just like a browser would](http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tokenizing-character-references), has an extensive test suite, and — contrary to many other JavaScript solutions — _he_ handles astral Unicode symbols just fine. [An online demo is available.](http://mothereff.in/html-entities)
4 |
5 | ## Installation
6 |
7 | Via [npm](http://npmjs.org/):
8 |
9 | ```bash
10 | npm install he
11 | ```
12 |
13 | Via [Bower](http://bower.io/):
14 |
15 | ```bash
16 | bower install he
17 | ```
18 |
19 | Via [Component](https://github.com/component/component):
20 |
21 | ```bash
22 | component install mathiasbynens/he
23 | ```
24 |
25 | In a browser:
26 |
27 | ```html
28 |
29 | ```
30 |
31 | In [Narwhal](http://narwhaljs.org/), [Node.js](http://nodejs.org/), and [RingoJS](http://ringojs.org/):
32 |
33 | ```js
34 | var he = require('he');
35 | ```
36 |
37 | In [Rhino](http://www.mozilla.org/rhino/):
38 |
39 | ```js
40 | load('he.js');
41 | ```
42 |
43 | Using an AMD loader like [RequireJS](http://requirejs.org/):
44 |
45 | ```js
46 | require(
47 | {
48 | 'paths': {
49 | 'he': 'path/to/he'
50 | }
51 | },
52 | ['he'],
53 | function(he) {
54 | console.log(he);
55 | }
56 | );
57 | ```
58 |
59 | ## API
60 |
61 | ### `he.version`
62 |
63 | A string representing the semantic version number.
64 |
65 | ### `he.encode(text, options)`
66 |
67 | This function takes a string of text and encodes (by default) any symbols that aren’t printable ASCII symbols, replacing them with character references. As long as the input string contains allowed code points only, the return value of this function is always valid HTML.
68 |
69 | ```js
70 | he.encode('foo © bar ≠ baz 𝌆 qux');
71 | // → 'foo © bar ≠ baz 𝌆 qux'
72 | ```
73 |
74 | The `options` object is optional. It recognizes the following properties:
75 |
76 | #### `useNamedReferences`
77 |
78 | The default value for the `useNamedReferences` option is `false`. This means that `encode()` will not use any named character references (e.g. `©`) in the output — hexadecimal escapes (e.g. `©`) will be used instead. Set it to `true` to enable the use of named references.
79 |
80 | **Note that if compatibility with older browsers is a concern, this option should remain disabled.**
81 |
82 | ```js
83 | // Using the global default setting (defaults to `false`):
84 | he.encode('foo © bar ≠ baz 𝌆 qux');
85 | // → 'foo © bar ≠ baz 𝌆 qux'
86 |
87 | // Passing an `options` object to `encode`, to explicitly disallow named references:
88 | he.encode('foo © bar ≠ baz 𝌆 qux', {
89 | 'useNamedReferences': false
90 | });
91 | // → 'foo © bar ≠ baz 𝌆 qux'
92 |
93 | // Passing an `options` object to `encode`, to explicitly allow named references:
94 | he.encode('foo © bar ≠ baz 𝌆 qux', {
95 | 'useNamedReferences': true
96 | });
97 | // → 'foo © bar ≠ baz 𝌆 qux'
98 | ```
99 |
100 | #### `encodeEverything`
101 |
102 | The default value for the `encodeEverything` option is `false`. This means that `encode()` will not use any character references for printable ASCII symbols that don’t need escaping. Set it to `true` to encode every symbol in the input string.
103 |
104 | ```js
105 | // Using the global default setting (defaults to `false`):
106 | he.encode('foo © bar ≠ baz 𝌆 qux');
107 | // → 'foo © bar ≠ baz 𝌆 qux'
108 |
109 | // Passing an `options` object to `encode`, to explicitly encode all symbols:
110 | he.encode('foo © bar ≠ baz 𝌆 qux', {
111 | 'encodeEverything': true
112 | });
113 | // → 'foo © bar ≠ baz 𝌆 qux'
114 |
115 | // This setting can be combined with the `useNamedReferences` option:
116 | he.encode('foo © bar ≠ baz 𝌆 qux', {
117 | 'encodeEverything': true,
118 | 'useNamedReferences': true
119 | });
120 | // → 'foo © bar ≠ baz 𝌆 qux'
121 | ```
122 |
123 | #### Overriding default `encode` options globally
124 |
125 | The global default setting can be overridden by modifying the `he.encode.options` object. This saves you from passing in an `options` object for every call to `encode` if you want to use the non-default setting.
126 |
127 | ```js
128 | // Read the global default setting:
129 | he.encode.options.useNamedReferences;
130 | // → `false` by default
131 |
132 | // Override the global default setting:
133 | he.encode.options.useNamedReferences = true;
134 |
135 | // Using the global default setting, which is now `true`:
136 | he.encode('foo © bar ≠ baz 𝌆 qux');
137 | // → 'foo © bar ≠ baz 𝌆 qux'
138 | ```
139 |
140 | ### `he.decode(html, options)`
141 |
142 | This function takes a string of HTML and decodes any named and numerical character references in it using [the algorithm described in section 12.2.4.69 of the HTML spec](http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tokenizing-character-references).
143 |
144 | ```js
145 | he.decode('foo © bar ≠ baz 𝌆 qux');
146 | // → 'foo © bar ≠ baz 𝌆 qux'
147 | ```
148 |
149 | The `options` object is optional. It recognizes the following properties:
150 |
151 | #### `isAttributeValue`
152 |
153 | The default value for the `isAttributeValue` option is `false`. This means that `decode()` will decode the string as if it were used in a text context in an HTML document. HTML has different rules for parsing character references in attribute values — set this option to `true` to treat the input string as if it were used as an attribute value.
154 |
155 | ```js
156 | // Using the global default setting (defaults to `false`, i.e. HTML text context):
157 | he.decode('foo&bar');
158 | // → 'foo&bar'
159 |
160 | // Passing an `options` object to `decode`, to explicitly assume an HTML text context:
161 | he.decode('foo&bar', {
162 | 'isAttributeValue': false
163 | });
164 | // → 'foo&bar'
165 |
166 | // Passing an `options` object to `decode`, to explicitly assume an HTML attribute value context:
167 | he.decode('foo&bar', {
168 | 'isAttributeValue': true
169 | });
170 | // → 'foo&bar'
171 | ```
172 |
173 | #### `strict`
174 |
175 | The default value for the `strict` option is `false`. This means that `decode()` will decode any HTML text content you feed it, even if it contains any entities that cause [parse errors](http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tokenizing-character-references). To throw an error when such invalid HTML is encountered, set the `strict` option to `true`. This option makes it possible to use _he_ as part of HTML parsers and HTML validators.
176 |
177 | ```js
178 | // Using the global default setting (defaults to `false`, i.e. error-tolerant mode):
179 | he.decode('foo&bar');
180 | // → 'foo&bar'
181 |
182 | // Passing an `options` object to `decode`, to explicitly enable error-tolerant mode:
183 | he.decode('foo&bar', {
184 | 'strict': false
185 | });
186 | // → 'foo&bar'
187 |
188 | // Passing an `options` object to `decode`, to explicitly enable strict mode:
189 | he.decode('foo&bar', {
190 | 'strict': true
191 | });
192 | // → Parse error
193 | ```
194 |
195 | #### Overriding default `decode` options globally
196 |
197 | The global default settings for the `decode` function can be overridden by modifying the `he.decode.options` object. This saves you from passing in an `options` object for every call to `decode` if you want to use a non-default setting.
198 |
199 | ```js
200 | // Read the global default setting:
201 | he.decode.options.isAttributeValue;
202 | // → `false` by default
203 |
204 | // Override the global default setting:
205 | he.decode.options.isAttributeValue = true;
206 |
207 | // Using the global default setting, which is now `true`:
208 | he.decode('foo&bar');
209 | // → 'foo&bar'
210 | ```
211 |
212 | ### `he.escape(text)`
213 |
214 | This function takes a string of text and escapes it for use in text contexts in XML or HTML documents. Only the following characters are escaped: `&`, `<`, `>`, `"`, and `'`.
215 |
216 | ```js
217 | he.escape('
');
218 | // → '<img src='x' onerror="prompt(1)">'
219 | ```
220 |
221 | ### `he.unescape(html, options)`
222 |
223 | `he.unescape` is an alias for `he.decode`. It takes a string of HTML and decodes any named and numerical character references in it.
224 |
225 | ### Using the `he` binary
226 |
227 | To use the `he` binary in your shell, simply install _he_ globally using npm:
228 |
229 | ```bash
230 | npm install -g he
231 | ```
232 |
233 | After that you will be able to encode/decode HTML entities from the command line:
234 |
235 | ```bash
236 | $ he --encode 'föo ♥ bår 𝌆 baz'
237 | föo ♥ bår 𝌆 baz
238 |
239 | $ he --encode --use-named-refs 'föo ♥ bår 𝌆 baz'
240 | föo ♥ bår 𝌆 baz
241 |
242 | $ he --decode 'föo ♥ bår 𝌆 baz'
243 | föo ♥ bår 𝌆 baz
244 | ```
245 |
246 | Read a local text file, encode it for use in an HTML text context, and save the result to a new file:
247 |
248 | ```bash
249 | $ he --encode < foo.txt > foo-escaped.html
250 | ```
251 |
252 | Or do the same with an online text file:
253 |
254 | ```bash
255 | $ curl -sL "http://git.io/HnfEaw" | he --encode > escaped.html
256 | ```
257 |
258 | Or, the opposite — read a local file containing a snippet of HTML in a text context, decode it back to plain text, and save the result to a new file:
259 |
260 | ```bash
261 | $ he --decode < foo-escaped.html > foo.txt
262 | ```
263 |
264 | Or do the same with an online HTML snippet:
265 |
266 | ```bash
267 | $ curl -sL "http://git.io/HnfEaw" | he --decode > decoded.txt
268 | ```
269 |
270 | See `he --help` for the full list of options.
271 |
272 | ## Support
273 |
274 | he has been tested in at least Chrome 27-29, Firefox 3-22, Safari 4-6, Opera 10-12, IE 6-10, Node.js v0.10.0, Narwhal 0.3.2, RingoJS 0.8-0.9, PhantomJS 1.9.0, and Rhino 1.7RC4.
275 |
276 | ## Unit tests & code coverage
277 |
278 | After cloning this repository, run `npm install` to install the dependencies needed for he development and testing. You may want to install Istanbul _globally_ using `npm install istanbul -g`.
279 |
280 | Once that’s done, you can run the unit tests in Node using `npm test` or `node tests/tests.js`. To run the tests in Rhino, Ringo, Narwhal, and web browsers as well, use `grunt test`.
281 |
282 | To generate [the code coverage report](http://rawgithub.com/mathiasbynens/he/master/coverage/he/he.js.html), use `grunt cover`.
283 |
284 | ## Acknowledgements
285 |
286 | Thanks to [Simon Pieters](http://simon.html5.org/) ([@zcorpan](https://twitter.com/zcorpan)) for the many suggestions.
287 |
288 | ## Author
289 |
290 | | [](http://twitter.com/mathias "Follow @mathias on Twitter") |
291 | |---|
292 | | [Mathias Bynens](http://mathiasbynens.be/) |
293 |
294 | ## License
295 |
296 | _he_ is available under the [MIT](http://mths.be/mit) license.
297 |
--------------------------------------------------------------------------------
/coverage/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Code coverage report for All files
5 |
6 |
7 |
8 |
9 |
180 |
181 |
182 |
200 |
201 |
202 |
203 |
204 |
205 | | File |
206 | |
207 | Statements |
208 | |
209 | Branches |
210 | |
211 | Functions |
212 | |
213 | Lines |
214 | |
215 |
216 |
217 |
218 | | he/ |
219 | |
220 | 96.48% |
221 | (137 / 142) |
222 | 90.43% |
223 | (85 / 94) |
224 | 94.44% |
225 | (17 / 18) |
226 | 96.48% |
227 | (137 / 142) |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
237 |
238 |
239 |
240 |
241 |
332 |
333 |
334 |
--------------------------------------------------------------------------------
/coverage/he/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Code coverage report for he/
5 |
6 |
7 |
8 |
9 |
180 |
181 |
182 |
200 |
201 |
202 |
203 |
204 |
205 | | File |
206 | |
207 | Statements |
208 | |
209 | Branches |
210 | |
211 | Functions |
212 | |
213 | Lines |
214 | |
215 |
216 |
217 |
218 | | he.js |
219 | |
220 | 96.48% |
221 | (137 / 142) |
222 | 90.43% |
223 | (85 / 94) |
224 | 94.44% |
225 | (17 / 18) |
226 | 96.48% |
227 | (137 / 142) |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
237 |
238 |
239 |
240 |
241 |
332 |
333 |
334 |
--------------------------------------------------------------------------------
/coverage/prettify.js:
--------------------------------------------------------------------------------
1 | window.PR_SHOULD_USE_CONTINUATION=true;(function(){var h=["break,continue,do,else,for,if,return,while"];var u=[h,"auto,case,char,const,default,double,enum,extern,float,goto,int,long,register,short,signed,sizeof,static,struct,switch,typedef,union,unsigned,void,volatile"];var p=[u,"catch,class,delete,false,import,new,operator,private,protected,public,this,throw,true,try,typeof"];var l=[p,"alignof,align_union,asm,axiom,bool,concept,concept_map,const_cast,constexpr,decltype,dynamic_cast,explicit,export,friend,inline,late_check,mutable,namespace,nullptr,reinterpret_cast,static_assert,static_cast,template,typeid,typename,using,virtual,where"];var x=[p,"abstract,boolean,byte,extends,final,finally,implements,import,instanceof,null,native,package,strictfp,super,synchronized,throws,transient"];var R=[x,"as,base,by,checked,decimal,delegate,descending,dynamic,event,fixed,foreach,from,group,implicit,in,interface,internal,into,is,lock,object,out,override,orderby,params,partial,readonly,ref,sbyte,sealed,stackalloc,string,select,uint,ulong,unchecked,unsafe,ushort,var"];var r="all,and,by,catch,class,else,extends,false,finally,for,if,in,is,isnt,loop,new,no,not,null,of,off,on,or,return,super,then,true,try,unless,until,when,while,yes";var w=[p,"debugger,eval,export,function,get,null,set,undefined,var,with,Infinity,NaN"];var s="caller,delete,die,do,dump,elsif,eval,exit,foreach,for,goto,if,import,last,local,my,next,no,our,print,package,redo,require,sub,undef,unless,until,use,wantarray,while,BEGIN,END";var I=[h,"and,as,assert,class,def,del,elif,except,exec,finally,from,global,import,in,is,lambda,nonlocal,not,or,pass,print,raise,try,with,yield,False,True,None"];var f=[h,"alias,and,begin,case,class,def,defined,elsif,end,ensure,false,in,module,next,nil,not,or,redo,rescue,retry,self,super,then,true,undef,unless,until,when,yield,BEGIN,END"];var H=[h,"case,done,elif,esac,eval,fi,function,in,local,set,then,until"];var A=[l,R,w,s+I,f,H];var e=/^(DIR|FILE|vector|(de|priority_)?queue|list|stack|(const_)?iterator|(multi)?(set|map)|bitset|u?(int|float)\d*)/;var C="str";var z="kwd";var j="com";var O="typ";var G="lit";var L="pun";var F="pln";var m="tag";var E="dec";var J="src";var P="atn";var n="atv";var N="nocode";var M="(?:^^\\.?|[+-]|\\!|\\!=|\\!==|\\#|\\%|\\%=|&|&&|&&=|&=|\\(|\\*|\\*=|\\+=|\\,|\\-=|\\->|\\/|\\/=|:|::|\\;|<|<<|<<=|<=|=|==|===|>|>=|>>|>>=|>>>|>>>=|\\?|\\@|\\[|\\^|\\^=|\\^\\^|\\^\\^=|\\{|\\||\\|=|\\|\\||\\|\\|=|\\~|break|case|continue|delete|do|else|finally|instanceof|return|throw|try|typeof)\\s*";function k(Z){var ad=0;var S=false;var ac=false;for(var V=0,U=Z.length;V122)){if(!(al<65||ag>90)){af.push([Math.max(65,ag)|32,Math.min(al,90)|32])}if(!(al<97||ag>122)){af.push([Math.max(97,ag)&~32,Math.min(al,122)&~32])}}}}af.sort(function(av,au){return(av[0]-au[0])||(au[1]-av[1])});var ai=[];var ap=[NaN,NaN];for(var ar=0;arat[0]){if(at[1]+1>at[0]){an.push("-")}an.push(T(at[1]))}}an.push("]");return an.join("")}function W(al){var aj=al.source.match(new RegExp("(?:\\[(?:[^\\x5C\\x5D]|\\\\[\\s\\S])*\\]|\\\\u[A-Fa-f0-9]{4}|\\\\x[A-Fa-f0-9]{2}|\\\\[0-9]+|\\\\[^ux0-9]|\\(\\?[:!=]|[\\(\\)\\^]|[^\\x5B\\x5C\\(\\)\\^]+)","g"));var ah=aj.length;var an=[];for(var ak=0,am=0;ak=2&&ai==="["){aj[ak]=X(ag)}else{if(ai!=="\\"){aj[ak]=ag.replace(/[a-zA-Z]/g,function(ao){var ap=ao.charCodeAt(0);return"["+String.fromCharCode(ap&~32,ap|32)+"]"})}}}}return aj.join("")}var aa=[];for(var V=0,U=Z.length;V=0;){S[ac.charAt(ae)]=Y}}var af=Y[1];var aa=""+af;if(!ag.hasOwnProperty(aa)){ah.push(af);ag[aa]=null}}ah.push(/[\0-\uffff]/);V=k(ah)})();var X=T.length;var W=function(ah){var Z=ah.sourceCode,Y=ah.basePos;var ad=[Y,F];var af=0;var an=Z.match(V)||[];var aj={};for(var ae=0,aq=an.length;ae=5&&"lang-"===ap.substring(0,5);if(am&&!(ai&&typeof ai[1]==="string")){am=false;ap=J}if(!am){aj[ag]=ap}}var ab=af;af+=ag.length;if(!am){ad.push(Y+ab,ap)}else{var al=ai[1];var ak=ag.indexOf(al);var ac=ak+al.length;if(ai[2]){ac=ag.length-ai[2].length;ak=ac-al.length}var ar=ap.substring(5);B(Y+ab,ag.substring(0,ak),W,ad);B(Y+ab+ak,al,q(ar,al),ad);B(Y+ab+ac,ag.substring(ac),W,ad)}}ah.decorations=ad};return W}function i(T){var W=[],S=[];if(T.tripleQuotedStrings){W.push([C,/^(?:\'\'\'(?:[^\'\\]|\\[\s\S]|\'{1,2}(?=[^\']))*(?:\'\'\'|$)|\"\"\"(?:[^\"\\]|\\[\s\S]|\"{1,2}(?=[^\"]))*(?:\"\"\"|$)|\'(?:[^\\\']|\\[\s\S])*(?:\'|$)|\"(?:[^\\\"]|\\[\s\S])*(?:\"|$))/,null,"'\""])}else{if(T.multiLineStrings){W.push([C,/^(?:\'(?:[^\\\']|\\[\s\S])*(?:\'|$)|\"(?:[^\\\"]|\\[\s\S])*(?:\"|$)|\`(?:[^\\\`]|\\[\s\S])*(?:\`|$))/,null,"'\"`"])}else{W.push([C,/^(?:\'(?:[^\\\'\r\n]|\\.)*(?:\'|$)|\"(?:[^\\\"\r\n]|\\.)*(?:\"|$))/,null,"\"'"])}}if(T.verbatimStrings){S.push([C,/^@\"(?:[^\"]|\"\")*(?:\"|$)/,null])}var Y=T.hashComments;if(Y){if(T.cStyleComments){if(Y>1){W.push([j,/^#(?:##(?:[^#]|#(?!##))*(?:###|$)|.*)/,null,"#"])}else{W.push([j,/^#(?:(?:define|elif|else|endif|error|ifdef|include|ifndef|line|pragma|undef|warning)\b|[^\r\n]*)/,null,"#"])}S.push([C,/^<(?:(?:(?:\.\.\/)*|\/?)(?:[\w-]+(?:\/[\w-]+)+)?[\w-]+\.h|[a-z]\w*)>/,null])}else{W.push([j,/^#[^\r\n]*/,null,"#"])}}if(T.cStyleComments){S.push([j,/^\/\/[^\r\n]*/,null]);S.push([j,/^\/\*[\s\S]*?(?:\*\/|$)/,null])}if(T.regexLiterals){var X=("/(?=[^/*])(?:[^/\\x5B\\x5C]|\\x5C[\\s\\S]|\\x5B(?:[^\\x5C\\x5D]|\\x5C[\\s\\S])*(?:\\x5D|$))+/");S.push(["lang-regex",new RegExp("^"+M+"("+X+")")])}var V=T.types;if(V){S.push([O,V])}var U=(""+T.keywords).replace(/^ | $/g,"");if(U.length){S.push([z,new RegExp("^(?:"+U.replace(/[\s,]+/g,"|")+")\\b"),null])}W.push([F,/^\s+/,null," \r\n\t\xA0"]);S.push([G,/^@[a-z_$][a-z_$@0-9]*/i,null],[O,/^(?:[@_]?[A-Z]+[a-z][A-Za-z_$@0-9]*|\w+_t\b)/,null],[F,/^[a-z_$][a-z_$@0-9]*/i,null],[G,new RegExp("^(?:0x[a-f0-9]+|(?:\\d(?:_\\d+)*\\d*(?:\\.\\d*)?|\\.\\d\\+)(?:e[+\\-]?\\d+)?)[a-z]*","i"),null,"0123456789"],[F,/^\\[\s\S]?/,null],[L,/^.[^\s\w\.$@\'\"\`\/\#\\]*/,null]);return g(W,S)}var K=i({keywords:A,hashComments:true,cStyleComments:true,multiLineStrings:true,regexLiterals:true});function Q(V,ag){var U=/(?:^|\s)nocode(?:\s|$)/;var ab=/\r\n?|\n/;var ac=V.ownerDocument;var S;if(V.currentStyle){S=V.currentStyle.whiteSpace}else{if(window.getComputedStyle){S=ac.defaultView.getComputedStyle(V,null).getPropertyValue("white-space")}}var Z=S&&"pre"===S.substring(0,3);var af=ac.createElement("LI");while(V.firstChild){af.appendChild(V.firstChild)}var W=[af];function ae(al){switch(al.nodeType){case 1:if(U.test(al.className)){break}if("BR"===al.nodeName){ad(al);if(al.parentNode){al.parentNode.removeChild(al)}}else{for(var an=al.firstChild;an;an=an.nextSibling){ae(an)}}break;case 3:case 4:if(Z){var am=al.nodeValue;var aj=am.match(ab);if(aj){var ai=am.substring(0,aj.index);al.nodeValue=ai;var ah=am.substring(aj.index+aj[0].length);if(ah){var ak=al.parentNode;ak.insertBefore(ac.createTextNode(ah),al.nextSibling)}ad(al);if(!ai){al.parentNode.removeChild(al)}}}break}}function ad(ak){while(!ak.nextSibling){ak=ak.parentNode;if(!ak){return}}function ai(al,ar){var aq=ar?al.cloneNode(false):al;var ao=al.parentNode;if(ao){var ap=ai(ao,1);var an=al.nextSibling;ap.appendChild(aq);for(var am=an;am;am=an){an=am.nextSibling;ap.appendChild(am)}}return aq}var ah=ai(ak.nextSibling,0);for(var aj;(aj=ah.parentNode)&&aj.nodeType===1;){ah=aj}W.push(ah)}for(var Y=0;Y=S){ah+=2}if(V>=ap){Z+=2}}}var t={};function c(U,V){for(var S=V.length;--S>=0;){var T=V[S];if(!t.hasOwnProperty(T)){t[T]=U}else{if(window.console){console.warn("cannot override language handler %s",T)}}}}function q(T,S){if(!(T&&t.hasOwnProperty(T))){T=/^\s*]*(?:>|$)/],[j,/^<\!--[\s\S]*?(?:-\->|$)/],["lang-",/^<\?([\s\S]+?)(?:\?>|$)/],["lang-",/^<%([\s\S]+?)(?:%>|$)/],[L,/^(?:<[%?]|[%?]>)/],["lang-",/^]*>([\s\S]+?)<\/xmp\b[^>]*>/i],["lang-js",/^