├── .github └── dependabot.yml ├── .gitignore ├── .npmignore ├── .travis.yml ├── Changelog.md ├── LICENSE ├── README.md ├── encodings ├── dbcs-codec.js ├── dbcs-data.js ├── index.js ├── internal.js ├── sbcs-codec.js ├── sbcs-data-generated.js ├── sbcs-data.js ├── tables │ ├── big5-added.json │ ├── cp936.json │ ├── cp949.json │ ├── cp950.json │ ├── eucjp.json │ ├── gb18030-ranges.json │ ├── gbk-added.json │ └── shiftjis.json ├── utf16.js ├── utf32.js └── utf7.js ├── generation ├── gen-dbcs.js ├── gen-sbcs.js ├── research │ ├── complex-encodings-iconv.md │ ├── gen-normalization.js │ ├── get-iconv-encodings.js │ ├── normalization.md │ └── notes.md └── utils.js ├── lib ├── bom-handling.js ├── index.d.ts ├── index.js └── streams.js ├── package.json └── test ├── big5-test.js ├── bom-test.js ├── cesu8-test.js ├── cyrillic-test.js ├── dbcs-test.js ├── gbk-test.js ├── gbkFile.txt ├── greek-test.js ├── main-test.js ├── mocha.opts ├── performance.js ├── sbcs-test.js ├── shiftjis-test.js ├── streams-test.js ├── turkish-test.js ├── utf16-test.js ├── utf32-test.js ├── utf7-test.js └── webpack ├── basic-test.js ├── index.js ├── karma.conf.js ├── package.json └── stream-test.js /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # Please see the documentation for all configuration options: 2 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 3 | 4 | version: 2 5 | updates: 6 | - package-ecosystem: "npm" 7 | directory: "/" 8 | schedule: 9 | interval: "daily" 10 | allow: 11 | - dependency-type: production 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Node.js stuff 2 | node_modules 3 | package-lock.json 4 | 5 | # Editors 6 | *~ 7 | *sublime-* 8 | /.idea 9 | 10 | # Development environment 11 | /coverage 12 | /benchmarks/node_envs 13 | /generation/source-data 14 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *sublime-* 3 | generation 4 | test 5 | wiki 6 | coverage 7 | .github 8 | .idea 9 | .travis.yml 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - "0.10" 4 | - "0.11" 5 | - "0.12" 6 | - "iojs" 7 | - "4" 8 | - "6" 9 | - "8" 10 | - "10" 11 | - "12" 12 | - "node" 13 | 14 | jobs: 15 | include: 16 | - name: webpack 17 | node_js: "12" 18 | install: cd test/webpack; npm install 19 | script: npm test -------------------------------------------------------------------------------- /Changelog.md: -------------------------------------------------------------------------------- 1 | ## 0.6.3 / 2021-05-23 2 | * Fix HKSCS encoding to prefer Big5 codes if both Big5 and HKSCS codes are possible (#264) 3 | 4 | 5 | ## 0.6.2 / 2020-07-08 6 | * Support Uint8Array-s decoding without conversion to Buffers, plus fix an edge case. 7 | 8 | 9 | ## 0.6.1 / 2020-06-28 10 | * Support Uint8Array-s directly when decoding (#246, by @gyzerok) 11 | * Unify package.json version ranges to be strictly semver-compatible (#241) 12 | * Fix minor issue in UTF-32 decoder's endianness detection code. 13 | 14 | 15 | ## 0.6.0 / 2020-06-08 16 | * Updated 'gb18030' encoding to :2005 edition (see https://github.com/whatwg/encoding/issues/22). 17 | * Removed `iconv.extendNodeEncodings()` mechanism. It was deprecated 5 years ago and didn't work 18 | in recent Node versions. 19 | * Reworked Streaming API behavior in browser environments to fix #204. Streaming API will be 20 | excluded by default in browser packs, saving ~100Kb bundle size, unless enabled explicitly using 21 | `iconv.enableStreamingAPI(require('stream'))`. 22 | * Updates to development environment & tests: 23 | * Added ./test/webpack private package to test complex new use cases that need custom environment. 24 | It's tested as a separate job in Travis CI. 25 | * Updated generation code for the new EUC-KR index file format from Encoding Standard. 26 | * Removed Buffer() constructor in tests (#197 by @gabrielschulhof). 27 | 28 | 29 | ## 0.5.2 / 2020-06-08 30 | * Added `iconv.getEncoder()` and `iconv.getDecoder()` methods to typescript definitions (#229). 31 | * Fixed semver version to 6.1.2 to support Node 8.x (by @tanandara). 32 | * Capped iconv version to 2.x as 3.x has dropped support for older Node versions. 33 | * Switched from instanbul to c8 for code coverage. 34 | 35 | 36 | ## 0.5.1 / 2020-01-18 37 | 38 | * Added cp720 encoding (#221, by @kr-deps) 39 | * (minor) Changed Changelog.md formatting to use h2. 40 | 41 | 42 | ## 0.5.0 / 2019-06-26 43 | 44 | * Added UTF-32 encoding, both little-endian and big-endian variants (UTF-32LE, UTF32-BE). If endianness 45 | is not provided for decoding, it's deduced automatically from the stream using a heuristic similar to 46 | what we use in UTF-16. (great work in #216 by @kshetline) 47 | * Several minor updates to README (#217 by @oldj, plus some more) 48 | * Added Node versions 10 and 12 to Travis test harness. 49 | 50 | 51 | ## 0.4.24 / 2018-08-22 52 | 53 | * Added MIK encoding (#196, by @Ivan-Kalatchev) 54 | 55 | 56 | ## 0.4.23 / 2018-05-07 57 | 58 | * Fix deprecation warning in Node v10 due to the last usage of `new Buffer` (#185, by @felixbuenemann) 59 | * Switched from NodeBuffer to Buffer in typings (#155 by @felixfbecker, #186 by @larssn) 60 | 61 | 62 | ## 0.4.22 / 2018-05-05 63 | 64 | * Use older semver style for dependencies to be compatible with Node version 0.10 (#182, by @dougwilson) 65 | * Fix tests to accomodate fixes in Node v10 (#182, by @dougwilson) 66 | 67 | 68 | ## 0.4.21 / 2018-04-06 69 | 70 | * Fix encoding canonicalization (#156) 71 | * Fix the paths in the "browser" field in package.json (#174 by @LMLB) 72 | * Removed "contributors" section in package.json - see Git history instead. 73 | 74 | 75 | ## 0.4.20 / 2018-04-06 76 | 77 | * Updated `new Buffer()` usages with recommended replacements as it's being deprecated in Node v10 (#176, #178 by @ChALkeR) 78 | 79 | 80 | ## 0.4.19 / 2017-09-09 81 | 82 | * Fixed iso8859-1 codec regression in handling untranslatable characters (#162, caused by #147) 83 | * Re-generated windows1255 codec, because it was updated in iconv project 84 | * Fixed grammar in error message when iconv-lite is loaded with encoding other than utf8 85 | 86 | 87 | ## 0.4.18 / 2017-06-13 88 | 89 | * Fixed CESU-8 regression in Node v8. 90 | 91 | 92 | ## 0.4.17 / 2017-04-22 93 | 94 | * Updated typescript definition file to support Angular 2 AoT mode (#153 by @larssn) 95 | 96 | 97 | ## 0.4.16 / 2017-04-22 98 | 99 | * Added support for React Native (#150) 100 | * Changed iso8859-1 encoding to usine internal 'binary' encoding, as it's the same thing (#147 by @mscdex) 101 | * Fixed typo in Readme (#138 by @jiangzhuo) 102 | * Fixed build for Node v6.10+ by making correct version comparison 103 | * Added a warning if iconv-lite is loaded not as utf-8 (see #142) 104 | 105 | 106 | ## 0.4.15 / 2016-11-21 107 | 108 | * Fixed typescript type definition (#137) 109 | 110 | 111 | ## 0.4.14 / 2016-11-20 112 | 113 | * Preparation for v1.0 114 | * Added Node v6 and latest Node versions to Travis CI test rig 115 | * Deprecated Node v0.8 support 116 | * Typescript typings (@larssn) 117 | * Fix encoding of Euro character in GB 18030 (inspired by @lygstate) 118 | * Add ms prefix to dbcs windows encodings (@rokoroku) 119 | 120 | 121 | ## 0.4.13 / 2015-10-01 122 | 123 | * Fix silly mistake in deprecation notice. 124 | 125 | 126 | ## 0.4.12 / 2015-09-26 127 | 128 | * Node v4 support: 129 | * Added CESU-8 decoding (#106) 130 | * Added deprecation notice for `extendNodeEncodings` 131 | * Added Travis tests for Node v4 and io.js latest (#105 by @Mithgol) 132 | 133 | 134 | ## 0.4.11 / 2015-07-03 135 | 136 | * Added CESU-8 encoding. 137 | 138 | 139 | ## 0.4.10 / 2015-05-26 140 | 141 | * Changed UTF-16 endianness heuristic to take into account any ASCII chars, not 142 | just spaces. This should minimize the importance of "default" endianness. 143 | 144 | 145 | ## 0.4.9 / 2015-05-24 146 | 147 | * Streamlined BOM handling: strip BOM by default, add BOM when encoding if 148 | addBOM: true. Added docs to Readme. 149 | * UTF16 now uses UTF16-LE by default. 150 | * Fixed minor issue with big5 encoding. 151 | * Added io.js testing on Travis; updated node-iconv version to test against. 152 | Now we just skip testing SBCS encodings that node-iconv doesn't support. 153 | * (internal refactoring) Updated codec interface to use classes. 154 | * Use strict mode in all files. 155 | 156 | 157 | ## 0.4.8 / 2015-04-14 158 | 159 | * added alias UNICODE-1-1-UTF-7 for UTF-7 encoding (#94) 160 | 161 | 162 | ## 0.4.7 / 2015-02-05 163 | 164 | * stop official support of Node.js v0.8. Should still work, but no guarantees. 165 | reason: Packages needed for testing are hard to get on Travis CI. 166 | * work in environment where Object.prototype is monkey patched with enumerable 167 | props (#89). 168 | 169 | 170 | ## 0.4.6 / 2015-01-12 171 | 172 | * fix rare aliases of single-byte encodings (thanks @mscdex) 173 | * double the timeout for dbcs tests to make them less flaky on travis 174 | 175 | 176 | ## 0.4.5 / 2014-11-20 177 | 178 | * fix windows-31j and x-sjis encoding support (@nleush) 179 | * minor fix: undefined variable reference when internal error happens 180 | 181 | 182 | ## 0.4.4 / 2014-07-16 183 | 184 | * added encodings UTF-7 (RFC2152) and UTF-7-IMAP (RFC3501 Section 5.1.3) 185 | * fixed streaming base64 encoding 186 | 187 | 188 | ## 0.4.3 / 2014-06-14 189 | 190 | * added encodings UTF-16BE and UTF-16 with BOM 191 | 192 | 193 | ## 0.4.2 / 2014-06-12 194 | 195 | * don't throw exception if `extendNodeEncodings()` is called more than once 196 | 197 | 198 | ## 0.4.1 / 2014-06-11 199 | 200 | * codepage 808 added 201 | 202 | 203 | ## 0.4.0 / 2014-06-10 204 | 205 | * code is rewritten from scratch 206 | * all widespread encodings are supported 207 | * streaming interface added 208 | * browserify compatibility added 209 | * (optional) extend core primitive encodings to make usage even simpler 210 | * moved from vows to mocha as the testing framework 211 | 212 | 213 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011 Alexander Shtuchkin 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## iconv-lite: Pure JS character encoding conversion 2 | 3 | * No need for native code compilation. Quick to install, works on Windows, Web, and in sandboxed environments. 4 | * Used in popular projects like [Express.js (body_parser)](https://github.com/expressjs/body-parser), 5 | [Grunt](http://gruntjs.com/), [Nodemailer](http://www.nodemailer.com/), [Yeoman](http://yeoman.io/) and others. 6 | * Faster than [node-iconv](https://github.com/bnoordhuis/node-iconv) (see below for performance comparison). 7 | * Intuitive encode/decode API, including Streaming support. 8 | * In-browser usage via [browserify](https://github.com/substack/node-browserify) or [webpack](https://webpack.js.org/) (~180kb gzip compressed with Buffer shim included). 9 | * Typescript [type definition file](https://github.com/ashtuchkin/iconv-lite/blob/master/lib/index.d.ts) included. 10 | * React Native is supported (need to install `stream` module to enable Streaming API). 11 | * License: MIT. 12 | 13 | [![NPM Stats](https://nodei.co/npm/iconv-lite.png)](https://npmjs.org/package/iconv-lite/) 14 | [![Build Status](https://travis-ci.org/ashtuchkin/iconv-lite.svg?branch=master)](https://travis-ci.org/ashtuchkin/iconv-lite) 15 | [![npm](https://img.shields.io/npm/v/iconv-lite.svg)](https://npmjs.org/package/iconv-lite/) 16 | [![npm downloads](https://img.shields.io/npm/dm/iconv-lite.svg)](https://npmjs.org/package/iconv-lite/) 17 | [![npm bundle size](https://img.shields.io/bundlephobia/min/iconv-lite.svg)](https://npmjs.org/package/iconv-lite/) 18 | 19 | ## Usage 20 | ### Basic API 21 | ```javascript 22 | var iconv = require('iconv-lite'); 23 | 24 | // Convert from an encoded buffer to a js string. 25 | str = iconv.decode(Buffer.from([0x68, 0x65, 0x6c, 0x6c, 0x6f]), 'win1251'); 26 | 27 | // Convert from a js string to an encoded buffer. 28 | buf = iconv.encode("Sample input string", 'win1251'); 29 | 30 | // Check if encoding is supported 31 | iconv.encodingExists("us-ascii") 32 | ``` 33 | 34 | ### Streaming API 35 | ```javascript 36 | 37 | // Decode stream (from binary data stream to js strings) 38 | http.createServer(function(req, res) { 39 | var converterStream = iconv.decodeStream('win1251'); 40 | req.pipe(converterStream); 41 | 42 | converterStream.on('data', function(str) { 43 | console.log(str); // Do something with decoded strings, chunk-by-chunk. 44 | }); 45 | }); 46 | 47 | // Convert encoding streaming example 48 | fs.createReadStream('file-in-win1251.txt') 49 | .pipe(iconv.decodeStream('win1251')) 50 | .pipe(iconv.encodeStream('ucs2')) 51 | .pipe(fs.createWriteStream('file-in-ucs2.txt')); 52 | 53 | // Sugar: all encode/decode streams have .collect(cb) method to accumulate data. 54 | http.createServer(function(req, res) { 55 | req.pipe(iconv.decodeStream('win1251')).collect(function(err, body) { 56 | assert(typeof body == 'string'); 57 | console.log(body); // full request body string 58 | }); 59 | }); 60 | ``` 61 | 62 | ## Supported encodings 63 | 64 | * All node.js native encodings: utf8, ucs2 / utf16-le, ascii, binary, base64, hex. 65 | * Additional unicode encodings: utf16, utf16-be, utf-7, utf-7-imap, utf32, utf32-le, and utf32-be. 66 | * All widespread singlebyte encodings: Windows 125x family, ISO-8859 family, 67 | IBM/DOS codepages, Macintosh family, KOI8 family, all others supported by iconv library. 68 | Aliases like 'latin1', 'us-ascii' also supported. 69 | * All widespread multibyte encodings: CP932, CP936, CP949, CP950, GB2312, GBK, GB18030, Big5, Shift_JIS, EUC-JP. 70 | 71 | See [all supported encodings on wiki](https://github.com/ashtuchkin/iconv-lite/wiki/Supported-Encodings). 72 | 73 | Most singlebyte encodings are generated automatically from [node-iconv](https://github.com/bnoordhuis/node-iconv). Thank you Ben Noordhuis and libiconv authors! 74 | 75 | Multibyte encodings are generated from [Unicode.org mappings](http://www.unicode.org/Public/MAPPINGS/) and [WHATWG Encoding Standard mappings](http://encoding.spec.whatwg.org/). Thank you, respective authors! 76 | 77 | 78 | ## Encoding/decoding speed 79 | 80 | Comparison with node-iconv module (1000x256kb, on MacBook Pro, Core i5/2.6 GHz, Node v0.12.0). 81 | Note: your results may vary, so please always check on your hardware. 82 | 83 | operation iconv@2.1.4 iconv-lite@0.4.7 84 | ---------------------------------------------------------- 85 | encode('win1251') ~96 Mb/s ~320 Mb/s 86 | decode('win1251') ~95 Mb/s ~246 Mb/s 87 | 88 | ## BOM handling 89 | 90 | * Decoding: BOM is stripped by default, unless overridden by passing `stripBOM: false` in options 91 | (f.ex. `iconv.decode(buf, enc, {stripBOM: false})`). 92 | A callback might also be given as a `stripBOM` parameter - it'll be called if BOM character was actually found. 93 | * If you want to detect UTF-8 BOM when decoding other encodings, use [node-autodetect-decoder-stream](https://github.com/danielgindi/node-autodetect-decoder-stream) module. 94 | * Encoding: No BOM added, unless overridden by `addBOM: true` option. 95 | 96 | ## UTF-16 Encodings 97 | 98 | This library supports UTF-16LE, UTF-16BE and UTF-16 encodings. First two are straightforward, but UTF-16 is trying to be 99 | smart about endianness in the following ways: 100 | * Decoding: uses BOM and 'spaces heuristic' to determine input endianness. Default is UTF-16LE, but can be 101 | overridden with `defaultEncoding: 'utf-16be'` option. Strips BOM unless `stripBOM: false`. 102 | * Encoding: uses UTF-16LE and writes BOM by default. Use `addBOM: false` to override. 103 | 104 | ## UTF-32 Encodings 105 | 106 | This library supports UTF-32LE, UTF-32BE and UTF-32 encodings. Like the UTF-16 encoding above, UTF-32 defaults to UTF-32LE, but uses BOM and 'spaces heuristics' to determine input endianness. 107 | * The default of UTF-32LE can be overridden with the `defaultEncoding: 'utf-32be'` option. Strips BOM unless `stripBOM: false`. 108 | * Encoding: uses UTF-32LE and writes BOM by default. Use `addBOM: false` to override. (`defaultEncoding: 'utf-32be'` can also be used here to change encoding.) 109 | 110 | ## Other notes 111 | 112 | When decoding, be sure to supply a Buffer to decode() method, otherwise [bad things usually happen](https://github.com/ashtuchkin/iconv-lite/wiki/Use-Buffers-when-decoding). 113 | Untranslatable characters are set to � or ?. No transliteration is currently supported. 114 | Node versions 0.10.31 and 0.11.13 are buggy, don't use them (see #65, #77). 115 | 116 | ## Testing 117 | 118 | ```bash 119 | $ git clone git@github.com:ashtuchkin/iconv-lite.git 120 | $ cd iconv-lite 121 | $ npm install 122 | $ npm test 123 | 124 | $ # To view performance: 125 | $ node test/performance.js 126 | 127 | $ # To view test coverage: 128 | $ npm run coverage 129 | $ open coverage/lcov-report/index.html 130 | ``` 131 | -------------------------------------------------------------------------------- /encodings/dbcs-data.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | 3 | // Description of supported double byte encodings and aliases. 4 | // Tables are not require()-d until they are needed to speed up library load. 5 | // require()-s are direct to support Browserify. 6 | 7 | module.exports = { 8 | 9 | // == Japanese/ShiftJIS ==================================================== 10 | // All japanese encodings are based on JIS X set of standards: 11 | // JIS X 0201 - Single-byte encoding of ASCII + ¥ + Kana chars at 0xA1-0xDF. 12 | // JIS X 0208 - Main set of 6879 characters, placed in 94x94 plane, to be encoded by 2 bytes. 13 | // Has several variations in 1978, 1983, 1990 and 1997. 14 | // JIS X 0212 - Supplementary plane of 6067 chars in 94x94 plane. 1990. Effectively dead. 15 | // JIS X 0213 - Extension and modern replacement of 0208 and 0212. Total chars: 11233. 16 | // 2 planes, first is superset of 0208, second - revised 0212. 17 | // Introduced in 2000, revised 2004. Some characters are in Unicode Plane 2 (0x2xxxx) 18 | 19 | // Byte encodings are: 20 | // * Shift_JIS: Compatible with 0201, uses not defined chars in top half as lead bytes for double-byte 21 | // encoding of 0208. Lead byte ranges: 0x81-0x9F, 0xE0-0xEF; Trail byte ranges: 0x40-0x7E, 0x80-0x9E, 0x9F-0xFC. 22 | // Windows CP932 is a superset of Shift_JIS. Some companies added more chars, notably KDDI. 23 | // * EUC-JP: Up to 3 bytes per character. Used mostly on *nixes. 24 | // 0x00-0x7F - lower part of 0201 25 | // 0x8E, 0xA1-0xDF - upper part of 0201 26 | // (0xA1-0xFE)x2 - 0208 plane (94x94). 27 | // 0x8F, (0xA1-0xFE)x2 - 0212 plane (94x94). 28 | // * JIS X 208: 7-bit, direct encoding of 0208. Byte ranges: 0x21-0x7E (94 values). Uncommon. 29 | // Used as-is in ISO2022 family. 30 | // * ISO2022-JP: Stateful encoding, with escape sequences to switch between ASCII, 31 | // 0201-1976 Roman, 0208-1978, 0208-1983. 32 | // * ISO2022-JP-1: Adds esc seq for 0212-1990. 33 | // * ISO2022-JP-2: Adds esc seq for GB2313-1980, KSX1001-1992, ISO8859-1, ISO8859-7. 34 | // * ISO2022-JP-3: Adds esc seq for 0201-1976 Kana set, 0213-2000 Planes 1, 2. 35 | // * ISO2022-JP-2004: Adds 0213-2004 Plane 1. 36 | // 37 | // After JIS X 0213 appeared, Shift_JIS-2004, EUC-JISX0213 and ISO2022-JP-2004 followed, with just changing the planes. 38 | // 39 | // Overall, it seems that it's a mess :( http://www8.plala.or.jp/tkubota1/unicode-symbols-map2.html 40 | 41 | 'shiftjis': { 42 | type: '_dbcs', 43 | table: function() { return require('./tables/shiftjis.json') }, 44 | encodeAdd: {'\u00a5': 0x5C, '\u203E': 0x7E}, 45 | encodeSkipVals: [{from: 0xED40, to: 0xF940}], 46 | }, 47 | 'csshiftjis': 'shiftjis', 48 | 'mskanji': 'shiftjis', 49 | 'sjis': 'shiftjis', 50 | 'windows31j': 'shiftjis', 51 | 'ms31j': 'shiftjis', 52 | 'xsjis': 'shiftjis', 53 | 'windows932': 'shiftjis', 54 | 'ms932': 'shiftjis', 55 | '932': 'shiftjis', 56 | 'cp932': 'shiftjis', 57 | 58 | 'eucjp': { 59 | type: '_dbcs', 60 | table: function() { return require('./tables/eucjp.json') }, 61 | encodeAdd: {'\u00a5': 0x5C, '\u203E': 0x7E}, 62 | }, 63 | 64 | // TODO: KDDI extension to Shift_JIS 65 | // TODO: IBM CCSID 942 = CP932, but F0-F9 custom chars and other char changes. 66 | // TODO: IBM CCSID 943 = Shift_JIS = CP932 with original Shift_JIS lower 128 chars. 67 | 68 | 69 | // == Chinese/GBK ========================================================== 70 | // http://en.wikipedia.org/wiki/GBK 71 | // We mostly implement W3C recommendation: https://www.w3.org/TR/encoding/#gbk-encoder 72 | 73 | // Oldest GB2312 (1981, ~7600 chars) is a subset of CP936 74 | 'gb2312': 'cp936', 75 | 'gb231280': 'cp936', 76 | 'gb23121980': 'cp936', 77 | 'csgb2312': 'cp936', 78 | 'csiso58gb231280': 'cp936', 79 | 'euccn': 'cp936', 80 | 81 | // Microsoft's CP936 is a subset and approximation of GBK. 82 | 'windows936': 'cp936', 83 | 'ms936': 'cp936', 84 | '936': 'cp936', 85 | 'cp936': { 86 | type: '_dbcs', 87 | table: function() { return require('./tables/cp936.json') }, 88 | }, 89 | 90 | // GBK (~22000 chars) is an extension of CP936 that added user-mapped chars and some other. 91 | 'gbk': { 92 | type: '_dbcs', 93 | table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) }, 94 | }, 95 | 'xgbk': 'gbk', 96 | 'isoir58': 'gbk', 97 | 98 | // GB18030 is an algorithmic extension of GBK. 99 | // Main source: https://www.w3.org/TR/encoding/#gbk-encoder 100 | // http://icu-project.org/docs/papers/gb18030.html 101 | // http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/gb-18030-2000.xml 102 | // http://www.khngai.com/chinese/charmap/tblgbk.php?page=0 103 | 'gb18030': { 104 | type: '_dbcs', 105 | table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) }, 106 | gb18030: function() { return require('./tables/gb18030-ranges.json') }, 107 | encodeSkipVals: [0x80], 108 | encodeAdd: {'€': 0xA2E3}, 109 | }, 110 | 111 | 'chinese': 'gb18030', 112 | 113 | 114 | // == Korean =============================================================== 115 | // EUC-KR, KS_C_5601 and KS X 1001 are exactly the same. 116 | 'windows949': 'cp949', 117 | 'ms949': 'cp949', 118 | '949': 'cp949', 119 | 'cp949': { 120 | type: '_dbcs', 121 | table: function() { return require('./tables/cp949.json') }, 122 | }, 123 | 124 | 'cseuckr': 'cp949', 125 | 'csksc56011987': 'cp949', 126 | 'euckr': 'cp949', 127 | 'isoir149': 'cp949', 128 | 'korean': 'cp949', 129 | 'ksc56011987': 'cp949', 130 | 'ksc56011989': 'cp949', 131 | 'ksc5601': 'cp949', 132 | 133 | 134 | // == Big5/Taiwan/Hong Kong ================================================ 135 | // There are lots of tables for Big5 and cp950. Please see the following links for history: 136 | // http://moztw.org/docs/big5/ http://www.haible.de/bruno/charsets/conversion-tables/Big5.html 137 | // Variations, in roughly number of defined chars: 138 | // * Windows CP 950: Microsoft variant of Big5. Canonical: http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT 139 | // * Windows CP 951: Microsoft variant of Big5-HKSCS-2001. Seems to be never public. http://me.abelcheung.org/articles/research/what-is-cp951/ 140 | // * Big5-2003 (Taiwan standard) almost superset of cp950. 141 | // * Unicode-at-on (UAO) / Mozilla 1.8. Falling out of use on the Web. Not supported by other browsers. 142 | // * Big5-HKSCS (-2001, -2004, -2008). Hong Kong standard. 143 | // many unicode code points moved from PUA to Supplementary plane (U+2XXXX) over the years. 144 | // Plus, it has 4 combining sequences. 145 | // Seems that Mozilla refused to support it for 10 yrs. https://bugzilla.mozilla.org/show_bug.cgi?id=162431 https://bugzilla.mozilla.org/show_bug.cgi?id=310299 146 | // because big5-hkscs is the only encoding to include astral characters in non-algorithmic way. 147 | // Implementations are not consistent within browsers; sometimes labeled as just big5. 148 | // MS Internet Explorer switches from big5 to big5-hkscs when a patch applied. 149 | // Great discussion & recap of what's going on https://bugzilla.mozilla.org/show_bug.cgi?id=912470#c31 150 | // In the encoder, it might make sense to support encoding old PUA mappings to Big5 bytes seq-s. 151 | // Official spec: http://www.ogcio.gov.hk/en/business/tech_promotion/ccli/terms/doc/2003cmp_2008.txt 152 | // http://www.ogcio.gov.hk/tc/business/tech_promotion/ccli/terms/doc/hkscs-2008-big5-iso.txt 153 | // 154 | // Current understanding of how to deal with Big5(-HKSCS) is in the Encoding Standard, http://encoding.spec.whatwg.org/#big5-encoder 155 | // Unicode mapping (http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT) is said to be wrong. 156 | 157 | 'windows950': 'cp950', 158 | 'ms950': 'cp950', 159 | '950': 'cp950', 160 | 'cp950': { 161 | type: '_dbcs', 162 | table: function() { return require('./tables/cp950.json') }, 163 | }, 164 | 165 | // Big5 has many variations and is an extension of cp950. We use Encoding Standard's as a consensus. 166 | 'big5': 'big5hkscs', 167 | 'big5hkscs': { 168 | type: '_dbcs', 169 | table: function() { return require('./tables/cp950.json').concat(require('./tables/big5-added.json')) }, 170 | encodeSkipVals: [ 171 | // Although Encoding Standard says we should avoid encoding to HKSCS area (See Step 1 of 172 | // https://encoding.spec.whatwg.org/#index-big5-pointer), we still do it to increase compatibility with ICU. 173 | // But if a single unicode point can be encoded both as HKSCS and regular Big5, we prefer the latter. 174 | 0x8e69, 0x8e6f, 0x8e7e, 0x8eab, 0x8eb4, 0x8ecd, 0x8ed0, 0x8f57, 0x8f69, 0x8f6e, 0x8fcb, 0x8ffe, 175 | 0x906d, 0x907a, 0x90c4, 0x90dc, 0x90f1, 0x91bf, 0x92af, 0x92b0, 0x92b1, 0x92b2, 0x92d1, 0x9447, 0x94ca, 176 | 0x95d9, 0x96fc, 0x9975, 0x9b76, 0x9b78, 0x9b7b, 0x9bc6, 0x9bde, 0x9bec, 0x9bf6, 0x9c42, 0x9c53, 0x9c62, 177 | 0x9c68, 0x9c6b, 0x9c77, 0x9cbc, 0x9cbd, 0x9cd0, 0x9d57, 0x9d5a, 0x9dc4, 0x9def, 0x9dfb, 0x9ea9, 0x9eef, 178 | 0x9efd, 0x9f60, 0x9fcb, 0xa077, 0xa0dc, 0xa0df, 0x8fcc, 0x92c8, 0x9644, 0x96ed, 179 | 180 | // Step 2 of https://encoding.spec.whatwg.org/#index-big5-pointer: Use last pointer for U+2550, U+255E, U+2561, U+256A, U+5341, or U+5345 181 | 0xa2a4, 0xa2a5, 0xa2a7, 0xa2a6, 0xa2cc, 0xa2ce, 182 | ], 183 | }, 184 | 185 | 'cnbig5': 'big5hkscs', 186 | 'csbig5': 'big5hkscs', 187 | 'xxbig5': 'big5hkscs', 188 | }; 189 | -------------------------------------------------------------------------------- /encodings/index.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | 3 | // Update this array if you add/rename/remove files in this directory. 4 | // We support Browserify by skipping automatic module discovery and requiring modules directly. 5 | var modules = [ 6 | require("./internal"), 7 | require("./utf32"), 8 | require("./utf16"), 9 | require("./utf7"), 10 | require("./sbcs-codec"), 11 | require("./sbcs-data"), 12 | require("./sbcs-data-generated"), 13 | require("./dbcs-codec"), 14 | require("./dbcs-data"), 15 | ]; 16 | 17 | // Put all encoding/alias/codec definitions to single object and export it. 18 | for (var i = 0; i < modules.length; i++) { 19 | var module = modules[i]; 20 | for (var enc in module) 21 | if (Object.prototype.hasOwnProperty.call(module, enc)) 22 | exports[enc] = module[enc]; 23 | } 24 | -------------------------------------------------------------------------------- /encodings/internal.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | var Buffer = require("safer-buffer").Buffer; 3 | 4 | // Export Node.js internal encodings. 5 | 6 | module.exports = { 7 | // Encodings 8 | utf8: { type: "_internal", bomAware: true}, 9 | cesu8: { type: "_internal", bomAware: true}, 10 | unicode11utf8: "utf8", 11 | 12 | ucs2: { type: "_internal", bomAware: true}, 13 | utf16le: "ucs2", 14 | 15 | binary: { type: "_internal" }, 16 | base64: { type: "_internal" }, 17 | hex: { type: "_internal" }, 18 | 19 | // Codec. 20 | _internal: InternalCodec, 21 | }; 22 | 23 | //------------------------------------------------------------------------------ 24 | 25 | function InternalCodec(codecOptions, iconv) { 26 | this.enc = codecOptions.encodingName; 27 | this.bomAware = codecOptions.bomAware; 28 | 29 | if (this.enc === "base64") 30 | this.encoder = InternalEncoderBase64; 31 | else if (this.enc === "utf8") 32 | this.encoder = InternalEncoderUtf8; 33 | else if (this.enc === "cesu8") { 34 | this.enc = "utf8"; // Use utf8 for decoding. 35 | this.encoder = InternalEncoderCesu8; 36 | 37 | // Add decoder for versions of Node not supporting CESU-8 38 | if (Buffer.from('eda0bdedb2a9', 'hex').toString() !== '💩') { 39 | this.decoder = InternalDecoderCesu8; 40 | this.defaultCharUnicode = iconv.defaultCharUnicode; 41 | } 42 | } 43 | } 44 | 45 | InternalCodec.prototype.encoder = InternalEncoder; 46 | InternalCodec.prototype.decoder = InternalDecoder; 47 | 48 | //------------------------------------------------------------------------------ 49 | 50 | // We use node.js internal decoder. Its signature is the same as ours. 51 | var StringDecoder = require('string_decoder').StringDecoder; 52 | 53 | if (!StringDecoder.prototype.end) // Node v0.8 doesn't have this method. 54 | StringDecoder.prototype.end = function() {}; 55 | 56 | 57 | function InternalDecoder(options, codec) { 58 | this.decoder = new StringDecoder(codec.enc); 59 | } 60 | 61 | InternalDecoder.prototype.write = function(buf) { 62 | if (!Buffer.isBuffer(buf)) { 63 | buf = Buffer.from(buf); 64 | } 65 | 66 | return this.decoder.write(buf); 67 | } 68 | 69 | InternalDecoder.prototype.end = function() { 70 | return this.decoder.end(); 71 | } 72 | 73 | 74 | //------------------------------------------------------------------------------ 75 | // Encoder is mostly trivial 76 | 77 | function InternalEncoder(options, codec) { 78 | this.enc = codec.enc; 79 | } 80 | 81 | InternalEncoder.prototype.write = function(str) { 82 | return Buffer.from(str, this.enc); 83 | } 84 | 85 | InternalEncoder.prototype.end = function() { 86 | } 87 | 88 | 89 | //------------------------------------------------------------------------------ 90 | // Except base64 encoder, which must keep its state. 91 | 92 | function InternalEncoderBase64(options, codec) { 93 | this.prevStr = ''; 94 | } 95 | 96 | InternalEncoderBase64.prototype.write = function(str) { 97 | str = this.prevStr + str; 98 | var completeQuads = str.length - (str.length % 4); 99 | this.prevStr = str.slice(completeQuads); 100 | str = str.slice(0, completeQuads); 101 | 102 | return Buffer.from(str, "base64"); 103 | } 104 | 105 | InternalEncoderBase64.prototype.end = function() { 106 | return Buffer.from(this.prevStr, "base64"); 107 | } 108 | 109 | 110 | //------------------------------------------------------------------------------ 111 | // CESU-8 encoder is also special. 112 | 113 | function InternalEncoderCesu8(options, codec) { 114 | } 115 | 116 | InternalEncoderCesu8.prototype.write = function(str) { 117 | var buf = Buffer.alloc(str.length * 3), bufIdx = 0; 118 | for (var i = 0; i < str.length; i++) { 119 | var charCode = str.charCodeAt(i); 120 | // Naive implementation, but it works because CESU-8 is especially easy 121 | // to convert from UTF-16 (which all JS strings are encoded in). 122 | if (charCode < 0x80) 123 | buf[bufIdx++] = charCode; 124 | else if (charCode < 0x800) { 125 | buf[bufIdx++] = 0xC0 + (charCode >>> 6); 126 | buf[bufIdx++] = 0x80 + (charCode & 0x3f); 127 | } 128 | else { // charCode will always be < 0x10000 in javascript. 129 | buf[bufIdx++] = 0xE0 + (charCode >>> 12); 130 | buf[bufIdx++] = 0x80 + ((charCode >>> 6) & 0x3f); 131 | buf[bufIdx++] = 0x80 + (charCode & 0x3f); 132 | } 133 | } 134 | return buf.slice(0, bufIdx); 135 | } 136 | 137 | InternalEncoderCesu8.prototype.end = function() { 138 | } 139 | 140 | //------------------------------------------------------------------------------ 141 | // CESU-8 decoder is not implemented in Node v4.0+ 142 | 143 | function InternalDecoderCesu8(options, codec) { 144 | this.acc = 0; 145 | this.contBytes = 0; 146 | this.accBytes = 0; 147 | this.defaultCharUnicode = codec.defaultCharUnicode; 148 | } 149 | 150 | InternalDecoderCesu8.prototype.write = function(buf) { 151 | var acc = this.acc, contBytes = this.contBytes, accBytes = this.accBytes, 152 | res = ''; 153 | for (var i = 0; i < buf.length; i++) { 154 | var curByte = buf[i]; 155 | if ((curByte & 0xC0) !== 0x80) { // Leading byte 156 | if (contBytes > 0) { // Previous code is invalid 157 | res += this.defaultCharUnicode; 158 | contBytes = 0; 159 | } 160 | 161 | if (curByte < 0x80) { // Single-byte code 162 | res += String.fromCharCode(curByte); 163 | } else if (curByte < 0xE0) { // Two-byte code 164 | acc = curByte & 0x1F; 165 | contBytes = 1; accBytes = 1; 166 | } else if (curByte < 0xF0) { // Three-byte code 167 | acc = curByte & 0x0F; 168 | contBytes = 2; accBytes = 1; 169 | } else { // Four or more are not supported for CESU-8. 170 | res += this.defaultCharUnicode; 171 | } 172 | } else { // Continuation byte 173 | if (contBytes > 0) { // We're waiting for it. 174 | acc = (acc << 6) | (curByte & 0x3f); 175 | contBytes--; accBytes++; 176 | if (contBytes === 0) { 177 | // Check for overlong encoding, but support Modified UTF-8 (encoding NULL as C0 80) 178 | if (accBytes === 2 && acc < 0x80 && acc > 0) 179 | res += this.defaultCharUnicode; 180 | else if (accBytes === 3 && acc < 0x800) 181 | res += this.defaultCharUnicode; 182 | else 183 | // Actually add character. 184 | res += String.fromCharCode(acc); 185 | } 186 | } else { // Unexpected continuation byte 187 | res += this.defaultCharUnicode; 188 | } 189 | } 190 | } 191 | this.acc = acc; this.contBytes = contBytes; this.accBytes = accBytes; 192 | return res; 193 | } 194 | 195 | InternalDecoderCesu8.prototype.end = function() { 196 | var res = 0; 197 | if (this.contBytes > 0) 198 | res += this.defaultCharUnicode; 199 | return res; 200 | } 201 | 202 | //------------------------------------------------------------------------------ 203 | // check the chunk boundaries for surrogate pair 204 | 205 | function InternalEncoderUtf8(options, codec) { 206 | this.highSurrogate = ''; 207 | } 208 | 209 | InternalEncoderUtf8.prototype.write = function (str) { 210 | if (this.highSurrogate) { 211 | str = this.highSurrogate + str; 212 | this.highSurrogate = ''; 213 | } 214 | 215 | if (str.length > 0) { 216 | var charCode = str.charCodeAt(str.length - 1); 217 | if (0xd800 <= charCode && charCode < 0xdc00) { 218 | this.highSurrogate = str[str.length - 1]; 219 | str = str.slice(0, str.length - 1); 220 | } 221 | } 222 | 223 | return Buffer.from(str, this.enc); 224 | } 225 | 226 | InternalEncoderUtf8.prototype.end = function () { 227 | if (this.highSurrogate) { 228 | var str = this.highSurrogate; 229 | this.highSurrogate = ''; 230 | return Buffer.from(str, this.enc); 231 | } 232 | } 233 | -------------------------------------------------------------------------------- /encodings/sbcs-codec.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | var Buffer = require("safer-buffer").Buffer; 3 | 4 | // Single-byte codec. Needs a 'chars' string parameter that contains 256 or 128 chars that 5 | // correspond to encoded bytes (if 128 - then lower half is ASCII). 6 | 7 | exports._sbcs = SBCSCodec; 8 | function SBCSCodec(codecOptions, iconv) { 9 | if (!codecOptions) 10 | throw new Error("SBCS codec is called without the data.") 11 | 12 | // Prepare char buffer for decoding. 13 | if (!codecOptions.chars || (codecOptions.chars.length !== 128 && codecOptions.chars.length !== 256)) 14 | throw new Error("Encoding '"+codecOptions.type+"' has incorrect 'chars' (must be of len 128 or 256)"); 15 | 16 | if (codecOptions.chars.length === 128) { 17 | var asciiString = ""; 18 | for (var i = 0; i < 128; i++) 19 | asciiString += String.fromCharCode(i); 20 | codecOptions.chars = asciiString + codecOptions.chars; 21 | } 22 | 23 | this.decodeBuf = Buffer.from(codecOptions.chars, 'ucs2'); 24 | 25 | // Encoding buffer. 26 | var encodeBuf = Buffer.alloc(65536, iconv.defaultCharSingleByte.charCodeAt(0)); 27 | 28 | for (var i = 0; i < codecOptions.chars.length; i++) 29 | encodeBuf[codecOptions.chars.charCodeAt(i)] = i; 30 | 31 | this.encodeBuf = encodeBuf; 32 | } 33 | 34 | SBCSCodec.prototype.encoder = SBCSEncoder; 35 | SBCSCodec.prototype.decoder = SBCSDecoder; 36 | 37 | 38 | function SBCSEncoder(options, codec) { 39 | this.encodeBuf = codec.encodeBuf; 40 | } 41 | 42 | SBCSEncoder.prototype.write = function(str) { 43 | var buf = Buffer.alloc(str.length); 44 | for (var i = 0; i < str.length; i++) 45 | buf[i] = this.encodeBuf[str.charCodeAt(i)]; 46 | 47 | return buf; 48 | } 49 | 50 | SBCSEncoder.prototype.end = function() { 51 | } 52 | 53 | 54 | function SBCSDecoder(options, codec) { 55 | this.decodeBuf = codec.decodeBuf; 56 | } 57 | 58 | SBCSDecoder.prototype.write = function(buf) { 59 | // Strings are immutable in JS -> we use ucs2 buffer to speed up computations. 60 | var decodeBuf = this.decodeBuf; 61 | var newBuf = Buffer.alloc(buf.length*2); 62 | var idx1 = 0, idx2 = 0; 63 | for (var i = 0; i < buf.length; i++) { 64 | idx1 = buf[i]*2; idx2 = i*2; 65 | newBuf[idx2] = decodeBuf[idx1]; 66 | newBuf[idx2+1] = decodeBuf[idx1+1]; 67 | } 68 | return newBuf.toString('ucs2'); 69 | } 70 | 71 | SBCSDecoder.prototype.end = function() { 72 | } 73 | -------------------------------------------------------------------------------- /encodings/sbcs-data.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | 3 | // Manually added data to be used by sbcs codec in addition to generated one. 4 | 5 | module.exports = { 6 | // Not supported by iconv, not sure why. 7 | "10029": "maccenteuro", 8 | "maccenteuro": { 9 | "type": "_sbcs", 10 | "chars": "ÄĀāÉĄÖÜáąČäčĆć鏟ĎíďĒēĖóėôöõúĚěü†°Ę£§•¶ß®©™ę¨≠ģĮįĪ≤≥īĶ∂∑łĻļĽľĹĺŅņѬ√ńŇ∆«»… ňŐÕőŌ–—“”‘’÷◊ōŔŕŘ‹›řŖŗŠ‚„šŚśÁŤťÍŽžŪÓÔūŮÚůŰűŲųÝýķŻŁżĢˇ" 11 | }, 12 | 13 | "808": "cp808", 14 | "ibm808": "cp808", 15 | "cp808": { 16 | "type": "_sbcs", 17 | "chars": "АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмноп░▒▓│┤╡╢╖╕╣║╗╝╜╛┐└┴┬├─┼╞╟╚╔╩╦╠═╬╧╨╤╥╙╘╒╓╫╪┘┌█▄▌▐▀рстуфхцчшщъыьэюяЁёЄєЇїЎў°∙·√№€■ " 18 | }, 19 | 20 | "mik": { 21 | "type": "_sbcs", 22 | "chars": "АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя└┴┬├─┼╣║╚╔╩╦╠═╬┐░▒▓│┤№§╗╝┘┌█▄▌▐▀αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²■ " 23 | }, 24 | 25 | "cp720": { 26 | "type": "_sbcs", 27 | "chars": "\x80\x81éâ\x84à\x86çêëèïî\x8d\x8e\x8f\x90\u0651\u0652ô¤ـûùءآأؤ£إئابةتثجحخدذرزسشص«»░▒▓│┤╡╢╖╕╣║╗╝╜╛┐└┴┬├─┼╞╟╚╔╩╦╠═╬╧╨╤╥╙╘╒╓╫╪┘┌█▄▌▐▀ضطظعغفµقكلمنهوىي≡\u064b\u064c\u064d\u064e\u064f\u0650≈°∙·√ⁿ²■\u00a0" 28 | }, 29 | 30 | // Aliases of generated encodings. 31 | "ascii8bit": "ascii", 32 | "usascii": "ascii", 33 | "ansix34": "ascii", 34 | "ansix341968": "ascii", 35 | "ansix341986": "ascii", 36 | "csascii": "ascii", 37 | "cp367": "ascii", 38 | "ibm367": "ascii", 39 | "isoir6": "ascii", 40 | "iso646us": "ascii", 41 | "iso646irv": "ascii", 42 | "us": "ascii", 43 | 44 | "latin1": "iso88591", 45 | "latin2": "iso88592", 46 | "latin3": "iso88593", 47 | "latin4": "iso88594", 48 | "latin5": "iso88599", 49 | "latin6": "iso885910", 50 | "latin7": "iso885913", 51 | "latin8": "iso885914", 52 | "latin9": "iso885915", 53 | "latin10": "iso885916", 54 | 55 | "csisolatin1": "iso88591", 56 | "csisolatin2": "iso88592", 57 | "csisolatin3": "iso88593", 58 | "csisolatin4": "iso88594", 59 | "csisolatincyrillic": "iso88595", 60 | "csisolatinarabic": "iso88596", 61 | "csisolatingreek" : "iso88597", 62 | "csisolatinhebrew": "iso88598", 63 | "csisolatin5": "iso88599", 64 | "csisolatin6": "iso885910", 65 | 66 | "l1": "iso88591", 67 | "l2": "iso88592", 68 | "l3": "iso88593", 69 | "l4": "iso88594", 70 | "l5": "iso88599", 71 | "l6": "iso885910", 72 | "l7": "iso885913", 73 | "l8": "iso885914", 74 | "l9": "iso885915", 75 | "l10": "iso885916", 76 | 77 | "isoir14": "iso646jp", 78 | "isoir57": "iso646cn", 79 | "isoir100": "iso88591", 80 | "isoir101": "iso88592", 81 | "isoir109": "iso88593", 82 | "isoir110": "iso88594", 83 | "isoir144": "iso88595", 84 | "isoir127": "iso88596", 85 | "isoir126": "iso88597", 86 | "isoir138": "iso88598", 87 | "isoir148": "iso88599", 88 | "isoir157": "iso885910", 89 | "isoir166": "tis620", 90 | "isoir179": "iso885913", 91 | "isoir199": "iso885914", 92 | "isoir203": "iso885915", 93 | "isoir226": "iso885916", 94 | 95 | "cp819": "iso88591", 96 | "ibm819": "iso88591", 97 | 98 | "cyrillic": "iso88595", 99 | 100 | "arabic": "iso88596", 101 | "arabic8": "iso88596", 102 | "ecma114": "iso88596", 103 | "asmo708": "iso88596", 104 | 105 | "greek" : "iso88597", 106 | "greek8" : "iso88597", 107 | "ecma118" : "iso88597", 108 | "elot928" : "iso88597", 109 | 110 | "hebrew": "iso88598", 111 | "hebrew8": "iso88598", 112 | 113 | "turkish": "iso88599", 114 | "turkish8": "iso88599", 115 | 116 | "thai": "iso885911", 117 | "thai8": "iso885911", 118 | 119 | "celtic": "iso885914", 120 | "celtic8": "iso885914", 121 | "isoceltic": "iso885914", 122 | 123 | "tis6200": "tis620", 124 | "tis62025291": "tis620", 125 | "tis62025330": "tis620", 126 | 127 | "10000": "macroman", 128 | "10006": "macgreek", 129 | "10007": "maccyrillic", 130 | "10079": "maciceland", 131 | "10081": "macturkish", 132 | 133 | "cspc8codepage437": "cp437", 134 | "cspc775baltic": "cp775", 135 | "cspc850multilingual": "cp850", 136 | "cspcp852": "cp852", 137 | "cspc862latinhebrew": "cp862", 138 | "cpgr": "cp869", 139 | 140 | "msee": "cp1250", 141 | "mscyrl": "cp1251", 142 | "msansi": "cp1252", 143 | "msgreek": "cp1253", 144 | "msturk": "cp1254", 145 | "mshebr": "cp1255", 146 | "msarab": "cp1256", 147 | "winbaltrim": "cp1257", 148 | 149 | "cp20866": "koi8r", 150 | "20866": "koi8r", 151 | "ibm878": "koi8r", 152 | "cskoi8r": "koi8r", 153 | 154 | "cp21866": "koi8u", 155 | "21866": "koi8u", 156 | "ibm1168": "koi8u", 157 | 158 | "strk10482002": "rk1048", 159 | 160 | "tcvn5712": "tcvn", 161 | "tcvn57121": "tcvn", 162 | 163 | "gb198880": "iso646cn", 164 | "cn": "iso646cn", 165 | 166 | "csiso14jisc6220ro": "iso646jp", 167 | "jisc62201969ro": "iso646jp", 168 | "jp": "iso646jp", 169 | 170 | "cshproman8": "hproman8", 171 | "r8": "hproman8", 172 | "roman8": "hproman8", 173 | "xroman8": "hproman8", 174 | "ibm1051": "hproman8", 175 | 176 | "mac": "macintosh", 177 | "csmacintosh": "macintosh", 178 | }; 179 | 180 | -------------------------------------------------------------------------------- /encodings/tables/big5-added.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["8740","䏰䰲䘃䖦䕸𧉧䵷䖳𧲱䳢𧳅㮕䜶䝄䱇䱀𤊿𣘗𧍒𦺋𧃒䱗𪍑䝏䗚䲅𧱬䴇䪤䚡𦬣爥𥩔𡩣𣸆𣽡晍囻"], 3 | ["8767","綕夝𨮹㷴霴𧯯寛𡵞媤㘥𩺰嫑宷峼杮薓𩥅瑡璝㡵𡵓𣚞𦀡㻬"], 4 | ["87a1","𥣞㫵竼龗𤅡𨤍𣇪𠪊𣉞䌊蒄龖鐯䤰蘓墖靊鈘秐稲晠権袝瑌篅枂稬剏遆㓦珄𥶹瓆鿇垳䤯呌䄱𣚎堘穲𧭥讏䚮𦺈䆁𥶙箮𢒼鿈𢓁𢓉𢓌鿉蔄𣖻䂴鿊䓡𪷿拁灮鿋"], 5 | ["8840","㇀",4,"𠄌㇅𠃑𠃍㇆㇇𠃋𡿨㇈𠃊㇉㇊㇋㇌𠄎㇍㇎ĀÁǍÀĒÉĚÈŌÓǑÒ࿿Ê̄Ế࿿Ê̌ỀÊāáǎàɑēéěèīíǐìōóǒòūúǔùǖǘǚ"], 6 | ["88a1","ǜü࿿ê̄ế࿿ê̌ềêɡ⏚⏛"], 7 | ["8940","𪎩𡅅"], 8 | ["8943","攊"], 9 | ["8946","丽滝鵎釟"], 10 | ["894c","𧜵撑会伨侨兖兴农凤务动医华发变团声处备夲头学实実岚庆总斉柾栄桥济炼电纤纬纺织经统缆缷艺苏药视设询车轧轮"], 11 | ["89a1","琑糼緍楆竉刧"], 12 | ["89ab","醌碸酞肼"], 13 | ["89b0","贋胶𠧧"], 14 | ["89b5","肟黇䳍鷉鸌䰾𩷶𧀎鸊𪄳㗁"], 15 | ["89c1","溚舾甙"], 16 | ["89c5","䤑马骏龙禇𨑬𡷊𠗐𢫦两亁亀亇亿仫伷㑌侽㹈倃傈㑽㒓㒥円夅凛凼刅争剹劐匧㗇厩㕑厰㕓参吣㕭㕲㚁咓咣咴咹哐哯唘唣唨㖘唿㖥㖿嗗㗅"], 17 | ["8a40","𧶄唥"], 18 | ["8a43","𠱂𠴕𥄫喐𢳆㧬𠍁蹆𤶸𩓥䁓𨂾睺𢰸㨴䟕𨅝𦧲𤷪擝𠵼𠾴𠳕𡃴撍蹾𠺖𠰋𠽤𢲩𨉖𤓓"], 19 | ["8a64","𠵆𩩍𨃩䟴𤺧𢳂骲㩧𩗴㿭㔆𥋇𩟔𧣈𢵄鵮頕"], 20 | ["8a76","䏙𦂥撴哣𢵌𢯊𡁷㧻𡁯"], 21 | ["8aa1","𦛚𦜖𧦠擪𥁒𠱃蹨𢆡𨭌𠜱"], 22 | ["8aac","䠋𠆩㿺塳𢶍"], 23 | ["8ab2","𤗈𠓼𦂗𠽌𠶖啹䂻䎺"], 24 | ["8abb","䪴𢩦𡂝膪飵𠶜捹㧾𢝵跀嚡摼㹃"], 25 | ["8ac9","𪘁𠸉𢫏𢳉"], 26 | ["8ace","𡃈𣧂㦒㨆𨊛㕸𥹉𢃇噒𠼱𢲲𩜠㒼氽𤸻"], 27 | ["8adf","𧕴𢺋𢈈𪙛𨳍𠹺𠰴𦠜羓𡃏𢠃𢤹㗻𥇣𠺌𠾍𠺪㾓𠼰𠵇𡅏𠹌"], 28 | ["8af6","𠺫𠮩𠵈𡃀𡄽㿹𢚖搲𠾭"], 29 | ["8b40","𣏴𧘹𢯎𠵾𠵿𢱑𢱕㨘𠺘𡃇𠼮𪘲𦭐𨳒𨶙𨳊閪哌苄喹"], 30 | ["8b55","𩻃鰦骶𧝞𢷮煀腭胬尜𦕲脴㞗卟𨂽醶𠻺𠸏𠹷𠻻㗝𤷫㘉𠳖嚯𢞵𡃉𠸐𠹸𡁸𡅈𨈇𡑕𠹹𤹐𢶤婔𡀝𡀞𡃵𡃶垜𠸑"], 31 | ["8ba1","𧚔𨋍𠾵𠹻𥅾㜃𠾶𡆀𥋘𪊽𤧚𡠺𤅷𨉼墙剨㘚𥜽箲孨䠀䬬鼧䧧鰟鮍𥭴𣄽嗻㗲嚉丨夂𡯁屮靑𠂆乛亻㔾尣彑忄㣺扌攵歺氵氺灬爫丬犭𤣩罒礻糹罓𦉪㓁"], 32 | ["8bde","𦍋耂肀𦘒𦥑卝衤见𧢲讠贝钅镸长门𨸏韦页风飞饣𩠐鱼鸟黄歯龜丷𠂇阝户钢"], 33 | ["8c40","倻淾𩱳龦㷉袏𤅎灷峵䬠𥇍㕙𥴰愢𨨲辧釶熑朙玺𣊁𪄇㲋𡦀䬐磤琂冮𨜏䀉橣𪊺䈣蘏𠩯稪𩥇𨫪靕灍匤𢁾鏴盙𨧣龧矝亣俰傼丯众龨吴綋墒壐𡶶庒庙忂𢜒斋"], 34 | ["8ca1","𣏹椙橃𣱣泿"], 35 | ["8ca7","爀𤔅玌㻛𤨓嬕璹讃𥲤𥚕窓篬糃繬苸薗龩袐龪躹龫迏蕟駠鈡龬𨶹𡐿䁱䊢娚"], 36 | ["8cc9","顨杫䉶圽"], 37 | ["8cce","藖𤥻芿𧄍䲁𦵴嵻𦬕𦾾龭龮宖龯曧繛湗秊㶈䓃𣉖𢞖䎚䔶"], 38 | ["8ce6","峕𣬚諹屸㴒𣕑嵸龲煗䕘𤃬𡸣䱷㥸㑊𠆤𦱁諌侴𠈹妿腬顖𩣺弻"], 39 | ["8d40","𠮟"], 40 | ["8d42","𢇁𨥭䄂䚻𩁹㼇龳𪆵䃸㟖䛷𦱆䅼𨚲𧏿䕭㣔𥒚䕡䔛䶉䱻䵶䗪㿈𤬏㙡䓞䒽䇭崾嵈嵖㷼㠏嶤嶹㠠㠸幂庽弥徃㤈㤔㤿㥍惗愽峥㦉憷憹懏㦸戬抐拥挘㧸嚱"], 41 | ["8da1","㨃揢揻搇摚㩋擀崕嘡龟㪗斆㪽旿晓㫲暒㬢朖㭂枤栀㭘桊梄㭲㭱㭻椉楃牜楤榟榅㮼槖㯝橥橴橱檂㯬檙㯲檫檵櫔櫶殁毁毪汵沪㳋洂洆洦涁㳯涤涱渕渘温溆𨧀溻滢滚齿滨滩漤漴㵆𣽁澁澾㵪㵵熷岙㶊瀬㶑灐灔灯灿炉𠌥䏁㗱𠻘"], 42 | ["8e40","𣻗垾𦻓焾𥟠㙎榢𨯩孴穉𥣡𩓙穥穽𥦬窻窰竂竃燑𦒍䇊竚竝竪䇯咲𥰁笋筕笩𥌎𥳾箢筯莜𥮴𦱿篐萡箒箸𥴠㶭𥱥蒒篺簆簵𥳁籄粃𤢂粦晽𤕸糉糇糦籴糳糵糎"], 43 | ["8ea1","繧䔝𦹄絝𦻖璍綉綫焵綳緒𤁗𦀩緤㴓緵𡟹緥𨍭縝𦄡𦅚繮纒䌫鑬縧罀罁罇礶𦋐駡羗𦍑羣𡙡𠁨䕜𣝦䔃𨌺翺𦒉者耈耝耨耯𪂇𦳃耻耼聡𢜔䦉𦘦𣷣𦛨朥肧𨩈脇脚墰𢛶汿𦒘𤾸擧𡒊舘𡡞橓𤩥𤪕䑺舩𠬍𦩒𣵾俹𡓽蓢荢𦬊𤦧𣔰𡝳𣷸芪椛芳䇛"], 44 | ["8f40","蕋苐茚𠸖𡞴㛁𣅽𣕚艻苢茘𣺋𦶣𦬅𦮗𣗎㶿茝嗬莅䔋𦶥莬菁菓㑾𦻔橗蕚㒖𦹂𢻯葘𥯤葱㷓䓤檧葊𣲵祘蒨𦮖𦹷𦹃蓞萏莑䒠蒓蓤𥲑䉀𥳀䕃蔴嫲𦺙䔧蕳䔖枿蘖"], 45 | ["8fa1","𨘥𨘻藁𧂈蘂𡖂𧃍䕫䕪蘨㙈𡢢号𧎚虾蝱𪃸蟮𢰧螱蟚蠏噡虬桖䘏衅衆𧗠𣶹𧗤衞袜䙛袴袵揁装睷𧜏覇覊覦覩覧覼𨨥觧𧤤𧪽誜瞓釾誐𧩙竩𧬺𣾏䜓𧬸煼謌謟𥐰𥕥謿譌譍誩𤩺讐讛誯𡛟䘕衏貛𧵔𧶏貫㜥𧵓賖𧶘𧶽贒贃𡤐賛灜贑𤳉㻐起"], 46 | ["9040","趩𨀂𡀔𤦊㭼𨆼𧄌竧躭躶軃鋔輙輭𨍥𨐒辥錃𪊟𠩐辳䤪𨧞𨔽𣶻廸𣉢迹𪀔𨚼𨔁𢌥㦀𦻗逷𨔼𧪾遡𨕬𨘋邨𨜓郄𨛦邮都酧㫰醩釄粬𨤳𡺉鈎沟鉁鉢𥖹銹𨫆𣲛𨬌𥗛"], 47 | ["90a1","𠴱錬鍫𨫡𨯫炏嫃𨫢𨫥䥥鉄𨯬𨰹𨯿鍳鑛躼閅閦鐦閠濶䊹𢙺𨛘𡉼𣸮䧟氜陻隖䅬隣𦻕懚隶磵𨫠隽双䦡𦲸𠉴𦐐𩂯𩃥𤫑𡤕𣌊霱虂霶䨏䔽䖅𤫩灵孁霛靜𩇕靗孊𩇫靟鐥僐𣂷𣂼鞉鞟鞱鞾韀韒韠𥑬韮琜𩐳響韵𩐝𧥺䫑頴頳顋顦㬎𧅵㵑𠘰𤅜"], 48 | ["9140","𥜆飊颷飈飇䫿𦴧𡛓喰飡飦飬鍸餹𤨩䭲𩡗𩤅駵騌騻騐驘𥜥㛄𩂱𩯕髠髢𩬅髴䰎鬔鬭𨘀倴鬴𦦨㣃𣁽魐魀𩴾婅𡡣鮎𤉋鰂鯿鰌𩹨鷔𩾷𪆒𪆫𪃡𪄣𪇟鵾鶃𪄴鸎梈"], 49 | ["91a1","鷄𢅛𪆓𪈠𡤻𪈳鴹𪂹𪊴麐麕麞麢䴴麪麯𤍤黁㭠㧥㴝伲㞾𨰫鼂鼈䮖鐤𦶢鼗鼖鼹嚟嚊齅馸𩂋韲葿齢齩竜龎爖䮾𤥵𤦻煷𤧸𤍈𤩑玞𨯚𡣺禟𨥾𨸶鍩鏳𨩄鋬鎁鏋𨥬𤒹爗㻫睲穃烐𤑳𤏸煾𡟯炣𡢾𣖙㻇𡢅𥐯𡟸㜢𡛻𡠹㛡𡝴𡣑𥽋㜣𡛀坛𤨥𡏾𡊨"], 50 | ["9240","𡏆𡒶蔃𣚦蔃葕𤦔𧅥𣸱𥕜𣻻𧁒䓴𣛮𩦝𦼦柹㜳㰕㷧塬𡤢栐䁗𣜿𤃡𤂋𤄏𦰡哋嚞𦚱嚒𠿟𠮨𠸍鏆𨬓鎜仸儫㠙𤐶亼𠑥𠍿佋侊𥙑婨𠆫𠏋㦙𠌊𠐔㐵伩𠋀𨺳𠉵諚𠈌亘"], 51 | ["92a1","働儍侢伃𤨎𣺊佂倮偬傁俌俥偘僼兙兛兝兞湶𣖕𣸹𣺿浲𡢄𣺉冨凃𠗠䓝𠒣𠒒𠒑赺𨪜𠜎剙劤𠡳勡鍮䙺熌𤎌𠰠𤦬𡃤槑𠸝瑹㻞璙琔瑖玘䮎𤪼𤂍叐㖄爏𤃉喴𠍅响𠯆圝鉝雴鍦埝垍坿㘾壋媙𨩆𡛺𡝯𡜐娬妸銏婾嫏娒𥥆𡧳𡡡𤊕㛵洅瑃娡𥺃"], 52 | ["9340","媁𨯗𠐓鏠璌𡌃焅䥲鐈𨧻鎽㞠尞岞幞幈𡦖𡥼𣫮廍孏𡤃𡤄㜁𡢠㛝𡛾㛓脪𨩇𡶺𣑲𨦨弌弎𡤧𡞫婫𡜻孄蘔𧗽衠恾𢡠𢘫忛㺸𢖯𢖾𩂈𦽳懀𠀾𠁆𢘛憙憘恵𢲛𢴇𤛔𩅍"], 53 | ["93a1","摱𤙥𢭪㨩𢬢𣑐𩣪𢹸挷𪑛撶挱揑𤧣𢵧护𢲡搻敫楲㯴𣂎𣊭𤦉𣊫唍𣋠𡣙𩐿曎𣊉𣆳㫠䆐𥖄𨬢𥖏𡛼𥕛𥐥磮𣄃𡠪𣈴㑤𣈏𣆂𤋉暎𦴤晫䮓昰𧡰𡷫晣𣋒𣋡昞𥡲㣑𣠺𣞼㮙𣞢𣏾瓐㮖枏𤘪梶栞㯄檾㡣𣟕𤒇樳橒櫉欅𡤒攑梘橌㯗橺歗𣿀𣲚鎠鋲𨯪𨫋"], 54 | ["9440","銉𨀞𨧜鑧涥漋𤧬浧𣽿㶏渄𤀼娽渊塇洤硂焻𤌚𤉶烱牐犇犔𤞏𤜥兹𤪤𠗫瑺𣻸𣙟𤩊𤤗𥿡㼆㺱𤫟𨰣𣼵悧㻳瓌琼鎇琷䒟𦷪䕑疃㽣𤳙𤴆㽘畕癳𪗆㬙瑨𨫌𤦫𤦎㫻"], 55 | ["94a1","㷍𤩎㻿𤧅𤣳釺圲鍂𨫣𡡤僟𥈡𥇧睸𣈲眎眏睻𤚗𣞁㩞𤣰琸璛㺿𤪺𤫇䃈𤪖𦆮錇𥖁砞碍碈磒珐祙𧝁𥛣䄎禛蒖禥樭𣻺稺秴䅮𡛦䄲鈵秱𠵌𤦌𠊙𣶺𡝮㖗啫㕰㚪𠇔𠰍竢婙𢛵𥪯𥪜娍𠉛磰娪𥯆竾䇹籝籭䈑𥮳𥺼𥺦糍𤧹𡞰粎籼粮檲緜縇緓罎𦉡"], 56 | ["9540","𦅜𧭈綗𥺂䉪𦭵𠤖柖𠁎𣗏埄𦐒𦏸𤥢翝笧𠠬𥫩𥵃笌𥸎駦虅驣樜𣐿㧢𤧷𦖭騟𦖠蒀𧄧𦳑䓪脷䐂胆脉腂𦞴飃𦩂艢艥𦩑葓𦶧蘐𧈛媆䅿𡡀嬫𡢡嫤𡣘蚠蜨𣶏蠭𧐢娂"], 57 | ["95a1","衮佅袇袿裦襥襍𥚃襔𧞅𧞄𨯵𨯙𨮜𨧹㺭蒣䛵䛏㟲訽訜𩑈彍鈫𤊄旔焩烄𡡅鵭貟賩𧷜妚矃姰䍮㛔踪躧𤰉輰轊䋴汘澻𢌡䢛潹溋𡟚鯩㚵𤤯邻邗啱䤆醻鐄𨩋䁢𨫼鐧𨰝𨰻蓥訫閙閧閗閖𨴴瑅㻂𤣿𤩂𤏪㻧𣈥随𨻧𨹦𨹥㻌𤧭𤩸𣿮琒瑫㻼靁𩂰"], 58 | ["9640","桇䨝𩂓𥟟靝鍨𨦉𨰦𨬯𦎾銺嬑譩䤼珹𤈛鞛靱餸𠼦巁𨯅𤪲頟𩓚鋶𩗗釥䓀𨭐𤩧𨭤飜𨩅㼀鈪䤥萔餻饍𧬆㷽馛䭯馪驜𨭥𥣈檏騡嫾騯𩣱䮐𩥈馼䮽䮗鍽塲𡌂堢𤦸"], 59 | ["96a1","𡓨硄𢜟𣶸棅㵽鑘㤧慐𢞁𢥫愇鱏鱓鱻鰵鰐魿鯏𩸭鮟𪇵𪃾鴡䲮𤄄鸘䲰鴌𪆴𪃭𪃳𩤯鶥蒽𦸒𦿟𦮂藼䔳𦶤𦺄𦷰萠藮𦸀𣟗𦁤秢𣖜𣙀䤭𤧞㵢鏛銾鍈𠊿碹鉷鑍俤㑀遤𥕝砽硔碶硋𡝗𣇉𤥁㚚佲濚濙瀞瀞吔𤆵垻壳垊鴖埗焴㒯𤆬燫𦱀𤾗嬨𡞵𨩉"], 60 | ["9740","愌嫎娋䊼𤒈㜬䭻𨧼鎻鎸𡣖𠼝葲𦳀𡐓𤋺𢰦𤏁妔𣶷𦝁綨𦅛𦂤𤦹𤦋𨧺鋥珢㻩璴𨭣𡢟㻡𤪳櫘珳珻㻖𤨾𤪔𡟙𤩦𠎧𡐤𤧥瑈𤤖炥𤥶銄珦鍟𠓾錱𨫎𨨖鎆𨯧𥗕䤵𨪂煫"], 61 | ["97a1","𤥃𠳿嚤𠘚𠯫𠲸唂秄𡟺緾𡛂𤩐𡡒䔮鐁㜊𨫀𤦭妰𡢿𡢃𧒄媡㛢𣵛㚰鉟婹𨪁𡡢鍴㳍𠪴䪖㦊僴㵩㵌𡎜煵䋻𨈘渏𩃤䓫浗𧹏灧沯㳖𣿭𣸭渂漌㵯𠏵畑㚼㓈䚀㻚䡱姄鉮䤾轁𨰜𦯀堒埈㛖𡑒烾𤍢𤩱𢿣𡊰𢎽梹楧𡎘𣓥𧯴𣛟𨪃𣟖𣏺𤲟樚𣚭𦲷萾䓟䓎"], 62 | ["9840","𦴦𦵑𦲂𦿞漗𧄉茽𡜺菭𦲀𧁓𡟛妉媂𡞳婡婱𡤅𤇼㜭姯𡜼㛇熎鎐暚𤊥婮娫𤊓樫𣻹𧜶𤑛𤋊焝𤉙𨧡侰𦴨峂𤓎𧹍𤎽樌𤉖𡌄炦焳𤏩㶥泟勇𤩏繥姫崯㷳彜𤩝𡟟綤萦"], 63 | ["98a1","咅𣫺𣌀𠈔坾𠣕𠘙㿥𡾞𪊶瀃𩅛嵰玏糓𨩙𩐠俈翧狍猐𧫴猸猹𥛶獁獈㺩𧬘遬燵𤣲珡臶㻊県㻑沢国琙琞琟㻢㻰㻴㻺瓓㼎㽓畂畭畲疍㽼痈痜㿀癍㿗癴㿜発𤽜熈嘣覀塩䀝睃䀹条䁅㗛瞘䁪䁯属瞾矋売砘点砜䂨砹硇硑硦葈𥔵礳栃礲䄃"], 64 | ["9940","䄉禑禙辻稆込䅧窑䆲窼艹䇄竏竛䇏両筢筬筻簒簛䉠䉺类粜䊌粸䊔糭输烀𠳏総緔緐緽羮羴犟䎗耠耥笹耮耱联㷌垴炠肷胩䏭脌猪脎脒畠脔䐁㬹腖腙腚"], 65 | ["99a1","䐓堺腼膄䐥膓䐭膥埯臁臤艔䒏芦艶苊苘苿䒰荗险榊萅烵葤惣蒈䔄蒾蓡蓸蔐蔸蕒䔻蕯蕰藠䕷虲蚒蚲蛯际螋䘆䘗袮裿褤襇覑𧥧訩訸誔誴豑賔賲贜䞘塟跃䟭仮踺嗘坔蹱嗵躰䠷軎転軤軭軲辷迁迊迌逳駄䢭飠鈓䤞鈨鉘鉫銱銮銿"], 66 | ["9a40","鋣鋫鋳鋴鋽鍃鎄鎭䥅䥑麿鐗匁鐝鐭鐾䥪鑔鑹锭関䦧间阳䧥枠䨤靀䨵鞲韂噔䫤惨颹䬙飱塄餎餙冴餜餷饂饝饢䭰駅䮝騼鬏窃魩鮁鯝鯱鯴䱭鰠㝯𡯂鵉鰺"], 67 | ["9aa1","黾噐鶓鶽鷀鷼银辶鹻麬麱麽黆铜黢黱黸竈齄𠂔𠊷𠎠椚铃妬𠓗塀铁㞹𠗕𠘕𠙶𡚺块煳𠫂𠫍𠮿呪吆𠯋咞𠯻𠰻𠱓𠱥𠱼惧𠲍噺𠲵𠳝𠳭𠵯𠶲𠷈楕鰯螥𠸄𠸎𠻗𠾐𠼭𠹳尠𠾼帋𡁜𡁏𡁶朞𡁻𡂈𡂖㙇𡂿𡃓𡄯𡄻卤蒭𡋣𡍵𡌶讁𡕷𡘙𡟃𡟇乸炻𡠭𡥪"], 68 | ["9b40","𡨭𡩅𡰪𡱰𡲬𡻈拃𡻕𡼕熘桕𢁅槩㛈𢉼𢏗𢏺𢜪𢡱𢥏苽𢥧𢦓𢫕覥𢫨辠𢬎鞸𢬿顇骽𢱌"], 69 | ["9b62","𢲈𢲷𥯨𢴈𢴒𢶷𢶕𢹂𢽴𢿌𣀳𣁦𣌟𣏞徱晈暿𧩹𣕧𣗳爁𤦺矗𣘚𣜖纇𠍆墵朎"], 70 | ["9ba1","椘𣪧𧙗𥿢𣸑𣺹𧗾𢂚䣐䪸𤄙𨪚𤋮𤌍𤀻𤌴𤎖𤩅𠗊凒𠘑妟𡺨㮾𣳿𤐄𤓖垈𤙴㦛𤜯𨗨𩧉㝢𢇃譞𨭎駖𤠒𤣻𤨕爉𤫀𠱸奥𤺥𤾆𠝹軚𥀬劏圿煱𥊙𥐙𣽊𤪧喼𥑆𥑮𦭒釔㑳𥔿𧘲𥕞䜘𥕢𥕦𥟇𤤿𥡝偦㓻𣏌惞𥤃䝼𨥈𥪮𥮉𥰆𡶐垡煑澶𦄂𧰒遖𦆲𤾚譢𦐂𦑊"], 71 | ["9c40","嵛𦯷輶𦒄𡤜諪𤧶𦒈𣿯𦔒䯀𦖿𦚵𢜛鑥𥟡憕娧晉侻嚹𤔡𦛼乪𤤴陖涏𦲽㘘襷𦞙𦡮𦐑𦡞營𦣇筂𩃀𠨑𦤦鄄𦤹穅鷰𦧺騦𦨭㙟𦑩𠀡禃𦨴𦭛崬𣔙菏𦮝䛐𦲤画补𦶮墶"], 72 | ["9ca1","㜜𢖍𧁋𧇍㱔𧊀𧊅銁𢅺𧊋錰𧋦𤧐氹钟𧑐𠻸蠧裵𢤦𨑳𡞱溸𤨪𡠠㦤㚹尐秣䔿暶𩲭𩢤襃𧟌𧡘囖䃟𡘊㦡𣜯𨃨𡏅熭荦𧧝𩆨婧䲷𧂯𨦫𧧽𧨊𧬋𧵦𤅺筃祾𨀉澵𪋟樃𨌘厢𦸇鎿栶靝𨅯𨀣𦦵𡏭𣈯𨁈嶅𨰰𨂃圕頣𨥉嶫𤦈斾槕叒𤪥𣾁㰑朶𨂐𨃴𨄮𡾡𨅏"], 73 | ["9d40","𨆉𨆯𨈚𨌆𨌯𨎊㗊𨑨𨚪䣺揦𨥖砈鉕𨦸䏲𨧧䏟𨧨𨭆𨯔姸𨰉輋𨿅𩃬筑𩄐𩄼㷷𩅞𤫊运犏嚋𩓧𩗩𩖰𩖸𩜲𩣑𩥉𩥪𩧃𩨨𩬎𩵚𩶛纟𩻸𩼣䲤镇𪊓熢𪋿䶑递𪗋䶜𠲜达嗁"], 74 | ["9da1","辺𢒰边𤪓䔉繿潖檱仪㓤𨬬𧢝㜺躀𡟵𨀤𨭬𨮙𧨾𦚯㷫𧙕𣲷𥘵𥥖亚𥺁𦉘嚿𠹭踎孭𣺈𤲞揞拐𡟶𡡻攰嘭𥱊吚𥌑㷆𩶘䱽嘢嘞罉𥻘奵𣵀蝰东𠿪𠵉𣚺脗鵞贘瘻鱅癎瞹鍅吲腈苷嘥脲萘肽嗪祢噃吖𠺝㗎嘅嗱曱𨋢㘭甴嗰喺咗啲𠱁𠲖廐𥅈𠹶𢱢"], 75 | ["9e40","𠺢麫絚嗞𡁵抝靭咔賍燶酶揼掹揾啩𢭃鱲𢺳冚㓟𠶧冧呍唞唓癦踭𦢊疱肶蠄螆裇膶萜𡃁䓬猄𤜆宐茋𦢓噻𢛴𧴯𤆣𧵳𦻐𧊶酰𡇙鈈𣳼𪚩𠺬𠻹牦𡲢䝎𤿂𧿹𠿫䃺"], 76 | ["9ea1","鱝攟𢶠䣳𤟠𩵼𠿬𠸊恢𧖣𠿭"], 77 | ["9ead","𦁈𡆇熣纎鵐业丄㕷嬍沲卧㚬㧜卽㚥𤘘墚𤭮舭呋垪𥪕𠥹"], 78 | ["9ec5","㩒𢑥獴𩺬䴉鯭𣳾𩼰䱛𤾩𩖞𩿞葜𣶶𧊲𦞳𣜠挮紥𣻷𣸬㨪逈勌㹴㙺䗩𠒎癀嫰𠺶硺𧼮墧䂿噼鮋嵴癔𪐴麅䳡痹㟻愙𣃚𤏲"], 79 | ["9ef5","噝𡊩垧𤥣𩸆刴𧂮㖭汊鵼"], 80 | ["9f40","籖鬹埞𡝬屓擓𩓐𦌵𧅤蚭𠴨𦴢𤫢𠵱"], 81 | ["9f4f","凾𡼏嶎霃𡷑麁遌笟鬂峑箣扨挵髿篏鬪籾鬮籂粆鰕篼鬉鼗鰛𤤾齚啳寃俽麘俲剠㸆勑坧偖妷帒韈鶫轜呩鞴饀鞺匬愰"], 82 | ["9fa1","椬叚鰊鴂䰻陁榀傦畆𡝭駚剳"], 83 | ["9fae","酙隁酜"], 84 | ["9fb2","酑𨺗捿𦴣櫊嘑醎畺抅𠏼獏籰𥰡𣳽"], 85 | ["9fc1","𤤙盖鮝个𠳔莾衂"], 86 | ["9fc9","届槀僭坺刟巵从氱𠇲伹咜哚劚趂㗾弌㗳"], 87 | ["9fdb","歒酼龥鮗頮颴骺麨麄煺笔"], 88 | ["9fe7","毺蠘罸"], 89 | ["9feb","嘠𪙊蹷齓"], 90 | ["9ff0","跔蹏鸜踁抂𨍽踨蹵竓𤩷稾磘泪詧瘇"], 91 | ["a040","𨩚鼦泎蟖痃𪊲硓咢贌狢獱謭猂瓱賫𤪻蘯徺袠䒷"], 92 | ["a055","𡠻𦸅"], 93 | ["a058","詾𢔛"], 94 | ["a05b","惽癧髗鵄鍮鮏蟵"], 95 | ["a063","蠏賷猬霡鮰㗖犲䰇籑饊𦅙慙䰄麖慽"], 96 | ["a073","坟慯抦戹拎㩜懢厪𣏵捤栂㗒"], 97 | ["a0a1","嵗𨯂迚𨸹"], 98 | ["a0a6","僙𡵆礆匲阸𠼻䁥"], 99 | ["a0ae","矾"], 100 | ["a0b0","糂𥼚糚稭聦聣絍甅瓲覔舚朌聢𧒆聛瓰脃眤覉𦟌畓𦻑螩蟎臈螌詉貭譃眫瓸蓚㘵榲趦"], 101 | ["a0d4","覩瑨涹蟁𤀑瓧㷛煶悤憜㳑煢恷"], 102 | ["a0e2","罱𨬭牐惩䭾删㰘𣳇𥻗𧙖𥔱𡥄𡋾𩤃𦷜𧂭峁𦆭𨨏𣙷𠃮𦡆𤼎䕢嬟𦍌齐麦𦉫"], 103 | ["a3c0","␀",31,"␡"], 104 | ["c6a1","①",9,"⑴",9,"ⅰ",9,"丶丿亅亠冂冖冫勹匸卩厶夊宀巛⼳广廴彐彡攴无疒癶辵隶¨ˆヽヾゝゞ〃仝々〆〇ー[]✽ぁ",23], 105 | ["c740","す",58,"ァアィイ"], 106 | ["c7a1","ゥ",81,"А",5,"ЁЖ",4], 107 | ["c840","Л",26,"ёж",25,"⇧↸↹㇏𠃌乚𠂊刂䒑"], 108 | ["c8a1","龰冈龱𧘇"], 109 | ["c8cd","¬¦'"㈱№℡゛゜⺀⺄⺆⺇⺈⺊⺌⺍⺕⺜⺝⺥⺧⺪⺬⺮⺶⺼⺾⻆⻊⻌⻍⻏⻖⻗⻞⻣"], 110 | ["c8f5","ʃɐɛɔɵœøŋʊɪ"], 111 | ["f9fe","■"], 112 | ["fa40","𠕇鋛𠗟𣿅蕌䊵珯况㙉𤥂𨧤鍄𡧛苮𣳈砼杄拟𤤳𨦪𠊠𦮳𡌅侫𢓭倈𦴩𧪄𣘀𤪱𢔓倩𠍾徤𠎀𠍇滛𠐟偽儁㑺儎顬㝃萖𤦤𠒇兠𣎴兪𠯿𢃼𠋥𢔰𠖎𣈳𡦃宂蝽𠖳𣲙冲冸"], 113 | ["faa1","鴴凉减凑㳜凓𤪦决凢卂凭菍椾𣜭彻刋刦刼劵剗劔効勅簕蕂勠蘍𦬓包𨫞啉滙𣾀𠥔𣿬匳卄𠯢泋𡜦栛珕恊㺪㣌𡛨燝䒢卭却𨚫卾卿𡖖𡘓矦厓𨪛厠厫厮玧𥝲㽙玜叁叅汉义埾叙㪫𠮏叠𣿫𢶣叶𠱷吓灹唫晗浛呭𦭓𠵴啝咏咤䞦𡜍𠻝㶴𠵍"], 114 | ["fb40","𨦼𢚘啇䳭启琗喆喩嘅𡣗𤀺䕒𤐵暳𡂴嘷曍𣊊暤暭噍噏磱囱鞇叾圀囯园𨭦㘣𡉏坆𤆥汮炋坂㚱𦱾埦𡐖堃𡑔𤍣堦𤯵塜墪㕡壠壜𡈼壻寿坃𪅐𤉸鏓㖡够梦㛃湙"], 115 | ["fba1","𡘾娤啓𡚒蔅姉𠵎𦲁𦴪𡟜姙𡟻𡞲𦶦浱𡠨𡛕姹𦹅媫婣㛦𤦩婷㜈媖瑥嫓𦾡𢕔㶅𡤑㜲𡚸広勐孶斈孼𧨎䀄䡝𠈄寕慠𡨴𥧌𠖥寳宝䴐尅𡭄尓珎尔𡲥𦬨屉䣝岅峩峯嶋𡷹𡸷崐崘嵆𡺤岺巗苼㠭𤤁𢁉𢅳芇㠶㯂帮檊幵幺𤒼𠳓厦亷廐厨𡝱帉廴𨒂"], 116 | ["fc40","廹廻㢠廼栾鐛弍𠇁弢㫞䢮𡌺强𦢈𢏐彘𢑱彣鞽𦹮彲鍀𨨶徧嶶㵟𥉐𡽪𧃸𢙨釖𠊞𨨩怱暅𡡷㥣㷇㘹垐𢞴祱㹀悞悤悳𤦂𤦏𧩓璤僡媠慤萤慂慈𦻒憁凴𠙖憇宪𣾷"], 117 | ["fca1","𢡟懓𨮝𩥝懐㤲𢦀𢣁怣慜攞掋𠄘担𡝰拕𢸍捬𤧟㨗搸揸𡎎𡟼撐澊𢸶頔𤂌𥜝擡擥鑻㩦携㩗敍漖𤨨𤨣斅敭敟𣁾斵𤥀䬷旑䃘𡠩无旣忟𣐀昘𣇷𣇸晄𣆤𣆥晋𠹵晧𥇦晳晴𡸽𣈱𨗴𣇈𥌓矅𢣷馤朂𤎜𤨡㬫槺𣟂杞杧杢𤇍𩃭柗䓩栢湐鈼栁𣏦𦶠桝"], 118 | ["fd40","𣑯槡樋𨫟楳棃𣗍椁椀㴲㨁𣘼㮀枬楡𨩊䋼椶榘㮡𠏉荣傐槹𣙙𢄪橅𣜃檝㯳枱櫈𩆜㰍欝𠤣惞欵歴𢟍溵𣫛𠎵𡥘㝀吡𣭚毡𣻼毜氷𢒋𤣱𦭑汚舦汹𣶼䓅𣶽𤆤𤤌𤤀"], 119 | ["fda1","𣳉㛥㳫𠴲鮃𣇹𢒑羏样𦴥𦶡𦷫涖浜湼漄𤥿𤂅𦹲蔳𦽴凇沜渝萮𨬡港𣸯瑓𣾂秌湏媑𣁋濸㜍澝𣸰滺𡒗𤀽䕕鏰潄潜㵎潴𩅰㴻澟𤅄濓𤂑𤅕𤀹𣿰𣾴𤄿凟𤅖𤅗𤅀𦇝灋灾炧炁烌烕烖烟䄄㷨熴熖𤉷焫煅媈煊煮岜𤍥煏鍢𤋁焬𤑚𤨧𤨢熺𨯨炽爎"], 120 | ["fe40","鑂爕夑鑃爤鍁𥘅爮牀𤥴梽牕牗㹕𣁄栍漽犂猪猫𤠣𨠫䣭𨠄猨献珏玪𠰺𦨮珉瑉𤇢𡛧𤨤昣㛅𤦷𤦍𤧻珷琕椃𤨦琹𠗃㻗瑜𢢭瑠𨺲瑇珤瑶莹瑬㜰瑴鏱樬璂䥓𤪌"], 121 | ["fea1","𤅟𤩹𨮏孆𨰃𡢞瓈𡦈甎瓩甞𨻙𡩋寗𨺬鎅畍畊畧畮𤾂㼄𤴓疎瑝疞疴瘂瘬癑癏癯癶𦏵皐臯㟸𦤑𦤎皡皥皷盌𦾟葢𥂝𥅽𡸜眞眦着撯𥈠睘𣊬瞯𨥤𨥨𡛁矴砉𡍶𤨒棊碯磇磓隥礮𥗠磗礴碱𧘌辸袄𨬫𦂃𢘜禆褀椂禀𥡗禝𧬹礼禩渪𧄦㺨秆𩄍秔"] 122 | ] 123 | -------------------------------------------------------------------------------- /encodings/tables/gb18030-ranges.json: -------------------------------------------------------------------------------- 1 | {"uChars":[128,165,169,178,184,216,226,235,238,244,248,251,253,258,276,284,300,325,329,334,364,463,465,467,469,471,473,475,477,506,594,610,712,716,730,930,938,962,970,1026,1104,1106,8209,8215,8218,8222,8231,8241,8244,8246,8252,8365,8452,8454,8458,8471,8482,8556,8570,8596,8602,8713,8720,8722,8726,8731,8737,8740,8742,8748,8751,8760,8766,8777,8781,8787,8802,8808,8816,8854,8858,8870,8896,8979,9322,9372,9548,9588,9616,9622,9634,9652,9662,9672,9676,9680,9702,9735,9738,9793,9795,11906,11909,11913,11917,11928,11944,11947,11951,11956,11960,11964,11979,12284,12292,12312,12319,12330,12351,12436,12447,12535,12543,12586,12842,12850,12964,13200,13215,13218,13253,13263,13267,13270,13384,13428,13727,13839,13851,14617,14703,14801,14816,14964,15183,15471,15585,16471,16736,17208,17325,17330,17374,17623,17997,18018,18212,18218,18301,18318,18760,18811,18814,18820,18823,18844,18848,18872,19576,19620,19738,19887,40870,59244,59336,59367,59413,59417,59423,59431,59437,59443,59452,59460,59478,59493,63789,63866,63894,63976,63986,64016,64018,64021,64025,64034,64037,64042,65074,65093,65107,65112,65127,65132,65375,65510,65536],"gbChars":[0,36,38,45,50,81,89,95,96,100,103,104,105,109,126,133,148,172,175,179,208,306,307,308,309,310,311,312,313,341,428,443,544,545,558,741,742,749,750,805,819,820,7922,7924,7925,7927,7934,7943,7944,7945,7950,8062,8148,8149,8152,8164,8174,8236,8240,8262,8264,8374,8380,8381,8384,8388,8390,8392,8393,8394,8396,8401,8406,8416,8419,8424,8437,8439,8445,8482,8485,8496,8521,8603,8936,8946,9046,9050,9063,9066,9076,9092,9100,9108,9111,9113,9131,9162,9164,9218,9219,11329,11331,11334,11336,11346,11361,11363,11366,11370,11372,11375,11389,11682,11686,11687,11692,11694,11714,11716,11723,11725,11730,11736,11982,11989,12102,12336,12348,12350,12384,12393,12395,12397,12510,12553,12851,12962,12973,13738,13823,13919,13933,14080,14298,14585,14698,15583,15847,16318,16434,16438,16481,16729,17102,17122,17315,17320,17402,17418,17859,17909,17911,17915,17916,17936,17939,17961,18664,18703,18814,18962,19043,33469,33470,33471,33484,33485,33490,33497,33501,33505,33513,33520,33536,33550,37845,37921,37948,38029,38038,38064,38065,38066,38069,38075,38076,38078,39108,39109,39113,39114,39115,39116,39265,39394,189000]} -------------------------------------------------------------------------------- /encodings/tables/gbk-added.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["a140","",62], 3 | ["a180","",32], 4 | ["a240","",62], 5 | ["a280","",32], 6 | ["a2ab","",5], 7 | ["a2e3","€"], 8 | ["a2ef",""], 9 | ["a2fd",""], 10 | ["a340","",62], 11 | ["a380","",31," "], 12 | ["a440","",62], 13 | ["a480","",32], 14 | ["a4f4","",10], 15 | ["a540","",62], 16 | ["a580","",32], 17 | ["a5f7","",7], 18 | ["a640","",62], 19 | ["a680","",32], 20 | ["a6b9","",7], 21 | ["a6d9","",6], 22 | ["a6ec",""], 23 | ["a6f3",""], 24 | ["a6f6","",8], 25 | ["a740","",62], 26 | ["a780","",32], 27 | ["a7c2","",14], 28 | ["a7f2","",12], 29 | ["a896","",10], 30 | ["a8bc","ḿ"], 31 | ["a8bf","ǹ"], 32 | ["a8c1",""], 33 | ["a8ea","",20], 34 | ["a958",""], 35 | ["a95b",""], 36 | ["a95d",""], 37 | ["a989","〾⿰",11], 38 | ["a997","",12], 39 | ["a9f0","",14], 40 | ["aaa1","",93], 41 | ["aba1","",93], 42 | ["aca1","",93], 43 | ["ada1","",93], 44 | ["aea1","",93], 45 | ["afa1","",93], 46 | ["d7fa","",4], 47 | ["f8a1","",93], 48 | ["f9a1","",93], 49 | ["faa1","",93], 50 | ["fba1","",93], 51 | ["fca1","",93], 52 | ["fda1","",93], 53 | ["fe50","⺁⺄㑳㑇⺈⺋㖞㘚㘎⺌⺗㥮㤘㧏㧟㩳㧐㭎㱮㳠⺧⺪䁖䅟⺮䌷⺳⺶⺷䎱䎬⺻䏝䓖䙡䙌"], 54 | ["fe80","䜣䜩䝼䞍⻊䥇䥺䥽䦂䦃䦅䦆䦟䦛䦷䦶䲣䲟䲠䲡䱷䲢䴓",6,"䶮",93], 55 | ["8135f437",""] 56 | ] 57 | -------------------------------------------------------------------------------- /encodings/tables/shiftjis.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["0","\u0000",128], 3 | ["a1","。",62], 4 | ["8140"," 、。,.・:;?!゛゜´`¨^ ̄_ヽヾゝゞ〃仝々〆〇ー―‐/\~∥|…‥‘’“”()〔〕[]{}〈",9,"+-±×"], 5 | ["8180","÷=≠<>≦≧∞∴♂♀°′″℃¥$¢£%#&*@§☆★○●◎◇◆□■△▲▽▼※〒→←↑↓〓"], 6 | ["81b8","∈∋⊆⊇⊂⊃∪∩"], 7 | ["81c8","∧∨¬⇒⇔∀∃"], 8 | ["81da","∠⊥⌒∂∇≡≒≪≫√∽∝∵∫∬"], 9 | ["81f0","ʼn♯♭♪†‡¶"], 10 | ["81fc","◯"], 11 | ["824f","0",9], 12 | ["8260","A",25], 13 | ["8281","a",25], 14 | ["829f","ぁ",82], 15 | ["8340","ァ",62], 16 | ["8380","ム",22], 17 | ["839f","Α",16,"Σ",6], 18 | ["83bf","α",16,"σ",6], 19 | ["8440","А",5,"ЁЖ",25], 20 | ["8470","а",5,"ёж",7], 21 | ["8480","о",17], 22 | ["849f","─│┌┐┘└├┬┤┴┼━┃┏┓┛┗┣┳┫┻╋┠┯┨┷┿┝┰┥┸╂"], 23 | ["8740","①",19,"Ⅰ",9], 24 | ["875f","㍉㌔㌢㍍㌘㌧㌃㌶㍑㍗㌍㌦㌣㌫㍊㌻㎜㎝㎞㎎㎏㏄㎡"], 25 | ["877e","㍻"], 26 | ["8780","〝〟№㏍℡㊤",4,"㈱㈲㈹㍾㍽㍼≒≡∫∮∑√⊥∠∟⊿∵∩∪"], 27 | ["889f","亜唖娃阿哀愛挨姶逢葵茜穐悪握渥旭葦芦鯵梓圧斡扱宛姐虻飴絢綾鮎或粟袷安庵按暗案闇鞍杏以伊位依偉囲夷委威尉惟意慰易椅為畏異移維緯胃萎衣謂違遺医井亥域育郁磯一壱溢逸稲茨芋鰯允印咽員因姻引飲淫胤蔭"], 28 | ["8940","院陰隠韻吋右宇烏羽迂雨卯鵜窺丑碓臼渦嘘唄欝蔚鰻姥厩浦瓜閏噂云運雲荏餌叡営嬰影映曳栄永泳洩瑛盈穎頴英衛詠鋭液疫益駅悦謁越閲榎厭円"], 29 | ["8980","園堰奄宴延怨掩援沿演炎焔煙燕猿縁艶苑薗遠鉛鴛塩於汚甥凹央奥往応押旺横欧殴王翁襖鴬鴎黄岡沖荻億屋憶臆桶牡乙俺卸恩温穏音下化仮何伽価佳加可嘉夏嫁家寡科暇果架歌河火珂禍禾稼箇花苛茄荷華菓蝦課嘩貨迦過霞蚊俄峨我牙画臥芽蛾賀雅餓駕介会解回塊壊廻快怪悔恢懐戒拐改"], 30 | ["8a40","魁晦械海灰界皆絵芥蟹開階貝凱劾外咳害崖慨概涯碍蓋街該鎧骸浬馨蛙垣柿蛎鈎劃嚇各廓拡撹格核殻獲確穫覚角赫較郭閣隔革学岳楽額顎掛笠樫"], 31 | ["8a80","橿梶鰍潟割喝恰括活渇滑葛褐轄且鰹叶椛樺鞄株兜竃蒲釜鎌噛鴨栢茅萱粥刈苅瓦乾侃冠寒刊勘勧巻喚堪姦完官寛干幹患感慣憾換敢柑桓棺款歓汗漢澗潅環甘監看竿管簡緩缶翰肝艦莞観諌貫還鑑間閑関陥韓館舘丸含岸巌玩癌眼岩翫贋雁頑顔願企伎危喜器基奇嬉寄岐希幾忌揮机旗既期棋棄"], 32 | ["8b40","機帰毅気汽畿祈季稀紀徽規記貴起軌輝飢騎鬼亀偽儀妓宜戯技擬欺犠疑祇義蟻誼議掬菊鞠吉吃喫桔橘詰砧杵黍却客脚虐逆丘久仇休及吸宮弓急救"], 33 | ["8b80","朽求汲泣灸球究窮笈級糾給旧牛去居巨拒拠挙渠虚許距鋸漁禦魚亨享京供侠僑兇競共凶協匡卿叫喬境峡強彊怯恐恭挟教橋況狂狭矯胸脅興蕎郷鏡響饗驚仰凝尭暁業局曲極玉桐粁僅勤均巾錦斤欣欽琴禁禽筋緊芹菌衿襟謹近金吟銀九倶句区狗玖矩苦躯駆駈駒具愚虞喰空偶寓遇隅串櫛釧屑屈"], 34 | ["8c40","掘窟沓靴轡窪熊隈粂栗繰桑鍬勲君薫訓群軍郡卦袈祁係傾刑兄啓圭珪型契形径恵慶慧憩掲携敬景桂渓畦稽系経継繋罫茎荊蛍計詣警軽頚鶏芸迎鯨"], 35 | ["8c80","劇戟撃激隙桁傑欠決潔穴結血訣月件倹倦健兼券剣喧圏堅嫌建憲懸拳捲検権牽犬献研硯絹県肩見謙賢軒遣鍵険顕験鹸元原厳幻弦減源玄現絃舷言諺限乎個古呼固姑孤己庫弧戸故枯湖狐糊袴股胡菰虎誇跨鈷雇顧鼓五互伍午呉吾娯後御悟梧檎瑚碁語誤護醐乞鯉交佼侯候倖光公功効勾厚口向"], 36 | ["8d40","后喉坑垢好孔孝宏工巧巷幸広庚康弘恒慌抗拘控攻昂晃更杭校梗構江洪浩港溝甲皇硬稿糠紅紘絞綱耕考肯肱腔膏航荒行衡講貢購郊酵鉱砿鋼閤降"], 37 | ["8d80","項香高鴻剛劫号合壕拷濠豪轟麹克刻告国穀酷鵠黒獄漉腰甑忽惚骨狛込此頃今困坤墾婚恨懇昏昆根梱混痕紺艮魂些佐叉唆嵯左差査沙瑳砂詐鎖裟坐座挫債催再最哉塞妻宰彩才採栽歳済災采犀砕砦祭斎細菜裁載際剤在材罪財冴坂阪堺榊肴咲崎埼碕鷺作削咋搾昨朔柵窄策索錯桜鮭笹匙冊刷"], 38 | ["8e40","察拶撮擦札殺薩雑皐鯖捌錆鮫皿晒三傘参山惨撒散桟燦珊産算纂蚕讃賛酸餐斬暫残仕仔伺使刺司史嗣四士始姉姿子屍市師志思指支孜斯施旨枝止"], 39 | ["8e80","死氏獅祉私糸紙紫肢脂至視詞詩試誌諮資賜雌飼歯事似侍児字寺慈持時次滋治爾璽痔磁示而耳自蒔辞汐鹿式識鴫竺軸宍雫七叱執失嫉室悉湿漆疾質実蔀篠偲柴芝屡蕊縞舎写射捨赦斜煮社紗者謝車遮蛇邪借勺尺杓灼爵酌釈錫若寂弱惹主取守手朱殊狩珠種腫趣酒首儒受呪寿授樹綬需囚収周"], 40 | ["8f40","宗就州修愁拾洲秀秋終繍習臭舟蒐衆襲讐蹴輯週酋酬集醜什住充十従戎柔汁渋獣縦重銃叔夙宿淑祝縮粛塾熟出術述俊峻春瞬竣舜駿准循旬楯殉淳"], 41 | ["8f80","準潤盾純巡遵醇順処初所暑曙渚庶緒署書薯藷諸助叙女序徐恕鋤除傷償勝匠升召哨商唱嘗奨妾娼宵将小少尚庄床廠彰承抄招掌捷昇昌昭晶松梢樟樵沼消渉湘焼焦照症省硝礁祥称章笑粧紹肖菖蒋蕉衝裳訟証詔詳象賞醤鉦鍾鐘障鞘上丈丞乗冗剰城場壌嬢常情擾条杖浄状畳穣蒸譲醸錠嘱埴飾"], 42 | ["9040","拭植殖燭織職色触食蝕辱尻伸信侵唇娠寝審心慎振新晋森榛浸深申疹真神秦紳臣芯薪親診身辛進針震人仁刃塵壬尋甚尽腎訊迅陣靭笥諏須酢図厨"], 43 | ["9080","逗吹垂帥推水炊睡粋翠衰遂酔錐錘随瑞髄崇嵩数枢趨雛据杉椙菅頗雀裾澄摺寸世瀬畝是凄制勢姓征性成政整星晴棲栖正清牲生盛精聖声製西誠誓請逝醒青静斉税脆隻席惜戚斥昔析石積籍績脊責赤跡蹟碩切拙接摂折設窃節説雪絶舌蝉仙先千占宣専尖川戦扇撰栓栴泉浅洗染潜煎煽旋穿箭線"], 44 | ["9140","繊羨腺舛船薦詮賎践選遷銭銑閃鮮前善漸然全禅繕膳糎噌塑岨措曾曽楚狙疏疎礎祖租粗素組蘇訴阻遡鼠僧創双叢倉喪壮奏爽宋層匝惣想捜掃挿掻"], 45 | ["9180","操早曹巣槍槽漕燥争痩相窓糟総綜聡草荘葬蒼藻装走送遭鎗霜騒像増憎臓蔵贈造促側則即息捉束測足速俗属賊族続卒袖其揃存孫尊損村遜他多太汰詑唾堕妥惰打柁舵楕陀駄騨体堆対耐岱帯待怠態戴替泰滞胎腿苔袋貸退逮隊黛鯛代台大第醍題鷹滝瀧卓啄宅托択拓沢濯琢託鐸濁諾茸凧蛸只"], 46 | ["9240","叩但達辰奪脱巽竪辿棚谷狸鱈樽誰丹単嘆坦担探旦歎淡湛炭短端箪綻耽胆蛋誕鍛団壇弾断暖檀段男談値知地弛恥智池痴稚置致蜘遅馳築畜竹筑蓄"], 47 | ["9280","逐秩窒茶嫡着中仲宙忠抽昼柱注虫衷註酎鋳駐樗瀦猪苧著貯丁兆凋喋寵帖帳庁弔張彫徴懲挑暢朝潮牒町眺聴脹腸蝶調諜超跳銚長頂鳥勅捗直朕沈珍賃鎮陳津墜椎槌追鎚痛通塚栂掴槻佃漬柘辻蔦綴鍔椿潰坪壷嬬紬爪吊釣鶴亭低停偵剃貞呈堤定帝底庭廷弟悌抵挺提梯汀碇禎程締艇訂諦蹄逓"], 48 | ["9340","邸鄭釘鼎泥摘擢敵滴的笛適鏑溺哲徹撤轍迭鉄典填天展店添纏甜貼転顛点伝殿澱田電兎吐堵塗妬屠徒斗杜渡登菟賭途都鍍砥砺努度土奴怒倒党冬"], 49 | ["9380","凍刀唐塔塘套宕島嶋悼投搭東桃梼棟盗淘湯涛灯燈当痘祷等答筒糖統到董蕩藤討謄豆踏逃透鐙陶頭騰闘働動同堂導憧撞洞瞳童胴萄道銅峠鴇匿得徳涜特督禿篤毒独読栃橡凸突椴届鳶苫寅酉瀞噸屯惇敦沌豚遁頓呑曇鈍奈那内乍凪薙謎灘捺鍋楢馴縄畷南楠軟難汝二尼弐迩匂賑肉虹廿日乳入"], 50 | ["9440","如尿韮任妊忍認濡禰祢寧葱猫熱年念捻撚燃粘乃廼之埜嚢悩濃納能脳膿農覗蚤巴把播覇杷波派琶破婆罵芭馬俳廃拝排敗杯盃牌背肺輩配倍培媒梅"], 51 | ["9480","楳煤狽買売賠陪這蝿秤矧萩伯剥博拍柏泊白箔粕舶薄迫曝漠爆縛莫駁麦函箱硲箸肇筈櫨幡肌畑畠八鉢溌発醗髪伐罰抜筏閥鳩噺塙蛤隼伴判半反叛帆搬斑板氾汎版犯班畔繁般藩販範釆煩頒飯挽晩番盤磐蕃蛮匪卑否妃庇彼悲扉批披斐比泌疲皮碑秘緋罷肥被誹費避非飛樋簸備尾微枇毘琵眉美"], 52 | ["9540","鼻柊稗匹疋髭彦膝菱肘弼必畢筆逼桧姫媛紐百謬俵彪標氷漂瓢票表評豹廟描病秒苗錨鋲蒜蛭鰭品彬斌浜瀕貧賓頻敏瓶不付埠夫婦富冨布府怖扶敷"], 53 | ["9580","斧普浮父符腐膚芙譜負賦赴阜附侮撫武舞葡蕪部封楓風葺蕗伏副復幅服福腹複覆淵弗払沸仏物鮒分吻噴墳憤扮焚奮粉糞紛雰文聞丙併兵塀幣平弊柄並蔽閉陛米頁僻壁癖碧別瞥蔑箆偏変片篇編辺返遍便勉娩弁鞭保舗鋪圃捕歩甫補輔穂募墓慕戊暮母簿菩倣俸包呆報奉宝峰峯崩庖抱捧放方朋"], 54 | ["9640","法泡烹砲縫胞芳萌蓬蜂褒訪豊邦鋒飽鳳鵬乏亡傍剖坊妨帽忘忙房暴望某棒冒紡肪膨謀貌貿鉾防吠頬北僕卜墨撲朴牧睦穆釦勃没殆堀幌奔本翻凡盆"], 55 | ["9680","摩磨魔麻埋妹昧枚毎哩槙幕膜枕鮪柾鱒桝亦俣又抹末沫迄侭繭麿万慢満漫蔓味未魅巳箕岬密蜜湊蓑稔脈妙粍民眠務夢無牟矛霧鵡椋婿娘冥名命明盟迷銘鳴姪牝滅免棉綿緬面麺摸模茂妄孟毛猛盲網耗蒙儲木黙目杢勿餅尤戻籾貰問悶紋門匁也冶夜爺耶野弥矢厄役約薬訳躍靖柳薮鑓愉愈油癒"], 56 | ["9740","諭輸唯佑優勇友宥幽悠憂揖有柚湧涌猶猷由祐裕誘遊邑郵雄融夕予余与誉輿預傭幼妖容庸揚揺擁曜楊様洋溶熔用窯羊耀葉蓉要謡踊遥陽養慾抑欲"], 57 | ["9780","沃浴翌翼淀羅螺裸来莱頼雷洛絡落酪乱卵嵐欄濫藍蘭覧利吏履李梨理璃痢裏裡里離陸律率立葎掠略劉流溜琉留硫粒隆竜龍侶慮旅虜了亮僚両凌寮料梁涼猟療瞭稜糧良諒遼量陵領力緑倫厘林淋燐琳臨輪隣鱗麟瑠塁涙累類令伶例冷励嶺怜玲礼苓鈴隷零霊麗齢暦歴列劣烈裂廉恋憐漣煉簾練聯"], 58 | ["9840","蓮連錬呂魯櫓炉賂路露労婁廊弄朗楼榔浪漏牢狼篭老聾蝋郎六麓禄肋録論倭和話歪賄脇惑枠鷲亙亘鰐詫藁蕨椀湾碗腕"], 59 | ["989f","弌丐丕个丱丶丼丿乂乖乘亂亅豫亊舒弍于亞亟亠亢亰亳亶从仍仄仆仂仗仞仭仟价伉佚估佛佝佗佇佶侈侏侘佻佩佰侑佯來侖儘俔俟俎俘俛俑俚俐俤俥倚倨倔倪倥倅伜俶倡倩倬俾俯們倆偃假會偕偐偈做偖偬偸傀傚傅傴傲"], 60 | ["9940","僉僊傳僂僖僞僥僭僣僮價僵儉儁儂儖儕儔儚儡儺儷儼儻儿兀兒兌兔兢竸兩兪兮冀冂囘册冉冏冑冓冕冖冤冦冢冩冪冫决冱冲冰况冽凅凉凛几處凩凭"], 61 | ["9980","凰凵凾刄刋刔刎刧刪刮刳刹剏剄剋剌剞剔剪剴剩剳剿剽劍劔劒剱劈劑辨辧劬劭劼劵勁勍勗勞勣勦飭勠勳勵勸勹匆匈甸匍匐匏匕匚匣匯匱匳匸區卆卅丗卉卍凖卞卩卮夘卻卷厂厖厠厦厥厮厰厶參簒雙叟曼燮叮叨叭叺吁吽呀听吭吼吮吶吩吝呎咏呵咎呟呱呷呰咒呻咀呶咄咐咆哇咢咸咥咬哄哈咨"], 62 | ["9a40","咫哂咤咾咼哘哥哦唏唔哽哮哭哺哢唹啀啣啌售啜啅啖啗唸唳啝喙喀咯喊喟啻啾喘喞單啼喃喩喇喨嗚嗅嗟嗄嗜嗤嗔嘔嗷嘖嗾嗽嘛嗹噎噐營嘴嘶嘲嘸"], 63 | ["9a80","噫噤嘯噬噪嚆嚀嚊嚠嚔嚏嚥嚮嚶嚴囂嚼囁囃囀囈囎囑囓囗囮囹圀囿圄圉圈國圍圓團圖嗇圜圦圷圸坎圻址坏坩埀垈坡坿垉垓垠垳垤垪垰埃埆埔埒埓堊埖埣堋堙堝塲堡塢塋塰毀塒堽塹墅墹墟墫墺壞墻墸墮壅壓壑壗壙壘壥壜壤壟壯壺壹壻壼壽夂夊夐夛梦夥夬夭夲夸夾竒奕奐奎奚奘奢奠奧奬奩"], 64 | ["9b40","奸妁妝佞侫妣妲姆姨姜妍姙姚娥娟娑娜娉娚婀婬婉娵娶婢婪媚媼媾嫋嫂媽嫣嫗嫦嫩嫖嫺嫻嬌嬋嬖嬲嫐嬪嬶嬾孃孅孀孑孕孚孛孥孩孰孳孵學斈孺宀"], 65 | ["9b80","它宦宸寃寇寉寔寐寤實寢寞寥寫寰寶寳尅將專對尓尠尢尨尸尹屁屆屎屓屐屏孱屬屮乢屶屹岌岑岔妛岫岻岶岼岷峅岾峇峙峩峽峺峭嶌峪崋崕崗嵜崟崛崑崔崢崚崙崘嵌嵒嵎嵋嵬嵳嵶嶇嶄嶂嶢嶝嶬嶮嶽嶐嶷嶼巉巍巓巒巖巛巫已巵帋帚帙帑帛帶帷幄幃幀幎幗幔幟幢幤幇幵并幺麼广庠廁廂廈廐廏"], 66 | ["9c40","廖廣廝廚廛廢廡廨廩廬廱廳廰廴廸廾弃弉彝彜弋弑弖弩弭弸彁彈彌彎弯彑彖彗彙彡彭彳彷徃徂彿徊很徑徇從徙徘徠徨徭徼忖忻忤忸忱忝悳忿怡恠"], 67 | ["9c80","怙怐怩怎怱怛怕怫怦怏怺恚恁恪恷恟恊恆恍恣恃恤恂恬恫恙悁悍惧悃悚悄悛悖悗悒悧悋惡悸惠惓悴忰悽惆悵惘慍愕愆惶惷愀惴惺愃愡惻惱愍愎慇愾愨愧慊愿愼愬愴愽慂慄慳慷慘慙慚慫慴慯慥慱慟慝慓慵憙憖憇憬憔憚憊憑憫憮懌懊應懷懈懃懆憺懋罹懍懦懣懶懺懴懿懽懼懾戀戈戉戍戌戔戛"], 68 | ["9d40","戞戡截戮戰戲戳扁扎扞扣扛扠扨扼抂抉找抒抓抖拔抃抔拗拑抻拏拿拆擔拈拜拌拊拂拇抛拉挌拮拱挧挂挈拯拵捐挾捍搜捏掖掎掀掫捶掣掏掉掟掵捫"], 69 | ["9d80","捩掾揩揀揆揣揉插揶揄搖搴搆搓搦搶攝搗搨搏摧摯摶摎攪撕撓撥撩撈撼據擒擅擇撻擘擂擱擧舉擠擡抬擣擯攬擶擴擲擺攀擽攘攜攅攤攣攫攴攵攷收攸畋效敖敕敍敘敞敝敲數斂斃變斛斟斫斷旃旆旁旄旌旒旛旙无旡旱杲昊昃旻杳昵昶昴昜晏晄晉晁晞晝晤晧晨晟晢晰暃暈暎暉暄暘暝曁暹曉暾暼"], 70 | ["9e40","曄暸曖曚曠昿曦曩曰曵曷朏朖朞朦朧霸朮朿朶杁朸朷杆杞杠杙杣杤枉杰枩杼杪枌枋枦枡枅枷柯枴柬枳柩枸柤柞柝柢柮枹柎柆柧檜栞框栩桀桍栲桎"], 71 | ["9e80","梳栫桙档桷桿梟梏梭梔條梛梃檮梹桴梵梠梺椏梍桾椁棊椈棘椢椦棡椌棍棔棧棕椶椒椄棗棣椥棹棠棯椨椪椚椣椡棆楹楷楜楸楫楔楾楮椹楴椽楙椰楡楞楝榁楪榲榮槐榿槁槓榾槎寨槊槝榻槃榧樮榑榠榜榕榴槞槨樂樛槿權槹槲槧樅榱樞槭樔槫樊樒櫁樣樓橄樌橲樶橸橇橢橙橦橈樸樢檐檍檠檄檢檣"], 72 | ["9f40","檗蘗檻櫃櫂檸檳檬櫞櫑櫟檪櫚櫪櫻欅蘖櫺欒欖鬱欟欸欷盜欹飮歇歃歉歐歙歔歛歟歡歸歹歿殀殄殃殍殘殕殞殤殪殫殯殲殱殳殷殼毆毋毓毟毬毫毳毯"], 73 | ["9f80","麾氈氓气氛氤氣汞汕汢汪沂沍沚沁沛汾汨汳沒沐泄泱泓沽泗泅泝沮沱沾沺泛泯泙泪洟衍洶洫洽洸洙洵洳洒洌浣涓浤浚浹浙涎涕濤涅淹渕渊涵淇淦涸淆淬淞淌淨淒淅淺淙淤淕淪淮渭湮渮渙湲湟渾渣湫渫湶湍渟湃渺湎渤滿渝游溂溪溘滉溷滓溽溯滄溲滔滕溏溥滂溟潁漑灌滬滸滾漿滲漱滯漲滌"], 74 | ["e040","漾漓滷澆潺潸澁澀潯潛濳潭澂潼潘澎澑濂潦澳澣澡澤澹濆澪濟濕濬濔濘濱濮濛瀉瀋濺瀑瀁瀏濾瀛瀚潴瀝瀘瀟瀰瀾瀲灑灣炙炒炯烱炬炸炳炮烟烋烝"], 75 | ["e080","烙焉烽焜焙煥煕熈煦煢煌煖煬熏燻熄熕熨熬燗熹熾燒燉燔燎燠燬燧燵燼燹燿爍爐爛爨爭爬爰爲爻爼爿牀牆牋牘牴牾犂犁犇犒犖犢犧犹犲狃狆狄狎狒狢狠狡狹狷倏猗猊猜猖猝猴猯猩猥猾獎獏默獗獪獨獰獸獵獻獺珈玳珎玻珀珥珮珞璢琅瑯琥珸琲琺瑕琿瑟瑙瑁瑜瑩瑰瑣瑪瑶瑾璋璞璧瓊瓏瓔珱"], 76 | ["e140","瓠瓣瓧瓩瓮瓲瓰瓱瓸瓷甄甃甅甌甎甍甕甓甞甦甬甼畄畍畊畉畛畆畚畩畤畧畫畭畸當疆疇畴疊疉疂疔疚疝疥疣痂疳痃疵疽疸疼疱痍痊痒痙痣痞痾痿"], 77 | ["e180","痼瘁痰痺痲痳瘋瘍瘉瘟瘧瘠瘡瘢瘤瘴瘰瘻癇癈癆癜癘癡癢癨癩癪癧癬癰癲癶癸發皀皃皈皋皎皖皓皙皚皰皴皸皹皺盂盍盖盒盞盡盥盧盪蘯盻眈眇眄眩眤眞眥眦眛眷眸睇睚睨睫睛睥睿睾睹瞎瞋瞑瞠瞞瞰瞶瞹瞿瞼瞽瞻矇矍矗矚矜矣矮矼砌砒礦砠礪硅碎硴碆硼碚碌碣碵碪碯磑磆磋磔碾碼磅磊磬"], 78 | ["e240","磧磚磽磴礇礒礑礙礬礫祀祠祗祟祚祕祓祺祿禊禝禧齋禪禮禳禹禺秉秕秧秬秡秣稈稍稘稙稠稟禀稱稻稾稷穃穗穉穡穢穩龝穰穹穽窈窗窕窘窖窩竈窰"], 79 | ["e280","窶竅竄窿邃竇竊竍竏竕竓站竚竝竡竢竦竭竰笂笏笊笆笳笘笙笞笵笨笶筐筺笄筍笋筌筅筵筥筴筧筰筱筬筮箝箘箟箍箜箚箋箒箏筝箙篋篁篌篏箴篆篝篩簑簔篦篥籠簀簇簓篳篷簗簍篶簣簧簪簟簷簫簽籌籃籔籏籀籐籘籟籤籖籥籬籵粃粐粤粭粢粫粡粨粳粲粱粮粹粽糀糅糂糘糒糜糢鬻糯糲糴糶糺紆"], 80 | ["e340","紂紜紕紊絅絋紮紲紿紵絆絳絖絎絲絨絮絏絣經綉絛綏絽綛綺綮綣綵緇綽綫總綢綯緜綸綟綰緘緝緤緞緻緲緡縅縊縣縡縒縱縟縉縋縢繆繦縻縵縹繃縷"], 81 | ["e380","縲縺繧繝繖繞繙繚繹繪繩繼繻纃緕繽辮繿纈纉續纒纐纓纔纖纎纛纜缸缺罅罌罍罎罐网罕罔罘罟罠罨罩罧罸羂羆羃羈羇羌羔羞羝羚羣羯羲羹羮羶羸譱翅翆翊翕翔翡翦翩翳翹飜耆耄耋耒耘耙耜耡耨耿耻聊聆聒聘聚聟聢聨聳聲聰聶聹聽聿肄肆肅肛肓肚肭冐肬胛胥胙胝胄胚胖脉胯胱脛脩脣脯腋"], 82 | ["e440","隋腆脾腓腑胼腱腮腥腦腴膃膈膊膀膂膠膕膤膣腟膓膩膰膵膾膸膽臀臂膺臉臍臑臙臘臈臚臟臠臧臺臻臾舁舂舅與舊舍舐舖舩舫舸舳艀艙艘艝艚艟艤"], 83 | ["e480","艢艨艪艫舮艱艷艸艾芍芒芫芟芻芬苡苣苟苒苴苳苺莓范苻苹苞茆苜茉苙茵茴茖茲茱荀茹荐荅茯茫茗茘莅莚莪莟莢莖茣莎莇莊荼莵荳荵莠莉莨菴萓菫菎菽萃菘萋菁菷萇菠菲萍萢萠莽萸蔆菻葭萪萼蕚蒄葷葫蒭葮蒂葩葆萬葯葹萵蓊葢蒹蒿蒟蓙蓍蒻蓚蓐蓁蓆蓖蒡蔡蓿蓴蔗蔘蔬蔟蔕蔔蓼蕀蕣蕘蕈"], 84 | ["e540","蕁蘂蕋蕕薀薤薈薑薊薨蕭薔薛藪薇薜蕷蕾薐藉薺藏薹藐藕藝藥藜藹蘊蘓蘋藾藺蘆蘢蘚蘰蘿虍乕虔號虧虱蚓蚣蚩蚪蚋蚌蚶蚯蛄蛆蚰蛉蠣蚫蛔蛞蛩蛬"], 85 | ["e580","蛟蛛蛯蜒蜆蜈蜀蜃蛻蜑蜉蜍蛹蜊蜴蜿蜷蜻蜥蜩蜚蝠蝟蝸蝌蝎蝴蝗蝨蝮蝙蝓蝣蝪蠅螢螟螂螯蟋螽蟀蟐雖螫蟄螳蟇蟆螻蟯蟲蟠蠏蠍蟾蟶蟷蠎蟒蠑蠖蠕蠢蠡蠱蠶蠹蠧蠻衄衂衒衙衞衢衫袁衾袞衵衽袵衲袂袗袒袮袙袢袍袤袰袿袱裃裄裔裘裙裝裹褂裼裴裨裲褄褌褊褓襃褞褥褪褫襁襄褻褶褸襌褝襠襞"], 86 | ["e640","襦襤襭襪襯襴襷襾覃覈覊覓覘覡覩覦覬覯覲覺覽覿觀觚觜觝觧觴觸訃訖訐訌訛訝訥訶詁詛詒詆詈詼詭詬詢誅誂誄誨誡誑誥誦誚誣諄諍諂諚諫諳諧"], 87 | ["e680","諤諱謔諠諢諷諞諛謌謇謚諡謖謐謗謠謳鞫謦謫謾謨譁譌譏譎證譖譛譚譫譟譬譯譴譽讀讌讎讒讓讖讙讚谺豁谿豈豌豎豐豕豢豬豸豺貂貉貅貊貍貎貔豼貘戝貭貪貽貲貳貮貶賈賁賤賣賚賽賺賻贄贅贊贇贏贍贐齎贓賍贔贖赧赭赱赳趁趙跂趾趺跏跚跖跌跛跋跪跫跟跣跼踈踉跿踝踞踐踟蹂踵踰踴蹊"], 88 | ["e740","蹇蹉蹌蹐蹈蹙蹤蹠踪蹣蹕蹶蹲蹼躁躇躅躄躋躊躓躑躔躙躪躡躬躰軆躱躾軅軈軋軛軣軼軻軫軾輊輅輕輒輙輓輜輟輛輌輦輳輻輹轅轂輾轌轉轆轎轗轜"], 89 | ["e780","轢轣轤辜辟辣辭辯辷迚迥迢迪迯邇迴逅迹迺逑逕逡逍逞逖逋逧逶逵逹迸遏遐遑遒逎遉逾遖遘遞遨遯遶隨遲邂遽邁邀邊邉邏邨邯邱邵郢郤扈郛鄂鄒鄙鄲鄰酊酖酘酣酥酩酳酲醋醉醂醢醫醯醪醵醴醺釀釁釉釋釐釖釟釡釛釼釵釶鈞釿鈔鈬鈕鈑鉞鉗鉅鉉鉤鉈銕鈿鉋鉐銜銖銓銛鉚鋏銹銷鋩錏鋺鍄錮"], 90 | ["e840","錙錢錚錣錺錵錻鍜鍠鍼鍮鍖鎰鎬鎭鎔鎹鏖鏗鏨鏥鏘鏃鏝鏐鏈鏤鐚鐔鐓鐃鐇鐐鐶鐫鐵鐡鐺鑁鑒鑄鑛鑠鑢鑞鑪鈩鑰鑵鑷鑽鑚鑼鑾钁鑿閂閇閊閔閖閘閙"], 91 | ["e880","閠閨閧閭閼閻閹閾闊濶闃闍闌闕闔闖關闡闥闢阡阨阮阯陂陌陏陋陷陜陞陝陟陦陲陬隍隘隕隗險隧隱隲隰隴隶隸隹雎雋雉雍襍雜霍雕雹霄霆霈霓霎霑霏霖霙霤霪霰霹霽霾靄靆靈靂靉靜靠靤靦靨勒靫靱靹鞅靼鞁靺鞆鞋鞏鞐鞜鞨鞦鞣鞳鞴韃韆韈韋韜韭齏韲竟韶韵頏頌頸頤頡頷頽顆顏顋顫顯顰"], 92 | ["e940","顱顴顳颪颯颱颶飄飃飆飩飫餃餉餒餔餘餡餝餞餤餠餬餮餽餾饂饉饅饐饋饑饒饌饕馗馘馥馭馮馼駟駛駝駘駑駭駮駱駲駻駸騁騏騅駢騙騫騷驅驂驀驃"], 93 | ["e980","騾驕驍驛驗驟驢驥驤驩驫驪骭骰骼髀髏髑髓體髞髟髢髣髦髯髫髮髴髱髷髻鬆鬘鬚鬟鬢鬣鬥鬧鬨鬩鬪鬮鬯鬲魄魃魏魍魎魑魘魴鮓鮃鮑鮖鮗鮟鮠鮨鮴鯀鯊鮹鯆鯏鯑鯒鯣鯢鯤鯔鯡鰺鯲鯱鯰鰕鰔鰉鰓鰌鰆鰈鰒鰊鰄鰮鰛鰥鰤鰡鰰鱇鰲鱆鰾鱚鱠鱧鱶鱸鳧鳬鳰鴉鴈鳫鴃鴆鴪鴦鶯鴣鴟鵄鴕鴒鵁鴿鴾鵆鵈"], 94 | ["ea40","鵝鵞鵤鵑鵐鵙鵲鶉鶇鶫鵯鵺鶚鶤鶩鶲鷄鷁鶻鶸鶺鷆鷏鷂鷙鷓鷸鷦鷭鷯鷽鸚鸛鸞鹵鹹鹽麁麈麋麌麒麕麑麝麥麩麸麪麭靡黌黎黏黐黔黜點黝黠黥黨黯"], 95 | ["ea80","黴黶黷黹黻黼黽鼇鼈皷鼕鼡鼬鼾齊齒齔齣齟齠齡齦齧齬齪齷齲齶龕龜龠堯槇遙瑤凜熙"], 96 | ["ed40","纊褜鍈銈蓜俉炻昱棈鋹曻彅丨仡仼伀伃伹佖侒侊侚侔俍偀倢俿倞偆偰偂傔僴僘兊兤冝冾凬刕劜劦勀勛匀匇匤卲厓厲叝﨎咜咊咩哿喆坙坥垬埈埇﨏"], 97 | ["ed80","塚增墲夋奓奛奝奣妤妺孖寀甯寘寬尞岦岺峵崧嵓﨑嵂嵭嶸嶹巐弡弴彧德忞恝悅悊惞惕愠惲愑愷愰憘戓抦揵摠撝擎敎昀昕昻昉昮昞昤晥晗晙晴晳暙暠暲暿曺朎朗杦枻桒柀栁桄棏﨓楨﨔榘槢樰橫橆橳橾櫢櫤毖氿汜沆汯泚洄涇浯涖涬淏淸淲淼渹湜渧渼溿澈澵濵瀅瀇瀨炅炫焏焄煜煆煇凞燁燾犱"], 98 | ["ee40","犾猤猪獷玽珉珖珣珒琇珵琦琪琩琮瑢璉璟甁畯皂皜皞皛皦益睆劯砡硎硤硺礰礼神祥禔福禛竑竧靖竫箞精絈絜綷綠緖繒罇羡羽茁荢荿菇菶葈蒴蕓蕙"], 99 | ["ee80","蕫﨟薰蘒﨡蠇裵訒訷詹誧誾諟諸諶譓譿賰賴贒赶﨣軏﨤逸遧郞都鄕鄧釚釗釞釭釮釤釥鈆鈐鈊鈺鉀鈼鉎鉙鉑鈹鉧銧鉷鉸鋧鋗鋙鋐﨧鋕鋠鋓錥錡鋻﨨錞鋿錝錂鍰鍗鎤鏆鏞鏸鐱鑅鑈閒隆﨩隝隯霳霻靃靍靏靑靕顗顥飯飼餧館馞驎髙髜魵魲鮏鮱鮻鰀鵰鵫鶴鸙黑"], 100 | ["eeef","ⅰ",9,"¬¦'""], 101 | ["f040","",62], 102 | ["f080","",124], 103 | ["f140","",62], 104 | ["f180","",124], 105 | ["f240","",62], 106 | ["f280","",124], 107 | ["f340","",62], 108 | ["f380","",124], 109 | ["f440","",62], 110 | ["f480","",124], 111 | ["f540","",62], 112 | ["f580","",124], 113 | ["f640","",62], 114 | ["f680","",124], 115 | ["f740","",62], 116 | ["f780","",124], 117 | ["f840","",62], 118 | ["f880","",124], 119 | ["f940",""], 120 | ["fa40","ⅰ",9,"Ⅰ",9,"¬¦'"㈱№℡∵纊褜鍈銈蓜俉炻昱棈鋹曻彅丨仡仼伀伃伹佖侒侊侚侔俍偀倢俿倞偆偰偂傔僴僘兊"], 121 | ["fa80","兤冝冾凬刕劜劦勀勛匀匇匤卲厓厲叝﨎咜咊咩哿喆坙坥垬埈埇﨏塚增墲夋奓奛奝奣妤妺孖寀甯寘寬尞岦岺峵崧嵓﨑嵂嵭嶸嶹巐弡弴彧德忞恝悅悊惞惕愠惲愑愷愰憘戓抦揵摠撝擎敎昀昕昻昉昮昞昤晥晗晙晴晳暙暠暲暿曺朎朗杦枻桒柀栁桄棏﨓楨﨔榘槢樰橫橆橳橾櫢櫤毖氿汜沆汯泚洄涇浯"], 122 | ["fb40","涖涬淏淸淲淼渹湜渧渼溿澈澵濵瀅瀇瀨炅炫焏焄煜煆煇凞燁燾犱犾猤猪獷玽珉珖珣珒琇珵琦琪琩琮瑢璉璟甁畯皂皜皞皛皦益睆劯砡硎硤硺礰礼神"], 123 | ["fb80","祥禔福禛竑竧靖竫箞精絈絜綷綠緖繒罇羡羽茁荢荿菇菶葈蒴蕓蕙蕫﨟薰蘒﨡蠇裵訒訷詹誧誾諟諸諶譓譿賰賴贒赶﨣軏﨤逸遧郞都鄕鄧釚釗釞釭釮釤釥鈆鈐鈊鈺鉀鈼鉎鉙鉑鈹鉧銧鉷鉸鋧鋗鋙鋐﨧鋕鋠鋓錥錡鋻﨨錞鋿錝錂鍰鍗鎤鏆鏞鏸鐱鑅鑈閒隆﨩隝隯霳霻靃靍靏靑靕顗顥飯飼餧館馞驎髙"], 124 | ["fc40","髜魵魲鮏鮱鮻鰀鵰鵫鶴鸙黑"] 125 | ] 126 | -------------------------------------------------------------------------------- /encodings/utf16.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | var Buffer = require("safer-buffer").Buffer; 3 | 4 | // Note: UTF16-LE (or UCS2) codec is Node.js native. See encodings/internal.js 5 | 6 | // == UTF16-BE codec. ========================================================== 7 | 8 | exports.utf16be = Utf16BECodec; 9 | function Utf16BECodec() { 10 | } 11 | 12 | Utf16BECodec.prototype.encoder = Utf16BEEncoder; 13 | Utf16BECodec.prototype.decoder = Utf16BEDecoder; 14 | Utf16BECodec.prototype.bomAware = true; 15 | 16 | 17 | // -- Encoding 18 | 19 | function Utf16BEEncoder() { 20 | } 21 | 22 | Utf16BEEncoder.prototype.write = function(str) { 23 | var buf = Buffer.from(str, 'ucs2'); 24 | for (var i = 0; i < buf.length; i += 2) { 25 | var tmp = buf[i]; buf[i] = buf[i+1]; buf[i+1] = tmp; 26 | } 27 | return buf; 28 | } 29 | 30 | Utf16BEEncoder.prototype.end = function() { 31 | } 32 | 33 | 34 | // -- Decoding 35 | 36 | function Utf16BEDecoder() { 37 | this.overflowByte = -1; 38 | } 39 | 40 | Utf16BEDecoder.prototype.write = function(buf) { 41 | if (buf.length == 0) 42 | return ''; 43 | 44 | var buf2 = Buffer.alloc(buf.length + 1), 45 | i = 0, j = 0; 46 | 47 | if (this.overflowByte !== -1) { 48 | buf2[0] = buf[0]; 49 | buf2[1] = this.overflowByte; 50 | i = 1; j = 2; 51 | } 52 | 53 | for (; i < buf.length-1; i += 2, j+= 2) { 54 | buf2[j] = buf[i+1]; 55 | buf2[j+1] = buf[i]; 56 | } 57 | 58 | this.overflowByte = (i == buf.length-1) ? buf[buf.length-1] : -1; 59 | 60 | return buf2.slice(0, j).toString('ucs2'); 61 | } 62 | 63 | Utf16BEDecoder.prototype.end = function() { 64 | this.overflowByte = -1; 65 | } 66 | 67 | 68 | // == UTF-16 codec ============================================================= 69 | // Decoder chooses automatically from UTF-16LE and UTF-16BE using BOM and space-based heuristic. 70 | // Defaults to UTF-16LE, as it's prevalent and default in Node. 71 | // http://en.wikipedia.org/wiki/UTF-16 and http://encoding.spec.whatwg.org/#utf-16le 72 | // Decoder default can be changed: iconv.decode(buf, 'utf16', {defaultEncoding: 'utf-16be'}); 73 | 74 | // Encoder uses UTF-16LE and prepends BOM (which can be overridden with addBOM: false). 75 | 76 | exports.utf16 = Utf16Codec; 77 | function Utf16Codec(codecOptions, iconv) { 78 | this.iconv = iconv; 79 | } 80 | 81 | Utf16Codec.prototype.encoder = Utf16Encoder; 82 | Utf16Codec.prototype.decoder = Utf16Decoder; 83 | 84 | 85 | // -- Encoding (pass-through) 86 | 87 | function Utf16Encoder(options, codec) { 88 | options = options || {}; 89 | if (options.addBOM === undefined) 90 | options.addBOM = true; 91 | this.encoder = codec.iconv.getEncoder('utf-16le', options); 92 | } 93 | 94 | Utf16Encoder.prototype.write = function(str) { 95 | return this.encoder.write(str); 96 | } 97 | 98 | Utf16Encoder.prototype.end = function() { 99 | return this.encoder.end(); 100 | } 101 | 102 | 103 | // -- Decoding 104 | 105 | function Utf16Decoder(options, codec) { 106 | this.decoder = null; 107 | this.initialBufs = []; 108 | this.initialBufsLen = 0; 109 | 110 | this.options = options || {}; 111 | this.iconv = codec.iconv; 112 | } 113 | 114 | Utf16Decoder.prototype.write = function(buf) { 115 | if (!this.decoder) { 116 | // Codec is not chosen yet. Accumulate initial bytes. 117 | this.initialBufs.push(buf); 118 | this.initialBufsLen += buf.length; 119 | 120 | if (this.initialBufsLen < 16) // We need more bytes to use space heuristic (see below) 121 | return ''; 122 | 123 | // We have enough bytes -> detect endianness. 124 | var encoding = detectEncoding(this.initialBufs, this.options.defaultEncoding); 125 | this.decoder = this.iconv.getDecoder(encoding, this.options); 126 | 127 | var resStr = ''; 128 | for (var i = 0; i < this.initialBufs.length; i++) 129 | resStr += this.decoder.write(this.initialBufs[i]); 130 | 131 | this.initialBufs.length = this.initialBufsLen = 0; 132 | return resStr; 133 | } 134 | 135 | return this.decoder.write(buf); 136 | } 137 | 138 | Utf16Decoder.prototype.end = function() { 139 | if (!this.decoder) { 140 | var encoding = detectEncoding(this.initialBufs, this.options.defaultEncoding); 141 | this.decoder = this.iconv.getDecoder(encoding, this.options); 142 | 143 | var resStr = ''; 144 | for (var i = 0; i < this.initialBufs.length; i++) 145 | resStr += this.decoder.write(this.initialBufs[i]); 146 | 147 | var trail = this.decoder.end(); 148 | if (trail) 149 | resStr += trail; 150 | 151 | this.initialBufs.length = this.initialBufsLen = 0; 152 | return resStr; 153 | } 154 | return this.decoder.end(); 155 | } 156 | 157 | function detectEncoding(bufs, defaultEncoding) { 158 | var b = []; 159 | var charsProcessed = 0; 160 | var asciiCharsLE = 0, asciiCharsBE = 0; // Number of ASCII chars when decoded as LE or BE. 161 | 162 | outer_loop: 163 | for (var i = 0; i < bufs.length; i++) { 164 | var buf = bufs[i]; 165 | for (var j = 0; j < buf.length; j++) { 166 | b.push(buf[j]); 167 | if (b.length === 2) { 168 | if (charsProcessed === 0) { 169 | // Check BOM first. 170 | if (b[0] === 0xFF && b[1] === 0xFE) return 'utf-16le'; 171 | if (b[0] === 0xFE && b[1] === 0xFF) return 'utf-16be'; 172 | } 173 | 174 | if (b[0] === 0 && b[1] !== 0) asciiCharsBE++; 175 | if (b[0] !== 0 && b[1] === 0) asciiCharsLE++; 176 | 177 | b.length = 0; 178 | charsProcessed++; 179 | 180 | if (charsProcessed >= 100) { 181 | break outer_loop; 182 | } 183 | } 184 | } 185 | } 186 | 187 | // Make decisions. 188 | // Most of the time, the content has ASCII chars (U+00**), but the opposite (U+**00) is uncommon. 189 | // So, we count ASCII as if it was LE or BE, and decide from that. 190 | if (asciiCharsBE > asciiCharsLE) return 'utf-16be'; 191 | if (asciiCharsBE < asciiCharsLE) return 'utf-16le'; 192 | 193 | // Couldn't decide (likely all zeros or not enough data). 194 | return defaultEncoding || 'utf-16le'; 195 | } 196 | 197 | 198 | -------------------------------------------------------------------------------- /encodings/utf32.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | var Buffer = require('safer-buffer').Buffer; 4 | 5 | // == UTF32-LE/BE codec. ========================================================== 6 | 7 | exports._utf32 = Utf32Codec; 8 | 9 | function Utf32Codec(codecOptions, iconv) { 10 | this.iconv = iconv; 11 | this.bomAware = true; 12 | this.isLE = codecOptions.isLE; 13 | } 14 | 15 | exports.utf32le = { type: '_utf32', isLE: true }; 16 | exports.utf32be = { type: '_utf32', isLE: false }; 17 | 18 | // Aliases 19 | exports.ucs4le = 'utf32le'; 20 | exports.ucs4be = 'utf32be'; 21 | 22 | Utf32Codec.prototype.encoder = Utf32Encoder; 23 | Utf32Codec.prototype.decoder = Utf32Decoder; 24 | 25 | // -- Encoding 26 | 27 | function Utf32Encoder(options, codec) { 28 | this.isLE = codec.isLE; 29 | this.highSurrogate = 0; 30 | } 31 | 32 | Utf32Encoder.prototype.write = function(str) { 33 | var src = Buffer.from(str, 'ucs2'); 34 | var dst = Buffer.alloc(src.length * 2); 35 | var write32 = this.isLE ? dst.writeUInt32LE : dst.writeUInt32BE; 36 | var offset = 0; 37 | 38 | for (var i = 0; i < src.length; i += 2) { 39 | var code = src.readUInt16LE(i); 40 | var isHighSurrogate = (0xD800 <= code && code < 0xDC00); 41 | var isLowSurrogate = (0xDC00 <= code && code < 0xE000); 42 | 43 | if (this.highSurrogate) { 44 | if (isHighSurrogate || !isLowSurrogate) { 45 | // There shouldn't be two high surrogates in a row, nor a high surrogate which isn't followed by a low 46 | // surrogate. If this happens, keep the pending high surrogate as a stand-alone semi-invalid character 47 | // (technically wrong, but expected by some applications, like Windows file names). 48 | write32.call(dst, this.highSurrogate, offset); 49 | offset += 4; 50 | } 51 | else { 52 | // Create 32-bit value from high and low surrogates; 53 | var codepoint = (((this.highSurrogate - 0xD800) << 10) | (code - 0xDC00)) + 0x10000; 54 | 55 | write32.call(dst, codepoint, offset); 56 | offset += 4; 57 | this.highSurrogate = 0; 58 | 59 | continue; 60 | } 61 | } 62 | 63 | if (isHighSurrogate) 64 | this.highSurrogate = code; 65 | else { 66 | // Even if the current character is a low surrogate, with no previous high surrogate, we'll 67 | // encode it as a semi-invalid stand-alone character for the same reasons expressed above for 68 | // unpaired high surrogates. 69 | write32.call(dst, code, offset); 70 | offset += 4; 71 | this.highSurrogate = 0; 72 | } 73 | } 74 | 75 | if (offset < dst.length) 76 | dst = dst.slice(0, offset); 77 | 78 | return dst; 79 | }; 80 | 81 | Utf32Encoder.prototype.end = function() { 82 | // Treat any leftover high surrogate as a semi-valid independent character. 83 | if (!this.highSurrogate) 84 | return; 85 | 86 | var buf = Buffer.alloc(4); 87 | 88 | if (this.isLE) 89 | buf.writeUInt32LE(this.highSurrogate, 0); 90 | else 91 | buf.writeUInt32BE(this.highSurrogate, 0); 92 | 93 | this.highSurrogate = 0; 94 | 95 | return buf; 96 | }; 97 | 98 | // -- Decoding 99 | 100 | function Utf32Decoder(options, codec) { 101 | this.isLE = codec.isLE; 102 | this.badChar = codec.iconv.defaultCharUnicode.charCodeAt(0); 103 | this.overflow = []; 104 | } 105 | 106 | Utf32Decoder.prototype.write = function(src) { 107 | if (src.length === 0) 108 | return ''; 109 | 110 | var i = 0; 111 | var codepoint = 0; 112 | var dst = Buffer.alloc(src.length + 4); 113 | var offset = 0; 114 | var isLE = this.isLE; 115 | var overflow = this.overflow; 116 | var badChar = this.badChar; 117 | 118 | if (overflow.length > 0) { 119 | for (; i < src.length && overflow.length < 4; i++) 120 | overflow.push(src[i]); 121 | 122 | if (overflow.length === 4) { 123 | // NOTE: codepoint is a signed int32 and can be negative. 124 | // NOTE: We copied this block from below to help V8 optimize it (it works with array, not buffer). 125 | if (isLE) { 126 | codepoint = overflow[i] | (overflow[i+1] << 8) | (overflow[i+2] << 16) | (overflow[i+3] << 24); 127 | } else { 128 | codepoint = overflow[i+3] | (overflow[i+2] << 8) | (overflow[i+1] << 16) | (overflow[i] << 24); 129 | } 130 | overflow.length = 0; 131 | 132 | offset = _writeCodepoint(dst, offset, codepoint, badChar); 133 | } 134 | } 135 | 136 | // Main loop. Should be as optimized as possible. 137 | for (; i < src.length - 3; i += 4) { 138 | // NOTE: codepoint is a signed int32 and can be negative. 139 | if (isLE) { 140 | codepoint = src[i] | (src[i+1] << 8) | (src[i+2] << 16) | (src[i+3] << 24); 141 | } else { 142 | codepoint = src[i+3] | (src[i+2] << 8) | (src[i+1] << 16) | (src[i] << 24); 143 | } 144 | offset = _writeCodepoint(dst, offset, codepoint, badChar); 145 | } 146 | 147 | // Keep overflowing bytes. 148 | for (; i < src.length; i++) { 149 | overflow.push(src[i]); 150 | } 151 | 152 | return dst.slice(0, offset).toString('ucs2'); 153 | }; 154 | 155 | function _writeCodepoint(dst, offset, codepoint, badChar) { 156 | // NOTE: codepoint is signed int32 and can be negative. We keep it that way to help V8 with optimizations. 157 | if (codepoint < 0 || codepoint > 0x10FFFF) { 158 | // Not a valid Unicode codepoint 159 | codepoint = badChar; 160 | } 161 | 162 | // Ephemeral Planes: Write high surrogate. 163 | if (codepoint >= 0x10000) { 164 | codepoint -= 0x10000; 165 | 166 | var high = 0xD800 | (codepoint >> 10); 167 | dst[offset++] = high & 0xff; 168 | dst[offset++] = high >> 8; 169 | 170 | // Low surrogate is written below. 171 | var codepoint = 0xDC00 | (codepoint & 0x3FF); 172 | } 173 | 174 | // Write BMP char or low surrogate. 175 | dst[offset++] = codepoint & 0xff; 176 | dst[offset++] = codepoint >> 8; 177 | 178 | return offset; 179 | }; 180 | 181 | Utf32Decoder.prototype.end = function() { 182 | this.overflow.length = 0; 183 | }; 184 | 185 | // == UTF-32 Auto codec ============================================================= 186 | // Decoder chooses automatically from UTF-32LE and UTF-32BE using BOM and space-based heuristic. 187 | // Defaults to UTF-32LE. http://en.wikipedia.org/wiki/UTF-32 188 | // Encoder/decoder default can be changed: iconv.decode(buf, 'utf32', {defaultEncoding: 'utf-32be'}); 189 | 190 | // Encoder prepends BOM (which can be overridden with (addBOM: false}). 191 | 192 | exports.utf32 = Utf32AutoCodec; 193 | exports.ucs4 = 'utf32'; 194 | 195 | function Utf32AutoCodec(options, iconv) { 196 | this.iconv = iconv; 197 | } 198 | 199 | Utf32AutoCodec.prototype.encoder = Utf32AutoEncoder; 200 | Utf32AutoCodec.prototype.decoder = Utf32AutoDecoder; 201 | 202 | // -- Encoding 203 | 204 | function Utf32AutoEncoder(options, codec) { 205 | options = options || {}; 206 | 207 | if (options.addBOM === undefined) 208 | options.addBOM = true; 209 | 210 | this.encoder = codec.iconv.getEncoder(options.defaultEncoding || 'utf-32le', options); 211 | } 212 | 213 | Utf32AutoEncoder.prototype.write = function(str) { 214 | return this.encoder.write(str); 215 | }; 216 | 217 | Utf32AutoEncoder.prototype.end = function() { 218 | return this.encoder.end(); 219 | }; 220 | 221 | // -- Decoding 222 | 223 | function Utf32AutoDecoder(options, codec) { 224 | this.decoder = null; 225 | this.initialBufs = []; 226 | this.initialBufsLen = 0; 227 | this.options = options || {}; 228 | this.iconv = codec.iconv; 229 | } 230 | 231 | Utf32AutoDecoder.prototype.write = function(buf) { 232 | if (!this.decoder) { 233 | // Codec is not chosen yet. Accumulate initial bytes. 234 | this.initialBufs.push(buf); 235 | this.initialBufsLen += buf.length; 236 | 237 | if (this.initialBufsLen < 32) // We need more bytes to use space heuristic (see below) 238 | return ''; 239 | 240 | // We have enough bytes -> detect endianness. 241 | var encoding = detectEncoding(this.initialBufs, this.options.defaultEncoding); 242 | this.decoder = this.iconv.getDecoder(encoding, this.options); 243 | 244 | var resStr = ''; 245 | for (var i = 0; i < this.initialBufs.length; i++) 246 | resStr += this.decoder.write(this.initialBufs[i]); 247 | 248 | this.initialBufs.length = this.initialBufsLen = 0; 249 | return resStr; 250 | } 251 | 252 | return this.decoder.write(buf); 253 | }; 254 | 255 | Utf32AutoDecoder.prototype.end = function() { 256 | if (!this.decoder) { 257 | var encoding = detectEncoding(this.initialBufs, this.options.defaultEncoding); 258 | this.decoder = this.iconv.getDecoder(encoding, this.options); 259 | 260 | var resStr = ''; 261 | for (var i = 0; i < this.initialBufs.length; i++) 262 | resStr += this.decoder.write(this.initialBufs[i]); 263 | 264 | var trail = this.decoder.end(); 265 | if (trail) 266 | resStr += trail; 267 | 268 | this.initialBufs.length = this.initialBufsLen = 0; 269 | return resStr; 270 | } 271 | 272 | return this.decoder.end(); 273 | }; 274 | 275 | function detectEncoding(bufs, defaultEncoding) { 276 | var b = []; 277 | var charsProcessed = 0; 278 | var invalidLE = 0, invalidBE = 0; // Number of invalid chars when decoded as LE or BE. 279 | var bmpCharsLE = 0, bmpCharsBE = 0; // Number of BMP chars when decoded as LE or BE. 280 | 281 | outer_loop: 282 | for (var i = 0; i < bufs.length; i++) { 283 | var buf = bufs[i]; 284 | for (var j = 0; j < buf.length; j++) { 285 | b.push(buf[j]); 286 | if (b.length === 4) { 287 | if (charsProcessed === 0) { 288 | // Check BOM first. 289 | if (b[0] === 0xFF && b[1] === 0xFE && b[2] === 0 && b[3] === 0) { 290 | return 'utf-32le'; 291 | } 292 | if (b[0] === 0 && b[1] === 0 && b[2] === 0xFE && b[3] === 0xFF) { 293 | return 'utf-32be'; 294 | } 295 | } 296 | 297 | if (b[0] !== 0 || b[1] > 0x10) invalidBE++; 298 | if (b[3] !== 0 || b[2] > 0x10) invalidLE++; 299 | 300 | if (b[0] === 0 && b[1] === 0 && (b[2] !== 0 || b[3] !== 0)) bmpCharsBE++; 301 | if ((b[0] !== 0 || b[1] !== 0) && b[2] === 0 && b[3] === 0) bmpCharsLE++; 302 | 303 | b.length = 0; 304 | charsProcessed++; 305 | 306 | if (charsProcessed >= 100) { 307 | break outer_loop; 308 | } 309 | } 310 | } 311 | } 312 | 313 | // Make decisions. 314 | if (bmpCharsBE - invalidBE > bmpCharsLE - invalidLE) return 'utf-32be'; 315 | if (bmpCharsBE - invalidBE < bmpCharsLE - invalidLE) return 'utf-32le'; 316 | 317 | // Couldn't decide (likely all zeros or not enough data). 318 | return defaultEncoding || 'utf-32le'; 319 | } 320 | -------------------------------------------------------------------------------- /encodings/utf7.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | var Buffer = require("safer-buffer").Buffer; 3 | 4 | // UTF-7 codec, according to https://tools.ietf.org/html/rfc2152 5 | // See also below a UTF-7-IMAP codec, according to http://tools.ietf.org/html/rfc3501#section-5.1.3 6 | 7 | exports.utf7 = Utf7Codec; 8 | exports.unicode11utf7 = 'utf7'; // Alias UNICODE-1-1-UTF-7 9 | function Utf7Codec(codecOptions, iconv) { 10 | this.iconv = iconv; 11 | }; 12 | 13 | Utf7Codec.prototype.encoder = Utf7Encoder; 14 | Utf7Codec.prototype.decoder = Utf7Decoder; 15 | Utf7Codec.prototype.bomAware = true; 16 | 17 | 18 | // -- Encoding 19 | 20 | var nonDirectChars = /[^A-Za-z0-9'\(\),-\.\/:\? \n\r\t]+/g; 21 | 22 | function Utf7Encoder(options, codec) { 23 | this.iconv = codec.iconv; 24 | } 25 | 26 | Utf7Encoder.prototype.write = function(str) { 27 | // Naive implementation. 28 | // Non-direct chars are encoded as "+-"; single "+" char is encoded as "+-". 29 | return Buffer.from(str.replace(nonDirectChars, function(chunk) { 30 | return "+" + (chunk === '+' ? '' : 31 | this.iconv.encode(chunk, 'utf16-be').toString('base64').replace(/=+$/, '')) 32 | + "-"; 33 | }.bind(this))); 34 | } 35 | 36 | Utf7Encoder.prototype.end = function() { 37 | } 38 | 39 | 40 | // -- Decoding 41 | 42 | function Utf7Decoder(options, codec) { 43 | this.iconv = codec.iconv; 44 | this.inBase64 = false; 45 | this.base64Accum = ''; 46 | } 47 | 48 | var base64Regex = /[A-Za-z0-9\/+]/; 49 | var base64Chars = []; 50 | for (var i = 0; i < 256; i++) 51 | base64Chars[i] = base64Regex.test(String.fromCharCode(i)); 52 | 53 | var plusChar = '+'.charCodeAt(0), 54 | minusChar = '-'.charCodeAt(0), 55 | andChar = '&'.charCodeAt(0); 56 | 57 | Utf7Decoder.prototype.write = function(buf) { 58 | var res = "", lastI = 0, 59 | inBase64 = this.inBase64, 60 | base64Accum = this.base64Accum; 61 | 62 | // The decoder is more involved as we must handle chunks in stream. 63 | 64 | for (var i = 0; i < buf.length; i++) { 65 | if (!inBase64) { // We're in direct mode. 66 | // Write direct chars until '+' 67 | if (buf[i] == plusChar) { 68 | res += this.iconv.decode(buf.slice(lastI, i), "ascii"); // Write direct chars. 69 | lastI = i+1; 70 | inBase64 = true; 71 | } 72 | } else { // We decode base64. 73 | if (!base64Chars[buf[i]]) { // Base64 ended. 74 | if (i == lastI && buf[i] == minusChar) {// "+-" -> "+" 75 | res += "+"; 76 | } else { 77 | var b64str = base64Accum + this.iconv.decode(buf.slice(lastI, i), "ascii"); 78 | res += this.iconv.decode(Buffer.from(b64str, 'base64'), "utf16-be"); 79 | } 80 | 81 | if (buf[i] != minusChar) // Minus is absorbed after base64. 82 | i--; 83 | 84 | lastI = i+1; 85 | inBase64 = false; 86 | base64Accum = ''; 87 | } 88 | } 89 | } 90 | 91 | if (!inBase64) { 92 | res += this.iconv.decode(buf.slice(lastI), "ascii"); // Write direct chars. 93 | } else { 94 | var b64str = base64Accum + this.iconv.decode(buf.slice(lastI), "ascii"); 95 | 96 | var canBeDecoded = b64str.length - (b64str.length % 8); // Minimal chunk: 2 quads -> 2x3 bytes -> 3 chars. 97 | base64Accum = b64str.slice(canBeDecoded); // The rest will be decoded in future. 98 | b64str = b64str.slice(0, canBeDecoded); 99 | 100 | res += this.iconv.decode(Buffer.from(b64str, 'base64'), "utf16-be"); 101 | } 102 | 103 | this.inBase64 = inBase64; 104 | this.base64Accum = base64Accum; 105 | 106 | return res; 107 | } 108 | 109 | Utf7Decoder.prototype.end = function() { 110 | var res = ""; 111 | if (this.inBase64 && this.base64Accum.length > 0) 112 | res = this.iconv.decode(Buffer.from(this.base64Accum, 'base64'), "utf16-be"); 113 | 114 | this.inBase64 = false; 115 | this.base64Accum = ''; 116 | return res; 117 | } 118 | 119 | 120 | // UTF-7-IMAP codec. 121 | // RFC3501 Sec. 5.1.3 Modified UTF-7 (http://tools.ietf.org/html/rfc3501#section-5.1.3) 122 | // Differences: 123 | // * Base64 part is started by "&" instead of "+" 124 | // * Direct characters are 0x20-0x7E, except "&" (0x26) 125 | // * In Base64, "," is used instead of "/" 126 | // * Base64 must not be used to represent direct characters. 127 | // * No implicit shift back from Base64 (should always end with '-') 128 | // * String must end in non-shifted position. 129 | // * "-&" while in base64 is not allowed. 130 | 131 | 132 | exports.utf7imap = Utf7IMAPCodec; 133 | function Utf7IMAPCodec(codecOptions, iconv) { 134 | this.iconv = iconv; 135 | }; 136 | 137 | Utf7IMAPCodec.prototype.encoder = Utf7IMAPEncoder; 138 | Utf7IMAPCodec.prototype.decoder = Utf7IMAPDecoder; 139 | Utf7IMAPCodec.prototype.bomAware = true; 140 | 141 | 142 | // -- Encoding 143 | 144 | function Utf7IMAPEncoder(options, codec) { 145 | this.iconv = codec.iconv; 146 | this.inBase64 = false; 147 | this.base64Accum = Buffer.alloc(6); 148 | this.base64AccumIdx = 0; 149 | } 150 | 151 | Utf7IMAPEncoder.prototype.write = function(str) { 152 | var inBase64 = this.inBase64, 153 | base64Accum = this.base64Accum, 154 | base64AccumIdx = this.base64AccumIdx, 155 | buf = Buffer.alloc(str.length*5 + 10), bufIdx = 0; 156 | 157 | for (var i = 0; i < str.length; i++) { 158 | var uChar = str.charCodeAt(i); 159 | if (0x20 <= uChar && uChar <= 0x7E) { // Direct character or '&'. 160 | if (inBase64) { 161 | if (base64AccumIdx > 0) { 162 | bufIdx += buf.write(base64Accum.slice(0, base64AccumIdx).toString('base64').replace(/\//g, ',').replace(/=+$/, ''), bufIdx); 163 | base64AccumIdx = 0; 164 | } 165 | 166 | buf[bufIdx++] = minusChar; // Write '-', then go to direct mode. 167 | inBase64 = false; 168 | } 169 | 170 | if (!inBase64) { 171 | buf[bufIdx++] = uChar; // Write direct character 172 | 173 | if (uChar === andChar) // Ampersand -> '&-' 174 | buf[bufIdx++] = minusChar; 175 | } 176 | 177 | } else { // Non-direct character 178 | if (!inBase64) { 179 | buf[bufIdx++] = andChar; // Write '&', then go to base64 mode. 180 | inBase64 = true; 181 | } 182 | if (inBase64) { 183 | base64Accum[base64AccumIdx++] = uChar >> 8; 184 | base64Accum[base64AccumIdx++] = uChar & 0xFF; 185 | 186 | if (base64AccumIdx == base64Accum.length) { 187 | bufIdx += buf.write(base64Accum.toString('base64').replace(/\//g, ','), bufIdx); 188 | base64AccumIdx = 0; 189 | } 190 | } 191 | } 192 | } 193 | 194 | this.inBase64 = inBase64; 195 | this.base64AccumIdx = base64AccumIdx; 196 | 197 | return buf.slice(0, bufIdx); 198 | } 199 | 200 | Utf7IMAPEncoder.prototype.end = function() { 201 | var buf = Buffer.alloc(10), bufIdx = 0; 202 | if (this.inBase64) { 203 | if (this.base64AccumIdx > 0) { 204 | bufIdx += buf.write(this.base64Accum.slice(0, this.base64AccumIdx).toString('base64').replace(/\//g, ',').replace(/=+$/, ''), bufIdx); 205 | this.base64AccumIdx = 0; 206 | } 207 | 208 | buf[bufIdx++] = minusChar; // Write '-', then go to direct mode. 209 | this.inBase64 = false; 210 | } 211 | 212 | return buf.slice(0, bufIdx); 213 | } 214 | 215 | 216 | // -- Decoding 217 | 218 | function Utf7IMAPDecoder(options, codec) { 219 | this.iconv = codec.iconv; 220 | this.inBase64 = false; 221 | this.base64Accum = ''; 222 | } 223 | 224 | var base64IMAPChars = base64Chars.slice(); 225 | base64IMAPChars[','.charCodeAt(0)] = true; 226 | 227 | Utf7IMAPDecoder.prototype.write = function(buf) { 228 | var res = "", lastI = 0, 229 | inBase64 = this.inBase64, 230 | base64Accum = this.base64Accum; 231 | 232 | // The decoder is more involved as we must handle chunks in stream. 233 | // It is forgiving, closer to standard UTF-7 (for example, '-' is optional at the end). 234 | 235 | for (var i = 0; i < buf.length; i++) { 236 | if (!inBase64) { // We're in direct mode. 237 | // Write direct chars until '&' 238 | if (buf[i] == andChar) { 239 | res += this.iconv.decode(buf.slice(lastI, i), "ascii"); // Write direct chars. 240 | lastI = i+1; 241 | inBase64 = true; 242 | } 243 | } else { // We decode base64. 244 | if (!base64IMAPChars[buf[i]]) { // Base64 ended. 245 | if (i == lastI && buf[i] == minusChar) { // "&-" -> "&" 246 | res += "&"; 247 | } else { 248 | var b64str = base64Accum + this.iconv.decode(buf.slice(lastI, i), "ascii").replace(/,/g, '/'); 249 | res += this.iconv.decode(Buffer.from(b64str, 'base64'), "utf16-be"); 250 | } 251 | 252 | if (buf[i] != minusChar) // Minus may be absorbed after base64. 253 | i--; 254 | 255 | lastI = i+1; 256 | inBase64 = false; 257 | base64Accum = ''; 258 | } 259 | } 260 | } 261 | 262 | if (!inBase64) { 263 | res += this.iconv.decode(buf.slice(lastI), "ascii"); // Write direct chars. 264 | } else { 265 | var b64str = base64Accum + this.iconv.decode(buf.slice(lastI), "ascii").replace(/,/g, '/'); 266 | 267 | var canBeDecoded = b64str.length - (b64str.length % 8); // Minimal chunk: 2 quads -> 2x3 bytes -> 3 chars. 268 | base64Accum = b64str.slice(canBeDecoded); // The rest will be decoded in future. 269 | b64str = b64str.slice(0, canBeDecoded); 270 | 271 | res += this.iconv.decode(Buffer.from(b64str, 'base64'), "utf16-be"); 272 | } 273 | 274 | this.inBase64 = inBase64; 275 | this.base64Accum = base64Accum; 276 | 277 | return res; 278 | } 279 | 280 | Utf7IMAPDecoder.prototype.end = function() { 281 | var res = ""; 282 | if (this.inBase64 && this.base64Accum.length > 0) 283 | res = this.iconv.decode(Buffer.from(this.base64Accum, 'base64'), "utf16-be"); 284 | 285 | this.inBase64 = false; 286 | this.base64Accum = ''; 287 | return res; 288 | } 289 | 290 | 291 | -------------------------------------------------------------------------------- /generation/gen-dbcs.js: -------------------------------------------------------------------------------- 1 | var utils = require("./utils"), 2 | errTo = require("errto"), 3 | async = require("async"); 4 | 5 | async.parallel({ 6 | $big5: utils.getFile.bind(null, "http://encoding.spec.whatwg.org/index-big5.txt"), // Encodings with $ are not saved. They are used to calculate other encs. 7 | $gbk: utils.getFile.bind(null, "http://encoding.spec.whatwg.org/index-gb18030.txt"), 8 | $gbRanges: utils.getFile.bind(null, "http://encoding.spec.whatwg.org/index-gb18030-ranges.txt"), 9 | $eucKr: utils.getFile.bind(null, "http://encoding.spec.whatwg.org/index-euc-kr.txt"), 10 | $jis0208: utils.getFile.bind(null, "http://encoding.spec.whatwg.org/index-jis0208.txt"), 11 | $jis0212: utils.getFile.bind(null, "http://encoding.spec.whatwg.org/index-jis0212.txt"), 12 | $cp932: utils.getFile.bind(null, "http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT"), 13 | cp936: utils.getFile.bind(null, "http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT"), 14 | cp949: utils.getFile.bind(null, "http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP949.TXT"), 15 | cp950: utils.getFile.bind(null, "http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT"), 16 | }, errTo(console.log, function(data) { 17 | // First, parse all files. 18 | for (var enc in data) { 19 | var dbcs = {}; 20 | utils.parseText(data[enc]).map(function(a) { 21 | var dbcsCode = parseInt(a[0]); 22 | var unicode = parseInt(a[1]); 23 | if (!isNaN(unicode)) 24 | dbcs[dbcsCode] = unicode; 25 | }); 26 | data[enc] = dbcs; 27 | } 28 | 29 | // Calculate difference between big5 and cp950, and write it to a file. 30 | // See http://encoding.spec.whatwg.org/#big5 31 | var big5add = {} 32 | for (var i = 0x8100; i < 0x10000; i++) { // Lead byte is 0x81 .. 0xFE 33 | var trail = i & 0xFF; 34 | if (trail < 0x40 || (0x7E < trail && trail < 0xA1) || trail > 0xFE) continue; 35 | var lead = i >> 8; 36 | var offset = (trail < 0x7F) ? 0x40 : 0x62; 37 | var pointer = (lead - 0x81) * 157 + (trail - offset); 38 | var cpChar = data.cp950[i]; 39 | var big5Char = data.$big5[pointer]; 40 | if (big5Char !== undefined && cpChar != big5Char) 41 | big5add[i] = big5Char; 42 | } 43 | 44 | // Calculate HKSCS codes that are duplicates of big5 codes and need to be skipped when encoding. 45 | console.log("Duplicate HKSCS codes that need to be skipped when encoded (see encodeSkipVals in big5hkscs): ") 46 | var big5codes = {}; 47 | for (var i = 0xA100; i < 0x10000; i++) { 48 | var uCharCode = (big5add[i] !== undefined) ? big5add[i] : data.cp950[i]; 49 | if (uCharCode !== undefined) { 50 | big5codes[uCharCode] = true; 51 | } 52 | } 53 | for (var i = 0x8100; i < 0xA100; i++) { 54 | var uCharCode = (big5add[i] !== undefined) ? big5add[i] : data.cp950[i]; 55 | if (uCharCode !== undefined && big5codes[uCharCode]) { 56 | console.log("0x"+i.toString(16)); 57 | } 58 | } 59 | 60 | if (big5Char !== undefined) { 61 | if (lead < 0xA1) { 62 | if (d[big5Char] !== undefined) { 63 | console.log("duplicate in first: "+ pointer + " char " + big5Char); 64 | } 65 | d[big5Char] = i; 66 | } else if (d[big5Char] !== undefined) { 67 | console.log("dup 0x"+d[big5Char].toString(16) + " -> " + i.toString(16)) 68 | } 69 | 70 | } 71 | 72 | // Add char sequences that are not in the index file (as given in http://encoding.spec.whatwg.org/#big5-decoder) 73 | function toIdx(pointer) { var trail = pointer % 157; var lead = Math.floor(pointer / 157) + 0x81; return (lead << 8) + (trail + (trail < 0x3F ? 0x40 : 0x62))} 74 | big5add[toIdx(1133)] = [0x00CA, 0x0304]; 75 | big5add[toIdx(1135)] = [0x00CA, 0x030C]; 76 | big5add[toIdx(1164)] = [0x00EA, 0x0304]; 77 | big5add[toIdx(1166)] = [0x00EA, 0x030C]; 78 | 79 | utils.writeTable("big5-added", utils.generateTable(big5add)); 80 | 81 | // Calculate difference between GB18030 encoding and cp936. 82 | // See http://encoding.spec.whatwg.org/#gb18030-encoder 83 | var gbkadd = {} 84 | for (var i = 0x8100; i < 0x10000; i++) { // Lead byte is 0x81 .. 0xFE 85 | var trail = i & 0xFF; 86 | if (trail < 0x40 || trail === 0x7F || trail > 0xFE) continue; 87 | var lead = i >> 8; 88 | var offset = (trail < 0x7F) ? 0x40 : 0x41; 89 | var gbAddr = (lead - 0x81) * 190 + (trail - offset); 90 | var cpChar = data.cp936[i]; 91 | var gbChar = data.$gbk[gbAddr]; 92 | if ((cpChar !== undefined) && (cpChar != gbChar)) 93 | console.log("Dont match: ", i.toString(16), gbAddr.toString(16), gbChar, cpChar); 94 | 95 | if (gbChar !== undefined && cpChar != gbChar) 96 | gbkadd[i] = gbChar; 97 | } 98 | 99 | // GB18030:2005 addition 100 | gbk2005add = [['8135f437', '']]; 101 | 102 | utils.writeTable("gbk-added", utils.generateTable(gbkadd).concat(gbk2005add)); 103 | 104 | // Write GB18030 ranges 105 | var ranges = { uChars: [], gbChars: [] }; 106 | for (var k in data.$gbRanges) { 107 | ranges.uChars.push(data.$gbRanges[k]); 108 | ranges.gbChars.push(+k); 109 | } 110 | utils.writeFile("gb18030-ranges", JSON.stringify(ranges)); 111 | 112 | 113 | // Use http://encoding.spec.whatwg.org/#shift_jis-decoder 114 | var shiftjis = {}; 115 | for (var i = 0; i <= 0x80; i++) 116 | shiftjis[i] = i; 117 | for (var i = 0xA1; i <= 0xDF; i++) 118 | shiftjis[i] = 0xFF61 + i - 0xA1; 119 | 120 | for (var lead = 0x81; lead < 0xFF; lead++) 121 | if (lead < 0xA1 || lead > 0xDF) 122 | for (var byte = 0; byte < 0xFF; byte++) { 123 | var offset = (byte < 0x7F) ? 0x40 : 0x41; 124 | var leadOffset = (lead < 0xA0) ? 0x81 : 0xC1; 125 | if ((0x40 <= byte && byte <= 0x7E) || (0x80 <= byte && byte <= 0xFC)) { 126 | var pointer = (lead - leadOffset) * 188 + byte - offset; 127 | if (data.$jis0208[pointer]) 128 | shiftjis[(lead << 8) + byte] = data.$jis0208[pointer]; 129 | else if (8836 <= pointer && pointer <= 10528) 130 | shiftjis[(lead << 8) + byte] = 0xE000 + pointer - 8836; // Interoperable legacy from Windows known as EUDC 131 | } 132 | } 133 | 134 | utils.writeTable("shiftjis", utils.generateTable(shiftjis)); 135 | 136 | // Fill out EUC-JP table according to http://encoding.spec.whatwg.org/#euc-jp 137 | var eucJp = {}; 138 | for (var i = 0; i < 0x80; i++) 139 | eucJp[i] = i; 140 | for (var i = 0xA1; i <= 0xDF; i++) 141 | eucJp[(0x8E << 8) + i] = 0xFF61 + i - 0xA1; 142 | for (var i = 0xA1; i <= 0xFE; i++) 143 | for (var j = 0xA1; j <= 0xFE; j++) { 144 | eucJp[ (i << 8) + j] = data.$jis0208[(i - 0xA1) * 94 + (j - 0xA1)]; 145 | eucJp[(0x8F << 16) + (i << 8) + j] = data.$jis0212[(i - 0xA1) * 94 + (j - 0xA1)]; 146 | } 147 | 148 | utils.writeTable("eucjp", utils.generateTable(eucJp, 3)); 149 | 150 | 151 | // Fill out EUC-KR Table and check that it is the same as cp949. 152 | var eucKr = {}; 153 | for (var i = 0; i < 0x80; i++) 154 | eucKr[i] = i; 155 | for (var i = 0x8100; i < 0xFF00; i++) { 156 | var lead = i >> 8, byte = i & 0xFF, ptr = null, t; 157 | if (0x41 <= byte && byte <= 0xFE) 158 | ptr = (lead-0x81) * 190 + (byte-0x41); 159 | if (ptr !== null) 160 | eucKr[i] = data.$eucKr[ptr]; 161 | 162 | // Compare with cp949 163 | if (data.cp949[i] !== eucKr[i]) 164 | console.log("Warning: EUC-KR from Encoding Standard doesn't match with CP949 from Unicode.com: ", i, data.cp949[i], eucKr[i]); 165 | } 166 | 167 | 168 | // Write all plain tables as-is. 169 | for (var enc in data) 170 | if (enc[0] != "$") 171 | utils.writeTable(enc, utils.generateTable(data[enc])); 172 | 173 | 174 | console.log("DBCS encodings regenerated."); 175 | })); 176 | 177 | 178 | -------------------------------------------------------------------------------- /generation/gen-sbcs.js: -------------------------------------------------------------------------------- 1 | var fs = require("fs"); 2 | var path = require("path"); 3 | var Iconv = require("iconv").Iconv; 4 | var Buffer = require("safer-buffer").Buffer; 5 | 6 | // Generate encoding families using original iconv. 7 | var destFileName = "encodings/sbcs-data-generated.js"; 8 | 9 | 10 | var encodingFamilies = [ 11 | { 12 | // Windows code pages http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/ (+932, 936, 949, 950) 13 | encodings: [874, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258], 14 | convert: function(cp) { 15 | return { 16 | name: "windows-"+cp, 17 | aliases: ["win"+cp, "cp"+cp, ""+cp], 18 | } 19 | } 20 | }, 21 | { 22 | // ISO-8859 code pages http://www.unicode.org/Public/MAPPINGS/ISO8859/ 23 | encodings: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16], 24 | convert: function(i) { 25 | return { 26 | name: "iso-8859-"+i, 27 | aliases: ["cp"+(28590+i), (28590+i)], 28 | } 29 | } 30 | }, 31 | { 32 | // IBM/DOS code pages http://www-01.ibm.com/software/globalization/cp/cp_cpgid.html http://download.boulder.ibm.com/ibmdl/pub/software/dw/java/cdctables.zip 33 | // GCGID <-> GCUID (unicode) http://www-01.ibm.com/software/globalization/gcgid/gcgid.html 34 | encodings: [437, 737, 775, 850, 852, 855, 856, 857, 858, 860, 861, 862, 863, 864, 865, 866, 869, 35 | 922, 1046, 1124, 1125, 1129, 1133, 1161, 1162, 1163], 36 | convert: function(cp) { 37 | return { 38 | name: "CP"+cp, 39 | aliases: ["ibm"+cp, "csibm"+cp, ""+cp], 40 | } 41 | } 42 | }, 43 | { 44 | // Macintosh code pages http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ 45 | encodings: ["macCroatian", "macCyrillic", "macGreek", 46 | "macIceland", "macRoman", "macRomania", 47 | "macThai", "macTurkish", "macUkraine"], 48 | }, 49 | { 50 | // Additional code pages http://www.unicode.org/Public/MAPPINGS/VENDORS/MISC/ and others. 51 | encodings: ["KOI8-R", "KOI8-U", "KOI8-RU", "KOI8-T", "ARMSCII-8", "RK1048", "TCVN", 52 | "GEORGIAN-ACADEMY", "GEORGIAN-PS", "PT154", "VISCII", "ISO646-CN", "ISO646-JP", 53 | "HP-ROMAN8", "MACINTOSH", "ASCII", "TIS620"], 54 | }, 55 | ]; 56 | 57 | 58 | var encodings = {}; 59 | 60 | // Add all encodings from encodingFamilies. 61 | encodingFamilies.forEach(function(family){ 62 | family.encodings.forEach(function(encoding){ 63 | if (family.convert) 64 | encoding = family.convert(encoding); 65 | 66 | var encodingIconvName = encoding.name ? encoding.name : encoding; 67 | var encodingName = encodingIconvName.replace(/[-_]/g, "").toLowerCase(); 68 | 69 | encodings[encodingName] = { 70 | type: "_sbcs", 71 | chars: generateCharsString(encodingIconvName) 72 | }; 73 | 74 | if (encoding.aliases) 75 | encoding.aliases.forEach(function(alias){ 76 | encodings[alias] = encodingName; 77 | }); 78 | }); 79 | }); 80 | 81 | // Write encodings. 82 | fs.writeFileSync(path.join(__dirname, "..", destFileName), 83 | "\"use strict\";\n\n// Generated data for sbcs codec. Don't edit manually. Regenerate using generation/gen-sbcs.js script.\n"+ 84 | "module.exports = "+JSON.stringify(encodings, undefined, " ")); 85 | 86 | 87 | function generateCharsString(encoding) { 88 | console.log("Generate encoding for " + encoding); 89 | var iconvToUtf8 = new Iconv(encoding, "UTF-8"); 90 | var iconvFromUtf8 = new Iconv("UTF-8", encoding); 91 | var chars = "", needReverse = false, containsDiacritics = []; 92 | 93 | for (var b = 0x0; b < 0x100; b++) { 94 | try { 95 | var convertedChar = iconvToUtf8.convert(Buffer.from([b])).toString(); 96 | 97 | if (convertedChar.length != 1) 98 | throw new Error("Single-byte encoding error: Must return single char."); 99 | 100 | var convertedBackBuf = iconvFromUtf8.convert(Buffer.from(convertedChar)); 101 | if (convertedBackBuf.length != 1) 102 | throw new Error("Single-byte encoding error: Cannot decode back."); 103 | 104 | if (convertedBackBuf[0] != b) 105 | needReverse = true; // We've got non 1:1 corresponding. 106 | 107 | var c = convertedChar.charCodeAt(0); 108 | var diacritics = {"768":true,"769":true,"770":true,"771":true,"772":true,"773":true,"774":true,"775":true,"776":true,"777":true,"778":true,"779":true,"780":true,"781":true,"782":true,"783":true,"784":true,"785":true,"786":true,"787":true,"788":true,"789":true,"790":true,"791":true,"792":true,"793":true,"794":true,"795":true,"796":true,"797":true,"798":true,"799":true,"800":true,"801":true,"802":true,"803":true,"804":true,"805":true,"806":true,"807":true,"808":true,"809":true,"810":true,"811":true,"812":true,"813":true,"814":true,"815":true,"816":true,"817":true,"818":true,"819":true,"820":true,"821":true,"822":true,"823":true,"824":true,"825":true,"826":true,"827":true,"828":true,"829":true,"830":true,"831":true,"832":true,"833":true,"834":true,"835":true,"836":true,"837":true,"838":true,"839":true,"840":true,"841":true,"842":true,"843":true,"844":true,"845":true,"846":true,"848":true,"849":true,"850":true,"851":true,"852":true,"853":true,"854":true,"855":true,"856":true,"857":true,"858":true,"859":true,"860":true,"861":true,"862":true,"863":true,"864":true,"865":true,"866":true,"867":true,"868":true,"869":true,"870":true,"871":true,"872":true,"873":true,"874":true,"875":true,"876":true,"877":true,"878":true,"879":true,"1155":true,"1156":true,"1157":true,"1158":true,"1159":true,"1425":true,"1426":true,"1427":true,"1428":true,"1429":true,"1430":true,"1431":true,"1432":true,"1433":true,"1434":true,"1435":true,"1436":true,"1437":true,"1438":true,"1439":true,"1440":true,"1441":true,"1442":true,"1443":true,"1444":true,"1445":true,"1446":true,"1447":true,"1448":true,"1449":true,"1450":true,"1451":true,"1452":true,"1453":true,"1454":true,"1455":true,"1456":true,"1457":true,"1458":true,"1459":true,"1460":true,"1461":true,"1462":true,"1463":true,"1464":true,"1465":true,"1466":true,"1467":true,"1468":true,"1469":true,"1471":true,"1473":true,"1474":true,"1476":true,"1477":true,"1479":true,"1552":true,"1553":true,"1554":true,"1555":true,"1556":true,"1557":true,"1558":true,"1559":true,"1560":true,"1561":true,"1562":true,"1611":true,"1612":true,"1613":true,"1614":true,"1615":true,"1616":true,"1617":true,"1618":true,"1619":true,"1620":true,"1621":true,"1622":true,"1623":true,"1624":true,"1625":true,"1626":true,"1627":true,"1628":true,"1629":true,"1630":true,"1631":true,"1648":true,"1750":true,"1751":true,"1752":true,"1753":true,"1754":true,"1755":true,"1756":true,"1759":true,"1760":true,"1761":true,"1762":true,"1763":true,"1764":true,"1767":true,"1768":true,"1770":true,"1771":true,"1772":true,"1773":true,"1809":true,"1840":true,"1841":true,"1842":true,"1843":true,"1844":true,"1845":true,"1846":true,"1847":true,"1848":true,"1849":true,"1850":true,"1851":true,"1852":true,"1853":true,"1854":true,"1855":true,"1856":true,"1857":true,"1858":true,"1859":true,"1860":true,"1861":true,"1862":true,"1863":true,"1864":true,"1865":true,"1866":true,"2027":true,"2028":true,"2029":true,"2030":true,"2031":true,"2032":true,"2033":true,"2034":true,"2035":true,"2070":true,"2071":true,"2072":true,"2073":true,"2075":true,"2076":true,"2077":true,"2078":true,"2079":true,"2080":true,"2081":true,"2082":true,"2083":true,"2085":true,"2086":true,"2087":true,"2089":true,"2090":true,"2091":true,"2092":true,"2093":true,"2137":true,"2138":true,"2139":true,"2276":true,"2277":true,"2278":true,"2279":true,"2280":true,"2281":true,"2282":true,"2283":true,"2284":true,"2285":true,"2286":true,"2287":true,"2288":true,"2289":true,"2290":true,"2291":true,"2292":true,"2293":true,"2294":true,"2295":true,"2296":true,"2297":true,"2298":true,"2299":true,"2300":true,"2301":true,"2302":true,"2364":true,"2381":true,"2385":true,"2386":true,"2387":true,"2388":true,"2492":true,"2509":true,"2620":true,"2637":true,"2748":true,"2765":true,"2876":true,"2893":true,"3021":true,"3149":true,"3157":true,"3158":true,"3260":true,"3277":true,"3405":true,"3530":true,"3640":true,"3641":true,"3642":true,"3656":true,"3657":true,"3658":true,"3659":true,"3768":true,"3769":true,"3784":true,"3785":true,"3786":true,"3787":true,"3864":true,"3865":true,"3893":true,"3895":true,"3897":true,"3953":true,"3954":true,"3956":true,"3962":true,"3963":true,"3964":true,"3965":true,"3968":true,"3970":true,"3971":true,"3972":true,"3974":true,"3975":true,"4038":true,"4151":true,"4153":true,"4154":true,"4237":true,"4957":true,"4958":true,"4959":true,"5908":true,"5940":true,"6098":true,"6109":true,"6313":true,"6457":true,"6458":true,"6459":true,"6679":true,"6680":true,"6752":true,"6773":true,"6774":true,"6775":true,"6776":true,"6777":true,"6778":true,"6779":true,"6780":true,"6783":true,"6964":true,"6980":true,"7019":true,"7020":true,"7021":true,"7022":true,"7023":true,"7024":true,"7025":true,"7026":true,"7027":true,"7082":true,"7083":true,"7142":true,"7154":true,"7155":true,"7223":true,"7376":true,"7377":true,"7378":true,"7380":true,"7381":true,"7382":true,"7383":true,"7384":true,"7385":true,"7386":true,"7387":true,"7388":true,"7389":true,"7390":true,"7391":true,"7392":true,"7394":true,"7395":true,"7396":true,"7397":true,"7398":true,"7399":true,"7400":true,"7405":true,"7412":true,"7616":true,"7617":true,"7618":true,"7619":true,"7620":true,"7621":true,"7622":true,"7623":true,"7624":true,"7625":true,"7626":true,"7627":true,"7628":true,"7629":true,"7630":true,"7631":true,"7632":true,"7633":true,"7634":true,"7635":true,"7636":true,"7637":true,"7638":true,"7639":true,"7640":true,"7641":true,"7642":true,"7643":true,"7644":true,"7645":true,"7646":true,"7647":true,"7648":true,"7649":true,"7650":true,"7651":true,"7652":true,"7653":true,"7654":true,"7676":true,"7677":true,"7678":true,"7679":true,"8400":true,"8401":true,"8402":true,"8403":true,"8404":true,"8405":true,"8406":true,"8407":true,"8408":true,"8409":true,"8410":true,"8411":true,"8412":true,"8417":true,"8421":true,"8422":true,"8423":true,"8424":true,"8425":true,"8426":true,"8427":true,"8428":true,"8429":true,"8430":true,"8431":true,"8432":true,"11503":true,"11504":true,"11505":true,"11647":true,"11744":true,"11745":true,"11746":true,"11747":true,"11748":true,"11749":true,"11750":true,"11751":true,"11752":true,"11753":true,"11754":true,"11755":true,"11756":true,"11757":true,"11758":true,"11759":true,"11760":true,"11761":true,"11762":true,"11763":true,"11764":true,"11765":true,"11766":true,"11767":true,"11768":true,"11769":true,"11770":true,"11771":true,"11772":true,"11773":true,"11774":true,"11775":true,"12330":true,"12331":true,"12332":true,"12333":true,"12334":true,"12335":true,"12441":true,"12442":true,"42607":true,"42612":true,"42613":true,"42614":true,"42615":true,"42616":true,"42617":true,"42618":true,"42619":true,"42620":true,"42621":true,"42655":true,"42736":true,"42737":true,"43014":true,"43204":true,"43232":true,"43233":true,"43234":true,"43235":true,"43236":true,"43237":true,"43238":true,"43239":true,"43240":true,"43241":true,"43242":true,"43243":true,"43244":true,"43245":true,"43246":true,"43247":true,"43248":true,"43249":true,"43307":true,"43308":true,"43309":true,"43347":true,"43443":true,"43456":true,"43696":true,"43698":true,"43699":true,"43700":true,"43703":true,"43704":true,"43710":true,"43711":true,"43713":true,"43766":true,"44013":true,"64286":true,"65056":true,"65057":true,"65058":true,"65059":true,"65060":true,"65061":true,"65062":true,"66045":true,"68109":true,"68111":true,"68152":true,"68153":true,"68154":true,"68159":true,"69702":true,"69817":true,"69818":true,"69888":true,"69889":true,"69890":true,"69939":true,"69940":true,"70080":true,"71350":true,"71351":true,"119141":true,"119142":true,"119143":true,"119144":true,"119145":true,"119149":true,"119150":true,"119151":true,"119152":true,"119153":true,"119154":true,"119163":true,"119164":true,"119165":true,"119166":true,"119167":true,"119168":true,"119169":true,"119170":true,"119173":true,"119174":true,"119175":true,"119176":true,"119177":true,"119178":true,"119179":true,"119210":true,"119211":true,"119212":true,"119213":true,"119362":true,"119363":true,"119364":true}; 109 | if (diacritics[c]) { 110 | containsDiacritics.push(c); 111 | } 112 | 113 | } catch (exception) { 114 | if (exception.code === "EILSEQ") { 115 | convertedChar = "\ufffd"; 116 | } else { 117 | throw exception; 118 | } 119 | } 120 | 121 | chars += convertedChar; 122 | } 123 | 124 | if (containsDiacritics.length > 0) 125 | console.log("Contains Diacritics: ", containsDiacritics.map(function(d) {return d.toString(16)})+""); 126 | 127 | // Check if the first half is standard and cut it if it is. 128 | var asciiString = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f'+ 129 | ' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f'; 130 | if (chars.slice(0, 0x80) === asciiString) 131 | chars = chars.slice(0x80); 132 | 133 | return chars; 134 | } 135 | 136 | 137 | -------------------------------------------------------------------------------- /generation/research/complex-encodings-iconv.md: -------------------------------------------------------------------------------- 1 | 2 | See http://encoding.spec.whatwg.org/#indexes 3 | 4 | ## DBCS 5 | 6 | // Taiwan 7 | { enc: [ 'BIG-5', 'BIG-FIVE', 'BIG5', 'BIGFIVE', 'CN-BIG5' ], maxChars: 2, valid: 13831, invalid: 10489 }, 8 | { enc: [ 'BIG5-HKSCS:1999' ], maxChars: 2, valid: 18284, invalid: 12180 }, 9 | { enc: [ 'BIG5-HKSCS:2001' ], maxChars: 2, valid: 18400, invalid: 12320 }, 10 | { enc: [ 'BIG5-HKSCS:2004' ], maxChars: 2, valid: 18523, invalid: 12453 }, 11 | { enc: [ 'BIG5-HKSCS', 'BIG5HKSCS' ], maxChars: 2, valid: 18591, invalid: 12385 }, 12 | { enc: [ 'CP950' ], maxChars: 2, valid: 19440, invalid: 13072 }, 13 | { enc: [ 'BIG5-2003' ], maxChars: 2, valid: 19710, invalid: 12802 }, 14 | 15 | // Chinese (CP936 has a valid 0x80 code = Euro, otherwise is DBCS) 16 | { enc: [ 'EUC-CN', 'EUCCN', 'CN-GB', 'CSGB2312', 'GB2312' ], maxChars: 2, valid: 7573, invalid: 16747 }, 17 | { enc: [ 'GBK' ], maxChars: 2, valid: 21919, invalid: 10593 }, 18 | { enc: [ 'CP936', 'MS936', 'WINDOWS-936' ], maxChars: 2, valid: 23334, invalid: 9178 }, 19 | 20 | // Korean, 21 | // See http://en.wikipedia.org/wiki/KS_X_1001, http://support.microsoft.com/kb/170557 22 | // KSC-5601_1992 = Johab (rarely used), http://opensource.apple.com/source/tcl/tcl-10/tcl/tools/encoding/ksc5601.txt 23 | // Add char: http://mail.openjdk.java.net/pipermail/core-libs-dev/2013-May/017472.html 24 | { enc: [ 'EUC-KR', 'EUCKR', 'CSEUCKR' ], maxChars: 2, valid: 8355, invalid: 15965 }, 25 | { enc: [ 'CP1361', 'JOHAB' ], isASCII: false, maxChars: 2, valid: 17177, invalid: 11751 }, 26 | { enc: [ 'CP949', 'UHC' ], maxChars: 2, valid: 17364, invalid: 15148 }, 27 | 28 | 29 | ## NOT DBCS 30 | 31 | // Japanese: Shift_JIS (add MacJapanese) - includes double-chars and chars from 2nd plane (i.e. U+2131B). 32 | // http://x0213.org/codetable/sjis-0213-2004-std.txt 33 | // X0213 - superset, see http://x0213.org/codetable/index.en.html 34 | // Ibm 943 = CP 932 (with x5C -> A5, not 5C). See also Ibm 942 35 | { enc: [ 'SHIFT_JIS', 'CSSHIFTJIS', 'MS_KANJI', 'SHIFT-JIS', 'SJIS' ], isASCII: false, maxChars: 2, valid: 8950, invalid: 4618 }, 36 | { enc: [ 'CP932' ], maxChars: 2, valid: 9795, invalid: 5821 }, 37 | { enc: [ 'SHIFT_JISX0213' ], isASCII: false, maxChars: 2, valid: 11424, invalid: 4192 }, 38 | 39 | // Japanese: EUC-JP - includes double chars and chars from 2nd plane. 40 | // http://x0213.org/codetable/euc-jis-2004-std.txt 41 | { enc: [ 'EUC-JP', 'CSEUCPKDFMTJAPANESE', 'EUCJP' ], maxChars: 3, valid: 15017, invalid: 33879 }, 42 | { enc: [ 'EUC-JISX0213' ], maxChars: 3, valid: 11424, invalid: 37472 }, 43 | 44 | // Japanese TODO: ISO-IR-87, JIS0208, JIS_C6226-1983, JIS_X0208, JIS_X0208-1983, JIS_X0208-1990, X0208, CSISO87JISX0208 45 | // ISO-IR-159, JIS_X0212, JIS_X0212-1990, JIS_X0212.1990-0, X0212, CSISO159JISX02121990 46 | // Aliases: 47 | // CHINESE, GB_2312-80 = ISO-IR-58 = CSISO58GB231280 48 | // KOREAN, ISO-IR-149 = KSC_5601 = KS_C_5601-1987 = KS_C_5601-1989 = CSKSC56011987 49 | 50 | // Taiwan, rare 51 | { enc: [ 'EUC-TW', 'EUCTW', 'CSEUCTW' ], maxChars: 4, valid: 61439, invalid: 16805889 }, 52 | 53 | // GBK extension 54 | { enc: [ 'GB18030' ], maxChars: 4, valid: 1112064, invalid: 27531008 }, 55 | 56 | // Stateful ISO-2022 encodings, with switches to different planes. 57 | { enc: [ 'ISO-2022-CN', 'CSISO2022CN', 'ISO-2022-CN-EXT' ], maxChars: 4 }, 58 | { enc: [ 'ISO-2022-KR', 'CSISO2022KR' ], maxChars: 4 }, 59 | { enc: [ 'ISO-2022-JP-1', 'ISO-2022-JP-2', 'CSISO2022JP2', 'ISO-2022-JP-3' ], maxChars: 4 }, 60 | { enc: [ 'ISO-2022-JP', 'CSISO2022JP' ], maxChars: 3 }, 61 | 62 | // Unknown 63 | { enc: [ 'CN-GB-ISOIR165', 'ISO-IR-165' ], maxChars: 2, valid: 8421, invalid: 15387 }, 64 | { enc: [ 'HZ', 'HZ-GB-2312' ], maxChars: 2 }, 65 | { enc: [ 'DEC-KANJI' ], isDBCS: true, isASCII: true, maxChars: 2, valid: 7007, invalid: 14753 }, 66 | { enc: [ 'DEC-HANYU' ], isDBCS: false, isASCII: true, maxChars: 4, valid: 20039, invalid: 70073 }, 67 | 68 | 69 | 70 | ## UNICODE 71 | 72 | { enc: [ 'UCS-2', 'CSUNICODE', 'ISO-10646-UCS-2' ], maxChars: 2 }, 73 | { enc: [ 'UCS-2BE', 'UNICODEBIG', 'UNICODE-1-1', 'CSUNICODE11', 'UCS-2-SWAPPED' ], maxChars: 2 }, 74 | { enc: [ 'UCS-2LE', 'UNICODELITTLE', 'UCS-2-INTERNAL' ], maxChars: 2 }, 75 | 76 | { enc: [ 'UCS-4', 'CSUCS4', 'ISO-10646-UCS-4' ], maxChars: 4 }, 77 | { enc: [ 'UCS-4BE', 'UCS-4-SWAPPED' ], maxChars: 4 }, 78 | { enc: [ 'UCS-4LE', 'WCHAR_T', 'UCS-4-INTERNAL' ], maxChars: 4 }, 79 | 80 | { enc: [ 'UTF-7', 'UTF7', 'UNICODE-1-1-UTF-7', 'CSUNICODE11UTF7' ], maxChars: 8 }, 81 | { enc: [ 'UTF-8', 'UTF8' ], maxChars: 4 }, 82 | { enc: [ 'UTF-16', 'UTF-16BE', 'UTF16', 'UTF16BE' ], maxChars: 4 }, 83 | { enc: [ 'UTF-16LE', 'UTF16LE' ], maxChars: 4 }, 84 | { enc: [ 'UTF-32', 'UTF32' ], maxChars: 4 }, 85 | { enc: [ 'UTF-32BE', 'UTF32BE' ], maxChars: 4 }, 86 | { enc: [ 'UTF-32LE', 'UTF32LE' ], maxChars: 4 }, 87 | 88 | ## Not covered 89 | 90 | http://en.wikipedia.org/wiki/Punycode 91 | 92 | -------------------------------------------------------------------------------- /generation/research/gen-normalization.js: -------------------------------------------------------------------------------- 1 | 2 | // This script generates unicode normalization data. 3 | 4 | var utils = require("../utils"), 5 | errTo = require("errto"), 6 | async = require("async"); 7 | 8 | var baseUrl = "http://www.unicode.org/Public/6.3.0/ucd/"; 9 | 10 | async.parallel({ 11 | data: utils.getFile.bind(null, baseUrl + "UnicodeData.txt"), 12 | exclusions: utils.getFile.bind(null, baseUrl + "CompositionExclusions.txt") 13 | }, errTo(console.log, function(data) { 14 | 15 | var features = {}; 16 | utils.parseText(data.data, ";").map(function(a) { 17 | var ch = parseInt(a[0], 16); 18 | var combiningClass = parseInt(a[3], 10) || 0; 19 | var decompStr = a[5].trim(); 20 | var canonical, decomp; 21 | 22 | if (decompStr.length > 0) { 23 | decomp = decompStr.split(" ").map(function(s) {return parseInt(s, 16)});; 24 | canonical = true; 25 | if (isNaN(decomp[0])) { // When first item is a tag (unparsable as int), this is a 'compatibility decomposition' 26 | canonical = false; 27 | decomp.shift(); 28 | } 29 | //console.log(String.fromCharCode(ch), " -> ", decomp.map(function(c) { return String.fromCharCode(c)}).join(" + "), canonical ? "canonical" : "compat"); 30 | } 31 | 32 | if (decomp || combiningClass) { 33 | features[ch] = { 34 | decomp: decomp, 35 | canonical: canonical, 36 | combiningClass: combiningClass, 37 | }; 38 | } 39 | }); 40 | 41 | // Process CompositionExclusions.txt 42 | utils.parseText(data.exclusions).map(function(a) { 43 | var ch = parseInt(a[0], 16); 44 | features[ch].noCompose = true; 45 | }); 46 | 47 | // Exclude Non-Starter Decompositions and Singleton Decompositions (CompositionExclusions.txt parts 3, 4) 48 | for (var ch in features) { 49 | var feat = features[ch]; 50 | if (feat.canonical && (feat.decomp.length == 1 || feat.combiningClass || (features[feat.decomp[0]] || {}).combiningClass)) { 51 | //console.log("Excluded:", (+ch).toString(16)); 52 | feat.noCompose = true; 53 | } 54 | } 55 | 56 | // Add Jamo decompositions (see part 3.12 of http://www.unicode.org/versions/Unicode6.3.0/ch03.pdf) 57 | var LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7, SBase = 0xAC00; 58 | var LCount = 19, VCount = 21, TCount = 28; 59 | 60 | for (var l = 0; l < LCount; l++) 61 | for (var v = 0; v < VCount; v++) { 62 | var lv = l * VCount * TCount + v * TCount + SBase; 63 | features[lv] = { 64 | decomp: [l + LBase, v + VBase], 65 | canonical: true, 66 | combiningClass: 0 67 | } 68 | 69 | for (var t = 1; t < TCount; t++) 70 | features[lv + t] = { 71 | decomp: [lv, t + TBase], 72 | canonical: true, 73 | combiningClass: 0 74 | }; 75 | } 76 | 77 | // ------------------------------------------------------------------------- 78 | 79 | function f(ch) { return features[ch] || {combiningClass: 0}; } 80 | function hex(ch) { return (+ch).toString(16);} 81 | 82 | function decompose(ch, canonical) { 83 | var feat = f(ch); 84 | if (feat.decomp && (feat.canonical || !canonical)) { 85 | return [].concat.apply([], feat.decomp.map(function(c) {return decompose(c, canonical)})); 86 | } else return [ch]; 87 | } 88 | /* 89 | for (var ch in features) { 90 | [true, false].map(function(can) { 91 | var arr = decompose(ch, can); 92 | for (var i = 0; i < arr.length-1; i++) 93 | if (f(arr[i]).combiningClass > f(arr[i+1]).combiningClass) 94 | console.log("Err", (+ch).toString(16), can, arr.map(function(ch) {return hex(ch)+"/"+f(ch).combiningClass;})); 95 | 96 | 97 | }); 98 | } 99 | */ 100 | // var asciiString = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f'+ 101 | // ' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f'; 102 | 103 | 104 | // var encodings = require("../../encodings/sbcs-data-generated"); 105 | // for (var encName in encodings) { 106 | // var enc = encodings[encName]; 107 | // if (enc.chars) { 108 | // if (enc.chars.length == 128) 109 | // enc.chars = asciiString + enc.chars; 110 | 111 | // var existChars = {}; 112 | // for (var i = 0; i < enc.chars.length; i++) 113 | // existChars[enc.chars.charCodeAt(i)] = true; 114 | 115 | // for (var i = 0; i < enc.chars.length; i++) { 116 | // var charCode = enc.chars.charCodeAt(i); 117 | // var feat = f(charCode); 118 | // if (feat.decomp && feat.canonical && feat.decomp.length == 2) { 119 | // if (!existChars[feat.decomp[0]]) 120 | // console.log("!!", encName, hex(enc.chars.charCodeAt(i)), "->", feat.decomp.map(hex)); 121 | // if (f(feat.decomp[0]).combiningClass != 0 || f(feat.decomp[1]).combiningClass == 0) 122 | // console.log("!!2", encName, hex(enc.chars.charCodeAt(i)), "->", feat.decomp.map(hex)); 123 | // } 124 | 125 | // var decomp = decompose(charCode, true); 126 | // if (decomp.length > 2) { 127 | 128 | // console.log("!!3", encName, hex(enc.chars.charCodeAt(i)), "->", decomp.map(hex)); 129 | 130 | // } 131 | // } 132 | 133 | 134 | // } 135 | 136 | // } 137 | 138 | for (var charCode in features) { 139 | var feat = f(charCode); 140 | if (feat.decomp && feat.canonical) { 141 | if (feat.decomp.length == 1) { 142 | if (f(feat.decomp[0]).combiningClass != feat.combiningClass) 143 | console.log("!!1", hex(charCode), "->", feat.decomp.map(hex)); 144 | 145 | } else if (feat.decomp.length == 2) { 146 | if (f(feat.decomp[0]).combiningClass != feat.combiningClass) // || f(feat.decomp[1]).combiningClass == 0) 147 | console.log("!!2", hex(charCode), "->", feat.decomp.map(hex)); 148 | 149 | } else { 150 | console.log("comp - not 1 or 2", hex(charCode)) 151 | } 152 | } 153 | } 154 | })); 155 | 156 | 157 | -------------------------------------------------------------------------------- /generation/research/get-iconv-encodings.js: -------------------------------------------------------------------------------- 1 | 2 | // Prints out information about all iconv encodings. 3 | // Usage: 4 | // > iconv --list | node get-iconv-encodings.js > iconv-data.json 5 | 6 | var iconv = require('iconv'), 7 | crypto = require('crypto'); 8 | var Buffer = require("safer-buffer").Buffer; 9 | 10 | 11 | var skipEncodings = {}; 12 | 13 | 14 | var input = ""; 15 | process.stdin.setEncoding("utf8"); 16 | process.stdin.on("data", function(data) {input += data}); 17 | 18 | process.stdin.on("end", function() { 19 | input = input.replace(/\s|\n/g, " "); 20 | encodings = input.split(",").map(function(s) {return s.trim();}).filter(Boolean); 21 | encodings = input.split(" ").map(function(s) {return s.trim();}).filter(Boolean); 22 | encodings = encodings.filter(function(enc) { 23 | try { 24 | new iconv.Iconv("utf-8", enc).convert(Buffer.from("hello!")); 25 | if (skipEncodings[enc]) { 26 | console.log("Encoding skipped: ", enc); 27 | return false; 28 | } 29 | } catch (e) { 30 | console.log("Encoding not supported: ", enc); 31 | return false; 32 | } 33 | return true; 34 | }); 35 | 36 | var hashes = {}; 37 | 38 | encodings = encodings.map(function(enc) { 39 | process.stderr.write("Checking "+enc+": "); 40 | var hash = crypto.createHash("sha1"); 41 | 42 | var converter = new iconv.Iconv(enc, "utf-8"), buf = Buffer.alloc(10); 43 | var res = { 44 | enc: [enc], 45 | isDBCS: true, 46 | isSBCS: true, 47 | isASCII: true, 48 | maxChars: 0, 49 | valid: 0, 50 | invalid: 0, 51 | hash: "", 52 | } 53 | 54 | try { 55 | forAllChars(converter, function(valid, inp, outp) { 56 | res.isASCII = res.isASCII && (inp[0] >= 0x80 || (valid && (inp[0] == outp[0]))); 57 | res.isSBCS = res.isSBCS && (inp.length == 1); 58 | res.isDBCS = res.isDBCS && (((inp.length == 1) && (inp[0] < 0x80 || !valid)) || ((inp.length == 2) && inp[0] >= 0x80)); 59 | res.maxChars = Math.max(res.maxChars, inp.length); 60 | hash.update(inp); 61 | if (valid) { 62 | res.valid++; 63 | hash.update(outp); 64 | } else { 65 | res.invalid++; 66 | } 67 | if (res.valid + res.invalid > 1000000) 68 | throw new Error("Too long"); 69 | }, buf, 1); 70 | } 71 | catch (e) { 72 | res.bad = true; 73 | } 74 | 75 | res.hash = hash.digest("hex"); 76 | if (hashes[res.hash]) { 77 | hashes[res.hash].enc.push(enc); 78 | } else { 79 | hashes[res.hash] = res; 80 | } 81 | 82 | process.stderr.write(JSON.stringify(res) + "\n"); 83 | return res; 84 | }); 85 | 86 | hashes = Object.keys(hashes).map(function(key) {return hashes[key];}); 87 | console.log(JSON.stringify(hashes, undefined, 2)); 88 | 89 | }); 90 | process.stdin.resume(); 91 | 92 | // Make all valid input combinations for a given encoding and call fn with it. 93 | // fn(valid, input, output) 94 | function forAllChars(converter, fn, origbuf, len) { 95 | var buf = origbuf.slice(0, len); 96 | for (var i = 0; i < 0x100; i++) { 97 | buf[len-1] = i; 98 | var res = undefined; 99 | try { 100 | res = converter.convert(buf); 101 | } catch (e) { 102 | if (e.code == "EILSEQ") { // Invalid character sequence. 103 | // Notify that this sequence is invalid. 104 | //fn(false, buf); 105 | } 106 | else if (e.code == "EINVAL") { // Partial character sequence. 107 | // Recurse deeper. 108 | forAllChars(converter, fn, origbuf, len+1); 109 | } 110 | else 111 | throw e; 112 | } 113 | 114 | // buf contains correct input combination. Run fn with input and converter output. 115 | fn(res != null, buf, res); 116 | } 117 | } 118 | 119 | 120 | 121 | -------------------------------------------------------------------------------- /generation/research/normalization.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | Combining diacritics: 4 | * http://en.wikipedia.org/wiki/Unicode_equivalence 5 | * Canonically equivalent -> n + ◌̃ = ñ (Same display, printing, meaning) 6 | * Compatible: ligatures ff = ff (Same is some apps - sorting, indexing) 7 | * Unicode normalization - replaces equivalent sequences. 8 | * There some equivalent characters (angstrom = 00C5 and 212B) 9 | * Combining vs precomposed characters (ligatures, combining) 10 | * Typographical conventions: ① is compatible with 1 11 | * There are 4 normal forms to compare/search for strings: 12 | * Canonical(NF)/Compatibility(NFK) equivalence (chosen semantically, canonical = strict, compatibility = relaxed) 13 | * Composed/Decomposed - doesn't matter, just choose one. 14 | * Forms are in http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt (http://en.wikipedia.org/wiki/Character_property_(Unicode)) 15 | * http://unicode.org/reports/tr15/ - Normalization & Equivalence. 16 | * http://www.icu-project.org/docs/papers/optimized_unicode_composition_and_decomposition.html 17 | * Algorithms: http://www.unicode.org/versions/Unicode6.3.0/ch03.pdf 18 | * TR15 Part 8: Legacy encodings - about how to convert from/to other encodings. 19 | 20 | * There's a Node.js unicode normalization library 'unorm' 21 | 22 | * http://en.wikipedia.org/wiki/Combining_diacritical_mark 23 | * If several combining codes, in canonical they should be stable sorted in order of combining class. 24 | * There's a `quick check` flags http://unicode.org/faq/normalization.html 25 | * We can check before encoding/decoding that the char is in needed form. 26 | * There's also a complex combining alg-m for Korean 'Hangul' 'Jamo', through 3 tables. 27 | * Combining diacritical: 0x300-0x36F, 0x591-0x5C7, 0x610-0x61A, 0x64B-0x065F, some others. 28 | * Encodings containing: 864, 874, 1046, 1129, 1133, 1161-1163, 1255, 1256, 1258, 8859-6, 8859-11, TCVN, MacThai (mostly TCVN, 1258, 1255) 29 | * Even for single-byte encodings I need (when there are combining chars): 30 | * Composing when decoding. 31 | * Decomposing when encoding. 32 | 33 | 34 | ================================================= 35 | SBCS fast alg-m fails (see http://www.icu-project.org/docs/papers/optimized_unicode_composition_and_decomposition.html as inspiration): 36 | * If combined char is in encoding, then un-combined is also there: 37 | * CP866: Її 38 | 39 | -------------------------------------------------------------------------------- /generation/research/notes.md: -------------------------------------------------------------------------------- 1 | 2 | ### TODO: 3 | * Deprecate string input in decoding (with tests). 4 | 5 | * keep internal state in encoders/decoders 6 | * streaming suport 7 | 8 | Later: 9 | * browser support through ArrayBuffer 10 | -> table support via utf-8 strings with ranges. 11 | 12 | 13 | 14 | ### Other projects 15 | http://code.google.com/p/stringencoding/ 16 | http://encoding.spec.whatwg.org 17 | 18 | ### Browser alternatives of Buffer 19 | http://www.khronos.org/registry/typedarray/specs/latest/ 20 | https://developer.mozilla.org/en-US/docs/JavaScript_typed_arrays 21 | ArrayBuffer polyfill http://www.calormen.com/polyfill/ https://bitbucket.org/lindenlab/llsd/src/7d2646cd3f9b/js/typedarray.js 22 | 23 | 24 | ### Streaming hurdles 25 | http://userguide.icu-project.org/conversion/converters 26 | 27 | * BOM in UTF-8/16 28 | * Surrogate chars 29 | UTF-16BE/16LE: Surrogate chars not streamable. 30 | CESU-8: Surrogate chars from UTF-8 31 | * Combining characters (see normalization.txt): both when encoding and decoding. 32 | 33 | 34 | ### General 35 | 36 | Everything goes to native javascript UTF16 (with surrogate pairs) and from it. 37 | We try to match what browsers do with encodings (as they have the most exposure). 38 | We ignore all non-alphanumeric chars in encoding aliases. 39 | 40 | Codec interface: 41 | encodings._codec = function() -> (cached) { 42 | 43 | encoder(options) -> (encoder obj) { 44 | , 45 | write(str) -> buf, 46 | end() -> buf, (optional) 47 | }, 48 | decoder(options) -> (decoder obj) { 49 | , 50 | write(buf) -> str, 51 | end() -> str, (optional) 52 | } 53 | } 54 | 55 | 56 | ### Edge cases 57 | 58 | * Browser support (see https://github.com/inexorabletash/text-encoding) 59 | -> Slow conversion between strings and ArrayBuffers. 60 | -> AMD/Require.js to load tables? Currently its 221k / 135k compressed. 61 | * Codecs should be able to share tables. 62 | 63 | * BOM for UTF-8/16/32 to determine endianness. 64 | * Surrogate chars: can be in different chunks. 65 | * Combining characters in input -> different combining chars in output. 66 | * Callback to decide what to do with unconvertable chars. 67 | * Ambiguous encoding names (Shift_JIS?) 68 | * Save memory by clearing the cache / read tables? 69 | * Stateful encodings ftw 70 | * Set substitute characters when no mapping is found, in addition to defaults, f.ex. 71 | in ISO-8859-1 it's 0x1A, in Unicode its U+FFFD. Both for encoding and decoding. 72 | In stateful encodings should be encoded every time depending on states. 73 | * When no mapping is found and we have substitution char, then it would still be better to 74 | skip it when the char has Default_Ignorable_Code_Point Unicode property, f.ex. Soft Hyphen, Combining Grapheme Joiner etc. 75 | http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B:DI:%5D 76 | 77 | We don't deal with: 78 | * Bidirectional reordering 79 | * Arabic shaping 80 | 81 | ### Error conditions 82 | * No mapping for a char is available -> callback that can consume data, write data, throw exception 83 | * substitute character 84 | * skip 85 | * throw exception 86 | * escape - replace by f.ex. \u1234 or ሴ 87 | * Character sequence is incomplete (at the end of source data) 88 | * Illegal char sequence found 89 | 90 | ### General structure 91 | iconv.encodings = {} all encodings, aliases, tables. 92 | 93 | Encoding = 94 | 1. Name + aliases (small, manual+generated) 95 | 2. Codec (med, manual) 96 | 3. Data/Table (large, generated) 97 | 98 | /encodings/index.js - loads all needed files to fill iconv.encodings. 99 | /encodings/sbcs-codec.js, dbcs-codec.js - code to convert. 100 | /encodings/sbcs-data.js, .. - aliases/tables to use. 101 | /encodings/tables/cp950.json - (generally large) tables to be used with dbcs. 102 | 103 | -------------------------------------------------------------------------------- /generation/utils.js: -------------------------------------------------------------------------------- 1 | 2 | var request = require('request'), 3 | fs = require('fs'), 4 | path = require('path'), 5 | errTo = require('errto'); 6 | 7 | // Common utilities used in scripts. 8 | 9 | exports.getFile = function(url, cb) { 10 | var sourceDataFolder = path.join(__dirname, "source-data"); 11 | var fullpath = path.join(sourceDataFolder, path.basename(url)); 12 | fs.readFile(fullpath, "utf8", function(err, text) { 13 | if (!err) return cb(null, text); 14 | if (err.code != "ENOENT") return cb(err); 15 | request(url, errTo(cb, function(res, buf) { 16 | fs.mkdir(sourceDataFolder, function(err) { 17 | if (err && err.code != "EEXIST") return cb(err); 18 | fs.writeFile(fullpath, buf, errTo(cb, function() { 19 | cb(null, buf.toString()); 20 | })); 21 | }); 22 | })); 23 | }); 24 | } 25 | 26 | // Returns array of arrays. 27 | exports.parseText = function(text, splitChar) { 28 | return text.split("\n").map(function(line) { 29 | return line.split("#")[0].trim(); 30 | }).filter(Boolean).map(function(line) { 31 | return line.split(splitChar || /\s+/).map(function(s) {return s.trim()}).filter(Boolean); 32 | }); 33 | } 34 | 35 | // Convert array of character codes to string. Character codes can be > 0xFFFF, 36 | // so we emit surrogates when needed. Also, some character codes are actually 37 | // sequences (arrays) - we emit them prepended with U+0FFF-(length-2). 38 | // U+0FFF was chosen because it's small and unassigned, as well as 32 chars before it 39 | function arrToStr(arr) { 40 | var s = ''; 41 | for (var i = 0; i < arr.length; i++) 42 | if (Array.isArray(arr[i])) { 43 | if (arr[i].length == 1) 44 | s += arrToStr(arr[i]); 45 | else if (arr[i].length > 1) 46 | s += String.fromCharCode(0xFFF - (arr[i].length-2)) 47 | + arrToStr(arr[i]); 48 | 49 | } else if (arr[i] > 0xFFFF) { 50 | // Surrogates 51 | s += String.fromCharCode(0xD800 + Math.floor((arr[i] - 0x10000) / 0x400)) 52 | + String.fromCharCode(0xDC00 + (arr[i] - 0x10000) % 0x400); 53 | 54 | } else { 55 | // Basic characters. 56 | s += String.fromCharCode(arr[i]); 57 | } 58 | 59 | return s; 60 | } 61 | 62 | // Input: map -> 63 | // Resulting format: Array of chunks, each chunk is: 64 | // [0] = address of start of the chunk, hex string. 65 | // - characters of the chunk. 66 | // - increasing sequence of the length num, starting with prev character. 67 | exports.generateTable = function(dbcs, maxBytes) { 68 | var minSeqLen = 4; 69 | var table = [], range, block, seqLen; 70 | var max = 1 << ((maxBytes || 2) * 8); 71 | for (var i = 0x0000; i < max; i++) 72 | if (dbcs[i] !== undefined) { 73 | if (dbcs[i-1] === undefined) { // Range started. 74 | range = [i.toString(16)]; // Range[0] is starting address. 75 | block = []; // Current block of character codes. 76 | seqLen = 0; // Increasing sequence length at the end of the block. 77 | } 78 | else if (typeof dbcs[i-1] === 'number' && // We have arrays as elements of dbcs - check against it. 79 | typeof dbcs[i] === 'number' && 80 | dbcs[i-1] + 1 === dbcs[i]) { // Increasing sequence continues - track its length. 81 | seqLen++; 82 | } 83 | else { // Increasing sequence ended (or not started at all). 84 | if (seqLen >= minSeqLen) { 85 | // Seq is long enough: write prev segment and its length. 86 | range.push(arrToStr(block.slice(0, -seqLen)), seqLen); 87 | block = []; 88 | } 89 | seqLen = 0; 90 | } 91 | 92 | block.push(dbcs[i]); 93 | 94 | } else if (range) { // Range finished, write last segments. 95 | if (seqLen >= minSeqLen) 96 | range.push(arrToStr(block.slice(0, -seqLen)), seqLen); 97 | else 98 | range.push(arrToStr(block)); 99 | 100 | table.push(range); 101 | range = null; 102 | } 103 | 104 | return table; 105 | } 106 | 107 | 108 | exports.writeTable = function(name, table) { 109 | this.writeFile(name, "[\n" + table.map(function(a) {return JSON.stringify(a);}).join(",\n") + "\n]\n"); 110 | } 111 | 112 | exports.writeFile = function(name, body) { 113 | fs.writeFileSync(path.join(__dirname, "../encodings/tables", name + ".json"), body); 114 | } 115 | 116 | -------------------------------------------------------------------------------- /lib/bom-handling.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | 3 | var BOMChar = '\uFEFF'; 4 | 5 | exports.PrependBOM = PrependBOMWrapper 6 | function PrependBOMWrapper(encoder, options) { 7 | this.encoder = encoder; 8 | this.addBOM = true; 9 | } 10 | 11 | PrependBOMWrapper.prototype.write = function(str) { 12 | if (this.addBOM) { 13 | str = BOMChar + str; 14 | this.addBOM = false; 15 | } 16 | 17 | return this.encoder.write(str); 18 | } 19 | 20 | PrependBOMWrapper.prototype.end = function() { 21 | return this.encoder.end(); 22 | } 23 | 24 | 25 | //------------------------------------------------------------------------------ 26 | 27 | exports.StripBOM = StripBOMWrapper; 28 | function StripBOMWrapper(decoder, options) { 29 | this.decoder = decoder; 30 | this.pass = false; 31 | this.options = options || {}; 32 | } 33 | 34 | StripBOMWrapper.prototype.write = function(buf) { 35 | var res = this.decoder.write(buf); 36 | if (this.pass || !res) 37 | return res; 38 | 39 | if (res[0] === BOMChar) { 40 | res = res.slice(1); 41 | if (typeof this.options.stripBOM === 'function') 42 | this.options.stripBOM(); 43 | } 44 | 45 | this.pass = true; 46 | return res; 47 | } 48 | 49 | StripBOMWrapper.prototype.end = function() { 50 | return this.decoder.end(); 51 | } 52 | 53 | -------------------------------------------------------------------------------- /lib/index.d.ts: -------------------------------------------------------------------------------- 1 | /*--------------------------------------------------------------------------------------------- 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT License. 4 | * REQUIREMENT: This definition is dependent on the @types/node definition. 5 | * Install with `npm install @types/node --save-dev` 6 | *--------------------------------------------------------------------------------------------*/ 7 | 8 | declare module 'iconv-lite' { 9 | // Basic API 10 | export function decode(buffer: Buffer | Uint8Array, encoding: string, options?: Options): string; 11 | 12 | export function encode(content: string, encoding: string, options?: Options): Buffer; 13 | 14 | export function encodingExists(encoding: string): boolean; 15 | 16 | // Stream API 17 | export function decodeStream(encoding: string, options?: Options): NodeJS.ReadWriteStream; 18 | 19 | export function encodeStream(encoding: string, options?: Options): NodeJS.ReadWriteStream; 20 | 21 | // Low-level stream APIs 22 | export function getEncoder(encoding: string, options?: Options): EncoderStream; 23 | 24 | export function getDecoder(encoding: string, options?: Options): DecoderStream; 25 | } 26 | 27 | export interface Options { 28 | stripBOM?: boolean; 29 | addBOM?: boolean; 30 | defaultEncoding?: string; 31 | } 32 | 33 | export interface EncoderStream { 34 | write(str: string): Buffer; 35 | end(): Buffer | undefined; 36 | } 37 | 38 | export interface DecoderStream { 39 | write(buf: Buffer): string; 40 | end(): string | undefined; 41 | } 42 | -------------------------------------------------------------------------------- /lib/index.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | 3 | var Buffer = require("safer-buffer").Buffer; 4 | 5 | var bomHandling = require("./bom-handling"), 6 | iconv = module.exports; 7 | 8 | // All codecs and aliases are kept here, keyed by encoding name/alias. 9 | // They are lazy loaded in `iconv.getCodec` from `encodings/index.js`. 10 | iconv.encodings = null; 11 | 12 | // Characters emitted in case of error. 13 | iconv.defaultCharUnicode = '�'; 14 | iconv.defaultCharSingleByte = '?'; 15 | 16 | // Public API. 17 | iconv.encode = function encode(str, encoding, options) { 18 | str = "" + (str || ""); // Ensure string. 19 | 20 | var encoder = iconv.getEncoder(encoding, options); 21 | 22 | var res = encoder.write(str); 23 | var trail = encoder.end(); 24 | 25 | return (trail && trail.length > 0) ? Buffer.concat([res, trail]) : res; 26 | } 27 | 28 | iconv.decode = function decode(buf, encoding, options) { 29 | if (typeof buf === 'string') { 30 | if (!iconv.skipDecodeWarning) { 31 | console.error('Iconv-lite warning: decode()-ing strings is deprecated. Refer to https://github.com/ashtuchkin/iconv-lite/wiki/Use-Buffers-when-decoding'); 32 | iconv.skipDecodeWarning = true; 33 | } 34 | 35 | buf = Buffer.from("" + (buf || ""), "binary"); // Ensure buffer. 36 | } 37 | 38 | var decoder = iconv.getDecoder(encoding, options); 39 | 40 | var res = decoder.write(buf); 41 | var trail = decoder.end(); 42 | 43 | return trail ? (res + trail) : res; 44 | } 45 | 46 | iconv.encodingExists = function encodingExists(enc) { 47 | try { 48 | iconv.getCodec(enc); 49 | return true; 50 | } catch (e) { 51 | return false; 52 | } 53 | } 54 | 55 | // Legacy aliases to convert functions 56 | iconv.toEncoding = iconv.encode; 57 | iconv.fromEncoding = iconv.decode; 58 | 59 | // Search for a codec in iconv.encodings. Cache codec data in iconv._codecDataCache. 60 | iconv._codecDataCache = {}; 61 | iconv.getCodec = function getCodec(encoding) { 62 | if (!iconv.encodings) 63 | iconv.encodings = require("../encodings"); // Lazy load all encoding definitions. 64 | 65 | // Canonicalize encoding name: strip all non-alphanumeric chars and appended year. 66 | var enc = iconv._canonicalizeEncoding(encoding); 67 | 68 | // Traverse iconv.encodings to find actual codec. 69 | var codecOptions = {}; 70 | while (true) { 71 | var codec = iconv._codecDataCache[enc]; 72 | if (codec) 73 | return codec; 74 | 75 | var codecDef = iconv.encodings[enc]; 76 | 77 | switch (typeof codecDef) { 78 | case "string": // Direct alias to other encoding. 79 | enc = codecDef; 80 | break; 81 | 82 | case "object": // Alias with options. Can be layered. 83 | for (var key in codecDef) 84 | codecOptions[key] = codecDef[key]; 85 | 86 | if (!codecOptions.encodingName) 87 | codecOptions.encodingName = enc; 88 | 89 | enc = codecDef.type; 90 | break; 91 | 92 | case "function": // Codec itself. 93 | if (!codecOptions.encodingName) 94 | codecOptions.encodingName = enc; 95 | 96 | // The codec function must load all tables and return object with .encoder and .decoder methods. 97 | // It'll be called only once (for each different options object). 98 | codec = new codecDef(codecOptions, iconv); 99 | 100 | iconv._codecDataCache[codecOptions.encodingName] = codec; // Save it to be reused later. 101 | return codec; 102 | 103 | default: 104 | throw new Error("Encoding not recognized: '" + encoding + "' (searched as: '"+enc+"')"); 105 | } 106 | } 107 | } 108 | 109 | iconv._canonicalizeEncoding = function(encoding) { 110 | // Canonicalize encoding name: strip all non-alphanumeric chars and appended year. 111 | return (''+encoding).toLowerCase().replace(/:\d{4}$|[^0-9a-z]/g, ""); 112 | } 113 | 114 | iconv.getEncoder = function getEncoder(encoding, options) { 115 | var codec = iconv.getCodec(encoding), 116 | encoder = new codec.encoder(options, codec); 117 | 118 | if (codec.bomAware && options && options.addBOM) 119 | encoder = new bomHandling.PrependBOM(encoder, options); 120 | 121 | return encoder; 122 | } 123 | 124 | iconv.getDecoder = function getDecoder(encoding, options) { 125 | var codec = iconv.getCodec(encoding), 126 | decoder = new codec.decoder(options, codec); 127 | 128 | if (codec.bomAware && !(options && options.stripBOM === false)) 129 | decoder = new bomHandling.StripBOM(decoder, options); 130 | 131 | return decoder; 132 | } 133 | 134 | // Streaming API 135 | // NOTE: Streaming API naturally depends on 'stream' module from Node.js. Unfortunately in browser environments this module can add 136 | // up to 100Kb to the output bundle. To avoid unnecessary code bloat, we don't enable Streaming API in browser by default. 137 | // If you would like to enable it explicitly, please add the following code to your app: 138 | // > iconv.enableStreamingAPI(require('stream')); 139 | iconv.enableStreamingAPI = function enableStreamingAPI(stream_module) { 140 | if (iconv.supportsStreams) 141 | return; 142 | 143 | // Dependency-inject stream module to create IconvLite stream classes. 144 | var streams = require("./streams")(stream_module); 145 | 146 | // Not public API yet, but expose the stream classes. 147 | iconv.IconvLiteEncoderStream = streams.IconvLiteEncoderStream; 148 | iconv.IconvLiteDecoderStream = streams.IconvLiteDecoderStream; 149 | 150 | // Streaming API. 151 | iconv.encodeStream = function encodeStream(encoding, options) { 152 | return new iconv.IconvLiteEncoderStream(iconv.getEncoder(encoding, options), options); 153 | } 154 | 155 | iconv.decodeStream = function decodeStream(encoding, options) { 156 | return new iconv.IconvLiteDecoderStream(iconv.getDecoder(encoding, options), options); 157 | } 158 | 159 | iconv.supportsStreams = true; 160 | } 161 | 162 | // Enable Streaming API automatically if 'stream' module is available and non-empty (the majority of environments). 163 | var stream_module; 164 | try { 165 | stream_module = require("stream"); 166 | } catch (e) {} 167 | 168 | if (stream_module && stream_module.Transform) { 169 | iconv.enableStreamingAPI(stream_module); 170 | 171 | } else { 172 | // In rare cases where 'stream' module is not available by default, throw a helpful exception. 173 | iconv.encodeStream = iconv.decodeStream = function() { 174 | throw new Error("iconv-lite Streaming API is not enabled. Use iconv.enableStreamingAPI(require('stream')); to enable it."); 175 | }; 176 | } 177 | 178 | if ("Ā" != "\u0100") { 179 | console.error("iconv-lite warning: js files use non-utf8 encoding. See https://github.com/ashtuchkin/iconv-lite/wiki/Javascript-source-file-encodings for more info."); 180 | } 181 | -------------------------------------------------------------------------------- /lib/streams.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | 3 | var Buffer = require("safer-buffer").Buffer; 4 | 5 | // NOTE: Due to 'stream' module being pretty large (~100Kb, significant in browser environments), 6 | // we opt to dependency-inject it instead of creating a hard dependency. 7 | module.exports = function(stream_module) { 8 | var Transform = stream_module.Transform; 9 | 10 | // == Encoder stream ======================================================= 11 | 12 | function IconvLiteEncoderStream(conv, options) { 13 | this.conv = conv; 14 | options = options || {}; 15 | options.decodeStrings = false; // We accept only strings, so we don't need to decode them. 16 | Transform.call(this, options); 17 | } 18 | 19 | IconvLiteEncoderStream.prototype = Object.create(Transform.prototype, { 20 | constructor: { value: IconvLiteEncoderStream } 21 | }); 22 | 23 | IconvLiteEncoderStream.prototype._transform = function(chunk, encoding, done) { 24 | if (typeof chunk != 'string') 25 | return done(new Error("Iconv encoding stream needs strings as its input.")); 26 | try { 27 | var res = this.conv.write(chunk); 28 | if (res && res.length) this.push(res); 29 | done(); 30 | } 31 | catch (e) { 32 | done(e); 33 | } 34 | } 35 | 36 | IconvLiteEncoderStream.prototype._flush = function(done) { 37 | try { 38 | var res = this.conv.end(); 39 | if (res && res.length) this.push(res); 40 | done(); 41 | } 42 | catch (e) { 43 | done(e); 44 | } 45 | } 46 | 47 | IconvLiteEncoderStream.prototype.collect = function(cb) { 48 | var chunks = []; 49 | this.on('error', cb); 50 | this.on('data', function(chunk) { chunks.push(chunk); }); 51 | this.on('end', function() { 52 | cb(null, Buffer.concat(chunks)); 53 | }); 54 | return this; 55 | } 56 | 57 | 58 | // == Decoder stream ======================================================= 59 | 60 | function IconvLiteDecoderStream(conv, options) { 61 | this.conv = conv; 62 | options = options || {}; 63 | options.encoding = this.encoding = 'utf8'; // We output strings. 64 | Transform.call(this, options); 65 | } 66 | 67 | IconvLiteDecoderStream.prototype = Object.create(Transform.prototype, { 68 | constructor: { value: IconvLiteDecoderStream } 69 | }); 70 | 71 | IconvLiteDecoderStream.prototype._transform = function(chunk, encoding, done) { 72 | if (!Buffer.isBuffer(chunk) && !(chunk instanceof Uint8Array)) 73 | return done(new Error("Iconv decoding stream needs buffers as its input.")); 74 | try { 75 | var res = this.conv.write(chunk); 76 | if (res && res.length) this.push(res, this.encoding); 77 | done(); 78 | } 79 | catch (e) { 80 | done(e); 81 | } 82 | } 83 | 84 | IconvLiteDecoderStream.prototype._flush = function(done) { 85 | try { 86 | var res = this.conv.end(); 87 | if (res && res.length) this.push(res, this.encoding); 88 | done(); 89 | } 90 | catch (e) { 91 | done(e); 92 | } 93 | } 94 | 95 | IconvLiteDecoderStream.prototype.collect = function(cb) { 96 | var res = ''; 97 | this.on('error', cb); 98 | this.on('data', function(chunk) { res += chunk; }); 99 | this.on('end', function() { 100 | cb(null, res); 101 | }); 102 | return this; 103 | } 104 | 105 | return { 106 | IconvLiteEncoderStream: IconvLiteEncoderStream, 107 | IconvLiteDecoderStream: IconvLiteDecoderStream, 108 | }; 109 | }; 110 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "iconv-lite", 3 | "description": "Convert character encodings in pure javascript.", 4 | "version": "0.6.3", 5 | "license": "MIT", 6 | "keywords": [ 7 | "iconv", 8 | "convert", 9 | "charset", 10 | "icu" 11 | ], 12 | "author": "Alexander Shtuchkin ", 13 | "main": "./lib/index.js", 14 | "typings": "./lib/index.d.ts", 15 | "homepage": "https://github.com/ashtuchkin/iconv-lite", 16 | "bugs": "https://github.com/ashtuchkin/iconv-lite/issues", 17 | "repository": { 18 | "type": "git", 19 | "url": "git://github.com/ashtuchkin/iconv-lite.git" 20 | }, 21 | "engines": { 22 | "node": ">=0.10.0" 23 | }, 24 | "scripts": { 25 | "coverage": "c8 _mocha --grep .", 26 | "test": "mocha --reporter spec --grep ." 27 | }, 28 | "browser": { 29 | "stream": false 30 | }, 31 | "devDependencies": { 32 | "async": "^3.2.0", 33 | "c8": "^7.2.0", 34 | "errto": "^0.2.1", 35 | "iconv": "^2.3.5", 36 | "mocha": "^3.5.3", 37 | "request": "^2.88.2", 38 | "semver": "^6.3.0", 39 | "unorm": "^1.6.0" 40 | }, 41 | "dependencies": { 42 | "safer-buffer": ">= 2.1.2 < 3.0.0" 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /test/big5-test.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert'), 2 | Buffer = require('safer-buffer').Buffer, 3 | iconv = require(__dirname + '/../'); 4 | 5 | var testString = "中文abc", //unicode contains Big5-code and ascii 6 | testStringBig5Buffer = Buffer.from([0xa4,0xa4,0xa4,0xe5,0x61,0x62,0x63]), 7 | testString2 = '測試', 8 | testStringBig5Buffer2 = Buffer.from([0xb4, 0xfa, 0xb8, 0xd5]); 9 | 10 | describe("Big5 tests", function() { 11 | it("Big5 correctly encoded/decoded", function() { 12 | assert.strictEqual(iconv.encode(testString, "big5").toString('hex'), testStringBig5Buffer.toString('hex')); 13 | assert.strictEqual(iconv.decode(testStringBig5Buffer, "big5"), testString); 14 | assert.strictEqual(iconv.encode(testString2, 'big5').toString('hex'), testStringBig5Buffer2.toString('hex')); 15 | assert.strictEqual(iconv.decode(testStringBig5Buffer2, 'big5'), testString2); 16 | }); 17 | 18 | it("cp950 correctly encoded/decoded", function() { 19 | assert.strictEqual(iconv.encode(testString, "cp950").toString('hex'), testStringBig5Buffer.toString('hex')); 20 | assert.strictEqual(iconv.decode(testStringBig5Buffer, "cp950"), testString); 21 | }); 22 | 23 | it("Big5 file read decoded,compare with iconv result", function() { 24 | var contentBuffer = Buffer.from('PEhUTUw+DQo8SEVBRD4gICAgDQoJPFRJVExFPiBtZXRhILzQxdKquqjPpc6hR6SkpOW69K22IDwvVElUTEU+DQoJPG1ldGEgSFRUUC1FUVVJVj0iQ29udGVudC1UeXBlIiBDT05URU5UPSJ0ZXh0L2h0bWw7IGNoYXJzZXQ9YmlnNSI+DQo8L0hFQUQ+DQo8Qk9EWT4NCg0Ks2+sT6RArdPBY8XppKSk5br0rbahSTxicj4NCihUaGlzIHBhZ2UgdXNlcyBiaWc1IGNoYXJhY3RlciBzZXQuKTxicj4NCmNoYXJzZXQ9YmlnNQ0KDQo8L0JPRFk+DQo8L0hUTUw+', 'base64'); 25 | var str = iconv.decode(contentBuffer, "big5"); 26 | var iconvc = new (require('iconv').Iconv)('big5','utf8'); 27 | assert.strictEqual(iconvc.convert(contentBuffer).toString(), str); 28 | }); 29 | 30 | it("Big5 correctly decodes and encodes characters · and ×", function() { 31 | // https://github.com/ashtuchkin/iconv-lite/issues/13 32 | // Reference: http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT 33 | var chars = "·×"; 34 | var big5Chars = Buffer.from([0xA1, 0x50, 0xA1, 0xD1]); 35 | assert.strictEqual(iconv.encode(chars, "big5").toString('hex'), big5Chars.toString('hex')); 36 | assert.strictEqual(iconv.decode(big5Chars, "big5"), chars) 37 | }); 38 | 39 | it("Big5 correctly encodes & decodes sequences", function() { 40 | assert.strictEqual(iconv.encode("\u00CA\u0304", "big5").toString('hex'), "8862"); 41 | assert.strictEqual(iconv.encode("\u00EA\u030C", "big5").toString('hex'), "88a5"); 42 | assert.strictEqual(iconv.encode("\u00CA", "big5").toString('hex'), "8866"); 43 | assert.strictEqual(iconv.encode("\u00CA\u00CA", "big5").toString('hex'), "88668866"); 44 | 45 | assert.strictEqual(iconv.encode("\u00CA\uD800", "big5").toString('hex'), "88663f"); // Unfinished surrogate. 46 | assert.strictEqual(iconv.encode("\u00CA\uD841\uDD47", "big5").toString('hex'), "8866fa40"); // Finished surrogate ('𠕇'). 47 | assert.strictEqual(iconv.encode("\u00CA𠕇", "big5").toString('hex'), "8866fa40"); // Finished surrogate ('𠕇'). 48 | 49 | assert.strictEqual(iconv.decode(Buffer.from('8862', 'hex'), "big5"), "\u00CA\u0304"); 50 | assert.strictEqual(iconv.decode(Buffer.from('8866', 'hex'), "big5"), "\u00CA"); 51 | assert.strictEqual(iconv.decode(Buffer.from('8866fa40', 'hex'), "big5"), "\u00CA𠕇"); 52 | }); 53 | 54 | it("Big5 correctly encodes 十", function() { 55 | assert.strictEqual(iconv.encode("十", "big5").toString('hex'), "a451"); 56 | }); 57 | 58 | it("Big5 correctly encodes 起 (issue #264)", function() { 59 | assert.strictEqual(iconv.encode("起", "big5").toString('hex'), "b05f"); 60 | }); 61 | }); 62 | -------------------------------------------------------------------------------- /test/bom-test.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert'), 2 | Buffer = require('safer-buffer').Buffer, 3 | iconv = require(__dirname+'/../'); 4 | 5 | var sampleStr = '\n<俄语>данные'; 6 | strBOM = '\ufeff', 7 | utf8BOM = Buffer.from([0xEF, 0xBB, 0xBF]), 8 | utf16beBOM = Buffer.from([0xFE, 0xFF]), 9 | utf16leBOM = Buffer.from([0xFF, 0xFE]); 10 | 11 | describe("BOM Handling", function() { 12 | it("strips UTF-8 BOM", function() { 13 | var body = Buffer.concat([utf8BOM, Buffer.from(sampleStr)]); 14 | assert.equal(iconv.decode(body, 'utf8'), sampleStr); 15 | }); 16 | 17 | it("strips UTF-16 BOM", function() { 18 | var body = Buffer.concat([utf16leBOM, iconv.encode(sampleStr, 'utf16le')]); 19 | assert.equal(iconv.decode(body, 'utf16'), sampleStr); 20 | assert.equal(iconv.decode(body, 'utf16le'), sampleStr); 21 | 22 | var body = Buffer.concat([utf16beBOM, iconv.encode(sampleStr, 'utf16be')]); 23 | assert.equal(iconv.decode(body, 'utf16'), sampleStr); 24 | assert.equal(iconv.decode(body, 'utf16be'), sampleStr); 25 | }); 26 | 27 | it("doesn't strip BOMs when stripBOM=false", function() { 28 | var body = Buffer.concat([utf8BOM, Buffer.from(sampleStr)]); 29 | assert.equal(iconv.decode(body, 'utf8', {stripBOM: false}), strBOM + sampleStr); 30 | 31 | var body = Buffer.concat([utf16leBOM, iconv.encode(sampleStr, 'utf16le')]); 32 | assert.equal(iconv.decode(body, 'utf16', {stripBOM: false}), strBOM + sampleStr); 33 | assert.equal(iconv.decode(body, 'utf16le', {stripBOM: false}), strBOM + sampleStr); 34 | 35 | var body = Buffer.concat([utf16beBOM, iconv.encode(sampleStr, 'utf16be')]); 36 | assert.equal(iconv.decode(body, 'utf16', {stripBOM: false}), strBOM + sampleStr); 37 | assert.equal(iconv.decode(body, 'utf16be', {stripBOM: false}), strBOM + sampleStr); 38 | }); 39 | 40 | it("adds/strips UTF-7 BOM", function() { 41 | var bodyWithBOM = iconv.encode(sampleStr, 'utf7', {addBOM: true}); 42 | var body = iconv.encode(sampleStr, 'utf7'); 43 | assert.notEqual(body.toString('hex'), bodyWithBOM.toString('hex')); 44 | assert.equal(iconv.decode(body, 'utf7'), sampleStr); 45 | }); 46 | 47 | it("adds UTF-8 BOM when addBOM=true", function() { 48 | var body = Buffer.concat([utf8BOM, Buffer.from(sampleStr)]).toString('hex'); 49 | assert.equal(iconv.encode(sampleStr, 'utf8', {addBOM: true}).toString('hex'), body); 50 | }); 51 | 52 | it("adds UTF-16 BOMs when addBOM=true", function() { 53 | var body = Buffer.concat([utf16leBOM, iconv.encode(sampleStr, 'utf16le')]).toString('hex'); 54 | assert.equal(iconv.encode(sampleStr, 'utf16le', {addBOM: true}).toString('hex'), body); 55 | 56 | var body = Buffer.concat([utf16beBOM, iconv.encode(sampleStr, 'utf16be')]).toString('hex'); 57 | assert.equal(iconv.encode(sampleStr, 'utf16be', {addBOM: true}).toString('hex'), body); 58 | }); 59 | 60 | it("'UTF-16' encoding adds BOM by default, but can be overridden with addBOM=false", function() { 61 | var body = Buffer.concat([utf16leBOM, iconv.encode(sampleStr, 'utf16le')]).toString('hex'); 62 | assert.equal(iconv.encode(sampleStr, 'utf16').toString('hex'), body); 63 | 64 | var body = Buffer.concat([iconv.encode(sampleStr, 'utf16le')]).toString('hex'); 65 | assert.equal(iconv.encode(sampleStr, 'utf16', {addBOM: false}).toString('hex'), body); 66 | }); 67 | 68 | 69 | it("when stripping BOM, calls callback 'stripBOM' if provided", function() { 70 | var bomStripped = false; 71 | var stripBOM = function() { bomStripped = true; } 72 | 73 | var body = Buffer.concat([utf8BOM, Buffer.from(sampleStr)]); 74 | assert.equal(iconv.decode(body, 'utf8', {stripBOM: stripBOM}), sampleStr); 75 | assert(bomStripped); 76 | 77 | bomStripped = false; 78 | 79 | body = Buffer.from(sampleStr); 80 | assert.equal(iconv.decode(body, 'utf8', {stripBOM: stripBOM}), sampleStr); 81 | assert(!bomStripped); 82 | }); 83 | 84 | }); 85 | -------------------------------------------------------------------------------- /test/cesu8-test.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert'), 2 | Buffer = require('safer-buffer').Buffer, 3 | iconv = require(__dirname+'/../'); 4 | 5 | describe("CESU-8 codec", function() { 6 | it("encodes correctly", function() { 7 | assert.equal(iconv.encode("E", "cesu8").toString('hex'), "45"); 8 | assert.equal(iconv.encode("¢", "cesu8").toString('hex'), "c2a2"); 9 | assert.equal(iconv.encode("ȅ", "cesu8").toString('hex'), "c885"); 10 | assert.equal(iconv.encode("€", "cesu8").toString('hex'), "e282ac"); 11 | assert.equal(iconv.encode("𐐀", "cesu8").toString('hex'), "eda081edb080"); 12 | assert.equal(iconv.encode("😱", "cesu8").toString('hex'), "eda0bdedb8b1"); 13 | assert.equal(iconv.encode("a😱a", "cesu8").toString('hex'), "61eda0bdedb8b161"); 14 | assert.equal(iconv.encode("😱😱", "cesu8").toString('hex'), "eda0bdedb8b1eda0bdedb8b1"); 15 | }); 16 | it("decodes correctly", function() { 17 | assert.equal(iconv.decode(Buffer.from("45", 'hex'), "cesu8"), "E"); 18 | assert.equal(iconv.decode(Buffer.from("c2a2", 'hex'), "cesu8"), "¢"); 19 | assert.equal(iconv.decode(Buffer.from("c885", 'hex'), "cesu8"), "ȅ"); 20 | assert.equal(iconv.decode(Buffer.from("e282ac", 'hex'), "cesu8"), "€"); 21 | assert.equal(iconv.decode(Buffer.from("eda081edb080", 'hex'), "cesu8"), "𐐀"); 22 | assert.equal(iconv.decode(Buffer.from("eda0bdedb8b1", 'hex'), "cesu8"), "😱"); 23 | }); 24 | }); 25 | -------------------------------------------------------------------------------- /test/cyrillic-test.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert'), 2 | Buffer = require('safer-buffer').Buffer, 3 | iconv = require(__dirname+'/../'); 4 | 5 | var baseStrings = { 6 | empty: "", 7 | hi: "Привет!", 8 | ascii: '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f'+ 9 | ' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f', 10 | rus: "АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя", 11 | additional1: "ЂЃ‚ѓ„…†‡€‰Љ‹ЊЌЋЏђ‘’“”•–—™љ›њќћџ ЎўЈ¤Ґ¦§Ё©Є«¬\xAD®Ї°±Ііґµ¶·ё№є»јЅѕї", 12 | additional2: "─│┌┐└┘├┤┬┴┼▀▄█▌▐░▒▓⌠■∙√≈≤≥ ⌡°²·÷═║╒ё╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡Ё╢╣╤╥╦╧╨╩╪╫╬©", 13 | additional3: " ЁЂЃЄЅІЇЈЉЊЋЌ­ЎЏ№ёђѓєѕіїјљњћќ§ўџ", 14 | untranslatable: "£Åçþÿ¿", 15 | }; 16 | 17 | var encodings = [{ 18 | name: "Win-1251", 19 | variations: ['win1251', 'Windows-1251', 'windows1251', 'CP1251', 1251], 20 | encodedStrings: { 21 | empty: Buffer.from(''), 22 | hi: Buffer.from('\xcf\xf0\xe8\xe2\xe5\xf2!', 'binary'), 23 | ascii: Buffer.from(baseStrings.ascii, 'binary'), 24 | rus: Buffer.from('\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff', 'binary'), 25 | additional1: Buffer.from('\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf','binary'), 26 | } 27 | }, { 28 | name: "Koi8-R", 29 | variations: ['koi8r', 'KOI8-R', 'cp20866', 20866], 30 | encodedStrings: { 31 | empty: Buffer.from(''), 32 | hi: Buffer.from('\xf0\xd2\xc9\xd7\xc5\xd4!', 'binary'), 33 | ascii: Buffer.from(baseStrings.ascii, 'binary'), 34 | rus: Buffer.from('\xe1\xe2\xf7\xe7\xe4\xe5\xf6\xfa\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf2\xf3\xf4\xf5\xe6\xe8\xe3\xfe\xfb\xfd\xff\xf9\xf8\xfc\xe0\xf1\xc1\xc2\xd7\xc7\xc4\xc5\xd6\xda\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd2\xd3\xd4\xd5\xc6\xc8\xc3\xde\xdb\xdd\xdf\xd9\xd8\xdc\xc0\xd1', 'binary'), 35 | additional2: Buffer.from('\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf', 'binary'), 36 | } 37 | }, { 38 | name: "ISO 8859-5", 39 | variations: ['iso88595', 'ISO-8859-5', 'ISO 8859-5', 'cp28595', 28595], 40 | encodedStrings: { 41 | empty: Buffer.from(''), 42 | hi: Buffer.from('\xbf\xe0\xd8\xd2\xd5\xe2!', 'binary'), 43 | ascii: Buffer.from(baseStrings.ascii, 'binary'), 44 | rus: Buffer.from('\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef', 'binary'), 45 | additional3: Buffer.from('\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff', 'binary'), 46 | } 47 | }]; 48 | 49 | describe("Test Cyrillic encodings", function() { 50 | encodings.forEach(function(encoding) { 51 | var enc = encoding.variations[0]; 52 | var key = "hi"; 53 | describe(encoding.name+":", function() { 54 | 55 | it("Convert from buffer", function() { 56 | for (var key in encoding.encodedStrings) 57 | assert.strictEqual(iconv.decode(encoding.encodedStrings[key], enc), 58 | baseStrings[key]); 59 | }); 60 | 61 | it("Convert to buffer", function() { 62 | for (var key in encoding.encodedStrings) 63 | assert.strictEqual(iconv.encode(baseStrings[key], enc).toString('binary'), 64 | encoding.encodedStrings[key].toString('binary')); 65 | }); 66 | 67 | it("Try different variations of encoding", function() { 68 | encoding.variations.forEach(function(enc) { 69 | assert.strictEqual(iconv.decode(encoding.encodedStrings[key], enc), baseStrings[key]); 70 | assert.strictEqual(iconv.encode(baseStrings[key], enc).toString('binary'), encoding.encodedStrings[key].toString('binary')); 71 | }); 72 | }); 73 | 74 | it("Untranslatable chars are converted to defaultCharSingleByte", function() { 75 | var expected = baseStrings.untranslatable.split('').map(function(c) {return iconv.defaultCharSingleByte; }).join(''); 76 | assert.strictEqual(iconv.encode(baseStrings.untranslatable, enc).toString('binary'), expected); // Only '?' characters. 77 | }); 78 | }); 79 | }); 80 | }); 81 | 82 | -------------------------------------------------------------------------------- /test/gbk-test.js: -------------------------------------------------------------------------------- 1 | var fs = require('fs'), 2 | assert = require('assert'), 3 | Buffer = require('safer-buffer').Buffer, 4 | iconv = require(__dirname+'/../'); 5 | 6 | var testString = "中国abc",//unicode contains GBK-code and ascii 7 | testStringGBKBuffer = Buffer.from([0xd6,0xd0,0xb9,0xfa,0x61,0x62,0x63]); 8 | 9 | describe("GBK tests", function() { 10 | it("GBK correctly encoded/decoded", function() { 11 | assert.strictEqual(iconv.encode(testString, "GBK").toString('binary'), testStringGBKBuffer.toString('binary')); 12 | assert.strictEqual(iconv.decode(testStringGBKBuffer, "GBK"), testString); 13 | }); 14 | 15 | it("GB2312 correctly encoded/decoded", function() { 16 | assert.strictEqual(iconv.encode(testString, "GB2312").toString('binary'), testStringGBKBuffer.toString('binary')); 17 | assert.strictEqual(iconv.decode(testStringGBKBuffer, "GB2312"), testString); 18 | }); 19 | 20 | it("GBK file read decoded,compare with iconv result", function() { 21 | var contentBuffer = fs.readFileSync(__dirname+"/gbkFile.txt"); 22 | var str = iconv.decode(contentBuffer, "GBK"); 23 | var iconvc = new (require('iconv').Iconv)('GBK','utf8'); 24 | assert.strictEqual(iconvc.convert(contentBuffer).toString(), str); 25 | }); 26 | 27 | it("GBK correctly decodes and encodes characters · and ×", function() { 28 | // https://github.com/ashtuchkin/iconv-lite/issues/13 29 | // Reference: http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT 30 | var chars = "·×"; 31 | var gbkChars = Buffer.from([0xA1, 0xA4, 0xA1, 0xC1]); 32 | assert.strictEqual(iconv.encode(chars, "GBK").toString('binary'), gbkChars.toString('binary')); 33 | assert.strictEqual(iconv.decode(gbkChars, "GBK"), chars) 34 | }); 35 | 36 | it("GBK and GB18030 correctly decodes and encodes Euro character", function() { 37 | // Euro character (U+20AC) has two encodings in GBK family: 0x80 and 0xA2 0xE3 38 | // According to W3C's technical recommendation (https://www.w3.org/TR/encoding/#gbk-encoder), 39 | // Both GBK and GB18030 decoders should accept both encodings. 40 | var gbkEuroEncoding1 = Buffer.from([0x80]), 41 | gbkEuroEncoding2 = Buffer.from([0xA2, 0xE3]), 42 | strEuro = "€"; 43 | 44 | assert.strictEqual(iconv.decode(gbkEuroEncoding1, "GBK"), strEuro); 45 | assert.strictEqual(iconv.decode(gbkEuroEncoding2, "GBK"), strEuro); 46 | assert.strictEqual(iconv.decode(gbkEuroEncoding1, "GB18030"), strEuro); 47 | assert.strictEqual(iconv.decode(gbkEuroEncoding2, "GB18030"), strEuro); 48 | 49 | // But when decoding, GBK should produce 0x80, but GB18030 - 0xA2 0xE3. 50 | assert.strictEqual(iconv.encode(strEuro, "GBK").toString('hex'), gbkEuroEncoding1.toString('hex')); 51 | assert.strictEqual(iconv.encode(strEuro, "GB18030").toString('hex'), gbkEuroEncoding2.toString('hex')); 52 | }); 53 | 54 | it("GB18030 findIdx works correctly", function() { 55 | function findIdxAlternative(table, val) { 56 | for (var i = 0; i < table.length; i++) 57 | if (table[i] > val) 58 | return i-1; 59 | return table.length - 1; 60 | } 61 | 62 | var codec = iconv.getEncoder('gb18030'); 63 | 64 | for (var i = 0; i < 0x100; i++) 65 | assert.strictEqual(codec.findIdx(codec.gb18030.uChars, i), findIdxAlternative(codec.gb18030.uChars, i), i); 66 | 67 | var tests = [0xFFFF, 0x10000, 0x10001, 0x30000]; 68 | for (var i = 0; i < tests.length; i++) 69 | assert.strictEqual(codec.findIdx(codec.gb18030.uChars, tests[i]), findIdxAlternative(codec.gb18030.uChars, tests[i]), tests[i]); 70 | }); 71 | 72 | function swapBytes(buf) { for (var i = 0; i < buf.length; i+=2) buf.writeUInt16LE(buf.readUInt16BE(i), i); return buf; } 73 | function spacify4(str) { return str.replace(/(....)/g, "$1 ").trim(); } 74 | function strToHex(str) { return spacify4(swapBytes(Buffer.from(str, 'ucs2')).toString('hex')); } 75 | 76 | it("GB18030 encodes/decodes 4 byte sequences", function() { 77 | var chars = { 78 | "\u0080": Buffer.from([0x81, 0x30, 0x81, 0x30]), 79 | "\u0081": Buffer.from([0x81, 0x30, 0x81, 0x31]), 80 | "\u008b": Buffer.from([0x81, 0x30, 0x82, 0x31]), 81 | "\u0615": Buffer.from([0x81, 0x31, 0x82, 0x31]), 82 | "\u399f": Buffer.from([0x82, 0x31, 0x82, 0x31]), 83 | "\udbd9\ude77": Buffer.from([0xE0, 0x31, 0x82, 0x31]), 84 | }; 85 | for (var uChar in chars) { 86 | var gbkBuf = chars[uChar]; 87 | assert.strictEqual(iconv.encode(uChar, "GB18030").toString('hex'), gbkBuf.toString('hex')); 88 | assert.strictEqual(strToHex(iconv.decode(gbkBuf, "GB18030")), strToHex(uChar)); 89 | } 90 | }); 91 | 92 | it("GB18030 correctly decodes incomplete 4 byte sequences", function() { 93 | var chars = { 94 | "�": Buffer.from([0x82]), 95 | "�1": Buffer.from([0x82, 0x31]), 96 | "�1�": Buffer.from([0x82, 0x31, 0x82]), 97 | "\u399f": Buffer.from([0x82, 0x31, 0x82, 0x31]), 98 | "� ": Buffer.from([0x82, 0x20]), 99 | "�1 ": Buffer.from([0x82, 0x31, 0x20]), 100 | "�1� ": Buffer.from([0x82, 0x31, 0x82, 0x20]), 101 | "\u399f ": Buffer.from([0x82, 0x31, 0x82, 0x31, 0x20]), 102 | "�1\u4fdb": Buffer.from([0x82, 0x31, 0x82, 0x61]), 103 | "�1\u5010\u0061": Buffer.from([0x82, 0x31, 0x82, 0x82, 0x61]), 104 | "\u399f\u4fdb": Buffer.from([0x82, 0x31, 0x82, 0x31, 0x82, 0x61]), 105 | "�1\u50101�1": Buffer.from([0x82, 0x31, 0x82, 0x82, 0x31, 0x82, 0x31]), 106 | }; 107 | for (var uChar in chars) { 108 | var gbkBuf = chars[uChar]; 109 | assert.strictEqual(strToHex(iconv.decode(gbkBuf, "GB18030")), strToHex(uChar)); 110 | } 111 | }); 112 | 113 | it("GB18030:2005 changes are applied", function() { 114 | // See https://github.com/whatwg/encoding/issues/22 115 | var chars = "\u1E3F\u0000\uE7C7"; // Use \u0000 as separator 116 | var gbkChars = Buffer.from([0xA8, 0xBC, 0x00, 0x81, 0x35, 0xF4, 0x37]); 117 | assert.strictEqual(iconv.decode(gbkChars, "GB18030"), chars); 118 | assert.strictEqual(iconv.encode(chars, "GB18030").toString('hex'), gbkChars.toString('hex')); 119 | }); 120 | }); 121 | -------------------------------------------------------------------------------- /test/gbkFile.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashtuchkin/iconv-lite/928f7c68e1be51c1391c70dbee244fd32623f121/test/gbkFile.txt -------------------------------------------------------------------------------- /test/greek-test.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert'), 2 | Buffer = require('safer-buffer').Buffer, 3 | iconv = require(__dirname+'/../'); 4 | 5 | var baseStrings = { 6 | empty: "", 7 | hi: "Γειά!", 8 | ascii: '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f'+ 9 | ' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f', 10 | greek: "αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίόύώΆΈΉΊΌΎΏϊϋΪΫ", 11 | untranslatable: "Åçþÿ¿" 12 | }; 13 | 14 | var encodings = [{ 15 | name: "windows1253", 16 | variations: ['windows-1253', 'win-1253', 'win1253', 'cp1253', 'cp-1253', 1253], 17 | encodedStrings: { 18 | empty: Buffer.from(''), 19 | hi: Buffer.from('\xc3\xe5\xe9\xdc!', 'binary'), 20 | ascii: Buffer.from(baseStrings.ascii, 'binary'), 21 | greek: Buffer.from('\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xdc\xdd\xde\xdf\xfc\xfd\xfe\xa2\xb8\xb9\xba\xbc\xbe\xbf\xfa\xfb\xda\xdb', 'binary'), 22 | } 23 | }, { 24 | name: "iso88597", 25 | variations: ['iso-8859-7', 'greek', 'greek8', 'cp28597', 'cp-28597', 28597], 26 | encodedStrings: { 27 | empty: Buffer.from(''), 28 | hi: Buffer.from('\xc3\xe5\xe9\xdc!', 'binary'), 29 | ascii: Buffer.from(baseStrings.ascii, 'binary'), 30 | greek: Buffer.from('\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xdc\xdd\xde\xdf\xfc\xfd\xfe\xb6\xb8\xb9\xba\xbc\xbe\xbf\xfa\xfb\xda\xdb', 'binary'), 31 | } 32 | }, { 33 | name: "cp737", 34 | variations: ['cp-737', 737], 35 | encodedStrings: { 36 | empty: Buffer.from(''), 37 | hi: Buffer.from('\x82\x9c\xa0\xe1!', 'binary'), 38 | ascii: Buffer.from(baseStrings.ascii, 'binary'), 39 | greek: Buffer.from('\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xab\xac\xad\xae\xaf\xe0\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\xe1\xe2\xe3\xe5\xe6\xe7\xe9\xea\xeb\xec\xed\xee\xef\xf0\xe4\xe8\xf4\xf5', 'binary'), 40 | } 41 | }]; 42 | 43 | describe("Test Greek encodings", function() { 44 | encodings.forEach(function(encoding) { 45 | var enc = encoding.variations[0]; 46 | var key = "hi"; 47 | describe(encoding.name+":", function() { 48 | it("Convert from buffer", function() { 49 | for (var key in encoding.encodedStrings) 50 | assert.strictEqual(iconv.decode(encoding.encodedStrings[key], enc), 51 | baseStrings[key]); 52 | }); 53 | 54 | it("Convert to buffer", function() { 55 | for (var key in encoding.encodedStrings) 56 | assert.strictEqual(iconv.encode(baseStrings[key], enc).toString('binary'), 57 | encoding.encodedStrings[key].toString('binary')); 58 | }); 59 | 60 | it("Try different variations of encoding", function() { 61 | encoding.variations.forEach(function(enc) { 62 | assert.strictEqual(iconv.decode(encoding.encodedStrings[key], enc), baseStrings[key]); 63 | assert.strictEqual(iconv.encode(baseStrings[key], enc).toString('binary'), encoding.encodedStrings[key].toString('binary')); 64 | }); 65 | }); 66 | 67 | it("Untranslatable chars are converted to defaultCharSingleByte", function() { 68 | var expected = baseStrings.untranslatable.split('').map(function(c) {return iconv.defaultCharSingleByte; }).join(''); 69 | assert.strictEqual(iconv.encode(baseStrings.untranslatable, enc).toString('binary'), expected); // Only '?' characters. 70 | }); 71 | }) 72 | }); 73 | }); 74 | -------------------------------------------------------------------------------- /test/main-test.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert'), 2 | Buffer = require('safer-buffer').Buffer, 3 | iconv = require(__dirname+'/../'); 4 | 5 | var testString = "Hello123!"; 6 | var testStringLatin1 = "Hello123!£Å÷×çþÿ¿®"; 7 | var testStringBase64 = "SGVsbG8xMjMh"; 8 | var testStringHex = "48656c6c6f31323321"; 9 | 10 | describe("Generic UTF8-UCS2 tests", function() { 11 | 12 | it("Return values are of correct types", function() { 13 | assert.ok(Buffer.isBuffer(iconv.encode(testString, "utf8"))); 14 | 15 | var s = iconv.decode(Buffer.from(testString), "utf8"); 16 | assert.strictEqual(Object.prototype.toString.call(s), "[object String]"); 17 | }); 18 | 19 | it("Internal encodings all correctly encoded/decoded", function() { 20 | ['utf8', "UTF-8", "UCS2", "binary"].forEach(function(enc) { 21 | assert.strictEqual(iconv.encode(testStringLatin1, enc).toString(enc), testStringLatin1); 22 | assert.strictEqual(iconv.decode(Buffer.from(testStringLatin1, enc), enc), testStringLatin1); 23 | }); 24 | }); 25 | 26 | it("Base64 correctly encoded/decoded", function() { 27 | assert.strictEqual(iconv.encode(testStringBase64, "base64").toString("binary"), testString); 28 | assert.strictEqual(iconv.decode(Buffer.from(testString, "binary"), "base64"), testStringBase64); 29 | }); 30 | 31 | it("Hex correctly encoded/decoded", function() { 32 | assert.strictEqual(iconv.encode(testStringHex, "hex").toString("binary"), testString); 33 | assert.strictEqual(iconv.decode(Buffer.from(testString, "binary"), "hex"), testStringHex); 34 | }); 35 | 36 | it("Latin1 correctly encoded/decoded", function() { 37 | assert.strictEqual(iconv.encode(testStringLatin1, "latin1").toString("binary"), testStringLatin1); 38 | assert.strictEqual(iconv.decode(Buffer.from(testStringLatin1, "binary"), "latin1"), testStringLatin1); 39 | }); 40 | 41 | it("Convert to string, not buffer (utf8 used)", function() { 42 | var res = iconv.encode(Buffer.from(testStringLatin1, "utf8"), "utf8"); 43 | assert.ok(Buffer.isBuffer(res)); 44 | assert.strictEqual(res.toString("utf8"), testStringLatin1); 45 | }); 46 | 47 | it("Throws on unknown encodings", function() { 48 | assert.throws(function() { iconv.encode("a", "xxx"); }); 49 | assert.throws(function() { iconv.decode(Buffer.from("a"), "xxx"); }); 50 | }); 51 | 52 | it("Convert non-strings and non-buffers", function() { 53 | assert.strictEqual(iconv.encode({}, "utf8").toString(), "[object Object]"); 54 | assert.strictEqual(iconv.encode(10, "utf8").toString(), "10"); 55 | assert.strictEqual(iconv.encode(undefined, "utf8").toString(), ""); 56 | }); 57 | 58 | it("Aliases toEncoding and fromEncoding work the same as encode and decode", function() { 59 | assert.strictEqual(iconv.toEncoding(testString, "latin1").toString("binary"), iconv.encode(testString, "latin1").toString("binary")); 60 | assert.strictEqual(iconv.fromEncoding(Buffer.from(testStringLatin1), "latin1"), iconv.decode(Buffer.from(testStringLatin1), "latin1")); 61 | }); 62 | 63 | it("handles Object & Array prototypes monkey patching", function() { 64 | Object.prototype.permits = function() {}; 65 | Array.prototype.sample2 = function() {}; 66 | 67 | iconv._codecDataCache = {}; // Clean up cache so that all encodings are loaded. 68 | 69 | assert.strictEqual(iconv.decode(Buffer.from("abc"), "gbk"), "abc"); 70 | assert.strictEqual(iconv.decode(Buffer.from("abc"), "win1251"), "abc"); 71 | assert.strictEqual(iconv.decode(Buffer.from("abc"), "utf7"), "abc"); 72 | assert.strictEqual(iconv.decode(Buffer.from("abc"), "utf8"), "abc"); 73 | 74 | assert.strictEqual(iconv.encode("abc", "gbk").toString(), "abc"); 75 | assert.strictEqual(iconv.encode("abc", "win1251").toString(), "abc"); 76 | assert.strictEqual(iconv.encode("abc", "utf7").toString(), "abc"); 77 | assert.strictEqual(iconv.encode("abc", "utf8").toString(), "abc"); 78 | 79 | delete Object.prototype.permits; 80 | delete Array.prototype.sample2; 81 | }); 82 | 83 | it("handles encoding untranslatable characters correctly", function() { 84 | // Regression #162 85 | assert.strictEqual(iconv.encode("外国人", "latin1").toString(), "???"); 86 | }); 87 | }); 88 | 89 | describe("Canonicalize encoding function", function() { 90 | it("works with numbers directly", function() { 91 | assert.equal(iconv._canonicalizeEncoding(955), "955"); 92 | }); 93 | 94 | it("correctly strips year and non-alpha chars", function() { 95 | assert.equal(iconv._canonicalizeEncoding("ISO_8859-5:1988"), "iso88595"); 96 | }); 97 | }); 98 | -------------------------------------------------------------------------------- /test/mocha.opts: -------------------------------------------------------------------------------- 1 | --check-leaks 2 | --grep ^(?!Full). -------------------------------------------------------------------------------- /test/performance.js: -------------------------------------------------------------------------------- 1 | 2 | if (module.parent) // Skip this file from testing. 3 | return; 4 | 5 | var iconv = require('iconv'); 6 | var iconv_lite = require("../"); 7 | 8 | var encoding = process.argv[2] || "windows-1251"; 9 | var convertTimes = 10000; 10 | 11 | var encodingStrings = { 12 | 'windows-1251': 'This is a test string 32 chars..', 13 | 'gbk': '这是中文字符测试。。!@¥%12', 14 | 'utf8': '这是中文字符测试。。!@¥%12This is a test string 48 chars..', 15 | }; 16 | // Test encoding. 17 | var str = encodingStrings[encoding]; 18 | if (!str) { 19 | throw new Error('Don\'t support ' + encoding + ' performance test.'); 20 | } 21 | for (var i = 0; i < 13; i++) { 22 | str = str + str; 23 | } 24 | 25 | console.log('\n' + encoding + ' charset performance test:'); 26 | console.log("\nEncoding "+str.length+" chars "+convertTimes+" times:"); 27 | 28 | var start = Date.now(); 29 | var converter = new iconv.Iconv("utf8", encoding); 30 | for (var i = 0; i < convertTimes; i++) { 31 | var b = converter.convert(str); 32 | } 33 | var duration = Date.now() - start; 34 | var mbs = convertTimes*b.length/duration/1024; 35 | 36 | console.log("iconv: "+duration+"ms, "+mbs.toFixed(2)+" Mb/s."); 37 | 38 | var start = Date.now(); 39 | for (var i = 0; i < convertTimes; i++) { 40 | var b = iconv_lite.encode(str, encoding); 41 | } 42 | var duration = Date.now() - start; 43 | var mbs = convertTimes*b.length/duration/1024; 44 | 45 | console.log("iconv-lite: "+duration+"ms, "+mbs.toFixed(2)+" Mb/s."); 46 | 47 | 48 | // Test decoding. 49 | var buf = iconv_lite.encode(str, encoding); 50 | console.log("\nDecoding "+buf.length+" bytes "+convertTimes+" times:"); 51 | 52 | var start = Date.now(); 53 | var converter = new iconv.Iconv(encoding, "utf8"); 54 | for (var i = 0; i < convertTimes; i++) { 55 | var s = converter.convert(buf).toString(); 56 | } 57 | var duration = Date.now() - start; 58 | var mbs = convertTimes*buf.length/duration/1024; 59 | 60 | console.log("iconv: "+duration+"ms, "+mbs.toFixed(2)+" Mb/s."); 61 | 62 | var start = Date.now(); 63 | for (var i = 0; i < convertTimes; i++) { 64 | var s = iconv_lite.decode(buf, encoding); 65 | } 66 | var duration = Date.now() - start; 67 | var mbs = convertTimes*buf.length/duration/1024; 68 | 69 | console.log("iconv-lite: "+duration+"ms, "+mbs.toFixed(2)+" Mb/s."); 70 | 71 | -------------------------------------------------------------------------------- /test/sbcs-test.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert'), 2 | unorm = require('unorm'), 3 | Buffer = require('safer-buffer').Buffer, 4 | iconv = require(__dirname+'/../'), 5 | Iconv = require('iconv').Iconv; 6 | 7 | function convertWithDefault(converter, buf, def) { 8 | var res = converter.convert(buf); 9 | return res.length > 0 ? res : def; 10 | } 11 | 12 | var aliases = { 13 | armscii8: "ARMSCII-8", 14 | georgianacademy: "GEORGIAN-ACADEMY", 15 | georgianps: "GEORGIAN-PS", 16 | iso646cn: "ISO646-CN", 17 | iso646jp: "ISO646-JP", 18 | hproman8: "HP-ROMAN8", 19 | } 20 | 21 | function iconvAlias(enc) { 22 | var r; 23 | if (r = /windows(\d+)/.exec(enc)) 24 | return "WINDOWS-"+r[1]; 25 | if (r = /iso8859(\d+)/.exec(enc)) 26 | return "ISO8859-"+r[1]; 27 | if (r = /koi8(\w+)/.exec(enc)) 28 | return "KOI8-"+r[1]; 29 | if (aliases[enc]) 30 | return aliases[enc]; 31 | return enc; 32 | } 33 | 34 | var normalizedEncodings = { windows1255: true, windows1258: true, tcvn: true }; 35 | 36 | var combClass = {'\u0327': 202, '\u0323': 220, '\u031B': 216}; // Combining class of unicode characters. 37 | for (var i = 0x300; i < 0x315; i++) combClass[String.fromCharCode(i)] = 230; 38 | 39 | var iconvEquivChars = { 40 | cp1163: {'\u00D0': '\u0110', '\u203E': '\u00AF'}, 41 | } 42 | 43 | 44 | function swapBytes(buf) { for (var i = 0; i < buf.length; i+=2) buf.writeUInt16LE(buf.readUInt16BE(i), i); return buf; } 45 | function spacify2(str) { return str.replace(/(..)/g, "$1 ").trim(); } 46 | function spacify4(str) { return str.replace(/(....)/g, "$1 ").trim(); } 47 | function strToHex(str) { return spacify4(swapBytes(Buffer.from(str, 'ucs2')).toString('hex')); } 48 | 49 | // Generate tests for all SBCS encodings. 50 | iconv.encode('', 'utf8'); // Load all encodings. 51 | 52 | 53 | var sbcsEncodingTests = {}; 54 | describe("Full SBCS encoding tests", function() { 55 | this.timeout(10000); 56 | 57 | for (var enc in iconv.encodings) 58 | if (iconv.encodings[enc].type === '_sbcs') (function(enc) { 59 | var iconvName = iconvAlias(enc), 60 | testEncName = enc + ((enc !== iconvName) ? " (" + iconvName + ")" : ""); 61 | 62 | it("Decode SBCS encoding " + testEncName, function() { 63 | try { 64 | var conv = new Iconv(iconvName, "utf-8//IGNORE"); 65 | } catch (e) { 66 | this.skip(); 67 | } 68 | var errors = []; 69 | for (var i = 0; i < 0x100; i++) { 70 | var buf = Buffer.from([i]); 71 | var strActual = iconv.decode(buf, enc); 72 | var strExpected = convertWithDefault(conv, buf, iconv.defaultCharUnicode).toString(); 73 | 74 | if (strActual != strExpected) 75 | errors.push({input: buf.toString('hex'), strExpected: strExpected, strActual: strActual}); 76 | } 77 | if (errors.length > 0) 78 | assert.fail(null, null, "Decoding mismatch: | | | | \n" 79 | + errors.map(function(err) { 80 | return " " + spacify2(err.input) + " | " + strToHex(err.strExpected) + " | " + strToHex(err.strActual) + " | " + 81 | err.strExpected + " | " + err.strActual; 82 | }).join("\n") + "\n "); 83 | }); 84 | 85 | it("Encode SBCS encoding " + testEncName, function() { 86 | try { 87 | var conv = new Iconv("utf-8", iconvName + "//IGNORE"); 88 | } catch (e) { 89 | this.skip(); 90 | } 91 | var errors = []; 92 | 93 | for (var i = 0; i < 0xFFF0; i++) { 94 | if (i == 0xD800) i = 0xF900; // Skip surrogates & private use 95 | 96 | var str = String.fromCharCode(i); 97 | var strExpected = convertWithDefault(conv, str, Buffer.from(iconv.defaultCharSingleByte)).toString('hex'); 98 | var strActual = iconv.encode(str, enc).toString('hex'); 99 | 100 | if (strExpected == strActual) 101 | continue; 102 | 103 | // We are not supporting unicode normalization/decomposition of input, so skip it. 104 | // (when single unicode char results in >1 encoded chars because of diacritics) 105 | if (normalizedEncodings[enc] && strActual == iconv.defaultCharSingleByte.charCodeAt(0).toString(16)) { 106 | var strDenormStrict = unorm.nfd(str); // Strict decomposition 107 | if (strExpected == iconv.encode(strDenormStrict, enc).toString('hex')) 108 | continue; 109 | 110 | var strDenorm = unorm.nfkd(str); // Check also compat decomposition. 111 | if (strExpected == iconv.encode(strDenorm, enc).toString('hex')) 112 | continue; 113 | 114 | // Try semicomposition if we have 2 combining characters. 115 | if (strDenorm.length == 3 && !combClass[strDenorm[0]] && combClass[strDenorm[1]] && combClass[strDenorm[2]]) { 116 | // Semicompose without swapping. 117 | var strDenorm2 = unorm.nfc(strDenorm[0] + strDenorm[1]) + strDenorm[2]; 118 | if (strExpected == iconv.encode(strDenorm2, enc).toString('hex')) 119 | continue; 120 | 121 | // Swap combining characters if they have different combining classes, making swap unicode-equivalent. 122 | var strDenorm3 = unorm.nfc(strDenorm[0] + strDenorm[2]) + strDenorm[1]; 123 | if (strExpected == iconv.encode(strDenorm3, enc).toString('hex')) 124 | if (combClass[strDenorm[1]] != combClass[strDenorm[2]]) 125 | continue; 126 | else 127 | // In theory, if combining classes are the same, we can not swap them. But iconv thinks otherwise. 128 | // So we skip this too. 129 | continue; 130 | } 131 | } 132 | 133 | // Iconv sometimes treats some characters as equivalent. Check it and skip. 134 | if (iconvEquivChars[enc] && iconvEquivChars[enc][str] && 135 | strExpected == iconv.encode(iconvEquivChars[enc][str], enc).toString('hex')) 136 | continue; 137 | 138 | errors.push({input: strToHex(str), inputChar: str, strExpected: strExpected, strActual: strActual}); 139 | } 140 | 141 | if (errors.length > 0) 142 | assert.fail(null, null, "Encoding mismatch: | | | \n" 143 | + errors.map(function(err) { 144 | return " " + err.input + " | " + err.inputChar + " | " + spacify2(err.strExpected) + " | " + spacify2(err.strActual); 145 | }).join("\n") + "\n "); 146 | }); 147 | 148 | /* 149 | // TODO: Implement unicode composition. After that, this test will be meaningful. 150 | 151 | // Create a large random text. 152 | var buf2 = Buffer.alloc(100); 153 | for (var i = 0; i < buf2.length; i++) 154 | buf2[i] = buf[(Math.random()*buf.length) | 0]; 155 | 156 | // Check both encoding and decoding. 157 | assert.strictEqual(JSON.stringify(iconv.decode(buf2, enc)), JSON.stringify(str = conv.convert(buf2).toString())); 158 | assert.strictEqual(iconv.encode(str, enc).toString('hex'), convBack.convert(Buffer.from(str)).toString('hex')); 159 | */ 160 | })(enc); 161 | }); 162 | 163 | -------------------------------------------------------------------------------- /test/shiftjis-test.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert'), 2 | Buffer = require('safer-buffer').Buffer, 3 | iconv = require(__dirname + '/../'); 4 | 5 | describe("ShiftJIS tests", function() { 6 | it("ShiftJIS correctly encoded/decoded", function() { 7 | var testString = "中文abc", //unicode contains ShiftJIS-code and ascii 8 | testStringBig5Buffer = Buffer.from([0x92, 0x86, 0x95, 0xb6, 0x61, 0x62, 0x63]), 9 | testString2 = '測試', 10 | testStringBig5Buffer2 = Buffer.from([0x91, 0xaa, 0x8e, 0x8e]); 11 | 12 | assert.strictEqual(iconv.encode(testString, "shiftjis").toString('hex'), testStringBig5Buffer.toString('hex')); 13 | assert.strictEqual(iconv.decode(testStringBig5Buffer, "shiftjis"), testString); 14 | assert.strictEqual(iconv.encode(testString2, 'shiftjis').toString('hex'), testStringBig5Buffer2.toString('hex')); 15 | assert.strictEqual(iconv.decode(testStringBig5Buffer2, 'shiftjis'), testString2); 16 | }); 17 | 18 | it("ShiftJIS extended chars are decoded, but not encoded", function() { 19 | var buf = Buffer.from('ed40eefceeef', 'hex'), str = "纊"ⅰ", res = "fa5cfa57fa40", // repeated block (these same chars are repeated in the different place) 20 | buf2 = Buffer.from('f040f2fcf940', 'hex'), str2 = "", res2 = "3f3f3f"; // non-repeated, UA block. 21 | 22 | assert.strictEqual(iconv.decode(buf, "shiftjis"), str); 23 | assert.strictEqual(iconv.decode(buf2, "shiftjis"), str2); 24 | 25 | assert.strictEqual(iconv.encode(str, "shiftjis").toString('hex'), res); 26 | assert.strictEqual(iconv.encode(str2, "shiftjis").toString('hex'), res2); 27 | }); 28 | 29 | it("ShiftJIS includes extensions", function() { 30 | assert.strictEqual(iconv.decode(Buffer.from('8740', 'hex'), 'shiftjis'), '①'); 31 | assert.strictEqual(iconv.encode('①', 'shiftjis').toString('hex'), '8740'); 32 | }); 33 | }); 34 | -------------------------------------------------------------------------------- /test/turkish-test.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert'), 2 | Buffer = require('safer-buffer').Buffer, 3 | iconv = require(__dirname+'/../'); 4 | 5 | var ascii = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f'+ 6 | ' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f'; 7 | 8 | var encodings = [{ 9 | name: "windows1254", 10 | variations: ['windows-1254', 'win-1254', 'win1254', 'cp1254', 'cp-1254', 1254], 11 | strings: { 12 | empty: "", 13 | ascii: ascii, 14 | turkish: "€‚ƒ„…†‡ˆ‰Š‹Œ‘’“”•–—˜™š›œŸ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏĞÑÒÓÔÕÖרÙÚÛÜİŞßàáâãäåæçèéêëìíîïğñòóôõö÷øùúûüışÿ", 15 | untranslatable: "\x81\x8d\x8e\x8f\x90\x9d\x9e" 16 | }, 17 | encodedStrings: { 18 | empty: Buffer.from(''), 19 | ascii: Buffer.from(ascii, 'binary'), 20 | turkish: Buffer.from( 21 | '\x80\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c' + 22 | '\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9f' + 23 | '\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xae\xaf' + 24 | '\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf' + 25 | '\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf' + 26 | '\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf' + 27 | '\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef' + 28 | '\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff', 29 | 'binary'), 30 | } 31 | }, { 32 | name: "iso88599", 33 | variations: ['iso-8859-9', 'turkish', 'turkish8', 'cp28599', 'cp-28599', 28599], 34 | strings: { 35 | empty: "", 36 | ascii: ascii, 37 | turkish: "\xa0¡¢£¤¥¦§¨©ª«¬\xad®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏĞÑÒÓÔÕÖרÙÚÛÜİŞßàáâãäåæçèéêëìíîïğñòóôõö÷øùúûüışÿ", 38 | untranslatable: '' 39 | }, 40 | encodedStrings: { 41 | empty: Buffer.from(''), 42 | ascii: Buffer.from(ascii, 'binary'), 43 | turkish: Buffer.from( 44 | '\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf' + 45 | '\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf' + 46 | '\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf' + 47 | '\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf' + 48 | '\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef' + 49 | '\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff', 50 | 'binary') 51 | } 52 | }]; 53 | 54 | describe("Test Turkish encodings", function() { 55 | encodings.forEach(function(encoding) { 56 | var enc = encoding.variations[0]; 57 | var key = "turkish"; 58 | describe(encoding.name+":", function() { 59 | it("Convert from buffer", function() { 60 | for (var key in encoding.encodedStrings) 61 | assert.strictEqual(iconv.decode(encoding.encodedStrings[key], enc), 62 | encoding.strings[key]); 63 | }); 64 | 65 | it("Convert to buffer", function() { 66 | for (var key in encoding.encodedStrings) 67 | assert.strictEqual(iconv.encode(encoding.strings[key], enc).toString('binary'), 68 | encoding.encodedStrings[key].toString('binary')); 69 | }); 70 | 71 | it("Try different variations of encoding", function() { 72 | encoding.variations.forEach(function(enc) { 73 | assert.strictEqual(iconv.decode(encoding.encodedStrings[key], enc), encoding.strings[key]); 74 | assert.strictEqual(iconv.encode(encoding.strings[key], enc).toString('binary'), encoding.encodedStrings[key].toString('binary')); 75 | }); 76 | }); 77 | 78 | it("Untranslatable chars are converted to defaultCharSingleByte", function() { 79 | var expected = encoding.strings.untranslatable.split('').map(function(c) {return iconv.defaultCharSingleByte; }).join(''); 80 | assert.strictEqual(iconv.encode(encoding.strings.untranslatable, enc).toString('binary'), expected); // Only '?' characters. 81 | }); 82 | }); 83 | }); 84 | }); 85 | -------------------------------------------------------------------------------- /test/utf16-test.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert'), 2 | Buffer = require('safer-buffer').Buffer, 3 | iconv = require(__dirname+'/../'); 4 | 5 | var testStr = "1aя中文☃💩"; 6 | utf16beBuf = Buffer.from([0, 0x31, 0, 0x61, 0x04, 0x4f, 0x4e, 0x2d, 0x65, 0x87, 0x26, 0x03, 0xd8, 0x3d, 0xdc, 0xa9]), 7 | utf16leBuf = Buffer.from(testStr, 'ucs2'), 8 | utf16beBOM = Buffer.from([0xFE, 0xFF]), 9 | utf16leBOM = Buffer.from([0xFF, 0xFE]), 10 | sampleStr = '\n<俄语>данные'; 11 | 12 | describe("UTF-16BE codec", function() { 13 | it("encodes basic strings correctly", function() { 14 | assert.equal(iconv.encode(testStr, 'UTF16-BE').toString('hex'), utf16beBuf.toString('hex')); 15 | }); 16 | 17 | it("decodes basic buffers correctly", function() { 18 | assert.equal(iconv.decode(utf16beBuf, 'UTF16-BE'), testStr); 19 | }); 20 | 21 | it("decodes uneven length buffers with no error", function() { 22 | assert.equal(iconv.decode(Buffer.from([0, 0x61, 0]), 'UTF16-BE'), "a"); 23 | }); 24 | }); 25 | 26 | describe("UTF-16 encoder", function() { 27 | it("uses UTF-16LE and adds BOM when encoding", function() { 28 | assert.equal(iconv.encode(testStr, "utf-16").toString('hex'), utf16leBOM.toString('hex') + utf16leBuf.toString('hex')); 29 | }); 30 | 31 | it("can use other encodings, for example UTF-16LE, with BOM", function() { 32 | assert.equal(iconv.encode(testStr, "utf-16", {use: 'UTF-16LE'}).toString('hex'), 33 | utf16leBOM.toString('hex') + Buffer.from(testStr, 'ucs2').toString('hex')); 34 | }); 35 | }); 36 | 37 | describe("UTF-16 decoder", function() { 38 | it("uses BOM to determine encoding", function() { 39 | assert.equal(iconv.decode(Buffer.concat([utf16leBOM, utf16leBuf]), "utf-16"), testStr); 40 | assert.equal(iconv.decode(Buffer.concat([utf16beBOM, utf16beBuf]), "utf-16"), testStr); 41 | }); 42 | 43 | it("handles very short buffers nice", function() { 44 | assert.equal(iconv.decode(Buffer.from([]), 'utf-16'), ''); 45 | assert.equal(iconv.decode(Buffer.from([0x61]), 'utf-16'), ''); 46 | }); 47 | 48 | it("uses spaces when there is no BOM to determine encoding", function() { 49 | assert.equal(iconv.decode(iconv.encode(sampleStr, 'utf-16le'), 'utf-16'), sampleStr); 50 | assert.equal(iconv.decode(iconv.encode(sampleStr, 'utf-16be'), 'utf-16'), sampleStr); 51 | }); 52 | 53 | it("uses UTF-16LE if no BOM and heuristics failed", function() { 54 | assert.equal(iconv.decode(utf16leBuf, 'utf-16'), testStr); 55 | }); 56 | 57 | it("can be given a different default encoding", function() { 58 | assert.equal(iconv.decode(utf16leBuf, 'utf-16', {default: 'utf-16le'}), testStr); 59 | }); 60 | }); 61 | -------------------------------------------------------------------------------- /test/utf32-test.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert'), 2 | Buffer = require('safer-buffer').Buffer, 3 | iconv = require(__dirname+'/../'), 4 | Iconv = require('iconv').Iconv; 5 | 6 | var testStr = '1aя中文☃💩', 7 | testStr2 = '❝Stray high \uD977😱 and low\uDDDD☔ surrogate values.❞', 8 | utf32leBuf = Buffer.from([0x31, 0x00, 0x00, 0x00, 0x61, 0x00, 0x00, 0x00, 0x4F, 0x04, 0x00, 0x00, 9 | 0x2D, 0x4E, 0x00, 0x00, 0x87, 0x65, 0x00, 0x00, 0x03, 0x26, 0x00, 0x00, 0xA9, 0xF4, 0x01, 0x00]), 10 | utf32beBuf = Buffer.from([0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x61, 0x00, 0x00, 0x04, 0x4F, 11 | 0x00, 0x00, 0x4E, 0x2D, 0x00, 0x00, 0x65, 0x87, 0x00, 0x00, 0x26, 0x03, 0x00, 0x01, 0xF4, 0xA9]), 12 | utf32leBOM = Buffer.from([0xFF, 0xFE, 0x00, 0x00]), 13 | utf32beBOM = Buffer.from([0x00, 0x00, 0xFE, 0xFF]), 14 | utf32leBufWithBOM = Buffer.concat([utf32leBOM, utf32leBuf]), 15 | utf32beBufWithBOM = Buffer.concat([utf32beBOM, utf32beBuf]), 16 | utf32leBufWithInvalidChar = Buffer.concat([utf32leBuf, Buffer.from([0x12, 0x34, 0x56, 0x78])]), 17 | utf32beBufWithInvalidChar = Buffer.concat([utf32beBuf, Buffer.from([0x12, 0x34, 0x56, 0x78])]), 18 | sampleStr = '\n<俄语>данные'; 19 | 20 | var fromCodePoint = String.fromCodePoint; 21 | 22 | if (!fromCodePoint) { 23 | fromCodePoint = function(cp) { 24 | if (cp < 0x10000) 25 | return String.fromCharCode(cp); 26 | 27 | cp -= 0x10000; 28 | 29 | return String.fromCharCode(0xD800 | (cp >> 10)) + 30 | String.fromCharCode(0xDC00 + (cp & 0x3FF)); 31 | } 32 | } 33 | 34 | var allCharsStr = ''; 35 | var allCharsLEBuf = Buffer.alloc(0x10F800 * 4); 36 | var allCharsBEBuf = Buffer.alloc(0x10F800 * 4); 37 | var skip = 0; 38 | 39 | for (var i = 0; i <= 0x10F7FF; ++i) { 40 | if (i === 0xD800) 41 | skip = 0x800; 42 | 43 | var cp = i + skip; 44 | allCharsStr += fromCodePoint(cp); 45 | allCharsLEBuf.writeUInt32LE(cp, i * 4); 46 | allCharsBEBuf.writeUInt32BE(cp, i * 4); 47 | } 48 | 49 | describe('UTF-32LE codec', function() { 50 | it('encodes basic strings correctly', function() { 51 | assert.equal(iconv.encode(testStr, 'UTF32-LE').toString('hex'), utf32leBuf.toString('hex')); 52 | }); 53 | 54 | it('decodes basic buffers correctly', function() { 55 | assert.equal(iconv.decode(utf32leBuf, 'ucs4le'), testStr); 56 | }); 57 | 58 | it('decodes uneven length buffers with no error', function() { 59 | assert.equal(iconv.decode(Buffer.from([0x61, 0, 0, 0, 0]), 'UTF32-LE'), 'a'); 60 | }); 61 | 62 | it('handles invalid surrogates gracefully', function() { 63 | var encoded = iconv.encode(testStr2, 'UTF32-LE'); 64 | assert.equal(escape(iconv.decode(encoded, 'UTF32-LE')), escape(testStr2)); 65 | }); 66 | 67 | it('handles invalid Unicode codepoints gracefully', function() { 68 | assert.equal(iconv.decode(utf32leBufWithInvalidChar, 'utf-32le'), testStr + '�'); 69 | }); 70 | 71 | it('handles encoding all valid codepoints', function() { 72 | assert.deepEqual(iconv.encode(allCharsStr, 'utf-32le'), allCharsLEBuf); 73 | var nodeIconv = new Iconv('UTF-8', 'UTF-32LE'); 74 | var nodeBuf = nodeIconv.convert(allCharsStr); 75 | assert.deepEqual(nodeBuf, allCharsLEBuf); 76 | }); 77 | 78 | it('handles decoding all valid codepoints', function() { 79 | assert.equal(iconv.decode(allCharsLEBuf, 'utf-32le'), allCharsStr); 80 | var nodeIconv = new Iconv('UTF-32LE', 'UTF-8'); 81 | var nodeStr = nodeIconv.convert(allCharsLEBuf).toString('utf8'); 82 | assert.equal(nodeStr, allCharsStr); 83 | }); 84 | }); 85 | 86 | describe('UTF-32BE codec', function() { 87 | it('encodes basic strings correctly', function() { 88 | assert.equal(iconv.encode(testStr, 'UTF32-BE').toString('hex'), utf32beBuf.toString('hex')); 89 | }); 90 | 91 | it('decodes basic buffers correctly', function() { 92 | assert.equal(iconv.decode(utf32beBuf, 'ucs4be'), testStr); 93 | }); 94 | 95 | it('decodes uneven length buffers with no error', function() { 96 | assert.equal(iconv.decode(Buffer.from([0, 0, 0, 0x61, 0]), 'UTF32-BE'), 'a'); 97 | }); 98 | 99 | it('handles invalid surrogates gracefully', function() { 100 | var encoded = iconv.encode(testStr2, 'UTF32-BE'); 101 | assert.equal(escape(iconv.decode(encoded, 'UTF32-BE')), escape(testStr2)); 102 | }); 103 | 104 | it('handles invalid Unicode codepoints gracefully', function() { 105 | assert.equal(iconv.decode(utf32beBufWithInvalidChar, 'utf-32be'), testStr + '�'); 106 | }); 107 | 108 | it('handles encoding all valid codepoints', function() { 109 | assert.deepEqual(iconv.encode(allCharsStr, 'utf-32be'), allCharsBEBuf); 110 | var nodeIconv = new Iconv('UTF-8', 'UTF-32BE'); 111 | var nodeBuf = nodeIconv.convert(allCharsStr); 112 | assert.deepEqual(nodeBuf, allCharsBEBuf); 113 | }); 114 | 115 | it('handles decoding all valid codepoints', function() { 116 | assert.equal(iconv.decode(allCharsBEBuf, 'utf-32be'), allCharsStr); 117 | var nodeIconv = new Iconv('UTF-32BE', 'UTF-8'); 118 | var nodeStr = nodeIconv.convert(allCharsBEBuf).toString('utf8'); 119 | assert.equal(nodeStr, allCharsStr); 120 | }); 121 | }); 122 | 123 | describe('UTF-32 general codec', function() { 124 | it('adds BOM when encoding, defaults to UTF-32LE', function() { 125 | assert.equal(iconv.encode(testStr, 'utf-32').toString('hex'), utf32leBOM.toString('hex') + utf32leBuf.toString('hex')); 126 | }); 127 | 128 | it('doesn\'t add BOM and uses UTF-32BE when specified', function() { 129 | assert.equal(iconv.encode(testStr, 'ucs4', {addBOM: false, defaultEncoding: 'ucs4be'}).toString('hex'), utf32beBuf.toString('hex')); 130 | }); 131 | 132 | it('correctly decodes UTF-32LE using BOM', function() { 133 | assert.equal(iconv.decode(utf32leBufWithBOM, 'utf-32'), testStr); 134 | }); 135 | 136 | it('correctly decodes UTF-32LE without BOM', function() { 137 | assert.equal(iconv.decode(iconv.encode(sampleStr, 'utf-32-le'), 'utf-32'), sampleStr); 138 | }); 139 | 140 | it('correctly decodes UTF-32BE using BOM', function() { 141 | assert.equal(iconv.decode(utf32beBufWithBOM, 'utf-32', { stripBOM: false }), '\uFEFF' + testStr); 142 | }); 143 | 144 | it('correctly decodes UTF-32BE without BOM', function() { 145 | assert.equal(iconv.decode(iconv.encode(sampleStr, 'utf-32-be'), 'utf-32'), sampleStr); 146 | }); 147 | }); 148 | 149 | // Utility function to make bad matches easier to visualize. 150 | function escape(s) { 151 | var sb = []; 152 | 153 | for (var i = 0; i < s.length; ++i) { 154 | var cc = s.charCodeAt(i); 155 | 156 | if (32 <= cc && cc < 127 && cc !== 0x5C) 157 | sb.push(s.charAt(i)); 158 | else { 159 | var h = s.charCodeAt(i).toString(16).toUpperCase(); 160 | while (h.length < 4) // No String.repeat in old versions of Node! 161 | h = '0' + h; 162 | 163 | sb.push('\\u' + h); 164 | } 165 | } 166 | 167 | return sb.join(''); 168 | } 169 | -------------------------------------------------------------------------------- /test/utf7-test.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert'), 2 | Buffer = require('safer-buffer').Buffer, 3 | iconv = require(__dirname+'/../'); 4 | 5 | // These tests are mostly from https://github.com/kkaefer/utf7 6 | // In case of ambiguity, we do the same as iconv. For example, we encode "optional direct" characters, but leave spaces and \n\r\t as-is. 7 | 8 | describe("UTF-7 codec", function() { 9 | it("encodes correctly", function() { 10 | // Examples from RFC 2152. 11 | assert.equal(iconv.encode('A\u2262\u0391.', 'utf-7').toString(), 'A+ImIDkQ-.'); 12 | assert.equal(iconv.encode('\u65E5\u672C\u8A9E', 'utf-7').toString(), '+ZeVnLIqe-'); 13 | 14 | assert.equal(iconv.encode('Hi Mom -\u263A-!', 'utf-7').toString(), 'Hi Mom -+Jjo--+ACE-'); 15 | 16 | assert.equal(iconv.encode('Item 3 is \u00A31.', 'utf-7').toString(), 'Item 3 is +AKM-1.'); 17 | 18 | // Custom examples that contain more than one mode shift. 19 | assert.equal(iconv.encode('Jyv\u00E4skyl\u00E4', 'utf-7').toString(), 'Jyv+AOQ-skyl+AOQ-'); 20 | assert.equal(iconv.encode('\'\u4F60\u597D\' heißt "Hallo"', 'utf-7').toString(), '\'+T2BZfQ-\' hei+AN8-t +ACI-Hallo+ACI-'); 21 | 22 | // The plus sign is represented as +-. 23 | assert.equal(iconv.encode('Hot + Spicy + Fruity', 'utf-7').toString(), 'Hot +- Spicy +- Fruity'); 24 | 25 | // Slashes in the beginning. 26 | assert.equal(iconv.encode('\uffff\uedca\u9876\u5432\u1fed', 'utf-7').toString(), '+///typh2VDIf7Q-'); 27 | 28 | // + sign around non-ASCII chars 29 | assert.equal(iconv.encode('\u00E4+\u00E4+\u00E4', 'utf-7').toString(), '+AOQAKwDkACsA5A-'); 30 | }); 31 | 32 | it("decodes correctly", function() { 33 | // Examples from RFC 2152. 34 | assert.equal(iconv.decode(Buffer.from('A+ImIDkQ-.'), 'utf-7'), 'A\u2262\u0391.'); 35 | assert.equal(iconv.decode(Buffer.from('A+ImIDkQ.'), 'utf-7'), 'A\u2262\u0391.'); 36 | 37 | assert.equal(iconv.decode(Buffer.from('+ZeVnLIqe-'), 'utf-7'), '\u65E5\u672C\u8A9E'); 38 | assert.equal(iconv.decode(Buffer.from('+ZeVnLIqe'), 'utf-7'), '\u65E5\u672C\u8A9E'); 39 | 40 | assert.equal(iconv.decode(Buffer.from('Hi Mom -+Jjo--!'), 'utf-7'), 'Hi Mom -\u263A-!'); 41 | assert.equal(iconv.decode(Buffer.from('Hi+ACA-Mom+ACA--+Jjo--+ACE-'), 'utf-7'), 'Hi Mom -\u263A-!'); 42 | assert.equal(iconv.decode(Buffer.from('Item 3 is +AKM-1.'), 'utf-7'), 'Item 3 is \u00A31.'); 43 | assert.equal(iconv.decode(Buffer.from('Item+ACA-3+ACA-is+ACAAow-1.'), 'utf-7'), 'Item 3 is \u00A31.'); 44 | 45 | // Custom examples that contain more than one mode shift. 46 | assert.equal(iconv.decode(Buffer.from('Jyv+AOQ-skyl+AOQ-'), 'utf-7'), 'Jyv\u00E4skyl\u00E4'); 47 | assert.equal(iconv.decode(Buffer.from('Jyv+AOQ-skyl+AOQ'), 'utf-7'), 'Jyv\u00E4skyl\u00E4'); 48 | assert.equal(iconv.decode(Buffer.from('\'+T2BZfQ-\' hei+AN8-t "Hallo"'), 'utf-7'), '\'\u4F60\u597D\' heißt "Hallo"'); 49 | assert.equal(iconv.decode(Buffer.from('\'+T2BZfQ\' hei+AN8-t "Hallo"'), 'utf-7'), '\'\u4F60\u597D\' heißt "Hallo"'); 50 | assert.equal(iconv.decode(Buffer.from('\'+T2BZfQ-\'+ACA-hei+AN8-t+ACAAIg-Hallo+ACI-'), 'utf-7'), '\'\u4F60\u597D\' heißt "Hallo"'); 51 | assert.equal(iconv.decode(Buffer.from('\'+T2BZfQ-\'+ACA-hei+AN8-t+ACAAIg-Hallo+ACI'), 'utf-7'), '\'\u4F60\u597D\' heißt "Hallo"'); 52 | 53 | // The plus sign is represented by +-. 54 | assert.equal(iconv.decode(Buffer.from('Hot +- Spicy +- Fruity'), 'utf-7'), 'Hot + Spicy + Fruity'); 55 | assert.equal(iconv.decode(Buffer.from('Hot+ACAAKwAg-Spicy+ACAAKwAg-Fruity'), 'utf-7'), 'Hot + Spicy + Fruity'); 56 | 57 | // Slashes in the beginning. 58 | assert.equal(iconv.decode(Buffer.from('+///typh2VDIf7Q-'), 'utf-7'), '\uffff\uedca\u9876\u5432\u1fed'); 59 | assert.equal(iconv.decode(Buffer.from('+///typh2VDIf7Q'), 'utf-7'), '\uffff\uedca\u9876\u5432\u1fed'); 60 | 61 | // + sign around non-ASCII chars 62 | assert.equal(iconv.decode(Buffer.from('+AOQ-+-+AOQ-+-+AOQ-'), 'utf-7'), '\u00E4+\u00E4+\u00E4'); 63 | //assert.equal(iconv.decode(Buffer.from('+AOQ++AOQ+-+AOQ'), 'utf-7'), '\u00E4+\u00E4+\u00E4'); 64 | assert.equal(iconv.decode(Buffer.from('+AOQAKwDkACsA5A-'), 'utf-7'), '\u00E4+\u00E4+\u00E4'); 65 | assert.equal(iconv.decode(Buffer.from('+AOQAKwDkACsA5A'), 'utf-7'), '\u00E4+\u00E4+\u00E4'); 66 | 67 | 68 | // Tests from https://gist.github.com/peteroupc/08c5ecc8131a76062ffe 69 | 70 | assert.equal(iconv.decode(Buffer.from("\r\n\t '!\"#'(),$-%@[]^&=<>;*_`{}./:|?"), 'utf-7'), "\r\n\t '!\"#'(),$-%@[]^&=<>;*_`{}./:|?"); 71 | assert.equal(iconv.decode(Buffer.from("x+--"), 'utf-7'), "x+-"); 72 | assert.equal(iconv.decode(Buffer.from("x+-y"), 'utf-7'), "x+y"); 73 | 74 | // UTF-16 code unit 75 | assert.equal(iconv.decode(Buffer.from("+DEE?"), 'utf-7'), "\u0c41?"); 76 | assert.equal(iconv.decode(Buffer.from("+DEE"), 'utf-7'), "\u0c41"); 77 | 78 | // Surrogate pair 79 | assert.equal(iconv.decode(Buffer.from("+2ADcAA?"), 'utf-7'), "\ud800\udc00?"); 80 | assert.equal(iconv.decode(Buffer.from("+2ADcAA"), 'utf-7'), "\ud800\udc00"); 81 | 82 | // Two UTF-16 code units 83 | assert.equal(iconv.decode(Buffer.from("+AMAA4A?"), 'utf-7'), "\u00c0\u00e0?"); 84 | assert.equal(iconv.decode(Buffer.from("+AMAA4A"), 'utf-7'), "\u00c0\u00e0"); 85 | assert.equal(iconv.decode(Buffer.from("+AMAA4A-Next"), 'utf-7'), "\u00c0\u00e0Next"); 86 | assert.equal(iconv.decode(Buffer.from("+AMAA4A!Next"), 'utf-7'), "\u00c0\u00e0!Next"); 87 | 88 | }); 89 | }); 90 | 91 | describe("UTF-7-IMAP codec", function() { 92 | it("encodes correctly", function() { 93 | // Examples from RFC 2152. 94 | assert.equal(iconv.encode('A\u2262\u0391.', 'utf-7-imap').toString(), 'A&ImIDkQ-.'); 95 | assert.equal(iconv.encode('\u65E5\u672C\u8A9E', 'utf-7-imap').toString(), '&ZeVnLIqe-'); 96 | assert.equal(iconv.encode('Hi Mom -\u263A-!', 'utf-7-imap').toString(), 'Hi Mom -&Jjo--!'); 97 | assert.equal(iconv.encode('Item 3 is \u00A31.', 'utf-7-imap').toString(), 'Item 3 is &AKM-1.'); 98 | 99 | // Custom examples that contain more than one mode shift. 100 | assert.equal(iconv.encode('Jyv\u00E4skyl\u00E4', 'utf-7-imap').toString(), 'Jyv&AOQ-skyl&AOQ-'); 101 | assert.equal(iconv.encode('\'\u4F60\u597D\' heißt "Hallo"', 'utf-7-imap').toString(), '\'&T2BZfQ-\' hei&AN8-t "Hallo"'); 102 | 103 | // The ampersand sign is represented as &-. 104 | assert.equal(iconv.encode('Hot & Spicy & Fruity', 'utf-7-imap').toString(), 'Hot &- Spicy &- Fruity'); 105 | 106 | // Slashes are converted to commas. 107 | assert.equal(iconv.encode('\uffff\uedca\u9876\u5432\u1fed', 'utf-7-imap').toString(), '&,,,typh2VDIf7Q-'); 108 | 109 | // & sign around non-ASCII chars 110 | assert.equal(iconv.encode('\u00E4&\u00E4&\u00E4', 'utf-7-imap').toString(), '&AOQ-&-&AOQ-&-&AOQ-'); 111 | }); 112 | 113 | it("decodes correctly", function() { 114 | // Examples from RFC 2152. 115 | assert.equal(iconv.decode(Buffer.from('A&ImIDkQ-.'), 'utf-7-imap'), 'A\u2262\u0391.'); 116 | assert.equal(iconv.decode(Buffer.from('&ZeVnLIqe-'), 'utf-7-imap'), '\u65E5\u672C\u8A9E'); 117 | assert.equal(iconv.decode(Buffer.from('Hi Mom -&Jjo--!'), 'utf-7-imap'), 'Hi Mom -\u263A-!'); 118 | assert.equal(iconv.decode(Buffer.from('Item 3 is &AKM-1.'), 'utf-7-imap'), 'Item 3 is \u00A31.'); 119 | 120 | // Custom examples that contain more than one mode shift. 121 | assert.equal(iconv.decode(Buffer.from('Jyv&AOQ-skyl&AOQ-'), 'utf-7-imap'), 'Jyv\u00E4skyl\u00E4'); 122 | assert.equal(iconv.decode(Buffer.from('\'&T2BZfQ-\' hei&AN8-t "Hallo"'), 'utf-7-imap'), '\'\u4F60\u597D\' heißt "Hallo"'); 123 | 124 | // The ampersand sign is represented by &-. 125 | assert.equal(iconv.decode(Buffer.from('Hot &- Spicy &- Fruity'), 'utf-7-imap'), 'Hot & Spicy & Fruity'); 126 | 127 | // Slashes are converted to commas. 128 | assert.equal(iconv.decode(Buffer.from('&,,,typh2VDIf7Q-'), 'utf-7-imap'), '\uffff\uedca\u9876\u5432\u1fed'); 129 | 130 | // & sign around non-ASCII chars 131 | assert.equal(iconv.decode(Buffer.from('&AOQ-&-&AOQ-&-&AOQ-'), 'utf-7-imap'), '\u00E4&\u00E4&\u00E4'); 132 | }); 133 | }); 134 | -------------------------------------------------------------------------------- /test/webpack/basic-test.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert').strict; 2 | 3 | describe("iconv-lite", function() { 4 | var iconv; 5 | 6 | it("can be require-d successfully", function() { 7 | // Emulate more complex environments that are both web- and node.js-compatible (e.g. Electron renderer process). 8 | // See https://github.com/ashtuchkin/iconv-lite/issues/204 for details. 9 | process.versions.node = "12.0.0"; 10 | 11 | iconv = require(".").iconv; 12 | }); 13 | 14 | it("does not support streams by default", function() { 15 | assert(!iconv.supportsStreams); 16 | 17 | assert.throws(function() { 18 | iconv.encodeStream() 19 | }, /Streaming API is not enabled/); 20 | }); 21 | 22 | it("can encode/decode sbcs encodings", function() { 23 | var buf = iconv.encode("abc", "win1251"); 24 | var str = iconv.decode(buf, "win1251"); 25 | assert.equal(str, "abc"); 26 | }); 27 | 28 | it("can encode/decode dbcs encodings", function() { 29 | var buf = iconv.encode("abc", "shiftjis"); 30 | var str = iconv.decode(buf, "shiftjis"); 31 | assert.equal(str, "abc"); 32 | }); 33 | 34 | it("can encode/decode internal encodings", function() { 35 | var buf = iconv.encode("💩", "utf8"); 36 | var str = iconv.decode(buf, "utf8"); 37 | assert.equal(str, "💩"); 38 | }); 39 | 40 | it("supports passing Uint8Array to decode for all encodings", function() { 41 | iconv.encode('', 'utf8'); // Load all encodings. 42 | 43 | var encodings = Object.keys(iconv.encodings) 44 | encodings 45 | .filter(encoding => 46 | !encoding.startsWith('_') 47 | // https://github.com/ashtuchkin/iconv-lite/issues/231 48 | && encoding !== 'base64' && encoding !== 'hex' 49 | ) 50 | .forEach(function(encoding) { 51 | var expected = 'Lorem ipsum'; 52 | 53 | var encoded = iconv.encode(expected, encoding); 54 | var uint8Array = Uint8Array.from(encoded); 55 | 56 | var actual = iconv.decode(uint8Array, encoding); 57 | assert.equal(actual, expected, encoding); 58 | }) 59 | }); 60 | }); 61 | 62 | describe("stream module", function() { 63 | it("is not included in the bundle", function() { 64 | var stream_module_name = "stream"; 65 | assert.throws(function() { return require(stream_module_name) }, /Cannot find module 'stream'/); 66 | }); 67 | }); -------------------------------------------------------------------------------- /test/webpack/index.js: -------------------------------------------------------------------------------- 1 | 2 | // Reexport iconv-lite for tests. 3 | exports.iconv = require('iconv-lite'); -------------------------------------------------------------------------------- /test/webpack/karma.conf.js: -------------------------------------------------------------------------------- 1 | // Karma configuration 2 | // Generated on Sat May 23 2020 18:02:48 GMT-0400 (Eastern Daylight Time) 3 | process.env.CHROME_BIN = require('puppeteer').executablePath() 4 | 5 | module.exports = function(config) { 6 | config.set({ 7 | 8 | // base path that will be used to resolve all patterns (eg. files, exclude) 9 | basePath: '', 10 | 11 | 12 | // frameworks to use 13 | // available frameworks: https://npmjs.org/browse/keyword/karma-adapter 14 | frameworks: ['mocha'], 15 | 16 | 17 | // list of files / patterns to load in the browser 18 | files: [ 19 | { pattern: '*test.js', watched: false }, 20 | ], 21 | 22 | // preprocess matching files before serving them to the browser 23 | // available preprocessors: https://npmjs.org/browse/keyword/karma-preprocessor 24 | preprocessors: { 25 | '*test.js': ['webpack'] 26 | }, 27 | 28 | webpack: { 29 | "mode": "development", 30 | // karma watches the test entry points 31 | // (you don't need to specify the entry option) 32 | // webpack watches dependencies 33 | // webpack configuration 34 | }, 35 | 36 | webpackMiddleware: { 37 | // Don't watch. 38 | "watchOptions": { 39 | ignored: ["**/*"], 40 | }, 41 | }, 42 | 43 | // test results reporter to use 44 | // possible values: 'dots', 'progress' 45 | // available reporters: https://npmjs.org/browse/keyword/karma-reporter 46 | reporters: ['progress'], 47 | 48 | 49 | // web server port 50 | port: 9876, 51 | 52 | 53 | // enable / disable colors in the output (reporters and logs) 54 | colors: true, 55 | 56 | 57 | // level of logging 58 | // possible values: config.LOG_DISABLE || config.LOG_ERROR || config.LOG_WARN || config.LOG_INFO || config.LOG_DEBUG 59 | logLevel: config.LOG_INFO, 60 | 61 | 62 | // enable / disable watching file and executing tests whenever any file changes 63 | autoWatch: false, 64 | 65 | 66 | // start these browsers 67 | // available browser launchers: https://npmjs.org/browse/keyword/karma-launcher 68 | browsers: ['ChromeHeadless'], 69 | 70 | 71 | // Continuous Integration mode 72 | // if true, Karma captures browsers, runs the tests and exits 73 | singleRun: true, 74 | 75 | // Concurrency level 76 | // how many browser should be started simultaneous 77 | concurrency: Infinity 78 | }) 79 | } 80 | -------------------------------------------------------------------------------- /test/webpack/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "webpack-test", 3 | "private": true, 4 | "version": "1.0.0", 5 | "scripts": { 6 | "!note1": "NOTE: We do `npm pack` of the main iconv-lite package followed by installing it to create a copy of the package (not symlink).", 7 | "!note2": "This is needed because webpack4/watchpack1.7 crashes when trying to enumerate circular symlink.", 8 | "preinstall": "mv $(npm pack -pq ../../) iconv-lite.tgz", 9 | "postinstall": "rm iconv-lite.tgz", 10 | "test": "karma start" 11 | }, 12 | "devDependencies": { 13 | "karma": "^5.0.9", 14 | "karma-chrome-launcher": "^3.1.0", 15 | "karma-mocha": "^2.0.1", 16 | "karma-webpack": "^4.0.2", 17 | "mocha": "^7.2.0", 18 | "puppeteer": "^4.0.0", 19 | "webpack": "^4.43.0" 20 | }, 21 | "dependencies": { 22 | "iconv-lite": "file:iconv-lite.tgz" 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /test/webpack/stream-test.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert').strict; 2 | 3 | describe("iconv-lite with streams", function() { 4 | var iconv = require(".").iconv; 5 | 6 | it("supports streams when explicitly enabled", function() { 7 | iconv.enableStreamingAPI(require('stream')); 8 | assert(iconv.supportsStreams); 9 | }); 10 | 11 | it("can encode/decode in streaming mode", function(done) { 12 | var stream1 = iconv.encodeStream("win1251"); 13 | var stream2 = iconv.decodeStream("win1251"); 14 | stream1.pipe(stream2); 15 | 16 | stream1.end("abc"); 17 | stream2.collect(function(err, str) { 18 | if (err) 19 | return done(err); 20 | 21 | assert.equal(str, "abc"); 22 | done(null); 23 | }); 24 | }); 25 | }); 26 | --------------------------------------------------------------------------------