├── .gitignore ├── LICENSE ├── README.md ├── dist ├── README.md ├── utfx-embeddable.js ├── utfx.js └── utfx.min.js ├── index.js ├── package.json ├── src ├── README.md ├── header.txt ├── utfx.js ├── wrap_embeddable.js └── wrap_standalone.js ├── tests ├── bench.js └── suite.js └── utfx.png /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | npm-debug.log 3 | .idea/ 4 | raw/ 5 | doco/ 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![utfx - A compact library to encode, decode and convert UTF8 / UTF16 in JavaScript.](https://raw.github.com/dcodeIO/utfx/master/utfx.png) 2 | ==== 3 | **utfx** is a compact library to encode, decode and convert UTF8 / UTF16 in JavaScript using arbitrary sources and 4 | destinations through the use of successively called functions, basically eliminating [memory overhead](https://github.com/dcodeIO/utfx/wiki#faq). 5 | 6 | The standalone library is also capable of using binary strings and arrays (with the usual overhead) and provides 7 | polyfills for `String.fromCodePoint` and `String#codePointAt`. 8 | 9 | API 10 | --- 11 | 12 | ### encodeUTF8(src, dst) 13 | 14 | Encodes UTF8 code points to UTF8 bytes. 15 | 16 | | Parameter | Type | Description 17 | |-----------------|-----------------|--------------- 18 | | src | *function():(number | null) | number* | Code points source, either as a function returning the next code point respectively `null` if there are no more code points left or a single numeric code point. 19 | | dst | *function(number)* | Bytes destination as a function successively called with the next byte 20 | 21 | ### decodeUTF8(src, dst) 22 | 23 | Decodes UTF8 bytes to UTF8 code points. 24 | 25 | | Parameter | Type | Description 26 | |-----------------|-----------------|--------------- 27 | | src | *function():(number | null)* | Bytes source as a function returning the next byte respectively `null` if there are no more bytes left. 28 | | dst | *function(number)* | Code points destination as a function successively called with each decoded code point. 29 | | **@throws** | *RangeError* | If a starting byte is invalid in UTF8 30 | | **@throws** | *Error* | If the last sequence is truncated. Has an array property `bytes` holding the remaining bytes. 31 | 32 | ### UTF16toUTF8(src, dst) 33 | 34 | Converts UTF16 characters to UTF8 code points. 35 | 36 | | Parameter | Type | Description 37 | |-----------------|-----------------|--------------- 38 | | src | *function():(number | null)* | Characters source as a function returning the next char code respectively `null` if there are no more characters left. 39 | | dst | *function(number)* | Code points destination as a function successively called with each converted code point. 40 | 41 | ### UTF8toUTF16(src, dst) 42 | 43 | Converts UTF8 code points to UTF16 characters. 44 | 45 | | Parameter | Type | Description 46 | |-----------------|-----------------|--------------- 47 | | src | *function():(number | null) | number* | Code points source, either as a function returning the next code point respectively `null` if there are no more code points left or a single numeric code point. 48 | | dst | *function(number)* | Characters destination as a function successively called with each converted char code. 49 | | **@throws** | *RangeError* | If a code point is out of range 50 | 51 | ### encodeUTF16toUTF8(src, dst) 52 | 53 | Converts and encodes UTF16 characters to UTF8 bytes. 54 | 55 | | Parameter | Type | Description 56 | |-----------------|-----------------|--------------- 57 | | src | *function():(number | null)* | Characters source as a function returning the next char code respectively `null` if there are no more characters left. 58 | | dst | *function(number)* | Bytes destination as a function successively called with the next byte. 59 | 60 | ### decodeUTF8toUTF16(src, dst) 61 | 62 | Decodes and converts UTF8 bytes to UTF16 characters. 63 | 64 | | Parameter | Type | Description 65 | |-----------------|-----------------|--------------- 66 | | src | *function():(number | null)* | Bytes source as a function returning the next byte respectively `null` if there are no more bytes left. 67 | | dst | *function(number)* | Characters destination as a function successively called with each converted char code. 68 | | **@throws** | *RangeError* | If a starting byte is invalid in UTF8 69 | | **@throws** | *Error* | If the last sequence is truncated. Has an array property `bytes` holding the remaining bytes. 70 | 71 | ### assertByte(b) 72 | 73 | Asserts a byte value. 74 | 75 | | Parameter | Type | Description 76 | |-----------------|-----------------|--------------- 77 | | b | *number* | 8bit byte value 78 | | **@returns** | *number* | Valid byte value 79 | | **@throws** | *TypeError* | If the byte value is invalid 80 | | **@throws** | *RangeError* | If the byte value is out of range 81 | 82 | ### assertCharCode(c) 83 | 84 | Asserts an UTF16 char code. 85 | 86 | | Parameter | Type | Description 87 | |-----------------|-----------------|--------------- 88 | | c | *number* | UTF16 char code 89 | | **@returns** | *number* | Valid char code 90 | | **@throws** | *TypeError* | If the char code is invalid 91 | | **@throws** | *RangeError* | If the char code is out of range 92 | 93 | ### assertCodePoint(cp) 94 | 95 | Asserts an UTF8 code point. 96 | 97 | | Parameter | Type | Description 98 | |-----------------|-----------------|--------------- 99 | | cp | *number* | UTF8 code point 100 | | **@returns** | *number* | Valid code point 101 | | **@throws** | *TypeError* | If the code point is invalid 102 | | **@throws** | *RangeError* | If the code point is out of range 103 | 104 | ### calculateCodePoint(cp) 105 | 106 | Calculates the byte length of an UTF8 code point. 107 | 108 | | Parameter | Type | Description 109 | |-----------------|-----------------|--------------- 110 | | cp | *number* | UTF8 code point 111 | | **@returns** | *number* | Byte length 112 | 113 | ### calculateUTF8(src) 114 | 115 | Calculates the number of UTF8 bytes required to store UTF8 code points. 116 | 117 | | Parameter | Type | Description 118 | |-----------------|-----------------|--------------- 119 | | src | *function():(number | null)* | Code points source as a function returning the next code point respectively `null` if there are no more code points left. 120 | | **@returns** | *number* | The number of UTF8 bytes required 121 | 122 | ### calculateUTF16asUTF8(src) 123 | 124 | Calculates the number of UTF8 code points respectively UTF8 bytes required to store UTF16 char codes. 125 | 126 | | Parameter | Type | Description 127 | |-----------------|-----------------|--------------- 128 | | src | *function():(number | null)* | Characters source as a function returning the next char code respectively `null` if there are no more characters left. 129 | | **@returns** | *!Array.<number>* | The number of UTF8 code points at index 0 and the number of UTF8 bytes required at index 1. 130 | 131 | ### arraySource(a) 132 | 133 | Creates a source function for an array. 134 | 135 | | Parameter | Type | Description 136 | |-----------------|-----------------|--------------- 137 | | a | *!Array.<number>* | Array to read from 138 | | **@returns** | *function():(number | null)* | Source function returning the next value respectively `null` if there are no more values left. 139 | | **@throws** | *TypeError* | If the argument is invalid 140 | 141 | ### arrayDestination(a) 142 | 143 | Creates a destination function for an array. 144 | 145 | | Parameter | Type | Description 146 | |-----------------|-----------------|--------------- 147 | | a | *!Array.<number>* | Array to write to 148 | | **@returns** | *function(number)* | Destination function successively called with the next value. 149 | | **@throws** | *TypeError* | If the argument is invalid 150 | 151 | ### stringSource(s) 152 | 153 | Creates a source function for a string. 154 | 155 | | Parameter | Type | Description 156 | |-----------------|-----------------|--------------- 157 | | s | *string* | String to read from 158 | | **@returns** | *function():(number | null)* | Source function returning the next char code respectively `null` if there are no more characters left. 159 | | **@throws** | *TypeError* | If the argument is invalid 160 | 161 | ### stringDestination() 162 | 163 | Creates a destination function for a string. 164 | 165 | | Parameter | Type | Description 166 | |-----------------|-----------------|--------------- 167 | | **@returns** | *function(number=):(undefined | string)* | Destination function successively called with the next char code. Returns the final string when called without arguments. 168 | 169 | ### fromCodePoint(var_args) 170 | 171 | A polyfill for `String.fromCodePoint`. 172 | 173 | | Parameter | Type | Description 174 | |-----------------|-----------------|--------------- 175 | | var_args | *...number* | Code points 176 | | **@returns** | *string* | JavaScript string 177 | | **@throws** | *TypeError* | If arguments are invalid or a code point is invalid 178 | | **@throws** | *RangeError* | If a code point is out of range 179 | 180 | ### codePointAt(s, i) 181 | 182 | A polyfill for `String#codePointAt`. 183 | 184 | | Parameter | Type | Description 185 | |-----------------|-----------------|--------------- 186 | | s | *string* | JavaScript string 187 | | i | *number* | Index 188 | | **@returns** | *number | undefined* | Code point or `undefined` if `i` is out of range 189 | | **@throws** | *TypeError* | If arguments are invalid 190 | 191 | ### polyfill(override=) 192 | 193 | Installs utfx as a polyfill for `String.fromCodePoint` and `String#codePointAt` if not implemented. 194 | 195 | | Parameter | Type | Description 196 | |-----------------|-----------------|--------------- 197 | | override | *boolean* | Overrides an existing implementation if `true`, defaults to `false` 198 | | **@returns** | *!Object.<string,*>* | utfx namespace 199 | 200 | Usage 201 | ----- 202 | * **node.js**: `npm install utfx` 203 | 204 | ```js 205 | var utfx = require("utfx"); 206 | ... 207 | ``` 208 | 209 | * **Browser**: `` 210 | 211 | ```js 212 | var utfx = dcodeIO.utfx; 213 | ... 214 | ``` 215 | 216 | * **Require.js/AMD** 217 | 218 | ```js 219 | require.config({ 220 | "paths": { 221 | "utfx": "/path/to/utfx.min.js" 222 | } 223 | }); 224 | require(["utfx"], function(utfx) { 225 | ... 226 | } 227 | ``` 228 | 229 | Downloads 230 | --------- 231 | * [Distributions](https://github.com/dcodeIO/utfx/tree/master/dist) 232 | 233 | FAQ and examples 234 | ---------------- 235 | * [Wiki](https://github.com/dcodeIO/utfx/wiki) 236 | 237 | License 238 | ------- 239 | Apache License, Version 2.0 240 | -------------------------------------------------------------------------------- /dist/README.md: -------------------------------------------------------------------------------- 1 | Distributions 2 | ------------- 3 | * **[utfx.js](https://raw.githubusercontent.com/dcodeIO/utfx/master/dist/utfx.js)** 4 | is the standalone library. 5 | * **[utfx.min.js](https://raw.githubusercontent.com/dcodeIO/utfx/master/dist/utfx.min.js)** 6 | has been compiled with Closure Compiler using advanced optimizations. 7 | 8 | * **[utfx-embeddable.js](https://raw.githubusercontent.com/dcodeIO/utfx/master/dist/utfx-embeddable.js)** 9 | is an embeddable library containing just the core functionality. 10 | -------------------------------------------------------------------------------- /dist/utfx-embeddable.js: -------------------------------------------------------------------------------- 1 | /** 2 | * utfx-embeddable (c) 2014 Daniel Wirtz 3 | * Released under the Apache License, Version 2.0 4 | * see: https://github.com/dcodeIO/utfx for details 5 | */ 6 | var utfx = function() { 7 | "use strict"; 8 | 9 | /** 10 | * utfx namespace. 11 | * @inner 12 | * @type {!Object.} 13 | */ 14 | var utfx = {}; 15 | 16 | /** 17 | * Maximum valid code point. 18 | * @type {number} 19 | * @const 20 | */ 21 | utfx.MAX_CODEPOINT = 0x10FFFF; 22 | 23 | /** 24 | * Encodes UTF8 code points to UTF8 bytes. 25 | * @param {(!function():number|null) | number} src Code points source, either as a function returning the next code point 26 | * respectively `null` if there are no more code points left or a single numeric code point. 27 | * @param {!function(number)} dst Bytes destination as a function successively called with the next byte 28 | */ 29 | utfx.encodeUTF8 = function(src, dst) { 30 | var cp = null; 31 | if (typeof src === 'number') 32 | cp = src, 33 | src = function() { return null; }; 34 | while (cp !== null || (cp = src()) !== null) { 35 | if (cp < 0x80) 36 | dst(cp&0x7F); 37 | else if (cp < 0x800) 38 | dst(((cp>>6)&0x1F)|0xC0), 39 | dst((cp&0x3F)|0x80); 40 | else if (cp < 0x10000) 41 | dst(((cp>>12)&0x0F)|0xE0), 42 | dst(((cp>>6)&0x3F)|0x80), 43 | dst((cp&0x3F)|0x80); 44 | else 45 | dst(((cp>>18)&0x07)|0xF0), 46 | dst(((cp>>12)&0x3F)|0x80), 47 | dst(((cp>>6)&0x3F)|0x80), 48 | dst((cp&0x3F)|0x80); 49 | cp = null; 50 | } 51 | }; 52 | 53 | /** 54 | * Decodes UTF8 bytes to UTF8 code points. 55 | * @param {!function():number|null} src Bytes source as a function returning the next byte respectively `null` if there 56 | * are no more bytes left. 57 | * @param {!function(number)} dst Code points destination as a function successively called with each decoded code point. 58 | * @throws {RangeError} If a starting byte is invalid in UTF8 59 | * @throws {Error} If the last sequence is truncated. Has an array property `bytes` holding the 60 | * remaining bytes. 61 | */ 62 | utfx.decodeUTF8 = function(src, dst) { 63 | var a, b, c, d, fail = function(b) { 64 | b = b.slice(0, b.indexOf(null)); 65 | var err = Error(b.toString()); 66 | err.name = "TruncatedError"; 67 | err['bytes'] = b; 68 | throw err; 69 | }; 70 | while ((a = src()) !== null) { 71 | if ((a&0x80) === 0) 72 | dst(a); 73 | else if ((a&0xE0) === 0xC0) 74 | ((b = src()) === null) && fail([a, b]), 75 | dst(((a&0x1F)<<6) | (b&0x3F)); 76 | else if ((a&0xF0) === 0xE0) 77 | ((b=src()) === null || (c=src()) === null) && fail([a, b, c]), 78 | dst(((a&0x0F)<<12) | ((b&0x3F)<<6) | (c&0x3F)); 79 | else if ((a&0xF8) === 0xF0) 80 | ((b=src()) === null || (c=src()) === null || (d=src()) === null) && fail([a, b, c ,d]), 81 | dst(((a&0x07)<<18) | ((b&0x3F)<<12) | ((c&0x3F)<<6) | (d&0x3F)); 82 | else throw RangeError("Illegal starting byte: "+a); 83 | } 84 | }; 85 | 86 | /** 87 | * Converts UTF16 characters to UTF8 code points. 88 | * @param {!function():number|null} src Characters source as a function returning the next char code respectively 89 | * `null` if there are no more characters left. 90 | * @param {!function(number)} dst Code points destination as a function successively called with each converted code 91 | * point. 92 | */ 93 | utfx.UTF16toUTF8 = function(src, dst) { 94 | var c1, c2 = null; 95 | while (true) { 96 | if ((c1 = c2 !== null ? c2 : src()) === null) 97 | break; 98 | if (c1 >= 0xD800 && c1 <= 0xDFFF) { 99 | if ((c2 = src()) !== null) { 100 | if (c2 >= 0xDC00 && c2 <= 0xDFFF) { 101 | dst((c1-0xD800)*0x400+c2-0xDC00+0x10000); 102 | c2 = null; continue; 103 | } 104 | } 105 | } 106 | dst(c1); 107 | } 108 | if (c2 !== null) dst(c2); 109 | }; 110 | 111 | /** 112 | * Converts UTF8 code points to UTF16 characters. 113 | * @param {(!function():number|null) | number} src Code points source, either as a function returning the next code point 114 | * respectively `null` if there are no more code points left or a single numeric code point. 115 | * @param {!function(number)} dst Characters destination as a function successively called with each converted char code. 116 | * @throws {RangeError} If a code point is out of range 117 | */ 118 | utfx.UTF8toUTF16 = function(src, dst) { 119 | var cp = null; 120 | if (typeof src === 'number') 121 | cp = src, src = function() { return null; }; 122 | while (cp !== null || (cp = src()) !== null) { 123 | if (cp <= 0xFFFF) 124 | dst(cp); 125 | else 126 | cp -= 0x10000, 127 | dst((cp>>10)+0xD800), 128 | dst((cp%0x400)+0xDC00); 129 | cp = null; 130 | } 131 | }; 132 | 133 | /** 134 | * Converts and encodes UTF16 characters to UTF8 bytes. 135 | * @param {!function():number|null} src Characters source as a function returning the next char code respectively `null` 136 | * if there are no more characters left. 137 | * @param {!function(number)} dst Bytes destination as a function successively called with the next byte. 138 | */ 139 | utfx.encodeUTF16toUTF8 = function(src, dst) { 140 | utfx.UTF16toUTF8(src, function(cp) { 141 | utfx.encodeUTF8(cp, dst); 142 | }); 143 | }; 144 | 145 | /** 146 | * Decodes and converts UTF8 bytes to UTF16 characters. 147 | * @param {!function():number|null} src Bytes source as a function returning the next byte respectively `null` if there 148 | * are no more bytes left. 149 | * @param {!function(number)} dst Characters destination as a function successively called with each converted char code. 150 | * @throws {RangeError} If a starting byte is invalid in UTF8 151 | * @throws {Error} If the last sequence is truncated. Has an array property `bytes` holding the remaining bytes. 152 | */ 153 | utfx.decodeUTF8toUTF16 = function(src, dst) { 154 | utfx.decodeUTF8(src, function(cp) { 155 | utfx.UTF8toUTF16(cp, dst); 156 | }); 157 | }; 158 | 159 | /** 160 | * Calculates the byte length of an UTF8 code point. 161 | * @param {number} cp UTF8 code point 162 | * @returns {number} Byte length 163 | */ 164 | utfx.calculateCodePoint = function(cp) { 165 | return (cp < 0x80) ? 1 : (cp < 0x800) ? 2 : (cp < 0x10000) ? 3 : 4; 166 | }; 167 | 168 | /** 169 | * Calculates the number of UTF8 bytes required to store UTF8 code points. 170 | * @param {(!function():number|null)} src Code points source as a function returning the next code point respectively 171 | * `null` if there are no more code points left. 172 | * @returns {number} The number of UTF8 bytes required 173 | */ 174 | utfx.calculateUTF8 = function(src) { 175 | var cp, l=0; 176 | while ((cp = src()) !== null) 177 | l += (cp < 0x80) ? 1 : (cp < 0x800) ? 2 : (cp < 0x10000) ? 3 : 4; 178 | return l; 179 | }; 180 | 181 | /** 182 | * Calculates the number of UTF8 code points respectively UTF8 bytes required to store UTF16 char codes. 183 | * @param {(!function():number|null)} src Characters source as a function returning the next char code respectively 184 | * `null` if there are no more characters left. 185 | * @returns {!Array.} The number of UTF8 code points at index 0 and the number of UTF8 bytes required at index 1. 186 | */ 187 | utfx.calculateUTF16asUTF8 = function(src) { 188 | var n=0, l=0; 189 | utfx.UTF16toUTF8(src, function(cp) { 190 | ++n; l += (cp < 0x80) ? 1 : (cp < 0x800) ? 2 : (cp < 0x10000) ? 3 : 4; 191 | }); 192 | return [n,l]; 193 | }; 194 | 195 | return utfx; 196 | }(); 197 | -------------------------------------------------------------------------------- /dist/utfx.js: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2014 Daniel Wirtz 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | /** 18 | * @license utfx (c) 2014 Daniel Wirtz 19 | * Released under the Apache License, Version 2.0 20 | * see: https://github.com/dcodeIO/utfx for details 21 | */ 22 | (function(global, factory) { 23 | 24 | /* AMD */ if (typeof define === 'function' && define['amd']) 25 | define(factory); 26 | /* CommonJS */ else if (typeof require === "function" && typeof module === 'object' && module && module['exports']) 27 | module['exports'] = factory(); 28 | /* Global */ else 29 | (global["dcodeIO"] = global["dcodeIO"] || {})["utfx"] = factory(); 30 | 31 | })(this, function() { 32 | "use strict"; 33 | 34 | if (!Array.isArray) 35 | Array.isArray = function (v) { 36 | return Object.prototype.toString.call(v) === "[object Array]"; 37 | }; 38 | 39 | /** 40 | * utfx namespace. 41 | * @exports utfx 42 | * @type {!Object.} 43 | */ 44 | var utfx = {}; 45 | 46 | /** 47 | * Maximum valid code point. 48 | * @type {number} 49 | * @const 50 | * @expose 51 | */ 52 | utfx.MAX_CODEPOINT = 0x10FFFF; 53 | 54 | /** 55 | * Encodes UTF8 code points to UTF8 bytes. 56 | * @param {(!function():number|null) | number} src Code points source, either as a function returning the next code point 57 | * respectively `null` if there are no more code points left or a single numeric code point. 58 | * @param {!function(number)} dst Bytes destination as a function successively called with the next byte 59 | * @expose 60 | */ 61 | utfx.encodeUTF8 = function(src, dst) { 62 | var cp = null; 63 | if (typeof src === 'number') 64 | cp = src, 65 | src = function() { return null; }; 66 | while (cp !== null || (cp = src()) !== null) { 67 | if (cp < 0x80) 68 | dst(cp&0x7F); 69 | else if (cp < 0x800) 70 | dst(((cp>>6)&0x1F)|0xC0), 71 | dst((cp&0x3F)|0x80); 72 | else if (cp < 0x10000) 73 | dst(((cp>>12)&0x0F)|0xE0), 74 | dst(((cp>>6)&0x3F)|0x80), 75 | dst((cp&0x3F)|0x80); 76 | else 77 | dst(((cp>>18)&0x07)|0xF0), 78 | dst(((cp>>12)&0x3F)|0x80), 79 | dst(((cp>>6)&0x3F)|0x80), 80 | dst((cp&0x3F)|0x80); 81 | cp = null; 82 | } 83 | }; 84 | 85 | /** 86 | * Decodes UTF8 bytes to UTF8 code points. 87 | * @param {!function():number|null} src Bytes source as a function returning the next byte respectively `null` if there 88 | * are no more bytes left. 89 | * @param {!function(number)} dst Code points destination as a function successively called with each decoded code point. 90 | * @throws {RangeError} If a starting byte is invalid in UTF8 91 | * @throws {Error} If the last sequence is truncated. Has an array property `bytes` holding the 92 | * remaining bytes. 93 | * @expose 94 | */ 95 | utfx.decodeUTF8 = function(src, dst) { 96 | var a, b, c, d, fail = function(b) { 97 | b = b.slice(0, b.indexOf(null)); 98 | var err = Error(b.toString()); 99 | err.name = "TruncatedError"; 100 | err['bytes'] = b; 101 | throw err; 102 | }; 103 | while ((a = src()) !== null) { 104 | if ((a&0x80) === 0) 105 | dst(a); 106 | else if ((a&0xE0) === 0xC0) 107 | ((b = src()) === null) && fail([a, b]), 108 | dst(((a&0x1F)<<6) | (b&0x3F)); 109 | else if ((a&0xF0) === 0xE0) 110 | ((b=src()) === null || (c=src()) === null) && fail([a, b, c]), 111 | dst(((a&0x0F)<<12) | ((b&0x3F)<<6) | (c&0x3F)); 112 | else if ((a&0xF8) === 0xF0) 113 | ((b=src()) === null || (c=src()) === null || (d=src()) === null) && fail([a, b, c ,d]), 114 | dst(((a&0x07)<<18) | ((b&0x3F)<<12) | ((c&0x3F)<<6) | (d&0x3F)); 115 | else throw RangeError("Illegal starting byte: "+a); 116 | } 117 | }; 118 | 119 | /** 120 | * Converts UTF16 characters to UTF8 code points. 121 | * @param {!function():number|null} src Characters source as a function returning the next char code respectively 122 | * `null` if there are no more characters left. 123 | * @param {!function(number)} dst Code points destination as a function successively called with each converted code 124 | * point. 125 | * @expose 126 | */ 127 | utfx.UTF16toUTF8 = function(src, dst) { 128 | var c1, c2 = null; 129 | while (true) { 130 | if ((c1 = c2 !== null ? c2 : src()) === null) 131 | break; 132 | if (c1 >= 0xD800 && c1 <= 0xDFFF) { 133 | if ((c2 = src()) !== null) { 134 | if (c2 >= 0xDC00 && c2 <= 0xDFFF) { 135 | dst((c1-0xD800)*0x400+c2-0xDC00+0x10000); 136 | c2 = null; continue; 137 | } 138 | } 139 | } 140 | dst(c1); 141 | } 142 | if (c2 !== null) dst(c2); 143 | }; 144 | 145 | /** 146 | * Converts UTF8 code points to UTF16 characters. 147 | * @param {(!function():number|null) | number} src Code points source, either as a function returning the next code point 148 | * respectively `null` if there are no more code points left or a single numeric code point. 149 | * @param {!function(number)} dst Characters destination as a function successively called with each converted char code. 150 | * @throws {RangeError} If a code point is out of range 151 | * @expose 152 | */ 153 | utfx.UTF8toUTF16 = function(src, dst) { 154 | var cp = null; 155 | if (typeof src === 'number') 156 | cp = src, src = function() { return null; }; 157 | while (cp !== null || (cp = src()) !== null) { 158 | if (cp <= 0xFFFF) 159 | dst(cp); 160 | else 161 | cp -= 0x10000, 162 | dst((cp>>10)+0xD800), 163 | dst((cp%0x400)+0xDC00); 164 | cp = null; 165 | } 166 | }; 167 | 168 | /** 169 | * Converts and encodes UTF16 characters to UTF8 bytes. 170 | * @param {!function():number|null} src Characters source as a function returning the next char code respectively `null` 171 | * if there are no more characters left. 172 | * @param {!function(number)} dst Bytes destination as a function successively called with the next byte. 173 | * @expose 174 | */ 175 | utfx.encodeUTF16toUTF8 = function(src, dst) { 176 | utfx.UTF16toUTF8(src, function(cp) { 177 | utfx.encodeUTF8(cp, dst); 178 | }); 179 | }; 180 | 181 | /** 182 | * Decodes and converts UTF8 bytes to UTF16 characters. 183 | * @param {!function():number|null} src Bytes source as a function returning the next byte respectively `null` if there 184 | * are no more bytes left. 185 | * @param {!function(number)} dst Characters destination as a function successively called with each converted char code. 186 | * @throws {RangeError} If a starting byte is invalid in UTF8 187 | * @throws {Error} If the last sequence is truncated. Has an array property `bytes` holding the remaining bytes. 188 | * @expose 189 | */ 190 | utfx.decodeUTF8toUTF16 = function(src, dst) { 191 | utfx.decodeUTF8(src, function(cp) { 192 | utfx.UTF8toUTF16(cp, dst); 193 | }); 194 | }; 195 | 196 | /** 197 | * Calculates the byte length of an UTF8 code point. 198 | * @param {number} cp UTF8 code point 199 | * @returns {number} Byte length 200 | * @expose 201 | */ 202 | utfx.calculateCodePoint = function(cp) { 203 | return (cp < 0x80) ? 1 : (cp < 0x800) ? 2 : (cp < 0x10000) ? 3 : 4; 204 | }; 205 | 206 | /** 207 | * Calculates the number of UTF8 bytes required to store UTF8 code points. 208 | * @param {(!function():number|null)} src Code points source as a function returning the next code point respectively 209 | * `null` if there are no more code points left. 210 | * @returns {number} The number of UTF8 bytes required 211 | * @expose 212 | */ 213 | utfx.calculateUTF8 = function(src) { 214 | var cp, l=0; 215 | while ((cp = src()) !== null) 216 | l += (cp < 0x80) ? 1 : (cp < 0x800) ? 2 : (cp < 0x10000) ? 3 : 4; 217 | return l; 218 | }; 219 | 220 | /** 221 | * Calculates the number of UTF8 code points respectively UTF8 bytes required to store UTF16 char codes. 222 | * @param {(!function():number|null)} src Characters source as a function returning the next char code respectively 223 | * `null` if there are no more characters left. 224 | * @returns {!Array.} The number of UTF8 code points at index 0 and the number of UTF8 bytes required at index 1. 225 | * @expose 226 | */ 227 | utfx.calculateUTF16asUTF8 = function(src) { 228 | var n=0, l=0; 229 | utfx.UTF16toUTF8(src, function(cp) { 230 | ++n; l += (cp < 0x80) ? 1 : (cp < 0x800) ? 2 : (cp < 0x10000) ? 3 : 4; 231 | }); 232 | return [n,l]; 233 | }; 234 | 235 | /** 236 | * String.fromCharCode reference for compile time renaming. 237 | * @type {!function(...[number]):string} 238 | * @inner 239 | */ 240 | var stringFromCharCode = String.fromCharCode; 241 | 242 | /** 243 | * Creates a source function for an array. 244 | * @param {!Array.} a Array to read from 245 | * @returns {!function():number|null} Source function returning the next value respectively `null` if there are no 246 | * more values left. 247 | * @throws {TypeError} If the argument is invalid 248 | * @expose 249 | */ 250 | utfx.arraySource = function(a) { 251 | if (!Array.isArray(a)) 252 | throw TypeError("Illegal argument: "+(typeof a)); 253 | var i=0; return function() { 254 | return i >= a.length ? null : a[i++]; 255 | }; 256 | }; 257 | 258 | /** 259 | * Creates a destination function for an array. 260 | * @param {!Array.} a Array to write to 261 | * @returns {!function(number)} Destination function successively called with the next value. 262 | * @throws {TypeError} If the argument is invalid 263 | * @expose 264 | */ 265 | utfx.arrayDestination = function(a) { 266 | if (!Array.isArray(a)) 267 | throw TypeError("Illegal argument: "+(typeof a)); 268 | return Array.prototype.push.bind(a); 269 | }; 270 | 271 | /** 272 | * Creates a source function for a string. 273 | * @param {string} s String to read from 274 | * @returns {!function():number|null} Source function returning the next char code respectively `null` if there are 275 | * no more characters left. 276 | * @throws {TypeError} If the argument is invalid 277 | * @expose 278 | */ 279 | utfx.stringSource = function(s) { 280 | if (typeof s !== 'string') 281 | throw TypeError("Illegal argument: "+(typeof s)); 282 | var i=0; return function() { 283 | return i >= s.length ? null : s.charCodeAt(i++); 284 | }; 285 | }; 286 | 287 | /** 288 | * Creates a destination function for a string. 289 | * @returns {function(number=):undefined|string} Destination function successively called with the next char code. 290 | * Returns the final string when called without arguments. 291 | * @expose 292 | */ 293 | utfx.stringDestination = function() { 294 | var cs = [], ps = []; return function() { 295 | if (arguments.length === 0) 296 | return ps.join('')+stringFromCharCode.apply(String, cs); 297 | if (cs.length + arguments.length > 1024) 298 | ps.push(stringFromCharCode.apply(String, cs)), 299 | cs.length = 0; 300 | Array.prototype.push.apply(cs, arguments); 301 | }; 302 | }; 303 | 304 | /** 305 | * Asserts an UTF16 char code. 306 | * @param {number} c UTF16 char code 307 | * @returns {number} Valid char code 308 | * @throws {TypeError} If the char code is invalid 309 | * @throws {RangeError} If the char code is out of range 310 | * @expose 311 | */ 312 | utfx.assertCharCode = function(c) { 313 | if (typeof c !== 'number' || c !== c) 314 | throw TypeError("Illegal char code: "+(typeof c)); 315 | if (c < 0 || c > 0xFFFF) 316 | throw RangeError("Illegal char code: "+c); 317 | return c; 318 | }; 319 | 320 | /** 321 | * Asserts an UTF8 code point. 322 | * @param {number} cp UTF8 code point 323 | * @returns {number} Valid code point 324 | * @throws {TypeError} If the code point is invalid 325 | * @throws {RangeError} If the code point is out of range 326 | * @expose 327 | */ 328 | utfx.assertCodePoint = function(cp) { 329 | if (typeof cp !== 'number' || cp !== cp) 330 | throw TypeError("Illegal code point: "+(typeof cp)); 331 | if (cp < 0 || cp > utfx.MAX_CODEPOINT) 332 | throw RangeError("Illegal code point: "+cp); 333 | return cp; 334 | }; 335 | 336 | /** 337 | * A polyfill for `String.fromCodePoint`. 338 | * @param {...number} var_args Code points 339 | * @returns {string} JavaScript string 340 | * @throws {TypeError} If arguments are invalid or a code point is invalid 341 | * @throws {RangeError} If a code point is out of range 342 | * @expose 343 | */ 344 | utfx.fromCodePoint = function(var_args) { 345 | var sd, i=0, cps=arguments, k=cps.length; 346 | utfx.UTF8toUTF16(function() { 347 | return i < k ? utfx.assertCodePoint(cps[i++]) : null; 348 | }, sd = utfx.stringDestination()); 349 | return sd(); 350 | }; 351 | 352 | /** 353 | * A polyfill for `String#codePointAt`. 354 | * @param {string} s JavaScript string 355 | * @param {number} i Index 356 | * @returns {number|undefined} Code point or `undefined` if `i` is out of range 357 | * @throws {TypeError} If arguments are invalid 358 | * @expose 359 | */ 360 | utfx.codePointAt = function(s, i) { 361 | if ((typeof s !== 'string' && !(s && s instanceof String)) || typeof i !== 'number') 362 | throw TypeError("Illegal arguments: "+(typeof s)+", "+(typeof i)); 363 | var k, cp; 364 | if (i < 0 || i >= (k=s.length)) 365 | return; 366 | utfx.UTF16toUTF8(function() { 367 | return typeof cp === 'undefined' && i < k ? s.charCodeAt(i++) : null; 368 | }, function(icp) { 369 | cp = icp; 370 | }); 371 | return cp; 372 | }; 373 | 374 | /** 375 | * Installs utfx as a polyfill for `String.fromCodePoint` and `String#codePointAt` if not implemented. 376 | * @param {boolean=} override Overrides an existing implementation if `true`, defaults to `false` 377 | * @returns {!Object.} utfx namespace 378 | * @expose 379 | */ 380 | utfx.polyfill = function(override) { 381 | if (!String['fromCodePoint'] || override) 382 | String['fromCodePoint'] = utfx.fromCodePoint; 383 | if (!String.prototype['codePointAt'] || override) 384 | String.prototype['codePointAt'] = function(i) { return utfx.codePointAt(this, i); }; 385 | return utfx; 386 | }; 387 | 388 | return utfx; 389 | 390 | }); 391 | -------------------------------------------------------------------------------- /dist/utfx.min.js: -------------------------------------------------------------------------------- 1 | /* 2 | utfx (c) 2014 Daniel Wirtz 3 | Released under the Apache License, Version 2.0 4 | see: https://github.com/dcodeIO/utfx for details 5 | */ 6 | function g(){Array.isArray||(Array.isArray=function(a){return"[object Array]"===Object.prototype.toString.call(a)});var d={MAX_CODEPOINT:1114111,encodeUTF8:function(a,b){var c=null;"number"===typeof a&&(c=a,a=function(){return null});for(;null!==c||null!==(c=a());)128>c?b(c&127):(2048>c?b(c>>6&31|192):(65536>c?b(c>>12&15|224):(b(c>>18&7|240),b(c>>12&63|128)),b(c>>6&63|128)),b(c&63|128)),c=null},decodeUTF8:function(a,b){function c(a){a=a.slice(0,a.indexOf(null));var b=Error(a.toString());b.name="TruncatedError"; 7 | b.bytes=a;throw b;}for(var e,d,f,h;null!==(e=a());)if(0===(e&128))b(e);else if(192===(e&224))null===(d=a())&&c([e,d]),b((e&31)<<6|d&63);else if(224===(e&240))null!==(d=a())&&null!==(f=a())||c([e,d,f]),b((e&15)<<12|(d&63)<<6|f&63);else if(240===(e&248))null!==(d=a())&&null!==(f=a())&&null!==(h=a())||c([e,d,f,h]),b((e&7)<<18|(d&63)<<12|(f&63)<<6|h&63);else throw RangeError("Illegal starting byte: "+e);},UTF16toUTF8:function(a,b){for(var c,e=null;null!==(c=null!==e?e:a());)55296<=c&&57343>=c&&null!== 8 | (e=a())&&56320<=e&&57343>=e?(b(1024*(c-55296)+e-56320+65536),e=null):b(c);null!==e&&b(e)},UTF8toUTF16:function(a,b){var c=null;"number"===typeof a&&(c=a,a=function(){return null});for(;null!==c||null!==(c=a());)65535>=c?b(c):(c-=65536,b((c>>10)+55296),b(c%1024+56320)),c=null},encodeUTF16toUTF8:function(a,b){d.UTF16toUTF8(a,function(a){d.encodeUTF8(a,b)})},decodeUTF8toUTF16:function(a,b){d.decodeUTF8(a,function(a){d.UTF8toUTF16(a,b)})},calculateCodePoint:function(a){return 128>a?1:2048>a?2:65536>a? 9 | 3:4},calculateUTF8:function(a){for(var b,c=0;null!==(b=a());)c+=128>b?1:2048>b?2:65536>b?3:4;return c},calculateUTF16asUTF8:function(a){var b=0,c=0;d.UTF16toUTF8(a,function(a){++b;c+=128>a?1:2048>a?2:65536>a?3:4});return[b,c]}},k=String.fromCharCode;d.arraySource=function(a){if(!Array.isArray(a))throw TypeError("Illegal argument: "+typeof a);var b=0;return function(){return b>=a.length?null:a[b++]}};d.arrayDestination=function(a){if(!Array.isArray(a))throw TypeError("Illegal argument: "+typeof a); 10 | return Array.prototype.push.bind(a)};d.stringSource=function(a){if("string"!==typeof a)throw TypeError("Illegal argument: "+typeof a);var b=0;return function(){return b>=a.length?null:a.charCodeAt(b++)}};d.stringDestination=function(){var a=[],b=[];return function(){if(0===arguments.length)return b.join("")+k.apply(String,a);1024a||65535a||a>d.MAX_CODEPOINT)throw RangeError("Illegal code point: "+a);return a};d.fromCodePoint=function(a){var b,c=0,e=arguments,l=e.length;d.UTF8toUTF16(function(){return cb||b>=(c=a.length)))return d.UTF16toUTF8(function(){return"undefined"===typeof e&&b 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | module.exports = require("./dist/utfx.js"); 18 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "utfx", 3 | "author": "Daniel Wirtz ", 4 | "version": "1.0.1", 5 | "description": "A compact library to encode, decode and convert UTF8 / UTF16 in JavaScript.", 6 | "main": "index.js", 7 | "repository": { 8 | "type": "git", 9 | "url": "https://github.com/dcodeIO/utfx.git" 10 | }, 11 | "bugs": { 12 | "url": "https://github.com/dcodeIO/utfx/issues" 13 | }, 14 | "keywords": [ 15 | "charset", 16 | "encoding", 17 | "unicode", 18 | "utf8", 19 | "utf16" 20 | ], 21 | "dependencies": {}, 22 | "devDependencies": { 23 | "closurecompiler": "~1", 24 | "metascript": "~0", 25 | "pretty-hrtime": "^1.0.0", 26 | "testjs": "~1", 27 | "utf8": "^2.1.0" 28 | }, 29 | "license": "Apache-2.0", 30 | "engines": { 31 | "node": ">=0.8" 32 | }, 33 | "scripts": { 34 | "build": "node node_modules/metascript/bin/metascript src/wrap_standalone.js > dist/utfx.js && node node_modules/metascript/bin/metascript src/wrap_embeddable.js > dist/utfx-embeddable.js", 35 | "compile": "node node_modules/closurecompiler/bin/ccjs dist/utfx.js --compilation_level=ADVANCED_OPTIMIZATIONS > dist/utfx.min.js", 36 | "test": "node tests/suite.js", 37 | "make": "npm run-script build && npm run-script compile && npm test" 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/README.md: -------------------------------------------------------------------------------- 1 | Sources 2 | ------- 3 | * **[wrap_standalone.js](https://github.com/dcodeIO/utfx/blob/master/src/wrap_standalone.js)** 4 | is the wrapper for the standalone library. 5 | 6 | * **[wrap_embeddable.js](https://github.com/dcodeIO/utfx/blob/master/src/wrap_embeddable.js)** 7 | is the wrapper for the embeddable library. 8 | 9 | * **[utfx.js](https://github.com/dcodeIO/utfx/blob/master/src/utfx.js)** 10 | is the core functionality. 11 | -------------------------------------------------------------------------------- /src/header.txt: -------------------------------------------------------------------------------- 1 | /*! utfx-embeddable | (c) 2014 Daniel Wirtz | http://www.apache.org/licenses/LICENSE-2.0.html */ 2 | -------------------------------------------------------------------------------- /src/utfx.js: -------------------------------------------------------------------------------- 1 | /** 2 | * utfx namespace. 3 | //? if (UTFX_STANDALONE) 4 | * @exports utfx 5 | //? else 6 | * @inner 7 | * @type {!Object.} 8 | */ 9 | var utfx = {}; 10 | 11 | /** 12 | * Maximum valid code point. 13 | * @type {number} 14 | * @const 15 | //? if (UTFX_STANDALONE) 16 | * @expose 17 | */ 18 | utfx.MAX_CODEPOINT = 0x10FFFF; 19 | 20 | /** 21 | * Encodes UTF8 code points to UTF8 bytes. 22 | * @param {(!function():number|null) | number} src Code points source, either as a function returning the next code point 23 | * respectively `null` if there are no more code points left or a single numeric code point. 24 | * @param {!function(number)} dst Bytes destination as a function successively called with the next byte 25 | //? if (UTFX_STANDALONE) 26 | * @expose 27 | */ 28 | utfx.encodeUTF8 = function(src, dst) { 29 | var cp = null; 30 | if (typeof src === 'number') 31 | cp = src, 32 | src = function() { return null; }; 33 | while (cp !== null || (cp = src()) !== null) { 34 | if (cp < 0x80) 35 | dst(cp&0x7F); 36 | else if (cp < 0x800) 37 | dst(((cp>>6)&0x1F)|0xC0), 38 | dst((cp&0x3F)|0x80); 39 | else if (cp < 0x10000) 40 | dst(((cp>>12)&0x0F)|0xE0), 41 | dst(((cp>>6)&0x3F)|0x80), 42 | dst((cp&0x3F)|0x80); 43 | else 44 | dst(((cp>>18)&0x07)|0xF0), 45 | dst(((cp>>12)&0x3F)|0x80), 46 | dst(((cp>>6)&0x3F)|0x80), 47 | dst((cp&0x3F)|0x80); 48 | cp = null; 49 | } 50 | }; 51 | 52 | /** 53 | * Decodes UTF8 bytes to UTF8 code points. 54 | * @param {!function():number|null} src Bytes source as a function returning the next byte respectively `null` if there 55 | * are no more bytes left. 56 | * @param {!function(number)} dst Code points destination as a function successively called with each decoded code point. 57 | * @throws {RangeError} If a starting byte is invalid in UTF8 58 | * @throws {Error} If the last sequence is truncated. Has an array property `bytes` holding the 59 | * remaining bytes. 60 | //? if (UTFX_STANDALONE) 61 | * @expose 62 | */ 63 | utfx.decodeUTF8 = function(src, dst) { 64 | var a, b, c, d, fail = function(b) { 65 | b = b.slice(0, b.indexOf(null)); 66 | var err = Error(b.toString()); 67 | err.name = "TruncatedError"; 68 | err['bytes'] = b; 69 | throw err; 70 | }; 71 | while ((a = src()) !== null) { 72 | if ((a&0x80) === 0) 73 | dst(a); 74 | else if ((a&0xE0) === 0xC0) 75 | ((b = src()) === null) && fail([a, b]), 76 | dst(((a&0x1F)<<6) | (b&0x3F)); 77 | else if ((a&0xF0) === 0xE0) 78 | ((b=src()) === null || (c=src()) === null) && fail([a, b, c]), 79 | dst(((a&0x0F)<<12) | ((b&0x3F)<<6) | (c&0x3F)); 80 | else if ((a&0xF8) === 0xF0) 81 | ((b=src()) === null || (c=src()) === null || (d=src()) === null) && fail([a, b, c ,d]), 82 | dst(((a&0x07)<<18) | ((b&0x3F)<<12) | ((c&0x3F)<<6) | (d&0x3F)); 83 | else throw RangeError("Illegal starting byte: "+a); 84 | } 85 | }; 86 | 87 | /** 88 | * Converts UTF16 characters to UTF8 code points. 89 | * @param {!function():number|null} src Characters source as a function returning the next char code respectively 90 | * `null` if there are no more characters left. 91 | * @param {!function(number)} dst Code points destination as a function successively called with each converted code 92 | * point. 93 | //? if (UTFX_STANDALONE) 94 | * @expose 95 | */ 96 | utfx.UTF16toUTF8 = function(src, dst) { 97 | var c1, c2 = null; 98 | while (true) { 99 | if ((c1 = c2 !== null ? c2 : src()) === null) 100 | break; 101 | if (c1 >= 0xD800 && c1 <= 0xDFFF) { 102 | if ((c2 = src()) !== null) { 103 | if (c2 >= 0xDC00 && c2 <= 0xDFFF) { 104 | dst((c1-0xD800)*0x400+c2-0xDC00+0x10000); 105 | c2 = null; continue; 106 | } 107 | } 108 | } 109 | dst(c1); 110 | } 111 | if (c2 !== null) dst(c2); 112 | }; 113 | 114 | /** 115 | * Converts UTF8 code points to UTF16 characters. 116 | * @param {(!function():number|null) | number} src Code points source, either as a function returning the next code point 117 | * respectively `null` if there are no more code points left or a single numeric code point. 118 | * @param {!function(number)} dst Characters destination as a function successively called with each converted char code. 119 | * @throws {RangeError} If a code point is out of range 120 | //? if (UTFX_STANDALONE) 121 | * @expose 122 | */ 123 | utfx.UTF8toUTF16 = function(src, dst) { 124 | var cp = null; 125 | if (typeof src === 'number') 126 | cp = src, src = function() { return null; }; 127 | while (cp !== null || (cp = src()) !== null) { 128 | if (cp <= 0xFFFF) 129 | dst(cp); 130 | else 131 | cp -= 0x10000, 132 | dst((cp>>10)+0xD800), 133 | dst((cp%0x400)+0xDC00); 134 | cp = null; 135 | } 136 | }; 137 | 138 | /** 139 | * Converts and encodes UTF16 characters to UTF8 bytes. 140 | * @param {!function():number|null} src Characters source as a function returning the next char code respectively `null` 141 | * if there are no more characters left. 142 | * @param {!function(number)} dst Bytes destination as a function successively called with the next byte. 143 | //? if (UTFX_STANDALONE) 144 | * @expose 145 | */ 146 | utfx.encodeUTF16toUTF8 = function(src, dst) { 147 | utfx.UTF16toUTF8(src, function(cp) { 148 | utfx.encodeUTF8(cp, dst); 149 | }); 150 | }; 151 | 152 | /** 153 | * Decodes and converts UTF8 bytes to UTF16 characters. 154 | * @param {!function():number|null} src Bytes source as a function returning the next byte respectively `null` if there 155 | * are no more bytes left. 156 | * @param {!function(number)} dst Characters destination as a function successively called with each converted char code. 157 | * @throws {RangeError} If a starting byte is invalid in UTF8 158 | * @throws {Error} If the last sequence is truncated. Has an array property `bytes` holding the remaining bytes. 159 | //? if (UTFX_STANDALONE) 160 | * @expose 161 | */ 162 | utfx.decodeUTF8toUTF16 = function(src, dst) { 163 | utfx.decodeUTF8(src, function(cp) { 164 | utfx.UTF8toUTF16(cp, dst); 165 | }); 166 | }; 167 | 168 | /** 169 | * Calculates the byte length of an UTF8 code point. 170 | * @param {number} cp UTF8 code point 171 | * @returns {number} Byte length 172 | //? if (UTFX_STANDALONE) 173 | * @expose 174 | */ 175 | utfx.calculateCodePoint = function(cp) { 176 | return (cp < 0x80) ? 1 : (cp < 0x800) ? 2 : (cp < 0x10000) ? 3 : 4; 177 | }; 178 | 179 | /** 180 | * Calculates the number of UTF8 bytes required to store UTF8 code points. 181 | * @param {(!function():number|null)} src Code points source as a function returning the next code point respectively 182 | * `null` if there are no more code points left. 183 | * @returns {number} The number of UTF8 bytes required 184 | //? if (UTFX_STANDALONE) 185 | * @expose 186 | */ 187 | utfx.calculateUTF8 = function(src) { 188 | var cp, l=0; 189 | while ((cp = src()) !== null) 190 | l += (cp < 0x80) ? 1 : (cp < 0x800) ? 2 : (cp < 0x10000) ? 3 : 4; 191 | return l; 192 | }; 193 | 194 | /** 195 | * Calculates the number of UTF8 code points respectively UTF8 bytes required to store UTF16 char codes. 196 | * @param {(!function():number|null)} src Characters source as a function returning the next char code respectively 197 | * `null` if there are no more characters left. 198 | * @returns {!Array.} The number of UTF8 code points at index 0 and the number of UTF8 bytes required at index 1. 199 | //? if (UTFX_STANDALONE) 200 | * @expose 201 | */ 202 | utfx.calculateUTF16asUTF8 = function(src) { 203 | var n=0, l=0; 204 | utfx.UTF16toUTF8(src, function(cp) { 205 | ++n; l += (cp < 0x80) ? 1 : (cp < 0x800) ? 2 : (cp < 0x10000) ? 3 : 4; 206 | }); 207 | return [n,l]; 208 | }; 209 | -------------------------------------------------------------------------------- /src/wrap_embeddable.js: -------------------------------------------------------------------------------- 1 | //? UTFX_STANDALONE = false; 2 | /** 3 | * utfx-embeddable (c) 2014 Daniel Wirtz 4 | * Released under the Apache License, Version 2.0 5 | * see: https://github.com/dcodeIO/utfx for details 6 | */ 7 | var utfx = function() { 8 | "use strict"; 9 | 10 | //? include("utfx.js"); 11 | 12 | return utfx; 13 | }(); 14 | -------------------------------------------------------------------------------- /src/wrap_standalone.js: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2014 Daniel Wirtz 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | //? UTFX_STANDALONE = true; 17 | 18 | /** 19 | * @license utfx (c) 2014 Daniel Wirtz 20 | * Released under the Apache License, Version 2.0 21 | * see: https://github.com/dcodeIO/utfx for details 22 | */ 23 | (function(global, factory) { 24 | 25 | /* AMD */ if (typeof define === 'function' && define['amd']) 26 | define(factory); 27 | /* CommonJS */ else if (typeof require === "function" && typeof module === 'object' && module && module['exports']) 28 | module['exports'] = factory(); 29 | /* Global */ else 30 | (global["dcodeIO"] = global["dcodeIO"] || {})["utfx"] = factory(); 31 | 32 | })(this, function() { 33 | "use strict"; 34 | 35 | if (!Array.isArray) 36 | Array.isArray = function (v) { 37 | return Object.prototype.toString.call(v) === "[object Array]"; 38 | }; 39 | 40 | //? include("utfx.js"); 41 | 42 | /** 43 | * String.fromCharCode reference for compile time renaming. 44 | * @type {!function(...[number]):string} 45 | * @inner 46 | */ 47 | var stringFromCharCode = String.fromCharCode; 48 | 49 | /** 50 | * Creates a source function for an array. 51 | * @param {!Array.} a Array to read from 52 | * @returns {!function():number|null} Source function returning the next value respectively `null` if there are no 53 | * more values left. 54 | * @throws {TypeError} If the argument is invalid 55 | //? if (UTFX_STANDALONE) 56 | * @expose 57 | */ 58 | utfx.arraySource = function(a) { 59 | if (!Array.isArray(a)) 60 | throw TypeError("Illegal argument: "+(typeof a)); 61 | var i=0; return function() { 62 | return i >= a.length ? null : a[i++]; 63 | }; 64 | }; 65 | 66 | /** 67 | * Creates a destination function for an array. 68 | * @param {!Array.} a Array to write to 69 | * @returns {!function(number)} Destination function successively called with the next value. 70 | * @throws {TypeError} If the argument is invalid 71 | //? if (UTFX_STANDALONE) 72 | * @expose 73 | */ 74 | utfx.arrayDestination = function(a) { 75 | if (!Array.isArray(a)) 76 | throw TypeError("Illegal argument: "+(typeof a)); 77 | return Array.prototype.push.bind(a); 78 | }; 79 | 80 | /** 81 | * Creates a source function for a string. 82 | * @param {string} s String to read from 83 | * @returns {!function():number|null} Source function returning the next char code respectively `null` if there are 84 | * no more characters left. 85 | * @throws {TypeError} If the argument is invalid 86 | //? if (UTFX_STANDALONE) 87 | * @expose 88 | */ 89 | utfx.stringSource = function(s) { 90 | if (typeof s !== 'string') 91 | throw TypeError("Illegal argument: "+(typeof s)); 92 | var i=0; return function() { 93 | return i >= s.length ? null : s.charCodeAt(i++); 94 | }; 95 | }; 96 | 97 | /** 98 | * Creates a destination function for a string. 99 | * @returns {function(number=):undefined|string} Destination function successively called with the next char code. 100 | * Returns the final string when called without arguments. 101 | //? if (UTFX_STANDALONE) 102 | * @expose 103 | */ 104 | utfx.stringDestination = function() { 105 | var cs = [], ps = []; return function() { 106 | if (arguments.length === 0) 107 | return ps.join('')+stringFromCharCode.apply(String, cs); 108 | if (cs.length + arguments.length > 1024) 109 | ps.push(stringFromCharCode.apply(String, cs)), 110 | cs.length = 0; 111 | Array.prototype.push.apply(cs, arguments); 112 | }; 113 | }; 114 | 115 | /** 116 | * Asserts an UTF16 char code. 117 | * @param {number} c UTF16 char code 118 | * @returns {number} Valid char code 119 | * @throws {TypeError} If the char code is invalid 120 | * @throws {RangeError} If the char code is out of range 121 | //? if (UTFX_STANDALONE) 122 | * @expose 123 | */ 124 | utfx.assertCharCode = function(c) { 125 | if (typeof c !== 'number' || c !== c) 126 | throw TypeError("Illegal char code: "+(typeof c)); 127 | if (c < 0 || c > 0xFFFF) 128 | throw RangeError("Illegal char code: "+c); 129 | return c; 130 | }; 131 | 132 | /** 133 | * Asserts an UTF8 code point. 134 | * @param {number} cp UTF8 code point 135 | * @returns {number} Valid code point 136 | * @throws {TypeError} If the code point is invalid 137 | * @throws {RangeError} If the code point is out of range 138 | //? if (UTFX_STANDALONE) 139 | * @expose 140 | */ 141 | utfx.assertCodePoint = function(cp) { 142 | if (typeof cp !== 'number' || cp !== cp) 143 | throw TypeError("Illegal code point: "+(typeof cp)); 144 | if (cp < 0 || cp > utfx.MAX_CODEPOINT) 145 | throw RangeError("Illegal code point: "+cp); 146 | return cp; 147 | }; 148 | 149 | /** 150 | * A polyfill for `String.fromCodePoint`. 151 | * @param {...number} var_args Code points 152 | * @returns {string} JavaScript string 153 | * @throws {TypeError} If arguments are invalid or a code point is invalid 154 | * @throws {RangeError} If a code point is out of range 155 | * @expose 156 | */ 157 | utfx.fromCodePoint = function(var_args) { 158 | var sd, i=0, cps=arguments, k=cps.length; 159 | utfx.UTF8toUTF16(function() { 160 | return i < k ? utfx.assertCodePoint(cps[i++]) : null; 161 | }, sd = utfx.stringDestination()); 162 | return sd(); 163 | }; 164 | 165 | /** 166 | * A polyfill for `String#codePointAt`. 167 | * @param {string} s JavaScript string 168 | * @param {number} i Index 169 | * @returns {number|undefined} Code point or `undefined` if `i` is out of range 170 | * @throws {TypeError} If arguments are invalid 171 | * @expose 172 | */ 173 | utfx.codePointAt = function(s, i) { 174 | if ((typeof s !== 'string' && !(s && s instanceof String)) || typeof i !== 'number') 175 | throw TypeError("Illegal arguments: "+(typeof s)+", "+(typeof i)); 176 | var k, cp; 177 | if (i < 0 || i >= (k=s.length)) 178 | return; 179 | utfx.UTF16toUTF8(function() { 180 | return typeof cp === 'undefined' && i < k ? s.charCodeAt(i++) : null; 181 | }, function(icp) { 182 | cp = icp; 183 | }); 184 | return cp; 185 | }; 186 | 187 | /** 188 | * Installs utfx as a polyfill for `String.fromCodePoint` and `String#codePointAt` if not implemented. 189 | * @param {boolean=} override Overrides an existing implementation if `true`, defaults to `false` 190 | * @returns {!Object.} utfx namespace 191 | * @expose 192 | */ 193 | utfx.polyfill = function(override) { 194 | if (!String['fromCodePoint'] || override) 195 | String['fromCodePoint'] = utfx.fromCodePoint; 196 | if (!String.prototype['codePointAt'] || override) 197 | String.prototype['codePointAt'] = function(i) { return utfx.codePointAt(this, i); }; 198 | return utfx; 199 | }; 200 | 201 | return utfx; 202 | 203 | }); 204 | -------------------------------------------------------------------------------- /tests/bench.js: -------------------------------------------------------------------------------- 1 | var utfx = require("../index.js"), 2 | utf8 = require("utf8"), 3 | prettyHrTime = require("pretty-hrtime"); 4 | 5 | var bench = {}; 6 | var impls = ["node", "utfx", "utf8", "binary"]; 7 | var str = "Hello world! ä☺𠜎️☁ Hello world! ä☺𠜎️☁ Hello world! ä☺𠜎️☁"; 8 | 9 | var encodeUTF16toUTF8_Buffer = new Buffer(Buffer.byteLength(str)); 10 | var encodeUTF16toUTF8_Array = new Uint8Array(encodeUTF16toUTF8_Buffer.length); 11 | 12 | bench["encodeUTF16toUTF8"] = function(type, n) { 13 | n = n || 1000000; 14 | switch (type) { 15 | case "node": 16 | for (var i=0; i