├── .gitignore
├── LICENSE
├── README.md
├── dist
├── README.md
├── utfx-embeddable.js
├── utfx.js
└── utfx.min.js
├── index.js
├── package.json
├── src
├── README.md
├── header.txt
├── utfx.js
├── wrap_embeddable.js
└── wrap_standalone.js
├── tests
├── bench.js
└── suite.js
└── utfx.png
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules/
2 | npm-debug.log
3 | .idea/
4 | raw/
5 | doco/
6 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
203 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 | ====
3 | **utfx** is a compact library to encode, decode and convert UTF8 / UTF16 in JavaScript using arbitrary sources and
4 | destinations through the use of successively called functions, basically eliminating [memory overhead](https://github.com/dcodeIO/utfx/wiki#faq).
5 |
6 | The standalone library is also capable of using binary strings and arrays (with the usual overhead) and provides
7 | polyfills for `String.fromCodePoint` and `String#codePointAt`.
8 |
9 | API
10 | ---
11 |
12 | ### encodeUTF8(src, dst)
13 |
14 | Encodes UTF8 code points to UTF8 bytes.
15 |
16 | | Parameter | Type | Description
17 | |-----------------|-----------------|---------------
18 | | src | *function():(number | null) | number* | Code points source, either as a function returning the next code point respectively `null` if there are no more code points left or a single numeric code point.
19 | | dst | *function(number)* | Bytes destination as a function successively called with the next byte
20 |
21 | ### decodeUTF8(src, dst)
22 |
23 | Decodes UTF8 bytes to UTF8 code points.
24 |
25 | | Parameter | Type | Description
26 | |-----------------|-----------------|---------------
27 | | src | *function():(number | null)* | Bytes source as a function returning the next byte respectively `null` if there are no more bytes left.
28 | | dst | *function(number)* | Code points destination as a function successively called with each decoded code point.
29 | | **@throws** | *RangeError* | If a starting byte is invalid in UTF8
30 | | **@throws** | *Error* | If the last sequence is truncated. Has an array property `bytes` holding the remaining bytes.
31 |
32 | ### UTF16toUTF8(src, dst)
33 |
34 | Converts UTF16 characters to UTF8 code points.
35 |
36 | | Parameter | Type | Description
37 | |-----------------|-----------------|---------------
38 | | src | *function():(number | null)* | Characters source as a function returning the next char code respectively `null` if there are no more characters left.
39 | | dst | *function(number)* | Code points destination as a function successively called with each converted code point.
40 |
41 | ### UTF8toUTF16(src, dst)
42 |
43 | Converts UTF8 code points to UTF16 characters.
44 |
45 | | Parameter | Type | Description
46 | |-----------------|-----------------|---------------
47 | | src | *function():(number | null) | number* | Code points source, either as a function returning the next code point respectively `null` if there are no more code points left or a single numeric code point.
48 | | dst | *function(number)* | Characters destination as a function successively called with each converted char code.
49 | | **@throws** | *RangeError* | If a code point is out of range
50 |
51 | ### encodeUTF16toUTF8(src, dst)
52 |
53 | Converts and encodes UTF16 characters to UTF8 bytes.
54 |
55 | | Parameter | Type | Description
56 | |-----------------|-----------------|---------------
57 | | src | *function():(number | null)* | Characters source as a function returning the next char code respectively `null` if there are no more characters left.
58 | | dst | *function(number)* | Bytes destination as a function successively called with the next byte.
59 |
60 | ### decodeUTF8toUTF16(src, dst)
61 |
62 | Decodes and converts UTF8 bytes to UTF16 characters.
63 |
64 | | Parameter | Type | Description
65 | |-----------------|-----------------|---------------
66 | | src | *function():(number | null)* | Bytes source as a function returning the next byte respectively `null` if there are no more bytes left.
67 | | dst | *function(number)* | Characters destination as a function successively called with each converted char code.
68 | | **@throws** | *RangeError* | If a starting byte is invalid in UTF8
69 | | **@throws** | *Error* | If the last sequence is truncated. Has an array property `bytes` holding the remaining bytes.
70 |
71 | ### assertByte(b)
72 |
73 | Asserts a byte value.
74 |
75 | | Parameter | Type | Description
76 | |-----------------|-----------------|---------------
77 | | b | *number* | 8bit byte value
78 | | **@returns** | *number* | Valid byte value
79 | | **@throws** | *TypeError* | If the byte value is invalid
80 | | **@throws** | *RangeError* | If the byte value is out of range
81 |
82 | ### assertCharCode(c)
83 |
84 | Asserts an UTF16 char code.
85 |
86 | | Parameter | Type | Description
87 | |-----------------|-----------------|---------------
88 | | c | *number* | UTF16 char code
89 | | **@returns** | *number* | Valid char code
90 | | **@throws** | *TypeError* | If the char code is invalid
91 | | **@throws** | *RangeError* | If the char code is out of range
92 |
93 | ### assertCodePoint(cp)
94 |
95 | Asserts an UTF8 code point.
96 |
97 | | Parameter | Type | Description
98 | |-----------------|-----------------|---------------
99 | | cp | *number* | UTF8 code point
100 | | **@returns** | *number* | Valid code point
101 | | **@throws** | *TypeError* | If the code point is invalid
102 | | **@throws** | *RangeError* | If the code point is out of range
103 |
104 | ### calculateCodePoint(cp)
105 |
106 | Calculates the byte length of an UTF8 code point.
107 |
108 | | Parameter | Type | Description
109 | |-----------------|-----------------|---------------
110 | | cp | *number* | UTF8 code point
111 | | **@returns** | *number* | Byte length
112 |
113 | ### calculateUTF8(src)
114 |
115 | Calculates the number of UTF8 bytes required to store UTF8 code points.
116 |
117 | | Parameter | Type | Description
118 | |-----------------|-----------------|---------------
119 | | src | *function():(number | null)* | Code points source as a function returning the next code point respectively `null` if there are no more code points left.
120 | | **@returns** | *number* | The number of UTF8 bytes required
121 |
122 | ### calculateUTF16asUTF8(src)
123 |
124 | Calculates the number of UTF8 code points respectively UTF8 bytes required to store UTF16 char codes.
125 |
126 | | Parameter | Type | Description
127 | |-----------------|-----------------|---------------
128 | | src | *function():(number | null)* | Characters source as a function returning the next char code respectively `null` if there are no more characters left.
129 | | **@returns** | *!Array.<number>* | The number of UTF8 code points at index 0 and the number of UTF8 bytes required at index 1.
130 |
131 | ### arraySource(a)
132 |
133 | Creates a source function for an array.
134 |
135 | | Parameter | Type | Description
136 | |-----------------|-----------------|---------------
137 | | a | *!Array.<number>* | Array to read from
138 | | **@returns** | *function():(number | null)* | Source function returning the next value respectively `null` if there are no more values left.
139 | | **@throws** | *TypeError* | If the argument is invalid
140 |
141 | ### arrayDestination(a)
142 |
143 | Creates a destination function for an array.
144 |
145 | | Parameter | Type | Description
146 | |-----------------|-----------------|---------------
147 | | a | *!Array.<number>* | Array to write to
148 | | **@returns** | *function(number)* | Destination function successively called with the next value.
149 | | **@throws** | *TypeError* | If the argument is invalid
150 |
151 | ### stringSource(s)
152 |
153 | Creates a source function for a string.
154 |
155 | | Parameter | Type | Description
156 | |-----------------|-----------------|---------------
157 | | s | *string* | String to read from
158 | | **@returns** | *function():(number | null)* | Source function returning the next char code respectively `null` if there are no more characters left.
159 | | **@throws** | *TypeError* | If the argument is invalid
160 |
161 | ### stringDestination()
162 |
163 | Creates a destination function for a string.
164 |
165 | | Parameter | Type | Description
166 | |-----------------|-----------------|---------------
167 | | **@returns** | *function(number=):(undefined | string)* | Destination function successively called with the next char code. Returns the final string when called without arguments.
168 |
169 | ### fromCodePoint(var_args)
170 |
171 | A polyfill for `String.fromCodePoint`.
172 |
173 | | Parameter | Type | Description
174 | |-----------------|-----------------|---------------
175 | | var_args | *...number* | Code points
176 | | **@returns** | *string* | JavaScript string
177 | | **@throws** | *TypeError* | If arguments are invalid or a code point is invalid
178 | | **@throws** | *RangeError* | If a code point is out of range
179 |
180 | ### codePointAt(s, i)
181 |
182 | A polyfill for `String#codePointAt`.
183 |
184 | | Parameter | Type | Description
185 | |-----------------|-----------------|---------------
186 | | s | *string* | JavaScript string
187 | | i | *number* | Index
188 | | **@returns** | *number | undefined* | Code point or `undefined` if `i` is out of range
189 | | **@throws** | *TypeError* | If arguments are invalid
190 |
191 | ### polyfill(override=)
192 |
193 | Installs utfx as a polyfill for `String.fromCodePoint` and `String#codePointAt` if not implemented.
194 |
195 | | Parameter | Type | Description
196 | |-----------------|-----------------|---------------
197 | | override | *boolean* | Overrides an existing implementation if `true`, defaults to `false`
198 | | **@returns** | *!Object.<string,*>* | utfx namespace
199 |
200 | Usage
201 | -----
202 | * **node.js**: `npm install utfx`
203 |
204 | ```js
205 | var utfx = require("utfx");
206 | ...
207 | ```
208 |
209 | * **Browser**: ``
210 |
211 | ```js
212 | var utfx = dcodeIO.utfx;
213 | ...
214 | ```
215 |
216 | * **Require.js/AMD**
217 |
218 | ```js
219 | require.config({
220 | "paths": {
221 | "utfx": "/path/to/utfx.min.js"
222 | }
223 | });
224 | require(["utfx"], function(utfx) {
225 | ...
226 | }
227 | ```
228 |
229 | Downloads
230 | ---------
231 | * [Distributions](https://github.com/dcodeIO/utfx/tree/master/dist)
232 |
233 | FAQ and examples
234 | ----------------
235 | * [Wiki](https://github.com/dcodeIO/utfx/wiki)
236 |
237 | License
238 | -------
239 | Apache License, Version 2.0
240 |
--------------------------------------------------------------------------------
/dist/README.md:
--------------------------------------------------------------------------------
1 | Distributions
2 | -------------
3 | * **[utfx.js](https://raw.githubusercontent.com/dcodeIO/utfx/master/dist/utfx.js)**
4 | is the standalone library.
5 | * **[utfx.min.js](https://raw.githubusercontent.com/dcodeIO/utfx/master/dist/utfx.min.js)**
6 | has been compiled with Closure Compiler using advanced optimizations.
7 |
8 | * **[utfx-embeddable.js](https://raw.githubusercontent.com/dcodeIO/utfx/master/dist/utfx-embeddable.js)**
9 | is an embeddable library containing just the core functionality.
10 |
--------------------------------------------------------------------------------
/dist/utfx-embeddable.js:
--------------------------------------------------------------------------------
1 | /**
2 | * utfx-embeddable (c) 2014 Daniel Wirtz
3 | * Released under the Apache License, Version 2.0
4 | * see: https://github.com/dcodeIO/utfx for details
5 | */
6 | var utfx = function() {
7 | "use strict";
8 |
9 | /**
10 | * utfx namespace.
11 | * @inner
12 | * @type {!Object.}
13 | */
14 | var utfx = {};
15 |
16 | /**
17 | * Maximum valid code point.
18 | * @type {number}
19 | * @const
20 | */
21 | utfx.MAX_CODEPOINT = 0x10FFFF;
22 |
23 | /**
24 | * Encodes UTF8 code points to UTF8 bytes.
25 | * @param {(!function():number|null) | number} src Code points source, either as a function returning the next code point
26 | * respectively `null` if there are no more code points left or a single numeric code point.
27 | * @param {!function(number)} dst Bytes destination as a function successively called with the next byte
28 | */
29 | utfx.encodeUTF8 = function(src, dst) {
30 | var cp = null;
31 | if (typeof src === 'number')
32 | cp = src,
33 | src = function() { return null; };
34 | while (cp !== null || (cp = src()) !== null) {
35 | if (cp < 0x80)
36 | dst(cp&0x7F);
37 | else if (cp < 0x800)
38 | dst(((cp>>6)&0x1F)|0xC0),
39 | dst((cp&0x3F)|0x80);
40 | else if (cp < 0x10000)
41 | dst(((cp>>12)&0x0F)|0xE0),
42 | dst(((cp>>6)&0x3F)|0x80),
43 | dst((cp&0x3F)|0x80);
44 | else
45 | dst(((cp>>18)&0x07)|0xF0),
46 | dst(((cp>>12)&0x3F)|0x80),
47 | dst(((cp>>6)&0x3F)|0x80),
48 | dst((cp&0x3F)|0x80);
49 | cp = null;
50 | }
51 | };
52 |
53 | /**
54 | * Decodes UTF8 bytes to UTF8 code points.
55 | * @param {!function():number|null} src Bytes source as a function returning the next byte respectively `null` if there
56 | * are no more bytes left.
57 | * @param {!function(number)} dst Code points destination as a function successively called with each decoded code point.
58 | * @throws {RangeError} If a starting byte is invalid in UTF8
59 | * @throws {Error} If the last sequence is truncated. Has an array property `bytes` holding the
60 | * remaining bytes.
61 | */
62 | utfx.decodeUTF8 = function(src, dst) {
63 | var a, b, c, d, fail = function(b) {
64 | b = b.slice(0, b.indexOf(null));
65 | var err = Error(b.toString());
66 | err.name = "TruncatedError";
67 | err['bytes'] = b;
68 | throw err;
69 | };
70 | while ((a = src()) !== null) {
71 | if ((a&0x80) === 0)
72 | dst(a);
73 | else if ((a&0xE0) === 0xC0)
74 | ((b = src()) === null) && fail([a, b]),
75 | dst(((a&0x1F)<<6) | (b&0x3F));
76 | else if ((a&0xF0) === 0xE0)
77 | ((b=src()) === null || (c=src()) === null) && fail([a, b, c]),
78 | dst(((a&0x0F)<<12) | ((b&0x3F)<<6) | (c&0x3F));
79 | else if ((a&0xF8) === 0xF0)
80 | ((b=src()) === null || (c=src()) === null || (d=src()) === null) && fail([a, b, c ,d]),
81 | dst(((a&0x07)<<18) | ((b&0x3F)<<12) | ((c&0x3F)<<6) | (d&0x3F));
82 | else throw RangeError("Illegal starting byte: "+a);
83 | }
84 | };
85 |
86 | /**
87 | * Converts UTF16 characters to UTF8 code points.
88 | * @param {!function():number|null} src Characters source as a function returning the next char code respectively
89 | * `null` if there are no more characters left.
90 | * @param {!function(number)} dst Code points destination as a function successively called with each converted code
91 | * point.
92 | */
93 | utfx.UTF16toUTF8 = function(src, dst) {
94 | var c1, c2 = null;
95 | while (true) {
96 | if ((c1 = c2 !== null ? c2 : src()) === null)
97 | break;
98 | if (c1 >= 0xD800 && c1 <= 0xDFFF) {
99 | if ((c2 = src()) !== null) {
100 | if (c2 >= 0xDC00 && c2 <= 0xDFFF) {
101 | dst((c1-0xD800)*0x400+c2-0xDC00+0x10000);
102 | c2 = null; continue;
103 | }
104 | }
105 | }
106 | dst(c1);
107 | }
108 | if (c2 !== null) dst(c2);
109 | };
110 |
111 | /**
112 | * Converts UTF8 code points to UTF16 characters.
113 | * @param {(!function():number|null) | number} src Code points source, either as a function returning the next code point
114 | * respectively `null` if there are no more code points left or a single numeric code point.
115 | * @param {!function(number)} dst Characters destination as a function successively called with each converted char code.
116 | * @throws {RangeError} If a code point is out of range
117 | */
118 | utfx.UTF8toUTF16 = function(src, dst) {
119 | var cp = null;
120 | if (typeof src === 'number')
121 | cp = src, src = function() { return null; };
122 | while (cp !== null || (cp = src()) !== null) {
123 | if (cp <= 0xFFFF)
124 | dst(cp);
125 | else
126 | cp -= 0x10000,
127 | dst((cp>>10)+0xD800),
128 | dst((cp%0x400)+0xDC00);
129 | cp = null;
130 | }
131 | };
132 |
133 | /**
134 | * Converts and encodes UTF16 characters to UTF8 bytes.
135 | * @param {!function():number|null} src Characters source as a function returning the next char code respectively `null`
136 | * if there are no more characters left.
137 | * @param {!function(number)} dst Bytes destination as a function successively called with the next byte.
138 | */
139 | utfx.encodeUTF16toUTF8 = function(src, dst) {
140 | utfx.UTF16toUTF8(src, function(cp) {
141 | utfx.encodeUTF8(cp, dst);
142 | });
143 | };
144 |
145 | /**
146 | * Decodes and converts UTF8 bytes to UTF16 characters.
147 | * @param {!function():number|null} src Bytes source as a function returning the next byte respectively `null` if there
148 | * are no more bytes left.
149 | * @param {!function(number)} dst Characters destination as a function successively called with each converted char code.
150 | * @throws {RangeError} If a starting byte is invalid in UTF8
151 | * @throws {Error} If the last sequence is truncated. Has an array property `bytes` holding the remaining bytes.
152 | */
153 | utfx.decodeUTF8toUTF16 = function(src, dst) {
154 | utfx.decodeUTF8(src, function(cp) {
155 | utfx.UTF8toUTF16(cp, dst);
156 | });
157 | };
158 |
159 | /**
160 | * Calculates the byte length of an UTF8 code point.
161 | * @param {number} cp UTF8 code point
162 | * @returns {number} Byte length
163 | */
164 | utfx.calculateCodePoint = function(cp) {
165 | return (cp < 0x80) ? 1 : (cp < 0x800) ? 2 : (cp < 0x10000) ? 3 : 4;
166 | };
167 |
168 | /**
169 | * Calculates the number of UTF8 bytes required to store UTF8 code points.
170 | * @param {(!function():number|null)} src Code points source as a function returning the next code point respectively
171 | * `null` if there are no more code points left.
172 | * @returns {number} The number of UTF8 bytes required
173 | */
174 | utfx.calculateUTF8 = function(src) {
175 | var cp, l=0;
176 | while ((cp = src()) !== null)
177 | l += (cp < 0x80) ? 1 : (cp < 0x800) ? 2 : (cp < 0x10000) ? 3 : 4;
178 | return l;
179 | };
180 |
181 | /**
182 | * Calculates the number of UTF8 code points respectively UTF8 bytes required to store UTF16 char codes.
183 | * @param {(!function():number|null)} src Characters source as a function returning the next char code respectively
184 | * `null` if there are no more characters left.
185 | * @returns {!Array.} The number of UTF8 code points at index 0 and the number of UTF8 bytes required at index 1.
186 | */
187 | utfx.calculateUTF16asUTF8 = function(src) {
188 | var n=0, l=0;
189 | utfx.UTF16toUTF8(src, function(cp) {
190 | ++n; l += (cp < 0x80) ? 1 : (cp < 0x800) ? 2 : (cp < 0x10000) ? 3 : 4;
191 | });
192 | return [n,l];
193 | };
194 |
195 | return utfx;
196 | }();
197 |
--------------------------------------------------------------------------------
/dist/utfx.js:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright 2014 Daniel Wirtz
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 |
17 | /**
18 | * @license utfx (c) 2014 Daniel Wirtz
19 | * Released under the Apache License, Version 2.0
20 | * see: https://github.com/dcodeIO/utfx for details
21 | */
22 | (function(global, factory) {
23 |
24 | /* AMD */ if (typeof define === 'function' && define['amd'])
25 | define(factory);
26 | /* CommonJS */ else if (typeof require === "function" && typeof module === 'object' && module && module['exports'])
27 | module['exports'] = factory();
28 | /* Global */ else
29 | (global["dcodeIO"] = global["dcodeIO"] || {})["utfx"] = factory();
30 |
31 | })(this, function() {
32 | "use strict";
33 |
34 | if (!Array.isArray)
35 | Array.isArray = function (v) {
36 | return Object.prototype.toString.call(v) === "[object Array]";
37 | };
38 |
39 | /**
40 | * utfx namespace.
41 | * @exports utfx
42 | * @type {!Object.}
43 | */
44 | var utfx = {};
45 |
46 | /**
47 | * Maximum valid code point.
48 | * @type {number}
49 | * @const
50 | * @expose
51 | */
52 | utfx.MAX_CODEPOINT = 0x10FFFF;
53 |
54 | /**
55 | * Encodes UTF8 code points to UTF8 bytes.
56 | * @param {(!function():number|null) | number} src Code points source, either as a function returning the next code point
57 | * respectively `null` if there are no more code points left or a single numeric code point.
58 | * @param {!function(number)} dst Bytes destination as a function successively called with the next byte
59 | * @expose
60 | */
61 | utfx.encodeUTF8 = function(src, dst) {
62 | var cp = null;
63 | if (typeof src === 'number')
64 | cp = src,
65 | src = function() { return null; };
66 | while (cp !== null || (cp = src()) !== null) {
67 | if (cp < 0x80)
68 | dst(cp&0x7F);
69 | else if (cp < 0x800)
70 | dst(((cp>>6)&0x1F)|0xC0),
71 | dst((cp&0x3F)|0x80);
72 | else if (cp < 0x10000)
73 | dst(((cp>>12)&0x0F)|0xE0),
74 | dst(((cp>>6)&0x3F)|0x80),
75 | dst((cp&0x3F)|0x80);
76 | else
77 | dst(((cp>>18)&0x07)|0xF0),
78 | dst(((cp>>12)&0x3F)|0x80),
79 | dst(((cp>>6)&0x3F)|0x80),
80 | dst((cp&0x3F)|0x80);
81 | cp = null;
82 | }
83 | };
84 |
85 | /**
86 | * Decodes UTF8 bytes to UTF8 code points.
87 | * @param {!function():number|null} src Bytes source as a function returning the next byte respectively `null` if there
88 | * are no more bytes left.
89 | * @param {!function(number)} dst Code points destination as a function successively called with each decoded code point.
90 | * @throws {RangeError} If a starting byte is invalid in UTF8
91 | * @throws {Error} If the last sequence is truncated. Has an array property `bytes` holding the
92 | * remaining bytes.
93 | * @expose
94 | */
95 | utfx.decodeUTF8 = function(src, dst) {
96 | var a, b, c, d, fail = function(b) {
97 | b = b.slice(0, b.indexOf(null));
98 | var err = Error(b.toString());
99 | err.name = "TruncatedError";
100 | err['bytes'] = b;
101 | throw err;
102 | };
103 | while ((a = src()) !== null) {
104 | if ((a&0x80) === 0)
105 | dst(a);
106 | else if ((a&0xE0) === 0xC0)
107 | ((b = src()) === null) && fail([a, b]),
108 | dst(((a&0x1F)<<6) | (b&0x3F));
109 | else if ((a&0xF0) === 0xE0)
110 | ((b=src()) === null || (c=src()) === null) && fail([a, b, c]),
111 | dst(((a&0x0F)<<12) | ((b&0x3F)<<6) | (c&0x3F));
112 | else if ((a&0xF8) === 0xF0)
113 | ((b=src()) === null || (c=src()) === null || (d=src()) === null) && fail([a, b, c ,d]),
114 | dst(((a&0x07)<<18) | ((b&0x3F)<<12) | ((c&0x3F)<<6) | (d&0x3F));
115 | else throw RangeError("Illegal starting byte: "+a);
116 | }
117 | };
118 |
119 | /**
120 | * Converts UTF16 characters to UTF8 code points.
121 | * @param {!function():number|null} src Characters source as a function returning the next char code respectively
122 | * `null` if there are no more characters left.
123 | * @param {!function(number)} dst Code points destination as a function successively called with each converted code
124 | * point.
125 | * @expose
126 | */
127 | utfx.UTF16toUTF8 = function(src, dst) {
128 | var c1, c2 = null;
129 | while (true) {
130 | if ((c1 = c2 !== null ? c2 : src()) === null)
131 | break;
132 | if (c1 >= 0xD800 && c1 <= 0xDFFF) {
133 | if ((c2 = src()) !== null) {
134 | if (c2 >= 0xDC00 && c2 <= 0xDFFF) {
135 | dst((c1-0xD800)*0x400+c2-0xDC00+0x10000);
136 | c2 = null; continue;
137 | }
138 | }
139 | }
140 | dst(c1);
141 | }
142 | if (c2 !== null) dst(c2);
143 | };
144 |
145 | /**
146 | * Converts UTF8 code points to UTF16 characters.
147 | * @param {(!function():number|null) | number} src Code points source, either as a function returning the next code point
148 | * respectively `null` if there are no more code points left or a single numeric code point.
149 | * @param {!function(number)} dst Characters destination as a function successively called with each converted char code.
150 | * @throws {RangeError} If a code point is out of range
151 | * @expose
152 | */
153 | utfx.UTF8toUTF16 = function(src, dst) {
154 | var cp = null;
155 | if (typeof src === 'number')
156 | cp = src, src = function() { return null; };
157 | while (cp !== null || (cp = src()) !== null) {
158 | if (cp <= 0xFFFF)
159 | dst(cp);
160 | else
161 | cp -= 0x10000,
162 | dst((cp>>10)+0xD800),
163 | dst((cp%0x400)+0xDC00);
164 | cp = null;
165 | }
166 | };
167 |
168 | /**
169 | * Converts and encodes UTF16 characters to UTF8 bytes.
170 | * @param {!function():number|null} src Characters source as a function returning the next char code respectively `null`
171 | * if there are no more characters left.
172 | * @param {!function(number)} dst Bytes destination as a function successively called with the next byte.
173 | * @expose
174 | */
175 | utfx.encodeUTF16toUTF8 = function(src, dst) {
176 | utfx.UTF16toUTF8(src, function(cp) {
177 | utfx.encodeUTF8(cp, dst);
178 | });
179 | };
180 |
181 | /**
182 | * Decodes and converts UTF8 bytes to UTF16 characters.
183 | * @param {!function():number|null} src Bytes source as a function returning the next byte respectively `null` if there
184 | * are no more bytes left.
185 | * @param {!function(number)} dst Characters destination as a function successively called with each converted char code.
186 | * @throws {RangeError} If a starting byte is invalid in UTF8
187 | * @throws {Error} If the last sequence is truncated. Has an array property `bytes` holding the remaining bytes.
188 | * @expose
189 | */
190 | utfx.decodeUTF8toUTF16 = function(src, dst) {
191 | utfx.decodeUTF8(src, function(cp) {
192 | utfx.UTF8toUTF16(cp, dst);
193 | });
194 | };
195 |
196 | /**
197 | * Calculates the byte length of an UTF8 code point.
198 | * @param {number} cp UTF8 code point
199 | * @returns {number} Byte length
200 | * @expose
201 | */
202 | utfx.calculateCodePoint = function(cp) {
203 | return (cp < 0x80) ? 1 : (cp < 0x800) ? 2 : (cp < 0x10000) ? 3 : 4;
204 | };
205 |
206 | /**
207 | * Calculates the number of UTF8 bytes required to store UTF8 code points.
208 | * @param {(!function():number|null)} src Code points source as a function returning the next code point respectively
209 | * `null` if there are no more code points left.
210 | * @returns {number} The number of UTF8 bytes required
211 | * @expose
212 | */
213 | utfx.calculateUTF8 = function(src) {
214 | var cp, l=0;
215 | while ((cp = src()) !== null)
216 | l += (cp < 0x80) ? 1 : (cp < 0x800) ? 2 : (cp < 0x10000) ? 3 : 4;
217 | return l;
218 | };
219 |
220 | /**
221 | * Calculates the number of UTF8 code points respectively UTF8 bytes required to store UTF16 char codes.
222 | * @param {(!function():number|null)} src Characters source as a function returning the next char code respectively
223 | * `null` if there are no more characters left.
224 | * @returns {!Array.} The number of UTF8 code points at index 0 and the number of UTF8 bytes required at index 1.
225 | * @expose
226 | */
227 | utfx.calculateUTF16asUTF8 = function(src) {
228 | var n=0, l=0;
229 | utfx.UTF16toUTF8(src, function(cp) {
230 | ++n; l += (cp < 0x80) ? 1 : (cp < 0x800) ? 2 : (cp < 0x10000) ? 3 : 4;
231 | });
232 | return [n,l];
233 | };
234 |
235 | /**
236 | * String.fromCharCode reference for compile time renaming.
237 | * @type {!function(...[number]):string}
238 | * @inner
239 | */
240 | var stringFromCharCode = String.fromCharCode;
241 |
242 | /**
243 | * Creates a source function for an array.
244 | * @param {!Array.} a Array to read from
245 | * @returns {!function():number|null} Source function returning the next value respectively `null` if there are no
246 | * more values left.
247 | * @throws {TypeError} If the argument is invalid
248 | * @expose
249 | */
250 | utfx.arraySource = function(a) {
251 | if (!Array.isArray(a))
252 | throw TypeError("Illegal argument: "+(typeof a));
253 | var i=0; return function() {
254 | return i >= a.length ? null : a[i++];
255 | };
256 | };
257 |
258 | /**
259 | * Creates a destination function for an array.
260 | * @param {!Array.} a Array to write to
261 | * @returns {!function(number)} Destination function successively called with the next value.
262 | * @throws {TypeError} If the argument is invalid
263 | * @expose
264 | */
265 | utfx.arrayDestination = function(a) {
266 | if (!Array.isArray(a))
267 | throw TypeError("Illegal argument: "+(typeof a));
268 | return Array.prototype.push.bind(a);
269 | };
270 |
271 | /**
272 | * Creates a source function for a string.
273 | * @param {string} s String to read from
274 | * @returns {!function():number|null} Source function returning the next char code respectively `null` if there are
275 | * no more characters left.
276 | * @throws {TypeError} If the argument is invalid
277 | * @expose
278 | */
279 | utfx.stringSource = function(s) {
280 | if (typeof s !== 'string')
281 | throw TypeError("Illegal argument: "+(typeof s));
282 | var i=0; return function() {
283 | return i >= s.length ? null : s.charCodeAt(i++);
284 | };
285 | };
286 |
287 | /**
288 | * Creates a destination function for a string.
289 | * @returns {function(number=):undefined|string} Destination function successively called with the next char code.
290 | * Returns the final string when called without arguments.
291 | * @expose
292 | */
293 | utfx.stringDestination = function() {
294 | var cs = [], ps = []; return function() {
295 | if (arguments.length === 0)
296 | return ps.join('')+stringFromCharCode.apply(String, cs);
297 | if (cs.length + arguments.length > 1024)
298 | ps.push(stringFromCharCode.apply(String, cs)),
299 | cs.length = 0;
300 | Array.prototype.push.apply(cs, arguments);
301 | };
302 | };
303 |
304 | /**
305 | * Asserts an UTF16 char code.
306 | * @param {number} c UTF16 char code
307 | * @returns {number} Valid char code
308 | * @throws {TypeError} If the char code is invalid
309 | * @throws {RangeError} If the char code is out of range
310 | * @expose
311 | */
312 | utfx.assertCharCode = function(c) {
313 | if (typeof c !== 'number' || c !== c)
314 | throw TypeError("Illegal char code: "+(typeof c));
315 | if (c < 0 || c > 0xFFFF)
316 | throw RangeError("Illegal char code: "+c);
317 | return c;
318 | };
319 |
320 | /**
321 | * Asserts an UTF8 code point.
322 | * @param {number} cp UTF8 code point
323 | * @returns {number} Valid code point
324 | * @throws {TypeError} If the code point is invalid
325 | * @throws {RangeError} If the code point is out of range
326 | * @expose
327 | */
328 | utfx.assertCodePoint = function(cp) {
329 | if (typeof cp !== 'number' || cp !== cp)
330 | throw TypeError("Illegal code point: "+(typeof cp));
331 | if (cp < 0 || cp > utfx.MAX_CODEPOINT)
332 | throw RangeError("Illegal code point: "+cp);
333 | return cp;
334 | };
335 |
336 | /**
337 | * A polyfill for `String.fromCodePoint`.
338 | * @param {...number} var_args Code points
339 | * @returns {string} JavaScript string
340 | * @throws {TypeError} If arguments are invalid or a code point is invalid
341 | * @throws {RangeError} If a code point is out of range
342 | * @expose
343 | */
344 | utfx.fromCodePoint = function(var_args) {
345 | var sd, i=0, cps=arguments, k=cps.length;
346 | utfx.UTF8toUTF16(function() {
347 | return i < k ? utfx.assertCodePoint(cps[i++]) : null;
348 | }, sd = utfx.stringDestination());
349 | return sd();
350 | };
351 |
352 | /**
353 | * A polyfill for `String#codePointAt`.
354 | * @param {string} s JavaScript string
355 | * @param {number} i Index
356 | * @returns {number|undefined} Code point or `undefined` if `i` is out of range
357 | * @throws {TypeError} If arguments are invalid
358 | * @expose
359 | */
360 | utfx.codePointAt = function(s, i) {
361 | if ((typeof s !== 'string' && !(s && s instanceof String)) || typeof i !== 'number')
362 | throw TypeError("Illegal arguments: "+(typeof s)+", "+(typeof i));
363 | var k, cp;
364 | if (i < 0 || i >= (k=s.length))
365 | return;
366 | utfx.UTF16toUTF8(function() {
367 | return typeof cp === 'undefined' && i < k ? s.charCodeAt(i++) : null;
368 | }, function(icp) {
369 | cp = icp;
370 | });
371 | return cp;
372 | };
373 |
374 | /**
375 | * Installs utfx as a polyfill for `String.fromCodePoint` and `String#codePointAt` if not implemented.
376 | * @param {boolean=} override Overrides an existing implementation if `true`, defaults to `false`
377 | * @returns {!Object.} utfx namespace
378 | * @expose
379 | */
380 | utfx.polyfill = function(override) {
381 | if (!String['fromCodePoint'] || override)
382 | String['fromCodePoint'] = utfx.fromCodePoint;
383 | if (!String.prototype['codePointAt'] || override)
384 | String.prototype['codePointAt'] = function(i) { return utfx.codePointAt(this, i); };
385 | return utfx;
386 | };
387 |
388 | return utfx;
389 |
390 | });
391 |
--------------------------------------------------------------------------------
/dist/utfx.min.js:
--------------------------------------------------------------------------------
1 | /*
2 | utfx (c) 2014 Daniel Wirtz
3 | Released under the Apache License, Version 2.0
4 | see: https://github.com/dcodeIO/utfx for details
5 | */
6 | function g(){Array.isArray||(Array.isArray=function(a){return"[object Array]"===Object.prototype.toString.call(a)});var d={MAX_CODEPOINT:1114111,encodeUTF8:function(a,b){var c=null;"number"===typeof a&&(c=a,a=function(){return null});for(;null!==c||null!==(c=a());)128>c?b(c&127):(2048>c?b(c>>6&31|192):(65536>c?b(c>>12&15|224):(b(c>>18&7|240),b(c>>12&63|128)),b(c>>6&63|128)),b(c&63|128)),c=null},decodeUTF8:function(a,b){function c(a){a=a.slice(0,a.indexOf(null));var b=Error(a.toString());b.name="TruncatedError";
7 | b.bytes=a;throw b;}for(var e,d,f,h;null!==(e=a());)if(0===(e&128))b(e);else if(192===(e&224))null===(d=a())&&c([e,d]),b((e&31)<<6|d&63);else if(224===(e&240))null!==(d=a())&&null!==(f=a())||c([e,d,f]),b((e&15)<<12|(d&63)<<6|f&63);else if(240===(e&248))null!==(d=a())&&null!==(f=a())&&null!==(h=a())||c([e,d,f,h]),b((e&7)<<18|(d&63)<<12|(f&63)<<6|h&63);else throw RangeError("Illegal starting byte: "+e);},UTF16toUTF8:function(a,b){for(var c,e=null;null!==(c=null!==e?e:a());)55296<=c&&57343>=c&&null!==
8 | (e=a())&&56320<=e&&57343>=e?(b(1024*(c-55296)+e-56320+65536),e=null):b(c);null!==e&&b(e)},UTF8toUTF16:function(a,b){var c=null;"number"===typeof a&&(c=a,a=function(){return null});for(;null!==c||null!==(c=a());)65535>=c?b(c):(c-=65536,b((c>>10)+55296),b(c%1024+56320)),c=null},encodeUTF16toUTF8:function(a,b){d.UTF16toUTF8(a,function(a){d.encodeUTF8(a,b)})},decodeUTF8toUTF16:function(a,b){d.decodeUTF8(a,function(a){d.UTF8toUTF16(a,b)})},calculateCodePoint:function(a){return 128>a?1:2048>a?2:65536>a?
9 | 3:4},calculateUTF8:function(a){for(var b,c=0;null!==(b=a());)c+=128>b?1:2048>b?2:65536>b?3:4;return c},calculateUTF16asUTF8:function(a){var b=0,c=0;d.UTF16toUTF8(a,function(a){++b;c+=128>a?1:2048>a?2:65536>a?3:4});return[b,c]}},k=String.fromCharCode;d.arraySource=function(a){if(!Array.isArray(a))throw TypeError("Illegal argument: "+typeof a);var b=0;return function(){return b>=a.length?null:a[b++]}};d.arrayDestination=function(a){if(!Array.isArray(a))throw TypeError("Illegal argument: "+typeof a);
10 | return Array.prototype.push.bind(a)};d.stringSource=function(a){if("string"!==typeof a)throw TypeError("Illegal argument: "+typeof a);var b=0;return function(){return b>=a.length?null:a.charCodeAt(b++)}};d.stringDestination=function(){var a=[],b=[];return function(){if(0===arguments.length)return b.join("")+k.apply(String,a);1024a||65535a||a>d.MAX_CODEPOINT)throw RangeError("Illegal code point: "+a);return a};d.fromCodePoint=function(a){var b,c=0,e=arguments,l=e.length;d.UTF8toUTF16(function(){return cb||b>=(c=a.length)))return d.UTF16toUTF8(function(){return"undefined"===typeof e&&b
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 |
17 | module.exports = require("./dist/utfx.js");
18 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "utfx",
3 | "author": "Daniel Wirtz ",
4 | "version": "1.0.1",
5 | "description": "A compact library to encode, decode and convert UTF8 / UTF16 in JavaScript.",
6 | "main": "index.js",
7 | "repository": {
8 | "type": "git",
9 | "url": "https://github.com/dcodeIO/utfx.git"
10 | },
11 | "bugs": {
12 | "url": "https://github.com/dcodeIO/utfx/issues"
13 | },
14 | "keywords": [
15 | "charset",
16 | "encoding",
17 | "unicode",
18 | "utf8",
19 | "utf16"
20 | ],
21 | "dependencies": {},
22 | "devDependencies": {
23 | "closurecompiler": "~1",
24 | "metascript": "~0",
25 | "pretty-hrtime": "^1.0.0",
26 | "testjs": "~1",
27 | "utf8": "^2.1.0"
28 | },
29 | "license": "Apache-2.0",
30 | "engines": {
31 | "node": ">=0.8"
32 | },
33 | "scripts": {
34 | "build": "node node_modules/metascript/bin/metascript src/wrap_standalone.js > dist/utfx.js && node node_modules/metascript/bin/metascript src/wrap_embeddable.js > dist/utfx-embeddable.js",
35 | "compile": "node node_modules/closurecompiler/bin/ccjs dist/utfx.js --compilation_level=ADVANCED_OPTIMIZATIONS > dist/utfx.min.js",
36 | "test": "node tests/suite.js",
37 | "make": "npm run-script build && npm run-script compile && npm test"
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/src/README.md:
--------------------------------------------------------------------------------
1 | Sources
2 | -------
3 | * **[wrap_standalone.js](https://github.com/dcodeIO/utfx/blob/master/src/wrap_standalone.js)**
4 | is the wrapper for the standalone library.
5 |
6 | * **[wrap_embeddable.js](https://github.com/dcodeIO/utfx/blob/master/src/wrap_embeddable.js)**
7 | is the wrapper for the embeddable library.
8 |
9 | * **[utfx.js](https://github.com/dcodeIO/utfx/blob/master/src/utfx.js)**
10 | is the core functionality.
11 |
--------------------------------------------------------------------------------
/src/header.txt:
--------------------------------------------------------------------------------
1 | /*! utfx-embeddable | (c) 2014 Daniel Wirtz | http://www.apache.org/licenses/LICENSE-2.0.html */
2 |
--------------------------------------------------------------------------------
/src/utfx.js:
--------------------------------------------------------------------------------
1 | /**
2 | * utfx namespace.
3 | //? if (UTFX_STANDALONE)
4 | * @exports utfx
5 | //? else
6 | * @inner
7 | * @type {!Object.}
8 | */
9 | var utfx = {};
10 |
11 | /**
12 | * Maximum valid code point.
13 | * @type {number}
14 | * @const
15 | //? if (UTFX_STANDALONE)
16 | * @expose
17 | */
18 | utfx.MAX_CODEPOINT = 0x10FFFF;
19 |
20 | /**
21 | * Encodes UTF8 code points to UTF8 bytes.
22 | * @param {(!function():number|null) | number} src Code points source, either as a function returning the next code point
23 | * respectively `null` if there are no more code points left or a single numeric code point.
24 | * @param {!function(number)} dst Bytes destination as a function successively called with the next byte
25 | //? if (UTFX_STANDALONE)
26 | * @expose
27 | */
28 | utfx.encodeUTF8 = function(src, dst) {
29 | var cp = null;
30 | if (typeof src === 'number')
31 | cp = src,
32 | src = function() { return null; };
33 | while (cp !== null || (cp = src()) !== null) {
34 | if (cp < 0x80)
35 | dst(cp&0x7F);
36 | else if (cp < 0x800)
37 | dst(((cp>>6)&0x1F)|0xC0),
38 | dst((cp&0x3F)|0x80);
39 | else if (cp < 0x10000)
40 | dst(((cp>>12)&0x0F)|0xE0),
41 | dst(((cp>>6)&0x3F)|0x80),
42 | dst((cp&0x3F)|0x80);
43 | else
44 | dst(((cp>>18)&0x07)|0xF0),
45 | dst(((cp>>12)&0x3F)|0x80),
46 | dst(((cp>>6)&0x3F)|0x80),
47 | dst((cp&0x3F)|0x80);
48 | cp = null;
49 | }
50 | };
51 |
52 | /**
53 | * Decodes UTF8 bytes to UTF8 code points.
54 | * @param {!function():number|null} src Bytes source as a function returning the next byte respectively `null` if there
55 | * are no more bytes left.
56 | * @param {!function(number)} dst Code points destination as a function successively called with each decoded code point.
57 | * @throws {RangeError} If a starting byte is invalid in UTF8
58 | * @throws {Error} If the last sequence is truncated. Has an array property `bytes` holding the
59 | * remaining bytes.
60 | //? if (UTFX_STANDALONE)
61 | * @expose
62 | */
63 | utfx.decodeUTF8 = function(src, dst) {
64 | var a, b, c, d, fail = function(b) {
65 | b = b.slice(0, b.indexOf(null));
66 | var err = Error(b.toString());
67 | err.name = "TruncatedError";
68 | err['bytes'] = b;
69 | throw err;
70 | };
71 | while ((a = src()) !== null) {
72 | if ((a&0x80) === 0)
73 | dst(a);
74 | else if ((a&0xE0) === 0xC0)
75 | ((b = src()) === null) && fail([a, b]),
76 | dst(((a&0x1F)<<6) | (b&0x3F));
77 | else if ((a&0xF0) === 0xE0)
78 | ((b=src()) === null || (c=src()) === null) && fail([a, b, c]),
79 | dst(((a&0x0F)<<12) | ((b&0x3F)<<6) | (c&0x3F));
80 | else if ((a&0xF8) === 0xF0)
81 | ((b=src()) === null || (c=src()) === null || (d=src()) === null) && fail([a, b, c ,d]),
82 | dst(((a&0x07)<<18) | ((b&0x3F)<<12) | ((c&0x3F)<<6) | (d&0x3F));
83 | else throw RangeError("Illegal starting byte: "+a);
84 | }
85 | };
86 |
87 | /**
88 | * Converts UTF16 characters to UTF8 code points.
89 | * @param {!function():number|null} src Characters source as a function returning the next char code respectively
90 | * `null` if there are no more characters left.
91 | * @param {!function(number)} dst Code points destination as a function successively called with each converted code
92 | * point.
93 | //? if (UTFX_STANDALONE)
94 | * @expose
95 | */
96 | utfx.UTF16toUTF8 = function(src, dst) {
97 | var c1, c2 = null;
98 | while (true) {
99 | if ((c1 = c2 !== null ? c2 : src()) === null)
100 | break;
101 | if (c1 >= 0xD800 && c1 <= 0xDFFF) {
102 | if ((c2 = src()) !== null) {
103 | if (c2 >= 0xDC00 && c2 <= 0xDFFF) {
104 | dst((c1-0xD800)*0x400+c2-0xDC00+0x10000);
105 | c2 = null; continue;
106 | }
107 | }
108 | }
109 | dst(c1);
110 | }
111 | if (c2 !== null) dst(c2);
112 | };
113 |
114 | /**
115 | * Converts UTF8 code points to UTF16 characters.
116 | * @param {(!function():number|null) | number} src Code points source, either as a function returning the next code point
117 | * respectively `null` if there are no more code points left or a single numeric code point.
118 | * @param {!function(number)} dst Characters destination as a function successively called with each converted char code.
119 | * @throws {RangeError} If a code point is out of range
120 | //? if (UTFX_STANDALONE)
121 | * @expose
122 | */
123 | utfx.UTF8toUTF16 = function(src, dst) {
124 | var cp = null;
125 | if (typeof src === 'number')
126 | cp = src, src = function() { return null; };
127 | while (cp !== null || (cp = src()) !== null) {
128 | if (cp <= 0xFFFF)
129 | dst(cp);
130 | else
131 | cp -= 0x10000,
132 | dst((cp>>10)+0xD800),
133 | dst((cp%0x400)+0xDC00);
134 | cp = null;
135 | }
136 | };
137 |
138 | /**
139 | * Converts and encodes UTF16 characters to UTF8 bytes.
140 | * @param {!function():number|null} src Characters source as a function returning the next char code respectively `null`
141 | * if there are no more characters left.
142 | * @param {!function(number)} dst Bytes destination as a function successively called with the next byte.
143 | //? if (UTFX_STANDALONE)
144 | * @expose
145 | */
146 | utfx.encodeUTF16toUTF8 = function(src, dst) {
147 | utfx.UTF16toUTF8(src, function(cp) {
148 | utfx.encodeUTF8(cp, dst);
149 | });
150 | };
151 |
152 | /**
153 | * Decodes and converts UTF8 bytes to UTF16 characters.
154 | * @param {!function():number|null} src Bytes source as a function returning the next byte respectively `null` if there
155 | * are no more bytes left.
156 | * @param {!function(number)} dst Characters destination as a function successively called with each converted char code.
157 | * @throws {RangeError} If a starting byte is invalid in UTF8
158 | * @throws {Error} If the last sequence is truncated. Has an array property `bytes` holding the remaining bytes.
159 | //? if (UTFX_STANDALONE)
160 | * @expose
161 | */
162 | utfx.decodeUTF8toUTF16 = function(src, dst) {
163 | utfx.decodeUTF8(src, function(cp) {
164 | utfx.UTF8toUTF16(cp, dst);
165 | });
166 | };
167 |
168 | /**
169 | * Calculates the byte length of an UTF8 code point.
170 | * @param {number} cp UTF8 code point
171 | * @returns {number} Byte length
172 | //? if (UTFX_STANDALONE)
173 | * @expose
174 | */
175 | utfx.calculateCodePoint = function(cp) {
176 | return (cp < 0x80) ? 1 : (cp < 0x800) ? 2 : (cp < 0x10000) ? 3 : 4;
177 | };
178 |
179 | /**
180 | * Calculates the number of UTF8 bytes required to store UTF8 code points.
181 | * @param {(!function():number|null)} src Code points source as a function returning the next code point respectively
182 | * `null` if there are no more code points left.
183 | * @returns {number} The number of UTF8 bytes required
184 | //? if (UTFX_STANDALONE)
185 | * @expose
186 | */
187 | utfx.calculateUTF8 = function(src) {
188 | var cp, l=0;
189 | while ((cp = src()) !== null)
190 | l += (cp < 0x80) ? 1 : (cp < 0x800) ? 2 : (cp < 0x10000) ? 3 : 4;
191 | return l;
192 | };
193 |
194 | /**
195 | * Calculates the number of UTF8 code points respectively UTF8 bytes required to store UTF16 char codes.
196 | * @param {(!function():number|null)} src Characters source as a function returning the next char code respectively
197 | * `null` if there are no more characters left.
198 | * @returns {!Array.} The number of UTF8 code points at index 0 and the number of UTF8 bytes required at index 1.
199 | //? if (UTFX_STANDALONE)
200 | * @expose
201 | */
202 | utfx.calculateUTF16asUTF8 = function(src) {
203 | var n=0, l=0;
204 | utfx.UTF16toUTF8(src, function(cp) {
205 | ++n; l += (cp < 0x80) ? 1 : (cp < 0x800) ? 2 : (cp < 0x10000) ? 3 : 4;
206 | });
207 | return [n,l];
208 | };
209 |
--------------------------------------------------------------------------------
/src/wrap_embeddable.js:
--------------------------------------------------------------------------------
1 | //? UTFX_STANDALONE = false;
2 | /**
3 | * utfx-embeddable (c) 2014 Daniel Wirtz
4 | * Released under the Apache License, Version 2.0
5 | * see: https://github.com/dcodeIO/utfx for details
6 | */
7 | var utfx = function() {
8 | "use strict";
9 |
10 | //? include("utfx.js");
11 |
12 | return utfx;
13 | }();
14 |
--------------------------------------------------------------------------------
/src/wrap_standalone.js:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright 2014 Daniel Wirtz
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | //? UTFX_STANDALONE = true;
17 |
18 | /**
19 | * @license utfx (c) 2014 Daniel Wirtz
20 | * Released under the Apache License, Version 2.0
21 | * see: https://github.com/dcodeIO/utfx for details
22 | */
23 | (function(global, factory) {
24 |
25 | /* AMD */ if (typeof define === 'function' && define['amd'])
26 | define(factory);
27 | /* CommonJS */ else if (typeof require === "function" && typeof module === 'object' && module && module['exports'])
28 | module['exports'] = factory();
29 | /* Global */ else
30 | (global["dcodeIO"] = global["dcodeIO"] || {})["utfx"] = factory();
31 |
32 | })(this, function() {
33 | "use strict";
34 |
35 | if (!Array.isArray)
36 | Array.isArray = function (v) {
37 | return Object.prototype.toString.call(v) === "[object Array]";
38 | };
39 |
40 | //? include("utfx.js");
41 |
42 | /**
43 | * String.fromCharCode reference for compile time renaming.
44 | * @type {!function(...[number]):string}
45 | * @inner
46 | */
47 | var stringFromCharCode = String.fromCharCode;
48 |
49 | /**
50 | * Creates a source function for an array.
51 | * @param {!Array.} a Array to read from
52 | * @returns {!function():number|null} Source function returning the next value respectively `null` if there are no
53 | * more values left.
54 | * @throws {TypeError} If the argument is invalid
55 | //? if (UTFX_STANDALONE)
56 | * @expose
57 | */
58 | utfx.arraySource = function(a) {
59 | if (!Array.isArray(a))
60 | throw TypeError("Illegal argument: "+(typeof a));
61 | var i=0; return function() {
62 | return i >= a.length ? null : a[i++];
63 | };
64 | };
65 |
66 | /**
67 | * Creates a destination function for an array.
68 | * @param {!Array.} a Array to write to
69 | * @returns {!function(number)} Destination function successively called with the next value.
70 | * @throws {TypeError} If the argument is invalid
71 | //? if (UTFX_STANDALONE)
72 | * @expose
73 | */
74 | utfx.arrayDestination = function(a) {
75 | if (!Array.isArray(a))
76 | throw TypeError("Illegal argument: "+(typeof a));
77 | return Array.prototype.push.bind(a);
78 | };
79 |
80 | /**
81 | * Creates a source function for a string.
82 | * @param {string} s String to read from
83 | * @returns {!function():number|null} Source function returning the next char code respectively `null` if there are
84 | * no more characters left.
85 | * @throws {TypeError} If the argument is invalid
86 | //? if (UTFX_STANDALONE)
87 | * @expose
88 | */
89 | utfx.stringSource = function(s) {
90 | if (typeof s !== 'string')
91 | throw TypeError("Illegal argument: "+(typeof s));
92 | var i=0; return function() {
93 | return i >= s.length ? null : s.charCodeAt(i++);
94 | };
95 | };
96 |
97 | /**
98 | * Creates a destination function for a string.
99 | * @returns {function(number=):undefined|string} Destination function successively called with the next char code.
100 | * Returns the final string when called without arguments.
101 | //? if (UTFX_STANDALONE)
102 | * @expose
103 | */
104 | utfx.stringDestination = function() {
105 | var cs = [], ps = []; return function() {
106 | if (arguments.length === 0)
107 | return ps.join('')+stringFromCharCode.apply(String, cs);
108 | if (cs.length + arguments.length > 1024)
109 | ps.push(stringFromCharCode.apply(String, cs)),
110 | cs.length = 0;
111 | Array.prototype.push.apply(cs, arguments);
112 | };
113 | };
114 |
115 | /**
116 | * Asserts an UTF16 char code.
117 | * @param {number} c UTF16 char code
118 | * @returns {number} Valid char code
119 | * @throws {TypeError} If the char code is invalid
120 | * @throws {RangeError} If the char code is out of range
121 | //? if (UTFX_STANDALONE)
122 | * @expose
123 | */
124 | utfx.assertCharCode = function(c) {
125 | if (typeof c !== 'number' || c !== c)
126 | throw TypeError("Illegal char code: "+(typeof c));
127 | if (c < 0 || c > 0xFFFF)
128 | throw RangeError("Illegal char code: "+c);
129 | return c;
130 | };
131 |
132 | /**
133 | * Asserts an UTF8 code point.
134 | * @param {number} cp UTF8 code point
135 | * @returns {number} Valid code point
136 | * @throws {TypeError} If the code point is invalid
137 | * @throws {RangeError} If the code point is out of range
138 | //? if (UTFX_STANDALONE)
139 | * @expose
140 | */
141 | utfx.assertCodePoint = function(cp) {
142 | if (typeof cp !== 'number' || cp !== cp)
143 | throw TypeError("Illegal code point: "+(typeof cp));
144 | if (cp < 0 || cp > utfx.MAX_CODEPOINT)
145 | throw RangeError("Illegal code point: "+cp);
146 | return cp;
147 | };
148 |
149 | /**
150 | * A polyfill for `String.fromCodePoint`.
151 | * @param {...number} var_args Code points
152 | * @returns {string} JavaScript string
153 | * @throws {TypeError} If arguments are invalid or a code point is invalid
154 | * @throws {RangeError} If a code point is out of range
155 | * @expose
156 | */
157 | utfx.fromCodePoint = function(var_args) {
158 | var sd, i=0, cps=arguments, k=cps.length;
159 | utfx.UTF8toUTF16(function() {
160 | return i < k ? utfx.assertCodePoint(cps[i++]) : null;
161 | }, sd = utfx.stringDestination());
162 | return sd();
163 | };
164 |
165 | /**
166 | * A polyfill for `String#codePointAt`.
167 | * @param {string} s JavaScript string
168 | * @param {number} i Index
169 | * @returns {number|undefined} Code point or `undefined` if `i` is out of range
170 | * @throws {TypeError} If arguments are invalid
171 | * @expose
172 | */
173 | utfx.codePointAt = function(s, i) {
174 | if ((typeof s !== 'string' && !(s && s instanceof String)) || typeof i !== 'number')
175 | throw TypeError("Illegal arguments: "+(typeof s)+", "+(typeof i));
176 | var k, cp;
177 | if (i < 0 || i >= (k=s.length))
178 | return;
179 | utfx.UTF16toUTF8(function() {
180 | return typeof cp === 'undefined' && i < k ? s.charCodeAt(i++) : null;
181 | }, function(icp) {
182 | cp = icp;
183 | });
184 | return cp;
185 | };
186 |
187 | /**
188 | * Installs utfx as a polyfill for `String.fromCodePoint` and `String#codePointAt` if not implemented.
189 | * @param {boolean=} override Overrides an existing implementation if `true`, defaults to `false`
190 | * @returns {!Object.} utfx namespace
191 | * @expose
192 | */
193 | utfx.polyfill = function(override) {
194 | if (!String['fromCodePoint'] || override)
195 | String['fromCodePoint'] = utfx.fromCodePoint;
196 | if (!String.prototype['codePointAt'] || override)
197 | String.prototype['codePointAt'] = function(i) { return utfx.codePointAt(this, i); };
198 | return utfx;
199 | };
200 |
201 | return utfx;
202 |
203 | });
204 |
--------------------------------------------------------------------------------
/tests/bench.js:
--------------------------------------------------------------------------------
1 | var utfx = require("../index.js"),
2 | utf8 = require("utf8"),
3 | prettyHrTime = require("pretty-hrtime");
4 |
5 | var bench = {};
6 | var impls = ["node", "utfx", "utf8", "binary"];
7 | var str = "Hello world! ä☺𠜎️☁ Hello world! ä☺𠜎️☁ Hello world! ä☺𠜎️☁";
8 |
9 | var encodeUTF16toUTF8_Buffer = new Buffer(Buffer.byteLength(str));
10 | var encodeUTF16toUTF8_Array = new Uint8Array(encodeUTF16toUTF8_Buffer.length);
11 |
12 | bench["encodeUTF16toUTF8"] = function(type, n) {
13 | n = n || 1000000;
14 | switch (type) {
15 | case "node":
16 | for (var i=0; i