├── .gitignore
├── LICENSE
├── README.md
├── builder.js
├── index.js
├── package.json
├── swap.js
└── test
    ├── mocha.opts
    └── test.js


/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules/
2 | coverage.html
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2018
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # unicode-trie
 2 | A data structure for fast Unicode character metadata lookup, ported from ICU
 3 | 
 4 | ## Background
 5 | 
 6 | When implementing many Unicode algorithms such as text segmentation,
 7 | normalization, bidi processing, etc., fast access to character metadata
 8 | is crucial to good performance.  There over a million code points in the
 9 | Unicode standard, many of which produce the same result when looked up,
10 | so an array or hash table is not appropriate - those data structures are
11 | fast but would require a lot of memory.  The data is generally
12 | grouped in ranges, so you could do a binary search, but that is not
13 | fast enough for some applications.
14 | 
15 | The [International Components for Unicode](http://site.icu-project.org) (ICU) project
16 | came up with a data structure based on a [Trie](http://en.wikipedia.org/wiki/Trie) that provides fast access
17 | to Unicode metadata.  The range data is precompiled to a serialized
18 | and flattened trie, which is then used at runtime to lookup the necessary
19 | data.  According to my own tests, this is generally at least 50% faster
20 | than binary search, with not too much additional memory required.
21 | 
22 | ## Installation
23 | 
24 |     npm install unicode-trie
25 | 
26 | ## Building a Trie
27 | 
28 | Unicode Tries are generally precompiled from data in the Unicode database
29 | for faster runtime performance.  To build a Unicode Trie, use the
30 | `UnicodeTrieBuilder` class.
31 | 
32 | ```js
33 | const UnicodeTrieBuilder = require('unicode-trie/builder');
34 | const fs = require('fs');
35 | 
36 | // create a trie
37 | let t = new UnicodeTrieBuilder();
38 | 
39 | // optional parameters for default value, and error value
40 | // if not provided, both are set to 0
41 | t = new UnicodeTrieBuilder(10, 999);
42 | 
43 | // set individual values and ranges
44 | t.set(0x4567, 99);
45 | t.setRange(0x40, 0xe7, 0x1234);
46 | 
47 | // you can lookup a value if you like
48 | t.get(0x4567); // => 99
49 | 
50 | // get a compiled trie (returns a UnicodeTrie object)
51 | const trie = t.freeze();
52 | 
53 | // write compressed trie to a binary file
54 | fs.writeFileSync('data.trie', t.toBuffer());
55 | ```
56 | 
57 | ## Using a precompiled Trie
58 | 
59 | Once you've built a precompiled trie, you can load it into the
60 | `UnicodeTrie` class, which is a readonly representation of the
61 | trie.  From there, you can lookup values.
62 | 
63 | ```js
64 | const UnicodeTrie = require('unicode-trie');
65 | const fs = require('fs');
66 | 
67 | // load serialized trie from binary file
68 | const data = fs.readFileSync('data.trie');
69 | const trie = new UnicodeTrie(data);
70 | 
71 | // lookup a value
72 | trie.get(0x4567); // => 99
73 | ```
74 | 
75 | ## License
76 | 
77 | MIT
78 | 


--------------------------------------------------------------------------------
/builder.js:
--------------------------------------------------------------------------------
  1 | const UnicodeTrie = require('./');
  2 | const pako = require('pako');
  3 | const { swap32LE } = require('./swap');
  4 | 
  5 | // Shift size for getting the index-1 table offset.
  6 | const SHIFT_1 = 6 + 5;
  7 | 
  8 | // Shift size for getting the index-2 table offset.
  9 | const SHIFT_2 = 5;
 10 | 
 11 | // Difference between the two shift sizes,
 12 | // for getting an index-1 offset from an index-2 offset. 6=11-5
 13 | const SHIFT_1_2 = SHIFT_1 - SHIFT_2;
 14 | 
 15 | // Number of index-1 entries for the BMP. 32=0x20
 16 | // This part of the index-1 table is omitted from the serialized form.
 17 | const OMITTED_BMP_INDEX_1_LENGTH = 0x10000 >> SHIFT_1;
 18 | 
 19 | // Number of code points per index-1 table entry. 2048=0x800
 20 | const CP_PER_INDEX_1_ENTRY = 1 << SHIFT_1;
 21 | 
 22 | // Number of entries in an index-2 block. 64=0x40
 23 | const INDEX_2_BLOCK_LENGTH = 1 << SHIFT_1_2;
 24 | 
 25 | // Mask for getting the lower bits for the in-index-2-block offset. */
 26 | const INDEX_2_MASK = INDEX_2_BLOCK_LENGTH - 1;
 27 | 
 28 | // Number of entries in a data block. 32=0x20
 29 | const DATA_BLOCK_LENGTH = 1 << SHIFT_2;
 30 | 
 31 | // Mask for getting the lower bits for the in-data-block offset.
 32 | const DATA_MASK = DATA_BLOCK_LENGTH - 1;
 33 | 
 34 | // Shift size for shifting left the index array values.
 35 | // Increases possible data size with 16-bit index values at the cost
 36 | // of compactability.
 37 | // This requires data blocks to be aligned by DATA_GRANULARITY.
 38 | const INDEX_SHIFT = 2;
 39 | 
 40 | // The alignment size of a data block. Also the granularity for compaction.
 41 | const DATA_GRANULARITY = 1 << INDEX_SHIFT;
 42 | 
 43 | // The BMP part of the index-2 table is fixed and linear and starts at offset 0.
 44 | // Length=2048=0x800=0x10000>>SHIFT_2.
 45 | const INDEX_2_OFFSET = 0;
 46 | 
 47 | // The part of the index-2 table for U+D800..U+DBFF stores values for
 48 | // lead surrogate code _units_ not code _points_.
 49 | // Values for lead surrogate code _points_ are indexed with this portion of the table.
 50 | // Length=32=0x20=0x400>>SHIFT_2. (There are 1024=0x400 lead surrogates.)
 51 | const LSCP_INDEX_2_OFFSET = 0x10000 >> SHIFT_2;
 52 | const LSCP_INDEX_2_LENGTH = 0x400 >> SHIFT_2;
 53 | 
 54 | // Count the lengths of both BMP pieces. 2080=0x820
 55 | const INDEX_2_BMP_LENGTH = LSCP_INDEX_2_OFFSET + LSCP_INDEX_2_LENGTH;
 56 | 
 57 | // The 2-byte UTF-8 version of the index-2 table follows at offset 2080=0x820.
 58 | // Length 32=0x20 for lead bytes C0..DF, regardless of SHIFT_2.
 59 | const UTF8_2B_INDEX_2_OFFSET = INDEX_2_BMP_LENGTH;
 60 | const UTF8_2B_INDEX_2_LENGTH = 0x800 >> 6;  // U+0800 is the first code point after 2-byte UTF-8
 61 | 
 62 | // The index-1 table, only used for supplementary code points, at offset 2112=0x840.
 63 | // Variable length, for code points up to highStart, where the last single-value range starts.
 64 | // Maximum length 512=0x200=0x100000>>SHIFT_1.
 65 | // (For 0x100000 supplementary code points U+10000..U+10ffff.)
 66 | //
 67 | // The part of the index-2 table for supplementary code points starts
 68 | // after this index-1 table.
 69 | //
 70 | // Both the index-1 table and the following part of the index-2 table
 71 | // are omitted completely if there is only BMP data.
 72 | const INDEX_1_OFFSET = UTF8_2B_INDEX_2_OFFSET + UTF8_2B_INDEX_2_LENGTH;
 73 | const MAX_INDEX_1_LENGTH = 0x100000 >> SHIFT_1;
 74 | 
 75 | // The illegal-UTF-8 data block follows the ASCII block, at offset 128=0x80.
 76 | // Used with linear access for single bytes 0..0xbf for simple error handling.
 77 | // Length 64=0x40, not DATA_BLOCK_LENGTH.
 78 | const BAD_UTF8_DATA_OFFSET = 0x80;
 79 | 
 80 | // The start of non-linear-ASCII data blocks, at offset 192=0xc0.
 81 | // !!!!
 82 | const DATA_START_OFFSET = 0xc0;
 83 | 
 84 | // The null data block.
 85 | // Length 64=0x40 even if DATA_BLOCK_LENGTH is smaller,
 86 | // to work with 6-bit trail bytes from 2-byte UTF-8.
 87 | const DATA_NULL_OFFSET = DATA_START_OFFSET;
 88 | 
 89 | // The start of allocated data blocks.
 90 | const NEW_DATA_START_OFFSET = DATA_NULL_OFFSET + 0x40;
 91 | 
 92 | // The start of data blocks for U+0800 and above.
 93 | // Below, compaction uses a block length of 64 for 2-byte UTF-8.
 94 | // From here on, compaction uses DATA_BLOCK_LENGTH.
 95 | // Data values for 0x780 code points beyond ASCII.
 96 | const DATA_0800_OFFSET = NEW_DATA_START_OFFSET + 0x780;
 97 | 
 98 | // Start with allocation of 16k data entries. */
 99 | const INITIAL_DATA_LENGTH = 1 << 14;
100 | 
101 | // Grow about 8x each time.
102 | const MEDIUM_DATA_LENGTH = 1 << 17;
103 | 
104 | // Maximum length of the runtime data array.
105 | // Limited by 16-bit index values that are left-shifted by INDEX_SHIFT,
106 | // and by uint16_t UTrie2Header.shiftedDataLength.
107 | const MAX_DATA_LENGTH_RUNTIME = 0xffff << INDEX_SHIFT;
108 | 
109 | const INDEX_1_LENGTH = 0x110000 >> SHIFT_1;
110 | 
111 | // Maximum length of the build-time data array.
112 | // One entry per 0x110000 code points, plus the illegal-UTF-8 block and the null block,
113 | // plus values for the 0x400 surrogate code units.
114 | const MAX_DATA_LENGTH_BUILDTIME = 0x110000 + 0x40 + 0x40 + 0x400;
115 | 
116 | // At build time, leave a gap in the index-2 table,
117 | // at least as long as the maximum lengths of the 2-byte UTF-8 index-2 table
118 | // and the supplementary index-1 table.
119 | // Round up to INDEX_2_BLOCK_LENGTH for proper compacting.
120 | const INDEX_GAP_OFFSET = INDEX_2_BMP_LENGTH;
121 | const INDEX_GAP_LENGTH = ((UTF8_2B_INDEX_2_LENGTH + MAX_INDEX_1_LENGTH) + INDEX_2_MASK) & ~INDEX_2_MASK;
122 | 
123 | // Maximum length of the build-time index-2 array.
124 | // Maximum number of Unicode code points (0x110000) shifted right by SHIFT_2,
125 | // plus the part of the index-2 table for lead surrogate code points,
126 | // plus the build-time index gap,
127 | // plus the null index-2 block.)
128 | const MAX_INDEX_2_LENGTH = (0x110000 >> SHIFT_2) + LSCP_INDEX_2_LENGTH + INDEX_GAP_LENGTH + INDEX_2_BLOCK_LENGTH;
129 | 
130 | // The null index-2 block, following the gap in the index-2 table.
131 | const INDEX_2_NULL_OFFSET = INDEX_GAP_OFFSET + INDEX_GAP_LENGTH;
132 | 
133 | // The start of allocated index-2 blocks.
134 | const INDEX_2_START_OFFSET = INDEX_2_NULL_OFFSET + INDEX_2_BLOCK_LENGTH;
135 | 
136 | // Maximum length of the runtime index array.
137 | // Limited by its own 16-bit index values, and by uint16_t UTrie2Header.indexLength.
138 | // (The actual maximum length is lower,
139 | // (0x110000>>SHIFT_2)+UTF8_2B_INDEX_2_LENGTH+MAX_INDEX_1_LENGTH.)
140 | const MAX_INDEX_LENGTH = 0xffff;
141 | 
142 | const equal_int = (a, s, t, length) => {
143 |   for (let i = 0; i < length; i++) {
144 |     if (a[s + i] !== a[t + i]) {
145 |       return false;
146 |     }
147 |   }
148 | 
149 |   return true;
150 | };
151 | 
152 | class UnicodeTrieBuilder {
153 |   constructor(initialValue, errorValue) {
154 |     let i, j;
155 |     if (initialValue == null) {
156 |       initialValue = 0;
157 |     }
158 |     this.initialValue = initialValue;
159 |     if (errorValue == null) {
160 |       errorValue = 0;
161 |     }
162 |     this.errorValue = errorValue;
163 |     this.index1 = new Int32Array(INDEX_1_LENGTH);
164 |     this.index2 = new Int32Array(MAX_INDEX_2_LENGTH);
165 |     this.highStart = 0x110000;
166 | 
167 |     this.data = new Uint32Array(INITIAL_DATA_LENGTH);
168 |     this.dataCapacity = INITIAL_DATA_LENGTH;
169 | 
170 |     this.firstFreeBlock = 0;
171 |     this.isCompacted = false;
172 | 
173 |     // Multi-purpose per-data-block table.
174 |     //
175 |     // Before compacting:
176 |     //
177 |     // Per-data-block reference counters/free-block list.
178 |     //  0: unused
179 |     // >0: reference counter (number of index-2 entries pointing here)
180 |     // <0: next free data block in free-block list
181 |     //
182 |     // While compacting:
183 |     //
184 |     // Map of adjusted indexes, used in compactData() and compactIndex2().
185 |     // Maps from original indexes to new ones.
186 |     this.map = new Int32Array(MAX_DATA_LENGTH_BUILDTIME >> SHIFT_2);
187 | 
188 |     for (i = 0; i < 0x80; i++) {
189 |       this.data[i] = this.initialValue;
190 |     }
191 | 
192 |     for (i = i; i < 0xc0; i++) {
193 |       this.data[i] = this.errorValue;
194 |     }
195 | 
196 |     for (i = DATA_NULL_OFFSET; i < NEW_DATA_START_OFFSET; i++) {
197 |       this.data[i] = this.initialValue;
198 |     }
199 | 
200 |     this.dataNullOffset = DATA_NULL_OFFSET;
201 |     this.dataLength = NEW_DATA_START_OFFSET;
202 | 
203 |     // set the index-2 indexes for the 2=0x80>>SHIFT_2 ASCII data blocks
204 |     i = 0;
205 |     for (j = 0; j < 0x80; j += DATA_BLOCK_LENGTH) {
206 |       this.index2[i] = j;
207 |       this.map[i++] = 1;
208 |     }
209 | 
210 |     // reference counts for the bad-UTF-8-data block
211 |     for (j = j; j < 0xc0; j += DATA_BLOCK_LENGTH) {
212 |       this.map[i++] = 0;
213 |     }
214 | 
215 |     // Reference counts for the null data block: all blocks except for the ASCII blocks.
216 |     // Plus 1 so that we don't drop this block during compaction.
217 |     // Plus as many as needed for lead surrogate code points.
218 |     // i==newTrie->dataNullOffset
219 |     this.map[i++] = ((0x110000 >> SHIFT_2) - (0x80 >> SHIFT_2)) + 1 + LSCP_INDEX_2_LENGTH;
220 |     j += DATA_BLOCK_LENGTH;
221 |     for (j = j; j < NEW_DATA_START_OFFSET; j += DATA_BLOCK_LENGTH) {
222 |       this.map[i++] = 0;
223 |     }
224 | 
225 |     // set the remaining indexes in the BMP index-2 block
226 |     // to the null data block
227 |     for (i = 0x80 >> SHIFT_2; i < INDEX_2_BMP_LENGTH; i++) {
228 |       this.index2[i] = DATA_NULL_OFFSET;
229 |     }
230 | 
231 |     // Fill the index gap with impossible values so that compaction
232 |     // does not overlap other index-2 blocks with the gap.
233 |     for (i = 0; i < INDEX_GAP_LENGTH; i++) {
234 |       this.index2[INDEX_GAP_OFFSET + i] = -1;
235 |     }
236 | 
237 |     // set the indexes in the null index-2 block
238 |     for (i = 0; i < INDEX_2_BLOCK_LENGTH; i++) {
239 |       this.index2[INDEX_2_NULL_OFFSET + i] = DATA_NULL_OFFSET;
240 |     }
241 | 
242 |     this.index2NullOffset = INDEX_2_NULL_OFFSET;
243 |     this.index2Length = INDEX_2_START_OFFSET;
244 | 
245 |     // set the index-1 indexes for the linear index-2 block
246 |     j = 0;
247 |     for (i = 0; i < OMITTED_BMP_INDEX_1_LENGTH; i++) {
248 |       this.index1[i] = j;
249 |       j += INDEX_2_BLOCK_LENGTH;
250 |     }
251 | 
252 |     // set the remaining index-1 indexes to the null index-2 block
253 |     for (i = i; i < INDEX_1_LENGTH; i++) {
254 |       this.index1[i] = INDEX_2_NULL_OFFSET;
255 |     }
256 | 
257 |     // Preallocate and reset data for U+0080..U+07ff,
258 |     // for 2-byte UTF-8 which will be compacted in 64-blocks
259 |     // even if DATA_BLOCK_LENGTH is smaller.
260 |     for (i = 0x80; i < 0x800; i += DATA_BLOCK_LENGTH) {
261 |       this.set(i, this.initialValue);
262 |     }
263 | 
264 |   }
265 | 
266 |   set(codePoint, value) {
267 |     if ((codePoint < 0) || (codePoint > 0x10ffff)) {
268 |       throw new Error('Invalid code point');
269 |     }
270 | 
271 |     if (this.isCompacted) {
272 |       throw new Error('Already compacted');
273 |     }
274 | 
275 |     const block = this._getDataBlock(codePoint, true);
276 |     this.data[block + (codePoint & DATA_MASK)] = value;
277 |     return this;
278 |   }
279 | 
280 |   setRange(start, end, value, overwrite) {
281 |     let block, repeatBlock;
282 |     if (overwrite == null) {
283 |       overwrite = true;
284 |     }
285 |     if ((start > 0x10ffff) || (end > 0x10ffff) || (start > end)) {
286 |       throw new Error('Invalid code point');
287 |     }
288 | 
289 |     if (this.isCompacted) {
290 |       throw new Error('Already compacted');
291 |     }
292 | 
293 |     if (!overwrite && (value === this.initialValue)) {
294 |       return this; // nothing to do
295 |     }
296 | 
297 |     let limit = end + 1;
298 |     if ((start & DATA_MASK) !== 0) {
299 |       // set partial block at [start..following block boundary
300 |       block = this._getDataBlock(start, true);
301 | 
302 |       const nextStart = (start + DATA_BLOCK_LENGTH) & ~DATA_MASK;
303 |       if (nextStart <= limit) {
304 |         this._fillBlock(block, start & DATA_MASK, DATA_BLOCK_LENGTH, value, this.initialValue, overwrite);
305 |         start = nextStart;
306 |       } else {
307 |         this._fillBlock(block, start & DATA_MASK, limit & DATA_MASK, value, this.initialValue, overwrite);
308 |         return this;
309 |       }
310 |     }
311 | 
312 |     // number of positions in the last, partial block
313 |     const rest = limit & DATA_MASK;
314 | 
315 |     // round down limit to a block boundary
316 |     limit &= ~DATA_MASK;
317 | 
318 |     // iterate over all-value blocks
319 |     if (value === this.initialValue) {
320 |       repeatBlock = this.dataNullOffset;
321 |     } else {
322 |       repeatBlock = -1;
323 |     }
324 | 
325 |     while (start < limit) {
326 |       let setRepeatBlock = false;
327 | 
328 |       if ((value === this.initialValue) && this._isInNullBlock(start, true)) {
329 |         start += DATA_BLOCK_LENGTH; // nothing to do
330 |         continue;
331 |       }
332 | 
333 |       // get index value
334 |       let i2 = this._getIndex2Block(start, true);
335 |       i2 += (start >> SHIFT_2) & INDEX_2_MASK;
336 | 
337 |       block = this.index2[i2];
338 |       if (this._isWritableBlock(block)) {
339 |         // already allocated
340 |         if (overwrite && (block >= DATA_0800_OFFSET)) {
341 |           // We overwrite all values, and it's not a
342 |           // protected (ASCII-linear or 2-byte UTF-8) block:
343 |           // replace with the repeatBlock.
344 |           setRepeatBlock = true;
345 |         } else {
346 |           // protected block: just write the values into this block
347 |           this._fillBlock(block, 0, DATA_BLOCK_LENGTH, value, this.initialValue, overwrite);
348 |         }
349 | 
350 |       } else if ((this.data[block] !== value) && (overwrite || (block === this.dataNullOffset))) {
351 |         // Set the repeatBlock instead of the null block or previous repeat block:
352 |         //
353 |         // If !isWritableBlock() then all entries in the block have the same value
354 |         // because it's the null block or a range block (the repeatBlock from a previous
355 |         // call to utrie2_setRange32()).
356 |         // No other blocks are used multiple times before compacting.
357 |         //
358 |         // The null block is the only non-writable block with the initialValue because
359 |         // of the repeatBlock initialization above. (If value==initialValue, then
360 |         // the repeatBlock will be the null data block.)
361 |         //
362 |         // We set our repeatBlock if the desired value differs from the block's value,
363 |         // and if we overwrite any data or if the data is all initial values
364 |         // (which is the same as the block being the null block, see above).
365 |         setRepeatBlock = true;
366 |       }
367 | 
368 |       if (setRepeatBlock) {
369 |         if (repeatBlock >= 0) {
370 |           this._setIndex2Entry(i2, repeatBlock);
371 |         } else {
372 |           // create and set and fill the repeatBlock
373 |           repeatBlock = this._getDataBlock(start, true);
374 |           this._writeBlock(repeatBlock, value);
375 |         }
376 |       }
377 | 
378 |       start += DATA_BLOCK_LENGTH;
379 |     }
380 | 
381 |     if (rest > 0) {
382 |       // set partial block at [last block boundary..limit
383 |       block = this._getDataBlock(start, true);
384 |       this._fillBlock(block, 0, rest, value, this.initialValue, overwrite);
385 |     }
386 | 
387 |     return this;
388 |   }
389 | 
390 |   get(c, fromLSCP) {
391 |     let i2;
392 |     if (fromLSCP == null) {
393 |       fromLSCP = true;
394 |     }
395 |     if ((c < 0) || (c > 0x10ffff)) {
396 |       return this.errorValue;
397 |     }
398 | 
399 |     if ((c >= this.highStart) && (!((c >= 0xd800) && (c < 0xdc00)) || fromLSCP)) {
400 |       return this.data[this.dataLength - DATA_GRANULARITY];
401 |     }
402 | 
403 |     if (((c >= 0xd800) && (c < 0xdc00)) && fromLSCP) {
404 |       i2 = (LSCP_INDEX_2_OFFSET - (0xd800 >> SHIFT_2)) + (c >> SHIFT_2);
405 |     } else {
406 |       i2 = this.index1[c >> SHIFT_1] + ((c >> SHIFT_2) & INDEX_2_MASK);
407 |     }
408 | 
409 |     const block = this.index2[i2];
410 |     return this.data[block + (c & DATA_MASK)];
411 |   }
412 | 
413 |   _isInNullBlock(c, forLSCP) {
414 |     let i2;
415 |     if (((c & 0xfffffc00) === 0xd800) && forLSCP) {
416 |       i2 = (LSCP_INDEX_2_OFFSET - (0xd800 >> SHIFT_2)) + (c >> SHIFT_2);
417 |     } else {
418 |       i2 = this.index1[c >> SHIFT_1] + ((c >> SHIFT_2) & INDEX_2_MASK);
419 |     }
420 | 
421 |     const block = this.index2[i2];
422 |     return block === this.dataNullOffset;
423 |   }
424 | 
425 |   _allocIndex2Block() {
426 |     const newBlock = this.index2Length;
427 |     const newTop = newBlock + INDEX_2_BLOCK_LENGTH;
428 |     if (newTop > this.index2.length) {
429 |       // Should never occur.
430 |       // Either MAX_BUILD_TIME_INDEX_LENGTH is incorrect,
431 |       // or the code writes more values than should be possible.
432 |       throw new Error("Internal error in Trie2 creation.");
433 |     }
434 | 
435 |     this.index2Length = newTop;
436 |     this.index2.set(this.index2.subarray(this.index2NullOffset, this.index2NullOffset + INDEX_2_BLOCK_LENGTH), newBlock);
437 | 
438 |     return newBlock;
439 |   }
440 | 
441 |   _getIndex2Block(c, forLSCP) {
442 |     if ((c >= 0xd800) && (c < 0xdc00) && forLSCP) {
443 |       return LSCP_INDEX_2_OFFSET;
444 |     }
445 | 
446 |     const i1 = c >> SHIFT_1;
447 |     let i2 = this.index1[i1];
448 |     if (i2 === this.index2NullOffset) {
449 |       i2 = this._allocIndex2Block();
450 |       this.index1[i1] = i2;
451 |     }
452 | 
453 |     return i2;
454 |   }
455 | 
456 |   _isWritableBlock(block) {
457 |     return (block !== this.dataNullOffset) && (this.map[block >> SHIFT_2] === 1);
458 |   }
459 | 
460 |   _allocDataBlock(copyBlock) {
461 |     let newBlock;
462 |     if (this.firstFreeBlock !== 0) {
463 |       // get the first free block
464 |       newBlock = this.firstFreeBlock;
465 |       this.firstFreeBlock = -this.map[newBlock >> SHIFT_2];
466 |     } else {
467 |       // get a new block from the high end
468 |       newBlock = this.dataLength;
469 |       const newTop = newBlock + DATA_BLOCK_LENGTH;
470 |       if (newTop > this.dataCapacity) {
471 |         // out of memory in the data array
472 |         let capacity;
473 |         if (this.dataCapacity < MEDIUM_DATA_LENGTH) {
474 |           capacity = MEDIUM_DATA_LENGTH;
475 |         } else if (this.dataCapacity < MAX_DATA_LENGTH_BUILDTIME) {
476 |           capacity = MAX_DATA_LENGTH_BUILDTIME;
477 |         } else {
478 |           // Should never occur.
479 |           // Either MAX_DATA_LENGTH_BUILDTIME is incorrect,
480 |           // or the code writes more values than should be possible.
481 |           throw new Error("Internal error in Trie2 creation.");
482 |         }
483 | 
484 |         const newData = new Uint32Array(capacity);
485 |         newData.set(this.data.subarray(0, this.dataLength));
486 |         this.data = newData;
487 |         this.dataCapacity = capacity;
488 |       }
489 | 
490 |       this.dataLength = newTop;
491 |     }
492 | 
493 |     this.data.set(this.data.subarray(copyBlock, copyBlock + DATA_BLOCK_LENGTH), newBlock);
494 |     this.map[newBlock >> SHIFT_2] = 0;
495 |     return newBlock;
496 |   }
497 | 
498 |   _releaseDataBlock(block) {
499 |     // put this block at the front of the free-block chain
500 |     this.map[block >> SHIFT_2] = -this.firstFreeBlock;
501 |     this.firstFreeBlock = block;
502 |   }
503 | 
504 |   _setIndex2Entry(i2, block) {
505 |     ++this.map[block >> SHIFT_2];  // increment first, in case block == oldBlock!
506 |     const oldBlock = this.index2[i2];
507 |     if (--this.map[oldBlock >> SHIFT_2] === 0) {
508 |       this._releaseDataBlock(oldBlock);
509 |     }
510 | 
511 |     this.index2[i2] = block;
512 |   }
513 | 
514 |   _getDataBlock(c, forLSCP) {
515 |     let i2 = this._getIndex2Block(c, forLSCP);
516 |     i2 += (c >> SHIFT_2) & INDEX_2_MASK;
517 | 
518 |     const oldBlock = this.index2[i2];
519 |     if (this._isWritableBlock(oldBlock)) {
520 |       return oldBlock;
521 |     }
522 | 
523 |     // allocate a new data block
524 |     const newBlock = this._allocDataBlock(oldBlock);
525 |     this._setIndex2Entry(i2, newBlock);
526 |     return newBlock;
527 |   }
528 | 
529 |   _fillBlock(block, start, limit, value, initialValue, overwrite) {
530 |     let i;
531 |     if (overwrite) {
532 |       for (i = block + start; i < block + limit; i++) {
533 |         this.data[i] = value;
534 |       }
535 |     } else {
536 |       for (i = block + start; i < block + limit; i++) {
537 |         if (this.data[i] === initialValue) {
538 |           this.data[i] = value;
539 |         }
540 |       }
541 |     }
542 |   }
543 | 
544 |   _writeBlock(block, value) {
545 |     const limit = block + DATA_BLOCK_LENGTH;
546 |     while (block < limit) {
547 |       this.data[block++] = value;
548 |     }
549 |   }
550 | 
551 |   _findHighStart(highValue) {
552 |     let prevBlock, prevI2Block;
553 |     const data32 = this.data;
554 |     const { initialValue } = this;
555 |     const { index2NullOffset } = this;
556 |     const nullBlock = this.dataNullOffset;
557 | 
558 |     // set variables for previous range
559 |     if (highValue === initialValue) {
560 |       prevI2Block = index2NullOffset;
561 |       prevBlock = nullBlock;
562 |     } else {
563 |       prevI2Block = -1;
564 |       prevBlock = -1;
565 |     }
566 | 
567 |     const prev = 0x110000;
568 | 
569 |     // enumerate index-2 blocks
570 |     let i1 = INDEX_1_LENGTH;
571 |     let c = prev;
572 |     while (c > 0) {
573 |       const i2Block = this.index1[--i1];
574 |       if (i2Block === prevI2Block) {
575 |         // the index-2 block is the same as the previous one, and filled with highValue
576 |         c -= CP_PER_INDEX_1_ENTRY;
577 |         continue;
578 |       }
579 | 
580 |       prevI2Block = i2Block;
581 |       if (i2Block === index2NullOffset) {
582 |         // this is the null index-2 block
583 |         if (highValue !== initialValue) {
584 |           return c;
585 |         }
586 |         c -= CP_PER_INDEX_1_ENTRY;
587 |       } else {
588 |         // enumerate data blocks for one index-2 block
589 |         let i2 = INDEX_2_BLOCK_LENGTH;
590 |         while (i2 > 0) {
591 |           const block = this.index2[i2Block + --i2];
592 |           if (block === prevBlock) {
593 |             // the block is the same as the previous one, and filled with highValue
594 |             c -= DATA_BLOCK_LENGTH;
595 |             continue;
596 |           }
597 | 
598 |           prevBlock = block;
599 |           if (block === nullBlock) {
600 |             // this is the null data block
601 |             if (highValue !== initialValue) {
602 |               return c;
603 |             }
604 |             c -= DATA_BLOCK_LENGTH;
605 |           } else {
606 |             let j = DATA_BLOCK_LENGTH;
607 |             while (j > 0) {
608 |               const value = data32[block + --j];
609 |               if (value !== highValue) {
610 |                 return c;
611 |               }
612 |               --c;
613 |             }
614 |           }
615 |         }
616 |       }
617 |     }
618 | 
619 |     // deliver last range
620 |     return 0;
621 |   }
622 | 
623 |   _findSameDataBlock(dataLength, otherBlock, blockLength) {
624 |     // ensure that we do not even partially get past dataLength
625 |     dataLength -= blockLength;
626 |     let block = 0;
627 |     while (block <= dataLength) {
628 |       if (equal_int(this.data, block, otherBlock, blockLength)) {
629 |         return block;
630 |       }
631 |       block += DATA_GRANULARITY;
632 |     }
633 | 
634 |     return -1;
635 |   }
636 | 
637 |   _findSameIndex2Block(index2Length, otherBlock) {
638 |     // ensure that we do not even partially get past index2Length
639 |     index2Length -= INDEX_2_BLOCK_LENGTH;
640 |     for (let block = 0; block <= index2Length; block++) {
641 |       if (equal_int(this.index2, block, otherBlock, INDEX_2_BLOCK_LENGTH)) {
642 |         return block;
643 |       }
644 |     }
645 | 
646 |     return -1;
647 |   }
648 | 
649 |   _compactData() {
650 |     // do not compact linear-ASCII data
651 |     let newStart = DATA_START_OFFSET;
652 |     let start = 0;
653 |     let i = 0;
654 | 
655 |     while (start < newStart) {
656 |       this.map[i++] = start;
657 |       start += DATA_BLOCK_LENGTH;
658 |     }
659 | 
660 |     // Start with a block length of 64 for 2-byte UTF-8,
661 |     // then switch to DATA_BLOCK_LENGTH.
662 |     let blockLength = 64;
663 |     let blockCount = blockLength >> SHIFT_2;
664 |     start = newStart;
665 |     while (start < this.dataLength) {
666 |       // start: index of first entry of current block
667 |       // newStart: index where the current block is to be moved
668 |       //           (right after current end of already-compacted data)
669 |       var mapIndex, movedStart;
670 |       if (start === DATA_0800_OFFSET) {
671 |         blockLength = DATA_BLOCK_LENGTH;
672 |         blockCount = 1;
673 |       }
674 | 
675 |       // skip blocks that are not used
676 |       if (this.map[start >> SHIFT_2] <= 0) {
677 |         // advance start to the next block
678 |         start += blockLength;
679 | 
680 |         // leave newStart with the previous block!
681 |         continue;
682 |       }
683 | 
684 |       // search for an identical block
685 |       if ((movedStart = this._findSameDataBlock(newStart, start, blockLength)) >= 0) {
686 |         // found an identical block, set the other block's index value for the current block
687 |         mapIndex = start >> SHIFT_2;
688 |         for (i = blockCount; i > 0; i--) {
689 |           this.map[mapIndex++] = movedStart;
690 |           movedStart += DATA_BLOCK_LENGTH;
691 |         }
692 | 
693 |         // advance start to the next block
694 |         start += blockLength;
695 | 
696 |         // leave newStart with the previous block!
697 |         continue;
698 |       }
699 | 
700 |       // see if the beginning of this block can be overlapped with the end of the previous block
701 |       // look for maximum overlap (modulo granularity) with the previous, adjacent block
702 |       let overlap = blockLength - DATA_GRANULARITY;
703 |       while ((overlap > 0) && !equal_int(this.data, (newStart - overlap), start, overlap)) {
704 |         overlap -= DATA_GRANULARITY;
705 |       }
706 | 
707 |       if ((overlap > 0) || (newStart < start)) {
708 |         // some overlap, or just move the whole block
709 |         movedStart = newStart - overlap;
710 |         mapIndex = start >> SHIFT_2;
711 | 
712 |         for (i = blockCount; i > 0; i--) {
713 |           this.map[mapIndex++] = movedStart;
714 |           movedStart += DATA_BLOCK_LENGTH;
715 |         }
716 | 
717 |         // move the non-overlapping indexes to their new positions
718 |         start += overlap;
719 |         for (i = blockLength - overlap; i > 0; i--) {
720 |           this.data[newStart++] = this.data[start++];
721 |         }
722 | 
723 |       } else { // no overlap && newStart==start
724 |         mapIndex = start >> SHIFT_2;
725 |         for (i = blockCount; i > 0; i--) {
726 |           this.map[mapIndex++] = start;
727 |           start += DATA_BLOCK_LENGTH;
728 |         }
729 | 
730 |         newStart = start;
731 |       }
732 |     }
733 | 
734 |     // now adjust the index-2 table
735 |     i = 0;
736 |     while (i < this.index2Length) {
737 |       // Gap indexes are invalid (-1). Skip over the gap.
738 |       if (i === INDEX_GAP_OFFSET) {
739 |         i += INDEX_GAP_LENGTH;
740 |       }
741 |       this.index2[i] = this.map[this.index2[i] >> SHIFT_2];
742 |       ++i;
743 |     }
744 | 
745 |     this.dataNullOffset = this.map[this.dataNullOffset >> SHIFT_2];
746 | 
747 |     // ensure dataLength alignment
748 |     while ((newStart & (DATA_GRANULARITY - 1)) !== 0) {
749 |       this.data[newStart++] = this.initialValue;
750 |     }
751 |     this.dataLength = newStart;
752 |   }
753 | 
754 |   _compactIndex2() {
755 |     // do not compact linear-BMP index-2 blocks
756 |     let newStart = INDEX_2_BMP_LENGTH;
757 |     let start = 0;
758 |     let i = 0;
759 | 
760 |     while (start < newStart) {
761 |       this.map[i++] = start;
762 |       start += INDEX_2_BLOCK_LENGTH;
763 |     }
764 | 
765 |     // Reduce the index table gap to what will be needed at runtime.
766 |     newStart += UTF8_2B_INDEX_2_LENGTH + ((this.highStart - 0x10000) >> SHIFT_1);
767 |     start = INDEX_2_NULL_OFFSET;
768 |     while (start < this.index2Length) {
769 |       // start: index of first entry of current block
770 |       // newStart: index where the current block is to be moved
771 |       //           (right after current end of already-compacted data)
772 | 
773 |       // search for an identical block
774 |       var movedStart;
775 |       if ((movedStart = this._findSameIndex2Block(newStart, start)) >= 0) {
776 |         // found an identical block, set the other block's index value for the current block
777 |         this.map[start >> SHIFT_1_2] = movedStart;
778 | 
779 |         // advance start to the next block
780 |         start += INDEX_2_BLOCK_LENGTH;
781 | 
782 |         // leave newStart with the previous block!
783 |         continue;
784 |       }
785 | 
786 |       // see if the beginning of this block can be overlapped with the end of the previous block
787 |       // look for maximum overlap with the previous, adjacent block
788 |       let overlap = INDEX_2_BLOCK_LENGTH - 1;
789 |       while ((overlap > 0) && !equal_int(this.index2, (newStart - overlap), start, overlap)) {
790 |         --overlap;
791 |       }
792 | 
793 |       if ((overlap > 0) || (newStart < start)) {
794 |         // some overlap, or just move the whole block
795 |         this.map[start >> SHIFT_1_2] = newStart - overlap;
796 | 
797 |         // move the non-overlapping indexes to their new positions
798 |         start += overlap;
799 |         for (i = INDEX_2_BLOCK_LENGTH - overlap; i > 0; i--) {
800 |           this.index2[newStart++] = this.index2[start++];
801 |         }
802 | 
803 |       } else { // no overlap && newStart==start
804 |         this.map[start >> SHIFT_1_2] = start;
805 |         start += INDEX_2_BLOCK_LENGTH;
806 |         newStart = start;
807 |       }
808 |     }
809 | 
810 |     // now adjust the index-1 table
811 |     for (i = 0; i < INDEX_1_LENGTH; i++) {
812 |       this.index1[i] = this.map[this.index1[i] >> SHIFT_1_2];
813 |     }
814 | 
815 |     this.index2NullOffset = this.map[this.index2NullOffset >> SHIFT_1_2];
816 | 
817 |     // Ensure data table alignment:
818 |     // Needs to be granularity-aligned for 16-bit trie
819 |     // (so that dataMove will be down-shiftable),
820 |     // and 2-aligned for uint32_t data.
821 | 
822 |     // Arbitrary value: 0x3fffc not possible for real data.
823 |     while ((newStart & ((DATA_GRANULARITY - 1) | 1)) !== 0) {
824 |       this.index2[newStart++] = 0x0000ffff << INDEX_SHIFT;
825 |     }
826 | 
827 |     this.index2Length = newStart;
828 |   }
829 | 
830 |   _compact() {
831 |     // find highStart and round it up
832 |     let highValue = this.get(0x10ffff);
833 |     let highStart = this._findHighStart(highValue);
834 |     highStart = (highStart + (CP_PER_INDEX_1_ENTRY - 1)) & ~(CP_PER_INDEX_1_ENTRY - 1);
835 |     if (highStart === 0x110000) {
836 |       highValue = this.errorValue;
837 |     }
838 | 
839 |     // Set trie->highStart only after utrie2_get32(trie, highStart).
840 |     // Otherwise utrie2_get32(trie, highStart) would try to read the highValue.
841 |     this.highStart = highStart;
842 |     if (this.highStart < 0x110000) {
843 |       // Blank out [highStart..10ffff] to release associated data blocks.
844 |       const suppHighStart = this.highStart <= 0x10000 ? 0x10000 : this.highStart;
845 |       this.setRange(suppHighStart, 0x10ffff, this.initialValue, true);
846 |     }
847 | 
848 |     this._compactData();
849 |     if (this.highStart > 0x10000) {
850 |       this._compactIndex2();
851 |     }
852 | 
853 |     // Store the highValue in the data array and round up the dataLength.
854 |     // Must be done after compactData() because that assumes that dataLength
855 |     // is a multiple of DATA_BLOCK_LENGTH.
856 |     this.data[this.dataLength++] = highValue;
857 |     while ((this.dataLength & (DATA_GRANULARITY - 1)) !== 0) {
858 |       this.data[this.dataLength++] = this.initialValue;
859 |     }
860 | 
861 |     this.isCompacted = true;
862 |   }
863 | 
864 |   freeze() {
865 |     let allIndexesLength, i;
866 |     if (!this.isCompacted) {
867 |       this._compact();
868 |     }
869 | 
870 |     if (this.highStart <= 0x10000) {
871 |       allIndexesLength = INDEX_1_OFFSET;
872 |     } else {
873 |       allIndexesLength = this.index2Length;
874 |     }
875 | 
876 |     const dataMove = allIndexesLength;
877 | 
878 |     // are indexLength and dataLength within limits?
879 |     if ((allIndexesLength > MAX_INDEX_LENGTH) || // for unshifted indexLength
880 |       ((dataMove + this.dataNullOffset) > 0xffff) || // for unshifted dataNullOffset
881 |       ((dataMove + DATA_0800_OFFSET) > 0xffff) || // for unshifted 2-byte UTF-8 index-2 values
882 |       ((dataMove + this.dataLength) > MAX_DATA_LENGTH_RUNTIME)) { // for shiftedDataLength
883 |       throw new Error("Trie data is too large.");
884 |     }
885 | 
886 |     // calculate the sizes of, and allocate, the index and data arrays
887 |     const indexLength = allIndexesLength + this.dataLength;
888 |     const data = new Int32Array(indexLength);
889 | 
890 |     // write the index-2 array values shifted right by INDEX_SHIFT, after adding dataMove
891 |     let destIdx = 0;
892 |     for (i = 0; i < INDEX_2_BMP_LENGTH; i++) {
893 |       data[destIdx++] = ((this.index2[i] + dataMove) >> INDEX_SHIFT);
894 |     }
895 | 
896 |     // write UTF-8 2-byte index-2 values, not right-shifted
897 |     for (i = 0; i < 0xc2 - 0xc0; i++) { // C0..C1
898 |       data[destIdx++] = (dataMove + BAD_UTF8_DATA_OFFSET);
899 |     }
900 | 
901 |     for (i = i; i < 0xe0 - 0xc0; i++) { // C2..DF
902 |       data[destIdx++] = (dataMove + this.index2[i << (6 - SHIFT_2)]);
903 |     }
904 | 
905 |     if (this.highStart > 0x10000) {
906 |       const index1Length = (this.highStart - 0x10000) >> SHIFT_1;
907 |       const index2Offset = INDEX_2_BMP_LENGTH + UTF8_2B_INDEX_2_LENGTH + index1Length;
908 | 
909 |       // write 16-bit index-1 values for supplementary code points
910 |       for (i = 0; i < index1Length; i++) {
911 |         data[destIdx++] = (INDEX_2_OFFSET + this.index1[i + OMITTED_BMP_INDEX_1_LENGTH]);
912 |       }
913 | 
914 |       // write the index-2 array values for supplementary code points,
915 |       // shifted right by INDEX_SHIFT, after adding dataMove
916 |       for (i = 0; i < this.index2Length - index2Offset; i++) {
917 |         data[destIdx++] = ((dataMove + this.index2[index2Offset + i]) >> INDEX_SHIFT);
918 |       }
919 |     }
920 | 
921 |     // write 16-bit data values
922 |     for (i = 0; i < this.dataLength; i++) {
923 |       data[destIdx++] = this.data[i];
924 |     }
925 | 
926 |     const dest = new UnicodeTrie({
927 |       data,
928 |       highStart: this.highStart,
929 |       errorValue: this.errorValue
930 |     });
931 | 
932 |     return dest;
933 |   }
934 | 
935 |   // Generates a Buffer containing the serialized and compressed trie.
936 |   // Trie data is compressed twice using the deflate algorithm to minimize file size.
937 |   // Format:
938 |   //   uint32_t highStart;
939 |   //   uint32_t errorValue;
940 |   //   uint32_t uncompressedDataLength;
941 |   //   uint8_t trieData[dataLength];
942 |   toBuffer() {
943 |     const trie = this.freeze();
944 | 
945 |     const data = new Uint8Array(trie.data.buffer);
946 | 
947 |     // swap bytes to little-endian
948 |     swap32LE(data);
949 | 
950 |     let compressed = pako.deflateRaw(data);
951 |     compressed = pako.deflateRaw(compressed);
952 | 
953 |     const buf = Buffer.alloc(compressed.length + 12);
954 |     buf.writeUInt32LE(trie.highStart, 0);
955 |     buf.writeUInt32LE(trie.errorValue, 4);
956 |     buf.writeUInt32LE(data.length, 8);
957 |     for (let i = 0; i < compressed.length; i++) {
958 |       const b = compressed[i];
959 |       buf[i + 12] = b;
960 |     }
961 | 
962 |     return buf;
963 |   }
964 | }
965 | 
966 | module.exports = UnicodeTrieBuilder;


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
  1 | const inflate = require('tiny-inflate');
  2 | const { swap32LE } = require('./swap');
  3 | 
  4 | // Shift size for getting the index-1 table offset.
  5 | const SHIFT_1 = 6 + 5;
  6 | 
  7 | // Shift size for getting the index-2 table offset.
  8 | const SHIFT_2 = 5;
  9 | 
 10 | // Difference between the two shift sizes,
 11 | // for getting an index-1 offset from an index-2 offset. 6=11-5
 12 | const SHIFT_1_2 = SHIFT_1 - SHIFT_2;
 13 | 
 14 | // Number of index-1 entries for the BMP. 32=0x20
 15 | // This part of the index-1 table is omitted from the serialized form.
 16 | const OMITTED_BMP_INDEX_1_LENGTH = 0x10000 >> SHIFT_1;
 17 | 
 18 | // Number of entries in an index-2 block. 64=0x40
 19 | const INDEX_2_BLOCK_LENGTH = 1 << SHIFT_1_2;
 20 | 
 21 | // Mask for getting the lower bits for the in-index-2-block offset. */
 22 | const INDEX_2_MASK = INDEX_2_BLOCK_LENGTH - 1;
 23 | 
 24 | // Shift size for shifting left the index array values.
 25 | // Increases possible data size with 16-bit index values at the cost
 26 | // of compactability.
 27 | // This requires data blocks to be aligned by DATA_GRANULARITY.
 28 | const INDEX_SHIFT = 2;
 29 | 
 30 | // Number of entries in a data block. 32=0x20
 31 | const DATA_BLOCK_LENGTH = 1 << SHIFT_2;
 32 | 
 33 | // Mask for getting the lower bits for the in-data-block offset.
 34 | const DATA_MASK = DATA_BLOCK_LENGTH - 1;
 35 | 
 36 | // The part of the index-2 table for U+D800..U+DBFF stores values for
 37 | // lead surrogate code _units_ not code _points_.
 38 | // Values for lead surrogate code _points_ are indexed with this portion of the table.
 39 | // Length=32=0x20=0x400>>SHIFT_2. (There are 1024=0x400 lead surrogates.)
 40 | const LSCP_INDEX_2_OFFSET = 0x10000 >> SHIFT_2;
 41 | const LSCP_INDEX_2_LENGTH = 0x400 >> SHIFT_2;
 42 | 
 43 | // Count the lengths of both BMP pieces. 2080=0x820
 44 | const INDEX_2_BMP_LENGTH = LSCP_INDEX_2_OFFSET + LSCP_INDEX_2_LENGTH;
 45 | 
 46 | // The 2-byte UTF-8 version of the index-2 table follows at offset 2080=0x820.
 47 | // Length 32=0x20 for lead bytes C0..DF, regardless of SHIFT_2.
 48 | const UTF8_2B_INDEX_2_OFFSET = INDEX_2_BMP_LENGTH;
 49 | const UTF8_2B_INDEX_2_LENGTH = 0x800 >> 6;  // U+0800 is the first code point after 2-byte UTF-8
 50 | 
 51 | // The index-1 table, only used for supplementary code points, at offset 2112=0x840.
 52 | // Variable length, for code points up to highStart, where the last single-value range starts.
 53 | // Maximum length 512=0x200=0x100000>>SHIFT_1.
 54 | // (For 0x100000 supplementary code points U+10000..U+10ffff.)
 55 | //
 56 | // The part of the index-2 table for supplementary code points starts
 57 | // after this index-1 table.
 58 | //
 59 | // Both the index-1 table and the following part of the index-2 table
 60 | // are omitted completely if there is only BMP data.
 61 | const INDEX_1_OFFSET = UTF8_2B_INDEX_2_OFFSET + UTF8_2B_INDEX_2_LENGTH;
 62 | 
 63 | // The alignment size of a data block. Also the granularity for compaction.
 64 | const DATA_GRANULARITY = 1 << INDEX_SHIFT;
 65 | 
 66 | class UnicodeTrie {
 67 |   constructor(data) {
 68 |     const isBuffer = (typeof data.readUInt32BE === 'function') && (typeof data.slice === 'function');
 69 | 
 70 |     if (isBuffer || data instanceof Uint8Array) {
 71 |       // read binary format
 72 |       let uncompressedLength;
 73 |       if (isBuffer) {
 74 |         this.highStart = data.readUInt32LE(0);
 75 |         this.errorValue = data.readUInt32LE(4);
 76 |         uncompressedLength = data.readUInt32LE(8);
 77 |         data = data.slice(12);
 78 |       } else {
 79 |         const view = new DataView(data.buffer);
 80 |         this.highStart = view.getUint32(0, true);
 81 |         this.errorValue = view.getUint32(4, true);
 82 |         uncompressedLength = view.getUint32(8, true);
 83 |         data = data.subarray(12);
 84 |       }
 85 | 
 86 |       // double inflate the actual trie data
 87 |       data = inflate(data, new Uint8Array(uncompressedLength));
 88 |       data = inflate(data, new Uint8Array(uncompressedLength));
 89 | 
 90 |       // swap bytes from little-endian
 91 |       swap32LE(data);
 92 | 
 93 |       this.data = new Uint32Array(data.buffer);
 94 | 
 95 |     } else {
 96 |       // pre-parsed data
 97 |       ({ data: this.data, highStart: this.highStart, errorValue: this.errorValue } = data);
 98 |     }
 99 |   }
100 | 
101 |   get(codePoint) {
102 |     let index;
103 |     if ((codePoint < 0) || (codePoint > 0x10ffff)) {
104 |       return this.errorValue;
105 |     }
106 | 
107 |     if ((codePoint < 0xd800) || ((codePoint > 0xdbff) && (codePoint <= 0xffff))) {
108 |       // Ordinary BMP code point, excluding leading surrogates.
109 |       // BMP uses a single level lookup.  BMP index starts at offset 0 in the index.
110 |       // data is stored in the index array itself.
111 |       index = (this.data[codePoint >> SHIFT_2] << INDEX_SHIFT) + (codePoint & DATA_MASK);
112 |       return this.data[index];
113 |     }
114 | 
115 |     if (codePoint <= 0xffff) {
116 |       // Lead Surrogate Code Point.  A Separate index section is stored for
117 |       // lead surrogate code units and code points.
118 |       //   The main index has the code unit data.
119 |       //   For this function, we need the code point data.
120 |       index = (this.data[LSCP_INDEX_2_OFFSET + ((codePoint - 0xd800) >> SHIFT_2)] << INDEX_SHIFT) + (codePoint & DATA_MASK);
121 |       return this.data[index];
122 |     }
123 | 
124 |     if (codePoint < this.highStart) {
125 |       // Supplemental code point, use two-level lookup.
126 |       index = this.data[(INDEX_1_OFFSET - OMITTED_BMP_INDEX_1_LENGTH) + (codePoint >> SHIFT_1)];
127 |       index = this.data[index + ((codePoint >> SHIFT_2) & INDEX_2_MASK)];
128 |       index = (index << INDEX_SHIFT) + (codePoint & DATA_MASK);
129 |       return this.data[index];
130 |     }
131 | 
132 |     return this.data[this.data.length - DATA_GRANULARITY];
133 |   }
134 | }
135 | 
136 | module.exports = UnicodeTrie;


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "unicode-trie",
 3 |   "version": "2.0.0",
 4 |   "description": "Unicode Trie data structure for fast character metadata lookup, ported from ICU",
 5 |   "devDependencies": {
 6 |     "mocha": "^6.1.4",
 7 |     "nyc": "^14.1.1"
 8 |   },
 9 |   "scripts": {
10 |     "test": "mocha",
11 |     "coverage": "nyc mocha"
12 |   },
13 |   "repository": {
14 |     "type": "git",
15 |     "url": "git://github.com/devongovett/unicode-trie.git"
16 |   },
17 |   "author": "Devon Govett <devongovett@gmail.com>",
18 |   "license": "MIT",
19 |   "bugs": {
20 |     "url": "https://github.com/devongovett/unicode-trie/issues"
21 |   },
22 |   "homepage": "https://github.com/devongovett/unicode-trie",
23 |   "dependencies": {
24 |     "pako": "^0.2.5",
25 |     "tiny-inflate": "^1.0.0"
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/swap.js:
--------------------------------------------------------------------------------
 1 | const isBigEndian = (new Uint8Array(new Uint32Array([0x12345678]).buffer)[0] === 0x12);
 2 | 
 3 | const swap = (b, n, m) => {
 4 |   let i = b[n];
 5 |   b[n] = b[m];
 6 |   b[m] = i;
 7 | };
 8 | 
 9 | const swap32 = array => {
10 |   const len = array.length;
11 |   for (let i = 0; i < len; i += 4) {
12 |     swap(array, i, i + 3);
13 |     swap(array, i + 1, i + 2);
14 |   }
15 | };
16 | 
17 | const swap32LE = array => {
18 |   if (isBigEndian) {
19 |     swap32(array);
20 |   }
21 | };
22 | 
23 | module.exports = {
24 |   swap32LE: swap32LE
25 | };
26 | 


--------------------------------------------------------------------------------
/test/mocha.opts:
--------------------------------------------------------------------------------
1 | --reporter spec


--------------------------------------------------------------------------------
/test/test.js:
--------------------------------------------------------------------------------
  1 | const assert = require('assert');
  2 | const UnicodeTrieBuilder = require('../builder');
  3 | const UnicodeTrie = require('../');
  4 | 
  5 | describe('unicode trie', () => {
  6 |   it('set', () => {
  7 |     const trie = new UnicodeTrieBuilder(10, 666);
  8 |     trie.set(0x4567, 99);
  9 |     assert.equal(trie.get(0x4566), 10);
 10 |     assert.equal(trie.get(0x4567), 99);
 11 |     assert.equal(trie.get(-1), 666);
 12 |     assert.equal(trie.get(0x110000), 666);
 13 |   });
 14 | 
 15 |   it('set -> compacted trie', () => {
 16 |     const t = new UnicodeTrieBuilder(10, 666);
 17 |     t.set(0x4567, 99);
 18 | 
 19 |     const trie = t.freeze();
 20 |     assert.equal(trie.get(0x4566), 10);
 21 |     assert.equal(trie.get(0x4567), 99);
 22 |     assert.equal(trie.get(-1), 666);
 23 |     assert.equal(trie.get(0x110000), 666);
 24 |   });
 25 | 
 26 |   it('setRange', () => {
 27 |     const trie = new UnicodeTrieBuilder(10, 666);
 28 |     trie.setRange(13, 6666, 7788, false);
 29 |     trie.setRange(6000, 7000, 9900, true);
 30 | 
 31 |     assert.equal(trie.get(12), 10);
 32 |     assert.equal(trie.get(13), 7788);
 33 |     assert.equal(trie.get(5999), 7788);
 34 |     assert.equal(trie.get(6000), 9900);
 35 |     assert.equal(trie.get(7000), 9900);
 36 |     assert.equal(trie.get(7001), 10);
 37 |     assert.equal(trie.get(0x110000), 666);
 38 |   });
 39 | 
 40 |   it('setRange -> compacted trie', () => {
 41 |     const t = new UnicodeTrieBuilder(10, 666);
 42 |     t.setRange(13, 6666, 7788, false);
 43 |     t.setRange(6000, 7000, 9900, true);
 44 | 
 45 |     const trie = t.freeze();
 46 |     assert.equal(trie.get(12), 10);
 47 |     assert.equal(trie.get(13), 7788);
 48 |     assert.equal(trie.get(5999), 7788);
 49 |     assert.equal(trie.get(6000), 9900);
 50 |     assert.equal(trie.get(7000), 9900);
 51 |     assert.equal(trie.get(7001), 10);
 52 |     assert.equal(trie.get(0x110000), 666);
 53 |   });
 54 | 
 55 |   it('toBuffer written in little-endian', () => {
 56 |     const trie = new UnicodeTrieBuilder();
 57 |     trie.set(0x4567, 99);
 58 | 
 59 |     const buf = trie.toBuffer();
 60 |     const bufferExpected = new Buffer.from([0, 72, 0, 0, 0, 0, 0, 0, 128, 36, 0, 0, 123, 123, 206, 144, 235, 128, 2, 143, 67, 96, 225, 171, 23, 55, 54, 38, 231, 47, 44, 127, 233, 90, 109, 194, 92, 246, 126, 197, 131, 223, 31, 56, 102, 78, 154, 20, 108, 117, 88, 244, 93, 192, 190, 218, 229, 156, 12, 107, 86, 235, 125, 96, 102, 0, 129, 15, 239, 109, 219, 204, 58, 151, 92, 52, 126, 152, 198, 14, 0]);
 61 |     assert.equal(buf.toString('hex'), bufferExpected.toString('hex'));
 62 |   });
 63 | 
 64 |   it('should work with compressed serialization format', () => {
 65 |     const t = new UnicodeTrieBuilder(10, 666);
 66 |     t.setRange(13, 6666, 7788, false);
 67 |     t.setRange(6000, 7000, 9900, true);
 68 | 
 69 |     const buf = t.toBuffer();
 70 |     const trie = new UnicodeTrie(buf);
 71 |     assert.equal(trie.get(12), 10);
 72 |     assert.equal(trie.get(13), 7788);
 73 |     assert.equal(trie.get(5999), 7788);
 74 |     assert.equal(trie.get(6000), 9900);
 75 |     assert.equal(trie.get(7000), 9900);
 76 |     assert.equal(trie.get(7001), 10);
 77 |     assert.equal(trie.get(0x110000), 666);
 78 |   });
 79 | 
 80 |   const rangeTests = [
 81 |     {
 82 |       ranges: [
 83 |          [ 0,        0,        0,      0 ],
 84 |          [ 0,        0x40,     0,      0 ],
 85 |          [ 0x40,     0xe7,     0x1234, 0 ],
 86 |          [ 0xe7,     0x3400,   0,      0 ],
 87 |          [ 0x3400,   0x9fa6,   0x6162, 0 ],
 88 |          [ 0x9fa6,   0xda9e,   0x3132, 0 ],
 89 |          [ 0xdada,   0xeeee,   0x87ff, 0 ],
 90 |          [ 0xeeee,   0x11111,  1,      0 ],
 91 |          [ 0x11111,  0x44444,  0x6162, 0 ],
 92 |          [ 0x44444,  0x60003,  0,      0 ],
 93 |          [ 0xf0003,  0xf0004,  0xf,    0 ],
 94 |          [ 0xf0004,  0xf0006,  0x10,   0 ],
 95 |          [ 0xf0006,  0xf0007,  0x11,   0 ],
 96 |          [ 0xf0007,  0xf0040,  0x12,   0 ],
 97 |          [ 0xf0040,  0x110000, 0,      0 ]
 98 |       ],
 99 | 
100 |       check: [
101 |         [ 0,        0      ],
102 |         [ 0x40,     0      ],
103 |         [ 0xe7,     0x1234 ],
104 |         [ 0x3400,   0      ],
105 |         [ 0x9fa6,   0x6162 ],
106 |         [ 0xda9e,   0x3132 ],
107 |         [ 0xdada,   0      ],
108 |         [ 0xeeee,   0x87ff ],
109 |         [ 0x11111,  1      ],
110 |         [ 0x44444,  0x6162 ],
111 |         [ 0xf0003,  0      ],
112 |         [ 0xf0004,  0xf    ],
113 |         [ 0xf0006,  0x10   ],
114 |         [ 0xf0007,  0x11   ],
115 |         [ 0xf0040,  0x12   ],
116 |         [ 0x110000, 0      ]
117 |       ]
118 |     },
119 |     {
120 |       // set some interesting overlapping ranges
121 |       ranges: [
122 |         [ 0,        0,        0,      0 ],
123 |         [ 0x21,     0x7f,     0x5555, 1 ],
124 |         [ 0x2f800,  0x2fedc,  0x7a,   1 ],
125 |         [ 0x72,     0xdd,     3,      1 ],
126 |         [ 0xdd,     0xde,     4,      0 ],
127 |         [ 0x201,    0x240,    6,      1 ],  // 3 consecutive blocks with the same pattern but
128 |         [ 0x241,    0x280,    6,      1 ],  // discontiguous value ranges, testing utrie2_enum()
129 |         [ 0x281,    0x2c0,    6,      1 ],
130 |         [ 0x2f987,  0x2fa98,  5,      1 ],
131 |         [ 0x2f777,  0x2f883,  0,      1 ],
132 |         [ 0x2f900,  0x2ffaa,  1,      0 ],
133 |         [ 0x2ffaa,  0x2ffab,  2,      1 ],
134 |         [ 0x2ffbb,  0x2ffc0,  7,      1 ]
135 |       ],
136 | 
137 |       check: [
138 |         [ 0,        0      ],
139 |         [ 0x21,     0      ],
140 |         [ 0x72,     0x5555 ],
141 |         [ 0xdd,     3      ],
142 |         [ 0xde,     4      ],
143 |         [ 0x201,    0      ],
144 |         [ 0x240,    6      ],
145 |         [ 0x241,    0      ],
146 |         [ 0x280,    6      ],
147 |         [ 0x281,    0      ],
148 |         [ 0x2c0,    6      ],
149 |         [ 0x2f883,  0      ],
150 |         [ 0x2f987,  0x7a   ],
151 |         [ 0x2fa98,  5      ],
152 |         [ 0x2fedc,  0x7a   ],
153 |         [ 0x2ffaa,  1      ],
154 |         [ 0x2ffab,  2      ],
155 |         [ 0x2ffbb,  0      ],
156 |         [ 0x2ffc0,  7      ],
157 |         [ 0x110000, 0      ]
158 |       ]
159 |     },
160 |     {
161 |       // use a non-zero initial value
162 |       ranges: [
163 |         [ 0,        0,        9, 0 ], // non-zero initial value.
164 |         [ 0x31,     0xa4,     1, 0 ],
165 |         [ 0x3400,   0x6789,   2, 0 ],
166 |         [ 0x8000,   0x89ab,   9, 1 ],
167 |         [ 0x9000,   0xa000,   4, 1 ],
168 |         [ 0xabcd,   0xbcde,   3, 1 ],
169 |         [ 0x55555,  0x110000, 6, 1 ], // highStart<U+ffff with non-initialValue
170 |         [ 0xcccc,   0x55555,  6, 1 ]
171 |       ],
172 | 
173 |       check: [
174 |         [ 0,        9 ],  // non-zero initialValue
175 |         [ 0x31,     9 ],
176 |         [ 0xa4,     1 ],
177 |         [ 0x3400,   9 ],
178 |         [ 0x6789,   2 ],
179 |         [ 0x9000,   9 ],
180 |         [ 0xa000,   4 ],
181 |         [ 0xabcd,   9 ],
182 |         [ 0xbcde,   3 ],
183 |         [ 0xcccc,   9 ],
184 |         [ 0x110000, 6 ]
185 |       ]
186 |     },
187 |     {
188 |       // empty or single-value tries, testing highStart==0
189 |       ranges: [
190 |         [ 0,        0,        3, 0 ] // Only the element with the initial value.
191 |       ],
192 | 
193 |       check: [
194 |         [ 0,        3 ],
195 |         [ 0x110000, 3 ]
196 |       ]
197 |     },
198 |     {
199 |       ranges: [
200 |         [ 0,        0,        3,  0 ], // Initial value = 3
201 |         [ 0,        0x110000, 5, 1  ]
202 |       ],
203 | 
204 |       check: [
205 |         [ 0,        3 ],
206 |         [ 0x110000, 5 ]
207 |       ]
208 |     }
209 |   ];
210 | 
211 |   it('should pass range tests', () => {
212 |     const result = [];
213 |     for (let test of rangeTests) {
214 |       let initialValue = 0;
215 |       let errorValue = 0x0bad;
216 |       let i = 0;
217 |       if (test.ranges[i][1] < 0) {
218 |         errorValue = test.ranges[i][2];
219 |         i++;
220 |       }
221 | 
222 |       initialValue = test.ranges[i++][2];
223 |       var trie = new UnicodeTrieBuilder(initialValue, errorValue);
224 | 
225 |       for (let range of test.ranges.slice(i)) {
226 |         trie.setRange(range[0], range[1] - 1, range[2], range[3] !== 0);
227 |       }
228 | 
229 |       var frozen = trie.freeze();
230 | 
231 |       var start = 0;
232 |       result.push(test.check.map((check) => {
233 |         let end;
234 |         const result1 = [];
235 |         for (start = start, end = check[0]; start < end; start++) {
236 |           assert.equal(trie.get(start), check[1]);
237 |           result1.push(assert.equal(frozen.get(start), check[1]));
238 |         }
239 |         return result1;
240 |       }));
241 |     }
242 |   });
243 | });
244 | 


--------------------------------------------------------------------------------