├── test ├── mocha.opts └── test.js ├── .gitignore ├── swap.js ├── package.json ├── LICENSE ├── README.md └── index.js /test/mocha.opts: -------------------------------------------------------------------------------- 1 | --reporter spec -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | coverage.html 3 | -------------------------------------------------------------------------------- /swap.js: -------------------------------------------------------------------------------- 1 | const isBigEndian = (new Uint8Array(new Uint32Array([0x12345678]).buffer)[0] === 0x12); 2 | 3 | const swap = (b, n, m) => { 4 | let i = b[n]; 5 | b[n] = b[m]; 6 | b[m] = i; 7 | }; 8 | 9 | const swap32 = array => { 10 | const len = array.length; 11 | for (let i = 0; i < len; i += 4) { 12 | swap(array, i, i + 3); 13 | swap(array, i + 1, i + 2); 14 | } 15 | }; 16 | 17 | const swap32LE = array => { 18 | if (isBigEndian) { 19 | swap32(array); 20 | } 21 | }; 22 | 23 | module.exports = { 24 | swap32LE: swap32LE 25 | }; 26 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "unicode-trie", 3 | "version": "2.0.0", 4 | "description": "Unicode Trie data structure for fast character metadata lookup, ported from ICU", 5 | "devDependencies": { 6 | "mocha": "^6.1.4", 7 | "nyc": "^14.1.1" 8 | }, 9 | "scripts": { 10 | "test": "mocha", 11 | "coverage": "nyc mocha" 12 | }, 13 | "repository": { 14 | "type": "git", 15 | "url": "git://github.com/devongovett/unicode-trie.git" 16 | }, 17 | "author": "Devon Govett ", 18 | "license": "MIT", 19 | "bugs": { 20 | "url": "https://github.com/devongovett/unicode-trie/issues" 21 | }, 22 | "homepage": "https://github.com/devongovett/unicode-trie", 23 | "dependencies": { 24 | "pako": "^0.2.5", 25 | "tiny-inflate": "^1.0.0" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2018 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # unicode-trie 2 | A data structure for fast Unicode character metadata lookup, ported from ICU 3 | 4 | ## Background 5 | 6 | When implementing many Unicode algorithms such as text segmentation, 7 | normalization, bidi processing, etc., fast access to character metadata 8 | is crucial to good performance. There over a million code points in the 9 | Unicode standard, many of which produce the same result when looked up, 10 | so an array or hash table is not appropriate - those data structures are 11 | fast but would require a lot of memory. The data is generally 12 | grouped in ranges, so you could do a binary search, but that is not 13 | fast enough for some applications. 14 | 15 | The [International Components for Unicode](http://site.icu-project.org) (ICU) project 16 | came up with a data structure based on a [Trie](http://en.wikipedia.org/wiki/Trie) that provides fast access 17 | to Unicode metadata. The range data is precompiled to a serialized 18 | and flattened trie, which is then used at runtime to lookup the necessary 19 | data. According to my own tests, this is generally at least 50% faster 20 | than binary search, with not too much additional memory required. 21 | 22 | ## Installation 23 | 24 | npm install unicode-trie 25 | 26 | ## Building a Trie 27 | 28 | Unicode Tries are generally precompiled from data in the Unicode database 29 | for faster runtime performance. To build a Unicode Trie, use the 30 | `UnicodeTrieBuilder` class. 31 | 32 | ```js 33 | const UnicodeTrieBuilder = require('unicode-trie/builder'); 34 | const fs = require('fs'); 35 | 36 | // create a trie 37 | let t = new UnicodeTrieBuilder(); 38 | 39 | // optional parameters for default value, and error value 40 | // if not provided, both are set to 0 41 | t = new UnicodeTrieBuilder(10, 999); 42 | 43 | // set individual values and ranges 44 | t.set(0x4567, 99); 45 | t.setRange(0x40, 0xe7, 0x1234); 46 | 47 | // you can lookup a value if you like 48 | t.get(0x4567); // => 99 49 | 50 | // get a compiled trie (returns a UnicodeTrie object) 51 | const trie = t.freeze(); 52 | 53 | // write compressed trie to a binary file 54 | fs.writeFileSync('data.trie', t.toBuffer()); 55 | ``` 56 | 57 | ## Using a precompiled Trie 58 | 59 | Once you've built a precompiled trie, you can load it into the 60 | `UnicodeTrie` class, which is a readonly representation of the 61 | trie. From there, you can lookup values. 62 | 63 | ```js 64 | const UnicodeTrie = require('unicode-trie'); 65 | const fs = require('fs'); 66 | 67 | // load serialized trie from binary file 68 | const data = fs.readFileSync('data.trie'); 69 | const trie = new UnicodeTrie(data); 70 | 71 | // lookup a value 72 | trie.get(0x4567); // => 99 73 | ``` 74 | 75 | ## License 76 | 77 | MIT 78 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | const inflate = require('tiny-inflate'); 2 | const { swap32LE } = require('./swap'); 3 | 4 | // Shift size for getting the index-1 table offset. 5 | const SHIFT_1 = 6 + 5; 6 | 7 | // Shift size for getting the index-2 table offset. 8 | const SHIFT_2 = 5; 9 | 10 | // Difference between the two shift sizes, 11 | // for getting an index-1 offset from an index-2 offset. 6=11-5 12 | const SHIFT_1_2 = SHIFT_1 - SHIFT_2; 13 | 14 | // Number of index-1 entries for the BMP. 32=0x20 15 | // This part of the index-1 table is omitted from the serialized form. 16 | const OMITTED_BMP_INDEX_1_LENGTH = 0x10000 >> SHIFT_1; 17 | 18 | // Number of entries in an index-2 block. 64=0x40 19 | const INDEX_2_BLOCK_LENGTH = 1 << SHIFT_1_2; 20 | 21 | // Mask for getting the lower bits for the in-index-2-block offset. */ 22 | const INDEX_2_MASK = INDEX_2_BLOCK_LENGTH - 1; 23 | 24 | // Shift size for shifting left the index array values. 25 | // Increases possible data size with 16-bit index values at the cost 26 | // of compactability. 27 | // This requires data blocks to be aligned by DATA_GRANULARITY. 28 | const INDEX_SHIFT = 2; 29 | 30 | // Number of entries in a data block. 32=0x20 31 | const DATA_BLOCK_LENGTH = 1 << SHIFT_2; 32 | 33 | // Mask for getting the lower bits for the in-data-block offset. 34 | const DATA_MASK = DATA_BLOCK_LENGTH - 1; 35 | 36 | // The part of the index-2 table for U+D800..U+DBFF stores values for 37 | // lead surrogate code _units_ not code _points_. 38 | // Values for lead surrogate code _points_ are indexed with this portion of the table. 39 | // Length=32=0x20=0x400>>SHIFT_2. (There are 1024=0x400 lead surrogates.) 40 | const LSCP_INDEX_2_OFFSET = 0x10000 >> SHIFT_2; 41 | const LSCP_INDEX_2_LENGTH = 0x400 >> SHIFT_2; 42 | 43 | // Count the lengths of both BMP pieces. 2080=0x820 44 | const INDEX_2_BMP_LENGTH = LSCP_INDEX_2_OFFSET + LSCP_INDEX_2_LENGTH; 45 | 46 | // The 2-byte UTF-8 version of the index-2 table follows at offset 2080=0x820. 47 | // Length 32=0x20 for lead bytes C0..DF, regardless of SHIFT_2. 48 | const UTF8_2B_INDEX_2_OFFSET = INDEX_2_BMP_LENGTH; 49 | const UTF8_2B_INDEX_2_LENGTH = 0x800 >> 6; // U+0800 is the first code point after 2-byte UTF-8 50 | 51 | // The index-1 table, only used for supplementary code points, at offset 2112=0x840. 52 | // Variable length, for code points up to highStart, where the last single-value range starts. 53 | // Maximum length 512=0x200=0x100000>>SHIFT_1. 54 | // (For 0x100000 supplementary code points U+10000..U+10ffff.) 55 | // 56 | // The part of the index-2 table for supplementary code points starts 57 | // after this index-1 table. 58 | // 59 | // Both the index-1 table and the following part of the index-2 table 60 | // are omitted completely if there is only BMP data. 61 | const INDEX_1_OFFSET = UTF8_2B_INDEX_2_OFFSET + UTF8_2B_INDEX_2_LENGTH; 62 | 63 | // The alignment size of a data block. Also the granularity for compaction. 64 | const DATA_GRANULARITY = 1 << INDEX_SHIFT; 65 | 66 | class UnicodeTrie { 67 | constructor(data) { 68 | const isBuffer = (typeof data.readUInt32BE === 'function') && (typeof data.slice === 'function'); 69 | 70 | if (isBuffer || data instanceof Uint8Array) { 71 | // read binary format 72 | let uncompressedLength; 73 | if (isBuffer) { 74 | this.highStart = data.readUInt32LE(0); 75 | this.errorValue = data.readUInt32LE(4); 76 | uncompressedLength = data.readUInt32LE(8); 77 | data = data.slice(12); 78 | } else { 79 | const view = new DataView(data.buffer); 80 | this.highStart = view.getUint32(0, true); 81 | this.errorValue = view.getUint32(4, true); 82 | uncompressedLength = view.getUint32(8, true); 83 | data = data.subarray(12); 84 | } 85 | 86 | // double inflate the actual trie data 87 | data = inflate(data, new Uint8Array(uncompressedLength)); 88 | data = inflate(data, new Uint8Array(uncompressedLength)); 89 | 90 | // swap bytes from little-endian 91 | swap32LE(data); 92 | 93 | this.data = new Uint32Array(data.buffer); 94 | 95 | } else { 96 | // pre-parsed data 97 | ({ data: this.data, highStart: this.highStart, errorValue: this.errorValue } = data); 98 | } 99 | } 100 | 101 | get(codePoint) { 102 | let index; 103 | if ((codePoint < 0) || (codePoint > 0x10ffff)) { 104 | return this.errorValue; 105 | } 106 | 107 | if ((codePoint < 0xd800) || ((codePoint > 0xdbff) && (codePoint <= 0xffff))) { 108 | // Ordinary BMP code point, excluding leading surrogates. 109 | // BMP uses a single level lookup. BMP index starts at offset 0 in the index. 110 | // data is stored in the index array itself. 111 | index = (this.data[codePoint >> SHIFT_2] << INDEX_SHIFT) + (codePoint & DATA_MASK); 112 | return this.data[index]; 113 | } 114 | 115 | if (codePoint <= 0xffff) { 116 | // Lead Surrogate Code Point. A Separate index section is stored for 117 | // lead surrogate code units and code points. 118 | // The main index has the code unit data. 119 | // For this function, we need the code point data. 120 | index = (this.data[LSCP_INDEX_2_OFFSET + ((codePoint - 0xd800) >> SHIFT_2)] << INDEX_SHIFT) + (codePoint & DATA_MASK); 121 | return this.data[index]; 122 | } 123 | 124 | if (codePoint < this.highStart) { 125 | // Supplemental code point, use two-level lookup. 126 | index = this.data[(INDEX_1_OFFSET - OMITTED_BMP_INDEX_1_LENGTH) + (codePoint >> SHIFT_1)]; 127 | index = this.data[index + ((codePoint >> SHIFT_2) & INDEX_2_MASK)]; 128 | index = (index << INDEX_SHIFT) + (codePoint & DATA_MASK); 129 | return this.data[index]; 130 | } 131 | 132 | return this.data[this.data.length - DATA_GRANULARITY]; 133 | } 134 | } 135 | 136 | module.exports = UnicodeTrie; -------------------------------------------------------------------------------- /test/test.js: -------------------------------------------------------------------------------- 1 | const assert = require('assert'); 2 | const UnicodeTrieBuilder = require('../builder'); 3 | const UnicodeTrie = require('../'); 4 | 5 | describe('unicode trie', () => { 6 | it('set', () => { 7 | const trie = new UnicodeTrieBuilder(10, 666); 8 | trie.set(0x4567, 99); 9 | assert.equal(trie.get(0x4566), 10); 10 | assert.equal(trie.get(0x4567), 99); 11 | assert.equal(trie.get(-1), 666); 12 | assert.equal(trie.get(0x110000), 666); 13 | }); 14 | 15 | it('set -> compacted trie', () => { 16 | const t = new UnicodeTrieBuilder(10, 666); 17 | t.set(0x4567, 99); 18 | 19 | const trie = t.freeze(); 20 | assert.equal(trie.get(0x4566), 10); 21 | assert.equal(trie.get(0x4567), 99); 22 | assert.equal(trie.get(-1), 666); 23 | assert.equal(trie.get(0x110000), 666); 24 | }); 25 | 26 | it('setRange', () => { 27 | const trie = new UnicodeTrieBuilder(10, 666); 28 | trie.setRange(13, 6666, 7788, false); 29 | trie.setRange(6000, 7000, 9900, true); 30 | 31 | assert.equal(trie.get(12), 10); 32 | assert.equal(trie.get(13), 7788); 33 | assert.equal(trie.get(5999), 7788); 34 | assert.equal(trie.get(6000), 9900); 35 | assert.equal(trie.get(7000), 9900); 36 | assert.equal(trie.get(7001), 10); 37 | assert.equal(trie.get(0x110000), 666); 38 | }); 39 | 40 | it('setRange -> compacted trie', () => { 41 | const t = new UnicodeTrieBuilder(10, 666); 42 | t.setRange(13, 6666, 7788, false); 43 | t.setRange(6000, 7000, 9900, true); 44 | 45 | const trie = t.freeze(); 46 | assert.equal(trie.get(12), 10); 47 | assert.equal(trie.get(13), 7788); 48 | assert.equal(trie.get(5999), 7788); 49 | assert.equal(trie.get(6000), 9900); 50 | assert.equal(trie.get(7000), 9900); 51 | assert.equal(trie.get(7001), 10); 52 | assert.equal(trie.get(0x110000), 666); 53 | }); 54 | 55 | it('toBuffer written in little-endian', () => { 56 | const trie = new UnicodeTrieBuilder(); 57 | trie.set(0x4567, 99); 58 | 59 | const buf = trie.toBuffer(); 60 | const bufferExpected = new Buffer.from([0, 72, 0, 0, 0, 0, 0, 0, 128, 36, 0, 0, 123, 123, 206, 144, 235, 128, 2, 143, 67, 96, 225, 171, 23, 55, 54, 38, 231, 47, 44, 127, 233, 90, 109, 194, 92, 246, 126, 197, 131, 223, 31, 56, 102, 78, 154, 20, 108, 117, 88, 244, 93, 192, 190, 218, 229, 156, 12, 107, 86, 235, 125, 96, 102, 0, 129, 15, 239, 109, 219, 204, 58, 151, 92, 52, 126, 152, 198, 14, 0]); 61 | assert.equal(buf.toString('hex'), bufferExpected.toString('hex')); 62 | }); 63 | 64 | it('should work with compressed serialization format', () => { 65 | const t = new UnicodeTrieBuilder(10, 666); 66 | t.setRange(13, 6666, 7788, false); 67 | t.setRange(6000, 7000, 9900, true); 68 | 69 | const buf = t.toBuffer(); 70 | const trie = new UnicodeTrie(buf); 71 | assert.equal(trie.get(12), 10); 72 | assert.equal(trie.get(13), 7788); 73 | assert.equal(trie.get(5999), 7788); 74 | assert.equal(trie.get(6000), 9900); 75 | assert.equal(trie.get(7000), 9900); 76 | assert.equal(trie.get(7001), 10); 77 | assert.equal(trie.get(0x110000), 666); 78 | }); 79 | 80 | const rangeTests = [ 81 | { 82 | ranges: [ 83 | [ 0, 0, 0, 0 ], 84 | [ 0, 0x40, 0, 0 ], 85 | [ 0x40, 0xe7, 0x1234, 0 ], 86 | [ 0xe7, 0x3400, 0, 0 ], 87 | [ 0x3400, 0x9fa6, 0x6162, 0 ], 88 | [ 0x9fa6, 0xda9e, 0x3132, 0 ], 89 | [ 0xdada, 0xeeee, 0x87ff, 0 ], 90 | [ 0xeeee, 0x11111, 1, 0 ], 91 | [ 0x11111, 0x44444, 0x6162, 0 ], 92 | [ 0x44444, 0x60003, 0, 0 ], 93 | [ 0xf0003, 0xf0004, 0xf, 0 ], 94 | [ 0xf0004, 0xf0006, 0x10, 0 ], 95 | [ 0xf0006, 0xf0007, 0x11, 0 ], 96 | [ 0xf0007, 0xf0040, 0x12, 0 ], 97 | [ 0xf0040, 0x110000, 0, 0 ] 98 | ], 99 | 100 | check: [ 101 | [ 0, 0 ], 102 | [ 0x40, 0 ], 103 | [ 0xe7, 0x1234 ], 104 | [ 0x3400, 0 ], 105 | [ 0x9fa6, 0x6162 ], 106 | [ 0xda9e, 0x3132 ], 107 | [ 0xdada, 0 ], 108 | [ 0xeeee, 0x87ff ], 109 | [ 0x11111, 1 ], 110 | [ 0x44444, 0x6162 ], 111 | [ 0xf0003, 0 ], 112 | [ 0xf0004, 0xf ], 113 | [ 0xf0006, 0x10 ], 114 | [ 0xf0007, 0x11 ], 115 | [ 0xf0040, 0x12 ], 116 | [ 0x110000, 0 ] 117 | ] 118 | }, 119 | { 120 | // set some interesting overlapping ranges 121 | ranges: [ 122 | [ 0, 0, 0, 0 ], 123 | [ 0x21, 0x7f, 0x5555, 1 ], 124 | [ 0x2f800, 0x2fedc, 0x7a, 1 ], 125 | [ 0x72, 0xdd, 3, 1 ], 126 | [ 0xdd, 0xde, 4, 0 ], 127 | [ 0x201, 0x240, 6, 1 ], // 3 consecutive blocks with the same pattern but 128 | [ 0x241, 0x280, 6, 1 ], // discontiguous value ranges, testing utrie2_enum() 129 | [ 0x281, 0x2c0, 6, 1 ], 130 | [ 0x2f987, 0x2fa98, 5, 1 ], 131 | [ 0x2f777, 0x2f883, 0, 1 ], 132 | [ 0x2f900, 0x2ffaa, 1, 0 ], 133 | [ 0x2ffaa, 0x2ffab, 2, 1 ], 134 | [ 0x2ffbb, 0x2ffc0, 7, 1 ] 135 | ], 136 | 137 | check: [ 138 | [ 0, 0 ], 139 | [ 0x21, 0 ], 140 | [ 0x72, 0x5555 ], 141 | [ 0xdd, 3 ], 142 | [ 0xde, 4 ], 143 | [ 0x201, 0 ], 144 | [ 0x240, 6 ], 145 | [ 0x241, 0 ], 146 | [ 0x280, 6 ], 147 | [ 0x281, 0 ], 148 | [ 0x2c0, 6 ], 149 | [ 0x2f883, 0 ], 150 | [ 0x2f987, 0x7a ], 151 | [ 0x2fa98, 5 ], 152 | [ 0x2fedc, 0x7a ], 153 | [ 0x2ffaa, 1 ], 154 | [ 0x2ffab, 2 ], 155 | [ 0x2ffbb, 0 ], 156 | [ 0x2ffc0, 7 ], 157 | [ 0x110000, 0 ] 158 | ] 159 | }, 160 | { 161 | // use a non-zero initial value 162 | ranges: [ 163 | [ 0, 0, 9, 0 ], // non-zero initial value. 164 | [ 0x31, 0xa4, 1, 0 ], 165 | [ 0x3400, 0x6789, 2, 0 ], 166 | [ 0x8000, 0x89ab, 9, 1 ], 167 | [ 0x9000, 0xa000, 4, 1 ], 168 | [ 0xabcd, 0xbcde, 3, 1 ], 169 | [ 0x55555, 0x110000, 6, 1 ], // highStart { 212 | const result = []; 213 | for (let test of rangeTests) { 214 | let initialValue = 0; 215 | let errorValue = 0x0bad; 216 | let i = 0; 217 | if (test.ranges[i][1] < 0) { 218 | errorValue = test.ranges[i][2]; 219 | i++; 220 | } 221 | 222 | initialValue = test.ranges[i++][2]; 223 | var trie = new UnicodeTrieBuilder(initialValue, errorValue); 224 | 225 | for (let range of test.ranges.slice(i)) { 226 | trie.setRange(range[0], range[1] - 1, range[2], range[3] !== 0); 227 | } 228 | 229 | var frozen = trie.freeze(); 230 | 231 | var start = 0; 232 | result.push(test.check.map((check) => { 233 | let end; 234 | const result1 = []; 235 | for (start = start, end = check[0]; start < end; start++) { 236 | assert.equal(trie.get(start), check[1]); 237 | result1.push(assert.equal(frozen.get(start), check[1])); 238 | } 239 | return result1; 240 | })); 241 | } 242 | }); 243 | }); 244 | --------------------------------------------------------------------------------