├── .gitignore ├── README.md ├── index.js ├── package.json └── test └── index.js /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | node_modules/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | spliddit 2 | ======== 3 | 4 | Spliddit - unicode-aware JS string splitting 5 | 6 | Split a string into its constituent characters, without munging emoji and other non-BMP code points. 7 | 8 | ##Why? 9 | 10 | The native `String#split` implementation does not pay attention to [surrogate pairs](http://en.wikipedia.org/wiki/UTF-16). When the code units of a surrogate pair are split apart, they are not intelligible on their own. Unless they are put back together in the correct order, individual code units will cause problems in code that handles strings. 11 | 12 | Consider: 13 | 14 | ```javascript 15 | var emojiMessage = 'Hello 😤' 16 | 17 | emojiMessage.split('').reverse().join('') 18 | // => String with messed-up emoji 19 | ``` 20 | 21 | `spliddit` will correctly split the string into strings consisting of legible characters: 22 | 23 | ```javascript 24 | var spliddit = require('spliddit') 25 | var emojiMessage = 'Hello 😤' 26 | 27 | spliddit(emojiMessage).reverse().join('') 28 | // => '😤 olleH' 29 | ``` 30 | 31 | Also, since surrogate pairs take up two spaces in the Javascript string to represent a single character, `spliddit` can help you correctly count the number of code points (characters) in the string: 32 | 33 | ```javascript 34 | var spliddit = require('spliddit') 35 | var emoji = '🍔' 36 | var han = '𠬠典' 37 | 38 | emoji.length 39 | // => 2 40 | han.length 41 | // => 3 42 | 43 | spliddit(emoji).length 44 | // => 1 45 | spliddit(han).length 46 | // => 2 47 | ``` 48 | 49 | Alternatively, you can pass `spliddit` an array that potentially has broken-apart surrogate pairs, and `spliddit` will return an array that has them put back together: 50 | 51 | ```javascript 52 | var myCoolString = '😎 Fooool' 53 | 54 | // Messed-up array beginning with a split-apart surrogate pair :( 55 | var myBustedArray = myCoolString.split('') 56 | 57 | // Aww yeah cool guy is back 58 | var myCoolFixedArray = spliddit(myBustedArray) 59 | ``` 60 | 61 | ### Delimiter 62 | 63 | You can also pass `spliddit` a second argument, a string or `RegExp` representing the delimiter to split by. The native String#split implementation does this correctly, so `spliddit` just passes through to String#split in this case. 64 | 65 | ```javascript 66 | spliddit('hi🍔hi', '🍔') 67 | // => ['hi', 'hi'] 68 | 69 | spliddit('123a456', 'a') 70 | // => ['123', '456'] 71 | ``` 72 | 73 | ###Country Flags 74 | 75 | Country flags like 🇦🇴 are composed of two *regional indicator* Unicode characters. Each regional indicator character is represented as a surrogate pair in JavaScript strings, so country flags take up 4 code units. The regional indicator symbols follow the alphabet, and the two regional indicators used follow the country's code. 76 | 77 | (For example, 🇮🇹 , Italy's flag, is [`U+1F1EE 'REGIONAL INDICATOR SYMBOL LETTER I'`](http://www.fileformat.info/info/unicode/char/1F1EE/index.htm) followed by [`U+1F1F9 'REGIONAL INDICATOR SYMBOL LETTER T'`](http://www.fileformat.info/info/unicode/char/1F1F9/index.htm).) 78 | 79 | `spliddit` will split pairs of regional indicator characters (4 total code units) into one character even though they consist of two Unicode code points. 80 | 81 | ###Skin tone emoji 82 | 83 | Skin tone emojis (👩🏾) are composed of a color-neutral emoji that depicts humans (👩), followed by one of the 5 Unicode skin tone modifier characters ([🏻](http://www.fileformat.info/info/unicode/char/1F3FB/index.htm), [🏼](http://www.fileformat.info/info/unicode/char/1F3FC/index.htm), [🏽](http://www.fileformat.info/info/unicode/char/1F3FD/index.htm), [🏾](http://www.fileformat.info/info/unicode/char/1F3FE/index.htm), [🏿](http://www.fileformat.info/info/unicode/char/1F3FF/index.htm)). The emoji character and the skin tone modifier are each represented as a surrogate pair in JavaScript strings. 84 | 85 | `spliddit` will split these sequences (4 total code units) into one character even though they consist of two Unicode code points. 86 | 87 | ##Other functions 88 | 89 | ###spliddit.hasPair(s) 90 | Tells if `s` contains a surrogate pair. 91 | 92 | ```javascript 93 | spliddit.hasPair('Look 👀 wow') 94 | // => true 95 | spliddit.hasPair('abcdef') 96 | // => false 97 | ``` 98 | 99 | ###spliddit.isFirstOfPair(c) 100 | Tells if `c[0]` (the first item in `c`) is the first code unit of a surrogate pair. (Character codes 0xD800 through 0xDFFF) 101 | 102 | ```javascript 103 | var s = '👴' 104 | var sFirst = s[0] 105 | var sArr = s.split('') 106 | 107 | spliddit.isFirstOfPair(s) 108 | // => true 109 | 110 | spliddit.isFirstOfPair(sFirst) 111 | // => true 112 | 113 | spliddit.isFirstOfPair(sArr) 114 | // => true 115 | 116 | spliddit.isFirstOfPair(sArr[0]) 117 | // => true 118 | 119 | spliddit.isFirstOfPair('a') 120 | // => false 121 | ``` 122 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | var HIGH_SURROGATE_START = 0xD800 2 | var HIGH_SURROGATE_END = 0xDBFF 3 | 4 | var LOW_SURROGATE_START = 0xDC00 5 | 6 | var REGIONAL_INDICATOR_START = 0x1F1E6 7 | var REGIONAL_INDICATOR_END = 0x1F1FF 8 | 9 | var FITZPATRICK_MODIFIER_START = 0x1f3fb 10 | var FITZPATRICK_MODIFIER_END = 0x1f3ff 11 | 12 | function spliddit (s, delimiter) { 13 | if (s === void 0 || s === null) { 14 | throw new Error('s cannot be undefined or null') 15 | } 16 | 17 | if (Array.isArray(s)) { 18 | s = s.join('') 19 | } 20 | 21 | if (delimiter instanceof RegExp || 22 | (typeof delimiter === 'string' && delimiter.length) 23 | ) { 24 | return s.split(delimiter) 25 | } 26 | 27 | return split_into_chars(s) 28 | } 29 | 30 | function split_into_chars (s) { 31 | var i = 0 32 | var increment 33 | var result = [] 34 | 35 | while (i < s.length) { 36 | increment = take_how_many(i, s) 37 | result.push(s.substring(i, i + increment)) 38 | i += increment 39 | } 40 | 41 | return result 42 | } 43 | 44 | // Decide how many code units make up the current character. 45 | // BMP characters: 1 code unit 46 | // Non-BMP characters (represented by surrogate pairs): 2 code units 47 | // Emoji with skin-tone modifiers: 4 code units (2 code points) 48 | // Country flags: 4 code units (2 code points) 49 | function take_how_many (i, s) { 50 | var last_index = s.length - 1 51 | var current = s[i] 52 | var current_pair 53 | var next_pair 54 | 55 | // If we don't have a value that is part of a surrogate pair, or we're at 56 | // the end, only take the value at i 57 | if (!is_first_of_surrogate_pair(current) || i === last_index) { 58 | return 1 59 | } 60 | 61 | // If the array isn't long enough to take another pair after this one, we 62 | // can only take the current pair 63 | if ((i + 3) > last_index) { 64 | return 2 65 | } 66 | 67 | current_pair = current + s[i + 1] 68 | next_pair = s.substring(i + 2, i + 5) 69 | 70 | // Country flags are comprised of two regional indicator symbols, 71 | // each represented by a surrogate pair. 72 | // See http://emojipedia.org/flags/ 73 | // If both pairs are regional indicator symbols, take 4 74 | if (is_regional_indicator_symbol(current_pair) && 75 | is_regional_indicator_symbol(next_pair)) { 76 | return 4 77 | } 78 | 79 | // If the next pair make a Fitzpatrick skin tone 80 | // modifier, take 4 81 | // See http://emojipedia.org/modifiers/ 82 | // Technically, only some code points are meant to be 83 | // combined with the skin tone modifiers. This function 84 | // does not check the current pair to see if it is 85 | // one of them. 86 | if (is_fitzpatrick_modifier(next_pair)) { 87 | return 4 88 | } 89 | 90 | return 2 91 | } 92 | 93 | function is_first_of_surrogate_pair (c) { 94 | if (c === void 0 || c === null || !c.hasOwnProperty(0)) { 95 | return false 96 | } 97 | 98 | return between_inclusive( 99 | c[0].charCodeAt(0), HIGH_SURROGATE_START, HIGH_SURROGATE_END 100 | ) 101 | } 102 | 103 | function has_pair (s) { 104 | if (typeof s !== 'string') { 105 | return false 106 | } 107 | 108 | return s.split('').some(is_first_of_surrogate_pair) 109 | } 110 | 111 | function is_regional_indicator_symbol (s) { 112 | var code_point = code_point_from_surrogate_pair(s) 113 | 114 | return between_inclusive( 115 | code_point, REGIONAL_INDICATOR_START, REGIONAL_INDICATOR_END 116 | ) 117 | } 118 | 119 | function is_fitzpatrick_modifier (s) { 120 | var code_point = code_point_from_surrogate_pair(s) 121 | 122 | return between_inclusive( 123 | code_point, FITZPATRICK_MODIFIER_START, FITZPATRICK_MODIFIER_END 124 | ) 125 | } 126 | 127 | // Turn two code units (surrogate pair) into 128 | // the code point they represent. 129 | function code_point_from_surrogate_pair (s) { 130 | var high_offset = s.charCodeAt(0) - HIGH_SURROGATE_START 131 | var low_offset = s.charCodeAt(1) - LOW_SURROGATE_START 132 | 133 | return (high_offset << 10) + low_offset + 0x10000 134 | } 135 | 136 | function between_inclusive (value, lower_bound, upper_bound) { 137 | return value >= lower_bound && value <= upper_bound 138 | } 139 | 140 | module.exports = spliddit 141 | module.exports.isFirstOfPair = is_first_of_surrogate_pair 142 | module.exports.hasPair = has_pair 143 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "spliddit", 3 | "version": "2.1.1", 4 | "description": "unicode-aware string splitting", 5 | "main": "index.js", 6 | "dependencies": {}, 7 | "scripts": { 8 | "test": "node test/index.js" 9 | }, 10 | "author": "sdot (Justin Sippel)", 11 | "keywords": [ 12 | "unicode", 13 | "string", 14 | "split", 15 | "spliddit" 16 | ], 17 | "license": "MIT", 18 | "repository": "https://github.com/essdot/spliddit.git", 19 | "devDependencies": { 20 | "tape": "^4.2.0" 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /test/index.js: -------------------------------------------------------------------------------- 1 | var test = require('tape') 2 | 3 | var spliddit = require('../index') 4 | 5 | test('emoji in middle', function (t) { 6 | var result = spliddit('abc😤def') 7 | 8 | t.deepEqual(result, ['a', 'b', 'c', '😤', 'd', 'e', 'f']) 9 | t.end() 10 | }) 11 | 12 | test('emoji start', function (t) { 13 | var s = '🍕abd' 14 | 15 | t.deepEqual(spliddit(s), ['🍕', 'a', 'b', 'd']) 16 | t.end() 17 | }) 18 | 19 | test('emoji end', function (t) { 20 | var s = '123🍥' 21 | 22 | t.deepEqual(spliddit(s), ['1', '2', '3', '🍥']) 23 | t.end() 24 | }) 25 | 26 | test('emoji party', function (t) { 27 | var result = spliddit('🍕⚽⛵✨⏳☕⏰🇯🇲😍👍💅😋👭👯✊👸🏽') 28 | 29 | t.deepEqual(result, [ 30 | '🍕', '⚽', '⛵', '✨', '⏳', '☕', '⏰', '🇯🇲', 31 | '😍', '👍', '💅', '😋', '👭', '👯', '✊', '👸🏽' 32 | ]) 33 | 34 | t.end() 35 | }) 36 | 37 | test('check', function (t) { 38 | var result = spliddit('123🍕✓') 39 | 40 | t.deepEqual(result, ['1', '2', '3', '🍕', '✓']) 41 | t.end() 42 | }) 43 | 44 | test('reverse string', function (t) { 45 | var s = '123🍕✓' 46 | 47 | var sReverse = spliddit(s).reverse().join('') 48 | var sReverse2 = spliddit(sReverse).reverse().join('') 49 | 50 | t.equal('✓🍕321', sReverse) 51 | t.equal(s, sReverse2) 52 | t.end() 53 | }) 54 | 55 | test('single char', function (t) { 56 | var s = 'a' 57 | 58 | t.deepEqual(spliddit(s), ['a']) 59 | t.end() 60 | }) 61 | 62 | test('regular string', function (t) { 63 | var s = 'Hello how are you' 64 | var arr = spliddit(s) 65 | 66 | t.equal(arr.length, 17) 67 | t.equal(arr[0], 'H') 68 | t.equal(arr[16], 'u') 69 | t.end() 70 | }) 71 | 72 | test('chinese', function (t) { 73 | var s = '𨭎", "𠬠", and "𩷶"' 74 | var result = spliddit(s) 75 | 76 | t.equal(result.length, 16) 77 | t.equal(result[0], '𨭎') 78 | t.equal(result[1], '"') 79 | t.equal(result[5], '𠬠') 80 | t.equal(result[6], '"') 81 | t.equal(result[14], '𩷶') 82 | t.equal(result[15], '"') 83 | t.end() 84 | }) 85 | 86 | test('en dash', function (t) { 87 | var s = 'and then–boom' 88 | var result = spliddit(s) 89 | 90 | t.equal(result.length, 13) 91 | t.equal(result[8], '–') 92 | 93 | s = 'ab–c' 94 | result = spliddit(s) 95 | t.deepEqual(result, ['a', 'b', '–', 'c']) 96 | t.end() 97 | }) 98 | 99 | test('math script', function (t) { 100 | var s = '𝒞𝒯𝒮𝒟' 101 | 102 | t.deepEqual(spliddit(s), ['𝒞', '𝒯', '𝒮', '𝒟']) 103 | t.end() 104 | }) 105 | 106 | test('fraktur', function (t) { 107 | var s = '𝔅𝔎' 108 | 109 | t.deepEqual(spliddit(s), ['𝔅', '𝔎']) 110 | t.end() 111 | }) 112 | 113 | test('acrophonic', function (t) { 114 | var s = '𐅧, 𐅨, and 𐅩' 115 | var result = spliddit(s) 116 | 117 | t.equal(result.length, 11) 118 | t.equal(result[0], '𐅧') 119 | t.equal(result[1], ',') 120 | t.equal(result[3], '𐅨') 121 | t.equal(result[4], ',') 122 | t.equal(result[10], '𐅩') 123 | t.end() 124 | }) 125 | 126 | test('pass in munged array', function (t) { 127 | var emojiString = 'No 🙅' 128 | var arr = emojiString.split('') 129 | 130 | t.deepEqual(spliddit(arr), spliddit(emojiString)) 131 | t.deepEqual(spliddit(arr), ['N', 'o', ' ', '🙅']) 132 | t.end() 133 | }) 134 | 135 | test('throws for null and undefined', function (t) { 136 | var undefinedFunction = function () { spliddit(void 0) } 137 | var nullFunction = function () { spliddit(null) } 138 | 139 | t.throws(undefinedFunction) 140 | t.throws(nullFunction) 141 | t.end() 142 | }) 143 | 144 | test('arabic', function (t) { 145 | var s = 'ځڂڃڄڅچڇڈ' 146 | 147 | t.deepEqual(spliddit(s), ['ځ', 'ڂ', 'ڃ', 'ڄ', 'څ', 'چ', 'ڇ', 'ڈ']) 148 | t.end() 149 | }) 150 | 151 | test('country flags/regional indicator characters', function (t) { 152 | var s = '🇦🇸' // American Samoa flag 153 | var flagInMiddle = 'Sup 🇮🇹 Italy' // Italian flag in middle 154 | 155 | t.deepEqual(spliddit(s), [s]) 156 | t.equal(spliddit(s).join(''), s) 157 | 158 | t.equal(spliddit(flagInMiddle).length, 11) 159 | t.equal(spliddit(flagInMiddle).join(''), flagInMiddle) 160 | t.equal(spliddit(flagInMiddle).reverse().join(''), 'ylatI 🇮🇹 puS') 161 | t.end() 162 | }) 163 | 164 | test('emoji with skin tone indicators', function (t) { 165 | var s = '🎅🏻🎅🏼🎅🏽🎅🏾🎅🏿' 166 | var s2 = 'hi santa 🎅🏾 lol' 167 | 168 | t.deepEqual(spliddit(s), ['🎅🏻', '🎅🏼', '🎅🏽', '🎅🏾', '🎅🏿']) 169 | t.equal(spliddit(s).join(''), s) 170 | t.equal(spliddit(s2).length, 14) 171 | t.equal(spliddit(s2).join(''), s2) 172 | t.end() 173 | }) 174 | 175 | test('has pair', function (t) { 176 | t.ok(spliddit.hasPair("hello 𝔎 what's up")) 177 | t.ok(spliddit.hasPair('👔')) 178 | t.ok(spliddit.hasPair('𐅕')) 179 | t.ok(spliddit.hasPair('🏼')) 180 | 181 | t.notOk(spliddit.hasPair('hello')) 182 | t.notOk(spliddit.hasPair('ڃ')) 183 | t.notOk(spliddit.hasPair('–')) 184 | t.end() 185 | }) 186 | 187 | test('first of pair', function (t) { 188 | t.ok(spliddit.isFirstOfPair('🐳')) 189 | t.ok(spliddit.isFirstOfPair(['🐣'])) 190 | t.ok(spliddit.isFirstOfPair('🚯'.charAt(0))) 191 | t.ok(spliddit.isFirstOfPair(['🔫'.charAt(0)])) 192 | t.ok(spliddit.isFirstOfPair(String.fromCharCode(0xD801))) 193 | 194 | t.notOk(spliddit.isFirstOfPair('a')) 195 | t.notOk(spliddit.isFirstOfPair('Hello')) 196 | t.notOk(spliddit.isFirstOfPair('–')) 197 | t.end() 198 | }) 199 | 200 | test('split by delimiter', function (t) { 201 | t.deepEqual(spliddit('abc', 'b'), ['a', 'c']) 202 | t.deepEqual(spliddit('abcd', 'e'), ['abcd']) 203 | t.deepEqual(spliddit('abcd', ''), ['a', 'b', 'c', 'd']) 204 | t.deepEqual(spliddit('1-800-867-5309', '-'), ['1', '800', '867', '5309']) 205 | t.deepEqual(spliddit('🐣🐳🐣', '🐳'), ['🐣', '🐣']) 206 | t.deepEqual(spliddit('abcddddabc', 'dddd'), ['abc', 'abc']) 207 | t.deepEqual(spliddit('ځڂڃڄڅچڇڈ', 'ڄڅ'), ['ځڂڃ', 'چڇڈ']) 208 | t.deepEqual(spliddit('ab🇦🇸cd', '🇦🇸'), ['ab', 'cd']) 209 | t.deepEqual(spliddit('⚽⛵✨⏳', 'a'), ['⚽⛵✨⏳']) 210 | t.deepEqual(spliddit('yuj390', '⚽'), ['yuj390']) 211 | t.deepEqual(spliddit('wow✨wow', '✨'), ['wow', 'wow']) 212 | t.deepEqual(spliddit('✨wow✨', 'wow'), ['✨', '✨']) 213 | t.deepEqual(spliddit('d✨wow✨d', '✨wow✨'), ['d', 'd']) 214 | t.deepEqual(spliddit('wow', 'wow'), ['', '']) 215 | t.deepEqual(spliddit('wow✨', '✨'), ['wow', '']) 216 | t.deepEqual(spliddit('✨wow', '✨'), ['', 'wow']) 217 | t.deepEqual(spliddit('abcd', /b/), ['a', 'cd']) 218 | 219 | t.end() 220 | }) 221 | --------------------------------------------------------------------------------