├── .gitignore ├── .babelrc ├── test ├── helper.js ├── gujarati.js ├── kannada.js ├── malayalam.js ├── bengali.js ├── myanmar.js ├── tibetan.js ├── devanagari.js ├── tamil.js ├── lao.js ├── telugu.js ├── thai.js ├── khmer.js ├── japanese-kana.js ├── hebrew.js ├── arabic.js ├── index.js └── emoji.js ├── src ├── thai.js ├── lao.js ├── khmer.js ├── tibetan.js ├── hebrew.js ├── tamil.js ├── telugu.js ├── bengali.js ├── devanagari.js ├── kannada.js ├── malayalam.js ├── gujarati.js ├── japanese-kana.js ├── arabic.js ├── myanmar.js ├── emoji.js └── index.js ├── .github └── workflows │ └── test-node.yml ├── package.json └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | node_modules/ 3 | 4 | lib/ 5 | -------------------------------------------------------------------------------- /.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "presets": [ 3 | "@babel/preset-env" 4 | ] 5 | } 6 | -------------------------------------------------------------------------------- /test/helper.js: -------------------------------------------------------------------------------- 1 | /* eslint-env mocha */ 2 | 3 | import { assert } from 'chai' 4 | 5 | export function testBreak (regExp, source, results) { 6 | const res = source.match(regExp) 7 | assert.deepEqual(res, results) 8 | } 9 | -------------------------------------------------------------------------------- /src/thai.js: -------------------------------------------------------------------------------- 1 | // spec: https://www.unicode.org/charts/PDF/U0E00.pdf 2 | 3 | const letter = '[\\u0E00-\\u0E7F]' 4 | const trailingLetter = '[\\u0E31\\u0E33-\\u0E3A\\u0E47-\\u0E4E]' 5 | export const thai = `${letter}${trailingLetter}*` 6 | -------------------------------------------------------------------------------- /src/lao.js: -------------------------------------------------------------------------------- 1 | // spec: https://www.unicode.org/charts/PDF/U0E80.pdf 2 | 3 | const letter = '[\\u{0E80}-\\u{0EFF}]' 4 | const trailingLetter = '[\\u{0EB1}\\u{0EB4}-\\u{0EBC}\\u{0EC8}-\\u{0ECD}]' 5 | export const lao = `${letter}${trailingLetter}*` 6 | -------------------------------------------------------------------------------- /src/khmer.js: -------------------------------------------------------------------------------- 1 | // spec: https://www.unicode.org/charts/PDF/U1780.pdf 2 | 3 | const letter = '[\\u{1780}-\\u{17FF}]' 4 | const trailingLetter = '[\\u{17B6}-\\u{17D1}\\u{17D3}\\u{17DD}]' 5 | const control = '\\u{17D2}' 6 | export const khmer = `${letter}(${control}${letter}|${trailingLetter})*` 7 | -------------------------------------------------------------------------------- /src/tibetan.js: -------------------------------------------------------------------------------- 1 | // spec: https://www.unicode.org/charts/PDF/U0F00.pdf 2 | 3 | const letter = '[\\u{0F00}-\\u{0FFF}]' 4 | const trailingLetter = '[\\0F18\\0F19\\0F35\\0F37\\0F39\\0F3E\\0F3F\\u{0F71}-\\u{0F87}\\u{0F8D}-\\u{0FBC}\\u{0FC6}]' 5 | export const tibetan = `${letter}${trailingLetter}*` 6 | -------------------------------------------------------------------------------- /src/hebrew.js: -------------------------------------------------------------------------------- 1 | // Hebrew 2 | // https://www.unicode.org/charts/PDF/U0590.pdf 3 | // https://unicode-table.com/blocks/hebrew/ 4 | 5 | const letter = '[\u05D0-\u05EA]' 6 | const combiningMark = '[\u0591-\u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7]' 7 | 8 | export const hebrew = `${letter}${combiningMark}*` 9 | -------------------------------------------------------------------------------- /src/tamil.js: -------------------------------------------------------------------------------- 1 | // spec: https://www.unicode.org/charts/PDF/U0B80.pdf 2 | 3 | const letter = '[\\u{0B80}-\\u{0BFF}]' 4 | const trailingLetter = '[\\u{0B82}-\\u{0B83}\\u{0BBE}-\\u{0BD7}\\u{0962}\\u{0963}]' 5 | // tamil's virama does not combine the following consonant 6 | export const tamil = `${letter}${trailingLetter}*` 7 | -------------------------------------------------------------------------------- /src/telugu.js: -------------------------------------------------------------------------------- 1 | // spec: https://www.unicode.org/charts/PDF/U0C00.pdf 2 | 3 | const letter = '[\\u{0C00}-\\u{0C7F}]' 4 | const trailingLetter = '[\\u{0C00}-\\u{0C04}\\u{0C3E}-\\u{0C56}\\u{0C62}\\u{0C63}]' 5 | const control = '\\u{0C4D}' // Virama 6 | export const telugu = `${letter}(${control}${letter}|${trailingLetter})*` 7 | -------------------------------------------------------------------------------- /src/bengali.js: -------------------------------------------------------------------------------- 1 | // spec: https://www.unicode.org/charts/PDF/U0980.pdf 2 | 3 | const letter = '[\\u{0980}-\\u{09FF}]' 4 | const trailingLetter = '[\\u{0980}-\\u{0983}\\u{09BC}-\\u{09D7}\\u{09E2}\\u{09E3}\\u{09FE}]' 5 | const control = '\\u{09CD}' // Virama 6 | export const bengali = `${letter}(${control}${letter}|${trailingLetter})*` 7 | -------------------------------------------------------------------------------- /src/devanagari.js: -------------------------------------------------------------------------------- 1 | // spec: https://www.unicode.org/charts/PDF/U0900.pdf 2 | 3 | const letter = '[\\u{0900}-\\u{097F}]' 4 | const trailingLetter = '[\\u{0900}-\\u{0903}\\u{093A}-\\u{0957}\\u{0962}\\u{0963}]' 5 | const control = '\\u{094D}' // Virama 6 | export const devanagari = `${letter}(${control}${letter}|${trailingLetter})*` 7 | -------------------------------------------------------------------------------- /src/kannada.js: -------------------------------------------------------------------------------- 1 | // spec: https://www.unicode.org/charts/PDF/U0C80.pdf 2 | 3 | const letter = '[\\u{0C80}-\\u{0CFF}]' 4 | const trailingLetter = '[\\u{0C81}-\\u{0C83}\\u{0CBC}\\u{0CBE}-\\u{0CCD}\\u{0CD5}\\u{0CD6}\\u{0CE2}\\u{0CE3}]' 5 | const control = '\\u{0CCD}' // Virama 6 | export const kannada = `${letter}(${control}${letter}|${trailingLetter})*` 7 | -------------------------------------------------------------------------------- /src/malayalam.js: -------------------------------------------------------------------------------- 1 | // spec: https://unicode.org/charts/PDF/U0D00.pdf 2 | 3 | const letter = '[\\u{0D00}-\\u{0D7F}]' 4 | const trailingLetter = '[\\u{0D00}-\\u{0D03}\\u{0D3B}\\u{0D3C}\\u{0D3E}-\\u{0D4D}\\u{0D57}\\u{0D62}-\\u{0D63}]' 5 | const control = '\\u{0D4D}' // Virama 6 | export const malayalam = `${letter}(${control}${letter}|${trailingLetter})*` 7 | -------------------------------------------------------------------------------- /src/gujarati.js: -------------------------------------------------------------------------------- 1 | // spec: https://www.unicode.org/charts/PDF/U0A80.pdf 2 | 3 | const letter = '[\\u{0A80}-\\u{0AFF}]' 4 | const trailingLetter = '[\\u{0A81}-\\u{0A83}\\u{0ABC}\\u{0ABE}-\\u{0ACD}\\u{0AE2}\\u{0AE3}\\u{0AFA}-\\u{0AFF}]' 5 | const control = '\\u{0ACD}' // Virama 6 | export const gujarati = `${letter}(${control}${letter}|${trailingLetter})*` 7 | -------------------------------------------------------------------------------- /.github/workflows/test-node.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - uses: actions/checkout@v2 11 | - name: Use Node.js 14.x 12 | uses: actions/setup-node@v1 13 | with: 14 | node-version: '14.x' 15 | - run: npm ci 16 | - run: npm test 17 | -------------------------------------------------------------------------------- /test/gujarati.js: -------------------------------------------------------------------------------- 1 | /* eslint-env mocha */ 2 | 3 | import { gujarati } from '../lib/gujarati' 4 | import { testBreak } from './helper' 5 | 6 | describe('WordBreakGujarati', function () { 7 | it('break correctly', function () { 8 | const regExp = new RegExp(gujarati, 'gu') 9 | testBreak(regExp, 'ગુજરાતી', ['ગુ', 'જ', 'રા', 'તી']) 10 | }) 11 | }) 12 | -------------------------------------------------------------------------------- /test/kannada.js: -------------------------------------------------------------------------------- 1 | /* eslint-env mocha */ 2 | 3 | import { kannada } from '../lib/kannada' 4 | import { testBreak } from './helper' 5 | 6 | describe('WordBreakKannada', function () { 7 | it('break correctly', function () { 8 | const regExp = new RegExp(kannada, 'gu') 9 | testBreak(regExp, 'ನಮಸ್ಕಾರ', ['ನ', 'ಮ', 'ಸ್ಕಾ', 'ರ']) 10 | testBreak(regExp, 'ನನ್ನ', ['ನ', 'ನ್ನ']) 11 | }) 12 | }) 13 | -------------------------------------------------------------------------------- /test/malayalam.js: -------------------------------------------------------------------------------- 1 | /* eslint-env mocha */ 2 | 3 | import { malayalam } from '../lib/malayalam' 4 | import { testBreak } from './helper' 5 | 6 | describe('WordBreakMalayalam', function () { 7 | it('break correctly', function () { 8 | const regExp = new RegExp(malayalam, 'gu') 9 | testBreak(regExp, 'പറ്റി', ['പ', 'റ്റി']) 10 | testBreak(regExp, 'വളരെ', ['വ', 'ള', 'രെ']) 11 | }) 12 | }) 13 | -------------------------------------------------------------------------------- /test/bengali.js: -------------------------------------------------------------------------------- 1 | /* eslint-env mocha */ 2 | 3 | import { bengali } from '../lib/bengali' 4 | import { testBreak } from './helper' 5 | 6 | describe('WordBreakBengali', function () { 7 | it('break correctly', function () { 8 | const regExp = new RegExp(bengali, 'gu') 9 | testBreak(regExp, 'দ্ধ', ['দ্ধ']) 10 | testBreak(regExp, 'স্ত্র', ['স্ত্র']) 11 | testBreak(regExp, 'নমস্কার', ['ন', 'ম', 'স্কা', 'র']) 12 | }) 13 | }) 14 | -------------------------------------------------------------------------------- /test/myanmar.js: -------------------------------------------------------------------------------- 1 | /* eslint-env mocha */ 2 | 3 | import { myanmar } from '../lib/myanmar' 4 | import { testBreak } from './helper' 5 | 6 | describe('WordBreakMyanmar', function () { 7 | it('break correctly', function () { 8 | const regExp = new RegExp(myanmar, 'gu') 9 | testBreak(regExp, 'နံနက်', ['နံ', 'န', 'က်']) 10 | testBreak(regExp, 'မင်္ဂလာနံနက်ခင်းပါ', ['မ', 'င်္ဂ', 'လာ', 'နံ', 'န', 'က်', 'ခ', 'င်း', 'ပါ']) 11 | }) 12 | }) 13 | -------------------------------------------------------------------------------- /test/tibetan.js: -------------------------------------------------------------------------------- 1 | /* eslint-env mocha */ 2 | 3 | import { tibetan } from '../lib/tibetan' 4 | import { testBreak } from './helper' 5 | 6 | describe('WordBreakTibetan', function () { 7 | it('break correctly', function () { 8 | const regExp = new RegExp(tibetan, 'gu') 9 | testBreak(regExp, 'གཟིགས་དང་', ['ག', 'ཟི', 'ག', 'ས', '་', 'ད', 'ང', '་']) 10 | testBreak(regExp, 'ལགས་མིན་', ['ལ', 'ག', 'ས', '་', 'མི', 'ན', '་']) 11 | }) 12 | }) 13 | -------------------------------------------------------------------------------- /test/devanagari.js: -------------------------------------------------------------------------------- 1 | /* eslint-env mocha */ 2 | 3 | import { devanagari } from '../lib/devanagari' 4 | import { testBreak } from './helper' 5 | 6 | describe('WordBreakDevanagari', function () { 7 | it('break correctly', function () { 8 | const regExp = new RegExp(devanagari, 'gu') 9 | testBreak(regExp, 'वाह', ['वा', 'ह']) 10 | testBreak(regExp, 'शुभ', ['शु', 'भ']) 11 | testBreak(regExp, 'रात्रि', ['रा', 'त्रि']) 12 | }) 13 | }) 14 | -------------------------------------------------------------------------------- /test/tamil.js: -------------------------------------------------------------------------------- 1 | /* eslint-env mocha */ 2 | 3 | import { tamil } from '../lib/tamil' 4 | import { testBreak } from './helper' 5 | 6 | describe('WordBreakTamil', function () { 7 | it('break correctly', function () { 8 | const regExp = new RegExp(tamil, 'gu') 9 | testBreak(regExp, 'ஹலோ', ['ஹ', 'லோ']) 10 | testBreak(regExp, 'குடீவ்னிங்', ['கு', 'டீ', 'வ்', 'னி', 'ங்']) 11 | testBreak(regExp, 'ஓக்கே', ['ஓ', 'க்', 'கே']) 12 | }) 13 | }) 14 | -------------------------------------------------------------------------------- /test/lao.js: -------------------------------------------------------------------------------- 1 | /* eslint-env mocha */ 2 | 3 | import { lao } from '../lib/lao' 4 | import { testBreak } from './helper' 5 | 6 | describe('WordBreakLao', function () { 7 | it('break correctly', function () { 8 | const regExp = new RegExp(lao, 'gu') 9 | testBreak(regExp, 'ສະບາຍດີ', ['ສ', 'ະ', 'ບ', 'າ', 'ຍ', 'ດີ']) 10 | testBreak(regExp, 'ກະລຸນາຮູ້ສຶກບໍ່ເສຍຄ່າ', ['ກ', 'ະ', 'ລຸ', 'ນ', 'າ', 'ຮູ້', 'ສຶ', 'ກ', 'ບໍ່', 'ເ', 'ສ', 'ຍ', 'ຄ່', 'າ']) 11 | }) 12 | }) 13 | -------------------------------------------------------------------------------- /test/telugu.js: -------------------------------------------------------------------------------- 1 | /* eslint-env mocha */ 2 | 3 | import { telugu } from '../lib/telugu' 4 | import { testBreak } from './helper' 5 | 6 | describe('WordBreakTelugu', function () { 7 | it('break correctly', function () { 8 | const regExp = new RegExp(telugu, 'gu') 9 | testBreak(regExp, 'నమస్కారం', ['న', 'మ', 'స్కా', 'రం']) 10 | testBreak(regExp, 'పుట్టపర్తి', ['పు', 'ట్ట', 'ప', 'ర్తి']) 11 | testBreak(regExp, 'ఉన్నారు', ['ఉ', 'న్నా', 'రు']) 12 | }) 13 | }) 14 | -------------------------------------------------------------------------------- /src/japanese-kana.js: -------------------------------------------------------------------------------- 1 | // spec 2 | // Hiragana https://www.unicode.org/charts/PDF/U3040.pdf 3 | // Katakana https://www.unicode.org/charts/PDF/U30A0.pdf 4 | 5 | const hiragana = '[\\u{3041}-\\u{3096}\\u{309D}-\\u{309F}]' 6 | const katakana = '[\\u{30A0}-\\u{30FF}]' 7 | const halfSizeSoundMark = '[\\u{3099}-\\u{309A}]' // soundmark NFD 8 | const fullSizeSoundMark = '[\\u{309B}-\\u{309C}]' 9 | 10 | export const japaneseKana = `((${katakana}|${hiragana})${halfSizeSoundMark}?|${fullSizeSoundMark})` 11 | -------------------------------------------------------------------------------- /test/thai.js: -------------------------------------------------------------------------------- 1 | /* eslint-env mocha */ 2 | 3 | import { thai } from '../lib/thai' 4 | import { testBreak } from './helper' 5 | 6 | describe('WordBreakThai', function () { 7 | it('break correctly', function () { 8 | const regExp = new RegExp(thai, 'gu') 9 | testBreak(regExp, 'ไม้ทัณฑฆาต', ['ไ', 'ม้', 'ทั', 'ณ', 'ฑ', 'ฆ', 'า', 'ต']) 10 | testBreak(regExp, 'ข้าวมันไก่', ['ข้', 'า', 'ว', 'มั', 'น', 'ไ', 'ก่']) 11 | testBreak(regExp, 'ลำแสง', ['ลำ', 'แ', 'ส', 'ง']) 12 | }) 13 | }) 14 | -------------------------------------------------------------------------------- /test/khmer.js: -------------------------------------------------------------------------------- 1 | /* eslint-env mocha */ 2 | 3 | import { khmer } from '../lib/khmer' 4 | import { testBreak } from './helper' 5 | 6 | describe('WordBreakKhmer', function () { 7 | it('break correctly', function () { 8 | const regExp = new RegExp(khmer, 'gu') 9 | testBreak(regExp, 'ស្រីៗ', ['ស្រី', 'ៗ']) 10 | testBreak(regExp, 'ច្រើនឡើងៗ', ['ច្រើ', 'ន', 'ឡើ', 'ង', 'ៗ']) 11 | testBreak(regExp, 'ប៉ុស្ដិ៍', ['ប៉ុ', 'ស្ដិ៍']) 12 | testBreak(regExp, 'ផម្រាបសួរ', ['ផ', 'ម្រា', 'ប', 'សួ', 'រ']) 13 | }) 14 | }) 15 | -------------------------------------------------------------------------------- /src/arabic.js: -------------------------------------------------------------------------------- 1 | // Arabic 2 | // https://www.unicode.org/charts/PDF/U0600.pdf 3 | // https://unicode-table.com/blocks/arabic/ 4 | // Arabic supplement 5 | // https://www.unicode.org/charts/PDF/U0750.pdf 6 | // https://unicode-table.com/blocks/arabic-supplement/ 7 | 8 | const arabicLetter = '\u0620-\u064A\u066E-\u066F\u0671-\u06D5\u06EE-\u06EF\u06FA-\u06FF' 9 | const arabicSupplementLetter = '\u0750-\u077F' 10 | 11 | const letter = `[${arabicLetter}${arabicSupplementLetter}]` 12 | const combiningMark = '[\u064B-\u065F\u0670]' 13 | 14 | export const arabic = `${letter}${combiningMark}*` 15 | -------------------------------------------------------------------------------- /src/myanmar.js: -------------------------------------------------------------------------------- 1 | // spec: https://www.unicode.org/charts/PDF/U1000.pdf 2 | 3 | const letter = '[\\u{1000}-\\u{109F}]' 4 | const trailingLetterRange = [ 5 | '\\u{102B}-\\u{1038}', 6 | '\\u{103A}-\\u{103E}', 7 | '\\u{1056}-\\u{1059}', 8 | '\\u{105E}-\\u{1060}', 9 | '\\u{1062}-\\u{1064}', 10 | '\\u{1067}-\\u{106D}', 11 | '\\u{1071}-\\u{1074}', 12 | '\\u{1082}-\\u{108D}', 13 | '\\u{108F}', 14 | '\\u{109A}-\\u{109D}' 15 | ] 16 | const trailingLetter = `[${trailingLetterRange.join('')}]` 17 | const control = '\\u{1039}' 18 | export const myanmar = `${letter}(${control}${letter}|${trailingLetter})*` 19 | -------------------------------------------------------------------------------- /test/japanese-kana.js: -------------------------------------------------------------------------------- 1 | /* eslint-env mocha */ 2 | 3 | import { japaneseKana } from '../lib/japanese-kana' 4 | import { testBreak } from './helper' 5 | 6 | describe('WordBreakJapaneseHiragana', function () { 7 | const regExp = new RegExp(japaneseKana, 'gu') 8 | 9 | it('breaks plain kana', function () { 10 | testBreak(regExp, 'こんにちは', ['こ', 'ん', 'に', 'ち', 'は']) 11 | testBreak(regExp, 'ハヒフヘホ', ['ハ', 'ヒ', 'フ', 'ヘ', 'ホ']) 12 | }) 13 | 14 | it('breaks kana with soundmark', function () { 15 | testBreak(regExp, 'ごん゙に゙ぢば', ['ご', 'ん゙', 'に゙', 'ぢ', 'ば']) 16 | testBreak(regExp, 'パピプペポ', ['パ', 'ピ', 'プ', 'ペ', 'ポ']) 17 | }) 18 | 19 | it('does not break fullsize soundmark', function () { 20 | testBreak(regExp, 'こ゛ん゛', ['こ', '゛', 'ん', '゛']) 21 | testBreak(regExp, 'ハ゜ヒ゜', ['ハ', '゜', 'ヒ', '゜']) 22 | }) 23 | }) 24 | -------------------------------------------------------------------------------- /src/emoji.js: -------------------------------------------------------------------------------- 1 | // spec: 2 | // https://en.wikipedia.org/wiki/Unicode_block 3 | 4 | export const countryFlag = '[\\u{1F1E6}-\\u{1F1FF}]{2}' 5 | 6 | export const keyCap = '[0-9#\\*][\\u{FE0F}]?\\u{20E3}' 7 | 8 | const emojiRange = [ 9 | '[\\u{2600}-\\u{26FF}]', // Miscellaneous Symbols 10 | '[\\u{2700}-\\u{27BF}]', // Dingbats 11 | '[\\u{1F300}-\\u{1F5FF}]', // Miscellaneous Symbols and Pictographs 12 | '[\\u{1F600}-\\u{1F64F}]', // Emoticons 13 | '[\\u{1F680}-\\u{1F6FF}]', // Transport and Map Symbols 14 | '[\\u{1F700}-\\u{1F77F}]', // Alchemical Symbols 15 | '[\\u{1F900}-\\u{1F9FF}]' // Supplemental Symbols and Pictographs 16 | ] 17 | const emoji = `(${emojiRange.join('|')})` 18 | const zeroWidthJoinder = '\\u{200D}' 19 | const variationSeletor = '[\\u{FE0E}\\u{FE0F}]' 20 | const skinTone = '[\\u{1F3FB}-\\u{1F3FF}]' 21 | 22 | export const emojiVariation = `${emoji}(${zeroWidthJoinder}${emoji}|${skinTone}|${variationSeletor})*` 23 | -------------------------------------------------------------------------------- /test/hebrew.js: -------------------------------------------------------------------------------- 1 | /* eslint-env mocha */ 2 | 3 | import { assert } from 'chai' 4 | import { hebrew } from '../lib/hebrew' 5 | import { testBreak } from './helper' 6 | 7 | describe('WordBreakHebrew', function () { 8 | const regExp = new RegExp(hebrew, 'gu') 9 | 10 | it('breaks single letters', function () { 11 | const israel = 'ישראל' 12 | assert.lengthOf(israel, 5) 13 | testBreak(regExp, israel, [ 14 | 'י', 15 | 'ש', 16 | 'ר', 17 | 'א', 18 | 'ל' 19 | ]) 20 | }) 21 | 22 | it('breaks letter + combining mark', function () { 23 | const japan = 'יָפּן' 24 | assert.lengthOf(japan, 5) 25 | testBreak(regExp, japan, [ 26 | 'יָ', 27 | 'פּ', 28 | 'ן' 29 | ]) 30 | }) 31 | 32 | it('breaks single letter + multiple combining marks', function () { 33 | const israel = 'יִשְׂרָאֵל' 34 | assert.lengthOf(israel, 10) 35 | testBreak(regExp, israel, [ 36 | 'יִ', 37 | 'שְׂ', 38 | 'רָ', 39 | 'אֵ', 40 | 'ל' 41 | ]) 42 | }) 43 | }) 44 | -------------------------------------------------------------------------------- /test/arabic.js: -------------------------------------------------------------------------------- 1 | /* eslint-env mocha */ 2 | 3 | import { assert } from 'chai' 4 | import { arabic } from '../lib/arabic' 5 | import { testBreak } from './helper' 6 | 7 | describe('WordBreakArabic', function () { 8 | const regExp = new RegExp(arabic, 'gu') 9 | 10 | it('breaks single letters', function () { 11 | const hello = 'مرحبا' 12 | assert.lengthOf(hello, 5) 13 | testBreak(regExp, hello, [ 14 | 'م', 15 | 'ر', 16 | 'ح', 17 | 'ب', 18 | 'ا' 19 | ]) 20 | }) 21 | 22 | it('breaks letter + combining mark', function () { 23 | const note = 'نِيهُون' 24 | assert.lengthOf(note, 7) 25 | testBreak(regExp, note, [ 26 | 'نِ', 27 | 'ي', 28 | 'هُ', 29 | 'و', 30 | 'ن' 31 | ]) 32 | }) 33 | 34 | it('breaks single letter + multiple combining marks', function () { 35 | const nippon = 'نِيپُّونْ' 36 | assert.lengthOf(nippon, 9) 37 | testBreak(regExp, nippon, [ 38 | 'نِ', 39 | 'ي', 40 | 'پُّ', 41 | 'و', 42 | 'نْ' 43 | ]) 44 | }) 45 | }) 46 | -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | import { arabic } from './arabic' 2 | import { bengali } from './bengali' 3 | import { devanagari } from './devanagari' 4 | import { gujarati } from './gujarati' 5 | import { hebrew } from './hebrew' 6 | import { japaneseKana } from './japanese-kana' 7 | import { kannada } from './kannada' 8 | import { khmer } from './khmer' 9 | import { lao } from './lao' 10 | import { malayalam } from './malayalam' 11 | import { myanmar } from './myanmar' 12 | import { tamil } from './tamil' 13 | import { telugu } from './telugu' 14 | import { thai } from './thai' 15 | import { tibetan } from './tibetan' 16 | import { countryFlag, keyCap, emojiVariation } from './emoji' 17 | 18 | const patterns = [ 19 | countryFlag, keyCap, emojiVariation, 20 | arabic, 21 | bengali, devanagari, gujarati, hebrew, 22 | japaneseKana, 23 | kannada, khmer, lao, malayalam, myanmar, 24 | tamil, telugu, thai, tibetan, 25 | '.' 26 | ] 27 | 28 | const splitter = new RegExp(`(${patterns.join('|')})`, 'gu') 29 | 30 | function splitGraphemes (str) { 31 | return str.match(splitter) || [] 32 | } 33 | 34 | module.exports = { splitGraphemes } 35 | -------------------------------------------------------------------------------- /test/index.js: -------------------------------------------------------------------------------- 1 | /* eslint-env mocha */ 2 | 3 | import { assert } from 'chai' 4 | import { splitGraphemes } from '../' 5 | 6 | function testBreak (source, results) { 7 | const res = splitGraphemes(source) 8 | assert.deepEqual(res, results) 9 | } 10 | 11 | describe('WordBreak', function () { 12 | it('break correctly', function () { 13 | testBreak('✌🏽Hello 🌏✌️', ['✌🏽', 'H', 'e', 'l', 'l', 'o', ' ', '🌏', '✌️']) 14 | testBreak('more and ច្រើនឡើងៗ!มีความสุข', 15 | ['m', 'o', 'r', 'e', ' ', 'a', 'n', 'd', ' ', 'ច្រើ', 'ន', 'ឡើ', 'ង', 'ៗ', '!', 'มี', 'ค', 'ว', 'า', 'ม', 'สุ', 'ข']) 16 | testBreak('[អ៊ីនធឺណិត] [* អ៊ីនធឺណិត]', 17 | ['[', 'អ៊ី', 'ន', 'ធឺ', 'ណិ', 'ត', ']', ' ', '[', '*', ' ', 'អ៊ី', 'ន', 'ធឺ', 'ណិ', 'ត', ']']) 18 | }) 19 | 20 | it('break arabic', function () { 21 | testBreak('نِيپُّونْ', [ 22 | 'نِ', 23 | 'ي', 24 | 'پُّ', 25 | 'و', 26 | 'نْ' 27 | ]) 28 | }) 29 | 30 | it('break hebrew', function () { 31 | testBreak('יִשְׂרָאֵל', [ 32 | 'יִ', 33 | 'שְׂ', 34 | 'רָ', 35 | 'אֵ', 36 | 'ל' 37 | ]) 38 | }) 39 | }) 40 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "split-graphemes", 3 | "version": "0.5.0", 4 | "description": "Divide the string into graphemes.", 5 | "main": "lib/index.js", 6 | "scripts": { 7 | "build": "run-s build:*", 8 | "build:clean": "if test -d ./lib; then rm -r ./lib; fi", 9 | "build:babel": "babel src/ --out-dir lib/", 10 | "pretest": "npm run build", 11 | "test": "run-s test:*", 12 | "test:standard": "standard", 13 | "test:mocha": "mocha test/*.js -r @babel/register -r @babel/polyfill --exit", 14 | "prepublishOnly": "npm test", 15 | "postpublish": "git push origin --tags && git push origin master" 16 | }, 17 | "keywords": [ 18 | "grapheme", 19 | "emoji", 20 | "abugida", 21 | "arabic", 22 | "brahmic", 23 | "bengali", 24 | "devanagari", 25 | "gujarati", 26 | "hebrew", 27 | "japanese", 28 | "kannada", 29 | "khmer", 30 | "lao", 31 | "malayalam", 32 | "myanmar", 33 | "NFD", 34 | "tamil", 35 | "telugu", 36 | "thai", 37 | "tibetan" 38 | ], 39 | "files": [ 40 | "README.md", 41 | "package.json", 42 | "lib" 43 | ], 44 | "author": "Daiki Iizuka ", 45 | "license": "MIT", 46 | "devDependencies": { 47 | "@babel/cli": "^7.4.3", 48 | "@babel/core": "^7.4.3", 49 | "@babel/polyfill": "^7.4.3", 50 | "@babel/preset-env": "^7.4.3", 51 | "@babel/register": "^7.4.0", 52 | "chai": "^4.2.0", 53 | "mocha": "^6.1.3", 54 | "npm-run-all": "^4.1.5", 55 | "standard": "^12.0.1" 56 | }, 57 | "standard": { 58 | "ignore": [ 59 | "/lib" 60 | ] 61 | }, 62 | "repository": { 63 | "type": "git", 64 | "url": "https://github.com/nota/split-graphemes.git" 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /test/emoji.js: -------------------------------------------------------------------------------- 1 | /* eslint-env mocha */ 2 | 3 | import { emojiVariation, keyCap, countryFlag } from '../lib/emoji' 4 | import { testBreak } from './helper' 5 | 6 | describe('WordBreakEmoji', function () { 7 | it('break correctly keyCap', function () { 8 | const regExp = new RegExp(keyCap, 'gu') 9 | testBreak(regExp, '8️⃣9️⃣', ['8️⃣', '9️⃣']) 10 | testBreak(regExp, '#️⃣*️⃣', ['#️⃣', '*️⃣']) 11 | }) 12 | 13 | it('break correctly countryFlag', function () { 14 | const regExp = new RegExp(countryFlag, 'gu') 15 | testBreak(regExp, '🇯🇵🇨🇦', ['🇯🇵', '🇨🇦']) 16 | }) 17 | 18 | describe('break correctly emojiVariation', function () { 19 | const regExp = new RegExp(emojiVariation, 'gu') 20 | 21 | // https://lets-emoji.com/emojilist/ 22 | it('smile', function () { 23 | testBreak(regExp, '😀', ['😀']) 24 | }) 25 | 26 | it('skin-tone', function () { 27 | testBreak(regExp, '🤛', ['🤛']) 28 | testBreak(regExp, '👧🏽', ['👧🏽']) 29 | }) 30 | 31 | it('family', function () { 32 | testBreak(regExp, '👨‍👩‍👦‍👦', ['👨‍👩‍👦‍👦']) 33 | }) 34 | 35 | it('person-role', function () { 36 | testBreak(regExp, '👨‍🏫', ['👨‍🏫']) 37 | testBreak(regExp, '👩🏾‍⚖️', ['👩🏾‍⚖️']) 38 | testBreak(regExp, '👩🏾‍✈️', ['👩🏾‍✈️']) 39 | }) 40 | 41 | it('person-fantasy', function () { 42 | testBreak(regExp, '👼', ['👼']) 43 | testBreak(regExp, '👼🏽', ['👼🏽']) 44 | }) 45 | 46 | it('flag', function () { 47 | testBreak(regExp, '🏳️‍🌈', ['🏳️‍🌈']) 48 | testBreak(regExp, '🏴‍☠️', ['🏴‍☠️']) 49 | }) 50 | 51 | it('variation selector', function () { 52 | // u260E uFE0E 53 | testBreak(regExp, '☎', ['☎']) 54 | // u260E uFE0F 55 | testBreak(regExp, '☎️', ['☎️']) 56 | }) 57 | 58 | it('multiple of emojis', function () { 59 | testBreak(regExp, '👨‍👩‍👦‍👦👨‍👩‍👧', ['👨‍👩‍👦‍👦', '👨‍👩‍👧']) 60 | testBreak(regExp, '☎☎️', ['☎', '☎️']) 61 | testBreak(regExp, '🤜🏾🤘🏾', ['🤜🏾', '🤘🏾']) 62 | }) 63 | }) 64 | }) 65 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # split-graphemes 2 | 3 | Divide ligature letters such as Thai, Khmer letters and complex emoji into array of [graphemes](https://en.wikipedia.org/wiki/Grapheme). 4 | You can simply use this library instead of `Array.from` to get graphemes. 5 | 6 | [![Tests](https://github.com/nota/split-graphemes/actions/workflows/test-node.yml/badge.svg?branch=master)](https://github.com/nota/split-graphemes/actions/workflows/test-node.yml) 7 | 8 | ## Installation 9 | ``` 10 | $ npm install split-graphemes 11 | ``` 12 | 13 | ## Examples 14 | ### Emoji 15 | 16 | ```js 17 | // An emoji '👨‍👩‍👦‍👦' consists of 4 people face emoji joined by Zero Width Joiners (ZWJ). 18 | const chars = Array.from('👨‍👩‍👦‍👦') // ['👨', ZWJ, '👩', ZWJ, '👦', ZWJ, '👦'] 19 | ``` 20 | 21 | ```js 22 | // It is interpreted exactly as one character! 23 | const chars = splitGraphemes('👨‍👩‍👦‍👦') // ['👨‍👩‍👦‍👦'] 24 | ``` 25 | 26 | ### Khmer characters 27 | 28 | ```js 29 | Array.from('ប៉ុស្ដិ៍') // ['ប', '៉', 'ុ', 'ស', '្', 'ដ', 'ិ', '៍'] 30 | ``` 31 | 32 | ```js 33 | splitGraphemes('ប៉ុស្ដិ៍') // ['ប៉ុ', 'ស្ដិ៍'] 34 | ``` 35 | 36 | ### Japanese NFD 37 | ```js 38 | splitGraphemes('ごん゙に゙ぢば') // ['ご', 'ん゙', 'に゙', 'ぢ', 'ば'] 39 | splitGraphemes('パピプペポ') // ['パ', 'ピ', 'プ', 'ペ', 'ポ'] 40 | ``` 41 | 42 | ### English 43 | ```js 44 | splitGraphemes('Hello') // ['H', 'e', 'l', 'l', 'o'] 45 | ``` 46 | 47 | ## Supported ligature characters 48 | The list of characters is at [here](https://github.com/nota/split-graphemes/tree/master/src). 49 | - [Emoji](https://en.wikipedia.org/wiki/Unicode_block) 50 | - [Arabic](https://www.unicode.org/charts/PDF/U0600.pdf) and [Arabic supplement](https://www.unicode.org/charts/PDF/U0750.pdf) 51 | - [Bengali](https://www.unicode.org/charts/PDF/U0980.pdf) 52 | - [Devanagari](https://www.unicode.org/charts/PDF/U0900.pdf) 53 | - [Gujarati](https://www.unicode.org/charts/PDF/U0A80.pdf) 54 | - [Hebrew](https://www.unicode.org/charts/PDF/U0590.pdf) 55 | - [Japanese Hiragana](https://www.unicode.org/charts/PDF/U3040.pdf) and [Katakana](https://www.unicode.org/charts/PDF/U30A0.pdf) NFD 56 | - [Kannada](https://www.unicode.org/charts/PDF/U0C80.pdf) 57 | - [Khmer](https://www.unicode.org/charts/PDF/U1780.pdf) 58 | - [Lao](https://www.unicode.org/charts/PDF/U0E80.pdf) 59 | - [Malayalam](https://unicode.org/charts/PDF/U0D00.pdf) 60 | - [Myanmar](https://www.unicode.org/charts/PDF/U1000.pdf) 61 | - [Tamil](https://www.unicode.org/charts/PDF/U0B80.pdf) 62 | - [Telugu](https://www.unicode.org/charts/PDF/U0C00.pdf) 63 | - [Thai](https://www.unicode.org/charts/PDF/U0E00.pdf) 64 | - [Tibetan](https://www.unicode.org/charts/PDF/U0F00.pdf) 65 | --------------------------------------------------------------------------------