├── .gitignore
├── .babelrc
├── test
    ├── helper.js
    ├── gujarati.js
    ├── kannada.js
    ├── malayalam.js
    ├── bengali.js
    ├── myanmar.js
    ├── tibetan.js
    ├── devanagari.js
    ├── tamil.js
    ├── lao.js
    ├── telugu.js
    ├── thai.js
    ├── khmer.js
    ├── japanese-kana.js
    ├── hebrew.js
    ├── arabic.js
    ├── index.js
    └── emoji.js
├── src
    ├── thai.js
    ├── lao.js
    ├── khmer.js
    ├── tibetan.js
    ├── hebrew.js
    ├── tamil.js
    ├── telugu.js
    ├── bengali.js
    ├── devanagari.js
    ├── kannada.js
    ├── malayalam.js
    ├── gujarati.js
    ├── japanese-kana.js
    ├── arabic.js
    ├── myanmar.js
    ├── emoji.js
    └── index.js
├── .github
    └── workflows
    │   └── test-node.yml
├── package.json
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | node_modules/
3 | 
4 | lib/
5 | 


--------------------------------------------------------------------------------
/.babelrc:
--------------------------------------------------------------------------------
1 | {
2 |   "presets": [
3 |     "@babel/preset-env"
4 |   ]
5 | }
6 | 


--------------------------------------------------------------------------------
/test/helper.js:
--------------------------------------------------------------------------------
1 | /* eslint-env mocha */
2 | 
3 | import { assert } from 'chai'
4 | 
5 | export function testBreak (regExp, source, results) {
6 |   const res = source.match(regExp)
7 |   assert.deepEqual(res, results)
8 | }
9 | 


--------------------------------------------------------------------------------
/src/thai.js:
--------------------------------------------------------------------------------
1 | // spec: https://www.unicode.org/charts/PDF/U0E00.pdf
2 | 
3 | const letter = '[\\u0E00-\\u0E7F]'
4 | const trailingLetter = '[\\u0E31\\u0E33-\\u0E3A\\u0E47-\\u0E4E]'
5 | export const thai = `${letter}${trailingLetter}*`
6 | 


--------------------------------------------------------------------------------
/src/lao.js:
--------------------------------------------------------------------------------
1 | // spec: https://www.unicode.org/charts/PDF/U0E80.pdf
2 | 
3 | const letter = '[\\u{0E80}-\\u{0EFF}]'
4 | const trailingLetter = '[\\u{0EB1}\\u{0EB4}-\\u{0EBC}\\u{0EC8}-\\u{0ECD}]'
5 | export const lao = `${letter}${trailingLetter}*`
6 | 


--------------------------------------------------------------------------------
/src/khmer.js:
--------------------------------------------------------------------------------
1 | // spec: https://www.unicode.org/charts/PDF/U1780.pdf
2 | 
3 | const letter = '[\\u{1780}-\\u{17FF}]'
4 | const trailingLetter = '[\\u{17B6}-\\u{17D1}\\u{17D3}\\u{17DD}]'
5 | const control = '\\u{17D2}'
6 | export const khmer = `${letter}(${control}${letter}|${trailingLetter})*`
7 | 


--------------------------------------------------------------------------------
/src/tibetan.js:
--------------------------------------------------------------------------------
1 | // spec: https://www.unicode.org/charts/PDF/U0F00.pdf
2 | 
3 | const letter = '[\\u{0F00}-\\u{0FFF}]'
4 | const trailingLetter = '[\\0F18\\0F19\\0F35\\0F37\\0F39\\0F3E\\0F3F\\u{0F71}-\\u{0F87}\\u{0F8D}-\\u{0FBC}\\u{0FC6}]'
5 | export const tibetan = `${letter}${trailingLetter}*`
6 | 


--------------------------------------------------------------------------------
/src/hebrew.js:
--------------------------------------------------------------------------------
1 | // Hebrew
2 | //  https://www.unicode.org/charts/PDF/U0590.pdf
3 | //  https://unicode-table.com/blocks/hebrew/
4 | 
5 | const letter = '[\u05D0-\u05EA]'
6 | const combiningMark = '[\u0591-\u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7]'
7 | 
8 | export const hebrew = `${letter}${combiningMark}*`
9 | 


--------------------------------------------------------------------------------
/src/tamil.js:
--------------------------------------------------------------------------------
1 | // spec: https://www.unicode.org/charts/PDF/U0B80.pdf
2 | 
3 | const letter = '[\\u{0B80}-\\u{0BFF}]'
4 | const trailingLetter = '[\\u{0B82}-\\u{0B83}\\u{0BBE}-\\u{0BD7}\\u{0962}\\u{0963}]'
5 | // tamil's virama does not combine the following consonant
6 | export const tamil = `${letter}${trailingLetter}*`
7 | 


--------------------------------------------------------------------------------
/src/telugu.js:
--------------------------------------------------------------------------------
1 | // spec: https://www.unicode.org/charts/PDF/U0C00.pdf
2 | 
3 | const letter = '[\\u{0C00}-\\u{0C7F}]'
4 | const trailingLetter = '[\\u{0C00}-\\u{0C04}\\u{0C3E}-\\u{0C56}\\u{0C62}\\u{0C63}]'
5 | const control = '\\u{0C4D}' // Virama
6 | export const telugu = `${letter}(${control}${letter}|${trailingLetter})*`
7 | 


--------------------------------------------------------------------------------
/src/bengali.js:
--------------------------------------------------------------------------------
1 | // spec: https://www.unicode.org/charts/PDF/U0980.pdf
2 | 
3 | const letter = '[\\u{0980}-\\u{09FF}]'
4 | const trailingLetter = '[\\u{0980}-\\u{0983}\\u{09BC}-\\u{09D7}\\u{09E2}\\u{09E3}\\u{09FE}]'
5 | const control = '\\u{09CD}' // Virama
6 | export const bengali = `${letter}(${control}${letter}|${trailingLetter})*`
7 | 


--------------------------------------------------------------------------------
/src/devanagari.js:
--------------------------------------------------------------------------------
1 | // spec: https://www.unicode.org/charts/PDF/U0900.pdf
2 | 
3 | const letter = '[\\u{0900}-\\u{097F}]'
4 | const trailingLetter = '[\\u{0900}-\\u{0903}\\u{093A}-\\u{0957}\\u{0962}\\u{0963}]'
5 | const control = '\\u{094D}' // Virama
6 | export const devanagari = `${letter}(${control}${letter}|${trailingLetter})*`
7 | 


--------------------------------------------------------------------------------
/src/kannada.js:
--------------------------------------------------------------------------------
1 | // spec: https://www.unicode.org/charts/PDF/U0C80.pdf
2 | 
3 | const letter = '[\\u{0C80}-\\u{0CFF}]'
4 | const trailingLetter = '[\\u{0C81}-\\u{0C83}\\u{0CBC}\\u{0CBE}-\\u{0CCD}\\u{0CD5}\\u{0CD6}\\u{0CE2}\\u{0CE3}]'
5 | const control = '\\u{0CCD}' // Virama
6 | export const kannada = `${letter}(${control}${letter}|${trailingLetter})*`
7 | 


--------------------------------------------------------------------------------
/src/malayalam.js:
--------------------------------------------------------------------------------
1 | // spec: https://unicode.org/charts/PDF/U0D00.pdf
2 | 
3 | const letter = '[\\u{0D00}-\\u{0D7F}]'
4 | const trailingLetter = '[\\u{0D00}-\\u{0D03}\\u{0D3B}\\u{0D3C}\\u{0D3E}-\\u{0D4D}\\u{0D57}\\u{0D62}-\\u{0D63}]'
5 | const control = '\\u{0D4D}' // Virama
6 | export const malayalam = `${letter}(${control}${letter}|${trailingLetter})*`
7 | 


--------------------------------------------------------------------------------
/src/gujarati.js:
--------------------------------------------------------------------------------
1 | // spec: https://www.unicode.org/charts/PDF/U0A80.pdf
2 | 
3 | const letter = '[\\u{0A80}-\\u{0AFF}]'
4 | const trailingLetter = '[\\u{0A81}-\\u{0A83}\\u{0ABC}\\u{0ABE}-\\u{0ACD}\\u{0AE2}\\u{0AE3}\\u{0AFA}-\\u{0AFF}]'
5 | const control = '\\u{0ACD}' // Virama
6 | export const gujarati = `${letter}(${control}${letter}|${trailingLetter})*`
7 | 


--------------------------------------------------------------------------------
/.github/workflows/test-node.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     runs-on: ubuntu-latest
 8 | 
 9 |     steps:
10 |     - uses: actions/checkout@v2
11 |     - name: Use Node.js 14.x
12 |       uses: actions/setup-node@v1
13 |       with:
14 |         node-version: '14.x'
15 |     - run: npm ci
16 |     - run: npm test
17 | 


--------------------------------------------------------------------------------
/test/gujarati.js:
--------------------------------------------------------------------------------
 1 | /* eslint-env mocha */
 2 | 
 3 | import { gujarati } from '../lib/gujarati'
 4 | import { testBreak } from './helper'
 5 | 
 6 | describe('WordBreakGujarati', function () {
 7 |   it('break correctly', function () {
 8 |     const regExp = new RegExp(gujarati, 'gu')
 9 |     testBreak(regExp, 'ગુજરાતી', ['ગુ', 'જ', 'રા', 'તી'])
10 |   })
11 | })
12 | 


--------------------------------------------------------------------------------
/test/kannada.js:
--------------------------------------------------------------------------------
 1 | /* eslint-env mocha */
 2 | 
 3 | import { kannada } from '../lib/kannada'
 4 | import { testBreak } from './helper'
 5 | 
 6 | describe('WordBreakKannada', function () {
 7 |   it('break correctly', function () {
 8 |     const regExp = new RegExp(kannada, 'gu')
 9 |     testBreak(regExp, 'ನಮಸ್ಕಾರ', ['ನ', 'ಮ', 'ಸ್ಕಾ', 'ರ'])
10 |     testBreak(regExp, 'ನನ್ನ', ['ನ', 'ನ್ನ'])
11 |   })
12 | })
13 | 


--------------------------------------------------------------------------------
/test/malayalam.js:
--------------------------------------------------------------------------------
 1 | /* eslint-env mocha */
 2 | 
 3 | import { malayalam } from '../lib/malayalam'
 4 | import { testBreak } from './helper'
 5 | 
 6 | describe('WordBreakMalayalam', function () {
 7 |   it('break correctly', function () {
 8 |     const regExp = new RegExp(malayalam, 'gu')
 9 |     testBreak(regExp, 'പറ്റി', ['പ', 'റ്റി'])
10 |     testBreak(regExp, 'വളരെ', ['വ', 'ള', 'രെ'])
11 |   })
12 | })
13 | 


--------------------------------------------------------------------------------
/test/bengali.js:
--------------------------------------------------------------------------------
 1 | /* eslint-env mocha */
 2 | 
 3 | import { bengali } from '../lib/bengali'
 4 | import { testBreak } from './helper'
 5 | 
 6 | describe('WordBreakBengali', function () {
 7 |   it('break correctly', function () {
 8 |     const regExp = new RegExp(bengali, 'gu')
 9 |     testBreak(regExp, 'দ্ধ', ['দ্ধ'])
10 |     testBreak(regExp, 'স্ত্র', ['স্ত্র'])
11 |     testBreak(regExp, 'নমস্কার', ['ন', 'ম', 'স্কা', 'র'])
12 |   })
13 | })
14 | 


--------------------------------------------------------------------------------
/test/myanmar.js:
--------------------------------------------------------------------------------
 1 | /* eslint-env mocha */
 2 | 
 3 | import { myanmar } from '../lib/myanmar'
 4 | import { testBreak } from './helper'
 5 | 
 6 | describe('WordBreakMyanmar', function () {
 7 |   it('break correctly', function () {
 8 |     const regExp = new RegExp(myanmar, 'gu')
 9 |     testBreak(regExp, 'နံနက်', ['နံ', 'န', 'က်'])
10 |     testBreak(regExp, 'မင်္ဂလာနံနက်ခင်းပါ', ['မ', 'င်္ဂ', 'လာ', 'နံ', 'န', 'က်', 'ခ', 'င်း', 'ပါ'])
11 |   })
12 | })
13 | 


--------------------------------------------------------------------------------
/test/tibetan.js:
--------------------------------------------------------------------------------
 1 | /* eslint-env mocha */
 2 | 
 3 | import { tibetan } from '../lib/tibetan'
 4 | import { testBreak } from './helper'
 5 | 
 6 | describe('WordBreakTibetan', function () {
 7 |   it('break correctly', function () {
 8 |     const regExp = new RegExp(tibetan, 'gu')
 9 |     testBreak(regExp, 'གཟིགས་དང་', ['ག', 'ཟི', 'ག', 'ས', '་', 'ད', 'ང', '་'])
10 |     testBreak(regExp, 'ལགས་མིན་', ['ལ', 'ག', 'ས', '་', 'མི', 'ན', '་'])
11 |   })
12 | })
13 | 


--------------------------------------------------------------------------------
/test/devanagari.js:
--------------------------------------------------------------------------------
 1 | /* eslint-env mocha */
 2 | 
 3 | import { devanagari } from '../lib/devanagari'
 4 | import { testBreak } from './helper'
 5 | 
 6 | describe('WordBreakDevanagari', function () {
 7 |   it('break correctly', function () {
 8 |     const regExp = new RegExp(devanagari, 'gu')
 9 |     testBreak(regExp, 'वाह', ['वा', 'ह'])
10 |     testBreak(regExp, 'शुभ', ['शु', 'भ'])
11 |     testBreak(regExp, 'रात्रि', ['रा', 'त्रि'])
12 |   })
13 | })
14 | 


--------------------------------------------------------------------------------
/test/tamil.js:
--------------------------------------------------------------------------------
 1 | /* eslint-env mocha */
 2 | 
 3 | import { tamil } from '../lib/tamil'
 4 | import { testBreak } from './helper'
 5 | 
 6 | describe('WordBreakTamil', function () {
 7 |   it('break correctly', function () {
 8 |     const regExp = new RegExp(tamil, 'gu')
 9 |     testBreak(regExp, 'ஹலோ', ['ஹ', 'லோ'])
10 |     testBreak(regExp, 'குடீவ்னிங்', ['கு', 'டீ', 'வ்', 'னி', 'ங்'])
11 |     testBreak(regExp, 'ஓக்கே', ['ஓ', 'க்', 'கே'])
12 |   })
13 | })
14 | 


--------------------------------------------------------------------------------
/test/lao.js:
--------------------------------------------------------------------------------
 1 | /* eslint-env mocha */
 2 | 
 3 | import { lao } from '../lib/lao'
 4 | import { testBreak } from './helper'
 5 | 
 6 | describe('WordBreakLao', function () {
 7 |   it('break correctly', function () {
 8 |     const regExp = new RegExp(lao, 'gu')
 9 |     testBreak(regExp, 'ສະບາຍດີ', ['ສ', 'ະ', 'ບ', 'າ', 'ຍ', 'ດີ'])
10 |     testBreak(regExp, 'ກະລຸນາຮູ້ສຶກບໍ່ເສຍຄ່າ', ['ກ', 'ະ', 'ລຸ', 'ນ', 'າ', 'ຮູ້', 'ສຶ', 'ກ', 'ບໍ່', 'ເ', 'ສ', 'ຍ', 'ຄ່', 'າ'])
11 |   })
12 | })
13 | 


--------------------------------------------------------------------------------
/test/telugu.js:
--------------------------------------------------------------------------------
 1 | /* eslint-env mocha */
 2 | 
 3 | import { telugu } from '../lib/telugu'
 4 | import { testBreak } from './helper'
 5 | 
 6 | describe('WordBreakTelugu', function () {
 7 |   it('break correctly', function () {
 8 |     const regExp = new RegExp(telugu, 'gu')
 9 |     testBreak(regExp, 'నమస్కారం', ['న', 'మ', 'స్కా', 'రం'])
10 |     testBreak(regExp, 'పుట్టపర్తి', ['పు', 'ట్ట', 'ప', 'ర్తి'])
11 |     testBreak(regExp, 'ఉన్నారు', ['ఉ', 'న్నా', 'రు'])
12 |   })
13 | })
14 | 


--------------------------------------------------------------------------------
/src/japanese-kana.js:
--------------------------------------------------------------------------------
 1 | // spec
 2 | // Hiragana https://www.unicode.org/charts/PDF/U3040.pdf
 3 | // Katakana https://www.unicode.org/charts/PDF/U30A0.pdf
 4 | 
 5 | const hiragana = '[\\u{3041}-\\u{3096}\\u{309D}-\\u{309F}]'
 6 | const katakana = '[\\u{30A0}-\\u{30FF}]'
 7 | const halfSizeSoundMark = '[\\u{3099}-\\u{309A}]' // soundmark NFD
 8 | const fullSizeSoundMark = '[\\u{309B}-\\u{309C}]'
 9 | 
10 | export const japaneseKana = `((${katakana}|${hiragana})${halfSizeSoundMark}?|${fullSizeSoundMark})`
11 | 


--------------------------------------------------------------------------------
/test/thai.js:
--------------------------------------------------------------------------------
 1 | /* eslint-env mocha */
 2 | 
 3 | import { thai } from '../lib/thai'
 4 | import { testBreak } from './helper'
 5 | 
 6 | describe('WordBreakThai', function () {
 7 |   it('break correctly', function () {
 8 |     const regExp = new RegExp(thai, 'gu')
 9 |     testBreak(regExp, 'ไม้ทัณฑฆาต', ['ไ', 'ม้', 'ทั', 'ณ', 'ฑ', 'ฆ', 'า', 'ต'])
10 |     testBreak(regExp, 'ข้าวมันไก่', ['ข้', 'า', 'ว', 'มั', 'น', 'ไ', 'ก่'])
11 |     testBreak(regExp, 'ลำแสง', ['ลำ', 'แ', 'ส', 'ง'])
12 |   })
13 | })
14 | 


--------------------------------------------------------------------------------
/test/khmer.js:
--------------------------------------------------------------------------------
 1 | /* eslint-env mocha */
 2 | 
 3 | import { khmer } from '../lib/khmer'
 4 | import { testBreak } from './helper'
 5 | 
 6 | describe('WordBreakKhmer', function () {
 7 |   it('break correctly', function () {
 8 |     const regExp = new RegExp(khmer, 'gu')
 9 |     testBreak(regExp, 'ស្រីៗ', ['ស្រី', 'ៗ'])
10 |     testBreak(regExp, 'ច្រើនឡើងៗ', ['ច្រើ', 'ន', 'ឡើ', 'ង', 'ៗ'])
11 |     testBreak(regExp, 'ប៉ុស្ដិ៍', ['ប៉ុ', 'ស្ដិ៍'])
12 |     testBreak(regExp, 'ផម្រាបសួរ', ['ផ', 'ម្រា', 'ប', 'សួ', 'រ'])
13 |   })
14 | })
15 | 


--------------------------------------------------------------------------------
/src/arabic.js:
--------------------------------------------------------------------------------
 1 | // Arabic
 2 | //  https://www.unicode.org/charts/PDF/U0600.pdf
 3 | //  https://unicode-table.com/blocks/arabic/
 4 | // Arabic supplement
 5 | //  https://www.unicode.org/charts/PDF/U0750.pdf
 6 | //  https://unicode-table.com/blocks/arabic-supplement/
 7 | 
 8 | const arabicLetter = '\u0620-\u064A\u066E-\u066F\u0671-\u06D5\u06EE-\u06EF\u06FA-\u06FF'
 9 | const arabicSupplementLetter = '\u0750-\u077F'
10 | 
11 | const letter = `[${arabicLetter}${arabicSupplementLetter}]`
12 | const combiningMark = '[\u064B-\u065F\u0670]'
13 | 
14 | export const arabic = `${letter}${combiningMark}*`
15 | 


--------------------------------------------------------------------------------
/src/myanmar.js:
--------------------------------------------------------------------------------
 1 | // spec: https://www.unicode.org/charts/PDF/U1000.pdf
 2 | 
 3 | const letter = '[\\u{1000}-\\u{109F}]'
 4 | const trailingLetterRange = [
 5 |   '\\u{102B}-\\u{1038}',
 6 |   '\\u{103A}-\\u{103E}',
 7 |   '\\u{1056}-\\u{1059}',
 8 |   '\\u{105E}-\\u{1060}',
 9 |   '\\u{1062}-\\u{1064}',
10 |   '\\u{1067}-\\u{106D}',
11 |   '\\u{1071}-\\u{1074}',
12 |   '\\u{1082}-\\u{108D}',
13 |   '\\u{108F}',
14 |   '\\u{109A}-\\u{109D}'
15 | ]
16 | const trailingLetter = `[${trailingLetterRange.join('')}]`
17 | const control = '\\u{1039}'
18 | export const myanmar = `${letter}(${control}${letter}|${trailingLetter})*`
19 | 


--------------------------------------------------------------------------------
/test/japanese-kana.js:
--------------------------------------------------------------------------------
 1 | /* eslint-env mocha */
 2 | 
 3 | import { japaneseKana } from '../lib/japanese-kana'
 4 | import { testBreak } from './helper'
 5 | 
 6 | describe('WordBreakJapaneseHiragana', function () {
 7 |   const regExp = new RegExp(japaneseKana, 'gu')
 8 | 
 9 |   it('breaks plain kana', function () {
10 |     testBreak(regExp, 'こんにちは', ['こ', 'ん', 'に', 'ち', 'は'])
11 |     testBreak(regExp, 'ハヒフヘホ', ['ハ', 'ヒ', 'フ', 'ヘ', 'ホ'])
12 |   })
13 | 
14 |   it('breaks kana with soundmark', function () {
15 |     testBreak(regExp, 'ごん゙に゙ぢば', ['ご', 'ん゙', 'に゙', 'ぢ', 'ば'])
16 |     testBreak(regExp, 'パピプペポ', ['パ', 'ピ', 'プ', 'ペ', 'ポ'])
17 |   })
18 | 
19 |   it('does not break fullsize soundmark', function () {
20 |     testBreak(regExp, 'こ゛ん゛', ['こ', '゛', 'ん', '゛'])
21 |     testBreak(regExp, 'ハ゜ヒ゜', ['ハ', '゜', 'ヒ', '゜'])
22 |   })
23 | })
24 | 


--------------------------------------------------------------------------------
/src/emoji.js:
--------------------------------------------------------------------------------
 1 | // spec:
 2 | //   https://en.wikipedia.org/wiki/Unicode_block
 3 | 
 4 | export const countryFlag = '[\\u{1F1E6}-\\u{1F1FF}]{2}'
 5 | 
 6 | export const keyCap = '[0-9#\\*][\\u{FE0F}]?\\u{20E3}'
 7 | 
 8 | const emojiRange = [
 9 |   '[\\u{2600}-\\u{26FF}]', // Miscellaneous Symbols
10 |   '[\\u{2700}-\\u{27BF}]', // Dingbats
11 |   '[\\u{1F300}-\\u{1F5FF}]', // Miscellaneous Symbols and Pictographs
12 |   '[\\u{1F600}-\\u{1F64F}]', // Emoticons
13 |   '[\\u{1F680}-\\u{1F6FF}]', // Transport and Map Symbols
14 |   '[\\u{1F700}-\\u{1F77F}]', // Alchemical Symbols
15 |   '[\\u{1F900}-\\u{1F9FF}]' // Supplemental Symbols and Pictographs
16 | ]
17 | const emoji = `(${emojiRange.join('|')})`
18 | const zeroWidthJoinder = '\\u{200D}'
19 | const variationSeletor = '[\\u{FE0E}\\u{FE0F}]'
20 | const skinTone = '[\\u{1F3FB}-\\u{1F3FF}]'
21 | 
22 | export const emojiVariation = `${emoji}(${zeroWidthJoinder}${emoji}|${skinTone}|${variationSeletor})*`
23 | 


--------------------------------------------------------------------------------
/test/hebrew.js:
--------------------------------------------------------------------------------
 1 | /* eslint-env mocha */
 2 | 
 3 | import { assert } from 'chai'
 4 | import { hebrew } from '../lib/hebrew'
 5 | import { testBreak } from './helper'
 6 | 
 7 | describe('WordBreakHebrew', function () {
 8 |   const regExp = new RegExp(hebrew, 'gu')
 9 | 
10 |   it('breaks single letters', function () {
11 |     const israel = 'ישראל'
12 |     assert.lengthOf(israel, 5)
13 |     testBreak(regExp, israel, [
14 |       'י',
15 |       'ש',
16 |       'ר',
17 |       'א',
18 |       'ל'
19 |     ])
20 |   })
21 | 
22 |   it('breaks letter + combining mark', function () {
23 |     const japan = 'יָפּן'
24 |     assert.lengthOf(japan, 5)
25 |     testBreak(regExp, japan, [
26 |       'יָ',
27 |       'פּ',
28 |       'ן'
29 |     ])
30 |   })
31 | 
32 |   it('breaks single letter + multiple combining marks', function () {
33 |     const israel = 'יִשְׂרָאֵל'
34 |     assert.lengthOf(israel, 10)
35 |     testBreak(regExp, israel, [
36 |       'יִ',
37 |       'שְׂ',
38 |       'רָ',
39 |       'אֵ',
40 |       'ל'
41 |     ])
42 |   })
43 | })
44 | 


--------------------------------------------------------------------------------
/test/arabic.js:
--------------------------------------------------------------------------------
 1 | /* eslint-env mocha */
 2 | 
 3 | import { assert } from 'chai'
 4 | import { arabic } from '../lib/arabic'
 5 | import { testBreak } from './helper'
 6 | 
 7 | describe('WordBreakArabic', function () {
 8 |   const regExp = new RegExp(arabic, 'gu')
 9 | 
10 |   it('breaks single letters', function () {
11 |     const hello = 'مرحبا'
12 |     assert.lengthOf(hello, 5)
13 |     testBreak(regExp, hello, [
14 |       'م',
15 |       'ر',
16 |       'ح',
17 |       'ب',
18 |       'ا'
19 |     ])
20 |   })
21 | 
22 |   it('breaks letter + combining mark', function () {
23 |     const note = 'نِيهُون'
24 |     assert.lengthOf(note, 7)
25 |     testBreak(regExp, note, [
26 |       'نِ',
27 |       'ي',
28 |       'هُ',
29 |       'و',
30 |       'ن'
31 |     ])
32 |   })
33 | 
34 |   it('breaks single letter + multiple combining marks', function () {
35 |     const nippon = 'نِيپُّونْ'
36 |     assert.lengthOf(nippon, 9)
37 |     testBreak(regExp, nippon, [
38 |       'نِ',
39 |       'ي',
40 |       'پُّ',
41 |       'و',
42 |       'نْ'
43 |     ])
44 |   })
45 | })
46 | 


--------------------------------------------------------------------------------
/src/index.js:
--------------------------------------------------------------------------------
 1 | import { arabic } from './arabic'
 2 | import { bengali } from './bengali'
 3 | import { devanagari } from './devanagari'
 4 | import { gujarati } from './gujarati'
 5 | import { hebrew } from './hebrew'
 6 | import { japaneseKana } from './japanese-kana'
 7 | import { kannada } from './kannada'
 8 | import { khmer } from './khmer'
 9 | import { lao } from './lao'
10 | import { malayalam } from './malayalam'
11 | import { myanmar } from './myanmar'
12 | import { tamil } from './tamil'
13 | import { telugu } from './telugu'
14 | import { thai } from './thai'
15 | import { tibetan } from './tibetan'
16 | import { countryFlag, keyCap, emojiVariation } from './emoji'
17 | 
18 | const patterns = [
19 |   countryFlag, keyCap, emojiVariation,
20 |   arabic,
21 |   bengali, devanagari, gujarati, hebrew,
22 |   japaneseKana,
23 |   kannada, khmer, lao, malayalam, myanmar,
24 |   tamil, telugu, thai, tibetan,
25 |   '.'
26 | ]
27 | 
28 | const splitter = new RegExp(`(${patterns.join('|')})`, 'gu')
29 | 
30 | function splitGraphemes (str) {
31 |   return str.match(splitter) || []
32 | }
33 | 
34 | module.exports = { splitGraphemes }
35 | 


--------------------------------------------------------------------------------
/test/index.js:
--------------------------------------------------------------------------------
 1 | /* eslint-env mocha */
 2 | 
 3 | import { assert } from 'chai'
 4 | import { splitGraphemes } from '../'
 5 | 
 6 | function testBreak (source, results) {
 7 |   const res = splitGraphemes(source)
 8 |   assert.deepEqual(res, results)
 9 | }
10 | 
11 | describe('WordBreak', function () {
12 |   it('break correctly', function () {
13 |     testBreak('✌🏽Hello 🌏✌️', ['✌🏽', 'H', 'e', 'l', 'l', 'o', ' ', '🌏', '✌️'])
14 |     testBreak('more and ច្រើនឡើងៗ!มีความสุข',
15 |       ['m', 'o', 'r', 'e', ' ', 'a', 'n', 'd', ' ', 'ច្រើ', 'ន', 'ឡើ', 'ង', 'ៗ', '!', 'มี', 'ค', 'ว', 'า', 'ม', 'สุ', 'ข'])
16 |     testBreak('[អ៊ីនធឺណិត] [* អ៊ីនធឺណិត]',
17 |       ['[', 'អ៊ី', 'ន', 'ធឺ', 'ណិ', 'ត', ']', ' ', '[', '*', ' ', 'អ៊ី', 'ន', 'ធឺ', 'ណិ', 'ត', ']'])
18 |   })
19 | 
20 |   it('break arabic', function () {
21 |     testBreak('نِيپُّونْ', [
22 |       'نِ',
23 |       'ي',
24 |       'پُّ',
25 |       'و',
26 |       'نْ'
27 |     ])
28 |   })
29 | 
30 |   it('break hebrew', function () {
31 |     testBreak('יִשְׂרָאֵל', [
32 |       'יִ',
33 |       'שְׂ',
34 |       'רָ',
35 |       'אֵ',
36 |       'ל'
37 |     ])
38 |   })
39 | })
40 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "split-graphemes",
 3 |   "version": "0.5.0",
 4 |   "description": "Divide the string into graphemes.",
 5 |   "main": "lib/index.js",
 6 |   "scripts": {
 7 |     "build": "run-s build:*",
 8 |     "build:clean": "if test -d ./lib; then rm -r ./lib; fi",
 9 |     "build:babel": "babel src/ --out-dir lib/",
10 |     "pretest": "npm run build",
11 |     "test": "run-s test:*",
12 |     "test:standard": "standard",
13 |     "test:mocha": "mocha test/*.js -r @babel/register -r @babel/polyfill --exit",
14 |     "prepublishOnly": "npm test",
15 |     "postpublish": "git push origin --tags && git push origin master"
16 |   },
17 |   "keywords": [
18 |     "grapheme",
19 |     "emoji",
20 |     "abugida",
21 |     "arabic",
22 |     "brahmic",
23 |     "bengali",
24 |     "devanagari",
25 |     "gujarati",
26 |     "hebrew",
27 |     "japanese",
28 |     "kannada",
29 |     "khmer",
30 |     "lao",
31 |     "malayalam",
32 |     "myanmar",
33 |     "NFD",
34 |     "tamil",
35 |     "telugu",
36 |     "thai",
37 |     "tibetan"
38 |   ],
39 |   "files": [
40 |     "README.md",
41 |     "package.json",
42 |     "lib"
43 |   ],
44 |   "author": "Daiki Iizuka <daiiz@notainc.com>",
45 |   "license": "MIT",
46 |   "devDependencies": {
47 |     "@babel/cli": "^7.4.3",
48 |     "@babel/core": "^7.4.3",
49 |     "@babel/polyfill": "^7.4.3",
50 |     "@babel/preset-env": "^7.4.3",
51 |     "@babel/register": "^7.4.0",
52 |     "chai": "^4.2.0",
53 |     "mocha": "^6.1.3",
54 |     "npm-run-all": "^4.1.5",
55 |     "standard": "^12.0.1"
56 |   },
57 |   "standard": {
58 |     "ignore": [
59 |       "/lib"
60 |     ]
61 |   },
62 |   "repository": {
63 |     "type": "git",
64 |     "url": "https://github.com/nota/split-graphemes.git"
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/test/emoji.js:
--------------------------------------------------------------------------------
 1 | /* eslint-env mocha */
 2 | 
 3 | import { emojiVariation, keyCap, countryFlag } from '../lib/emoji'
 4 | import { testBreak } from './helper'
 5 | 
 6 | describe('WordBreakEmoji', function () {
 7 |   it('break correctly keyCap', function () {
 8 |     const regExp = new RegExp(keyCap, 'gu')
 9 |     testBreak(regExp, '8️⃣9️⃣', ['8️⃣', '9️⃣'])
10 |     testBreak(regExp, '#️⃣*️⃣', ['#️⃣', '*️⃣'])
11 |   })
12 | 
13 |   it('break correctly countryFlag', function () {
14 |     const regExp = new RegExp(countryFlag, 'gu')
15 |     testBreak(regExp, '🇯🇵🇨🇦', ['🇯🇵', '🇨🇦'])
16 |   })
17 | 
18 |   describe('break correctly emojiVariation', function () {
19 |     const regExp = new RegExp(emojiVariation, 'gu')
20 | 
21 |     // https://lets-emoji.com/emojilist/
22 |     it('smile', function () {
23 |       testBreak(regExp, '😀', ['😀'])
24 |     })
25 | 
26 |     it('skin-tone', function () {
27 |       testBreak(regExp, '🤛', ['🤛'])
28 |       testBreak(regExp, '👧🏽', ['👧🏽'])
29 |     })
30 | 
31 |     it('family', function () {
32 |       testBreak(regExp, '👨‍👩‍👦‍👦', ['👨‍👩‍👦‍👦'])
33 |     })
34 | 
35 |     it('person-role', function () {
36 |       testBreak(regExp, '👨‍🏫', ['👨‍🏫'])
37 |       testBreak(regExp, '👩🏾‍⚖️', ['👩🏾‍⚖️'])
38 |       testBreak(regExp, '👩🏾‍✈️', ['👩🏾‍✈️'])
39 |     })
40 | 
41 |     it('person-fantasy', function () {
42 |       testBreak(regExp, '👼', ['👼'])
43 |       testBreak(regExp, '👼🏽', ['👼🏽'])
44 |     })
45 | 
46 |     it('flag', function () {
47 |       testBreak(regExp, '🏳️‍🌈', ['🏳️‍🌈'])
48 |       testBreak(regExp, '🏴‍☠️', ['🏴‍☠️'])
49 |     })
50 | 
51 |     it('variation selector', function () {
52 |       // u260E uFE0E
53 |       testBreak(regExp, '☎', ['☎'])
54 |       // u260E uFE0F
55 |       testBreak(regExp, '☎️', ['☎️'])
56 |     })
57 | 
58 |     it('multiple of emojis', function () {
59 |       testBreak(regExp, '👨‍👩‍👦‍👦👨‍👩‍👧', ['👨‍👩‍👦‍👦', '👨‍👩‍👧'])
60 |       testBreak(regExp, '☎☎️', ['☎', '☎️'])
61 |       testBreak(regExp, '🤜🏾🤘🏾', ['🤜🏾', '🤘🏾'])
62 |     })
63 |   })
64 | })
65 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # split-graphemes
 2 | 
 3 | Divide ligature letters such as Thai, Khmer letters and complex emoji into array of [graphemes](https://en.wikipedia.org/wiki/Grapheme).
 4 | You can simply use this library instead of `Array.from` to get graphemes.
 5 | 
 6 | [![Tests](https://github.com/nota/split-graphemes/actions/workflows/test-node.yml/badge.svg?branch=master)](https://github.com/nota/split-graphemes/actions/workflows/test-node.yml)
 7 | 
 8 | ## Installation
 9 | ```
10 | $ npm install split-graphemes
11 | ```
12 | 
13 | ## Examples
14 | ### Emoji
15 | 
16 | ```js
17 | // An emoji '👨‍👩‍👦‍👦' consists of 4 people face emoji joined by Zero Width Joiners (ZWJ).
18 | const chars = Array.from('👨‍👩‍👦‍👦') // ['👨', ZWJ, '👩', ZWJ, '👦', ZWJ, '👦']
19 | ```
20 | 
21 | ```js
22 | // It is interpreted exactly as one character!
23 | const chars = splitGraphemes('👨‍👩‍👦‍👦') // ['👨‍👩‍👦‍👦']
24 | ```
25 | 
26 | ### Khmer characters
27 | 
28 | ```js
29 | Array.from('ប៉ុស្ដិ៍') // ['ប', '៉', 'ុ', 'ស', '្', 'ដ', 'ិ', '៍']
30 | ```
31 | 
32 | ```js
33 | splitGraphemes('ប៉ុស្ដិ៍') // ['ប៉ុ', 'ស្ដិ៍']
34 | ```
35 | 
36 | ### Japanese NFD
37 | ```js
38 | splitGraphemes('ごん゙に゙ぢば') // ['ご', 'ん゙', 'に゙', 'ぢ', 'ば']
39 | splitGraphemes('パピプペポ') // ['パ', 'ピ', 'プ', 'ペ', 'ポ']
40 | ```
41 | 
42 | ### English
43 | ```js
44 | splitGraphemes('Hello') // ['H', 'e', 'l', 'l', 'o']
45 | ```
46 | 
47 | ## Supported ligature characters
48 | The list of characters is at [here](https://github.com/nota/split-graphemes/tree/master/src).
49 | - [Emoji](https://en.wikipedia.org/wiki/Unicode_block)
50 | - [Arabic](https://www.unicode.org/charts/PDF/U0600.pdf) and [Arabic supplement](https://www.unicode.org/charts/PDF/U0750.pdf)
51 | - [Bengali](https://www.unicode.org/charts/PDF/U0980.pdf)
52 | - [Devanagari](https://www.unicode.org/charts/PDF/U0900.pdf)
53 | - [Gujarati](https://www.unicode.org/charts/PDF/U0A80.pdf)
54 | - [Hebrew](https://www.unicode.org/charts/PDF/U0590.pdf)
55 | - [Japanese Hiragana](https://www.unicode.org/charts/PDF/U3040.pdf) and [Katakana](https://www.unicode.org/charts/PDF/U30A0.pdf) NFD
56 | - [Kannada](https://www.unicode.org/charts/PDF/U0C80.pdf)
57 | - [Khmer](https://www.unicode.org/charts/PDF/U1780.pdf)
58 | - [Lao](https://www.unicode.org/charts/PDF/U0E80.pdf)
59 | - [Malayalam](https://unicode.org/charts/PDF/U0D00.pdf)
60 | - [Myanmar](https://www.unicode.org/charts/PDF/U1000.pdf)
61 | - [Tamil](https://www.unicode.org/charts/PDF/U0B80.pdf)
62 | - [Telugu](https://www.unicode.org/charts/PDF/U0C00.pdf)
63 | - [Thai](https://www.unicode.org/charts/PDF/U0E00.pdf)
64 | - [Tibetan](https://www.unicode.org/charts/PDF/U0F00.pdf)
65 | 


--------------------------------------------------------------------------------