├── .npmrc ├── .gitattributes ├── .husky ├── pre-commit ├── post-commit └── _ │ └── husky.sh ├── .prettierrc.json ├── .gitignore ├── renovate.json ├── .editorconfig ├── eslint.config.mjs ├── tsconfig.json ├── jest.config.mjs ├── license ├── src ├── index.test.ts └── index.ts ├── benchmark.ts ├── package.json └── readme.md /.npmrc: -------------------------------------------------------------------------------- 1 | package-lock=false 2 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto 2 | *.js text eol=lf 3 | -------------------------------------------------------------------------------- /.husky/pre-commit: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | . "$(dirname "$0")/_/husky.sh" 3 | yarn lint-staged 4 | -------------------------------------------------------------------------------- /.husky/post-commit: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | . "$(dirname "$0")/_/husky.sh" 3 | git update-index --again 4 | -------------------------------------------------------------------------------- /.prettierrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "printWidth": 100, 3 | "singleQuote": true, 4 | "bracketSpacing": false 5 | } 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | coverage/ 3 | node_modules/ 4 | lib/ 5 | temp 6 | yarn.lock 7 | *.log 8 | !.husky/_/husky.sh 9 | -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": ["github>shelfio/renovate-config-public"], 3 | "labels": ["backend"], 4 | "ignoreDeps": [ 5 | "cimg/node" 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | end_of_line = lf 5 | charset = utf-8 6 | trim_trailing_whitespace = true 7 | insert_final_newline = true 8 | indent_style = space 9 | indent_size = 2 10 | -------------------------------------------------------------------------------- /eslint.config.mjs: -------------------------------------------------------------------------------- 1 | import rules from '@shelf/eslint-config/typescript.js'; 2 | 3 | export default [ 4 | ...rules, 5 | {files: ['**/*.js', '**/*.jsx', '**/*.ts', '**/*.tsx', '**/*.json']}, 6 | { 7 | ignores: ['**/node_modules/', '**/coverage/', '**/lib/', 'renovate.json', 'tsconfig.json'], 8 | }, 9 | ]; 10 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@shelf/tsconfig/backend", 3 | "compilerOptions": { 4 | "strict": true, 5 | "module": "ESNext", 6 | "target": "ESNext", 7 | "moduleResolution": "bundler", 8 | "declaration": true, 9 | "resolveJsonModule": true, 10 | "declarationMap": true, 11 | "outDir": "lib" 12 | }, 13 | "exclude": ["node_modules", "**/*.test.*", "**/mocks.ts"], 14 | "include": ["src"] 15 | } 16 | -------------------------------------------------------------------------------- /jest.config.mjs: -------------------------------------------------------------------------------- 1 | const ES_PACKAGES_TO_TRANSFORM = []; 2 | 3 | /** @type {import('jest').Config} */ 4 | const config = { 5 | collectCoverageFrom: ['src/**/*.ts', '!src/**/types.ts'], 6 | reporters: ['default'], 7 | transform: { 8 | '^.+\\.(t|j)sx?$': '@swc/jest', 9 | }, 10 | resolver: 'ts-jest-resolver', 11 | transformIgnorePatterns: [ 12 | `node_modules/(?!(${ES_PACKAGES_TO_TRANSFORM.join('|')}))/node_modules/.+\\.js`, 13 | ], 14 | }; 15 | 16 | export default config; 17 | -------------------------------------------------------------------------------- /.husky/_/husky.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | if [ -z "$husky_skip_init" ]; then 3 | debug () { 4 | if [ "$HUSKY_DEBUG" = "1" ]; then 5 | echo "husky (debug) - $1" 6 | fi 7 | } 8 | 9 | readonly hook_name="$(basename "$0")" 10 | debug "starting $hook_name..." 11 | 12 | if [ "$HUSKY" = "0" ]; then 13 | debug "HUSKY env variable is set to 0, skipping hook" 14 | exit 0 15 | fi 16 | 17 | if [ -f ~/.huskyrc ]; then 18 | debug "sourcing ~/.huskyrc" 19 | . ~/.huskyrc 20 | fi 21 | 22 | export readonly husky_skip_init=1 23 | sh -e "$0" "$@" 24 | exitCode="$?" 25 | 26 | if [ $exitCode != 0 ]; then 27 | echo "husky - $hook_name hook exited with code $exitCode (error)" 28 | fi 29 | 30 | exit $exitCode 31 | fi 32 | -------------------------------------------------------------------------------- /license: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Gemshelf Inc. (shelf.io) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | -------------------------------------------------------------------------------- /src/index.test.ts: -------------------------------------------------------------------------------- 1 | import fastChunkString from './index.js'; 2 | 3 | const text = 'hello my dear world'; 4 | 5 | it('should export a module', () => { 6 | expect(fastChunkString).toBeInstanceOf(Function); 7 | }); 8 | 9 | it('should handle empty string', () => { 10 | expect(fastChunkString('', {size: 4})).toEqual([]); 11 | }); 12 | 13 | it('should handle non string', () => { 14 | expect(fastChunkString(null as never, {size: 4})).toEqual([]); 15 | }); 16 | 17 | it('should split string into chunks of even number of chars', () => { 18 | expect(fastChunkString(text, {size: 4})).toEqual(['hell', 'o my', ' dea', 'r wo', 'rld']); 19 | }); 20 | 21 | it('should split string even if chunk size is larger', () => { 22 | expect(fastChunkString(text, {size: 4000})).toEqual([text]); 23 | }); 24 | 25 | it('should split string into chunks of odd number of chars', () => { 26 | expect(fastChunkString(text, {size: 3})).toEqual(['hel', 'lo ', 'my ', 'dea', 'r w', 'orl', 'd']); 27 | }); 28 | 29 | it('should split emojis to 1 even if asked for 2', () => { 30 | expect(fastChunkString('😀😃😄😁', {size: 2})).toEqual(['😀', '😃', '😄', '😁']); 31 | }); 32 | 33 | it('should split emojis correctly w/ useByteLength option', () => { 34 | expect(fastChunkString('😀😃😄😁', {size: 2, unicodeAware: true})).toEqual(['😀😃', '😄😁']); 35 | }); 36 | 37 | it('should split emojis correctly w/ useByteLength option for odd chunk length', () => { 38 | expect(fastChunkString('😀😃😄', {size: 2, unicodeAware: true})).toEqual(['😀😃', '😄']); 39 | }); 40 | 41 | it('should coerce fractional unicode chunk sizes like slice does', () => { 42 | expect(fastChunkString('abcdef', {size: 2.5, unicodeAware: true})).toEqual(['ab', 'cde', 'f']); 43 | }); 44 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | import runes from 'runes'; 2 | 3 | function fastChunkString( 4 | original: string, 5 | { 6 | size, 7 | unicodeAware = false, 8 | }: { 9 | size: number; 10 | unicodeAware?: boolean; 11 | }, 12 | ): string[] { 13 | const str = original || ''; 14 | 15 | if (str.length === 0) { 16 | return []; 17 | } 18 | 19 | if (!unicodeAware) { 20 | return getChunks(str, size); 21 | } 22 | 23 | return getChunksUnicode(str, size); 24 | } 25 | 26 | function getChunks(str: string, size: number): string[] { 27 | const strLength = str.length; 28 | 29 | if (strLength === 0) { 30 | return []; 31 | } 32 | 33 | if (size >= strLength) { 34 | return [str]; 35 | } 36 | 37 | const numChunks = Math.ceil(strLength / size); 38 | const chunks = new Array(numChunks); 39 | 40 | for (let index = 0, offset = 0; index < numChunks; index += 1, offset += size) { 41 | chunks[index] = str.substr(offset, size); 42 | } 43 | 44 | return chunks; 45 | } 46 | 47 | function getChunksUnicode(str: string, size: number): string[] { 48 | const runeChars = runes(str); 49 | const runeCount = runeChars.length; 50 | 51 | if (runeCount === 0) { 52 | return []; 53 | } 54 | 55 | if (size >= runeCount) { 56 | return [str]; 57 | } 58 | 59 | const numChunks = Math.ceil(runeCount / size); 60 | const chunks = new Array(numChunks); 61 | 62 | for (let index = 0, offset = 0; index < numChunks; index += 1, offset += size) { 63 | const start = Math.min(Math.floor(offset), runeCount); 64 | const end = Math.min(Math.floor(offset + size), runeCount); 65 | 66 | let chunk = ''; 67 | 68 | for (let i = start; i < end; i += 1) { 69 | chunk += runeChars[i]; 70 | } 71 | 72 | chunks[index] = chunk; 73 | } 74 | 75 | return chunks; 76 | } 77 | export default fastChunkString; 78 | -------------------------------------------------------------------------------- /benchmark.ts: -------------------------------------------------------------------------------- 1 | import benny from 'benny'; 2 | import fLI from 'fast-lorem-ipsum'; 3 | import fastChunkString from './lib'; 4 | 5 | const words5000 = fLI(5000, 'w'); 6 | const words50000 = fLI(50000, 'w'); 7 | const words500000 = fLI(500000, 'w'); 8 | const words5000000 = fLI(5000000, 'w'); 9 | 10 | benny.suite( 11 | 'Without Unicode', 12 | benny.add('~33 kb split by 2 kb', () => { 13 | fastChunkString(words5000, {size: 2 * 1024}); 14 | }), 15 | benny.add('~33 kb split by 1 mb', () => { 16 | fastChunkString(words5000, {size: 1024 * 1024}); 17 | }), 18 | benny.add('~330 kb split by 2 kb', () => { 19 | fastChunkString(words50000, {size: 2 * 1024}); 20 | }), 21 | benny.add('~330 kb split by 1 mb', () => { 22 | fastChunkString(words50000, {size: 1024 * 1024}); 23 | }), 24 | benny.add('~3.3 mb split by 2 kb', () => { 25 | fastChunkString(words500000, {size: 2 * 1024}); 26 | }), 27 | benny.add('~3.3 mb split by 1 mb', () => { 28 | fastChunkString(words500000, {size: 1024 * 1024}); 29 | }), 30 | benny.add('~33 mb split by 2 kb', () => { 31 | fastChunkString(words5000000, {size: 2 * 1024}); 32 | }), 33 | benny.add('~33 mb split by 1 mb', () => { 34 | fastChunkString(words5000000, {size: 1024 * 1024}); 35 | }), 36 | benny.cycle(), 37 | benny.complete(), 38 | ); 39 | 40 | benny.suite( 41 | 'Unicode Aware', 42 | benny.add('~33 kb split by 2 kb with unicodeAware', () => { 43 | fastChunkString(words5000, {size: 2 * 1024, unicodeAware: true}); 44 | }), 45 | benny.add('~33 kb split by 1 mb with unicodeAware', () => { 46 | fastChunkString(words5000, {size: 1024 * 1024, unicodeAware: true}); 47 | }), 48 | benny.add('~330 kb split by 2 kb with unicodeAware', () => { 49 | fastChunkString(words50000, {size: 2 * 1024, unicodeAware: true}); 50 | }), 51 | benny.add('~330 kb split by 1 mb with unicodeAware', () => { 52 | fastChunkString(words50000, {size: 1024 * 1024, unicodeAware: true}); 53 | }), 54 | benny.cycle(), 55 | benny.complete(), 56 | ); 57 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@shelf/fast-chunk-string", 3 | "version": "4.1.0", 4 | "description": "Chunk string into equal substrings with unicode support", 5 | "keywords": [ 6 | "string", 7 | "split", 8 | "chunk", 9 | "unicode", 10 | "utf", 11 | "utf8", 12 | "substring" 13 | ], 14 | "repository": "shelfio/fast-chunk-string", 15 | "license": "MIT", 16 | "author": { 17 | "name": "Vlad Holubiev", 18 | "email": "vlad@shelf.io", 19 | "url": "https://shelf.io" 20 | }, 21 | "sideEffects": false, 22 | "type": "module", 23 | "exports": "./lib/index.js", 24 | "module": "./lib/index.js", 25 | "types": "lib/index.d.ts", 26 | "files": [ 27 | "lib" 28 | ], 29 | "scripts": { 30 | "benchmark": "tsx benchmark.ts", 31 | "build": "rm -rf lib/ && tsc", 32 | "coverage": "jest --coverage", 33 | "lint": "yarn lint:ci --fix", 34 | "lint:ci": "eslint . --quiet", 35 | "test": "jest src", 36 | "type-check": "tsc --noEmit", 37 | "type-check:watch": "npm run type-check -- --watch" 38 | }, 39 | "lint-staged": { 40 | "*.{html,json,md,yml}": [ 41 | "prettier --write --ignore-path=./.eslintignore" 42 | ], 43 | "*.{ts,js}": [ 44 | "eslint --fix" 45 | ] 46 | }, 47 | "resolutions": { 48 | "strip-ansi": "6.0.1", 49 | "wrap-ansi": "7.0.0", 50 | "string-width": "4.2.3" 51 | }, 52 | "dependencies": { 53 | "runes": "0.4.3" 54 | }, 55 | "devDependencies": { 56 | "@shelf/eslint-config": "5.2.3", 57 | "@shelf/tsconfig": "0.1.0", 58 | "@swc/core": "1.13.5", 59 | "@swc/jest": "0.2.39", 60 | "@types/benchmark": "2.1.5", 61 | "@types/jest": "30.0.0", 62 | "@types/node": "22.18.6", 63 | "@types/runes": "0.4.3", 64 | "benny": "3.7.1", 65 | "eslint": "9.39.2", 66 | "fast-lorem-ipsum": "1.2.0", 67 | "husky": "9.1.7", 68 | "jest": "30.2.0", 69 | "lint-staged": "16.2.7", 70 | "prettier": "3.7.4", 71 | "ts-jest-resolver": "2.0.1", 72 | "tsx": "4.20.5", 73 | "typescript": "5.9.3" 74 | }, 75 | "engines": { 76 | "node": ">=20" 77 | }, 78 | "publishConfig": { 79 | "access": "public" 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # fast-chunk-string [![CircleCI](https://img.shields.io/circleci/project/shelfio/fast-chunk-string.svg)](https://circleci.com/gh/shelfio/fast-chunk-string) 2 | 3 | > Chunk string into equal substrings with unicode support 4 | 5 | Credits to [stackoverflow.com/a/29202760/2727317](https://stackoverflow.com/a/29202760/2727317) 6 | 7 | ## Install 8 | 9 | ``` 10 | $ yarn add @shelf/fast-chunk-string 11 | ``` 12 | 13 | ## Usage 14 | 15 | ```js 16 | import fastChunkString from '@shelf/fast-chunk-string'; 17 | 18 | // the fastest way 19 | fastChunkString('unicorns', {size: 2, unicodeAware: false}); 20 | // => ['un', 'ic', 'or', 'ns'] 21 | 22 | // ignore unicode, still fast but inaccurate 23 | fastChunkString('😀😃😄😁', {size: 2, unicodeAware: false}); 24 | // => ['😀', '😃', '😄', '😁'] 25 | 26 | // respect unicode, slow but accurate 27 | fastChunkString('😀😃😄😁', {size: 2, unicodeAware: true}); 28 | // => ['😀😃', '😄😁'] 29 | ``` 30 | 31 | ## Benchmarks 32 | 33 | Run via `yarn benchmark`. Measured on M2 Max. 34 | 35 | ``` 36 | Running "Without Unicode" suite... 37 | Progress: 100% 38 | 39 | ~33 kb split by 2 kb: 40 | 14 106 903 ops/s, ±1.71% | 86.19% slower 41 | 42 | ~33 kb split by 1 mb: 43 | 100 461 043 ops/s, ±1.45% | 1.63% slower 44 | 45 | ~330 kb split by 2 kb: 46 | 1 600 485 ops/s, ±0.63% | 98.43% slower 47 | 48 | ~330 kb split by 1 mb: 49 | 102 125 168 ops/s, ±1.50% | fastest 50 | 51 | ~3.3 mb split by 2 kb: 52 | 161 507 ops/s, ±1.19% | 99.84% slower 53 | 54 | ~3.3 mb split by 1 mb: 55 | 41 773 807 ops/s, ±1.54% | 59.1% slower 56 | 57 | ~33 mb split by 2 kb: 58 | 11 098 ops/s, ±0.25% | slowest, 99.99% slower 59 | 60 | ~33 mb split by 1 mb: 61 | 5 506 349 ops/s, ±0.58% | 94.61% slower 62 | 63 | Finished 8 cases! 64 | Fastest: ~330 kb split by 1 mb 65 | Slowest: ~33 mb split by 2 kb 66 | Running "Unicode Aware" suite... 67 | Progress: 100% 68 | 69 | ~33 kb split by 2 kb with unicodeAware: 70 | 847 ops/s, ±0.99% | 12.14% slower 71 | 72 | ~33 kb split by 1 mb with unicodeAware: 73 | 964 ops/s, ±0.25% | fastest 74 | 75 | ~330 kb split by 2 kb with unicodeAware: 76 | 71 ops/s, ±0.76% | slowest, 92.63% slower 77 | 78 | ~330 kb split by 1 mb with unicodeAware: 79 | 90 ops/s, ±0.94% | 90.66% slower 80 | 81 | Finished 4 cases! 82 | Fastest: ~33 kb split by 1 mb with unicodeAware 83 | Slowest: ~330 kb split by 2 kb with unicodeAware 84 | ``` 85 | 86 | ## Recent optimizations — September 2025 87 | 88 | September 2025 improvements were delivered autonomously by the gpt-5-codex model. We treated the hot paths like any latency-sensitive service and tuned the slowest sections: 89 | 90 | - Single-pass unicode chunking – length and slicing now come from the same `runes()` walk, eliminating the extra `string-length` scan and keeping multicodepoint graphemes intact. 91 | - Consolidated ASCII loop – collapsed the fast path into one traversal with early exits for empty inputs and oversized chunk sizes to trim per-call overhead. 92 | - Fractional-size parity – restored the legacy `slice` coercion semantics so non-integer chunk sizes behave exactly as before, backed by new regression tests. 93 | 94 | The result is steadier throughput in the ASCII suite (for example ~33 kb split by 1 mb climbs from 85.6M to 100.5M ops/s) and a 9–10× lift in the unicode-aware scenarios (e.g. 33 kb splits rise from ~101 ops/s to ~964 ops/s) while preserving behaviour for combining marks and emoji ligatures. 95 | 96 | ## See Also 97 | 98 | - [fast-normalize-spaces](https://github.com/shelfio/fast-normalize-spaces) 99 | - [fast-natural-order-by](https://github.com/shelfio/fast-natural-order-by) 100 | - [fast-uslug](https://github.com/shelfio/fast-uslug) 101 | 102 | ## Publish 103 | 104 | ```sh 105 | $ git checkout master 106 | $ yarn version 107 | $ yarn publish 108 | $ git push origin master --tags 109 | ``` 110 | 111 | ## License 112 | 113 | MIT © [Shelf](https://shelf.io) 114 | --------------------------------------------------------------------------------