├── pnpm-workspace.yaml
├── packages
    ├── cloud
    │   ├── .tsc-aliasrc.json
    │   ├── vitest.config.ts
    │   ├── tsconfig.json
    │   ├── tests
    │   │   ├── overlap-refinery.test.ts
    │   │   ├── embeddings-refinery.test.ts
    │   │   ├── token.test.ts
    │   │   ├── recursive.test.ts
    │   │   ├── sentence.test.ts
    │   │   ├── neural.test.ts
    │   │   ├── semantic.test.ts
    │   │   ├── code.test.ts
    │   │   ├── fixtures
    │   │   │   ├── test-code.js
    │   │   │   └── test-code.ts
    │   │   └── pipeline.test.ts
    │   ├── CHANGELOG.md
    │   ├── examples
    │   │   ├── sentence.example.ts
    │   │   ├── neural.example.ts
    │   │   ├── recursive.example.ts
    │   │   ├── semantic.example.ts
    │   │   ├── token.example.ts
    │   │   ├── embeddings-refinery.example.ts
    │   │   └── overlap-refinery.example.ts
    │   ├── package.json
    │   ├── src
    │   │   ├── index.ts
    │   │   ├── refineries
    │   │   │   ├── embeddings.ts
    │   │   │   └── overlap.ts
    │   │   ├── chunkers
    │   │   │   ├── neural.ts
    │   │   │   ├── code.ts
    │   │   │   ├── token.ts
    │   │   │   ├── late.ts
    │   │   │   ├── recursive.ts
    │   │   │   ├── sentence.ts
    │   │   │   └── semantic.ts
    │   │   ├── utils.ts
    │   │   └── base.ts
    │   └── README.md
    ├── core
    │   ├── .tsc-aliasrc.json
    │   ├── vitest.config.ts
    │   ├── CHANGELOG.md
    │   ├── src
    │   │   ├── index.ts
    │   │   ├── tokenizer.ts
    │   │   ├── token.ts
    │   │   └── types.ts
    │   ├── tsconfig.json
    │   ├── package.json
    │   ├── examples
    │   │   ├── token.example.ts
    │   │   ├── with-huggingface.example.ts
    │   │   └── recursive.example.ts
    │   └── README.md
    └── token
    │   ├── src
    │       ├── index.ts
    │       └── huggingface.ts
    │   ├── .tsc-aliasrc.json
    │   ├── CHANGELOG.md
    │   ├── tsconfig.json
    │   ├── package.json
    │   └── README.md
├── assets
    └── chonkie_logo_br_transparent_bg.png
├── legacy
    └── chonkie
    │   ├── types
    │       ├── wasm.d.ts
    │       ├── index.ts
    │       ├── code.ts
    │       ├── late.ts
    │       ├── semantic.ts
    │       ├── base.ts
    │       └── sentence.ts
    │   ├── friends
    │       ├── index.ts
    │       ├── base.ts
    │       ├── utils.ts
    │       └── chroma.ts
    │   ├── chunker
    │       ├── index.ts
    │       └── base.ts
    │   ├── cloud
    │       ├── index.ts
    │       ├── embeddings_refinery.ts
    │       ├── base.ts
    │       ├── neural.ts
    │       ├── token.ts
    │       ├── overlap_refinery.ts
    │       ├── code.ts
    │       ├── late.ts
    │       ├── slumber.ts
    │       ├── recursive.ts
    │       ├── sentence.ts
    │       ├── semantic.ts
    │       └── sdpm.ts
    │   ├── index.ts
    │   └── utils
    │       └── hub.ts
├── .changeset
    ├── config.json
    └── README.md
├── .github
    └── workflows
    │   ├── auto-assign.yml
    │   └── test.yml
├── package.json
├── LICENSE
├── .gitignore
├── CONTRIBUTING.md
└── README.md


/pnpm-workspace.yaml:
--------------------------------------------------------------------------------
1 | packages:
2 |   - 'packages/*'
3 | 


--------------------------------------------------------------------------------
/packages/cloud/.tsc-aliasrc.json:
--------------------------------------------------------------------------------
1 | {
2 |   "resolveFullPaths": true,
3 |   "output": {
4 |     "fileExtension": ".js"
5 |   }
6 | }
7 | 


--------------------------------------------------------------------------------
/packages/core/.tsc-aliasrc.json:
--------------------------------------------------------------------------------
1 | {
2 |   "resolveFullPaths": true,
3 |   "output": {
4 |     "fileExtension": ".js"
5 |   }
6 | }
7 | 


--------------------------------------------------------------------------------
/assets/chonkie_logo_br_transparent_bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonkie-inc/chonkiejs/HEAD/assets/chonkie_logo_br_transparent_bg.png


--------------------------------------------------------------------------------
/packages/token/src/index.ts:
--------------------------------------------------------------------------------
1 | /**
2 |  * @chonkiejs/token
3 |  * HuggingFace tokenizer support for Chonkie
4 |  */
5 | 
6 | export { HuggingFaceTokenizer } from './huggingface';
7 | 


--------------------------------------------------------------------------------
/legacy/chonkie/types/wasm.d.ts:
--------------------------------------------------------------------------------
1 | declare module '*.wasm' {
2 |   const value: string; // The path to the wasm file or its content depending on the bundler
3 |   export default value;
4 | } 


--------------------------------------------------------------------------------
/legacy/chonkie/friends/index.ts:
--------------------------------------------------------------------------------
1 | /** Chonkie's Friends. */
2 | 
3 | export { BaseHandshake } from "./base";
4 | export { ChromaHandshake } from "./chroma";
5 | export { generateRandomCollectionName } from "./utils";


--------------------------------------------------------------------------------
/packages/token/.tsc-aliasrc.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "resolveFullPaths": true,
 3 |   "output": {
 4 |     "fileExtension": ".js"
 5 |   },
 6 |   "replacers": {
 7 |     "~": {
 8 |       "enabled": true
 9 |     }
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/packages/core/vitest.config.ts:
--------------------------------------------------------------------------------
 1 | import { defineConfig } from 'vitest/config';
 2 | import tsconfigPaths from 'vite-tsconfig-paths';
 3 | 
 4 | export default defineConfig({
 5 |   plugins: [tsconfigPaths()],
 6 |   test: {
 7 |     globals: true,
 8 |     environment: 'node',
 9 |   },
10 | });
11 | 


--------------------------------------------------------------------------------
/packages/token/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # @chonkiejs/token
 2 | 
 3 | ## 0.0.3
 4 | 
 5 | ### Patch Changes
 6 | 
 7 | - Fix: Add proper `.js` extension to the files
 8 | 
 9 | ## 0.0.2
10 | 
11 | ### Patch Changes
12 | 
13 | - Add Huggingface Tokenizer and TokenChunker support
14 | - Updated dependencies
15 |   - @chonkiejs/core@0.0.3
16 | 


--------------------------------------------------------------------------------
/legacy/chonkie/friends/base.ts:
--------------------------------------------------------------------------------
 1 | /** Base class for Chonkie's Handshakes. */
 2 | 
 3 | import { Chunk } from "../types/base";
 4 | 
 5 | 
 6 | export abstract class BaseHandshake {
 7 | 
 8 |     public abstract write(chunks: Chunk[]): Promise<void>;
 9 |     public abstract query(query: string, nResults: number): Promise<Chunk[]>;
10 | }


--------------------------------------------------------------------------------
/.changeset/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://unpkg.com/@changesets/config@3.1.1/schema.json",
 3 |   "changelog": "@changesets/cli/changelog",
 4 |   "commit": false,
 5 |   "fixed": [],
 6 |   "linked": [],
 7 |   "access": "public",
 8 |   "baseBranch": "main",
 9 |   "updateInternalDependencies": "patch",
10 |   "ignore": []
11 | }
12 | 


--------------------------------------------------------------------------------
/packages/cloud/vitest.config.ts:
--------------------------------------------------------------------------------
 1 | import { defineConfig } from 'vitest/config';
 2 | import tsconfigPaths from 'vite-tsconfig-paths';
 3 | 
 4 | export default defineConfig({
 5 |   plugins: [tsconfigPaths()],
 6 |   test: {
 7 |     globals: true,
 8 |     environment: 'node',
 9 |     testTimeout: 60000, // Longer timeout for slow API calls
10 |   },
11 | });
12 | 


--------------------------------------------------------------------------------
/legacy/chonkie/types/index.ts:
--------------------------------------------------------------------------------
1 | export { Chunk } from './base';
2 | export { SentenceData, Sentence, SentenceChunk } from './sentence';
3 | export { TreeSitterNode, CodeChunk } from './code';
4 | export { RecursiveLevel, RecursiveRules, RecursiveChunk } from './recursive';
5 | export { LateChunk } from './late'; 
6 | export { SemanticSentenceData, SemanticSentence, SemanticChunkData, SemanticChunk } from './semantic';
7 | 


--------------------------------------------------------------------------------
/packages/core/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # @chonkiejs/core
 2 | 
 3 | ## 0.0.5
 4 | 
 5 | ### Patch Changes
 6 | 
 7 | - Fix: Added full path resolution for .js files
 8 | 
 9 | ## 0.0.4
10 | 
11 | ### Patch Changes
12 | 
13 | - Fix: Add `embedding` to the `Chunk` for `EmbeddingsRefinery`
14 | 
15 | ## 0.0.3
16 | 
17 | ### Patch Changes
18 | 
19 | - Add Huggingface Tokenizer and TokenChunker support
20 | - Updated dependencies
21 |   - @chonkiejs/core@0.0.3
22 | 


--------------------------------------------------------------------------------
/legacy/chonkie/chunker/index.ts:
--------------------------------------------------------------------------------
1 | export { BaseChunker } from './base';
2 | export { CodeChunker, CallableCodeChunker, CodeChunkerOptions } from './code';
3 | export { RecursiveChunker, CallableRecursiveChunker, RecursiveChunkerOptions, RecursiveChunkerRecipeOptions } from './recursive';
4 | export { SentenceChunker, CallableSentenceChunker, SentenceChunkerOptions, SentenceChunkerRecipeOptions } from './sentence';
5 | export { TokenChunker, CallableTokenChunker, TokenChunkerOptions } from './token'; 


--------------------------------------------------------------------------------
/.changeset/README.md:
--------------------------------------------------------------------------------
1 | # Changesets
2 | 
3 | Hello and welcome! This folder has been automatically generated by `@changesets/cli`, a build tool that works
4 | with multi-package repos, or single-package repos to help you version and publish your code. You can
5 | find the full documentation for it [in our repository](https://github.com/changesets/changesets)
6 | 
7 | We have a quick list of common questions to get you started engaging with this project in
8 | [our documentation](https://github.com/changesets/changesets/blob/main/docs/common-questions.md)
9 | 


--------------------------------------------------------------------------------
/packages/core/src/index.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @chonkie/core
 3 |  * Core chunking library for Chonkie - lightweight and efficient text chunking
 4 |  */
 5 | 
 6 | export { RecursiveChunker } from '@/recursive';
 7 | export type { RecursiveChunkerOptions } from '@/recursive';
 8 | 
 9 | export { TokenChunker } from '@/token';
10 | export type { TokenChunkerOptions } from '@/token';
11 | 
12 | export { Tokenizer } from '@/tokenizer';
13 | 
14 | export { Chunk, RecursiveLevel, RecursiveRules } from '@/types';
15 | export type { RecursiveLevelConfig, RecursiveRulesConfig, IncludeDelim } from '@/types';
16 | 


--------------------------------------------------------------------------------
/packages/token/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "ES2021",
 4 |     "module": "ES2020",
 5 |     "lib": ["ES2021"],
 6 |     "moduleResolution": "node",
 7 |     "outDir": "./dist",
 8 |     "rootDir": "./src",
 9 |     "strict": true,
10 |     "esModuleInterop": true,
11 |     "skipLibCheck": true,
12 |     "declaration": true,
13 |     "declarationMap": true,
14 |     "sourceMap": true,
15 |     "resolveJsonModule": true,
16 |     "types": ["node"]
17 |   },
18 |   "include": ["src/**/*"],
19 |   "exclude": ["node_modules", "dist", "**/*.test.ts", "**/*.spec.ts"]
20 | }
21 | 


--------------------------------------------------------------------------------
/.github/workflows/auto-assign.yml:
--------------------------------------------------------------------------------
 1 | name: Auto Assign Issues
 2 | 
 3 | on:
 4 |   issues:
 5 |     types: [opened]
 6 | 
 7 | jobs:
 8 |   assign:
 9 |     runs-on: ubuntu-latest
10 |     permissions:
11 |       issues: write
12 |     steps:
13 |       - name: Assign issue to chonknick
14 |         uses: actions/github-script@v7
15 |         with:
16 |           script: |
17 |             github.rest.issues.addAssignees({
18 |               owner: context.repo.owner,
19 |               repo: context.repo.repo,
20 |               issue_number: context.issue.number,
21 |               assignees: ['chonknick']
22 |             });
23 | 


--------------------------------------------------------------------------------
/packages/cloud/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "ES2021",
 4 |     "module": "ES2020",
 5 |     "lib": ["ES2021"],
 6 |     "moduleResolution": "node",
 7 |     "outDir": "./dist",
 8 |     "rootDir": "./src",
 9 |     "strict": true,
10 |     "esModuleInterop": true,
11 |     "skipLibCheck": true,
12 |     "declaration": true,
13 |     "declarationMap": true,
14 |     "sourceMap": true,
15 |     "resolveJsonModule": true,
16 |     "types": ["node"],
17 |     "baseUrl": "./src",
18 |     "paths": {
19 |       "@/*": ["./*"]
20 |     }
21 |   },
22 |   "include": ["src/**/*"],
23 |   "exclude": ["node_modules", "dist", "**/*.test.ts", "**/*.spec.ts"]
24 | }
25 | 


--------------------------------------------------------------------------------
/packages/core/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "ES2021",
 4 |     "module": "ES2020",
 5 |     "lib": ["ES2021"],
 6 |     "moduleResolution": "node",
 7 |     "outDir": "./dist",
 8 |     "rootDir": "./src",
 9 |     "strict": true,
10 |     "esModuleInterop": true,
11 |     "skipLibCheck": true,
12 |     "declaration": true,
13 |     "declarationMap": true,
14 |     "sourceMap": true,
15 |     "resolveJsonModule": true,
16 |     "types": ["node"],
17 |     "baseUrl": "./src",
18 |     "paths": {
19 |       "@/*": ["./*"]
20 |     }
21 |   },
22 |   "include": ["src/**/*"],
23 |   "exclude": ["node_modules", "dist", "**/*.test.ts", "**/*.spec.ts"]
24 | }
25 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Run Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ '*' ]
 6 |   pull_request:
 7 |     branches: [ '*' ]
 8 | 
 9 | jobs:
10 |   test:
11 |     runs-on: ubuntu-latest
12 | 
13 |     strategy:
14 |       matrix:
15 |         node-version: [20.x]
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v3
19 | 
20 |     - uses: pnpm/action-setup@v2
21 |       with:
22 |         version: 10
23 | 
24 |     - name: Use Node.js ${{ matrix.node-version }}
25 |       uses: actions/setup-node@v3
26 |       with:
27 |         node-version: ${{ matrix.node-version }}
28 |         cache: 'pnpm'
29 | 
30 |     - name: Install dependencies
31 |       run: pnpm install --frozen-lockfile
32 | 
33 |     - name: Run tests
34 |       run: pnpm test
35 |       env:
36 |         CHONKIE_API_KEY: ${{ secrets.CHONKIE_API_KEY }} 


--------------------------------------------------------------------------------
/legacy/chonkie/cloud/index.ts:
--------------------------------------------------------------------------------
 1 | /** Cloud client exports for Chonkie API. */
 2 | 
 3 | export { CloudClient, CloudClientConfig, ChunkerInput } from './base';
 4 | export { CodeChunker, CodeChunkerConfig } from './code';
 5 | export { LateChunker, LateChunkerConfig } from './late';
 6 | export { NeuralChunker, NeuralChunkerConfig } from './neural';
 7 | export { RecursiveChunker, RecursiveChunkerConfig } from './recursive';
 8 | export { EmbeddingsRefinery, EmbeddingsRefineryConfig } from './embeddings_refinery';
 9 | export { SDPMChunker, SDPMChunkerConfig } from './sdpm';
10 | export { SemanticChunker, SemanticChunkerConfig } from './semantic';
11 | export { SentenceChunker, SentenceChunkerConfig } from './sentence';
12 | export { SlumberChunker, SlumberChunkerConfig } from './slumber';
13 | export { TokenChunker, TokenChunkerConfig } from './token'; 
14 | export { OverlapRefinery, OverlapRefineryConfig } from './overlap_refinery';
15 | 


--------------------------------------------------------------------------------
/packages/cloud/tests/overlap-refinery.test.ts:
--------------------------------------------------------------------------------
 1 | import { TokenChunker, OverlapRefinery } from '../src';
 2 | 
 3 | describe.skipIf(!process.env.CHONKIE_API_KEY)('OverlapRefinery', () => {
 4 |   it('should add overlap to chunks successfully', async () => {
 5 |     // First create some chunks
 6 |     const chunker = new TokenChunker({ chunkSize: 30 });
 7 |     const chunks = await chunker.chunk({ text: 'This is a test for overlap refinery functionality.' });
 8 | 
 9 |     // Add overlap
10 |     const refinery = new OverlapRefinery({
11 |       contextSize: 0.25,
12 |       method: 'suffix'
13 |     });
14 | 
15 |     const refinedChunks = await refinery.refine(chunks);
16 | 
17 |     expect(refinedChunks.length).toBeGreaterThan(0);
18 |     expect(refinedChunks[0]).toHaveProperty('text');
19 |     expect(refinedChunks[0]).toHaveProperty('tokenCount');
20 |     expect(refinedChunks[0]).toHaveProperty('startIndex');
21 |     expect(refinedChunks[0]).toHaveProperty('endIndex');
22 |   });
23 | });
24 | 


--------------------------------------------------------------------------------
/legacy/chonkie/index.ts:
--------------------------------------------------------------------------------
 1 | // Import chunkers directly to avoid loading CodeChunker and web-tree-sitter
 2 | export { TokenChunker } from './chunker/token';
 3 | export { SentenceChunker } from './chunker/sentence';
 4 | export { RecursiveChunker } from './chunker/recursive';
 5 | // CodeChunker removed - use: import { CodeChunker } from "chonkie/chunker/code"
 6 | export { Tokenizer } from './tokenizer';
 7 | export { Visualizer } from './utils/viz';
 8 | export { Hubbie } from './utils/hub';
 9 | // ChromaHandshake removed - use: import { ChromaHandshake } from "chonkie/friends"
10 | export { 
11 |     Chunk, 
12 |     SentenceData, 
13 |     Sentence, 
14 |     SentenceChunk, 
15 |     // TreeSitterNode, CodeChunk removed - use: import { TreeSitterNode, CodeChunk } from "chonkie/types"
16 |     RecursiveLevel, 
17 |     RecursiveRules, 
18 |     RecursiveChunk, 
19 |     LateChunk, 
20 |     SemanticSentenceData, 
21 |     SemanticSentence, 
22 |     SemanticChunkData, 
23 |     SemanticChunk 
24 | } from './types'; 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "chonkie-main",
 3 |   "version": "0.3.0",
 4 |   "private": true,
 5 |   "description": "Monorepo for Chonkie - lightweight and efficient text chunking library",
 6 |   "repository": {
 7 |     "type": "git",
 8 |     "url": "git+https://github.com/chonkie-inc/chonkie-ts.git"
 9 |   },
10 |   "workspaces": [
11 |     "packages/*"
12 |   ],
13 |   "scripts": {
14 |     "clean": "rimraf packages/*/dist",
15 |     "build": "npm run clean && npm run build --workspace=packages/core && npm run build --workspace=packages/cloud",
16 |     "test": "npm run build && npm run test --workspace=packages/core && npm run test --workspace=packages/cloud",
17 |     "changeset": "changeset",
18 |     "version": "changeset version",
19 |     "release": "npm run build && changeset publish"
20 |   },
21 |   "devDependencies": {
22 |     "@changesets/cli": "^2.29.7",
23 |     "@types/node": "^22.15.21",
24 |     "publint": "^0.3.14",
25 |     "rimraf": "^6.0.1",
26 |     "typescript": "^5.8.3",
27 |     "vitest": "^3.2.4"
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Chonkie
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/legacy/chonkie/cloud/embeddings_refinery.ts:
--------------------------------------------------------------------------------
 1 | /** Refinery clients for Chonkie API. */
 2 | 
 3 | import { CloudClient } from "./base";
 4 | import { Chunk } from "../types/base";
 5 | 
 6 | export interface EmbeddingsRefineryConfig {
 7 |   embeddingModel: string;
 8 | }
 9 | 
10 | export class EmbeddingsRefinery extends CloudClient {
11 |   private readonly config: Required<EmbeddingsRefineryConfig>;
12 | 
13 |   constructor(apiKey: string, config: EmbeddingsRefineryConfig) {
14 |     super({ apiKey });
15 |     if (!config.embeddingModel) {
16 |       throw new Error("Embedding model is required for embeddings refinement");
17 |     }
18 |     this.config = {
19 |       embeddingModel: config.embeddingModel,
20 |     };
21 |   }
22 | 
23 |   async refine(chunks: Chunk[]): Promise<Chunk[]> {
24 |     const response = await this.request<Chunk[]>("/v1/refine/embeddings", {
25 |       body: {
26 |         chunks: chunks.map(chunk => chunk.toDict()),
27 |         embedding_model: this.config.embeddingModel,
28 |       },
29 |       headers: {
30 |         "Content-Type": "application/json",
31 |       },
32 |     });
33 | 
34 |     return response.map(chunk => Chunk.fromDict(chunk));
35 |   }
36 | }


--------------------------------------------------------------------------------
/packages/cloud/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # @chonkiejs/cloud
 2 | 
 3 | ## 0.1.0
 4 | 
 5 | ### Minor Changes
 6 | 
 7 | - Add Pipeline client for building and executing pipelines via api.chonkie.ai
 8 | 
 9 |   - New `Pipeline` class with fluent API for building pipelines
10 |   - Support for `chunkWith()`, `refineWith()`, and `processWith()` builder methods
11 |   - Static methods: `Pipeline.get()`, `Pipeline.list()`, `Pipeline.validate()`
12 |   - Instance methods: `run()`, `update()`, `delete()`, `reset()`
13 |   - Auto-save on first `run()` call
14 |   - File upload support via `filepath` option
15 |   - Full TypeScript types: `PipelineOptions`, `PipelineStep`, `PipelineValidationResult`
16 | 
17 | ## 0.0.6
18 | 
19 | ### Patch Changes
20 | 
21 | - Fix: Add proper `.js` extension to the files
22 | 
23 | ## 0.0.5
24 | 
25 | ### Patch Changes
26 | 
27 | - Fix: Add `embedding` to the `Chunk` for `EmbeddingsRefinery`
28 | - Updated dependencies
29 |   - @chonkiejs/core@0.0.4
30 | 
31 | ## 0.0.4
32 | 
33 | ### Patch Changes
34 | 
35 | - Add OverlapRefinery and EmbeddingsRefinery
36 | 
37 | ## 0.0.3
38 | 
39 | ### Patch Changes
40 | 
41 | - Updated dependencies
42 |   - @chonkiejs/core@0.0.3
43 | 
44 | ## 0.0.2
45 | 
46 | ### Patch Changes
47 | 
48 | - Fix: tsx alias not present error
49 | 


--------------------------------------------------------------------------------
/packages/cloud/examples/sentence.example.ts:
--------------------------------------------------------------------------------
 1 | import { SentenceChunker } from '../src';
 2 | 
 3 | async function main() {
 4 |   console.log('🦛 Testing SentenceChunker with api.chonkie.ai\n');
 5 | 
 6 |   try {
 7 |     const chunker = new SentenceChunker({
 8 |       chunkSize: 50,
 9 |       minSentencesPerChunk: 2
10 |     });
11 | 
12 |     console.log('✅ SentenceChunker created');
13 |     console.log(`Config: ${chunker.toString()}\n`);
14 | 
15 |     const text = 'This is the first sentence. Here is the second one. And a third sentence for testing. Finally, a fourth sentence to complete the example.';
16 | 
17 |     console.log(`📝 Input (${text.length} chars):`);
18 |     console.log(`"${text}"\n`);
19 | 
20 |     const chunks = await chunker.chunk({ text });
21 | 
22 |     console.log(`✅ Received ${chunks.length} chunks:\n`);
23 | 
24 |     chunks.forEach((chunk, index) => {
25 |       console.log(`Chunk ${index + 1}:`);
26 |       console.log(`  Tokens: ${chunk.tokenCount}`);
27 |       console.log(`  Position: [${chunk.startIndex}:${chunk.endIndex}]`);
28 |       console.log(`  Text: "${chunk.text}"`);
29 |       console.log();
30 |     });
31 | 
32 |   } catch (error) {
33 |     console.error('❌ Error:', error instanceof Error ? error.message : error);
34 |     process.exit(1);
35 |   }
36 | }
37 | 
38 | main();
39 | 


--------------------------------------------------------------------------------
/packages/core/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@chonkiejs/core",
 3 |   "version": "0.0.5",
 4 |   "description": "Core chunking library for Chonkie - lightweight and efficient text chunking",
 5 |   "license": "MIT",
 6 |   "homepage": "https://docs.chonkie.ai",
 7 |   "repository": {
 8 |     "type": "git",
 9 |     "url": "git+https://github.com/chonkie-inc/chonkie-ts.git",
10 |     "directory": "packages/core"
11 |   },
12 |   "author": "Bhavnick Minhas",
13 |   "type": "module",
14 |   "main": "./dist/index.js",
15 |   "types": "./dist/index.d.ts",
16 |   "exports": {
17 |     ".": {
18 |       "types": "./dist/index.d.ts",
19 |       "import": "./dist/index.js",
20 |       "default": "./dist/index.js"
21 |     }
22 |   },
23 |   "scripts": {
24 |     "clean": "rimraf dist",
25 |     "build": "npm run clean && tsc && tsc-alias -p tsconfig.json --resolve-full-paths",
26 |     "test": "vitest run"
27 |   },
28 |   "files": [
29 |     "dist"
30 |   ],
31 |   "keywords": [
32 |     "chonkie",
33 |     "chunking",
34 |     "text-processing",
35 |     "tokenization"
36 |   ],
37 |   "devDependencies": {
38 |     "rimraf": "^6.0.1",
39 |     "ts-node": "^10.9.2",
40 |     "tsc-alias": "^1.8.16",
41 |     "tsx": "^4.20.6",
42 |     "typescript": "^5.9.3",
43 |     "vite-tsconfig-paths": "^5.1.4",
44 |     "vitest": "^3.2.4"
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/packages/cloud/examples/neural.example.ts:
--------------------------------------------------------------------------------
 1 | import { NeuralChunker } from '../src';
 2 | 
 3 | async function main() {
 4 |   console.log('🦛 Testing NeuralChunker with api.chonkie.ai\n');
 5 | 
 6 |   try {
 7 |     const chunker = new NeuralChunker();
 8 | 
 9 |     console.log('✅ NeuralChunker created');
10 |     console.log(`Config: ${chunker.toString()}\n`);
11 | 
12 |     const text = 'Neural networks are used for pattern recognition. They learn from data to make predictions. Deep learning has revolutionized AI applications. Modern architectures like transformers are very powerful.';
13 | 
14 |     console.log(`📝 Input (${text.length} chars):`);
15 |     console.log(`"${text}"\n`);
16 | 
17 |     console.log('🔄 Processing with neural model...\n');
18 | 
19 |     const chunks = await chunker.chunk({ text });
20 | 
21 |     console.log(`✅ Received ${chunks.length} chunks:\n`);
22 | 
23 |     chunks.forEach((chunk, index) => {
24 |       console.log(`Chunk ${index + 1}:`);
25 |       console.log(`  Tokens: ${chunk.tokenCount}`);
26 |       console.log(`  Position: [${chunk.startIndex}:${chunk.endIndex}]`);
27 |       console.log(`  Text: "${chunk.text}"`);
28 |       console.log();
29 |     });
30 | 
31 |   } catch (error) {
32 |     console.error('❌ Error:', error instanceof Error ? error.message : error);
33 |     process.exit(1);
34 |   }
35 | }
36 | 
37 | main();
38 | 


--------------------------------------------------------------------------------
/packages/cloud/examples/recursive.example.ts:
--------------------------------------------------------------------------------
 1 | import { RecursiveChunker } from '../src';
 2 | 
 3 | async function main() {
 4 |   console.log('🦛 Testing RecursiveChunker with api.chonkie.ai\n');
 5 | 
 6 |   try {
 7 |     const chunker = new RecursiveChunker({
 8 |       chunkSize: 60,
 9 |       recipe: 'default',
10 |       lang: 'en'
11 |     });
12 | 
13 |     console.log('✅ RecursiveChunker created');
14 |     console.log(`Config: ${chunker.toString()}\n`);
15 | 
16 |     const text = `First paragraph with some content here.
17 | 
18 | Second paragraph with more details. It contains multiple sentences.
19 | 
20 | Third paragraph to test the recursive splitting.`;
21 | 
22 |     console.log(`📝 Input (${text.length} chars):`);
23 |     console.log(`"${text}"\n`);
24 | 
25 |     const chunks = await chunker.chunk({ text });
26 | 
27 |     console.log(`✅ Received ${chunks.length} chunks:\n`);
28 | 
29 |     chunks.forEach((chunk, index) => {
30 |       console.log(`Chunk ${index + 1}:`);
31 |       console.log(`  Tokens: ${chunk.tokenCount}`);
32 |       console.log(`  Position: [${chunk.startIndex}:${chunk.endIndex}]`);
33 |       console.log(`  Text: "${chunk.text}"`);
34 |       console.log();
35 |     });
36 | 
37 |   } catch (error) {
38 |     console.error('❌ Error:', error instanceof Error ? error.message : error);
39 |     process.exit(1);
40 |   }
41 | }
42 | 
43 | main();
44 | 


--------------------------------------------------------------------------------
/packages/cloud/tests/embeddings-refinery.test.ts:
--------------------------------------------------------------------------------
 1 | import { TokenChunker, EmbeddingsRefinery } from '../src';
 2 | 
 3 | describe.skipIf(!process.env.CHONKIE_API_KEY)('EmbeddingsRefinery', () => {
 4 |   it('should add embeddings to chunks successfully', async () => {
 5 |     // First create some chunks
 6 |     const chunker = new TokenChunker({ chunkSize: 30 });
 7 |     const chunks = await chunker.chunk({ text: 'This is a test for embeddings refinery.' });
 8 | 
 9 |     // Verify chunks don't have embeddings initially
10 |     expect(chunks[0].embedding).toBeUndefined();
11 | 
12 |     // Add embeddings
13 |     const refinery = new EmbeddingsRefinery({
14 |       embeddingModel: 'sentence-transformers/all-MiniLM-L6-v2'
15 |     });
16 | 
17 |     const refinedChunks = await refinery.refine(chunks);
18 | 
19 |     expect(refinedChunks.length).toBeGreaterThan(0);
20 |     expect(refinedChunks[0]).toHaveProperty('text');
21 |     expect(refinedChunks[0]).toHaveProperty('tokenCount');
22 |     expect(refinedChunks[0]).toHaveProperty('startIndex');
23 |     expect(refinedChunks[0]).toHaveProperty('endIndex');
24 | 
25 |     // Verify embeddings are now present
26 |     expect(refinedChunks[0]).toHaveProperty('embedding');
27 |     expect(refinedChunks[0].embedding).toBeDefined();
28 |     expect(Array.isArray(refinedChunks[0].embedding)).toBe(true);
29 |     expect(refinedChunks[0].embedding!.length).toBeGreaterThan(0);
30 |   });
31 | });
32 | 


--------------------------------------------------------------------------------
/packages/cloud/examples/semantic.example.ts:
--------------------------------------------------------------------------------
 1 | import { SemanticChunker } from '../src';
 2 | 
 3 | async function main() {
 4 |   console.log('🦛 Testing SemanticChunker with api.chonkie.ai\n');
 5 | 
 6 |   try {
 7 |     const chunker = new SemanticChunker({
 8 |       chunkSize: 60,
 9 |       threshold: 0.5
10 |     });
11 | 
12 |     console.log('✅ SemanticChunker created');
13 |     console.log(`Config: ${chunker.toString()}\n`);
14 | 
15 |     const text = 'Artificial intelligence is transforming technology. Machine learning models are becoming more powerful. Meanwhile, climate change poses significant challenges. Environmental protection is crucial for our future.';
16 | 
17 |     console.log(`📝 Input (${text.length} chars):`);
18 |     console.log(`"${text}"\n`);
19 | 
20 |     console.log('🔄 Analyzing semantic similarity...\n');
21 | 
22 |     const chunks = await chunker.chunk({ text });
23 | 
24 |     console.log(`✅ Received ${chunks.length} chunks:\n`);
25 | 
26 |     chunks.forEach((chunk, index) => {
27 |       console.log(`Chunk ${index + 1}:`);
28 |       console.log(`  Tokens: ${chunk.tokenCount}`);
29 |       console.log(`  Position: [${chunk.startIndex}:${chunk.endIndex}]`);
30 |       console.log(`  Text: "${chunk.text}"`);
31 |       console.log();
32 |     });
33 | 
34 |   } catch (error) {
35 |     console.error('❌ Error:', error instanceof Error ? error.message : error);
36 |     process.exit(1);
37 |   }
38 | }
39 | 
40 | main();
41 | 


--------------------------------------------------------------------------------
/packages/cloud/tests/token.test.ts:
--------------------------------------------------------------------------------
 1 | import { TokenChunker } from '../src';
 2 | import * as path from 'path';
 3 | 
 4 | describe.skipIf(!process.env.CHONKIE_API_KEY)('TokenChunker', () => {
 5 |   it('should chunk text successfully', async () => {
 6 |     const chunker = new TokenChunker({ chunkSize: 30 });
 7 |     const text = 'This is a test. It should be chunked properly.';
 8 | 
 9 |     const chunks = await chunker.chunk({ text });
10 | 
11 |     expect(chunks.length).toBeGreaterThan(0);
12 |     expect(chunks[0]).toHaveProperty('text');
13 |     expect(chunks[0]).toHaveProperty('tokenCount');
14 |     expect(chunks[0]).toHaveProperty('startIndex');
15 |     expect(chunks[0]).toHaveProperty('endIndex');
16 |   });
17 | 
18 |   it('should chunk file successfully with file upload', async () => {
19 |     const chunker = new TokenChunker({ chunkSize: 150, chunkOverlap: 20 });
20 |     const testFilePath = path.join(__dirname, 'fixtures', 'test-document.md');
21 | 
22 |     const chunks = await chunker.chunk({ filepath: testFilePath });
23 | 
24 |     expect(chunks.length).toBeGreaterThan(0);
25 |     expect(chunks[0]).toHaveProperty('text');
26 |     expect(chunks[0]).toHaveProperty('tokenCount');
27 |     expect(chunks[0]).toHaveProperty('startIndex');
28 |     expect(chunks[0]).toHaveProperty('endIndex');
29 | 
30 |     // Verify chunks respect chunk size
31 |     chunks.forEach(chunk => {
32 |       expect(chunk.tokenCount).toBeLessThanOrEqual(150);
33 |     });
34 |   });
35 | });
36 | 


--------------------------------------------------------------------------------
/packages/cloud/tests/recursive.test.ts:
--------------------------------------------------------------------------------
 1 | import { RecursiveChunker } from '../src';
 2 | import * as path from 'path';
 3 | 
 4 | describe.skipIf(!process.env.CHONKIE_API_KEY)('RecursiveChunker', () => {
 5 |   it('should chunk text successfully', async () => {
 6 |     const chunker = new RecursiveChunker({ chunkSize: 50 });
 7 |     const text = 'Paragraph one.\n\nParagraph two with more text.';
 8 | 
 9 |     const chunks = await chunker.chunk({ text });
10 | 
11 |     expect(chunks.length).toBeGreaterThan(0);
12 |     expect(chunks[0]).toHaveProperty('text');
13 |     expect(chunks[0]).toHaveProperty('tokenCount');
14 |     expect(chunks[0]).toHaveProperty('startIndex');
15 |     expect(chunks[0]).toHaveProperty('endIndex');
16 |   });
17 | 
18 |   it('should chunk file successfully with file upload', async () => {
19 |     const chunker = new RecursiveChunker({ chunkSize: 200, minCharactersPerChunk: 50 });
20 |     const testFilePath = path.join(__dirname, 'fixtures', 'test-document.md');
21 | 
22 |     const chunks = await chunker.chunk({ filepath: testFilePath });
23 | 
24 |     expect(chunks.length).toBeGreaterThan(0);
25 |     expect(chunks[0]).toHaveProperty('text');
26 |     expect(chunks[0]).toHaveProperty('tokenCount');
27 |     expect(chunks[0]).toHaveProperty('startIndex');
28 |     expect(chunks[0]).toHaveProperty('endIndex');
29 | 
30 |     // Verify chunks can reconstruct the file
31 |     const reconstructed = chunks.map(c => c.text).join('');
32 |     expect(reconstructed.length).toBeGreaterThan(0);
33 |   });
34 | });
35 | 


--------------------------------------------------------------------------------
/packages/cloud/tests/sentence.test.ts:
--------------------------------------------------------------------------------
 1 | import { SentenceChunker } from '../src';
 2 | import * as path from 'path';
 3 | 
 4 | describe.skipIf(!process.env.CHONKIE_API_KEY)('SentenceChunker', () => {
 5 |   it('should chunk text successfully', async () => {
 6 |     const chunker = new SentenceChunker({ chunkSize: 50 });
 7 |     const text = 'First sentence here. Second sentence. Third one too.';
 8 | 
 9 |     const chunks = await chunker.chunk({ text });
10 | 
11 |     expect(chunks.length).toBeGreaterThan(0);
12 |     expect(chunks[0]).toHaveProperty('text');
13 |     expect(chunks[0]).toHaveProperty('tokenCount');
14 |     expect(chunks[0]).toHaveProperty('startIndex');
15 |     expect(chunks[0]).toHaveProperty('endIndex');
16 |   });
17 | 
18 |   it('should chunk file successfully with file upload', async () => {
19 |     const chunker = new SentenceChunker({ chunkSize: 150, minSentencesPerChunk: 2 });
20 |     const testFilePath = path.join(__dirname, 'fixtures', 'test-document.md');
21 | 
22 |     const chunks = await chunker.chunk({ filepath: testFilePath });
23 | 
24 |     expect(chunks.length).toBeGreaterThan(0);
25 |     expect(chunks[0]).toHaveProperty('text');
26 |     expect(chunks[0]).toHaveProperty('tokenCount');
27 |     expect(chunks[0]).toHaveProperty('startIndex');
28 |     expect(chunks[0]).toHaveProperty('endIndex');
29 | 
30 |     // Verify chunks can reconstruct the file
31 |     const reconstructed = chunks.map(c => c.text).join('');
32 |     expect(reconstructed.length).toBeGreaterThan(0);
33 |   });
34 | });
35 | 


--------------------------------------------------------------------------------
/packages/token/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@chonkiejs/token",
 3 |   "version": "0.0.3",
 4 |   "description": "HuggingFace tokenizer support for Chonkie - extends @chonkiejs/core with real tokenization",
 5 |   "license": "MIT",
 6 |   "homepage": "https://docs.chonkie.ai",
 7 |   "repository": {
 8 |     "type": "git",
 9 |     "url": "git+https://github.com/chonkie-inc/chonkie-ts.git",
10 |     "directory": "packages/token"
11 |   },
12 |   "author": "Bhavnick Minhas",
13 |   "type": "module",
14 |   "main": "./dist/index.js",
15 |   "types": "./dist/index.d.ts",
16 |   "exports": {
17 |     ".": {
18 |       "types": "./dist/index.d.ts",
19 |       "import": "./dist/index.js",
20 |       "default": "./dist/index.js"
21 |     }
22 |   },
23 |   "scripts": {
24 |     "clean": "rimraf dist",
25 |     "build": "npm run clean && tsc && tsc-alias -p tsconfig.json --resolve-full-paths",
26 |     "test": "echo 'No tests for @chonkiejs/token yet'"
27 |   },
28 |   "files": [
29 |     "dist"
30 |   ],
31 |   "keywords": [
32 |     "chonkie",
33 |     "tokenizer",
34 |     "huggingface",
35 |     "transformers",
36 |     "gpt2",
37 |     "bert"
38 |   ],
39 |   "peerDependencies": {
40 |     "@chonkiejs/core": ">=0.0.5"
41 |   },
42 |   "dependencies": {
43 |     "@huggingface/transformers": "^3.5.1"
44 |   },
45 |   "devDependencies": {
46 |     "@chonkiejs/core": "^0.0.5",
47 |     "rimraf": "^6.0.1",
48 |     "tsc-alias": "^1.8.16",
49 |     "tsx": "^4.20.6",
50 |     "typescript": "^5.9.3",
51 |     "vitest": "^3.2.4"
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/packages/cloud/tests/neural.test.ts:
--------------------------------------------------------------------------------
 1 | import { NeuralChunker } from '../src';
 2 | import * as path from 'path';
 3 | 
 4 | describe.skipIf(!process.env.CHONKIE_API_KEY)('NeuralChunker', () => {
 5 |   it('should chunk text successfully', async () => {
 6 |     const chunker = new NeuralChunker();
 7 |     const text = 'Neural networks learn patterns. Deep learning is powerful. Transformers changed NLP. Modern AI is impressive.';
 8 | 
 9 |     const chunks = await chunker.chunk({ text });
10 | 
11 |     expect(chunks.length).toBeGreaterThan(0);
12 |     expect(chunks[0]).toHaveProperty('text');
13 |     expect(chunks[0]).toHaveProperty('tokenCount');
14 |     expect(chunks[0]).toHaveProperty('startIndex');
15 |     expect(chunks[0]).toHaveProperty('endIndex');
16 |   });
17 | 
18 |   it('should chunk file successfully with file upload', async () => {
19 |     const chunker = new NeuralChunker({ minCharactersPerChunk: 50 });
20 |     const testFilePath = path.join(__dirname, 'fixtures', 'test-document.md');
21 | 
22 |     const chunks = await chunker.chunk({ filepath: testFilePath });
23 | 
24 |     expect(chunks.length).toBeGreaterThan(0);
25 |     expect(chunks[0]).toHaveProperty('text');
26 |     expect(chunks[0]).toHaveProperty('tokenCount');
27 |     expect(chunks[0]).toHaveProperty('startIndex');
28 |     expect(chunks[0]).toHaveProperty('endIndex');
29 | 
30 |     // Verify chunks can reconstruct the file
31 |     const reconstructed = chunks.map(c => c.text).join('');
32 |     expect(reconstructed.length).toBeGreaterThan(0);
33 |   });
34 | });
35 | 


--------------------------------------------------------------------------------
/packages/cloud/tests/semantic.test.ts:
--------------------------------------------------------------------------------
 1 | import { SemanticChunker } from '../src';
 2 | import * as path from 'path';
 3 | 
 4 | describe.skipIf(!process.env.CHONKIE_API_KEY)('SemanticChunker', () => {
 5 |   it('should chunk text successfully', async () => {
 6 |     const chunker = new SemanticChunker({ chunkSize: 60 });
 7 |     const text = 'AI is advancing rapidly. Technology continues to evolve. Climate change needs attention. Environmental issues are critical.';
 8 | 
 9 |     const chunks = await chunker.chunk({ text });
10 | 
11 |     expect(chunks.length).toBeGreaterThan(0);
12 |     expect(chunks[0]).toHaveProperty('text');
13 |     expect(chunks[0]).toHaveProperty('tokenCount');
14 |     expect(chunks[0]).toHaveProperty('startIndex');
15 |     expect(chunks[0]).toHaveProperty('endIndex');
16 |   });
17 | 
18 |   it('should chunk file successfully with file upload', async () => {
19 |     const chunker = new SemanticChunker({ chunkSize: 200, threshold: 0.5 });
20 |     const testFilePath = path.join(__dirname, 'fixtures', 'test-document.md');
21 | 
22 |     const chunks = await chunker.chunk({ filepath: testFilePath });
23 | 
24 |     expect(chunks.length).toBeGreaterThan(0);
25 |     expect(chunks[0]).toHaveProperty('text');
26 |     expect(chunks[0]).toHaveProperty('tokenCount');
27 |     expect(chunks[0]).toHaveProperty('startIndex');
28 |     expect(chunks[0]).toHaveProperty('endIndex');
29 | 
30 |     // Verify chunks can reconstruct the file
31 |     const reconstructed = chunks.map(c => c.text).join('');
32 |     expect(reconstructed.length).toBeGreaterThan(0);
33 |   });
34 | });
35 | 


--------------------------------------------------------------------------------
/packages/cloud/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@chonkiejs/cloud",
 3 |   "version": "0.1.0",
 4 |   "description": "Cloud-based chunkers for Chonkie - semantic, neural, and AI-powered text chunking via api.chonkie.ai",
 5 |   "license": "MIT",
 6 |   "homepage": "https://docs.chonkie.ai",
 7 |   "repository": {
 8 |     "type": "git",
 9 |     "url": "git+https://github.com/chonkie-inc/chonkie-ts.git",
10 |     "directory": "packages/cloud"
11 |   },
12 |   "author": "Bhavnick Minhas",
13 |   "type": "module",
14 |   "main": "./dist/index.js",
15 |   "types": "./dist/index.d.ts",
16 |   "exports": {
17 |     ".": {
18 |       "types": "./dist/index.d.ts",
19 |       "import": "./dist/index.js",
20 |       "default": "./dist/index.js"
21 |     }
22 |   },
23 |   "scripts": {
24 |     "clean": "rimraf dist",
25 |     "build": "npm run clean && tsc && tsc-alias -p tsconfig.json --resolve-full-paths",
26 |     "test": "vitest run"
27 |   },
28 |   "files": [
29 |     "dist"
30 |   ],
31 |   "keywords": [
32 |     "chonkie",
33 |     "chunking",
34 |     "cloud",
35 |     "semantic",
36 |     "neural",
37 |     "ai",
38 |     "embeddings",
39 |     "api"
40 |   ],
41 |   "peerDependencies": {
42 |     "@chonkiejs/core": ">=0.0.5"
43 |   },
44 |   "devDependencies": {
45 |     "@chonkiejs/core": "^0.0.5",
46 |     "@types/mime-types": "^3.0.1",
47 |     "@types/node": "^22.15.21",
48 |     "rimraf": "^6.0.1",
49 |     "tsc-alias": "^1.8.16",
50 |     "tsx": "^4.20.6",
51 |     "typescript": "^5.9.3",
52 |     "vite-tsconfig-paths": "^5.1.4",
53 |     "vitest": "^3.2.4"
54 |   },
55 |   "dependencies": {
56 |     "mime-types": "^3.0.1"
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/packages/cloud/tests/code.test.ts:
--------------------------------------------------------------------------------
 1 | import { CodeChunker } from '../src';
 2 | import * as path from 'path';
 3 | 
 4 | describe.skipIf(!process.env.CHONKIE_API_KEY)('CodeChunker', () => {
 5 |   it('should chunk TypeScript code successfully', async () => {
 6 |     const chunker = new CodeChunker({ language: 'typescript', chunkSize: 100 });
 7 |     const code = `
 8 | function hello() {
 9 |   console.log('Hello world');
10 | }
11 | 
12 | class Example {
13 |   constructor() {
14 |     this.value = 42;
15 |   }
16 | }
17 |     `.trim();
18 | 
19 |     const chunks = await chunker.chunk({ text: code });
20 | 
21 |     expect(chunks.length).toBeGreaterThan(0);
22 |     expect(chunks[0]).toHaveProperty('text');
23 |     expect(chunks[0]).toHaveProperty('tokenCount');
24 |     expect(chunks[0]).toHaveProperty('startIndex');
25 |     expect(chunks[0]).toHaveProperty('endIndex');
26 |   });
27 | 
28 |   it('should chunk TypeScript file successfully with file upload', async () => {
29 |     const chunker = new CodeChunker({ language: 'typescript', chunkSize: 200 });
30 |     const testFilePath = path.join(__dirname, 'fixtures', 'test-code.js');
31 | 
32 |     const chunks = await chunker.chunk({ filepath: testFilePath });
33 | 
34 |     expect(chunks.length).toBeGreaterThan(0);
35 |     expect(chunks[0]).toHaveProperty('text');
36 |     expect(chunks[0]).toHaveProperty('tokenCount');
37 |     expect(chunks[0]).toHaveProperty('startIndex');
38 |     expect(chunks[0]).toHaveProperty('endIndex');
39 | 
40 |     // Verify chunks can reconstruct the file
41 |     const reconstructed = chunks.map(c => c.text).join('');
42 |     expect(reconstructed.length).toBeGreaterThan(0);
43 |   });
44 | });
45 | 


--------------------------------------------------------------------------------
/legacy/chonkie/types/code.ts:
--------------------------------------------------------------------------------
 1 | import { Chunk } from './base';
 2 | 
 3 | /** Interface for tree-sitter Node */
 4 | export interface TreeSitterNode {
 5 |   // This will be defined by tree-sitter when imported
 6 |   [key: string]: any;
 7 | }
 8 | 
 9 | /** Interface for CodeChunk data */
10 | interface CodeChunkData {
11 |   text: string;
12 |   startIndex: number;
13 |   endIndex: number;
14 |   tokenCount: number;
15 |   lang?: string;
16 |   nodes?: TreeSitterNode[];
17 | }
18 | 
19 | /** Class to represent code chunks with metadata */
20 | export class CodeChunk extends Chunk {
21 |   /** The programming language of the code */
22 |   public lang?: string;
23 |   /** The tree-sitter AST nodes in the chunk */
24 |   public nodes?: TreeSitterNode[];
25 | 
26 |   constructor(data: {
27 |     text: string;
28 |     startIndex: number;
29 |     endIndex: number;
30 |     tokenCount: number;
31 |     lang?: string;
32 |     nodes?: TreeSitterNode[];
33 |   }) {
34 |     super(data);
35 |     this.lang = data.lang;
36 |     this.nodes = data.nodes;
37 |   }
38 | 
39 |   /** Return a string representation of the CodeChunk */
40 |   public toString(): string {
41 |     return `CodeChunk(text=${this.text}, startIndex=${this.startIndex}, endIndex=${this.endIndex}, tokenCount=${this.tokenCount}, lang=${this.lang}, nodes=${this.nodes})`;
42 |   }
43 | 
44 |   /** Return the CodeChunk as a dictionary-like object */
45 |   public toDict(): CodeChunkData {
46 |     const baseDict = super.toDict();
47 |     return {
48 |       ...baseDict,
49 |       lang: this.lang,
50 |       nodes: this.nodes,
51 |     };
52 |   }
53 | 
54 |   /** Create a CodeChunk object from a dictionary */
55 |   public static fromDict(data: CodeChunkData): CodeChunk {
56 |     return new CodeChunk(data);
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/packages/cloud/src/index.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @chonkiejs/cloud
 3 |  * Cloud-based chunkers and refineries for Chonkie via api.chonkie.ai
 4 |  */
 5 | 
 6 | // Base
 7 | export { CloudBaseChunker } from '@/base';
 8 | export type { CloudClientConfig, ChunkerInput } from '@/base';
 9 | 
10 | // Utils
11 | export { createFileReference } from '@/utils';
12 | export type { FileReference, FileUploadResponse } from '@/utils';
13 | 
14 | // Chunkers
15 | export { TokenChunker } from '@/chunkers/token';
16 | export type { TokenChunkerOptions } from '@/chunkers/token';
17 | 
18 | export { SentenceChunker } from '@/chunkers/sentence';
19 | export type { SentenceChunkerOptions } from '@/chunkers/sentence';
20 | 
21 | export { RecursiveChunker } from '@/chunkers/recursive';
22 | export type { RecursiveChunkerOptions } from '@/chunkers/recursive';
23 | 
24 | export { SemanticChunker } from '@/chunkers/semantic';
25 | export type { SemanticChunkerOptions } from '@/chunkers/semantic';
26 | 
27 | export { NeuralChunker } from '@/chunkers/neural';
28 | export type { NeuralChunkerOptions } from '@/chunkers/neural';
29 | 
30 | export { CodeChunker } from '@/chunkers/code';
31 | export type { CodeChunkerOptions } from '@/chunkers/code';
32 | 
33 | export { LateChunker } from '@/chunkers/late';
34 | export type { LateChunkerOptions } from '@/chunkers/late';
35 | 
36 | // Refineries
37 | export { EmbeddingsRefinery } from '@/refineries/embeddings';
38 | export type { EmbeddingsRefineryOptions } from '@/refineries/embeddings';
39 | 
40 | export { OverlapRefinery } from '@/refineries/overlap';
41 | export type { OverlapRefineryOptions } from '@/refineries/overlap';
42 | 
43 | // Pipeline
44 | export { Pipeline } from '@/pipeline';
45 | export type { PipelineOptions, PipelineStep, PipelineValidationResult } from '@/pipeline';
46 | 


--------------------------------------------------------------------------------
/packages/cloud/examples/token.example.ts:
--------------------------------------------------------------------------------
 1 | import { TokenChunker } from '../src';
 2 | 
 3 | async function main() {
 4 |   console.log('🦛 Testing TokenChunker with api.chonkie.ai\n');
 5 | 
 6 |   try {
 7 |     // Create chunker (will use CHONKIE_API_KEY from environment)
 8 |     const chunker = new TokenChunker({
 9 |       chunkSize: 50,
10 |       chunkOverlap: 10,
11 |       tokenizer: 'gpt2'
12 |     });
13 | 
14 |     console.log('✅ TokenChunker created successfully');
15 |     console.log(`Config: ${chunker.toString()}\n`);
16 | 
17 |     // Test chunking
18 |     const text = 'This is a simple test to verify that the Chonkie cloud API is working correctly. We are testing the TokenChunker to make sure it can split text into token-based chunks using the remote API.';
19 | 
20 |     console.log(`📝 Input text (${text.length} chars):`);
21 |     console.log(`"${text}"\n`);
22 | 
23 |     console.log('🔄 Sending request to api.chonkie.ai...\n');
24 | 
25 |     const chunks = await chunker.chunk({ text });
26 | 
27 |     console.log(`✅ Received ${chunks.length} chunks:\n`);
28 | 
29 |     chunks.forEach((chunk, index) => {
30 |       console.log(`Chunk ${index + 1}:`);
31 |       console.log(`  Tokens: ${chunk.tokenCount}`);
32 |       console.log(`  Position: [${chunk.startIndex}:${chunk.endIndex}]`);
33 |       console.log(`  Text: "${chunk.text}"`);
34 |       console.log();
35 |     });
36 | 
37 |     // Verify reconstruction
38 |     const reconstructed = chunks.map(c => c.text).join('');
39 |     const matches = reconstructed === text;
40 |     console.log(`🔍 Reconstruction: ${matches ? '✅ Perfect match' : '❌ Mismatch'}`);
41 | 
42 |   } catch (error) {
43 |     console.error('❌ Error:', error instanceof Error ? error.message : error);
44 |     process.exit(1);
45 |   }
46 | }
47 | 
48 | main();
49 | 


--------------------------------------------------------------------------------
/legacy/chonkie/cloud/base.ts:
--------------------------------------------------------------------------------
 1 | /** Base cloud client for Chonkie API. */
 2 | 
 3 | export interface CloudClientConfig {
 4 |   apiKey: string;
 5 |   baseUrl?: string;
 6 | }
 7 | 
 8 | export interface ChunkerInput {
 9 |   text?: string;
10 |   filepath?: string;
11 | }
12 | 
13 | export class CloudClient {
14 |   private readonly apiKey: string;
15 |   private readonly baseUrl: string;
16 | 
17 |   constructor(config: CloudClientConfig) {
18 |     this.apiKey = config.apiKey;
19 |     this.baseUrl = config.baseUrl || "https://api.chonkie.ai";
20 |   }
21 | 
22 |   protected async request<T>(
23 |     endpoint: string,
24 |     options: {
25 |       method?: string;
26 |       body?: any;
27 |       headers?: Record<string, string>;
28 |     } = {}
29 |   ): Promise<T> {
30 |     const { method = "POST", body, headers = {} } = options;
31 | 
32 |     // Don't set Content-Type or stringify body if it's FormData
33 |     const isFormData = body instanceof FormData;
34 |     const requestHeaders = {
35 |       "Authorization": `Bearer ${this.apiKey}`,
36 |       ...headers,
37 |     };
38 | 
39 |     const response = await fetch(`${this.baseUrl}${endpoint}`, {
40 |       method,
41 |       headers: requestHeaders,
42 |       body: isFormData ? body : (body ? JSON.stringify(body) : undefined),
43 |     });
44 | 
45 |     if (!response.ok) {
46 |       const error = await response.json().catch(() => ({ message: "Unknown error" }));
47 |       throw new Error(`API request failed: ${error.message}`);
48 |     }
49 | 
50 |     return response.json();
51 |   }
52 | 
53 |   protected async validateAuth(): Promise<boolean> {
54 |     try {
55 |       const response = await this.request<{ message: string; status: number }>("/v1/auth/validate");
56 |       return response.status === 200;
57 |     } catch (error) {
58 |       return false;
59 |     }
60 |   }
61 | } 


--------------------------------------------------------------------------------
/packages/core/examples/token.example.ts:
--------------------------------------------------------------------------------
 1 | import { TokenChunker } from '../src';
 2 | 
 3 | async function main() {
 4 |   console.log('🦛 Testing TokenChunker (Character-based)\n');
 5 | 
 6 |   try {
 7 |     const chunker = await TokenChunker.create({
 8 |       chunkSize: 50,
 9 |       chunkOverlap: 10
10 |     });
11 | 
12 |     console.log('✅ TokenChunker created');
13 |     console.log(`Config: ${chunker.toString()}\n`);
14 | 
15 |     const text = 'This is a test of the TokenChunker. It splits text into fixed-size token chunks. With character-based tokenization, each character is one token.';
16 | 
17 |     console.log(`📝 Input (${text.length} chars):`);
18 |     console.log(`"${text}"\n`);
19 | 
20 |     const chunks = await chunker.chunk(text);
21 | 
22 |     console.log(`✅ Created ${chunks.length} chunks:\n`);
23 | 
24 |     chunks.forEach((chunk, index) => {
25 |       console.log(`Chunk ${index + 1}:`);
26 |       console.log(`  Tokens: ${chunk.tokenCount}`);
27 |       console.log(`  Position: [${chunk.startIndex}:${chunk.endIndex}]`);
28 |       console.log(`  Text: "${chunk.text}"`);
29 |       console.log();
30 |     });
31 | 
32 |     // Test dynamic tokenizer (will show helpful error)
33 |     console.log('='.repeat(60));
34 |     console.log('\n🔍 Testing dynamic tokenizer detection:\n');
35 | 
36 |     try {
37 |       const gpt2Chunker = await TokenChunker.create({
38 |         tokenizer: 'gpt2',
39 |         chunkSize: 50
40 |       });
41 |       console.log('✅ GPT-2 tokenizer loaded (you have @chonkiejs/token installed!)');
42 |     } catch (error) {
43 |       console.log('ℹ️  Expected behavior - @chonkiejs/token not installed:');
44 |       console.log((error as Error).message);
45 |     }
46 | 
47 |   } catch (error) {
48 |     console.error('❌ Error:', error);
49 |     process.exit(1);
50 |   }
51 | }
52 | 
53 | main();
54 | 


--------------------------------------------------------------------------------
/legacy/chonkie/friends/utils.ts:
--------------------------------------------------------------------------------
 1 | /** Utility functions for Chonkie's Handshakes. */
 2 | 
 3 | const ADJECTIVES = [
 4 |   "happy", "chonky", "splashy", "munchy", "muddy", "groovy", "bubbly",
 5 |   "swift", "lazy", "hungry", "glowing", "radiant", "mighty", "gentle",
 6 |   "whimsical", "snug", "plump", "jovial", "sleepy", "sunny", "peppy",
 7 |   "breezy", "sneaky", "clever", "peaceful", "dreamy",
 8 | ];
 9 | 
10 | const VERBS = [
11 |   "chomping", "splashing", "munching", "wading", "floating", "drifting", "chunking",
12 |   "slicing", "dancing", "wandering", "sleeping", "dreaming", "gliding", "swimming",
13 |   "bubbling", "giggling", "jumping", "diving", "hopping", "skipping", "trotting", "sneaking",
14 |   "exploring", "nibbling", "resting",
15 | ];
16 | 
17 | const NOUNS = [
18 |   "hippo", "river", "chunk", "lilypad", "mudbath", "stream", "pod", "chomp",
19 |   "byte", "fragment", "slice", "splash", "nugget", "lagoon", "marsh",
20 |   "pebble", "ripple", "cluster", "patch", "parcel", "meadow", "glade",
21 |   "puddle", "nook", "bite", "whisper", "journey", "haven", "buddy", "pal",
22 |   "snack", "secret"
23 | ];
24 | 
25 | /**
26 |  * Generate a random, fun, 3-part Chonkie-themed name (Adj-Verb-Noun).
27 |  * 
28 |  * Combines one random adjective, one random verb, and one random noun from
29 |  * predefined lists, joined by a separator.
30 |  * 
31 |  * @param sep - The separator to use between the words. Defaults to "-".
32 |  * @returns A randomly generated collection name string (e.g., "happy-splashes-hippo").
33 |  */
34 | export function generateRandomCollectionName(sep: string = "-"): string {
35 |   const adjective = ADJECTIVES[Math.floor(Math.random() * ADJECTIVES.length)];
36 |   const verb = VERBS[Math.floor(Math.random() * VERBS.length)];
37 |   const noun = NOUNS[Math.floor(Math.random() * NOUNS.length)];
38 |   
39 |   return `${adjective}${sep}${verb}${sep}${noun}`;
40 | }


--------------------------------------------------------------------------------
/legacy/chonkie/types/late.ts:
--------------------------------------------------------------------------------
 1 | import { RecursiveChunk } from './recursive';
 2 | 
 3 | /** Interface for LateChunk data */
 4 | interface LateChunkData {
 5 |   text: string;
 6 |   startIndex: number;
 7 |   endIndex: number;
 8 |   tokenCount: number;
 9 |   embedding?: number[];
10 | }
11 | 
12 | /** Class to represent the late chunk 
13 |  * 
14 |  * @class LateChunk
15 |  */
16 | export class LateChunk extends RecursiveChunk {
17 |   /** The embedding of the chunk */
18 |   public embedding?: number[];
19 | 
20 |   constructor(data: {
21 |     text: string;
22 |     startIndex: number;
23 |     endIndex: number;
24 |     tokenCount: number;
25 |     embedding?: number[];
26 |   }) {
27 |     super(data);
28 |     this.embedding = data.embedding ?? undefined;
29 |   }
30 | 
31 |   /**
32 |    * Return a string representation of the LateChunk
33 |    *
34 |    * @returns {string} The string representation of the LateChunk.
35 |    */
36 |   public toString(): string {
37 |     return `LateChunk(text=${this.text}, startIndex=${this.startIndex}, endIndex=${this.endIndex}, tokenCount=${this.tokenCount}, embedding=${this.embedding})`;
38 |   }
39 | 
40 |   /**
41 |    * Return the LateChunk as a dictionary-like object
42 |    *
43 |    * @returns {LateChunkData} The dictionary-like object.
44 |    */
45 |   public toDict(): LateChunkData {
46 |     return {
47 |       text: this.text,
48 |       startIndex: this.startIndex,
49 |       endIndex: this.endIndex,
50 |       tokenCount: this.tokenCount,
51 |       embedding: this.embedding,
52 |     };
53 |   }
54 | 
55 | 
56 |   /**
57 |    * Create a LateChunk object from a dictionary-like object.
58 |    *
59 |    * @param {LateChunkData} data - The dictionary-like object.
60 |    * @returns {LateChunk} The LateChunk object.
61 |    */
62 |   public static fromDict(data: LateChunkData): LateChunk {
63 |     return new LateChunk({
64 |       text: data.text,
65 |       startIndex: data.startIndex,
66 |       endIndex: data.endIndex,
67 |       tokenCount: data.tokenCount,
68 |       embedding: data.embedding,
69 |     });
70 |   }
71 | } 


--------------------------------------------------------------------------------
/packages/core/examples/with-huggingface.example.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This example demonstrates using RecursiveChunker with HuggingFace tokenizers.
 3 |  * Requires: npm install @chonkiejs/token
 4 |  */
 5 | 
 6 | import { RecursiveChunker, TokenChunker } from '../src';
 7 | 
 8 | async function main() {
 9 |   console.log('🦛 Testing Chonkie with HuggingFace Tokenizers\n');
10 |   console.log('Note: This requires @chonkiejs/token to be installed\n');
11 |   console.log('='.repeat(60));
12 | 
13 |   const text = 'This is a test. We are testing GPT-2 tokenization with Chonkie!';
14 | 
15 |   // Test 1: TokenChunker with GPT-2
16 |   console.log('\n📝 Test 1: TokenChunker with GPT-2\n');
17 |   try {
18 |     const tokenChunker = await TokenChunker.create({
19 |       tokenizer: 'Xenova/gpt2',
20 |       chunkSize: 10
21 |     });
22 | 
23 |     console.log('✅ GPT-2 tokenizer loaded');
24 |     const chunks = await tokenChunker.chunk(text);
25 |     console.log(`Created ${chunks.length} chunks`);
26 |     chunks.forEach((c, i) => {
27 |       console.log(`  ${i + 1}. [${c.tokenCount} tokens]: "${c.text}"`);
28 |     });
29 |   } catch (error) {
30 |     console.log('ℹ️  Expected if @chonkiejs/token not installed:');
31 |     console.log((error as Error).message);
32 |   }
33 | 
34 |   // Test 2: RecursiveChunker with GPT-2
35 |   console.log('\n' + '='.repeat(60));
36 |   console.log('\n📝 Test 2: RecursiveChunker with GPT-2\n');
37 |   try {
38 |     const recursiveChunker = await RecursiveChunker.create({
39 |       tokenizer: 'Xenova/gpt2',
40 |       chunkSize: 15
41 |     });
42 | 
43 |     console.log('✅ GPT-2 tokenizer loaded');
44 |     const chunks = await recursiveChunker.chunk(text);
45 |     console.log(`Created ${chunks.length} chunks`);
46 |     chunks.forEach((c, i) => {
47 |       console.log(`  ${i + 1}. [${c.tokenCount} tokens]: "${c.text}"`);
48 |     });
49 |   } catch (error) {
50 |     console.log('ℹ️  Expected if @chonkiejs/token not installed:');
51 |     console.log((error as Error).message);
52 |   }
53 | 
54 |   console.log('\n' + '='.repeat(60));
55 |   console.log('\n🎉 Example completed!\n');
56 | }
57 | 
58 | main();
59 | 


--------------------------------------------------------------------------------
/packages/cloud/examples/embeddings-refinery.example.ts:
--------------------------------------------------------------------------------
 1 | import { TokenChunker, EmbeddingsRefinery } from '../src';
 2 | 
 3 | async function main() {
 4 |   console.log('🦛 Testing EmbeddingsRefinery with api.chonkie.ai\n');
 5 | 
 6 |   try {
 7 |     // Step 1: Create chunks
 8 |     console.log('Step 1: Creating chunks...\n');
 9 | 
10 |     const chunker = new TokenChunker({ chunkSize: 50 });
11 |     const text = 'Artificial intelligence is transforming technology. Machine learning enables new possibilities. Neural networks process complex patterns.';
12 | 
13 |     console.log(`📝 Input (${text.length} chars):`);
14 |     console.log(`"${text}"\n`);
15 | 
16 |     const chunks = await chunker.chunk({ text });
17 |     console.log(`✅ Created ${chunks.length} chunks\n`);
18 | 
19 |     chunks.forEach((chunk, i) => {
20 |       console.log(`Chunk ${i + 1}: "${chunk.text}" (${chunk.tokenCount} tokens)`);
21 |     });
22 | 
23 |     // Step 2: Add embeddings
24 |     console.log('\n' + '='.repeat(60));
25 |     console.log('\nStep 2: Adding embeddings to chunks...\n');
26 | 
27 |     const refinery = new EmbeddingsRefinery({
28 |       embeddingModel: 'sentence-transformers/all-MiniLM-L6-v2'
29 |     });
30 | 
31 |     console.log('✅ EmbeddingsRefinery created');
32 |     console.log(`Config: ${refinery.toString()}\n`);
33 | 
34 |     console.log('🔄 Calling API to add embeddings...\n');
35 | 
36 |     const refinedChunks = await refinery.refine(chunks);
37 | 
38 |     console.log(`✅ Refined ${refinedChunks.length} chunks with embeddings\n`);
39 | 
40 |     refinedChunks.forEach((chunk, i) => {
41 |       console.log(`Refined Chunk ${i + 1}:`);
42 |       console.log(`  Text: "${chunk.text}"`);
43 |       console.log(`  Tokens: ${chunk.tokenCount}`);
44 |       console.log(`  Position: [${chunk.startIndex}:${chunk.endIndex}]`);
45 |       console.log();
46 |     });
47 | 
48 |     console.log('='.repeat(60));
49 |     console.log('\n🎉 Embeddings successfully added!\n');
50 | 
51 |   } catch (error) {
52 |     console.error('❌ Error:', error instanceof Error ? error.message : error);
53 |     process.exit(1);
54 |   }
55 | }
56 | 
57 | main();
58 | 


--------------------------------------------------------------------------------
/legacy/chonkie/cloud/neural.ts:
--------------------------------------------------------------------------------
 1 | /** Neural chunker client for Chonkie API. */
 2 | 
 3 | import { CloudClient, ChunkerInput } from "./base";
 4 | import { Chunk } from "../types/base";
 5 | 
 6 | export interface NeuralChunkerConfig {
 7 |   model?: string;
 8 |   minCharactersPerChunk?: number;
 9 | }
10 | 
11 | export class NeuralChunker extends CloudClient {
12 |   private readonly config: Required<NeuralChunkerConfig>;
13 | 
14 |   constructor(apiKey: string, config: NeuralChunkerConfig = {}) {
15 |     super({ apiKey });
16 |     this.config = {
17 |       model: config.model || "mirth/chonky_modernbert_large_1",
18 |       minCharactersPerChunk: config.minCharactersPerChunk || 10,
19 |     };
20 |   }
21 | 
22 |   async chunk(input: ChunkerInput): Promise<Chunk[]> {
23 |     const formData = new FormData();
24 | 
25 |     if (input.filepath) {
26 |       formData.append("file", input.filepath);
27 |     } else if (input.text) {
28 |       // JSON encode the text
29 |       formData.append("text", JSON.stringify(input.text));
30 |       // Append empty file to ensure multipart form
31 |       formData.append("file", new Blob(), "text_input.txt");
32 |     } else {
33 |       throw new Error("Either text or file must be provided");
34 |     }
35 | 
36 |     formData.append("embedding_model", this.config.model);
37 |     formData.append("min_characters_per_chunk", this.config.minCharactersPerChunk.toString());
38 |     formData.append("return_type", "chunks");
39 | 
40 |     const data = await this.request<any>("/v1/chunk/neural", {
41 |       method: "POST",
42 |       body: formData,
43 |     });
44 | 
45 |     // Convert from snake_case to camelCase
46 |     const camelCaseData = data.map((chunk: any) => {
47 |       return {
48 |         text: chunk.text,
49 |         startIndex: chunk.start_index,
50 |         endIndex: chunk.end_index,
51 |         tokenCount: chunk.token_count,
52 |         embedding: chunk.embedding || undefined,
53 |       };
54 |     });
55 | 
56 |     return camelCaseData.map((chunk: any) => Chunk.fromDict(chunk));
57 |   }
58 | 
59 |   async chunkBatch(inputs: ChunkerInput[]): Promise<Chunk[][]> {
60 |     return Promise.all(inputs.map(input => this.chunk(input)));
61 |   }
62 | } 


--------------------------------------------------------------------------------
/packages/cloud/examples/overlap-refinery.example.ts:
--------------------------------------------------------------------------------
 1 | import { TokenChunker, OverlapRefinery } from '../src';
 2 | 
 3 | async function main() {
 4 |   console.log('🦛 Testing OverlapRefinery with api.chonkie.ai\n');
 5 | 
 6 |   try {
 7 |     // Step 1: Create chunks
 8 |     console.log('Step 1: Creating chunks without overlap...\n');
 9 | 
10 |     const chunker = new TokenChunker({
11 |       chunkSize: 40,
12 |       chunkOverlap: 0  // No overlap initially
13 |     });
14 | 
15 |     const text = 'The quick brown fox jumps over the lazy dog. This sentence demonstrates overlap refinement. Context is preserved across boundaries.';
16 | 
17 |     console.log(`📝 Input (${text.length} chars):`);
18 |     console.log(`"${text}"\n`);
19 | 
20 |     const chunks = await chunker.chunk({ text });
21 |     console.log(`✅ Created ${chunks.length} chunks (no overlap)\n`);
22 | 
23 |     chunks.forEach((chunk, i) => {
24 |       console.log(`Chunk ${i + 1}: "${chunk.text.substring(0, 50)}${chunk.text.length > 50 ? '...' : ''}" (${chunk.tokenCount} tokens)`);
25 |     });
26 | 
27 |     // Step 2: Add overlap
28 |     console.log('\n' + '='.repeat(60));
29 |     console.log('\nStep 2: Adding overlap for context...\n');
30 | 
31 |     const refinery = new OverlapRefinery({
32 |       contextSize: 0.25,  // 25% overlap
33 |       mode: 'token',
34 |       method: 'suffix'
35 |     });
36 | 
37 |     console.log('✅ OverlapRefinery created');
38 |     console.log(`Config: ${refinery.toString()}\n`);
39 | 
40 |     console.log('🔄 Calling API to add overlap...\n');
41 | 
42 |     const refinedChunks = await refinery.refine(chunks);
43 | 
44 |     console.log(`✅ Refined ${refinedChunks.length} chunks with overlap\n`);
45 | 
46 |     refinedChunks.forEach((chunk, i) => {
47 |       console.log(`Refined Chunk ${i + 1}:`);
48 |       console.log(`  Text: "${chunk.text.substring(0, 60)}${chunk.text.length > 60 ? '...' : ''}"`);
49 |       console.log(`  Tokens: ${chunk.tokenCount}`);
50 |       console.log(`  Position: [${chunk.startIndex}:${chunk.endIndex}]`);
51 |       console.log();
52 |     });
53 | 
54 |     console.log('='.repeat(60));
55 |     console.log('\n💡 Notice: Chunks now have overlapping context for better coherence!\n');
56 | 
57 |   } catch (error) {
58 |     console.error('❌ Error:', error instanceof Error ? error.message : error);
59 |     process.exit(1);
60 |   }
61 | }
62 | 
63 | main();
64 | 


--------------------------------------------------------------------------------
/legacy/chonkie/cloud/token.ts:
--------------------------------------------------------------------------------
 1 | /** Token chunker client for Chonkie API. */
 2 | 
 3 | import { CloudClient, ChunkerInput } from "./base";
 4 | import { Chunk } from "../types/base";
 5 | import * as fs from 'fs';
 6 | import * as path from 'path';
 7 | 
 8 | export interface TokenChunkerConfig {
 9 |   tokenizer?: string;
10 |   chunkSize?: number;
11 |   chunkOverlap?: number;
12 | }
13 | 
14 | export class TokenChunker extends CloudClient {
15 |   private readonly config: Required<TokenChunkerConfig>;
16 | 
17 |   constructor(apiKey: string, config: TokenChunkerConfig = {}) {
18 |     super({ apiKey });
19 |     this.config = {
20 |       tokenizer: config.tokenizer || "gpt2",
21 |       chunkSize: config.chunkSize || 512,
22 |       chunkOverlap: config.chunkOverlap || 0,
23 |     };
24 |   }
25 | 
26 |   async chunk(input: ChunkerInput): Promise<Chunk[]> {
27 |     const formData = new FormData();
28 | 
29 |     if (input.filepath) {
30 |       const fileContent = fs.readFileSync(input.filepath);
31 |       const fileName = path.basename(input.filepath) || 'file.txt';
32 |       formData.append("file", new Blob([fileContent]), fileName);
33 |     } else if (input.text) {
34 |       formData.append("text", input.text);
35 |       // Append empty file to ensure multipart form
36 |       formData.append("file", new Blob(), "text_input.txt");
37 |     } else {
38 |       throw new Error("Either text or filepath must be provided");
39 |     }
40 | 
41 |     formData.append("tokenizer_or_token_counter", this.config.tokenizer);
42 |     formData.append("chunk_size", this.config.chunkSize.toString());
43 |     formData.append("chunk_overlap", this.config.chunkOverlap.toString());
44 |     formData.append("return_type", "chunks");
45 | 
46 |     const data = await this.request<any>("/v1/chunk/token", {
47 |       method: "POST",
48 |       body: formData,
49 |     });
50 | 
51 |     // Convert from snake_case to camelCase
52 |     const camelCaseData = data.map((chunk: any) => {
53 |       return {
54 |         text: chunk.text,
55 |         startIndex: chunk.start_index,
56 |         endIndex: chunk.end_index,
57 |         tokenCount: chunk.token_count,
58 |         context: chunk.context || undefined,
59 |       };
60 |     });
61 | 
62 |     return camelCaseData.map((chunk: any) => Chunk.fromDict(chunk));
63 |   }
64 | 
65 |   async chunkBatch(inputs: ChunkerInput[]): Promise<Chunk[][]> {
66 |     return Promise.all(inputs.map(input => this.chunk(input)));
67 |   }
68 | } 


--------------------------------------------------------------------------------
/packages/cloud/src/refineries/embeddings.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Embeddings refinery that adds embeddings to existing chunks
 3 |  * via api.chonkie.ai
 4 |  */
 5 | 
 6 | import { Chunk } from "@chonkiejs/core";
 7 | import { CloudBaseChunker } from "@/base";
 8 | 
 9 | export interface EmbeddingsRefineryOptions {
10 |   /** Embedding model to use (default: "minishlab/potion-retrieval-32M") */
11 |   embeddingModel?: string;
12 |   /** API key (reads from CHONKIE_API_KEY env var if not provided) */
13 |   apiKey?: string;
14 |   /** Base URL for API (default: "https://api.chonkie.ai") */
15 |   baseUrl?: string;
16 | }
17 | 
18 | interface ChunkData {
19 |   text: string;
20 |   start_index: number;
21 |   end_index: number;
22 |   token_count: number;
23 |   embedding?: number[];
24 | }
25 | 
26 | /**
27 |  * Post-processes chunks by adding embeddings to them.
28 |  */
29 | export class EmbeddingsRefinery extends CloudBaseChunker {
30 |   private readonly embeddingModel: string;
31 | 
32 |   constructor(options: EmbeddingsRefineryOptions = {}) {
33 |     const apiKey = options.apiKey || process.env.CHONKIE_API_KEY;
34 |     if (!apiKey) {
35 |       throw new Error(
36 |         "API key is required. Provide it in options.apiKey or set CHONKIE_API_KEY environment variable."
37 |       );
38 |     }
39 | 
40 |     super({ apiKey, baseUrl: options.baseUrl });
41 |     this.embeddingModel = options.embeddingModel || 'minishlab/potion-retrieval-32M';
42 |   }
43 | 
44 |   /**
45 |    * Add embeddings to existing chunks.
46 |    *
47 |    * @param chunks - Array of chunks to add embeddings to
48 |    * @returns Array of chunks with embeddings added
49 |    */
50 |   async refine(chunks: Chunk[]): Promise<Chunk[]> {
51 |     const chunkData = chunks.map((chunk) => ({
52 |       text: chunk.text,
53 |       start_index: chunk.startIndex,
54 |       end_index: chunk.endIndex,
55 |       token_count: chunk.tokenCount,
56 |     }));
57 | 
58 |     const response = await this.request<ChunkData[]>("/v1/refine/embeddings", {
59 |       method: "POST",
60 |       body: {
61 |         chunks: chunkData,
62 |         embedding_model: this.embeddingModel,
63 |       },
64 |     });
65 | 
66 |     return response.map(
67 |       (chunkData) =>
68 |         new Chunk({
69 |           text: chunkData.text,
70 |           startIndex: chunkData.start_index,
71 |           endIndex: chunkData.end_index,
72 |           tokenCount: chunkData.token_count,
73 |           embedding: chunkData.embedding,
74 |         })
75 |     );
76 |   }
77 | 
78 |   toString(): string {
79 |     return `EmbeddingsRefinery(embeddingModel=${this.embeddingModel})`;
80 |   }
81 | }
82 | 


--------------------------------------------------------------------------------
/packages/cloud/tests/fixtures/test-code.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Example TypeScript code for testing CodeChunker
  3 |  */
  4 | 
  5 | export interface User {
  6 |   id: number;
  7 |   name: string;
  8 |   email: string;
  9 |   createdAt: Date;
 10 | }
 11 | 
 12 | export class UserService {
 13 |   private users: Map<number, User> = new Map();
 14 |   private nextId: number = 1;
 15 | 
 16 |   /**
 17 |    * Create a new user
 18 |    */
 19 |   async createUser(name: string, email: string): Promise<User> {
 20 |     const user: User = {
 21 |       id: this.nextId++,
 22 |       name,
 23 |       email,
 24 |       createdAt: new Date(),
 25 |     };
 26 |     this.users.set(user.id, user);
 27 |     return user;
 28 |   }
 29 | 
 30 |   /**
 31 |    * Get user by ID
 32 |    */
 33 |   async getUserById(id: number): Promise<User | undefined> {
 34 |     return this.users.get(id);
 35 |   }
 36 | 
 37 |   /**
 38 |    * Update user information
 39 |    */
 40 |   async updateUser(id: number, updates: Partial<Omit<User, 'id' | 'createdAt'>>): Promise<User | null> {
 41 |     const user = this.users.get(id);
 42 |     if (!user) {
 43 |       return null;
 44 |     }
 45 | 
 46 |     const updatedUser = {
 47 |       ...user,
 48 |       ...updates,
 49 |     };
 50 | 
 51 |     this.users.set(id, updatedUser);
 52 |     return updatedUser;
 53 |   }
 54 | 
 55 |   /**
 56 |    * Delete a user
 57 |    */
 58 |   async deleteUser(id: number): Promise<boolean> {
 59 |     return this.users.delete(id);
 60 |   }
 61 | 
 62 |   /**
 63 |    * Get all users
 64 |    */
 65 |   async getAllUsers(): Promise<User[]> {
 66 |     return Array.from(this.users.values());
 67 |   }
 68 | 
 69 |   /**
 70 |    * Find users by email
 71 |    */
 72 |   async findUsersByEmail(email: string): Promise<User[]> {
 73 |     return Array.from(this.users.values()).filter(
 74 |       user => user.email.toLowerCase().includes(email.toLowerCase())
 75 |     );
 76 |   }
 77 | }
 78 | 
 79 | export class AuthService {
 80 |   private tokens: Map<string, number> = new Map();
 81 | 
 82 |   /**
 83 |    * Generate authentication token
 84 |    */
 85 |   generateToken(userId: number): string {
 86 |     const token = Math.random().toString(36).substring(2);
 87 |     this.tokens.set(token, userId);
 88 |     return token;
 89 |   }
 90 | 
 91 |   /**
 92 |    * Validate token and return user ID
 93 |    */
 94 |   validateToken(token: string): number | null {
 95 |     return this.tokens.get(token) || null;
 96 |   }
 97 | 
 98 |   /**
 99 |    * Revoke a token
100 |    */
101 |   revokeToken(token: string): boolean {
102 |     return this.tokens.delete(token);
103 |   }
104 | }
105 | 


--------------------------------------------------------------------------------
/packages/cloud/tests/fixtures/test-code.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Example TypeScript code for testing CodeChunker
  3 |  */
  4 | 
  5 | export interface User {
  6 |   id: number;
  7 |   name: string;
  8 |   email: string;
  9 |   createdAt: Date;
 10 | }
 11 | 
 12 | export class UserService {
 13 |   private users: Map<number, User> = new Map();
 14 |   private nextId: number = 1;
 15 | 
 16 |   /**
 17 |    * Create a new user
 18 |    */
 19 |   async createUser(name: string, email: string): Promise<User> {
 20 |     const user: User = {
 21 |       id: this.nextId++,
 22 |       name,
 23 |       email,
 24 |       createdAt: new Date(),
 25 |     };
 26 |     this.users.set(user.id, user);
 27 |     return user;
 28 |   }
 29 | 
 30 |   /**
 31 |    * Get user by ID
 32 |    */
 33 |   async getUserById(id: number): Promise<User | undefined> {
 34 |     return this.users.get(id);
 35 |   }
 36 | 
 37 |   /**
 38 |    * Update user information
 39 |    */
 40 |   async updateUser(id: number, updates: Partial<Omit<User, 'id' | 'createdAt'>>): Promise<User | null> {
 41 |     const user = this.users.get(id);
 42 |     if (!user) {
 43 |       return null;
 44 |     }
 45 | 
 46 |     const updatedUser = {
 47 |       ...user,
 48 |       ...updates,
 49 |     };
 50 | 
 51 |     this.users.set(id, updatedUser);
 52 |     return updatedUser;
 53 |   }
 54 | 
 55 |   /**
 56 |    * Delete a user
 57 |    */
 58 |   async deleteUser(id: number): Promise<boolean> {
 59 |     return this.users.delete(id);
 60 |   }
 61 | 
 62 |   /**
 63 |    * Get all users
 64 |    */
 65 |   async getAllUsers(): Promise<User[]> {
 66 |     return Array.from(this.users.values());
 67 |   }
 68 | 
 69 |   /**
 70 |    * Find users by email
 71 |    */
 72 |   async findUsersByEmail(email: string): Promise<User[]> {
 73 |     return Array.from(this.users.values()).filter(
 74 |       user => user.email.toLowerCase().includes(email.toLowerCase())
 75 |     );
 76 |   }
 77 | }
 78 | 
 79 | export class AuthService {
 80 |   private tokens: Map<string, number> = new Map();
 81 | 
 82 |   /**
 83 |    * Generate authentication token
 84 |    */
 85 |   generateToken(userId: number): string {
 86 |     const token = Math.random().toString(36).substring(2);
 87 |     this.tokens.set(token, userId);
 88 |     return token;
 89 |   }
 90 | 
 91 |   /**
 92 |    * Validate token and return user ID
 93 |    */
 94 |   validateToken(token: string): number | null {
 95 |     return this.tokens.get(token) || null;
 96 |   }
 97 | 
 98 |   /**
 99 |    * Revoke a token
100 |    */
101 |   revokeToken(token: string): boolean {
102 |     return this.tokens.delete(token);
103 |   }
104 | }
105 | 


--------------------------------------------------------------------------------
/legacy/chonkie/cloud/overlap_refinery.ts:
--------------------------------------------------------------------------------
 1 | import { CloudClient } from "./base";
 2 | import { Chunk } from "../types/base";
 3 | 
 4 | 
 5 | export interface OverlapRefineryConfig {
 6 |   tokenizerOrTokenCounter?: string;
 7 |   contextSize?: number;
 8 |   mode?: "token" | "recursive";
 9 |   method?: "suffix" | "prefix";
10 |   recipe?: string;
11 |   lang?: string;
12 |   merge?: boolean;
13 | }
14 | export class OverlapRefinery extends CloudClient {
15 |     private readonly config: Required<OverlapRefineryConfig>;
16 | 
17 |     constructor(apiKey: string, config: OverlapRefineryConfig = {}) {
18 |         super({ apiKey });
19 |         this.config = {
20 |             tokenizerOrTokenCounter: config.tokenizerOrTokenCounter || "character",
21 |             contextSize: config.contextSize ?? 0.25,
22 |             mode: config.mode || "token",
23 |             method: config.method || "suffix",
24 |             recipe: config.recipe || "default",
25 |             lang: config.lang || "en",
26 |             merge: config.merge ?? true,
27 |         };
28 |     }
29 | 
30 |     async refine(chunks: Chunk[]): Promise<Chunk[]> {
31 |         // Create snake cased chunks for the request
32 |         const snakeCasedChunks = chunks.map(chunk => {
33 |             return {
34 |                 text: chunk.text,
35 |                 start_index: chunk.startIndex,
36 |                 end_index: chunk.endIndex,
37 |                 token_count: chunk.tokenCount,
38 |             };
39 |         });
40 |         const response = await this.request<any>("/v1/refine/overlap", {
41 |             body: {
42 |                 chunks: snakeCasedChunks,
43 |                 tokenizer_or_token_counter: this.config.tokenizerOrTokenCounter,
44 |                 context_size: this.config.contextSize,
45 |                 mode: this.config.mode,
46 |                 method: this.config.method,
47 |                 recipe: this.config.recipe,
48 |                 lang: this.config.lang,
49 |                 merge: this.config.merge,
50 |             },
51 |             headers: {
52 |                 "Content-Type": "application/json",
53 |             },
54 |         });
55 |         // Merge the response chunks with the original chunks
56 |         const mergedChunks = response.map((chunk: any, index: number) => {
57 |             const originalChunk = chunks[index];
58 |             return {
59 |                 ...originalChunk,
60 |                 text: chunk.text,
61 |                 startIndex: chunk.start_index,
62 |                 endIndex: chunk.end_index,
63 |                 tokenCount: chunk.token_count,
64 |             };
65 |         });
66 |         return mergedChunks;
67 |     }
68 | } 


--------------------------------------------------------------------------------
/packages/token/src/huggingface.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * HuggingFace tokenizer implementation using @huggingface/transformers
 3 |  */
 4 | 
 5 | import { AutoTokenizer, PreTrainedTokenizer } from '@huggingface/transformers';
 6 | import { Tokenizer } from '@chonkiejs/core';
 7 | 
 8 | /**
 9 |  * Tokenizer that uses HuggingFace transformers.js for tokenization.
10 |  *
11 |  * Extends the base Tokenizer interface from @chonkiejs/core to provide
12 |  * real tokenization using models like GPT-2, BERT, etc.
13 |  */
14 | export class HuggingFaceTokenizer extends Tokenizer {
15 |   private hfTokenizer: PreTrainedTokenizer;
16 |   private modelName: string;
17 | 
18 |   private constructor(hfTokenizer: PreTrainedTokenizer, modelName: string) {
19 |     super();
20 |     this.hfTokenizer = hfTokenizer;
21 |     this.modelName = modelName;
22 |   }
23 | 
24 |   /**
25 |    * Create a HuggingFace tokenizer instance.
26 |    *
27 |    * @param model - HuggingFace model name (e.g., 'gpt2', 'Xenova/gpt-4', 'bert-base-uncased')
28 |    * @returns Promise resolving to HuggingFaceTokenizer instance
29 |    *
30 |    * @example
31 |    * const tokenizer = await HuggingFaceTokenizer.create('gpt2');
32 |    * const tokenizer = await HuggingFaceTokenizer.create('Xenova/gpt-4');
33 |    */
34 |   static async create(model: string): Promise<HuggingFaceTokenizer> {
35 |     try {
36 |       const hfTokenizer = await AutoTokenizer.from_pretrained(model);
37 |       return new HuggingFaceTokenizer(hfTokenizer, model);
38 |     } catch (error) {
39 |       throw new Error(`Failed to load HuggingFace tokenizer "${model}": ${error instanceof Error ? error.message : error}`);
40 |     }
41 |   }
42 | 
43 |   /**
44 |    * Count tokens in text using HuggingFace tokenizer.
45 |    */
46 |   countTokens(text: string): number {
47 |     const encoded = this.hfTokenizer.encode(text) as number[];
48 |     return encoded.length;
49 |   }
50 | 
51 |   /**
52 |    * Encode text into token IDs.
53 |    */
54 |   encode(text: string): number[] {
55 |     return this.hfTokenizer.encode(text) as number[];
56 |   }
57 | 
58 |   /**
59 |    * Decode token IDs back into text.
60 |    */
61 |   decode(tokens: number[]): string {
62 |     return this.hfTokenizer.decode(tokens, { skip_special_tokens: true });
63 |   }
64 | 
65 |   /**
66 |    * Decode a batch of token arrays.
67 |    */
68 |   decodeBatch(tokensBatch: number[][]): string[] {
69 |     return tokensBatch.map(tokens => this.decode(tokens));
70 |   }
71 | 
72 |   /**
73 |    * Get the model name.
74 |    */
75 |   getModelName(): string {
76 |     return this.modelName;
77 |   }
78 | 
79 |   toString(): string {
80 |     return `HuggingFaceTokenizer(model=${this.modelName})`;
81 |   }
82 | }
83 | 


--------------------------------------------------------------------------------
/legacy/chonkie/cloud/code.ts:
--------------------------------------------------------------------------------
 1 | /** Code chunker client for Chonkie API. */
 2 | 
 3 | import { CloudClient, ChunkerInput } from "./base";
 4 | import { CodeChunk } from "../types/code";
 5 | import * as fs from 'fs';
 6 | import * as path from 'path';
 7 | 
 8 | export interface CodeChunkerConfig {
 9 |   tokenizerOrTokenCounter?: string;
10 |   chunkSize?: number;
11 |   language: string;
12 |   includeNodes?: boolean;
13 | }
14 | 
15 | export class CodeChunker extends CloudClient {
16 |   private readonly config: Required<CodeChunkerConfig>;
17 | 
18 |   constructor(apiKey: string, config: CodeChunkerConfig) {
19 |     super({ apiKey });
20 |     if (!config.language) {
21 |       throw new Error("Language is required for code chunking");
22 |     }
23 |     this.config = {
24 |       tokenizerOrTokenCounter: config.tokenizerOrTokenCounter || "gpt2",
25 |       chunkSize: config.chunkSize || 1500,
26 |       language: config.language,
27 |       includeNodes: config.includeNodes ?? false,
28 |     };
29 |   }
30 | 
31 |   async chunk(input: ChunkerInput): Promise<CodeChunk[]> {
32 |     const formData = new FormData();
33 | 
34 |     if (input.filepath) {
35 |       const fileContent = fs.readFileSync(input.filepath);
36 |       const fileName = path.basename(input.filepath) || 'file.txt';
37 |       formData.append("file", new Blob([fileContent]), fileName);
38 |     } else if (input.text) {
39 |       formData.append("text", input.text);
40 |       // Append empty file to ensure multipart form
41 |       formData.append("file", new Blob(), "text_input.txt");
42 |     } else {
43 |       throw new Error("Either text or filepath must be provided");
44 |     }
45 | 
46 |     formData.append("tokenizer_or_token_counter", this.config.tokenizerOrTokenCounter);
47 |     formData.append("chunk_size", this.config.chunkSize.toString());
48 |     formData.append("language", this.config.language);
49 |     formData.append("include_nodes", this.config.includeNodes.toString());
50 | 
51 |     const data = await this.request<any>("/v1/chunk/code", {
52 |       method: "POST",
53 |       body: formData,
54 |     });
55 | 
56 |     // Convert from snake_case to camelCase
57 |     const camelCaseData = data.map((chunk: any) => {
58 |       return {
59 |         text: chunk.text,
60 |         startIndex: chunk.start_index,
61 |         endIndex: chunk.end_index,
62 |         tokenCount: chunk.token_count,
63 |         nodes: chunk.nodes || undefined,
64 |         embedding: chunk.embedding || undefined,
65 |       };
66 |     });
67 | 
68 |     return camelCaseData.map((chunk: any) => CodeChunk.fromDict(chunk));
69 |   }
70 | 
71 |   async chunkBatch(inputs: ChunkerInput[]): Promise<CodeChunk[][]> {
72 |     return Promise.all(inputs.map(input => this.chunk(input)));
73 |   }
74 | } 


--------------------------------------------------------------------------------
/legacy/chonkie/cloud/late.ts:
--------------------------------------------------------------------------------
 1 | /** Late chunker client for Chonkie API. */
 2 | 
 3 | import { CloudClient, ChunkerInput } from "./base";
 4 | import { LateChunk } from "../types/late";
 5 | import * as fs from 'fs';
 6 | import * as path from 'path';
 7 | 
 8 | export interface LateChunkerConfig {
 9 |   embeddingModel?: string;
10 |   chunkSize?: number;
11 |   recipe?: string;
12 |   lang?: string;
13 |   minCharactersPerChunk?: number;
14 | }
15 | 
16 | export class LateChunker extends CloudClient {
17 |   private readonly config: Required<LateChunkerConfig>;
18 | 
19 |   constructor(apiKey: string, config: LateChunkerConfig = {}) {
20 |     super({ apiKey });
21 |     this.config = {
22 |       embeddingModel: config.embeddingModel || "all-MiniLM-L6-v2",
23 |       chunkSize: config.chunkSize || 512,
24 |       recipe: config.recipe || "default",
25 |       lang: config.lang || "en",
26 |       minCharactersPerChunk: config.minCharactersPerChunk || 24,
27 |     };
28 |   }
29 | 
30 |   async chunk(input: ChunkerInput): Promise<LateChunk[]> {
31 |     const formData = new FormData();
32 | 
33 |     if (input.filepath) {
34 |       const fileContent = fs.readFileSync(input.filepath);
35 |       const fileName = path.basename(input.filepath) || 'file.txt';
36 |       formData.append("file", new Blob([fileContent]), fileName);
37 |     } else if (input.text) {
38 |       // JSON encode the text
39 |       formData.append("text", JSON.stringify(input.text));
40 |       // Append empty file to ensure multipart form
41 |       formData.append("file", new Blob(), "text_input.txt");
42 |     } else {
43 |       throw new Error("Either text or filepath must be provided");
44 |     }
45 | 
46 |     formData.append("embedding_model", this.config.embeddingModel);
47 |     formData.append("chunk_size", this.config.chunkSize.toString());
48 |     formData.append("recipe", this.config.recipe);
49 |     formData.append("lang", this.config.lang);
50 |     formData.append("min_characters_per_chunk", this.config.minCharactersPerChunk.toString());
51 | 
52 |     const data = await this.request<any>("/v1/chunk/late", {
53 |       method: "POST",
54 |       body: formData,
55 |     });
56 | 
57 |     // Convert from snake_case to camelCase
58 |     const camelCaseData = data.map((chunk: any) => {
59 |       return {
60 |         text: chunk.text,
61 |         startIndex: chunk.start_index,
62 |         endIndex: chunk.end_index,
63 |         tokenCount: chunk.token_count,
64 |         embedding: chunk.embedding || undefined,
65 |       };
66 |     });
67 | 
68 |     return camelCaseData.map((chunk: any) => LateChunk.fromDict(chunk));
69 |   }
70 | 
71 |   async chunkBatch(inputs: ChunkerInput[]): Promise<LateChunk[][]> {
72 |     return Promise.all(inputs.map(input => this.chunk(input)));
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/legacy/chonkie/cloud/slumber.ts:
--------------------------------------------------------------------------------
 1 | /** Slumber chunker client for Chonkie API. */
 2 | 
 3 | import { CloudClient, ChunkerInput } from "./base";
 4 | import { Chunk } from "../types/base";
 5 | import * as fs from 'fs';
 6 | import * as path from 'path';
 7 | 
 8 | export interface SlumberChunkerConfig {
 9 |   tokenizerOrTokenCounter?: string;
10 |   chunkSize?: number;
11 |   candidateSize?: number;
12 |   minCharactersPerChunk?: number;
13 | }
14 | 
15 | export class SlumberChunker extends CloudClient {
16 |   private readonly config: Required<SlumberChunkerConfig>;
17 | 
18 |   constructor(apiKey: string, config: SlumberChunkerConfig = {}) {
19 |     super({ apiKey });
20 |     this.config = {
21 |       tokenizerOrTokenCounter: config.tokenizerOrTokenCounter || "gpt2",
22 |       chunkSize: config.chunkSize || 1024,
23 |       candidateSize: config.candidateSize || 32,
24 |       minCharactersPerChunk: config.minCharactersPerChunk || 12,
25 |     };
26 |   }
27 | 
28 |   async chunk(input: ChunkerInput): Promise<Chunk[]> {
29 |     const formData = new FormData();
30 | 
31 |     if (input.filepath) {
32 |       const fileContent = fs.readFileSync(input.filepath);
33 |       const fileName = path.basename(input.filepath) || 'file.txt';
34 |       formData.append("file", new Blob([fileContent]), fileName);
35 |     } else if (input.text) {
36 |       formData.append("text", input.text);
37 |       // Append empty file to ensure multipart form
38 |       formData.append("file", new Blob(), "text_input.txt");
39 |     } else {
40 |       throw new Error("Either text or filepath must be provided");
41 |     }
42 | 
43 |     formData.append("tokenizer_or_token_counter", this.config.tokenizerOrTokenCounter);
44 |     formData.append("chunk_size", this.config.chunkSize.toString());
45 |     formData.append("candidate_size", this.config.candidateSize.toString());
46 |     formData.append("min_characters_per_chunk", this.config.minCharactersPerChunk.toString());
47 |     formData.append("return_type", "chunks");
48 | 
49 |     const data = await this.request<any>("/v1/chunk/slumber", {
50 |       method: "POST",
51 |       body: formData,
52 |     });
53 | 
54 |     // Convert from snake_case to camelCase
55 |     const camelCaseData = data.map((chunk: any) => {
56 |       return {
57 |         text: chunk.text,
58 |         startIndex: chunk.start_index,
59 |         endIndex: chunk.end_index,
60 |         tokenCount: chunk.token_count,
61 |         embedding: chunk.embedding || undefined,
62 |         context: chunk.context || undefined,
63 |       };
64 |     });
65 | 
66 |     return camelCaseData.map((chunk: any) => Chunk.fromDict(chunk));
67 |   }
68 | 
69 |   async chunkBatch(inputs: ChunkerInput[]): Promise<Chunk[][]> {
70 |     return Promise.all(inputs.map(input => this.chunk(input)));
71 |   }
72 | } 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Logs
  2 | logs
  3 | *.log
  4 | npm-debug.log*
  5 | yarn-debug.log*
  6 | yarn-error.log*
  7 | lerna-debug.log*
  8 | .pnpm-debug.log*
  9 | 
 10 | # Diagnostic reports (https://nodejs.org/api/report.html)
 11 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
 12 | 
 13 | # Runtime data
 14 | pids
 15 | *.pid
 16 | *.seed
 17 | *.pid.lock
 18 | 
 19 | # Directory for instrumented libs generated by jscoverage/JSCover
 20 | lib-cov
 21 | 
 22 | # Coverage directory used by tools like istanbul
 23 | coverage
 24 | *.lcov
 25 | 
 26 | # nyc test coverage
 27 | .nyc_output
 28 | 
 29 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
 30 | .grunt
 31 | 
 32 | # Bower dependency directory (https://bower.io/)
 33 | bower_components
 34 | 
 35 | # node-waf configuration
 36 | .lock-wscript
 37 | 
 38 | # Compiled binary addons (https://nodejs.org/api/addons.html)
 39 | build/Release
 40 | 
 41 | # Dependency directories
 42 | node_modules/
 43 | jspm_packages/
 44 | 
 45 | # Snowpack dependency directory (https://snowpack.dev/)
 46 | web_modules/
 47 | 
 48 | # TypeScript cache
 49 | *.tsbuildinfo
 50 | 
 51 | # Optional npm cache directory
 52 | .npm
 53 | 
 54 | # Optional eslint cache
 55 | .eslintcache
 56 | 
 57 | # Optional stylelint cache
 58 | .stylelintcache
 59 | 
 60 | # Microbundle cache
 61 | .rpt2_cache/
 62 | .rts2_cache_cjs/
 63 | .rts2_cache_es/
 64 | .rts2_cache_umd/
 65 | 
 66 | # Optional REPL history
 67 | .node_repl_history
 68 | 
 69 | # Output of 'npm pack'
 70 | *.tgz
 71 | 
 72 | # Yarn Integrity file
 73 | .yarn-integrity
 74 | 
 75 | # dotenv environment variable files
 76 | .env
 77 | .env.development.local
 78 | .env.test.local
 79 | .env.production.local
 80 | .env.local
 81 | 
 82 | # parcel-bundler cache (https://parceljs.org/)
 83 | .cache
 84 | .parcel-cache
 85 | 
 86 | # Next.js build output
 87 | .next
 88 | out
 89 | 
 90 | # Nuxt.js build / generate output
 91 | .nuxt
 92 | dist
 93 | 
 94 | # Gatsby files
 95 | .cache/
 96 | # Comment in the public line in if your project uses Gatsby and not Next.js
 97 | # https://nextjs.org/blog/next-9-1#public-directory-support
 98 | # public
 99 | 
100 | # vuepress build output
101 | .vuepress/dist
102 | 
103 | # vuepress v2.x temp and cache directory
104 | .temp
105 | .cache
106 | 
107 | # vitepress build output
108 | **/.vitepress/dist
109 | 
110 | # vitepress cache directory
111 | **/.vitepress/cache
112 | 
113 | # Docusaurus cache and generated files
114 | .docusaurus
115 | 
116 | # Serverless directories
117 | .serverless/
118 | 
119 | # FuseBox cache
120 | .fusebox/
121 | 
122 | # DynamoDB Local files
123 | .dynamodb/
124 | 
125 | # TernJS port file
126 | .tern-port
127 | 
128 | # Stores VSCode versions used for testing VSCode extensions
129 | .vscode-test
130 | 
131 | # yarn v2
132 | .yarn/cache
133 | .yarn/unplugged
134 | .yarn/build-state.yml
135 | .yarn/install-state.gz
136 | .pnp.*
137 | .vscode
138 | .chaider


--------------------------------------------------------------------------------
/legacy/chonkie/cloud/recursive.ts:
--------------------------------------------------------------------------------
 1 | /** Recursive chunker client for Chonkie API. */
 2 | 
 3 | import { CloudClient, ChunkerInput } from "./base";
 4 | import { RecursiveChunk } from "../types/recursive";
 5 | import * as fs from 'fs';
 6 | import * as path from 'path';
 7 | 
 8 | export interface RecursiveChunkerConfig {
 9 |   tokenizerOrTokenCounter?: string;
10 |   chunkSize?: number;
11 |   recipe?: string;
12 |   lang?: string;
13 |   minCharactersPerChunk?: number;
14 | }
15 | 
16 | export class RecursiveChunker extends CloudClient {
17 |   private readonly config: Required<RecursiveChunkerConfig>;
18 | 
19 |   constructor(apiKey: string, config: RecursiveChunkerConfig = {}) {
20 |     super({ apiKey });
21 |     this.config = {
22 |       tokenizerOrTokenCounter: config.tokenizerOrTokenCounter || "gpt2",
23 |       chunkSize: config.chunkSize || 512,
24 |       recipe: config.recipe || "default",
25 |       lang: config.lang || "en",
26 |       minCharactersPerChunk: config.minCharactersPerChunk || 12,
27 |     };
28 |   }
29 | 
30 |   async chunk(input: ChunkerInput): Promise<RecursiveChunk[]> {
31 |     const formData = new FormData();
32 |     
33 |     if (input.filepath) {
34 |       const fileContent = fs.readFileSync(input.filepath);
35 |       const fileName = path.basename(input.filepath) || 'file.txt';
36 |       formData.append("file", new Blob([fileContent]), fileName);
37 |     } else if (input.text) {
38 |       // JSON encode the text
39 |       formData.append("text", JSON.stringify(input.text));
40 |       // Append empty file to ensure multipart form
41 |       formData.append("file", new Blob(), "text_input.txt");
42 |     } else {
43 |       throw new Error("Either text or filepath must be provided");
44 |     }
45 |     formData.append("tokenizer_or_token_counter", this.config.tokenizerOrTokenCounter);
46 |     formData.append("chunk_size", this.config.chunkSize.toString());
47 |     formData.append("recipe", this.config.recipe);
48 |     formData.append("lang", this.config.lang);
49 |     formData.append("min_characters_per_chunk", this.config.minCharactersPerChunk.toString());
50 |     formData.append("return_type", "chunks");
51 | 
52 |     const data = await this.request<any>("/v1/chunk/recursive", {
53 |       method: "POST",
54 |       body: formData,
55 |     });
56 | 
57 |     // Convert from snake_case to camelCase
58 |     const camelCaseData = data.map((chunk: any) => {
59 |       return {
60 |         text: chunk.text,
61 |         startIndex: chunk.start_index,
62 |         endIndex: chunk.end_index,
63 |         tokenCount: chunk.token_count,
64 |         embedding: chunk.embedding || undefined,
65 |         level: chunk.level,
66 |       };
67 |     });
68 | 
69 |     return camelCaseData.map((chunk: any) => RecursiveChunk.fromDict(chunk));
70 |   }
71 | 
72 |   async chunkBatch(inputs: ChunkerInput[]): Promise<RecursiveChunk[][]> {
73 |     return Promise.all(inputs.map(input => this.chunk(input)));
74 |   }
75 | } 


--------------------------------------------------------------------------------
/packages/cloud/src/chunkers/neural.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Neural chunker that uses neural networks for intelligent chunking
  3 |  * via api.chonkie.ai
  4 |  */
  5 | 
  6 | import { Chunk } from '@chonkiejs/core';
  7 | import { CloudBaseChunker, ChunkerInput } from '@/base';
  8 | 
  9 | export interface NeuralChunkerOptions {
 10 |   /** Model to use (default: "mirth/chonky_modernbert_large_1") */
 11 |   model?: string;
 12 |   /** Minimum characters per chunk (default: 10) */
 13 |   minCharactersPerChunk?: number;
 14 |   /** API key (reads from CHONKIE_API_KEY env var if not provided) */
 15 |   apiKey?: string;
 16 |   /** Base URL for API (default: "https://api.chonkie.ai") */
 17 |   baseUrl?: string;
 18 | }
 19 | 
 20 | interface ApiChunkResponse {
 21 |   text: string;
 22 |   start_index: number;
 23 |   end_index: number;
 24 |   token_count: number;
 25 | }
 26 | 
 27 | interface NeuralChunkPayload extends Record<string, unknown> {
 28 |   text?: string;
 29 |   file?: { type: string; content: string };
 30 |   embedding_model: string;
 31 |   min_characters_per_chunk: number;
 32 |   return_type: string;
 33 | }
 34 | 
 35 | export class NeuralChunker extends CloudBaseChunker {
 36 |   private readonly config: {
 37 |     model: string;
 38 |     minCharactersPerChunk: number;
 39 |   };
 40 | 
 41 |   constructor(options: NeuralChunkerOptions = {}) {
 42 |     const apiKey = options.apiKey || process.env.CHONKIE_API_KEY;
 43 |     if (!apiKey) {
 44 |       throw new Error('API key is required. Provide it in options.apiKey or set CHONKIE_API_KEY environment variable.');
 45 |     }
 46 | 
 47 |     super({ apiKey, baseUrl: options.baseUrl });
 48 | 
 49 |     this.config = {
 50 |       model: options.model || 'mirth/chonky_modernbert_large_1',
 51 |       minCharactersPerChunk: options.minCharactersPerChunk || 10,
 52 |     };
 53 |   }
 54 | 
 55 |   async chunk(input: ChunkerInput): Promise<Chunk[]> {
 56 |     let fileRef = input.file;
 57 | 
 58 |     // If filepath is provided, upload it first to get a file reference
 59 |     if (input.filepath) {
 60 |       fileRef = await this.uploadFile(input.filepath);
 61 |     }
 62 | 
 63 |     // Build the payload
 64 |     const payload: NeuralChunkPayload = {
 65 |       embedding_model: this.config.model,
 66 |       min_characters_per_chunk: this.config.minCharactersPerChunk,
 67 |       return_type: 'chunks',
 68 |     };
 69 | 
 70 |     // Add either text or file to the payload
 71 |     if (fileRef) {
 72 |       payload.file = fileRef;
 73 |     } else if (input.text) {
 74 |       payload.text = input.text;
 75 |     } else {
 76 |       throw new Error('Either text, filepath, or file must be provided');
 77 |     }
 78 | 
 79 |     const data = await this.request<ApiChunkResponse[]>('/v1/chunk/neural', {
 80 |       method: 'POST',
 81 |       body: payload,
 82 |     });
 83 | 
 84 |     return data.map(chunk => new Chunk({
 85 |       text: chunk.text,
 86 |       startIndex: chunk.start_index,
 87 |       endIndex: chunk.end_index,
 88 |       tokenCount: chunk.token_count,
 89 |     }));
 90 |   }
 91 | 
 92 |   async chunkBatch(inputs: ChunkerInput[]): Promise<Chunk[][]> {
 93 |     return Promise.all(inputs.map(input => this.chunk(input)));
 94 |   }
 95 | 
 96 |   toString(): string {
 97 |     return `NeuralChunker(model=${this.config.model})`;
 98 |   }
 99 | }
100 | 


--------------------------------------------------------------------------------
/legacy/chonkie/types/semantic.ts:
--------------------------------------------------------------------------------
 1 | import { Sentence, SentenceData } from './sentence';
 2 | import { SentenceChunk } from './sentence';
 3 | 
 4 | /**
 5 |  * Represents a semantic sentence with metadata, including an embedding.
 6 |  * Extends the base Sentence class.
 7 |  */
 8 | export interface SemanticSentenceData extends SentenceData {
 9 |   embedding?: number[] | null;
10 | }
11 | 
12 | export class SemanticSentence extends Sentence {
13 |   /** The embedding vector for the sentence (array of numbers, or null if not present) */
14 |   public embedding: number[] | null;
15 | 
16 |   constructor(data: SemanticSentenceData) {
17 |     super(data);
18 |     this.embedding = data.embedding ?? null;
19 |   }
20 | 
21 |   /** Return the SemanticSentence as a dictionary-like object */
22 |   public toDict(): SemanticSentenceData {
23 |     return { ...super.toDict(), embedding: this.embedding ?? null };
24 |   }
25 | 
26 |   /** Create a SemanticSentence object from a dictionary */
27 |   public static fromDict(data: SemanticSentenceData): SemanticSentence {
28 |     // Defensive copy to avoid mutating input
29 |     const { embedding, ...rest } = data;
30 |     return new SemanticSentence({ ...rest, embedding: embedding ?? null });
31 |   }
32 | 
33 |   /** Return a string representation of the SemanticSentence */
34 |   public toString(): string {
35 |     return `SemanticSentence(text=${this.text}, startIndex=${this.startIndex}, endIndex=${this.endIndex}, tokenCount=${this.tokenCount}, embedding=${JSON.stringify(this.embedding)})`;
36 |   }
37 | }
38 | 
39 | /**
40 |  * Represents a semantic chunk with metadata, including a list of semantic sentences.
41 |  * Extends the base SentenceChunk class.
42 |  */
43 | export interface SemanticChunkData {
44 |   text: string;
45 |   startIndex: number;
46 |   endIndex: number;
47 |   tokenCount: number;
48 |   sentences: SemanticSentenceData[];
49 |   embedding?: number[];
50 | }
51 | 
52 | export class SemanticChunk extends SentenceChunk {
53 |   /** List of SemanticSentence objects in the chunk */
54 |   public sentences: SemanticSentence[];
55 | 
56 |   constructor(data: SemanticChunkData & { sentences: SemanticSentence[] }) {
57 |     super({
58 |       text: data.text,
59 |       startIndex: data.startIndex,
60 |       endIndex: data.endIndex,
61 |       tokenCount: data.tokenCount,
62 |       sentences: data.sentences,
63 |       embedding: data.embedding,
64 |     });
65 |     this.sentences = data.sentences;
66 |   }
67 | 
68 |   /** Return the SemanticChunk as a dictionary-like object */
69 |   public toDict(): SemanticChunkData {
70 |     const base = super.toDict() as SemanticChunkData;
71 |     return {
72 |       ...base,
73 |       sentences: this.sentences.map((s) => s.toDict()),
74 |     };
75 |   }
76 | 
77 |   /** Create a SemanticChunk object from a dictionary */
78 |   public static fromDict(data: SemanticChunkData): SemanticChunk {
79 |     const { sentences, ...rest } = data;
80 |     const semanticSentences = sentences.map((s) => SemanticSentence.fromDict(s));
81 |     return new SemanticChunk({ ...rest, sentences: semanticSentences });
82 |   }
83 | 
84 |   /** Return a string representation of the SemanticChunk */
85 |   public toString(): string {
86 |     return `SemanticChunk(text=${this.text}, startIndex=${this.startIndex}, endIndex=${this.endIndex}, tokenCount=${this.tokenCount}, sentences=${JSON.stringify(this.sentences)})`;
87 |   }
88 | }
89 | 


--------------------------------------------------------------------------------
/packages/cloud/src/chunkers/code.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Code chunker that splits code into structurally meaningful chunks
  3 |  * via api.chonkie.ai
  4 |  */
  5 | 
  6 | import { Chunk } from '@chonkiejs/core';
  7 | import { CloudBaseChunker, ChunkerInput } from '@/base';
  8 | 
  9 | export interface CodeChunkerOptions {
 10 |   /** Tokenizer to use (default: "gpt2") */
 11 |   tokenizer?: string;
 12 |   /** Maximum tokens per chunk (default: 1500) */
 13 |   chunkSize?: number;
 14 |   /** Programming language (required, e.g., "python", "javascript", "typescript") */
 15 |   language: string;
 16 |   /** API key (reads from CHONKIE_API_KEY env var if not provided) */
 17 |   apiKey?: string;
 18 |   /** Base URL for API (default: "https://api.chonkie.ai") */
 19 |   baseUrl?: string;
 20 | }
 21 | 
 22 | interface ApiChunkResponse {
 23 |   text: string;
 24 |   start_index: number;
 25 |   end_index: number;
 26 |   token_count: number;
 27 | }
 28 | 
 29 | interface CodeChunkPayload extends Record<string, unknown> {
 30 |   text?: string;
 31 |   file?: { type: string; content: string };
 32 |   tokenizer_or_token_counter: string;
 33 |   chunk_size: number;
 34 |   language: string;
 35 | }
 36 | 
 37 | export class CodeChunker extends CloudBaseChunker {
 38 |   private readonly config: {
 39 |     tokenizer: string;
 40 |     chunkSize: number;
 41 |     language: string;
 42 |   };
 43 | 
 44 |   constructor(options: CodeChunkerOptions) {
 45 |     if (!options.language) {
 46 |       throw new Error('Language is required for code chunking');
 47 |     }
 48 | 
 49 |     const apiKey = options.apiKey || process.env.CHONKIE_API_KEY;
 50 |     if (!apiKey) {
 51 |       throw new Error('API key is required. Provide it in options.apiKey or set CHONKIE_API_KEY environment variable.');
 52 |     }
 53 | 
 54 |     super({ apiKey, baseUrl: options.baseUrl });
 55 | 
 56 |     this.config = {
 57 |       tokenizer: options.tokenizer || 'gpt2',
 58 |       chunkSize: options.chunkSize || 1500,
 59 |       language: options.language,
 60 |     };
 61 |   }
 62 | 
 63 |   async chunk(input: ChunkerInput): Promise<Chunk[]> {
 64 |     let fileRef = input.file;
 65 | 
 66 |     // If filepath is provided, upload it first to get a file reference
 67 |     if (input.filepath) {
 68 |       fileRef = await this.uploadFile(input.filepath);
 69 |     }
 70 | 
 71 |     // Build the payload
 72 |     const payload: CodeChunkPayload = {
 73 |       tokenizer_or_token_counter: this.config.tokenizer,
 74 |       chunk_size: this.config.chunkSize,
 75 |       language: this.config.language,
 76 |     };
 77 | 
 78 |     // Add either text or file to the payload
 79 |     if (fileRef) {
 80 |       payload.file = fileRef;
 81 |     } else if (input.text) {
 82 |       payload.text = input.text;
 83 |     } else {
 84 |       throw new Error('Either text, filepath, or file must be provided');
 85 |     }
 86 | 
 87 |     const data = await this.request<ApiChunkResponse[]>('/v1/chunk/code', {
 88 |       method: 'POST',
 89 |       body: payload,
 90 |     });
 91 | 
 92 |     return data.map(chunk => new Chunk({
 93 |       text: chunk.text,
 94 |       startIndex: chunk.start_index,
 95 |       endIndex: chunk.end_index,
 96 |       tokenCount: chunk.token_count,
 97 |     }));
 98 |   }
 99 | 
100 |   async chunkBatch(inputs: ChunkerInput[]): Promise<Chunk[][]> {
101 |     return Promise.all(inputs.map(input => this.chunk(input)));
102 |   }
103 | 
104 |   toString(): string {
105 |     return `CodeChunker(language=${this.config.language}, chunkSize=${this.config.chunkSize})`;
106 |   }
107 | }
108 | 


--------------------------------------------------------------------------------
/packages/cloud/src/chunkers/token.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Token chunker that splits text into fixed-size token chunks
  3 |  * via api.chonkie.ai
  4 |  */
  5 | 
  6 | import { Chunk } from '@chonkiejs/core';
  7 | import { CloudBaseChunker, ChunkerInput } from '@/base';
  8 | 
  9 | export interface TokenChunkerOptions {
 10 |   /** Tokenizer to use (default: "gpt2") */
 11 |   tokenizer?: string;
 12 |   /** Maximum tokens per chunk (default: 512) */
 13 |   chunkSize?: number;
 14 |   /** Number of tokens to overlap between chunks (default: 0) */
 15 |   chunkOverlap?: number;
 16 |   /** API key (reads from CHONKIE_API_KEY env var if not provided) */
 17 |   apiKey?: string;
 18 |   /** Base URL for API (default: "https://api.chonkie.ai") */
 19 |   baseUrl?: string;
 20 | }
 21 | 
 22 | interface ApiChunkResponse {
 23 |   text: string;
 24 |   start_index: number;
 25 |   end_index: number;
 26 |   token_count: number;
 27 | }
 28 | 
 29 | interface TokenChunkPayload extends Record<string, unknown> {
 30 |   text?: string;
 31 |   file?: { type: string; content: string };
 32 |   tokenizer_or_token_counter: string;
 33 |   chunk_size: number;
 34 |   chunk_overlap: number;
 35 |   return_type: string;
 36 | }
 37 | 
 38 | export class TokenChunker extends CloudBaseChunker {
 39 |   private readonly config: {
 40 |     tokenizer: string;
 41 |     chunkSize: number;
 42 |     chunkOverlap: number;
 43 |   };
 44 | 
 45 |   constructor(options: TokenChunkerOptions = {}) {
 46 |     const apiKey = options.apiKey || process.env.CHONKIE_API_KEY;
 47 |     if (!apiKey) {
 48 |       throw new Error('API key is required. Provide it in options.apiKey or set CHONKIE_API_KEY environment variable.');
 49 |     }
 50 | 
 51 |     super({ apiKey, baseUrl: options.baseUrl });
 52 | 
 53 |     this.config = {
 54 |       tokenizer: options.tokenizer || 'gpt2',
 55 |       chunkSize: options.chunkSize || 512,
 56 |       chunkOverlap: options.chunkOverlap || 0,
 57 |     };
 58 |   }
 59 | 
 60 |   async chunk(input: ChunkerInput): Promise<Chunk[]> {
 61 |     let fileRef = input.file;
 62 | 
 63 |     // If filepath is provided, upload it first to get a file reference
 64 |     if (input.filepath) {
 65 |       fileRef = await this.uploadFile(input.filepath);
 66 |     }
 67 | 
 68 |     // Build the payload
 69 |     const payload: TokenChunkPayload = {
 70 |       tokenizer_or_token_counter: this.config.tokenizer,
 71 |       chunk_size: this.config.chunkSize,
 72 |       chunk_overlap: this.config.chunkOverlap,
 73 |       return_type: 'chunks',
 74 |     };
 75 | 
 76 |     // Add either text or file to the payload
 77 |     if (fileRef) {
 78 |       payload.file = fileRef;
 79 |     } else if (input.text) {
 80 |       payload.text = input.text;
 81 |     } else {
 82 |       throw new Error('Either text, filepath, or file must be provided');
 83 |     }
 84 | 
 85 |     const data = await this.request<ApiChunkResponse[]>('/v1/chunk/token', {
 86 |       method: 'POST',
 87 |       body: payload,
 88 |     });
 89 | 
 90 |     // Convert API response to Chunk objects
 91 |     return data.map(chunk => new Chunk({
 92 |       text: chunk.text,
 93 |       startIndex: chunk.start_index,
 94 |       endIndex: chunk.end_index,
 95 |       tokenCount: chunk.token_count,
 96 |     }));
 97 |   }
 98 | 
 99 |   async chunkBatch(inputs: ChunkerInput[]): Promise<Chunk[][]> {
100 |     return Promise.all(inputs.map(input => this.chunk(input)));
101 |   }
102 | 
103 |   toString(): string {
104 |     return `TokenChunker(tokenizer=${this.config.tokenizer}, chunkSize=${this.config.chunkSize}, overlap=${this.config.chunkOverlap})`;
105 |   }
106 | }
107 | 


--------------------------------------------------------------------------------
/packages/cloud/src/refineries/overlap.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Overlap refinery that adds context overlap to existing chunks
  3 |  * via api.chonkie.ai
  4 |  */
  5 | 
  6 | import { Chunk } from '@chonkiejs/core';
  7 | import { CloudBaseChunker } from '@/base';
  8 | 
  9 | export interface OverlapRefineryOptions {
 10 |   /** Tokenizer to use (default: "character") */
 11 |   tokenizer?: string;
 12 |   /** Context size as fraction or token count (default: 0.25) */
 13 |   contextSize?: number;
 14 |   /** Mode for overlap (default: "token") */
 15 |   mode?: 'token' | 'recursive';
 16 |   /** Method for adding context (default: "suffix") */
 17 |   method?: 'suffix' | 'prefix';
 18 |   /** Recipe name for recursive mode (default: "default") */
 19 |   recipe?: string;
 20 |   /** Language for recipe (default: "en") */
 21 |   lang?: string;
 22 |   /** Merge overlapping chunks (default: true) */
 23 |   merge?: boolean;
 24 |   /** API key (reads from CHONKIE_API_KEY env var if not provided) */
 25 |   apiKey?: string;
 26 |   /** Base URL for API (default: "https://api.chonkie.ai") */
 27 |   baseUrl?: string;
 28 | }
 29 | 
 30 | interface ChunkData {
 31 |   text: string;
 32 |   start_index: number;
 33 |   end_index: number;
 34 |   token_count: number;
 35 | }
 36 | 
 37 | /**
 38 |  * Post-processes chunks by adding contextual overlap.
 39 |  */
 40 | export class OverlapRefinery extends CloudBaseChunker {
 41 |   private readonly config: {
 42 |     tokenizer: string;
 43 |     contextSize: number;
 44 |     mode: 'token' | 'recursive';
 45 |     method: 'suffix' | 'prefix';
 46 |     recipe: string;
 47 |     lang: string;
 48 |     merge: boolean;
 49 |   };
 50 | 
 51 |   constructor(options: OverlapRefineryOptions = {}) {
 52 |     const apiKey = options.apiKey || process.env.CHONKIE_API_KEY;
 53 |     if (!apiKey) {
 54 |       throw new Error('API key is required. Provide it in options.apiKey or set CHONKIE_API_KEY environment variable.');
 55 |     }
 56 | 
 57 |     super({ apiKey, baseUrl: options.baseUrl });
 58 | 
 59 |     this.config = {
 60 |       tokenizer: options.tokenizer || 'character',
 61 |       contextSize: options.contextSize ?? 0.25,
 62 |       mode: options.mode || 'token',
 63 |       method: options.method || 'suffix',
 64 |       recipe: options.recipe || 'default',
 65 |       lang: options.lang || 'en',
 66 |       merge: options.merge ?? true,
 67 |     };
 68 |   }
 69 | 
 70 |   /**
 71 |    * Add overlap context to existing chunks.
 72 |    *
 73 |    * @param chunks - Array of chunks to add overlap to
 74 |    * @returns Array of chunks with overlap added
 75 |    */
 76 |   async refine(chunks: Chunk[]): Promise<Chunk[]> {
 77 |     const chunkData = chunks.map(chunk => ({
 78 |       text: chunk.text,
 79 |       start_index: chunk.startIndex,
 80 |       end_index: chunk.endIndex,
 81 |       token_count: chunk.tokenCount,
 82 |     }));
 83 | 
 84 |     const response = await this.request<ChunkData[]>('/v1/refine/overlap', {
 85 |       method: 'POST',
 86 |       body: {
 87 |         chunks: chunkData,
 88 |         tokenizer_or_token_counter: this.config.tokenizer,
 89 |         context_size: this.config.contextSize,
 90 |         mode: this.config.mode,
 91 |         method: this.config.method,
 92 |         recipe: this.config.recipe,
 93 |         lang: this.config.lang,
 94 |         merge: this.config.merge,
 95 |       },
 96 |     });
 97 | 
 98 |     return response.map(chunk => new Chunk({
 99 |       text: chunk.text,
100 |       startIndex: chunk.start_index,
101 |       endIndex: chunk.end_index,
102 |       tokenCount: chunk.token_count,
103 |     }));
104 |   }
105 | 
106 |   toString(): string {
107 |     return `OverlapRefinery(mode=${this.config.mode}, contextSize=${this.config.contextSize})`;
108 |   }
109 | }
110 | 


--------------------------------------------------------------------------------
/packages/core/src/tokenizer.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Simple character-based tokenizer for text chunking.
  3 |  *
  4 |  * This tokenizer treats each character as a single token, providing
  5 |  * a straightforward and predictable tokenization strategy.
  6 |  *
  7 |  * For advanced tokenization (GPT-2, BERT, etc.), use the static `create()` method
  8 |  * with @chonkiejs/token package installed.
  9 |  */
 10 | export class Tokenizer {
 11 |   /**
 12 |    * Create a tokenizer instance.
 13 |    *
 14 |    * @param model - Tokenizer model to use. Use 'character' (default) for character-based,
 15 |    *                or specify a HuggingFace model like 'gpt2', 'bert-base-uncased', etc.
 16 |    * @returns Promise resolving to a Tokenizer instance
 17 |    *
 18 |    * @example
 19 |    * // Character-based (no dependencies)
 20 |    * const tokenizer = await Tokenizer.create();
 21 |    * const tokenizer = await Tokenizer.create('character');
 22 |    *
 23 |    * @example
 24 |    * // HuggingFace models (requires @chonkiejs/token)
 25 |    * const tokenizer = await Tokenizer.create('gpt2');
 26 |    * const tokenizer = await Tokenizer.create('Xenova/gpt-4');
 27 |    */
 28 |   static async create(model: string = 'character'): Promise<Tokenizer> {
 29 |     if (model === 'character') {
 30 |       return new Tokenizer();
 31 |     }
 32 | 
 33 |     // Try to dynamically import @chonkiejs/token
 34 |     try {
 35 |       // Use dynamic import with string to avoid TypeScript resolution
 36 |       const tokenPackage = await import('@chonkiejs/token' as any);
 37 |       const { HuggingFaceTokenizer } = tokenPackage;
 38 |       return await HuggingFaceTokenizer.create(model);
 39 |     } catch (error) {
 40 |       const errorMessage = error instanceof Error ? error.message : '';
 41 | 
 42 |       // Check if it's a module not found error
 43 |       if (errorMessage.includes('Cannot find') || errorMessage.includes('MODULE_NOT_FOUND')) {
 44 |         throw new Error(`
 45 | To use "${model}" tokenizer, install @chonkiejs/token:
 46 | 
 47 |   npm install @chonkiejs/token
 48 | 
 49 | Or use character-based tokenization (no dependencies):
 50 | 
 51 |   const tokenizer = await Tokenizer.create();
 52 |   const tokenizer = await Tokenizer.create('character');
 53 | 
 54 | Available with @chonkiejs/token: gpt2, bert-base-uncased, Xenova/gpt-4, etc.
 55 |         `.trim());
 56 |       }
 57 | 
 58 |       // Re-throw other errors
 59 |       throw error;
 60 |     }
 61 |   }
 62 | 
 63 |   /**
 64 |    * Count the number of tokens in the given text.
 65 |    * For character-based tokenization, this is simply the length of the text.
 66 |    *
 67 |    * @param text - The text to count tokens for
 68 |    * @returns The number of tokens (characters) in the text
 69 |    */
 70 |   countTokens(text: string): number {
 71 |     return text.length;
 72 |   }
 73 | 
 74 |   /**
 75 |    * Encode text into token IDs.
 76 |    * For character-based tokenization, returns character codes.
 77 |    *
 78 |    * @param text - The text to encode
 79 |    * @returns Array of character codes
 80 |    */
 81 |   encode(text: string): number[] {
 82 |     return Array.from(text).map(char => char.charCodeAt(0));
 83 |   }
 84 | 
 85 |   /**
 86 |    * Decode token IDs back into text.
 87 |    * For character-based tokenization, converts character codes back to string.
 88 |    *
 89 |    * @param tokens - Array of token IDs (character codes)
 90 |    * @returns The decoded text
 91 |    */
 92 |   decode(tokens: number[]): string {
 93 |     return String.fromCharCode(...tokens);
 94 |   }
 95 | 
 96 |   /**
 97 |    * Decode a batch of token arrays.
 98 |    *
 99 |    * @param tokensBatch - Array of token arrays
100 |    * @returns Array of decoded texts
101 |    */
102 |   decodeBatch(tokensBatch: number[][]): string[] {
103 |     return tokensBatch.map(tokens => this.decode(tokens));
104 |   }
105 | }
106 | 


--------------------------------------------------------------------------------
/legacy/chonkie/cloud/sentence.ts:
--------------------------------------------------------------------------------
 1 | /** Sentence chunker client for Chonkie API. */
 2 | 
 3 | import { CloudClient, ChunkerInput } from "./base";
 4 | import { SentenceChunk } from "../types/sentence";
 5 | import * as fs from 'fs';
 6 | import * as path from 'path';
 7 | 
 8 | export interface SentenceChunkerConfig {
 9 |   tokenizerOrTokenCounter?: string;
10 |   chunkSize?: number;
11 |   chunkOverlap?: number;
12 |   minSentencesPerChunk?: number;
13 |   minCharactersPerSentence?: number;
14 |   approximate?: boolean;
15 |   delim?: string | string[];
16 |   includeDelim?: "prev" | "next" | null;
17 | }
18 | 
19 | export class SentenceChunker extends CloudClient {
20 |   private readonly config: Required<SentenceChunkerConfig>;
21 | 
22 |   constructor(apiKey: string, config: SentenceChunkerConfig = {}) {
23 |     super({ apiKey });
24 |     this.config = {
25 |       tokenizerOrTokenCounter: config.tokenizerOrTokenCounter || "gpt2",
26 |       chunkSize: config.chunkSize || 512,
27 |       chunkOverlap: config.chunkOverlap || 0,
28 |       minSentencesPerChunk: config.minSentencesPerChunk || 1,
29 |       minCharactersPerSentence: config.minCharactersPerSentence || 12,
30 |       approximate: config.approximate ?? false,
31 |       delim: config.delim || [".", "!", "?", "\n"],
32 |       includeDelim: config.includeDelim ?? "prev",
33 |     };
34 |   }
35 | 
36 |   async chunk(input: ChunkerInput): Promise<SentenceChunk[]> {
37 |     const formData = new FormData();
38 | 
39 |     if (input.filepath) {
40 |       const fileContent = fs.readFileSync(input.filepath);
41 |       const fileName = path.basename(input.filepath) || 'file.txt';
42 |       formData.append("file", new Blob([fileContent]), fileName);
43 |     } else if (input.text) {
44 |       // JSON encode the text
45 |       formData.append("text", JSON.stringify(input.text));
46 |       // Append empty file to ensure multipart form
47 |       formData.append("file", new Blob(), "text_input.txt");
48 |     } else {
49 |       throw new Error("Either text or filepath must be provided");
50 |     }
51 |     // Append all config options to the form data
52 |     formData.append("tokenizer_or_token_counter", this.config.tokenizerOrTokenCounter);
53 |     formData.append("chunk_size", this.config.chunkSize.toString());
54 |     formData.append("chunk_overlap", this.config.chunkOverlap.toString());
55 |     formData.append("min_sentences_per_chunk", this.config.minSentencesPerChunk.toString());
56 |     formData.append("min_characters_per_sentence", this.config.minCharactersPerSentence.toString());
57 |     formData.append("approximate", this.config.approximate.toString());
58 |     formData.append("delim", JSON.stringify(this.config.delim));
59 |     formData.append("include_delim", this.config.includeDelim || "prev");
60 |     formData.append("return_type", "chunks");
61 | 
62 |     const data = await this.request<any>("/v1/chunk/sentence", {
63 |       method: "POST",
64 |       body: formData,
65 |     });
66 | 
67 |     // Convert from snake_case to camelCase
68 |     const camelCaseData = data.map((chunk: any) => {
69 |       return {
70 |         text: chunk.text,
71 |         startIndex: chunk.start_index,
72 |         endIndex: chunk.end_index,
73 |         tokenCount: chunk.token_count,
74 |         sentences: chunk.sentences.map((sentence: any) => {
75 |           return {
76 |             text: sentence.text,
77 |             startIndex: sentence.start_index,
78 |             endIndex: sentence.end_index,
79 |             tokenCount: sentence.token_count,
80 |             embedding: sentence.embedding || undefined,
81 |           };
82 |         }),
83 |       };
84 |     });
85 | 
86 |     return camelCaseData.map((chunk: any) => SentenceChunk.fromDict(chunk));
87 |   }
88 | 
89 |   async chunkBatch(inputs: ChunkerInput[]): Promise<SentenceChunk[][]> {
90 |     return Promise.all(inputs.map(input => this.chunk(input)));
91 |   }
92 | }   


--------------------------------------------------------------------------------
/packages/core/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | 
  3 | ![Chonkie Logo](../../assets/chonkie_logo_br_transparent_bg.png)
  4 | 
  5 | # @chonkiejs/core
  6 | 
  7 | _Core chunking library for Chonkie - lightweight and efficient text chunking with zero dependencies._
  8 | 
  9 | [![npm version](https://img.shields.io/npm/v/@chonkiejs/core)](https://www.npmjs.com/package/@chonkiejs/core)
 10 | [![npm license](https://img.shields.io/npm/l/@chonkiejs/core)](https://www.npmjs.com/package/@chonkiejs/core)
 11 | [![Documentation](https://img.shields.io/badge/docs-DOCS.md-blue.svg)](./DOCS.md)
 12 | [![GitHub](https://img.shields.io/badge/github-chonkie--ts-black.svg?logo=github)](https://github.com/chonkie-inc/chonkie-ts)
 13 | 
 14 | </div>
 15 | 
 16 | ## Features
 17 | ✨ **Simple & Clean API** - Easy to use OOP design</br>
 18 | ⚡ **Zero Dependencies** - Minimal, lightweight, fast</br>
 19 | 🔤 **Character-based** - Simple tokenization (1 char = 1 token)</br>
 20 | 🎯 **Recursive Chunking** - Smart hierarchical text splitting</br>
 21 | 📦 **TypeScript First** - Full type safety with TypeScript</br>
 22 | 
 23 | ## Installation
 24 | 
 25 | Install with `npm`:
 26 | ```bash
 27 | npm i @chonkiejs/core
 28 | ```
 29 | 
 30 | Install with `pnpm`:
 31 | ```bash
 32 | pnpm add @chonkiejs/core
 33 | ```
 34 | 
 35 | Install with `yarn`:
 36 | ```bash
 37 | yarn add @chonkiejs/core
 38 | ```
 39 | 
 40 | Install with `bun`:
 41 | ```bash
 42 | bun add @chonkiejs/core
 43 | ```
 44 | 
 45 | ## Quick Start
 46 | 
 47 | ```typescript
 48 | import { RecursiveChunker } from '@chonkiejs/core';
 49 | 
 50 | // Create a chunker
 51 | const chunker = await RecursiveChunker.create({
 52 |   chunkSize: 512,
 53 |   minCharactersPerChunk: 24
 54 | });
 55 | 
 56 | // Chunk your text
 57 | const chunks = await chunker.chunk('Your text here...');
 58 | 
 59 | // Use the chunks
 60 | for (const chunk of chunks) {
 61 |   console.log(chunk.text);
 62 |   console.log(`Tokens: ${chunk.tokenCount}`);
 63 | }
 64 | ```
 65 | 
 66 | ## Available Chunkers
 67 | 
 68 | | Name | Description |
 69 | |------|-------------|
 70 | | `RecursiveChunker` | Recursively splits text using hierarchical rules (paragraphs → sentences → punctuation → words → characters). Each level only activates if chunks exceed the configured size. |
 71 | | `TokenChunker` | Splits text into fixed-size token chunks with optional overlap. Uses character-based tokenization by default, or HuggingFace models with @chonkiejs/token. |
 72 | 
 73 | For detailed API documentation, configuration options, and advanced usage, see [DOCS.md](./DOCS.md).
 74 | 
 75 | ## Contributing
 76 | 
 77 | Want to help grow Chonkie? Check out [CONTRIBUTING.md](../../CONTRIBUTING.md) to get started! Whether you're fixing bugs, adding features, improving docs, or simply leaving a ⭐️ on the repo, every contribution helps make Chonkie a better CHONK for everyone.
 78 | 
 79 | Remember: No contribution is too small for this tiny hippo!
 80 | 
 81 | ## Acknowledgements
 82 | 
 83 | Chonkie would like to CHONK its way through a special thanks to all the users and contributors who have helped make this library what it is today! Your feedback, issue reports, and improvements have helped make Chonkie the CHONKIEST it can be.
 84 | 
 85 | And of course, special thanks to [Moto Moto](https://www.youtube.com/watch?v=I0zZC4wtqDQ&t=5s) for endorsing Chonkie with his famous quote:
 86 | > "I like them big, I like them chonkie in TypeScript" ~ Moto Moto... definitely did not say this
 87 | 
 88 | ## Citation
 89 | 
 90 | If you use Chonkie in your research, please cite it as follows:
 91 | 
 92 | ```bibtex
 93 | @software{chonkie2025,
 94 |   author = {Bhavnick Minhas and Shreyash Nigam},
 95 |   title = {Chonkie: A no-nonsense fast, lightweight, and efficient text chunking library},
 96 |   year = {2025},
 97 |   publisher = {GitHub},
 98 |   howpublished = {\url{https://github.com/chonkie-inc}},
 99 | }
100 | ```
101 | 


--------------------------------------------------------------------------------
/packages/core/examples/recursive.example.ts:
--------------------------------------------------------------------------------
 1 | import { RecursiveChunker, RecursiveRules } from '../src';
 2 | 
 3 | async function main() {
 4 |   console.log('🦛 Chonkie RecursiveChunker Example\n');
 5 |   console.log('='.repeat(60));
 6 | 
 7 |   // Example 1: Basic usage with default settings
 8 |   console.log('\n📝 Example 1: Basic Chunking\n');
 9 | 
10 |   const chunker = await RecursiveChunker.create({
11 |     chunkSize: 100,
12 |     minCharactersPerChunk: 20
13 |   });
14 | 
15 |   const text = `
16 | Chonkie is a powerful text chunking library. It helps you break down large documents into manageable pieces.
17 | 
18 | The library uses a recursive approach. It starts by splitting on paragraphs, then sentences, then punctuation, and finally words.
19 | 
20 | This hierarchical method ensures that chunks are semantically meaningful. Each chunk respects the configured size limits while maintaining context.
21 |   `.trim();
22 | 
23 |   const chunks = await chunker.chunk(text);
24 | 
25 |   console.log(`Input text length: ${text.length} characters`);
26 |   console.log(`Number of chunks created: ${chunks.length}\n`);
27 | 
28 |   chunks.forEach((chunk, index) => {
29 |     console.log(`Chunk ${index + 1}:`);
30 |     console.log(`  Position: [${chunk.startIndex}:${chunk.endIndex}]`);
31 |     console.log(`  Token count: ${chunk.tokenCount}`);
32 |     console.log(`  Text: "${chunk.text.substring(0, 60)}${chunk.text.length > 60 ? '...' : ''}"`);
33 |     console.log();
34 |   });
35 | 
36 |   // Example 2: Custom rules
37 |   console.log('='.repeat(60));
38 |   console.log('\n📝 Example 2: Custom Rules (Paragraphs Only)\n');
39 | 
40 |   const customChunker = await RecursiveChunker.create({
41 |     chunkSize: 150,
42 |     rules: new RecursiveRules({
43 |       levels: [
44 |         { delimiters: ['\n\n'] },  // Only split on paragraphs
45 |         { whitespace: true },       // Then words
46 |         {}                          // Then characters
47 |       ]
48 |     })
49 |   });
50 | 
51 |   const paragraphText = `First paragraph with some content.
52 | 
53 | Second paragraph with more information that needs to be chunked properly.
54 | 
55 | Third paragraph concludes the example.`;
56 | 
57 |   const customChunks = await customChunker.chunk(paragraphText);
58 | 
59 |   console.log(`Input text length: ${paragraphText.length} characters`);
60 |   console.log(`Number of chunks: ${customChunks.length}\n`);
61 | 
62 |   customChunks.forEach((chunk, index) => {
63 |     console.log(`Chunk ${index + 1}: ${chunk.tokenCount} tokens`);
64 |     console.log(`"${chunk.text}"`);
65 |     console.log();
66 |   });
67 | 
68 |   // Example 3: Very long text
69 |   console.log('='.repeat(60));
70 |   console.log('\n📝 Example 3: Long Text Handling\n');
71 | 
72 |   const longText = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. '.repeat(20);
73 |   const longChunker = await RecursiveChunker.create({ chunkSize: 100 });
74 |   const longChunks = await longChunker.chunk(longText);
75 | 
76 |   console.log(`Input text length: ${longText.length} characters`);
77 |   console.log(`Number of chunks: ${longChunks.length}`);
78 |   console.log(`Average chunk size: ${Math.round(longText.length / longChunks.length)} characters`);
79 |   console.log(`Max chunk tokens: ${Math.max(...longChunks.map(c => c.tokenCount))}`);
80 |   console.log(`Min chunk tokens: ${Math.min(...longChunks.map(c => c.tokenCount))}`);
81 | 
82 |   // Example 4: Verification
83 |   console.log('\n='.repeat(60));
84 |   console.log('\n✅ Verification: Text Reconstruction\n');
85 | 
86 |   const reconstructed = chunks.map(c => c.text).join('');
87 |   const matches = reconstructed === text;
88 | 
89 |   console.log(`Original length: ${text.length}`);
90 |   console.log(`Reconstructed length: ${reconstructed.length}`);
91 |   console.log(`Reconstruction matches: ${matches ? '✅ Yes' : '❌ No'}`);
92 | 
93 |   console.log('\n='.repeat(60));
94 |   console.log('\n🎉 Example completed!\n');
95 | }
96 | 
97 | main().catch(console.error);
98 | 


--------------------------------------------------------------------------------
/packages/cloud/src/chunkers/late.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Late chunker that uses recursive chunking with embeddings
  3 |  * via api.chonkie.ai
  4 |  */
  5 | 
  6 | import { Chunk } from '@chonkiejs/core';
  7 | import { CloudBaseChunker, ChunkerInput } from '@/base';
  8 | 
  9 | export interface LateChunkerOptions {
 10 |   /** Embedding model to use (default: "all-MiniLM-L6-v2") */
 11 |   embeddingModel?: string;
 12 |   /** Maximum tokens per chunk (default: 512) */
 13 |   chunkSize?: number;
 14 |   /** Recipe name (default: "default") */
 15 |   recipe?: string;
 16 |   /** Language for recipe (default: "en") */
 17 |   lang?: string;
 18 |   /** Minimum characters per chunk (default: 24) */
 19 |   minCharactersPerChunk?: number;
 20 |   /** API key (reads from CHONKIE_API_KEY env var if not provided) */
 21 |   apiKey?: string;
 22 |   /** Base URL for API (default: "https://api.chonkie.ai") */
 23 |   baseUrl?: string;
 24 | }
 25 | 
 26 | interface ApiChunkResponse {
 27 |   text: string;
 28 |   start_index: number;
 29 |   end_index: number;
 30 |   token_count: number;
 31 |   embedding?: number[];
 32 | }
 33 | 
 34 | interface LateChunkPayload extends Record<string, unknown> {
 35 |   text?: string;
 36 |   file?: { type: string; content: string };
 37 |   embedding_model: string;
 38 |   chunk_size: number;
 39 |   recipe: string;
 40 |   lang: string;
 41 |   min_characters_per_chunk: number;
 42 | }
 43 | 
 44 | export class LateChunker extends CloudBaseChunker {
 45 |   private readonly config: {
 46 |     embeddingModel: string;
 47 |     chunkSize: number;
 48 |     recipe: string;
 49 |     lang: string;
 50 |     minCharactersPerChunk: number;
 51 |   };
 52 | 
 53 |   constructor(options: LateChunkerOptions = {}) {
 54 |     const apiKey = options.apiKey || process.env.CHONKIE_API_KEY;
 55 |     if (!apiKey) {
 56 |       throw new Error('API key is required. Provide it in options.apiKey or set CHONKIE_API_KEY environment variable.');
 57 |     }
 58 | 
 59 |     super({ apiKey, baseUrl: options.baseUrl });
 60 | 
 61 |     this.config = {
 62 |       embeddingModel: options.embeddingModel || 'all-MiniLM-L6-v2',
 63 |       chunkSize: options.chunkSize || 512,
 64 |       recipe: options.recipe || 'default',
 65 |       lang: options.lang || 'en',
 66 |       minCharactersPerChunk: options.minCharactersPerChunk || 24,
 67 |     };
 68 |   }
 69 | 
 70 |   async chunk(input: ChunkerInput): Promise<Chunk[]> {
 71 |     let fileRef = input.file;
 72 | 
 73 |     // If filepath is provided, upload it first to get a file reference
 74 |     if (input.filepath) {
 75 |       fileRef = await this.uploadFile(input.filepath);
 76 |     }
 77 | 
 78 |     // Build the payload
 79 |     const payload: LateChunkPayload = {
 80 |       embedding_model: this.config.embeddingModel,
 81 |       chunk_size: this.config.chunkSize,
 82 |       recipe: this.config.recipe,
 83 |       lang: this.config.lang,
 84 |       min_characters_per_chunk: this.config.minCharactersPerChunk,
 85 |     };
 86 | 
 87 |     // Add either text or file to the payload
 88 |     if (fileRef) {
 89 |       payload.file = fileRef;
 90 |     } else if (input.text) {
 91 |       payload.text = input.text;
 92 |     } else {
 93 |       throw new Error('Either text, filepath, or file must be provided');
 94 |     }
 95 | 
 96 |     const data = await this.request<ApiChunkResponse[]>('/v1/chunk/late', {
 97 |       method: 'POST',
 98 |       body: payload,
 99 |     });
100 | 
101 |     return data.map(chunk => new Chunk({
102 |       text: chunk.text,
103 |       startIndex: chunk.start_index,
104 |       endIndex: chunk.end_index,
105 |       tokenCount: chunk.token_count,
106 |     }));
107 |   }
108 | 
109 |   async chunkBatch(inputs: ChunkerInput[]): Promise<Chunk[][]> {
110 |     return Promise.all(inputs.map(input => this.chunk(input)));
111 |   }
112 | 
113 |   toString(): string {
114 |     return `LateChunker(embeddingModel=${this.config.embeddingModel}, chunkSize=${this.config.chunkSize}, recipe=${this.config.recipe})`;
115 |   }
116 | }
117 | 


--------------------------------------------------------------------------------
/packages/cloud/src/utils.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Utility functions for cloud package
  3 |  */
  4 | 
  5 | /**
  6 |  * Format API error messages with helpful context and instructions
  7 |  */
  8 | export function formatApiError(
  9 |   statusCode: number,
 10 |   errorMessage: string,
 11 |   endpoint: string
 12 | ): string {
 13 |   const baseMessage = `API Error (${statusCode}): ${errorMessage}`;
 14 | 
 15 |   let helpText = '';
 16 | 
 17 |   // Provide specific help based on error type
 18 |   if (statusCode === 401 || errorMessage.toLowerCase().includes('invalid api key')) {
 19 |     helpText = `
 20 | Please check your API key:
 21 | - Ensure CHONKIE_API_KEY environment variable is set correctly
 22 | - Or pass apiKey in the constructor options
 23 | - Verify your key at https://api.chonkie.ai/dashboard`;
 24 |   } else if (statusCode === 403) {
 25 |     helpText = `
 26 | You don't have permission to access this resource.
 27 | - Check your API key permissions
 28 | - Contact support if you believe this is an error`;
 29 |   } else if (statusCode === 429) {
 30 |     helpText = `
 31 | Rate limit exceeded. Please:
 32 | - Wait a moment before retrying
 33 | - Check your usage limits at https://api.chonkie.ai/dashboard`;
 34 |   } else if (statusCode === 404) {
 35 |     helpText = `
 36 | Endpoint not found: ${endpoint}
 37 | - Verify you're using the latest version of @hippolib/cloud
 38 | - Check the API documentation`;
 39 |   } else if (statusCode >= 500) {
 40 |     helpText = `
 41 | Server error on api.chonkie.ai
 42 | - This is likely a temporary issue
 43 | - Try again in a few moments
 44 | - Check status at https://status.chonkie.ai (if available)`;
 45 |   } else {
 46 |     helpText = `
 47 | Unexpected error occurred.`;
 48 |   }
 49 | 
 50 |   const footer = `
 51 | 
 52 | If this error persists:
 53 | - Open an issue: https://github.com/chonkie-inc/chonkie-ts/issues
 54 | - Contact maintainer: bhavnick@chonkie.ai
 55 | - Include the error message and what you were trying to do`;
 56 | 
 57 |   return baseMessage + helpText + footer;
 58 | }
 59 | 
 60 | /**
 61 |  * Common API error types
 62 |  */
 63 | export const API_ERRORS = {
 64 |   INVALID_API_KEY: 'Invalid API key',
 65 |   RATE_LIMIT: 'Rate limit exceeded',
 66 |   SERVER_ERROR: 'Server error',
 67 |   NOT_FOUND: 'Endpoint not found',
 68 |   FORBIDDEN: 'Access forbidden',
 69 | } as const;
 70 | 
 71 | /**
 72 |  * File reference type for API requests
 73 |  */
 74 | export interface FileReference {
 75 |   /** Type of file reference */
 76 |   type: 'document' | 'base64';
 77 |   /** Content - either document name or base64 string */
 78 |   content: string;
 79 | }
 80 | 
 81 | /**
 82 |  * Response from file upload endpoint
 83 |  */
 84 | export interface FileUploadResponse {
 85 |   /** The document name/ID that can be used in subsequent API calls */
 86 |   document: string;
 87 |   /** Optional additional metadata */
 88 |   [key: string]: unknown;
 89 | }
 90 | 
 91 | /**
 92 |  * Create a file reference object for use in JSON API requests
 93 |  *
 94 |  * @param type - Type of file reference ('document' or 'base64')
 95 |  * @param content - The document name or base64 encoded string
 96 |  * @returns FileReference object that can be included in API request bodies
 97 |  *
 98 |  * @example
 99 |  * ```typescript
100 |  * // Using a document reference
101 |  * const fileRef = createFileReference('document', 'my-uploaded-file.pdf');
102 |  *
103 |  * // Using base64
104 |  * const base64Data = btoa('file contents');
105 |  * const fileRef = createFileReference('base64', base64Data);
106 |  * ```
107 |  */
108 | export function createFileReference(type: 'document' | 'base64', content: string): FileReference {
109 |   if (!type || (type !== 'document' && type !== 'base64')) {
110 |     throw new Error('File reference type must be either "document" or "base64"');
111 |   }
112 |   if (!content || typeof content !== 'string') {
113 |     throw new Error('File reference content must be a non-empty string');
114 |   }
115 |   return { type, content };
116 | }
117 | 


--------------------------------------------------------------------------------
/packages/cloud/src/chunkers/recursive.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Recursive chunker that uses hierarchical rules for chunking
  3 |  * via api.chonkie.ai
  4 |  */
  5 | 
  6 | import { Chunk } from '@chonkiejs/core';
  7 | import { CloudBaseChunker, ChunkerInput } from '@/base';
  8 | 
  9 | export interface RecursiveChunkerOptions {
 10 |   /** Tokenizer to use (default: "gpt2") */
 11 |   tokenizer?: string;
 12 |   /** Maximum tokens per chunk (default: 512) */
 13 |   chunkSize?: number;
 14 |   /** Recipe name (default: "default") */
 15 |   recipe?: string;
 16 |   /** Language for recipe (default: "en") */
 17 |   lang?: string;
 18 |   /** Minimum characters per chunk (default: 12) */
 19 |   minCharactersPerChunk?: number;
 20 |   /** API key (reads from CHONKIE_API_KEY env var if not provided) */
 21 |   apiKey?: string;
 22 |   /** Base URL for API (default: "https://api.chonkie.ai") */
 23 |   baseUrl?: string;
 24 | }
 25 | 
 26 | interface ApiChunkResponse {
 27 |   text: string;
 28 |   start_index: number;
 29 |   end_index: number;
 30 |   token_count: number;
 31 | }
 32 | 
 33 | interface RecursiveChunkPayload extends Record<string, unknown> {
 34 |   text?: string;
 35 |   file?: { type: string; content: string };
 36 |   tokenizer_or_token_counter: string;
 37 |   chunk_size: number;
 38 |   recipe: string;
 39 |   lang: string;
 40 |   min_characters_per_chunk: number;
 41 |   return_type: string;
 42 | }
 43 | 
 44 | export class RecursiveChunker extends CloudBaseChunker {
 45 |   private readonly config: {
 46 |     tokenizer: string;
 47 |     chunkSize: number;
 48 |     recipe: string;
 49 |     lang: string;
 50 |     minCharactersPerChunk: number;
 51 |   };
 52 | 
 53 |   constructor(options: RecursiveChunkerOptions = {}) {
 54 |     const apiKey = options.apiKey || process.env.CHONKIE_API_KEY;
 55 |     if (!apiKey) {
 56 |       throw new Error('API key is required. Provide it in options.apiKey or set CHONKIE_API_KEY environment variable.');
 57 |     }
 58 | 
 59 |     super({ apiKey, baseUrl: options.baseUrl });
 60 | 
 61 |     this.config = {
 62 |       tokenizer: options.tokenizer || 'gpt2',
 63 |       chunkSize: options.chunkSize || 512,
 64 |       recipe: options.recipe || 'default',
 65 |       lang: options.lang || 'en',
 66 |       minCharactersPerChunk: options.minCharactersPerChunk || 12,
 67 |     };
 68 |   }
 69 | 
 70 |   async chunk(input: ChunkerInput): Promise<Chunk[]> {
 71 |     let fileRef = input.file;
 72 | 
 73 |     // If filepath is provided, upload it first to get a file reference
 74 |     if (input.filepath) {
 75 |       fileRef = await this.uploadFile(input.filepath);
 76 |     }
 77 | 
 78 |     // Build the payload
 79 |     const payload: RecursiveChunkPayload = {
 80 |       tokenizer_or_token_counter: this.config.tokenizer,
 81 |       chunk_size: this.config.chunkSize,
 82 |       recipe: this.config.recipe,
 83 |       lang: this.config.lang,
 84 |       min_characters_per_chunk: this.config.minCharactersPerChunk,
 85 |       return_type: 'chunks',
 86 |     };
 87 | 
 88 |     // Add either text or file to the payload
 89 |     if (fileRef) {
 90 |       payload.file = fileRef;
 91 |     } else if (input.text) {
 92 |       payload.text = input.text;
 93 |     } else {
 94 |       throw new Error('Either text, filepath, or file must be provided');
 95 |     }
 96 | 
 97 |     const data = await this.request<ApiChunkResponse[]>('/v1/chunk/recursive', {
 98 |       method: 'POST',
 99 |       body: payload,
100 |     });
101 | 
102 |     return data.map(chunk => new Chunk({
103 |       text: chunk.text,
104 |       startIndex: chunk.start_index,
105 |       endIndex: chunk.end_index,
106 |       tokenCount: chunk.token_count,
107 |     }));
108 |   }
109 | 
110 |   async chunkBatch(inputs: ChunkerInput[]): Promise<Chunk[][]> {
111 |     return Promise.all(inputs.map(input => this.chunk(input)));
112 |   }
113 | 
114 |   toString(): string {
115 |     return `RecursiveChunker(tokenizer=${this.config.tokenizer}, chunkSize=${this.config.chunkSize}, recipe=${this.config.recipe})`;
116 |   }
117 | }
118 | 


--------------------------------------------------------------------------------
/packages/token/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | 
  3 | ![Chonkie Logo](../../assets/chonkie_logo_br_transparent_bg.png)
  4 | 
  5 | # @chonkiejs/token
  6 | 
  7 | _HuggingFace tokenizer support for Chonkie - extends @chonkiejs/core with real tokenization._
  8 | 
  9 | [![npm version](https://img.shields.io/npm/v/@chonkiejs/token)](https://www.npmjs.com/package/@chonkiejs/token)
 10 | [![npm license](https://img.shields.io/npm/l/@chonkiejs/token)](https://www.npmjs.com/package/@chonkiejs/token)
 11 | [![GitHub](https://img.shields.io/badge/github-chonkie--ts-black.svg?logo=github)](https://github.com/chonkie-inc/chonkie-ts)
 12 | 
 13 | </div>
 14 | 
 15 | ## Features
 16 | 🤗 **HuggingFace Integration** - Use any HuggingFace tokenizer model</br>
 17 | 🔌 **Optional Plugin** - Install only when you need real tokenization</br>
 18 | 📦 **Zero Config** - Works automatically with @chonkiejs/core</br>
 19 | ⚡ **Progressive Enhancement** - Core works without it, better with it</br>
 20 | 
 21 | ## Installation
 22 | 
 23 | Install with `npm`:
 24 | ```bash
 25 | npm i @chonkiejs/token @chonkiejs/core
 26 | ```
 27 | 
 28 | Install with `pnpm`:
 29 | ```bash
 30 | pnpm add @chonkiejs/token @chonkiejs/core
 31 | ```
 32 | 
 33 | Install with `yarn`:
 34 | ```bash
 35 | yarn add @chonkiejs/token @chonkiejs/core
 36 | ```
 37 | 
 38 | Install with `bun`:
 39 | ```bash
 40 | bun add @chonkiejs/token @chonkiejs/core
 41 | ```
 42 | 
 43 | ## Quick Start
 44 | 
 45 | Simply install this package alongside `@chonkiejs/core`, then use tokenizer names:
 46 | 
 47 | ```typescript
 48 | import { RecursiveChunker, TokenChunker } from '@chonkiejs/core';
 49 | 
 50 | // Use GPT-2 tokenization (automatically uses @chonkiejs/token)
 51 | const chunker = await RecursiveChunker.create({
 52 |   tokenizer: 'Xenova/gpt2',
 53 |   chunkSize: 512
 54 | });
 55 | 
 56 | const chunks = await chunker.chunk('Your text here...');
 57 | ```
 58 | 
 59 | ## Supported Models
 60 | 
 61 | Any HuggingFace model from transformers.js:
 62 | 
 63 | - `Xenova/gpt2`
 64 | - `Xenova/gpt-4`
 65 | - `bert-base-uncased`
 66 | - `google-bert/bert-base-multilingual-cased`
 67 | - And many more!
 68 | 
 69 | See: https://huggingface.co/models?library=transformers.js
 70 | 
 71 | ## Usage Examples
 72 | 
 73 | ### With RecursiveChunker
 74 | 
 75 | ```typescript
 76 | import { RecursiveChunker } from '@chonkiejs/core';
 77 | 
 78 | const chunker = await RecursiveChunker.create({
 79 |   tokenizer: 'Xenova/gpt2',
 80 |   chunkSize: 512
 81 | });
 82 | 
 83 | const chunks = await chunker.chunk('Your document...');
 84 | ```
 85 | 
 86 | ### With TokenChunker
 87 | 
 88 | ```typescript
 89 | import { TokenChunker } from '@chonkiejs/core';
 90 | 
 91 | const chunker = await TokenChunker.create({
 92 |   tokenizer: 'bert-base-uncased',
 93 |   chunkSize: 256,
 94 |   chunkOverlap: 50
 95 | });
 96 | 
 97 | const chunks = await chunker.chunk('Your text...');
 98 | ```
 99 | 
100 | ### Direct Tokenizer Usage
101 | 
102 | ```typescript
103 | import { HuggingFaceTokenizer } from '@chonkiejs/token';
104 | 
105 | const tokenizer = await HuggingFaceTokenizer.create('Xenova/gpt2');
106 | 
107 | const count = tokenizer.countTokens('Hello world!');
108 | const tokens = tokenizer.encode('Hello world!');
109 | const text = tokenizer.decode(tokens);
110 | 
111 | console.log(`Token count: ${count}`);
112 | ```
113 | 
114 | ## How It Works
115 | 
116 | When you call `Tokenizer.create('gpt2')` in @chonkiejs/core:
117 | 
118 | 1. Core tries to dynamically import `@chonkiejs/token`
119 | 2. **If installed:** Uses HuggingFaceTokenizer
120 | 3. **If not installed:** Shows helpful error message
121 | 
122 | This keeps core lightweight while allowing advanced tokenization when needed!
123 | 
124 | ## Contributing
125 | 
126 | Want to help grow Chonkie? Check out [CONTRIBUTING.md](../../CONTRIBUTING.md) to get started! Whether you're fixing bugs, adding features, improving docs, or simply leaving a ⭐️ on the repo, every contribution helps make Chonkie a better CHONK for everyone.
127 | 
128 | Remember: No contribution is too small for this tiny hippo!
129 | 


--------------------------------------------------------------------------------
/legacy/chonkie/friends/chroma.ts:
--------------------------------------------------------------------------------
  1 | /** ChromaHandshake to integrate Chonkie with Chroma. */
  2 | 
  3 | import { BaseHandshake } from "./base";
  4 | import { ChromaClient } from "chromadb";
  5 | import { generateRandomCollectionName } from "./utils";
  6 | import { Chunk } from "../types/base";
  7 | import { v5 as uuidv5 } from "uuid";
  8 | 
  9 | /**
 10 |  * ChromaHandshake to integrate Chonkie with Chroma.
 11 |  * 
 12 |  * @param client - The ChromaClient to use.
 13 |  * @param collectionName - The name of the collection to use.
 14 |  * @param path - The path to the Chroma database. Can point to the running instance, Docker or Cloud.
 15 |  * @param logLevel - The log level ('verbose' or 'silent'). Default: 'verbose'.
 16 |  */
 17 | export class ChromaHandshake extends BaseHandshake {
 18 | 
 19 |   private client: ChromaClient;
 20 |   private collectionName: string;
 21 |   private logLevel: 'verbose' | 'silent';
 22 | 
 23 |   constructor(client?: ChromaClient, collectionName?: string, path?: string, logLevel: 'verbose' | 'silent' = 'verbose') {
 24 |     super();
 25 | 
 26 |     // If the client is not provided, create a new one
 27 |     this.client = client ?? new ChromaClient({ path });
 28 |     // If the collection name is not provided, generate a random one
 29 |     this.collectionName = collectionName ?? generateRandomCollectionName();
 30 |     this.logLevel = logLevel;
 31 | 
 32 |     // Print to console the collection name if verbose
 33 |     if (this.logLevel === 'verbose') {
 34 |       console.log(`Using collection ${this.collectionName}`);
 35 |     }
 36 |   }
 37 | 
 38 |   private _getId(index: number, chunk: Chunk): string {
 39 |     const id = uuidv5(`CHUNK-${index}:${chunk.text}`, uuidv5.DNS);
 40 |     return id;
 41 |   }
 42 | 
 43 |   /**
 44 |    * Write chunks to the collection provided in the constructor.
 45 |    * @param chunks - The chunks to write.
 46 |    */
 47 |   public async write(chunks: Chunk[]): Promise<void> {
 48 |     // Check if the collection exists and if not, create it
 49 |     const collection = await this.client.getOrCreateCollection({ name: this.collectionName });
 50 |     
 51 |     // Create a list of ids and documents to upsert
 52 |     const ids: string[] = [];
 53 |     const documents: string[] = [];
 54 |     const metadatas: Record<string, any>[] = [];
 55 |     for (const [index, chunk] of chunks.entries()) {
 56 |       ids.push(this._getId(index, chunk));
 57 |       documents.push(chunk.text);
 58 |       metadatas.push({
 59 |         "start_index": chunk.startIndex,
 60 |         "end_index": chunk.endIndex,
 61 |         "token_count": chunk.tokenCount,
 62 |       });
 63 |     }
 64 | 
 65 |     // Upsert the chunks into the collection
 66 |     await collection.upsert({
 67 |       ids: ids,
 68 |       documents: documents,
 69 |       metadatas: metadatas,
 70 |     });
 71 | 
 72 |     // Print to console the number of chunks upserted if verbose
 73 |     if (this.logLevel === 'verbose') {
 74 |       console.log(`Upserted ${chunks.length} chunks into the collection ${this.collectionName}`);
 75 |     }
 76 |   }
 77 | 
 78 |   /**
 79 |    * Query the collection provided in the constructor.
 80 |    * @param query - The query to search for.
 81 |    * @param nResults - The number of results to return.
 82 |    * @returns The chunks that match the query.
 83 |    */
 84 |   public async query(query: string, nResults: number = 10): Promise<Chunk[]> {
 85 |     const collection = await this.client.getCollection({ name: this.collectionName });
 86 |     const results = await collection.query({
 87 |       queryTexts: [query],
 88 |       nResults: nResults,
 89 |     });
 90 | 
 91 |     // Return the chunks
 92 |     const { documents, metadatas } = results;
 93 |     return documents[0].map((document, index) => {
 94 |       const metadata = metadatas[0][index];
 95 |       return new Chunk({
 96 |         text: document ?? '',
 97 |         startIndex: Number(metadata?.start_index) || 0,
 98 |         endIndex: Number(metadata?.end_index) || 0,
 99 |         tokenCount: Number(metadata?.token_count) || 0,
100 |       });
101 |     });
102 |   }
103 | }


--------------------------------------------------------------------------------
/packages/core/src/token.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Token chunker that splits text into fixed-size token chunks.
  3 |  */
  4 | 
  5 | import { Tokenizer } from '@/tokenizer';
  6 | import { Chunk } from '@/types';
  7 | 
  8 | export interface TokenChunkerOptions {
  9 |   /** Tokenizer instance or model name (default: 'character') */
 10 |   tokenizer?: Tokenizer | string;
 11 |   /** Maximum tokens per chunk (default: 512) */
 12 |   chunkSize?: number;
 13 |   /** Number of tokens to overlap between chunks (default: 0) */
 14 |   chunkOverlap?: number;
 15 | }
 16 | 
 17 | /**
 18 |  * Splits text into fixed-size token chunks with optional overlap.
 19 |  *
 20 |  * Uses character-based tokenization by default, but can use advanced
 21 |  * tokenizers from @chonkiejs/token package.
 22 |  */
 23 | export class TokenChunker {
 24 |   public readonly chunkSize: number;
 25 |   public readonly chunkOverlap: number;
 26 |   private tokenizer: Tokenizer;
 27 | 
 28 |   private constructor(
 29 |     tokenizer: Tokenizer,
 30 |     chunkSize: number,
 31 |     chunkOverlap: number
 32 |   ) {
 33 |     if (chunkSize <= 0) {
 34 |       throw new Error('chunkSize must be greater than 0');
 35 |     }
 36 |     if (chunkOverlap < 0) {
 37 |       throw new Error('chunkOverlap must be non-negative');
 38 |     }
 39 |     if (chunkOverlap >= chunkSize) {
 40 |       throw new Error('chunkOverlap must be less than chunkSize');
 41 |     }
 42 | 
 43 |     this.tokenizer = tokenizer;
 44 |     this.chunkSize = chunkSize;
 45 |     this.chunkOverlap = chunkOverlap;
 46 |   }
 47 | 
 48 |   /**
 49 |    * Create a TokenChunker instance.
 50 |    *
 51 |    * @param options - Configuration options
 52 |    * @returns Promise resolving to TokenChunker instance
 53 |    *
 54 |    * @example
 55 |    * // Character-based (no dependencies)
 56 |    * const chunker = await TokenChunker.create({ chunkSize: 512 });
 57 |    *
 58 |    * @example
 59 |    * // With HuggingFace tokenizer (requires @chonkiejs/token)
 60 |    * const chunker = await TokenChunker.create({
 61 |    *   tokenizer: 'gpt2',
 62 |    *   chunkSize: 512,
 63 |    *   chunkOverlap: 50
 64 |    * });
 65 |    */
 66 |   static async create(options: TokenChunkerOptions = {}): Promise<TokenChunker> {
 67 |     const {
 68 |       tokenizer = 'character',
 69 |       chunkSize = 512,
 70 |       chunkOverlap = 0,
 71 |     } = options;
 72 | 
 73 |     let tokenizerInstance: Tokenizer;
 74 | 
 75 |     if (typeof tokenizer === 'string') {
 76 |       tokenizerInstance = await Tokenizer.create(tokenizer);
 77 |     } else {
 78 |       tokenizerInstance = tokenizer;
 79 |     }
 80 | 
 81 |     return new TokenChunker(tokenizerInstance, chunkSize, chunkOverlap);
 82 |   }
 83 | 
 84 |   /**
 85 |    * Chunk a single text into fixed-size token chunks.
 86 |    *
 87 |    * @param text - The text to chunk
 88 |    * @returns Array of chunks
 89 |    */
 90 |   async chunk(text: string): Promise<Chunk[]> {
 91 |     if (!text) {
 92 |       return [];
 93 |     }
 94 | 
 95 |     const tokens = this.tokenizer.encode(text);
 96 |     const chunks: Chunk[] = [];
 97 |     const step = this.chunkSize - this.chunkOverlap;
 98 | 
 99 |     for (let i = 0; i < tokens.length; i += step) {
100 |       const chunkTokens = tokens.slice(i, i + this.chunkSize);
101 |       const chunkText = this.tokenizer.decode(chunkTokens);
102 |       const startIndex = this.findStartIndex(text, chunkText, i > 0 ? chunks[chunks.length - 1].endIndex : 0);
103 |       const endIndex = startIndex + chunkText.length;
104 | 
105 |       chunks.push(new Chunk({
106 |         text: chunkText,
107 |         startIndex,
108 |         endIndex,
109 |         tokenCount: chunkTokens.length,
110 |       }));
111 |     }
112 | 
113 |     return chunks;
114 |   }
115 | 
116 |   /**
117 |    * Find the start index of chunk text in the original text.
118 |    * This handles overlaps correctly.
119 |    */
120 |   private findStartIndex(text: string, chunkText: string, searchFrom: number): number {
121 |     const index = text.indexOf(chunkText, searchFrom);
122 |     return index !== -1 ? index : searchFrom;
123 |   }
124 | 
125 |   toString(): string {
126 |     return `TokenChunker(chunkSize=${this.chunkSize}, overlap=${this.chunkOverlap})`;
127 |   }
128 | }
129 | 


--------------------------------------------------------------------------------
/legacy/chonkie/cloud/semantic.ts:
--------------------------------------------------------------------------------
  1 | /** Semantic chunker client for Chonkie API. */
  2 | 
  3 | import { CloudClient, ChunkerInput } from "./base";
  4 | import { SemanticChunk } from "../types/semantic";
  5 | import * as fs from 'fs';
  6 | import * as path from 'path';
  7 | 
  8 | export interface SemanticChunkerConfig {
  9 |   embeddingModel?: string;
 10 |   threshold?: number | "auto";
 11 |   chunkSize?: number;
 12 |   similarityWindow?: number;
 13 |   minSentences?: number;
 14 |   minChunkSize?: number;
 15 |   minCharactersPerSentence?: number;
 16 |   thresholdStep?: number;
 17 |   delim?: string | string[];
 18 |   includeDelim?: "prev" | "next" | null;
 19 | }
 20 | 
 21 | export class SemanticChunker extends CloudClient {
 22 |   private readonly config: Required<SemanticChunkerConfig>;
 23 | 
 24 |   constructor(apiKey: string, config: SemanticChunkerConfig = {}) {
 25 |     super({ apiKey });
 26 |     this.config = {
 27 |       embeddingModel: config.embeddingModel || "minishlab/potion-base-8M",
 28 |       threshold: config.threshold ?? "auto",
 29 |       chunkSize: config.chunkSize || 512,
 30 |       similarityWindow: config.similarityWindow || 1,
 31 |       minSentences: config.minSentences || 1,
 32 |       minChunkSize: config.minChunkSize || 2,
 33 |       minCharactersPerSentence: config.minCharactersPerSentence || 12,
 34 |       thresholdStep: config.thresholdStep || 0.01,
 35 |       delim: config.delim || [".", "!", "?", "\n"],
 36 |       includeDelim: config.includeDelim ?? "prev",
 37 |     };
 38 |   }
 39 | 
 40 |   async chunk(input: ChunkerInput): Promise<SemanticChunk[]> {
 41 |     const formData = new FormData();
 42 |     
 43 |     if (input.filepath) {
 44 |       const fileContent = fs.readFileSync(input.filepath);
 45 |       const fileName = path.basename(input.filepath) || 'file.txt';
 46 |       formData.append("file", new Blob([fileContent]), fileName);
 47 |     } else if (input.text) {
 48 |       // JSON encode the text
 49 |       formData.append("text", JSON.stringify(input.text));
 50 |       // Append empty file to ensure multipart form
 51 |       formData.append("file", new Blob(), "text_input.txt");
 52 |     } else {
 53 |       throw new Error("Either text or filepath must be provided");
 54 |     }
 55 | 
 56 |     // Add all config options to the form data
 57 |     formData.append("embedding_model", this.config.embeddingModel);
 58 |     formData.append("threshold", this.config.threshold.toString());
 59 |     formData.append("chunk_size", this.config.chunkSize.toString());
 60 |     formData.append("similarity_window", this.config.similarityWindow.toString());
 61 |     formData.append("min_sentences", this.config.minSentences.toString());
 62 |     formData.append("min_chunk_size", this.config.minChunkSize.toString());
 63 |     formData.append("min_characters_per_sentence", this.config.minCharactersPerSentence.toString());
 64 |     formData.append("threshold_step", this.config.thresholdStep.toString());
 65 |     formData.append("delim", JSON.stringify(this.config.delim));
 66 |     formData.append("include_delim", this.config.includeDelim || "prev");
 67 |     formData.append("return_type", "chunks");
 68 | 
 69 |     const data = await this.request<any>("/v1/chunk/semantic", {
 70 |       method: "POST",
 71 |       body: formData,
 72 |     });
 73 | 
 74 |     // Convert from snake_case to camelCase
 75 |     const camelCaseData = data.map((chunk: any) => {
 76 |       return {
 77 |         text: chunk.text,
 78 |         startIndex: chunk.start_index,
 79 |         endIndex: chunk.end_index,
 80 |         tokenCount: chunk.token_count,
 81 |         embedding: chunk.embedding || undefined,
 82 |         sentences: chunk.sentences.map((sentence: any) => {
 83 |           return {
 84 |             text: sentence.text,
 85 |             startIndex: sentence.start_index,
 86 |             endIndex: sentence.end_index,
 87 |             tokenCount: sentence.token_count,
 88 |             embedding: sentence.embedding || undefined,
 89 |           };
 90 |         }),
 91 |       };
 92 |     });
 93 | 
 94 |     return camelCaseData.map((chunk: any) => SemanticChunk.fromDict(chunk));
 95 |   }
 96 | 
 97 |   async chunkBatch(inputs: ChunkerInput[]): Promise<SemanticChunk[][]> {
 98 |     return Promise.all(inputs.map(input => this.chunk(input)));
 99 |   }
100 | } 


--------------------------------------------------------------------------------
/legacy/chonkie/cloud/sdpm.ts:
--------------------------------------------------------------------------------
  1 | /** SDPM chunker client for Chonkie API. */
  2 | 
  3 | import { CloudClient, ChunkerInput } from "./base";
  4 | import { SemanticChunk } from "../types/semantic";
  5 | import * as fs from 'fs';
  6 | import * as path from 'path';
  7 | 
  8 | export interface SDPMChunkerConfig {
  9 |   embeddingModel?: string;
 10 |   threshold?: number | "auto";
 11 |   mode?: "window" | "cumulative";
 12 |   chunkSize?: number;
 13 |   similarityWindow?: number;
 14 |   minSentences?: number;
 15 |   minCharactersPerSentence?: number;
 16 |   thresholdStep?: number;
 17 |   delim?: string | string[];
 18 |   includeDelim?: "prev" | "next" | null;
 19 | }
 20 | 
 21 | export class SDPMChunker extends CloudClient {
 22 |   private readonly config: Required<SDPMChunkerConfig>;
 23 | 
 24 |   constructor(apiKey: string, config: SDPMChunkerConfig = {}) {
 25 |     super({ apiKey });
 26 |     this.config = {
 27 |       embeddingModel: config.embeddingModel || "minishlab/potion-base-8M",
 28 |       threshold: config.threshold ?? "auto",
 29 |       mode: config.mode || "window",
 30 |       chunkSize: config.chunkSize || 512,
 31 |       similarityWindow: config.similarityWindow || 1,
 32 |       minSentences: config.minSentences || 1,
 33 |       minCharactersPerSentence: config.minCharactersPerSentence || 12,
 34 |       thresholdStep: config.thresholdStep || 0.01,
 35 |       delim: config.delim || [".", "!", "?", "\n"],
 36 |       includeDelim: config.includeDelim ?? "prev",
 37 |     };
 38 |   }
 39 | 
 40 |   async chunk(input: ChunkerInput): Promise<SemanticChunk[]> {
 41 |     const formData = new FormData();
 42 | 
 43 |     if (input.filepath) {
 44 |       const fileContent = fs.readFileSync(input.filepath);
 45 |       const fileName = path.basename(input.filepath) || 'file.txt';
 46 |       formData.append("file", new Blob([fileContent]), fileName);
 47 |     } else if (input.text) {
 48 |       // JSON encode the text
 49 |       formData.append("text", JSON.stringify(input.text));
 50 |       // Append empty file to ensure multipart form
 51 |       formData.append("file", new Blob(), "text_input.txt");
 52 |     } else {
 53 |       throw new Error("Either text or filepath must be provided");
 54 |     }
 55 | 
 56 |     formData.append("embedding_model", this.config.embeddingModel);
 57 |     if (typeof this.config.threshold === "number") {
 58 |       formData.append("threshold", this.config.threshold.toString());
 59 |     } else {
 60 |       formData.append("threshold", this.config.threshold);
 61 |     }
 62 |     formData.append("mode", this.config.mode);
 63 |     formData.append("chunk_size", this.config.chunkSize.toString());
 64 |     formData.append("similarity_window", this.config.similarityWindow.toString());
 65 |     formData.append("min_sentences", this.config.minSentences.toString());
 66 |     formData.append("min_characters_per_sentence", this.config.minCharactersPerSentence.toString());
 67 |     formData.append("threshold_step", this.config.thresholdStep.toString());
 68 |     // Append delim as a string array
 69 |     formData.append("delim", JSON.stringify(this.config.delim));
 70 |     formData.append("include_delim", this.config.includeDelim || "prev");
 71 |     formData.append("return_type", "chunks");
 72 | 
 73 |     const data = await this.request<any>("/v1/chunk/sdpm", {
 74 |       method: "POST",
 75 |       body: formData,
 76 |     });
 77 | 
 78 |     // Convert from snake_case to camelCase
 79 |     const camelCaseData = data.map((chunk: any) => {
 80 |       return {
 81 |         text: chunk.text,
 82 |         startIndex: chunk.start_index,
 83 |         endIndex: chunk.end_index,
 84 |         tokenCount: chunk.token_count,
 85 |         embedding: chunk.embedding || undefined,
 86 |         sentences: chunk.sentences.map((sentence: any) => {
 87 |           return {
 88 |             text: sentence.text,
 89 |             startIndex: sentence.start_index,
 90 |             endIndex: sentence.end_index,
 91 |             tokenCount: sentence.token_count,
 92 |             embedding: sentence.embedding || undefined,
 93 |           };
 94 |         }),
 95 |       };
 96 |     });
 97 | 
 98 |     return camelCaseData.map((chunk: any) => SemanticChunk.fromDict(chunk));
 99 |   }
100 | 
101 |   async chunkBatch(inputs: ChunkerInput[]): Promise<SemanticChunk[][]> {
102 |     return Promise.all(inputs.map(input => this.chunk(input)));
103 |   }
104 | } 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # 🦛 Contributing to Chonkie
  2 | 
  3 | > "I like them big, I like them CONTRIBUTING" ~ Moto Moto, probably
  4 | 
  5 | Welcome fellow CHONKer! We're thrilled you want to contribute to Chonkie. Every contribution—whether fixing bugs, adding features, or improving documentation—makes Chonkie better for everyone.
  6 | 
  7 | ## 🚀 Getting Started
  8 | 
  9 | ### Before You Dive In
 10 | 
 11 | 1. **Check existing issues** or open a new one to start a discussion
 12 | 2. **Read [Chonkie's documentation](https://docs.chonkie.ai)** and core [concepts](https://docs.chonkie.ai/getting-started/concepts)
 13 | 3. **Set up your development environment** using the guide below
 14 | 
 15 | ### Development Setup
 16 | 
 17 | ```bash
 18 | # 1. Fork and clone the repository
 19 | git clone https://github.com/chonkie-inc/chonkie-ts.git
 20 | cd chonkie-ts
 21 | 
 22 | # 2. Install dependencies   
 23 | npm install
 24 | ```
 25 | 
 26 | ## 🧪 Testing & Code Quality
 27 | 
 28 | ### Running Tests
 29 | 
 30 | ```bash
 31 | npx jest tests/ # Run all tests
 32 | npx jest tests/chunker/ # Run all tests in chunker
 33 | npx jest tests/chunker/tokenChunker.test.ts # Run specific test file
 34 | ```
 35 | 
 36 | ### Documentation Style
 37 | 
 38 | We follow Google-style docstrings:
 39 | 
 40 | ```typescript
 41 | /**
 42 |  * Splits text into chunks of specified size.
 43 |  * 
 44 |  * @param text - Input text to chunk
 45 |  * @param chunk_size - Maximum size of each chunk
 46 |  * 
 47 |  * @returns List of text chunks
 48 |  * 
 49 |  * @throws ValueError if chunk_size <= 0
 50 |  */
 51 | function chunk_text(text: string, chunk_size: number): string[] {
 52 |     return text.split(' ').slice(0, chunk_size);
 53 | }
 54 | ```
 55 | 
 56 | ## 📦 Project Structure
 57 | 
 58 | ```
 59 | src/
 60 | ├── chonkie/
 61 |     ├── chunker/     # Local Chunkers
 62 |     ├── cloud/       # Cloud API Clients
 63 |     ├── types/       # Chonkie type definitions
 64 | ```
 65 | 
 66 | ## 🎯 Contribution Opportunities
 67 | 
 68 | ### For Beginners
 69 | 
 70 | Start with issues labeled [`good-first-issue`](https://github.com/chonkie-inc/chonkie/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)
 71 | 
 72 | ### Documentation
 73 | 
 74 | - Improve existing docs
 75 | - Add examples or tutorials
 76 | - Fix typos
 77 | 
 78 | ### Code Improvements
 79 | 
 80 | - Implement new chunking strategies
 81 | - Add tokenizer support
 82 | - Optimize existing chunkers
 83 | - Improve test coverage
 84 | - Bring to feature parity with Python library
 85 | 
 86 | ### Performance Enhancements
 87 | 
 88 | - Profile and optimize code
 89 | - Add benchmarks
 90 | - Improve memory usage
 91 | 
 92 | ### New Features
 93 | 
 94 | Look for issues with [FEAT] labels, especially those from Chonkie Maintainers
 95 | 
 96 | ## 🚦 Pull Request Process
 97 | 
 98 | ### 1. Branch Naming
 99 | 
100 | - `feature/description` for new features
101 | - `fix/description` for bug fixes
102 | - `docs/description` for documentation changes
103 | 
104 | ### 2. Commit Messages
105 | 
106 | Write clear, descriptive commit messages:
107 | 
108 | ```
109 | feat: add batch processing to WordChunker
110 | 
111 | - Implement batch_process method
112 | - Add tests for batch processing
113 | - Update documentation
114 | ```
115 | 
116 | ### 3. Code Review
117 | 
118 | - **Make sure your PR is for the `development` branch**
119 | - All PRs need at least one review
120 | - Maintainers will review for:
121 |   - Code quality (via ruff)
122 |   - Test coverage
123 |   - Performance impact
124 |   - Documentation completeness
125 | 
126 | ## 🦛 Technical Details
127 | 
128 | ### Semantic Versioning
129 | 
130 | Chonkie does not follow strict semantic versioning. We follow the following rules:
131 | 
132 | - 'MAJOR' version when we refactor/rewrite large parts of the codebase
133 | - 'MINOR' version when we add breaking changes (e.g. changing a public API)
134 | - 'PATCH' version when we add non-breaking features (e.g. adding a new chunker) or fix bugs
135 | 
136 | ## 💡 Getting Help
137 | 
138 | - **Chat?** [Join our Discord!](https://discord.gg/Q6zkP8w6ur)
139 | - **Questions?** Open an issue or ask in Discord
140 | - **Bugs?** Open an issue or report in Discord
141 | - **Email?** Contact [support@chonkie.ai](mailto:support@chonkie.ai)
142 | 
143 | ## 🙏 Thank You
144 | 
145 | Every contribution helps make Chonkie better! We appreciate your time and effort in helping make Chonkie the CHONKiest it can be!
146 | 
147 | Remember:
148 | > "A journey of a thousand CHONKs begins with a single commit" ~ Ancient Proverb, probably
149 | 


--------------------------------------------------------------------------------
/legacy/chonkie/types/base.ts:
--------------------------------------------------------------------------------
  1 | /** Custom base types for Chonkie. */
  2 | 
  3 | /**
  4 |  * Represents the data structure for a chunk object.
  5 |  * 
  6 |  * @property {string} text - The text of the chunk.
  7 |  * @property {number} startIndex - The starting index of the chunk in the original text.
  8 |  * @property {number} endIndex - The ending index of the chunk in the original text.
  9 |  * @property {number} tokenCount - The number of tokens in the chunk.
 10 |  */
 11 | interface ChunkData {
 12 |   text: string;
 13 |   startIndex: number;
 14 |   endIndex: number;
 15 |   tokenCount: number;
 16 |   embedding?: number[];
 17 | }
 18 | 
 19 | /**
 20 |  * Represents a chunk of text with associated metadata.
 21 |  * 
 22 |  * @property {string} text - The text of the chunk.
 23 |  * @property {number} startIndex - The starting index of the chunk in the original text.
 24 |  * @property {number} endIndex - The ending index of the chunk in the original text.
 25 |  * @property {number} tokenCount - The number of tokens in the chunk.
 26 |  * @property {number[]} [embedding] - The embedding for the chunk.
 27 |  */
 28 | export class Chunk {
 29 |   /** The text of the chunk. */
 30 |   public text: string;
 31 |   /** The starting index of the chunk in the original text. */
 32 |   public startIndex: number;
 33 |   /** The ending index of the chunk in the original text. */
 34 |   public endIndex: number;
 35 |   /** The number of tokens in the chunk. */
 36 |   public tokenCount: number;
 37 |   /** Optional embedding for the chunk. */
 38 |   public embedding?: number[];
 39 | 
 40 |   /**
 41 |    * Constructs a new Chunk object.
 42 |    * 
 43 |    * @param {ChunkData} data - The data to construct the Chunk from.
 44 |    */
 45 |   constructor(data: {
 46 |     text: string;
 47 |     startIndex: number;
 48 |     endIndex: number;
 49 |     tokenCount: number;
 50 |     embedding?: number[];
 51 |   }) {
 52 |     this.text = data.text;
 53 |     this.startIndex = data.startIndex;
 54 |     this.endIndex = data.endIndex;
 55 |     this.tokenCount = data.tokenCount;
 56 |     this.embedding = data.embedding;
 57 | 
 58 |     // Basic validation, more can be added if needed
 59 |     if (this.startIndex > this.endIndex) {
 60 |         throw new Error("Start index must be less than or equal to end index.");
 61 |     }
 62 |     if (this.tokenCount < 0) {
 63 |         throw new Error("Token count must be a non-negative integer.");
 64 |     }
 65 |   }
 66 | 
 67 |   /** Return a string representation of the Chunk. 
 68 |    * 
 69 |    * @returns {string} The text of the chunk.
 70 |    */
 71 |   public toString(): string {
 72 |     return this.text;
 73 |   }
 74 | 
 75 |   /** Return a detailed string representation of the Chunk. 
 76 |    * 
 77 |    * @returns {string} The detailed string representation of the Chunk.
 78 |    */
 79 |   public toRepresentation(): string {
 80 |     let repr = `Chunk(text='${this.text}', tokenCount=${this.tokenCount}, startIndex=${this.startIndex}, endIndex=${this.endIndex}`;
 81 |     repr += ')';
 82 |     return repr;
 83 |   }
 84 | 
 85 |   /** Return a slice of the chunk's text. 
 86 |    * 
 87 |    * @param {number} [start] - The starting index of the slice.
 88 |    * @param {number} [end] - The ending index of the slice.
 89 |    * @returns {string} The slice of the chunk's text.
 90 |    */
 91 |   public slice(start?: number, end?: number): string {
 92 |     return this.text.slice(start, end);
 93 |   }
 94 | 
 95 |   /** Return the Chunk as a dictionary-like object. 
 96 |    * 
 97 |    * @returns {ChunkData} The dictionary-like object.
 98 |    */
 99 |   public toDict(): ChunkData {
100 |     return {
101 |       text: this.text,
102 |       startIndex: this.startIndex,
103 |       endIndex: this.endIndex,
104 |       tokenCount: this.tokenCount,
105 |       embedding: this.embedding,
106 |     };
107 |   }
108 | 
109 |   /** Create a Chunk object from a dictionary-like object. 
110 |    * 
111 |    * @param {ChunkData} data - The dictionary-like object.
112 |    * @returns {Chunk} The Chunk object.
113 |    */
114 |   public static fromDict(data: ChunkData): Chunk {
115 |     return new Chunk({
116 |       text: data.text,
117 |       startIndex: data.startIndex,
118 |       endIndex: data.endIndex,
119 |       tokenCount: data.tokenCount,
120 |       embedding: data.embedding,
121 |     });
122 |   }
123 | 
124 |   /** Return a deep copy of the chunk. 
125 |    * 
126 |    * @returns {Chunk} The deep copy of the chunk.
127 |    */
128 |   public copy(): Chunk {
129 |     return Chunk.fromDict(this.toDict());
130 |   }
131 | }
132 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 | 
 3 | ![Chonkie Logo](./assets/chonkie_logo_br_transparent_bg.png)
 4 | 
 5 | # 🦛 chonkiejs ✨
 6 | 
 7 | [![npm version](https://img.shields.io/npm/v/@chonkiejs/core)](https://www.npmjs.com/package/@chonkiejs/core)
 8 | [![npm downloads](https://img.shields.io/npm/dt/@chonkiejs/core)](https://www.npmjs.com/package/@chonkiejs/core)
 9 | [![npm license](https://img.shields.io/npm/l/@chonkiejs/core)](https://www.npmjs.com/package/@chonkiejs/core)
10 | [![npm bundle size](https://img.shields.io/bundlephobia/min/@chonkiejs/core)](https://www.npmjs.com/package/@chonkiejs/core)
11 | [![Discord](https://dcbadge.limes.pink/api/server/https://discord.gg/rYYp6DC4cv?style=flat)](https://discord.gg/rYYp6DC4cv)
12 | [![Github Stars](https://img.shields.io/github/stars/chonkie-inc/chonkie-ts?style=social)](https://github.com/chonkie-inc/chonkie-ts)
13 | 
14 | _🦛 CHONK your texts in TypeScript with Chonkie!✨ The no-nonsense lightweight and efficient chunking library._
15 | 
16 | [Installation](#-installation) •
17 | [Usage](#-usage) •
18 | [Chunkers](#chunkers) •
19 | [Acknowledgements](#acknowledgements) •
20 | [Citation](#citation)
21 | 
22 | </div>
23 | 
24 | We built `chonkiejs` while developing a TypeScript web app that needed fast, on-the-fly text chunking for RAG applications. After trying several existing libraries, we found them either too heavy or not flexible enough for our needs. `chonkiejs` is a port of the original `chonkie` library, but with some type-safety and a few extra features to make it more useful for TypeScript developers!
25 | 
26 | **🚀 Feature-rich**: All the CHONKs you'd ever need </br>
27 | **✨ Easy to use**: Install, Import, CHONK </br>
28 | **⚡  Fast**: CHONK at the max speed of TypeScript! tssssooooooom </br>
29 | **🪶 Light-weight**: No bloat, just CHONK </br>
30 | **🦛 Cute CHONK mascot**: psst it's a pygmy hippo btw </br>
31 | **❤️ [Moto Moto](#acknowledgements)'s favorite TypeScript library** </br>
32 | 
33 | **Chonkie** is a chunking library that "**just works**" ✨
34 | 
35 | > [!NOTE]
36 | > This library is not a _binding_ but a _port_ of the original `chonkie` library written in Python, to TypeScript. This library is still under active development and not at feature parity with the original `chonkie` library yet. Please bear with us! 🫂
37 | 
38 | ## 📦 Installation
39 | 
40 | ```bash
41 | npm install @chonkiejs/core
42 | ```
43 | 
44 | ## 📚 Usage
45 | 
46 | ```typescript
47 | import { RecursiveChunker } from '@chonkiejs/core';
48 | 
49 | // Create a chunker
50 | const chunker = await RecursiveChunker.create({
51 |   chunkSize: 512
52 | });
53 | 
54 | // Chunk your text
55 | const chunks = await chunker.chunk('Your text here...');
56 | 
57 | // Use the chunks
58 | for (const chunk of chunks) {
59 |   console.log(chunk.text);
60 |   console.log(`Tokens: ${chunk.tokenCount}`);
61 | }
62 | ```
63 | 
64 | ## 📦 Packages
65 | 
66 | | Package | Description | Dependencies |
67 | |---------|-------------|--------------|
68 | | [@chonkiejs/core](./packages/core) | Local chunking (Recursive, Token) with character-based tokenization | Zero |
69 | | [@chonkiejs/cloud](./packages/cloud) | Cloud-based chunkers (Semantic, Neural, Code, etc.) via api.chonkie.ai | @chonkiejs/core |
70 | | [@chonkiejs/token](./packages/token) | HuggingFace tokenizer support for core chunkers | @huggingface/transformers |
71 | 
72 | ## Contributing
73 | 
74 | Want to help grow Chonkie? Check out [CONTRIBUTING.md](CONTRIBUTING.md) to get started! Whether you're fixing bugs, adding features, improving docs, or simply leaving a ⭐️ on the repo, every contribution helps make Chonkie a better CHONK for everyone.
75 | 
76 | Remember: No contribution is too small for this tiny hippo!
77 | 
78 | ## Acknowledgements
79 | 
80 | Chonkie would like to CHONK its way through a special thanks to all the users and contributors who have helped make this library what it is today! Your feedback, issue reports, and improvements have helped make Chonkie the CHONKIEST it can be.
81 | 
82 | And of course, special thanks to [Moto Moto](https://www.youtube.com/watch?v=I0zZC4wtqDQ&t=5s) for endorsing Chonkie with his famous quote:
83 | > "I like them big, I like them chonkie in TypeScript" ~ Moto Moto... definitely did not say this
84 | 
85 | ## Citation
86 | 
87 | If you use Chonkie in your research, please cite it as follows:
88 | 
89 | ```bibtex
90 | @software{chonkie2025,
91 |   author = {Bhavnick Minhas and Shreyash Nigam},
92 |   title = {Chonkie: A no-nonsense fast, lightweight, and efficient text chunking library},
93 |   year = {2025},
94 |   publisher = {GitHub},
95 |   howpublished = {\url{https://github.com/chonkie-inc/chonkie}},
96 | }
97 | ```
98 | 


--------------------------------------------------------------------------------
/packages/cloud/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | 
  3 | ![Chonkie Logo](../../assets/chonkie_logo_br_transparent_bg.png)
  4 | 
  5 | # @chonkiejs/cloud
  6 | 
  7 | _Cloud-based chunkers for Chonkie via api.chonkie.ai - semantic, neural, and AI-powered text chunking._
  8 | 
  9 | [![npm version](https://img.shields.io/npm/v/@chonkiejs/cloud)](https://www.npmjs.com/package/@chonkiejs/cloud)
 10 | [![npm license](https://img.shields.io/npm/l/@chonkiejs/cloud)](https://www.npmjs.com/package/@chonkiejs/cloud)
 11 | [![Documentation](https://img.shields.io/badge/docs-DOCS.md-blue.svg)](./DOCS.md)
 12 | [![GitHub](https://img.shields.io/badge/github-chonkie--ts-black.svg?logo=github)](https://github.com/chonkie-inc/chonkie-ts)
 13 | 
 14 | </div>
 15 | 
 16 | ## Features
 17 | 🌐 **Cloud-Powered** - Leverage powerful chunking via api.chonkie.ai</br>
 18 | 🧠 **Semantic & Neural** - AI-powered intelligent chunking</br>
 19 | 🔧 **Refineries** - Post-process chunks with embeddings and overlap</br>
 20 | 🔑 **Easy Auth** - Auto-reads CHONKIE_API_KEY from environment</br>
 21 | 📦 **Returns Chunk Objects** - Compatible with @chonkiejs/core</br>
 22 | ✨ **Clean API** - Simple, consistent interface across all chunkers</br>
 23 | 
 24 | ## Installation
 25 | 
 26 | Install with `npm`:
 27 | ```bash
 28 | npm i @chonkiejs/cloud
 29 | ```
 30 | 
 31 | Install with `pnpm`:
 32 | ```bash
 33 | pnpm add @chonkiejs/cloud
 34 | ```
 35 | 
 36 | Install with `yarn`:
 37 | ```bash
 38 | yarn add @chonkiejs/cloud
 39 | ```
 40 | 
 41 | Install with `bun`:
 42 | ```bash
 43 | bun add @chonkiejs/cloud
 44 | ```
 45 | 
 46 | ## Quick Start
 47 | 
 48 | Set your API key:
 49 | ```bash
 50 | export CHONKIE_API_KEY=your-api-key-here
 51 | ```
 52 | 
 53 | Use a chunker:
 54 | ```typescript
 55 | import { SemanticChunker } from '@chonkiejs/cloud';
 56 | 
 57 | // Create a chunker (automatically uses CHONKIE_API_KEY)
 58 | const chunker = new SemanticChunker({
 59 |   chunkSize: 512,
 60 |   threshold: 0.5
 61 | });
 62 | 
 63 | // Chunk your text
 64 | const chunks = await chunker.chunk({ text: 'Your text here...' });
 65 | 
 66 | // Use the chunks
 67 | for (const chunk of chunks) {
 68 |   console.log(chunk.text);
 69 |   console.log(`Tokens: ${chunk.tokenCount}`);
 70 | }
 71 | ```
 72 | 
 73 | ## Available Chunkers
 74 | 
 75 | | Name | Description |
 76 | |------|-------------|
 77 | | `TokenChunker` | Splits text into fixed-size token chunks with optional overlap |
 78 | | `SentenceChunker` | Splits text into sentence-based chunks respecting sentence boundaries |
 79 | | `RecursiveChunker` | Uses hierarchical rules (paragraphs → sentences → punctuation → words) with customizable recipes |
 80 | | `SemanticChunker` | Creates semantically coherent chunks using embedding-based similarity analysis |
 81 | | `NeuralChunker` | Uses neural networks for intelligent, context-aware chunking |
 82 | | `CodeChunker` | Splits code into structurally meaningful chunks based on AST parsing |
 83 | | `LateChunker` | Recursive chunking with embeddings for enhanced semantic coherence |
 84 | 
 85 | ## Available Refineries
 86 | 
 87 | | Name | Description |
 88 | |------|-------------|
 89 | | `EmbeddingsRefinery` | Post-processes chunks by adding embeddings using specified embedding model |
 90 | | `OverlapRefinery` | Adds contextual overlap between chunks for better coherence |
 91 | 
 92 | For detailed API documentation, configuration options, and advanced usage, see [DOCS.md](./DOCS.md).
 93 | 
 94 | ## Contributing
 95 | 
 96 | Want to help grow Chonkie? Check out [CONTRIBUTING.md](../../CONTRIBUTING.md) to get started! Whether you're fixing bugs, adding features, improving docs, or simply leaving a ⭐️ on the repo, every contribution helps make Chonkie a better CHONK for everyone.
 97 | 
 98 | Remember: No contribution is too small for this tiny hippo!
 99 | 
100 | ## Acknowledgements
101 | 
102 | Chonkie would like to CHONK its way through a special thanks to all the users and contributors who have helped make this library what it is today! Your feedback, issue reports, and improvements have helped make Chonkie the CHONKIEST it can be.
103 | 
104 | And of course, special thanks to [Moto Moto](https://www.youtube.com/watch?v=I0zZC4wtqDQ&t=5s) for endorsing Chonkie with his famous quote:
105 | > "I like them big, I like them chonkie in TypeScript" ~ Moto Moto... definitely did not say this
106 | 
107 | ## Citation
108 | 
109 | If you use Chonkie in your research, please cite it as follows:
110 | 
111 | ```bibtex
112 | @software{chonkie2025,
113 |   author = {Bhavnick Minhas and Shreyash Nigam},
114 |   title = {Chonkie: A no-nonsense fast, lightweight, and efficient text chunking library},
115 |   year = {2025},
116 |   publisher = {GitHub},
117 |   howpublished = {\url{https://github.com/chonkie-inc}},
118 | }
119 | ```
120 | 


--------------------------------------------------------------------------------
/packages/cloud/src/base.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Base cloud client for interacting with api.chonkie.ai
  3 |  */
  4 | 
  5 | import { formatApiError, FileUploadResponse, FileReference, createFileReference } from '@/utils';
  6 | import * as fs from 'fs';
  7 | import * as path from 'path';
  8 | import * as mime from 'mime-types';
  9 | 
 10 | export interface CloudClientConfig {
 11 |   apiKey: string;
 12 |   baseUrl?: string;
 13 | }
 14 | 
 15 | export interface ChunkerInput {
 16 |   text?: string;
 17 |   filepath?: string;
 18 |   file?: FileReference;
 19 | }
 20 | 
 21 | export class CloudBaseChunker {
 22 |   protected readonly apiKey: string;
 23 |   protected readonly baseUrl: string;
 24 | 
 25 |   constructor(config: CloudClientConfig) {
 26 |     if (!config.apiKey) {
 27 |       throw new Error('API key is required');
 28 |     }
 29 |     this.apiKey = config.apiKey;
 30 |     this.baseUrl = config.baseUrl || 'https://api.chonkie.ai';
 31 |   }
 32 | 
 33 |   protected async request<T>(
 34 |     endpoint: string,
 35 |     options: {
 36 |       method?: string;
 37 |       body?: FormData | Record<string, unknown>;
 38 |       headers?: Record<string, string>;
 39 |     } = {}
 40 |   ): Promise<T> {
 41 |     const { method = 'POST', body, headers = {} } = options;
 42 | 
 43 |     const isFormData = body instanceof FormData;
 44 |     const requestHeaders: Record<string, string> = {
 45 |       'Authorization': `Bearer ${this.apiKey}`,
 46 |       ...headers,
 47 |     };
 48 | 
 49 |     // Don't set Content-Type for FormData
 50 |     if (!isFormData && body) {
 51 |       requestHeaders['Content-Type'] = 'application/json';
 52 |     }
 53 | 
 54 |     const response = await fetch(`${this.baseUrl}${endpoint}`, {
 55 |       method,
 56 |       headers: requestHeaders,
 57 |       body: isFormData ? body : (body ? JSON.stringify(body) : undefined),
 58 |     });
 59 | 
 60 |     if (!response.ok) {
 61 |       const errorText = await response.text();
 62 |       let errorMessage = response.statusText || 'Unknown error';
 63 | 
 64 |       try {
 65 |         const errorJson = JSON.parse(errorText) as { message?: string; error?: string; detail?: string };
 66 |         errorMessage = errorJson.message || errorJson.error || errorJson.detail || errorMessage;
 67 |       } catch {
 68 |         if (errorText) {
 69 |           errorMessage = errorText;
 70 |         }
 71 |       }
 72 | 
 73 |       const formattedError = formatApiError(response.status, errorMessage, endpoint);
 74 |       throw new Error(formattedError);
 75 |     }
 76 | 
 77 |     return response.json() as Promise<T>;
 78 |   }
 79 | 
 80 |   async validateAuth(): Promise<boolean> {
 81 |     try {
 82 |       const response = await this.request<{ message: string; status: number }>('/v1/auth/validate', {
 83 |         method: 'GET'
 84 |       });
 85 |       return response.status === 200;
 86 |     } catch (error) {
 87 |       return false;
 88 |     }
 89 |   }
 90 | 
 91 |   /**
 92 |    * Upload a file to the Chonkie API for OCR/document processing.
 93 |    * This is an internal method used by chunkers to upload files before chunking.
 94 |    *
 95 |    * @param filepath - Path to the file to upload
 96 |    * @returns FileReference object that can be used in subsequent API calls
 97 |    * @internal
 98 |    */
 99 |   protected async uploadFile(filepath: string): Promise<FileReference> {
100 |     if (!filepath) {
101 |       throw new Error('File path is required');
102 |     }
103 | 
104 |     if (!fs.existsSync(filepath)) {
105 |       throw new Error(`File not found: ${filepath}`);
106 |     }
107 | 
108 |     const formData = new FormData();
109 |     const fileContent = fs.readFileSync(filepath);
110 |     const fileName = path.basename(filepath);
111 | 
112 |     // Detect MIME type from file extension
113 |     const mimeType = mime.lookup(fileName) || 'application/octet-stream';
114 |     const blob = new Blob([fileContent], { type: mimeType });
115 |     formData.append('file', blob, fileName);
116 | 
117 |     const response = await this.request<FileUploadResponse>('/v1/files', {
118 |       method: 'POST',
119 |       body: formData,
120 |     });
121 | 
122 |     // The API might return different field names, check common variations
123 |     const documentName = response.document || (response as Record<string, unknown>).filename || (response as Record<string, unknown>).name || (response as Record<string, unknown>).id;
124 | 
125 |     if (!documentName || typeof documentName !== 'string') {
126 |       throw new Error(`Invalid file upload response: missing document identifier. Response: ${JSON.stringify(response)}`);
127 |     }
128 | 
129 |     // Return a FileReference with type 'document' and the document name
130 |     return createFileReference('document', documentName);
131 |   }
132 | }
133 | 


--------------------------------------------------------------------------------
/legacy/chonkie/utils/hub.ts:
--------------------------------------------------------------------------------
  1 | import { downloadFile, RepoType } from '@huggingface/hub';
  2 | import * as fs from 'fs';
  3 | import * as path from 'path';
  4 | import * as jsonschema from 'jsonschema';
  5 | 
  6 | /**
  7 |  * Hubbie is a Huggingface hub manager for Chonkie.
  8 |  */
  9 | export class Hubbie {
 10 |     private static readonly SCHEMA_VERSION = "v1";
 11 |     private readonly getRecipeConfig: {
 12 |         repo: string;
 13 |         subfolder: string;
 14 |         repoType: RepoType;
 15 |     };
 16 |     private readonly recipeSchema: Record<string, any>;
 17 | 
 18 |     constructor() {
 19 |         // Define the path to the recipes
 20 |         this.getRecipeConfig = {
 21 |             repo: "chonkie-ai/recipes",
 22 |             subfolder: "recipes",
 23 |             repoType: "dataset" as RepoType,
 24 |         };
 25 | 
 26 |         // Fetch the current recipe schema from the hub
 27 |         this.recipeSchema = this.getRecipeSchema();
 28 |     }
 29 | 
 30 |     /**
 31 |      * Get the current recipe schema from the hub.
 32 |      */
 33 |     private async getRecipeSchema(): Promise<Record<string, any>> {
 34 |         const schemaBlob = await downloadFile({
 35 |             repo: {
 36 |                 name: "chonkie-ai/recipes",
 37 |                 type: "dataset" as RepoType,
 38 |             },
 39 |             path: `${Hubbie.SCHEMA_VERSION}.schema.json`,
 40 |         });
 41 | 
 42 |         if (!schemaBlob) {
 43 |             throw new Error("Failed to download schema file");
 44 |         }
 45 | 
 46 |         const schemaContent = await schemaBlob.text();
 47 |         return JSON.parse(schemaContent);
 48 |     }
 49 | 
 50 |     /**
 51 |      * Validate a recipe against the current schema.
 52 |      */
 53 |     private validateRecipe(recipe: Record<string, any>): boolean {
 54 |         try {
 55 |             jsonschema.validate(recipe, this.recipeSchema);
 56 |             return true;
 57 |         } catch (error) {
 58 |             throw new Error(`Recipe is invalid. Please check the recipe and try again. Error: ${error}`);
 59 |         }
 60 |     }
 61 | 
 62 |     /**
 63 |      * Get a recipe from the hub.
 64 |      * 
 65 |      * @param name - The name of the recipe to get
 66 |      * @param language - The language of the recipe to get
 67 |      * @param filePath - Optionally, provide the path to the recipe
 68 |      * @returns The recipe
 69 |      * @throws Error if the recipe is not found or invalid
 70 |      */
 71 |     public async getRecipe(
 72 |         name: string = 'default',
 73 |         language: string = 'en',
 74 |         filePath?: string
 75 |     ): Promise<Record<string, any>> {
 76 |         // Check if either (name & language) or path is provided
 77 |         if ((!name || !language) && !filePath) {
 78 |             throw new Error("Either (name & language) or path must be provided.");
 79 |         }
 80 | 
 81 |         let recipeContent: string;
 82 | 
 83 |         // If path is not provided, download the recipe from the hub
 84 |         if (!filePath && name && language) {
 85 |             try {
 86 |                 const recipeBlob = await downloadFile({
 87 |                     repo: {
 88 |                         name: this.getRecipeConfig.repo,
 89 |                         type: this.getRecipeConfig.repoType,
 90 |                     },
 91 |                     path: `${this.getRecipeConfig.subfolder}/${name}_${language}.json`,
 92 |                 });
 93 | 
 94 |                 if (!recipeBlob) {
 95 |                     throw new Error(`Could not download recipe '${name}_${language}'`);
 96 |                 }
 97 | 
 98 |                 recipeContent = await recipeBlob.text();
 99 |             } catch (error) {
100 |                 throw new Error(`Could not download recipe '${name}_${language}'. Ensure name and language are correct or provide a valid path. Error: ${error}`);
101 |             }
102 |         } else {
103 |             // Read from local file
104 |             try {
105 |                 recipeContent = fs.readFileSync(filePath!, 'utf-8');
106 |             } catch (error) {
107 |                 throw new Error(`Failed to read the file ${filePath} —— please check if the file exists and if the path is correct. Error: ${error}`);
108 |             }
109 |         }
110 | 
111 |         // Parse and validate the recipe
112 |         try {
113 |             const recipe = JSON.parse(recipeContent);
114 |             
115 |             // Validate the recipe
116 |             if (!this.validateRecipe(recipe)) {
117 |                 throw new Error("Recipe is invalid. Please check the recipe and try again.");
118 |             }
119 | 
120 |             return recipe;
121 |         } catch (error) {
122 |             throw new Error(`Failed to parse recipe JSON. Error: ${error}`);
123 |         }
124 |     }
125 | }
126 | 


--------------------------------------------------------------------------------
/packages/cloud/tests/pipeline.test.ts:
--------------------------------------------------------------------------------
  1 | import { Pipeline, PipelineStep } from '../src';
  2 | 
  3 | const TEST_SLUG = `test-pipeline-${Date.now().toString(36)}`;
  4 | const BASE_URL = process.env.CHONKIE_BASE_URL || 'https://api.chonkie.ai';
  5 | 
  6 | describe.skipIf(!process.env.CHONKIE_API_KEY)('Pipeline', () => {
  7 |   // Clean up after all tests
  8 |   afterAll(async () => {
  9 |     try {
 10 |       const pipeline = await Pipeline.get(TEST_SLUG, { baseUrl: BASE_URL });
 11 |       await pipeline.delete();
 12 |     } catch {
 13 |       // Pipeline may not exist, ignore
 14 |     }
 15 |   });
 16 | 
 17 |   it('should create a pipeline and run it', async () => {
 18 |     const pipeline = new Pipeline({
 19 |       slug: TEST_SLUG,
 20 |       description: 'Test pipeline for unit tests',
 21 |       baseUrl: BASE_URL,
 22 |     });
 23 | 
 24 |     pipeline
 25 |       .chunkWith('recursive', { chunk_size: 256 })
 26 |       .refineWith('overlap', { context_size: 32 });
 27 | 
 28 |     expect(pipeline.slug).toBe(TEST_SLUG);
 29 |     expect(pipeline.isSaved).toBe(false);
 30 | 
 31 |     // Run pipeline (auto-saves)
 32 |     const chunks = await pipeline.run({
 33 |       text: 'This is a test document. It contains multiple sentences for chunking.',
 34 |     });
 35 | 
 36 |     expect(pipeline.isSaved).toBe(true);
 37 |     expect(chunks.length).toBeGreaterThan(0);
 38 |     expect(chunks[0]).toHaveProperty('text');
 39 |     expect(chunks[0]).toHaveProperty('tokenCount');
 40 |   });
 41 | 
 42 |   it('should fetch an existing pipeline', async () => {
 43 |     const pipeline = await Pipeline.get(TEST_SLUG, { baseUrl: BASE_URL });
 44 | 
 45 |     expect(pipeline.slug).toBe(TEST_SLUG);
 46 |     expect(pipeline.isSaved).toBe(true);
 47 |     expect(pipeline.steps.length).toBeGreaterThan(0);
 48 |   });
 49 | 
 50 |   it('should list pipelines', async () => {
 51 |     const pipelines = await Pipeline.list({ baseUrl: BASE_URL });
 52 | 
 53 |     expect(Array.isArray(pipelines)).toBe(true);
 54 | 
 55 |     // Should include our test pipeline
 56 |     const found = pipelines.find(p => p.slug === TEST_SLUG);
 57 |     expect(found).toBeDefined();
 58 |   });
 59 | 
 60 |   it('should update a pipeline', async () => {
 61 |     const pipeline = await Pipeline.get(TEST_SLUG, { baseUrl: BASE_URL });
 62 | 
 63 |     // Modify steps
 64 |     pipeline.reset().chunkWith('sentence', { chunk_size: 128 });
 65 | 
 66 |     await pipeline.update({ description: 'Updated description' });
 67 | 
 68 |     expect(pipeline.description).toBe('Updated description');
 69 |   });
 70 | 
 71 |   it('should validate pipeline configuration', async () => {
 72 |     const validSteps: PipelineStep[] = [
 73 |       { type: 'chunk', component: 'recursive', chunk_size: 256 },
 74 |     ];
 75 | 
 76 |     const result = await Pipeline.validate(validSteps, { baseUrl: BASE_URL });
 77 | 
 78 |     expect(result.valid).toBe(true);
 79 |     expect(result.errors).toBeNull();
 80 |   });
 81 | 
 82 |   it('should reject invalid slug format', () => {
 83 |     expect(() => {
 84 |       new Pipeline({ slug: 'Invalid Slug!' });
 85 |     }).toThrow(/Invalid slug/);
 86 |   });
 87 | 
 88 |   it('should describe pipeline steps', () => {
 89 |     const pipeline = new Pipeline({ slug: 'desc-test' });
 90 | 
 91 |     expect(pipeline.describe()).toBe('Empty pipeline');
 92 | 
 93 |     pipeline
 94 |       .chunkWith('recursive')
 95 |       .refineWith('overlap');
 96 | 
 97 |     expect(pipeline.describe()).toBe('chunk(recursive) -> refine(overlap)');
 98 |   });
 99 | 
100 |   it('should export configuration', () => {
101 |     const pipeline = new Pipeline({ slug: 'config-test' });
102 | 
103 |     pipeline
104 |       .chunkWith('token', { chunk_size: 512 })
105 |       .refineWith('embeddings', { embedding_model: 'test-model' });
106 | 
107 |     const config = pipeline.toConfig();
108 | 
109 |     expect(config).toHaveLength(2);
110 |     expect(config[0]).toEqual({
111 |       type: 'chunk',
112 |       component: 'token',
113 |       chunk_size: 512,
114 |     });
115 |     expect(config[1]).toEqual({
116 |       type: 'refine',
117 |       component: 'embeddings',
118 |       embedding_model: 'test-model',
119 |     });
120 |   });
121 | 
122 |   it('should delete a pipeline', async () => {
123 |     // Create a temporary pipeline
124 |     const tempSlug = `temp-${Date.now().toString(36)}`;
125 |     const pipeline = new Pipeline({
126 |       slug: tempSlug,
127 |       description: 'Temporary pipeline',
128 |       baseUrl: BASE_URL,
129 |     });
130 | 
131 |     pipeline.chunkWith('token', { chunk_size: 256 });
132 |     await pipeline.run({ text: 'Test' });
133 | 
134 |     expect(pipeline.isSaved).toBe(true);
135 | 
136 |     await pipeline.delete();
137 | 
138 |     expect(pipeline.isSaved).toBe(false);
139 | 
140 |     // Verify it's actually deleted
141 |     await expect(Pipeline.get(tempSlug, { baseUrl: BASE_URL })).rejects.toThrow(/not found/);
142 |   });
143 | });
144 | 


--------------------------------------------------------------------------------
/packages/cloud/src/chunkers/sentence.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Sentence chunker that splits text into sentence-based chunks
  3 |  * via api.chonkie.ai
  4 |  */
  5 | 
  6 | import { Chunk } from '@chonkiejs/core';
  7 | import { CloudBaseChunker, ChunkerInput } from '@/base';
  8 | 
  9 | export interface SentenceChunkerOptions {
 10 |   /** Tokenizer to use (default: "gpt2") */
 11 |   tokenizer?: string;
 12 |   /** Maximum tokens per chunk (default: 512) */
 13 |   chunkSize?: number;
 14 |   /** Number of tokens to overlap between chunks (default: 0) */
 15 |   chunkOverlap?: number;
 16 |   /** Minimum sentences per chunk (default: 1) */
 17 |   minSentencesPerChunk?: number;
 18 |   /** Minimum characters per sentence (default: 12) */
 19 |   minCharactersPerSentence?: number;
 20 |   /** Use approximate token counting (default: false) */
 21 |   approximate?: boolean;
 22 |   /** Sentence delimiters (default: [".", "!", "?", "\n"]) */
 23 |   delim?: string | string[];
 24 |   /** Where to include delimiter (default: "prev") */
 25 |   includeDelim?: 'prev' | 'next' | null;
 26 |   /** API key (reads from CHONKIE_API_KEY env var if not provided) */
 27 |   apiKey?: string;
 28 |   /** Base URL for API (default: "https://api.chonkie.ai") */
 29 |   baseUrl?: string;
 30 | }
 31 | 
 32 | interface ApiChunkResponse {
 33 |   text: string;
 34 |   start_index: number;
 35 |   end_index: number;
 36 |   token_count: number;
 37 | }
 38 | 
 39 | interface SentenceChunkPayload extends Record<string, unknown> {
 40 |   text?: string;
 41 |   file?: { type: string; content: string };
 42 |   tokenizer_or_token_counter: string;
 43 |   chunk_size: number;
 44 |   chunk_overlap: number;
 45 |   min_sentences_per_chunk: number;
 46 |   min_characters_per_sentence: number;
 47 |   approximate: boolean;
 48 |   delim: string | string[];
 49 |   include_delim: string;
 50 |   return_type: string;
 51 | }
 52 | 
 53 | export class SentenceChunker extends CloudBaseChunker {
 54 |   private readonly config: {
 55 |     tokenizer: string;
 56 |     chunkSize: number;
 57 |     chunkOverlap: number;
 58 |     minSentencesPerChunk: number;
 59 |     minCharactersPerSentence: number;
 60 |     approximate: boolean;
 61 |     delim: string | string[];
 62 |     includeDelim: 'prev' | 'next' | null;
 63 |   };
 64 | 
 65 |   constructor(options: SentenceChunkerOptions = {}) {
 66 |     const apiKey = options.apiKey || process.env.CHONKIE_API_KEY;
 67 |     if (!apiKey) {
 68 |       throw new Error('API key is required. Provide it in options.apiKey or set CHONKIE_API_KEY environment variable.');
 69 |     }
 70 | 
 71 |     super({ apiKey, baseUrl: options.baseUrl });
 72 | 
 73 |     this.config = {
 74 |       tokenizer: options.tokenizer || 'gpt2',
 75 |       chunkSize: options.chunkSize || 512,
 76 |       chunkOverlap: options.chunkOverlap || 0,
 77 |       minSentencesPerChunk: options.minSentencesPerChunk || 1,
 78 |       minCharactersPerSentence: options.minCharactersPerSentence || 12,
 79 |       approximate: options.approximate ?? false,
 80 |       delim: options.delim || ['.', '!', '?', '\n'],
 81 |       includeDelim: options.includeDelim ?? 'prev',
 82 |     };
 83 |   }
 84 | 
 85 |   async chunk(input: ChunkerInput): Promise<Chunk[]> {
 86 |     let fileRef = input.file;
 87 | 
 88 |     // If filepath is provided, upload it first to get a file reference
 89 |     if (input.filepath) {
 90 |       fileRef = await this.uploadFile(input.filepath);
 91 |     }
 92 | 
 93 |     // Build the payload
 94 |     const payload: SentenceChunkPayload = {
 95 |       tokenizer_or_token_counter: this.config.tokenizer,
 96 |       chunk_size: this.config.chunkSize,
 97 |       chunk_overlap: this.config.chunkOverlap,
 98 |       min_sentences_per_chunk: this.config.minSentencesPerChunk,
 99 |       min_characters_per_sentence: this.config.minCharactersPerSentence,
100 |       approximate: this.config.approximate,
101 |       delim: this.config.delim,
102 |       include_delim: this.config.includeDelim || 'prev',
103 |       return_type: 'chunks',
104 |     };
105 | 
106 |     // Add either text or file to the payload
107 |     if (fileRef) {
108 |       payload.file = fileRef;
109 |     } else if (input.text) {
110 |       payload.text = input.text;
111 |     } else {
112 |       throw new Error('Either text, filepath, or file must be provided');
113 |     }
114 | 
115 |     const data = await this.request<ApiChunkResponse[]>('/v1/chunk/sentence', {
116 |       method: 'POST',
117 |       body: payload,
118 |     });
119 | 
120 |     return data.map(chunk => new Chunk({
121 |       text: chunk.text,
122 |       startIndex: chunk.start_index,
123 |       endIndex: chunk.end_index,
124 |       tokenCount: chunk.token_count,
125 |     }));
126 |   }
127 | 
128 |   async chunkBatch(inputs: ChunkerInput[]): Promise<Chunk[][]> {
129 |     return Promise.all(inputs.map(input => this.chunk(input)));
130 |   }
131 | 
132 |   toString(): string {
133 |     return `SentenceChunker(tokenizer=${this.config.tokenizer}, chunkSize=${this.config.chunkSize})`;
134 |   }
135 | }
136 | 


--------------------------------------------------------------------------------
/packages/core/src/types.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Base chunk type representing a piece of text with metadata.
  3 |  */
  4 | export class Chunk {
  5 |   /** The text content of the chunk */
  6 |   public text: string;
  7 |   /** The starting index of the chunk in the original text */
  8 |   public startIndex: number;
  9 |   /** The ending index of the chunk in the original text */
 10 |   public endIndex: number;
 11 |   /** The number of tokens in the chunk */
 12 |   public tokenCount: number;
 13 |   /** Optional embedding vector for the chunk */
 14 |   public embedding?: number[];
 15 | 
 16 |   constructor(data: {
 17 |     text: string;
 18 |     startIndex: number;
 19 |     endIndex: number;
 20 |     tokenCount: number;
 21 |     embedding?: number[];
 22 |   }) {
 23 |     this.text = data.text;
 24 |     this.startIndex = data.startIndex;
 25 |     this.endIndex = data.endIndex;
 26 |     this.tokenCount = data.tokenCount;
 27 |     this.embedding = data.embedding;
 28 | 
 29 |     if (this.startIndex > this.endIndex) {
 30 |       throw new Error('Start index must be less than or equal to end index');
 31 |     }
 32 |     if (this.tokenCount < 0) {
 33 |       throw new Error('Token count must be non-negative');
 34 |     }
 35 |   }
 36 | 
 37 |   /**
 38 |    * Get a string representation of the chunk.
 39 |    */
 40 |   toString(): string {
 41 |     return this.text;
 42 |   }
 43 | }
 44 | 
 45 | /**
 46 |  * Type for specifying where delimiters should be included in chunks.
 47 |  */
 48 | export type IncludeDelim = 'prev' | 'next' | 'none';
 49 | 
 50 | /**
 51 |  * Configuration for a single level in the recursive chunking hierarchy.
 52 |  */
 53 | export interface RecursiveLevelConfig {
 54 |   /** Delimiters to split on at this level */
 55 |   delimiters?: string | string[];
 56 |   /** Whether to use whitespace as the delimiter */
 57 |   whitespace?: boolean;
 58 |   /** Where to include the delimiter in the resulting chunks */
 59 |   includeDelim?: IncludeDelim;
 60 | }
 61 | 
 62 | /**
 63 |  * Represents one level in the recursive chunking hierarchy.
 64 |  */
 65 | export class RecursiveLevel {
 66 |   public delimiters?: string | string[];
 67 |   public whitespace: boolean;
 68 |   public includeDelim: IncludeDelim;
 69 | 
 70 |   constructor(config: RecursiveLevelConfig = {}) {
 71 |     this.delimiters = config.delimiters;
 72 |     this.whitespace = config.whitespace ?? false;
 73 |     this.includeDelim = config.includeDelim ?? 'prev';
 74 | 
 75 |     this.validate();
 76 |   }
 77 | 
 78 |   private validate(): void {
 79 |     if (this.delimiters !== undefined && this.whitespace) {
 80 |       throw new Error('Cannot use both custom delimiters and whitespace');
 81 |     }
 82 |     if (this.delimiters !== undefined) {
 83 |       if (typeof this.delimiters === 'string' && this.delimiters.length === 0) {
 84 |         throw new Error('Delimiter cannot be empty string');
 85 |       }
 86 |       if (Array.isArray(this.delimiters)) {
 87 |         if (this.delimiters.some(d => typeof d !== 'string' || d.length === 0)) {
 88 |           throw new Error('Delimiter cannot be empty string');
 89 |         }
 90 |         if (this.delimiters.includes(' ')) {
 91 |           throw new Error('Use whitespace option instead of space delimiter');
 92 |         }
 93 |       }
 94 |     }
 95 |   }
 96 | 
 97 |   toString(): string {
 98 |     return `RecursiveLevel(delimiters=${JSON.stringify(this.delimiters)}, whitespace=${this.whitespace}, includeDelim=${this.includeDelim})`;
 99 |   }
100 | }
101 | 
102 | /**
103 |  * Configuration for recursive chunking rules.
104 |  */
105 | export interface RecursiveRulesConfig {
106 |   /** Array of levels to use for recursive chunking */
107 |   levels?: RecursiveLevelConfig[];
108 | }
109 | 
110 | /**
111 |  * Defines the hierarchy of rules for recursive text chunking.
112 |  *
113 |  * Default hierarchy:
114 |  * 1. Paragraphs (split on \n\n, \r\n, \n, \r)
115 |  * 2. Sentences (split on . ! ?)
116 |  * 3. Pauses (split on punctuation/symbols)
117 |  * 4. Words (split on whitespace)
118 |  * 5. Characters (token-level splitting)
119 |  */
120 | export class RecursiveRules {
121 |   public levels: RecursiveLevel[];
122 | 
123 |   constructor(config: RecursiveRulesConfig = {}) {
124 |     if (config.levels === undefined) {
125 |       // Default hierarchy
126 |       this.levels = [
127 |         new RecursiveLevel({ delimiters: ['\n\n', '\r\n', '\n', '\r'] }), // Paragraphs
128 |         new RecursiveLevel({ delimiters: ['. ', '! ', '? '] }), // Sentences
129 |         new RecursiveLevel({
130 |           delimiters: [
131 |             '{', '}', '"', '[', ']', '<', '>', '(', ')', ':', ';', ',',
132 |             '—', '|', '~', '-', '...', '`', "'"
133 |           ]
134 |         }), // Pauses
135 |         new RecursiveLevel({ whitespace: true }), // Words
136 |         new RecursiveLevel() // Characters/tokens
137 |       ];
138 |     } else {
139 |       this.levels = config.levels.map(levelConfig => new RecursiveLevel(levelConfig));
140 |     }
141 |   }
142 | 
143 |   get length(): number {
144 |     return this.levels.length;
145 |   }
146 | 
147 |   getLevel(index: number): RecursiveLevel | undefined {
148 |     return this.levels[index];
149 |   }
150 | 
151 |   toString(): string {
152 |     return `RecursiveRules(${this.levels.length} levels)`;
153 |   }
154 | }
155 | 


--------------------------------------------------------------------------------
/packages/cloud/src/chunkers/semantic.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Semantic chunker that uses embeddings to create semantically coherent chunks
  3 |  * via api.chonkie.ai
  4 |  */
  5 | 
  6 | import { Chunk } from '@chonkiejs/core';
  7 | import { CloudBaseChunker, ChunkerInput } from '@/base';
  8 | 
  9 | export interface SemanticChunkerOptions {
 10 |   /** Embedding model to use (default: "minishlab/potion-base-8M") */
 11 |   embeddingModel?: string;
 12 |   /** Similarity threshold for chunking (default: 0.5) */
 13 |   threshold?: number;
 14 |   /** Maximum tokens per chunk (default: 512) */
 15 |   chunkSize?: number;
 16 |   /** Window size for similarity comparison (default: 1) */
 17 |   similarityWindow?: number;
 18 |   /** Minimum sentences per chunk (default: 1) */
 19 |   minSentences?: number;
 20 |   /** Minimum chunk size (default: 2) */
 21 |   minChunkSize?: number;
 22 |   /** Minimum characters per sentence (default: 12) */
 23 |   minCharactersPerSentence?: number;
 24 |   /** Step size for threshold adjustment (default: 0.01) */
 25 |   thresholdStep?: number;
 26 |   /** Sentence delimiters (default: [".", "!", "?", "\n"]) */
 27 |   delim?: string | string[];
 28 |   /** Where to include delimiter (default: "prev") */
 29 |   includeDelim?: 'prev' | 'next' | null;
 30 |   /** API key (reads from CHONKIE_API_KEY env var if not provided) */
 31 |   apiKey?: string;
 32 |   /** Base URL for API (default: "https://api.chonkie.ai") */
 33 |   baseUrl?: string;
 34 | }
 35 | 
 36 | interface ApiChunkResponse {
 37 |   text: string;
 38 |   start_index: number;
 39 |   end_index: number;
 40 |   token_count: number;
 41 | }
 42 | 
 43 | interface SemanticChunkPayload extends Record<string, unknown> {
 44 |   text?: string;
 45 |   file?: { type: string; content: string };
 46 |   embedding_model: string;
 47 |   threshold: number;
 48 |   chunk_size: number;
 49 |   similarity_window: number;
 50 |   min_sentences: number;
 51 |   min_chunk_size: number;
 52 |   min_characters_per_sentence: number;
 53 |   threshold_step: number;
 54 |   delim: string | string[];
 55 |   include_delim: string;
 56 |   return_type: string;
 57 | }
 58 | 
 59 | export class SemanticChunker extends CloudBaseChunker {
 60 |   private readonly config: {
 61 |     embeddingModel: string;
 62 |     threshold: number;
 63 |     chunkSize: number;
 64 |     similarityWindow: number;
 65 |     minSentences: number;
 66 |     minChunkSize: number;
 67 |     minCharactersPerSentence: number;
 68 |     thresholdStep: number;
 69 |     delim: string | string[];
 70 |     includeDelim: 'prev' | 'next' | null;
 71 |   };
 72 | 
 73 |   constructor(options: SemanticChunkerOptions = {}) {
 74 |     const apiKey = options.apiKey || process.env.CHONKIE_API_KEY;
 75 |     if (!apiKey) {
 76 |       throw new Error('API key is required. Provide it in options.apiKey or set CHONKIE_API_KEY environment variable.');
 77 |     }
 78 | 
 79 |     super({ apiKey, baseUrl: options.baseUrl });
 80 | 
 81 |     this.config = {
 82 |       embeddingModel: options.embeddingModel || 'minishlab/potion-base-8M',
 83 |       threshold: options.threshold ?? 0.5,
 84 |       chunkSize: options.chunkSize || 512,
 85 |       similarityWindow: options.similarityWindow || 1,
 86 |       minSentences: options.minSentences || 1,
 87 |       minChunkSize: options.minChunkSize || 2,
 88 |       minCharactersPerSentence: options.minCharactersPerSentence || 12,
 89 |       thresholdStep: options.thresholdStep || 0.01,
 90 |       delim: options.delim || ['.', '!', '?', '\n'],
 91 |       includeDelim: options.includeDelim ?? 'prev',
 92 |     };
 93 |   }
 94 | 
 95 |   async chunk(input: ChunkerInput): Promise<Chunk[]> {
 96 |     let fileRef = input.file;
 97 | 
 98 |     // If filepath is provided, upload it first to get a file reference
 99 |     if (input.filepath) {
100 |       fileRef = await this.uploadFile(input.filepath);
101 |     }
102 | 
103 |     // Build the payload
104 |     const payload: SemanticChunkPayload = {
105 |       embedding_model: this.config.embeddingModel,
106 |       threshold: this.config.threshold,
107 |       chunk_size: this.config.chunkSize,
108 |       similarity_window: this.config.similarityWindow,
109 |       min_sentences: this.config.minSentences,
110 |       min_chunk_size: this.config.minChunkSize,
111 |       min_characters_per_sentence: this.config.minCharactersPerSentence,
112 |       threshold_step: this.config.thresholdStep,
113 |       delim: this.config.delim,
114 |       include_delim: this.config.includeDelim || 'prev',
115 |       return_type: 'chunks',
116 |     };
117 | 
118 |     // Add either text or file to the payload
119 |     if (fileRef) {
120 |       payload.file = fileRef;
121 |     } else if (input.text) {
122 |       payload.text = input.text;
123 |     } else {
124 |       throw new Error('Either text, filepath, or file must be provided');
125 |     }
126 | 
127 |     const data = await this.request<ApiChunkResponse[]>('/v1/chunk/semantic', {
128 |       method: 'POST',
129 |       body: payload,
130 |     });
131 | 
132 |     return data.map(chunk => new Chunk({
133 |       text: chunk.text,
134 |       startIndex: chunk.start_index,
135 |       endIndex: chunk.end_index,
136 |       tokenCount: chunk.token_count,
137 |     }));
138 |   }
139 | 
140 |   async chunkBatch(inputs: ChunkerInput[]): Promise<Chunk[][]> {
141 |     return Promise.all(inputs.map(input => this.chunk(input)));
142 |   }
143 | 
144 |   toString(): string {
145 |     return `SemanticChunker(embeddingModel=${this.config.embeddingModel}, threshold=${this.config.threshold})`;
146 |   }
147 | }
148 | 


--------------------------------------------------------------------------------
/legacy/chonkie/chunker/base.ts:
--------------------------------------------------------------------------------
  1 | /** Base Chunking Class. **/
  2 | 
  3 | import { Tokenizer } from "../tokenizer";
  4 | import { Chunk } from "../types/base";
  5 | 
  6 | /**
  7 |  * Base class for all chunking classes.
  8 |  *
  9 |  * This abstract class provides a common interface and shared logic for all chunking implementations.
 10 |  * It supports chunking a single text or a batch of texts, with optional concurrency and progress reporting.
 11 |  *
 12 |  * Subclasses must implement the `chunk` method to define how a single text is chunked.
 13 |  *
 14 |  * @template T - The type of chunk produced (usually `Chunk[]` or `string[]`).
 15 |  *
 16 |  * @property {Tokenizer} tokenizer - The tokenizer instance used for chunking operations.
 17 |  * @property {boolean} _useConcurrency - Whether to use concurrent processing for batch chunking (default: true).
 18 |  *
 19 |  * @example
 20 |  * class MyChunker extends BaseChunker {
 21 |  *   async chunk(text: string): Promise<Chunk[]> {
 22 |  *     // ... implementation ...
 23 |  *   }
 24 |  * }
 25 |  *
 26 |  * const chunker = new MyChunker(tokenizer);
 27 |  * const chunks = await chunker.call("Some text");
 28 |  * const batchChunks = await chunker.call(["Text 1", "Text 2"], true);
 29 |  */
 30 | export abstract class BaseChunker {
 31 |   protected tokenizer: Tokenizer;
 32 |   protected _useConcurrency: boolean = true; // Determines if batch processing uses Promise.all
 33 | 
 34 |   constructor(tokenizer: Tokenizer) {
 35 |     this.tokenizer = tokenizer;
 36 |   }
 37 | 
 38 |   /**
 39 |    * Returns a string representation of the chunker instance.
 40 |    *
 41 |    * @returns {string} The class name and constructor signature.
 42 |    */
 43 |   public toString(): string {
 44 |     return `${this.constructor.name}()`;
 45 |   }
 46 | 
 47 |   /**
 48 |    * Call the chunker with a single string or an array of strings.
 49 |    *
 50 |    * If a single string is provided, returns the result of `chunk(text)`.
 51 |    * If an array of strings is provided, returns the result of `chunkBatch(texts, showProgress)`.
 52 |    *
 53 |    * @param {string | string[]} textOrTexts - The text or array of texts to chunk.
 54 |    * @param {boolean} [showProgress=false] - Whether to display progress for batch operations (only applies to arrays).
 55 |    * @returns {Promise<Chunk[] | Chunk[][]>} The chunked result(s).
 56 |    * @throws {Error} If input is not a string or array of strings.
 57 |    */
 58 |   public async call(text: string, showProgress?: boolean): Promise<Chunk[]>;
 59 |   public async call(texts: string[], showProgress?: boolean): Promise<Chunk[][]>;
 60 |   public async call(
 61 |     textOrTexts: string | string[],
 62 |     showProgress: boolean = false
 63 |   ): Promise<Chunk[] | Chunk[][]> {
 64 |     if (typeof textOrTexts === 'string') {
 65 |       return this.chunk(textOrTexts) as Promise<Chunk[]>;
 66 |     } else if (Array.isArray(textOrTexts)) {
 67 |       return this.chunkBatch(textOrTexts, showProgress) as Promise<Chunk[][]>;
 68 |     } else {
 69 |       // This case should ideally not be reached due to TypeScript's type checking
 70 |       // if the public overloads are used correctly.
 71 |       throw new Error("Input must be a string or an array of strings.");
 72 |     }
 73 |   }
 74 | 
 75 |   /**
 76 |    * Process a batch of texts sequentially (one after another).
 77 |    *
 78 |    * @protected
 79 |    * @param {string[]} texts - The texts to chunk.
 80 |    * @param {boolean} [showProgress=false] - Whether to display progress in the console.
 81 |    * @returns {Promise<Chunk[][]>} An array of chunked results for each input text.
 82 |    */
 83 |   protected async _sequential_batch_processing(
 84 |     texts: string[],
 85 |     showProgress: boolean = false
 86 |   ): Promise<Chunk[][]> {
 87 |     const results: Chunk[][] = [];
 88 |     const total = texts.length;
 89 |     for (let i = 0; i < total; i++) {
 90 |       if (showProgress && total > 1) {
 91 |         const progress = Math.round(((i + 1) / total) * 100);
 92 |         process.stdout.write(`Sequential processing: Document ${i + 1}/${total} (${progress}%)\r`);
 93 |       }
 94 |       results.push(await this.chunk(texts[i]));
 95 |     }
 96 |     if (showProgress && total > 1) {
 97 |       process.stdout.write("\n"); // Newline after progress
 98 |     }
 99 |     return results;
100 |   }
101 | 
102 |   /**
103 |    * Process a batch of texts concurrently using Promise.all.
104 |    *
105 |    * @protected
106 |    * @param {string[]} texts - The texts to chunk.
107 |    * @param {boolean} [showProgress=false] - Whether to display progress in the console.
108 |    * @returns {Promise<Chunk[][]>} An array of chunked results for each input text.
109 |    */
110 |   protected async _concurrent_batch_processing(
111 |     texts: string[],
112 |     showProgress: boolean = false
113 |   ): Promise<Chunk[][]> {
114 |     const total = texts.length;
115 |     let completedCount = 0;
116 | 
117 |     const updateProgress = () => {
118 |       if (showProgress && total > 1) {
119 |         completedCount++;
120 |         const progress = Math.round((completedCount / total) * 100);
121 |         process.stdout.write(`Concurrent processing: Document ${completedCount}/${total} (${progress}%)\r`);
122 |       }
123 |     };
124 | 
125 |     const chunkPromises = texts.map(text =>
126 |       this.chunk(text).then(result => {
127 |         updateProgress();
128 |         return result;
129 |       })
130 |     );
131 | 
132 |     const results = await Promise.all(chunkPromises);
133 |     if (showProgress && total > 1 && completedCount > 0) { // ensure newline only if progress was shown
134 |       process.stdout.write("\n"); // Newline after progress
135 |     }
136 |     return results;
137 |   }
138 | 
139 |   /**
140 |    * Abstract method to chunk a single text. Must be implemented by subclasses.
141 |    *
142 |    * @param {string} text - The text to chunk.
143 |    * @returns {Promise<Chunk[]>} The chunked representation of the input text.
144 |    * @abstract
145 |    */
146 |   public abstract chunk(text: string): Promise<Chunk[]>;
147 | 
148 |   /**
149 |    * Chunk a batch of texts, using either concurrent or sequential processing.
150 |    *
151 |    * If only one text is provided, processes it directly without batch overhead.
152 |    *
153 |    * @param {string[]} texts - The texts to chunk.
154 |    * @param {boolean} [showProgress=true] - Whether to display progress in the console.
155 |    * @returns {Promise<Chunk[][]>} An array of chunked results for each input text.
156 |    */
157 |   public async chunkBatch(
158 |     texts: string[],
159 |     showProgress: boolean = true
160 |   ): Promise<Chunk[][]> {
161 |     if (texts.length === 0) {
162 |       return [];
163 |     }
164 |     // If only one text, process it directly without batch overhead, progress not shown for single item.
165 |     if (texts.length === 1) {
166 |       return [await this.chunk(texts[0]) as Chunk[] ];
167 |     }
168 | 
169 |     // For multiple texts, use selected batch processing strategy
170 |     if (this._useConcurrency) {
171 |       return this._concurrent_batch_processing(texts, showProgress);
172 |     } else {
173 |       return this._sequential_batch_processing(texts, showProgress);
174 |     }
175 |   }
176 | }
177 | 


--------------------------------------------------------------------------------
/legacy/chonkie/types/sentence.ts:
--------------------------------------------------------------------------------
  1 | import { Chunk } from './base';
  2 | 
  3 | /**
  4 |  * Represents the essential data for a sentence within a text.
  5 |  *
  6 |  * @property text - The actual sentence string as it appears in the source text.
  7 |  * @property startIndex - The zero-based index indicating where the sentence starts in the original text.
  8 |  * @property endIndex - The zero-based index indicating where the sentence ends in the original text (inclusive).
  9 |  * @property tokenCount - The number of tokens (words or subwords) in the sentence, useful for NLP tasks.
 10 |  */
 11 | export interface SentenceData {
 12 |   text: string;
 13 |   startIndex: number;
 14 |   endIndex: number;
 15 |   tokenCount: number;
 16 | }
 17 | 
 18 | /**
 19 |  * Class to represent a sentence.
 20 |  *
 21 |  * Represents a single sentence within a text, including its text, position, and token count.
 22 |  *
 23 |  * @class
 24 |  * @param {SentenceData} data - The data required to construct a Sentence instance.
 25 |  * @property {string} text - The text of the sentence.
 26 |  * @property {number} startIndex - The starting index of the sentence in the original text.
 27 |  * @property {number} endIndex - The ending index of the sentence in the original text.
 28 |  * @property {number} tokenCount - The number of tokens in the sentence.
 29 |  * @property {number[]} [embedding] - The embedding vector for the sentence (array of numbers, or null if not present).
 30 |  *
 31 |  * @method toString Returns a string representation of the Sentence.
 32 |  * @returns {string}
 33 |  *
 34 |  * @method toDict Returns the Sentence as a dictionary-like object.
 35 |  * @returns {SentenceData}
 36 |  *
 37 |  * @method static fromDict Creates a Sentence object from a dictionary-like object.
 38 |  * @param {SentenceData} data - The data to create the Sentence from.
 39 |  * @returns {Sentence}
 40 |  */
 41 | export class Sentence {
 42 |   /** The text of the sentence */
 43 |   public text: string;
 44 |   /** The starting index of the sentence in the original text */
 45 |   public startIndex: number;
 46 |   /** The ending index of the sentence in the original text */
 47 |   public endIndex: number;
 48 |   /** The number of tokens in the sentence */
 49 |   public tokenCount: number;
 50 | 
 51 |   constructor(data: SentenceData) {
 52 |     this.text = data.text;
 53 |     this.startIndex = data.startIndex;
 54 |     this.endIndex = data.endIndex;
 55 |     this.tokenCount = data.tokenCount;
 56 |   }
 57 | 
 58 |   /** Return a string representation of the Sentence */
 59 |   public toString(): string {
 60 |     return `Sentence(text=${this.text}, startIndex=${this.startIndex}, endIndex=${this.endIndex}, tokenCount=${this.tokenCount})`;
 61 |   }
 62 | 
 63 |   /** Return the Sentence as a dictionary-like object */
 64 |   public toDict(): SentenceData {
 65 |     return {
 66 |       text: this.text,
 67 |       startIndex: this.startIndex,
 68 |       endIndex: this.endIndex,
 69 |       tokenCount: this.tokenCount,
 70 |     };
 71 |   }
 72 | 
 73 |   /** Create a Sentence object from a dictionary-like object */
 74 |   public static fromDict(data: SentenceData): Sentence {
 75 |     return new Sentence(data);
 76 |   }
 77 | }
 78 | 
 79 | /**
 80 |  * Represents the essential data for a chunk of sentences within a text.
 81 |  *
 82 |  * @property text - The combined text of all sentences in the chunk as it appears in the source text.
 83 |  * @property startIndex - The zero-based index indicating where the chunk starts in the original text.
 84 |  * @property endIndex - The zero-based index indicating where the chunk ends in the original text (inclusive).
 85 |  * @property tokenCount - The total number of tokens (words or subwords) in the chunk, useful for NLP tasks.
 86 |  * @property sentences - An array of SentenceData objects, each representing an individual sentence within the chunk.
 87 |  */
 88 | interface SentenceChunkData {
 89 |   text: string;
 90 |   startIndex: number;
 91 |   endIndex: number;
 92 |   tokenCount: number;
 93 |   sentences: SentenceData[];
 94 |   embedding?: number[];
 95 | }
 96 | 
 97 | /**
 98 |  * Represents a chunk of one or more sentences within a text.
 99 |  *
100 |  * A SentenceChunk groups together multiple {@link Sentence} objects, providing their combined text, position, and token count within the original text.
101 |  *
102 |  * @class
103 |  * @extends Chunk
104 |  *
105 |  * @param {Object} data - Data to construct a SentenceChunk instance.
106 |  * @param {string} data.text - Combined text of all sentences in the chunk.
107 |  * @param {number} data.startIndex - Zero-based index where the chunk starts in the original text.
108 |  * @param {number} data.endIndex - Zero-based index where the chunk ends in the original text (inclusive).
109 |  * @param {number} data.tokenCount - Total number of tokens in the chunk.
110 |  * @param {Sentence[]} data.sentences - Array of {@link Sentence} objects in the chunk.
111 |  *
112 |  * @property {string} text - Combined text of all sentences in the chunk.
113 |  * @property {number} startIndex - Starting index of the chunk in the original text.
114 |  * @property {number} endIndex - Ending index of the chunk in the original text.
115 |  * @property {number} tokenCount - Total number of tokens in the chunk.
116 |  * @property {Sentence[]} sentences - List of {@link Sentence} objects in the chunk.
117 |  *
118 |  * @method toString Returns a detailed string representation of the SentenceChunk, including its text, start and end indices, token count, and a list of all contained sentences with their metadata.
119 |  * @method toDict Returns the SentenceChunk as a plain object (see {@link SentenceChunkData}).
120 |  * @method static fromDict Creates a SentenceChunk from a {@link SentenceChunkData} object.
121 |  */
122 | export class SentenceChunk extends Chunk {
123 |   /** List of sentences in the chunk */
124 |   public sentences: Sentence[];
125 | 
126 |   constructor(data: {
127 |     text: string;
128 |     startIndex: number;
129 |     endIndex: number;
130 |     tokenCount: number;
131 |     sentences: Sentence[];
132 |     embedding?: number[];
133 |   }) {
134 |     super(data);
135 |     this.sentences = data.sentences;
136 |     this.embedding = data.embedding ?? undefined;
137 |   }
138 | 
139 |   /**
140 |    * Returns a detailed string representation of the SentenceChunk, including its text, start and end indices, token count, and a list of all contained sentences with their metadata.
141 |    *
142 |    * This method overrides the base {@link Chunk} toString method to provide a more informative output, which is especially useful for debugging and logging. Each sentence in the chunk is represented using its own toString method, and all sentences are included in the output.
143 |    *
144 |    * @returns {string} A string describing the SentenceChunk and all its sentences, e.g.,
145 |    *   SentenceChunk(text=..., startIndex=..., endIndex=..., tokenCount=..., sentences=[Sentence(...), ...])
146 |    */
147 |   public toString(): string {
148 |     const sentencesStr = this.sentences.map(s => s.toString()).join(', ');
149 |     return `SentenceChunk(text=${this.text}, startIndex=${this.startIndex}, endIndex=${this.endIndex}, tokenCount=${this.tokenCount}, sentences=[${sentencesStr}])`;
150 |   }
151 | 
152 |   /**
153 |    * Returns the SentenceChunk as a dictionary-like object.
154 |    *
155 |    * This method extends the base {@link Chunk} toDict method to include the sentences in the chunk.
156 |    *
157 |    * @returns {SentenceChunkData} A dictionary-like object containing the chunk's text, start and end indices, token count, and an array of sentence data.
158 |   /** Return the SentenceChunk as a dictionary-like object */
159 |   public toDict(): SentenceChunkData {
160 |     const baseDict = super.toDict();
161 |     return {
162 |       ...baseDict,
163 |       sentences: this.sentences.map(sentence => sentence.toDict()),
164 |     };
165 |   }
166 | 
167 |   /**
168 |    * Creates a SentenceChunk object from a dictionary-like object.
169 |    *
170 |    * This method extends the base {@link Chunk} fromDict method to include the sentences in the chunk.
171 |    *
172 |    * @param {SentenceChunkData} data - A dictionary-like object containing the chunk's text, start and end indices, token count, and an array of sentence data.
173 |    * @returns {SentenceChunk} A new SentenceChunk object created from the provided dictionary-like object.
174 |    */
175 |   public static fromDict(data: SentenceChunkData): SentenceChunk {
176 |     const sentences = data.sentences.map(sentence => Sentence.fromDict(sentence));
177 |     return new SentenceChunk({
178 |       text: data.text,
179 |       startIndex: data.startIndex,
180 |       endIndex: data.endIndex,
181 |       tokenCount: data.tokenCount,
182 |       sentences,
183 |       embedding: data.embedding ?? undefined,
184 |     });
185 |   }
186 | } 


--------------------------------------------------------------------------------