├── pnpm-workspace.yaml ├── packages ├── cloud │ ├── .tsc-aliasrc.json │ ├── vitest.config.ts │ ├── tsconfig.json │ ├── tests │ │ ├── overlap-refinery.test.ts │ │ ├── embeddings-refinery.test.ts │ │ ├── token.test.ts │ │ ├── recursive.test.ts │ │ ├── sentence.test.ts │ │ ├── neural.test.ts │ │ ├── semantic.test.ts │ │ ├── code.test.ts │ │ ├── fixtures │ │ │ ├── test-code.js │ │ │ └── test-code.ts │ │ └── pipeline.test.ts │ ├── CHANGELOG.md │ ├── examples │ │ ├── sentence.example.ts │ │ ├── neural.example.ts │ │ ├── recursive.example.ts │ │ ├── semantic.example.ts │ │ ├── token.example.ts │ │ ├── embeddings-refinery.example.ts │ │ └── overlap-refinery.example.ts │ ├── package.json │ ├── src │ │ ├── index.ts │ │ ├── refineries │ │ │ ├── embeddings.ts │ │ │ └── overlap.ts │ │ ├── chunkers │ │ │ ├── neural.ts │ │ │ ├── code.ts │ │ │ ├── token.ts │ │ │ ├── late.ts │ │ │ ├── recursive.ts │ │ │ ├── sentence.ts │ │ │ └── semantic.ts │ │ ├── utils.ts │ │ └── base.ts │ └── README.md ├── core │ ├── .tsc-aliasrc.json │ ├── vitest.config.ts │ ├── CHANGELOG.md │ ├── src │ │ ├── index.ts │ │ ├── tokenizer.ts │ │ ├── token.ts │ │ └── types.ts │ ├── tsconfig.json │ ├── package.json │ ├── examples │ │ ├── token.example.ts │ │ ├── with-huggingface.example.ts │ │ └── recursive.example.ts │ └── README.md └── token │ ├── src │ ├── index.ts │ └── huggingface.ts │ ├── .tsc-aliasrc.json │ ├── CHANGELOG.md │ ├── tsconfig.json │ ├── package.json │ └── README.md ├── assets └── chonkie_logo_br_transparent_bg.png ├── legacy └── chonkie │ ├── types │ ├── wasm.d.ts │ ├── index.ts │ ├── code.ts │ ├── late.ts │ ├── semantic.ts │ ├── base.ts │ └── sentence.ts │ ├── friends │ ├── index.ts │ ├── base.ts │ ├── utils.ts │ └── chroma.ts │ ├── chunker │ ├── index.ts │ └── base.ts │ ├── cloud │ ├── index.ts │ ├── embeddings_refinery.ts │ ├── base.ts │ ├── neural.ts │ ├── token.ts │ ├── overlap_refinery.ts │ ├── code.ts │ ├── late.ts │ ├── slumber.ts │ ├── recursive.ts │ ├── sentence.ts │ ├── semantic.ts │ └── sdpm.ts │ ├── index.ts │ └── utils │ └── hub.ts ├── .changeset ├── config.json └── README.md ├── .github └── workflows │ ├── auto-assign.yml │ └── test.yml ├── package.json ├── LICENSE ├── .gitignore ├── CONTRIBUTING.md └── README.md /pnpm-workspace.yaml: -------------------------------------------------------------------------------- 1 | packages: 2 | - 'packages/*' 3 | -------------------------------------------------------------------------------- /packages/cloud/.tsc-aliasrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "resolveFullPaths": true, 3 | "output": { 4 | "fileExtension": ".js" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /packages/core/.tsc-aliasrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "resolveFullPaths": true, 3 | "output": { 4 | "fileExtension": ".js" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /assets/chonkie_logo_br_transparent_bg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chonkie-inc/chonkiejs/HEAD/assets/chonkie_logo_br_transparent_bg.png -------------------------------------------------------------------------------- /packages/token/src/index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @chonkiejs/token 3 | * HuggingFace tokenizer support for Chonkie 4 | */ 5 | 6 | export { HuggingFaceTokenizer } from './huggingface'; 7 | -------------------------------------------------------------------------------- /legacy/chonkie/types/wasm.d.ts: -------------------------------------------------------------------------------- 1 | declare module '*.wasm' { 2 | const value: string; // The path to the wasm file or its content depending on the bundler 3 | export default value; 4 | } -------------------------------------------------------------------------------- /legacy/chonkie/friends/index.ts: -------------------------------------------------------------------------------- 1 | /** Chonkie's Friends. */ 2 | 3 | export { BaseHandshake } from "./base"; 4 | export { ChromaHandshake } from "./chroma"; 5 | export { generateRandomCollectionName } from "./utils"; -------------------------------------------------------------------------------- /packages/token/.tsc-aliasrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "resolveFullPaths": true, 3 | "output": { 4 | "fileExtension": ".js" 5 | }, 6 | "replacers": { 7 | "~": { 8 | "enabled": true 9 | } 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /packages/core/vitest.config.ts: -------------------------------------------------------------------------------- 1 | import { defineConfig } from 'vitest/config'; 2 | import tsconfigPaths from 'vite-tsconfig-paths'; 3 | 4 | export default defineConfig({ 5 | plugins: [tsconfigPaths()], 6 | test: { 7 | globals: true, 8 | environment: 'node', 9 | }, 10 | }); 11 | -------------------------------------------------------------------------------- /packages/token/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # @chonkiejs/token 2 | 3 | ## 0.0.3 4 | 5 | ### Patch Changes 6 | 7 | - Fix: Add proper `.js` extension to the files 8 | 9 | ## 0.0.2 10 | 11 | ### Patch Changes 12 | 13 | - Add Huggingface Tokenizer and TokenChunker support 14 | - Updated dependencies 15 | - @chonkiejs/core@0.0.3 16 | -------------------------------------------------------------------------------- /legacy/chonkie/friends/base.ts: -------------------------------------------------------------------------------- 1 | /** Base class for Chonkie's Handshakes. */ 2 | 3 | import { Chunk } from "../types/base"; 4 | 5 | 6 | export abstract class BaseHandshake { 7 | 8 | public abstract write(chunks: Chunk[]): Promise; 9 | public abstract query(query: string, nResults: number): Promise; 10 | } -------------------------------------------------------------------------------- /.changeset/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://unpkg.com/@changesets/config@3.1.1/schema.json", 3 | "changelog": "@changesets/cli/changelog", 4 | "commit": false, 5 | "fixed": [], 6 | "linked": [], 7 | "access": "public", 8 | "baseBranch": "main", 9 | "updateInternalDependencies": "patch", 10 | "ignore": [] 11 | } 12 | -------------------------------------------------------------------------------- /packages/cloud/vitest.config.ts: -------------------------------------------------------------------------------- 1 | import { defineConfig } from 'vitest/config'; 2 | import tsconfigPaths from 'vite-tsconfig-paths'; 3 | 4 | export default defineConfig({ 5 | plugins: [tsconfigPaths()], 6 | test: { 7 | globals: true, 8 | environment: 'node', 9 | testTimeout: 60000, // Longer timeout for slow API calls 10 | }, 11 | }); 12 | -------------------------------------------------------------------------------- /legacy/chonkie/types/index.ts: -------------------------------------------------------------------------------- 1 | export { Chunk } from './base'; 2 | export { SentenceData, Sentence, SentenceChunk } from './sentence'; 3 | export { TreeSitterNode, CodeChunk } from './code'; 4 | export { RecursiveLevel, RecursiveRules, RecursiveChunk } from './recursive'; 5 | export { LateChunk } from './late'; 6 | export { SemanticSentenceData, SemanticSentence, SemanticChunkData, SemanticChunk } from './semantic'; 7 | -------------------------------------------------------------------------------- /packages/core/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # @chonkiejs/core 2 | 3 | ## 0.0.5 4 | 5 | ### Patch Changes 6 | 7 | - Fix: Added full path resolution for .js files 8 | 9 | ## 0.0.4 10 | 11 | ### Patch Changes 12 | 13 | - Fix: Add `embedding` to the `Chunk` for `EmbeddingsRefinery` 14 | 15 | ## 0.0.3 16 | 17 | ### Patch Changes 18 | 19 | - Add Huggingface Tokenizer and TokenChunker support 20 | - Updated dependencies 21 | - @chonkiejs/core@0.0.3 22 | -------------------------------------------------------------------------------- /legacy/chonkie/chunker/index.ts: -------------------------------------------------------------------------------- 1 | export { BaseChunker } from './base'; 2 | export { CodeChunker, CallableCodeChunker, CodeChunkerOptions } from './code'; 3 | export { RecursiveChunker, CallableRecursiveChunker, RecursiveChunkerOptions, RecursiveChunkerRecipeOptions } from './recursive'; 4 | export { SentenceChunker, CallableSentenceChunker, SentenceChunkerOptions, SentenceChunkerRecipeOptions } from './sentence'; 5 | export { TokenChunker, CallableTokenChunker, TokenChunkerOptions } from './token'; -------------------------------------------------------------------------------- /.changeset/README.md: -------------------------------------------------------------------------------- 1 | # Changesets 2 | 3 | Hello and welcome! This folder has been automatically generated by `@changesets/cli`, a build tool that works 4 | with multi-package repos, or single-package repos to help you version and publish your code. You can 5 | find the full documentation for it [in our repository](https://github.com/changesets/changesets) 6 | 7 | We have a quick list of common questions to get you started engaging with this project in 8 | [our documentation](https://github.com/changesets/changesets/blob/main/docs/common-questions.md) 9 | -------------------------------------------------------------------------------- /packages/core/src/index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @chonkie/core 3 | * Core chunking library for Chonkie - lightweight and efficient text chunking 4 | */ 5 | 6 | export { RecursiveChunker } from '@/recursive'; 7 | export type { RecursiveChunkerOptions } from '@/recursive'; 8 | 9 | export { TokenChunker } from '@/token'; 10 | export type { TokenChunkerOptions } from '@/token'; 11 | 12 | export { Tokenizer } from '@/tokenizer'; 13 | 14 | export { Chunk, RecursiveLevel, RecursiveRules } from '@/types'; 15 | export type { RecursiveLevelConfig, RecursiveRulesConfig, IncludeDelim } from '@/types'; 16 | -------------------------------------------------------------------------------- /packages/token/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2021", 4 | "module": "ES2020", 5 | "lib": ["ES2021"], 6 | "moduleResolution": "node", 7 | "outDir": "./dist", 8 | "rootDir": "./src", 9 | "strict": true, 10 | "esModuleInterop": true, 11 | "skipLibCheck": true, 12 | "declaration": true, 13 | "declarationMap": true, 14 | "sourceMap": true, 15 | "resolveJsonModule": true, 16 | "types": ["node"] 17 | }, 18 | "include": ["src/**/*"], 19 | "exclude": ["node_modules", "dist", "**/*.test.ts", "**/*.spec.ts"] 20 | } 21 | -------------------------------------------------------------------------------- /.github/workflows/auto-assign.yml: -------------------------------------------------------------------------------- 1 | name: Auto Assign Issues 2 | 3 | on: 4 | issues: 5 | types: [opened] 6 | 7 | jobs: 8 | assign: 9 | runs-on: ubuntu-latest 10 | permissions: 11 | issues: write 12 | steps: 13 | - name: Assign issue to chonknick 14 | uses: actions/github-script@v7 15 | with: 16 | script: | 17 | github.rest.issues.addAssignees({ 18 | owner: context.repo.owner, 19 | repo: context.repo.repo, 20 | issue_number: context.issue.number, 21 | assignees: ['chonknick'] 22 | }); 23 | -------------------------------------------------------------------------------- /packages/cloud/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2021", 4 | "module": "ES2020", 5 | "lib": ["ES2021"], 6 | "moduleResolution": "node", 7 | "outDir": "./dist", 8 | "rootDir": "./src", 9 | "strict": true, 10 | "esModuleInterop": true, 11 | "skipLibCheck": true, 12 | "declaration": true, 13 | "declarationMap": true, 14 | "sourceMap": true, 15 | "resolveJsonModule": true, 16 | "types": ["node"], 17 | "baseUrl": "./src", 18 | "paths": { 19 | "@/*": ["./*"] 20 | } 21 | }, 22 | "include": ["src/**/*"], 23 | "exclude": ["node_modules", "dist", "**/*.test.ts", "**/*.spec.ts"] 24 | } 25 | -------------------------------------------------------------------------------- /packages/core/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2021", 4 | "module": "ES2020", 5 | "lib": ["ES2021"], 6 | "moduleResolution": "node", 7 | "outDir": "./dist", 8 | "rootDir": "./src", 9 | "strict": true, 10 | "esModuleInterop": true, 11 | "skipLibCheck": true, 12 | "declaration": true, 13 | "declarationMap": true, 14 | "sourceMap": true, 15 | "resolveJsonModule": true, 16 | "types": ["node"], 17 | "baseUrl": "./src", 18 | "paths": { 19 | "@/*": ["./*"] 20 | } 21 | }, 22 | "include": ["src/**/*"], 23 | "exclude": ["node_modules", "dist", "**/*.test.ts", "**/*.spec.ts"] 24 | } 25 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Run Tests 2 | 3 | on: 4 | push: 5 | branches: [ '*' ] 6 | pull_request: 7 | branches: [ '*' ] 8 | 9 | jobs: 10 | test: 11 | runs-on: ubuntu-latest 12 | 13 | strategy: 14 | matrix: 15 | node-version: [20.x] 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | 20 | - uses: pnpm/action-setup@v2 21 | with: 22 | version: 10 23 | 24 | - name: Use Node.js ${{ matrix.node-version }} 25 | uses: actions/setup-node@v3 26 | with: 27 | node-version: ${{ matrix.node-version }} 28 | cache: 'pnpm' 29 | 30 | - name: Install dependencies 31 | run: pnpm install --frozen-lockfile 32 | 33 | - name: Run tests 34 | run: pnpm test 35 | env: 36 | CHONKIE_API_KEY: ${{ secrets.CHONKIE_API_KEY }} -------------------------------------------------------------------------------- /legacy/chonkie/cloud/index.ts: -------------------------------------------------------------------------------- 1 | /** Cloud client exports for Chonkie API. */ 2 | 3 | export { CloudClient, CloudClientConfig, ChunkerInput } from './base'; 4 | export { CodeChunker, CodeChunkerConfig } from './code'; 5 | export { LateChunker, LateChunkerConfig } from './late'; 6 | export { NeuralChunker, NeuralChunkerConfig } from './neural'; 7 | export { RecursiveChunker, RecursiveChunkerConfig } from './recursive'; 8 | export { EmbeddingsRefinery, EmbeddingsRefineryConfig } from './embeddings_refinery'; 9 | export { SDPMChunker, SDPMChunkerConfig } from './sdpm'; 10 | export { SemanticChunker, SemanticChunkerConfig } from './semantic'; 11 | export { SentenceChunker, SentenceChunkerConfig } from './sentence'; 12 | export { SlumberChunker, SlumberChunkerConfig } from './slumber'; 13 | export { TokenChunker, TokenChunkerConfig } from './token'; 14 | export { OverlapRefinery, OverlapRefineryConfig } from './overlap_refinery'; 15 | -------------------------------------------------------------------------------- /packages/cloud/tests/overlap-refinery.test.ts: -------------------------------------------------------------------------------- 1 | import { TokenChunker, OverlapRefinery } from '../src'; 2 | 3 | describe.skipIf(!process.env.CHONKIE_API_KEY)('OverlapRefinery', () => { 4 | it('should add overlap to chunks successfully', async () => { 5 | // First create some chunks 6 | const chunker = new TokenChunker({ chunkSize: 30 }); 7 | const chunks = await chunker.chunk({ text: 'This is a test for overlap refinery functionality.' }); 8 | 9 | // Add overlap 10 | const refinery = new OverlapRefinery({ 11 | contextSize: 0.25, 12 | method: 'suffix' 13 | }); 14 | 15 | const refinedChunks = await refinery.refine(chunks); 16 | 17 | expect(refinedChunks.length).toBeGreaterThan(0); 18 | expect(refinedChunks[0]).toHaveProperty('text'); 19 | expect(refinedChunks[0]).toHaveProperty('tokenCount'); 20 | expect(refinedChunks[0]).toHaveProperty('startIndex'); 21 | expect(refinedChunks[0]).toHaveProperty('endIndex'); 22 | }); 23 | }); 24 | -------------------------------------------------------------------------------- /legacy/chonkie/index.ts: -------------------------------------------------------------------------------- 1 | // Import chunkers directly to avoid loading CodeChunker and web-tree-sitter 2 | export { TokenChunker } from './chunker/token'; 3 | export { SentenceChunker } from './chunker/sentence'; 4 | export { RecursiveChunker } from './chunker/recursive'; 5 | // CodeChunker removed - use: import { CodeChunker } from "chonkie/chunker/code" 6 | export { Tokenizer } from './tokenizer'; 7 | export { Visualizer } from './utils/viz'; 8 | export { Hubbie } from './utils/hub'; 9 | // ChromaHandshake removed - use: import { ChromaHandshake } from "chonkie/friends" 10 | export { 11 | Chunk, 12 | SentenceData, 13 | Sentence, 14 | SentenceChunk, 15 | // TreeSitterNode, CodeChunk removed - use: import { TreeSitterNode, CodeChunk } from "chonkie/types" 16 | RecursiveLevel, 17 | RecursiveRules, 18 | RecursiveChunk, 19 | LateChunk, 20 | SemanticSentenceData, 21 | SemanticSentence, 22 | SemanticChunkData, 23 | SemanticChunk 24 | } from './types'; -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "chonkie-main", 3 | "version": "0.3.0", 4 | "private": true, 5 | "description": "Monorepo for Chonkie - lightweight and efficient text chunking library", 6 | "repository": { 7 | "type": "git", 8 | "url": "git+https://github.com/chonkie-inc/chonkie-ts.git" 9 | }, 10 | "workspaces": [ 11 | "packages/*" 12 | ], 13 | "scripts": { 14 | "clean": "rimraf packages/*/dist", 15 | "build": "npm run clean && npm run build --workspace=packages/core && npm run build --workspace=packages/cloud", 16 | "test": "npm run build && npm run test --workspace=packages/core && npm run test --workspace=packages/cloud", 17 | "changeset": "changeset", 18 | "version": "changeset version", 19 | "release": "npm run build && changeset publish" 20 | }, 21 | "devDependencies": { 22 | "@changesets/cli": "^2.29.7", 23 | "@types/node": "^22.15.21", 24 | "publint": "^0.3.14", 25 | "rimraf": "^6.0.1", 26 | "typescript": "^5.8.3", 27 | "vitest": "^3.2.4" 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Chonkie 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /legacy/chonkie/cloud/embeddings_refinery.ts: -------------------------------------------------------------------------------- 1 | /** Refinery clients for Chonkie API. */ 2 | 3 | import { CloudClient } from "./base"; 4 | import { Chunk } from "../types/base"; 5 | 6 | export interface EmbeddingsRefineryConfig { 7 | embeddingModel: string; 8 | } 9 | 10 | export class EmbeddingsRefinery extends CloudClient { 11 | private readonly config: Required; 12 | 13 | constructor(apiKey: string, config: EmbeddingsRefineryConfig) { 14 | super({ apiKey }); 15 | if (!config.embeddingModel) { 16 | throw new Error("Embedding model is required for embeddings refinement"); 17 | } 18 | this.config = { 19 | embeddingModel: config.embeddingModel, 20 | }; 21 | } 22 | 23 | async refine(chunks: Chunk[]): Promise { 24 | const response = await this.request("/v1/refine/embeddings", { 25 | body: { 26 | chunks: chunks.map(chunk => chunk.toDict()), 27 | embedding_model: this.config.embeddingModel, 28 | }, 29 | headers: { 30 | "Content-Type": "application/json", 31 | }, 32 | }); 33 | 34 | return response.map(chunk => Chunk.fromDict(chunk)); 35 | } 36 | } -------------------------------------------------------------------------------- /packages/cloud/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # @chonkiejs/cloud 2 | 3 | ## 0.1.0 4 | 5 | ### Minor Changes 6 | 7 | - Add Pipeline client for building and executing pipelines via api.chonkie.ai 8 | 9 | - New `Pipeline` class with fluent API for building pipelines 10 | - Support for `chunkWith()`, `refineWith()`, and `processWith()` builder methods 11 | - Static methods: `Pipeline.get()`, `Pipeline.list()`, `Pipeline.validate()` 12 | - Instance methods: `run()`, `update()`, `delete()`, `reset()` 13 | - Auto-save on first `run()` call 14 | - File upload support via `filepath` option 15 | - Full TypeScript types: `PipelineOptions`, `PipelineStep`, `PipelineValidationResult` 16 | 17 | ## 0.0.6 18 | 19 | ### Patch Changes 20 | 21 | - Fix: Add proper `.js` extension to the files 22 | 23 | ## 0.0.5 24 | 25 | ### Patch Changes 26 | 27 | - Fix: Add `embedding` to the `Chunk` for `EmbeddingsRefinery` 28 | - Updated dependencies 29 | - @chonkiejs/core@0.0.4 30 | 31 | ## 0.0.4 32 | 33 | ### Patch Changes 34 | 35 | - Add OverlapRefinery and EmbeddingsRefinery 36 | 37 | ## 0.0.3 38 | 39 | ### Patch Changes 40 | 41 | - Updated dependencies 42 | - @chonkiejs/core@0.0.3 43 | 44 | ## 0.0.2 45 | 46 | ### Patch Changes 47 | 48 | - Fix: tsx alias not present error 49 | -------------------------------------------------------------------------------- /packages/cloud/examples/sentence.example.ts: -------------------------------------------------------------------------------- 1 | import { SentenceChunker } from '../src'; 2 | 3 | async function main() { 4 | console.log('🦛 Testing SentenceChunker with api.chonkie.ai\n'); 5 | 6 | try { 7 | const chunker = new SentenceChunker({ 8 | chunkSize: 50, 9 | minSentencesPerChunk: 2 10 | }); 11 | 12 | console.log('✅ SentenceChunker created'); 13 | console.log(`Config: ${chunker.toString()}\n`); 14 | 15 | const text = 'This is the first sentence. Here is the second one. And a third sentence for testing. Finally, a fourth sentence to complete the example.'; 16 | 17 | console.log(`📝 Input (${text.length} chars):`); 18 | console.log(`"${text}"\n`); 19 | 20 | const chunks = await chunker.chunk({ text }); 21 | 22 | console.log(`✅ Received ${chunks.length} chunks:\n`); 23 | 24 | chunks.forEach((chunk, index) => { 25 | console.log(`Chunk ${index + 1}:`); 26 | console.log(` Tokens: ${chunk.tokenCount}`); 27 | console.log(` Position: [${chunk.startIndex}:${chunk.endIndex}]`); 28 | console.log(` Text: "${chunk.text}"`); 29 | console.log(); 30 | }); 31 | 32 | } catch (error) { 33 | console.error('❌ Error:', error instanceof Error ? error.message : error); 34 | process.exit(1); 35 | } 36 | } 37 | 38 | main(); 39 | -------------------------------------------------------------------------------- /packages/core/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@chonkiejs/core", 3 | "version": "0.0.5", 4 | "description": "Core chunking library for Chonkie - lightweight and efficient text chunking", 5 | "license": "MIT", 6 | "homepage": "https://docs.chonkie.ai", 7 | "repository": { 8 | "type": "git", 9 | "url": "git+https://github.com/chonkie-inc/chonkie-ts.git", 10 | "directory": "packages/core" 11 | }, 12 | "author": "Bhavnick Minhas", 13 | "type": "module", 14 | "main": "./dist/index.js", 15 | "types": "./dist/index.d.ts", 16 | "exports": { 17 | ".": { 18 | "types": "./dist/index.d.ts", 19 | "import": "./dist/index.js", 20 | "default": "./dist/index.js" 21 | } 22 | }, 23 | "scripts": { 24 | "clean": "rimraf dist", 25 | "build": "npm run clean && tsc && tsc-alias -p tsconfig.json --resolve-full-paths", 26 | "test": "vitest run" 27 | }, 28 | "files": [ 29 | "dist" 30 | ], 31 | "keywords": [ 32 | "chonkie", 33 | "chunking", 34 | "text-processing", 35 | "tokenization" 36 | ], 37 | "devDependencies": { 38 | "rimraf": "^6.0.1", 39 | "ts-node": "^10.9.2", 40 | "tsc-alias": "^1.8.16", 41 | "tsx": "^4.20.6", 42 | "typescript": "^5.9.3", 43 | "vite-tsconfig-paths": "^5.1.4", 44 | "vitest": "^3.2.4" 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /packages/cloud/examples/neural.example.ts: -------------------------------------------------------------------------------- 1 | import { NeuralChunker } from '../src'; 2 | 3 | async function main() { 4 | console.log('🦛 Testing NeuralChunker with api.chonkie.ai\n'); 5 | 6 | try { 7 | const chunker = new NeuralChunker(); 8 | 9 | console.log('✅ NeuralChunker created'); 10 | console.log(`Config: ${chunker.toString()}\n`); 11 | 12 | const text = 'Neural networks are used for pattern recognition. They learn from data to make predictions. Deep learning has revolutionized AI applications. Modern architectures like transformers are very powerful.'; 13 | 14 | console.log(`📝 Input (${text.length} chars):`); 15 | console.log(`"${text}"\n`); 16 | 17 | console.log('🔄 Processing with neural model...\n'); 18 | 19 | const chunks = await chunker.chunk({ text }); 20 | 21 | console.log(`✅ Received ${chunks.length} chunks:\n`); 22 | 23 | chunks.forEach((chunk, index) => { 24 | console.log(`Chunk ${index + 1}:`); 25 | console.log(` Tokens: ${chunk.tokenCount}`); 26 | console.log(` Position: [${chunk.startIndex}:${chunk.endIndex}]`); 27 | console.log(` Text: "${chunk.text}"`); 28 | console.log(); 29 | }); 30 | 31 | } catch (error) { 32 | console.error('❌ Error:', error instanceof Error ? error.message : error); 33 | process.exit(1); 34 | } 35 | } 36 | 37 | main(); 38 | -------------------------------------------------------------------------------- /packages/cloud/examples/recursive.example.ts: -------------------------------------------------------------------------------- 1 | import { RecursiveChunker } from '../src'; 2 | 3 | async function main() { 4 | console.log('🦛 Testing RecursiveChunker with api.chonkie.ai\n'); 5 | 6 | try { 7 | const chunker = new RecursiveChunker({ 8 | chunkSize: 60, 9 | recipe: 'default', 10 | lang: 'en' 11 | }); 12 | 13 | console.log('✅ RecursiveChunker created'); 14 | console.log(`Config: ${chunker.toString()}\n`); 15 | 16 | const text = `First paragraph with some content here. 17 | 18 | Second paragraph with more details. It contains multiple sentences. 19 | 20 | Third paragraph to test the recursive splitting.`; 21 | 22 | console.log(`📝 Input (${text.length} chars):`); 23 | console.log(`"${text}"\n`); 24 | 25 | const chunks = await chunker.chunk({ text }); 26 | 27 | console.log(`✅ Received ${chunks.length} chunks:\n`); 28 | 29 | chunks.forEach((chunk, index) => { 30 | console.log(`Chunk ${index + 1}:`); 31 | console.log(` Tokens: ${chunk.tokenCount}`); 32 | console.log(` Position: [${chunk.startIndex}:${chunk.endIndex}]`); 33 | console.log(` Text: "${chunk.text}"`); 34 | console.log(); 35 | }); 36 | 37 | } catch (error) { 38 | console.error('❌ Error:', error instanceof Error ? error.message : error); 39 | process.exit(1); 40 | } 41 | } 42 | 43 | main(); 44 | -------------------------------------------------------------------------------- /packages/cloud/tests/embeddings-refinery.test.ts: -------------------------------------------------------------------------------- 1 | import { TokenChunker, EmbeddingsRefinery } from '../src'; 2 | 3 | describe.skipIf(!process.env.CHONKIE_API_KEY)('EmbeddingsRefinery', () => { 4 | it('should add embeddings to chunks successfully', async () => { 5 | // First create some chunks 6 | const chunker = new TokenChunker({ chunkSize: 30 }); 7 | const chunks = await chunker.chunk({ text: 'This is a test for embeddings refinery.' }); 8 | 9 | // Verify chunks don't have embeddings initially 10 | expect(chunks[0].embedding).toBeUndefined(); 11 | 12 | // Add embeddings 13 | const refinery = new EmbeddingsRefinery({ 14 | embeddingModel: 'sentence-transformers/all-MiniLM-L6-v2' 15 | }); 16 | 17 | const refinedChunks = await refinery.refine(chunks); 18 | 19 | expect(refinedChunks.length).toBeGreaterThan(0); 20 | expect(refinedChunks[0]).toHaveProperty('text'); 21 | expect(refinedChunks[0]).toHaveProperty('tokenCount'); 22 | expect(refinedChunks[0]).toHaveProperty('startIndex'); 23 | expect(refinedChunks[0]).toHaveProperty('endIndex'); 24 | 25 | // Verify embeddings are now present 26 | expect(refinedChunks[0]).toHaveProperty('embedding'); 27 | expect(refinedChunks[0].embedding).toBeDefined(); 28 | expect(Array.isArray(refinedChunks[0].embedding)).toBe(true); 29 | expect(refinedChunks[0].embedding!.length).toBeGreaterThan(0); 30 | }); 31 | }); 32 | -------------------------------------------------------------------------------- /packages/cloud/examples/semantic.example.ts: -------------------------------------------------------------------------------- 1 | import { SemanticChunker } from '../src'; 2 | 3 | async function main() { 4 | console.log('🦛 Testing SemanticChunker with api.chonkie.ai\n'); 5 | 6 | try { 7 | const chunker = new SemanticChunker({ 8 | chunkSize: 60, 9 | threshold: 0.5 10 | }); 11 | 12 | console.log('✅ SemanticChunker created'); 13 | console.log(`Config: ${chunker.toString()}\n`); 14 | 15 | const text = 'Artificial intelligence is transforming technology. Machine learning models are becoming more powerful. Meanwhile, climate change poses significant challenges. Environmental protection is crucial for our future.'; 16 | 17 | console.log(`📝 Input (${text.length} chars):`); 18 | console.log(`"${text}"\n`); 19 | 20 | console.log('🔄 Analyzing semantic similarity...\n'); 21 | 22 | const chunks = await chunker.chunk({ text }); 23 | 24 | console.log(`✅ Received ${chunks.length} chunks:\n`); 25 | 26 | chunks.forEach((chunk, index) => { 27 | console.log(`Chunk ${index + 1}:`); 28 | console.log(` Tokens: ${chunk.tokenCount}`); 29 | console.log(` Position: [${chunk.startIndex}:${chunk.endIndex}]`); 30 | console.log(` Text: "${chunk.text}"`); 31 | console.log(); 32 | }); 33 | 34 | } catch (error) { 35 | console.error('❌ Error:', error instanceof Error ? error.message : error); 36 | process.exit(1); 37 | } 38 | } 39 | 40 | main(); 41 | -------------------------------------------------------------------------------- /packages/cloud/tests/token.test.ts: -------------------------------------------------------------------------------- 1 | import { TokenChunker } from '../src'; 2 | import * as path from 'path'; 3 | 4 | describe.skipIf(!process.env.CHONKIE_API_KEY)('TokenChunker', () => { 5 | it('should chunk text successfully', async () => { 6 | const chunker = new TokenChunker({ chunkSize: 30 }); 7 | const text = 'This is a test. It should be chunked properly.'; 8 | 9 | const chunks = await chunker.chunk({ text }); 10 | 11 | expect(chunks.length).toBeGreaterThan(0); 12 | expect(chunks[0]).toHaveProperty('text'); 13 | expect(chunks[0]).toHaveProperty('tokenCount'); 14 | expect(chunks[0]).toHaveProperty('startIndex'); 15 | expect(chunks[0]).toHaveProperty('endIndex'); 16 | }); 17 | 18 | it('should chunk file successfully with file upload', async () => { 19 | const chunker = new TokenChunker({ chunkSize: 150, chunkOverlap: 20 }); 20 | const testFilePath = path.join(__dirname, 'fixtures', 'test-document.md'); 21 | 22 | const chunks = await chunker.chunk({ filepath: testFilePath }); 23 | 24 | expect(chunks.length).toBeGreaterThan(0); 25 | expect(chunks[0]).toHaveProperty('text'); 26 | expect(chunks[0]).toHaveProperty('tokenCount'); 27 | expect(chunks[0]).toHaveProperty('startIndex'); 28 | expect(chunks[0]).toHaveProperty('endIndex'); 29 | 30 | // Verify chunks respect chunk size 31 | chunks.forEach(chunk => { 32 | expect(chunk.tokenCount).toBeLessThanOrEqual(150); 33 | }); 34 | }); 35 | }); 36 | -------------------------------------------------------------------------------- /packages/cloud/tests/recursive.test.ts: -------------------------------------------------------------------------------- 1 | import { RecursiveChunker } from '../src'; 2 | import * as path from 'path'; 3 | 4 | describe.skipIf(!process.env.CHONKIE_API_KEY)('RecursiveChunker', () => { 5 | it('should chunk text successfully', async () => { 6 | const chunker = new RecursiveChunker({ chunkSize: 50 }); 7 | const text = 'Paragraph one.\n\nParagraph two with more text.'; 8 | 9 | const chunks = await chunker.chunk({ text }); 10 | 11 | expect(chunks.length).toBeGreaterThan(0); 12 | expect(chunks[0]).toHaveProperty('text'); 13 | expect(chunks[0]).toHaveProperty('tokenCount'); 14 | expect(chunks[0]).toHaveProperty('startIndex'); 15 | expect(chunks[0]).toHaveProperty('endIndex'); 16 | }); 17 | 18 | it('should chunk file successfully with file upload', async () => { 19 | const chunker = new RecursiveChunker({ chunkSize: 200, minCharactersPerChunk: 50 }); 20 | const testFilePath = path.join(__dirname, 'fixtures', 'test-document.md'); 21 | 22 | const chunks = await chunker.chunk({ filepath: testFilePath }); 23 | 24 | expect(chunks.length).toBeGreaterThan(0); 25 | expect(chunks[0]).toHaveProperty('text'); 26 | expect(chunks[0]).toHaveProperty('tokenCount'); 27 | expect(chunks[0]).toHaveProperty('startIndex'); 28 | expect(chunks[0]).toHaveProperty('endIndex'); 29 | 30 | // Verify chunks can reconstruct the file 31 | const reconstructed = chunks.map(c => c.text).join(''); 32 | expect(reconstructed.length).toBeGreaterThan(0); 33 | }); 34 | }); 35 | -------------------------------------------------------------------------------- /packages/cloud/tests/sentence.test.ts: -------------------------------------------------------------------------------- 1 | import { SentenceChunker } from '../src'; 2 | import * as path from 'path'; 3 | 4 | describe.skipIf(!process.env.CHONKIE_API_KEY)('SentenceChunker', () => { 5 | it('should chunk text successfully', async () => { 6 | const chunker = new SentenceChunker({ chunkSize: 50 }); 7 | const text = 'First sentence here. Second sentence. Third one too.'; 8 | 9 | const chunks = await chunker.chunk({ text }); 10 | 11 | expect(chunks.length).toBeGreaterThan(0); 12 | expect(chunks[0]).toHaveProperty('text'); 13 | expect(chunks[0]).toHaveProperty('tokenCount'); 14 | expect(chunks[0]).toHaveProperty('startIndex'); 15 | expect(chunks[0]).toHaveProperty('endIndex'); 16 | }); 17 | 18 | it('should chunk file successfully with file upload', async () => { 19 | const chunker = new SentenceChunker({ chunkSize: 150, minSentencesPerChunk: 2 }); 20 | const testFilePath = path.join(__dirname, 'fixtures', 'test-document.md'); 21 | 22 | const chunks = await chunker.chunk({ filepath: testFilePath }); 23 | 24 | expect(chunks.length).toBeGreaterThan(0); 25 | expect(chunks[0]).toHaveProperty('text'); 26 | expect(chunks[0]).toHaveProperty('tokenCount'); 27 | expect(chunks[0]).toHaveProperty('startIndex'); 28 | expect(chunks[0]).toHaveProperty('endIndex'); 29 | 30 | // Verify chunks can reconstruct the file 31 | const reconstructed = chunks.map(c => c.text).join(''); 32 | expect(reconstructed.length).toBeGreaterThan(0); 33 | }); 34 | }); 35 | -------------------------------------------------------------------------------- /packages/token/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@chonkiejs/token", 3 | "version": "0.0.3", 4 | "description": "HuggingFace tokenizer support for Chonkie - extends @chonkiejs/core with real tokenization", 5 | "license": "MIT", 6 | "homepage": "https://docs.chonkie.ai", 7 | "repository": { 8 | "type": "git", 9 | "url": "git+https://github.com/chonkie-inc/chonkie-ts.git", 10 | "directory": "packages/token" 11 | }, 12 | "author": "Bhavnick Minhas", 13 | "type": "module", 14 | "main": "./dist/index.js", 15 | "types": "./dist/index.d.ts", 16 | "exports": { 17 | ".": { 18 | "types": "./dist/index.d.ts", 19 | "import": "./dist/index.js", 20 | "default": "./dist/index.js" 21 | } 22 | }, 23 | "scripts": { 24 | "clean": "rimraf dist", 25 | "build": "npm run clean && tsc && tsc-alias -p tsconfig.json --resolve-full-paths", 26 | "test": "echo 'No tests for @chonkiejs/token yet'" 27 | }, 28 | "files": [ 29 | "dist" 30 | ], 31 | "keywords": [ 32 | "chonkie", 33 | "tokenizer", 34 | "huggingface", 35 | "transformers", 36 | "gpt2", 37 | "bert" 38 | ], 39 | "peerDependencies": { 40 | "@chonkiejs/core": ">=0.0.5" 41 | }, 42 | "dependencies": { 43 | "@huggingface/transformers": "^3.5.1" 44 | }, 45 | "devDependencies": { 46 | "@chonkiejs/core": "^0.0.5", 47 | "rimraf": "^6.0.1", 48 | "tsc-alias": "^1.8.16", 49 | "tsx": "^4.20.6", 50 | "typescript": "^5.9.3", 51 | "vitest": "^3.2.4" 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /packages/cloud/tests/neural.test.ts: -------------------------------------------------------------------------------- 1 | import { NeuralChunker } from '../src'; 2 | import * as path from 'path'; 3 | 4 | describe.skipIf(!process.env.CHONKIE_API_KEY)('NeuralChunker', () => { 5 | it('should chunk text successfully', async () => { 6 | const chunker = new NeuralChunker(); 7 | const text = 'Neural networks learn patterns. Deep learning is powerful. Transformers changed NLP. Modern AI is impressive.'; 8 | 9 | const chunks = await chunker.chunk({ text }); 10 | 11 | expect(chunks.length).toBeGreaterThan(0); 12 | expect(chunks[0]).toHaveProperty('text'); 13 | expect(chunks[0]).toHaveProperty('tokenCount'); 14 | expect(chunks[0]).toHaveProperty('startIndex'); 15 | expect(chunks[0]).toHaveProperty('endIndex'); 16 | }); 17 | 18 | it('should chunk file successfully with file upload', async () => { 19 | const chunker = new NeuralChunker({ minCharactersPerChunk: 50 }); 20 | const testFilePath = path.join(__dirname, 'fixtures', 'test-document.md'); 21 | 22 | const chunks = await chunker.chunk({ filepath: testFilePath }); 23 | 24 | expect(chunks.length).toBeGreaterThan(0); 25 | expect(chunks[0]).toHaveProperty('text'); 26 | expect(chunks[0]).toHaveProperty('tokenCount'); 27 | expect(chunks[0]).toHaveProperty('startIndex'); 28 | expect(chunks[0]).toHaveProperty('endIndex'); 29 | 30 | // Verify chunks can reconstruct the file 31 | const reconstructed = chunks.map(c => c.text).join(''); 32 | expect(reconstructed.length).toBeGreaterThan(0); 33 | }); 34 | }); 35 | -------------------------------------------------------------------------------- /packages/cloud/tests/semantic.test.ts: -------------------------------------------------------------------------------- 1 | import { SemanticChunker } from '../src'; 2 | import * as path from 'path'; 3 | 4 | describe.skipIf(!process.env.CHONKIE_API_KEY)('SemanticChunker', () => { 5 | it('should chunk text successfully', async () => { 6 | const chunker = new SemanticChunker({ chunkSize: 60 }); 7 | const text = 'AI is advancing rapidly. Technology continues to evolve. Climate change needs attention. Environmental issues are critical.'; 8 | 9 | const chunks = await chunker.chunk({ text }); 10 | 11 | expect(chunks.length).toBeGreaterThan(0); 12 | expect(chunks[0]).toHaveProperty('text'); 13 | expect(chunks[0]).toHaveProperty('tokenCount'); 14 | expect(chunks[0]).toHaveProperty('startIndex'); 15 | expect(chunks[0]).toHaveProperty('endIndex'); 16 | }); 17 | 18 | it('should chunk file successfully with file upload', async () => { 19 | const chunker = new SemanticChunker({ chunkSize: 200, threshold: 0.5 }); 20 | const testFilePath = path.join(__dirname, 'fixtures', 'test-document.md'); 21 | 22 | const chunks = await chunker.chunk({ filepath: testFilePath }); 23 | 24 | expect(chunks.length).toBeGreaterThan(0); 25 | expect(chunks[0]).toHaveProperty('text'); 26 | expect(chunks[0]).toHaveProperty('tokenCount'); 27 | expect(chunks[0]).toHaveProperty('startIndex'); 28 | expect(chunks[0]).toHaveProperty('endIndex'); 29 | 30 | // Verify chunks can reconstruct the file 31 | const reconstructed = chunks.map(c => c.text).join(''); 32 | expect(reconstructed.length).toBeGreaterThan(0); 33 | }); 34 | }); 35 | -------------------------------------------------------------------------------- /packages/cloud/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@chonkiejs/cloud", 3 | "version": "0.1.0", 4 | "description": "Cloud-based chunkers for Chonkie - semantic, neural, and AI-powered text chunking via api.chonkie.ai", 5 | "license": "MIT", 6 | "homepage": "https://docs.chonkie.ai", 7 | "repository": { 8 | "type": "git", 9 | "url": "git+https://github.com/chonkie-inc/chonkie-ts.git", 10 | "directory": "packages/cloud" 11 | }, 12 | "author": "Bhavnick Minhas", 13 | "type": "module", 14 | "main": "./dist/index.js", 15 | "types": "./dist/index.d.ts", 16 | "exports": { 17 | ".": { 18 | "types": "./dist/index.d.ts", 19 | "import": "./dist/index.js", 20 | "default": "./dist/index.js" 21 | } 22 | }, 23 | "scripts": { 24 | "clean": "rimraf dist", 25 | "build": "npm run clean && tsc && tsc-alias -p tsconfig.json --resolve-full-paths", 26 | "test": "vitest run" 27 | }, 28 | "files": [ 29 | "dist" 30 | ], 31 | "keywords": [ 32 | "chonkie", 33 | "chunking", 34 | "cloud", 35 | "semantic", 36 | "neural", 37 | "ai", 38 | "embeddings", 39 | "api" 40 | ], 41 | "peerDependencies": { 42 | "@chonkiejs/core": ">=0.0.5" 43 | }, 44 | "devDependencies": { 45 | "@chonkiejs/core": "^0.0.5", 46 | "@types/mime-types": "^3.0.1", 47 | "@types/node": "^22.15.21", 48 | "rimraf": "^6.0.1", 49 | "tsc-alias": "^1.8.16", 50 | "tsx": "^4.20.6", 51 | "typescript": "^5.9.3", 52 | "vite-tsconfig-paths": "^5.1.4", 53 | "vitest": "^3.2.4" 54 | }, 55 | "dependencies": { 56 | "mime-types": "^3.0.1" 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /packages/cloud/tests/code.test.ts: -------------------------------------------------------------------------------- 1 | import { CodeChunker } from '../src'; 2 | import * as path from 'path'; 3 | 4 | describe.skipIf(!process.env.CHONKIE_API_KEY)('CodeChunker', () => { 5 | it('should chunk TypeScript code successfully', async () => { 6 | const chunker = new CodeChunker({ language: 'typescript', chunkSize: 100 }); 7 | const code = ` 8 | function hello() { 9 | console.log('Hello world'); 10 | } 11 | 12 | class Example { 13 | constructor() { 14 | this.value = 42; 15 | } 16 | } 17 | `.trim(); 18 | 19 | const chunks = await chunker.chunk({ text: code }); 20 | 21 | expect(chunks.length).toBeGreaterThan(0); 22 | expect(chunks[0]).toHaveProperty('text'); 23 | expect(chunks[0]).toHaveProperty('tokenCount'); 24 | expect(chunks[0]).toHaveProperty('startIndex'); 25 | expect(chunks[0]).toHaveProperty('endIndex'); 26 | }); 27 | 28 | it('should chunk TypeScript file successfully with file upload', async () => { 29 | const chunker = new CodeChunker({ language: 'typescript', chunkSize: 200 }); 30 | const testFilePath = path.join(__dirname, 'fixtures', 'test-code.js'); 31 | 32 | const chunks = await chunker.chunk({ filepath: testFilePath }); 33 | 34 | expect(chunks.length).toBeGreaterThan(0); 35 | expect(chunks[0]).toHaveProperty('text'); 36 | expect(chunks[0]).toHaveProperty('tokenCount'); 37 | expect(chunks[0]).toHaveProperty('startIndex'); 38 | expect(chunks[0]).toHaveProperty('endIndex'); 39 | 40 | // Verify chunks can reconstruct the file 41 | const reconstructed = chunks.map(c => c.text).join(''); 42 | expect(reconstructed.length).toBeGreaterThan(0); 43 | }); 44 | }); 45 | -------------------------------------------------------------------------------- /legacy/chonkie/types/code.ts: -------------------------------------------------------------------------------- 1 | import { Chunk } from './base'; 2 | 3 | /** Interface for tree-sitter Node */ 4 | export interface TreeSitterNode { 5 | // This will be defined by tree-sitter when imported 6 | [key: string]: any; 7 | } 8 | 9 | /** Interface for CodeChunk data */ 10 | interface CodeChunkData { 11 | text: string; 12 | startIndex: number; 13 | endIndex: number; 14 | tokenCount: number; 15 | lang?: string; 16 | nodes?: TreeSitterNode[]; 17 | } 18 | 19 | /** Class to represent code chunks with metadata */ 20 | export class CodeChunk extends Chunk { 21 | /** The programming language of the code */ 22 | public lang?: string; 23 | /** The tree-sitter AST nodes in the chunk */ 24 | public nodes?: TreeSitterNode[]; 25 | 26 | constructor(data: { 27 | text: string; 28 | startIndex: number; 29 | endIndex: number; 30 | tokenCount: number; 31 | lang?: string; 32 | nodes?: TreeSitterNode[]; 33 | }) { 34 | super(data); 35 | this.lang = data.lang; 36 | this.nodes = data.nodes; 37 | } 38 | 39 | /** Return a string representation of the CodeChunk */ 40 | public toString(): string { 41 | return `CodeChunk(text=${this.text}, startIndex=${this.startIndex}, endIndex=${this.endIndex}, tokenCount=${this.tokenCount}, lang=${this.lang}, nodes=${this.nodes})`; 42 | } 43 | 44 | /** Return the CodeChunk as a dictionary-like object */ 45 | public toDict(): CodeChunkData { 46 | const baseDict = super.toDict(); 47 | return { 48 | ...baseDict, 49 | lang: this.lang, 50 | nodes: this.nodes, 51 | }; 52 | } 53 | 54 | /** Create a CodeChunk object from a dictionary */ 55 | public static fromDict(data: CodeChunkData): CodeChunk { 56 | return new CodeChunk(data); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /packages/cloud/src/index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @chonkiejs/cloud 3 | * Cloud-based chunkers and refineries for Chonkie via api.chonkie.ai 4 | */ 5 | 6 | // Base 7 | export { CloudBaseChunker } from '@/base'; 8 | export type { CloudClientConfig, ChunkerInput } from '@/base'; 9 | 10 | // Utils 11 | export { createFileReference } from '@/utils'; 12 | export type { FileReference, FileUploadResponse } from '@/utils'; 13 | 14 | // Chunkers 15 | export { TokenChunker } from '@/chunkers/token'; 16 | export type { TokenChunkerOptions } from '@/chunkers/token'; 17 | 18 | export { SentenceChunker } from '@/chunkers/sentence'; 19 | export type { SentenceChunkerOptions } from '@/chunkers/sentence'; 20 | 21 | export { RecursiveChunker } from '@/chunkers/recursive'; 22 | export type { RecursiveChunkerOptions } from '@/chunkers/recursive'; 23 | 24 | export { SemanticChunker } from '@/chunkers/semantic'; 25 | export type { SemanticChunkerOptions } from '@/chunkers/semantic'; 26 | 27 | export { NeuralChunker } from '@/chunkers/neural'; 28 | export type { NeuralChunkerOptions } from '@/chunkers/neural'; 29 | 30 | export { CodeChunker } from '@/chunkers/code'; 31 | export type { CodeChunkerOptions } from '@/chunkers/code'; 32 | 33 | export { LateChunker } from '@/chunkers/late'; 34 | export type { LateChunkerOptions } from '@/chunkers/late'; 35 | 36 | // Refineries 37 | export { EmbeddingsRefinery } from '@/refineries/embeddings'; 38 | export type { EmbeddingsRefineryOptions } from '@/refineries/embeddings'; 39 | 40 | export { OverlapRefinery } from '@/refineries/overlap'; 41 | export type { OverlapRefineryOptions } from '@/refineries/overlap'; 42 | 43 | // Pipeline 44 | export { Pipeline } from '@/pipeline'; 45 | export type { PipelineOptions, PipelineStep, PipelineValidationResult } from '@/pipeline'; 46 | -------------------------------------------------------------------------------- /packages/cloud/examples/token.example.ts: -------------------------------------------------------------------------------- 1 | import { TokenChunker } from '../src'; 2 | 3 | async function main() { 4 | console.log('🦛 Testing TokenChunker with api.chonkie.ai\n'); 5 | 6 | try { 7 | // Create chunker (will use CHONKIE_API_KEY from environment) 8 | const chunker = new TokenChunker({ 9 | chunkSize: 50, 10 | chunkOverlap: 10, 11 | tokenizer: 'gpt2' 12 | }); 13 | 14 | console.log('✅ TokenChunker created successfully'); 15 | console.log(`Config: ${chunker.toString()}\n`); 16 | 17 | // Test chunking 18 | const text = 'This is a simple test to verify that the Chonkie cloud API is working correctly. We are testing the TokenChunker to make sure it can split text into token-based chunks using the remote API.'; 19 | 20 | console.log(`📝 Input text (${text.length} chars):`); 21 | console.log(`"${text}"\n`); 22 | 23 | console.log('🔄 Sending request to api.chonkie.ai...\n'); 24 | 25 | const chunks = await chunker.chunk({ text }); 26 | 27 | console.log(`✅ Received ${chunks.length} chunks:\n`); 28 | 29 | chunks.forEach((chunk, index) => { 30 | console.log(`Chunk ${index + 1}:`); 31 | console.log(` Tokens: ${chunk.tokenCount}`); 32 | console.log(` Position: [${chunk.startIndex}:${chunk.endIndex}]`); 33 | console.log(` Text: "${chunk.text}"`); 34 | console.log(); 35 | }); 36 | 37 | // Verify reconstruction 38 | const reconstructed = chunks.map(c => c.text).join(''); 39 | const matches = reconstructed === text; 40 | console.log(`🔍 Reconstruction: ${matches ? '✅ Perfect match' : '❌ Mismatch'}`); 41 | 42 | } catch (error) { 43 | console.error('❌ Error:', error instanceof Error ? error.message : error); 44 | process.exit(1); 45 | } 46 | } 47 | 48 | main(); 49 | -------------------------------------------------------------------------------- /legacy/chonkie/cloud/base.ts: -------------------------------------------------------------------------------- 1 | /** Base cloud client for Chonkie API. */ 2 | 3 | export interface CloudClientConfig { 4 | apiKey: string; 5 | baseUrl?: string; 6 | } 7 | 8 | export interface ChunkerInput { 9 | text?: string; 10 | filepath?: string; 11 | } 12 | 13 | export class CloudClient { 14 | private readonly apiKey: string; 15 | private readonly baseUrl: string; 16 | 17 | constructor(config: CloudClientConfig) { 18 | this.apiKey = config.apiKey; 19 | this.baseUrl = config.baseUrl || "https://api.chonkie.ai"; 20 | } 21 | 22 | protected async request( 23 | endpoint: string, 24 | options: { 25 | method?: string; 26 | body?: any; 27 | headers?: Record; 28 | } = {} 29 | ): Promise { 30 | const { method = "POST", body, headers = {} } = options; 31 | 32 | // Don't set Content-Type or stringify body if it's FormData 33 | const isFormData = body instanceof FormData; 34 | const requestHeaders = { 35 | "Authorization": `Bearer ${this.apiKey}`, 36 | ...headers, 37 | }; 38 | 39 | const response = await fetch(`${this.baseUrl}${endpoint}`, { 40 | method, 41 | headers: requestHeaders, 42 | body: isFormData ? body : (body ? JSON.stringify(body) : undefined), 43 | }); 44 | 45 | if (!response.ok) { 46 | const error = await response.json().catch(() => ({ message: "Unknown error" })); 47 | throw new Error(`API request failed: ${error.message}`); 48 | } 49 | 50 | return response.json(); 51 | } 52 | 53 | protected async validateAuth(): Promise { 54 | try { 55 | const response = await this.request<{ message: string; status: number }>("/v1/auth/validate"); 56 | return response.status === 200; 57 | } catch (error) { 58 | return false; 59 | } 60 | } 61 | } -------------------------------------------------------------------------------- /packages/core/examples/token.example.ts: -------------------------------------------------------------------------------- 1 | import { TokenChunker } from '../src'; 2 | 3 | async function main() { 4 | console.log('🦛 Testing TokenChunker (Character-based)\n'); 5 | 6 | try { 7 | const chunker = await TokenChunker.create({ 8 | chunkSize: 50, 9 | chunkOverlap: 10 10 | }); 11 | 12 | console.log('✅ TokenChunker created'); 13 | console.log(`Config: ${chunker.toString()}\n`); 14 | 15 | const text = 'This is a test of the TokenChunker. It splits text into fixed-size token chunks. With character-based tokenization, each character is one token.'; 16 | 17 | console.log(`📝 Input (${text.length} chars):`); 18 | console.log(`"${text}"\n`); 19 | 20 | const chunks = await chunker.chunk(text); 21 | 22 | console.log(`✅ Created ${chunks.length} chunks:\n`); 23 | 24 | chunks.forEach((chunk, index) => { 25 | console.log(`Chunk ${index + 1}:`); 26 | console.log(` Tokens: ${chunk.tokenCount}`); 27 | console.log(` Position: [${chunk.startIndex}:${chunk.endIndex}]`); 28 | console.log(` Text: "${chunk.text}"`); 29 | console.log(); 30 | }); 31 | 32 | // Test dynamic tokenizer (will show helpful error) 33 | console.log('='.repeat(60)); 34 | console.log('\n🔍 Testing dynamic tokenizer detection:\n'); 35 | 36 | try { 37 | const gpt2Chunker = await TokenChunker.create({ 38 | tokenizer: 'gpt2', 39 | chunkSize: 50 40 | }); 41 | console.log('✅ GPT-2 tokenizer loaded (you have @chonkiejs/token installed!)'); 42 | } catch (error) { 43 | console.log('ℹ️ Expected behavior - @chonkiejs/token not installed:'); 44 | console.log((error as Error).message); 45 | } 46 | 47 | } catch (error) { 48 | console.error('❌ Error:', error); 49 | process.exit(1); 50 | } 51 | } 52 | 53 | main(); 54 | -------------------------------------------------------------------------------- /legacy/chonkie/friends/utils.ts: -------------------------------------------------------------------------------- 1 | /** Utility functions for Chonkie's Handshakes. */ 2 | 3 | const ADJECTIVES = [ 4 | "happy", "chonky", "splashy", "munchy", "muddy", "groovy", "bubbly", 5 | "swift", "lazy", "hungry", "glowing", "radiant", "mighty", "gentle", 6 | "whimsical", "snug", "plump", "jovial", "sleepy", "sunny", "peppy", 7 | "breezy", "sneaky", "clever", "peaceful", "dreamy", 8 | ]; 9 | 10 | const VERBS = [ 11 | "chomping", "splashing", "munching", "wading", "floating", "drifting", "chunking", 12 | "slicing", "dancing", "wandering", "sleeping", "dreaming", "gliding", "swimming", 13 | "bubbling", "giggling", "jumping", "diving", "hopping", "skipping", "trotting", "sneaking", 14 | "exploring", "nibbling", "resting", 15 | ]; 16 | 17 | const NOUNS = [ 18 | "hippo", "river", "chunk", "lilypad", "mudbath", "stream", "pod", "chomp", 19 | "byte", "fragment", "slice", "splash", "nugget", "lagoon", "marsh", 20 | "pebble", "ripple", "cluster", "patch", "parcel", "meadow", "glade", 21 | "puddle", "nook", "bite", "whisper", "journey", "haven", "buddy", "pal", 22 | "snack", "secret" 23 | ]; 24 | 25 | /** 26 | * Generate a random, fun, 3-part Chonkie-themed name (Adj-Verb-Noun). 27 | * 28 | * Combines one random adjective, one random verb, and one random noun from 29 | * predefined lists, joined by a separator. 30 | * 31 | * @param sep - The separator to use between the words. Defaults to "-". 32 | * @returns A randomly generated collection name string (e.g., "happy-splashes-hippo"). 33 | */ 34 | export function generateRandomCollectionName(sep: string = "-"): string { 35 | const adjective = ADJECTIVES[Math.floor(Math.random() * ADJECTIVES.length)]; 36 | const verb = VERBS[Math.floor(Math.random() * VERBS.length)]; 37 | const noun = NOUNS[Math.floor(Math.random() * NOUNS.length)]; 38 | 39 | return `${adjective}${sep}${verb}${sep}${noun}`; 40 | } -------------------------------------------------------------------------------- /legacy/chonkie/types/late.ts: -------------------------------------------------------------------------------- 1 | import { RecursiveChunk } from './recursive'; 2 | 3 | /** Interface for LateChunk data */ 4 | interface LateChunkData { 5 | text: string; 6 | startIndex: number; 7 | endIndex: number; 8 | tokenCount: number; 9 | embedding?: number[]; 10 | } 11 | 12 | /** Class to represent the late chunk 13 | * 14 | * @class LateChunk 15 | */ 16 | export class LateChunk extends RecursiveChunk { 17 | /** The embedding of the chunk */ 18 | public embedding?: number[]; 19 | 20 | constructor(data: { 21 | text: string; 22 | startIndex: number; 23 | endIndex: number; 24 | tokenCount: number; 25 | embedding?: number[]; 26 | }) { 27 | super(data); 28 | this.embedding = data.embedding ?? undefined; 29 | } 30 | 31 | /** 32 | * Return a string representation of the LateChunk 33 | * 34 | * @returns {string} The string representation of the LateChunk. 35 | */ 36 | public toString(): string { 37 | return `LateChunk(text=${this.text}, startIndex=${this.startIndex}, endIndex=${this.endIndex}, tokenCount=${this.tokenCount}, embedding=${this.embedding})`; 38 | } 39 | 40 | /** 41 | * Return the LateChunk as a dictionary-like object 42 | * 43 | * @returns {LateChunkData} The dictionary-like object. 44 | */ 45 | public toDict(): LateChunkData { 46 | return { 47 | text: this.text, 48 | startIndex: this.startIndex, 49 | endIndex: this.endIndex, 50 | tokenCount: this.tokenCount, 51 | embedding: this.embedding, 52 | }; 53 | } 54 | 55 | 56 | /** 57 | * Create a LateChunk object from a dictionary-like object. 58 | * 59 | * @param {LateChunkData} data - The dictionary-like object. 60 | * @returns {LateChunk} The LateChunk object. 61 | */ 62 | public static fromDict(data: LateChunkData): LateChunk { 63 | return new LateChunk({ 64 | text: data.text, 65 | startIndex: data.startIndex, 66 | endIndex: data.endIndex, 67 | tokenCount: data.tokenCount, 68 | embedding: data.embedding, 69 | }); 70 | } 71 | } -------------------------------------------------------------------------------- /packages/core/examples/with-huggingface.example.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * This example demonstrates using RecursiveChunker with HuggingFace tokenizers. 3 | * Requires: npm install @chonkiejs/token 4 | */ 5 | 6 | import { RecursiveChunker, TokenChunker } from '../src'; 7 | 8 | async function main() { 9 | console.log('🦛 Testing Chonkie with HuggingFace Tokenizers\n'); 10 | console.log('Note: This requires @chonkiejs/token to be installed\n'); 11 | console.log('='.repeat(60)); 12 | 13 | const text = 'This is a test. We are testing GPT-2 tokenization with Chonkie!'; 14 | 15 | // Test 1: TokenChunker with GPT-2 16 | console.log('\n📝 Test 1: TokenChunker with GPT-2\n'); 17 | try { 18 | const tokenChunker = await TokenChunker.create({ 19 | tokenizer: 'Xenova/gpt2', 20 | chunkSize: 10 21 | }); 22 | 23 | console.log('✅ GPT-2 tokenizer loaded'); 24 | const chunks = await tokenChunker.chunk(text); 25 | console.log(`Created ${chunks.length} chunks`); 26 | chunks.forEach((c, i) => { 27 | console.log(` ${i + 1}. [${c.tokenCount} tokens]: "${c.text}"`); 28 | }); 29 | } catch (error) { 30 | console.log('ℹ️ Expected if @chonkiejs/token not installed:'); 31 | console.log((error as Error).message); 32 | } 33 | 34 | // Test 2: RecursiveChunker with GPT-2 35 | console.log('\n' + '='.repeat(60)); 36 | console.log('\n📝 Test 2: RecursiveChunker with GPT-2\n'); 37 | try { 38 | const recursiveChunker = await RecursiveChunker.create({ 39 | tokenizer: 'Xenova/gpt2', 40 | chunkSize: 15 41 | }); 42 | 43 | console.log('✅ GPT-2 tokenizer loaded'); 44 | const chunks = await recursiveChunker.chunk(text); 45 | console.log(`Created ${chunks.length} chunks`); 46 | chunks.forEach((c, i) => { 47 | console.log(` ${i + 1}. [${c.tokenCount} tokens]: "${c.text}"`); 48 | }); 49 | } catch (error) { 50 | console.log('ℹ️ Expected if @chonkiejs/token not installed:'); 51 | console.log((error as Error).message); 52 | } 53 | 54 | console.log('\n' + '='.repeat(60)); 55 | console.log('\n🎉 Example completed!\n'); 56 | } 57 | 58 | main(); 59 | -------------------------------------------------------------------------------- /packages/cloud/examples/embeddings-refinery.example.ts: -------------------------------------------------------------------------------- 1 | import { TokenChunker, EmbeddingsRefinery } from '../src'; 2 | 3 | async function main() { 4 | console.log('🦛 Testing EmbeddingsRefinery with api.chonkie.ai\n'); 5 | 6 | try { 7 | // Step 1: Create chunks 8 | console.log('Step 1: Creating chunks...\n'); 9 | 10 | const chunker = new TokenChunker({ chunkSize: 50 }); 11 | const text = 'Artificial intelligence is transforming technology. Machine learning enables new possibilities. Neural networks process complex patterns.'; 12 | 13 | console.log(`📝 Input (${text.length} chars):`); 14 | console.log(`"${text}"\n`); 15 | 16 | const chunks = await chunker.chunk({ text }); 17 | console.log(`✅ Created ${chunks.length} chunks\n`); 18 | 19 | chunks.forEach((chunk, i) => { 20 | console.log(`Chunk ${i + 1}: "${chunk.text}" (${chunk.tokenCount} tokens)`); 21 | }); 22 | 23 | // Step 2: Add embeddings 24 | console.log('\n' + '='.repeat(60)); 25 | console.log('\nStep 2: Adding embeddings to chunks...\n'); 26 | 27 | const refinery = new EmbeddingsRefinery({ 28 | embeddingModel: 'sentence-transformers/all-MiniLM-L6-v2' 29 | }); 30 | 31 | console.log('✅ EmbeddingsRefinery created'); 32 | console.log(`Config: ${refinery.toString()}\n`); 33 | 34 | console.log('🔄 Calling API to add embeddings...\n'); 35 | 36 | const refinedChunks = await refinery.refine(chunks); 37 | 38 | console.log(`✅ Refined ${refinedChunks.length} chunks with embeddings\n`); 39 | 40 | refinedChunks.forEach((chunk, i) => { 41 | console.log(`Refined Chunk ${i + 1}:`); 42 | console.log(` Text: "${chunk.text}"`); 43 | console.log(` Tokens: ${chunk.tokenCount}`); 44 | console.log(` Position: [${chunk.startIndex}:${chunk.endIndex}]`); 45 | console.log(); 46 | }); 47 | 48 | console.log('='.repeat(60)); 49 | console.log('\n🎉 Embeddings successfully added!\n'); 50 | 51 | } catch (error) { 52 | console.error('❌ Error:', error instanceof Error ? error.message : error); 53 | process.exit(1); 54 | } 55 | } 56 | 57 | main(); 58 | -------------------------------------------------------------------------------- /legacy/chonkie/cloud/neural.ts: -------------------------------------------------------------------------------- 1 | /** Neural chunker client for Chonkie API. */ 2 | 3 | import { CloudClient, ChunkerInput } from "./base"; 4 | import { Chunk } from "../types/base"; 5 | 6 | export interface NeuralChunkerConfig { 7 | model?: string; 8 | minCharactersPerChunk?: number; 9 | } 10 | 11 | export class NeuralChunker extends CloudClient { 12 | private readonly config: Required; 13 | 14 | constructor(apiKey: string, config: NeuralChunkerConfig = {}) { 15 | super({ apiKey }); 16 | this.config = { 17 | model: config.model || "mirth/chonky_modernbert_large_1", 18 | minCharactersPerChunk: config.minCharactersPerChunk || 10, 19 | }; 20 | } 21 | 22 | async chunk(input: ChunkerInput): Promise { 23 | const formData = new FormData(); 24 | 25 | if (input.filepath) { 26 | formData.append("file", input.filepath); 27 | } else if (input.text) { 28 | // JSON encode the text 29 | formData.append("text", JSON.stringify(input.text)); 30 | // Append empty file to ensure multipart form 31 | formData.append("file", new Blob(), "text_input.txt"); 32 | } else { 33 | throw new Error("Either text or file must be provided"); 34 | } 35 | 36 | formData.append("embedding_model", this.config.model); 37 | formData.append("min_characters_per_chunk", this.config.minCharactersPerChunk.toString()); 38 | formData.append("return_type", "chunks"); 39 | 40 | const data = await this.request("/v1/chunk/neural", { 41 | method: "POST", 42 | body: formData, 43 | }); 44 | 45 | // Convert from snake_case to camelCase 46 | const camelCaseData = data.map((chunk: any) => { 47 | return { 48 | text: chunk.text, 49 | startIndex: chunk.start_index, 50 | endIndex: chunk.end_index, 51 | tokenCount: chunk.token_count, 52 | embedding: chunk.embedding || undefined, 53 | }; 54 | }); 55 | 56 | return camelCaseData.map((chunk: any) => Chunk.fromDict(chunk)); 57 | } 58 | 59 | async chunkBatch(inputs: ChunkerInput[]): Promise { 60 | return Promise.all(inputs.map(input => this.chunk(input))); 61 | } 62 | } -------------------------------------------------------------------------------- /packages/cloud/examples/overlap-refinery.example.ts: -------------------------------------------------------------------------------- 1 | import { TokenChunker, OverlapRefinery } from '../src'; 2 | 3 | async function main() { 4 | console.log('🦛 Testing OverlapRefinery with api.chonkie.ai\n'); 5 | 6 | try { 7 | // Step 1: Create chunks 8 | console.log('Step 1: Creating chunks without overlap...\n'); 9 | 10 | const chunker = new TokenChunker({ 11 | chunkSize: 40, 12 | chunkOverlap: 0 // No overlap initially 13 | }); 14 | 15 | const text = 'The quick brown fox jumps over the lazy dog. This sentence demonstrates overlap refinement. Context is preserved across boundaries.'; 16 | 17 | console.log(`📝 Input (${text.length} chars):`); 18 | console.log(`"${text}"\n`); 19 | 20 | const chunks = await chunker.chunk({ text }); 21 | console.log(`✅ Created ${chunks.length} chunks (no overlap)\n`); 22 | 23 | chunks.forEach((chunk, i) => { 24 | console.log(`Chunk ${i + 1}: "${chunk.text.substring(0, 50)}${chunk.text.length > 50 ? '...' : ''}" (${chunk.tokenCount} tokens)`); 25 | }); 26 | 27 | // Step 2: Add overlap 28 | console.log('\n' + '='.repeat(60)); 29 | console.log('\nStep 2: Adding overlap for context...\n'); 30 | 31 | const refinery = new OverlapRefinery({ 32 | contextSize: 0.25, // 25% overlap 33 | mode: 'token', 34 | method: 'suffix' 35 | }); 36 | 37 | console.log('✅ OverlapRefinery created'); 38 | console.log(`Config: ${refinery.toString()}\n`); 39 | 40 | console.log('🔄 Calling API to add overlap...\n'); 41 | 42 | const refinedChunks = await refinery.refine(chunks); 43 | 44 | console.log(`✅ Refined ${refinedChunks.length} chunks with overlap\n`); 45 | 46 | refinedChunks.forEach((chunk, i) => { 47 | console.log(`Refined Chunk ${i + 1}:`); 48 | console.log(` Text: "${chunk.text.substring(0, 60)}${chunk.text.length > 60 ? '...' : ''}"`); 49 | console.log(` Tokens: ${chunk.tokenCount}`); 50 | console.log(` Position: [${chunk.startIndex}:${chunk.endIndex}]`); 51 | console.log(); 52 | }); 53 | 54 | console.log('='.repeat(60)); 55 | console.log('\n💡 Notice: Chunks now have overlapping context for better coherence!\n'); 56 | 57 | } catch (error) { 58 | console.error('❌ Error:', error instanceof Error ? error.message : error); 59 | process.exit(1); 60 | } 61 | } 62 | 63 | main(); 64 | -------------------------------------------------------------------------------- /legacy/chonkie/cloud/token.ts: -------------------------------------------------------------------------------- 1 | /** Token chunker client for Chonkie API. */ 2 | 3 | import { CloudClient, ChunkerInput } from "./base"; 4 | import { Chunk } from "../types/base"; 5 | import * as fs from 'fs'; 6 | import * as path from 'path'; 7 | 8 | export interface TokenChunkerConfig { 9 | tokenizer?: string; 10 | chunkSize?: number; 11 | chunkOverlap?: number; 12 | } 13 | 14 | export class TokenChunker extends CloudClient { 15 | private readonly config: Required; 16 | 17 | constructor(apiKey: string, config: TokenChunkerConfig = {}) { 18 | super({ apiKey }); 19 | this.config = { 20 | tokenizer: config.tokenizer || "gpt2", 21 | chunkSize: config.chunkSize || 512, 22 | chunkOverlap: config.chunkOverlap || 0, 23 | }; 24 | } 25 | 26 | async chunk(input: ChunkerInput): Promise { 27 | const formData = new FormData(); 28 | 29 | if (input.filepath) { 30 | const fileContent = fs.readFileSync(input.filepath); 31 | const fileName = path.basename(input.filepath) || 'file.txt'; 32 | formData.append("file", new Blob([fileContent]), fileName); 33 | } else if (input.text) { 34 | formData.append("text", input.text); 35 | // Append empty file to ensure multipart form 36 | formData.append("file", new Blob(), "text_input.txt"); 37 | } else { 38 | throw new Error("Either text or filepath must be provided"); 39 | } 40 | 41 | formData.append("tokenizer_or_token_counter", this.config.tokenizer); 42 | formData.append("chunk_size", this.config.chunkSize.toString()); 43 | formData.append("chunk_overlap", this.config.chunkOverlap.toString()); 44 | formData.append("return_type", "chunks"); 45 | 46 | const data = await this.request("/v1/chunk/token", { 47 | method: "POST", 48 | body: formData, 49 | }); 50 | 51 | // Convert from snake_case to camelCase 52 | const camelCaseData = data.map((chunk: any) => { 53 | return { 54 | text: chunk.text, 55 | startIndex: chunk.start_index, 56 | endIndex: chunk.end_index, 57 | tokenCount: chunk.token_count, 58 | context: chunk.context || undefined, 59 | }; 60 | }); 61 | 62 | return camelCaseData.map((chunk: any) => Chunk.fromDict(chunk)); 63 | } 64 | 65 | async chunkBatch(inputs: ChunkerInput[]): Promise { 66 | return Promise.all(inputs.map(input => this.chunk(input))); 67 | } 68 | } -------------------------------------------------------------------------------- /packages/cloud/src/refineries/embeddings.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Embeddings refinery that adds embeddings to existing chunks 3 | * via api.chonkie.ai 4 | */ 5 | 6 | import { Chunk } from "@chonkiejs/core"; 7 | import { CloudBaseChunker } from "@/base"; 8 | 9 | export interface EmbeddingsRefineryOptions { 10 | /** Embedding model to use (default: "minishlab/potion-retrieval-32M") */ 11 | embeddingModel?: string; 12 | /** API key (reads from CHONKIE_API_KEY env var if not provided) */ 13 | apiKey?: string; 14 | /** Base URL for API (default: "https://api.chonkie.ai") */ 15 | baseUrl?: string; 16 | } 17 | 18 | interface ChunkData { 19 | text: string; 20 | start_index: number; 21 | end_index: number; 22 | token_count: number; 23 | embedding?: number[]; 24 | } 25 | 26 | /** 27 | * Post-processes chunks by adding embeddings to them. 28 | */ 29 | export class EmbeddingsRefinery extends CloudBaseChunker { 30 | private readonly embeddingModel: string; 31 | 32 | constructor(options: EmbeddingsRefineryOptions = {}) { 33 | const apiKey = options.apiKey || process.env.CHONKIE_API_KEY; 34 | if (!apiKey) { 35 | throw new Error( 36 | "API key is required. Provide it in options.apiKey or set CHONKIE_API_KEY environment variable." 37 | ); 38 | } 39 | 40 | super({ apiKey, baseUrl: options.baseUrl }); 41 | this.embeddingModel = options.embeddingModel || 'minishlab/potion-retrieval-32M'; 42 | } 43 | 44 | /** 45 | * Add embeddings to existing chunks. 46 | * 47 | * @param chunks - Array of chunks to add embeddings to 48 | * @returns Array of chunks with embeddings added 49 | */ 50 | async refine(chunks: Chunk[]): Promise { 51 | const chunkData = chunks.map((chunk) => ({ 52 | text: chunk.text, 53 | start_index: chunk.startIndex, 54 | end_index: chunk.endIndex, 55 | token_count: chunk.tokenCount, 56 | })); 57 | 58 | const response = await this.request("/v1/refine/embeddings", { 59 | method: "POST", 60 | body: { 61 | chunks: chunkData, 62 | embedding_model: this.embeddingModel, 63 | }, 64 | }); 65 | 66 | return response.map( 67 | (chunkData) => 68 | new Chunk({ 69 | text: chunkData.text, 70 | startIndex: chunkData.start_index, 71 | endIndex: chunkData.end_index, 72 | tokenCount: chunkData.token_count, 73 | embedding: chunkData.embedding, 74 | }) 75 | ); 76 | } 77 | 78 | toString(): string { 79 | return `EmbeddingsRefinery(embeddingModel=${this.embeddingModel})`; 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /packages/cloud/tests/fixtures/test-code.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Example TypeScript code for testing CodeChunker 3 | */ 4 | 5 | export interface User { 6 | id: number; 7 | name: string; 8 | email: string; 9 | createdAt: Date; 10 | } 11 | 12 | export class UserService { 13 | private users: Map = new Map(); 14 | private nextId: number = 1; 15 | 16 | /** 17 | * Create a new user 18 | */ 19 | async createUser(name: string, email: string): Promise { 20 | const user: User = { 21 | id: this.nextId++, 22 | name, 23 | email, 24 | createdAt: new Date(), 25 | }; 26 | this.users.set(user.id, user); 27 | return user; 28 | } 29 | 30 | /** 31 | * Get user by ID 32 | */ 33 | async getUserById(id: number): Promise { 34 | return this.users.get(id); 35 | } 36 | 37 | /** 38 | * Update user information 39 | */ 40 | async updateUser(id: number, updates: Partial>): Promise { 41 | const user = this.users.get(id); 42 | if (!user) { 43 | return null; 44 | } 45 | 46 | const updatedUser = { 47 | ...user, 48 | ...updates, 49 | }; 50 | 51 | this.users.set(id, updatedUser); 52 | return updatedUser; 53 | } 54 | 55 | /** 56 | * Delete a user 57 | */ 58 | async deleteUser(id: number): Promise { 59 | return this.users.delete(id); 60 | } 61 | 62 | /** 63 | * Get all users 64 | */ 65 | async getAllUsers(): Promise { 66 | return Array.from(this.users.values()); 67 | } 68 | 69 | /** 70 | * Find users by email 71 | */ 72 | async findUsersByEmail(email: string): Promise { 73 | return Array.from(this.users.values()).filter( 74 | user => user.email.toLowerCase().includes(email.toLowerCase()) 75 | ); 76 | } 77 | } 78 | 79 | export class AuthService { 80 | private tokens: Map = new Map(); 81 | 82 | /** 83 | * Generate authentication token 84 | */ 85 | generateToken(userId: number): string { 86 | const token = Math.random().toString(36).substring(2); 87 | this.tokens.set(token, userId); 88 | return token; 89 | } 90 | 91 | /** 92 | * Validate token and return user ID 93 | */ 94 | validateToken(token: string): number | null { 95 | return this.tokens.get(token) || null; 96 | } 97 | 98 | /** 99 | * Revoke a token 100 | */ 101 | revokeToken(token: string): boolean { 102 | return this.tokens.delete(token); 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /packages/cloud/tests/fixtures/test-code.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Example TypeScript code for testing CodeChunker 3 | */ 4 | 5 | export interface User { 6 | id: number; 7 | name: string; 8 | email: string; 9 | createdAt: Date; 10 | } 11 | 12 | export class UserService { 13 | private users: Map = new Map(); 14 | private nextId: number = 1; 15 | 16 | /** 17 | * Create a new user 18 | */ 19 | async createUser(name: string, email: string): Promise { 20 | const user: User = { 21 | id: this.nextId++, 22 | name, 23 | email, 24 | createdAt: new Date(), 25 | }; 26 | this.users.set(user.id, user); 27 | return user; 28 | } 29 | 30 | /** 31 | * Get user by ID 32 | */ 33 | async getUserById(id: number): Promise { 34 | return this.users.get(id); 35 | } 36 | 37 | /** 38 | * Update user information 39 | */ 40 | async updateUser(id: number, updates: Partial>): Promise { 41 | const user = this.users.get(id); 42 | if (!user) { 43 | return null; 44 | } 45 | 46 | const updatedUser = { 47 | ...user, 48 | ...updates, 49 | }; 50 | 51 | this.users.set(id, updatedUser); 52 | return updatedUser; 53 | } 54 | 55 | /** 56 | * Delete a user 57 | */ 58 | async deleteUser(id: number): Promise { 59 | return this.users.delete(id); 60 | } 61 | 62 | /** 63 | * Get all users 64 | */ 65 | async getAllUsers(): Promise { 66 | return Array.from(this.users.values()); 67 | } 68 | 69 | /** 70 | * Find users by email 71 | */ 72 | async findUsersByEmail(email: string): Promise { 73 | return Array.from(this.users.values()).filter( 74 | user => user.email.toLowerCase().includes(email.toLowerCase()) 75 | ); 76 | } 77 | } 78 | 79 | export class AuthService { 80 | private tokens: Map = new Map(); 81 | 82 | /** 83 | * Generate authentication token 84 | */ 85 | generateToken(userId: number): string { 86 | const token = Math.random().toString(36).substring(2); 87 | this.tokens.set(token, userId); 88 | return token; 89 | } 90 | 91 | /** 92 | * Validate token and return user ID 93 | */ 94 | validateToken(token: string): number | null { 95 | return this.tokens.get(token) || null; 96 | } 97 | 98 | /** 99 | * Revoke a token 100 | */ 101 | revokeToken(token: string): boolean { 102 | return this.tokens.delete(token); 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /legacy/chonkie/cloud/overlap_refinery.ts: -------------------------------------------------------------------------------- 1 | import { CloudClient } from "./base"; 2 | import { Chunk } from "../types/base"; 3 | 4 | 5 | export interface OverlapRefineryConfig { 6 | tokenizerOrTokenCounter?: string; 7 | contextSize?: number; 8 | mode?: "token" | "recursive"; 9 | method?: "suffix" | "prefix"; 10 | recipe?: string; 11 | lang?: string; 12 | merge?: boolean; 13 | } 14 | export class OverlapRefinery extends CloudClient { 15 | private readonly config: Required; 16 | 17 | constructor(apiKey: string, config: OverlapRefineryConfig = {}) { 18 | super({ apiKey }); 19 | this.config = { 20 | tokenizerOrTokenCounter: config.tokenizerOrTokenCounter || "character", 21 | contextSize: config.contextSize ?? 0.25, 22 | mode: config.mode || "token", 23 | method: config.method || "suffix", 24 | recipe: config.recipe || "default", 25 | lang: config.lang || "en", 26 | merge: config.merge ?? true, 27 | }; 28 | } 29 | 30 | async refine(chunks: Chunk[]): Promise { 31 | // Create snake cased chunks for the request 32 | const snakeCasedChunks = chunks.map(chunk => { 33 | return { 34 | text: chunk.text, 35 | start_index: chunk.startIndex, 36 | end_index: chunk.endIndex, 37 | token_count: chunk.tokenCount, 38 | }; 39 | }); 40 | const response = await this.request("/v1/refine/overlap", { 41 | body: { 42 | chunks: snakeCasedChunks, 43 | tokenizer_or_token_counter: this.config.tokenizerOrTokenCounter, 44 | context_size: this.config.contextSize, 45 | mode: this.config.mode, 46 | method: this.config.method, 47 | recipe: this.config.recipe, 48 | lang: this.config.lang, 49 | merge: this.config.merge, 50 | }, 51 | headers: { 52 | "Content-Type": "application/json", 53 | }, 54 | }); 55 | // Merge the response chunks with the original chunks 56 | const mergedChunks = response.map((chunk: any, index: number) => { 57 | const originalChunk = chunks[index]; 58 | return { 59 | ...originalChunk, 60 | text: chunk.text, 61 | startIndex: chunk.start_index, 62 | endIndex: chunk.end_index, 63 | tokenCount: chunk.token_count, 64 | }; 65 | }); 66 | return mergedChunks; 67 | } 68 | } -------------------------------------------------------------------------------- /packages/token/src/huggingface.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * HuggingFace tokenizer implementation using @huggingface/transformers 3 | */ 4 | 5 | import { AutoTokenizer, PreTrainedTokenizer } from '@huggingface/transformers'; 6 | import { Tokenizer } from '@chonkiejs/core'; 7 | 8 | /** 9 | * Tokenizer that uses HuggingFace transformers.js for tokenization. 10 | * 11 | * Extends the base Tokenizer interface from @chonkiejs/core to provide 12 | * real tokenization using models like GPT-2, BERT, etc. 13 | */ 14 | export class HuggingFaceTokenizer extends Tokenizer { 15 | private hfTokenizer: PreTrainedTokenizer; 16 | private modelName: string; 17 | 18 | private constructor(hfTokenizer: PreTrainedTokenizer, modelName: string) { 19 | super(); 20 | this.hfTokenizer = hfTokenizer; 21 | this.modelName = modelName; 22 | } 23 | 24 | /** 25 | * Create a HuggingFace tokenizer instance. 26 | * 27 | * @param model - HuggingFace model name (e.g., 'gpt2', 'Xenova/gpt-4', 'bert-base-uncased') 28 | * @returns Promise resolving to HuggingFaceTokenizer instance 29 | * 30 | * @example 31 | * const tokenizer = await HuggingFaceTokenizer.create('gpt2'); 32 | * const tokenizer = await HuggingFaceTokenizer.create('Xenova/gpt-4'); 33 | */ 34 | static async create(model: string): Promise { 35 | try { 36 | const hfTokenizer = await AutoTokenizer.from_pretrained(model); 37 | return new HuggingFaceTokenizer(hfTokenizer, model); 38 | } catch (error) { 39 | throw new Error(`Failed to load HuggingFace tokenizer "${model}": ${error instanceof Error ? error.message : error}`); 40 | } 41 | } 42 | 43 | /** 44 | * Count tokens in text using HuggingFace tokenizer. 45 | */ 46 | countTokens(text: string): number { 47 | const encoded = this.hfTokenizer.encode(text) as number[]; 48 | return encoded.length; 49 | } 50 | 51 | /** 52 | * Encode text into token IDs. 53 | */ 54 | encode(text: string): number[] { 55 | return this.hfTokenizer.encode(text) as number[]; 56 | } 57 | 58 | /** 59 | * Decode token IDs back into text. 60 | */ 61 | decode(tokens: number[]): string { 62 | return this.hfTokenizer.decode(tokens, { skip_special_tokens: true }); 63 | } 64 | 65 | /** 66 | * Decode a batch of token arrays. 67 | */ 68 | decodeBatch(tokensBatch: number[][]): string[] { 69 | return tokensBatch.map(tokens => this.decode(tokens)); 70 | } 71 | 72 | /** 73 | * Get the model name. 74 | */ 75 | getModelName(): string { 76 | return this.modelName; 77 | } 78 | 79 | toString(): string { 80 | return `HuggingFaceTokenizer(model=${this.modelName})`; 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /legacy/chonkie/cloud/code.ts: -------------------------------------------------------------------------------- 1 | /** Code chunker client for Chonkie API. */ 2 | 3 | import { CloudClient, ChunkerInput } from "./base"; 4 | import { CodeChunk } from "../types/code"; 5 | import * as fs from 'fs'; 6 | import * as path from 'path'; 7 | 8 | export interface CodeChunkerConfig { 9 | tokenizerOrTokenCounter?: string; 10 | chunkSize?: number; 11 | language: string; 12 | includeNodes?: boolean; 13 | } 14 | 15 | export class CodeChunker extends CloudClient { 16 | private readonly config: Required; 17 | 18 | constructor(apiKey: string, config: CodeChunkerConfig) { 19 | super({ apiKey }); 20 | if (!config.language) { 21 | throw new Error("Language is required for code chunking"); 22 | } 23 | this.config = { 24 | tokenizerOrTokenCounter: config.tokenizerOrTokenCounter || "gpt2", 25 | chunkSize: config.chunkSize || 1500, 26 | language: config.language, 27 | includeNodes: config.includeNodes ?? false, 28 | }; 29 | } 30 | 31 | async chunk(input: ChunkerInput): Promise { 32 | const formData = new FormData(); 33 | 34 | if (input.filepath) { 35 | const fileContent = fs.readFileSync(input.filepath); 36 | const fileName = path.basename(input.filepath) || 'file.txt'; 37 | formData.append("file", new Blob([fileContent]), fileName); 38 | } else if (input.text) { 39 | formData.append("text", input.text); 40 | // Append empty file to ensure multipart form 41 | formData.append("file", new Blob(), "text_input.txt"); 42 | } else { 43 | throw new Error("Either text or filepath must be provided"); 44 | } 45 | 46 | formData.append("tokenizer_or_token_counter", this.config.tokenizerOrTokenCounter); 47 | formData.append("chunk_size", this.config.chunkSize.toString()); 48 | formData.append("language", this.config.language); 49 | formData.append("include_nodes", this.config.includeNodes.toString()); 50 | 51 | const data = await this.request("/v1/chunk/code", { 52 | method: "POST", 53 | body: formData, 54 | }); 55 | 56 | // Convert from snake_case to camelCase 57 | const camelCaseData = data.map((chunk: any) => { 58 | return { 59 | text: chunk.text, 60 | startIndex: chunk.start_index, 61 | endIndex: chunk.end_index, 62 | tokenCount: chunk.token_count, 63 | nodes: chunk.nodes || undefined, 64 | embedding: chunk.embedding || undefined, 65 | }; 66 | }); 67 | 68 | return camelCaseData.map((chunk: any) => CodeChunk.fromDict(chunk)); 69 | } 70 | 71 | async chunkBatch(inputs: ChunkerInput[]): Promise { 72 | return Promise.all(inputs.map(input => this.chunk(input))); 73 | } 74 | } -------------------------------------------------------------------------------- /legacy/chonkie/cloud/late.ts: -------------------------------------------------------------------------------- 1 | /** Late chunker client for Chonkie API. */ 2 | 3 | import { CloudClient, ChunkerInput } from "./base"; 4 | import { LateChunk } from "../types/late"; 5 | import * as fs from 'fs'; 6 | import * as path from 'path'; 7 | 8 | export interface LateChunkerConfig { 9 | embeddingModel?: string; 10 | chunkSize?: number; 11 | recipe?: string; 12 | lang?: string; 13 | minCharactersPerChunk?: number; 14 | } 15 | 16 | export class LateChunker extends CloudClient { 17 | private readonly config: Required; 18 | 19 | constructor(apiKey: string, config: LateChunkerConfig = {}) { 20 | super({ apiKey }); 21 | this.config = { 22 | embeddingModel: config.embeddingModel || "all-MiniLM-L6-v2", 23 | chunkSize: config.chunkSize || 512, 24 | recipe: config.recipe || "default", 25 | lang: config.lang || "en", 26 | minCharactersPerChunk: config.minCharactersPerChunk || 24, 27 | }; 28 | } 29 | 30 | async chunk(input: ChunkerInput): Promise { 31 | const formData = new FormData(); 32 | 33 | if (input.filepath) { 34 | const fileContent = fs.readFileSync(input.filepath); 35 | const fileName = path.basename(input.filepath) || 'file.txt'; 36 | formData.append("file", new Blob([fileContent]), fileName); 37 | } else if (input.text) { 38 | // JSON encode the text 39 | formData.append("text", JSON.stringify(input.text)); 40 | // Append empty file to ensure multipart form 41 | formData.append("file", new Blob(), "text_input.txt"); 42 | } else { 43 | throw new Error("Either text or filepath must be provided"); 44 | } 45 | 46 | formData.append("embedding_model", this.config.embeddingModel); 47 | formData.append("chunk_size", this.config.chunkSize.toString()); 48 | formData.append("recipe", this.config.recipe); 49 | formData.append("lang", this.config.lang); 50 | formData.append("min_characters_per_chunk", this.config.minCharactersPerChunk.toString()); 51 | 52 | const data = await this.request("/v1/chunk/late", { 53 | method: "POST", 54 | body: formData, 55 | }); 56 | 57 | // Convert from snake_case to camelCase 58 | const camelCaseData = data.map((chunk: any) => { 59 | return { 60 | text: chunk.text, 61 | startIndex: chunk.start_index, 62 | endIndex: chunk.end_index, 63 | tokenCount: chunk.token_count, 64 | embedding: chunk.embedding || undefined, 65 | }; 66 | }); 67 | 68 | return camelCaseData.map((chunk: any) => LateChunk.fromDict(chunk)); 69 | } 70 | 71 | async chunkBatch(inputs: ChunkerInput[]): Promise { 72 | return Promise.all(inputs.map(input => this.chunk(input))); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /legacy/chonkie/cloud/slumber.ts: -------------------------------------------------------------------------------- 1 | /** Slumber chunker client for Chonkie API. */ 2 | 3 | import { CloudClient, ChunkerInput } from "./base"; 4 | import { Chunk } from "../types/base"; 5 | import * as fs from 'fs'; 6 | import * as path from 'path'; 7 | 8 | export interface SlumberChunkerConfig { 9 | tokenizerOrTokenCounter?: string; 10 | chunkSize?: number; 11 | candidateSize?: number; 12 | minCharactersPerChunk?: number; 13 | } 14 | 15 | export class SlumberChunker extends CloudClient { 16 | private readonly config: Required; 17 | 18 | constructor(apiKey: string, config: SlumberChunkerConfig = {}) { 19 | super({ apiKey }); 20 | this.config = { 21 | tokenizerOrTokenCounter: config.tokenizerOrTokenCounter || "gpt2", 22 | chunkSize: config.chunkSize || 1024, 23 | candidateSize: config.candidateSize || 32, 24 | minCharactersPerChunk: config.minCharactersPerChunk || 12, 25 | }; 26 | } 27 | 28 | async chunk(input: ChunkerInput): Promise { 29 | const formData = new FormData(); 30 | 31 | if (input.filepath) { 32 | const fileContent = fs.readFileSync(input.filepath); 33 | const fileName = path.basename(input.filepath) || 'file.txt'; 34 | formData.append("file", new Blob([fileContent]), fileName); 35 | } else if (input.text) { 36 | formData.append("text", input.text); 37 | // Append empty file to ensure multipart form 38 | formData.append("file", new Blob(), "text_input.txt"); 39 | } else { 40 | throw new Error("Either text or filepath must be provided"); 41 | } 42 | 43 | formData.append("tokenizer_or_token_counter", this.config.tokenizerOrTokenCounter); 44 | formData.append("chunk_size", this.config.chunkSize.toString()); 45 | formData.append("candidate_size", this.config.candidateSize.toString()); 46 | formData.append("min_characters_per_chunk", this.config.minCharactersPerChunk.toString()); 47 | formData.append("return_type", "chunks"); 48 | 49 | const data = await this.request("/v1/chunk/slumber", { 50 | method: "POST", 51 | body: formData, 52 | }); 53 | 54 | // Convert from snake_case to camelCase 55 | const camelCaseData = data.map((chunk: any) => { 56 | return { 57 | text: chunk.text, 58 | startIndex: chunk.start_index, 59 | endIndex: chunk.end_index, 60 | tokenCount: chunk.token_count, 61 | embedding: chunk.embedding || undefined, 62 | context: chunk.context || undefined, 63 | }; 64 | }); 65 | 66 | return camelCaseData.map((chunk: any) => Chunk.fromDict(chunk)); 67 | } 68 | 69 | async chunkBatch(inputs: ChunkerInput[]): Promise { 70 | return Promise.all(inputs.map(input => this.chunk(input))); 71 | } 72 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | .pnpm-debug.log* 9 | 10 | # Diagnostic reports (https://nodejs.org/api/report.html) 11 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 12 | 13 | # Runtime data 14 | pids 15 | *.pid 16 | *.seed 17 | *.pid.lock 18 | 19 | # Directory for instrumented libs generated by jscoverage/JSCover 20 | lib-cov 21 | 22 | # Coverage directory used by tools like istanbul 23 | coverage 24 | *.lcov 25 | 26 | # nyc test coverage 27 | .nyc_output 28 | 29 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 30 | .grunt 31 | 32 | # Bower dependency directory (https://bower.io/) 33 | bower_components 34 | 35 | # node-waf configuration 36 | .lock-wscript 37 | 38 | # Compiled binary addons (https://nodejs.org/api/addons.html) 39 | build/Release 40 | 41 | # Dependency directories 42 | node_modules/ 43 | jspm_packages/ 44 | 45 | # Snowpack dependency directory (https://snowpack.dev/) 46 | web_modules/ 47 | 48 | # TypeScript cache 49 | *.tsbuildinfo 50 | 51 | # Optional npm cache directory 52 | .npm 53 | 54 | # Optional eslint cache 55 | .eslintcache 56 | 57 | # Optional stylelint cache 58 | .stylelintcache 59 | 60 | # Microbundle cache 61 | .rpt2_cache/ 62 | .rts2_cache_cjs/ 63 | .rts2_cache_es/ 64 | .rts2_cache_umd/ 65 | 66 | # Optional REPL history 67 | .node_repl_history 68 | 69 | # Output of 'npm pack' 70 | *.tgz 71 | 72 | # Yarn Integrity file 73 | .yarn-integrity 74 | 75 | # dotenv environment variable files 76 | .env 77 | .env.development.local 78 | .env.test.local 79 | .env.production.local 80 | .env.local 81 | 82 | # parcel-bundler cache (https://parceljs.org/) 83 | .cache 84 | .parcel-cache 85 | 86 | # Next.js build output 87 | .next 88 | out 89 | 90 | # Nuxt.js build / generate output 91 | .nuxt 92 | dist 93 | 94 | # Gatsby files 95 | .cache/ 96 | # Comment in the public line in if your project uses Gatsby and not Next.js 97 | # https://nextjs.org/blog/next-9-1#public-directory-support 98 | # public 99 | 100 | # vuepress build output 101 | .vuepress/dist 102 | 103 | # vuepress v2.x temp and cache directory 104 | .temp 105 | .cache 106 | 107 | # vitepress build output 108 | **/.vitepress/dist 109 | 110 | # vitepress cache directory 111 | **/.vitepress/cache 112 | 113 | # Docusaurus cache and generated files 114 | .docusaurus 115 | 116 | # Serverless directories 117 | .serverless/ 118 | 119 | # FuseBox cache 120 | .fusebox/ 121 | 122 | # DynamoDB Local files 123 | .dynamodb/ 124 | 125 | # TernJS port file 126 | .tern-port 127 | 128 | # Stores VSCode versions used for testing VSCode extensions 129 | .vscode-test 130 | 131 | # yarn v2 132 | .yarn/cache 133 | .yarn/unplugged 134 | .yarn/build-state.yml 135 | .yarn/install-state.gz 136 | .pnp.* 137 | .vscode 138 | .chaider -------------------------------------------------------------------------------- /legacy/chonkie/cloud/recursive.ts: -------------------------------------------------------------------------------- 1 | /** Recursive chunker client for Chonkie API. */ 2 | 3 | import { CloudClient, ChunkerInput } from "./base"; 4 | import { RecursiveChunk } from "../types/recursive"; 5 | import * as fs from 'fs'; 6 | import * as path from 'path'; 7 | 8 | export interface RecursiveChunkerConfig { 9 | tokenizerOrTokenCounter?: string; 10 | chunkSize?: number; 11 | recipe?: string; 12 | lang?: string; 13 | minCharactersPerChunk?: number; 14 | } 15 | 16 | export class RecursiveChunker extends CloudClient { 17 | private readonly config: Required; 18 | 19 | constructor(apiKey: string, config: RecursiveChunkerConfig = {}) { 20 | super({ apiKey }); 21 | this.config = { 22 | tokenizerOrTokenCounter: config.tokenizerOrTokenCounter || "gpt2", 23 | chunkSize: config.chunkSize || 512, 24 | recipe: config.recipe || "default", 25 | lang: config.lang || "en", 26 | minCharactersPerChunk: config.minCharactersPerChunk || 12, 27 | }; 28 | } 29 | 30 | async chunk(input: ChunkerInput): Promise { 31 | const formData = new FormData(); 32 | 33 | if (input.filepath) { 34 | const fileContent = fs.readFileSync(input.filepath); 35 | const fileName = path.basename(input.filepath) || 'file.txt'; 36 | formData.append("file", new Blob([fileContent]), fileName); 37 | } else if (input.text) { 38 | // JSON encode the text 39 | formData.append("text", JSON.stringify(input.text)); 40 | // Append empty file to ensure multipart form 41 | formData.append("file", new Blob(), "text_input.txt"); 42 | } else { 43 | throw new Error("Either text or filepath must be provided"); 44 | } 45 | formData.append("tokenizer_or_token_counter", this.config.tokenizerOrTokenCounter); 46 | formData.append("chunk_size", this.config.chunkSize.toString()); 47 | formData.append("recipe", this.config.recipe); 48 | formData.append("lang", this.config.lang); 49 | formData.append("min_characters_per_chunk", this.config.minCharactersPerChunk.toString()); 50 | formData.append("return_type", "chunks"); 51 | 52 | const data = await this.request("/v1/chunk/recursive", { 53 | method: "POST", 54 | body: formData, 55 | }); 56 | 57 | // Convert from snake_case to camelCase 58 | const camelCaseData = data.map((chunk: any) => { 59 | return { 60 | text: chunk.text, 61 | startIndex: chunk.start_index, 62 | endIndex: chunk.end_index, 63 | tokenCount: chunk.token_count, 64 | embedding: chunk.embedding || undefined, 65 | level: chunk.level, 66 | }; 67 | }); 68 | 69 | return camelCaseData.map((chunk: any) => RecursiveChunk.fromDict(chunk)); 70 | } 71 | 72 | async chunkBatch(inputs: ChunkerInput[]): Promise { 73 | return Promise.all(inputs.map(input => this.chunk(input))); 74 | } 75 | } -------------------------------------------------------------------------------- /packages/cloud/src/chunkers/neural.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Neural chunker that uses neural networks for intelligent chunking 3 | * via api.chonkie.ai 4 | */ 5 | 6 | import { Chunk } from '@chonkiejs/core'; 7 | import { CloudBaseChunker, ChunkerInput } from '@/base'; 8 | 9 | export interface NeuralChunkerOptions { 10 | /** Model to use (default: "mirth/chonky_modernbert_large_1") */ 11 | model?: string; 12 | /** Minimum characters per chunk (default: 10) */ 13 | minCharactersPerChunk?: number; 14 | /** API key (reads from CHONKIE_API_KEY env var if not provided) */ 15 | apiKey?: string; 16 | /** Base URL for API (default: "https://api.chonkie.ai") */ 17 | baseUrl?: string; 18 | } 19 | 20 | interface ApiChunkResponse { 21 | text: string; 22 | start_index: number; 23 | end_index: number; 24 | token_count: number; 25 | } 26 | 27 | interface NeuralChunkPayload extends Record { 28 | text?: string; 29 | file?: { type: string; content: string }; 30 | embedding_model: string; 31 | min_characters_per_chunk: number; 32 | return_type: string; 33 | } 34 | 35 | export class NeuralChunker extends CloudBaseChunker { 36 | private readonly config: { 37 | model: string; 38 | minCharactersPerChunk: number; 39 | }; 40 | 41 | constructor(options: NeuralChunkerOptions = {}) { 42 | const apiKey = options.apiKey || process.env.CHONKIE_API_KEY; 43 | if (!apiKey) { 44 | throw new Error('API key is required. Provide it in options.apiKey or set CHONKIE_API_KEY environment variable.'); 45 | } 46 | 47 | super({ apiKey, baseUrl: options.baseUrl }); 48 | 49 | this.config = { 50 | model: options.model || 'mirth/chonky_modernbert_large_1', 51 | minCharactersPerChunk: options.minCharactersPerChunk || 10, 52 | }; 53 | } 54 | 55 | async chunk(input: ChunkerInput): Promise { 56 | let fileRef = input.file; 57 | 58 | // If filepath is provided, upload it first to get a file reference 59 | if (input.filepath) { 60 | fileRef = await this.uploadFile(input.filepath); 61 | } 62 | 63 | // Build the payload 64 | const payload: NeuralChunkPayload = { 65 | embedding_model: this.config.model, 66 | min_characters_per_chunk: this.config.minCharactersPerChunk, 67 | return_type: 'chunks', 68 | }; 69 | 70 | // Add either text or file to the payload 71 | if (fileRef) { 72 | payload.file = fileRef; 73 | } else if (input.text) { 74 | payload.text = input.text; 75 | } else { 76 | throw new Error('Either text, filepath, or file must be provided'); 77 | } 78 | 79 | const data = await this.request('/v1/chunk/neural', { 80 | method: 'POST', 81 | body: payload, 82 | }); 83 | 84 | return data.map(chunk => new Chunk({ 85 | text: chunk.text, 86 | startIndex: chunk.start_index, 87 | endIndex: chunk.end_index, 88 | tokenCount: chunk.token_count, 89 | })); 90 | } 91 | 92 | async chunkBatch(inputs: ChunkerInput[]): Promise { 93 | return Promise.all(inputs.map(input => this.chunk(input))); 94 | } 95 | 96 | toString(): string { 97 | return `NeuralChunker(model=${this.config.model})`; 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /legacy/chonkie/types/semantic.ts: -------------------------------------------------------------------------------- 1 | import { Sentence, SentenceData } from './sentence'; 2 | import { SentenceChunk } from './sentence'; 3 | 4 | /** 5 | * Represents a semantic sentence with metadata, including an embedding. 6 | * Extends the base Sentence class. 7 | */ 8 | export interface SemanticSentenceData extends SentenceData { 9 | embedding?: number[] | null; 10 | } 11 | 12 | export class SemanticSentence extends Sentence { 13 | /** The embedding vector for the sentence (array of numbers, or null if not present) */ 14 | public embedding: number[] | null; 15 | 16 | constructor(data: SemanticSentenceData) { 17 | super(data); 18 | this.embedding = data.embedding ?? null; 19 | } 20 | 21 | /** Return the SemanticSentence as a dictionary-like object */ 22 | public toDict(): SemanticSentenceData { 23 | return { ...super.toDict(), embedding: this.embedding ?? null }; 24 | } 25 | 26 | /** Create a SemanticSentence object from a dictionary */ 27 | public static fromDict(data: SemanticSentenceData): SemanticSentence { 28 | // Defensive copy to avoid mutating input 29 | const { embedding, ...rest } = data; 30 | return new SemanticSentence({ ...rest, embedding: embedding ?? null }); 31 | } 32 | 33 | /** Return a string representation of the SemanticSentence */ 34 | public toString(): string { 35 | return `SemanticSentence(text=${this.text}, startIndex=${this.startIndex}, endIndex=${this.endIndex}, tokenCount=${this.tokenCount}, embedding=${JSON.stringify(this.embedding)})`; 36 | } 37 | } 38 | 39 | /** 40 | * Represents a semantic chunk with metadata, including a list of semantic sentences. 41 | * Extends the base SentenceChunk class. 42 | */ 43 | export interface SemanticChunkData { 44 | text: string; 45 | startIndex: number; 46 | endIndex: number; 47 | tokenCount: number; 48 | sentences: SemanticSentenceData[]; 49 | embedding?: number[]; 50 | } 51 | 52 | export class SemanticChunk extends SentenceChunk { 53 | /** List of SemanticSentence objects in the chunk */ 54 | public sentences: SemanticSentence[]; 55 | 56 | constructor(data: SemanticChunkData & { sentences: SemanticSentence[] }) { 57 | super({ 58 | text: data.text, 59 | startIndex: data.startIndex, 60 | endIndex: data.endIndex, 61 | tokenCount: data.tokenCount, 62 | sentences: data.sentences, 63 | embedding: data.embedding, 64 | }); 65 | this.sentences = data.sentences; 66 | } 67 | 68 | /** Return the SemanticChunk as a dictionary-like object */ 69 | public toDict(): SemanticChunkData { 70 | const base = super.toDict() as SemanticChunkData; 71 | return { 72 | ...base, 73 | sentences: this.sentences.map((s) => s.toDict()), 74 | }; 75 | } 76 | 77 | /** Create a SemanticChunk object from a dictionary */ 78 | public static fromDict(data: SemanticChunkData): SemanticChunk { 79 | const { sentences, ...rest } = data; 80 | const semanticSentences = sentences.map((s) => SemanticSentence.fromDict(s)); 81 | return new SemanticChunk({ ...rest, sentences: semanticSentences }); 82 | } 83 | 84 | /** Return a string representation of the SemanticChunk */ 85 | public toString(): string { 86 | return `SemanticChunk(text=${this.text}, startIndex=${this.startIndex}, endIndex=${this.endIndex}, tokenCount=${this.tokenCount}, sentences=${JSON.stringify(this.sentences)})`; 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /packages/cloud/src/chunkers/code.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Code chunker that splits code into structurally meaningful chunks 3 | * via api.chonkie.ai 4 | */ 5 | 6 | import { Chunk } from '@chonkiejs/core'; 7 | import { CloudBaseChunker, ChunkerInput } from '@/base'; 8 | 9 | export interface CodeChunkerOptions { 10 | /** Tokenizer to use (default: "gpt2") */ 11 | tokenizer?: string; 12 | /** Maximum tokens per chunk (default: 1500) */ 13 | chunkSize?: number; 14 | /** Programming language (required, e.g., "python", "javascript", "typescript") */ 15 | language: string; 16 | /** API key (reads from CHONKIE_API_KEY env var if not provided) */ 17 | apiKey?: string; 18 | /** Base URL for API (default: "https://api.chonkie.ai") */ 19 | baseUrl?: string; 20 | } 21 | 22 | interface ApiChunkResponse { 23 | text: string; 24 | start_index: number; 25 | end_index: number; 26 | token_count: number; 27 | } 28 | 29 | interface CodeChunkPayload extends Record { 30 | text?: string; 31 | file?: { type: string; content: string }; 32 | tokenizer_or_token_counter: string; 33 | chunk_size: number; 34 | language: string; 35 | } 36 | 37 | export class CodeChunker extends CloudBaseChunker { 38 | private readonly config: { 39 | tokenizer: string; 40 | chunkSize: number; 41 | language: string; 42 | }; 43 | 44 | constructor(options: CodeChunkerOptions) { 45 | if (!options.language) { 46 | throw new Error('Language is required for code chunking'); 47 | } 48 | 49 | const apiKey = options.apiKey || process.env.CHONKIE_API_KEY; 50 | if (!apiKey) { 51 | throw new Error('API key is required. Provide it in options.apiKey or set CHONKIE_API_KEY environment variable.'); 52 | } 53 | 54 | super({ apiKey, baseUrl: options.baseUrl }); 55 | 56 | this.config = { 57 | tokenizer: options.tokenizer || 'gpt2', 58 | chunkSize: options.chunkSize || 1500, 59 | language: options.language, 60 | }; 61 | } 62 | 63 | async chunk(input: ChunkerInput): Promise { 64 | let fileRef = input.file; 65 | 66 | // If filepath is provided, upload it first to get a file reference 67 | if (input.filepath) { 68 | fileRef = await this.uploadFile(input.filepath); 69 | } 70 | 71 | // Build the payload 72 | const payload: CodeChunkPayload = { 73 | tokenizer_or_token_counter: this.config.tokenizer, 74 | chunk_size: this.config.chunkSize, 75 | language: this.config.language, 76 | }; 77 | 78 | // Add either text or file to the payload 79 | if (fileRef) { 80 | payload.file = fileRef; 81 | } else if (input.text) { 82 | payload.text = input.text; 83 | } else { 84 | throw new Error('Either text, filepath, or file must be provided'); 85 | } 86 | 87 | const data = await this.request('/v1/chunk/code', { 88 | method: 'POST', 89 | body: payload, 90 | }); 91 | 92 | return data.map(chunk => new Chunk({ 93 | text: chunk.text, 94 | startIndex: chunk.start_index, 95 | endIndex: chunk.end_index, 96 | tokenCount: chunk.token_count, 97 | })); 98 | } 99 | 100 | async chunkBatch(inputs: ChunkerInput[]): Promise { 101 | return Promise.all(inputs.map(input => this.chunk(input))); 102 | } 103 | 104 | toString(): string { 105 | return `CodeChunker(language=${this.config.language}, chunkSize=${this.config.chunkSize})`; 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /packages/cloud/src/chunkers/token.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Token chunker that splits text into fixed-size token chunks 3 | * via api.chonkie.ai 4 | */ 5 | 6 | import { Chunk } from '@chonkiejs/core'; 7 | import { CloudBaseChunker, ChunkerInput } from '@/base'; 8 | 9 | export interface TokenChunkerOptions { 10 | /** Tokenizer to use (default: "gpt2") */ 11 | tokenizer?: string; 12 | /** Maximum tokens per chunk (default: 512) */ 13 | chunkSize?: number; 14 | /** Number of tokens to overlap between chunks (default: 0) */ 15 | chunkOverlap?: number; 16 | /** API key (reads from CHONKIE_API_KEY env var if not provided) */ 17 | apiKey?: string; 18 | /** Base URL for API (default: "https://api.chonkie.ai") */ 19 | baseUrl?: string; 20 | } 21 | 22 | interface ApiChunkResponse { 23 | text: string; 24 | start_index: number; 25 | end_index: number; 26 | token_count: number; 27 | } 28 | 29 | interface TokenChunkPayload extends Record { 30 | text?: string; 31 | file?: { type: string; content: string }; 32 | tokenizer_or_token_counter: string; 33 | chunk_size: number; 34 | chunk_overlap: number; 35 | return_type: string; 36 | } 37 | 38 | export class TokenChunker extends CloudBaseChunker { 39 | private readonly config: { 40 | tokenizer: string; 41 | chunkSize: number; 42 | chunkOverlap: number; 43 | }; 44 | 45 | constructor(options: TokenChunkerOptions = {}) { 46 | const apiKey = options.apiKey || process.env.CHONKIE_API_KEY; 47 | if (!apiKey) { 48 | throw new Error('API key is required. Provide it in options.apiKey or set CHONKIE_API_KEY environment variable.'); 49 | } 50 | 51 | super({ apiKey, baseUrl: options.baseUrl }); 52 | 53 | this.config = { 54 | tokenizer: options.tokenizer || 'gpt2', 55 | chunkSize: options.chunkSize || 512, 56 | chunkOverlap: options.chunkOverlap || 0, 57 | }; 58 | } 59 | 60 | async chunk(input: ChunkerInput): Promise { 61 | let fileRef = input.file; 62 | 63 | // If filepath is provided, upload it first to get a file reference 64 | if (input.filepath) { 65 | fileRef = await this.uploadFile(input.filepath); 66 | } 67 | 68 | // Build the payload 69 | const payload: TokenChunkPayload = { 70 | tokenizer_or_token_counter: this.config.tokenizer, 71 | chunk_size: this.config.chunkSize, 72 | chunk_overlap: this.config.chunkOverlap, 73 | return_type: 'chunks', 74 | }; 75 | 76 | // Add either text or file to the payload 77 | if (fileRef) { 78 | payload.file = fileRef; 79 | } else if (input.text) { 80 | payload.text = input.text; 81 | } else { 82 | throw new Error('Either text, filepath, or file must be provided'); 83 | } 84 | 85 | const data = await this.request('/v1/chunk/token', { 86 | method: 'POST', 87 | body: payload, 88 | }); 89 | 90 | // Convert API response to Chunk objects 91 | return data.map(chunk => new Chunk({ 92 | text: chunk.text, 93 | startIndex: chunk.start_index, 94 | endIndex: chunk.end_index, 95 | tokenCount: chunk.token_count, 96 | })); 97 | } 98 | 99 | async chunkBatch(inputs: ChunkerInput[]): Promise { 100 | return Promise.all(inputs.map(input => this.chunk(input))); 101 | } 102 | 103 | toString(): string { 104 | return `TokenChunker(tokenizer=${this.config.tokenizer}, chunkSize=${this.config.chunkSize}, overlap=${this.config.chunkOverlap})`; 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /packages/cloud/src/refineries/overlap.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Overlap refinery that adds context overlap to existing chunks 3 | * via api.chonkie.ai 4 | */ 5 | 6 | import { Chunk } from '@chonkiejs/core'; 7 | import { CloudBaseChunker } from '@/base'; 8 | 9 | export interface OverlapRefineryOptions { 10 | /** Tokenizer to use (default: "character") */ 11 | tokenizer?: string; 12 | /** Context size as fraction or token count (default: 0.25) */ 13 | contextSize?: number; 14 | /** Mode for overlap (default: "token") */ 15 | mode?: 'token' | 'recursive'; 16 | /** Method for adding context (default: "suffix") */ 17 | method?: 'suffix' | 'prefix'; 18 | /** Recipe name for recursive mode (default: "default") */ 19 | recipe?: string; 20 | /** Language for recipe (default: "en") */ 21 | lang?: string; 22 | /** Merge overlapping chunks (default: true) */ 23 | merge?: boolean; 24 | /** API key (reads from CHONKIE_API_KEY env var if not provided) */ 25 | apiKey?: string; 26 | /** Base URL for API (default: "https://api.chonkie.ai") */ 27 | baseUrl?: string; 28 | } 29 | 30 | interface ChunkData { 31 | text: string; 32 | start_index: number; 33 | end_index: number; 34 | token_count: number; 35 | } 36 | 37 | /** 38 | * Post-processes chunks by adding contextual overlap. 39 | */ 40 | export class OverlapRefinery extends CloudBaseChunker { 41 | private readonly config: { 42 | tokenizer: string; 43 | contextSize: number; 44 | mode: 'token' | 'recursive'; 45 | method: 'suffix' | 'prefix'; 46 | recipe: string; 47 | lang: string; 48 | merge: boolean; 49 | }; 50 | 51 | constructor(options: OverlapRefineryOptions = {}) { 52 | const apiKey = options.apiKey || process.env.CHONKIE_API_KEY; 53 | if (!apiKey) { 54 | throw new Error('API key is required. Provide it in options.apiKey or set CHONKIE_API_KEY environment variable.'); 55 | } 56 | 57 | super({ apiKey, baseUrl: options.baseUrl }); 58 | 59 | this.config = { 60 | tokenizer: options.tokenizer || 'character', 61 | contextSize: options.contextSize ?? 0.25, 62 | mode: options.mode || 'token', 63 | method: options.method || 'suffix', 64 | recipe: options.recipe || 'default', 65 | lang: options.lang || 'en', 66 | merge: options.merge ?? true, 67 | }; 68 | } 69 | 70 | /** 71 | * Add overlap context to existing chunks. 72 | * 73 | * @param chunks - Array of chunks to add overlap to 74 | * @returns Array of chunks with overlap added 75 | */ 76 | async refine(chunks: Chunk[]): Promise { 77 | const chunkData = chunks.map(chunk => ({ 78 | text: chunk.text, 79 | start_index: chunk.startIndex, 80 | end_index: chunk.endIndex, 81 | token_count: chunk.tokenCount, 82 | })); 83 | 84 | const response = await this.request('/v1/refine/overlap', { 85 | method: 'POST', 86 | body: { 87 | chunks: chunkData, 88 | tokenizer_or_token_counter: this.config.tokenizer, 89 | context_size: this.config.contextSize, 90 | mode: this.config.mode, 91 | method: this.config.method, 92 | recipe: this.config.recipe, 93 | lang: this.config.lang, 94 | merge: this.config.merge, 95 | }, 96 | }); 97 | 98 | return response.map(chunk => new Chunk({ 99 | text: chunk.text, 100 | startIndex: chunk.start_index, 101 | endIndex: chunk.end_index, 102 | tokenCount: chunk.token_count, 103 | })); 104 | } 105 | 106 | toString(): string { 107 | return `OverlapRefinery(mode=${this.config.mode}, contextSize=${this.config.contextSize})`; 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /packages/core/src/tokenizer.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Simple character-based tokenizer for text chunking. 3 | * 4 | * This tokenizer treats each character as a single token, providing 5 | * a straightforward and predictable tokenization strategy. 6 | * 7 | * For advanced tokenization (GPT-2, BERT, etc.), use the static `create()` method 8 | * with @chonkiejs/token package installed. 9 | */ 10 | export class Tokenizer { 11 | /** 12 | * Create a tokenizer instance. 13 | * 14 | * @param model - Tokenizer model to use. Use 'character' (default) for character-based, 15 | * or specify a HuggingFace model like 'gpt2', 'bert-base-uncased', etc. 16 | * @returns Promise resolving to a Tokenizer instance 17 | * 18 | * @example 19 | * // Character-based (no dependencies) 20 | * const tokenizer = await Tokenizer.create(); 21 | * const tokenizer = await Tokenizer.create('character'); 22 | * 23 | * @example 24 | * // HuggingFace models (requires @chonkiejs/token) 25 | * const tokenizer = await Tokenizer.create('gpt2'); 26 | * const tokenizer = await Tokenizer.create('Xenova/gpt-4'); 27 | */ 28 | static async create(model: string = 'character'): Promise { 29 | if (model === 'character') { 30 | return new Tokenizer(); 31 | } 32 | 33 | // Try to dynamically import @chonkiejs/token 34 | try { 35 | // Use dynamic import with string to avoid TypeScript resolution 36 | const tokenPackage = await import('@chonkiejs/token' as any); 37 | const { HuggingFaceTokenizer } = tokenPackage; 38 | return await HuggingFaceTokenizer.create(model); 39 | } catch (error) { 40 | const errorMessage = error instanceof Error ? error.message : ''; 41 | 42 | // Check if it's a module not found error 43 | if (errorMessage.includes('Cannot find') || errorMessage.includes('MODULE_NOT_FOUND')) { 44 | throw new Error(` 45 | To use "${model}" tokenizer, install @chonkiejs/token: 46 | 47 | npm install @chonkiejs/token 48 | 49 | Or use character-based tokenization (no dependencies): 50 | 51 | const tokenizer = await Tokenizer.create(); 52 | const tokenizer = await Tokenizer.create('character'); 53 | 54 | Available with @chonkiejs/token: gpt2, bert-base-uncased, Xenova/gpt-4, etc. 55 | `.trim()); 56 | } 57 | 58 | // Re-throw other errors 59 | throw error; 60 | } 61 | } 62 | 63 | /** 64 | * Count the number of tokens in the given text. 65 | * For character-based tokenization, this is simply the length of the text. 66 | * 67 | * @param text - The text to count tokens for 68 | * @returns The number of tokens (characters) in the text 69 | */ 70 | countTokens(text: string): number { 71 | return text.length; 72 | } 73 | 74 | /** 75 | * Encode text into token IDs. 76 | * For character-based tokenization, returns character codes. 77 | * 78 | * @param text - The text to encode 79 | * @returns Array of character codes 80 | */ 81 | encode(text: string): number[] { 82 | return Array.from(text).map(char => char.charCodeAt(0)); 83 | } 84 | 85 | /** 86 | * Decode token IDs back into text. 87 | * For character-based tokenization, converts character codes back to string. 88 | * 89 | * @param tokens - Array of token IDs (character codes) 90 | * @returns The decoded text 91 | */ 92 | decode(tokens: number[]): string { 93 | return String.fromCharCode(...tokens); 94 | } 95 | 96 | /** 97 | * Decode a batch of token arrays. 98 | * 99 | * @param tokensBatch - Array of token arrays 100 | * @returns Array of decoded texts 101 | */ 102 | decodeBatch(tokensBatch: number[][]): string[] { 103 | return tokensBatch.map(tokens => this.decode(tokens)); 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /legacy/chonkie/cloud/sentence.ts: -------------------------------------------------------------------------------- 1 | /** Sentence chunker client for Chonkie API. */ 2 | 3 | import { CloudClient, ChunkerInput } from "./base"; 4 | import { SentenceChunk } from "../types/sentence"; 5 | import * as fs from 'fs'; 6 | import * as path from 'path'; 7 | 8 | export interface SentenceChunkerConfig { 9 | tokenizerOrTokenCounter?: string; 10 | chunkSize?: number; 11 | chunkOverlap?: number; 12 | minSentencesPerChunk?: number; 13 | minCharactersPerSentence?: number; 14 | approximate?: boolean; 15 | delim?: string | string[]; 16 | includeDelim?: "prev" | "next" | null; 17 | } 18 | 19 | export class SentenceChunker extends CloudClient { 20 | private readonly config: Required; 21 | 22 | constructor(apiKey: string, config: SentenceChunkerConfig = {}) { 23 | super({ apiKey }); 24 | this.config = { 25 | tokenizerOrTokenCounter: config.tokenizerOrTokenCounter || "gpt2", 26 | chunkSize: config.chunkSize || 512, 27 | chunkOverlap: config.chunkOverlap || 0, 28 | minSentencesPerChunk: config.minSentencesPerChunk || 1, 29 | minCharactersPerSentence: config.minCharactersPerSentence || 12, 30 | approximate: config.approximate ?? false, 31 | delim: config.delim || [".", "!", "?", "\n"], 32 | includeDelim: config.includeDelim ?? "prev", 33 | }; 34 | } 35 | 36 | async chunk(input: ChunkerInput): Promise { 37 | const formData = new FormData(); 38 | 39 | if (input.filepath) { 40 | const fileContent = fs.readFileSync(input.filepath); 41 | const fileName = path.basename(input.filepath) || 'file.txt'; 42 | formData.append("file", new Blob([fileContent]), fileName); 43 | } else if (input.text) { 44 | // JSON encode the text 45 | formData.append("text", JSON.stringify(input.text)); 46 | // Append empty file to ensure multipart form 47 | formData.append("file", new Blob(), "text_input.txt"); 48 | } else { 49 | throw new Error("Either text or filepath must be provided"); 50 | } 51 | // Append all config options to the form data 52 | formData.append("tokenizer_or_token_counter", this.config.tokenizerOrTokenCounter); 53 | formData.append("chunk_size", this.config.chunkSize.toString()); 54 | formData.append("chunk_overlap", this.config.chunkOverlap.toString()); 55 | formData.append("min_sentences_per_chunk", this.config.minSentencesPerChunk.toString()); 56 | formData.append("min_characters_per_sentence", this.config.minCharactersPerSentence.toString()); 57 | formData.append("approximate", this.config.approximate.toString()); 58 | formData.append("delim", JSON.stringify(this.config.delim)); 59 | formData.append("include_delim", this.config.includeDelim || "prev"); 60 | formData.append("return_type", "chunks"); 61 | 62 | const data = await this.request("/v1/chunk/sentence", { 63 | method: "POST", 64 | body: formData, 65 | }); 66 | 67 | // Convert from snake_case to camelCase 68 | const camelCaseData = data.map((chunk: any) => { 69 | return { 70 | text: chunk.text, 71 | startIndex: chunk.start_index, 72 | endIndex: chunk.end_index, 73 | tokenCount: chunk.token_count, 74 | sentences: chunk.sentences.map((sentence: any) => { 75 | return { 76 | text: sentence.text, 77 | startIndex: sentence.start_index, 78 | endIndex: sentence.end_index, 79 | tokenCount: sentence.token_count, 80 | embedding: sentence.embedding || undefined, 81 | }; 82 | }), 83 | }; 84 | }); 85 | 86 | return camelCaseData.map((chunk: any) => SentenceChunk.fromDict(chunk)); 87 | } 88 | 89 | async chunkBatch(inputs: ChunkerInput[]): Promise { 90 | return Promise.all(inputs.map(input => this.chunk(input))); 91 | } 92 | } -------------------------------------------------------------------------------- /packages/core/README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | ![Chonkie Logo](../../assets/chonkie_logo_br_transparent_bg.png) 4 | 5 | # @chonkiejs/core 6 | 7 | _Core chunking library for Chonkie - lightweight and efficient text chunking with zero dependencies._ 8 | 9 | [![npm version](https://img.shields.io/npm/v/@chonkiejs/core)](https://www.npmjs.com/package/@chonkiejs/core) 10 | [![npm license](https://img.shields.io/npm/l/@chonkiejs/core)](https://www.npmjs.com/package/@chonkiejs/core) 11 | [![Documentation](https://img.shields.io/badge/docs-DOCS.md-blue.svg)](./DOCS.md) 12 | [![GitHub](https://img.shields.io/badge/github-chonkie--ts-black.svg?logo=github)](https://github.com/chonkie-inc/chonkie-ts) 13 | 14 |
15 | 16 | ## Features 17 | ✨ **Simple & Clean API** - Easy to use OOP design
18 | ⚡ **Zero Dependencies** - Minimal, lightweight, fast
19 | 🔤 **Character-based** - Simple tokenization (1 char = 1 token)
20 | 🎯 **Recursive Chunking** - Smart hierarchical text splitting
21 | 📦 **TypeScript First** - Full type safety with TypeScript
22 | 23 | ## Installation 24 | 25 | Install with `npm`: 26 | ```bash 27 | npm i @chonkiejs/core 28 | ``` 29 | 30 | Install with `pnpm`: 31 | ```bash 32 | pnpm add @chonkiejs/core 33 | ``` 34 | 35 | Install with `yarn`: 36 | ```bash 37 | yarn add @chonkiejs/core 38 | ``` 39 | 40 | Install with `bun`: 41 | ```bash 42 | bun add @chonkiejs/core 43 | ``` 44 | 45 | ## Quick Start 46 | 47 | ```typescript 48 | import { RecursiveChunker } from '@chonkiejs/core'; 49 | 50 | // Create a chunker 51 | const chunker = await RecursiveChunker.create({ 52 | chunkSize: 512, 53 | minCharactersPerChunk: 24 54 | }); 55 | 56 | // Chunk your text 57 | const chunks = await chunker.chunk('Your text here...'); 58 | 59 | // Use the chunks 60 | for (const chunk of chunks) { 61 | console.log(chunk.text); 62 | console.log(`Tokens: ${chunk.tokenCount}`); 63 | } 64 | ``` 65 | 66 | ## Available Chunkers 67 | 68 | | Name | Description | 69 | |------|-------------| 70 | | `RecursiveChunker` | Recursively splits text using hierarchical rules (paragraphs → sentences → punctuation → words → characters). Each level only activates if chunks exceed the configured size. | 71 | | `TokenChunker` | Splits text into fixed-size token chunks with optional overlap. Uses character-based tokenization by default, or HuggingFace models with @chonkiejs/token. | 72 | 73 | For detailed API documentation, configuration options, and advanced usage, see [DOCS.md](./DOCS.md). 74 | 75 | ## Contributing 76 | 77 | Want to help grow Chonkie? Check out [CONTRIBUTING.md](../../CONTRIBUTING.md) to get started! Whether you're fixing bugs, adding features, improving docs, or simply leaving a ⭐️ on the repo, every contribution helps make Chonkie a better CHONK for everyone. 78 | 79 | Remember: No contribution is too small for this tiny hippo! 80 | 81 | ## Acknowledgements 82 | 83 | Chonkie would like to CHONK its way through a special thanks to all the users and contributors who have helped make this library what it is today! Your feedback, issue reports, and improvements have helped make Chonkie the CHONKIEST it can be. 84 | 85 | And of course, special thanks to [Moto Moto](https://www.youtube.com/watch?v=I0zZC4wtqDQ&t=5s) for endorsing Chonkie with his famous quote: 86 | > "I like them big, I like them chonkie in TypeScript" ~ Moto Moto... definitely did not say this 87 | 88 | ## Citation 89 | 90 | If you use Chonkie in your research, please cite it as follows: 91 | 92 | ```bibtex 93 | @software{chonkie2025, 94 | author = {Bhavnick Minhas and Shreyash Nigam}, 95 | title = {Chonkie: A no-nonsense fast, lightweight, and efficient text chunking library}, 96 | year = {2025}, 97 | publisher = {GitHub}, 98 | howpublished = {\url{https://github.com/chonkie-inc}}, 99 | } 100 | ``` 101 | -------------------------------------------------------------------------------- /packages/core/examples/recursive.example.ts: -------------------------------------------------------------------------------- 1 | import { RecursiveChunker, RecursiveRules } from '../src'; 2 | 3 | async function main() { 4 | console.log('🦛 Chonkie RecursiveChunker Example\n'); 5 | console.log('='.repeat(60)); 6 | 7 | // Example 1: Basic usage with default settings 8 | console.log('\n📝 Example 1: Basic Chunking\n'); 9 | 10 | const chunker = await RecursiveChunker.create({ 11 | chunkSize: 100, 12 | minCharactersPerChunk: 20 13 | }); 14 | 15 | const text = ` 16 | Chonkie is a powerful text chunking library. It helps you break down large documents into manageable pieces. 17 | 18 | The library uses a recursive approach. It starts by splitting on paragraphs, then sentences, then punctuation, and finally words. 19 | 20 | This hierarchical method ensures that chunks are semantically meaningful. Each chunk respects the configured size limits while maintaining context. 21 | `.trim(); 22 | 23 | const chunks = await chunker.chunk(text); 24 | 25 | console.log(`Input text length: ${text.length} characters`); 26 | console.log(`Number of chunks created: ${chunks.length}\n`); 27 | 28 | chunks.forEach((chunk, index) => { 29 | console.log(`Chunk ${index + 1}:`); 30 | console.log(` Position: [${chunk.startIndex}:${chunk.endIndex}]`); 31 | console.log(` Token count: ${chunk.tokenCount}`); 32 | console.log(` Text: "${chunk.text.substring(0, 60)}${chunk.text.length > 60 ? '...' : ''}"`); 33 | console.log(); 34 | }); 35 | 36 | // Example 2: Custom rules 37 | console.log('='.repeat(60)); 38 | console.log('\n📝 Example 2: Custom Rules (Paragraphs Only)\n'); 39 | 40 | const customChunker = await RecursiveChunker.create({ 41 | chunkSize: 150, 42 | rules: new RecursiveRules({ 43 | levels: [ 44 | { delimiters: ['\n\n'] }, // Only split on paragraphs 45 | { whitespace: true }, // Then words 46 | {} // Then characters 47 | ] 48 | }) 49 | }); 50 | 51 | const paragraphText = `First paragraph with some content. 52 | 53 | Second paragraph with more information that needs to be chunked properly. 54 | 55 | Third paragraph concludes the example.`; 56 | 57 | const customChunks = await customChunker.chunk(paragraphText); 58 | 59 | console.log(`Input text length: ${paragraphText.length} characters`); 60 | console.log(`Number of chunks: ${customChunks.length}\n`); 61 | 62 | customChunks.forEach((chunk, index) => { 63 | console.log(`Chunk ${index + 1}: ${chunk.tokenCount} tokens`); 64 | console.log(`"${chunk.text}"`); 65 | console.log(); 66 | }); 67 | 68 | // Example 3: Very long text 69 | console.log('='.repeat(60)); 70 | console.log('\n📝 Example 3: Long Text Handling\n'); 71 | 72 | const longText = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. '.repeat(20); 73 | const longChunker = await RecursiveChunker.create({ chunkSize: 100 }); 74 | const longChunks = await longChunker.chunk(longText); 75 | 76 | console.log(`Input text length: ${longText.length} characters`); 77 | console.log(`Number of chunks: ${longChunks.length}`); 78 | console.log(`Average chunk size: ${Math.round(longText.length / longChunks.length)} characters`); 79 | console.log(`Max chunk tokens: ${Math.max(...longChunks.map(c => c.tokenCount))}`); 80 | console.log(`Min chunk tokens: ${Math.min(...longChunks.map(c => c.tokenCount))}`); 81 | 82 | // Example 4: Verification 83 | console.log('\n='.repeat(60)); 84 | console.log('\n✅ Verification: Text Reconstruction\n'); 85 | 86 | const reconstructed = chunks.map(c => c.text).join(''); 87 | const matches = reconstructed === text; 88 | 89 | console.log(`Original length: ${text.length}`); 90 | console.log(`Reconstructed length: ${reconstructed.length}`); 91 | console.log(`Reconstruction matches: ${matches ? '✅ Yes' : '❌ No'}`); 92 | 93 | console.log('\n='.repeat(60)); 94 | console.log('\n🎉 Example completed!\n'); 95 | } 96 | 97 | main().catch(console.error); 98 | -------------------------------------------------------------------------------- /packages/cloud/src/chunkers/late.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Late chunker that uses recursive chunking with embeddings 3 | * via api.chonkie.ai 4 | */ 5 | 6 | import { Chunk } from '@chonkiejs/core'; 7 | import { CloudBaseChunker, ChunkerInput } from '@/base'; 8 | 9 | export interface LateChunkerOptions { 10 | /** Embedding model to use (default: "all-MiniLM-L6-v2") */ 11 | embeddingModel?: string; 12 | /** Maximum tokens per chunk (default: 512) */ 13 | chunkSize?: number; 14 | /** Recipe name (default: "default") */ 15 | recipe?: string; 16 | /** Language for recipe (default: "en") */ 17 | lang?: string; 18 | /** Minimum characters per chunk (default: 24) */ 19 | minCharactersPerChunk?: number; 20 | /** API key (reads from CHONKIE_API_KEY env var if not provided) */ 21 | apiKey?: string; 22 | /** Base URL for API (default: "https://api.chonkie.ai") */ 23 | baseUrl?: string; 24 | } 25 | 26 | interface ApiChunkResponse { 27 | text: string; 28 | start_index: number; 29 | end_index: number; 30 | token_count: number; 31 | embedding?: number[]; 32 | } 33 | 34 | interface LateChunkPayload extends Record { 35 | text?: string; 36 | file?: { type: string; content: string }; 37 | embedding_model: string; 38 | chunk_size: number; 39 | recipe: string; 40 | lang: string; 41 | min_characters_per_chunk: number; 42 | } 43 | 44 | export class LateChunker extends CloudBaseChunker { 45 | private readonly config: { 46 | embeddingModel: string; 47 | chunkSize: number; 48 | recipe: string; 49 | lang: string; 50 | minCharactersPerChunk: number; 51 | }; 52 | 53 | constructor(options: LateChunkerOptions = {}) { 54 | const apiKey = options.apiKey || process.env.CHONKIE_API_KEY; 55 | if (!apiKey) { 56 | throw new Error('API key is required. Provide it in options.apiKey or set CHONKIE_API_KEY environment variable.'); 57 | } 58 | 59 | super({ apiKey, baseUrl: options.baseUrl }); 60 | 61 | this.config = { 62 | embeddingModel: options.embeddingModel || 'all-MiniLM-L6-v2', 63 | chunkSize: options.chunkSize || 512, 64 | recipe: options.recipe || 'default', 65 | lang: options.lang || 'en', 66 | minCharactersPerChunk: options.minCharactersPerChunk || 24, 67 | }; 68 | } 69 | 70 | async chunk(input: ChunkerInput): Promise { 71 | let fileRef = input.file; 72 | 73 | // If filepath is provided, upload it first to get a file reference 74 | if (input.filepath) { 75 | fileRef = await this.uploadFile(input.filepath); 76 | } 77 | 78 | // Build the payload 79 | const payload: LateChunkPayload = { 80 | embedding_model: this.config.embeddingModel, 81 | chunk_size: this.config.chunkSize, 82 | recipe: this.config.recipe, 83 | lang: this.config.lang, 84 | min_characters_per_chunk: this.config.minCharactersPerChunk, 85 | }; 86 | 87 | // Add either text or file to the payload 88 | if (fileRef) { 89 | payload.file = fileRef; 90 | } else if (input.text) { 91 | payload.text = input.text; 92 | } else { 93 | throw new Error('Either text, filepath, or file must be provided'); 94 | } 95 | 96 | const data = await this.request('/v1/chunk/late', { 97 | method: 'POST', 98 | body: payload, 99 | }); 100 | 101 | return data.map(chunk => new Chunk({ 102 | text: chunk.text, 103 | startIndex: chunk.start_index, 104 | endIndex: chunk.end_index, 105 | tokenCount: chunk.token_count, 106 | })); 107 | } 108 | 109 | async chunkBatch(inputs: ChunkerInput[]): Promise { 110 | return Promise.all(inputs.map(input => this.chunk(input))); 111 | } 112 | 113 | toString(): string { 114 | return `LateChunker(embeddingModel=${this.config.embeddingModel}, chunkSize=${this.config.chunkSize}, recipe=${this.config.recipe})`; 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /packages/cloud/src/utils.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Utility functions for cloud package 3 | */ 4 | 5 | /** 6 | * Format API error messages with helpful context and instructions 7 | */ 8 | export function formatApiError( 9 | statusCode: number, 10 | errorMessage: string, 11 | endpoint: string 12 | ): string { 13 | const baseMessage = `API Error (${statusCode}): ${errorMessage}`; 14 | 15 | let helpText = ''; 16 | 17 | // Provide specific help based on error type 18 | if (statusCode === 401 || errorMessage.toLowerCase().includes('invalid api key')) { 19 | helpText = ` 20 | Please check your API key: 21 | - Ensure CHONKIE_API_KEY environment variable is set correctly 22 | - Or pass apiKey in the constructor options 23 | - Verify your key at https://api.chonkie.ai/dashboard`; 24 | } else if (statusCode === 403) { 25 | helpText = ` 26 | You don't have permission to access this resource. 27 | - Check your API key permissions 28 | - Contact support if you believe this is an error`; 29 | } else if (statusCode === 429) { 30 | helpText = ` 31 | Rate limit exceeded. Please: 32 | - Wait a moment before retrying 33 | - Check your usage limits at https://api.chonkie.ai/dashboard`; 34 | } else if (statusCode === 404) { 35 | helpText = ` 36 | Endpoint not found: ${endpoint} 37 | - Verify you're using the latest version of @hippolib/cloud 38 | - Check the API documentation`; 39 | } else if (statusCode >= 500) { 40 | helpText = ` 41 | Server error on api.chonkie.ai 42 | - This is likely a temporary issue 43 | - Try again in a few moments 44 | - Check status at https://status.chonkie.ai (if available)`; 45 | } else { 46 | helpText = ` 47 | Unexpected error occurred.`; 48 | } 49 | 50 | const footer = ` 51 | 52 | If this error persists: 53 | - Open an issue: https://github.com/chonkie-inc/chonkie-ts/issues 54 | - Contact maintainer: bhavnick@chonkie.ai 55 | - Include the error message and what you were trying to do`; 56 | 57 | return baseMessage + helpText + footer; 58 | } 59 | 60 | /** 61 | * Common API error types 62 | */ 63 | export const API_ERRORS = { 64 | INVALID_API_KEY: 'Invalid API key', 65 | RATE_LIMIT: 'Rate limit exceeded', 66 | SERVER_ERROR: 'Server error', 67 | NOT_FOUND: 'Endpoint not found', 68 | FORBIDDEN: 'Access forbidden', 69 | } as const; 70 | 71 | /** 72 | * File reference type for API requests 73 | */ 74 | export interface FileReference { 75 | /** Type of file reference */ 76 | type: 'document' | 'base64'; 77 | /** Content - either document name or base64 string */ 78 | content: string; 79 | } 80 | 81 | /** 82 | * Response from file upload endpoint 83 | */ 84 | export interface FileUploadResponse { 85 | /** The document name/ID that can be used in subsequent API calls */ 86 | document: string; 87 | /** Optional additional metadata */ 88 | [key: string]: unknown; 89 | } 90 | 91 | /** 92 | * Create a file reference object for use in JSON API requests 93 | * 94 | * @param type - Type of file reference ('document' or 'base64') 95 | * @param content - The document name or base64 encoded string 96 | * @returns FileReference object that can be included in API request bodies 97 | * 98 | * @example 99 | * ```typescript 100 | * // Using a document reference 101 | * const fileRef = createFileReference('document', 'my-uploaded-file.pdf'); 102 | * 103 | * // Using base64 104 | * const base64Data = btoa('file contents'); 105 | * const fileRef = createFileReference('base64', base64Data); 106 | * ``` 107 | */ 108 | export function createFileReference(type: 'document' | 'base64', content: string): FileReference { 109 | if (!type || (type !== 'document' && type !== 'base64')) { 110 | throw new Error('File reference type must be either "document" or "base64"'); 111 | } 112 | if (!content || typeof content !== 'string') { 113 | throw new Error('File reference content must be a non-empty string'); 114 | } 115 | return { type, content }; 116 | } 117 | -------------------------------------------------------------------------------- /packages/cloud/src/chunkers/recursive.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Recursive chunker that uses hierarchical rules for chunking 3 | * via api.chonkie.ai 4 | */ 5 | 6 | import { Chunk } from '@chonkiejs/core'; 7 | import { CloudBaseChunker, ChunkerInput } from '@/base'; 8 | 9 | export interface RecursiveChunkerOptions { 10 | /** Tokenizer to use (default: "gpt2") */ 11 | tokenizer?: string; 12 | /** Maximum tokens per chunk (default: 512) */ 13 | chunkSize?: number; 14 | /** Recipe name (default: "default") */ 15 | recipe?: string; 16 | /** Language for recipe (default: "en") */ 17 | lang?: string; 18 | /** Minimum characters per chunk (default: 12) */ 19 | minCharactersPerChunk?: number; 20 | /** API key (reads from CHONKIE_API_KEY env var if not provided) */ 21 | apiKey?: string; 22 | /** Base URL for API (default: "https://api.chonkie.ai") */ 23 | baseUrl?: string; 24 | } 25 | 26 | interface ApiChunkResponse { 27 | text: string; 28 | start_index: number; 29 | end_index: number; 30 | token_count: number; 31 | } 32 | 33 | interface RecursiveChunkPayload extends Record { 34 | text?: string; 35 | file?: { type: string; content: string }; 36 | tokenizer_or_token_counter: string; 37 | chunk_size: number; 38 | recipe: string; 39 | lang: string; 40 | min_characters_per_chunk: number; 41 | return_type: string; 42 | } 43 | 44 | export class RecursiveChunker extends CloudBaseChunker { 45 | private readonly config: { 46 | tokenizer: string; 47 | chunkSize: number; 48 | recipe: string; 49 | lang: string; 50 | minCharactersPerChunk: number; 51 | }; 52 | 53 | constructor(options: RecursiveChunkerOptions = {}) { 54 | const apiKey = options.apiKey || process.env.CHONKIE_API_KEY; 55 | if (!apiKey) { 56 | throw new Error('API key is required. Provide it in options.apiKey or set CHONKIE_API_KEY environment variable.'); 57 | } 58 | 59 | super({ apiKey, baseUrl: options.baseUrl }); 60 | 61 | this.config = { 62 | tokenizer: options.tokenizer || 'gpt2', 63 | chunkSize: options.chunkSize || 512, 64 | recipe: options.recipe || 'default', 65 | lang: options.lang || 'en', 66 | minCharactersPerChunk: options.minCharactersPerChunk || 12, 67 | }; 68 | } 69 | 70 | async chunk(input: ChunkerInput): Promise { 71 | let fileRef = input.file; 72 | 73 | // If filepath is provided, upload it first to get a file reference 74 | if (input.filepath) { 75 | fileRef = await this.uploadFile(input.filepath); 76 | } 77 | 78 | // Build the payload 79 | const payload: RecursiveChunkPayload = { 80 | tokenizer_or_token_counter: this.config.tokenizer, 81 | chunk_size: this.config.chunkSize, 82 | recipe: this.config.recipe, 83 | lang: this.config.lang, 84 | min_characters_per_chunk: this.config.minCharactersPerChunk, 85 | return_type: 'chunks', 86 | }; 87 | 88 | // Add either text or file to the payload 89 | if (fileRef) { 90 | payload.file = fileRef; 91 | } else if (input.text) { 92 | payload.text = input.text; 93 | } else { 94 | throw new Error('Either text, filepath, or file must be provided'); 95 | } 96 | 97 | const data = await this.request('/v1/chunk/recursive', { 98 | method: 'POST', 99 | body: payload, 100 | }); 101 | 102 | return data.map(chunk => new Chunk({ 103 | text: chunk.text, 104 | startIndex: chunk.start_index, 105 | endIndex: chunk.end_index, 106 | tokenCount: chunk.token_count, 107 | })); 108 | } 109 | 110 | async chunkBatch(inputs: ChunkerInput[]): Promise { 111 | return Promise.all(inputs.map(input => this.chunk(input))); 112 | } 113 | 114 | toString(): string { 115 | return `RecursiveChunker(tokenizer=${this.config.tokenizer}, chunkSize=${this.config.chunkSize}, recipe=${this.config.recipe})`; 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /packages/token/README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | ![Chonkie Logo](../../assets/chonkie_logo_br_transparent_bg.png) 4 | 5 | # @chonkiejs/token 6 | 7 | _HuggingFace tokenizer support for Chonkie - extends @chonkiejs/core with real tokenization._ 8 | 9 | [![npm version](https://img.shields.io/npm/v/@chonkiejs/token)](https://www.npmjs.com/package/@chonkiejs/token) 10 | [![npm license](https://img.shields.io/npm/l/@chonkiejs/token)](https://www.npmjs.com/package/@chonkiejs/token) 11 | [![GitHub](https://img.shields.io/badge/github-chonkie--ts-black.svg?logo=github)](https://github.com/chonkie-inc/chonkie-ts) 12 | 13 |
14 | 15 | ## Features 16 | 🤗 **HuggingFace Integration** - Use any HuggingFace tokenizer model
17 | 🔌 **Optional Plugin** - Install only when you need real tokenization
18 | 📦 **Zero Config** - Works automatically with @chonkiejs/core
19 | ⚡ **Progressive Enhancement** - Core works without it, better with it
20 | 21 | ## Installation 22 | 23 | Install with `npm`: 24 | ```bash 25 | npm i @chonkiejs/token @chonkiejs/core 26 | ``` 27 | 28 | Install with `pnpm`: 29 | ```bash 30 | pnpm add @chonkiejs/token @chonkiejs/core 31 | ``` 32 | 33 | Install with `yarn`: 34 | ```bash 35 | yarn add @chonkiejs/token @chonkiejs/core 36 | ``` 37 | 38 | Install with `bun`: 39 | ```bash 40 | bun add @chonkiejs/token @chonkiejs/core 41 | ``` 42 | 43 | ## Quick Start 44 | 45 | Simply install this package alongside `@chonkiejs/core`, then use tokenizer names: 46 | 47 | ```typescript 48 | import { RecursiveChunker, TokenChunker } from '@chonkiejs/core'; 49 | 50 | // Use GPT-2 tokenization (automatically uses @chonkiejs/token) 51 | const chunker = await RecursiveChunker.create({ 52 | tokenizer: 'Xenova/gpt2', 53 | chunkSize: 512 54 | }); 55 | 56 | const chunks = await chunker.chunk('Your text here...'); 57 | ``` 58 | 59 | ## Supported Models 60 | 61 | Any HuggingFace model from transformers.js: 62 | 63 | - `Xenova/gpt2` 64 | - `Xenova/gpt-4` 65 | - `bert-base-uncased` 66 | - `google-bert/bert-base-multilingual-cased` 67 | - And many more! 68 | 69 | See: https://huggingface.co/models?library=transformers.js 70 | 71 | ## Usage Examples 72 | 73 | ### With RecursiveChunker 74 | 75 | ```typescript 76 | import { RecursiveChunker } from '@chonkiejs/core'; 77 | 78 | const chunker = await RecursiveChunker.create({ 79 | tokenizer: 'Xenova/gpt2', 80 | chunkSize: 512 81 | }); 82 | 83 | const chunks = await chunker.chunk('Your document...'); 84 | ``` 85 | 86 | ### With TokenChunker 87 | 88 | ```typescript 89 | import { TokenChunker } from '@chonkiejs/core'; 90 | 91 | const chunker = await TokenChunker.create({ 92 | tokenizer: 'bert-base-uncased', 93 | chunkSize: 256, 94 | chunkOverlap: 50 95 | }); 96 | 97 | const chunks = await chunker.chunk('Your text...'); 98 | ``` 99 | 100 | ### Direct Tokenizer Usage 101 | 102 | ```typescript 103 | import { HuggingFaceTokenizer } from '@chonkiejs/token'; 104 | 105 | const tokenizer = await HuggingFaceTokenizer.create('Xenova/gpt2'); 106 | 107 | const count = tokenizer.countTokens('Hello world!'); 108 | const tokens = tokenizer.encode('Hello world!'); 109 | const text = tokenizer.decode(tokens); 110 | 111 | console.log(`Token count: ${count}`); 112 | ``` 113 | 114 | ## How It Works 115 | 116 | When you call `Tokenizer.create('gpt2')` in @chonkiejs/core: 117 | 118 | 1. Core tries to dynamically import `@chonkiejs/token` 119 | 2. **If installed:** Uses HuggingFaceTokenizer 120 | 3. **If not installed:** Shows helpful error message 121 | 122 | This keeps core lightweight while allowing advanced tokenization when needed! 123 | 124 | ## Contributing 125 | 126 | Want to help grow Chonkie? Check out [CONTRIBUTING.md](../../CONTRIBUTING.md) to get started! Whether you're fixing bugs, adding features, improving docs, or simply leaving a ⭐️ on the repo, every contribution helps make Chonkie a better CHONK for everyone. 127 | 128 | Remember: No contribution is too small for this tiny hippo! 129 | -------------------------------------------------------------------------------- /legacy/chonkie/friends/chroma.ts: -------------------------------------------------------------------------------- 1 | /** ChromaHandshake to integrate Chonkie with Chroma. */ 2 | 3 | import { BaseHandshake } from "./base"; 4 | import { ChromaClient } from "chromadb"; 5 | import { generateRandomCollectionName } from "./utils"; 6 | import { Chunk } from "../types/base"; 7 | import { v5 as uuidv5 } from "uuid"; 8 | 9 | /** 10 | * ChromaHandshake to integrate Chonkie with Chroma. 11 | * 12 | * @param client - The ChromaClient to use. 13 | * @param collectionName - The name of the collection to use. 14 | * @param path - The path to the Chroma database. Can point to the running instance, Docker or Cloud. 15 | * @param logLevel - The log level ('verbose' or 'silent'). Default: 'verbose'. 16 | */ 17 | export class ChromaHandshake extends BaseHandshake { 18 | 19 | private client: ChromaClient; 20 | private collectionName: string; 21 | private logLevel: 'verbose' | 'silent'; 22 | 23 | constructor(client?: ChromaClient, collectionName?: string, path?: string, logLevel: 'verbose' | 'silent' = 'verbose') { 24 | super(); 25 | 26 | // If the client is not provided, create a new one 27 | this.client = client ?? new ChromaClient({ path }); 28 | // If the collection name is not provided, generate a random one 29 | this.collectionName = collectionName ?? generateRandomCollectionName(); 30 | this.logLevel = logLevel; 31 | 32 | // Print to console the collection name if verbose 33 | if (this.logLevel === 'verbose') { 34 | console.log(`Using collection ${this.collectionName}`); 35 | } 36 | } 37 | 38 | private _getId(index: number, chunk: Chunk): string { 39 | const id = uuidv5(`CHUNK-${index}:${chunk.text}`, uuidv5.DNS); 40 | return id; 41 | } 42 | 43 | /** 44 | * Write chunks to the collection provided in the constructor. 45 | * @param chunks - The chunks to write. 46 | */ 47 | public async write(chunks: Chunk[]): Promise { 48 | // Check if the collection exists and if not, create it 49 | const collection = await this.client.getOrCreateCollection({ name: this.collectionName }); 50 | 51 | // Create a list of ids and documents to upsert 52 | const ids: string[] = []; 53 | const documents: string[] = []; 54 | const metadatas: Record[] = []; 55 | for (const [index, chunk] of chunks.entries()) { 56 | ids.push(this._getId(index, chunk)); 57 | documents.push(chunk.text); 58 | metadatas.push({ 59 | "start_index": chunk.startIndex, 60 | "end_index": chunk.endIndex, 61 | "token_count": chunk.tokenCount, 62 | }); 63 | } 64 | 65 | // Upsert the chunks into the collection 66 | await collection.upsert({ 67 | ids: ids, 68 | documents: documents, 69 | metadatas: metadatas, 70 | }); 71 | 72 | // Print to console the number of chunks upserted if verbose 73 | if (this.logLevel === 'verbose') { 74 | console.log(`Upserted ${chunks.length} chunks into the collection ${this.collectionName}`); 75 | } 76 | } 77 | 78 | /** 79 | * Query the collection provided in the constructor. 80 | * @param query - The query to search for. 81 | * @param nResults - The number of results to return. 82 | * @returns The chunks that match the query. 83 | */ 84 | public async query(query: string, nResults: number = 10): Promise { 85 | const collection = await this.client.getCollection({ name: this.collectionName }); 86 | const results = await collection.query({ 87 | queryTexts: [query], 88 | nResults: nResults, 89 | }); 90 | 91 | // Return the chunks 92 | const { documents, metadatas } = results; 93 | return documents[0].map((document, index) => { 94 | const metadata = metadatas[0][index]; 95 | return new Chunk({ 96 | text: document ?? '', 97 | startIndex: Number(metadata?.start_index) || 0, 98 | endIndex: Number(metadata?.end_index) || 0, 99 | tokenCount: Number(metadata?.token_count) || 0, 100 | }); 101 | }); 102 | } 103 | } -------------------------------------------------------------------------------- /packages/core/src/token.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Token chunker that splits text into fixed-size token chunks. 3 | */ 4 | 5 | import { Tokenizer } from '@/tokenizer'; 6 | import { Chunk } from '@/types'; 7 | 8 | export interface TokenChunkerOptions { 9 | /** Tokenizer instance or model name (default: 'character') */ 10 | tokenizer?: Tokenizer | string; 11 | /** Maximum tokens per chunk (default: 512) */ 12 | chunkSize?: number; 13 | /** Number of tokens to overlap between chunks (default: 0) */ 14 | chunkOverlap?: number; 15 | } 16 | 17 | /** 18 | * Splits text into fixed-size token chunks with optional overlap. 19 | * 20 | * Uses character-based tokenization by default, but can use advanced 21 | * tokenizers from @chonkiejs/token package. 22 | */ 23 | export class TokenChunker { 24 | public readonly chunkSize: number; 25 | public readonly chunkOverlap: number; 26 | private tokenizer: Tokenizer; 27 | 28 | private constructor( 29 | tokenizer: Tokenizer, 30 | chunkSize: number, 31 | chunkOverlap: number 32 | ) { 33 | if (chunkSize <= 0) { 34 | throw new Error('chunkSize must be greater than 0'); 35 | } 36 | if (chunkOverlap < 0) { 37 | throw new Error('chunkOverlap must be non-negative'); 38 | } 39 | if (chunkOverlap >= chunkSize) { 40 | throw new Error('chunkOverlap must be less than chunkSize'); 41 | } 42 | 43 | this.tokenizer = tokenizer; 44 | this.chunkSize = chunkSize; 45 | this.chunkOverlap = chunkOverlap; 46 | } 47 | 48 | /** 49 | * Create a TokenChunker instance. 50 | * 51 | * @param options - Configuration options 52 | * @returns Promise resolving to TokenChunker instance 53 | * 54 | * @example 55 | * // Character-based (no dependencies) 56 | * const chunker = await TokenChunker.create({ chunkSize: 512 }); 57 | * 58 | * @example 59 | * // With HuggingFace tokenizer (requires @chonkiejs/token) 60 | * const chunker = await TokenChunker.create({ 61 | * tokenizer: 'gpt2', 62 | * chunkSize: 512, 63 | * chunkOverlap: 50 64 | * }); 65 | */ 66 | static async create(options: TokenChunkerOptions = {}): Promise { 67 | const { 68 | tokenizer = 'character', 69 | chunkSize = 512, 70 | chunkOverlap = 0, 71 | } = options; 72 | 73 | let tokenizerInstance: Tokenizer; 74 | 75 | if (typeof tokenizer === 'string') { 76 | tokenizerInstance = await Tokenizer.create(tokenizer); 77 | } else { 78 | tokenizerInstance = tokenizer; 79 | } 80 | 81 | return new TokenChunker(tokenizerInstance, chunkSize, chunkOverlap); 82 | } 83 | 84 | /** 85 | * Chunk a single text into fixed-size token chunks. 86 | * 87 | * @param text - The text to chunk 88 | * @returns Array of chunks 89 | */ 90 | async chunk(text: string): Promise { 91 | if (!text) { 92 | return []; 93 | } 94 | 95 | const tokens = this.tokenizer.encode(text); 96 | const chunks: Chunk[] = []; 97 | const step = this.chunkSize - this.chunkOverlap; 98 | 99 | for (let i = 0; i < tokens.length; i += step) { 100 | const chunkTokens = tokens.slice(i, i + this.chunkSize); 101 | const chunkText = this.tokenizer.decode(chunkTokens); 102 | const startIndex = this.findStartIndex(text, chunkText, i > 0 ? chunks[chunks.length - 1].endIndex : 0); 103 | const endIndex = startIndex + chunkText.length; 104 | 105 | chunks.push(new Chunk({ 106 | text: chunkText, 107 | startIndex, 108 | endIndex, 109 | tokenCount: chunkTokens.length, 110 | })); 111 | } 112 | 113 | return chunks; 114 | } 115 | 116 | /** 117 | * Find the start index of chunk text in the original text. 118 | * This handles overlaps correctly. 119 | */ 120 | private findStartIndex(text: string, chunkText: string, searchFrom: number): number { 121 | const index = text.indexOf(chunkText, searchFrom); 122 | return index !== -1 ? index : searchFrom; 123 | } 124 | 125 | toString(): string { 126 | return `TokenChunker(chunkSize=${this.chunkSize}, overlap=${this.chunkOverlap})`; 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /legacy/chonkie/cloud/semantic.ts: -------------------------------------------------------------------------------- 1 | /** Semantic chunker client for Chonkie API. */ 2 | 3 | import { CloudClient, ChunkerInput } from "./base"; 4 | import { SemanticChunk } from "../types/semantic"; 5 | import * as fs from 'fs'; 6 | import * as path from 'path'; 7 | 8 | export interface SemanticChunkerConfig { 9 | embeddingModel?: string; 10 | threshold?: number | "auto"; 11 | chunkSize?: number; 12 | similarityWindow?: number; 13 | minSentences?: number; 14 | minChunkSize?: number; 15 | minCharactersPerSentence?: number; 16 | thresholdStep?: number; 17 | delim?: string | string[]; 18 | includeDelim?: "prev" | "next" | null; 19 | } 20 | 21 | export class SemanticChunker extends CloudClient { 22 | private readonly config: Required; 23 | 24 | constructor(apiKey: string, config: SemanticChunkerConfig = {}) { 25 | super({ apiKey }); 26 | this.config = { 27 | embeddingModel: config.embeddingModel || "minishlab/potion-base-8M", 28 | threshold: config.threshold ?? "auto", 29 | chunkSize: config.chunkSize || 512, 30 | similarityWindow: config.similarityWindow || 1, 31 | minSentences: config.minSentences || 1, 32 | minChunkSize: config.minChunkSize || 2, 33 | minCharactersPerSentence: config.minCharactersPerSentence || 12, 34 | thresholdStep: config.thresholdStep || 0.01, 35 | delim: config.delim || [".", "!", "?", "\n"], 36 | includeDelim: config.includeDelim ?? "prev", 37 | }; 38 | } 39 | 40 | async chunk(input: ChunkerInput): Promise { 41 | const formData = new FormData(); 42 | 43 | if (input.filepath) { 44 | const fileContent = fs.readFileSync(input.filepath); 45 | const fileName = path.basename(input.filepath) || 'file.txt'; 46 | formData.append("file", new Blob([fileContent]), fileName); 47 | } else if (input.text) { 48 | // JSON encode the text 49 | formData.append("text", JSON.stringify(input.text)); 50 | // Append empty file to ensure multipart form 51 | formData.append("file", new Blob(), "text_input.txt"); 52 | } else { 53 | throw new Error("Either text or filepath must be provided"); 54 | } 55 | 56 | // Add all config options to the form data 57 | formData.append("embedding_model", this.config.embeddingModel); 58 | formData.append("threshold", this.config.threshold.toString()); 59 | formData.append("chunk_size", this.config.chunkSize.toString()); 60 | formData.append("similarity_window", this.config.similarityWindow.toString()); 61 | formData.append("min_sentences", this.config.minSentences.toString()); 62 | formData.append("min_chunk_size", this.config.minChunkSize.toString()); 63 | formData.append("min_characters_per_sentence", this.config.minCharactersPerSentence.toString()); 64 | formData.append("threshold_step", this.config.thresholdStep.toString()); 65 | formData.append("delim", JSON.stringify(this.config.delim)); 66 | formData.append("include_delim", this.config.includeDelim || "prev"); 67 | formData.append("return_type", "chunks"); 68 | 69 | const data = await this.request("/v1/chunk/semantic", { 70 | method: "POST", 71 | body: formData, 72 | }); 73 | 74 | // Convert from snake_case to camelCase 75 | const camelCaseData = data.map((chunk: any) => { 76 | return { 77 | text: chunk.text, 78 | startIndex: chunk.start_index, 79 | endIndex: chunk.end_index, 80 | tokenCount: chunk.token_count, 81 | embedding: chunk.embedding || undefined, 82 | sentences: chunk.sentences.map((sentence: any) => { 83 | return { 84 | text: sentence.text, 85 | startIndex: sentence.start_index, 86 | endIndex: sentence.end_index, 87 | tokenCount: sentence.token_count, 88 | embedding: sentence.embedding || undefined, 89 | }; 90 | }), 91 | }; 92 | }); 93 | 94 | return camelCaseData.map((chunk: any) => SemanticChunk.fromDict(chunk)); 95 | } 96 | 97 | async chunkBatch(inputs: ChunkerInput[]): Promise { 98 | return Promise.all(inputs.map(input => this.chunk(input))); 99 | } 100 | } -------------------------------------------------------------------------------- /legacy/chonkie/cloud/sdpm.ts: -------------------------------------------------------------------------------- 1 | /** SDPM chunker client for Chonkie API. */ 2 | 3 | import { CloudClient, ChunkerInput } from "./base"; 4 | import { SemanticChunk } from "../types/semantic"; 5 | import * as fs from 'fs'; 6 | import * as path from 'path'; 7 | 8 | export interface SDPMChunkerConfig { 9 | embeddingModel?: string; 10 | threshold?: number | "auto"; 11 | mode?: "window" | "cumulative"; 12 | chunkSize?: number; 13 | similarityWindow?: number; 14 | minSentences?: number; 15 | minCharactersPerSentence?: number; 16 | thresholdStep?: number; 17 | delim?: string | string[]; 18 | includeDelim?: "prev" | "next" | null; 19 | } 20 | 21 | export class SDPMChunker extends CloudClient { 22 | private readonly config: Required; 23 | 24 | constructor(apiKey: string, config: SDPMChunkerConfig = {}) { 25 | super({ apiKey }); 26 | this.config = { 27 | embeddingModel: config.embeddingModel || "minishlab/potion-base-8M", 28 | threshold: config.threshold ?? "auto", 29 | mode: config.mode || "window", 30 | chunkSize: config.chunkSize || 512, 31 | similarityWindow: config.similarityWindow || 1, 32 | minSentences: config.minSentences || 1, 33 | minCharactersPerSentence: config.minCharactersPerSentence || 12, 34 | thresholdStep: config.thresholdStep || 0.01, 35 | delim: config.delim || [".", "!", "?", "\n"], 36 | includeDelim: config.includeDelim ?? "prev", 37 | }; 38 | } 39 | 40 | async chunk(input: ChunkerInput): Promise { 41 | const formData = new FormData(); 42 | 43 | if (input.filepath) { 44 | const fileContent = fs.readFileSync(input.filepath); 45 | const fileName = path.basename(input.filepath) || 'file.txt'; 46 | formData.append("file", new Blob([fileContent]), fileName); 47 | } else if (input.text) { 48 | // JSON encode the text 49 | formData.append("text", JSON.stringify(input.text)); 50 | // Append empty file to ensure multipart form 51 | formData.append("file", new Blob(), "text_input.txt"); 52 | } else { 53 | throw new Error("Either text or filepath must be provided"); 54 | } 55 | 56 | formData.append("embedding_model", this.config.embeddingModel); 57 | if (typeof this.config.threshold === "number") { 58 | formData.append("threshold", this.config.threshold.toString()); 59 | } else { 60 | formData.append("threshold", this.config.threshold); 61 | } 62 | formData.append("mode", this.config.mode); 63 | formData.append("chunk_size", this.config.chunkSize.toString()); 64 | formData.append("similarity_window", this.config.similarityWindow.toString()); 65 | formData.append("min_sentences", this.config.minSentences.toString()); 66 | formData.append("min_characters_per_sentence", this.config.minCharactersPerSentence.toString()); 67 | formData.append("threshold_step", this.config.thresholdStep.toString()); 68 | // Append delim as a string array 69 | formData.append("delim", JSON.stringify(this.config.delim)); 70 | formData.append("include_delim", this.config.includeDelim || "prev"); 71 | formData.append("return_type", "chunks"); 72 | 73 | const data = await this.request("/v1/chunk/sdpm", { 74 | method: "POST", 75 | body: formData, 76 | }); 77 | 78 | // Convert from snake_case to camelCase 79 | const camelCaseData = data.map((chunk: any) => { 80 | return { 81 | text: chunk.text, 82 | startIndex: chunk.start_index, 83 | endIndex: chunk.end_index, 84 | tokenCount: chunk.token_count, 85 | embedding: chunk.embedding || undefined, 86 | sentences: chunk.sentences.map((sentence: any) => { 87 | return { 88 | text: sentence.text, 89 | startIndex: sentence.start_index, 90 | endIndex: sentence.end_index, 91 | tokenCount: sentence.token_count, 92 | embedding: sentence.embedding || undefined, 93 | }; 94 | }), 95 | }; 96 | }); 97 | 98 | return camelCaseData.map((chunk: any) => SemanticChunk.fromDict(chunk)); 99 | } 100 | 101 | async chunkBatch(inputs: ChunkerInput[]): Promise { 102 | return Promise.all(inputs.map(input => this.chunk(input))); 103 | } 104 | } -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # 🦛 Contributing to Chonkie 2 | 3 | > "I like them big, I like them CONTRIBUTING" ~ Moto Moto, probably 4 | 5 | Welcome fellow CHONKer! We're thrilled you want to contribute to Chonkie. Every contribution—whether fixing bugs, adding features, or improving documentation—makes Chonkie better for everyone. 6 | 7 | ## 🚀 Getting Started 8 | 9 | ### Before You Dive In 10 | 11 | 1. **Check existing issues** or open a new one to start a discussion 12 | 2. **Read [Chonkie's documentation](https://docs.chonkie.ai)** and core [concepts](https://docs.chonkie.ai/getting-started/concepts) 13 | 3. **Set up your development environment** using the guide below 14 | 15 | ### Development Setup 16 | 17 | ```bash 18 | # 1. Fork and clone the repository 19 | git clone https://github.com/chonkie-inc/chonkie-ts.git 20 | cd chonkie-ts 21 | 22 | # 2. Install dependencies 23 | npm install 24 | ``` 25 | 26 | ## 🧪 Testing & Code Quality 27 | 28 | ### Running Tests 29 | 30 | ```bash 31 | npx jest tests/ # Run all tests 32 | npx jest tests/chunker/ # Run all tests in chunker 33 | npx jest tests/chunker/tokenChunker.test.ts # Run specific test file 34 | ``` 35 | 36 | ### Documentation Style 37 | 38 | We follow Google-style docstrings: 39 | 40 | ```typescript 41 | /** 42 | * Splits text into chunks of specified size. 43 | * 44 | * @param text - Input text to chunk 45 | * @param chunk_size - Maximum size of each chunk 46 | * 47 | * @returns List of text chunks 48 | * 49 | * @throws ValueError if chunk_size <= 0 50 | */ 51 | function chunk_text(text: string, chunk_size: number): string[] { 52 | return text.split(' ').slice(0, chunk_size); 53 | } 54 | ``` 55 | 56 | ## 📦 Project Structure 57 | 58 | ``` 59 | src/ 60 | ├── chonkie/ 61 | ├── chunker/ # Local Chunkers 62 | ├── cloud/ # Cloud API Clients 63 | ├── types/ # Chonkie type definitions 64 | ``` 65 | 66 | ## 🎯 Contribution Opportunities 67 | 68 | ### For Beginners 69 | 70 | Start with issues labeled [`good-first-issue`](https://github.com/chonkie-inc/chonkie/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) 71 | 72 | ### Documentation 73 | 74 | - Improve existing docs 75 | - Add examples or tutorials 76 | - Fix typos 77 | 78 | ### Code Improvements 79 | 80 | - Implement new chunking strategies 81 | - Add tokenizer support 82 | - Optimize existing chunkers 83 | - Improve test coverage 84 | - Bring to feature parity with Python library 85 | 86 | ### Performance Enhancements 87 | 88 | - Profile and optimize code 89 | - Add benchmarks 90 | - Improve memory usage 91 | 92 | ### New Features 93 | 94 | Look for issues with [FEAT] labels, especially those from Chonkie Maintainers 95 | 96 | ## 🚦 Pull Request Process 97 | 98 | ### 1. Branch Naming 99 | 100 | - `feature/description` for new features 101 | - `fix/description` for bug fixes 102 | - `docs/description` for documentation changes 103 | 104 | ### 2. Commit Messages 105 | 106 | Write clear, descriptive commit messages: 107 | 108 | ``` 109 | feat: add batch processing to WordChunker 110 | 111 | - Implement batch_process method 112 | - Add tests for batch processing 113 | - Update documentation 114 | ``` 115 | 116 | ### 3. Code Review 117 | 118 | - **Make sure your PR is for the `development` branch** 119 | - All PRs need at least one review 120 | - Maintainers will review for: 121 | - Code quality (via ruff) 122 | - Test coverage 123 | - Performance impact 124 | - Documentation completeness 125 | 126 | ## 🦛 Technical Details 127 | 128 | ### Semantic Versioning 129 | 130 | Chonkie does not follow strict semantic versioning. We follow the following rules: 131 | 132 | - 'MAJOR' version when we refactor/rewrite large parts of the codebase 133 | - 'MINOR' version when we add breaking changes (e.g. changing a public API) 134 | - 'PATCH' version when we add non-breaking features (e.g. adding a new chunker) or fix bugs 135 | 136 | ## 💡 Getting Help 137 | 138 | - **Chat?** [Join our Discord!](https://discord.gg/Q6zkP8w6ur) 139 | - **Questions?** Open an issue or ask in Discord 140 | - **Bugs?** Open an issue or report in Discord 141 | - **Email?** Contact [support@chonkie.ai](mailto:support@chonkie.ai) 142 | 143 | ## 🙏 Thank You 144 | 145 | Every contribution helps make Chonkie better! We appreciate your time and effort in helping make Chonkie the CHONKiest it can be! 146 | 147 | Remember: 148 | > "A journey of a thousand CHONKs begins with a single commit" ~ Ancient Proverb, probably 149 | -------------------------------------------------------------------------------- /legacy/chonkie/types/base.ts: -------------------------------------------------------------------------------- 1 | /** Custom base types for Chonkie. */ 2 | 3 | /** 4 | * Represents the data structure for a chunk object. 5 | * 6 | * @property {string} text - The text of the chunk. 7 | * @property {number} startIndex - The starting index of the chunk in the original text. 8 | * @property {number} endIndex - The ending index of the chunk in the original text. 9 | * @property {number} tokenCount - The number of tokens in the chunk. 10 | */ 11 | interface ChunkData { 12 | text: string; 13 | startIndex: number; 14 | endIndex: number; 15 | tokenCount: number; 16 | embedding?: number[]; 17 | } 18 | 19 | /** 20 | * Represents a chunk of text with associated metadata. 21 | * 22 | * @property {string} text - The text of the chunk. 23 | * @property {number} startIndex - The starting index of the chunk in the original text. 24 | * @property {number} endIndex - The ending index of the chunk in the original text. 25 | * @property {number} tokenCount - The number of tokens in the chunk. 26 | * @property {number[]} [embedding] - The embedding for the chunk. 27 | */ 28 | export class Chunk { 29 | /** The text of the chunk. */ 30 | public text: string; 31 | /** The starting index of the chunk in the original text. */ 32 | public startIndex: number; 33 | /** The ending index of the chunk in the original text. */ 34 | public endIndex: number; 35 | /** The number of tokens in the chunk. */ 36 | public tokenCount: number; 37 | /** Optional embedding for the chunk. */ 38 | public embedding?: number[]; 39 | 40 | /** 41 | * Constructs a new Chunk object. 42 | * 43 | * @param {ChunkData} data - The data to construct the Chunk from. 44 | */ 45 | constructor(data: { 46 | text: string; 47 | startIndex: number; 48 | endIndex: number; 49 | tokenCount: number; 50 | embedding?: number[]; 51 | }) { 52 | this.text = data.text; 53 | this.startIndex = data.startIndex; 54 | this.endIndex = data.endIndex; 55 | this.tokenCount = data.tokenCount; 56 | this.embedding = data.embedding; 57 | 58 | // Basic validation, more can be added if needed 59 | if (this.startIndex > this.endIndex) { 60 | throw new Error("Start index must be less than or equal to end index."); 61 | } 62 | if (this.tokenCount < 0) { 63 | throw new Error("Token count must be a non-negative integer."); 64 | } 65 | } 66 | 67 | /** Return a string representation of the Chunk. 68 | * 69 | * @returns {string} The text of the chunk. 70 | */ 71 | public toString(): string { 72 | return this.text; 73 | } 74 | 75 | /** Return a detailed string representation of the Chunk. 76 | * 77 | * @returns {string} The detailed string representation of the Chunk. 78 | */ 79 | public toRepresentation(): string { 80 | let repr = `Chunk(text='${this.text}', tokenCount=${this.tokenCount}, startIndex=${this.startIndex}, endIndex=${this.endIndex}`; 81 | repr += ')'; 82 | return repr; 83 | } 84 | 85 | /** Return a slice of the chunk's text. 86 | * 87 | * @param {number} [start] - The starting index of the slice. 88 | * @param {number} [end] - The ending index of the slice. 89 | * @returns {string} The slice of the chunk's text. 90 | */ 91 | public slice(start?: number, end?: number): string { 92 | return this.text.slice(start, end); 93 | } 94 | 95 | /** Return the Chunk as a dictionary-like object. 96 | * 97 | * @returns {ChunkData} The dictionary-like object. 98 | */ 99 | public toDict(): ChunkData { 100 | return { 101 | text: this.text, 102 | startIndex: this.startIndex, 103 | endIndex: this.endIndex, 104 | tokenCount: this.tokenCount, 105 | embedding: this.embedding, 106 | }; 107 | } 108 | 109 | /** Create a Chunk object from a dictionary-like object. 110 | * 111 | * @param {ChunkData} data - The dictionary-like object. 112 | * @returns {Chunk} The Chunk object. 113 | */ 114 | public static fromDict(data: ChunkData): Chunk { 115 | return new Chunk({ 116 | text: data.text, 117 | startIndex: data.startIndex, 118 | endIndex: data.endIndex, 119 | tokenCount: data.tokenCount, 120 | embedding: data.embedding, 121 | }); 122 | } 123 | 124 | /** Return a deep copy of the chunk. 125 | * 126 | * @returns {Chunk} The deep copy of the chunk. 127 | */ 128 | public copy(): Chunk { 129 | return Chunk.fromDict(this.toDict()); 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | ![Chonkie Logo](./assets/chonkie_logo_br_transparent_bg.png) 4 | 5 | # 🦛 chonkiejs ✨ 6 | 7 | [![npm version](https://img.shields.io/npm/v/@chonkiejs/core)](https://www.npmjs.com/package/@chonkiejs/core) 8 | [![npm downloads](https://img.shields.io/npm/dt/@chonkiejs/core)](https://www.npmjs.com/package/@chonkiejs/core) 9 | [![npm license](https://img.shields.io/npm/l/@chonkiejs/core)](https://www.npmjs.com/package/@chonkiejs/core) 10 | [![npm bundle size](https://img.shields.io/bundlephobia/min/@chonkiejs/core)](https://www.npmjs.com/package/@chonkiejs/core) 11 | [![Discord](https://dcbadge.limes.pink/api/server/https://discord.gg/rYYp6DC4cv?style=flat)](https://discord.gg/rYYp6DC4cv) 12 | [![Github Stars](https://img.shields.io/github/stars/chonkie-inc/chonkie-ts?style=social)](https://github.com/chonkie-inc/chonkie-ts) 13 | 14 | _🦛 CHONK your texts in TypeScript with Chonkie!✨ The no-nonsense lightweight and efficient chunking library._ 15 | 16 | [Installation](#-installation) • 17 | [Usage](#-usage) • 18 | [Chunkers](#chunkers) • 19 | [Acknowledgements](#acknowledgements) • 20 | [Citation](#citation) 21 | 22 |
23 | 24 | We built `chonkiejs` while developing a TypeScript web app that needed fast, on-the-fly text chunking for RAG applications. After trying several existing libraries, we found them either too heavy or not flexible enough for our needs. `chonkiejs` is a port of the original `chonkie` library, but with some type-safety and a few extra features to make it more useful for TypeScript developers! 25 | 26 | **🚀 Feature-rich**: All the CHONKs you'd ever need
27 | **✨ Easy to use**: Install, Import, CHONK
28 | **⚡ Fast**: CHONK at the max speed of TypeScript! tssssooooooom
29 | **🪶 Light-weight**: No bloat, just CHONK
30 | **🦛 Cute CHONK mascot**: psst it's a pygmy hippo btw
31 | **❤️ [Moto Moto](#acknowledgements)'s favorite TypeScript library**
32 | 33 | **Chonkie** is a chunking library that "**just works**" ✨ 34 | 35 | > [!NOTE] 36 | > This library is not a _binding_ but a _port_ of the original `chonkie` library written in Python, to TypeScript. This library is still under active development and not at feature parity with the original `chonkie` library yet. Please bear with us! 🫂 37 | 38 | ## 📦 Installation 39 | 40 | ```bash 41 | npm install @chonkiejs/core 42 | ``` 43 | 44 | ## 📚 Usage 45 | 46 | ```typescript 47 | import { RecursiveChunker } from '@chonkiejs/core'; 48 | 49 | // Create a chunker 50 | const chunker = await RecursiveChunker.create({ 51 | chunkSize: 512 52 | }); 53 | 54 | // Chunk your text 55 | const chunks = await chunker.chunk('Your text here...'); 56 | 57 | // Use the chunks 58 | for (const chunk of chunks) { 59 | console.log(chunk.text); 60 | console.log(`Tokens: ${chunk.tokenCount}`); 61 | } 62 | ``` 63 | 64 | ## 📦 Packages 65 | 66 | | Package | Description | Dependencies | 67 | |---------|-------------|--------------| 68 | | [@chonkiejs/core](./packages/core) | Local chunking (Recursive, Token) with character-based tokenization | Zero | 69 | | [@chonkiejs/cloud](./packages/cloud) | Cloud-based chunkers (Semantic, Neural, Code, etc.) via api.chonkie.ai | @chonkiejs/core | 70 | | [@chonkiejs/token](./packages/token) | HuggingFace tokenizer support for core chunkers | @huggingface/transformers | 71 | 72 | ## Contributing 73 | 74 | Want to help grow Chonkie? Check out [CONTRIBUTING.md](CONTRIBUTING.md) to get started! Whether you're fixing bugs, adding features, improving docs, or simply leaving a ⭐️ on the repo, every contribution helps make Chonkie a better CHONK for everyone. 75 | 76 | Remember: No contribution is too small for this tiny hippo! 77 | 78 | ## Acknowledgements 79 | 80 | Chonkie would like to CHONK its way through a special thanks to all the users and contributors who have helped make this library what it is today! Your feedback, issue reports, and improvements have helped make Chonkie the CHONKIEST it can be. 81 | 82 | And of course, special thanks to [Moto Moto](https://www.youtube.com/watch?v=I0zZC4wtqDQ&t=5s) for endorsing Chonkie with his famous quote: 83 | > "I like them big, I like them chonkie in TypeScript" ~ Moto Moto... definitely did not say this 84 | 85 | ## Citation 86 | 87 | If you use Chonkie in your research, please cite it as follows: 88 | 89 | ```bibtex 90 | @software{chonkie2025, 91 | author = {Bhavnick Minhas and Shreyash Nigam}, 92 | title = {Chonkie: A no-nonsense fast, lightweight, and efficient text chunking library}, 93 | year = {2025}, 94 | publisher = {GitHub}, 95 | howpublished = {\url{https://github.com/chonkie-inc/chonkie}}, 96 | } 97 | ``` 98 | -------------------------------------------------------------------------------- /packages/cloud/README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | ![Chonkie Logo](../../assets/chonkie_logo_br_transparent_bg.png) 4 | 5 | # @chonkiejs/cloud 6 | 7 | _Cloud-based chunkers for Chonkie via api.chonkie.ai - semantic, neural, and AI-powered text chunking._ 8 | 9 | [![npm version](https://img.shields.io/npm/v/@chonkiejs/cloud)](https://www.npmjs.com/package/@chonkiejs/cloud) 10 | [![npm license](https://img.shields.io/npm/l/@chonkiejs/cloud)](https://www.npmjs.com/package/@chonkiejs/cloud) 11 | [![Documentation](https://img.shields.io/badge/docs-DOCS.md-blue.svg)](./DOCS.md) 12 | [![GitHub](https://img.shields.io/badge/github-chonkie--ts-black.svg?logo=github)](https://github.com/chonkie-inc/chonkie-ts) 13 | 14 |
15 | 16 | ## Features 17 | 🌐 **Cloud-Powered** - Leverage powerful chunking via api.chonkie.ai
18 | 🧠 **Semantic & Neural** - AI-powered intelligent chunking
19 | 🔧 **Refineries** - Post-process chunks with embeddings and overlap
20 | 🔑 **Easy Auth** - Auto-reads CHONKIE_API_KEY from environment
21 | 📦 **Returns Chunk Objects** - Compatible with @chonkiejs/core
22 | ✨ **Clean API** - Simple, consistent interface across all chunkers
23 | 24 | ## Installation 25 | 26 | Install with `npm`: 27 | ```bash 28 | npm i @chonkiejs/cloud 29 | ``` 30 | 31 | Install with `pnpm`: 32 | ```bash 33 | pnpm add @chonkiejs/cloud 34 | ``` 35 | 36 | Install with `yarn`: 37 | ```bash 38 | yarn add @chonkiejs/cloud 39 | ``` 40 | 41 | Install with `bun`: 42 | ```bash 43 | bun add @chonkiejs/cloud 44 | ``` 45 | 46 | ## Quick Start 47 | 48 | Set your API key: 49 | ```bash 50 | export CHONKIE_API_KEY=your-api-key-here 51 | ``` 52 | 53 | Use a chunker: 54 | ```typescript 55 | import { SemanticChunker } from '@chonkiejs/cloud'; 56 | 57 | // Create a chunker (automatically uses CHONKIE_API_KEY) 58 | const chunker = new SemanticChunker({ 59 | chunkSize: 512, 60 | threshold: 0.5 61 | }); 62 | 63 | // Chunk your text 64 | const chunks = await chunker.chunk({ text: 'Your text here...' }); 65 | 66 | // Use the chunks 67 | for (const chunk of chunks) { 68 | console.log(chunk.text); 69 | console.log(`Tokens: ${chunk.tokenCount}`); 70 | } 71 | ``` 72 | 73 | ## Available Chunkers 74 | 75 | | Name | Description | 76 | |------|-------------| 77 | | `TokenChunker` | Splits text into fixed-size token chunks with optional overlap | 78 | | `SentenceChunker` | Splits text into sentence-based chunks respecting sentence boundaries | 79 | | `RecursiveChunker` | Uses hierarchical rules (paragraphs → sentences → punctuation → words) with customizable recipes | 80 | | `SemanticChunker` | Creates semantically coherent chunks using embedding-based similarity analysis | 81 | | `NeuralChunker` | Uses neural networks for intelligent, context-aware chunking | 82 | | `CodeChunker` | Splits code into structurally meaningful chunks based on AST parsing | 83 | | `LateChunker` | Recursive chunking with embeddings for enhanced semantic coherence | 84 | 85 | ## Available Refineries 86 | 87 | | Name | Description | 88 | |------|-------------| 89 | | `EmbeddingsRefinery` | Post-processes chunks by adding embeddings using specified embedding model | 90 | | `OverlapRefinery` | Adds contextual overlap between chunks for better coherence | 91 | 92 | For detailed API documentation, configuration options, and advanced usage, see [DOCS.md](./DOCS.md). 93 | 94 | ## Contributing 95 | 96 | Want to help grow Chonkie? Check out [CONTRIBUTING.md](../../CONTRIBUTING.md) to get started! Whether you're fixing bugs, adding features, improving docs, or simply leaving a ⭐️ on the repo, every contribution helps make Chonkie a better CHONK for everyone. 97 | 98 | Remember: No contribution is too small for this tiny hippo! 99 | 100 | ## Acknowledgements 101 | 102 | Chonkie would like to CHONK its way through a special thanks to all the users and contributors who have helped make this library what it is today! Your feedback, issue reports, and improvements have helped make Chonkie the CHONKIEST it can be. 103 | 104 | And of course, special thanks to [Moto Moto](https://www.youtube.com/watch?v=I0zZC4wtqDQ&t=5s) for endorsing Chonkie with his famous quote: 105 | > "I like them big, I like them chonkie in TypeScript" ~ Moto Moto... definitely did not say this 106 | 107 | ## Citation 108 | 109 | If you use Chonkie in your research, please cite it as follows: 110 | 111 | ```bibtex 112 | @software{chonkie2025, 113 | author = {Bhavnick Minhas and Shreyash Nigam}, 114 | title = {Chonkie: A no-nonsense fast, lightweight, and efficient text chunking library}, 115 | year = {2025}, 116 | publisher = {GitHub}, 117 | howpublished = {\url{https://github.com/chonkie-inc}}, 118 | } 119 | ``` 120 | -------------------------------------------------------------------------------- /packages/cloud/src/base.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Base cloud client for interacting with api.chonkie.ai 3 | */ 4 | 5 | import { formatApiError, FileUploadResponse, FileReference, createFileReference } from '@/utils'; 6 | import * as fs from 'fs'; 7 | import * as path from 'path'; 8 | import * as mime from 'mime-types'; 9 | 10 | export interface CloudClientConfig { 11 | apiKey: string; 12 | baseUrl?: string; 13 | } 14 | 15 | export interface ChunkerInput { 16 | text?: string; 17 | filepath?: string; 18 | file?: FileReference; 19 | } 20 | 21 | export class CloudBaseChunker { 22 | protected readonly apiKey: string; 23 | protected readonly baseUrl: string; 24 | 25 | constructor(config: CloudClientConfig) { 26 | if (!config.apiKey) { 27 | throw new Error('API key is required'); 28 | } 29 | this.apiKey = config.apiKey; 30 | this.baseUrl = config.baseUrl || 'https://api.chonkie.ai'; 31 | } 32 | 33 | protected async request( 34 | endpoint: string, 35 | options: { 36 | method?: string; 37 | body?: FormData | Record; 38 | headers?: Record; 39 | } = {} 40 | ): Promise { 41 | const { method = 'POST', body, headers = {} } = options; 42 | 43 | const isFormData = body instanceof FormData; 44 | const requestHeaders: Record = { 45 | 'Authorization': `Bearer ${this.apiKey}`, 46 | ...headers, 47 | }; 48 | 49 | // Don't set Content-Type for FormData 50 | if (!isFormData && body) { 51 | requestHeaders['Content-Type'] = 'application/json'; 52 | } 53 | 54 | const response = await fetch(`${this.baseUrl}${endpoint}`, { 55 | method, 56 | headers: requestHeaders, 57 | body: isFormData ? body : (body ? JSON.stringify(body) : undefined), 58 | }); 59 | 60 | if (!response.ok) { 61 | const errorText = await response.text(); 62 | let errorMessage = response.statusText || 'Unknown error'; 63 | 64 | try { 65 | const errorJson = JSON.parse(errorText) as { message?: string; error?: string; detail?: string }; 66 | errorMessage = errorJson.message || errorJson.error || errorJson.detail || errorMessage; 67 | } catch { 68 | if (errorText) { 69 | errorMessage = errorText; 70 | } 71 | } 72 | 73 | const formattedError = formatApiError(response.status, errorMessage, endpoint); 74 | throw new Error(formattedError); 75 | } 76 | 77 | return response.json() as Promise; 78 | } 79 | 80 | async validateAuth(): Promise { 81 | try { 82 | const response = await this.request<{ message: string; status: number }>('/v1/auth/validate', { 83 | method: 'GET' 84 | }); 85 | return response.status === 200; 86 | } catch (error) { 87 | return false; 88 | } 89 | } 90 | 91 | /** 92 | * Upload a file to the Chonkie API for OCR/document processing. 93 | * This is an internal method used by chunkers to upload files before chunking. 94 | * 95 | * @param filepath - Path to the file to upload 96 | * @returns FileReference object that can be used in subsequent API calls 97 | * @internal 98 | */ 99 | protected async uploadFile(filepath: string): Promise { 100 | if (!filepath) { 101 | throw new Error('File path is required'); 102 | } 103 | 104 | if (!fs.existsSync(filepath)) { 105 | throw new Error(`File not found: ${filepath}`); 106 | } 107 | 108 | const formData = new FormData(); 109 | const fileContent = fs.readFileSync(filepath); 110 | const fileName = path.basename(filepath); 111 | 112 | // Detect MIME type from file extension 113 | const mimeType = mime.lookup(fileName) || 'application/octet-stream'; 114 | const blob = new Blob([fileContent], { type: mimeType }); 115 | formData.append('file', blob, fileName); 116 | 117 | const response = await this.request('/v1/files', { 118 | method: 'POST', 119 | body: formData, 120 | }); 121 | 122 | // The API might return different field names, check common variations 123 | const documentName = response.document || (response as Record).filename || (response as Record).name || (response as Record).id; 124 | 125 | if (!documentName || typeof documentName !== 'string') { 126 | throw new Error(`Invalid file upload response: missing document identifier. Response: ${JSON.stringify(response)}`); 127 | } 128 | 129 | // Return a FileReference with type 'document' and the document name 130 | return createFileReference('document', documentName); 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /legacy/chonkie/utils/hub.ts: -------------------------------------------------------------------------------- 1 | import { downloadFile, RepoType } from '@huggingface/hub'; 2 | import * as fs from 'fs'; 3 | import * as path from 'path'; 4 | import * as jsonschema from 'jsonschema'; 5 | 6 | /** 7 | * Hubbie is a Huggingface hub manager for Chonkie. 8 | */ 9 | export class Hubbie { 10 | private static readonly SCHEMA_VERSION = "v1"; 11 | private readonly getRecipeConfig: { 12 | repo: string; 13 | subfolder: string; 14 | repoType: RepoType; 15 | }; 16 | private readonly recipeSchema: Record; 17 | 18 | constructor() { 19 | // Define the path to the recipes 20 | this.getRecipeConfig = { 21 | repo: "chonkie-ai/recipes", 22 | subfolder: "recipes", 23 | repoType: "dataset" as RepoType, 24 | }; 25 | 26 | // Fetch the current recipe schema from the hub 27 | this.recipeSchema = this.getRecipeSchema(); 28 | } 29 | 30 | /** 31 | * Get the current recipe schema from the hub. 32 | */ 33 | private async getRecipeSchema(): Promise> { 34 | const schemaBlob = await downloadFile({ 35 | repo: { 36 | name: "chonkie-ai/recipes", 37 | type: "dataset" as RepoType, 38 | }, 39 | path: `${Hubbie.SCHEMA_VERSION}.schema.json`, 40 | }); 41 | 42 | if (!schemaBlob) { 43 | throw new Error("Failed to download schema file"); 44 | } 45 | 46 | const schemaContent = await schemaBlob.text(); 47 | return JSON.parse(schemaContent); 48 | } 49 | 50 | /** 51 | * Validate a recipe against the current schema. 52 | */ 53 | private validateRecipe(recipe: Record): boolean { 54 | try { 55 | jsonschema.validate(recipe, this.recipeSchema); 56 | return true; 57 | } catch (error) { 58 | throw new Error(`Recipe is invalid. Please check the recipe and try again. Error: ${error}`); 59 | } 60 | } 61 | 62 | /** 63 | * Get a recipe from the hub. 64 | * 65 | * @param name - The name of the recipe to get 66 | * @param language - The language of the recipe to get 67 | * @param filePath - Optionally, provide the path to the recipe 68 | * @returns The recipe 69 | * @throws Error if the recipe is not found or invalid 70 | */ 71 | public async getRecipe( 72 | name: string = 'default', 73 | language: string = 'en', 74 | filePath?: string 75 | ): Promise> { 76 | // Check if either (name & language) or path is provided 77 | if ((!name || !language) && !filePath) { 78 | throw new Error("Either (name & language) or path must be provided."); 79 | } 80 | 81 | let recipeContent: string; 82 | 83 | // If path is not provided, download the recipe from the hub 84 | if (!filePath && name && language) { 85 | try { 86 | const recipeBlob = await downloadFile({ 87 | repo: { 88 | name: this.getRecipeConfig.repo, 89 | type: this.getRecipeConfig.repoType, 90 | }, 91 | path: `${this.getRecipeConfig.subfolder}/${name}_${language}.json`, 92 | }); 93 | 94 | if (!recipeBlob) { 95 | throw new Error(`Could not download recipe '${name}_${language}'`); 96 | } 97 | 98 | recipeContent = await recipeBlob.text(); 99 | } catch (error) { 100 | throw new Error(`Could not download recipe '${name}_${language}'. Ensure name and language are correct or provide a valid path. Error: ${error}`); 101 | } 102 | } else { 103 | // Read from local file 104 | try { 105 | recipeContent = fs.readFileSync(filePath!, 'utf-8'); 106 | } catch (error) { 107 | throw new Error(`Failed to read the file ${filePath} —— please check if the file exists and if the path is correct. Error: ${error}`); 108 | } 109 | } 110 | 111 | // Parse and validate the recipe 112 | try { 113 | const recipe = JSON.parse(recipeContent); 114 | 115 | // Validate the recipe 116 | if (!this.validateRecipe(recipe)) { 117 | throw new Error("Recipe is invalid. Please check the recipe and try again."); 118 | } 119 | 120 | return recipe; 121 | } catch (error) { 122 | throw new Error(`Failed to parse recipe JSON. Error: ${error}`); 123 | } 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /packages/cloud/tests/pipeline.test.ts: -------------------------------------------------------------------------------- 1 | import { Pipeline, PipelineStep } from '../src'; 2 | 3 | const TEST_SLUG = `test-pipeline-${Date.now().toString(36)}`; 4 | const BASE_URL = process.env.CHONKIE_BASE_URL || 'https://api.chonkie.ai'; 5 | 6 | describe.skipIf(!process.env.CHONKIE_API_KEY)('Pipeline', () => { 7 | // Clean up after all tests 8 | afterAll(async () => { 9 | try { 10 | const pipeline = await Pipeline.get(TEST_SLUG, { baseUrl: BASE_URL }); 11 | await pipeline.delete(); 12 | } catch { 13 | // Pipeline may not exist, ignore 14 | } 15 | }); 16 | 17 | it('should create a pipeline and run it', async () => { 18 | const pipeline = new Pipeline({ 19 | slug: TEST_SLUG, 20 | description: 'Test pipeline for unit tests', 21 | baseUrl: BASE_URL, 22 | }); 23 | 24 | pipeline 25 | .chunkWith('recursive', { chunk_size: 256 }) 26 | .refineWith('overlap', { context_size: 32 }); 27 | 28 | expect(pipeline.slug).toBe(TEST_SLUG); 29 | expect(pipeline.isSaved).toBe(false); 30 | 31 | // Run pipeline (auto-saves) 32 | const chunks = await pipeline.run({ 33 | text: 'This is a test document. It contains multiple sentences for chunking.', 34 | }); 35 | 36 | expect(pipeline.isSaved).toBe(true); 37 | expect(chunks.length).toBeGreaterThan(0); 38 | expect(chunks[0]).toHaveProperty('text'); 39 | expect(chunks[0]).toHaveProperty('tokenCount'); 40 | }); 41 | 42 | it('should fetch an existing pipeline', async () => { 43 | const pipeline = await Pipeline.get(TEST_SLUG, { baseUrl: BASE_URL }); 44 | 45 | expect(pipeline.slug).toBe(TEST_SLUG); 46 | expect(pipeline.isSaved).toBe(true); 47 | expect(pipeline.steps.length).toBeGreaterThan(0); 48 | }); 49 | 50 | it('should list pipelines', async () => { 51 | const pipelines = await Pipeline.list({ baseUrl: BASE_URL }); 52 | 53 | expect(Array.isArray(pipelines)).toBe(true); 54 | 55 | // Should include our test pipeline 56 | const found = pipelines.find(p => p.slug === TEST_SLUG); 57 | expect(found).toBeDefined(); 58 | }); 59 | 60 | it('should update a pipeline', async () => { 61 | const pipeline = await Pipeline.get(TEST_SLUG, { baseUrl: BASE_URL }); 62 | 63 | // Modify steps 64 | pipeline.reset().chunkWith('sentence', { chunk_size: 128 }); 65 | 66 | await pipeline.update({ description: 'Updated description' }); 67 | 68 | expect(pipeline.description).toBe('Updated description'); 69 | }); 70 | 71 | it('should validate pipeline configuration', async () => { 72 | const validSteps: PipelineStep[] = [ 73 | { type: 'chunk', component: 'recursive', chunk_size: 256 }, 74 | ]; 75 | 76 | const result = await Pipeline.validate(validSteps, { baseUrl: BASE_URL }); 77 | 78 | expect(result.valid).toBe(true); 79 | expect(result.errors).toBeNull(); 80 | }); 81 | 82 | it('should reject invalid slug format', () => { 83 | expect(() => { 84 | new Pipeline({ slug: 'Invalid Slug!' }); 85 | }).toThrow(/Invalid slug/); 86 | }); 87 | 88 | it('should describe pipeline steps', () => { 89 | const pipeline = new Pipeline({ slug: 'desc-test' }); 90 | 91 | expect(pipeline.describe()).toBe('Empty pipeline'); 92 | 93 | pipeline 94 | .chunkWith('recursive') 95 | .refineWith('overlap'); 96 | 97 | expect(pipeline.describe()).toBe('chunk(recursive) -> refine(overlap)'); 98 | }); 99 | 100 | it('should export configuration', () => { 101 | const pipeline = new Pipeline({ slug: 'config-test' }); 102 | 103 | pipeline 104 | .chunkWith('token', { chunk_size: 512 }) 105 | .refineWith('embeddings', { embedding_model: 'test-model' }); 106 | 107 | const config = pipeline.toConfig(); 108 | 109 | expect(config).toHaveLength(2); 110 | expect(config[0]).toEqual({ 111 | type: 'chunk', 112 | component: 'token', 113 | chunk_size: 512, 114 | }); 115 | expect(config[1]).toEqual({ 116 | type: 'refine', 117 | component: 'embeddings', 118 | embedding_model: 'test-model', 119 | }); 120 | }); 121 | 122 | it('should delete a pipeline', async () => { 123 | // Create a temporary pipeline 124 | const tempSlug = `temp-${Date.now().toString(36)}`; 125 | const pipeline = new Pipeline({ 126 | slug: tempSlug, 127 | description: 'Temporary pipeline', 128 | baseUrl: BASE_URL, 129 | }); 130 | 131 | pipeline.chunkWith('token', { chunk_size: 256 }); 132 | await pipeline.run({ text: 'Test' }); 133 | 134 | expect(pipeline.isSaved).toBe(true); 135 | 136 | await pipeline.delete(); 137 | 138 | expect(pipeline.isSaved).toBe(false); 139 | 140 | // Verify it's actually deleted 141 | await expect(Pipeline.get(tempSlug, { baseUrl: BASE_URL })).rejects.toThrow(/not found/); 142 | }); 143 | }); 144 | -------------------------------------------------------------------------------- /packages/cloud/src/chunkers/sentence.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Sentence chunker that splits text into sentence-based chunks 3 | * via api.chonkie.ai 4 | */ 5 | 6 | import { Chunk } from '@chonkiejs/core'; 7 | import { CloudBaseChunker, ChunkerInput } from '@/base'; 8 | 9 | export interface SentenceChunkerOptions { 10 | /** Tokenizer to use (default: "gpt2") */ 11 | tokenizer?: string; 12 | /** Maximum tokens per chunk (default: 512) */ 13 | chunkSize?: number; 14 | /** Number of tokens to overlap between chunks (default: 0) */ 15 | chunkOverlap?: number; 16 | /** Minimum sentences per chunk (default: 1) */ 17 | minSentencesPerChunk?: number; 18 | /** Minimum characters per sentence (default: 12) */ 19 | minCharactersPerSentence?: number; 20 | /** Use approximate token counting (default: false) */ 21 | approximate?: boolean; 22 | /** Sentence delimiters (default: [".", "!", "?", "\n"]) */ 23 | delim?: string | string[]; 24 | /** Where to include delimiter (default: "prev") */ 25 | includeDelim?: 'prev' | 'next' | null; 26 | /** API key (reads from CHONKIE_API_KEY env var if not provided) */ 27 | apiKey?: string; 28 | /** Base URL for API (default: "https://api.chonkie.ai") */ 29 | baseUrl?: string; 30 | } 31 | 32 | interface ApiChunkResponse { 33 | text: string; 34 | start_index: number; 35 | end_index: number; 36 | token_count: number; 37 | } 38 | 39 | interface SentenceChunkPayload extends Record { 40 | text?: string; 41 | file?: { type: string; content: string }; 42 | tokenizer_or_token_counter: string; 43 | chunk_size: number; 44 | chunk_overlap: number; 45 | min_sentences_per_chunk: number; 46 | min_characters_per_sentence: number; 47 | approximate: boolean; 48 | delim: string | string[]; 49 | include_delim: string; 50 | return_type: string; 51 | } 52 | 53 | export class SentenceChunker extends CloudBaseChunker { 54 | private readonly config: { 55 | tokenizer: string; 56 | chunkSize: number; 57 | chunkOverlap: number; 58 | minSentencesPerChunk: number; 59 | minCharactersPerSentence: number; 60 | approximate: boolean; 61 | delim: string | string[]; 62 | includeDelim: 'prev' | 'next' | null; 63 | }; 64 | 65 | constructor(options: SentenceChunkerOptions = {}) { 66 | const apiKey = options.apiKey || process.env.CHONKIE_API_KEY; 67 | if (!apiKey) { 68 | throw new Error('API key is required. Provide it in options.apiKey or set CHONKIE_API_KEY environment variable.'); 69 | } 70 | 71 | super({ apiKey, baseUrl: options.baseUrl }); 72 | 73 | this.config = { 74 | tokenizer: options.tokenizer || 'gpt2', 75 | chunkSize: options.chunkSize || 512, 76 | chunkOverlap: options.chunkOverlap || 0, 77 | minSentencesPerChunk: options.minSentencesPerChunk || 1, 78 | minCharactersPerSentence: options.minCharactersPerSentence || 12, 79 | approximate: options.approximate ?? false, 80 | delim: options.delim || ['.', '!', '?', '\n'], 81 | includeDelim: options.includeDelim ?? 'prev', 82 | }; 83 | } 84 | 85 | async chunk(input: ChunkerInput): Promise { 86 | let fileRef = input.file; 87 | 88 | // If filepath is provided, upload it first to get a file reference 89 | if (input.filepath) { 90 | fileRef = await this.uploadFile(input.filepath); 91 | } 92 | 93 | // Build the payload 94 | const payload: SentenceChunkPayload = { 95 | tokenizer_or_token_counter: this.config.tokenizer, 96 | chunk_size: this.config.chunkSize, 97 | chunk_overlap: this.config.chunkOverlap, 98 | min_sentences_per_chunk: this.config.minSentencesPerChunk, 99 | min_characters_per_sentence: this.config.minCharactersPerSentence, 100 | approximate: this.config.approximate, 101 | delim: this.config.delim, 102 | include_delim: this.config.includeDelim || 'prev', 103 | return_type: 'chunks', 104 | }; 105 | 106 | // Add either text or file to the payload 107 | if (fileRef) { 108 | payload.file = fileRef; 109 | } else if (input.text) { 110 | payload.text = input.text; 111 | } else { 112 | throw new Error('Either text, filepath, or file must be provided'); 113 | } 114 | 115 | const data = await this.request('/v1/chunk/sentence', { 116 | method: 'POST', 117 | body: payload, 118 | }); 119 | 120 | return data.map(chunk => new Chunk({ 121 | text: chunk.text, 122 | startIndex: chunk.start_index, 123 | endIndex: chunk.end_index, 124 | tokenCount: chunk.token_count, 125 | })); 126 | } 127 | 128 | async chunkBatch(inputs: ChunkerInput[]): Promise { 129 | return Promise.all(inputs.map(input => this.chunk(input))); 130 | } 131 | 132 | toString(): string { 133 | return `SentenceChunker(tokenizer=${this.config.tokenizer}, chunkSize=${this.config.chunkSize})`; 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /packages/core/src/types.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Base chunk type representing a piece of text with metadata. 3 | */ 4 | export class Chunk { 5 | /** The text content of the chunk */ 6 | public text: string; 7 | /** The starting index of the chunk in the original text */ 8 | public startIndex: number; 9 | /** The ending index of the chunk in the original text */ 10 | public endIndex: number; 11 | /** The number of tokens in the chunk */ 12 | public tokenCount: number; 13 | /** Optional embedding vector for the chunk */ 14 | public embedding?: number[]; 15 | 16 | constructor(data: { 17 | text: string; 18 | startIndex: number; 19 | endIndex: number; 20 | tokenCount: number; 21 | embedding?: number[]; 22 | }) { 23 | this.text = data.text; 24 | this.startIndex = data.startIndex; 25 | this.endIndex = data.endIndex; 26 | this.tokenCount = data.tokenCount; 27 | this.embedding = data.embedding; 28 | 29 | if (this.startIndex > this.endIndex) { 30 | throw new Error('Start index must be less than or equal to end index'); 31 | } 32 | if (this.tokenCount < 0) { 33 | throw new Error('Token count must be non-negative'); 34 | } 35 | } 36 | 37 | /** 38 | * Get a string representation of the chunk. 39 | */ 40 | toString(): string { 41 | return this.text; 42 | } 43 | } 44 | 45 | /** 46 | * Type for specifying where delimiters should be included in chunks. 47 | */ 48 | export type IncludeDelim = 'prev' | 'next' | 'none'; 49 | 50 | /** 51 | * Configuration for a single level in the recursive chunking hierarchy. 52 | */ 53 | export interface RecursiveLevelConfig { 54 | /** Delimiters to split on at this level */ 55 | delimiters?: string | string[]; 56 | /** Whether to use whitespace as the delimiter */ 57 | whitespace?: boolean; 58 | /** Where to include the delimiter in the resulting chunks */ 59 | includeDelim?: IncludeDelim; 60 | } 61 | 62 | /** 63 | * Represents one level in the recursive chunking hierarchy. 64 | */ 65 | export class RecursiveLevel { 66 | public delimiters?: string | string[]; 67 | public whitespace: boolean; 68 | public includeDelim: IncludeDelim; 69 | 70 | constructor(config: RecursiveLevelConfig = {}) { 71 | this.delimiters = config.delimiters; 72 | this.whitespace = config.whitespace ?? false; 73 | this.includeDelim = config.includeDelim ?? 'prev'; 74 | 75 | this.validate(); 76 | } 77 | 78 | private validate(): void { 79 | if (this.delimiters !== undefined && this.whitespace) { 80 | throw new Error('Cannot use both custom delimiters and whitespace'); 81 | } 82 | if (this.delimiters !== undefined) { 83 | if (typeof this.delimiters === 'string' && this.delimiters.length === 0) { 84 | throw new Error('Delimiter cannot be empty string'); 85 | } 86 | if (Array.isArray(this.delimiters)) { 87 | if (this.delimiters.some(d => typeof d !== 'string' || d.length === 0)) { 88 | throw new Error('Delimiter cannot be empty string'); 89 | } 90 | if (this.delimiters.includes(' ')) { 91 | throw new Error('Use whitespace option instead of space delimiter'); 92 | } 93 | } 94 | } 95 | } 96 | 97 | toString(): string { 98 | return `RecursiveLevel(delimiters=${JSON.stringify(this.delimiters)}, whitespace=${this.whitespace}, includeDelim=${this.includeDelim})`; 99 | } 100 | } 101 | 102 | /** 103 | * Configuration for recursive chunking rules. 104 | */ 105 | export interface RecursiveRulesConfig { 106 | /** Array of levels to use for recursive chunking */ 107 | levels?: RecursiveLevelConfig[]; 108 | } 109 | 110 | /** 111 | * Defines the hierarchy of rules for recursive text chunking. 112 | * 113 | * Default hierarchy: 114 | * 1. Paragraphs (split on \n\n, \r\n, \n, \r) 115 | * 2. Sentences (split on . ! ?) 116 | * 3. Pauses (split on punctuation/symbols) 117 | * 4. Words (split on whitespace) 118 | * 5. Characters (token-level splitting) 119 | */ 120 | export class RecursiveRules { 121 | public levels: RecursiveLevel[]; 122 | 123 | constructor(config: RecursiveRulesConfig = {}) { 124 | if (config.levels === undefined) { 125 | // Default hierarchy 126 | this.levels = [ 127 | new RecursiveLevel({ delimiters: ['\n\n', '\r\n', '\n', '\r'] }), // Paragraphs 128 | new RecursiveLevel({ delimiters: ['. ', '! ', '? '] }), // Sentences 129 | new RecursiveLevel({ 130 | delimiters: [ 131 | '{', '}', '"', '[', ']', '<', '>', '(', ')', ':', ';', ',', 132 | '—', '|', '~', '-', '...', '`', "'" 133 | ] 134 | }), // Pauses 135 | new RecursiveLevel({ whitespace: true }), // Words 136 | new RecursiveLevel() // Characters/tokens 137 | ]; 138 | } else { 139 | this.levels = config.levels.map(levelConfig => new RecursiveLevel(levelConfig)); 140 | } 141 | } 142 | 143 | get length(): number { 144 | return this.levels.length; 145 | } 146 | 147 | getLevel(index: number): RecursiveLevel | undefined { 148 | return this.levels[index]; 149 | } 150 | 151 | toString(): string { 152 | return `RecursiveRules(${this.levels.length} levels)`; 153 | } 154 | } 155 | -------------------------------------------------------------------------------- /packages/cloud/src/chunkers/semantic.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Semantic chunker that uses embeddings to create semantically coherent chunks 3 | * via api.chonkie.ai 4 | */ 5 | 6 | import { Chunk } from '@chonkiejs/core'; 7 | import { CloudBaseChunker, ChunkerInput } from '@/base'; 8 | 9 | export interface SemanticChunkerOptions { 10 | /** Embedding model to use (default: "minishlab/potion-base-8M") */ 11 | embeddingModel?: string; 12 | /** Similarity threshold for chunking (default: 0.5) */ 13 | threshold?: number; 14 | /** Maximum tokens per chunk (default: 512) */ 15 | chunkSize?: number; 16 | /** Window size for similarity comparison (default: 1) */ 17 | similarityWindow?: number; 18 | /** Minimum sentences per chunk (default: 1) */ 19 | minSentences?: number; 20 | /** Minimum chunk size (default: 2) */ 21 | minChunkSize?: number; 22 | /** Minimum characters per sentence (default: 12) */ 23 | minCharactersPerSentence?: number; 24 | /** Step size for threshold adjustment (default: 0.01) */ 25 | thresholdStep?: number; 26 | /** Sentence delimiters (default: [".", "!", "?", "\n"]) */ 27 | delim?: string | string[]; 28 | /** Where to include delimiter (default: "prev") */ 29 | includeDelim?: 'prev' | 'next' | null; 30 | /** API key (reads from CHONKIE_API_KEY env var if not provided) */ 31 | apiKey?: string; 32 | /** Base URL for API (default: "https://api.chonkie.ai") */ 33 | baseUrl?: string; 34 | } 35 | 36 | interface ApiChunkResponse { 37 | text: string; 38 | start_index: number; 39 | end_index: number; 40 | token_count: number; 41 | } 42 | 43 | interface SemanticChunkPayload extends Record { 44 | text?: string; 45 | file?: { type: string; content: string }; 46 | embedding_model: string; 47 | threshold: number; 48 | chunk_size: number; 49 | similarity_window: number; 50 | min_sentences: number; 51 | min_chunk_size: number; 52 | min_characters_per_sentence: number; 53 | threshold_step: number; 54 | delim: string | string[]; 55 | include_delim: string; 56 | return_type: string; 57 | } 58 | 59 | export class SemanticChunker extends CloudBaseChunker { 60 | private readonly config: { 61 | embeddingModel: string; 62 | threshold: number; 63 | chunkSize: number; 64 | similarityWindow: number; 65 | minSentences: number; 66 | minChunkSize: number; 67 | minCharactersPerSentence: number; 68 | thresholdStep: number; 69 | delim: string | string[]; 70 | includeDelim: 'prev' | 'next' | null; 71 | }; 72 | 73 | constructor(options: SemanticChunkerOptions = {}) { 74 | const apiKey = options.apiKey || process.env.CHONKIE_API_KEY; 75 | if (!apiKey) { 76 | throw new Error('API key is required. Provide it in options.apiKey or set CHONKIE_API_KEY environment variable.'); 77 | } 78 | 79 | super({ apiKey, baseUrl: options.baseUrl }); 80 | 81 | this.config = { 82 | embeddingModel: options.embeddingModel || 'minishlab/potion-base-8M', 83 | threshold: options.threshold ?? 0.5, 84 | chunkSize: options.chunkSize || 512, 85 | similarityWindow: options.similarityWindow || 1, 86 | minSentences: options.minSentences || 1, 87 | minChunkSize: options.minChunkSize || 2, 88 | minCharactersPerSentence: options.minCharactersPerSentence || 12, 89 | thresholdStep: options.thresholdStep || 0.01, 90 | delim: options.delim || ['.', '!', '?', '\n'], 91 | includeDelim: options.includeDelim ?? 'prev', 92 | }; 93 | } 94 | 95 | async chunk(input: ChunkerInput): Promise { 96 | let fileRef = input.file; 97 | 98 | // If filepath is provided, upload it first to get a file reference 99 | if (input.filepath) { 100 | fileRef = await this.uploadFile(input.filepath); 101 | } 102 | 103 | // Build the payload 104 | const payload: SemanticChunkPayload = { 105 | embedding_model: this.config.embeddingModel, 106 | threshold: this.config.threshold, 107 | chunk_size: this.config.chunkSize, 108 | similarity_window: this.config.similarityWindow, 109 | min_sentences: this.config.minSentences, 110 | min_chunk_size: this.config.minChunkSize, 111 | min_characters_per_sentence: this.config.minCharactersPerSentence, 112 | threshold_step: this.config.thresholdStep, 113 | delim: this.config.delim, 114 | include_delim: this.config.includeDelim || 'prev', 115 | return_type: 'chunks', 116 | }; 117 | 118 | // Add either text or file to the payload 119 | if (fileRef) { 120 | payload.file = fileRef; 121 | } else if (input.text) { 122 | payload.text = input.text; 123 | } else { 124 | throw new Error('Either text, filepath, or file must be provided'); 125 | } 126 | 127 | const data = await this.request('/v1/chunk/semantic', { 128 | method: 'POST', 129 | body: payload, 130 | }); 131 | 132 | return data.map(chunk => new Chunk({ 133 | text: chunk.text, 134 | startIndex: chunk.start_index, 135 | endIndex: chunk.end_index, 136 | tokenCount: chunk.token_count, 137 | })); 138 | } 139 | 140 | async chunkBatch(inputs: ChunkerInput[]): Promise { 141 | return Promise.all(inputs.map(input => this.chunk(input))); 142 | } 143 | 144 | toString(): string { 145 | return `SemanticChunker(embeddingModel=${this.config.embeddingModel}, threshold=${this.config.threshold})`; 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /legacy/chonkie/chunker/base.ts: -------------------------------------------------------------------------------- 1 | /** Base Chunking Class. **/ 2 | 3 | import { Tokenizer } from "../tokenizer"; 4 | import { Chunk } from "../types/base"; 5 | 6 | /** 7 | * Base class for all chunking classes. 8 | * 9 | * This abstract class provides a common interface and shared logic for all chunking implementations. 10 | * It supports chunking a single text or a batch of texts, with optional concurrency and progress reporting. 11 | * 12 | * Subclasses must implement the `chunk` method to define how a single text is chunked. 13 | * 14 | * @template T - The type of chunk produced (usually `Chunk[]` or `string[]`). 15 | * 16 | * @property {Tokenizer} tokenizer - The tokenizer instance used for chunking operations. 17 | * @property {boolean} _useConcurrency - Whether to use concurrent processing for batch chunking (default: true). 18 | * 19 | * @example 20 | * class MyChunker extends BaseChunker { 21 | * async chunk(text: string): Promise { 22 | * // ... implementation ... 23 | * } 24 | * } 25 | * 26 | * const chunker = new MyChunker(tokenizer); 27 | * const chunks = await chunker.call("Some text"); 28 | * const batchChunks = await chunker.call(["Text 1", "Text 2"], true); 29 | */ 30 | export abstract class BaseChunker { 31 | protected tokenizer: Tokenizer; 32 | protected _useConcurrency: boolean = true; // Determines if batch processing uses Promise.all 33 | 34 | constructor(tokenizer: Tokenizer) { 35 | this.tokenizer = tokenizer; 36 | } 37 | 38 | /** 39 | * Returns a string representation of the chunker instance. 40 | * 41 | * @returns {string} The class name and constructor signature. 42 | */ 43 | public toString(): string { 44 | return `${this.constructor.name}()`; 45 | } 46 | 47 | /** 48 | * Call the chunker with a single string or an array of strings. 49 | * 50 | * If a single string is provided, returns the result of `chunk(text)`. 51 | * If an array of strings is provided, returns the result of `chunkBatch(texts, showProgress)`. 52 | * 53 | * @param {string | string[]} textOrTexts - The text or array of texts to chunk. 54 | * @param {boolean} [showProgress=false] - Whether to display progress for batch operations (only applies to arrays). 55 | * @returns {Promise} The chunked result(s). 56 | * @throws {Error} If input is not a string or array of strings. 57 | */ 58 | public async call(text: string, showProgress?: boolean): Promise; 59 | public async call(texts: string[], showProgress?: boolean): Promise; 60 | public async call( 61 | textOrTexts: string | string[], 62 | showProgress: boolean = false 63 | ): Promise { 64 | if (typeof textOrTexts === 'string') { 65 | return this.chunk(textOrTexts) as Promise; 66 | } else if (Array.isArray(textOrTexts)) { 67 | return this.chunkBatch(textOrTexts, showProgress) as Promise; 68 | } else { 69 | // This case should ideally not be reached due to TypeScript's type checking 70 | // if the public overloads are used correctly. 71 | throw new Error("Input must be a string or an array of strings."); 72 | } 73 | } 74 | 75 | /** 76 | * Process a batch of texts sequentially (one after another). 77 | * 78 | * @protected 79 | * @param {string[]} texts - The texts to chunk. 80 | * @param {boolean} [showProgress=false] - Whether to display progress in the console. 81 | * @returns {Promise} An array of chunked results for each input text. 82 | */ 83 | protected async _sequential_batch_processing( 84 | texts: string[], 85 | showProgress: boolean = false 86 | ): Promise { 87 | const results: Chunk[][] = []; 88 | const total = texts.length; 89 | for (let i = 0; i < total; i++) { 90 | if (showProgress && total > 1) { 91 | const progress = Math.round(((i + 1) / total) * 100); 92 | process.stdout.write(`Sequential processing: Document ${i + 1}/${total} (${progress}%)\r`); 93 | } 94 | results.push(await this.chunk(texts[i])); 95 | } 96 | if (showProgress && total > 1) { 97 | process.stdout.write("\n"); // Newline after progress 98 | } 99 | return results; 100 | } 101 | 102 | /** 103 | * Process a batch of texts concurrently using Promise.all. 104 | * 105 | * @protected 106 | * @param {string[]} texts - The texts to chunk. 107 | * @param {boolean} [showProgress=false] - Whether to display progress in the console. 108 | * @returns {Promise} An array of chunked results for each input text. 109 | */ 110 | protected async _concurrent_batch_processing( 111 | texts: string[], 112 | showProgress: boolean = false 113 | ): Promise { 114 | const total = texts.length; 115 | let completedCount = 0; 116 | 117 | const updateProgress = () => { 118 | if (showProgress && total > 1) { 119 | completedCount++; 120 | const progress = Math.round((completedCount / total) * 100); 121 | process.stdout.write(`Concurrent processing: Document ${completedCount}/${total} (${progress}%)\r`); 122 | } 123 | }; 124 | 125 | const chunkPromises = texts.map(text => 126 | this.chunk(text).then(result => { 127 | updateProgress(); 128 | return result; 129 | }) 130 | ); 131 | 132 | const results = await Promise.all(chunkPromises); 133 | if (showProgress && total > 1 && completedCount > 0) { // ensure newline only if progress was shown 134 | process.stdout.write("\n"); // Newline after progress 135 | } 136 | return results; 137 | } 138 | 139 | /** 140 | * Abstract method to chunk a single text. Must be implemented by subclasses. 141 | * 142 | * @param {string} text - The text to chunk. 143 | * @returns {Promise} The chunked representation of the input text. 144 | * @abstract 145 | */ 146 | public abstract chunk(text: string): Promise; 147 | 148 | /** 149 | * Chunk a batch of texts, using either concurrent or sequential processing. 150 | * 151 | * If only one text is provided, processes it directly without batch overhead. 152 | * 153 | * @param {string[]} texts - The texts to chunk. 154 | * @param {boolean} [showProgress=true] - Whether to display progress in the console. 155 | * @returns {Promise} An array of chunked results for each input text. 156 | */ 157 | public async chunkBatch( 158 | texts: string[], 159 | showProgress: boolean = true 160 | ): Promise { 161 | if (texts.length === 0) { 162 | return []; 163 | } 164 | // If only one text, process it directly without batch overhead, progress not shown for single item. 165 | if (texts.length === 1) { 166 | return [await this.chunk(texts[0]) as Chunk[] ]; 167 | } 168 | 169 | // For multiple texts, use selected batch processing strategy 170 | if (this._useConcurrency) { 171 | return this._concurrent_batch_processing(texts, showProgress); 172 | } else { 173 | return this._sequential_batch_processing(texts, showProgress); 174 | } 175 | } 176 | } 177 | -------------------------------------------------------------------------------- /legacy/chonkie/types/sentence.ts: -------------------------------------------------------------------------------- 1 | import { Chunk } from './base'; 2 | 3 | /** 4 | * Represents the essential data for a sentence within a text. 5 | * 6 | * @property text - The actual sentence string as it appears in the source text. 7 | * @property startIndex - The zero-based index indicating where the sentence starts in the original text. 8 | * @property endIndex - The zero-based index indicating where the sentence ends in the original text (inclusive). 9 | * @property tokenCount - The number of tokens (words or subwords) in the sentence, useful for NLP tasks. 10 | */ 11 | export interface SentenceData { 12 | text: string; 13 | startIndex: number; 14 | endIndex: number; 15 | tokenCount: number; 16 | } 17 | 18 | /** 19 | * Class to represent a sentence. 20 | * 21 | * Represents a single sentence within a text, including its text, position, and token count. 22 | * 23 | * @class 24 | * @param {SentenceData} data - The data required to construct a Sentence instance. 25 | * @property {string} text - The text of the sentence. 26 | * @property {number} startIndex - The starting index of the sentence in the original text. 27 | * @property {number} endIndex - The ending index of the sentence in the original text. 28 | * @property {number} tokenCount - The number of tokens in the sentence. 29 | * @property {number[]} [embedding] - The embedding vector for the sentence (array of numbers, or null if not present). 30 | * 31 | * @method toString Returns a string representation of the Sentence. 32 | * @returns {string} 33 | * 34 | * @method toDict Returns the Sentence as a dictionary-like object. 35 | * @returns {SentenceData} 36 | * 37 | * @method static fromDict Creates a Sentence object from a dictionary-like object. 38 | * @param {SentenceData} data - The data to create the Sentence from. 39 | * @returns {Sentence} 40 | */ 41 | export class Sentence { 42 | /** The text of the sentence */ 43 | public text: string; 44 | /** The starting index of the sentence in the original text */ 45 | public startIndex: number; 46 | /** The ending index of the sentence in the original text */ 47 | public endIndex: number; 48 | /** The number of tokens in the sentence */ 49 | public tokenCount: number; 50 | 51 | constructor(data: SentenceData) { 52 | this.text = data.text; 53 | this.startIndex = data.startIndex; 54 | this.endIndex = data.endIndex; 55 | this.tokenCount = data.tokenCount; 56 | } 57 | 58 | /** Return a string representation of the Sentence */ 59 | public toString(): string { 60 | return `Sentence(text=${this.text}, startIndex=${this.startIndex}, endIndex=${this.endIndex}, tokenCount=${this.tokenCount})`; 61 | } 62 | 63 | /** Return the Sentence as a dictionary-like object */ 64 | public toDict(): SentenceData { 65 | return { 66 | text: this.text, 67 | startIndex: this.startIndex, 68 | endIndex: this.endIndex, 69 | tokenCount: this.tokenCount, 70 | }; 71 | } 72 | 73 | /** Create a Sentence object from a dictionary-like object */ 74 | public static fromDict(data: SentenceData): Sentence { 75 | return new Sentence(data); 76 | } 77 | } 78 | 79 | /** 80 | * Represents the essential data for a chunk of sentences within a text. 81 | * 82 | * @property text - The combined text of all sentences in the chunk as it appears in the source text. 83 | * @property startIndex - The zero-based index indicating where the chunk starts in the original text. 84 | * @property endIndex - The zero-based index indicating where the chunk ends in the original text (inclusive). 85 | * @property tokenCount - The total number of tokens (words or subwords) in the chunk, useful for NLP tasks. 86 | * @property sentences - An array of SentenceData objects, each representing an individual sentence within the chunk. 87 | */ 88 | interface SentenceChunkData { 89 | text: string; 90 | startIndex: number; 91 | endIndex: number; 92 | tokenCount: number; 93 | sentences: SentenceData[]; 94 | embedding?: number[]; 95 | } 96 | 97 | /** 98 | * Represents a chunk of one or more sentences within a text. 99 | * 100 | * A SentenceChunk groups together multiple {@link Sentence} objects, providing their combined text, position, and token count within the original text. 101 | * 102 | * @class 103 | * @extends Chunk 104 | * 105 | * @param {Object} data - Data to construct a SentenceChunk instance. 106 | * @param {string} data.text - Combined text of all sentences in the chunk. 107 | * @param {number} data.startIndex - Zero-based index where the chunk starts in the original text. 108 | * @param {number} data.endIndex - Zero-based index where the chunk ends in the original text (inclusive). 109 | * @param {number} data.tokenCount - Total number of tokens in the chunk. 110 | * @param {Sentence[]} data.sentences - Array of {@link Sentence} objects in the chunk. 111 | * 112 | * @property {string} text - Combined text of all sentences in the chunk. 113 | * @property {number} startIndex - Starting index of the chunk in the original text. 114 | * @property {number} endIndex - Ending index of the chunk in the original text. 115 | * @property {number} tokenCount - Total number of tokens in the chunk. 116 | * @property {Sentence[]} sentences - List of {@link Sentence} objects in the chunk. 117 | * 118 | * @method toString Returns a detailed string representation of the SentenceChunk, including its text, start and end indices, token count, and a list of all contained sentences with their metadata. 119 | * @method toDict Returns the SentenceChunk as a plain object (see {@link SentenceChunkData}). 120 | * @method static fromDict Creates a SentenceChunk from a {@link SentenceChunkData} object. 121 | */ 122 | export class SentenceChunk extends Chunk { 123 | /** List of sentences in the chunk */ 124 | public sentences: Sentence[]; 125 | 126 | constructor(data: { 127 | text: string; 128 | startIndex: number; 129 | endIndex: number; 130 | tokenCount: number; 131 | sentences: Sentence[]; 132 | embedding?: number[]; 133 | }) { 134 | super(data); 135 | this.sentences = data.sentences; 136 | this.embedding = data.embedding ?? undefined; 137 | } 138 | 139 | /** 140 | * Returns a detailed string representation of the SentenceChunk, including its text, start and end indices, token count, and a list of all contained sentences with their metadata. 141 | * 142 | * This method overrides the base {@link Chunk} toString method to provide a more informative output, which is especially useful for debugging and logging. Each sentence in the chunk is represented using its own toString method, and all sentences are included in the output. 143 | * 144 | * @returns {string} A string describing the SentenceChunk and all its sentences, e.g., 145 | * SentenceChunk(text=..., startIndex=..., endIndex=..., tokenCount=..., sentences=[Sentence(...), ...]) 146 | */ 147 | public toString(): string { 148 | const sentencesStr = this.sentences.map(s => s.toString()).join(', '); 149 | return `SentenceChunk(text=${this.text}, startIndex=${this.startIndex}, endIndex=${this.endIndex}, tokenCount=${this.tokenCount}, sentences=[${sentencesStr}])`; 150 | } 151 | 152 | /** 153 | * Returns the SentenceChunk as a dictionary-like object. 154 | * 155 | * This method extends the base {@link Chunk} toDict method to include the sentences in the chunk. 156 | * 157 | * @returns {SentenceChunkData} A dictionary-like object containing the chunk's text, start and end indices, token count, and an array of sentence data. 158 | /** Return the SentenceChunk as a dictionary-like object */ 159 | public toDict(): SentenceChunkData { 160 | const baseDict = super.toDict(); 161 | return { 162 | ...baseDict, 163 | sentences: this.sentences.map(sentence => sentence.toDict()), 164 | }; 165 | } 166 | 167 | /** 168 | * Creates a SentenceChunk object from a dictionary-like object. 169 | * 170 | * This method extends the base {@link Chunk} fromDict method to include the sentences in the chunk. 171 | * 172 | * @param {SentenceChunkData} data - A dictionary-like object containing the chunk's text, start and end indices, token count, and an array of sentence data. 173 | * @returns {SentenceChunk} A new SentenceChunk object created from the provided dictionary-like object. 174 | */ 175 | public static fromDict(data: SentenceChunkData): SentenceChunk { 176 | const sentences = data.sentences.map(sentence => Sentence.fromDict(sentence)); 177 | return new SentenceChunk({ 178 | text: data.text, 179 | startIndex: data.startIndex, 180 | endIndex: data.endIndex, 181 | tokenCount: data.tokenCount, 182 | sentences, 183 | embedding: data.embedding ?? undefined, 184 | }); 185 | } 186 | } --------------------------------------------------------------------------------