├── .gitignore ├── node ├── mod.mts └── parsers.mts ├── tokenizer ├── mod.mts ├── common.mts ├── data.mts ├── parsers.mts ├── gpt.mts └── codex.mts ├── test ├── fixtures │ ├── nested-javascript.js │ ├── single-paragraph.txt │ ├── sample-html.html │ └── multiple-paragraphs.txt ├── common.mts ├── CostEstimator.test.ts └── Tokenizer.test.ts ├── mod.mts ├── models ├── chat-gpt.mts ├── curie.mts ├── ada.mts ├── gpt-4.mts ├── babbage.mts ├── mod.mts ├── davinci.mts └── common.mts ├── tsconfig.json ├── examples └── gpt-token-encoding.mts ├── LICENSE ├── CostEstimator.mts ├── patterns.mts ├── README.md ├── RanksMap.mts ├── BytePairDecoder.mts ├── BytePairTokenMap.mts ├── BytePairEncoding.mts ├── CodePointByteMap.mts ├── package.json ├── EncoderResult.mts └── BytePairEncoder.mts /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | .npmrc 3 | dist 4 | -------------------------------------------------------------------------------- /node/mod.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | export * from './parsers.mjs' 9 | -------------------------------------------------------------------------------- /tokenizer/mod.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | export * from './codex.mjs' 9 | export * from './common.mjs' 10 | export * from './data.mjs' 11 | export * from './encoder.mjs' 12 | export * from './gpt.mjs' 13 | export * from './parsers.mjs' 14 | export * from './vocab.mjs' 15 | -------------------------------------------------------------------------------- /test/fixtures/nested-javascript.js: -------------------------------------------------------------------------------- 1 | function deeplyNested () { 2 | return { 3 | the: { 4 | quick: { 5 | brown: { 6 | fox: { 7 | jumps: { 8 | over: { 9 | the: { 10 | lazy: { 11 | dog: { 12 | } 13 | } 14 | } 15 | } 16 | } 17 | } 18 | } 19 | } 20 | } 21 | } 22 | } -------------------------------------------------------------------------------- /test/fixtures/single-paragraph.txt: -------------------------------------------------------------------------------- 1 | Lorem elit in ullamco deserunt et tempor pariatur do est cupidatat commodo elit ex. 2 | In proident non irure esse nisi quis ullamco. 3 | Quis est sint veniam exercitation et sint enim. 4 | Occaecat officia dolore occaecat sunt minim deserunt. 5 | In voluptate nostrud enim sint voluptate nulla amet adipisicing. 6 | Et cillum quis officia dolore aliqua sint sit non non irure ea tempor. 7 | Quis duis adipisicing esse nostrud do veniam occaecat. 8 | -------------------------------------------------------------------------------- /mod.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | export * from './BytePairDecoder.mjs' 9 | export * from './BytePairEncoder.mjs' 10 | export * from './BytePairEncoding.mjs' 11 | export * from './BytePairTokenMap.mjs' 12 | export * from './CodePointByteMap.mjs' 13 | export * from './CostEstimator.mjs' 14 | export * from './EncoderResult.mjs' 15 | export * from './models/mod.mjs' 16 | export * from './patterns.mjs' 17 | export * from './RanksMap.mjs' 18 | export * from './tokenizer/mod.mjs' 19 | -------------------------------------------------------------------------------- /models/chat-gpt.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | import { ModelFamily, ModelFamilyIDs } from './common.mjs' 9 | 10 | export const ChatGPTModelFamily: ModelFamily = { 11 | familyID: ModelFamilyIDs.ChatGPT, 12 | tokenLimit: 4096, 13 | mergeSpaces: 0, 14 | modelIDs: ['gpt-3.5-turbo-0301', 'gpt-3.5-turbo'], 15 | preferredModelID: 'gpt-3.5-turbo', 16 | pricing: { 17 | usage: 0.002, 18 | prompt: null, 19 | completion: null, 20 | fineTunedTraining: null, 21 | fineTunedUsage: null, 22 | }, 23 | } 24 | -------------------------------------------------------------------------------- /test/fixtures/sample-html.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 |

Hello world!

11 | 12 | 19 | 20 |

Lorem ipsum dolor sit amet consectetur adipi

21 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /tokenizer/common.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | /** 9 | * Serialized vocabulary. Used with the bundled vocabulary. 10 | * @internal 11 | * @ignore 12 | */ 13 | export type BundledVocab = readonly string[] 14 | 15 | /** 16 | * Serialized encoder keys. Used with the bundled encoder. 17 | * @internal 18 | * @ignore 19 | */ 20 | export type BundledEncoderKeys = readonly string[] 21 | 22 | /** 23 | * Serialized encoder keys. Used with the bundled encoder. 24 | * @internal 25 | * @ignore 26 | */ 27 | export type BundledEncoderValues = readonly number[] 28 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | // This file is not used in compilation. It is here just for a nice editor experience. 3 | 4 | "compilerOptions": { 5 | "allowJs": true, 6 | "jsx": "react", 7 | "lib": [ 8 | "dom", 9 | "dom.iterable", 10 | "esnext" 11 | ], 12 | "sourceMap": false, 13 | "target": "ESNext", 14 | "module": "ES2020", 15 | "outDir": "./dist", 16 | "esModuleInterop": true, 17 | "moduleResolution": "nodenext", 18 | "declaration": true, 19 | "baseUrl": ".", 20 | "strict": true, 21 | "skipLibCheck": true, 22 | }, 23 | "include": [ 24 | "./**/*.ts", 25 | "./**/*.mts", 26 | ], 27 | "exclude": [ 28 | "dist", 29 | "node_modules", 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /test/common.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | import { readFileSync } from 'node:fs' 9 | import * as path from 'node:path' 10 | import { fileURLToPath } from 'url' 11 | import { IBytePairEncodingOptions } from '../mod.mjs' 12 | 13 | export interface TestCase { 14 | label: string 15 | given: G 16 | expected: E 17 | options?: Partial 18 | } 19 | 20 | const __dirname = path.dirname(fileURLToPath(import.meta.url)) 21 | const fixturesPath = path.join(__dirname, 'fixtures') 22 | 23 | export function readFixture(fileName: string): string { 24 | return readFileSync(path.join(fixturesPath, fileName), 'utf8').trim() 25 | } 26 | -------------------------------------------------------------------------------- /examples/gpt-token-encoding.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | import { gptDecoder, gptEncoder } from '../mod.mjs' 9 | 10 | const str = 'This is an example sentence to try encoding out on!' 11 | console.log('Encoding...', str) 12 | const encoded = gptEncoder.encode(str) 13 | console.log('Encoded this string looks like: ') 14 | 15 | for (const token of encoded) { 16 | console.log(token) 17 | } 18 | 19 | console.log('We can look at each token and what it represents') 20 | for (const token of encoded) { 21 | console.log({ token, string: gptDecoder.decode([token]) }) 22 | } 23 | 24 | const decoded = gptDecoder.decode(encoded) 25 | console.log('We can decode it back into:\n', decoded) 26 | -------------------------------------------------------------------------------- /models/curie.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | import { ModelFamily, ModelFamilyIDs } from './common.mjs' 9 | 10 | export const CurieModelFamily: ModelFamily = { 11 | familyID: ModelFamilyIDs.Curie, 12 | tokenLimit: 2049, 13 | mergeSpaces: 0, 14 | modelIDs: [ 15 | 'curie-instruct-beta', 16 | 'curie-search-document', 17 | 'curie-search-query', 18 | 'curie-similarity', 19 | 'curie:2020-05-03', 20 | 'curie', 21 | 'if-curie-v2', 22 | 'text-curie-001', 23 | 'text-curie:001', 24 | 'text-search-curie-doc-001', 25 | 'text-search-curie-query-001', 26 | 'text-similarity-curie-001', 27 | ], 28 | pricing: { 29 | usage: 0.002, 30 | prompt: 0.002, 31 | completion: 0.002, 32 | fineTunedTraining: 0.003, 33 | fineTunedUsage: 0.012, 34 | }, 35 | } 36 | -------------------------------------------------------------------------------- /tokenizer/data.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | import type { IBytePairEncodingOptions } from '../BytePairEncoding.mjs' 9 | import { DEFAULT_ENCODER_KEYS, DEFAULT_ENCODER_VALUES } from './encoder.mjs' 10 | import { parseBundledEncoder, parseBundledVocab } from './parsers.mjs' 11 | import { DEFAULT_VOCAB } from './vocab.mjs' 12 | 13 | /** 14 | * @internal 15 | */ 16 | export function createDefaultBPEOptions(): Readonly { 17 | return { 18 | tokenEncodings: parseBundledEncoder(DEFAULT_ENCODER_KEYS, DEFAULT_ENCODER_VALUES), 19 | vocab: parseBundledVocab(DEFAULT_VOCAB), 20 | } 21 | } 22 | 23 | /** 24 | * Default options for byte pair encoding. 25 | * 26 | * Note that referencing to this object will incur a filesize penalty when bundling. 27 | */ 28 | export const DEFAULT_BPE_OPTIONS = createDefaultBPEOptions() 29 | -------------------------------------------------------------------------------- /models/ada.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | import { ModelFamily, ModelFamilyIDs } from './common.mjs' 9 | 10 | export const AdaModelFamily: ModelFamily = { 11 | familyID: ModelFamilyIDs.Ada, 12 | tokenLimit: 2049, 13 | mergeSpaces: 0, 14 | modelIDs: [ 15 | 'ada-code-search-code', 16 | 'ada-code-search-text', 17 | 'ada-search-document', 18 | 'ada-search-query', 19 | 'ada-similarity', 20 | 'ada:2020-05-03', 21 | 'ada', 22 | 'code-search-ada-code-001', 23 | 'code-search-ada-text-001', 24 | 'text-ada-001', 25 | 'text-ada:001', 26 | 'text-embedding-ada-002', 27 | 'text-search-ada-doc-001', 28 | 'text-search-ada-query-001', 29 | 'text-similarity-ada-001', 30 | ], 31 | pricing: { 32 | usage: 0.0004, 33 | prompt: 0.0004, 34 | completion: 0.0004, 35 | fineTunedTraining: 0.0004, 36 | fineTunedUsage: 0.0016, 37 | }, 38 | } 39 | -------------------------------------------------------------------------------- /models/gpt-4.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | import { ModelFamily, ModelFamilyIDs } from './common.mjs' 9 | 10 | export const GPT4_8KModelFamily: ModelFamily = { 11 | familyID: ModelFamilyIDs.GPT4, 12 | tokenLimit: 8192, 13 | mergeSpaces: 0, 14 | modelIDs: ['gpt-3.5-turbo', 'gpt-3.5-turbo-0301'], 15 | preferredModelID: 'gpt-3.5-turbo', 16 | pricing: { 17 | prompt: 0.03, 18 | completion: 0.06, 19 | usage: null, 20 | fineTunedTraining: null, 21 | fineTunedUsage: null, 22 | }, 23 | } 24 | 25 | export const GPT4_32KModelFamily: ModelFamily = { 26 | familyID: ModelFamilyIDs.GPT4, 27 | tokenLimit: 32768, 28 | mergeSpaces: 0, 29 | modelIDs: ['gpt-4-32k', 'gpt-4-32k-0314'], 30 | preferredModelID: 'gpt-4-32k', 31 | pricing: { 32 | prompt: 0.06, 33 | completion: 0.12, 34 | usage: null, 35 | fineTunedTraining: null, 36 | fineTunedUsage: null, 37 | }, 38 | } 39 | -------------------------------------------------------------------------------- /models/babbage.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | import { ModelFamily, ModelFamilyIDs } from './common.mjs' 9 | 10 | export const BabbageModelFamily: ModelFamily = { 11 | familyID: ModelFamilyIDs.Babbage, 12 | tokenLimit: 2049, 13 | mergeSpaces: 0, 14 | modelIDs: [ 15 | 'babbage-code-search-code', 16 | 'babbage-code-search-text', 17 | 'babbage-search-document', 18 | 'babbage-search-query', 19 | 'babbage-similarity', 20 | 'babbage:2020-05-03', 21 | 'babbage', 22 | 'code-search-babbage-code-001', 23 | 'code-search-babbage-text-001', 24 | 'text-babbage-001', 25 | 'text-babbage:001', 26 | 'text-search-babbage-doc-001', 27 | 'text-search-babbage-query-001', 28 | 'text-similarity-babbage-001', 29 | ], 30 | pricing: { 31 | usage: 0.0005, 32 | prompt: 0.0005, 33 | completion: 0.0005, 34 | fineTunedTraining: 0.0006, 35 | fineTunedUsage: 0.0024, 36 | }, 37 | } 38 | -------------------------------------------------------------------------------- /models/mod.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | import { AdaModelFamily } from './ada.mjs' 9 | import { BabbageModelFamily } from './babbage.mjs' 10 | import { ChatGPTModelFamily } from './chat-gpt.mjs' 11 | import { ModelFamiliesMap } from './common.mjs' 12 | import { CurieModelFamily } from './curie.mjs' 13 | import { DavinciModelFamily } from './davinci.mjs' 14 | import { GPT4_32KModelFamily, GPT4_8KModelFamily } from './gpt-4.mjs' 15 | 16 | /** 17 | * A global store of all model families. 18 | */ 19 | export const ModelFamilyStore = new ModelFamiliesMap() 20 | 21 | ModelFamilyStore.addFamily(AdaModelFamily) 22 | ModelFamilyStore.addFamily(BabbageModelFamily) 23 | ModelFamilyStore.addFamily(CurieModelFamily) 24 | ModelFamilyStore.addFamily(DavinciModelFamily) 25 | ModelFamilyStore.addFamily(ChatGPTModelFamily) 26 | ModelFamilyStore.addFamily(GPT4_32KModelFamily) 27 | ModelFamilyStore.addFamily(GPT4_8KModelFamily) 28 | 29 | export * from './common.mjs' 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Sister Software 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /models/davinci.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | import { ModelFamily, ModelFamilyIDs } from './common.mjs' 9 | 10 | export const DavinciModelFamily: ModelFamily = { 11 | familyID: ModelFamilyIDs.Davinci, 12 | tokenLimit: 2049, 13 | mergeSpaces: 0, 14 | modelIDs: [ 15 | 'davinci-if:3.0.0', 16 | 'davinci-instruct-beta:2.0.0', 17 | 'davinci-instruct-beta', 18 | 'davinci-search-document', 19 | 'davinci-search-query', 20 | 'davinci-similarity', 21 | 'davinci:2020-05-03', 22 | 'davinci', 23 | 'if-davinci-v2', 24 | 'if-davinci:3.0.0', 25 | 'text-davinci-001', 26 | 'text-davinci-002', 27 | 'text-davinci-003', 28 | 'text-davinci-edit-001', 29 | 'text-davinci-insert-001', 30 | 'text-davinci-insert-002', 31 | 'text-davinci:001', 32 | 'text-search-davinci-doc-001', 33 | 'text-search-davinci-query-001', 34 | 'text-similarity-davinci-001', 35 | ], 36 | pricing: { 37 | usage: 0.02, 38 | prompt: 0.02, 39 | completion: 0.02, 40 | fineTunedTraining: 0.03, 41 | fineTunedUsage: 0.12, 42 | }, 43 | } 44 | -------------------------------------------------------------------------------- /CostEstimator.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | import { EncoderInput } from './BytePairEncoder.mjs' 9 | import { EncoderResult } from './EncoderResult.mjs' 10 | import { ModelFamily, ModelFamilyStore, ModelPricingTypes } from './models/mod.mjs' 11 | import { encode } from './tokenizer/mod.mjs' 12 | 13 | export type CostEstimatorInput = string | EncoderResult 14 | 15 | export type ICostEstimationResult = Record 16 | 17 | export interface NormalizeInputResult { 18 | modelFamily: ModelFamily 19 | encodedResults: EncoderResult[] 20 | } 21 | 22 | export interface EstimateCostFn { 23 | (modelOrFamilyID: string, ...inputs: EncoderInput[]): ICostEstimationResult 24 | (modelFamily: ModelFamily, ...inputs: EncoderInput[]): ICostEstimationResult 25 | } 26 | 27 | export const estimateCost: EstimateCostFn = (modelInput: string | ModelFamily, ...inputs: EncoderInput[]) => { 28 | const modelFamily = ModelFamilyStore.get(modelInput) 29 | const encodedResults = inputs.map((input) => encode(input)) 30 | const tokenCount = encodedResults.reduce((acc, result) => acc + result.tokens.length, 0) 31 | 32 | // Remember that pricing is per 1000 tokens 33 | const pricedUnits = tokenCount / 1000 34 | 35 | const result = {} as ICostEstimationResult 36 | 37 | for (const [pricingType, pricePer] of Object.entries(modelFamily.pricing)) { 38 | const price = typeof pricePer === 'number' ? pricePer * pricedUnits : null 39 | 40 | result[pricingType as ModelPricingTypes] = price 41 | } 42 | 43 | return result 44 | } 45 | -------------------------------------------------------------------------------- /tokenizer/parsers.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | import { TokenEncodingsRecord } from '../BytePairTokenMap.mjs' 9 | import { BPEVocab, VocabEntry } from '../RanksMap.mjs' 10 | import { BundledEncoderKeys, BundledEncoderValues, BundledVocab } from './common.mjs' 11 | 12 | /** 13 | * Parses a bundled vocabulary into a list of bigrams. 14 | * @internal 15 | */ 16 | export function parseBundledVocab(bundledVocab: BundledVocab): BPEVocab { 17 | if (bundledVocab.length % 2 !== 0) { 18 | throw new Error('Invalid bundled vocabulary format: vocab must be an even number of entries') 19 | } 20 | 21 | const entries: VocabEntry[] = [] 22 | 23 | for (let i = 0; i < bundledVocab.length; i += 2) { 24 | const prefix = bundledVocab[i] 25 | const suffix = bundledVocab[i + 1] 26 | 27 | entries.push({ 28 | prefix, 29 | suffix, 30 | }) 31 | } 32 | 33 | return { 34 | version: 'bundled', 35 | entries, 36 | } 37 | } 38 | 39 | /** 40 | * Parses a bundled encoder into a record. 41 | * @internal 42 | */ 43 | export function parseBundledEncoder( 44 | encoderKeys: BundledEncoderKeys, 45 | encoderValues: BundledEncoderValues 46 | ): TokenEncodingsRecord { 47 | if (encoderKeys.length !== encoderValues.length) { 48 | throw new Error('Invalid bundled encoder: keys and values are not the same length') 49 | } 50 | 51 | const tokenEncodings: TokenEncodingsRecord = {} 52 | 53 | for (let i = 0; i < encoderKeys.length; i++) { 54 | const key = encoderKeys[i] 55 | const value = encoderValues[i] 56 | 57 | tokenEncodings[key] = value 58 | } 59 | 60 | return tokenEncodings 61 | } 62 | -------------------------------------------------------------------------------- /patterns.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | /** 9 | * Default contractions used by the tokenizer pattern. 10 | * Note that order matters here, as the pattern will match the first contraction that matches. 11 | */ 12 | export const DEFAULT_CONTRACTIONS = [ 13 | /** @example "John's" */ 14 | `'s`, 15 | /** @example "can't" */ 16 | `'t`, 17 | /** @example "they're" */ 18 | `'re`, 19 | /** @example "I've" */ 20 | `'ve`, 21 | /** @example "I'm" */ 22 | `'m`, 23 | /** @example "they'll" */ 24 | `'ll`, 25 | /** @example "he'd" */ 26 | `'d`, 27 | ] as const satisfies readonly string[] 28 | 29 | /** 30 | * Default tokenizer rules used to build the tokenizer pattern. 31 | */ 32 | export const DEFAULT_TOKENIZER_RULES = [ 33 | /** Matches one or more letters optionally preceded by a space. */ 34 | ' ?\\p{L}+', 35 | /** Matches one or more digits optionally preceded by a space. */ 36 | ' ?\\p{N}+', 37 | /** Matches one or more non-space, non-letter, non-digit characters optionally preceded by a space. */ 38 | ' ?[^\\s\\p{L}\\p{N}]+', 39 | /** Matches one or more spaces that are not followed by a non-space character (i.e. end of word). */ 40 | '\\s+(?!\\S)', 41 | /** Matches one or more spaces. */ 42 | '\\s+', 43 | ] as const satisfies readonly string[] 44 | 45 | /** 46 | * Creates a regular expression pattern used to tokenize text into individual tokens. 47 | * @param contractions - Contractions used by the tokenizer pattern. 48 | * @param rules - Rules used to build the tokenizer pattern. 49 | * 50 | * @see {@linkcode DEFAULT_TOKENIZER_RULES} 51 | */ 52 | export function createTokenizerPattern( 53 | contractions: string[] = DEFAULT_CONTRACTIONS.slice(), 54 | rules: string[] = DEFAULT_TOKENIZER_RULES.slice() 55 | ): RegExp { 56 | const pattern = [...contractions, ...rules].join('|') 57 | return new RegExp(pattern, 'gu') 58 | } 59 | -------------------------------------------------------------------------------- /tokenizer/gpt.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | import { BytePairDecoder } from '../BytePairDecoder.mjs' 9 | import { BytePairEncoder } from '../BytePairEncoder.mjs' 10 | import { BytePairEncoding } from '../BytePairEncoding.mjs' 11 | import { DEFAULT_BPE_OPTIONS } from './data.mjs' 12 | 13 | // We can use a single instance for both encoding and decoding GPT tokens. 14 | const gptEncoding = new BytePairEncoding(DEFAULT_BPE_OPTIONS) 15 | 16 | /** 17 | * Default GPT-3 decoder. 18 | * This is a singleton instance of {@linkcode BytePairEncoder} that is pre-configured to decode GPT-3 tokens. 19 | */ 20 | export const gptEncoder = new BytePairEncoder(gptEncoding) 21 | 22 | /** 23 | * Encodes a given UTF-8 string into a list of GPT-3 tokens. 24 | * 25 | * ```js 26 | * const text = "Do androids dream of electric sheep?" 27 | * const tokens = encoder.encode(text) 28 | * console.log(tokens) // [5211, 290, 305, 2340, 4320, 286, 5186, 15900, 30] 29 | * ``` 30 | * 31 | * @see {@linkcode decode} for the inverse function. 32 | * @see {@linkcode BytePairEncoder} for more information on how the tokens are decoded. 33 | */ 34 | export const encode = gptEncoder.encode 35 | 36 | /** 37 | * Default GPT-3 decoder. 38 | * This is a singleton instance of {@linkcode BytePairDecoder} that is pre-configured to decode GPT-3 tokens. 39 | */ 40 | export const gptDecoder = new BytePairDecoder(gptEncoding) 41 | 42 | /** 43 | * Converts a list of GPT-3 tokens into a string. 44 | * 45 | * ```ts 46 | * const tokens = [5211, 290, 305, 2340, 4320, 286, 5186, 15900, 30] 47 | * const text = decode(tokens) 48 | * console.log(text) // "Do androids dream of electric sheep?" 49 | * ``` 50 | * 51 | * @see {@linkcode encode} for the inverse function. 52 | * @see {@linkcode BytePairDecoder} for more information on how the tokens are decoded. 53 | */ 54 | export const decode = gptDecoder.decode 55 | -------------------------------------------------------------------------------- /test/fixtures/multiple-paragraphs.txt: -------------------------------------------------------------------------------- 1 | Adipisicing minim aliquip irure nisi mollit dolore nostrud ea incididunt occaecat. 2 | Non voluptate ea sint eiusmod et pariatur incididunt commodo commodo veniam reprehenderit sunt. 3 | Elit velit ullamco ea id sit elit Lorem irure cupidatat ea dolor. Eu ex eiusmod dolore duis. 4 | Esse dolor amet incididunt cupidatat amet velit nostrud. 5 | 6 | Ut anim et ea minim nisi fugiat officia exercitation minim aliqua velit nostrud. 7 | Anim cillum ea sunt ad eu laboris. 8 | Incididunt sint eiusmod do consequat eiusmod esse eiusmod sit irure esse anim veniam officia aliquip. 9 | Nulla esse eiusmod est aliqua. Magna ad aute anim qui et irure eu. 10 | 11 | Incididunt dolore adipisicing excepteur sunt dolor tempor nostrud cupidatat ullamco id Lorem esse. 12 | Non velit ex adipisicing esse proident anim irure exercitation id eiusmod officia proident. 13 | Sint incididunt cillum non reprehenderit nisi pariatur ipsum mollit enim commodo incididunt Lorem fugiat deserunt. 14 | Veniam do pariatur duis magna id sit dolore amet aliqua magna aliqua voluptate exercitation. 15 | Nostrud tempor consectetur ut ad consequat. Ex commodo in sunt dolor irure. Nostrud elit et ipsum minim. 16 | 17 | Amet officia nostrud amet cillum ea nisi. 18 | In in nulla ullamco amet velit nostrud exercitation do nisi reprehenderit. 19 | Laborum aute dolore in ut aliqua reprehenderit amet nisi qui esse occaecat cupidatat cupidatat labore. 20 | Veniam sunt labore excepteur aliqua aliqua sunt do enim do anim in occaecat consequat aute. 21 | Sit labore irure quis commodo non elit sunt ex ea. Nisi do tempor aliquip aliqua ut veniam culpa dolore. 22 | 23 | Exercitation sint consequat sint exercitation enim officia non velit. 24 | Cillum ut sint dolore minim aliqua. Id excepteur fugiat magna laborum in dolore laboris cupidatat occaecat aliquip ipsum laboris. 25 | Duis ex ut qui sit id fugiat aute laborum nulla. 26 | Minim laboris consectetur non occaecat dolor commodo sit adipisicing consequat magna. 27 | Incididunt elit amet fugiat ex voluptate. 28 | Nulla occaecat sit sunt voluptate ex id adipisicing excepteur fugiat reprehenderit. 29 | -------------------------------------------------------------------------------- /node/parsers.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | import { TokenEncodingsRecord } from '../BytePairTokenMap.mjs' 9 | import { BPEVocab, VocabEntry } from '../RanksMap.mjs' 10 | 11 | /** 12 | * Parses a BPE file into a list of bigrams 13 | * 14 | * The vocab.bpe file is a text file that contains a set of byte pair encoding (BPE) codes 15 | * that are used in the tokenization process. 16 | * 17 | * The file should be in the following format: 18 | * 19 | * ```text 20 | * #version: VERSION_STRING 21 | * [prefix1] [suffix1] 22 | * [prefixN] [suffixN] 23 | * ... 24 | * ``` 25 | */ 26 | export function parseBPEFile(bpeFileContents: string): BPEVocab { 27 | const lines = bpeFileContents.trim().split('\n') 28 | const [versionLine, ...bpeMerges] = lines 29 | const [version = 'unknown'] = versionLine.trim().match(/^#version: (\d.+)$/) || [] 30 | 31 | const entries = bpeMerges.map((line, lineIndex) => { 32 | const segments = line 33 | // Each line contains a pair of tokens separated by a space 34 | .split(/(\s+)/) 35 | // Clean up the tokens... 36 | .map((x) => x.trim()) 37 | .filter(Boolean) 38 | 39 | if (segments.length < 2) { 40 | throw new Error(`Invalid BPE file format: line ${lineIndex + 1} is not a valid bigram`) 41 | } 42 | 43 | const [prefix, suffix] = segments 44 | 45 | const entry: VocabEntry = { 46 | prefix, 47 | suffix, 48 | } 49 | 50 | return entry 51 | }) 52 | 53 | return { 54 | version, 55 | entries, 56 | } 57 | } 58 | 59 | /** 60 | * Parse a token encoder file, usually from a file named `encoder.json` 61 | */ 62 | export function parseEncoderFile( 63 | /** 64 | * The token encoder content, either as a string or as a parsed object. 65 | */ 66 | tokenEncoderContent: string | TokenEncodingsRecord 67 | ): TokenEncodingsRecord { 68 | const tokenEncodings: TokenEncodingsRecord = 69 | typeof tokenEncoderContent === 'string' ? JSON.parse(tokenEncoderContent) : tokenEncoderContent 70 | 71 | return tokenEncodings 72 | } 73 | -------------------------------------------------------------------------------- /tokenizer/codex.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | import { BytePairDecoder } from '../BytePairDecoder.mjs' 9 | import { BytePairEncoder } from '../BytePairEncoder.mjs' 10 | import { BytePairEncoding } from '../BytePairEncoding.mjs' 11 | import { DEFAULT_BPE_OPTIONS } from './data.mjs' 12 | 13 | const codexEncoding = new BytePairEncoding({ 14 | ...DEFAULT_BPE_OPTIONS, 15 | mergeSpaces: 'codex', 16 | }) 17 | 18 | /** 19 | * Default Codex decoder. 20 | * This is a singleton instance of {@linkcode BytePairDecoder} that is pre-configured to decode GPT-3 tokens. 21 | */ 22 | export const codexEncoder = new BytePairEncoder(codexEncoding) 23 | 24 | /** 25 | * Encodes a given UTF-8 string into a list of GPT-3 tokens. 26 | * 27 | * ```js 28 | * const codeText = [ 29 | * 'function deeplyNested () {', 30 | * ' return {', 31 | * ' the: {', 32 | * ' quick: {', 33 | * ' brown: {', 34 | * '...etc' 35 | * ].join('') 36 | * 37 | * const codexTokens = encodeToCodexTokens(codeText) 38 | * ``` 39 | * 40 | * @see {@linkcode decodeCodexTokens} for the inverse function. 41 | * @see {@linkcode BytePairEncoder} for more information on how the tokens are decoded. 42 | */ 43 | export const encodeCodex = codexEncoder.encode 44 | 45 | /** 46 | * Default Codex decoder. 47 | * This is a singleton instance of {@linkcode BytePairDecoder} that is pre-configured to decode GPT-3 tokens. 48 | */ 49 | export const codexDecoder = new BytePairDecoder(codexEncoding) 50 | 51 | /** 52 | * Converts a list of Codex tokens into a string. 53 | * 54 | * ```ts 55 | * // Truncated for brevity... 56 | * const tokens = [8818, 7744, 45, 7287, 7499] 57 | * const text = codexDecoder.decode(tokens) 58 | * console.log(text) 59 | * 60 | * // `function deeplyNested () { 61 | * // return { 62 | * // the: { 63 | * // quick: { 64 | * // brown: { 65 | * // ...` 66 | * ``` 67 | * 68 | * @see {@linkcode codexEncoder} for the inverse object. 69 | * @see {@linkcode BytePairDecoder} for more information on how the tokens are decoded. 70 | */ 71 | export const decodeCodex = codexDecoder.decode 72 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # What is this? 2 | 3 | **GPT Token Utilities** is a small library for encoding and decoding text to and from the tokenized format used by OpenAI's GPT models. 4 | 5 | ## Why should I use this? 6 | 7 | ### 🏃‍♀️ Fast 8 | 9 | Our token encoder/decoder is optimized for a balance of speed and ease of use. No external dependencies are required. 10 | 11 | ### 🤸‍♀️ Flexible 12 | 13 | Everything is written in TypeScript and includes type definitions with full documentation. This library is isomorphic and can be used in both Node and the browser! 14 | 15 | ### ⚖️ Light in size. Heavy in features. 16 | 17 | GPT Token Utils balances a small footprint with a full-featured API. 18 | It's also tree-shakeable, so you can import only the functions you need. 19 | 20 | ## Installation 21 | 22 | ### NPM 23 | 24 | ```bash 25 | yarn add gpt-token-utils 26 | # or 27 | npm install --save gpt-token-utils 28 | ``` 29 | 30 | ## Usage 31 | 32 | ## Encoding and Decoding Text 33 | 34 | The `encode` and `decode` exports are the main functions you'll use to work with GPT tokens. 35 | 36 | ```js 37 | import { encode, decode } from 'gpt-token-utils' 38 | 39 | // Encode a string to a list of tokens... 40 | const tokens = encode('Humans are strange creatures, and ever so curious too!') 41 | 42 | // You've got a list of tokens! 43 | console.log(tokens) // [32661, 504, 389, 6283, 8109, 11, 290, 1683, 523, 11040, 1165, 0] 44 | 45 | // How many tokens are there? 46 | console.log(tokens.length) // 6 47 | 48 | // Can we decode it back to text? 49 | console.log(decode(tokens)) // "Humans are strange creatures...." 50 | ``` 51 | 52 | ### Advanced Usage 53 | 54 | By default, GPT Token Utils includes a sizable vocabulary and encoder. Alternatively, you can pass in your own to customize the encoding/decoding process. 55 | 56 | ```js 57 | import {BytePairEncoder} from 'gpt-token-utils/BytePairEncoder' 58 | 59 | const tokenEncoder = new BytePairEncoder({...}) 60 | ``` 61 | 62 | ```js 63 | import {BytePairDecoder} from 'gpt-token-utils/BytePairDecoder' 64 | 65 | const tokenDecoder = new TokenDecoder({...}) 66 | ``` 67 | 68 | # License 69 | 70 | GPT Token Utils is licensed under the [MIT License](https://opensource.org/licenses/MIT). If you've got something cool to share that's built with this library, let us know at [@SisterSoftware](https://twitter.com/SisterSoftware)! We would love to see it! 71 | -------------------------------------------------------------------------------- /RanksMap.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | const nodeInspectSymbol = Symbol.for('nodejs.util.inspect.custom') 9 | 10 | /** 11 | * Map of byte-pair encodings according to their BPE rank 12 | * @internal 13 | */ 14 | export class RanksMap { 15 | protected _prefixToSuffixRankMap: Map< 16 | /** Prefix */ 17 | string, 18 | Map< 19 | /** Suffix */ 20 | string, 21 | /** Rank */ 22 | number 23 | > 24 | > = new Map() 25 | 26 | public getRank(prefix: string, suffix: string): number | undefined { 27 | const suffixMap = this._prefixToSuffixRankMap.get(prefix) 28 | 29 | if (suffixMap) { 30 | return suffixMap.get(suffix) 31 | } 32 | } 33 | 34 | constructor(vocab: VocabEntry[] | BPEVocab, mergesSpacesCount = 0) { 35 | const normalizedVocab = Array.isArray(vocab) ? vocab.slice() : vocab.entries.slice() 36 | 37 | if (mergesSpacesCount > 0) { 38 | for (let i = 1; i < mergesSpacesCount; i++) { 39 | for (let j = 1; j < mergesSpacesCount; j++) { 40 | if (i + j <= mergesSpacesCount) { 41 | normalizedVocab.push({ 42 | prefix: '\u0120'.repeat(i), 43 | suffix: '\u0120'.repeat(j), 44 | }) 45 | } 46 | } 47 | } 48 | } 49 | 50 | for (const [rank, entry] of normalizedVocab.entries()) { 51 | let suffixMap = this._prefixToSuffixRankMap.get(entry.prefix) 52 | 53 | if (!suffixMap) { 54 | suffixMap = new Map() 55 | this._prefixToSuffixRankMap.set(entry.prefix, suffixMap) 56 | } 57 | 58 | suffixMap.set(entry.suffix, rank) 59 | } 60 | } 61 | 62 | public get size() { 63 | return this._prefixToSuffixRankMap.size 64 | } 65 | 66 | [nodeInspectSymbol]() { 67 | return `RanksMap(${this.size})` 68 | } 69 | } 70 | 71 | /** 72 | * A parsed vocabulary entry. 73 | * The rank of the byte-pair encoding is derived from the index of the pair in the `vocab.bpe` file. 74 | */ 75 | export interface VocabEntry { 76 | /** The word stem prefix in the pair. */ 77 | prefix: string 78 | /** The suffix token in the pair. */ 79 | suffix: string 80 | } 81 | 82 | /** 83 | * A vocabulary of byte-pair encodings. 84 | * 85 | * @see {@linkcode parseBPEFile} 86 | */ 87 | export interface BPEVocab { 88 | version: string 89 | entries: VocabEntry[] 90 | } 91 | -------------------------------------------------------------------------------- /BytePairDecoder.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | import { BytePairEncoding } from './BytePairEncoding.mjs' 9 | // eslint-disable-next-line @typescript-eslint/no-unused-vars 10 | import type { BytePairEncoder } from './BytePairEncoder.mjs' 11 | import type { EncoderResult } from './EncoderResult.mjs' 12 | 13 | /** 14 | * Methods associated with decoding a list of tokens into a string. 15 | */ 16 | export interface TokenDecodeFn { 17 | ( 18 | /** 19 | * The list of tokens to decode. 20 | */ 21 | tokens: number[] 22 | ): string 23 | 24 | ( 25 | /** 26 | * The resulting object of the {@linkcode BytePairEncoder.encode} function. 27 | */ 28 | encoderResult: EncoderResult 29 | ): string 30 | } 31 | 32 | /** 33 | * GPT Token Decoder. 34 | * 35 | * Generally, you should not need to use this class directly unless you are 36 | * implementing a custom token decoder. 37 | * 38 | * @see {@linkcode BytePairEncoder} for the encoder. 39 | * 40 | * ```ts 41 | * const decoder = new BytePairDecoder({codePointByteMap, bpeTokenMap}) 42 | * const text = decoder.decode(tokens) 43 | * ``` 44 | */ 45 | export class BytePairDecoder { 46 | constructor(protected _bpe: BytePairEncoding, protected _textDecoder = new TextDecoder()) {} 47 | 48 | /** 49 | * Converts a list of tokens into a string. 50 | * 51 | * ```ts 52 | * const tokens = [5211, 290, 305, 2340, 4320, 286, 5186, 15900, 30] 53 | * const text = decoder.decode(tokens) 54 | * console.log(text) // "Do androids dream of electric sheep?" 55 | * ``` 56 | * 57 | * @returns The decoded string. 58 | */ 59 | public decode: TokenDecodeFn = (tokens: number[] | EncoderResult): string => { 60 | const source = Array.isArray(tokens) ? tokens : tokens.tokens 61 | 62 | const bytePairEncodings = source 63 | // First, we convert the tokens into BPE... 64 | .map((token) => this._bpe.tokenMap.tokenToBytePair(token)) 65 | // The pairs combined into a single string to combine the graphemes. 66 | .join('') 67 | 68 | // We then convert the BPE into UTF-8 by split the string... 69 | //...into an array of characters to convert the characters into bytes 70 | const bytes = Array.from(bytePairEncodings, (x) => this._bpe.codePointByteMap.codePointToByte(x)) 71 | 72 | // Finally, we convert the bytes into a string. 73 | const text = this._textDecoder.decode(new Uint8Array(bytes)) 74 | 75 | return text 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /BytePairTokenMap.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | /** 9 | * A map of byte-pair encodings to their corresponding tokens. 10 | * @internal 11 | */ 12 | export type TokenEncodingsRecord = Record 13 | 14 | const nodeInspectSymbol = Symbol.for('nodejs.util.inspect.custom') 15 | 16 | /** 17 | * Two-way map between Unicode byte-pairs and tokens. 18 | * @internal 19 | */ 20 | export class BytePairTokenMap { 21 | protected _bpeTokenMap: Map< 22 | /** 23 | * Byte paired character(s), e.g. `'!'`, `'\u00a8'` 24 | */ 25 | string, 26 | /** 27 | * The corresponding token, e.g. `0`, `101` 28 | */ 29 | number 30 | > 31 | protected _tokenBPEMap: Map< 32 | /** 33 | * The corresponding token, e.g. `0`, `101` 34 | */ 35 | number, 36 | /** 37 | * Byte paired character(s), e.g. `'!'`, `'\u00a8'` 38 | */ 39 | string 40 | > 41 | 42 | constructor(tokenEncodings: TokenEncodingsRecord, nMergedSpaces = 0) { 43 | this._bpeTokenMap = new Map() 44 | this._tokenBPEMap = new Map() 45 | 46 | for (const [key, value] of Object.entries(tokenEncodings)) { 47 | this.addBytePair(key, value!) 48 | } 49 | 50 | // add merged spaces for codex tokenizer 51 | const normalizeVocabLength = this._bpeTokenMap.size + nMergedSpaces 52 | 53 | for (let i = 0; i < nMergedSpaces; i++) { 54 | const key = '\u0120'.repeat(i + 2) 55 | const value = normalizeVocabLength - nMergedSpaces + i 56 | 57 | this.addBytePair(key, value) 58 | } 59 | } 60 | 61 | public addBytePair(bytePair: string, token: number): void { 62 | this._bpeTokenMap.set(bytePair, token) 63 | this._tokenBPEMap.set(token, bytePair) 64 | } 65 | 66 | public tokenToBytePair(token: number): string { 67 | const bytePair = this._tokenBPEMap.get(token) 68 | 69 | if (typeof bytePair === 'undefined') { 70 | throw new Error(`Token "${token}" was not found in the token encoder.`) 71 | } 72 | 73 | return bytePair 74 | } 75 | 76 | public bytePairToToken(bytePair: string): number { 77 | const token = this._bpeTokenMap.get(bytePair) 78 | 79 | if (typeof token === 'undefined') { 80 | throw new Error(`Byte pair "${bytePair}" was not found in the token encoder.`) 81 | } 82 | 83 | return token 84 | } 85 | 86 | public get size() { 87 | return this._bpeTokenMap.size 88 | } 89 | 90 | public [nodeInspectSymbol]() { 91 | return `BytePairTokenMap(${this.size})` 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /BytePairEncoding.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | import { BytePairTokenMap, TokenEncodingsRecord } from './BytePairTokenMap.mjs' 9 | import { CodePointByteMap } from './CodePointByteMap.mjs' 10 | import { createTokenizerPattern } from './patterns.mjs' 11 | import { BPEVocab, RanksMap, VocabEntry } from './RanksMap.mjs' 12 | 13 | export interface IBytePairEncodingOptions { 14 | /** 15 | * The token encoder map. This is typically derived from a `encoder.json` file: 16 | * 17 | * ```ts 18 | * const tokenEncodings = parseEncoderFile(fs.readFileSync('./encoder.json', 'utf-8')) 19 | * ``` 20 | */ 21 | tokenEncodings: TokenEncodingsRecord 22 | 23 | /** 24 | * The BPE ranks map. This is typically derived from a `vocab.bpe` file: 25 | * 26 | * ```ts 27 | * const vocab = parseBPEFile(fs.readFileSync('./vocab.bpe', 'utf-8')) 28 | * ``` 29 | * 30 | * You should only use this option if you are using a custom vocabulary. 31 | * 32 | * @see {@linkcode parseBPEFile} 33 | * 34 | * @default parseBPEFile(DEFAULT_VOCAB) 35 | */ 36 | vocab: VocabEntry[] | BPEVocab 37 | 38 | /** 39 | * The number of spaces to merge into a single token. 40 | * 41 | * Codex models use a different set of encodings that handle whitespace more efficiently. 42 | * @default 'none' 43 | */ 44 | mergeSpaces?: 'none' | 'codex' | number 45 | 46 | /** 47 | * Optional override of the regular expression used to tokenize text. 48 | * @default createTokenizerPattern() 49 | */ 50 | tokenizationPattern?: RegExp 51 | } 52 | 53 | /** 54 | * A base class for the Byte Pair Encoding (BPE) encoder and decoder. 55 | * @internal 56 | */ 57 | export class BytePairEncoding { 58 | public codePointByteMap: CodePointByteMap 59 | public mergesSpacesCount: number 60 | 61 | public tokenMap: BytePairTokenMap 62 | public ranksMap: RanksMap 63 | 64 | public tokenizationPattern: RegExp 65 | 66 | constructor(options: IBytePairEncodingOptions) { 67 | this.tokenizationPattern = options.tokenizationPattern ?? createTokenizerPattern() 68 | this.codePointByteMap = new CodePointByteMap() 69 | 70 | if (typeof options.mergeSpaces === 'string') { 71 | this.mergesSpacesCount = options.mergeSpaces === 'codex' ? 30 : 0 72 | } else { 73 | this.mergesSpacesCount = options.mergeSpaces ?? 0 74 | } 75 | 76 | this.tokenMap = new BytePairTokenMap(options.tokenEncodings, this.mergesSpacesCount) 77 | this.ranksMap = new RanksMap(options.vocab, this.mergesSpacesCount) 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /test/CostEstimator.test.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | import { expect, test } from 'vitest' 9 | import { 10 | BytePairEncoder, 11 | BytePairEncoding, 12 | createDefaultBPEOptions, 13 | estimateCost, 14 | ICostEstimationResult, 15 | ModelFamilyIDs, 16 | } from '../mod.mjs' 17 | import { readFixture, TestCase } from './common.mjs' 18 | 19 | interface CostEstimatorTestCase extends TestCase { 20 | modelID: string 21 | } 22 | 23 | const testCases: CostEstimatorTestCase[] = [ 24 | { 25 | label: 'Empty string', 26 | modelID: ModelFamilyIDs.Davinci, 27 | given: '', 28 | expected: { 29 | usage: 0, 30 | fineTunedUsage: 0, 31 | fineTunedTraining: 0, 32 | prompt: 0, 33 | completion: 0, 34 | }, 35 | }, 36 | { 37 | label: 'Just a space', 38 | modelID: ModelFamilyIDs.Davinci, 39 | given: ' ', 40 | expected: { 41 | completion: 0.00002, 42 | fineTunedTraining: 0.00003, 43 | fineTunedUsage: 0.00012, 44 | prompt: 0.00002, 45 | usage: 0.00002, 46 | }, 47 | }, 48 | { 49 | label: 'Tab', 50 | modelID: ModelFamilyIDs.Davinci, 51 | given: '\t', 52 | expected: { 53 | completion: 0.00002, 54 | fineTunedTraining: 0.00003, 55 | fineTunedUsage: 0.00012, 56 | prompt: 0.00002, 57 | usage: 0.00002, 58 | }, 59 | }, 60 | { 61 | label: 'Single paragraph', 62 | modelID: ModelFamilyIDs.Davinci, 63 | given: readFixture('single-paragraph.txt'), 64 | expected: { 65 | completion: 0.0031, 66 | fineTunedTraining: 0.00465, 67 | fineTunedUsage: 0.0186, 68 | prompt: 0.0031, 69 | usage: 0.0031, 70 | }, 71 | }, 72 | { 73 | label: 'Multiple paragraphs', 74 | modelID: ModelFamilyIDs.Davinci, 75 | given: readFixture('multiple-paragraphs.txt'), 76 | expected: { 77 | completion: 0.01434, 78 | fineTunedTraining: 0.021509999999999998, 79 | fineTunedUsage: 0.08603999999999999, 80 | prompt: 0.01434, 81 | usage: 0.01434, 82 | }, 83 | }, 84 | // { 85 | // label: 'HTML content', 86 | // modelID: ModelFamilyIDs.GPT4, 87 | // given: readFixture('sample-html.html'), 88 | // expected: { 89 | // completion: 0.005659999999999999, 90 | // fineTunedTraining: 0.00849, 91 | // fineTunedUsage: 0.03396, 92 | // prompt: 0.005659999999999999, 93 | // usage: 0.005659999999999999, 94 | // }, 95 | // }, 96 | ] 97 | 98 | for (const { label, given, modelID, expected, options } of testCases) { 99 | test(label, () => { 100 | const gptEncoding = new BytePairEncoding({ ...createDefaultBPEOptions(), ...options }) 101 | const encoder = new BytePairEncoder(gptEncoding) 102 | 103 | const encoded = encoder.encode(given) 104 | const estimatedCosts = estimateCost(modelID, encoded) 105 | 106 | expect(estimatedCosts).toEqual(expected) 107 | }) 108 | } 109 | -------------------------------------------------------------------------------- /CodePointByteMap.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | const nodeInspectSymbol = Symbol.for('nodejs.util.inspect.custom') 9 | 10 | /** 11 | * Two-way map of byte values to their corresponding Unicode codepoints. 12 | */ 13 | export class CodePointByteMap { 14 | /** 15 | * Maps each byte value to its corresponding Unicode character. 16 | */ 17 | protected _byteToCodePoint: Map 18 | /** 19 | * Maps each Unicode character to its corresponding byte value. 20 | */ 21 | protected _codePointToByte: Map 22 | 23 | constructor() { 24 | // Contains all the byte values corresponding to printable ASCII characters 25 | const basicBytes = Array.from({ length: 94 }, (_, i) => i + 33) 26 | 27 | // Contains all the byte values corresponding to extended ASCII characters 28 | // that are not already included in `basicBytes` 29 | const extendedBytes = [ 30 | ...Array.from({ length: 12 }, (_, i) => i + 161), 31 | ...Array.from({ length: 82 }, (_, i) => i + 174), 32 | ] 33 | 34 | // Combine `basicBytes` and `extendedBytes` to get a list of all byte values 35 | const allBytes = basicBytes.concat(extendedBytes) 36 | const cs = allBytes.slice() 37 | 38 | let unicodeIndex = 0 39 | 40 | // Then assign unique Unicode characters to the bytes in `allBytes` that are not 41 | // already in `basicBytes` or `extendedBytes`... 42 | // For each possible byte value (0-255)... 43 | for (let byteValue = 0; byteValue < 256; byteValue++) { 44 | // If the byte value is not in `allBytes`, it needs to be added to the dictionary 45 | if (!allBytes.includes(byteValue)) { 46 | // Add the byte value to `allBytes` 47 | allBytes.push(byteValue) 48 | 49 | cs.push(256 + unicodeIndex) 50 | 51 | // Increment `unicodeIndex` so the next new byte value will get a unique Unicode character 52 | unicodeIndex++ 53 | } 54 | } 55 | 56 | this._byteToCodePoint = new Map() 57 | this._codePointToByte = new Map() 58 | 59 | for (let i = 0; i < cs.length; i++) { 60 | const key = allBytes[i] 61 | const value = String.fromCharCode(cs[i]) 62 | 63 | this._byteToCodePoint.set(key, value) 64 | this._codePointToByte.set(value, key) 65 | } 66 | } 67 | 68 | public byteToCodePoint(byte: number): string { 69 | const codePoint = this._byteToCodePoint.get(byte) 70 | 71 | if (typeof codePoint === 'undefined') { 72 | throw new Error(`Byte "${byte}" was not found in the byte map.`) 73 | } 74 | 75 | return codePoint 76 | } 77 | 78 | public codePointToByte(codePoint: string): number { 79 | const byte = this._codePointToByte.get(codePoint) 80 | 81 | if (typeof byte === 'undefined') { 82 | throw new Error(`Unicode character "${codePoint}" was not found in the byte map.`) 83 | } 84 | 85 | return byte 86 | } 87 | 88 | public get size() { 89 | return this._byteToCodePoint.size 90 | } 91 | 92 | public get byteToCodePointMap() { 93 | return this._byteToCodePoint 94 | } 95 | 96 | public get codePointToByteMap() { 97 | return this._codePointToByte 98 | } 99 | 100 | [nodeInspectSymbol]() { 101 | return `CodePointByteMap(${this.size})` 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "gpt-token-utils", 3 | "version": "1.2.0", 4 | "description": "Isomorphic utilities for GPT-3 tokenization and prompt building.", 5 | "repository": "git@github.com:sister-software/gpt-token-utils.git", 6 | "bugs": { 7 | "url": "https://github.com/sister-software/gpt-token-utils/issues" 8 | }, 9 | "author": "Teffen Ellis ", 10 | "homepage": "https://github.com/sister-software/gpt-token-utils#readme", 11 | "license": "MIT", 12 | "keywords": [ 13 | "gpt", 14 | "gpt-3", 15 | "gpt3", 16 | "openai", 17 | "Open AI", 18 | "tokenization", 19 | "tokenizer", 20 | "prompt", 21 | "prompt-builder" 22 | ], 23 | "main": "dist/mod.mjs", 24 | "types": "dist/mod.d.mts", 25 | "type": "module", 26 | "files": [ 27 | "dist/**/*" 28 | ], 29 | "exports": { 30 | "./package.json": "./package.json", 31 | ".": { 32 | "import": "./dist/mod.mjs", 33 | "types": "./dist/mod.d.mts" 34 | }, 35 | "./mod": { 36 | "import": "./dist/mod.mjs", 37 | "types": "./dist/mod.d.mts" 38 | }, 39 | "./mod.mjs": { 40 | "import": "./dist/mod.mjs", 41 | "types": "./dist/mod.d.mts" 42 | }, 43 | "./BytePairDecoder": { 44 | "import": "./dist/BytePairDecoder/mod.mjs", 45 | "types": "./dist/BytePairDecoder/mod.d.mts" 46 | }, 47 | "./BytePairDecoder.mjs": { 48 | "import": "./dist/BytePairDecoder/mod.mjs", 49 | "types": "./dist/BytePairDecoder/mod.d.mts" 50 | }, 51 | "./BytePairEncoder": { 52 | "import": "./dist/BytePairEncoder/mod.mjs", 53 | "types": "./dist/BytePairEncoder/mod.d.mts" 54 | }, 55 | "./BytePairEncoder.mjs": { 56 | "import": "./dist/BytePairEncoder/mod.mjs", 57 | "types": "./dist/BytePairEncoder/mod.d.mts" 58 | }, 59 | "./tokenizer": { 60 | "import": "./dist/tokenizer/mod.mjs", 61 | "types": "./dist/tokenizer/mod.d.mts" 62 | }, 63 | "./tokenizer.mjs": { 64 | "import": "./dist/tokenizer/mod.mjs", 65 | "types": "./dist/tokenizer/mod.d.mts" 66 | }, 67 | "./models": { 68 | "import": "./dist/models/mod.mjs", 69 | "types": "./dist/models/mod.d.mts" 70 | }, 71 | "./models.mjs": { 72 | "import": "./dist/models/mod.mjs", 73 | "types": "./dist/models/mod.d.mts" 74 | }, 75 | "./node": { 76 | "import": "./dist/node/mod.mjs", 77 | "types": "./dist/node/mod.d.mts" 78 | }, 79 | "./node.mjs": { 80 | "import": "./dist/node/mod.mjs", 81 | "types": "./dist/node/mod.d.mts" 82 | } 83 | }, 84 | "scripts": { 85 | "test": "vitest", 86 | "build": "tsc -p ./tsconfig.json", 87 | "start": "http-server ./ -p 8081", 88 | "cli-tiktoken": "NODE_OPTIONS=\"--loader ts-node/esm --no-warnings\" ts-node ./internal/tiktoken.mts", 89 | "cli-example": "NODE_OPTIONS=\"--loader ts-node/esm --no-warnings\" ts-node ./example.mts" 90 | }, 91 | "devDependencies": { 92 | "@sister.software/eslint-config": "^1.0.0", 93 | "@sister.software/prettier-config": "^1.0.0", 94 | "@sister.software/stylelint-config": "^1.0.0", 95 | "@types/node": "^18.14.4", 96 | "@typescript-eslint/eslint-plugin": "^5.53.0", 97 | "@typescript-eslint/parser": "^5.53.0", 98 | "eslint": "^8.34.0", 99 | "http-server": "^14.1.1", 100 | "prettier": "^2.8.1", 101 | "prettier-plugin-organize-imports": "^3.2.2", 102 | "react": "^18.2.0", 103 | "react-dom": "^18.2.0", 104 | "stylelint": "~14", 105 | "ts-node": "^10.9.1", 106 | "typescript": "4.9.5", 107 | "vitest": "^0.29.2" 108 | }, 109 | "prettier": "@sister.software/prettier-config", 110 | "stylelint": { 111 | "extends": [ 112 | "@sister.software/stylelint-config" 113 | ] 114 | }, 115 | "eslintConfig": { 116 | "extends": [ 117 | "@sister.software/eslint-config" 118 | ], 119 | "ignorePatterns": [ 120 | "dist", 121 | "results", 122 | "node_modules", 123 | "test/fixtures" 124 | ] 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /models/common.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | /** 9 | * The IDs of available model families. 10 | */ 11 | export const ModelFamilyIDs = { 12 | Ada: 'ada', 13 | Babbage: 'babbage', 14 | Curie: 'curie', 15 | Davinci: 'davinci', 16 | ChatGPT: 'chat-gpt', 17 | GPT4: 'gpt-4', 18 | GPT4_32K: 'gpt-4-32k', 19 | } as const 20 | 21 | /** 22 | * A model family is a group of models that share a common lineage or training data. 23 | */ 24 | export interface ModelFamily { 25 | familyID: string 26 | /** 27 | * The number of tokens that can be used with this model in a single request. 28 | */ 29 | tokenLimit: number 30 | /** 31 | * The number of spaces to merge into a single token. 32 | * 33 | * Codex models use a different set of encodings that handle whitespace more efficiently. 34 | */ 35 | mergeSpaces: number 36 | pricing: ModelPricing 37 | /** 38 | * The IDs of available models, matches the IDs used in the OpenAI API. 39 | */ 40 | modelIDs: string[] 41 | 42 | /** 43 | * The ID of the preferred model in this family. 44 | */ 45 | preferredModelID?: string 46 | } 47 | 48 | export type ModelPricingTypes = 'usage' | 'fineTunedUsage' | 'fineTunedTraining' | 'prompt' | 'completion' 49 | /** 50 | * The pricing of a model in US dollars. 51 | * @see https://openai.com/pricing 52 | */ 53 | export interface ModelPricing { 54 | /** 55 | * The price of model usage per 1000 tokens. 56 | */ 57 | usage: number | null 58 | /** 59 | * The price of fine-tuned model usage per 1000 tokens. 60 | */ 61 | fineTunedUsage: number | null 62 | /** 63 | * The price of fine-tuned model training per 1000 tokens. 64 | */ 65 | fineTunedTraining: number | null 66 | /** 67 | * The price of usage for the prompt endpoint per 1000 tokens. 68 | */ 69 | prompt: number | null 70 | /** 71 | * The price of usage for the completion endpoint per 1000 tokens. 72 | */ 73 | completion: number | null 74 | } 75 | 76 | export interface GetModelFamilyFn { 77 | ( 78 | /** 79 | * The ID of a model within a family, e.g. `"text-davinci-003"` 80 | * @returns The family that the model belongs to. 81 | */ 82 | modelID: string 83 | ): ModelFamily 84 | ( 85 | /** 86 | * The ID of a model family, e.g. `"davinci"` 87 | * @returns The family associated with the ID. 88 | */ 89 | familyID: string 90 | ): ModelFamily 91 | ( 92 | /** 93 | * A model family. This is useful for when you already have a model family object. 94 | * @returns The same family object that was passed in. 95 | */ 96 | modelFamily: ModelFamily 97 | ): ModelFamily 98 | 99 | (input: string | ModelFamily): ModelFamily 100 | } 101 | 102 | export class ModelFamiliesMap { 103 | protected _familyMap = new Map() 104 | protected _modelToFamilyMap = new Map() 105 | 106 | public addFamily(family: ModelFamily): void { 107 | this._familyMap.set(family.familyID, family) 108 | for (const modelID of family.modelIDs) { 109 | this._modelToFamilyMap.set(modelID, family) 110 | } 111 | } 112 | 113 | public getFamilyByFamilyID(familyID: string): ModelFamily | undefined { 114 | return this._familyMap.get(familyID) 115 | } 116 | 117 | public getFamilyByModelID(modelID: string): ModelFamily | undefined { 118 | return this._modelToFamilyMap.get(modelID) 119 | } 120 | 121 | public get: GetModelFamilyFn = (input) => { 122 | if (typeof input === 'string') { 123 | const family = this.getFamilyByFamilyID(input) || this.getFamilyByModelID(input) 124 | 125 | if (!family) { 126 | throw new Error(`No model ID or family found with ID: ${input}`) 127 | } 128 | 129 | return family 130 | } 131 | 132 | return input 133 | } 134 | 135 | public isModelInFamily(modelID: string, familyID: string): boolean { 136 | const family = this.getFamilyByFamilyID(familyID) 137 | 138 | return family?.modelIDs.includes(modelID) ?? false 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /test/Tokenizer.test.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | import { expect, test } from 'vitest' 9 | import { BytePairDecoder, BytePairEncoder, BytePairEncoding } from '../mod.mjs' 10 | import { createDefaultBPEOptions } from '../tokenizer/mod.mjs' 11 | import { readFixture, TestCase } from './common.mjs' 12 | 13 | type TestCases = TestCase[] 14 | 15 | const testCases: TestCases = [ 16 | { 17 | label: 'Empty string', 18 | given: '', 19 | expected: [], 20 | }, 21 | { 22 | label: 'Just a space', 23 | given: ' ', 24 | expected: [220], 25 | }, 26 | { 27 | label: 'Tab', 28 | given: '\t', 29 | expected: [197], 30 | }, 31 | { 32 | label: 'Simple text', 33 | given: 'This is some text', 34 | expected: [1212, 318, 617, 2420], 35 | }, 36 | { 37 | label: 'Text with special characters', 38 | given: `This is some text with a few special characters: !@#$%^&*()_+-=~[]{}|;:'",./<>?`, 39 | expected: [ 40 | 1212, 318, 617, 2420, 351, 257, 1178, 2041, 3435, 25, 5145, 31, 29953, 4, 61, 5, 9, 3419, 62, 10, 12, 31820, 41 | 21737, 90, 92, 91, 26, 32105, 1600, 19571, 27, 29, 30, 42 | ], 43 | }, 44 | { 45 | label: 'Text with numbers', 46 | given: 'This is some text with numbers 1234567890', 47 | expected: [1212, 318, 617, 2420, 351, 3146, 17031, 2231, 30924, 3829], 48 | }, 49 | 50 | { 51 | label: 'Non-European text', 52 | given: '你好世界', 53 | expected: [19526, 254, 25001, 121, 10310, 244, 45911, 234], 54 | }, 55 | { 56 | label: 'Bubble text', 57 | given: 'Ⓗⓔⓛⓛⓞ Ⓑⓤⓑⓑⓛⓔ', 58 | expected: [ 59 | 158, 240, 121, 158, 241, 242, 158, 241, 249, 158, 241, 249, 158, 241, 252, 2343, 240, 115, 158, 241, 97, 158, 241, 60 | 239, 158, 241, 239, 158, 241, 249, 158, 241, 242, 61 | ], 62 | }, 63 | { 64 | label: 'Multi-token word', 65 | given: 'indivisible', 66 | expected: [521, 452, 12843], 67 | }, 68 | { 69 | label: 'Emojis', 70 | given: 'hello 👋 world 🌍', 71 | expected: [31373, 50169, 233, 995, 12520, 234, 235], 72 | }, 73 | // We include a few properties of Object here to test that the tokenizer 74 | // doesn't include inherited properties. 75 | { 76 | label: 'properties of Object', 77 | given: 'toString constructor hasOwnProperty valueOf', 78 | expected: [1462, 10100, 23772, 468, 23858, 21746, 1988, 5189], 79 | }, 80 | 81 | // Codex models use additional tokens for whitespace... 82 | { 83 | label: 'Without Codex', 84 | given: readFixture('nested-javascript.js'), 85 | expected: [ 86 | 8818, 7744, 45, 7287, 7499, 1391, 198, 220, 1441, 1391, 198, 220, 220, 220, 262, 25, 1391, 198, 220, 220, 220, 87 | 220, 220, 2068, 25, 1391, 198, 220, 220, 220, 220, 220, 220, 220, 7586, 25, 1391, 198, 220, 220, 220, 220, 220, 88 | 220, 220, 220, 220, 21831, 25, 1391, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 18045, 25, 1391, 89 | 198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 625, 25, 1391, 198, 220, 220, 220, 220, 220, 90 | 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 262, 25, 1391, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 91 | 220, 220, 220, 220, 220, 220, 220, 220, 16931, 25, 1391, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 92 | 220, 220, 220, 220, 220, 220, 220, 220, 220, 3290, 25, 1391, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 93 | 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 1782, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 94 | 220, 220, 220, 220, 220, 220, 220, 1782, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 95 | 220, 220, 1782, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 1782, 198, 220, 220, 220, 96 | 220, 220, 220, 220, 220, 220, 220, 220, 1782, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 1782, 198, 220, 97 | 220, 220, 220, 220, 220, 220, 1782, 198, 220, 220, 220, 220, 220, 1782, 198, 220, 220, 220, 1782, 198, 220, 1782, 98 | 198, 92, 99 | ], 100 | }, 101 | // Codex models use additional tokens for whitespace... 102 | { 103 | label: 'With Codex', 104 | given: readFixture('nested-javascript.js'), 105 | expected: [ 106 | 8818, 7744, 45, 7287, 7499, 1391, 198, 220, 1441, 1391, 198, 50258, 262, 25, 1391, 198, 50260, 2068, 25, 1391, 107 | 198, 50262, 7586, 25, 1391, 198, 50264, 21831, 25, 1391, 198, 50266, 18045, 25, 1391, 198, 50268, 625, 25, 1391, 108 | 198, 50270, 262, 25, 1391, 198, 50272, 16931, 25, 1391, 198, 50274, 3290, 25, 1391, 198, 50274, 1782, 198, 50272, 109 | 1782, 198, 50270, 1782, 198, 50268, 1782, 198, 50266, 1782, 198, 50264, 1782, 198, 50262, 1782, 198, 50260, 1782, 110 | 198, 50258, 1782, 198, 220, 1782, 198, 92, 111 | ], 112 | options: { 113 | mergeSpaces: 'codex', 114 | }, 115 | }, 116 | ] 117 | 118 | for (const { label, given, expected, options } of testCases) { 119 | test(label, () => { 120 | const gptEncoding = new BytePairEncoding({ ...createDefaultBPEOptions(), ...options }) 121 | const encoder = new BytePairEncoder(gptEncoding) 122 | const decoder = new BytePairDecoder(gptEncoding) 123 | 124 | const encoded = encoder.encode(given).tokens 125 | const decoded = decoder.decode(encoded) 126 | 127 | expect(encoded).toEqual(expected) 128 | expect(decoded).toEqual(given) 129 | }) 130 | } 131 | -------------------------------------------------------------------------------- /EncoderResult.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | // eslint-disable-next-line @typescript-eslint/no-unused-vars 9 | import type { BytePairEncoder } from './BytePairEncoder.mjs' 10 | 11 | const nodeInspectSymbol = Symbol.for('nodejs.util.inspect.custom') 12 | const supportsSegmenter = typeof Intl !== 'undefined' && typeof Intl.Segmenter !== 'undefined' 13 | 14 | export interface IEncoderResult { 15 | /** 16 | * The tokens that were encoded. 17 | */ 18 | readonly tokens: number[] 19 | /** 20 | * The BPE token pairs that were used during encoded. 21 | */ 22 | readonly bpeTokenPairs: string[] 23 | 24 | /** 25 | * The original text content that was encoded. 26 | */ 27 | readonly originalInput: string 28 | 29 | /** 30 | * The matched text segments found during encoding. 31 | */ 32 | readonly matchedTextSegments: string[] 33 | } 34 | 35 | /** 36 | * The `EncoderResult` includes information for post-encoding analysis such as... 37 | * 38 | * - The tokens that were encoded. 39 | * - The BPE token pairs that were used during encoded. 40 | * - Two-way maps of tokens to BPE token pairs. 41 | * 42 | * This information can be used to analyze the encoding process and to 43 | * reconstruct the original string from the encoded tokens. 44 | * 45 | * Note that this object is considered immutable. Consider encoding a new string 46 | * if you need an updated `EncoderResult`. 47 | * 48 | * @see {@linkcode BytePairEncoder} 49 | */ 50 | export class EncoderResult implements IEncoderResult { 51 | /** 52 | * A map of BPE token pairs to the corresponding token. 53 | */ 54 | public tokenBPEMap: ReadonlyMap 55 | /** 56 | * A map of tokens to the corresponding BPE token pair. 57 | */ 58 | public bpeTokenMap: ReadonlyMap 59 | 60 | /** 61 | * A map of BPE token pairs to the number of times they were used during encoding. 62 | * The key is the BPE token pair and the value is the number of times it appeared. 63 | */ 64 | public bpeCountsMap: ReadonlyMap 65 | 66 | /** 67 | * A map of tokens to the number of times they were used during encoding. 68 | * The key is the token and the value is the number of times it appeared. 69 | */ 70 | public tokenCountsMap: ReadonlyMap 71 | 72 | public readonly tokens: number[] 73 | public readonly bpeTokenPairs: string[] 74 | public readonly originalInput: string 75 | public readonly matchedTextSegments: string[] 76 | 77 | public segmenter: Intl.Segmenter | undefined 78 | 79 | constructor({ tokens, bpeTokenPairs, originalInput, matchedTextSegments }: IEncoderResult, locale?: string) { 80 | if (bpeTokenPairs.length !== tokens.length) { 81 | throw new Error('The number of BPE token pairs must match the number of tokens.') 82 | } 83 | 84 | const tokenToBPE: Array<[number, string]> = [] 85 | const BPEToToken: Array<[string, number]> = [] 86 | 87 | const tokenCountsMap = new Map() 88 | const bpeCountsMap = new Map() 89 | 90 | for (let i = 0; i < bpeTokenPairs.length; i++) { 91 | const token = tokens[i] 92 | const bpeTokenPair = bpeTokenPairs[i] 93 | 94 | const tokenCount = tokenCountsMap.get(token) || 0 95 | const bpeCount = bpeCountsMap.get(bpeTokenPair) || 0 96 | 97 | tokenCountsMap.set(token, tokenCount + 1) 98 | bpeCountsMap.set(bpeTokenPair, bpeCount + 1) 99 | 100 | tokenToBPE.push([token, bpeTokenPair]) 101 | BPEToToken.push([bpeTokenPair, token]) 102 | } 103 | 104 | this.tokenBPEMap = new Map(tokenToBPE) 105 | this.bpeTokenMap = new Map(BPEToToken) 106 | 107 | this.tokenCountsMap = tokenCountsMap 108 | this.bpeCountsMap = bpeCountsMap 109 | 110 | this.tokens = tokens 111 | this.bpeTokenPairs = bpeTokenPairs 112 | this.originalInput = originalInput 113 | this.matchedTextSegments = matchedTextSegments 114 | 115 | if (supportsSegmenter) { 116 | this.segmenter = new Intl.Segmenter(locale) 117 | } 118 | } 119 | 120 | /** 121 | * Get the encoded byte-pair for a given token. 122 | */ 123 | public getBPE(token: number) { 124 | return this.tokenBPEMap.get(token) 125 | } 126 | 127 | /** 128 | * Get the number of times a given token appeared during encoding. 129 | * @see {@linkcode EncoderResult.length} if you're just trying count number of tokens. 130 | */ 131 | public getTokenCount(token: number): number { 132 | return this.tokenCountsMap.get(token) || 0 133 | } 134 | 135 | /** 136 | * Get the number of times a given byte-pair appeared during encoding. 137 | */ 138 | public getBPECount(bpe: string): number { 139 | return this.bpeCountsMap.get(bpe) || 0 140 | } 141 | 142 | /** 143 | * Iterate over the tokens in the result. 144 | */ 145 | public [Symbol.iterator]() { 146 | return this.tokens[Symbol.iterator]() 147 | } 148 | 149 | /** 150 | * The number of tokens in the result. 151 | */ 152 | public get length() { 153 | return this.tokens.length 154 | } 155 | 156 | /** 157 | * The number of characters in the original text. 158 | * 159 | * @see {@link https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl/Segmenter Intl.Segmenter} 160 | */ 161 | public get characterCount(): number { 162 | if (!this.segmenter) { 163 | console.warn('Intl.Segmenter is not supported. Falling back to string length.') 164 | return this.originalInput.length 165 | } 166 | 167 | return Array.from(this.segmenter.segment(this.originalInput)).length 168 | } 169 | 170 | public [nodeInspectSymbol]() { 171 | return `EncoderResult(${this.length})` 172 | } 173 | 174 | public toString() { 175 | return this[Symbol.iterator]().toString() 176 | } 177 | 178 | public toJSON(): IEncoderResult { 179 | return { 180 | tokens: this.tokens, 181 | bpeTokenPairs: this.bpeTokenPairs, 182 | originalInput: this.originalInput, 183 | matchedTextSegments: this.matchedTextSegments, 184 | } 185 | } 186 | } 187 | -------------------------------------------------------------------------------- /BytePairEncoder.mts: -------------------------------------------------------------------------------- 1 | /** 2 | * @copyright Sister Software. All rights reserved. 3 | * @author Teffen Ellis, et al. 4 | * @license 5 | * See LICENSE file in the project root for full license information. 6 | */ 7 | 8 | // eslint-disable-next-line @typescript-eslint/no-unused-vars 9 | import type { BytePairDecoder } from './BytePairDecoder.mjs' 10 | import { BytePairEncoding } from './BytePairEncoding.mjs' 11 | import { EncoderResult } from './EncoderResult.mjs' 12 | 13 | /** 14 | * A valid input for the encoder. 15 | * @internal 16 | */ 17 | export type EncoderInput = string | EncoderResult 18 | 19 | export interface TokenEncodeFn { 20 | ( 21 | /** 22 | * The string to encode. 23 | */ 24 | text: string 25 | ): EncoderResult 26 | 27 | ( 28 | /** 29 | * The string to encode. 30 | */ 31 | text: string, 32 | /** 33 | * Skip post-encoding processing for a slight performance boost. 34 | */ 35 | skipPostProcessing?: boolean 36 | ): EncoderResult 37 | 38 | ( 39 | /** 40 | * A previous encoder result to use as a starting point. 41 | * This will simply pass back the same result. 42 | * Useful when batch processing a mixed list of strings and encoder results. 43 | */ 44 | encoderResult: EncoderResult 45 | ): EncoderResult 46 | 47 | (input: EncoderInput, skipPostProcessing?: boolean): EncoderResult 48 | } 49 | 50 | /** 51 | * GPT Token Encoder. 52 | * 53 | * Generally, you should not need to use this class directly unless you are 54 | * implementing a custom token encoder. 55 | * 56 | * @see {@linkcode BytePairDecoder} for the decoder. 57 | * 58 | * ```ts 59 | * const encoder = new BytePairEncoder(bpeTokenMap, ranksMap) 60 | * const tokens = encoder.encode(encoder) 61 | * ``` 62 | */ 63 | export class BytePairEncoder { 64 | constructor( 65 | protected _bpe: BytePairEncoding, 66 | protected _textEncoder = new TextEncoder(), 67 | protected _bpeTokenCache = new Map() 68 | ) {} 69 | //#region Public Methods 70 | 71 | /** 72 | * Encodes a given string into a list of tokens. 73 | * 74 | * ```ts 75 | * const text = "Do androids dream of electric sheep?" 76 | * const tokens = encoder.encode(text) 77 | * console.log(tokens) // [5211, 290, 305, 2340, 4320, 286, 5186, 15900, 30] 78 | * ``` 79 | * 80 | * @returns The list of encoded tokens. 81 | */ 82 | public encode: TokenEncodeFn = (input, skipPostProcessing = false): any => { 83 | if (typeof input !== 'string') { 84 | return input 85 | } 86 | 87 | // First, we run the pattern matcher on the text... 88 | const matchedTextSegments = Array.from(input.matchAll(this._bpe.tokenizationPattern), (x) => x[0]) 89 | 90 | // Then we convert the tokens into UTF-8 byte arrays... 91 | const utf8Tokens = matchedTextSegments.map((textSegment) => { 92 | // The individual text segments are already UTF-8 encoded, so we can just convert them to byte arrays. 93 | const asUTF8 = this._textEncoder.encode(textSegment) 94 | // We then use our byte map to get the Unicode code point for each byte. 95 | const codePoints = Array.from(asUTF8, (byte) => { 96 | const codePoint = this._bpe.codePointByteMap.byteToCodePoint(byte) 97 | 98 | return codePoint 99 | }) 100 | 101 | return codePoints.join('') 102 | }) 103 | 104 | // Then we convert the UTF-8 byte arrays into BPE tokens... 105 | const bpeTokenPairs = utf8Tokens.flatMap((token) => this._tokenToBPE(token)) 106 | 107 | const tokens = bpeTokenPairs.map((bpeToken) => { 108 | return this._bpe.tokenMap.bytePairToToken(bpeToken) 109 | }) 110 | 111 | if (skipPostProcessing) { 112 | return tokens 113 | } 114 | 115 | const result = new EncoderResult({ tokens, bpeTokenPairs, originalInput: input, matchedTextSegments }) 116 | 117 | return result 118 | } 119 | 120 | /** 121 | * Merges the pair of characters with the given values in the given word. 122 | * 123 | * @param word - An array of individual characters in the word. 124 | * @param first - The first character in the pair to merge. 125 | * @param second - The second character in the pair to merge. 126 | * 127 | * @returns The word with the pair of characters merged. 128 | */ 129 | public mergePair(word: string[], first: string, second: string) { 130 | const newWord: string[] = [] 131 | let i = 0 132 | 133 | while (i < word.length) { 134 | const j = word.indexOf(first, i) 135 | if (j === -1) { 136 | newWord.push(...word.slice(i)) 137 | break 138 | } 139 | newWord.push(...word.slice(i, j)) 140 | if (word[j + 1] === second) { 141 | newWord.push(first + second) 142 | i = j + 2 143 | } else { 144 | newWord.push(first) 145 | i = j + 1 146 | } 147 | } 148 | 149 | return newWord 150 | } 151 | 152 | /** 153 | * Returns an array of all possible pairs of adjacent characters in the given word. 154 | * 155 | * @param word - An array of individual characters in the word. 156 | * @returns An array of all possible pairs of adjacent characters in the word. 157 | */ 158 | public getPairs(word: string[]) { 159 | const characters = word.slice() 160 | const pairingsFound: Record = {} 161 | const pairs: string[][] = [] 162 | let previousCharacterIndex = 0 163 | 164 | for (let i = 1; i < characters.length; i++) { 165 | const previousCharacter = characters[previousCharacterIndex] 166 | const character = characters[i] 167 | 168 | previousCharacterIndex = i 169 | 170 | const pair = [previousCharacter, character] 171 | const grapheme = pair.join('') 172 | 173 | if (Object.hasOwn(pairingsFound, grapheme)) { 174 | continue 175 | } 176 | 177 | pairs.push(pair) 178 | pairingsFound[grapheme] = true 179 | } 180 | 181 | return pairs 182 | } 183 | 184 | //#endregion 185 | 186 | //#region Protected Methods 187 | 188 | /** 189 | * Applies byte pair encoding (BPE) to the given token using the provided BPE ranks and cache. 190 | * If the token is already in the cache, returns its value from the cache. 191 | * 192 | * @param token - The token to encode using BPE. This is derived from text passed through the `tokenizerPattern` RegExp. 193 | * 194 | * @returns The BPE-encoded token. 195 | */ 196 | protected _tokenToBPE(token: string): string[] { 197 | if (this._bpeTokenCache.has(token)) { 198 | return this._bpeTokenCache.get(token)! 199 | } 200 | 201 | // Convert the input token to an array of individual characters 202 | let word = Array.from(token) 203 | 204 | // Get all possible pairs of characters in the token 205 | let pairs = this.getPairs(word) 206 | 207 | // Loop until there are no more pairs to merge 208 | // eslint-disable-next-line no-constant-condition 209 | while (true) { 210 | // If there are no pairs, return the original token 211 | if (!pairs || pairs.length === 0) { 212 | const word = [token] 213 | this._bpeTokenCache.set(token, word) 214 | 215 | return word 216 | } 217 | 218 | // Find the pair with the lowest rank (or highest numeric value if the rank is NaN) 219 | const minRankPair = this._findMinRankPair(pairs) 220 | 221 | // If no valid pair is found, exit the loop 222 | if (!minRankPair || minRankPair.length === 0) { 223 | break 224 | } 225 | 226 | // Merge the pair with the lowest rank 227 | const [first, second] = minRankPair 228 | 229 | let newWord: string[] = [] 230 | let i = 0 231 | 232 | while (i < word.length) { 233 | const j = word.indexOf(first, i) 234 | if (j === -1) { 235 | newWord = newWord.concat(word.slice(i)) 236 | break 237 | } 238 | newWord = newWord.concat(word.slice(i, j)) 239 | i = j 240 | 241 | if (word[i] === first && i < word.length - 1 && word[i + 1] === second) { 242 | newWord.push(first + second) 243 | i = i + 2 244 | } else { 245 | newWord.push(word[i]) 246 | i = i + 1 247 | } 248 | } 249 | 250 | // Update the word with the merged pair 251 | word = newWord 252 | 253 | // If the word is reduced to a single character, exit the loop 254 | if (word.length === 1) { 255 | break 256 | } 257 | 258 | // Otherwise, get all possible pairs of characters in the updated word 259 | pairs = this.getPairs(word) 260 | } 261 | 262 | this._bpeTokenCache.set(token, word) 263 | 264 | return word 265 | } 266 | 267 | /** 268 | * Finds the pair with the lowest rank (or highest numeric value if the rank is NaN) in the given array of pairs. 269 | * 270 | * @param pairs - An array of pairs of characters. 271 | * @param bpeRanks - An object containing the BPE ranks for all pairs of characters. 272 | * @returns The pair with the lowest rank, or null if no valid pair is found. 273 | */ 274 | protected _findMinRankPair(pairs: string[][]): string[] | null { 275 | let minPair: string[] | null = null 276 | let minRank = Infinity 277 | 278 | for (const pair of pairs) { 279 | const rank = this._bpe.ranksMap.getRank(pair[0], pair[1]) 280 | if (typeof rank !== 'number') { 281 | continue 282 | } 283 | 284 | if (rank < minRank) { 285 | minPair = pair 286 | minRank = rank 287 | } 288 | } 289 | 290 | return minPair || null 291 | } 292 | //#endregion 293 | } 294 | --------------------------------------------------------------------------------