├── .gitignore
├── node
├── mod.mts
└── parsers.mts
├── tokenizer
├── mod.mts
├── common.mts
├── data.mts
├── parsers.mts
├── gpt.mts
└── codex.mts
├── test
├── fixtures
│ ├── nested-javascript.js
│ ├── single-paragraph.txt
│ ├── sample-html.html
│ └── multiple-paragraphs.txt
├── common.mts
├── CostEstimator.test.ts
└── Tokenizer.test.ts
├── mod.mts
├── models
├── chat-gpt.mts
├── curie.mts
├── ada.mts
├── gpt-4.mts
├── babbage.mts
├── mod.mts
├── davinci.mts
└── common.mts
├── tsconfig.json
├── examples
└── gpt-token-encoding.mts
├── LICENSE
├── CostEstimator.mts
├── patterns.mts
├── README.md
├── RanksMap.mts
├── BytePairDecoder.mts
├── BytePairTokenMap.mts
├── BytePairEncoding.mts
├── CodePointByteMap.mts
├── package.json
├── EncoderResult.mts
└── BytePairEncoder.mts
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | .npmrc
3 | dist
4 |
--------------------------------------------------------------------------------
/node/mod.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | export * from './parsers.mjs'
9 |
--------------------------------------------------------------------------------
/tokenizer/mod.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | export * from './codex.mjs'
9 | export * from './common.mjs'
10 | export * from './data.mjs'
11 | export * from './encoder.mjs'
12 | export * from './gpt.mjs'
13 | export * from './parsers.mjs'
14 | export * from './vocab.mjs'
15 |
--------------------------------------------------------------------------------
/test/fixtures/nested-javascript.js:
--------------------------------------------------------------------------------
1 | function deeplyNested () {
2 | return {
3 | the: {
4 | quick: {
5 | brown: {
6 | fox: {
7 | jumps: {
8 | over: {
9 | the: {
10 | lazy: {
11 | dog: {
12 | }
13 | }
14 | }
15 | }
16 | }
17 | }
18 | }
19 | }
20 | }
21 | }
22 | }
--------------------------------------------------------------------------------
/test/fixtures/single-paragraph.txt:
--------------------------------------------------------------------------------
1 | Lorem elit in ullamco deserunt et tempor pariatur do est cupidatat commodo elit ex.
2 | In proident non irure esse nisi quis ullamco.
3 | Quis est sint veniam exercitation et sint enim.
4 | Occaecat officia dolore occaecat sunt minim deserunt.
5 | In voluptate nostrud enim sint voluptate nulla amet adipisicing.
6 | Et cillum quis officia dolore aliqua sint sit non non irure ea tempor.
7 | Quis duis adipisicing esse nostrud do veniam occaecat.
8 |
--------------------------------------------------------------------------------
/mod.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | export * from './BytePairDecoder.mjs'
9 | export * from './BytePairEncoder.mjs'
10 | export * from './BytePairEncoding.mjs'
11 | export * from './BytePairTokenMap.mjs'
12 | export * from './CodePointByteMap.mjs'
13 | export * from './CostEstimator.mjs'
14 | export * from './EncoderResult.mjs'
15 | export * from './models/mod.mjs'
16 | export * from './patterns.mjs'
17 | export * from './RanksMap.mjs'
18 | export * from './tokenizer/mod.mjs'
19 |
--------------------------------------------------------------------------------
/models/chat-gpt.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | import { ModelFamily, ModelFamilyIDs } from './common.mjs'
9 |
10 | export const ChatGPTModelFamily: ModelFamily = {
11 | familyID: ModelFamilyIDs.ChatGPT,
12 | tokenLimit: 4096,
13 | mergeSpaces: 0,
14 | modelIDs: ['gpt-3.5-turbo-0301', 'gpt-3.5-turbo'],
15 | preferredModelID: 'gpt-3.5-turbo',
16 | pricing: {
17 | usage: 0.002,
18 | prompt: null,
19 | completion: null,
20 | fineTunedTraining: null,
21 | fineTunedUsage: null,
22 | },
23 | }
24 |
--------------------------------------------------------------------------------
/test/fixtures/sample-html.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Document
8 |
9 |
10 | Hello world!
11 |
12 |
19 |
20 | Lorem ipsum dolor sit amet consectetur adipi
21 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/tokenizer/common.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | /**
9 | * Serialized vocabulary. Used with the bundled vocabulary.
10 | * @internal
11 | * @ignore
12 | */
13 | export type BundledVocab = readonly string[]
14 |
15 | /**
16 | * Serialized encoder keys. Used with the bundled encoder.
17 | * @internal
18 | * @ignore
19 | */
20 | export type BundledEncoderKeys = readonly string[]
21 |
22 | /**
23 | * Serialized encoder keys. Used with the bundled encoder.
24 | * @internal
25 | * @ignore
26 | */
27 | export type BundledEncoderValues = readonly number[]
28 |
--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | // This file is not used in compilation. It is here just for a nice editor experience.
3 |
4 | "compilerOptions": {
5 | "allowJs": true,
6 | "jsx": "react",
7 | "lib": [
8 | "dom",
9 | "dom.iterable",
10 | "esnext"
11 | ],
12 | "sourceMap": false,
13 | "target": "ESNext",
14 | "module": "ES2020",
15 | "outDir": "./dist",
16 | "esModuleInterop": true,
17 | "moduleResolution": "nodenext",
18 | "declaration": true,
19 | "baseUrl": ".",
20 | "strict": true,
21 | "skipLibCheck": true,
22 | },
23 | "include": [
24 | "./**/*.ts",
25 | "./**/*.mts",
26 | ],
27 | "exclude": [
28 | "dist",
29 | "node_modules",
30 | ]
31 | }
32 |
--------------------------------------------------------------------------------
/test/common.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | import { readFileSync } from 'node:fs'
9 | import * as path from 'node:path'
10 | import { fileURLToPath } from 'url'
11 | import { IBytePairEncodingOptions } from '../mod.mjs'
12 |
13 | export interface TestCase {
14 | label: string
15 | given: G
16 | expected: E
17 | options?: Partial
18 | }
19 |
20 | const __dirname = path.dirname(fileURLToPath(import.meta.url))
21 | const fixturesPath = path.join(__dirname, 'fixtures')
22 |
23 | export function readFixture(fileName: string): string {
24 | return readFileSync(path.join(fixturesPath, fileName), 'utf8').trim()
25 | }
26 |
--------------------------------------------------------------------------------
/examples/gpt-token-encoding.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | import { gptDecoder, gptEncoder } from '../mod.mjs'
9 |
10 | const str = 'This is an example sentence to try encoding out on!'
11 | console.log('Encoding...', str)
12 | const encoded = gptEncoder.encode(str)
13 | console.log('Encoded this string looks like: ')
14 |
15 | for (const token of encoded) {
16 | console.log(token)
17 | }
18 |
19 | console.log('We can look at each token and what it represents')
20 | for (const token of encoded) {
21 | console.log({ token, string: gptDecoder.decode([token]) })
22 | }
23 |
24 | const decoded = gptDecoder.decode(encoded)
25 | console.log('We can decode it back into:\n', decoded)
26 |
--------------------------------------------------------------------------------
/models/curie.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | import { ModelFamily, ModelFamilyIDs } from './common.mjs'
9 |
10 | export const CurieModelFamily: ModelFamily = {
11 | familyID: ModelFamilyIDs.Curie,
12 | tokenLimit: 2049,
13 | mergeSpaces: 0,
14 | modelIDs: [
15 | 'curie-instruct-beta',
16 | 'curie-search-document',
17 | 'curie-search-query',
18 | 'curie-similarity',
19 | 'curie:2020-05-03',
20 | 'curie',
21 | 'if-curie-v2',
22 | 'text-curie-001',
23 | 'text-curie:001',
24 | 'text-search-curie-doc-001',
25 | 'text-search-curie-query-001',
26 | 'text-similarity-curie-001',
27 | ],
28 | pricing: {
29 | usage: 0.002,
30 | prompt: 0.002,
31 | completion: 0.002,
32 | fineTunedTraining: 0.003,
33 | fineTunedUsage: 0.012,
34 | },
35 | }
36 |
--------------------------------------------------------------------------------
/tokenizer/data.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | import type { IBytePairEncodingOptions } from '../BytePairEncoding.mjs'
9 | import { DEFAULT_ENCODER_KEYS, DEFAULT_ENCODER_VALUES } from './encoder.mjs'
10 | import { parseBundledEncoder, parseBundledVocab } from './parsers.mjs'
11 | import { DEFAULT_VOCAB } from './vocab.mjs'
12 |
13 | /**
14 | * @internal
15 | */
16 | export function createDefaultBPEOptions(): Readonly {
17 | return {
18 | tokenEncodings: parseBundledEncoder(DEFAULT_ENCODER_KEYS, DEFAULT_ENCODER_VALUES),
19 | vocab: parseBundledVocab(DEFAULT_VOCAB),
20 | }
21 | }
22 |
23 | /**
24 | * Default options for byte pair encoding.
25 | *
26 | * Note that referencing to this object will incur a filesize penalty when bundling.
27 | */
28 | export const DEFAULT_BPE_OPTIONS = createDefaultBPEOptions()
29 |
--------------------------------------------------------------------------------
/models/ada.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | import { ModelFamily, ModelFamilyIDs } from './common.mjs'
9 |
10 | export const AdaModelFamily: ModelFamily = {
11 | familyID: ModelFamilyIDs.Ada,
12 | tokenLimit: 2049,
13 | mergeSpaces: 0,
14 | modelIDs: [
15 | 'ada-code-search-code',
16 | 'ada-code-search-text',
17 | 'ada-search-document',
18 | 'ada-search-query',
19 | 'ada-similarity',
20 | 'ada:2020-05-03',
21 | 'ada',
22 | 'code-search-ada-code-001',
23 | 'code-search-ada-text-001',
24 | 'text-ada-001',
25 | 'text-ada:001',
26 | 'text-embedding-ada-002',
27 | 'text-search-ada-doc-001',
28 | 'text-search-ada-query-001',
29 | 'text-similarity-ada-001',
30 | ],
31 | pricing: {
32 | usage: 0.0004,
33 | prompt: 0.0004,
34 | completion: 0.0004,
35 | fineTunedTraining: 0.0004,
36 | fineTunedUsage: 0.0016,
37 | },
38 | }
39 |
--------------------------------------------------------------------------------
/models/gpt-4.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | import { ModelFamily, ModelFamilyIDs } from './common.mjs'
9 |
10 | export const GPT4_8KModelFamily: ModelFamily = {
11 | familyID: ModelFamilyIDs.GPT4,
12 | tokenLimit: 8192,
13 | mergeSpaces: 0,
14 | modelIDs: ['gpt-3.5-turbo', 'gpt-3.5-turbo-0301'],
15 | preferredModelID: 'gpt-3.5-turbo',
16 | pricing: {
17 | prompt: 0.03,
18 | completion: 0.06,
19 | usage: null,
20 | fineTunedTraining: null,
21 | fineTunedUsage: null,
22 | },
23 | }
24 |
25 | export const GPT4_32KModelFamily: ModelFamily = {
26 | familyID: ModelFamilyIDs.GPT4,
27 | tokenLimit: 32768,
28 | mergeSpaces: 0,
29 | modelIDs: ['gpt-4-32k', 'gpt-4-32k-0314'],
30 | preferredModelID: 'gpt-4-32k',
31 | pricing: {
32 | prompt: 0.06,
33 | completion: 0.12,
34 | usage: null,
35 | fineTunedTraining: null,
36 | fineTunedUsage: null,
37 | },
38 | }
39 |
--------------------------------------------------------------------------------
/models/babbage.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | import { ModelFamily, ModelFamilyIDs } from './common.mjs'
9 |
10 | export const BabbageModelFamily: ModelFamily = {
11 | familyID: ModelFamilyIDs.Babbage,
12 | tokenLimit: 2049,
13 | mergeSpaces: 0,
14 | modelIDs: [
15 | 'babbage-code-search-code',
16 | 'babbage-code-search-text',
17 | 'babbage-search-document',
18 | 'babbage-search-query',
19 | 'babbage-similarity',
20 | 'babbage:2020-05-03',
21 | 'babbage',
22 | 'code-search-babbage-code-001',
23 | 'code-search-babbage-text-001',
24 | 'text-babbage-001',
25 | 'text-babbage:001',
26 | 'text-search-babbage-doc-001',
27 | 'text-search-babbage-query-001',
28 | 'text-similarity-babbage-001',
29 | ],
30 | pricing: {
31 | usage: 0.0005,
32 | prompt: 0.0005,
33 | completion: 0.0005,
34 | fineTunedTraining: 0.0006,
35 | fineTunedUsage: 0.0024,
36 | },
37 | }
38 |
--------------------------------------------------------------------------------
/models/mod.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | import { AdaModelFamily } from './ada.mjs'
9 | import { BabbageModelFamily } from './babbage.mjs'
10 | import { ChatGPTModelFamily } from './chat-gpt.mjs'
11 | import { ModelFamiliesMap } from './common.mjs'
12 | import { CurieModelFamily } from './curie.mjs'
13 | import { DavinciModelFamily } from './davinci.mjs'
14 | import { GPT4_32KModelFamily, GPT4_8KModelFamily } from './gpt-4.mjs'
15 |
16 | /**
17 | * A global store of all model families.
18 | */
19 | export const ModelFamilyStore = new ModelFamiliesMap()
20 |
21 | ModelFamilyStore.addFamily(AdaModelFamily)
22 | ModelFamilyStore.addFamily(BabbageModelFamily)
23 | ModelFamilyStore.addFamily(CurieModelFamily)
24 | ModelFamilyStore.addFamily(DavinciModelFamily)
25 | ModelFamilyStore.addFamily(ChatGPTModelFamily)
26 | ModelFamilyStore.addFamily(GPT4_32KModelFamily)
27 | ModelFamilyStore.addFamily(GPT4_8KModelFamily)
28 |
29 | export * from './common.mjs'
30 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Sister Software
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/models/davinci.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | import { ModelFamily, ModelFamilyIDs } from './common.mjs'
9 |
10 | export const DavinciModelFamily: ModelFamily = {
11 | familyID: ModelFamilyIDs.Davinci,
12 | tokenLimit: 2049,
13 | mergeSpaces: 0,
14 | modelIDs: [
15 | 'davinci-if:3.0.0',
16 | 'davinci-instruct-beta:2.0.0',
17 | 'davinci-instruct-beta',
18 | 'davinci-search-document',
19 | 'davinci-search-query',
20 | 'davinci-similarity',
21 | 'davinci:2020-05-03',
22 | 'davinci',
23 | 'if-davinci-v2',
24 | 'if-davinci:3.0.0',
25 | 'text-davinci-001',
26 | 'text-davinci-002',
27 | 'text-davinci-003',
28 | 'text-davinci-edit-001',
29 | 'text-davinci-insert-001',
30 | 'text-davinci-insert-002',
31 | 'text-davinci:001',
32 | 'text-search-davinci-doc-001',
33 | 'text-search-davinci-query-001',
34 | 'text-similarity-davinci-001',
35 | ],
36 | pricing: {
37 | usage: 0.02,
38 | prompt: 0.02,
39 | completion: 0.02,
40 | fineTunedTraining: 0.03,
41 | fineTunedUsage: 0.12,
42 | },
43 | }
44 |
--------------------------------------------------------------------------------
/CostEstimator.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | import { EncoderInput } from './BytePairEncoder.mjs'
9 | import { EncoderResult } from './EncoderResult.mjs'
10 | import { ModelFamily, ModelFamilyStore, ModelPricingTypes } from './models/mod.mjs'
11 | import { encode } from './tokenizer/mod.mjs'
12 |
13 | export type CostEstimatorInput = string | EncoderResult
14 |
15 | export type ICostEstimationResult = Record
16 |
17 | export interface NormalizeInputResult {
18 | modelFamily: ModelFamily
19 | encodedResults: EncoderResult[]
20 | }
21 |
22 | export interface EstimateCostFn {
23 | (modelOrFamilyID: string, ...inputs: EncoderInput[]): ICostEstimationResult
24 | (modelFamily: ModelFamily, ...inputs: EncoderInput[]): ICostEstimationResult
25 | }
26 |
27 | export const estimateCost: EstimateCostFn = (modelInput: string | ModelFamily, ...inputs: EncoderInput[]) => {
28 | const modelFamily = ModelFamilyStore.get(modelInput)
29 | const encodedResults = inputs.map((input) => encode(input))
30 | const tokenCount = encodedResults.reduce((acc, result) => acc + result.tokens.length, 0)
31 |
32 | // Remember that pricing is per 1000 tokens
33 | const pricedUnits = tokenCount / 1000
34 |
35 | const result = {} as ICostEstimationResult
36 |
37 | for (const [pricingType, pricePer] of Object.entries(modelFamily.pricing)) {
38 | const price = typeof pricePer === 'number' ? pricePer * pricedUnits : null
39 |
40 | result[pricingType as ModelPricingTypes] = price
41 | }
42 |
43 | return result
44 | }
45 |
--------------------------------------------------------------------------------
/tokenizer/parsers.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | import { TokenEncodingsRecord } from '../BytePairTokenMap.mjs'
9 | import { BPEVocab, VocabEntry } from '../RanksMap.mjs'
10 | import { BundledEncoderKeys, BundledEncoderValues, BundledVocab } from './common.mjs'
11 |
12 | /**
13 | * Parses a bundled vocabulary into a list of bigrams.
14 | * @internal
15 | */
16 | export function parseBundledVocab(bundledVocab: BundledVocab): BPEVocab {
17 | if (bundledVocab.length % 2 !== 0) {
18 | throw new Error('Invalid bundled vocabulary format: vocab must be an even number of entries')
19 | }
20 |
21 | const entries: VocabEntry[] = []
22 |
23 | for (let i = 0; i < bundledVocab.length; i += 2) {
24 | const prefix = bundledVocab[i]
25 | const suffix = bundledVocab[i + 1]
26 |
27 | entries.push({
28 | prefix,
29 | suffix,
30 | })
31 | }
32 |
33 | return {
34 | version: 'bundled',
35 | entries,
36 | }
37 | }
38 |
39 | /**
40 | * Parses a bundled encoder into a record.
41 | * @internal
42 | */
43 | export function parseBundledEncoder(
44 | encoderKeys: BundledEncoderKeys,
45 | encoderValues: BundledEncoderValues
46 | ): TokenEncodingsRecord {
47 | if (encoderKeys.length !== encoderValues.length) {
48 | throw new Error('Invalid bundled encoder: keys and values are not the same length')
49 | }
50 |
51 | const tokenEncodings: TokenEncodingsRecord = {}
52 |
53 | for (let i = 0; i < encoderKeys.length; i++) {
54 | const key = encoderKeys[i]
55 | const value = encoderValues[i]
56 |
57 | tokenEncodings[key] = value
58 | }
59 |
60 | return tokenEncodings
61 | }
62 |
--------------------------------------------------------------------------------
/patterns.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | /**
9 | * Default contractions used by the tokenizer pattern.
10 | * Note that order matters here, as the pattern will match the first contraction that matches.
11 | */
12 | export const DEFAULT_CONTRACTIONS = [
13 | /** @example "John's" */
14 | `'s`,
15 | /** @example "can't" */
16 | `'t`,
17 | /** @example "they're" */
18 | `'re`,
19 | /** @example "I've" */
20 | `'ve`,
21 | /** @example "I'm" */
22 | `'m`,
23 | /** @example "they'll" */
24 | `'ll`,
25 | /** @example "he'd" */
26 | `'d`,
27 | ] as const satisfies readonly string[]
28 |
29 | /**
30 | * Default tokenizer rules used to build the tokenizer pattern.
31 | */
32 | export const DEFAULT_TOKENIZER_RULES = [
33 | /** Matches one or more letters optionally preceded by a space. */
34 | ' ?\\p{L}+',
35 | /** Matches one or more digits optionally preceded by a space. */
36 | ' ?\\p{N}+',
37 | /** Matches one or more non-space, non-letter, non-digit characters optionally preceded by a space. */
38 | ' ?[^\\s\\p{L}\\p{N}]+',
39 | /** Matches one or more spaces that are not followed by a non-space character (i.e. end of word). */
40 | '\\s+(?!\\S)',
41 | /** Matches one or more spaces. */
42 | '\\s+',
43 | ] as const satisfies readonly string[]
44 |
45 | /**
46 | * Creates a regular expression pattern used to tokenize text into individual tokens.
47 | * @param contractions - Contractions used by the tokenizer pattern.
48 | * @param rules - Rules used to build the tokenizer pattern.
49 | *
50 | * @see {@linkcode DEFAULT_TOKENIZER_RULES}
51 | */
52 | export function createTokenizerPattern(
53 | contractions: string[] = DEFAULT_CONTRACTIONS.slice(),
54 | rules: string[] = DEFAULT_TOKENIZER_RULES.slice()
55 | ): RegExp {
56 | const pattern = [...contractions, ...rules].join('|')
57 | return new RegExp(pattern, 'gu')
58 | }
59 |
--------------------------------------------------------------------------------
/tokenizer/gpt.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | import { BytePairDecoder } from '../BytePairDecoder.mjs'
9 | import { BytePairEncoder } from '../BytePairEncoder.mjs'
10 | import { BytePairEncoding } from '../BytePairEncoding.mjs'
11 | import { DEFAULT_BPE_OPTIONS } from './data.mjs'
12 |
13 | // We can use a single instance for both encoding and decoding GPT tokens.
14 | const gptEncoding = new BytePairEncoding(DEFAULT_BPE_OPTIONS)
15 |
16 | /**
17 | * Default GPT-3 decoder.
18 | * This is a singleton instance of {@linkcode BytePairEncoder} that is pre-configured to decode GPT-3 tokens.
19 | */
20 | export const gptEncoder = new BytePairEncoder(gptEncoding)
21 |
22 | /**
23 | * Encodes a given UTF-8 string into a list of GPT-3 tokens.
24 | *
25 | * ```js
26 | * const text = "Do androids dream of electric sheep?"
27 | * const tokens = encoder.encode(text)
28 | * console.log(tokens) // [5211, 290, 305, 2340, 4320, 286, 5186, 15900, 30]
29 | * ```
30 | *
31 | * @see {@linkcode decode} for the inverse function.
32 | * @see {@linkcode BytePairEncoder} for more information on how the tokens are decoded.
33 | */
34 | export const encode = gptEncoder.encode
35 |
36 | /**
37 | * Default GPT-3 decoder.
38 | * This is a singleton instance of {@linkcode BytePairDecoder} that is pre-configured to decode GPT-3 tokens.
39 | */
40 | export const gptDecoder = new BytePairDecoder(gptEncoding)
41 |
42 | /**
43 | * Converts a list of GPT-3 tokens into a string.
44 | *
45 | * ```ts
46 | * const tokens = [5211, 290, 305, 2340, 4320, 286, 5186, 15900, 30]
47 | * const text = decode(tokens)
48 | * console.log(text) // "Do androids dream of electric sheep?"
49 | * ```
50 | *
51 | * @see {@linkcode encode} for the inverse function.
52 | * @see {@linkcode BytePairDecoder} for more information on how the tokens are decoded.
53 | */
54 | export const decode = gptDecoder.decode
55 |
--------------------------------------------------------------------------------
/test/fixtures/multiple-paragraphs.txt:
--------------------------------------------------------------------------------
1 | Adipisicing minim aliquip irure nisi mollit dolore nostrud ea incididunt occaecat.
2 | Non voluptate ea sint eiusmod et pariatur incididunt commodo commodo veniam reprehenderit sunt.
3 | Elit velit ullamco ea id sit elit Lorem irure cupidatat ea dolor. Eu ex eiusmod dolore duis.
4 | Esse dolor amet incididunt cupidatat amet velit nostrud.
5 |
6 | Ut anim et ea minim nisi fugiat officia exercitation minim aliqua velit nostrud.
7 | Anim cillum ea sunt ad eu laboris.
8 | Incididunt sint eiusmod do consequat eiusmod esse eiusmod sit irure esse anim veniam officia aliquip.
9 | Nulla esse eiusmod est aliqua. Magna ad aute anim qui et irure eu.
10 |
11 | Incididunt dolore adipisicing excepteur sunt dolor tempor nostrud cupidatat ullamco id Lorem esse.
12 | Non velit ex adipisicing esse proident anim irure exercitation id eiusmod officia proident.
13 | Sint incididunt cillum non reprehenderit nisi pariatur ipsum mollit enim commodo incididunt Lorem fugiat deserunt.
14 | Veniam do pariatur duis magna id sit dolore amet aliqua magna aliqua voluptate exercitation.
15 | Nostrud tempor consectetur ut ad consequat. Ex commodo in sunt dolor irure. Nostrud elit et ipsum minim.
16 |
17 | Amet officia nostrud amet cillum ea nisi.
18 | In in nulla ullamco amet velit nostrud exercitation do nisi reprehenderit.
19 | Laborum aute dolore in ut aliqua reprehenderit amet nisi qui esse occaecat cupidatat cupidatat labore.
20 | Veniam sunt labore excepteur aliqua aliqua sunt do enim do anim in occaecat consequat aute.
21 | Sit labore irure quis commodo non elit sunt ex ea. Nisi do tempor aliquip aliqua ut veniam culpa dolore.
22 |
23 | Exercitation sint consequat sint exercitation enim officia non velit.
24 | Cillum ut sint dolore minim aliqua. Id excepteur fugiat magna laborum in dolore laboris cupidatat occaecat aliquip ipsum laboris.
25 | Duis ex ut qui sit id fugiat aute laborum nulla.
26 | Minim laboris consectetur non occaecat dolor commodo sit adipisicing consequat magna.
27 | Incididunt elit amet fugiat ex voluptate.
28 | Nulla occaecat sit sunt voluptate ex id adipisicing excepteur fugiat reprehenderit.
29 |
--------------------------------------------------------------------------------
/node/parsers.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | import { TokenEncodingsRecord } from '../BytePairTokenMap.mjs'
9 | import { BPEVocab, VocabEntry } from '../RanksMap.mjs'
10 |
11 | /**
12 | * Parses a BPE file into a list of bigrams
13 | *
14 | * The vocab.bpe file is a text file that contains a set of byte pair encoding (BPE) codes
15 | * that are used in the tokenization process.
16 | *
17 | * The file should be in the following format:
18 | *
19 | * ```text
20 | * #version: VERSION_STRING
21 | * [prefix1] [suffix1]
22 | * [prefixN] [suffixN]
23 | * ...
24 | * ```
25 | */
26 | export function parseBPEFile(bpeFileContents: string): BPEVocab {
27 | const lines = bpeFileContents.trim().split('\n')
28 | const [versionLine, ...bpeMerges] = lines
29 | const [version = 'unknown'] = versionLine.trim().match(/^#version: (\d.+)$/) || []
30 |
31 | const entries = bpeMerges.map((line, lineIndex) => {
32 | const segments = line
33 | // Each line contains a pair of tokens separated by a space
34 | .split(/(\s+)/)
35 | // Clean up the tokens...
36 | .map((x) => x.trim())
37 | .filter(Boolean)
38 |
39 | if (segments.length < 2) {
40 | throw new Error(`Invalid BPE file format: line ${lineIndex + 1} is not a valid bigram`)
41 | }
42 |
43 | const [prefix, suffix] = segments
44 |
45 | const entry: VocabEntry = {
46 | prefix,
47 | suffix,
48 | }
49 |
50 | return entry
51 | })
52 |
53 | return {
54 | version,
55 | entries,
56 | }
57 | }
58 |
59 | /**
60 | * Parse a token encoder file, usually from a file named `encoder.json`
61 | */
62 | export function parseEncoderFile(
63 | /**
64 | * The token encoder content, either as a string or as a parsed object.
65 | */
66 | tokenEncoderContent: string | TokenEncodingsRecord
67 | ): TokenEncodingsRecord {
68 | const tokenEncodings: TokenEncodingsRecord =
69 | typeof tokenEncoderContent === 'string' ? JSON.parse(tokenEncoderContent) : tokenEncoderContent
70 |
71 | return tokenEncodings
72 | }
73 |
--------------------------------------------------------------------------------
/tokenizer/codex.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | import { BytePairDecoder } from '../BytePairDecoder.mjs'
9 | import { BytePairEncoder } from '../BytePairEncoder.mjs'
10 | import { BytePairEncoding } from '../BytePairEncoding.mjs'
11 | import { DEFAULT_BPE_OPTIONS } from './data.mjs'
12 |
13 | const codexEncoding = new BytePairEncoding({
14 | ...DEFAULT_BPE_OPTIONS,
15 | mergeSpaces: 'codex',
16 | })
17 |
18 | /**
19 | * Default Codex decoder.
20 | * This is a singleton instance of {@linkcode BytePairDecoder} that is pre-configured to decode GPT-3 tokens.
21 | */
22 | export const codexEncoder = new BytePairEncoder(codexEncoding)
23 |
24 | /**
25 | * Encodes a given UTF-8 string into a list of GPT-3 tokens.
26 | *
27 | * ```js
28 | * const codeText = [
29 | * 'function deeplyNested () {',
30 | * ' return {',
31 | * ' the: {',
32 | * ' quick: {',
33 | * ' brown: {',
34 | * '...etc'
35 | * ].join('')
36 | *
37 | * const codexTokens = encodeToCodexTokens(codeText)
38 | * ```
39 | *
40 | * @see {@linkcode decodeCodexTokens} for the inverse function.
41 | * @see {@linkcode BytePairEncoder} for more information on how the tokens are decoded.
42 | */
43 | export const encodeCodex = codexEncoder.encode
44 |
45 | /**
46 | * Default Codex decoder.
47 | * This is a singleton instance of {@linkcode BytePairDecoder} that is pre-configured to decode GPT-3 tokens.
48 | */
49 | export const codexDecoder = new BytePairDecoder(codexEncoding)
50 |
51 | /**
52 | * Converts a list of Codex tokens into a string.
53 | *
54 | * ```ts
55 | * // Truncated for brevity...
56 | * const tokens = [8818, 7744, 45, 7287, 7499]
57 | * const text = codexDecoder.decode(tokens)
58 | * console.log(text)
59 | *
60 | * // `function deeplyNested () {
61 | * // return {
62 | * // the: {
63 | * // quick: {
64 | * // brown: {
65 | * // ...`
66 | * ```
67 | *
68 | * @see {@linkcode codexEncoder} for the inverse object.
69 | * @see {@linkcode BytePairDecoder} for more information on how the tokens are decoded.
70 | */
71 | export const decodeCodex = codexDecoder.decode
72 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # What is this?
2 |
3 | **GPT Token Utilities** is a small library for encoding and decoding text to and from the tokenized format used by OpenAI's GPT models.
4 |
5 | ## Why should I use this?
6 |
7 | ### 🏃♀️ Fast
8 |
9 | Our token encoder/decoder is optimized for a balance of speed and ease of use. No external dependencies are required.
10 |
11 | ### 🤸♀️ Flexible
12 |
13 | Everything is written in TypeScript and includes type definitions with full documentation. This library is isomorphic and can be used in both Node and the browser!
14 |
15 | ### ⚖️ Light in size. Heavy in features.
16 |
17 | GPT Token Utils balances a small footprint with a full-featured API.
18 | It's also tree-shakeable, so you can import only the functions you need.
19 |
20 | ## Installation
21 |
22 | ### NPM
23 |
24 | ```bash
25 | yarn add gpt-token-utils
26 | # or
27 | npm install --save gpt-token-utils
28 | ```
29 |
30 | ## Usage
31 |
32 | ## Encoding and Decoding Text
33 |
34 | The `encode` and `decode` exports are the main functions you'll use to work with GPT tokens.
35 |
36 | ```js
37 | import { encode, decode } from 'gpt-token-utils'
38 |
39 | // Encode a string to a list of tokens...
40 | const tokens = encode('Humans are strange creatures, and ever so curious too!')
41 |
42 | // You've got a list of tokens!
43 | console.log(tokens) // [32661, 504, 389, 6283, 8109, 11, 290, 1683, 523, 11040, 1165, 0]
44 |
45 | // How many tokens are there?
46 | console.log(tokens.length) // 6
47 |
48 | // Can we decode it back to text?
49 | console.log(decode(tokens)) // "Humans are strange creatures...."
50 | ```
51 |
52 | ### Advanced Usage
53 |
54 | By default, GPT Token Utils includes a sizable vocabulary and encoder. Alternatively, you can pass in your own to customize the encoding/decoding process.
55 |
56 | ```js
57 | import {BytePairEncoder} from 'gpt-token-utils/BytePairEncoder'
58 |
59 | const tokenEncoder = new BytePairEncoder({...})
60 | ```
61 |
62 | ```js
63 | import {BytePairDecoder} from 'gpt-token-utils/BytePairDecoder'
64 |
65 | const tokenDecoder = new TokenDecoder({...})
66 | ```
67 |
68 | # License
69 |
70 | GPT Token Utils is licensed under the [MIT License](https://opensource.org/licenses/MIT). If you've got something cool to share that's built with this library, let us know at [@SisterSoftware](https://twitter.com/SisterSoftware)! We would love to see it!
71 |
--------------------------------------------------------------------------------
/RanksMap.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | const nodeInspectSymbol = Symbol.for('nodejs.util.inspect.custom')
9 |
10 | /**
11 | * Map of byte-pair encodings according to their BPE rank
12 | * @internal
13 | */
14 | export class RanksMap {
15 | protected _prefixToSuffixRankMap: Map<
16 | /** Prefix */
17 | string,
18 | Map<
19 | /** Suffix */
20 | string,
21 | /** Rank */
22 | number
23 | >
24 | > = new Map()
25 |
26 | public getRank(prefix: string, suffix: string): number | undefined {
27 | const suffixMap = this._prefixToSuffixRankMap.get(prefix)
28 |
29 | if (suffixMap) {
30 | return suffixMap.get(suffix)
31 | }
32 | }
33 |
34 | constructor(vocab: VocabEntry[] | BPEVocab, mergesSpacesCount = 0) {
35 | const normalizedVocab = Array.isArray(vocab) ? vocab.slice() : vocab.entries.slice()
36 |
37 | if (mergesSpacesCount > 0) {
38 | for (let i = 1; i < mergesSpacesCount; i++) {
39 | for (let j = 1; j < mergesSpacesCount; j++) {
40 | if (i + j <= mergesSpacesCount) {
41 | normalizedVocab.push({
42 | prefix: '\u0120'.repeat(i),
43 | suffix: '\u0120'.repeat(j),
44 | })
45 | }
46 | }
47 | }
48 | }
49 |
50 | for (const [rank, entry] of normalizedVocab.entries()) {
51 | let suffixMap = this._prefixToSuffixRankMap.get(entry.prefix)
52 |
53 | if (!suffixMap) {
54 | suffixMap = new Map()
55 | this._prefixToSuffixRankMap.set(entry.prefix, suffixMap)
56 | }
57 |
58 | suffixMap.set(entry.suffix, rank)
59 | }
60 | }
61 |
62 | public get size() {
63 | return this._prefixToSuffixRankMap.size
64 | }
65 |
66 | [nodeInspectSymbol]() {
67 | return `RanksMap(${this.size})`
68 | }
69 | }
70 |
71 | /**
72 | * A parsed vocabulary entry.
73 | * The rank of the byte-pair encoding is derived from the index of the pair in the `vocab.bpe` file.
74 | */
75 | export interface VocabEntry {
76 | /** The word stem prefix in the pair. */
77 | prefix: string
78 | /** The suffix token in the pair. */
79 | suffix: string
80 | }
81 |
82 | /**
83 | * A vocabulary of byte-pair encodings.
84 | *
85 | * @see {@linkcode parseBPEFile}
86 | */
87 | export interface BPEVocab {
88 | version: string
89 | entries: VocabEntry[]
90 | }
91 |
--------------------------------------------------------------------------------
/BytePairDecoder.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | import { BytePairEncoding } from './BytePairEncoding.mjs'
9 | // eslint-disable-next-line @typescript-eslint/no-unused-vars
10 | import type { BytePairEncoder } from './BytePairEncoder.mjs'
11 | import type { EncoderResult } from './EncoderResult.mjs'
12 |
13 | /**
14 | * Methods associated with decoding a list of tokens into a string.
15 | */
16 | export interface TokenDecodeFn {
17 | (
18 | /**
19 | * The list of tokens to decode.
20 | */
21 | tokens: number[]
22 | ): string
23 |
24 | (
25 | /**
26 | * The resulting object of the {@linkcode BytePairEncoder.encode} function.
27 | */
28 | encoderResult: EncoderResult
29 | ): string
30 | }
31 |
32 | /**
33 | * GPT Token Decoder.
34 | *
35 | * Generally, you should not need to use this class directly unless you are
36 | * implementing a custom token decoder.
37 | *
38 | * @see {@linkcode BytePairEncoder} for the encoder.
39 | *
40 | * ```ts
41 | * const decoder = new BytePairDecoder({codePointByteMap, bpeTokenMap})
42 | * const text = decoder.decode(tokens)
43 | * ```
44 | */
45 | export class BytePairDecoder {
46 | constructor(protected _bpe: BytePairEncoding, protected _textDecoder = new TextDecoder()) {}
47 |
48 | /**
49 | * Converts a list of tokens into a string.
50 | *
51 | * ```ts
52 | * const tokens = [5211, 290, 305, 2340, 4320, 286, 5186, 15900, 30]
53 | * const text = decoder.decode(tokens)
54 | * console.log(text) // "Do androids dream of electric sheep?"
55 | * ```
56 | *
57 | * @returns The decoded string.
58 | */
59 | public decode: TokenDecodeFn = (tokens: number[] | EncoderResult): string => {
60 | const source = Array.isArray(tokens) ? tokens : tokens.tokens
61 |
62 | const bytePairEncodings = source
63 | // First, we convert the tokens into BPE...
64 | .map((token) => this._bpe.tokenMap.tokenToBytePair(token))
65 | // The pairs combined into a single string to combine the graphemes.
66 | .join('')
67 |
68 | // We then convert the BPE into UTF-8 by split the string...
69 | //...into an array of characters to convert the characters into bytes
70 | const bytes = Array.from(bytePairEncodings, (x) => this._bpe.codePointByteMap.codePointToByte(x))
71 |
72 | // Finally, we convert the bytes into a string.
73 | const text = this._textDecoder.decode(new Uint8Array(bytes))
74 |
75 | return text
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/BytePairTokenMap.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | /**
9 | * A map of byte-pair encodings to their corresponding tokens.
10 | * @internal
11 | */
12 | export type TokenEncodingsRecord = Record
13 |
14 | const nodeInspectSymbol = Symbol.for('nodejs.util.inspect.custom')
15 |
16 | /**
17 | * Two-way map between Unicode byte-pairs and tokens.
18 | * @internal
19 | */
20 | export class BytePairTokenMap {
21 | protected _bpeTokenMap: Map<
22 | /**
23 | * Byte paired character(s), e.g. `'!'`, `'\u00a8'`
24 | */
25 | string,
26 | /**
27 | * The corresponding token, e.g. `0`, `101`
28 | */
29 | number
30 | >
31 | protected _tokenBPEMap: Map<
32 | /**
33 | * The corresponding token, e.g. `0`, `101`
34 | */
35 | number,
36 | /**
37 | * Byte paired character(s), e.g. `'!'`, `'\u00a8'`
38 | */
39 | string
40 | >
41 |
42 | constructor(tokenEncodings: TokenEncodingsRecord, nMergedSpaces = 0) {
43 | this._bpeTokenMap = new Map()
44 | this._tokenBPEMap = new Map()
45 |
46 | for (const [key, value] of Object.entries(tokenEncodings)) {
47 | this.addBytePair(key, value!)
48 | }
49 |
50 | // add merged spaces for codex tokenizer
51 | const normalizeVocabLength = this._bpeTokenMap.size + nMergedSpaces
52 |
53 | for (let i = 0; i < nMergedSpaces; i++) {
54 | const key = '\u0120'.repeat(i + 2)
55 | const value = normalizeVocabLength - nMergedSpaces + i
56 |
57 | this.addBytePair(key, value)
58 | }
59 | }
60 |
61 | public addBytePair(bytePair: string, token: number): void {
62 | this._bpeTokenMap.set(bytePair, token)
63 | this._tokenBPEMap.set(token, bytePair)
64 | }
65 |
66 | public tokenToBytePair(token: number): string {
67 | const bytePair = this._tokenBPEMap.get(token)
68 |
69 | if (typeof bytePair === 'undefined') {
70 | throw new Error(`Token "${token}" was not found in the token encoder.`)
71 | }
72 |
73 | return bytePair
74 | }
75 |
76 | public bytePairToToken(bytePair: string): number {
77 | const token = this._bpeTokenMap.get(bytePair)
78 |
79 | if (typeof token === 'undefined') {
80 | throw new Error(`Byte pair "${bytePair}" was not found in the token encoder.`)
81 | }
82 |
83 | return token
84 | }
85 |
86 | public get size() {
87 | return this._bpeTokenMap.size
88 | }
89 |
90 | public [nodeInspectSymbol]() {
91 | return `BytePairTokenMap(${this.size})`
92 | }
93 | }
94 |
--------------------------------------------------------------------------------
/BytePairEncoding.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | import { BytePairTokenMap, TokenEncodingsRecord } from './BytePairTokenMap.mjs'
9 | import { CodePointByteMap } from './CodePointByteMap.mjs'
10 | import { createTokenizerPattern } from './patterns.mjs'
11 | import { BPEVocab, RanksMap, VocabEntry } from './RanksMap.mjs'
12 |
13 | export interface IBytePairEncodingOptions {
14 | /**
15 | * The token encoder map. This is typically derived from a `encoder.json` file:
16 | *
17 | * ```ts
18 | * const tokenEncodings = parseEncoderFile(fs.readFileSync('./encoder.json', 'utf-8'))
19 | * ```
20 | */
21 | tokenEncodings: TokenEncodingsRecord
22 |
23 | /**
24 | * The BPE ranks map. This is typically derived from a `vocab.bpe` file:
25 | *
26 | * ```ts
27 | * const vocab = parseBPEFile(fs.readFileSync('./vocab.bpe', 'utf-8'))
28 | * ```
29 | *
30 | * You should only use this option if you are using a custom vocabulary.
31 | *
32 | * @see {@linkcode parseBPEFile}
33 | *
34 | * @default parseBPEFile(DEFAULT_VOCAB)
35 | */
36 | vocab: VocabEntry[] | BPEVocab
37 |
38 | /**
39 | * The number of spaces to merge into a single token.
40 | *
41 | * Codex models use a different set of encodings that handle whitespace more efficiently.
42 | * @default 'none'
43 | */
44 | mergeSpaces?: 'none' | 'codex' | number
45 |
46 | /**
47 | * Optional override of the regular expression used to tokenize text.
48 | * @default createTokenizerPattern()
49 | */
50 | tokenizationPattern?: RegExp
51 | }
52 |
53 | /**
54 | * A base class for the Byte Pair Encoding (BPE) encoder and decoder.
55 | * @internal
56 | */
57 | export class BytePairEncoding {
58 | public codePointByteMap: CodePointByteMap
59 | public mergesSpacesCount: number
60 |
61 | public tokenMap: BytePairTokenMap
62 | public ranksMap: RanksMap
63 |
64 | public tokenizationPattern: RegExp
65 |
66 | constructor(options: IBytePairEncodingOptions) {
67 | this.tokenizationPattern = options.tokenizationPattern ?? createTokenizerPattern()
68 | this.codePointByteMap = new CodePointByteMap()
69 |
70 | if (typeof options.mergeSpaces === 'string') {
71 | this.mergesSpacesCount = options.mergeSpaces === 'codex' ? 30 : 0
72 | } else {
73 | this.mergesSpacesCount = options.mergeSpaces ?? 0
74 | }
75 |
76 | this.tokenMap = new BytePairTokenMap(options.tokenEncodings, this.mergesSpacesCount)
77 | this.ranksMap = new RanksMap(options.vocab, this.mergesSpacesCount)
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/test/CostEstimator.test.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | import { expect, test } from 'vitest'
9 | import {
10 | BytePairEncoder,
11 | BytePairEncoding,
12 | createDefaultBPEOptions,
13 | estimateCost,
14 | ICostEstimationResult,
15 | ModelFamilyIDs,
16 | } from '../mod.mjs'
17 | import { readFixture, TestCase } from './common.mjs'
18 |
19 | interface CostEstimatorTestCase extends TestCase {
20 | modelID: string
21 | }
22 |
23 | const testCases: CostEstimatorTestCase[] = [
24 | {
25 | label: 'Empty string',
26 | modelID: ModelFamilyIDs.Davinci,
27 | given: '',
28 | expected: {
29 | usage: 0,
30 | fineTunedUsage: 0,
31 | fineTunedTraining: 0,
32 | prompt: 0,
33 | completion: 0,
34 | },
35 | },
36 | {
37 | label: 'Just a space',
38 | modelID: ModelFamilyIDs.Davinci,
39 | given: ' ',
40 | expected: {
41 | completion: 0.00002,
42 | fineTunedTraining: 0.00003,
43 | fineTunedUsage: 0.00012,
44 | prompt: 0.00002,
45 | usage: 0.00002,
46 | },
47 | },
48 | {
49 | label: 'Tab',
50 | modelID: ModelFamilyIDs.Davinci,
51 | given: '\t',
52 | expected: {
53 | completion: 0.00002,
54 | fineTunedTraining: 0.00003,
55 | fineTunedUsage: 0.00012,
56 | prompt: 0.00002,
57 | usage: 0.00002,
58 | },
59 | },
60 | {
61 | label: 'Single paragraph',
62 | modelID: ModelFamilyIDs.Davinci,
63 | given: readFixture('single-paragraph.txt'),
64 | expected: {
65 | completion: 0.0031,
66 | fineTunedTraining: 0.00465,
67 | fineTunedUsage: 0.0186,
68 | prompt: 0.0031,
69 | usage: 0.0031,
70 | },
71 | },
72 | {
73 | label: 'Multiple paragraphs',
74 | modelID: ModelFamilyIDs.Davinci,
75 | given: readFixture('multiple-paragraphs.txt'),
76 | expected: {
77 | completion: 0.01434,
78 | fineTunedTraining: 0.021509999999999998,
79 | fineTunedUsage: 0.08603999999999999,
80 | prompt: 0.01434,
81 | usage: 0.01434,
82 | },
83 | },
84 | // {
85 | // label: 'HTML content',
86 | // modelID: ModelFamilyIDs.GPT4,
87 | // given: readFixture('sample-html.html'),
88 | // expected: {
89 | // completion: 0.005659999999999999,
90 | // fineTunedTraining: 0.00849,
91 | // fineTunedUsage: 0.03396,
92 | // prompt: 0.005659999999999999,
93 | // usage: 0.005659999999999999,
94 | // },
95 | // },
96 | ]
97 |
98 | for (const { label, given, modelID, expected, options } of testCases) {
99 | test(label, () => {
100 | const gptEncoding = new BytePairEncoding({ ...createDefaultBPEOptions(), ...options })
101 | const encoder = new BytePairEncoder(gptEncoding)
102 |
103 | const encoded = encoder.encode(given)
104 | const estimatedCosts = estimateCost(modelID, encoded)
105 |
106 | expect(estimatedCosts).toEqual(expected)
107 | })
108 | }
109 |
--------------------------------------------------------------------------------
/CodePointByteMap.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | const nodeInspectSymbol = Symbol.for('nodejs.util.inspect.custom')
9 |
10 | /**
11 | * Two-way map of byte values to their corresponding Unicode codepoints.
12 | */
13 | export class CodePointByteMap {
14 | /**
15 | * Maps each byte value to its corresponding Unicode character.
16 | */
17 | protected _byteToCodePoint: Map
18 | /**
19 | * Maps each Unicode character to its corresponding byte value.
20 | */
21 | protected _codePointToByte: Map
22 |
23 | constructor() {
24 | // Contains all the byte values corresponding to printable ASCII characters
25 | const basicBytes = Array.from({ length: 94 }, (_, i) => i + 33)
26 |
27 | // Contains all the byte values corresponding to extended ASCII characters
28 | // that are not already included in `basicBytes`
29 | const extendedBytes = [
30 | ...Array.from({ length: 12 }, (_, i) => i + 161),
31 | ...Array.from({ length: 82 }, (_, i) => i + 174),
32 | ]
33 |
34 | // Combine `basicBytes` and `extendedBytes` to get a list of all byte values
35 | const allBytes = basicBytes.concat(extendedBytes)
36 | const cs = allBytes.slice()
37 |
38 | let unicodeIndex = 0
39 |
40 | // Then assign unique Unicode characters to the bytes in `allBytes` that are not
41 | // already in `basicBytes` or `extendedBytes`...
42 | // For each possible byte value (0-255)...
43 | for (let byteValue = 0; byteValue < 256; byteValue++) {
44 | // If the byte value is not in `allBytes`, it needs to be added to the dictionary
45 | if (!allBytes.includes(byteValue)) {
46 | // Add the byte value to `allBytes`
47 | allBytes.push(byteValue)
48 |
49 | cs.push(256 + unicodeIndex)
50 |
51 | // Increment `unicodeIndex` so the next new byte value will get a unique Unicode character
52 | unicodeIndex++
53 | }
54 | }
55 |
56 | this._byteToCodePoint = new Map()
57 | this._codePointToByte = new Map()
58 |
59 | for (let i = 0; i < cs.length; i++) {
60 | const key = allBytes[i]
61 | const value = String.fromCharCode(cs[i])
62 |
63 | this._byteToCodePoint.set(key, value)
64 | this._codePointToByte.set(value, key)
65 | }
66 | }
67 |
68 | public byteToCodePoint(byte: number): string {
69 | const codePoint = this._byteToCodePoint.get(byte)
70 |
71 | if (typeof codePoint === 'undefined') {
72 | throw new Error(`Byte "${byte}" was not found in the byte map.`)
73 | }
74 |
75 | return codePoint
76 | }
77 |
78 | public codePointToByte(codePoint: string): number {
79 | const byte = this._codePointToByte.get(codePoint)
80 |
81 | if (typeof byte === 'undefined') {
82 | throw new Error(`Unicode character "${codePoint}" was not found in the byte map.`)
83 | }
84 |
85 | return byte
86 | }
87 |
88 | public get size() {
89 | return this._byteToCodePoint.size
90 | }
91 |
92 | public get byteToCodePointMap() {
93 | return this._byteToCodePoint
94 | }
95 |
96 | public get codePointToByteMap() {
97 | return this._codePointToByte
98 | }
99 |
100 | [nodeInspectSymbol]() {
101 | return `CodePointByteMap(${this.size})`
102 | }
103 | }
104 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "gpt-token-utils",
3 | "version": "1.2.0",
4 | "description": "Isomorphic utilities for GPT-3 tokenization and prompt building.",
5 | "repository": "git@github.com:sister-software/gpt-token-utils.git",
6 | "bugs": {
7 | "url": "https://github.com/sister-software/gpt-token-utils/issues"
8 | },
9 | "author": "Teffen Ellis ",
10 | "homepage": "https://github.com/sister-software/gpt-token-utils#readme",
11 | "license": "MIT",
12 | "keywords": [
13 | "gpt",
14 | "gpt-3",
15 | "gpt3",
16 | "openai",
17 | "Open AI",
18 | "tokenization",
19 | "tokenizer",
20 | "prompt",
21 | "prompt-builder"
22 | ],
23 | "main": "dist/mod.mjs",
24 | "types": "dist/mod.d.mts",
25 | "type": "module",
26 | "files": [
27 | "dist/**/*"
28 | ],
29 | "exports": {
30 | "./package.json": "./package.json",
31 | ".": {
32 | "import": "./dist/mod.mjs",
33 | "types": "./dist/mod.d.mts"
34 | },
35 | "./mod": {
36 | "import": "./dist/mod.mjs",
37 | "types": "./dist/mod.d.mts"
38 | },
39 | "./mod.mjs": {
40 | "import": "./dist/mod.mjs",
41 | "types": "./dist/mod.d.mts"
42 | },
43 | "./BytePairDecoder": {
44 | "import": "./dist/BytePairDecoder/mod.mjs",
45 | "types": "./dist/BytePairDecoder/mod.d.mts"
46 | },
47 | "./BytePairDecoder.mjs": {
48 | "import": "./dist/BytePairDecoder/mod.mjs",
49 | "types": "./dist/BytePairDecoder/mod.d.mts"
50 | },
51 | "./BytePairEncoder": {
52 | "import": "./dist/BytePairEncoder/mod.mjs",
53 | "types": "./dist/BytePairEncoder/mod.d.mts"
54 | },
55 | "./BytePairEncoder.mjs": {
56 | "import": "./dist/BytePairEncoder/mod.mjs",
57 | "types": "./dist/BytePairEncoder/mod.d.mts"
58 | },
59 | "./tokenizer": {
60 | "import": "./dist/tokenizer/mod.mjs",
61 | "types": "./dist/tokenizer/mod.d.mts"
62 | },
63 | "./tokenizer.mjs": {
64 | "import": "./dist/tokenizer/mod.mjs",
65 | "types": "./dist/tokenizer/mod.d.mts"
66 | },
67 | "./models": {
68 | "import": "./dist/models/mod.mjs",
69 | "types": "./dist/models/mod.d.mts"
70 | },
71 | "./models.mjs": {
72 | "import": "./dist/models/mod.mjs",
73 | "types": "./dist/models/mod.d.mts"
74 | },
75 | "./node": {
76 | "import": "./dist/node/mod.mjs",
77 | "types": "./dist/node/mod.d.mts"
78 | },
79 | "./node.mjs": {
80 | "import": "./dist/node/mod.mjs",
81 | "types": "./dist/node/mod.d.mts"
82 | }
83 | },
84 | "scripts": {
85 | "test": "vitest",
86 | "build": "tsc -p ./tsconfig.json",
87 | "start": "http-server ./ -p 8081",
88 | "cli-tiktoken": "NODE_OPTIONS=\"--loader ts-node/esm --no-warnings\" ts-node ./internal/tiktoken.mts",
89 | "cli-example": "NODE_OPTIONS=\"--loader ts-node/esm --no-warnings\" ts-node ./example.mts"
90 | },
91 | "devDependencies": {
92 | "@sister.software/eslint-config": "^1.0.0",
93 | "@sister.software/prettier-config": "^1.0.0",
94 | "@sister.software/stylelint-config": "^1.0.0",
95 | "@types/node": "^18.14.4",
96 | "@typescript-eslint/eslint-plugin": "^5.53.0",
97 | "@typescript-eslint/parser": "^5.53.0",
98 | "eslint": "^8.34.0",
99 | "http-server": "^14.1.1",
100 | "prettier": "^2.8.1",
101 | "prettier-plugin-organize-imports": "^3.2.2",
102 | "react": "^18.2.0",
103 | "react-dom": "^18.2.0",
104 | "stylelint": "~14",
105 | "ts-node": "^10.9.1",
106 | "typescript": "4.9.5",
107 | "vitest": "^0.29.2"
108 | },
109 | "prettier": "@sister.software/prettier-config",
110 | "stylelint": {
111 | "extends": [
112 | "@sister.software/stylelint-config"
113 | ]
114 | },
115 | "eslintConfig": {
116 | "extends": [
117 | "@sister.software/eslint-config"
118 | ],
119 | "ignorePatterns": [
120 | "dist",
121 | "results",
122 | "node_modules",
123 | "test/fixtures"
124 | ]
125 | }
126 | }
127 |
--------------------------------------------------------------------------------
/models/common.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | /**
9 | * The IDs of available model families.
10 | */
11 | export const ModelFamilyIDs = {
12 | Ada: 'ada',
13 | Babbage: 'babbage',
14 | Curie: 'curie',
15 | Davinci: 'davinci',
16 | ChatGPT: 'chat-gpt',
17 | GPT4: 'gpt-4',
18 | GPT4_32K: 'gpt-4-32k',
19 | } as const
20 |
21 | /**
22 | * A model family is a group of models that share a common lineage or training data.
23 | */
24 | export interface ModelFamily {
25 | familyID: string
26 | /**
27 | * The number of tokens that can be used with this model in a single request.
28 | */
29 | tokenLimit: number
30 | /**
31 | * The number of spaces to merge into a single token.
32 | *
33 | * Codex models use a different set of encodings that handle whitespace more efficiently.
34 | */
35 | mergeSpaces: number
36 | pricing: ModelPricing
37 | /**
38 | * The IDs of available models, matches the IDs used in the OpenAI API.
39 | */
40 | modelIDs: string[]
41 |
42 | /**
43 | * The ID of the preferred model in this family.
44 | */
45 | preferredModelID?: string
46 | }
47 |
48 | export type ModelPricingTypes = 'usage' | 'fineTunedUsage' | 'fineTunedTraining' | 'prompt' | 'completion'
49 | /**
50 | * The pricing of a model in US dollars.
51 | * @see https://openai.com/pricing
52 | */
53 | export interface ModelPricing {
54 | /**
55 | * The price of model usage per 1000 tokens.
56 | */
57 | usage: number | null
58 | /**
59 | * The price of fine-tuned model usage per 1000 tokens.
60 | */
61 | fineTunedUsage: number | null
62 | /**
63 | * The price of fine-tuned model training per 1000 tokens.
64 | */
65 | fineTunedTraining: number | null
66 | /**
67 | * The price of usage for the prompt endpoint per 1000 tokens.
68 | */
69 | prompt: number | null
70 | /**
71 | * The price of usage for the completion endpoint per 1000 tokens.
72 | */
73 | completion: number | null
74 | }
75 |
76 | export interface GetModelFamilyFn {
77 | (
78 | /**
79 | * The ID of a model within a family, e.g. `"text-davinci-003"`
80 | * @returns The family that the model belongs to.
81 | */
82 | modelID: string
83 | ): ModelFamily
84 | (
85 | /**
86 | * The ID of a model family, e.g. `"davinci"`
87 | * @returns The family associated with the ID.
88 | */
89 | familyID: string
90 | ): ModelFamily
91 | (
92 | /**
93 | * A model family. This is useful for when you already have a model family object.
94 | * @returns The same family object that was passed in.
95 | */
96 | modelFamily: ModelFamily
97 | ): ModelFamily
98 |
99 | (input: string | ModelFamily): ModelFamily
100 | }
101 |
102 | export class ModelFamiliesMap {
103 | protected _familyMap = new Map()
104 | protected _modelToFamilyMap = new Map()
105 |
106 | public addFamily(family: ModelFamily): void {
107 | this._familyMap.set(family.familyID, family)
108 | for (const modelID of family.modelIDs) {
109 | this._modelToFamilyMap.set(modelID, family)
110 | }
111 | }
112 |
113 | public getFamilyByFamilyID(familyID: string): ModelFamily | undefined {
114 | return this._familyMap.get(familyID)
115 | }
116 |
117 | public getFamilyByModelID(modelID: string): ModelFamily | undefined {
118 | return this._modelToFamilyMap.get(modelID)
119 | }
120 |
121 | public get: GetModelFamilyFn = (input) => {
122 | if (typeof input === 'string') {
123 | const family = this.getFamilyByFamilyID(input) || this.getFamilyByModelID(input)
124 |
125 | if (!family) {
126 | throw new Error(`No model ID or family found with ID: ${input}`)
127 | }
128 |
129 | return family
130 | }
131 |
132 | return input
133 | }
134 |
135 | public isModelInFamily(modelID: string, familyID: string): boolean {
136 | const family = this.getFamilyByFamilyID(familyID)
137 |
138 | return family?.modelIDs.includes(modelID) ?? false
139 | }
140 | }
141 |
--------------------------------------------------------------------------------
/test/Tokenizer.test.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | import { expect, test } from 'vitest'
9 | import { BytePairDecoder, BytePairEncoder, BytePairEncoding } from '../mod.mjs'
10 | import { createDefaultBPEOptions } from '../tokenizer/mod.mjs'
11 | import { readFixture, TestCase } from './common.mjs'
12 |
13 | type TestCases = TestCase[]
14 |
15 | const testCases: TestCases = [
16 | {
17 | label: 'Empty string',
18 | given: '',
19 | expected: [],
20 | },
21 | {
22 | label: 'Just a space',
23 | given: ' ',
24 | expected: [220],
25 | },
26 | {
27 | label: 'Tab',
28 | given: '\t',
29 | expected: [197],
30 | },
31 | {
32 | label: 'Simple text',
33 | given: 'This is some text',
34 | expected: [1212, 318, 617, 2420],
35 | },
36 | {
37 | label: 'Text with special characters',
38 | given: `This is some text with a few special characters: !@#$%^&*()_+-=~[]{}|;:'",./<>?`,
39 | expected: [
40 | 1212, 318, 617, 2420, 351, 257, 1178, 2041, 3435, 25, 5145, 31, 29953, 4, 61, 5, 9, 3419, 62, 10, 12, 31820,
41 | 21737, 90, 92, 91, 26, 32105, 1600, 19571, 27, 29, 30,
42 | ],
43 | },
44 | {
45 | label: 'Text with numbers',
46 | given: 'This is some text with numbers 1234567890',
47 | expected: [1212, 318, 617, 2420, 351, 3146, 17031, 2231, 30924, 3829],
48 | },
49 |
50 | {
51 | label: 'Non-European text',
52 | given: '你好世界',
53 | expected: [19526, 254, 25001, 121, 10310, 244, 45911, 234],
54 | },
55 | {
56 | label: 'Bubble text',
57 | given: 'Ⓗⓔⓛⓛⓞ Ⓑⓤⓑⓑⓛⓔ',
58 | expected: [
59 | 158, 240, 121, 158, 241, 242, 158, 241, 249, 158, 241, 249, 158, 241, 252, 2343, 240, 115, 158, 241, 97, 158, 241,
60 | 239, 158, 241, 239, 158, 241, 249, 158, 241, 242,
61 | ],
62 | },
63 | {
64 | label: 'Multi-token word',
65 | given: 'indivisible',
66 | expected: [521, 452, 12843],
67 | },
68 | {
69 | label: 'Emojis',
70 | given: 'hello 👋 world 🌍',
71 | expected: [31373, 50169, 233, 995, 12520, 234, 235],
72 | },
73 | // We include a few properties of Object here to test that the tokenizer
74 | // doesn't include inherited properties.
75 | {
76 | label: 'properties of Object',
77 | given: 'toString constructor hasOwnProperty valueOf',
78 | expected: [1462, 10100, 23772, 468, 23858, 21746, 1988, 5189],
79 | },
80 |
81 | // Codex models use additional tokens for whitespace...
82 | {
83 | label: 'Without Codex',
84 | given: readFixture('nested-javascript.js'),
85 | expected: [
86 | 8818, 7744, 45, 7287, 7499, 1391, 198, 220, 1441, 1391, 198, 220, 220, 220, 262, 25, 1391, 198, 220, 220, 220,
87 | 220, 220, 2068, 25, 1391, 198, 220, 220, 220, 220, 220, 220, 220, 7586, 25, 1391, 198, 220, 220, 220, 220, 220,
88 | 220, 220, 220, 220, 21831, 25, 1391, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 18045, 25, 1391,
89 | 198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 625, 25, 1391, 198, 220, 220, 220, 220, 220,
90 | 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 262, 25, 1391, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220,
91 | 220, 220, 220, 220, 220, 220, 220, 220, 16931, 25, 1391, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220,
92 | 220, 220, 220, 220, 220, 220, 220, 220, 220, 3290, 25, 1391, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220,
93 | 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 1782, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220,
94 | 220, 220, 220, 220, 220, 220, 220, 1782, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220,
95 | 220, 220, 1782, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 1782, 198, 220, 220, 220,
96 | 220, 220, 220, 220, 220, 220, 220, 220, 1782, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 1782, 198, 220,
97 | 220, 220, 220, 220, 220, 220, 1782, 198, 220, 220, 220, 220, 220, 1782, 198, 220, 220, 220, 1782, 198, 220, 1782,
98 | 198, 92,
99 | ],
100 | },
101 | // Codex models use additional tokens for whitespace...
102 | {
103 | label: 'With Codex',
104 | given: readFixture('nested-javascript.js'),
105 | expected: [
106 | 8818, 7744, 45, 7287, 7499, 1391, 198, 220, 1441, 1391, 198, 50258, 262, 25, 1391, 198, 50260, 2068, 25, 1391,
107 | 198, 50262, 7586, 25, 1391, 198, 50264, 21831, 25, 1391, 198, 50266, 18045, 25, 1391, 198, 50268, 625, 25, 1391,
108 | 198, 50270, 262, 25, 1391, 198, 50272, 16931, 25, 1391, 198, 50274, 3290, 25, 1391, 198, 50274, 1782, 198, 50272,
109 | 1782, 198, 50270, 1782, 198, 50268, 1782, 198, 50266, 1782, 198, 50264, 1782, 198, 50262, 1782, 198, 50260, 1782,
110 | 198, 50258, 1782, 198, 220, 1782, 198, 92,
111 | ],
112 | options: {
113 | mergeSpaces: 'codex',
114 | },
115 | },
116 | ]
117 |
118 | for (const { label, given, expected, options } of testCases) {
119 | test(label, () => {
120 | const gptEncoding = new BytePairEncoding({ ...createDefaultBPEOptions(), ...options })
121 | const encoder = new BytePairEncoder(gptEncoding)
122 | const decoder = new BytePairDecoder(gptEncoding)
123 |
124 | const encoded = encoder.encode(given).tokens
125 | const decoded = decoder.decode(encoded)
126 |
127 | expect(encoded).toEqual(expected)
128 | expect(decoded).toEqual(given)
129 | })
130 | }
131 |
--------------------------------------------------------------------------------
/EncoderResult.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | // eslint-disable-next-line @typescript-eslint/no-unused-vars
9 | import type { BytePairEncoder } from './BytePairEncoder.mjs'
10 |
11 | const nodeInspectSymbol = Symbol.for('nodejs.util.inspect.custom')
12 | const supportsSegmenter = typeof Intl !== 'undefined' && typeof Intl.Segmenter !== 'undefined'
13 |
14 | export interface IEncoderResult {
15 | /**
16 | * The tokens that were encoded.
17 | */
18 | readonly tokens: number[]
19 | /**
20 | * The BPE token pairs that were used during encoded.
21 | */
22 | readonly bpeTokenPairs: string[]
23 |
24 | /**
25 | * The original text content that was encoded.
26 | */
27 | readonly originalInput: string
28 |
29 | /**
30 | * The matched text segments found during encoding.
31 | */
32 | readonly matchedTextSegments: string[]
33 | }
34 |
35 | /**
36 | * The `EncoderResult` includes information for post-encoding analysis such as...
37 | *
38 | * - The tokens that were encoded.
39 | * - The BPE token pairs that were used during encoded.
40 | * - Two-way maps of tokens to BPE token pairs.
41 | *
42 | * This information can be used to analyze the encoding process and to
43 | * reconstruct the original string from the encoded tokens.
44 | *
45 | * Note that this object is considered immutable. Consider encoding a new string
46 | * if you need an updated `EncoderResult`.
47 | *
48 | * @see {@linkcode BytePairEncoder}
49 | */
50 | export class EncoderResult implements IEncoderResult {
51 | /**
52 | * A map of BPE token pairs to the corresponding token.
53 | */
54 | public tokenBPEMap: ReadonlyMap
55 | /**
56 | * A map of tokens to the corresponding BPE token pair.
57 | */
58 | public bpeTokenMap: ReadonlyMap
59 |
60 | /**
61 | * A map of BPE token pairs to the number of times they were used during encoding.
62 | * The key is the BPE token pair and the value is the number of times it appeared.
63 | */
64 | public bpeCountsMap: ReadonlyMap
65 |
66 | /**
67 | * A map of tokens to the number of times they were used during encoding.
68 | * The key is the token and the value is the number of times it appeared.
69 | */
70 | public tokenCountsMap: ReadonlyMap
71 |
72 | public readonly tokens: number[]
73 | public readonly bpeTokenPairs: string[]
74 | public readonly originalInput: string
75 | public readonly matchedTextSegments: string[]
76 |
77 | public segmenter: Intl.Segmenter | undefined
78 |
79 | constructor({ tokens, bpeTokenPairs, originalInput, matchedTextSegments }: IEncoderResult, locale?: string) {
80 | if (bpeTokenPairs.length !== tokens.length) {
81 | throw new Error('The number of BPE token pairs must match the number of tokens.')
82 | }
83 |
84 | const tokenToBPE: Array<[number, string]> = []
85 | const BPEToToken: Array<[string, number]> = []
86 |
87 | const tokenCountsMap = new Map()
88 | const bpeCountsMap = new Map()
89 |
90 | for (let i = 0; i < bpeTokenPairs.length; i++) {
91 | const token = tokens[i]
92 | const bpeTokenPair = bpeTokenPairs[i]
93 |
94 | const tokenCount = tokenCountsMap.get(token) || 0
95 | const bpeCount = bpeCountsMap.get(bpeTokenPair) || 0
96 |
97 | tokenCountsMap.set(token, tokenCount + 1)
98 | bpeCountsMap.set(bpeTokenPair, bpeCount + 1)
99 |
100 | tokenToBPE.push([token, bpeTokenPair])
101 | BPEToToken.push([bpeTokenPair, token])
102 | }
103 |
104 | this.tokenBPEMap = new Map(tokenToBPE)
105 | this.bpeTokenMap = new Map(BPEToToken)
106 |
107 | this.tokenCountsMap = tokenCountsMap
108 | this.bpeCountsMap = bpeCountsMap
109 |
110 | this.tokens = tokens
111 | this.bpeTokenPairs = bpeTokenPairs
112 | this.originalInput = originalInput
113 | this.matchedTextSegments = matchedTextSegments
114 |
115 | if (supportsSegmenter) {
116 | this.segmenter = new Intl.Segmenter(locale)
117 | }
118 | }
119 |
120 | /**
121 | * Get the encoded byte-pair for a given token.
122 | */
123 | public getBPE(token: number) {
124 | return this.tokenBPEMap.get(token)
125 | }
126 |
127 | /**
128 | * Get the number of times a given token appeared during encoding.
129 | * @see {@linkcode EncoderResult.length} if you're just trying count number of tokens.
130 | */
131 | public getTokenCount(token: number): number {
132 | return this.tokenCountsMap.get(token) || 0
133 | }
134 |
135 | /**
136 | * Get the number of times a given byte-pair appeared during encoding.
137 | */
138 | public getBPECount(bpe: string): number {
139 | return this.bpeCountsMap.get(bpe) || 0
140 | }
141 |
142 | /**
143 | * Iterate over the tokens in the result.
144 | */
145 | public [Symbol.iterator]() {
146 | return this.tokens[Symbol.iterator]()
147 | }
148 |
149 | /**
150 | * The number of tokens in the result.
151 | */
152 | public get length() {
153 | return this.tokens.length
154 | }
155 |
156 | /**
157 | * The number of characters in the original text.
158 | *
159 | * @see {@link https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl/Segmenter Intl.Segmenter}
160 | */
161 | public get characterCount(): number {
162 | if (!this.segmenter) {
163 | console.warn('Intl.Segmenter is not supported. Falling back to string length.')
164 | return this.originalInput.length
165 | }
166 |
167 | return Array.from(this.segmenter.segment(this.originalInput)).length
168 | }
169 |
170 | public [nodeInspectSymbol]() {
171 | return `EncoderResult(${this.length})`
172 | }
173 |
174 | public toString() {
175 | return this[Symbol.iterator]().toString()
176 | }
177 |
178 | public toJSON(): IEncoderResult {
179 | return {
180 | tokens: this.tokens,
181 | bpeTokenPairs: this.bpeTokenPairs,
182 | originalInput: this.originalInput,
183 | matchedTextSegments: this.matchedTextSegments,
184 | }
185 | }
186 | }
187 |
--------------------------------------------------------------------------------
/BytePairEncoder.mts:
--------------------------------------------------------------------------------
1 | /**
2 | * @copyright Sister Software. All rights reserved.
3 | * @author Teffen Ellis, et al.
4 | * @license
5 | * See LICENSE file in the project root for full license information.
6 | */
7 |
8 | // eslint-disable-next-line @typescript-eslint/no-unused-vars
9 | import type { BytePairDecoder } from './BytePairDecoder.mjs'
10 | import { BytePairEncoding } from './BytePairEncoding.mjs'
11 | import { EncoderResult } from './EncoderResult.mjs'
12 |
13 | /**
14 | * A valid input for the encoder.
15 | * @internal
16 | */
17 | export type EncoderInput = string | EncoderResult
18 |
19 | export interface TokenEncodeFn {
20 | (
21 | /**
22 | * The string to encode.
23 | */
24 | text: string
25 | ): EncoderResult
26 |
27 | (
28 | /**
29 | * The string to encode.
30 | */
31 | text: string,
32 | /**
33 | * Skip post-encoding processing for a slight performance boost.
34 | */
35 | skipPostProcessing?: boolean
36 | ): EncoderResult
37 |
38 | (
39 | /**
40 | * A previous encoder result to use as a starting point.
41 | * This will simply pass back the same result.
42 | * Useful when batch processing a mixed list of strings and encoder results.
43 | */
44 | encoderResult: EncoderResult
45 | ): EncoderResult
46 |
47 | (input: EncoderInput, skipPostProcessing?: boolean): EncoderResult
48 | }
49 |
50 | /**
51 | * GPT Token Encoder.
52 | *
53 | * Generally, you should not need to use this class directly unless you are
54 | * implementing a custom token encoder.
55 | *
56 | * @see {@linkcode BytePairDecoder} for the decoder.
57 | *
58 | * ```ts
59 | * const encoder = new BytePairEncoder(bpeTokenMap, ranksMap)
60 | * const tokens = encoder.encode(encoder)
61 | * ```
62 | */
63 | export class BytePairEncoder {
64 | constructor(
65 | protected _bpe: BytePairEncoding,
66 | protected _textEncoder = new TextEncoder(),
67 | protected _bpeTokenCache = new Map()
68 | ) {}
69 | //#region Public Methods
70 |
71 | /**
72 | * Encodes a given string into a list of tokens.
73 | *
74 | * ```ts
75 | * const text = "Do androids dream of electric sheep?"
76 | * const tokens = encoder.encode(text)
77 | * console.log(tokens) // [5211, 290, 305, 2340, 4320, 286, 5186, 15900, 30]
78 | * ```
79 | *
80 | * @returns The list of encoded tokens.
81 | */
82 | public encode: TokenEncodeFn = (input, skipPostProcessing = false): any => {
83 | if (typeof input !== 'string') {
84 | return input
85 | }
86 |
87 | // First, we run the pattern matcher on the text...
88 | const matchedTextSegments = Array.from(input.matchAll(this._bpe.tokenizationPattern), (x) => x[0])
89 |
90 | // Then we convert the tokens into UTF-8 byte arrays...
91 | const utf8Tokens = matchedTextSegments.map((textSegment) => {
92 | // The individual text segments are already UTF-8 encoded, so we can just convert them to byte arrays.
93 | const asUTF8 = this._textEncoder.encode(textSegment)
94 | // We then use our byte map to get the Unicode code point for each byte.
95 | const codePoints = Array.from(asUTF8, (byte) => {
96 | const codePoint = this._bpe.codePointByteMap.byteToCodePoint(byte)
97 |
98 | return codePoint
99 | })
100 |
101 | return codePoints.join('')
102 | })
103 |
104 | // Then we convert the UTF-8 byte arrays into BPE tokens...
105 | const bpeTokenPairs = utf8Tokens.flatMap((token) => this._tokenToBPE(token))
106 |
107 | const tokens = bpeTokenPairs.map((bpeToken) => {
108 | return this._bpe.tokenMap.bytePairToToken(bpeToken)
109 | })
110 |
111 | if (skipPostProcessing) {
112 | return tokens
113 | }
114 |
115 | const result = new EncoderResult({ tokens, bpeTokenPairs, originalInput: input, matchedTextSegments })
116 |
117 | return result
118 | }
119 |
120 | /**
121 | * Merges the pair of characters with the given values in the given word.
122 | *
123 | * @param word - An array of individual characters in the word.
124 | * @param first - The first character in the pair to merge.
125 | * @param second - The second character in the pair to merge.
126 | *
127 | * @returns The word with the pair of characters merged.
128 | */
129 | public mergePair(word: string[], first: string, second: string) {
130 | const newWord: string[] = []
131 | let i = 0
132 |
133 | while (i < word.length) {
134 | const j = word.indexOf(first, i)
135 | if (j === -1) {
136 | newWord.push(...word.slice(i))
137 | break
138 | }
139 | newWord.push(...word.slice(i, j))
140 | if (word[j + 1] === second) {
141 | newWord.push(first + second)
142 | i = j + 2
143 | } else {
144 | newWord.push(first)
145 | i = j + 1
146 | }
147 | }
148 |
149 | return newWord
150 | }
151 |
152 | /**
153 | * Returns an array of all possible pairs of adjacent characters in the given word.
154 | *
155 | * @param word - An array of individual characters in the word.
156 | * @returns An array of all possible pairs of adjacent characters in the word.
157 | */
158 | public getPairs(word: string[]) {
159 | const characters = word.slice()
160 | const pairingsFound: Record = {}
161 | const pairs: string[][] = []
162 | let previousCharacterIndex = 0
163 |
164 | for (let i = 1; i < characters.length; i++) {
165 | const previousCharacter = characters[previousCharacterIndex]
166 | const character = characters[i]
167 |
168 | previousCharacterIndex = i
169 |
170 | const pair = [previousCharacter, character]
171 | const grapheme = pair.join('')
172 |
173 | if (Object.hasOwn(pairingsFound, grapheme)) {
174 | continue
175 | }
176 |
177 | pairs.push(pair)
178 | pairingsFound[grapheme] = true
179 | }
180 |
181 | return pairs
182 | }
183 |
184 | //#endregion
185 |
186 | //#region Protected Methods
187 |
188 | /**
189 | * Applies byte pair encoding (BPE) to the given token using the provided BPE ranks and cache.
190 | * If the token is already in the cache, returns its value from the cache.
191 | *
192 | * @param token - The token to encode using BPE. This is derived from text passed through the `tokenizerPattern` RegExp.
193 | *
194 | * @returns The BPE-encoded token.
195 | */
196 | protected _tokenToBPE(token: string): string[] {
197 | if (this._bpeTokenCache.has(token)) {
198 | return this._bpeTokenCache.get(token)!
199 | }
200 |
201 | // Convert the input token to an array of individual characters
202 | let word = Array.from(token)
203 |
204 | // Get all possible pairs of characters in the token
205 | let pairs = this.getPairs(word)
206 |
207 | // Loop until there are no more pairs to merge
208 | // eslint-disable-next-line no-constant-condition
209 | while (true) {
210 | // If there are no pairs, return the original token
211 | if (!pairs || pairs.length === 0) {
212 | const word = [token]
213 | this._bpeTokenCache.set(token, word)
214 |
215 | return word
216 | }
217 |
218 | // Find the pair with the lowest rank (or highest numeric value if the rank is NaN)
219 | const minRankPair = this._findMinRankPair(pairs)
220 |
221 | // If no valid pair is found, exit the loop
222 | if (!minRankPair || minRankPair.length === 0) {
223 | break
224 | }
225 |
226 | // Merge the pair with the lowest rank
227 | const [first, second] = minRankPair
228 |
229 | let newWord: string[] = []
230 | let i = 0
231 |
232 | while (i < word.length) {
233 | const j = word.indexOf(first, i)
234 | if (j === -1) {
235 | newWord = newWord.concat(word.slice(i))
236 | break
237 | }
238 | newWord = newWord.concat(word.slice(i, j))
239 | i = j
240 |
241 | if (word[i] === first && i < word.length - 1 && word[i + 1] === second) {
242 | newWord.push(first + second)
243 | i = i + 2
244 | } else {
245 | newWord.push(word[i])
246 | i = i + 1
247 | }
248 | }
249 |
250 | // Update the word with the merged pair
251 | word = newWord
252 |
253 | // If the word is reduced to a single character, exit the loop
254 | if (word.length === 1) {
255 | break
256 | }
257 |
258 | // Otherwise, get all possible pairs of characters in the updated word
259 | pairs = this.getPairs(word)
260 | }
261 |
262 | this._bpeTokenCache.set(token, word)
263 |
264 | return word
265 | }
266 |
267 | /**
268 | * Finds the pair with the lowest rank (or highest numeric value if the rank is NaN) in the given array of pairs.
269 | *
270 | * @param pairs - An array of pairs of characters.
271 | * @param bpeRanks - An object containing the BPE ranks for all pairs of characters.
272 | * @returns The pair with the lowest rank, or null if no valid pair is found.
273 | */
274 | protected _findMinRankPair(pairs: string[][]): string[] | null {
275 | let minPair: string[] | null = null
276 | let minRank = Infinity
277 |
278 | for (const pair of pairs) {
279 | const rank = this._bpe.ranksMap.getRank(pair[0], pair[1])
280 | if (typeof rank !== 'number') {
281 | continue
282 | }
283 |
284 | if (rank < minRank) {
285 | minPair = pair
286 | minRank = rank
287 | }
288 | }
289 |
290 | return minPair || null
291 | }
292 | //#endregion
293 | }
294 |
--------------------------------------------------------------------------------