├── .gitignore
├── node
    ├── mod.mts
    └── parsers.mts
├── tokenizer
    ├── mod.mts
    ├── common.mts
    ├── data.mts
    ├── parsers.mts
    ├── gpt.mts
    └── codex.mts
├── test
    ├── fixtures
    │   ├── nested-javascript.js
    │   ├── single-paragraph.txt
    │   ├── sample-html.html
    │   └── multiple-paragraphs.txt
    ├── common.mts
    ├── CostEstimator.test.ts
    └── Tokenizer.test.ts
├── mod.mts
├── models
    ├── chat-gpt.mts
    ├── curie.mts
    ├── ada.mts
    ├── gpt-4.mts
    ├── babbage.mts
    ├── mod.mts
    ├── davinci.mts
    └── common.mts
├── tsconfig.json
├── examples
    └── gpt-token-encoding.mts
├── LICENSE
├── CostEstimator.mts
├── patterns.mts
├── README.md
├── RanksMap.mts
├── BytePairDecoder.mts
├── BytePairTokenMap.mts
├── BytePairEncoding.mts
├── CodePointByteMap.mts
├── package.json
├── EncoderResult.mts
└── BytePairEncoder.mts


/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | .npmrc
3 | dist
4 | 


--------------------------------------------------------------------------------
/node/mod.mts:
--------------------------------------------------------------------------------
1 | /**
2 |  * @copyright Sister Software. All rights reserved.
3 |  * @author Teffen Ellis, et al.
4 |  * @license
5 |  * See LICENSE file in the project root for full license information.
6 |  */
7 | 
8 | export * from './parsers.mjs'
9 | 


--------------------------------------------------------------------------------
/tokenizer/mod.mts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @copyright Sister Software. All rights reserved.
 3 |  * @author Teffen Ellis, et al.
 4 |  * @license
 5 |  * See LICENSE file in the project root for full license information.
 6 |  */
 7 | 
 8 | export * from './codex.mjs'
 9 | export * from './common.mjs'
10 | export * from './data.mjs'
11 | export * from './encoder.mjs'
12 | export * from './gpt.mjs'
13 | export * from './parsers.mjs'
14 | export * from './vocab.mjs'
15 | 


--------------------------------------------------------------------------------
/test/fixtures/nested-javascript.js:
--------------------------------------------------------------------------------
 1 | function deeplyNested () {
 2 |   return {
 3 |     the: {
 4 |       quick: {
 5 |         brown: {
 6 |           fox: {
 7 |             jumps: {
 8 |               over: {
 9 |                 the: {
10 |                   lazy: {
11 |                     dog: {
12 |                     }
13 |                   }
14 |                 }
15 |               }
16 |             }
17 |           }
18 |         }
19 |       }
20 |     }
21 |   }
22 | }


--------------------------------------------------------------------------------
/test/fixtures/single-paragraph.txt:
--------------------------------------------------------------------------------
1 | Lorem elit in ullamco deserunt et tempor pariatur do est cupidatat commodo elit ex.
2 | In proident non irure esse nisi quis ullamco.
3 | Quis est sint veniam exercitation et sint enim.
4 | Occaecat officia dolore occaecat sunt minim deserunt.
5 | In voluptate nostrud enim sint voluptate nulla amet adipisicing.
6 | Et cillum quis officia dolore aliqua sint sit non non irure ea tempor.
7 | Quis duis adipisicing esse nostrud do veniam occaecat.
8 | 


--------------------------------------------------------------------------------
/mod.mts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @copyright Sister Software. All rights reserved.
 3 |  * @author Teffen Ellis, et al.
 4 |  * @license
 5 |  * See LICENSE file in the project root for full license information.
 6 |  */
 7 | 
 8 | export * from './BytePairDecoder.mjs'
 9 | export * from './BytePairEncoder.mjs'
10 | export * from './BytePairEncoding.mjs'
11 | export * from './BytePairTokenMap.mjs'
12 | export * from './CodePointByteMap.mjs'
13 | export * from './CostEstimator.mjs'
14 | export * from './EncoderResult.mjs'
15 | export * from './models/mod.mjs'
16 | export * from './patterns.mjs'
17 | export * from './RanksMap.mjs'
18 | export * from './tokenizer/mod.mjs'
19 | 


--------------------------------------------------------------------------------
/models/chat-gpt.mts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @copyright Sister Software. All rights reserved.
 3 |  * @author Teffen Ellis, et al.
 4 |  * @license
 5 |  * See LICENSE file in the project root for full license information.
 6 |  */
 7 | 
 8 | import { ModelFamily, ModelFamilyIDs } from './common.mjs'
 9 | 
10 | export const ChatGPTModelFamily: ModelFamily = {
11 |   familyID: ModelFamilyIDs.ChatGPT,
12 |   tokenLimit: 4096,
13 |   mergeSpaces: 0,
14 |   modelIDs: ['gpt-3.5-turbo-0301', 'gpt-3.5-turbo'],
15 |   preferredModelID: 'gpt-3.5-turbo',
16 |   pricing: {
17 |     usage: 0.002,
18 |     prompt: null,
19 |     completion: null,
20 |     fineTunedTraining: null,
21 |     fineTunedUsage: null,
22 |   },
23 | }
24 | 


--------------------------------------------------------------------------------
/test/fixtures/sample-html.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="UTF-8" />
 5 |     <meta http-equiv="X-UA-Compatible" content="IE=edge" />
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
 7 |     <title>Document</title>
 8 |   </head>
 9 |   <body>
10 |     <h1>Hello world!</h1>
11 | 
12 |     <nav>
13 |       <ul>
14 |         <li><a href="#">Home</a></li>
15 |         <li><a href="#">About</a></li>
16 |         <li><a href="#">Contact</a></li>
17 |       </ul>
18 |     </nav>
19 | 
20 |     <p>Lorem ipsum dolor sit amet consectetur adipi</p>
21 |     <footer>
22 |       <p>Footer</p>
23 |     </footer>
24 |   </body>
25 | </html>
26 | 


--------------------------------------------------------------------------------
/tokenizer/common.mts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @copyright Sister Software. All rights reserved.
 3 |  * @author Teffen Ellis, et al.
 4 |  * @license
 5 |  * See LICENSE file in the project root for full license information.
 6 |  */
 7 | 
 8 | /**
 9 |  * Serialized vocabulary. Used with the bundled vocabulary.
10 |  * @internal
11 |  * @ignore
12 |  */
13 | export type BundledVocab = readonly string[]
14 | 
15 | /**
16 |  * Serialized encoder keys. Used with the bundled encoder.
17 |  * @internal
18 |  * @ignore
19 |  */
20 | export type BundledEncoderKeys = readonly string[]
21 | 
22 | /**
23 |  * Serialized encoder keys. Used with the bundled encoder.
24 |  * @internal
25 |  * @ignore
26 |  */
27 | export type BundledEncoderValues = readonly number[]
28 | 


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   // This file is not used in compilation. It is here just for a nice editor experience.
 3 | 
 4 |   "compilerOptions": {
 5 |     "allowJs": true,
 6 |     "jsx": "react",
 7 |     "lib": [
 8 |       "dom",
 9 |       "dom.iterable",
10 |       "esnext"
11 |     ],
12 |     "sourceMap": false,
13 |     "target": "ESNext",
14 |     "module": "ES2020",
15 |     "outDir": "./dist",
16 |     "esModuleInterop": true,
17 |     "moduleResolution": "nodenext",
18 |     "declaration": true,
19 |     "baseUrl": ".",
20 |     "strict": true,
21 |     "skipLibCheck": true,
22 |   },
23 |   "include": [
24 |     "./**/*.ts",
25 |     "./**/*.mts",
26 |   ],
27 |   "exclude": [
28 |     "dist",
29 |     "node_modules",
30 |   ]
31 | }
32 | 


--------------------------------------------------------------------------------
/test/common.mts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @copyright Sister Software. All rights reserved.
 3 |  * @author Teffen Ellis, et al.
 4 |  * @license
 5 |  * See LICENSE file in the project root for full license information.
 6 |  */
 7 | 
 8 | import { readFileSync } from 'node:fs'
 9 | import * as path from 'node:path'
10 | import { fileURLToPath } from 'url'
11 | import { IBytePairEncodingOptions } from '../mod.mjs'
12 | 
13 | export interface TestCase<G, E> {
14 |   label: string
15 |   given: G
16 |   expected: E
17 |   options?: Partial<IBytePairEncodingOptions>
18 | }
19 | 
20 | const __dirname = path.dirname(fileURLToPath(import.meta.url))
21 | const fixturesPath = path.join(__dirname, 'fixtures')
22 | 
23 | export function readFixture(fileName: string): string {
24 |   return readFileSync(path.join(fixturesPath, fileName), 'utf8').trim()
25 | }
26 | 


--------------------------------------------------------------------------------
/examples/gpt-token-encoding.mts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @copyright Sister Software. All rights reserved.
 3 |  * @author Teffen Ellis, et al.
 4 |  * @license
 5 |  * See LICENSE file in the project root for full license information.
 6 |  */
 7 | 
 8 | import { gptDecoder, gptEncoder } from '../mod.mjs'
 9 | 
10 | const str = 'This is an example sentence to try encoding out on!'
11 | console.log('Encoding...', str)
12 | const encoded = gptEncoder.encode(str)
13 | console.log('Encoded this string looks like: ')
14 | 
15 | for (const token of encoded) {
16 |   console.log(token)
17 | }
18 | 
19 | console.log('We can look at each token and what it represents')
20 | for (const token of encoded) {
21 |   console.log({ token, string: gptDecoder.decode([token]) })
22 | }
23 | 
24 | const decoded = gptDecoder.decode(encoded)
25 | console.log('We can decode it back into:\n', decoded)
26 | 


--------------------------------------------------------------------------------
/models/curie.mts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @copyright Sister Software. All rights reserved.
 3 |  * @author Teffen Ellis, et al.
 4 |  * @license
 5 |  * See LICENSE file in the project root for full license information.
 6 |  */
 7 | 
 8 | import { ModelFamily, ModelFamilyIDs } from './common.mjs'
 9 | 
10 | export const CurieModelFamily: ModelFamily = {
11 |   familyID: ModelFamilyIDs.Curie,
12 |   tokenLimit: 2049,
13 |   mergeSpaces: 0,
14 |   modelIDs: [
15 |     'curie-instruct-beta',
16 |     'curie-search-document',
17 |     'curie-search-query',
18 |     'curie-similarity',
19 |     'curie:2020-05-03',
20 |     'curie',
21 |     'if-curie-v2',
22 |     'text-curie-001',
23 |     'text-curie:001',
24 |     'text-search-curie-doc-001',
25 |     'text-search-curie-query-001',
26 |     'text-similarity-curie-001',
27 |   ],
28 |   pricing: {
29 |     usage: 0.002,
30 |     prompt: 0.002,
31 |     completion: 0.002,
32 |     fineTunedTraining: 0.003,
33 |     fineTunedUsage: 0.012,
34 |   },
35 | }
36 | 


--------------------------------------------------------------------------------
/tokenizer/data.mts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @copyright Sister Software. All rights reserved.
 3 |  * @author Teffen Ellis, et al.
 4 |  * @license
 5 |  * See LICENSE file in the project root for full license information.
 6 |  */
 7 | 
 8 | import type { IBytePairEncodingOptions } from '../BytePairEncoding.mjs'
 9 | import { DEFAULT_ENCODER_KEYS, DEFAULT_ENCODER_VALUES } from './encoder.mjs'
10 | import { parseBundledEncoder, parseBundledVocab } from './parsers.mjs'
11 | import { DEFAULT_VOCAB } from './vocab.mjs'
12 | 
13 | /**
14 |  * @internal
15 |  */
16 | export function createDefaultBPEOptions(): Readonly<IBytePairEncodingOptions> {
17 |   return {
18 |     tokenEncodings: parseBundledEncoder(DEFAULT_ENCODER_KEYS, DEFAULT_ENCODER_VALUES),
19 |     vocab: parseBundledVocab(DEFAULT_VOCAB),
20 |   }
21 | }
22 | 
23 | /**
24 |  * Default options for byte pair encoding.
25 |  *
26 |  * Note that referencing to this object will incur a filesize penalty when bundling.
27 |  */
28 | export const DEFAULT_BPE_OPTIONS = createDefaultBPEOptions()
29 | 


--------------------------------------------------------------------------------
/models/ada.mts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @copyright Sister Software. All rights reserved.
 3 |  * @author Teffen Ellis, et al.
 4 |  * @license
 5 |  * See LICENSE file in the project root for full license information.
 6 |  */
 7 | 
 8 | import { ModelFamily, ModelFamilyIDs } from './common.mjs'
 9 | 
10 | export const AdaModelFamily: ModelFamily = {
11 |   familyID: ModelFamilyIDs.Ada,
12 |   tokenLimit: 2049,
13 |   mergeSpaces: 0,
14 |   modelIDs: [
15 |     'ada-code-search-code',
16 |     'ada-code-search-text',
17 |     'ada-search-document',
18 |     'ada-search-query',
19 |     'ada-similarity',
20 |     'ada:2020-05-03',
21 |     'ada',
22 |     'code-search-ada-code-001',
23 |     'code-search-ada-text-001',
24 |     'text-ada-001',
25 |     'text-ada:001',
26 |     'text-embedding-ada-002',
27 |     'text-search-ada-doc-001',
28 |     'text-search-ada-query-001',
29 |     'text-similarity-ada-001',
30 |   ],
31 |   pricing: {
32 |     usage: 0.0004,
33 |     prompt: 0.0004,
34 |     completion: 0.0004,
35 |     fineTunedTraining: 0.0004,
36 |     fineTunedUsage: 0.0016,
37 |   },
38 | }
39 | 


--------------------------------------------------------------------------------
/models/gpt-4.mts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @copyright Sister Software. All rights reserved.
 3 |  * @author Teffen Ellis, et al.
 4 |  * @license
 5 |  * See LICENSE file in the project root for full license information.
 6 |  */
 7 | 
 8 | import { ModelFamily, ModelFamilyIDs } from './common.mjs'
 9 | 
10 | export const GPT4_8KModelFamily: ModelFamily = {
11 |   familyID: ModelFamilyIDs.GPT4,
12 |   tokenLimit: 8192,
13 |   mergeSpaces: 0,
14 |   modelIDs: ['gpt-3.5-turbo', 'gpt-3.5-turbo-0301'],
15 |   preferredModelID: 'gpt-3.5-turbo',
16 |   pricing: {
17 |     prompt: 0.03,
18 |     completion: 0.06,
19 |     usage: null,
20 |     fineTunedTraining: null,
21 |     fineTunedUsage: null,
22 |   },
23 | }
24 | 
25 | export const GPT4_32KModelFamily: ModelFamily = {
26 |   familyID: ModelFamilyIDs.GPT4,
27 |   tokenLimit: 32768,
28 |   mergeSpaces: 0,
29 |   modelIDs: ['gpt-4-32k', 'gpt-4-32k-0314'],
30 |   preferredModelID: 'gpt-4-32k',
31 |   pricing: {
32 |     prompt: 0.06,
33 |     completion: 0.12,
34 |     usage: null,
35 |     fineTunedTraining: null,
36 |     fineTunedUsage: null,
37 |   },
38 | }
39 | 


--------------------------------------------------------------------------------
/models/babbage.mts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @copyright Sister Software. All rights reserved.
 3 |  * @author Teffen Ellis, et al.
 4 |  * @license
 5 |  * See LICENSE file in the project root for full license information.
 6 |  */
 7 | 
 8 | import { ModelFamily, ModelFamilyIDs } from './common.mjs'
 9 | 
10 | export const BabbageModelFamily: ModelFamily = {
11 |   familyID: ModelFamilyIDs.Babbage,
12 |   tokenLimit: 2049,
13 |   mergeSpaces: 0,
14 |   modelIDs: [
15 |     'babbage-code-search-code',
16 |     'babbage-code-search-text',
17 |     'babbage-search-document',
18 |     'babbage-search-query',
19 |     'babbage-similarity',
20 |     'babbage:2020-05-03',
21 |     'babbage',
22 |     'code-search-babbage-code-001',
23 |     'code-search-babbage-text-001',
24 |     'text-babbage-001',
25 |     'text-babbage:001',
26 |     'text-search-babbage-doc-001',
27 |     'text-search-babbage-query-001',
28 |     'text-similarity-babbage-001',
29 |   ],
30 |   pricing: {
31 |     usage: 0.0005,
32 |     prompt: 0.0005,
33 |     completion: 0.0005,
34 |     fineTunedTraining: 0.0006,
35 |     fineTunedUsage: 0.0024,
36 |   },
37 | }
38 | 


--------------------------------------------------------------------------------
/models/mod.mts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @copyright Sister Software. All rights reserved.
 3 |  * @author Teffen Ellis, et al.
 4 |  * @license
 5 |  * See LICENSE file in the project root for full license information.
 6 |  */
 7 | 
 8 | import { AdaModelFamily } from './ada.mjs'
 9 | import { BabbageModelFamily } from './babbage.mjs'
10 | import { ChatGPTModelFamily } from './chat-gpt.mjs'
11 | import { ModelFamiliesMap } from './common.mjs'
12 | import { CurieModelFamily } from './curie.mjs'
13 | import { DavinciModelFamily } from './davinci.mjs'
14 | import { GPT4_32KModelFamily, GPT4_8KModelFamily } from './gpt-4.mjs'
15 | 
16 | /**
17 |  * A global store of all model families.
18 |  */
19 | export const ModelFamilyStore = new ModelFamiliesMap()
20 | 
21 | ModelFamilyStore.addFamily(AdaModelFamily)
22 | ModelFamilyStore.addFamily(BabbageModelFamily)
23 | ModelFamilyStore.addFamily(CurieModelFamily)
24 | ModelFamilyStore.addFamily(DavinciModelFamily)
25 | ModelFamilyStore.addFamily(ChatGPTModelFamily)
26 | ModelFamilyStore.addFamily(GPT4_32KModelFamily)
27 | ModelFamilyStore.addFamily(GPT4_8KModelFamily)
28 | 
29 | export * from './common.mjs'
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Sister Software
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/models/davinci.mts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @copyright Sister Software. All rights reserved.
 3 |  * @author Teffen Ellis, et al.
 4 |  * @license
 5 |  * See LICENSE file in the project root for full license information.
 6 |  */
 7 | 
 8 | import { ModelFamily, ModelFamilyIDs } from './common.mjs'
 9 | 
10 | export const DavinciModelFamily: ModelFamily = {
11 |   familyID: ModelFamilyIDs.Davinci,
12 |   tokenLimit: 2049,
13 |   mergeSpaces: 0,
14 |   modelIDs: [
15 |     'davinci-if:3.0.0',
16 |     'davinci-instruct-beta:2.0.0',
17 |     'davinci-instruct-beta',
18 |     'davinci-search-document',
19 |     'davinci-search-query',
20 |     'davinci-similarity',
21 |     'davinci:2020-05-03',
22 |     'davinci',
23 |     'if-davinci-v2',
24 |     'if-davinci:3.0.0',
25 |     'text-davinci-001',
26 |     'text-davinci-002',
27 |     'text-davinci-003',
28 |     'text-davinci-edit-001',
29 |     'text-davinci-insert-001',
30 |     'text-davinci-insert-002',
31 |     'text-davinci:001',
32 |     'text-search-davinci-doc-001',
33 |     'text-search-davinci-query-001',
34 |     'text-similarity-davinci-001',
35 |   ],
36 |   pricing: {
37 |     usage: 0.02,
38 |     prompt: 0.02,
39 |     completion: 0.02,
40 |     fineTunedTraining: 0.03,
41 |     fineTunedUsage: 0.12,
42 |   },
43 | }
44 | 


--------------------------------------------------------------------------------
/CostEstimator.mts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @copyright Sister Software. All rights reserved.
 3 |  * @author Teffen Ellis, et al.
 4 |  * @license
 5 |  * See LICENSE file in the project root for full license information.
 6 |  */
 7 | 
 8 | import { EncoderInput } from './BytePairEncoder.mjs'
 9 | import { EncoderResult } from './EncoderResult.mjs'
10 | import { ModelFamily, ModelFamilyStore, ModelPricingTypes } from './models/mod.mjs'
11 | import { encode } from './tokenizer/mod.mjs'
12 | 
13 | export type CostEstimatorInput = string | EncoderResult
14 | 
15 | export type ICostEstimationResult = Record<ModelPricingTypes, number | null>
16 | 
17 | export interface NormalizeInputResult {
18 |   modelFamily: ModelFamily
19 |   encodedResults: EncoderResult[]
20 | }
21 | 
22 | export interface EstimateCostFn {
23 |   (modelOrFamilyID: string, ...inputs: EncoderInput[]): ICostEstimationResult
24 |   (modelFamily: ModelFamily, ...inputs: EncoderInput[]): ICostEstimationResult
25 | }
26 | 
27 | export const estimateCost: EstimateCostFn = (modelInput: string | ModelFamily, ...inputs: EncoderInput[]) => {
28 |   const modelFamily = ModelFamilyStore.get(modelInput)
29 |   const encodedResults = inputs.map((input) => encode(input))
30 |   const tokenCount = encodedResults.reduce((acc, result) => acc + result.tokens.length, 0)
31 | 
32 |   // Remember that pricing is per 1000 tokens
33 |   const pricedUnits = tokenCount / 1000
34 | 
35 |   const result = {} as ICostEstimationResult
36 | 
37 |   for (const [pricingType, pricePer] of Object.entries(modelFamily.pricing)) {
38 |     const price = typeof pricePer === 'number' ? pricePer * pricedUnits : null
39 | 
40 |     result[pricingType as ModelPricingTypes] = price
41 |   }
42 | 
43 |   return result
44 | }
45 | 


--------------------------------------------------------------------------------
/tokenizer/parsers.mts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @copyright Sister Software. All rights reserved.
 3 |  * @author Teffen Ellis, et al.
 4 |  * @license
 5 |  * See LICENSE file in the project root for full license information.
 6 |  */
 7 | 
 8 | import { TokenEncodingsRecord } from '../BytePairTokenMap.mjs'
 9 | import { BPEVocab, VocabEntry } from '../RanksMap.mjs'
10 | import { BundledEncoderKeys, BundledEncoderValues, BundledVocab } from './common.mjs'
11 | 
12 | /**
13 |  * Parses a bundled vocabulary into a list of bigrams.
14 |  * @internal
15 |  */
16 | export function parseBundledVocab(bundledVocab: BundledVocab): BPEVocab {
17 |   if (bundledVocab.length % 2 !== 0) {
18 |     throw new Error('Invalid bundled vocabulary format: vocab must be an even number of entries')
19 |   }
20 | 
21 |   const entries: VocabEntry[] = []
22 | 
23 |   for (let i = 0; i < bundledVocab.length; i += 2) {
24 |     const prefix = bundledVocab[i]
25 |     const suffix = bundledVocab[i + 1]
26 | 
27 |     entries.push({
28 |       prefix,
29 |       suffix,
30 |     })
31 |   }
32 | 
33 |   return {
34 |     version: 'bundled',
35 |     entries,
36 |   }
37 | }
38 | 
39 | /**
40 |  * Parses a bundled encoder into a record.
41 |  * @internal
42 |  */
43 | export function parseBundledEncoder(
44 |   encoderKeys: BundledEncoderKeys,
45 |   encoderValues: BundledEncoderValues
46 | ): TokenEncodingsRecord {
47 |   if (encoderKeys.length !== encoderValues.length) {
48 |     throw new Error('Invalid bundled encoder: keys and values are not the same length')
49 |   }
50 | 
51 |   const tokenEncodings: TokenEncodingsRecord = {}
52 | 
53 |   for (let i = 0; i < encoderKeys.length; i++) {
54 |     const key = encoderKeys[i]
55 |     const value = encoderValues[i]
56 | 
57 |     tokenEncodings[key] = value
58 |   }
59 | 
60 |   return tokenEncodings
61 | }
62 | 


--------------------------------------------------------------------------------
/patterns.mts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @copyright Sister Software. All rights reserved.
 3 |  * @author Teffen Ellis, et al.
 4 |  * @license
 5 |  * See LICENSE file in the project root for full license information.
 6 |  */
 7 | 
 8 | /**
 9 |  * Default contractions used by the tokenizer pattern.
10 |  * Note that order matters here, as the pattern will match the first contraction that matches.
11 |  */
12 | export const DEFAULT_CONTRACTIONS = [
13 |   /** @example "John's" */
14 |   `'s`,
15 |   /** @example "can't" */
16 |   `'t`,
17 |   /** @example "they're" */
18 |   `'re`,
19 |   /** @example "I've" */
20 |   `'ve`,
21 |   /** @example "I'm" */
22 |   `'m`,
23 |   /** @example "they'll" */
24 |   `'ll`,
25 |   /** @example "he'd" */
26 |   `'d`,
27 | ] as const satisfies readonly string[]
28 | 
29 | /**
30 |  * Default tokenizer rules used to build the tokenizer pattern.
31 |  */
32 | export const DEFAULT_TOKENIZER_RULES = [
33 |   /** Matches one or more letters optionally preceded by a space. */
34 |   ' ?\\p{L}+',
35 |   /** Matches one or more digits optionally preceded by a space. */
36 |   ' ?\\p{N}+',
37 |   /** Matches one or more non-space, non-letter, non-digit characters optionally preceded by a space. */
38 |   ' ?[^\\s\\p{L}\\p{N}]+',
39 |   /** Matches one or more spaces that are not followed by a non-space character (i.e. end of word). */
40 |   '\\s+(?!\\S)',
41 |   /** Matches one or more spaces. */
42 |   '\\s+',
43 | ] as const satisfies readonly string[]
44 | 
45 | /**
46 |  * Creates a regular expression pattern used to tokenize text into individual tokens.
47 |  * @param contractions - Contractions used by the tokenizer pattern.
48 |  * @param rules - Rules used to build the tokenizer pattern.
49 |  *
50 |  * @see {@linkcode DEFAULT_TOKENIZER_RULES}
51 |  */
52 | export function createTokenizerPattern(
53 |   contractions: string[] = DEFAULT_CONTRACTIONS.slice(),
54 |   rules: string[] = DEFAULT_TOKENIZER_RULES.slice()
55 | ): RegExp {
56 |   const pattern = [...contractions, ...rules].join('|')
57 |   return new RegExp(pattern, 'gu')
58 | }
59 | 


--------------------------------------------------------------------------------
/tokenizer/gpt.mts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @copyright Sister Software. All rights reserved.
 3 |  * @author Teffen Ellis, et al.
 4 |  * @license
 5 |  * See LICENSE file in the project root for full license information.
 6 |  */
 7 | 
 8 | import { BytePairDecoder } from '../BytePairDecoder.mjs'
 9 | import { BytePairEncoder } from '../BytePairEncoder.mjs'
10 | import { BytePairEncoding } from '../BytePairEncoding.mjs'
11 | import { DEFAULT_BPE_OPTIONS } from './data.mjs'
12 | 
13 | // We can use a single instance for both encoding and decoding GPT tokens.
14 | const gptEncoding = new BytePairEncoding(DEFAULT_BPE_OPTIONS)
15 | 
16 | /**
17 |  * Default GPT-3 decoder.
18 |  * This is a singleton instance of {@linkcode BytePairEncoder} that is pre-configured to decode GPT-3 tokens.
19 |  */
20 | export const gptEncoder = new BytePairEncoder(gptEncoding)
21 | 
22 | /**
23 |  * Encodes a given UTF-8 string into a list of GPT-3 tokens.
24 |  *
25 |  * ```js
26 |  * const text = "Do androids dream of electric sheep?"
27 |  * const tokens = encoder.encode(text)
28 |  * console.log(tokens) // [5211, 290, 305, 2340, 4320, 286, 5186, 15900, 30]
29 |  * ```
30 |  *
31 |  * @see {@linkcode decode} for the inverse function.
32 |  * @see {@linkcode BytePairEncoder} for more information on how the tokens are decoded.
33 |  */
34 | export const encode = gptEncoder.encode
35 | 
36 | /**
37 |  * Default GPT-3 decoder.
38 |  * This is a singleton instance of {@linkcode BytePairDecoder} that is pre-configured to decode GPT-3 tokens.
39 |  */
40 | export const gptDecoder = new BytePairDecoder(gptEncoding)
41 | 
42 | /**
43 |  * Converts a list of GPT-3 tokens into a string.
44 |  *
45 |  * ```ts
46 |  * const tokens = [5211, 290, 305, 2340, 4320, 286, 5186, 15900, 30]
47 |  * const text = decode(tokens)
48 |  * console.log(text) // "Do androids dream of electric sheep?"
49 |  * ```
50 |  *
51 |  * @see {@linkcode encode} for the inverse function.
52 |  * @see {@linkcode BytePairDecoder} for more information on how the tokens are decoded.
53 |  */
54 | export const decode = gptDecoder.decode
55 | 


--------------------------------------------------------------------------------
/test/fixtures/multiple-paragraphs.txt:
--------------------------------------------------------------------------------
 1 | Adipisicing minim aliquip irure nisi mollit dolore nostrud ea incididunt occaecat.
 2 | Non voluptate ea sint eiusmod et pariatur incididunt commodo commodo veniam reprehenderit sunt.
 3 | Elit velit ullamco ea id sit elit Lorem irure cupidatat ea dolor. Eu ex eiusmod dolore duis.
 4 | Esse dolor amet incididunt cupidatat amet velit nostrud.
 5 | 
 6 | Ut anim et ea minim nisi fugiat officia exercitation minim aliqua velit nostrud.
 7 | Anim cillum ea sunt ad eu laboris.
 8 | Incididunt sint eiusmod do consequat eiusmod esse eiusmod sit irure esse anim veniam officia aliquip.
 9 | Nulla esse eiusmod est aliqua. Magna ad aute anim qui et irure eu.
10 | 
11 | Incididunt dolore adipisicing excepteur sunt dolor tempor nostrud cupidatat ullamco id Lorem esse.
12 | Non velit ex adipisicing esse proident anim irure exercitation id eiusmod officia proident.
13 | Sint incididunt cillum non reprehenderit nisi pariatur ipsum mollit enim commodo incididunt Lorem fugiat deserunt.
14 | Veniam do pariatur duis magna id sit dolore amet aliqua magna aliqua voluptate exercitation.
15 | Nostrud tempor consectetur ut ad consequat. Ex commodo in sunt dolor irure. Nostrud elit et ipsum minim.
16 | 
17 | Amet officia nostrud amet cillum ea nisi.
18 | In in nulla ullamco amet velit nostrud exercitation do nisi reprehenderit.
19 | Laborum aute dolore in ut aliqua reprehenderit amet nisi qui esse occaecat cupidatat cupidatat labore.
20 | Veniam sunt labore excepteur aliqua aliqua sunt do enim do anim in occaecat consequat aute.
21 | Sit labore irure quis commodo non elit sunt ex ea. Nisi do tempor aliquip aliqua ut veniam culpa dolore.
22 | 
23 | Exercitation sint consequat sint exercitation enim officia non velit.
24 | Cillum ut sint dolore minim aliqua. Id excepteur fugiat magna laborum in dolore laboris cupidatat occaecat aliquip ipsum laboris.
25 | Duis ex ut qui sit id fugiat aute laborum nulla.
26 | Minim laboris consectetur non occaecat dolor commodo sit adipisicing consequat magna.
27 | Incididunt elit amet fugiat ex voluptate.
28 | Nulla occaecat sit sunt voluptate ex id adipisicing excepteur fugiat reprehenderit.
29 | 


--------------------------------------------------------------------------------
/node/parsers.mts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @copyright Sister Software. All rights reserved.
 3 |  * @author Teffen Ellis, et al.
 4 |  * @license
 5 |  * See LICENSE file in the project root for full license information.
 6 |  */
 7 | 
 8 | import { TokenEncodingsRecord } from '../BytePairTokenMap.mjs'
 9 | import { BPEVocab, VocabEntry } from '../RanksMap.mjs'
10 | 
11 | /**
12 |  * Parses a BPE file into a list of bigrams
13 |  *
14 |  * The vocab.bpe file is a text file that contains a set of byte pair encoding (BPE) codes
15 |  * that are used in the tokenization process.
16 |  *
17 |  * The file should be in the following format:
18 |  *
19 |  * ```text
20 |  * #version: VERSION_STRING
21 |  * [prefix1] [suffix1]
22 |  * [prefixN] [suffixN]
23 |  * ...
24 |  * ```
25 |  */
26 | export function parseBPEFile(bpeFileContents: string): BPEVocab {
27 |   const lines = bpeFileContents.trim().split('\n')
28 |   const [versionLine, ...bpeMerges] = lines
29 |   const [version = 'unknown'] = versionLine.trim().match(/^#version: (\d.+)$/) || []
30 | 
31 |   const entries = bpeMerges.map((line, lineIndex) => {
32 |     const segments = line
33 |       // Each line contains a pair of tokens separated by a space
34 |       .split(/(\s+)/)
35 |       // Clean up the tokens...
36 |       .map((x) => x.trim())
37 |       .filter(Boolean)
38 | 
39 |     if (segments.length < 2) {
40 |       throw new Error(`Invalid BPE file format: line ${lineIndex + 1} is not a valid bigram`)
41 |     }
42 | 
43 |     const [prefix, suffix] = segments
44 | 
45 |     const entry: VocabEntry = {
46 |       prefix,
47 |       suffix,
48 |     }
49 | 
50 |     return entry
51 |   })
52 | 
53 |   return {
54 |     version,
55 |     entries,
56 |   }
57 | }
58 | 
59 | /**
60 |  * Parse a token encoder file, usually from a file named `encoder.json`
61 |  */
62 | export function parseEncoderFile(
63 |   /**
64 |    * The token encoder content, either as a string or as a parsed object.
65 |    */
66 |   tokenEncoderContent: string | TokenEncodingsRecord
67 | ): TokenEncodingsRecord {
68 |   const tokenEncodings: TokenEncodingsRecord =
69 |     typeof tokenEncoderContent === 'string' ? JSON.parse(tokenEncoderContent) : tokenEncoderContent
70 | 
71 |   return tokenEncodings
72 | }
73 | 


--------------------------------------------------------------------------------
/tokenizer/codex.mts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @copyright Sister Software. All rights reserved.
 3 |  * @author Teffen Ellis, et al.
 4 |  * @license
 5 |  * See LICENSE file in the project root for full license information.
 6 |  */
 7 | 
 8 | import { BytePairDecoder } from '../BytePairDecoder.mjs'
 9 | import { BytePairEncoder } from '../BytePairEncoder.mjs'
10 | import { BytePairEncoding } from '../BytePairEncoding.mjs'
11 | import { DEFAULT_BPE_OPTIONS } from './data.mjs'
12 | 
13 | const codexEncoding = new BytePairEncoding({
14 |   ...DEFAULT_BPE_OPTIONS,
15 |   mergeSpaces: 'codex',
16 | })
17 | 
18 | /**
19 |  * Default Codex decoder.
20 |  * This is a singleton instance of {@linkcode BytePairDecoder} that is pre-configured to decode GPT-3 tokens.
21 |  */
22 | export const codexEncoder = new BytePairEncoder(codexEncoding)
23 | 
24 | /**
25 |  * Encodes a given UTF-8 string into a list of GPT-3 tokens.
26 |  *
27 |  * ```js
28 |  * const codeText = [
29 |  *   'function deeplyNested () {',
30 |  *   '  return {',
31 |  *   '    the: {',
32 |  *   '      quick: {',
33 |  *   '        brown: {',
34 |  *   '...etc'
35 |  * ].join('')
36 |  *
37 |  * const codexTokens = encodeToCodexTokens(codeText)
38 |  * ```
39 |  *
40 |  * @see {@linkcode decodeCodexTokens} for the inverse function.
41 |  * @see {@linkcode BytePairEncoder} for more information on how the tokens are decoded.
42 |  */
43 | export const encodeCodex = codexEncoder.encode
44 | 
45 | /**
46 |  * Default Codex decoder.
47 |  * This is a singleton instance of {@linkcode BytePairDecoder} that is pre-configured to decode GPT-3 tokens.
48 |  */
49 | export const codexDecoder = new BytePairDecoder(codexEncoding)
50 | 
51 | /**
52 |  * Converts a list of Codex tokens into a string.
53 |  *
54 |  * ```ts
55 |  * // Truncated for brevity...
56 |  * const tokens = [8818, 7744, 45, 7287, 7499]
57 |  * const text = codexDecoder.decode(tokens)
58 |  * console.log(text)
59 |  *
60 |  * // `function deeplyNested () {
61 |  * //   return {
62 |  * //     the: {
63 |  * //       quick: {
64 |  * //         brown: {
65 |  * // ...`
66 |  * ```
67 |  *
68 |  * @see {@linkcode codexEncoder} for the inverse object.
69 |  * @see {@linkcode BytePairDecoder} for more information on how the tokens are decoded.
70 |  */
71 | export const decodeCodex = codexDecoder.decode
72 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # What is this?
 2 | 
 3 | **GPT Token Utilities** is a small library for encoding and decoding text to and from the tokenized format used by OpenAI's GPT models.
 4 | 
 5 | ## Why should I use this?
 6 | 
 7 | ### 🏃‍♀️ Fast
 8 | 
 9 | Our token encoder/decoder is optimized for a balance of speed and ease of use. No external dependencies are required.
10 | 
11 | ### 🤸‍♀️ Flexible
12 | 
13 | Everything is written in TypeScript and includes type definitions with full documentation. This library is isomorphic and can be used in both Node and the browser!
14 | 
15 | ### ⚖️ Light in size. Heavy in features.
16 | 
17 | GPT Token Utils balances a small footprint with a full-featured API.
18 | It's also tree-shakeable, so you can import only the functions you need.
19 | 
20 | ## Installation
21 | 
22 | ### NPM
23 | 
24 | ```bash
25 | yarn add gpt-token-utils
26 | # or
27 | npm install --save gpt-token-utils
28 | ```
29 | 
30 | ## Usage
31 | 
32 | ## Encoding and Decoding Text
33 | 
34 | The `encode` and `decode` exports are the main functions you'll use to work with GPT tokens.
35 | 
36 | ```js
37 | import { encode, decode } from 'gpt-token-utils'
38 | 
39 | // Encode a string to a list of tokens...
40 | const tokens = encode('Humans are strange creatures, and ever so curious too!')
41 | 
42 | // You've got a list of tokens!
43 | console.log(tokens) // [32661, 504, 389, 6283, 8109, 11, 290, 1683, 523, 11040, 1165, 0]
44 | 
45 | // How many tokens are there?
46 | console.log(tokens.length) // 6
47 | 
48 | // Can we decode it back to text?
49 | console.log(decode(tokens)) // "Humans are strange creatures...."
50 | ```
51 | 
52 | ### Advanced Usage
53 | 
54 | By default, GPT Token Utils includes a sizable vocabulary and encoder. Alternatively, you can pass in your own to customize the encoding/decoding process.
55 | 
56 | ```js
57 | import {BytePairEncoder} from 'gpt-token-utils/BytePairEncoder'
58 | 
59 | const tokenEncoder = new BytePairEncoder({...})
60 | ```
61 | 
62 | ```js
63 | import {BytePairDecoder} from 'gpt-token-utils/BytePairDecoder'
64 | 
65 | const tokenDecoder = new TokenDecoder({...})
66 | ```
67 | 
68 | # License
69 | 
70 | GPT Token Utils is licensed under the [MIT License](https://opensource.org/licenses/MIT). If you've got something cool to share that's built with this library, let us know at [@SisterSoftware](https://twitter.com/SisterSoftware)! We would love to see it!
71 | 


--------------------------------------------------------------------------------
/RanksMap.mts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @copyright Sister Software. All rights reserved.
 3 |  * @author Teffen Ellis, et al.
 4 |  * @license
 5 |  * See LICENSE file in the project root for full license information.
 6 |  */
 7 | 
 8 | const nodeInspectSymbol = Symbol.for('nodejs.util.inspect.custom')
 9 | 
10 | /**
11 |  * Map of byte-pair encodings according to their BPE rank
12 |  * @internal
13 |  */
14 | export class RanksMap {
15 |   protected _prefixToSuffixRankMap: Map<
16 |     /** Prefix */
17 |     string,
18 |     Map<
19 |       /** Suffix */
20 |       string,
21 |       /** Rank */
22 |       number
23 |     >
24 |   > = new Map()
25 | 
26 |   public getRank(prefix: string, suffix: string): number | undefined {
27 |     const suffixMap = this._prefixToSuffixRankMap.get(prefix)
28 | 
29 |     if (suffixMap) {
30 |       return suffixMap.get(suffix)
31 |     }
32 |   }
33 | 
34 |   constructor(vocab: VocabEntry[] | BPEVocab, mergesSpacesCount = 0) {
35 |     const normalizedVocab = Array.isArray(vocab) ? vocab.slice() : vocab.entries.slice()
36 | 
37 |     if (mergesSpacesCount > 0) {
38 |       for (let i = 1; i < mergesSpacesCount; i++) {
39 |         for (let j = 1; j < mergesSpacesCount; j++) {
40 |           if (i + j <= mergesSpacesCount) {
41 |             normalizedVocab.push({
42 |               prefix: '\u0120'.repeat(i),
43 |               suffix: '\u0120'.repeat(j),
44 |             })
45 |           }
46 |         }
47 |       }
48 |     }
49 | 
50 |     for (const [rank, entry] of normalizedVocab.entries()) {
51 |       let suffixMap = this._prefixToSuffixRankMap.get(entry.prefix)
52 | 
53 |       if (!suffixMap) {
54 |         suffixMap = new Map<string, number>()
55 |         this._prefixToSuffixRankMap.set(entry.prefix, suffixMap)
56 |       }
57 | 
58 |       suffixMap.set(entry.suffix, rank)
59 |     }
60 |   }
61 | 
62 |   public get size() {
63 |     return this._prefixToSuffixRankMap.size
64 |   }
65 | 
66 |   [nodeInspectSymbol]() {
67 |     return `RanksMap(${this.size})`
68 |   }
69 | }
70 | 
71 | /**
72 |  * A parsed vocabulary entry.
73 |  * The rank of the byte-pair encoding is derived from the index of the pair in the `vocab.bpe` file.
74 |  */
75 | export interface VocabEntry {
76 |   /** The word stem prefix in the pair. */
77 |   prefix: string
78 |   /** The suffix token in the pair. */
79 |   suffix: string
80 | }
81 | 
82 | /**
83 |  * A vocabulary of byte-pair encodings.
84 |  *
85 |  * @see {@linkcode parseBPEFile}
86 |  */
87 | export interface BPEVocab {
88 |   version: string
89 |   entries: VocabEntry[]
90 | }
91 | 


--------------------------------------------------------------------------------
/BytePairDecoder.mts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @copyright Sister Software. All rights reserved.
 3 |  * @author Teffen Ellis, et al.
 4 |  * @license
 5 |  * See LICENSE file in the project root for full license information.
 6 |  */
 7 | 
 8 | import { BytePairEncoding } from './BytePairEncoding.mjs'
 9 | // eslint-disable-next-line @typescript-eslint/no-unused-vars
10 | import type { BytePairEncoder } from './BytePairEncoder.mjs'
11 | import type { EncoderResult } from './EncoderResult.mjs'
12 | 
13 | /**
14 |  * Methods associated with decoding a list of tokens into a string.
15 |  */
16 | export interface TokenDecodeFn {
17 |   (
18 |     /**
19 |      * The list of tokens to decode.
20 |      */
21 |     tokens: number[]
22 |   ): string
23 | 
24 |   (
25 |     /**
26 |      * The resulting object of the {@linkcode BytePairEncoder.encode} function.
27 |      */
28 |     encoderResult: EncoderResult
29 |   ): string
30 | }
31 | 
32 | /**
33 |  * GPT Token Decoder.
34 |  *
35 |  * Generally, you should not need to use this class directly unless you are
36 |  * implementing a custom token decoder.
37 |  *
38 |  * @see {@linkcode BytePairEncoder} for the encoder.
39 |  *
40 |  * ```ts
41 |  * const decoder = new BytePairDecoder({codePointByteMap, bpeTokenMap})
42 |  * const text = decoder.decode(tokens)
43 |  * ```
44 |  */
45 | export class BytePairDecoder {
46 |   constructor(protected _bpe: BytePairEncoding, protected _textDecoder = new TextDecoder()) {}
47 | 
48 |   /**
49 |    * Converts a list of tokens into a string.
50 |    *
51 |    * ```ts
52 |    * const tokens = [5211, 290, 305, 2340, 4320, 286, 5186, 15900, 30]
53 |    * const text = decoder.decode(tokens)
54 |    * console.log(text) // "Do androids dream of electric sheep?"
55 |    * ```
56 |    *
57 |    * @returns The decoded string.
58 |    */
59 |   public decode: TokenDecodeFn = (tokens: number[] | EncoderResult): string => {
60 |     const source = Array.isArray(tokens) ? tokens : tokens.tokens
61 | 
62 |     const bytePairEncodings = source
63 |       // First, we convert the tokens into BPE...
64 |       .map((token) => this._bpe.tokenMap.tokenToBytePair(token))
65 |       // The pairs combined into a single string to combine the graphemes.
66 |       .join('')
67 | 
68 |     // We then convert the BPE into UTF-8 by split the string...
69 |     //...into an array of characters to convert the characters into bytes
70 |     const bytes = Array.from(bytePairEncodings, (x) => this._bpe.codePointByteMap.codePointToByte(x))
71 | 
72 |     // Finally, we convert the bytes into a string.
73 |     const text = this._textDecoder.decode(new Uint8Array(bytes))
74 | 
75 |     return text
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/BytePairTokenMap.mts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @copyright Sister Software. All rights reserved.
 3 |  * @author Teffen Ellis, et al.
 4 |  * @license
 5 |  * See LICENSE file in the project root for full license information.
 6 |  */
 7 | 
 8 | /**
 9 |  * A map of byte-pair encodings to their corresponding tokens.
10 |  * @internal
11 |  */
12 | export type TokenEncodingsRecord = Record<string, number | undefined>
13 | 
14 | const nodeInspectSymbol = Symbol.for('nodejs.util.inspect.custom')
15 | 
16 | /**
17 |  * Two-way map between Unicode byte-pairs and tokens.
18 |  * @internal
19 |  */
20 | export class BytePairTokenMap {
21 |   protected _bpeTokenMap: Map<
22 |     /**
23 |      * Byte paired character(s), e.g. `'!'`, `'\u00a8'`
24 |      */
25 |     string,
26 |     /**
27 |      * The corresponding token, e.g. `0`, `101`
28 |      */
29 |     number
30 |   >
31 |   protected _tokenBPEMap: Map<
32 |     /**
33 |      * The corresponding token, e.g. `0`, `101`
34 |      */
35 |     number,
36 |     /**
37 |      * Byte paired character(s), e.g. `'!'`, `'\u00a8'`
38 |      */
39 |     string
40 |   >
41 | 
42 |   constructor(tokenEncodings: TokenEncodingsRecord, nMergedSpaces = 0) {
43 |     this._bpeTokenMap = new Map()
44 |     this._tokenBPEMap = new Map()
45 | 
46 |     for (const [key, value] of Object.entries(tokenEncodings)) {
47 |       this.addBytePair(key, value!)
48 |     }
49 | 
50 |     // add merged spaces for codex tokenizer
51 |     const normalizeVocabLength = this._bpeTokenMap.size + nMergedSpaces
52 | 
53 |     for (let i = 0; i < nMergedSpaces; i++) {
54 |       const key = '\u0120'.repeat(i + 2)
55 |       const value = normalizeVocabLength - nMergedSpaces + i
56 | 
57 |       this.addBytePair(key, value)
58 |     }
59 |   }
60 | 
61 |   public addBytePair(bytePair: string, token: number): void {
62 |     this._bpeTokenMap.set(bytePair, token)
63 |     this._tokenBPEMap.set(token, bytePair)
64 |   }
65 | 
66 |   public tokenToBytePair(token: number): string {
67 |     const bytePair = this._tokenBPEMap.get(token)
68 | 
69 |     if (typeof bytePair === 'undefined') {
70 |       throw new Error(`Token "${token}" was not found in the token encoder.`)
71 |     }
72 | 
73 |     return bytePair
74 |   }
75 | 
76 |   public bytePairToToken(bytePair: string): number {
77 |     const token = this._bpeTokenMap.get(bytePair)
78 | 
79 |     if (typeof token === 'undefined') {
80 |       throw new Error(`Byte pair "${bytePair}" was not found in the token encoder.`)
81 |     }
82 | 
83 |     return token
84 |   }
85 | 
86 |   public get size() {
87 |     return this._bpeTokenMap.size
88 |   }
89 | 
90 |   public [nodeInspectSymbol]() {
91 |     return `BytePairTokenMap(${this.size})`
92 |   }
93 | }
94 | 


--------------------------------------------------------------------------------
/BytePairEncoding.mts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @copyright Sister Software. All rights reserved.
 3 |  * @author Teffen Ellis, et al.
 4 |  * @license
 5 |  * See LICENSE file in the project root for full license information.
 6 |  */
 7 | 
 8 | import { BytePairTokenMap, TokenEncodingsRecord } from './BytePairTokenMap.mjs'
 9 | import { CodePointByteMap } from './CodePointByteMap.mjs'
10 | import { createTokenizerPattern } from './patterns.mjs'
11 | import { BPEVocab, RanksMap, VocabEntry } from './RanksMap.mjs'
12 | 
13 | export interface IBytePairEncodingOptions {
14 |   /**
15 |    * The token encoder map. This is typically derived from a `encoder.json` file:
16 |    *
17 |    * ```ts
18 |    * const tokenEncodings = parseEncoderFile(fs.readFileSync('./encoder.json', 'utf-8'))
19 |    * ```
20 |    */
21 |   tokenEncodings: TokenEncodingsRecord
22 | 
23 |   /**
24 |    * The BPE ranks map. This is typically derived from a `vocab.bpe` file:
25 |    *
26 |    * ```ts
27 |    * const vocab = parseBPEFile(fs.readFileSync('./vocab.bpe', 'utf-8'))
28 |    * ```
29 |    *
30 |    * You should only use this option if you are using a custom vocabulary.
31 |    *
32 |    * @see {@linkcode parseBPEFile}
33 |    *
34 |    * @default parseBPEFile(DEFAULT_VOCAB)
35 |    */
36 |   vocab: VocabEntry[] | BPEVocab
37 | 
38 |   /**
39 |    * The number of spaces to merge into a single token.
40 |    *
41 |    * Codex models use a different set of encodings that handle whitespace more efficiently.
42 |    * @default 'none'
43 |    */
44 |   mergeSpaces?: 'none' | 'codex' | number
45 | 
46 |   /**
47 |    * Optional override of the regular expression used to tokenize text.
48 |    * @default createTokenizerPattern()
49 |    */
50 |   tokenizationPattern?: RegExp
51 | }
52 | 
53 | /**
54 |  * A base class for the Byte Pair Encoding (BPE) encoder and decoder.
55 |  * @internal
56 |  */
57 | export class BytePairEncoding {
58 |   public codePointByteMap: CodePointByteMap
59 |   public mergesSpacesCount: number
60 | 
61 |   public tokenMap: BytePairTokenMap
62 |   public ranksMap: RanksMap
63 | 
64 |   public tokenizationPattern: RegExp
65 | 
66 |   constructor(options: IBytePairEncodingOptions) {
67 |     this.tokenizationPattern = options.tokenizationPattern ?? createTokenizerPattern()
68 |     this.codePointByteMap = new CodePointByteMap()
69 | 
70 |     if (typeof options.mergeSpaces === 'string') {
71 |       this.mergesSpacesCount = options.mergeSpaces === 'codex' ? 30 : 0
72 |     } else {
73 |       this.mergesSpacesCount = options.mergeSpaces ?? 0
74 |     }
75 | 
76 |     this.tokenMap = new BytePairTokenMap(options.tokenEncodings, this.mergesSpacesCount)
77 |     this.ranksMap = new RanksMap(options.vocab, this.mergesSpacesCount)
78 |   }
79 | }
80 | 


--------------------------------------------------------------------------------
/test/CostEstimator.test.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @copyright Sister Software. All rights reserved.
  3 |  * @author Teffen Ellis, et al.
  4 |  * @license
  5 |  * See LICENSE file in the project root for full license information.
  6 |  */
  7 | 
  8 | import { expect, test } from 'vitest'
  9 | import {
 10 |   BytePairEncoder,
 11 |   BytePairEncoding,
 12 |   createDefaultBPEOptions,
 13 |   estimateCost,
 14 |   ICostEstimationResult,
 15 |   ModelFamilyIDs,
 16 | } from '../mod.mjs'
 17 | import { readFixture, TestCase } from './common.mjs'
 18 | 
 19 | interface CostEstimatorTestCase extends TestCase<string, ICostEstimationResult> {
 20 |   modelID: string
 21 | }
 22 | 
 23 | const testCases: CostEstimatorTestCase[] = [
 24 |   {
 25 |     label: 'Empty string',
 26 |     modelID: ModelFamilyIDs.Davinci,
 27 |     given: '',
 28 |     expected: {
 29 |       usage: 0,
 30 |       fineTunedUsage: 0,
 31 |       fineTunedTraining: 0,
 32 |       prompt: 0,
 33 |       completion: 0,
 34 |     },
 35 |   },
 36 |   {
 37 |     label: 'Just a space',
 38 |     modelID: ModelFamilyIDs.Davinci,
 39 |     given: ' ',
 40 |     expected: {
 41 |       completion: 0.00002,
 42 |       fineTunedTraining: 0.00003,
 43 |       fineTunedUsage: 0.00012,
 44 |       prompt: 0.00002,
 45 |       usage: 0.00002,
 46 |     },
 47 |   },
 48 |   {
 49 |     label: 'Tab',
 50 |     modelID: ModelFamilyIDs.Davinci,
 51 |     given: '\t',
 52 |     expected: {
 53 |       completion: 0.00002,
 54 |       fineTunedTraining: 0.00003,
 55 |       fineTunedUsage: 0.00012,
 56 |       prompt: 0.00002,
 57 |       usage: 0.00002,
 58 |     },
 59 |   },
 60 |   {
 61 |     label: 'Single paragraph',
 62 |     modelID: ModelFamilyIDs.Davinci,
 63 |     given: readFixture('single-paragraph.txt'),
 64 |     expected: {
 65 |       completion: 0.0031,
 66 |       fineTunedTraining: 0.00465,
 67 |       fineTunedUsage: 0.0186,
 68 |       prompt: 0.0031,
 69 |       usage: 0.0031,
 70 |     },
 71 |   },
 72 |   {
 73 |     label: 'Multiple paragraphs',
 74 |     modelID: ModelFamilyIDs.Davinci,
 75 |     given: readFixture('multiple-paragraphs.txt'),
 76 |     expected: {
 77 |       completion: 0.01434,
 78 |       fineTunedTraining: 0.021509999999999998,
 79 |       fineTunedUsage: 0.08603999999999999,
 80 |       prompt: 0.01434,
 81 |       usage: 0.01434,
 82 |     },
 83 |   },
 84 |   // {
 85 |   //   label: 'HTML content',
 86 |   //   modelID: ModelFamilyIDs.GPT4,
 87 |   //   given: readFixture('sample-html.html'),
 88 |   //   expected: {
 89 |   //     completion: 0.005659999999999999,
 90 |   //     fineTunedTraining: 0.00849,
 91 |   //     fineTunedUsage: 0.03396,
 92 |   //     prompt: 0.005659999999999999,
 93 |   //     usage: 0.005659999999999999,
 94 |   //   },
 95 |   // },
 96 | ]
 97 | 
 98 | for (const { label, given, modelID, expected, options } of testCases) {
 99 |   test(label, () => {
100 |     const gptEncoding = new BytePairEncoding({ ...createDefaultBPEOptions(), ...options })
101 |     const encoder = new BytePairEncoder(gptEncoding)
102 | 
103 |     const encoded = encoder.encode(given)
104 |     const estimatedCosts = estimateCost(modelID, encoded)
105 | 
106 |     expect(estimatedCosts).toEqual(expected)
107 |   })
108 | }
109 | 


--------------------------------------------------------------------------------
/CodePointByteMap.mts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @copyright Sister Software. All rights reserved.
  3 |  * @author Teffen Ellis, et al.
  4 |  * @license
  5 |  * See LICENSE file in the project root for full license information.
  6 |  */
  7 | 
  8 | const nodeInspectSymbol = Symbol.for('nodejs.util.inspect.custom')
  9 | 
 10 | /**
 11 |  * Two-way map of byte values to their corresponding Unicode codepoints.
 12 |  */
 13 | export class CodePointByteMap {
 14 |   /**
 15 |    * Maps each byte value to its corresponding Unicode character.
 16 |    */
 17 |   protected _byteToCodePoint: Map<number, string>
 18 |   /**
 19 |    * Maps each Unicode character to its corresponding byte value.
 20 |    */
 21 |   protected _codePointToByte: Map<string, number>
 22 | 
 23 |   constructor() {
 24 |     // Contains all the byte values corresponding to printable ASCII characters
 25 |     const basicBytes = Array.from({ length: 94 }, (_, i) => i + 33)
 26 | 
 27 |     // Contains all the byte values corresponding to extended ASCII characters
 28 |     // that are not already included in `basicBytes`
 29 |     const extendedBytes = [
 30 |       ...Array.from({ length: 12 }, (_, i) => i + 161),
 31 |       ...Array.from({ length: 82 }, (_, i) => i + 174),
 32 |     ]
 33 | 
 34 |     // Combine `basicBytes` and `extendedBytes` to get a list of all byte values
 35 |     const allBytes = basicBytes.concat(extendedBytes)
 36 |     const cs = allBytes.slice()
 37 | 
 38 |     let unicodeIndex = 0
 39 | 
 40 |     // Then assign unique Unicode characters to the bytes in `allBytes` that are not
 41 |     // already in `basicBytes` or `extendedBytes`...
 42 |     // For each possible byte value (0-255)...
 43 |     for (let byteValue = 0; byteValue < 256; byteValue++) {
 44 |       // If the byte value is not in `allBytes`, it needs to be added to the dictionary
 45 |       if (!allBytes.includes(byteValue)) {
 46 |         // Add the byte value to `allBytes`
 47 |         allBytes.push(byteValue)
 48 | 
 49 |         cs.push(256 + unicodeIndex)
 50 | 
 51 |         // Increment `unicodeIndex` so the next new byte value will get a unique Unicode character
 52 |         unicodeIndex++
 53 |       }
 54 |     }
 55 | 
 56 |     this._byteToCodePoint = new Map()
 57 |     this._codePointToByte = new Map()
 58 | 
 59 |     for (let i = 0; i < cs.length; i++) {
 60 |       const key = allBytes[i]
 61 |       const value = String.fromCharCode(cs[i])
 62 | 
 63 |       this._byteToCodePoint.set(key, value)
 64 |       this._codePointToByte.set(value, key)
 65 |     }
 66 |   }
 67 | 
 68 |   public byteToCodePoint(byte: number): string {
 69 |     const codePoint = this._byteToCodePoint.get(byte)
 70 | 
 71 |     if (typeof codePoint === 'undefined') {
 72 |       throw new Error(`Byte "${byte}" was not found in the byte map.`)
 73 |     }
 74 | 
 75 |     return codePoint
 76 |   }
 77 | 
 78 |   public codePointToByte(codePoint: string): number {
 79 |     const byte = this._codePointToByte.get(codePoint)
 80 | 
 81 |     if (typeof byte === 'undefined') {
 82 |       throw new Error(`Unicode character "${codePoint}" was not found in the byte map.`)
 83 |     }
 84 | 
 85 |     return byte
 86 |   }
 87 | 
 88 |   public get size() {
 89 |     return this._byteToCodePoint.size
 90 |   }
 91 | 
 92 |   public get byteToCodePointMap() {
 93 |     return this._byteToCodePoint
 94 |   }
 95 | 
 96 |   public get codePointToByteMap() {
 97 |     return this._codePointToByte
 98 |   }
 99 | 
100 |   [nodeInspectSymbol]() {
101 |     return `CodePointByteMap(${this.size})`
102 |   }
103 | }
104 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "name": "gpt-token-utils",
  3 |   "version": "1.2.0",
  4 |   "description": "Isomorphic utilities for GPT-3 tokenization and prompt building.",
  5 |   "repository": "git@github.com:sister-software/gpt-token-utils.git",
  6 |   "bugs": {
  7 |     "url": "https://github.com/sister-software/gpt-token-utils/issues"
  8 |   },
  9 |   "author": "Teffen Ellis <teffen@sister.software>",
 10 |   "homepage": "https://github.com/sister-software/gpt-token-utils#readme",
 11 |   "license": "MIT",
 12 |   "keywords": [
 13 |     "gpt",
 14 |     "gpt-3",
 15 |     "gpt3",
 16 |     "openai",
 17 |     "Open AI",
 18 |     "tokenization",
 19 |     "tokenizer",
 20 |     "prompt",
 21 |     "prompt-builder"
 22 |   ],
 23 |   "main": "dist/mod.mjs",
 24 |   "types": "dist/mod.d.mts",
 25 |   "type": "module",
 26 |   "files": [
 27 |     "dist/**/*"
 28 |   ],
 29 |   "exports": {
 30 |     "./package.json": "./package.json",
 31 |     ".": {
 32 |       "import": "./dist/mod.mjs",
 33 |       "types": "./dist/mod.d.mts"
 34 |     },
 35 |     "./mod": {
 36 |       "import": "./dist/mod.mjs",
 37 |       "types": "./dist/mod.d.mts"
 38 |     },
 39 |     "./mod.mjs": {
 40 |       "import": "./dist/mod.mjs",
 41 |       "types": "./dist/mod.d.mts"
 42 |     },
 43 |     "./BytePairDecoder": {
 44 |       "import": "./dist/BytePairDecoder/mod.mjs",
 45 |       "types": "./dist/BytePairDecoder/mod.d.mts"
 46 |     },
 47 |     "./BytePairDecoder.mjs": {
 48 |       "import": "./dist/BytePairDecoder/mod.mjs",
 49 |       "types": "./dist/BytePairDecoder/mod.d.mts"
 50 |     },
 51 |     "./BytePairEncoder": {
 52 |       "import": "./dist/BytePairEncoder/mod.mjs",
 53 |       "types": "./dist/BytePairEncoder/mod.d.mts"
 54 |     },
 55 |     "./BytePairEncoder.mjs": {
 56 |       "import": "./dist/BytePairEncoder/mod.mjs",
 57 |       "types": "./dist/BytePairEncoder/mod.d.mts"
 58 |     },
 59 |     "./tokenizer": {
 60 |       "import": "./dist/tokenizer/mod.mjs",
 61 |       "types": "./dist/tokenizer/mod.d.mts"
 62 |     },
 63 |     "./tokenizer.mjs": {
 64 |       "import": "./dist/tokenizer/mod.mjs",
 65 |       "types": "./dist/tokenizer/mod.d.mts"
 66 |     },
 67 |     "./models": {
 68 |       "import": "./dist/models/mod.mjs",
 69 |       "types": "./dist/models/mod.d.mts"
 70 |     },
 71 |     "./models.mjs": {
 72 |       "import": "./dist/models/mod.mjs",
 73 |       "types": "./dist/models/mod.d.mts"
 74 |     },
 75 |     "./node": {
 76 |       "import": "./dist/node/mod.mjs",
 77 |       "types": "./dist/node/mod.d.mts"
 78 |     },
 79 |     "./node.mjs": {
 80 |       "import": "./dist/node/mod.mjs",
 81 |       "types": "./dist/node/mod.d.mts"
 82 |     }
 83 |   },
 84 |   "scripts": {
 85 |     "test": "vitest",
 86 |     "build": "tsc -p ./tsconfig.json",
 87 |     "start": "http-server ./ -p 8081",
 88 |     "cli-tiktoken": "NODE_OPTIONS=\"--loader ts-node/esm --no-warnings\" ts-node ./internal/tiktoken.mts",
 89 |     "cli-example": "NODE_OPTIONS=\"--loader ts-node/esm --no-warnings\" ts-node ./example.mts"
 90 |   },
 91 |   "devDependencies": {
 92 |     "@sister.software/eslint-config": "^1.0.0",
 93 |     "@sister.software/prettier-config": "^1.0.0",
 94 |     "@sister.software/stylelint-config": "^1.0.0",
 95 |     "@types/node": "^18.14.4",
 96 |     "@typescript-eslint/eslint-plugin": "^5.53.0",
 97 |     "@typescript-eslint/parser": "^5.53.0",
 98 |     "eslint": "^8.34.0",
 99 |     "http-server": "^14.1.1",
100 |     "prettier": "^2.8.1",
101 |     "prettier-plugin-organize-imports": "^3.2.2",
102 |     "react": "^18.2.0",
103 |     "react-dom": "^18.2.0",
104 |     "stylelint": "~14",
105 |     "ts-node": "^10.9.1",
106 |     "typescript": "4.9.5",
107 |     "vitest": "^0.29.2"
108 |   },
109 |   "prettier": "@sister.software/prettier-config",
110 |   "stylelint": {
111 |     "extends": [
112 |       "@sister.software/stylelint-config"
113 |     ]
114 |   },
115 |   "eslintConfig": {
116 |     "extends": [
117 |       "@sister.software/eslint-config"
118 |     ],
119 |     "ignorePatterns": [
120 |       "dist",
121 |       "results",
122 |       "node_modules",
123 |       "test/fixtures"
124 |     ]
125 |   }
126 | }
127 | 


--------------------------------------------------------------------------------
/models/common.mts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @copyright Sister Software. All rights reserved.
  3 |  * @author Teffen Ellis, et al.
  4 |  * @license
  5 |  * See LICENSE file in the project root for full license information.
  6 |  */
  7 | 
  8 | /**
  9 |  * The IDs of available model families.
 10 |  */
 11 | export const ModelFamilyIDs = {
 12 |   Ada: 'ada',
 13 |   Babbage: 'babbage',
 14 |   Curie: 'curie',
 15 |   Davinci: 'davinci',
 16 |   ChatGPT: 'chat-gpt',
 17 |   GPT4: 'gpt-4',
 18 |   GPT4_32K: 'gpt-4-32k',
 19 | } as const
 20 | 
 21 | /**
 22 |  * A model family is a group of models that share a common lineage or training data.
 23 |  */
 24 | export interface ModelFamily {
 25 |   familyID: string
 26 |   /**
 27 |    * The number of tokens that can be used with this model in a single request.
 28 |    */
 29 |   tokenLimit: number
 30 |   /**
 31 |    * The number of spaces to merge into a single token.
 32 |    *
 33 |    * Codex models use a different set of encodings that handle whitespace more efficiently.
 34 |    */
 35 |   mergeSpaces: number
 36 |   pricing: ModelPricing
 37 |   /**
 38 |    * The IDs of available models, matches the IDs used in the OpenAI API.
 39 |    */
 40 |   modelIDs: string[]
 41 | 
 42 |   /**
 43 |    * The ID of the preferred model in this family.
 44 |    */
 45 |   preferredModelID?: string
 46 | }
 47 | 
 48 | export type ModelPricingTypes = 'usage' | 'fineTunedUsage' | 'fineTunedTraining' | 'prompt' | 'completion'
 49 | /**
 50 |  * The pricing of a model in US dollars.
 51 |  * @see https://openai.com/pricing
 52 |  */
 53 | export interface ModelPricing {
 54 |   /**
 55 |    * The price of model usage per 1000 tokens.
 56 |    */
 57 |   usage: number | null
 58 |   /**
 59 |    * The price of fine-tuned model usage per 1000 tokens.
 60 |    */
 61 |   fineTunedUsage: number | null
 62 |   /**
 63 |    * The price of fine-tuned model training per 1000 tokens.
 64 |    */
 65 |   fineTunedTraining: number | null
 66 |   /**
 67 |    * The price of usage for the prompt endpoint per 1000 tokens.
 68 |    */
 69 |   prompt: number | null
 70 |   /**
 71 |    * The price of usage for the completion endpoint per 1000 tokens.
 72 |    */
 73 |   completion: number | null
 74 | }
 75 | 
 76 | export interface GetModelFamilyFn {
 77 |   (
 78 |     /**
 79 |      * The ID of a model within a family, e.g. `"text-davinci-003"`
 80 |      * @returns The family that the model belongs to.
 81 |      */
 82 |     modelID: string
 83 |   ): ModelFamily
 84 |   (
 85 |     /**
 86 |      * The ID of a model family, e.g. `"davinci"`
 87 |      * @returns The family associated with the ID.
 88 |      */
 89 |     familyID: string
 90 |   ): ModelFamily
 91 |   (
 92 |     /**
 93 |      * A model family. This is useful for when you already have a model family object.
 94 |      * @returns The same family object that was passed in.
 95 |      */
 96 |     modelFamily: ModelFamily
 97 |   ): ModelFamily
 98 | 
 99 |   (input: string | ModelFamily): ModelFamily
100 | }
101 | 
102 | export class ModelFamiliesMap {
103 |   protected _familyMap = new Map<string, ModelFamily>()
104 |   protected _modelToFamilyMap = new Map<string, ModelFamily>()
105 | 
106 |   public addFamily(family: ModelFamily): void {
107 |     this._familyMap.set(family.familyID, family)
108 |     for (const modelID of family.modelIDs) {
109 |       this._modelToFamilyMap.set(modelID, family)
110 |     }
111 |   }
112 | 
113 |   public getFamilyByFamilyID(familyID: string): ModelFamily | undefined {
114 |     return this._familyMap.get(familyID)
115 |   }
116 | 
117 |   public getFamilyByModelID(modelID: string): ModelFamily | undefined {
118 |     return this._modelToFamilyMap.get(modelID)
119 |   }
120 | 
121 |   public get: GetModelFamilyFn = (input) => {
122 |     if (typeof input === 'string') {
123 |       const family = this.getFamilyByFamilyID(input) || this.getFamilyByModelID(input)
124 | 
125 |       if (!family) {
126 |         throw new Error(`No model ID or family found with ID: ${input}`)
127 |       }
128 | 
129 |       return family
130 |     }
131 | 
132 |     return input
133 |   }
134 | 
135 |   public isModelInFamily(modelID: string, familyID: string): boolean {
136 |     const family = this.getFamilyByFamilyID(familyID)
137 | 
138 |     return family?.modelIDs.includes(modelID) ?? false
139 |   }
140 | }
141 | 


--------------------------------------------------------------------------------
/test/Tokenizer.test.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @copyright Sister Software. All rights reserved.
  3 |  * @author Teffen Ellis, et al.
  4 |  * @license
  5 |  * See LICENSE file in the project root for full license information.
  6 |  */
  7 | 
  8 | import { expect, test } from 'vitest'
  9 | import { BytePairDecoder, BytePairEncoder, BytePairEncoding } from '../mod.mjs'
 10 | import { createDefaultBPEOptions } from '../tokenizer/mod.mjs'
 11 | import { readFixture, TestCase } from './common.mjs'
 12 | 
 13 | type TestCases = TestCase<string, number[]>[]
 14 | 
 15 | const testCases: TestCases = [
 16 |   {
 17 |     label: 'Empty string',
 18 |     given: '',
 19 |     expected: [],
 20 |   },
 21 |   {
 22 |     label: 'Just a space',
 23 |     given: ' ',
 24 |     expected: [220],
 25 |   },
 26 |   {
 27 |     label: 'Tab',
 28 |     given: '\t',
 29 |     expected: [197],
 30 |   },
 31 |   {
 32 |     label: 'Simple text',
 33 |     given: 'This is some text',
 34 |     expected: [1212, 318, 617, 2420],
 35 |   },
 36 |   {
 37 |     label: 'Text with special characters',
 38 |     given: `This is some text with a few special characters: !@#$%^&*()_+-=~[]{}|;:'",./<>?`,
 39 |     expected: [
 40 |       1212, 318, 617, 2420, 351, 257, 1178, 2041, 3435, 25, 5145, 31, 29953, 4, 61, 5, 9, 3419, 62, 10, 12, 31820,
 41 |       21737, 90, 92, 91, 26, 32105, 1600, 19571, 27, 29, 30,
 42 |     ],
 43 |   },
 44 |   {
 45 |     label: 'Text with numbers',
 46 |     given: 'This is some text with numbers 1234567890',
 47 |     expected: [1212, 318, 617, 2420, 351, 3146, 17031, 2231, 30924, 3829],
 48 |   },
 49 | 
 50 |   {
 51 |     label: 'Non-European text',
 52 |     given: '你好世界',
 53 |     expected: [19526, 254, 25001, 121, 10310, 244, 45911, 234],
 54 |   },
 55 |   {
 56 |     label: 'Bubble text',
 57 |     given: 'Ⓗⓔⓛⓛⓞ Ⓑⓤⓑⓑⓛⓔ',
 58 |     expected: [
 59 |       158, 240, 121, 158, 241, 242, 158, 241, 249, 158, 241, 249, 158, 241, 252, 2343, 240, 115, 158, 241, 97, 158, 241,
 60 |       239, 158, 241, 239, 158, 241, 249, 158, 241, 242,
 61 |     ],
 62 |   },
 63 |   {
 64 |     label: 'Multi-token word',
 65 |     given: 'indivisible',
 66 |     expected: [521, 452, 12843],
 67 |   },
 68 |   {
 69 |     label: 'Emojis',
 70 |     given: 'hello 👋 world 🌍',
 71 |     expected: [31373, 50169, 233, 995, 12520, 234, 235],
 72 |   },
 73 |   // We include a few properties of Object here to test that the tokenizer
 74 |   // doesn't include inherited properties.
 75 |   {
 76 |     label: 'properties of Object',
 77 |     given: 'toString constructor hasOwnProperty valueOf',
 78 |     expected: [1462, 10100, 23772, 468, 23858, 21746, 1988, 5189],
 79 |   },
 80 | 
 81 |   // Codex models use additional tokens for whitespace...
 82 |   {
 83 |     label: 'Without Codex',
 84 |     given: readFixture('nested-javascript.js'),
 85 |     expected: [
 86 |       8818, 7744, 45, 7287, 7499, 1391, 198, 220, 1441, 1391, 198, 220, 220, 220, 262, 25, 1391, 198, 220, 220, 220,
 87 |       220, 220, 2068, 25, 1391, 198, 220, 220, 220, 220, 220, 220, 220, 7586, 25, 1391, 198, 220, 220, 220, 220, 220,
 88 |       220, 220, 220, 220, 21831, 25, 1391, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 18045, 25, 1391,
 89 |       198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 625, 25, 1391, 198, 220, 220, 220, 220, 220,
 90 |       220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 262, 25, 1391, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220,
 91 |       220, 220, 220, 220, 220, 220, 220, 220, 16931, 25, 1391, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220,
 92 |       220, 220, 220, 220, 220, 220, 220, 220, 220, 3290, 25, 1391, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220,
 93 |       220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 1782, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220,
 94 |       220, 220, 220, 220, 220, 220, 220, 1782, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220,
 95 |       220, 220, 1782, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 1782, 198, 220, 220, 220,
 96 |       220, 220, 220, 220, 220, 220, 220, 220, 1782, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 1782, 198, 220,
 97 |       220, 220, 220, 220, 220, 220, 1782, 198, 220, 220, 220, 220, 220, 1782, 198, 220, 220, 220, 1782, 198, 220, 1782,
 98 |       198, 92,
 99 |     ],
100 |   },
101 |   // Codex models use additional tokens for whitespace...
102 |   {
103 |     label: 'With Codex',
104 |     given: readFixture('nested-javascript.js'),
105 |     expected: [
106 |       8818, 7744, 45, 7287, 7499, 1391, 198, 220, 1441, 1391, 198, 50258, 262, 25, 1391, 198, 50260, 2068, 25, 1391,
107 |       198, 50262, 7586, 25, 1391, 198, 50264, 21831, 25, 1391, 198, 50266, 18045, 25, 1391, 198, 50268, 625, 25, 1391,
108 |       198, 50270, 262, 25, 1391, 198, 50272, 16931, 25, 1391, 198, 50274, 3290, 25, 1391, 198, 50274, 1782, 198, 50272,
109 |       1782, 198, 50270, 1782, 198, 50268, 1782, 198, 50266, 1782, 198, 50264, 1782, 198, 50262, 1782, 198, 50260, 1782,
110 |       198, 50258, 1782, 198, 220, 1782, 198, 92,
111 |     ],
112 |     options: {
113 |       mergeSpaces: 'codex',
114 |     },
115 |   },
116 | ]
117 | 
118 | for (const { label, given, expected, options } of testCases) {
119 |   test(label, () => {
120 |     const gptEncoding = new BytePairEncoding({ ...createDefaultBPEOptions(), ...options })
121 |     const encoder = new BytePairEncoder(gptEncoding)
122 |     const decoder = new BytePairDecoder(gptEncoding)
123 | 
124 |     const encoded = encoder.encode(given).tokens
125 |     const decoded = decoder.decode(encoded)
126 | 
127 |     expect(encoded).toEqual(expected)
128 |     expect(decoded).toEqual(given)
129 |   })
130 | }
131 | 


--------------------------------------------------------------------------------
/EncoderResult.mts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @copyright Sister Software. All rights reserved.
  3 |  * @author Teffen Ellis, et al.
  4 |  * @license
  5 |  * See LICENSE file in the project root for full license information.
  6 |  */
  7 | 
  8 | // eslint-disable-next-line @typescript-eslint/no-unused-vars
  9 | import type { BytePairEncoder } from './BytePairEncoder.mjs'
 10 | 
 11 | const nodeInspectSymbol = Symbol.for('nodejs.util.inspect.custom')
 12 | const supportsSegmenter = typeof Intl !== 'undefined' && typeof Intl.Segmenter !== 'undefined'
 13 | 
 14 | export interface IEncoderResult {
 15 |   /**
 16 |    * The tokens that were encoded.
 17 |    */
 18 |   readonly tokens: number[]
 19 |   /**
 20 |    * The BPE token pairs that were used during encoded.
 21 |    */
 22 |   readonly bpeTokenPairs: string[]
 23 | 
 24 |   /**
 25 |    * The original text content that was encoded.
 26 |    */
 27 |   readonly originalInput: string
 28 | 
 29 |   /**
 30 |    * The matched text segments found during encoding.
 31 |    */
 32 |   readonly matchedTextSegments: string[]
 33 | }
 34 | 
 35 | /**
 36 |  * The `EncoderResult` includes information for post-encoding analysis such as...
 37 |  *
 38 |  * - The tokens that were encoded.
 39 |  * - The BPE token pairs that were used during encoded.
 40 |  * - Two-way maps of tokens to BPE token pairs.
 41 |  *
 42 |  * This information can be used to analyze the encoding process and to
 43 |  * reconstruct the original string from the encoded tokens.
 44 |  *
 45 |  * Note that this object is considered immutable. Consider encoding a new string
 46 |  * if you need an updated `EncoderResult`.
 47 |  *
 48 |  * @see {@linkcode BytePairEncoder}
 49 |  */
 50 | export class EncoderResult implements IEncoderResult {
 51 |   /**
 52 |    * A map of BPE token pairs to the corresponding token.
 53 |    */
 54 |   public tokenBPEMap: ReadonlyMap<number, string>
 55 |   /**
 56 |    * A map of tokens to the corresponding BPE token pair.
 57 |    */
 58 |   public bpeTokenMap: ReadonlyMap<string, number>
 59 | 
 60 |   /**
 61 |    * A map of BPE token pairs to the number of times they were used during encoding.
 62 |    * The key is the BPE token pair and the value is the number of times it appeared.
 63 |    */
 64 |   public bpeCountsMap: ReadonlyMap<string, number>
 65 | 
 66 |   /**
 67 |    * A map of tokens to the number of times they were used during encoding.
 68 |    * The key is the token and the value is the number of times it appeared.
 69 |    */
 70 |   public tokenCountsMap: ReadonlyMap<number, number>
 71 | 
 72 |   public readonly tokens: number[]
 73 |   public readonly bpeTokenPairs: string[]
 74 |   public readonly originalInput: string
 75 |   public readonly matchedTextSegments: string[]
 76 | 
 77 |   public segmenter: Intl.Segmenter | undefined
 78 | 
 79 |   constructor({ tokens, bpeTokenPairs, originalInput, matchedTextSegments }: IEncoderResult, locale?: string) {
 80 |     if (bpeTokenPairs.length !== tokens.length) {
 81 |       throw new Error('The number of BPE token pairs must match the number of tokens.')
 82 |     }
 83 | 
 84 |     const tokenToBPE: Array<[number, string]> = []
 85 |     const BPEToToken: Array<[string, number]> = []
 86 | 
 87 |     const tokenCountsMap = new Map<number, number>()
 88 |     const bpeCountsMap = new Map<string, number>()
 89 | 
 90 |     for (let i = 0; i < bpeTokenPairs.length; i++) {
 91 |       const token = tokens[i]
 92 |       const bpeTokenPair = bpeTokenPairs[i]
 93 | 
 94 |       const tokenCount = tokenCountsMap.get(token) || 0
 95 |       const bpeCount = bpeCountsMap.get(bpeTokenPair) || 0
 96 | 
 97 |       tokenCountsMap.set(token, tokenCount + 1)
 98 |       bpeCountsMap.set(bpeTokenPair, bpeCount + 1)
 99 | 
100 |       tokenToBPE.push([token, bpeTokenPair])
101 |       BPEToToken.push([bpeTokenPair, token])
102 |     }
103 | 
104 |     this.tokenBPEMap = new Map(tokenToBPE)
105 |     this.bpeTokenMap = new Map(BPEToToken)
106 | 
107 |     this.tokenCountsMap = tokenCountsMap
108 |     this.bpeCountsMap = bpeCountsMap
109 | 
110 |     this.tokens = tokens
111 |     this.bpeTokenPairs = bpeTokenPairs
112 |     this.originalInput = originalInput
113 |     this.matchedTextSegments = matchedTextSegments
114 | 
115 |     if (supportsSegmenter) {
116 |       this.segmenter = new Intl.Segmenter(locale)
117 |     }
118 |   }
119 | 
120 |   /**
121 |    * Get the encoded byte-pair for a given token.
122 |    */
123 |   public getBPE(token: number) {
124 |     return this.tokenBPEMap.get(token)
125 |   }
126 | 
127 |   /**
128 |    * Get the number of times a given token appeared during encoding.
129 |    * @see {@linkcode EncoderResult.length} if you're just trying count number of tokens.
130 |    */
131 |   public getTokenCount(token: number): number {
132 |     return this.tokenCountsMap.get(token) || 0
133 |   }
134 | 
135 |   /**
136 |    * Get the number of times a given byte-pair appeared during encoding.
137 |    */
138 |   public getBPECount(bpe: string): number {
139 |     return this.bpeCountsMap.get(bpe) || 0
140 |   }
141 | 
142 |   /**
143 |    * Iterate over the tokens in the result.
144 |    */
145 |   public [Symbol.iterator]() {
146 |     return this.tokens[Symbol.iterator]()
147 |   }
148 | 
149 |   /**
150 |    * The number of tokens in the result.
151 |    */
152 |   public get length() {
153 |     return this.tokens.length
154 |   }
155 | 
156 |   /**
157 |    * The number of characters in the original text.
158 |    *
159 |    * @see {@link https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl/Segmenter Intl.Segmenter}
160 |    */
161 |   public get characterCount(): number {
162 |     if (!this.segmenter) {
163 |       console.warn('Intl.Segmenter is not supported. Falling back to string length.')
164 |       return this.originalInput.length
165 |     }
166 | 
167 |     return Array.from(this.segmenter.segment(this.originalInput)).length
168 |   }
169 | 
170 |   public [nodeInspectSymbol]() {
171 |     return `EncoderResult(${this.length})`
172 |   }
173 | 
174 |   public toString() {
175 |     return this[Symbol.iterator]().toString()
176 |   }
177 | 
178 |   public toJSON(): IEncoderResult {
179 |     return {
180 |       tokens: this.tokens,
181 |       bpeTokenPairs: this.bpeTokenPairs,
182 |       originalInput: this.originalInput,
183 |       matchedTextSegments: this.matchedTextSegments,
184 |     }
185 |   }
186 | }
187 | 


--------------------------------------------------------------------------------
/BytePairEncoder.mts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @copyright Sister Software. All rights reserved.
  3 |  * @author Teffen Ellis, et al.
  4 |  * @license
  5 |  * See LICENSE file in the project root for full license information.
  6 |  */
  7 | 
  8 | // eslint-disable-next-line @typescript-eslint/no-unused-vars
  9 | import type { BytePairDecoder } from './BytePairDecoder.mjs'
 10 | import { BytePairEncoding } from './BytePairEncoding.mjs'
 11 | import { EncoderResult } from './EncoderResult.mjs'
 12 | 
 13 | /**
 14 |  * A valid input for the encoder.
 15 |  * @internal
 16 |  */
 17 | export type EncoderInput = string | EncoderResult
 18 | 
 19 | export interface TokenEncodeFn {
 20 |   (
 21 |     /**
 22 |      * The string to encode.
 23 |      */
 24 |     text: string
 25 |   ): EncoderResult
 26 | 
 27 |   (
 28 |     /**
 29 |      * The string to encode.
 30 |      */
 31 |     text: string,
 32 |     /**
 33 |      * Skip post-encoding processing for a slight performance boost.
 34 |      */
 35 |     skipPostProcessing?: boolean
 36 |   ): EncoderResult
 37 | 
 38 |   (
 39 |     /**
 40 |      * A previous encoder result to use as a starting point.
 41 |      * This will simply pass back the same result.
 42 |      * Useful when batch processing a mixed list of strings and encoder results.
 43 |      */
 44 |     encoderResult: EncoderResult
 45 |   ): EncoderResult
 46 | 
 47 |   (input: EncoderInput, skipPostProcessing?: boolean): EncoderResult
 48 | }
 49 | 
 50 | /**
 51 |  * GPT Token Encoder.
 52 |  *
 53 |  * Generally, you should not need to use this class directly unless you are
 54 |  * implementing a custom token encoder.
 55 |  *
 56 |  * @see {@linkcode BytePairDecoder} for the decoder.
 57 |  *
 58 |  * ```ts
 59 |  * const encoder = new BytePairEncoder(bpeTokenMap, ranksMap)
 60 |  * const tokens = encoder.encode(encoder)
 61 |  * ```
 62 |  */
 63 | export class BytePairEncoder {
 64 |   constructor(
 65 |     protected _bpe: BytePairEncoding,
 66 |     protected _textEncoder = new TextEncoder(),
 67 |     protected _bpeTokenCache = new Map<string, string[]>()
 68 |   ) {}
 69 |   //#region Public Methods
 70 | 
 71 |   /**
 72 |    * Encodes a given string into a list of tokens.
 73 |    *
 74 |    * ```ts
 75 |    * const text = "Do androids dream of electric sheep?"
 76 |    * const tokens = encoder.encode(text)
 77 |    * console.log(tokens) // [5211, 290, 305, 2340, 4320, 286, 5186, 15900, 30]
 78 |    * ```
 79 |    *
 80 |    * @returns The list of encoded tokens.
 81 |    */
 82 |   public encode: TokenEncodeFn = (input, skipPostProcessing = false): any => {
 83 |     if (typeof input !== 'string') {
 84 |       return input
 85 |     }
 86 | 
 87 |     // First, we run the pattern matcher on the text...
 88 |     const matchedTextSegments = Array.from(input.matchAll(this._bpe.tokenizationPattern), (x) => x[0])
 89 | 
 90 |     // Then we convert the tokens into UTF-8 byte arrays...
 91 |     const utf8Tokens = matchedTextSegments.map((textSegment) => {
 92 |       // The individual text segments are already UTF-8 encoded, so we can just convert them to byte arrays.
 93 |       const asUTF8 = this._textEncoder.encode(textSegment)
 94 |       // We then use our byte map to get the Unicode code point for each byte.
 95 |       const codePoints = Array.from(asUTF8, (byte) => {
 96 |         const codePoint = this._bpe.codePointByteMap.byteToCodePoint(byte)
 97 | 
 98 |         return codePoint
 99 |       })
100 | 
101 |       return codePoints.join('')
102 |     })
103 | 
104 |     // Then we convert the UTF-8 byte arrays into BPE tokens...
105 |     const bpeTokenPairs = utf8Tokens.flatMap((token) => this._tokenToBPE(token))
106 | 
107 |     const tokens = bpeTokenPairs.map((bpeToken) => {
108 |       return this._bpe.tokenMap.bytePairToToken(bpeToken)
109 |     })
110 | 
111 |     if (skipPostProcessing) {
112 |       return tokens
113 |     }
114 | 
115 |     const result = new EncoderResult({ tokens, bpeTokenPairs, originalInput: input, matchedTextSegments })
116 | 
117 |     return result
118 |   }
119 | 
120 |   /**
121 |    * Merges the pair of characters with the given values in the given word.
122 |    *
123 |    * @param word - An array of individual characters in the word.
124 |    * @param first - The first character in the pair to merge.
125 |    * @param second - The second character in the pair to merge.
126 |    *
127 |    * @returns The word with the pair of characters merged.
128 |    */
129 |   public mergePair(word: string[], first: string, second: string) {
130 |     const newWord: string[] = []
131 |     let i = 0
132 | 
133 |     while (i < word.length) {
134 |       const j = word.indexOf(first, i)
135 |       if (j === -1) {
136 |         newWord.push(...word.slice(i))
137 |         break
138 |       }
139 |       newWord.push(...word.slice(i, j))
140 |       if (word[j + 1] === second) {
141 |         newWord.push(first + second)
142 |         i = j + 2
143 |       } else {
144 |         newWord.push(first)
145 |         i = j + 1
146 |       }
147 |     }
148 | 
149 |     return newWord
150 |   }
151 | 
152 |   /**
153 |    * Returns an array of all possible pairs of adjacent characters in the given word.
154 |    *
155 |    * @param word - An array of individual characters in the word.
156 |    * @returns An array of all possible pairs of adjacent characters in the word.
157 |    */
158 |   public getPairs(word: string[]) {
159 |     const characters = word.slice()
160 |     const pairingsFound: Record<string, boolean> = {}
161 |     const pairs: string[][] = []
162 |     let previousCharacterIndex = 0
163 | 
164 |     for (let i = 1; i < characters.length; i++) {
165 |       const previousCharacter = characters[previousCharacterIndex]
166 |       const character = characters[i]
167 | 
168 |       previousCharacterIndex = i
169 | 
170 |       const pair = [previousCharacter, character]
171 |       const grapheme = pair.join('')
172 | 
173 |       if (Object.hasOwn(pairingsFound, grapheme)) {
174 |         continue
175 |       }
176 | 
177 |       pairs.push(pair)
178 |       pairingsFound[grapheme] = true
179 |     }
180 | 
181 |     return pairs
182 |   }
183 | 
184 |   //#endregion
185 | 
186 |   //#region Protected Methods
187 | 
188 |   /**
189 |    * Applies byte pair encoding (BPE) to the given token using the provided BPE ranks and cache.
190 |    * If the token is already in the cache, returns its value from the cache.
191 |    *
192 |    * @param token - The token to encode using BPE. This is derived from text passed through the `tokenizerPattern` RegExp.
193 |    *
194 |    * @returns The BPE-encoded token.
195 |    */
196 |   protected _tokenToBPE(token: string): string[] {
197 |     if (this._bpeTokenCache.has(token)) {
198 |       return this._bpeTokenCache.get(token)!
199 |     }
200 | 
201 |     // Convert the input token to an array of individual characters
202 |     let word = Array.from(token)
203 | 
204 |     // Get all possible pairs of characters in the token
205 |     let pairs = this.getPairs(word)
206 | 
207 |     // Loop until there are no more pairs to merge
208 |     // eslint-disable-next-line no-constant-condition
209 |     while (true) {
210 |       // If there are no pairs, return the original token
211 |       if (!pairs || pairs.length === 0) {
212 |         const word = [token]
213 |         this._bpeTokenCache.set(token, word)
214 | 
215 |         return word
216 |       }
217 | 
218 |       // Find the pair with the lowest rank (or highest numeric value if the rank is NaN)
219 |       const minRankPair = this._findMinRankPair(pairs)
220 | 
221 |       // If no valid pair is found, exit the loop
222 |       if (!minRankPair || minRankPair.length === 0) {
223 |         break
224 |       }
225 | 
226 |       // Merge the pair with the lowest rank
227 |       const [first, second] = minRankPair
228 | 
229 |       let newWord: string[] = []
230 |       let i = 0
231 | 
232 |       while (i < word.length) {
233 |         const j = word.indexOf(first, i)
234 |         if (j === -1) {
235 |           newWord = newWord.concat(word.slice(i))
236 |           break
237 |         }
238 |         newWord = newWord.concat(word.slice(i, j))
239 |         i = j
240 | 
241 |         if (word[i] === first && i < word.length - 1 && word[i + 1] === second) {
242 |           newWord.push(first + second)
243 |           i = i + 2
244 |         } else {
245 |           newWord.push(word[i])
246 |           i = i + 1
247 |         }
248 |       }
249 | 
250 |       // Update the word with the merged pair
251 |       word = newWord
252 | 
253 |       // If the word is reduced to a single character, exit the loop
254 |       if (word.length === 1) {
255 |         break
256 |       }
257 | 
258 |       // Otherwise, get all possible pairs of characters in the updated word
259 |       pairs = this.getPairs(word)
260 |     }
261 | 
262 |     this._bpeTokenCache.set(token, word)
263 | 
264 |     return word
265 |   }
266 | 
267 |   /**
268 |    * Finds the pair with the lowest rank (or highest numeric value if the rank is NaN) in the given array of pairs.
269 |    *
270 |    * @param pairs - An array of pairs of characters.
271 |    * @param bpeRanks - An object containing the BPE ranks for all pairs of characters.
272 |    * @returns The pair with the lowest rank, or null if no valid pair is found.
273 |    */
274 |   protected _findMinRankPair(pairs: string[][]): string[] | null {
275 |     let minPair: string[] | null = null
276 |     let minRank = Infinity
277 | 
278 |     for (const pair of pairs) {
279 |       const rank = this._bpe.ranksMap.getRank(pair[0], pair[1])
280 |       if (typeof rank !== 'number') {
281 |         continue
282 |       }
283 | 
284 |       if (rank < minRank) {
285 |         minPair = pair
286 |         minRank = rank
287 |       }
288 |     }
289 | 
290 |     return minPair || null
291 |   }
292 |   //#endregion
293 | }
294 | 


--------------------------------------------------------------------------------