├── .gitignore ├── LICENSE ├── README.md ├── README_zh-hans.md ├── benchmark_test.go ├── bpe.go ├── core_bpe.go ├── doc └── test_result.md ├── encoding.go ├── go.mod ├── go.sum ├── load.go ├── regex_test.go ├── test ├── benchmark.py ├── benchmark_test.go ├── test.txt ├── token_num.go └── token_num.py ├── tiktoken.go └── tiktoken_test.go /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 ImmortalFog 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tiktoken-go 2 | [简体中文](./README_zh-hans.md) 3 | 4 | OpenAI's tiktoken in Go. 5 | 6 | Tiktoken is a fast BPE tokeniser for use with OpenAI's models. 7 | 8 | This is a port of the original [tiktoken](https://github.com/openai/tiktoken). 9 | 10 | # Usage 11 | ## Install 12 | 13 | ```bash 14 | go get github.com/linux-do/tiktoken-go 15 | ``` 16 | ## Cache 17 | Tiktoken-go has the same cache mechanism as the original Tiktoken library. 18 | 19 | You can set the cache directory by using the environment variable TIKTOKEN_CACHE_DIR. 20 | 21 | Once this variable is set, tiktoken-go will use this directory to cache the token dictionary. 22 | 23 | If you don't set this environment variable, tiktoken-go will download the dictionary each time you initialize an encoding for the first time. 24 | 25 | ## Alternative BPE loaders 26 | If you don't want to use cache or download the dictionary each time, you can use alternative BPE loader. 27 | 28 | Just call `tiktoken.SetBpeLoader` before calling `tiktoken.GetEncoding` or `tiktoken.EncodingForModel`. 29 | 30 | `BpeLoader` is an interface, you can implement your own BPE loader by implementing this interface. 31 | 32 | ### Offline BPE loader 33 | The offline BPE loader loads the BPE dictionary from embed files, it helps if you don't want to download the dictionary at runtime. 34 | 35 | Due to the size of the BPE dictionary, this loader is in other project. 36 | 37 | Include if you require this loader: [tiktoken_loader](https://github.com/pkoukk/tiktoken-go-loader) 38 | 39 | ## Examples 40 | ### Get Token By Encoding 41 | 42 | ```go 43 | package main 44 | 45 | import ( 46 | "fmt" 47 | "github.com/linux-do/tiktoken-go" 48 | ) 49 | 50 | func main() { 51 | text := "Hello, world!" 52 | encoding := "cl100k_base" 53 | 54 | // if you don't want download dictionary at runtime, you can use offline loader 55 | // tiktoken.SetBpeLoader(tiktoken_loader.NewOfflineLoader()) 56 | tke, err := tiktoken.GetEncoding(encoding) 57 | if err != nil { 58 | err = fmt.Errorf("getEncoding: %v", err) 59 | return 60 | } 61 | 62 | // encode 63 | token := tke.Encode(text, nil, nil) 64 | 65 | //tokens 66 | fmt.Println((token)) 67 | // num_tokens 68 | fmt.Println(len(token)) 69 | } 70 | ``` 71 | 72 | ### Get Token By Model 73 | 74 | ```go 75 | package main 76 | 77 | import ( 78 | "fmt" 79 | "github.com/linux-do/tiktoken-go" 80 | ) 81 | 82 | func main() { 83 | text := "Hello, world!" 84 | encoding := "gpt-3.5-turbo" 85 | 86 | tkm, err := tiktoken.EncodingForModel(encoding) 87 | if err != nil { 88 | err = fmt.Errorf("getEncoding: %v", err) 89 | return 90 | } 91 | 92 | // encode 93 | token := tkm.Encode(text, nil, nil) 94 | 95 | // tokens 96 | fmt.Println(token) 97 | // num_tokens 98 | fmt.Println(len(token)) 99 | } 100 | ``` 101 | 102 | ### Counting Tokens For Chat API Calls 103 | Below is an example function for counting tokens for messages passed to gpt-3.5-turbo or gpt-4. 104 | 105 | The following code was written based on [openai-cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb) examples at `Wednesday, 28 June 2023`. 106 | 107 | Please note that the token calculation method for the message may change at any time, so this code may not necessarily be applicable in the future. 108 | 109 | If you need accurate calculation, please refer to the official documentation. 110 | 111 | If you find that this code is no longer applicable, please feel free to submit a PR or Issue. 112 | 113 | 114 | ```go 115 | package main 116 | 117 | import ( 118 | "fmt" 119 | 120 | "github.com/linux-do/tiktoken-go" 121 | "github.com/sashabaranov/go-openai" 122 | ) 123 | 124 | // OpenAI Cookbook: https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb 125 | func NumTokensFromMessages(messages []openai.ChatCompletionMessage, model string) (numTokens int) { 126 | tkm, err := tiktoken.EncodingForModel(model) 127 | if err != nil { 128 | err = fmt.Errorf("encoding for model: %v", err) 129 | log.Println(err) 130 | return 131 | } 132 | 133 | var tokensPerMessage, tokensPerName int 134 | switch model { 135 | case "gpt-3.5-turbo-0613", 136 | "gpt-3.5-turbo-16k-0613", 137 | "gpt-4-0314", 138 | "gpt-4-32k-0314", 139 | "gpt-4-0613", 140 | "gpt-4-32k-0613": 141 | tokensPerMessage = 3 142 | tokensPerName = 1 143 | case "gpt-3.5-turbo-0301": 144 | tokensPerMessage = 4 // every message follows <|start|>{role/name}\n{content}<|end|>\n 145 | tokensPerName = -1 // if there's a name, the role is omitted 146 | default: 147 | if strings.Contains(model, "gpt-3.5-turbo") { 148 | log.Println("warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.") 149 | return NumTokensFromMessages(messages, "gpt-3.5-turbo-0613") 150 | } else if strings.Contains(model, "gpt-4") { 151 | log.Println("warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.") 152 | return NumTokensFromMessages(messages, "gpt-4-0613") 153 | } else { 154 | err = fmt.Errorf("num_tokens_from_messages() is not implemented for model %s. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.", model) 155 | log.Println(err) 156 | return 157 | } 158 | } 159 | 160 | for _, message := range messages { 161 | numTokens += tokensPerMessage 162 | numTokens += len(tkm.Encode(message.Content, nil, nil)) 163 | numTokens += len(tkm.Encode(message.Role, nil, nil)) 164 | numTokens += len(tkm.Encode(message.Name, nil, nil)) 165 | if message.Name != "" { 166 | numTokens += tokensPerName 167 | } 168 | } 169 | numTokens += 3 // every reply is primed with <|start|>assistant<|message|> 170 | return numTokens 171 | } 172 | 173 | ``` 174 | 175 | 176 | # Available Encodings 177 | | Encoding name | OpenAI models | 178 | |-------------------------|------------------------------------------------------| 179 | | `o200k_base` | `gpt-4o` | 180 | | `cl100k_base` | `gpt-4`, `gpt-3.5-turbo`, `text-embedding-ada-002` | 181 | | `cl100k_base` | `text-embedding-3-large`, `text-embedding-3-small` | 182 | | `p50k_base` | Codex models, `text-davinci-002`, `text-davinci-003` | 183 | | `r50k_base` (or `gpt2`) | GPT-3 models like `davinci` | 184 | 185 | 186 | 187 | # Available Models 188 | | Model name | OpenAI models | 189 | |------------------------------| ------------- | 190 | | gpt-4o-* | o200k_base | 191 | | gpt-4-* | cl100k_base | 192 | | gpt-3.5-turbo-* | cl100k_base | 193 | | gpt-4 | cl100k_base | 194 | | gpt-3.5-turbo | cl100k_base | 195 | | text-davinci-003 | p50k_base | 196 | | text-davinci-002 | p50k_base | 197 | | text-davinci-001 | r50k_base | 198 | | text-curie-001 | r50k_base | 199 | | text-babbage-001 | r50k_base | 200 | | text-ada-001 | r50k_base | 201 | | davinci | r50k_base | 202 | | curie | r50k_base | 203 | | babbage | r50k_base | 204 | | ada | r50k_base | 205 | | code-davinci-002 | p50k_base | 206 | | code-davinci-001 | p50k_base | 207 | | code-cushman-002 | p50k_base | 208 | | code-cushman-001 | p50k_base | 209 | | davinci-codex | p50k_base | 210 | | cushman-codex | p50k_base | 211 | | text-davinci-edit-001 | p50k_edit | 212 | | code-davinci-edit-001 | p50k_edit | 213 | | text-embedding-ada-002 | cl100k_base | 214 | | text-embedding-3-small | cl100k_base | 215 | | text-embedding-3-large | cl100k_base | 216 | | text-similarity-davinci-001 | r50k_base | 217 | | text-similarity-curie-001 | r50k_base | 218 | | text-similarity-babbage-001 | r50k_base | 219 | | text-similarity-ada-001 | r50k_base | 220 | | text-search-davinci-doc-001 | r50k_base | 221 | | text-search-curie-doc-001 | r50k_base | 222 | | text-search-babbage-doc-001 | r50k_base | 223 | | text-search-ada-doc-001 | r50k_base | 224 | | code-search-babbage-code-001 | r50k_base | 225 | | code-search-ada-code-001 | r50k_base | 226 | | gpt2 | gpt2 | 227 | 228 | 229 | 230 | # Test 231 | > you can run test in [test](./test) folder 232 | 233 | ## compare with original [tiktoken](https://github.com/openai/tiktoken) 234 | 235 | ## get token by encoding 236 | [result](./doc/test_result.md#encoding-test-result) 237 | 238 | ## get token by model 239 | [result](./doc/test_result.md#model-test-result) 240 | 241 | 242 | 243 | # Benchmark 244 | > you can run benchmark in [test](./test) folder 245 | 246 | ## Benchmark result 247 | | name | time/op | os | cpu | text | times | 248 | | ----------- | ------- | ---------- | -------- | -------------------------------- | ------ | 249 | | tiktoken-go | 8795ns | macOS 13.2 | Apple M1 | [UDHR](https://unicode.org/udhr) | 100000 | 250 | | tiktoken | 8838ns | macOS 13.2 | Apple M1 | [UDHR](https://unicode.org/udhr) | 100000 | 251 | 252 | It looks like the performance is almost the same. 253 | 254 | Maybe the difference is due to the difference in the performance of the machine. 255 | 256 | Or maybe my benchmark method is not appropriate. 257 | 258 | If you have better benchmark method or if you want add your benchmark result, please feel free to submit a PR. 259 | 260 | # License 261 | [MIT](./LICENSE) 262 | -------------------------------------------------------------------------------- /README_zh-hans.md: -------------------------------------------------------------------------------- 1 | # tiktoken-go 2 | Go 语言版本的 OpenAI 的 tiktoken。 3 | 帮你把文本转换成 OpenAI 的模型可以识别的 token。 4 | tiktoken的原项目地址[tiktoken](https://github.com/openai/tiktoken). 5 | 6 | # 用法 7 | 8 | ## 安装 9 | 10 | ```bash 11 | go get github.com/linux-do/tiktoken-go 12 | ``` 13 | ## 缓存 14 | Tiktoken-go 和原始的 Tiktoken 库一样,具有相同的缓存机制。 15 | 16 | 您可以使用环境变量 TIKTOKEN_CACHE_DIR 来设置缓存目录。 17 | 18 | 一旦设置了该变量,tiktoken-go 将使用该目录来缓存令牌字典。 19 | 20 | 如果您未设置此环境变量,则 tiktoken-go 将在每次首次初始化编码时下载字典。 21 | 22 | 23 | ## 替代 BPE 加载器 24 | 默认情况下,tiktoken-go 会在运行时下载字典,如果您不想使用缓存或每次下载字典,您可以使用替代 BPE 加载器。 25 | 26 | 只需在调用 `tiktoken.GetEncoding` 或 `tiktoken.EncodingForModel` 之前调用 `tiktoken.SetBpeLoader`。 27 | 28 | `BpeLoader` 是一个接口,您可以通过实现此接口来实现自己的 BPE 加载器。 29 | 30 | ### 离线 BPE 加载器 31 | 离线 BPE 加载器从嵌入文件加载 BPE 字典。 32 | 33 | 由于 BPE 字典的文件较大,不适合包含在本项目中,故此加载器在其他项目中。 34 | 35 | 如果需要使用,请引用:[tiktoken_loader](https://github.com/pkoukk/tiktoken-go-loader) 36 | 37 | ## 例子 38 | 39 | ### Get Token By Encoding 40 | 41 | ```go 42 | package main 43 | 44 | import ( 45 | "fmt" 46 | "github.com/linux-do/tiktoken-go" 47 | ) 48 | 49 | func main() { 50 | text := "Hello, world!" 51 | encoding := "cl100k_base" 52 | 53 | // 如果你不想在运行时下载字典,你可以使用离线加载器 54 | // tiktoken.SetBpeLoader(tiktoken_loader.NewOfflineLoader()) 55 | tke, err := tiktoken.GetEncoding(encoding) 56 | if err != nil { 57 | err = fmt.Errorf("getEncoding: %v", err) 58 | return 59 | } 60 | 61 | // encode 62 | token := tke.Encode(text, nil, nil) 63 | 64 | //tokens 65 | fmt.Println((token)) 66 | // num_tokens 67 | fmt.Println(len(token)) 68 | } 69 | ``` 70 | 71 | ### get token by Model 72 | 73 | ```go 74 | package main 75 | 76 | import ( 77 | "fmt" 78 | "github.com/linux-do/tiktoken-go" 79 | ) 80 | 81 | func main() { 82 | text := "Hello, world!" 83 | encoding := "gpt-3.5-turbo" 84 | 85 | tkm, err := tiktoken.EncodingForModel(encoding) 86 | if err != nil { 87 | err = fmt.Errorf("getEncoding: %v", err) 88 | return 89 | } 90 | 91 | // encode 92 | token := tkm.Encode(text, nil, nil) 93 | 94 | // tokens 95 | fmt.Println(token) 96 | // num_tokens 97 | fmt.Println(len(token)) 98 | } 99 | ``` 100 | 101 | ### 计算chat API消息当中的token消耗 102 | 这段代码根据[官方示例](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb)编写 103 | 104 | 编写时间: `2023-06-28` 105 | 106 | 请注意,消息的token计算方式可能随时会发生改变,以下代码并不一定在将来适用,如果您需要精确的计算,请关注官方文档。 107 | 108 | 如果您发现这段代码不再适用,欢迎您提PR或Issue。 109 | 110 | ```go 111 | package main 112 | 113 | import ( 114 | "fmt" 115 | 116 | "github.com/linux-do/tiktoken-go" 117 | "github.com/sashabaranov/go-openai" 118 | ) 119 | 120 | func NumTokensFromMessages(messages []openai.ChatCompletionMessage, model string) (numTokens int) { 121 | tkm, err := tiktoken.EncodingForModel(model) 122 | if err != nil { 123 | err = fmt.Errorf("encoding for model: %v", err) 124 | log.Println(err) 125 | return 126 | } 127 | 128 | var tokensPerMessage, tokensPerName int 129 | switch model { 130 | case "gpt-3.5-turbo-0613", 131 | "gpt-3.5-turbo-16k-0613", 132 | "gpt-4-0314", 133 | "gpt-4-32k-0314", 134 | "gpt-4-0613", 135 | "gpt-4-32k-0613": 136 | tokensPerMessage = 3 137 | tokensPerName = 1 138 | case "gpt-3.5-turbo-0301": 139 | tokensPerMessage = 4 // every message follows <|start|>{role/name}\n{content}<|end|>\n 140 | tokensPerName = -1 // if there's a name, the role is omitted 141 | default: 142 | if strings.Contains(model, "gpt-3.5-turbo") { 143 | log.Println("warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.") 144 | return NumTokensFromMessages(messages, "gpt-3.5-turbo-0613") 145 | } else if strings.Contains(model, "gpt-4") { 146 | log.Println("warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.") 147 | return NumTokensFromMessages(messages, "gpt-4-0613") 148 | } else { 149 | err = fmt.Errorf("num_tokens_from_messages() is not implemented for model %s. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.", model) 150 | log.Println(err) 151 | return 152 | } 153 | } 154 | 155 | for _, message := range messages { 156 | numTokens += tokensPerMessage 157 | numTokens += len(tkm.Encode(message.Content, nil, nil)) 158 | numTokens += len(tkm.Encode(message.Role, nil, nil)) 159 | numTokens += len(tkm.Encode(message.Name, nil, nil)) 160 | if message.Name != "" { 161 | numTokens += tokensPerName 162 | } 163 | } 164 | numTokens += 3 // every reply is primed with <|start|>assistant<|message|> 165 | return numTokens 166 | } 167 | ``` 168 | 169 | # Available Encodings 170 | | Encoding name | OpenAI models | 171 | |-------------------------|------------------------------------------------------| 172 | | `o200k_base` | `gpt-4o` | 173 | | `cl100k_base` | `gpt-4`, `gpt-3.5-turbo`, `text-embedding-ada-002` | 174 | | `cl100k_base` | `text-embedding-3-large`, `text-embedding-3-small` | 175 | | `p50k_base` | Codex models, `text-davinci-002`, `text-davinci-003` | 176 | | `r50k_base` (or `gpt2`) | GPT-3 models like `davinci` | 177 | 178 | 179 | # Available Models 180 | | Model name | OpenAI models | 181 | |------------------------------| ------------- | 182 | | gpt-4o-* | o200k_base | 183 | | gpt-4-* | cl100k_base | 184 | | gpt-3.5-turbo-* | cl100k_base | 185 | | gpt-4 | cl100k_base | 186 | | gpt-3.5-turbo | cl100k_base | 187 | | text-davinci-003 | p50k_base | 188 | | text-davinci-002 | p50k_base | 189 | | text-davinci-001 | r50k_base | 190 | | text-curie-001 | r50k_base | 191 | | text-babbage-001 | r50k_base | 192 | | text-ada-001 | r50k_base | 193 | | davinci | r50k_base | 194 | | curie | r50k_base | 195 | | babbage | r50k_base | 196 | | ada | r50k_base | 197 | | code-davinci-002 | p50k_base | 198 | | code-davinci-001 | p50k_base | 199 | | code-cushman-002 | p50k_base | 200 | | code-cushman-001 | p50k_base | 201 | | davinci-codex | p50k_base | 202 | | cushman-codex | p50k_base | 203 | | text-davinci-edit-001 | p50k_edit | 204 | | code-davinci-edit-001 | p50k_edit | 205 | | text-embedding-ada-002 | cl100k_base | 206 | | text-embedding-3-small | cl100k_base | 207 | | text-embedding-3-large | cl100k_base | 208 | | text-similarity-davinci-001 | r50k_base | 209 | | text-similarity-curie-001 | r50k_base | 210 | | text-similarity-babbage-001 | r50k_base | 211 | | text-similarity-ada-001 | r50k_base | 212 | | text-search-davinci-doc-001 | r50k_base | 213 | | text-search-curie-doc-001 | r50k_base | 214 | | text-search-babbage-doc-001 | r50k_base | 215 | | text-search-ada-doc-001 | r50k_base | 216 | | code-search-babbage-code-001 | r50k_base | 217 | | code-search-ada-code-001 | r50k_base | 218 | | gpt2 | gpt2 | 219 | 220 | # 与官方 [tiktoken](https://github.com/openai/tiktoken) 的对比 221 | 222 | ## get token by encoding 223 | [测试结果](./doc/test_result.md#encoding-test-result) 224 | 225 | ## get token by model 226 | [测试结果](./doc/test_result.md#model-test-result) 227 | 228 | # Benchmark 229 | > 你可以使用 [test](./test) 目录下的文件执行基准测试。 230 | 231 | ## Benchmark result 232 | | name | time/op | os | cpu | text | times | 233 | | ----------- | ------- | ---------- | -------- | -------------------------------- | ------ | 234 | | tiktoken-go | 8795ns | macOS 13.2 | Apple M1 | [UDHR](https://unicode.org/udhr) | 100000 | 235 | | tiktoken | 8838ns | macOS 13.2 | Apple M1 | [UDHR](https://unicode.org/udhr) | 100000 | 236 | 237 | 看上去tiktoken-go的性能基本与原tiktoken一致。 238 | 239 | 也许在不同的机器上的测试结果会有所不同。也可能是我的测试方法并不恰当。 240 | 241 | 如果你有更好的测试方法,或者说你想添加在你机器上的测试结果,欢迎提PR。 242 | 243 | # License 244 | [MIT](./LICENSE) 245 | -------------------------------------------------------------------------------- /benchmark_test.go: -------------------------------------------------------------------------------- 1 | package tiktoken 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "strings" 7 | "testing" 8 | ) 9 | 10 | const TEST_FILE = "test/test.txt" 11 | 12 | func ReadTestFile() ([]byte, error) { 13 | // open and read TEST_FILE 14 | return os.ReadFile(TEST_FILE) 15 | } 16 | 17 | func BenchmarkEncoding(b *testing.B) { 18 | fileContent, err := ReadTestFile() 19 | if err != nil { 20 | panic(err) 21 | } 22 | 23 | tkm, err := EncodingForModel("gpt-4") 24 | if err != nil { 25 | panic(err) 26 | } 27 | 28 | text := string(fileContent) 29 | 30 | for ordersOfMagnitude := 0; ordersOfMagnitude < 4; ordersOfMagnitude++ { 31 | // do actual encoding 32 | fmt.Printf("Encoding %d bytes\n", len(text)) 33 | tkm.Encode(text, nil, nil) 34 | 35 | stringBuilder := strings.Builder{} 36 | for i := 0; i < 10; i++ { 37 | stringBuilder.WriteString(text) 38 | } 39 | 40 | text = stringBuilder.String() 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /bpe.go: -------------------------------------------------------------------------------- 1 | package tiktoken 2 | 3 | import ( 4 | "math" 5 | ) 6 | 7 | func bytePairMerge[T any](piece []byte, ranks map[string]int, f func(start, end int) T) []T { 8 | parts := make([][2]int, len(piece)+1) 9 | for i := 0; i < len(parts); i++ { 10 | parts[i][0], parts[i][1] = i, math.MaxInt // use max int as sentinel 11 | } 12 | 13 | getRank := func(startIdx, skip int) int { 14 | if startIdx+skip+2 < len(parts) { 15 | b := piece[parts[startIdx][0]:parts[startIdx+skip+2][0]] 16 | rank, ok := ranks[string(b)] 17 | if ok { 18 | return rank 19 | } 20 | } 21 | return -1 // use -1 to represent None 22 | } 23 | 24 | for i := 0; i < len(parts)-2; i++ { 25 | if rank := getRank(i, 0); rank >= 0 { 26 | parts[i][1] = rank 27 | } 28 | } 29 | 30 | for len(parts) > 1 { 31 | minRank, minIdx := math.MaxInt, -1 32 | for i := 0; i < len(parts)-1; i++ { 33 | if parts[i][1] < minRank { 34 | minRank, minIdx = parts[i][1], i 35 | } 36 | } 37 | 38 | if minRank < math.MaxInt { 39 | i := minIdx 40 | rank := getRank(i, 1) 41 | if rank >= 0 { 42 | parts[i][1] = rank 43 | } else { 44 | parts[i][1] = math.MaxInt 45 | } 46 | if i > 0 { 47 | rk := getRank(i-1, 1) 48 | if rk >= 0 { 49 | parts[i-1][1] = rk 50 | } else { 51 | parts[i-1][1] = math.MaxInt 52 | } 53 | } 54 | parts = append(parts[:i+1], parts[i+2:]...) 55 | } else { 56 | break 57 | } 58 | } 59 | 60 | out := make([]T, len(parts)-1) 61 | for i := 0; i < len(out); i++ { 62 | out[i] = f(parts[i][0], parts[i+1][0]) 63 | } 64 | return out 65 | } 66 | 67 | func bytePairEncode(piece []byte, ranks map[string]int) []int { 68 | if len(piece) == 1 { 69 | v := ranks[string(piece)] 70 | return []int{v} 71 | } 72 | return bytePairMerge(piece, ranks, func(start, end int) int { 73 | return ranks[string(piece[start:end])] 74 | }) 75 | } 76 | -------------------------------------------------------------------------------- /core_bpe.go: -------------------------------------------------------------------------------- 1 | package tiktoken 2 | 3 | import ( 4 | "bytes" 5 | "errors" 6 | "fmt" 7 | "regexp" 8 | "sort" 9 | "strings" 10 | 11 | "github.com/dlclark/regexp2" 12 | ) 13 | 14 | type CoreBPE struct { 15 | encoder map[string]int 16 | decoder map[int]string 17 | specialTokensEncoder map[string]int 18 | specialTokensDecoder map[int]string 19 | tlRegex *regexp2.Regexp 20 | tlSpecialRegex *regexp2.Regexp 21 | sortedTokenBytes [][]byte 22 | } 23 | 24 | func NewCoreBPE(encoder map[string]int, specialTokensEncoder map[string]int, pattern string) (*CoreBPE, error) { 25 | regex, err := regexp2.Compile(pattern, regexp2.None) 26 | if err != nil { 27 | return nil, fmt.Errorf("error compiling regex: %s", err) 28 | } 29 | 30 | specialRegexStrs := make([]string, 0, len(specialTokensEncoder)) 31 | for k := range specialTokensEncoder { 32 | specialRegexStrs = append(specialRegexStrs, regexp.QuoteMeta(k)) 33 | } 34 | specialRegex, err := regexp2.Compile(strings.Join(specialRegexStrs, "|"), regexp2.None) 35 | if err != nil { 36 | return nil, fmt.Errorf("error compiling special regex: %s", err) 37 | } 38 | 39 | decoder := make(map[int]string, len(encoder)) 40 | for k, v := range encoder { 41 | decoder[v] = k 42 | } 43 | 44 | if len(encoder) != len(decoder) { 45 | return nil, errors.New("encoder and decoder map sizes are different") 46 | } 47 | 48 | specialTokensDecoder := make(map[int]string, len(specialTokensEncoder)) 49 | for k, v := range specialTokensEncoder { 50 | specialTokensDecoder[v] = k 51 | } 52 | 53 | sortedTokenBytes := make([][]byte, 0, len(encoder)) 54 | for k := range encoder { 55 | sortedTokenBytes = append(sortedTokenBytes, []byte(k)) 56 | } 57 | sort.Slice(sortedTokenBytes, func(i, j int) bool { 58 | return bytes.Compare(sortedTokenBytes[i], sortedTokenBytes[j]) < 0 59 | }) 60 | 61 | return &CoreBPE{ 62 | encoder: encoder, 63 | specialTokensEncoder: specialTokensEncoder, 64 | decoder: decoder, 65 | specialTokensDecoder: specialTokensDecoder, 66 | tlRegex: regex, 67 | tlSpecialRegex: specialRegex, 68 | sortedTokenBytes: sortedTokenBytes, 69 | }, nil 70 | } 71 | 72 | func (bp *CoreBPE) encodeNative(text string, allowedSpecial map[string]any) ([]int, int) { 73 | specialRegex := bp.tlSpecialRegex 74 | regex := bp.tlRegex 75 | ret := []int{} 76 | lastPieceTokenLen := 0 77 | textRunes := []rune(text) 78 | 79 | start := 0 80 | for { 81 | var nextSpecial []int 82 | startFind := start 83 | for { 84 | // Find the next allowed special token, if any 85 | temp := cutRunes(textRunes, startFind, len(textRunes)) 86 | nextSpecial = findRegex2StringIndex(temp, specialRegex) 87 | if nextSpecial != nil { 88 | token := cutRunes(textRunes, startFind+nextSpecial[0], startFind+nextSpecial[1]) 89 | if _, ok := allowedSpecial[token]; ok { 90 | break 91 | } 92 | startFind += nextSpecial[1] 93 | } else { 94 | break 95 | } 96 | } 97 | 98 | end := len([]rune(text)) 99 | if nextSpecial != nil { 100 | end = start + nextSpecial[0] 101 | } 102 | 103 | // Okay, here we go, compare this logic to _encode_ordinary_native 104 | for _, mat := range findRegex2AllStringMatchIndex(cutRunes(textRunes, start, end), regex) { 105 | piece := cutRunes(textRunes, start+mat[0], start+mat[1]) 106 | if token, ok := bp.encoder[piece]; ok { 107 | lastPieceTokenLen = 1 108 | ret = append(ret, token) 109 | continue 110 | } 111 | tokens := bytePairEncode([]byte(piece), bp.encoder) 112 | lastPieceTokenLen = len(tokens) 113 | ret = append(ret, tokens...) 114 | } 115 | 116 | if nextSpecial != nil { 117 | temp := cutRunes(textRunes, start+nextSpecial[0], start+nextSpecial[1]) 118 | token := bp.specialTokensEncoder[temp] 119 | ret = append(ret, token) 120 | start = start + nextSpecial[1] 121 | lastPieceTokenLen = 0 122 | } else { 123 | break 124 | } 125 | } 126 | 127 | return ret, lastPieceTokenLen 128 | } 129 | 130 | func (bp *CoreBPE) encodeOrdinaryNative(text string) []int { 131 | ret := []int{} 132 | textRunes := []rune(text) 133 | for _, mat := range findRegex2AllStringMatchIndex(text, bp.tlRegex) { 134 | piece := cutRunes(textRunes, mat[0], mat[1]) 135 | if token, ok := bp.encoder[piece]; ok { 136 | ret = append(ret, token) 137 | continue 138 | } 139 | tokens := bytePairEncode([]byte(piece), bp.encoder) 140 | ret = append(ret, tokens...) 141 | } 142 | return ret 143 | } 144 | 145 | func (bpe *CoreBPE) decodeNative(tokens []int) []byte { 146 | ret := make([]byte, 0, len(tokens)*2) 147 | for _, token := range tokens { 148 | tokenBytes, ok := bpe.decoder[token] 149 | if !ok { 150 | tokenBytes = bpe.specialTokensDecoder[token] 151 | } 152 | if len(tokenBytes) > 0 { 153 | ret = append(ret, tokenBytes...) 154 | } 155 | } 156 | return ret 157 | } 158 | 159 | func findRegex2StringIndex(text string, reg *regexp2.Regexp) []int { 160 | m, _ := reg.FindStringMatch(text) 161 | if m == nil { 162 | return nil 163 | } 164 | result := make([]int, 2) 165 | result[0] = m.Index 166 | result[1] = m.Index + m.Length 167 | return result 168 | } 169 | 170 | func findRegex2AllStringMatchIndex(text string, reg *regexp2.Regexp) [][]int { 171 | var matches [][]int 172 | m, _ := reg.FindStringMatch(text) 173 | for m != nil { 174 | result := make([]int, 2) 175 | result[0] = m.Index 176 | result[1] = m.Index + m.Length 177 | matches = append(matches, result) 178 | m, _ = reg.FindNextMatch(m) 179 | } 180 | return matches 181 | } 182 | 183 | func cutRunes(runes []rune, start, end int) string { 184 | if start < 0 { 185 | start = 0 186 | } 187 | if end > len(runes) { 188 | end = len(runes) 189 | } 190 | return string(runes[start:end]) 191 | } 192 | -------------------------------------------------------------------------------- /doc/test_result.md: -------------------------------------------------------------------------------- 1 | # Encoding Test Result 2 | | python tiktoken | golang tiktoken-go | 3 | | :------------------------------------------------------- | :------------------------------------------------------- | 4 | | text: hallo world!, encoding: o200k_base, token: 4 | text: hallo world!, encoding: o200k_base, token: 4 | 5 | | text: hallo world!, encoding: cl100k_base, token: 4 | text: hallo world!, encoding: cl100k_base, token: 4 | 6 | | text: hallo world!, encoding: p50k_base, token: 4 | text: hallo world!, encoding: p50k_base, token: 4 | 7 | | text: hallo world!, encoding: r50k_base, token: 4 | text: hallo world!, encoding: r50k_base, token: 4 | 8 | | text: 你好世界!, encoding: o200k_base, token: 3 | text: 你好世界!, encoding: o200k_base, token: 3 | 9 | | text: 你好世界!, encoding: cl100k_base, token: 6 | text: 你好世界!, encoding: cl100k_base, token: 6 | 10 | | text: 你好世界!, encoding: p50k_base, token: 11 | text: 你好世界!, encoding: p50k_base, token: 11 | 11 | | text: 你好世界!, encoding: r50k_base, token: 11 | text: 你好世界!, encoding: r50k_base, token: 11 | 12 | | text: こんにちは世界!, encoding: o200k_base, token: 3 | text: こんにちは世界!, encoding: cl100k_base, token: 3 | 13 | | text: こんにちは世界!, encoding: cl100k_base, token: 5 | text: こんにちは世界!, encoding: cl100k_base, token: 5 | 14 | | text: こんにちは世界!, encoding: p50k_base, token: 13 | text: こんにちは世界!, encoding: p50k_base, token: 13 | 15 | | text: こんにちは世界!, encoding: r50k_base, token: 13 | text: こんにちは世界!, encoding: r50k_base, token: 13 | 16 | | text: 안녕하세요 세계!, encoding: o200k_base, token: 4 | text: 안녕하세요 세계!, encoding: o200k_base, token: 4 | 17 | | text: 안녕하세요 세계!, encoding: cl100k_base, token: 10 | text: 안녕하세요 세계!, encoding: cl100k_base, token: 10 | 18 | | text: 안녕하세요 세계!, encoding: p50k_base, token: 21 | text: 안녕하세요 세계!, encoding: p50k_base, token: 21 | 19 | | text: 안녕하세요 세계!, encoding: r50k_base, token: 21 | text: 안녕하세요 세계!, encoding: r50k_base, token: 21 | 20 | | text: Привет мир!, encoding: o200k_base, token: 4 | text: Привет мир!, encoding: cl100k_base, token: 4 | 21 | | text: Привет мир!, encoding: cl100k_base, token: 6 | text: Привет мир!, encoding: cl100k_base, token: 6 | 22 | | text: Привет мир!, encoding: p50k_base, token: 12 | text: Привет мир!, encoding: p50k_base, token: 12 | 23 | | text: Привет мир!, encoding: r50k_base, token: 12 | text: Привет мир!, encoding: r50k_base, token: 12 | 24 | | text: ¡Hola mundo!, encoding: o200k_base, token: 4 | text: ¡Hola mundo!, encoding: o200k_base, token: 4 | 25 | | text: ¡Hola mundo!, encoding: cl100k_base, token: 4 | text: ¡Hola mundo!, encoding: cl100k_base, token: 4 | 26 | | text: ¡Hola mundo!, encoding: p50k_base, token: 7 | text: ¡Hola mundo!, encoding: p50k_base, token: 7 | 27 | | text: ¡Hola mundo!, encoding: r50k_base, token: 7 | text: ¡Hola mundo!, encoding: r50k_base, token: 7 | 28 | | text: Hallo Welt!, encoding: o200k_base, token: 3 | text: Hallo Welt!, encoding: o200k_base, token: 3 | 29 | | text: Hallo Welt!, encoding: cl100k_base, token: 3 | text: Hallo Welt!, encoding: cl100k_base, token: 3 | 30 | | text: Hallo Welt!, encoding: p50k_base, token: 5 | text: Hallo Welt!, encoding: p50k_base, token: 5 | 31 | | text: Hallo Welt!, encoding: r50k_base, token: 5 | text: Hallo Welt!, encoding: r50k_base, token: 5 | 32 | | text: Bonjour le monde!, encoding: o200k_base, token: 4 | text: Bonjour le monde!, encoding: o200k_base, token: 4 | 33 | | text: Bonjour le monde!, encoding: cl100k_base, token: 4 | text: Bonjour le monde!, encoding: cl100k_base, token: 4 | 34 | | text: Bonjour le monde!, encoding: p50k_base, token: 7 | text: Bonjour le monde!, encoding: p50k_base, token: 7 | 35 | | text: Bonjour le monde!, encoding: r50k_base, token: 7 | text: Bonjour le monde!, encoding: r50k_base, token: 7 | 36 | | text: Ciao mondo!, encoding: o200k_base, token: 4 | text: Ciao mondo!, encoding: o200k_base, token: 4 | 37 | | text: Ciao mondo!, encoding: cl100k_base, token: 4 | text: Ciao mondo!, encoding: cl100k_base, token: 4 | 38 | | text: Ciao mondo!, encoding: p50k_base, token: 5 | text: Ciao mondo!, encoding: p50k_base, token: 5 | 39 | | text: Ciao mondo!, encoding: r50k_base, token: 5 | text: Ciao mondo!, encoding: r50k_base, token: 5 | 40 | | text: Hej världen!, encoding: cl100k_base, token: 3 | text: Hej världen!, encoding: o200k_base, token: 3 | 41 | | text: Hej världen!, encoding: cl100k_base, token: 7 | text: Hej världen!, encoding: cl100k_base, token: 7 | 42 | | text: Hej världen!, encoding: p50k_base, token: 8 | text: Hej världen!, encoding: p50k_base, token: 8 | 43 | | text: Hej världen!, encoding: r50k_base, token: 8 | text: Hej världen!, encoding: r50k_base, token: 8 | 44 | | text: Hallo wereld!, encoding: o200k_base, token: 3 | text: Hallo wereld!, encoding: o200k_base, token: 3 | 45 | | text: Hallo wereld!, encoding: cl100k_base, token: 3 | text: Hallo wereld!, encoding: cl100k_base, token: 3 | 46 | | text: Hallo wereld!, encoding: p50k_base, token: 5 | text: Hallo wereld!, encoding: p50k_base, token: 5 | 47 | | text: Hallo wereld!, encoding: r50k_base, token: 5 | text: Hallo wereld!, encoding: r50k_base, token: 5 | 48 | | text: Hallo verden!, encoding: cl100k_base, token: 4 | text: Hallo verden!, encoding: cl100k_base, token: 4 | 49 | | text: Hallo verden!, encoding: p50k_base, token: 5 | text: Hallo verden!, encoding: p50k_base, token: 5 | 50 | | text: Hallo verden!, encoding: r50k_base, token: 5 | text: Hallo verden!, encoding: r50k_base, token: 5 | 51 | | text: Hallo wereld!, encoding: o200k_base, token: 3 | text: Hallo wereld!, encoding: o200k_base, token: 3 | 52 | | text: Hallo wereld!, encoding: cl100k_base, token: 3 | text: Hallo wereld!, encoding: cl100k_base, token: 3 | 53 | | text: Hallo wereld!, encoding: p50k_base, token: 5 | text: Hallo wereld!, encoding: p50k_base, token: 5 | 54 | | text: Hallo wereld!, encoding: r50k_base, token: 5 | text: Hallo wereld!, encoding: r50k_base, token: 5 | 55 | | text: Hallo verden!, encoding: cl100k_base, token: 4 | text: Hallo verden!, encoding: cl100k_base, token: 4 | 56 | | text: Hallo verden!, encoding: p50k_base, token: 5 | text: Hallo verden!, encoding: p50k_base, token: 5 | 57 | | text: Hallo verden!, encoding: r50k_base, token: 5 | text: Hallo verden!, encoding: r50k_base, token: 5 | 58 | 59 | # Model Test Result 60 | | python tiktoken | golang tiktoken-go | 61 | | --------------------------------------------------------------------- | --------------------------------------------------------------------- | 62 | | text: hallo world!, model: gpt-4, token: 4 | text: hallo world!, model: gpt-4, token: 4 | 63 | | text: hallo world!, model: gpt-3.5-turbo, token: 4 | text: hallo world!, model: gpt-3.5-turbo, token: 4 | 64 | | text: hallo world!, model: text-davinci-003, token: 4 | text: hallo world!, model: text-davinci-003, token: 4 | 65 | | text: hallo world!, model: text-davinci-002, token: 4 | text: hallo world!, model: text-davinci-002, token: 4 | 66 | | text: hallo world!, model: text-davinci-001, token: 4 | text: hallo world!, model: text-davinci-001, token: 4 | 67 | | text: hallo world!, model: text-curie-001, token: 4 | text: hallo world!, model: text-curie-001, token: 4 | 68 | | text: hallo world!, model: text-babbage-001, token: 4 | text: hallo world!, model: text-babbage-001, token: 4 | 69 | | text: hallo world!, model: text-ada-001, token: 4 | text: hallo world!, model: text-ada-001, token: 4 | 70 | | text: hallo world!, model: davinci, token: 4 | text: hallo world!, model: davinci, token: 4 | 71 | | text: hallo world!, model: curie, token: 4 | text: hallo world!, model: curie, token: 4 | 72 | | text: hallo world!, model: babbage, token: 4 | text: hallo world!, model: babbage, token: 4 | 73 | | text: hallo world!, model: ada, token: 4 | text: hallo world!, model: ada, token: 4 | 74 | | text: hallo world!, model: code-davinci-002, token: 4 | text: hallo world!, model: code-davinci-002, token: 4 | 75 | | text: hallo world!, model: code-davinci-001, token: 4 | text: hallo world!, model: code-davinci-001, token: 4 | 76 | | text: hallo world!, model: code-cushman-002, token: 4 | text: hallo world!, model: code-cushman-002, token: 4 | 77 | | text: hallo world!, model: code-cushman-001, token: 4 | text: hallo world!, model: code-cushman-001, token: 4 | 78 | | text: hallo world!, model: davinci-codex, token: 4 | text: hallo world!, model: davinci-codex, token: 4 | 79 | | text: hallo world!, model: cushman-codex, token: 4 | text: hallo world!, model: cushman-codex, token: 4 | 80 | | text: hallo world!, model: text-davinci-edit-001, token: 4 | text: hallo world!, model: text-davinci-edit-001, token: 4 | 81 | | text: hallo world!, model: code-davinci-edit-001, token: 4 | text: hallo world!, model: code-davinci-edit-001, token: 4 | 82 | | text: hallo world!, model: text-embedding-ada-002, token: 4 | text: hallo world!, model: text-embedding-ada-002, token: 4 | 83 | | text: hallo world!, model: text-similarity-davinci-001, token: 4 | text: hallo world!, model: text-similarity-davinci-001, token: 4 | 84 | | text: 你好世界!, model: gpt-4, token: 6 | text: 你好世界!, model: gpt-4, token: 6 | 85 | | text: 你好世界!, model: gpt-3.5-turbo, token: 6 | text: 你好世界!, model: gpt-3.5-turbo, token: 6 | 86 | | text: 你好世界!, model: text-davinci-003, token: 11 | text: 你好世界!, model: text-davinci-003, token: 11 | 87 | | text: 你好世界!, model: text-davinci-002, token: 11 | text: 你好世界!, model: text-davinci-002, token: 11 | 88 | | text: 你好世界!, model: text-davinci-001, token: 11 | text: 你好世界!, model: text-davinci-001, token: 11 | 89 | | text: 你好世界!, model: text-curie-001, token: 11 | text: 你好世界!, model: text-curie-001, token: 11 | 90 | | text: 你好世界!, model: text-babbage-001, token: 11 | text: 你好世界!, model: text-babbage-001, token: 11 | 91 | | text: 你好世界!, model: text-ada-001, token: 11 | text: 你好世界!, model: text-ada-001, token: 11 | 92 | | text: 你好世界!, model: davinci, token: 11 | text: 你好世界!, model: davinci, token: 11 | 93 | | text: 你好世界!, model: curie, token: 11 | text: 你好世界!, model: curie, token: 11 | 94 | | text: 你好世界!, model: babbage, token: 11 | text: 你好世界!, model: babbage, token: 11 | 95 | | text: 你好世界!, model: ada, token: 11 | text: 你好世界!, model: ada, token: 11 | 96 | | text: 你好世界!, model: code-davinci-002, token: 11 | text: 你好世界!, model: code-davinci-002, token: 11 | 97 | | text: 你好世界!, model: code-davinci-001, token: 11 | text: 你好世界!, model: code-davinci-001, token: 11 | 98 | | text: 你好世界!, model: code-cushman-002, token: 11 | text: 你好世界!, model: code-cushman-002, token: 11 | 99 | | text: 你好世界!, model: code-cushman-001, token: 11 | text: 你好世界!, model: code-cushman-001, token: 11 | 100 | | text: 你好世界!, model: davinci-codex, token: 11 | text: 你好世界!, model: davinci-codex, token: 11 | 101 | | text: 你好世界!, model: cushman-codex, token: 11 | text: 你好世界!, model: cushman-codex, token: 11 | 102 | | text: 你好世界!, model: text-davinci-edit-001, token: 11 | text: 你好世界!, model: text-davinci-edit-001, token: 11 | 103 | | text: 你好世界!, model: code-davinci-edit-001, token: 11 | text: 你好世界!, model: code-davinci-edit-001, token: 11 | 104 | | text: 你好世界!, model: text-embedding-ada-002, token: 6 | text: 你好世界!, model: text-embedding-ada-002, token: 6 | 105 | | text: 你好世界!, model: text-similarity-davinci-001, token: 11 | text: 你好世界!, model: text-similarity-davinci-001, token: 11 | 106 | | text: こんにちは世界!, model: gpt-4, token: 5 | text: こんにちは世界!, model: gpt-4, token: 5 | 107 | | text: こんにちは世界!, model: gpt-3.5-turbo, token: 5 | text: こんにちは世界!, model: gpt-3.5-turbo, token: 5 | 108 | | text: こんにちは世界!, model: text-davinci-003, token: 13 | text: こんにちは世界!, model: text-davinci-003, token: 13 | 109 | | text: こんにちは世界!, model: text-davinci-002, token: 13 | text: こんにちは世界!, model: text-davinci-002, token: 13 | 110 | | text: こんにちは世界!, model: text-davinci-001, token: 13 | text: こんにちは世界!, model: text-davinci-001, token: 13 | 111 | | text: こんにちは世界!, model: text-curie-001, token: 13 | text: こんにちは世界!, model: text-curie-001, token: 13 | 112 | | text: こんにちは世界!, model: text-babbage-001, token: 13 | text: こんにちは世界!, model: text-babbage-001, token: 13 | 113 | | text: こんにちは世界!, model: text-ada-001, token: 13 | text: こんにちは世界!, model: text-ada-001, token: 13 | 114 | | text: こんにちは世界!, model: davinci, token: 13 | text: こんにちは世界!, model: davinci, token: 13 | 115 | | text: こんにちは世界!, model: curie, token: 13 | text: こんにちは世界!, model: curie, token: 13 | 116 | | text: こんにちは世界!, model: babbage, token: 13 | text: こんにちは世界!, model: babbage, token: 13 | 117 | | text: こんにちは世界!, model: ada, token: 13 | text: こんにちは世界!, model: ada, token: 13 | 118 | | text: こんにちは世界!, model: code-davinci-002, token: 13 | text: こんにちは世界!, model: code-davinci-002, token: 13 | 119 | | text: こんにちは世界!, model: code-davinci-001, token: 13 | text: こんにちは世界!, model: code-davinci-001, token: 13 | 120 | | text: こんにちは世界!, model: code-cushman-002, token: 13 | text: こんにちは世界!, model: code-cushman-002, token: 13 | 121 | | text: こんにちは世界!, model: code-cushman-001, token: 13 | text: こんにちは世界!, model: code-cushman-001, token: 13 | 122 | | text: こんにちは世界!, model: davinci-codex, token: 13 | text: こんにちは世界!, model: davinci-codex, token: 13 | 123 | | text: こんにちは世界!, model: cushman-codex, token: 13 | text: こんにちは世界!, model: cushman-codex, token: 13 | 124 | | text: こんにちは世界!, model: text-davinci-edit-001, token: 13 | text: こんにちは世界!, model: text-davinci-edit-001, token: 13 | 125 | | text: こんにちは世界!, model: code-davinci-edit-001, token: 13 | text: こんにちは世界!, model: code-davinci-edit-001, token: 13 | 126 | | text: こんにちは世界!, model: text-embedding-ada-002, token: 5 | text: こんにちは世界!, model: text-embedding-ada-002, token: 5 | 127 | | text: こんにちは世界!, model: text-similarity-davinci-001, token: 13 | text: こんにちは世界!, model: text-similarity-davinci-001, token: 13 | 128 | | text: 안녕하세요 세계!, model: gpt-4, token: 10 | text: 안녕하세요 세계!, model: gpt-4, token: 10 | 129 | | text: 안녕하세요 세계!, model: gpt-3.5-turbo, token: 10 | text: 안녕하세요 세계!, model: gpt-3.5-turbo, token: 10 | 130 | | text: 안녕하세요 세계!, model: text-davinci-003, token: 21 | text: 안녕하세요 세계!, model: text-davinci-003, token: 21 | 131 | | text: 안녕하세요 세계!, model: text-davinci-002, token: 21 | text: 안녕하세요 세계!, model: text-davinci-002, token: 21 | 132 | | text: 안녕하세요 세계!, model: text-davinci-001, token: 21 | text: 안녕하세요 세계!, model: text-davinci-001, token: 21 | 133 | | text: 안녕하세요 세계!, model: text-curie-001, token: 21 | text: 안녕하세요 세계!, model: text-curie-001, token: 21 | 134 | | text: 안녕하세요 세계!, model: text-babbage-001, token: 21 | text: 안녕하세요 세계!, model: text-babbage-001, token: 21 | 135 | | text: 안녕하세요 세계!, model: text-ada-001, token: 21 | text: 안녕하세요 세계!, model: text-ada-001, token: 21 | 136 | | text: 안녕하세요 세계!, model: davinci, token: 21 | text: 안녕하세요 세계!, model: davinci, token: 21 | 137 | | text: 안녕하세요 세계!, model: curie, token: 21 | text: 안녕하세요 세계!, model: curie, token: 21 | 138 | | text: 안녕하세요 세계!, model: babbage, token: 21 | text: 안녕하세요 세계!, model: babbage, token: 21 | 139 | | text: 안녕하세요 세계!, model: ada, token: 21 | text: 안녕하세요 세계!, model: ada, token: 21 | 140 | | text: 안녕하세요 세계!, model: code-davinci-002, token: 21 | text: 안녕하세요 세계!, model: code-davinci-002, token: 21 | 141 | | text: 안녕하세요 세계!, model: code-davinci-001, token: 21 | text: 안녕하세요 세계!, model: code-davinci-001, token: 21 | 142 | | text: 안녕하세요 세계!, model: code-cushman-002, token: 21 | text: 안녕하세요 세계!, model: code-cushman-002, token: 21 | 143 | | text: 안녕하세요 세계!, model: code-cushman-001, token: 21 | text: 안녕하세요 세계!, model: code-cushman-001, token: 21 | 144 | | text: 안녕하세요 세계!, model: davinci-codex, token: 21 | text: 안녕하세요 세계!, model: davinci-codex, token: 21 | 145 | | text: 안녕하세요 세계!, model: cushman-codex, token: 21 | text: 안녕하세요 세계!, model: cushman-codex, token: 21 | 146 | | text: 안녕하세요 세계!, model: text-davinci-edit-001, token: 21 | text: 안녕하세요 세계!, model: text-davinci-edit-001, token: 21 | 147 | | text: 안녕하세요 세계!, model: code-davinci-edit-001, token: 21 | text: 안녕하세요 세계!, model: code-davinci-edit-001, token: 21 | 148 | | text: 안녕하세요 세계!, model: text-embedding-ada-002, token: 10 | text: 안녕하세요 세계!, model: text-embedding-ada-002, token: 10 | 149 | | text: 안녕하세요 세계!, model: text-similarity-davinci-001, token: 21 | text: 안녕하세요 세계!, model: text-similarity-davinci-001, token: 21 | 150 | | text: Привет мир!, model: gpt-4, token: 6 | text: Привет мир!, model: gpt-4, token: 6 | 151 | | text: Привет мир!, model: gpt-3.5-turbo, token: 6 | text: Привет мир!, model: gpt-3.5-turbo, token: 6 | 152 | | text: Привет мир!, model: text-davinci-003, token: 12 | text: Привет мир!, model: text-davinci-003, token: 12 | 153 | | text: Привет мир!, model: text-davinci-002, token: 12 | text: Привет мир!, model: text-davinci-002, token: 12 | 154 | | text: Привет мир!, model: text-davinci-001, token: 12 | text: Привет мир!, model: text-davinci-001, token: 12 | 155 | | text: Привет мир!, model: text-curie-001, token: 12 | text: Привет мир!, model: text-curie-001, token: 12 | 156 | | text: Привет мир!, model: text-babbage-001, token: 12 | text: Привет мир!, model: text-babbage-001, token: 12 | 157 | | text: Привет мир!, model: text-ada-001, token: 12 | text: Привет мир!, model: text-ada-001, token: 12 | 158 | | text: Привет мир!, model: davinci, token: 12 | text: Привет мир!, model: davinci, token: 12 | 159 | | text: Привет мир!, model: curie, token: 12 | text: Привет мир!, model: curie, token: 12 | 160 | | text: Привет мир!, model: babbage, token: 12 | text: Привет мир!, model: babbage, token: 12 | 161 | | text: Привет мир!, model: ada, token: 12 | text: Привет мир!, model: ada, token: 12 | 162 | | text: Привет мир!, model: code-davinci-002, token: 12 | text: Привет мир!, model: code-davinci-002, token: 12 | 163 | | text: Привет мир!, model: code-davinci-001, token: 12 | text: Привет мир!, model: code-davinci-001, token: 12 | 164 | | text: Привет мир!, model: code-cushman-002, token: 12 | text: Привет мир!, model: code-cushman-002, token: 12 | 165 | | text: Привет мир!, model: code-cushman-001, token: 12 | text: Привет мир!, model: code-cushman-001, token: 12 | 166 | | text: Привет мир!, model: davinci-codex, token: 12 | text: Привет мир!, model: davinci-codex, token: 12 | 167 | | text: Привет мир!, model: cushman-codex, token: 12 | text: Привет мир!, model: cushman-codex, token: 12 | 168 | | text: Привет мир!, model: text-davinci-edit-001, token: 12 | text: Привет мир!, model: text-davinci-edit-001, token: 12 | 169 | | text: Привет мир!, model: code-davinci-edit-001, token: 12 | text: Привет мир!, model: code-davinci-edit-001, token: 12 | 170 | | text: Привет мир!, model: text-embedding-ada-002, token: 6 | text: Привет мир!, model: text-embedding-ada-002, token: 6 | 171 | | text: Привет мир!, model: text-similarity-davinci-001, token: 12 | text: Привет мир!, model: text-similarity-davinci-001, token: 12 | 172 | | text: ¡Hola mundo!, model: gpt-4, token: 4 | text: ¡Hola mundo!, model: gpt-4, token: 4 | 173 | | text: ¡Hola mundo!, model: gpt-3.5-turbo, token: 4 | text: ¡Hola mundo!, model: gpt-3.5-turbo, token: 4 | 174 | | text: ¡Hola mundo!, model: text-davinci-003, token: 7 | text: ¡Hola mundo!, model: text-davinci-003, token: 7 | 175 | | text: ¡Hola mundo!, model: text-davinci-002, token: 7 | text: ¡Hola mundo!, model: text-davinci-002, token: 7 | 176 | | text: ¡Hola mundo!, model: text-davinci-001, token: 7 | text: ¡Hola mundo!, model: text-davinci-001, token: 7 | 177 | | text: ¡Hola mundo!, model: text-curie-001, token: 7 | text: ¡Hola mundo!, model: text-curie-001, token: 7 | 178 | | text: ¡Hola mundo!, model: text-babbage-001, token: 7 | text: ¡Hola mundo!, model: text-babbage-001, token: 7 | 179 | | text: ¡Hola mundo!, model: text-ada-001, token: 7 | text: ¡Hola mundo!, model: text-ada-001, token: 7 | 180 | | text: ¡Hola mundo!, model: davinci, token: 7 | text: ¡Hola mundo!, model: davinci, token: 7 | 181 | | text: ¡Hola mundo!, model: curie, token: 7 | text: ¡Hola mundo!, model: curie, token: 7 | 182 | | text: ¡Hola mundo!, model: babbage, token: 7 | text: ¡Hola mundo!, model: babbage, token: 7 | 183 | | text: ¡Hola mundo!, model: ada, token: 7 | text: ¡Hola mundo!, model: ada, token: 7 | 184 | | text: ¡Hola mundo!, model: code-davinci-002, token: 7 | text: ¡Hola mundo!, model: code-davinci-002, token: 7 | 185 | | text: ¡Hola mundo!, model: code-davinci-001, token: 7 | text: ¡Hola mundo!, model: code-davinci-001, token: 7 | 186 | | text: ¡Hola mundo!, model: code-cushman-002, token: 7 | text: ¡Hola mundo!, model: code-cushman-002, token: 7 | 187 | | text: ¡Hola mundo!, model: code-cushman-001, token: 7 | text: ¡Hola mundo!, model: code-cushman-001, token: 7 | 188 | | text: ¡Hola mundo!, model: davinci-codex, token: 7 | text: ¡Hola mundo!, model: davinci-codex, token: 7 | 189 | | text: ¡Hola mundo!, model: cushman-codex, token: 7 | text: ¡Hola mundo!, model: cushman-codex, token: 7 | 190 | | text: ¡Hola mundo!, model: text-davinci-edit-001, token: 7 | text: ¡Hola mundo!, model: text-davinci-edit-001, token: 7 | 191 | | text: ¡Hola mundo!, model: code-davinci-edit-001, token: 7 | text: ¡Hola mundo!, model: code-davinci-edit-001, token: 7 | 192 | | text: ¡Hola mundo!, model: text-embedding-ada-002, token: 4 | text: ¡Hola mundo!, model: text-embedding-ada-002, token: 4 | 193 | | text: ¡Hola mundo!, model: text-similarity-davinci-001, token: 7 | text: ¡Hola mundo!, model: text-similarity-davinci-001, token: 7 | 194 | | text: Hallo Welt!, model: gpt-4, token: 3 | text: Hallo Welt!, model: gpt-4, token: 3 | 195 | | text: Hallo Welt!, model: gpt-3.5-turbo, token: 3 | text: Hallo Welt!, model: gpt-3.5-turbo, token: 3 | 196 | | text: Hallo Welt!, model: text-davinci-003, token: 5 | text: Hallo Welt!, model: text-davinci-003, token: 5 | 197 | | text: Hallo Welt!, model: text-davinci-002, token: 5 | text: Hallo Welt!, model: text-davinci-002, token: 5 | 198 | | text: Hallo Welt!, model: text-davinci-001, token: 5 | text: Hallo Welt!, model: text-davinci-001, token: 5 | 199 | | text: Hallo Welt!, model: text-curie-001, token: 5 | text: Hallo Welt!, model: text-curie-001, token: 5 | 200 | | text: Hallo Welt!, model: text-babbage-001, token: 5 | text: Hallo Welt!, model: text-babbage-001, token: 5 | 201 | | text: Hallo Welt!, model: text-ada-001, token: 5 | text: Hallo Welt!, model: text-ada-001, token: 5 | 202 | | text: Hallo Welt!, model: davinci, token: 5 | text: Hallo Welt!, model: davinci, token: 5 | 203 | | text: Hallo Welt!, model: curie, token: 5 | text: Hallo Welt!, model: curie, token: 5 | 204 | | text: Hallo Welt!, model: babbage, token: 5 | text: Hallo Welt!, model: babbage, token: 5 | 205 | | text: Hallo Welt!, model: ada, token: 5 | text: Hallo Welt!, model: ada, token: 5 | 206 | | text: Hallo Welt!, model: code-davinci-002, token: 5 | text: Hallo Welt!, model: code-davinci-002, token: 5 | 207 | | text: Hallo Welt!, model: code-davinci-001, token: 5 | text: Hallo Welt!, model: code-davinci-001, token: 5 | 208 | | text: Hallo Welt!, model: code-cushman-002, token: 5 | text: Hallo Welt!, model: code-cushman-002, token: 5 | 209 | | text: Hallo Welt!, model: code-cushman-001, token: 5 | text: Hallo Welt!, model: code-cushman-001, token: 5 | 210 | | text: Hallo Welt!, model: davinci-codex, token: 5 | text: Hallo Welt!, model: davinci-codex, token: 5 | 211 | | text: Hallo Welt!, model: cushman-codex, token: 5 | text: Hallo Welt!, model: cushman-codex, token: 5 | 212 | | text: Hallo Welt!, model: text-davinci-edit-001, token: 5 | text: Hallo Welt!, model: text-davinci-edit-001, token: 5 | 213 | | text: Hallo Welt!, model: code-davinci-edit-001, token: 5 | text: Hallo Welt!, model: code-davinci-edit-001, token: 5 | 214 | | text: Hallo Welt!, model: text-embedding-ada-002, token: 3 | text: Hallo Welt!, model: text-embedding-ada-002, token: 3 | 215 | | text: Hallo Welt!, model: text-similarity-davinci-001, token: 5 | text: Hallo Welt!, model: text-similarity-davinci-001, token: 5 | 216 | | text: Bonjour le monde!, model: gpt-4, token: 4 | text: Bonjour le monde!, model: gpt-4, token: 4 | 217 | | text: Bonjour le monde!, model: gpt-3.5-turbo, token: 4 | text: Bonjour le monde!, model: gpt-3.5-turbo, token: 4 | 218 | | text: Bonjour le monde!, model: text-davinci-003, token: 7 | text: Bonjour le monde!, model: text-davinci-003, token: 7 | 219 | | text: Bonjour le monde!, model: text-davinci-002, token: 7 | text: Bonjour le monde!, model: text-davinci-002, token: 7 | 220 | | text: Bonjour le monde!, model: text-davinci-001, token: 7 | text: Bonjour le monde!, model: text-davinci-001, token: 7 | 221 | | text: Bonjour le monde!, model: text-curie-001, token: 7 | text: Bonjour le monde!, model: text-curie-001, token: 7 | 222 | | text: Bonjour le monde!, model: text-babbage-001, token: 7 | text: Bonjour le monde!, model: text-babbage-001, token: 7 | 223 | | text: Bonjour le monde!, model: text-ada-001, token: 7 | text: Bonjour le monde!, model: text-ada-001, token: 7 | 224 | | text: Bonjour le monde!, model: davinci, token: 7 | text: Bonjour le monde!, model: davinci, token: 7 | 225 | | text: Bonjour le monde!, model: curie, token: 7 | text: Bonjour le monde!, model: curie, token: 7 | 226 | | text: Bonjour le monde!, model: babbage, token: 7 | text: Bonjour le monde!, model: babbage, token: 7 | 227 | | text: Bonjour le monde!, model: ada, token: 7 | text: Bonjour le monde!, model: ada, token: 7 | 228 | | text: Bonjour le monde!, model: code-davinci-002, token: 7 | text: Bonjour le monde!, model: code-davinci-002, token: 7 | 229 | | text: Bonjour le monde!, model: code-davinci-001, token: 7 | text: Bonjour le monde!, model: code-davinci-001, token: 7 | 230 | | text: Bonjour le monde!, model: code-cushman-002, token: 7 | text: Bonjour le monde!, model: code-cushman-002, token: 7 | 231 | | text: Bonjour le monde!, model: code-cushman-001, token: 7 | text: Bonjour le monde!, model: code-cushman-001, token: 7 | 232 | | text: Bonjour le monde!, model: davinci-codex, token: 7 | text: Bonjour le monde!, model: davinci-codex, token: 7 | 233 | | text: Bonjour le monde!, model: cushman-codex, token: 7 | text: Bonjour le monde!, model: cushman-codex, token: 7 | 234 | | text: Bonjour le monde!, model: text-davinci-edit-001, token: 7 | text: Bonjour le monde!, model: text-davinci-edit-001, token: 7 | 235 | | text: Bonjour le monde!, model: code-davinci-edit-001, token: 7 | text: Bonjour le monde!, model: code-davinci-edit-001, token: 7 | 236 | | text: Bonjour le monde!, model: text-embedding-ada-002, token: 4 | text: Bonjour le monde!, model: text-embedding-ada-002, token: 4 | 237 | | text: Bonjour le monde!, model: text-similarity-davinci-001, token: 7 | text: Bonjour le monde!, model: text-similarity-davinci-001, token: 7 | 238 | | text: Ciao mondo!, model: gpt-4, token: 4 | text: Ciao mondo!, model: gpt-4, token: 4 | 239 | | text: Ciao mondo!, model: gpt-3.5-turbo, token: 4 | text: Ciao mondo!, model: gpt-3.5-turbo, token: 4 | 240 | | text: Ciao mondo!, model: text-davinci-003, token: 5 | text: Ciao mondo!, model: text-davinci-003, token: 5 | 241 | | text: Ciao mondo!, model: text-davinci-002, token: 5 | text: Ciao mondo!, model: text-davinci-002, token: 5 | 242 | | text: Ciao mondo!, model: text-davinci-001, token: 5 | text: Ciao mondo!, model: text-davinci-001, token: 5 | 243 | | text: Ciao mondo!, model: text-curie-001, token: 5 | text: Ciao mondo!, model: text-curie-001, token: 5 | 244 | | text: Ciao mondo!, model: text-babbage-001, token: 5 | text: Ciao mondo!, model: text-babbage-001, token: 5 | 245 | | text: Ciao mondo!, model: text-ada-001, token: 5 | text: Ciao mondo!, model: text-ada-001, token: 5 | 246 | | text: Ciao mondo!, model: davinci, token: 5 | text: Ciao mondo!, model: davinci, token: 5 | 247 | | text: Ciao mondo!, model: curie, token: 5 | text: Ciao mondo!, model: curie, token: 5 | 248 | | text: Ciao mondo!, model: babbage, token: 5 | text: Ciao mondo!, model: babbage, token: 5 | 249 | | text: Ciao mondo!, model: ada, token: 5 | text: Ciao mondo!, model: ada, token: 5 | 250 | | text: Ciao mondo!, model: code-davinci-002, token: 5 | text: Ciao mondo!, model: code-davinci-002, token: 5 | 251 | | text: Ciao mondo!, model: code-davinci-001, token: 5 | text: Ciao mondo!, model: code-davinci-001, token: 5 | 252 | | text: Ciao mondo!, model: code-cushman-002, token: 5 | text: Ciao mondo!, model: code-cushman-002, token: 5 | 253 | | text: Ciao mondo!, model: code-cushman-001, token: 5 | text: Ciao mondo!, model: code-cushman-001, token: 5 | 254 | | text: Ciao mondo!, model: davinci-codex, token: 5 | text: Ciao mondo!, model: davinci-codex, token: 5 | 255 | | text: Ciao mondo!, model: cushman-codex, token: 5 | text: Ciao mondo!, model: cushman-codex, token: 5 | 256 | | text: Ciao mondo!, model: text-davinci-edit-001, token: 5 | text: Ciao mondo!, model: text-davinci-edit-001, token: 5 | 257 | | text: Ciao mondo!, model: code-davinci-edit-001, token: 5 | text: Ciao mondo!, model: code-davinci-edit-001, token: 5 | 258 | | text: Ciao mondo!, model: text-embedding-ada-002, token: 4 | text: Ciao mondo!, model: text-embedding-ada-002, token: 4 | 259 | | text: Ciao mondo!, model: text-similarity-davinci-001, token: 5 | text: Ciao mondo!, model: text-similarity-davinci-001, token: 5 | 260 | | text: Hej världen!, model: gpt-4, token: 7 | text: Hej världen!, model: gpt-4, token: 7 | 261 | | text: Hej världen!, model: gpt-3.5-turbo, token: 7 | text: Hej världen!, model: gpt-3.5-turbo, token: 7 | 262 | | text: Hej världen!, model: text-davinci-003, token: 8 | text: Hej världen!, model: text-davinci-003, token: 8 | 263 | | text: Hej världen!, model: text-davinci-002, token: 8 | text: Hej världen!, model: text-davinci-002, token: 8 | 264 | | text: Hej världen!, model: text-davinci-001, token: 8 | text: Hej världen!, model: text-davinci-001, token: 8 | 265 | | text: Hej världen!, model: text-curie-001, token: 8 | text: Hej världen!, model: text-curie-001, token: 8 | 266 | | text: Hej världen!, model: text-babbage-001, token: 8 | text: Hej världen!, model: text-babbage-001, token: 8 | 267 | | text: Hej världen!, model: text-ada-001, token: 8 | text: Hej världen!, model: text-ada-001, token: 8 | 268 | | text: Hej världen!, model: davinci, token: 8 | text: Hej världen!, model: davinci, token: 8 | 269 | | text: Hej världen!, model: curie, token: 8 | text: Hej världen!, model: curie, token: 8 | 270 | | text: Hej världen!, model: babbage, token: 8 | text: Hej världen!, model: babbage, token: 8 | 271 | | text: Hej världen!, model: ada, token: 8 | text: Hej världen!, model: ada, token: 8 | 272 | | text: Hej världen!, model: code-davinci-002, token: 8 | text: Hej världen!, model: code-davinci-002, token: 8 | 273 | | text: Hej världen!, model: code-davinci-001, token: 8 | text: Hej världen!, model: code-davinci-001, token: 8 | 274 | | text: Hej världen!, model: code-cushman-002, token: 8 | text: Hej världen!, model: code-cushman-002, token: 8 | 275 | | text: Hej världen!, model: code-cushman-001, token: 8 | text: Hej världen!, model: code-cushman-001, token: 8 | 276 | | text: Hej världen!, model: davinci-codex, token: 8 | text: Hej världen!, model: davinci-codex, token: 8 | 277 | | text: Hej världen!, model: cushman-codex, token: 8 | text: Hej världen!, model: cushman-codex, token: 8 | 278 | | text: Hej världen!, model: text-davinci-edit-001, token: 8 | text: Hej världen!, model: text-davinci-edit-001, token: 8 | 279 | | text: Hej världen!, model: code-davinci-edit-001, token: 8 | text: Hej världen!, model: code-davinci-edit-001, token: 8 | 280 | | text: Hej världen!, model: text-embedding-ada-002, token: 7 | text: Hej världen!, model: text-embedding-ada-002, token: 7 | 281 | | text: Hej världen!, model: text-similarity-davinci-001, token: 8 | text: Hej världen!, model: text-similarity-davinci-001, token: 8 | 282 | | text: Hallo wereld!, model: gpt-4, token: 3 | text: Hallo wereld!, model: gpt-4, token: 3 | 283 | | text: Hallo wereld!, model: gpt-3.5-turbo, token: 3 | text: Hallo wereld!, model: gpt-3.5-turbo, token: 3 | 284 | | text: Hallo wereld!, model: text-davinci-003, token: 5 | text: Hallo wereld!, model: text-davinci-003, token: 5 | 285 | | text: Hallo wereld!, model: text-davinci-002, token: 5 | text: Hallo wereld!, model: text-davinci-002, token: 5 | 286 | | text: Hallo wereld!, model: text-davinci-001, token: 5 | text: Hallo wereld!, model: text-davinci-001, token: 5 | 287 | | text: Hallo wereld!, model: text-curie-001, token: 5 | text: Hallo wereld!, model: text-curie-001, token: 5 | 288 | | text: Hallo wereld!, model: text-babbage-001, token: 5 | text: Hallo wereld!, model: text-babbage-001, token: 5 | 289 | | text: Hallo wereld!, model: text-ada-001, token: 5 | text: Hallo wereld!, model: text-ada-001, token: 5 | 290 | | text: Hallo wereld!, model: davinci, token: 5 | text: Hallo wereld!, model: davinci, token: 5 | 291 | | text: Hallo wereld!, model: curie, token: 5 | text: Hallo wereld!, model: curie, token: 5 | 292 | | text: Hallo wereld!, model: babbage, token: 5 | text: Hallo wereld!, model: babbage, token: 5 | 293 | | text: Hallo wereld!, model: ada, token: 5 | text: Hallo wereld!, model: ada, token: 5 | 294 | | text: Hallo wereld!, model: code-davinci-002, token: 5 | text: Hallo wereld!, model: code-davinci-002, token: 5 | 295 | | text: Hallo wereld!, model: code-davinci-001, token: 5 | text: Hallo wereld!, model: code-davinci-001, token: 5 | 296 | | text: Hallo wereld!, model: code-cushman-002, token: 5 | text: Hallo wereld!, model: code-cushman-002, token: 5 | 297 | | text: Hallo wereld!, model: code-cushman-001, token: 5 | text: Hallo wereld!, model: code-cushman-001, token: 5 | 298 | | text: Hallo wereld!, model: davinci-codex, token: 5 | text: Hallo wereld!, model: davinci-codex, token: 5 | 299 | | text: Hallo wereld!, model: cushman-codex, token: 5 | text: Hallo wereld!, model: cushman-codex, token: 5 | 300 | | text: Hallo wereld!, model: text-davinci-edit-001, token: 5 | text: Hallo wereld!, model: text-davinci-edit-001, token: 5 | 301 | | text: Hallo wereld!, model: code-davinci-edit-001, token: 5 | text: Hallo wereld!, model: code-davinci-edit-001, token: 5 | 302 | | text: Hallo wereld!, model: text-embedding-ada-002, token: 3 | text: Hallo wereld!, model: text-embedding-ada-002, token: 3 | 303 | | text: Hallo wereld!, model: text-similarity-davinci-001, token: 5 | text: Hallo wereld!, model: text-similarity-davinci-001, token: 5 | 304 | | text: Hallo verden!, model: gpt-4, token: 4 | text: Hallo verden!, model: gpt-4, token: 4 | 305 | | text: Hallo verden!, model: gpt-3.5-turbo, token: 4 | text: Hallo verden!, model: gpt-3.5-turbo, token: 4 | 306 | | text: Hallo verden!, model: text-davinci-003, token: 5 | text: Hallo verden!, model: text-davinci-003, token: 5 | 307 | | text: Hallo verden!, model: text-davinci-002, token: 5 | text: Hallo verden!, model: text-davinci-002, token: 5 | 308 | | text: Hallo verden!, model: text-davinci-001, token: 5 | text: Hallo verden!, model: text-davinci-001, token: 5 | 309 | | text: Hallo verden!, model: text-curie-001, token: 5 | text: Hallo verden!, model: text-curie-001, token: 5 | 310 | | text: Hallo verden!, model: text-babbage-001, token: 5 | text: Hallo verden!, model: text-babbage-001, token: 5 | 311 | | text: Hallo verden!, model: text-ada-001, token: 5 | text: Hallo verden!, model: text-ada-001, token: 5 | 312 | | text: Hallo verden!, model: davinci, token: 5 | text: Hallo verden!, model: davinci, token: 5 | 313 | | text: Hallo verden!, model: curie, token: 5 | text: Hallo verden!, model: curie, token: 5 | 314 | | text: Hallo verden!, model: babbage, token: 5 | text: Hallo verden!, model: babbage, token: 5 | 315 | | text: Hallo verden!, model: ada, token: 5 | text: Hallo verden!, model: ada, token: 5 | 316 | | text: Hallo verden!, model: code-davinci-002, token: 5 | text: Hallo verden!, model: code-davinci-002, token: 5 | 317 | | text: Hallo verden!, model: code-davinci-001, token: 5 | text: Hallo verden!, model: code-davinci-001, token: 5 | 318 | | text: Hallo verden!, model: code-cushman-002, token: 5 | text: Hallo verden!, model: code-cushman-002, token: 5 | 319 | | text: Hallo verden!, model: code-cushman-001, token: 5 | text: Hallo verden!, model: code-cushman-001, token: 5 | 320 | | text: Hallo verden!, model: davinci-codex, token: 5 | text: Hallo verden!, model: davinci-codex, token: 5 | 321 | | text: Hallo verden!, model: cushman-codex, token: 5 | text: Hallo verden!, model: cushman-codex, token: 5 | 322 | | text: Hallo verden!, model: text-davinci-edit-001, token: 5 | text: Hallo verden!, model: text-davinci-edit-001, token: 5 | 323 | | text: Hallo verden!, model: code-davinci-edit-001, token: 5 | text: Hallo verden!, model: code-davinci-edit-001, token: 5 | 324 | | text: Hallo verden!, model: text-embedding-ada-002, token: 4 | text: Hallo verden!, model: text-embedding-ada-002, token: 4 | 325 | | text: Hallo verden!, model: text-similarity-davinci-001, token: 5 | text: Hallo verden!, model: text-similarity-davinci-001, token: 5 | 326 | | text: Hallo wereld!, model: gpt-4, token: 3 | text: Hallo wereld!, model: gpt-4, token: 3 | 327 | | text: Hallo wereld!, model: gpt-3.5-turbo, token: 3 | text: Hallo wereld!, model: gpt-3.5-turbo, token: 3 | 328 | | text: Hallo wereld!, model: text-davinci-003, token: 5 | text: Hallo wereld!, model: text-davinci-003, token: 5 | 329 | | text: Hallo wereld!, model: text-davinci-002, token: 5 | text: Hallo wereld!, model: text-davinci-002, token: 5 | 330 | | text: Hallo wereld!, model: text-davinci-001, token: 5 | text: Hallo wereld!, model: text-davinci-001, token: 5 | 331 | | text: Hallo wereld!, model: text-curie-001, token: 5 | text: Hallo wereld!, model: text-curie-001, token: 5 | 332 | | text: Hallo wereld!, model: text-babbage-001, token: 5 | text: Hallo wereld!, model: text-babbage-001, token: 5 | 333 | | text: Hallo wereld!, model: text-ada-001, token: 5 | text: Hallo wereld!, model: text-ada-001, token: 5 | 334 | | text: Hallo wereld!, model: davinci, token: 5 | text: Hallo wereld!, model: davinci, token: 5 | 335 | | text: Hallo wereld!, model: curie, token: 5 | text: Hallo wereld!, model: curie, token: 5 | 336 | | text: Hallo wereld!, model: babbage, token: 5 | text: Hallo wereld!, model: babbage, token: 5 | 337 | | text: Hallo wereld!, model: ada, token: 5 | text: Hallo wereld!, model: ada, token: 5 | 338 | | text: Hallo wereld!, model: code-davinci-002, token: 5 | text: Hallo wereld!, model: code-davinci-002, token: 5 | 339 | | text: Hallo wereld!, model: code-davinci-001, token: 5 | text: Hallo wereld!, model: code-davinci-001, token: 5 | 340 | | text: Hallo wereld!, model: code-cushman-002, token: 5 | text: Hallo wereld!, model: code-cushman-002, token: 5 | 341 | | text: Hallo wereld!, model: code-cushman-001, token: 5 | text: Hallo wereld!, model: code-cushman-001, token: 5 | 342 | | text: Hallo wereld!, model: davinci-codex, token: 5 | text: Hallo wereld!, model: davinci-codex, token: 5 | 343 | | text: Hallo wereld!, model: cushman-codex, token: 5 | text: Hallo wereld!, model: cushman-codex, token: 5 | 344 | | text: Hallo wereld!, model: text-davinci-edit-001, token: 5 | text: Hallo wereld!, model: text-davinci-edit-001, token: 5 | 345 | | text: Hallo wereld!, model: code-davinci-edit-001, token: 5 | text: Hallo wereld!, model: code-davinci-edit-001, token: 5 | 346 | | text: Hallo wereld!, model: text-embedding-ada-002, token: 3 | text: Hallo wereld!, model: text-embedding-ada-002, token: 3 | 347 | | text: Hallo wereld!, model: text-similarity-davinci-001, token: 5 | text: Hallo wereld!, model: text-similarity-davinci-001, token: 5 | 348 | | text: Hallo verden!, model: gpt-4, token: 4 | text: Hallo verden!, model: gpt-4, token: 4 | 349 | | text: Hallo verden!, model: gpt-3.5-turbo, token: 4 | text: Hallo verden!, model: gpt-3.5-turbo, token: 4 | 350 | | text: Hallo verden!, model: text-davinci-003, token: 5 | text: Hallo verden!, model: text-davinci-003, token: 5 | 351 | | text: Hallo verden!, model: text-davinci-002, token: 5 | text: Hallo verden!, model: text-davinci-002, token: 5 | 352 | | text: Hallo verden!, model: text-davinci-001, token: 5 | text: Hallo verden!, model: text-davinci-001, token: 5 | 353 | | text: Hallo verden!, model: text-curie-001, token: 5 | text: Hallo verden!, model: text-curie-001, token: 5 | 354 | | text: Hallo verden!, model: text-babbage-001, token: 5 | text: Hallo verden!, model: text-babbage-001, token: 5 | 355 | | text: Hallo verden!, model: text-ada-001, token: 5 | text: Hallo verden!, model: text-ada-001, token: 5 | 356 | | text: Hallo verden!, model: davinci, token: 5 | text: Hallo verden!, model: davinci, token: 5 | 357 | | text: Hallo verden!, model: curie, token: 5 | text: Hallo verden!, model: curie, token: 5 | 358 | | text: Hallo verden!, model: babbage, token: 5 | text: Hallo verden!, model: babbage, token: 5 | 359 | | text: Hallo verden!, model: ada, token: 5 | text: Hallo verden!, model: ada, token: 5 | 360 | | text: Hallo verden!, model: code-davinci-002, token: 5 | text: Hallo verden!, model: code-davinci-002, token: 5 | 361 | | text: Hallo verden!, model: code-davinci-001, token: 5 | text: Hallo verden!, model: code-davinci-001, token: 5 | 362 | | text: Hallo verden!, model: code-cushman-002, token: 5 | text: Hallo verden!, model: code-cushman-002, token: 5 | 363 | | text: Hallo verden!, model: code-cushman-001, token: 5 | text: Hallo verden!, model: code-cushman-001, token: 5 | 364 | | text: Hallo verden!, model: davinci-codex, token: 5 | text: Hallo verden!, model: davinci-codex, token: 5 | 365 | | text: Hallo verden!, model: cushman-codex, token: 5 | text: Hallo verden!, model: cushman-codex, token: 5 | 366 | | text: Hallo verden!, model: text-davinci-edit-001, token: 5 | text: Hallo verden!, model: text-davinci-edit-001, token: 5 | 367 | | text: Hallo verden!, model: code-davinci-edit-001, token: 5 | text: Hallo verden!, model: code-davinci-edit-001, token: 5 | 368 | | text: Hallo verden!, model: text-embedding-ada-002, token: 4 | text: Hallo verden!, model: text-embedding-ada-002, token: 4 | 369 | | text: Hallo verden!, model: text-similarity-davinci-001, token: 5 | text: Hallo verden!, model: text-similarity-davinci-001, token: 5 | -------------------------------------------------------------------------------- /encoding.go: -------------------------------------------------------------------------------- 1 | package tiktoken 2 | 3 | import ( 4 | "errors" 5 | "strings" 6 | "sync" 7 | ) 8 | 9 | const ENDOFTEXT string = "<|endoftext|>" 10 | const FIM_PREFIX string = "<|fim_prefix|>" 11 | const FIM_MIDDLE string = "<|fim_middle|>" 12 | const FIM_SUFFIX string = "<|fim_suffix|>" 13 | const ENDOFPROMPT string = "<|endofprompt|>" 14 | 15 | const ( 16 | MODEL_O200K_BASE string = "o200k_base" 17 | MODEL_CL100K_BASE string = "cl100k_base" 18 | MODEL_P50K_BASE string = "p50k_base" 19 | MODEL_P50K_EDIT string = "p50k_edit" 20 | MODEL_R50K_BASE string = "r50k_base" 21 | MODEL_GPT2 string = "gpt2" 22 | ) 23 | 24 | var MODEL_TO_ENCODING = map[string]string{ 25 | // chat 26 | "gpt-4o": MODEL_O200K_BASE, 27 | "gpt-4": MODEL_CL100K_BASE, 28 | "gpt-3.5-turbo": MODEL_CL100K_BASE, 29 | "gpt-3.5": MODEL_CL100K_BASE, // Common shorthand 30 | "gpt-35-turbo": MODEL_CL100K_BASE, // Azure deployment name 31 | // base 32 | "davinci-002": MODEL_CL100K_BASE, 33 | "babbage-002": MODEL_CL100K_BASE, 34 | // embeddings 35 | "text-embedding-ada-002": MODEL_CL100K_BASE, 36 | "text-embedding-3-small": MODEL_CL100K_BASE, 37 | "text-embedding-3-large": MODEL_CL100K_BASE, 38 | // DEPRECATED MODELS 39 | // text (DEPRECATED) 40 | "text-davinci-003": MODEL_P50K_BASE, 41 | "text-davinci-002": MODEL_P50K_BASE, 42 | "text-davinci-001": MODEL_P50K_BASE, 43 | "text-curie-001": MODEL_P50K_BASE, 44 | "text-babbage-001": MODEL_P50K_BASE, 45 | "text-ada-001": MODEL_P50K_BASE, 46 | "davinci": MODEL_P50K_BASE, 47 | "curie": MODEL_P50K_BASE, 48 | "babbage": MODEL_P50K_BASE, 49 | "ada": MODEL_P50K_BASE, 50 | // code (DEPRECATED) 51 | "code-davinci-002": MODEL_P50K_BASE, 52 | "code-davinci-001": MODEL_P50K_BASE, 53 | "code-cushman-002": MODEL_P50K_BASE, 54 | "code-cushman-001": MODEL_P50K_BASE, 55 | "davinci-codex": MODEL_P50K_BASE, 56 | "cushman-codex": MODEL_P50K_BASE, 57 | // edit (DEPRECATED) 58 | "text-davinci-edit-001": MODEL_P50K_EDIT, 59 | "code-davinci-edit-001": MODEL_P50K_EDIT, 60 | // old embeddings (DEPRECATED) 61 | "text-similarity-davinci-001": MODEL_R50K_BASE, 62 | "text-similarity-curie-001": MODEL_R50K_BASE, 63 | "text-similarity-babbage-001": MODEL_R50K_BASE, 64 | "text-similarity-ada-001": MODEL_R50K_BASE, 65 | "text-search-davinci-doc-001": MODEL_R50K_BASE, 66 | "text-search-curie-doc-001": MODEL_R50K_BASE, 67 | "text-search-babbage-doc-001": MODEL_R50K_BASE, 68 | "text-search-ada-doc-001": MODEL_R50K_BASE, 69 | "code-search-babbage-code-001": MODEL_R50K_BASE, 70 | "code-search-ada-code-001": MODEL_R50K_BASE, 71 | // open source 72 | "gpt2": MODEL_GPT2, 73 | "gpt-2": MODEL_GPT2, // Maintains consistency with gpt-4 74 | } 75 | 76 | var MODEL_PREFIX_TO_ENCODING = map[string]string{ 77 | // chat 78 | "gpt-4o-": MODEL_O200K_BASE, // e.g., gpt-4o-2024-05-13 79 | "gpt-4-": MODEL_CL100K_BASE, // e.g., gpt-4-0314, etc., plus gpt-4-32k 80 | "gpt-3.5-turbo-": MODEL_CL100K_BASE, // e.g, gpt-3.5-turbo-0301, -0401, etc. 81 | "gpt-35-turbo-": MODEL_CL100K_BASE, // Azure deployment name 82 | // fine-tuned 83 | "ft:gpt-4": MODEL_CL100K_BASE, 84 | "ft:gpt-3.5-turbo": MODEL_CL100K_BASE, 85 | "ft:davinci-002": MODEL_CL100K_BASE, 86 | "ft:babbage-002": MODEL_CL100K_BASE, 87 | } 88 | 89 | var encodingMap map[string]*Encoding 90 | var l *sync.Mutex 91 | 92 | func init() { 93 | encodingMap = make(map[string]*Encoding) 94 | l = &sync.Mutex{} 95 | } 96 | 97 | type Encoding struct { 98 | Name string 99 | PatStr string 100 | MergeableRanks map[string]int 101 | SpecialTokens map[string]int 102 | ExplicitNVocab int 103 | } 104 | 105 | func getEncoding(encodingName string) (*Encoding, error) { 106 | l.Lock() 107 | defer l.Unlock() 108 | if encoding, ok := encodingMap[encodingName]; ok { 109 | return encoding, nil 110 | } 111 | initEncoding, err := initEncoding(encodingName) 112 | if err != nil { 113 | return nil, err 114 | } 115 | encodingMap[encodingName] = initEncoding 116 | return encodingMap[encodingName], nil 117 | } 118 | 119 | func initEncoding(encodingName string) (*Encoding, error) { 120 | switch encodingName { 121 | case MODEL_O200K_BASE: 122 | return o200k_base() 123 | case MODEL_CL100K_BASE: 124 | return cl100k_base() 125 | case MODEL_P50K_BASE: 126 | return p50k_base() 127 | case MODEL_R50K_BASE: 128 | return r50k_base() 129 | case MODEL_P50K_EDIT: 130 | return p50k_edit() 131 | default: 132 | return nil, errors.New("Unknown encoding: " + encodingName) 133 | } 134 | } 135 | 136 | func o200k_base() (*Encoding, error) { 137 | ranks, err := bpeLoader.LoadTiktokenBpe("https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken") 138 | if err != nil { 139 | return nil, err 140 | } 141 | special_tokens := map[string]int{ 142 | ENDOFTEXT: 199999, 143 | ENDOFPROMPT: 200018, 144 | } 145 | patStr := []string{ 146 | `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?`, 147 | `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?`, 148 | `\p{N}{1,3}`, 149 | ` ?[^\s\p{L}\p{N}]+[\r\n/]*`, 150 | `\s*[\r\n]+`, 151 | `\s+(?!\S)`, 152 | `\s+`, 153 | } 154 | 155 | return &Encoding{ 156 | Name: MODEL_O200K_BASE, 157 | PatStr: strings.Join(patStr, "|"), 158 | MergeableRanks: ranks, 159 | SpecialTokens: special_tokens, 160 | }, nil 161 | } 162 | 163 | func cl100k_base() (*Encoding, error) { 164 | ranks, err := bpeLoader.LoadTiktokenBpe("https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken") 165 | if err != nil { 166 | return nil, err 167 | } 168 | special_tokens := map[string]int{ 169 | ENDOFTEXT: 100257, 170 | FIM_PREFIX: 100258, 171 | FIM_MIDDLE: 100259, 172 | FIM_SUFFIX: 100260, 173 | ENDOFPROMPT: 100276, 174 | } 175 | return &Encoding{ 176 | Name: MODEL_CL100K_BASE, 177 | PatStr: `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`, 178 | MergeableRanks: ranks, 179 | SpecialTokens: special_tokens, 180 | }, nil 181 | } 182 | 183 | func p50k_edit() (*Encoding, error) { 184 | ranks, err := bpeLoader.LoadTiktokenBpe("https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken") 185 | if err != nil { 186 | return nil, err 187 | } 188 | special_tokens := map[string]int{ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283} 189 | return &Encoding{ 190 | Name: MODEL_P50K_EDIT, 191 | PatStr: `'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`, 192 | MergeableRanks: ranks, 193 | SpecialTokens: special_tokens, 194 | }, nil 195 | } 196 | 197 | func p50k_base() (*Encoding, error) { 198 | ranks, err := bpeLoader.LoadTiktokenBpe("https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken") 199 | if err != nil { 200 | return nil, err 201 | } 202 | special_tokens := map[string]int{ENDOFTEXT: 50256} 203 | 204 | // ExplicitNVocab := 50281 205 | // max_tokens := int(math.Max(float64(len(special_tokens)), float64(len(ranks)))) 206 | 207 | // if len(special_tokens)+len(ranks) != max_tokens { 208 | // return nil, errors.New("special_tokens and ranks must be disjoint") 209 | // } 210 | 211 | return &Encoding{ 212 | Name: MODEL_P50K_BASE, 213 | PatStr: `'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`, 214 | MergeableRanks: ranks, 215 | SpecialTokens: special_tokens, 216 | ExplicitNVocab: 50281, 217 | }, nil 218 | } 219 | 220 | func r50k_base() (*Encoding, error) { 221 | ranks, err := bpeLoader.LoadTiktokenBpe("https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken") 222 | if err != nil { 223 | return nil, err 224 | } 225 | special_tokens := map[string]int{ENDOFTEXT: 50256} 226 | return &Encoding{ 227 | Name: MODEL_R50K_BASE, 228 | MergeableRanks: ranks, 229 | PatStr: `'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`, 230 | SpecialTokens: special_tokens, 231 | ExplicitNVocab: 50257, 232 | }, nil 233 | } 234 | 235 | // var ENCODING_MAP = map[string]*Encoding{} 236 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/linux-do/tiktoken-go 2 | 3 | go 1.19 4 | 5 | require ( 6 | github.com/dlclark/regexp2 v1.11.0 7 | github.com/google/uuid v1.6.0 8 | github.com/stretchr/testify v1.8.2 9 | ) 10 | 11 | require ( 12 | github.com/davecgh/go-spew v1.1.1 // indirect 13 | github.com/pmezard/go-difflib v1.0.0 // indirect 14 | gopkg.in/yaml.v3 v3.0.1 // indirect 15 | ) 16 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 2 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 3 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 4 | github.com/dlclark/regexp2 v1.10.0 h1:+/GIL799phkJqYW+3YbOd8LCcbHzT0Pbo8zl70MHsq0= 5 | github.com/dlclark/regexp2 v1.10.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= 6 | github.com/dlclark/regexp2 v1.11.0 h1:G/nrcoOa7ZXlpoa/91N3X7mM3r8eIlMBBJZvsz/mxKI= 7 | github.com/dlclark/regexp2 v1.11.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= 8 | github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= 9 | github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= 10 | github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= 11 | github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= 12 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 13 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 14 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 15 | github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= 16 | github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= 17 | github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 18 | github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= 19 | github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8= 20 | github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= 21 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= 22 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 23 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 24 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 25 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 26 | -------------------------------------------------------------------------------- /load.go: -------------------------------------------------------------------------------- 1 | package tiktoken 2 | 3 | import ( 4 | "crypto/sha1" 5 | "encoding/base64" 6 | "fmt" 7 | "io/ioutil" 8 | "net/http" 9 | "os" 10 | "path/filepath" 11 | "strconv" 12 | "strings" 13 | 14 | "github.com/google/uuid" 15 | ) 16 | 17 | type BpeLoader interface { 18 | LoadTiktokenBpe(tiktokenBpeFile string) (map[string]int, error) 19 | } 20 | 21 | func readFile(blobpath string) ([]byte, error) { 22 | if !strings.HasPrefix(blobpath, "http://") && !strings.HasPrefix(blobpath, "https://") { 23 | file, err := os.Open(blobpath) 24 | if err != nil { 25 | return nil, err 26 | } 27 | defer file.Close() 28 | return ioutil.ReadAll(file) 29 | } 30 | // avoiding blobfile for public files helps avoid auth issues, like MFA prompts 31 | resp, err := http.Get(blobpath) 32 | if err != nil { 33 | return nil, err 34 | } 35 | defer resp.Body.Close() 36 | return ioutil.ReadAll(resp.Body) 37 | } 38 | 39 | func readFileCached(blobpath string) ([]byte, error) { 40 | var cacheDir string 41 | if os.Getenv("TIKTOKEN_CACHE_DIR") != "" { 42 | cacheDir = os.Getenv("TIKTOKEN_CACHE_DIR") 43 | } else if os.Getenv("DATA_GYM_CACHE_DIR") != "" { 44 | cacheDir = os.Getenv("DATA_GYM_CACHE_DIR") 45 | } else { 46 | cacheDir = filepath.Join(os.TempDir(), "data-gym-cache") 47 | } 48 | 49 | if cacheDir == "" { 50 | // disable caching 51 | return readFile(blobpath) 52 | } 53 | 54 | cacheKey := fmt.Sprintf("%x", sha1.Sum([]byte(blobpath))) 55 | 56 | cachePath := filepath.Join(cacheDir, cacheKey) 57 | if _, err := os.Stat(cachePath); err == nil { 58 | return ioutil.ReadFile(cachePath) 59 | } 60 | 61 | contents, err := readFile(blobpath) 62 | if err != nil { 63 | return nil, err 64 | } 65 | 66 | os.MkdirAll(cacheDir, os.ModePerm) 67 | tmpFilename := cachePath + "." + uuid.New().String() + ".tmp" 68 | if err := ioutil.WriteFile(tmpFilename, contents, os.ModePerm); err != nil { 69 | return nil, err 70 | } 71 | return contents, os.Rename(tmpFilename, cachePath) 72 | } 73 | 74 | func loadTiktokenBpe(tiktokenBpeFile string) (map[string]int, error) { 75 | contents, err := readFileCached(tiktokenBpeFile) 76 | if err != nil { 77 | return nil, err 78 | } 79 | 80 | bpeRanks := make(map[string]int) 81 | for _, line := range strings.Split(string(contents), "\n") { 82 | if line == "" { 83 | continue 84 | } 85 | parts := strings.Split(line, " ") 86 | token, err := base64.StdEncoding.DecodeString(parts[0]) 87 | if err != nil { 88 | return nil, err 89 | } 90 | rank, err := strconv.Atoi(parts[1]) 91 | if err != nil { 92 | return nil, err 93 | } 94 | bpeRanks[string(token)] = rank 95 | } 96 | return bpeRanks, nil 97 | } 98 | 99 | type defaultBpeLoader struct{} 100 | 101 | func (l *defaultBpeLoader) LoadTiktokenBpe(tiktokenBpeFile string) (map[string]int, error) { 102 | return loadTiktokenBpe(tiktokenBpeFile) 103 | } 104 | 105 | func NewDefaultBpeLoader() BpeLoader { 106 | return &defaultBpeLoader{} 107 | } 108 | -------------------------------------------------------------------------------- /regex_test.go: -------------------------------------------------------------------------------- 1 | package tiktoken 2 | 3 | import ( 4 | "regexp" 5 | "testing" 6 | 7 | "github.com/dlclark/regexp2" 8 | "github.com/stretchr/testify/assert" 9 | ) 10 | 11 | func TestRegex2Func(t *testing.T) { 12 | ass := assert.New(t) 13 | pattern := `[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,4}` 14 | re := regexp.MustCompile(pattern) 15 | re2 := regexp2.MustCompile(pattern, regexp2.None) 16 | 17 | words := []string{ 18 | "this is my email hi@google.com,and this is john's email world@outlook.com", 19 | "hi@google.com is email for google", 20 | "outlook email world@outlook.com is work for microsoft", 21 | } 22 | 23 | for _, word := range words { 24 | ass.ElementsMatch(re.FindStringIndex(word), findRegex2StringIndex(word, re2)) 25 | ass.ElementsMatch(re.FindAllStringSubmatchIndex(word, -1), findRegex2AllStringMatchIndex(word, re2)) 26 | ass.Equal(re.FindString(word), findRegex2StringMatch(word, re2)) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /test/benchmark.py: -------------------------------------------------------------------------------- 1 | import tiktoken as tk 2 | import requests 3 | import time 4 | 5 | def benchmark_test(text_list,enc): 6 | """ 7 | Benchmark test 8 | :return: None 9 | """ 10 | start = time.perf_counter_ns() 11 | for index in range(100000): 12 | text = text_list[index] 13 | num_tokens = len(enc.encode(text)) 14 | end = time.perf_counter_ns() 15 | print('benchmark test: {} ns/op'.format((end - start)/100000)) 16 | 17 | if __name__ == '__main__': 18 | r = requests.get('https://unicode.org/udhr/assemblies/full_all.txt') 19 | text_list = r.text.splitlines() 20 | cursor = 0 21 | enc=tk.get_encoding('cl100k_base') 22 | benchmark_test(text_list,enc) 23 | 24 | -------------------------------------------------------------------------------- /test/benchmark_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "io" 5 | "log" 6 | "net/http" 7 | "strings" 8 | "testing" 9 | 10 | "github.com/linux-do/tiktoken-go" 11 | ) 12 | 13 | func BenchmarkEncodingInFullLanguage(b *testing.B) { 14 | // Universal Declaration of Human Rights in all languages 15 | url := "https://unicode.org/udhr/assemblies/full_all.txt" 16 | response, err := http.Get(url) 17 | if err != nil { 18 | log.Fatal(err) 19 | } 20 | defer response.Body.Close() 21 | 22 | responseData, err := io.ReadAll(response.Body) 23 | if err != nil { 24 | log.Fatal(err) 25 | } 26 | 27 | responseString := string(responseData) 28 | lines := strings.Split(responseString, "\n") 29 | tkm, err := tiktoken.EncodingForModel("gpt-4") 30 | lineCount := len(lines) 31 | if err != nil { 32 | log.Fatal(err) 33 | } 34 | b.ResetTimer() 35 | for n := 0; n < b.N; n++ { 36 | tkm.EncodeOrdinary(lines[n%lineCount]) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /test/test.txt: -------------------------------------------------------------------------------- 1 | hallo world!,你好世界!,こんにちは世界!,안녕하세요 세계!,Привет мир!,¡Hola mundo!,Hallo Welt!,Bonjour le monde!,Ciao mondo!,Hej världen!,Hallo wereld!,Hallo verden!,Hallo wereld!,Hallo verden! 2 | gpt-4o,gpt-4-turbo,gpt-4,gpt-3.5-turbo,text-davinci-003,text-davinci-002,text-davinci-001,text-curie-001,text-babbage-001,text-ada-001,davinci,curie,babbage,ada,code-davinci-002,code-davinci-001,code-cushman-002,code-cushman-001,davinci-codex,cushman-codex,text-davinci-edit-001,code-davinci-edit-001,text-embedding-ada-002,text-similarity-davinci-001 3 | o200k_base,cl100k_base,p50k_base,r50k_base -------------------------------------------------------------------------------- /test/token_num.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "log" 7 | "os" 8 | "strings" 9 | 10 | "github.com/linux-do/tiktoken-go" 11 | ) 12 | 13 | // main 14 | func main() { 15 | textList, modelList, encodingList := ReadTestFile() 16 | testTokenByModel(textList, modelList) 17 | fmt.Println("=========================================") 18 | testTokenByEncoding(textList, encodingList) 19 | } 20 | 21 | // read all columns from a file 22 | func ReadTestFile() (textList []string, modelList []string, encodingList []string) { 23 | file, err := os.Open("test/test.txt") 24 | if err != nil { 25 | log.Fatal(err) 26 | } 27 | defer file.Close() 28 | 29 | var lines []string 30 | scanner := bufio.NewScanner(file) 31 | for scanner.Scan() { 32 | lines = append(lines, scanner.Text()) 33 | } 34 | 35 | if err := scanner.Err(); err != nil { 36 | log.Fatal(err) 37 | } 38 | textList = strings.Split(lines[0], ",") 39 | modelList = strings.Split(lines[1], ",") 40 | encodingList = strings.Split(lines[2], ",") 41 | 42 | return 43 | } 44 | 45 | // getTokenByModel 46 | func getTokenByModel(text string, model string) (num_tokens int) { 47 | 48 | tkm, err := tiktoken.EncodingForModel(model) 49 | if err != nil { 50 | err = fmt.Errorf(": %v", err) 51 | return 52 | } 53 | 54 | token := tkm.Encode(text, nil, nil) 55 | 56 | return len(token) 57 | } 58 | 59 | // getTokenByEncoding 60 | func getTokenByEncoding(text string, encoding string) (num_tokens int) { 61 | 62 | tke, err := tiktoken.GetEncoding(encoding) 63 | if err != nil { 64 | err = fmt.Errorf(": %v", err) 65 | return 66 | } 67 | 68 | token := tke.Encode(text, nil, nil) 69 | 70 | return len(token) 71 | } 72 | 73 | // testTokenByModel 74 | func testTokenByModel(textList []string, modelList []string) { 75 | for i := 0; i < len(textList); i++ { 76 | for j := 0; j < len(modelList); j++ { 77 | fmt.Printf("text: %s, model: %s, token: %d \n", textList[i], modelList[j], getTokenByModel(textList[i], modelList[j])) 78 | } 79 | } 80 | } 81 | 82 | // testTokenByEncoding 83 | func testTokenByEncoding(textList []string, encodingList []string) { 84 | for i := 0; i < len(textList); i++ { 85 | for j := 0; j < len(encodingList); j++ { 86 | fmt.Printf("text: %s, encoding: %s, token: %d \n", textList[i], encodingList[j], getTokenByEncoding(textList[i], encodingList[j])) 87 | } 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /test/token_num.py: -------------------------------------------------------------------------------- 1 | import tiktoken as tk 2 | 3 | def read_data_from_file(filename): 4 | """ 5 | Read data from file 6 | :param filename: file name 7 | :return: text_list, model_list, encoding_list 8 | """ 9 | with open(filename, 'r', encoding='utf-8') as f: 10 | data = f.read() 11 | text_list = data.splitlines()[0].split(',') 12 | model_list = data.splitlines()[1].split(',') 13 | encoding_list = data.splitlines()[2].split(',') 14 | 15 | return text_list, model_list, encoding_list 16 | 17 | def get_token_by_model(text, model): 18 | """ 19 | Get token by model 20 | :param text: text 21 | :param model: model 22 | :return: num_tokens 23 | """ 24 | encoding = tk.encoding_for_model(model) 25 | num_tokens = len(encoding.encode(text)) 26 | return num_tokens 27 | 28 | def get_token_by_encoding(text, encoding_name): 29 | """ 30 | Get token by encoding 31 | :param text: text 32 | :param encoding: encoding 33 | :return: num_tokens 34 | """ 35 | encoding = tk.get_encoding(encoding_name) 36 | num_tokens = len(encoding.encode(text)) 37 | return num_tokens 38 | 39 | 40 | def test_token_by_model(text_list, model_list): 41 | """ 42 | Test token by model 43 | :param text_list: text list 44 | :param model_list: model list 45 | :return: None 46 | """ 47 | for text in text_list: 48 | for model in model_list: 49 | num_tokens = get_token_by_model(text, model) 50 | print('text: {}, model: {}, token: {}'.format(text, model, num_tokens)) 51 | 52 | def test_token_by_encoding(text_list, encoding_list): 53 | """ 54 | Test token by encoding 55 | :param text_list: text list 56 | :param encoding_list: encoding list 57 | :return: None 58 | """ 59 | for text in text_list: 60 | for encoding in encoding_list: 61 | num_tokens = get_token_by_encoding(text, encoding) 62 | print('text: {}, encoding: {}, token: {}'.format(text, encoding, num_tokens)) 63 | 64 | if __name__ == '__main__': 65 | text_list, model_list, encoding_list = read_data_from_file('test/test.txt') 66 | test_token_by_model(text_list, model_list) 67 | print("=====================================") 68 | test_token_by_encoding(text_list, encoding_list) 69 | 70 | -------------------------------------------------------------------------------- /tiktoken.go: -------------------------------------------------------------------------------- 1 | package tiktoken 2 | 3 | import ( 4 | "fmt" 5 | "regexp" 6 | "strings" 7 | 8 | "github.com/dlclark/regexp2" 9 | ) 10 | 11 | var bpeLoader BpeLoader = NewDefaultBpeLoader() 12 | 13 | func SetBpeLoader(loader BpeLoader) { 14 | bpeLoader = loader 15 | } 16 | 17 | func GetEncoding(encodingName string) (*Tiktoken, error) { 18 | enc, err := getEncoding(encodingName) 19 | if err != nil { 20 | return nil, err 21 | } 22 | pbe, err := NewCoreBPE(enc.MergeableRanks, enc.SpecialTokens, enc.PatStr) 23 | if err != nil { 24 | return nil, err 25 | } 26 | specialTokensSet := map[string]any{} 27 | for k := range enc.SpecialTokens { 28 | specialTokensSet[k] = true 29 | } 30 | return NewTiktoken(pbe, enc, specialTokensSet), nil 31 | } 32 | 33 | func EncodingForModel(modelName string) (*Tiktoken, error) { 34 | if encodingName, ok := MODEL_TO_ENCODING[modelName]; ok { 35 | return GetEncoding(encodingName) 36 | } else { 37 | for prefix, encodingName := range MODEL_PREFIX_TO_ENCODING { 38 | if strings.HasPrefix(modelName, prefix) { 39 | return GetEncoding(encodingName) 40 | } 41 | } 42 | } 43 | return nil, fmt.Errorf("no encoding for model %s", modelName) 44 | } 45 | 46 | type Tiktoken struct { 47 | bpe *CoreBPE 48 | pbeEncoding *Encoding 49 | specialTokensSet map[string]any 50 | } 51 | 52 | func (t *Tiktoken) Encode(text string, allowedSpecial []string, disallowedSpecial []string) []int { 53 | var allowedSpecialSet map[string]any 54 | if len(allowedSpecial) == 0 { 55 | allowedSpecialSet = map[string]any{} 56 | } else if len(allowedSpecial) == 1 && allowedSpecial[0] == "all" { 57 | allowedSpecialSet = t.specialTokensSet 58 | } else { 59 | allowedSpecialSet = map[string]any{} 60 | for _, v := range allowedSpecial { 61 | allowedSpecialSet[v] = nil 62 | } 63 | } 64 | 65 | disallowedSpecialSet := map[string]any{} 66 | for _, v := range disallowedSpecial { 67 | disallowedSpecialSet[v] = nil 68 | } 69 | if len(disallowedSpecial) == 1 && disallowedSpecial[0] == "all" { 70 | disallowedSpecialSet = difference(t.specialTokensSet, allowedSpecialSet) 71 | } 72 | 73 | if len(disallowedSpecialSet) > 0 { 74 | specialRegex := t.SpecialTokenRegex(disallowedSpecialSet) 75 | m := findRegex2StringMatch(text, specialRegex) 76 | if m != "" { 77 | panic(fmt.Sprintf("text contains disallowed special token %s", m)) 78 | } 79 | } 80 | 81 | tokens, _ := t.bpe.encodeNative(text, allowedSpecialSet) 82 | return tokens 83 | } 84 | 85 | func (t *Tiktoken) EncodeOrdinary(text string) []int { 86 | return (t.bpe.encodeOrdinaryNative(text)) 87 | } 88 | 89 | func (t *Tiktoken) Decode(tokens []int) string { 90 | return string(t.bpe.decodeNative(tokens)) 91 | } 92 | 93 | func (t *Tiktoken) EncoderName() string { 94 | return t.pbeEncoding.Name 95 | } 96 | 97 | func (t *Tiktoken) SpecialTokenRegex(disallowedSpecialSet map[string]any) *regexp2.Regexp { 98 | specialRegexStrs := make([]string, 0, len(disallowedSpecialSet)) 99 | for k := range disallowedSpecialSet { 100 | specialRegexStrs = append(specialRegexStrs, regexp.QuoteMeta(k)) 101 | } 102 | specialRegex := regexp2.MustCompile(strings.Join(specialRegexStrs, "|"), regexp2.None) 103 | return specialRegex 104 | } 105 | 106 | func findRegex2StringMatch(text string, reg *regexp2.Regexp) string { 107 | m, _ := reg.FindStringMatch(text) 108 | if m == nil { 109 | return "" 110 | } 111 | 112 | return m.String() 113 | } 114 | 115 | func difference(setA, setB map[string]any) map[string]any { 116 | result := make(map[string]any) 117 | for k := range setA { 118 | if _, ok := setB[k]; !ok { 119 | result[k] = true 120 | } 121 | } 122 | return result 123 | } 124 | 125 | // NewTiktoken can be used to create a *Tiktoken with custom parameters. 126 | func NewTiktoken(bpe *CoreBPE, encoding *Encoding, specialTokensSet map[string]any) *Tiktoken { 127 | return &Tiktoken{ 128 | bpe: bpe, 129 | pbeEncoding: encoding, 130 | specialTokensSet: specialTokensSet, 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /tiktoken_test.go: -------------------------------------------------------------------------------- 1 | package tiktoken 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func TestEncoding(t *testing.T) { 10 | ass := assert.New(t) 11 | enc, err := EncodingForModel("gpt-3.5-turbo-16k") 12 | ass.Nil(err, "Encoding init should not be nil") 13 | tokens := enc.Encode("hello world!你好,世界!", []string{"all"}, []string{"all"}) 14 | // these tokens are converted from the original python code 15 | sourceTokens := []int{15339, 1917, 0, 57668, 53901, 3922, 3574, 244, 98220, 6447} 16 | ass.ElementsMatch(sourceTokens, tokens, "Encoding should be equal") 17 | 18 | tokens = enc.Encode("hello <|endoftext|>", []string{"<|endoftext|>"}, nil) 19 | sourceTokens = []int{15339, 220, 100257} 20 | ass.ElementsMatch(sourceTokens, tokens, "Encoding should be equal") 21 | 22 | tokens = enc.Encode("hello <|endoftext|>", []string{"<|endoftext|>"}, []string{"all"}) 23 | sourceTokens = []int{15339, 220, 100257} 24 | ass.ElementsMatch(sourceTokens, tokens, "Encoding should be equal") 25 | 26 | ass.Panics(func() { 27 | tokens = enc.Encode("hello <|endoftext|><|endofprompt|>", []string{"<|endoftext|>"}, []string{"all"}) 28 | }) 29 | ass.Panics(func() { 30 | tokens = enc.Encode("hello <|endoftext|>", []string{"<|endoftext|>"}, []string{"<|endoftext|>"}) 31 | }) 32 | } 33 | 34 | func TestDecoding(t *testing.T) { 35 | ass := assert.New(t) 36 | // enc, err := GetEncoding("cl100k_base") 37 | enc, err := GetEncoding(MODEL_CL100K_BASE) 38 | ass.Nil(err, "Encoding init should not be nil") 39 | sourceTokens := []int{15339, 1917, 0, 57668, 53901, 3922, 3574, 244, 98220, 6447} 40 | txt := enc.Decode(sourceTokens) 41 | ass.Equal("hello world!你好,世界!", txt, "Decoding should be equal") 42 | } 43 | --------------------------------------------------------------------------------