├── .github └── workflows │ └── go.yml ├── LICENSE ├── README.md ├── cmd └── tokenizer │ └── main.go ├── codec ├── cl100k_base.go ├── cl100k_base_vocab.go ├── codec.go ├── o200k_base.go ├── o200k_base_vocab.go ├── p50k_base.go ├── p50k_base_vocab.go ├── p50k_edit.go ├── r50k_base.go ├── r50k_base_vocab.go └── vocab.go ├── go.mod ├── go.sum ├── internal └── cmd │ └── vocab.go ├── tokenizer.go └── tokenizer_test.go /.github/workflows/go.yml: -------------------------------------------------------------------------------- 1 | # This workflow will build a golang project 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-go 3 | 4 | name: Go 5 | 6 | on: 7 | push: 8 | branches: [ "main" ] 9 | pull_request: 10 | branches: [ "main" ] 11 | 12 | jobs: 13 | 14 | build: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v3 18 | 19 | - name: Set up Go 20 | uses: actions/setup-go@v3 21 | with: 22 | go-version: "1.24" 23 | 24 | - name: Build 25 | run: go build -v ./... 26 | 27 | - name: Test 28 | run: go test -v ./... 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 tiktoken-go 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Tests](https://github.com/tiktoken-go/tokenizer/actions/workflows/go.yml/badge.svg) 2 | 3 | # Tokenizer 4 | 5 | This is a pure go port of OpenAI's tokenizer. 6 | 7 | Buy Me A Coffee 8 | 9 | ## Usage 10 | 11 | ```go 12 | package main 13 | 14 | import ( 15 | "fmt" 16 | "github.com/tiktoken-go/tokenizer" 17 | ) 18 | 19 | func main() { 20 | enc, err := tokenizer.Get(tokenizer.Cl100kBase) 21 | if err != nil { 22 | panic("oh oh") 23 | } 24 | 25 | // this should print a list of token ids 26 | ids, _, _ := enc.Encode("supercalifragilistic") 27 | fmt.Println(ids) 28 | 29 | // this should print the original string back 30 | text, _ := enc.Decode(ids) 31 | fmt.Println(text) 32 | } 33 | ``` 34 | 35 | Alternatively you can use the included command-line tool 36 | 37 | ```sh 38 | > tokenizer -h 39 | 40 | Usage of tokenizer: 41 | -decode string 42 | tokens to decode 43 | -encode string 44 | text to encode 45 | -token string 46 | text to calculate token 47 | 48 | > tokenizer -encode supercalifragilistic 49 | ``` 50 | 51 | ## Todo 52 | 53 | - ✅ port code 54 | - ✅ o200k_base encoding 55 | - ✅ cl100k_base encoding 56 | - ✅ r50k_base encoding 57 | - ✅ p50k_base encoding 58 | - ✅ p50k_edit encoding 59 | - ✅ tests 60 | - ❌ handle special tokens 61 | - ❌ gpt-2 model 62 | 63 | ## Caveats 64 | 65 | This library embeds OpenAI's vocabularies—which are not small (~4Mb)— as go 66 | maps. This is different than what the way python version of tiktoken works, 67 | which downloads the dictionaries and puts them in a cache folder. 68 | 69 | However, since the dictionaries are compiled during the go build process 70 | the performance and start-up times should be better than downloading and loading 71 | them at runtime. 72 | 73 | ## Alternatives 74 | 75 | Here is a list of other libraries that do something similar. 76 | 77 | - [https://github.com/sugarme/tokenizer](https://github.com/sugarme/tokenizer) (A different tokenizer algorithm than OpenAI's) 78 | - [https://github.com/pandodao/tokenizer-go](https://github.com/pandodao/tokenizer-go) (deprecated, calls into JavaScript) 79 | - [https://github.com/pkoukk/tiktoken-go](https://github.com/pkoukk/tiktoken-go) 80 | 81 | 82 | -------------------------------------------------------------------------------- /cmd/tokenizer/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "log" 7 | "os" 8 | "strconv" 9 | "strings" 10 | 11 | "github.com/tiktoken-go/tokenizer" 12 | ) 13 | 14 | func main() { 15 | model := flag.String("model", "gpt-3.5-turbo", "the target OpenAI model to generate tokens for") 16 | encoding := flag.String("encoding", "", "the encoding format. (not that you can't specify both model and encoding)") 17 | encode := flag.String("encode", "", "text to encode") 18 | decode := flag.String("decode", "", "space separated list of token ids to decode") 19 | emitTokens := flag.Bool("tokens", false, "if true will output the tokens instead of the token ids") 20 | listModels := flag.Bool("list-models", false, "list all supported models") 21 | listEncodings := flag.Bool("list-encodings", false, "list all supported encoding formats") 22 | flag.Parse() 23 | 24 | if *listModels { 25 | printModels() 26 | os.Exit(0) 27 | } 28 | 29 | if *listEncodings { 30 | printEncodings() 31 | os.Exit(0) 32 | } 33 | 34 | // either model or encoding should be specified 35 | if (*model != "" && *encoding != "") || (*model == "" && *encoding == "") { 36 | flag.PrintDefaults() 37 | } 38 | 39 | // either encode or decode operations should be requested 40 | if (*encode != "" && *decode != "") || (*encode == "" && *decode == "") { 41 | flag.PrintDefaults() 42 | } 43 | 44 | codec := getCodec(*model, *encoding) 45 | 46 | if *encode != "" { 47 | encodeInput(codec, *encode, *emitTokens) 48 | } else { 49 | decodeInput(codec, *decode+" "+strings.Join(flag.Args(), " ")) 50 | } 51 | } 52 | 53 | func getCodec(model, encoding string) tokenizer.Codec { 54 | if model != "" { 55 | c, err := tokenizer.ForModel(tokenizer.Model(model)) 56 | if err != nil { 57 | log.Fatalf("error creating tokenizer: %v", err) 58 | } 59 | return c 60 | } else { 61 | c, err := tokenizer.Get(tokenizer.Encoding(encoding)) 62 | if err != nil { 63 | log.Fatalf("error creating tokenizer: %v", err) 64 | } 65 | return c 66 | } 67 | } 68 | 69 | func encodeInput(codec tokenizer.Codec, text string, wantTokens bool) { 70 | ids, tokens, err := codec.Encode(text) 71 | if err != nil { 72 | log.Fatalf("error encoding: %v", err) 73 | } 74 | 75 | if wantTokens { 76 | fmt.Println(strings.Join(tokens, " ")) 77 | } else { 78 | var textIds []string 79 | for _, id := range ids { 80 | textIds = append(textIds, strconv.Itoa(int(id))) 81 | } 82 | fmt.Println(strings.Join(textIds, " ")) 83 | } 84 | } 85 | 86 | func decodeInput(codec tokenizer.Codec, tokens string) { 87 | var ids []uint 88 | for _, t := range strings.Split(tokens, " ") { 89 | id, err := strconv.Atoi(t) 90 | if err != nil { 91 | log.Fatalf("invalid token id: %s", t) 92 | } 93 | ids = append(ids, uint(id)) 94 | } 95 | 96 | text, err := codec.Decode(ids) 97 | if err != nil { 98 | log.Fatalf("error decoding: %v", err) 99 | } 100 | fmt.Println(text) 101 | } 102 | 103 | func printEncodings() { 104 | encodings := []tokenizer.Encoding{ 105 | tokenizer.R50kBase, 106 | tokenizer.P50kBase, 107 | tokenizer.P50kEdit, 108 | tokenizer.Cl100kBase, 109 | } 110 | 111 | for _, e := range encodings { 112 | fmt.Println(e) 113 | } 114 | } 115 | 116 | func printModels() { 117 | models := []tokenizer.Model{tokenizer.GPT4, 118 | tokenizer.GPT35Turbo, 119 | tokenizer.TextEmbeddingAda002, 120 | tokenizer.TextDavinci003, 121 | tokenizer.TextDavinci002, 122 | tokenizer.CodeDavinci002, 123 | tokenizer.CodeDavinci001, 124 | tokenizer.CodeCushman002, 125 | tokenizer.CodeCushman001, 126 | tokenizer.DavinciCodex, 127 | tokenizer.CushmanCodex, 128 | tokenizer.TextDavinci001, 129 | tokenizer.TextCurie001, 130 | tokenizer.TextBabbage001, 131 | tokenizer.TextAda001, 132 | tokenizer.Davinci, 133 | tokenizer.Curie, 134 | tokenizer.Babbage, 135 | tokenizer.Ada, 136 | tokenizer.TextSimilarityDavinci001, 137 | tokenizer.TextSimilarityCurie001, 138 | tokenizer.TextSimilarityBabbage001, 139 | tokenizer.TextSimilarityAda001, 140 | tokenizer.TextSearchDavinciDoc001, 141 | tokenizer.TextSearchCurieDoc001, 142 | tokenizer.TextSearchAdaDoc001, 143 | tokenizer.TextSearchBabbageDoc001, 144 | tokenizer.CodeSearchBabbageCode001, 145 | tokenizer.CodeSearchAdaCode001, 146 | tokenizer.TextDavinciEdit001, 147 | tokenizer.CodeDavinciEdit001} 148 | 149 | for _, m := range models { 150 | fmt.Println(m) 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /codec/cl100k_base.go: -------------------------------------------------------------------------------- 1 | package codec 2 | 3 | import "github.com/dlclark/regexp2" 4 | 5 | func NewCl100kBase() *Codec { 6 | cl100kBaseVocabOnce.Do(cl100kBaseVocabInit) 7 | 8 | splitRegexp := regexp2.MustCompile(`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`, regexp2.None) 9 | 10 | return &Codec{ 11 | name: "cl100k_base", 12 | vocabulary: cl100kBaseVocab, 13 | splitRegexp: splitRegexp, 14 | specialTokens: map[string]uint{ 15 | "<|endoftext|>": 100257, 16 | "<|fim_prefix|>": 100258, 17 | "<|fim_middle|>": 100259, 18 | "<|fim_suffix|>": 100260, 19 | "<|endofprompt|>": 100276, 20 | }, 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /codec/codec.go: -------------------------------------------------------------------------------- 1 | package codec 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | 7 | "github.com/dlclark/regexp2" 8 | ) 9 | 10 | type Codec struct { 11 | vocabulary vocab 12 | reverseVocabulary reverse 13 | specialTokens map[string]uint 14 | splitRegexp *regexp2.Regexp 15 | name string 16 | } 17 | 18 | func (c *Codec) GetName() string { 19 | return c.name 20 | } 21 | 22 | // Count returns the number of tokens in the input string. 23 | func (c *Codec) Count(input string) (int, error) { 24 | var count int 25 | 26 | err := c.tokenize(input, func(_ uint, _ string) { 27 | count++ 28 | }) 29 | 30 | return count, err 31 | } 32 | 33 | // Encode returns the token IDs and tokens for the input string. 34 | func (c *Codec) Encode(input string) ([]uint, []string, error) { 35 | 36 | var ids []uint 37 | var tokens []string 38 | 39 | err := c.tokenize(input, func(id uint, token string) { 40 | ids = append(ids, id) 41 | tokens = append(tokens, token) 42 | }) 43 | 44 | return ids, tokens, err 45 | } 46 | 47 | func (c *Codec) tokenize(input string, yield func(uint, string)) error { 48 | match, err := c.splitRegexp.FindStringMatch(input) 49 | if err != nil { 50 | return fmt.Errorf("error matching: %v", err) 51 | } 52 | for match != nil { 53 | piece := match.String() 54 | if id, ok := c.vocabulary[piece]; ok { 55 | yield(id, piece) 56 | } else { 57 | parts := c.mergePairs(piece) 58 | 59 | for i := range len(parts) - 1 { 60 | token := piece[parts[i].offset:parts[i+1].offset] 61 | yield(c.vocabulary[token], token) 62 | } 63 | } 64 | match, err = c.splitRegexp.FindNextMatch(match) 65 | if err != nil { 66 | return fmt.Errorf("error matching: %v", err) 67 | } 68 | } 69 | 70 | return nil 71 | } 72 | 73 | func (c *Codec) Decode(tokens []uint) (string, error) { 74 | if c.reverseVocabulary == nil { 75 | c.reverseVocabulary = make(map[uint]string) 76 | for k, v := range c.vocabulary { 77 | c.reverseVocabulary[v] = k 78 | } 79 | } 80 | 81 | var out string 82 | for _, t := range tokens { 83 | piece, ok := c.reverseVocabulary[t] 84 | if !ok { 85 | return "", fmt.Errorf("invalid token: %d", t) 86 | } 87 | out += piece 88 | } 89 | return out, nil 90 | } 91 | 92 | type part struct { 93 | offset int 94 | rank uint 95 | } 96 | 97 | func (c *Codec) mergePairs(piece string) []part { 98 | parts := make([]part, len(piece)+1) 99 | for i := range len(parts) { 100 | parts[i] = part{i, math.MaxUint} 101 | } 102 | 103 | getRank := func(index, skip int) uint { 104 | if index+skip+2 < len(parts) { 105 | start := parts[index].offset 106 | end := parts[index+skip+2].offset 107 | if rank, ok := c.vocabulary[piece[start:end]]; ok { 108 | return rank 109 | } 110 | } 111 | return math.MaxUint 112 | } 113 | 114 | for i := 0; i < len(parts)-2; i++ { 115 | parts[i].rank = getRank(i, 0) 116 | } 117 | 118 | for { 119 | if len(parts) == 1 { 120 | break 121 | } 122 | 123 | minRank := uint(math.MaxUint) 124 | minIndex := 0 125 | for i, p := range parts[:len(parts)-1] { 126 | if p.rank < minRank { 127 | minRank = p.rank 128 | minIndex = i 129 | } 130 | } 131 | 132 | if minRank == math.MaxUint { 133 | break 134 | } 135 | 136 | parts[minIndex].rank = getRank(minIndex, 1) 137 | 138 | if minIndex > 0 { 139 | parts[minIndex-1].rank = getRank(minIndex-1, 1) 140 | } 141 | 142 | parts = append(parts[:minIndex+1], parts[minIndex+2:]...) 143 | } 144 | 145 | return parts 146 | } 147 | -------------------------------------------------------------------------------- /codec/o200k_base.go: -------------------------------------------------------------------------------- 1 | package codec 2 | 3 | import "github.com/dlclark/regexp2" 4 | 5 | func NewO200kBase() *Codec { 6 | o200kBaseVocabOnce.Do(o200kBaseVocabInit) 7 | 8 | splitRegexp := regexp2.MustCompile( 9 | `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`, 10 | regexp2.None) 11 | 12 | return &Codec{ 13 | name: "o200k_base", 14 | vocabulary: o200kBaseVocab, 15 | splitRegexp: splitRegexp, 16 | specialTokens: map[string]uint{ 17 | "<|endoftext|>": 199999, 18 | "<|endofprompt|>": 200018, 19 | }, 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /codec/p50k_base.go: -------------------------------------------------------------------------------- 1 | package codec 2 | 3 | import "github.com/dlclark/regexp2" 4 | 5 | func NewP50kBase() *Codec { 6 | p50kBaseVocabOnce.Do(p50kBaseVocabInit) 7 | 8 | splitRegexp := regexp2.MustCompile(`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`, regexp2.None) 9 | 10 | return &Codec{ 11 | name: "p50k_base", 12 | vocabulary: p50kBaseVocab, 13 | splitRegexp: splitRegexp, 14 | specialTokens: map[string]uint{ 15 | "<|endoftext|>": 50256, 16 | }, 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /codec/p50k_edit.go: -------------------------------------------------------------------------------- 1 | package codec 2 | 3 | import "github.com/dlclark/regexp2" 4 | 5 | func NewP50kEdit() *Codec { 6 | p50kBaseVocabOnce.Do(p50kBaseVocabInit) 7 | 8 | splitRegexp := regexp2.MustCompile(`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`, regexp2.None) 9 | 10 | return &Codec{ 11 | name: "p50k_edit", 12 | vocabulary: p50kBaseVocab, 13 | splitRegexp: splitRegexp, 14 | specialTokens: map[string]uint{ 15 | "<|endoftext|>": 50256, 16 | "<|fim_prefix|>": 50281, 17 | "<|fim_middle|>": 50282, 18 | "<|fim_suffix|>": 50283, 19 | }, 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /codec/r50k_base.go: -------------------------------------------------------------------------------- 1 | package codec 2 | 3 | import "github.com/dlclark/regexp2" 4 | 5 | func NewR50kBase() *Codec { 6 | r50kBaseVocabOnce.Do(r50kBaseVocabInit) 7 | 8 | splitRegexp := regexp2.MustCompile(`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`, regexp2.None) 9 | 10 | return &Codec{ 11 | name: "r50k_base", 12 | vocabulary: r50kBaseVocab, 13 | splitRegexp: splitRegexp, 14 | specialTokens: map[string]uint{ 15 | "<|endoftext|>": 50256, 16 | }, 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /codec/vocab.go: -------------------------------------------------------------------------------- 1 | package codec 2 | 3 | type vocab map[string]uint 4 | type reverse map[uint]string 5 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/tiktoken-go/tokenizer 2 | 3 | go 1.23 4 | 5 | tool github.com/dlclark/regexp2cg 6 | 7 | require github.com/dlclark/regexp2 v1.11.5 8 | 9 | require ( 10 | github.com/dlclark/regexp2cg v0.2.0 // indirect 11 | github.com/pkg/errors v0.9.1 // indirect 12 | ) 13 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/dlclark/regexp2 v1.11.5 h1:Q/sSnsKerHeCkc/jSTNq1oCm7KiVgUMZRDUoRu0JQZQ= 2 | github.com/dlclark/regexp2 v1.11.5/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= 3 | github.com/dlclark/regexp2cg v0.2.0 h1:YTk+oP9dO74myroxiopnf/zlGOSuTGIuYhRx769YFk4= 4 | github.com/dlclark/regexp2cg v0.2.0/go.mod h1:K2c4ctxtSQjzgeMKKgi1rEflZVVJWZWlUUdmtjOp/y8= 5 | github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= 6 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 7 | -------------------------------------------------------------------------------- /internal/cmd/vocab.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "encoding/base64" 7 | "flag" 8 | "fmt" 9 | "go/format" 10 | "io" 11 | "log" 12 | "net/http" 13 | "os" 14 | "strconv" 15 | "strings" 16 | ) 17 | 18 | const ( 19 | packageName = "codec" 20 | ) 21 | 22 | type config struct { 23 | url string 24 | mapName string 25 | filename string 26 | } 27 | 28 | func main() { 29 | encoding := flag.String("encoding", "", "encoding format. (e.g. cl100k_base)") 30 | flag.Parse() 31 | 32 | if encoding == nil { 33 | flag.PrintDefaults() 34 | os.Exit(1) 35 | } 36 | 37 | cfg := getConfig(*encoding) 38 | 39 | buf := new(bytes.Buffer) 40 | generatePreamble(buf, *encoding) 41 | generateVocabulary(buf, cfg.mapName, cfg.url) 42 | 43 | src, err := format.Source(buf.Bytes()) 44 | if err != nil { 45 | log.Fatalf("error preparing source: %v", err) 46 | } 47 | 48 | if err := os.WriteFile(cfg.filename, src, 0o644); err != nil { 49 | log.Fatalf("error writing file: %v", err) 50 | } 51 | } 52 | 53 | func generatePreamble(w io.Writer, encoding string) { 54 | fmt.Fprintf(w, "// Code generated by internal/cmd/vocab.go. DO NOT EDIT.\n\n") 55 | fmt.Fprintf(w, "//go:generate go run ../internal/cmd/vocab.go -encoding %s\n\n", encoding) 56 | fmt.Fprintf(w, "package %s\n", packageName) 57 | } 58 | 59 | func generateVocabulary(w io.Writer, mapName string, uri string) { 60 | resp, err := http.Get(uri) 61 | if err != nil { 62 | log.Fatalf("error fetching file: %v", err) 63 | } 64 | defer resp.Body.Close() 65 | 66 | fmt.Fprintf(w, "import \"sync\"\n") 67 | fmt.Fprintf(w, "var (\n") 68 | fmt.Fprintf(w, "%v vocab\n", mapName) 69 | fmt.Fprintf(w, "%vOnce sync.Once\n", mapName) 70 | fmt.Fprintf(w, ")\n") 71 | fmt.Fprintf(w, "func %sInit() {\n", mapName) 72 | fmt.Fprintf(w, "%s = vocab{\n", mapName) 73 | 74 | scanner := bufio.NewScanner(resp.Body) 75 | for scanner.Scan() { 76 | line := scanner.Text() 77 | 78 | wordInput, idInput, ok := strings.Cut(line, " ") 79 | if !ok { 80 | log.Fatalf("invalid line: %q", line) 81 | } 82 | 83 | word, err := base64.StdEncoding.DecodeString(wordInput) 84 | if err != nil { 85 | log.Fatalf("invalid word: %q", wordInput) 86 | } 87 | 88 | id, err := strconv.ParseUint(idInput, 10, 0) 89 | if err != nil { 90 | log.Fatalf("invalid id: %q", idInput) 91 | } 92 | 93 | fmt.Fprintf(w, "%q: %d,\n", word, id) 94 | } 95 | 96 | fmt.Fprintf(w, "}\n}\n") 97 | } 98 | 99 | func getConfig(encoding string) config { 100 | switch encoding { 101 | case "o200k_base": 102 | return config{ 103 | mapName: "o200kBaseVocab", 104 | url: "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken", 105 | filename: "o200k_base_vocab.go", 106 | } 107 | case "cl100k_base": 108 | return config{ 109 | mapName: "cl100kBaseVocab", 110 | url: "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken", 111 | filename: "cl100k_base_vocab.go", 112 | } 113 | case "r50k_base": 114 | return config{ 115 | mapName: "r50kBaseVocab", 116 | url: "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken", 117 | filename: "r50k_base_vocab.go", 118 | } 119 | case "p50k_base": 120 | return config{ 121 | mapName: "p50kBaseVocab", 122 | url: "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken", 123 | filename: "p50k_base_vocab.go", 124 | } 125 | default: 126 | log.Fatal("config not found") 127 | return config{} 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /tokenizer.go: -------------------------------------------------------------------------------- 1 | package tokenizer 2 | 3 | // Package tokenizer provides functions for encoding and decoding text using 4 | // different tokenization schemes. 5 | // 6 | // Encoding Formats 7 | // 8 | // The following encoding formats are supported: 9 | // - Cl100kBase 10 | // - R50kBase 11 | // - P50kBase 12 | // - P50kEdit 13 | // 14 | // Alternatively you can request a tokenizer using OpenAI's model name, the 15 | // following OpenAI models are supported: 16 | // - O1Preview 17 | // - O1Mini 18 | // - GPT4 19 | // - GPT35Turbo 20 | // - TextEmbeddingAda002 21 | // - TextDavinci003 22 | // - TextDavinci002 23 | // - CodeDavinci002 24 | // - CodeDavinci001 25 | // - CodeCushman002 26 | // - CodeCushman001 27 | // - DavinciCodex 28 | // - CushmanCodex 29 | // - TextDavinci001 30 | // - TextCurie001 31 | // - TextBabbage001 32 | // - TextAda001 33 | // - Davinci 34 | // - Curie 35 | // - Babbage 36 | // - Ada 37 | // - TextSimilarityDavinci001 38 | // - TextSimilarityCurie001 39 | // - TextSimilarityBabbage001 40 | // - TextSimilarityAda001 41 | // - TextSearchDavinciDoc001 42 | // - TextSearchCurieDoc001 43 | // - TextSearchAdaDoc001 44 | // - TextSearchBabbageDoc001 45 | // - CodeSearchBabbageCode001 46 | // - CodeSearchAdaCode001 47 | // - TextDavinciEdit001 48 | // - CodeDavinciEdit001 49 | // 50 | // Usage Example 51 | // 52 | // Here is an example of how to encode a string using the `ForModel` function: 53 | // 54 | // package main 55 | // 56 | // import ( 57 | // "fmt" 58 | // "github.com/tiktoken-go/tokenizer" 59 | // ) 60 | // 61 | // func main() { 62 | // enc, err := tokenizer.Get(tokenizer.Cl100kBase) 63 | // if err != nil { 64 | // panic("oh oh") 65 | // } 66 | // 67 | // // this should print a list of token ids 68 | // ids, token, _ := enc.Encode("supercalifragilistic") 69 | // fmt.Println(ids) 70 | // 71 | // // this should print the original string back 72 | // text, _ := enc.Decode(ids) 73 | // fmt.Println(text) 74 | //} 75 | 76 | import ( 77 | "errors" 78 | "strings" 79 | 80 | "github.com/tiktoken-go/tokenizer/codec" 81 | ) 82 | 83 | var ( 84 | ErrModelNotSupported = errors.New("model not supported") 85 | ErrEncodingNotSupported = errors.New("encoding not supported") 86 | ) 87 | 88 | type Codec interface { 89 | GetName() string 90 | Count(string) (int, error) 91 | Encode(string) ([]uint, []string, error) 92 | Decode([]uint) (string, error) 93 | } 94 | 95 | type Model string 96 | 97 | const ( 98 | O1Preview Model = "o1-preview" 99 | O1Mini Model = "o1-mini" 100 | O3Mini Model = "o3-mini" 101 | GPT4o Model = "gpt-4o" 102 | GPT4 Model = "gpt-4" 103 | GPT35Turbo Model = "gpt-3.5-turbo" 104 | GPT35 Model = "gpt-3.5" 105 | TextEmbeddingAda002 Model = "text-embedding-ada-002" 106 | TextDavinci003 Model = "text-davinci-003" 107 | TextDavinci002 Model = "text-davinci-002" 108 | CodeDavinci002 Model = "code-davinci-002" 109 | CodeDavinci001 Model = "code-davinci-001" 110 | CodeCushman002 Model = "code-cushman-002" 111 | CodeCushman001 Model = "code-cushman-001" 112 | DavinciCodex Model = "davinci-codex" 113 | CushmanCodex Model = "cushman-codex" 114 | TextDavinci001 Model = "text-davinci-001" 115 | TextCurie001 Model = "text-curie-001" 116 | TextBabbage001 Model = "text-babbage-001" 117 | TextAda001 Model = "text-ada-001" 118 | Davinci Model = "davinci" 119 | Curie Model = "curie" 120 | Babbage Model = "babbage" 121 | Ada Model = "ada" 122 | TextSimilarityDavinci001 Model = "text-similarity-davinci-001" 123 | TextSimilarityCurie001 Model = "text-similarity-curie-001" 124 | TextSimilarityBabbage001 Model = "text-similarity-babbage-001" 125 | TextSimilarityAda001 Model = "text-similarity-ada-001" 126 | TextSearchDavinciDoc001 Model = "text-search-davinci-doc-001" 127 | TextSearchCurieDoc001 Model = "text-search-curie-doc-001" 128 | TextSearchAdaDoc001 Model = "text-search-ada-doc-001" 129 | TextSearchBabbageDoc001 Model = "text-search-babbage-doc-001" 130 | CodeSearchBabbageCode001 Model = "code-search-babbage-code-001" 131 | CodeSearchAdaCode001 Model = "code-search-ada-code-001" 132 | TextDavinciEdit001 Model = "text-davinci-edit-001" 133 | CodeDavinciEdit001 Model = "code-davinci-edit-001" 134 | GPT2 Model = "gpt2" 135 | ) 136 | 137 | type Encoding string 138 | 139 | const ( 140 | GPT2Enc Encoding = "gpt2" 141 | R50kBase Encoding = "r50k_base" 142 | P50kBase Encoding = "p50k_base" 143 | P50kEdit Encoding = "p50k_edit" 144 | Cl100kBase Encoding = "cl100k_base" 145 | O200kBase Encoding = "o200k_base" 146 | ) 147 | 148 | var modelPrefixToEncoding map[Model]Encoding = map[Model]Encoding{ 149 | "o1-": O200kBase, 150 | // chat 151 | "chatgpt-4o-": O200kBase, 152 | "gpt-4o-": O200kBase, 153 | "gpt-4-": Cl100kBase, 154 | "gpt-3.5-turbo-": Cl100kBase, 155 | "gpt-35-turbo-": Cl100kBase, 156 | // fine-tuned 157 | "ft:gpt-4": Cl100kBase, 158 | "ft:gpt-3.5-turbo": Cl100kBase, 159 | "ft:davinci-002": Cl100kBase, 160 | "ft:babbage-002": Cl100kBase, 161 | } 162 | 163 | // Get returns a new instance of a Codec implementation based on the specified 164 | // encoding format. The returned Codec instance can be used to encode (tokenize) 165 | // and decode (reassemble) text. If the specified encoding is not supported, 166 | // an error is returned. 167 | func Get(encoding Encoding) (Codec, error) { 168 | switch encoding { 169 | case O200kBase: 170 | return codec.NewO200kBase(), nil 171 | case Cl100kBase: 172 | return codec.NewCl100kBase(), nil 173 | case R50kBase: 174 | return codec.NewR50kBase(), nil 175 | case P50kBase: 176 | return codec.NewP50kBase(), nil 177 | case P50kEdit: 178 | return codec.NewP50kEdit(), nil 179 | default: 180 | return nil, ErrEncodingNotSupported 181 | } 182 | } 183 | 184 | // ForModel returns a new instance of a Codec implementation based on the 185 | // specified OpenAI model. If the specified model is not supported, an error 186 | // is returned. 187 | func ForModel(model Model) (Codec, error) { 188 | switch model { 189 | case O1Preview, O1Mini, GPT4o, O3Mini: 190 | return Get(O200kBase) 191 | 192 | case GPT4, GPT35, GPT35Turbo, TextEmbeddingAda002: 193 | return Get(Cl100kBase) 194 | 195 | case TextDavinci003, TextDavinci002, CodeDavinci001, 196 | CodeDavinci002, CodeCushman002, CodeCushman001, 197 | DavinciCodex, CushmanCodex: 198 | return Get(P50kBase) 199 | 200 | case TextDavinci001, TextCurie001, TextBabbage001, TextAda001, Davinci, 201 | Curie, Babbage, Ada, TextSimilarityDavinci001, TextSimilarityCurie001, 202 | TextSimilarityBabbage001, TextSimilarityAda001, TextSearchDavinciDoc001, 203 | TextSearchCurieDoc001, TextSearchAdaDoc001, TextSearchBabbageDoc001, 204 | CodeSearchBabbageCode001, CodeSearchAdaCode001: 205 | return Get(R50kBase) 206 | 207 | case TextDavinciEdit001, CodeDavinciEdit001: 208 | return Get(P50kEdit) 209 | 210 | case GPT2: 211 | return Get(GPT2Enc) 212 | default: 213 | for prefix, enc := range modelPrefixToEncoding { 214 | if strings.HasPrefix(string(model), string(prefix)) { 215 | return Get(enc) 216 | } 217 | } 218 | return nil, ErrModelNotSupported 219 | } 220 | } 221 | -------------------------------------------------------------------------------- /tokenizer_test.go: -------------------------------------------------------------------------------- 1 | package tokenizer_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/tiktoken-go/tokenizer" 7 | ) 8 | 9 | type testCase struct { 10 | text string 11 | ids []uint 12 | } 13 | 14 | func TestO200kBase(t *testing.T) { 15 | tok, err := tokenizer.Get(tokenizer.O200kBase) 16 | if err != nil { 17 | t.Fatalf("can't create tokenizer: %v", err) 18 | } 19 | 20 | tests := []testCase{ 21 | {text: "hello world", ids: []uint{24912, 2375}}, 22 | {text: "hello world", ids: []uint{24912, 220, 2375}}, 23 | {text: "hello world", ids: []uint{24912, 256, 2375}}, 24 | {text: "supercalifragilistic", ids: []uint{17789, 5842, 366, 17764, 311, 6207}}, 25 | {text: "We know what we are, but know not what we may be.", ids: []uint{2167, 1761, 1412, 581, 553, 11, 889, 1761, 625, 1412, 581, 1340, 413, 13}}, 26 | } 27 | 28 | runTests(t, tok, tests) 29 | } 30 | 31 | func TestCl100kBase(t *testing.T) { 32 | tok, err := tokenizer.Get(tokenizer.Cl100kBase) 33 | if err != nil { 34 | t.Fatalf("can't create tokenizer: %v", err) 35 | } 36 | 37 | tests := []testCase{ 38 | {text: "hello world", ids: []uint{15339, 1917}}, 39 | {text: "hello world", ids: []uint{15339, 220, 1917}}, 40 | {text: "hello world", ids: []uint{15339, 256, 1917}}, 41 | {text: "supercalifragilistic", ids: []uint{13066, 3035, 278, 333, 4193, 321, 4633}}, 42 | {text: "We know what we are, but know not what we may be.", ids: []uint{1687, 1440, 1148, 584, 527, 11, 719, 1440, 539, 1148, 584, 1253, 387, 13}}, 43 | } 44 | 45 | runTests(t, tok, tests) 46 | } 47 | 48 | func TestR50kBase(t *testing.T) { 49 | tok, err := tokenizer.Get(tokenizer.R50kBase) 50 | if err != nil { 51 | t.Fatalf("can't create tokenizer: %v", err) 52 | } 53 | 54 | tests := []testCase{ 55 | {text: "hello world", ids: []uint{31373, 995}}, 56 | {text: "hello world", ids: []uint{31373, 220, 995}}, 57 | {text: "hello world", ids: []uint{31373, 220, 220, 995}}, 58 | {text: "supercalifragilistic", ids: []uint{16668, 9948, 361, 22562, 346, 2569}}, 59 | {text: "We know what we are, but know not what we may be.", ids: []uint{1135, 760, 644, 356, 389, 11, 475, 760, 407, 644, 356, 743, 307, 13}}, 60 | } 61 | 62 | runTests(t, tok, tests) 63 | } 64 | 65 | func TestP50kBase(t *testing.T) { 66 | tok, err := tokenizer.Get(tokenizer.P50kBase) 67 | if err != nil { 68 | t.Fatalf("can't create tokenizer: %v", err) 69 | } 70 | 71 | tests := []testCase{ 72 | {text: "hello world", ids: []uint{31373, 995}}, 73 | {text: "hello world", ids: []uint{31373, 220, 995}}, 74 | {text: "hello world", ids: []uint{31373, 50257, 995}}, 75 | {text: "supercalifragilistic", ids: []uint{16668, 9948, 361, 22562, 346, 2569}}, 76 | {text: "We know what we are, but know not what we may be.", ids: []uint{1135, 760, 644, 356, 389, 11, 475, 760, 407, 644, 356, 743, 307, 13}}, 77 | } 78 | 79 | runTests(t, tok, tests) 80 | } 81 | 82 | func runTests(t *testing.T, tok tokenizer.Codec, tests []testCase) { 83 | for _, test := range tests { 84 | t.Run(test.text, func(t *testing.T) { 85 | ids, _, err := tok.Encode(test.text) 86 | if err != nil { 87 | t.Fatalf("error encoding: %v", err) 88 | } 89 | if !sliceEqual(ids, test.ids) { 90 | t.Errorf("encoding mismatch - want: %v got: %v", test.ids, ids) 91 | } 92 | 93 | text, err := tok.Decode(ids) 94 | if err != nil { 95 | t.Fatalf("error decoding: %v", err) 96 | } 97 | if text != test.text { 98 | t.Errorf("decoding mismatch - want: %s got: %s", test.text, text) 99 | } 100 | 101 | count, err := tok.Count(test.text) 102 | if err != nil { 103 | t.Fatalf("error counting: %v", err) 104 | } 105 | if count != len(test.ids) { 106 | t.Errorf("count mismatch - want: %d got: %d", len(test.ids), count) 107 | } 108 | }) 109 | } 110 | } 111 | 112 | func sliceEqual(a, b []uint) bool { 113 | if len(a) != len(b) { 114 | return false 115 | } 116 | for i, elem := range a { 117 | if elem != b[i] { 118 | return false 119 | } 120 | } 121 | return true 122 | } 123 | --------------------------------------------------------------------------------