├── .github
    └── workflows
    │   └── go.yml
├── LICENSE
├── README.md
├── cmd
    └── tokenizer
    │   └── main.go
├── codec
    ├── cl100k_base.go
    ├── cl100k_base_vocab.go
    ├── codec.go
    ├── o200k_base.go
    ├── o200k_base_vocab.go
    ├── p50k_base.go
    ├── p50k_base_vocab.go
    ├── p50k_edit.go
    ├── r50k_base.go
    ├── r50k_base_vocab.go
    └── vocab.go
├── go.mod
├── go.sum
├── internal
    └── cmd
    │   └── vocab.go
├── tokenizer.go
└── tokenizer_test.go


/.github/workflows/go.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will build a golang project
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-go
 3 | 
 4 | name: Go
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "main" ]
 9 |   pull_request:
10 |     branches: [ "main" ]
11 | 
12 | jobs:
13 | 
14 |   build:
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |     - uses: actions/checkout@v3
18 | 
19 |     - name: Set up Go
20 |       uses: actions/setup-go@v3
21 |       with:
22 |         go-version: "1.24"
23 | 
24 |     - name: Build
25 |       run: go build -v ./...
26 | 
27 |     - name: Test
28 |       run: go test -v ./...
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 tiktoken-go
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![Tests](https://github.com/tiktoken-go/tokenizer/actions/workflows/go.yml/badge.svg)
 2 | 
 3 | # Tokenizer
 4 | 
 5 | This is a pure go port of OpenAI's tokenizer.
 6 | 
 7 | <a href="https://www.buymeacoffee.com/mwahlmann" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/default-blue.png" alt="Buy Me A Coffee" height="41" width="174"></a>
 8 | 
 9 | ## Usage
10 | 
11 | ```go
12 | package main
13 | 
14 | import (
15 |     "fmt"
16 |     "github.com/tiktoken-go/tokenizer"
17 | )
18 | 
19 | func main() {
20 |     enc, err := tokenizer.Get(tokenizer.Cl100kBase)
21 |     if err != nil {
22 |         panic("oh oh")
23 |     }
24 | 
25 |     // this should print a list of token ids
26 |     ids, _, _ := enc.Encode("supercalifragilistic")
27 |     fmt.Println(ids)
28 | 
29 |     // this should print the original string back
30 |     text, _ := enc.Decode(ids)
31 |     fmt.Println(text)
32 | }
33 | ```
34 | 
35 | Alternatively you can use the included command-line tool
36 | 
37 | ```sh
38 | > tokenizer -h
39 | 
40 | Usage of tokenizer:
41 |   -decode string
42 |         tokens to decode
43 |   -encode string
44 |         text to encode
45 |   -token string
46 |         text to calculate token
47 | 
48 | > tokenizer -encode supercalifragilistic
49 | ```
50 | 
51 | ## Todo
52 | 
53 | - ✅ port code
54 | - ✅ o200k_base encoding
55 | - ✅ cl100k_base encoding
56 | - ✅ r50k_base encoding
57 | - ✅ p50k_base encoding
58 | - ✅ p50k_edit encoding
59 | - ✅ tests
60 | - ❌ handle special tokens
61 | - ❌ gpt-2 model
62 | 
63 | ## Caveats
64 | 
65 | This library embeds OpenAI's vocabularies—which are not small (~4Mb)— as go
66 | maps. This is different than what the way python version of tiktoken works, 
67 | which downloads the dictionaries and puts them in a cache folder.
68 | 
69 | However, since the dictionaries are compiled during the go build process
70 | the performance and start-up times should be better than downloading and loading
71 | them at runtime.
72 | 
73 | ## Alternatives
74 | 
75 | Here is a list of other libraries that do something similar.
76 | 
77 | - [https://github.com/sugarme/tokenizer](https://github.com/sugarme/tokenizer) (A different tokenizer algorithm than OpenAI's)
78 | - [https://github.com/pandodao/tokenizer-go](https://github.com/pandodao/tokenizer-go) (deprecated, calls into JavaScript)
79 | - [https://github.com/pkoukk/tiktoken-go](https://github.com/pkoukk/tiktoken-go)
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/cmd/tokenizer/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"flag"
  5 | 	"fmt"
  6 | 	"log"
  7 | 	"os"
  8 | 	"strconv"
  9 | 	"strings"
 10 | 
 11 | 	"github.com/tiktoken-go/tokenizer"
 12 | )
 13 | 
 14 | func main() {
 15 | 	model := flag.String("model", "gpt-3.5-turbo", "the target OpenAI model to generate tokens for")
 16 | 	encoding := flag.String("encoding", "", "the encoding format. (not that you can't specify both model and encoding)")
 17 | 	encode := flag.String("encode", "", "text to encode")
 18 | 	decode := flag.String("decode", "", "space separated list of token ids to decode")
 19 | 	emitTokens := flag.Bool("tokens", false, "if true will output the tokens instead of the token ids")
 20 | 	listModels := flag.Bool("list-models", false, "list all supported models")
 21 | 	listEncodings := flag.Bool("list-encodings", false, "list all supported encoding formats")
 22 | 	flag.Parse()
 23 | 
 24 | 	if *listModels {
 25 | 		printModels()
 26 | 		os.Exit(0)
 27 | 	}
 28 | 
 29 | 	if *listEncodings {
 30 | 		printEncodings()
 31 | 		os.Exit(0)
 32 | 	}
 33 | 
 34 | 	// either model or encoding should be specified
 35 | 	if (*model != "" && *encoding != "") || (*model == "" && *encoding == "") {
 36 | 		flag.PrintDefaults()
 37 | 	}
 38 | 
 39 | 	// either encode or decode operations should be requested
 40 | 	if (*encode != "" && *decode != "") || (*encode == "" && *decode == "") {
 41 | 		flag.PrintDefaults()
 42 | 	}
 43 | 
 44 | 	codec := getCodec(*model, *encoding)
 45 | 
 46 | 	if *encode != "" {
 47 | 		encodeInput(codec, *encode, *emitTokens)
 48 | 	} else {
 49 | 		decodeInput(codec, *decode+" "+strings.Join(flag.Args(), " "))
 50 | 	}
 51 | }
 52 | 
 53 | func getCodec(model, encoding string) tokenizer.Codec {
 54 | 	if model != "" {
 55 | 		c, err := tokenizer.ForModel(tokenizer.Model(model))
 56 | 		if err != nil {
 57 | 			log.Fatalf("error creating tokenizer: %v", err)
 58 | 		}
 59 | 		return c
 60 | 	} else {
 61 | 		c, err := tokenizer.Get(tokenizer.Encoding(encoding))
 62 | 		if err != nil {
 63 | 			log.Fatalf("error creating tokenizer: %v", err)
 64 | 		}
 65 | 		return c
 66 | 	}
 67 | }
 68 | 
 69 | func encodeInput(codec tokenizer.Codec, text string, wantTokens bool) {
 70 | 	ids, tokens, err := codec.Encode(text)
 71 | 	if err != nil {
 72 | 		log.Fatalf("error encoding: %v", err)
 73 | 	}
 74 | 
 75 | 	if wantTokens {
 76 | 		fmt.Println(strings.Join(tokens, " "))
 77 | 	} else {
 78 | 		var textIds []string
 79 | 		for _, id := range ids {
 80 | 			textIds = append(textIds, strconv.Itoa(int(id)))
 81 | 		}
 82 | 		fmt.Println(strings.Join(textIds, " "))
 83 | 	}
 84 | }
 85 | 
 86 | func decodeInput(codec tokenizer.Codec, tokens string) {
 87 | 	var ids []uint
 88 | 	for _, t := range strings.Split(tokens, " ") {
 89 | 		id, err := strconv.Atoi(t)
 90 | 		if err != nil {
 91 | 			log.Fatalf("invalid token id: %s", t)
 92 | 		}
 93 | 		ids = append(ids, uint(id))
 94 | 	}
 95 | 
 96 | 	text, err := codec.Decode(ids)
 97 | 	if err != nil {
 98 | 		log.Fatalf("error decoding: %v", err)
 99 | 	}
100 | 	fmt.Println(text)
101 | }
102 | 
103 | func printEncodings() {
104 | 	encodings := []tokenizer.Encoding{
105 | 		tokenizer.R50kBase,
106 | 		tokenizer.P50kBase,
107 | 		tokenizer.P50kEdit,
108 | 		tokenizer.Cl100kBase,
109 | 	}
110 | 
111 | 	for _, e := range encodings {
112 | 		fmt.Println(e)
113 | 	}
114 | }
115 | 
116 | func printModels() {
117 | 	models := []tokenizer.Model{tokenizer.GPT4,
118 | 		tokenizer.GPT35Turbo,
119 | 		tokenizer.TextEmbeddingAda002,
120 | 		tokenizer.TextDavinci003,
121 | 		tokenizer.TextDavinci002,
122 | 		tokenizer.CodeDavinci002,
123 | 		tokenizer.CodeDavinci001,
124 | 		tokenizer.CodeCushman002,
125 | 		tokenizer.CodeCushman001,
126 | 		tokenizer.DavinciCodex,
127 | 		tokenizer.CushmanCodex,
128 | 		tokenizer.TextDavinci001,
129 | 		tokenizer.TextCurie001,
130 | 		tokenizer.TextBabbage001,
131 | 		tokenizer.TextAda001,
132 | 		tokenizer.Davinci,
133 | 		tokenizer.Curie,
134 | 		tokenizer.Babbage,
135 | 		tokenizer.Ada,
136 | 		tokenizer.TextSimilarityDavinci001,
137 | 		tokenizer.TextSimilarityCurie001,
138 | 		tokenizer.TextSimilarityBabbage001,
139 | 		tokenizer.TextSimilarityAda001,
140 | 		tokenizer.TextSearchDavinciDoc001,
141 | 		tokenizer.TextSearchCurieDoc001,
142 | 		tokenizer.TextSearchAdaDoc001,
143 | 		tokenizer.TextSearchBabbageDoc001,
144 | 		tokenizer.CodeSearchBabbageCode001,
145 | 		tokenizer.CodeSearchAdaCode001,
146 | 		tokenizer.TextDavinciEdit001,
147 | 		tokenizer.CodeDavinciEdit001}
148 | 
149 | 	for _, m := range models {
150 | 		fmt.Println(m)
151 | 	}
152 | }
153 | 


--------------------------------------------------------------------------------
/codec/cl100k_base.go:
--------------------------------------------------------------------------------
 1 | package codec
 2 | 
 3 | import "github.com/dlclark/regexp2"
 4 | 
 5 | func NewCl100kBase() *Codec {
 6 | 	cl100kBaseVocabOnce.Do(cl100kBaseVocabInit)
 7 | 
 8 | 	splitRegexp := regexp2.MustCompile(`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`, regexp2.None)
 9 | 
10 | 	return &Codec{
11 | 		name:        "cl100k_base",
12 | 		vocabulary:  cl100kBaseVocab,
13 | 		splitRegexp: splitRegexp,
14 | 		specialTokens: map[string]uint{
15 | 			"<|endoftext|>":   100257,
16 | 			"<|fim_prefix|>":  100258,
17 | 			"<|fim_middle|>":  100259,
18 | 			"<|fim_suffix|>":  100260,
19 | 			"<|endofprompt|>": 100276,
20 | 		},
21 | 	}
22 | }
23 | 


--------------------------------------------------------------------------------
/codec/codec.go:
--------------------------------------------------------------------------------
  1 | package codec
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"math"
  6 | 
  7 | 	"github.com/dlclark/regexp2"
  8 | )
  9 | 
 10 | type Codec struct {
 11 | 	vocabulary        vocab
 12 | 	reverseVocabulary reverse
 13 | 	specialTokens     map[string]uint
 14 | 	splitRegexp       *regexp2.Regexp
 15 | 	name              string
 16 | }
 17 | 
 18 | func (c *Codec) GetName() string {
 19 | 	return c.name
 20 | }
 21 | 
 22 | // Count returns the number of tokens in the input string.
 23 | func (c *Codec) Count(input string) (int, error) {
 24 | 	var count int
 25 | 
 26 | 	err := c.tokenize(input, func(_ uint, _ string) {
 27 | 		count++
 28 | 	})
 29 | 
 30 | 	return count, err
 31 | }
 32 | 
 33 | // Encode returns the token IDs and tokens for the input string.
 34 | func (c *Codec) Encode(input string) ([]uint, []string, error) {
 35 | 
 36 | 	var ids []uint
 37 | 	var tokens []string
 38 | 
 39 | 	err := c.tokenize(input, func(id uint, token string) {
 40 | 		ids = append(ids, id)
 41 | 		tokens = append(tokens, token)
 42 | 	})
 43 | 
 44 | 	return ids, tokens, err
 45 | }
 46 | 
 47 | func (c *Codec) tokenize(input string, yield func(uint, string)) error {
 48 | 	match, err := c.splitRegexp.FindStringMatch(input)
 49 | 	if err != nil {
 50 | 		return fmt.Errorf("error matching: %v", err)
 51 | 	}
 52 | 	for match != nil {
 53 | 		piece := match.String()
 54 | 		if id, ok := c.vocabulary[piece]; ok {
 55 | 			yield(id, piece)
 56 | 		} else {
 57 | 			parts := c.mergePairs(piece)
 58 | 
 59 | 			for i := range len(parts) - 1 {
 60 | 				token := piece[parts[i].offset:parts[i+1].offset]
 61 | 				yield(c.vocabulary[token], token)
 62 | 			}
 63 | 		}
 64 | 		match, err = c.splitRegexp.FindNextMatch(match)
 65 | 		if err != nil {
 66 | 			return fmt.Errorf("error matching: %v", err)
 67 | 		}
 68 | 	}
 69 | 
 70 | 	return nil
 71 | }
 72 | 
 73 | func (c *Codec) Decode(tokens []uint) (string, error) {
 74 | 	if c.reverseVocabulary == nil {
 75 | 		c.reverseVocabulary = make(map[uint]string)
 76 | 		for k, v := range c.vocabulary {
 77 | 			c.reverseVocabulary[v] = k
 78 | 		}
 79 | 	}
 80 | 
 81 | 	var out string
 82 | 	for _, t := range tokens {
 83 | 		piece, ok := c.reverseVocabulary[t]
 84 | 		if !ok {
 85 | 			return "", fmt.Errorf("invalid token: %d", t)
 86 | 		}
 87 | 		out += piece
 88 | 	}
 89 | 	return out, nil
 90 | }
 91 | 
 92 | type part struct {
 93 | 	offset int
 94 | 	rank   uint
 95 | }
 96 | 
 97 | func (c *Codec) mergePairs(piece string) []part {
 98 | 	parts := make([]part, len(piece)+1)
 99 | 	for i := range len(parts) {
100 | 		parts[i] = part{i, math.MaxUint}
101 | 	}
102 | 
103 | 	getRank := func(index, skip int) uint {
104 | 		if index+skip+2 < len(parts) {
105 | 			start := parts[index].offset
106 | 			end := parts[index+skip+2].offset
107 | 			if rank, ok := c.vocabulary[piece[start:end]]; ok {
108 | 				return rank
109 | 			}
110 | 		}
111 | 		return math.MaxUint
112 | 	}
113 | 
114 | 	for i := 0; i < len(parts)-2; i++ {
115 | 		parts[i].rank = getRank(i, 0)
116 | 	}
117 | 
118 | 	for {
119 | 		if len(parts) == 1 {
120 | 			break
121 | 		}
122 | 
123 | 		minRank := uint(math.MaxUint)
124 | 		minIndex := 0
125 | 		for i, p := range parts[:len(parts)-1] {
126 | 			if p.rank < minRank {
127 | 				minRank = p.rank
128 | 				minIndex = i
129 | 			}
130 | 		}
131 | 
132 | 		if minRank == math.MaxUint {
133 | 			break
134 | 		}
135 | 
136 | 		parts[minIndex].rank = getRank(minIndex, 1)
137 | 
138 | 		if minIndex > 0 {
139 | 			parts[minIndex-1].rank = getRank(minIndex-1, 1)
140 | 		}
141 | 
142 | 		parts = append(parts[:minIndex+1], parts[minIndex+2:]...)
143 | 	}
144 | 
145 | 	return parts
146 | }
147 | 


--------------------------------------------------------------------------------
/codec/o200k_base.go:
--------------------------------------------------------------------------------
 1 | package codec
 2 | 
 3 | import "github.com/dlclark/regexp2"
 4 | 
 5 | func NewO200kBase() *Codec {
 6 | 	o200kBaseVocabOnce.Do(o200kBaseVocabInit)
 7 | 
 8 | 	splitRegexp := regexp2.MustCompile(
 9 | 		`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
10 | 		regexp2.None)
11 | 
12 | 	return &Codec{
13 | 		name:        "o200k_base",
14 | 		vocabulary:  o200kBaseVocab,
15 | 		splitRegexp: splitRegexp,
16 | 		specialTokens: map[string]uint{
17 | 			"<|endoftext|>":   199999,
18 | 			"<|endofprompt|>": 200018,
19 | 		},
20 | 	}
21 | }
22 | 


--------------------------------------------------------------------------------
/codec/p50k_base.go:
--------------------------------------------------------------------------------
 1 | package codec
 2 | 
 3 | import "github.com/dlclark/regexp2"
 4 | 
 5 | func NewP50kBase() *Codec {
 6 | 	p50kBaseVocabOnce.Do(p50kBaseVocabInit)
 7 | 
 8 | 	splitRegexp := regexp2.MustCompile(`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`, regexp2.None)
 9 | 
10 | 	return &Codec{
11 | 		name:        "p50k_base",
12 | 		vocabulary:  p50kBaseVocab,
13 | 		splitRegexp: splitRegexp,
14 | 		specialTokens: map[string]uint{
15 | 			"<|endoftext|>": 50256,
16 | 		},
17 | 	}
18 | }
19 | 


--------------------------------------------------------------------------------
/codec/p50k_edit.go:
--------------------------------------------------------------------------------
 1 | package codec
 2 | 
 3 | import "github.com/dlclark/regexp2"
 4 | 
 5 | func NewP50kEdit() *Codec {
 6 | 	p50kBaseVocabOnce.Do(p50kBaseVocabInit)
 7 | 
 8 | 	splitRegexp := regexp2.MustCompile(`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`, regexp2.None)
 9 | 
10 | 	return &Codec{
11 | 		name:        "p50k_edit",
12 | 		vocabulary:  p50kBaseVocab,
13 | 		splitRegexp: splitRegexp,
14 | 		specialTokens: map[string]uint{
15 | 			"<|endoftext|>":  50256,
16 | 			"<|fim_prefix|>": 50281,
17 | 			"<|fim_middle|>": 50282,
18 | 			"<|fim_suffix|>": 50283,
19 | 		},
20 | 	}
21 | }
22 | 


--------------------------------------------------------------------------------
/codec/r50k_base.go:
--------------------------------------------------------------------------------
 1 | package codec
 2 | 
 3 | import "github.com/dlclark/regexp2"
 4 | 
 5 | func NewR50kBase() *Codec {
 6 | 	r50kBaseVocabOnce.Do(r50kBaseVocabInit)
 7 | 
 8 | 	splitRegexp := regexp2.MustCompile(`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`, regexp2.None)
 9 | 
10 | 	return &Codec{
11 | 		name:        "r50k_base",
12 | 		vocabulary:  r50kBaseVocab,
13 | 		splitRegexp: splitRegexp,
14 | 		specialTokens: map[string]uint{
15 | 			"<|endoftext|>": 50256,
16 | 		},
17 | 	}
18 | }
19 | 


--------------------------------------------------------------------------------
/codec/vocab.go:
--------------------------------------------------------------------------------
1 | package codec
2 | 
3 | type vocab map[string]uint
4 | type reverse map[uint]string
5 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/tiktoken-go/tokenizer
 2 | 
 3 | go 1.23
 4 | 
 5 | tool github.com/dlclark/regexp2cg
 6 | 
 7 | require github.com/dlclark/regexp2 v1.11.5
 8 | 
 9 | require (
10 | 	github.com/dlclark/regexp2cg v0.2.0 // indirect
11 | 	github.com/pkg/errors v0.9.1 // indirect
12 | )
13 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/dlclark/regexp2 v1.11.5 h1:Q/sSnsKerHeCkc/jSTNq1oCm7KiVgUMZRDUoRu0JQZQ=
2 | github.com/dlclark/regexp2 v1.11.5/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
3 | github.com/dlclark/regexp2cg v0.2.0 h1:YTk+oP9dO74myroxiopnf/zlGOSuTGIuYhRx769YFk4=
4 | github.com/dlclark/regexp2cg v0.2.0/go.mod h1:K2c4ctxtSQjzgeMKKgi1rEflZVVJWZWlUUdmtjOp/y8=
5 | github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
6 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
7 | 


--------------------------------------------------------------------------------
/internal/cmd/vocab.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"bytes"
  6 | 	"encoding/base64"
  7 | 	"flag"
  8 | 	"fmt"
  9 | 	"go/format"
 10 | 	"io"
 11 | 	"log"
 12 | 	"net/http"
 13 | 	"os"
 14 | 	"strconv"
 15 | 	"strings"
 16 | )
 17 | 
 18 | const (
 19 | 	packageName = "codec"
 20 | )
 21 | 
 22 | type config struct {
 23 | 	url      string
 24 | 	mapName  string
 25 | 	filename string
 26 | }
 27 | 
 28 | func main() {
 29 | 	encoding := flag.String("encoding", "", "encoding format. (e.g. cl100k_base)")
 30 | 	flag.Parse()
 31 | 
 32 | 	if encoding == nil {
 33 | 		flag.PrintDefaults()
 34 | 		os.Exit(1)
 35 | 	}
 36 | 
 37 | 	cfg := getConfig(*encoding)
 38 | 
 39 | 	buf := new(bytes.Buffer)
 40 | 	generatePreamble(buf, *encoding)
 41 | 	generateVocabulary(buf, cfg.mapName, cfg.url)
 42 | 
 43 | 	src, err := format.Source(buf.Bytes())
 44 | 	if err != nil {
 45 | 		log.Fatalf("error preparing source: %v", err)
 46 | 	}
 47 | 
 48 | 	if err := os.WriteFile(cfg.filename, src, 0o644); err != nil {
 49 | 		log.Fatalf("error writing file: %v", err)
 50 | 	}
 51 | }
 52 | 
 53 | func generatePreamble(w io.Writer, encoding string) {
 54 | 	fmt.Fprintf(w, "// Code generated by internal/cmd/vocab.go. DO NOT EDIT.\n\n")
 55 | 	fmt.Fprintf(w, "//go:generate go run ../internal/cmd/vocab.go -encoding %s\n\n", encoding)
 56 | 	fmt.Fprintf(w, "package %s\n", packageName)
 57 | }
 58 | 
 59 | func generateVocabulary(w io.Writer, mapName string, uri string) {
 60 | 	resp, err := http.Get(uri)
 61 | 	if err != nil {
 62 | 		log.Fatalf("error fetching file: %v", err)
 63 | 	}
 64 | 	defer resp.Body.Close()
 65 | 
 66 | 	fmt.Fprintf(w, "import \"sync\"\n")
 67 | 	fmt.Fprintf(w, "var (\n")
 68 | 	fmt.Fprintf(w, "%v vocab\n", mapName)
 69 | 	fmt.Fprintf(w, "%vOnce sync.Once\n", mapName)
 70 | 	fmt.Fprintf(w, ")\n")
 71 | 	fmt.Fprintf(w, "func %sInit() {\n", mapName)
 72 | 	fmt.Fprintf(w, "%s = vocab{\n", mapName)
 73 | 
 74 | 	scanner := bufio.NewScanner(resp.Body)
 75 | 	for scanner.Scan() {
 76 | 		line := scanner.Text()
 77 | 
 78 | 		wordInput, idInput, ok := strings.Cut(line, " ")
 79 | 		if !ok {
 80 | 			log.Fatalf("invalid line: %q", line)
 81 | 		}
 82 | 
 83 | 		word, err := base64.StdEncoding.DecodeString(wordInput)
 84 | 		if err != nil {
 85 | 			log.Fatalf("invalid word: %q", wordInput)
 86 | 		}
 87 | 
 88 | 		id, err := strconv.ParseUint(idInput, 10, 0)
 89 | 		if err != nil {
 90 | 			log.Fatalf("invalid id: %q", idInput)
 91 | 		}
 92 | 
 93 | 		fmt.Fprintf(w, "%q: %d,\n", word, id)
 94 | 	}
 95 | 
 96 | 	fmt.Fprintf(w, "}\n}\n")
 97 | }
 98 | 
 99 | func getConfig(encoding string) config {
100 | 	switch encoding {
101 | 	case "o200k_base":
102 | 		return config{
103 | 			mapName:  "o200kBaseVocab",
104 | 			url:      "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken",
105 | 			filename: "o200k_base_vocab.go",
106 | 		}
107 | 	case "cl100k_base":
108 | 		return config{
109 | 			mapName:  "cl100kBaseVocab",
110 | 			url:      "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken",
111 | 			filename: "cl100k_base_vocab.go",
112 | 		}
113 | 	case "r50k_base":
114 | 		return config{
115 | 			mapName:  "r50kBaseVocab",
116 | 			url:      "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken",
117 | 			filename: "r50k_base_vocab.go",
118 | 		}
119 | 	case "p50k_base":
120 | 		return config{
121 | 			mapName:  "p50kBaseVocab",
122 | 			url:      "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
123 | 			filename: "p50k_base_vocab.go",
124 | 		}
125 | 	default:
126 | 		log.Fatal("config not found")
127 | 		return config{}
128 | 	}
129 | }
130 | 


--------------------------------------------------------------------------------
/tokenizer.go:
--------------------------------------------------------------------------------
  1 | package tokenizer
  2 | 
  3 | // Package tokenizer provides functions for encoding and decoding text using
  4 | // different tokenization schemes.
  5 | //
  6 | // Encoding Formats
  7 | //
  8 | // The following encoding formats are supported:
  9 | // - Cl100kBase
 10 | // - R50kBase
 11 | // - P50kBase
 12 | // - P50kEdit
 13 | //
 14 | // Alternatively you can request a tokenizer using OpenAI's model name, the
 15 | // following OpenAI models are supported:
 16 | // - O1Preview
 17 | // - O1Mini
 18 | // - GPT4
 19 | // - GPT35Turbo
 20 | // - TextEmbeddingAda002
 21 | // - TextDavinci003
 22 | // - TextDavinci002
 23 | // - CodeDavinci002
 24 | // - CodeDavinci001
 25 | // - CodeCushman002
 26 | // - CodeCushman001
 27 | // - DavinciCodex
 28 | // - CushmanCodex
 29 | // - TextDavinci001
 30 | // - TextCurie001
 31 | // - TextBabbage001
 32 | // - TextAda001
 33 | // - Davinci
 34 | // - Curie
 35 | // - Babbage
 36 | // - Ada
 37 | // - TextSimilarityDavinci001
 38 | // - TextSimilarityCurie001
 39 | // - TextSimilarityBabbage001
 40 | // - TextSimilarityAda001
 41 | // - TextSearchDavinciDoc001
 42 | // - TextSearchCurieDoc001
 43 | // - TextSearchAdaDoc001
 44 | // - TextSearchBabbageDoc001
 45 | // - CodeSearchBabbageCode001
 46 | // - CodeSearchAdaCode001
 47 | // - TextDavinciEdit001
 48 | // - CodeDavinciEdit001
 49 | //
 50 | // Usage Example
 51 | //
 52 | // Here is an example of how to encode a string using the `ForModel` function:
 53 | //
 54 | //	package main
 55 | //
 56 | //	import (
 57 | //		"fmt"
 58 | //		"github.com/tiktoken-go/tokenizer"
 59 | //	)
 60 | //
 61 | //	func main() {
 62 | //		enc, err := tokenizer.Get(tokenizer.Cl100kBase)
 63 | //		if err != nil {
 64 | //			panic("oh oh")
 65 | //		}
 66 | //
 67 | //		// this should print a list of token ids
 68 | //		ids, token, _ := enc.Encode("supercalifragilistic")
 69 | //		fmt.Println(ids)
 70 | //
 71 | //		// this should print the original string back
 72 | //		text, _ := enc.Decode(ids)
 73 | //		fmt.Println(text)
 74 | //}
 75 | 
 76 | import (
 77 | 	"errors"
 78 | 	"strings"
 79 | 
 80 | 	"github.com/tiktoken-go/tokenizer/codec"
 81 | )
 82 | 
 83 | var (
 84 | 	ErrModelNotSupported    = errors.New("model not supported")
 85 | 	ErrEncodingNotSupported = errors.New("encoding not supported")
 86 | )
 87 | 
 88 | type Codec interface {
 89 | 	GetName() string
 90 | 	Count(string) (int, error)
 91 | 	Encode(string) ([]uint, []string, error)
 92 | 	Decode([]uint) (string, error)
 93 | }
 94 | 
 95 | type Model string
 96 | 
 97 | const (
 98 | 	O1Preview                Model = "o1-preview"
 99 | 	O1Mini                   Model = "o1-mini"
100 | 	O3Mini                   Model = "o3-mini"
101 | 	GPT4o                    Model = "gpt-4o"
102 | 	GPT4                     Model = "gpt-4"
103 | 	GPT35Turbo               Model = "gpt-3.5-turbo"
104 | 	GPT35                    Model = "gpt-3.5"
105 | 	TextEmbeddingAda002      Model = "text-embedding-ada-002"
106 | 	TextDavinci003           Model = "text-davinci-003"
107 | 	TextDavinci002           Model = "text-davinci-002"
108 | 	CodeDavinci002           Model = "code-davinci-002"
109 | 	CodeDavinci001           Model = "code-davinci-001"
110 | 	CodeCushman002           Model = "code-cushman-002"
111 | 	CodeCushman001           Model = "code-cushman-001"
112 | 	DavinciCodex             Model = "davinci-codex"
113 | 	CushmanCodex             Model = "cushman-codex"
114 | 	TextDavinci001           Model = "text-davinci-001"
115 | 	TextCurie001             Model = "text-curie-001"
116 | 	TextBabbage001           Model = "text-babbage-001"
117 | 	TextAda001               Model = "text-ada-001"
118 | 	Davinci                  Model = "davinci"
119 | 	Curie                    Model = "curie"
120 | 	Babbage                  Model = "babbage"
121 | 	Ada                      Model = "ada"
122 | 	TextSimilarityDavinci001 Model = "text-similarity-davinci-001"
123 | 	TextSimilarityCurie001   Model = "text-similarity-curie-001"
124 | 	TextSimilarityBabbage001 Model = "text-similarity-babbage-001"
125 | 	TextSimilarityAda001     Model = "text-similarity-ada-001"
126 | 	TextSearchDavinciDoc001  Model = "text-search-davinci-doc-001"
127 | 	TextSearchCurieDoc001    Model = "text-search-curie-doc-001"
128 | 	TextSearchAdaDoc001      Model = "text-search-ada-doc-001"
129 | 	TextSearchBabbageDoc001  Model = "text-search-babbage-doc-001"
130 | 	CodeSearchBabbageCode001 Model = "code-search-babbage-code-001"
131 | 	CodeSearchAdaCode001     Model = "code-search-ada-code-001"
132 | 	TextDavinciEdit001       Model = "text-davinci-edit-001"
133 | 	CodeDavinciEdit001       Model = "code-davinci-edit-001"
134 | 	GPT2                     Model = "gpt2"
135 | )
136 | 
137 | type Encoding string
138 | 
139 | const (
140 | 	GPT2Enc    Encoding = "gpt2"
141 | 	R50kBase   Encoding = "r50k_base"
142 | 	P50kBase   Encoding = "p50k_base"
143 | 	P50kEdit   Encoding = "p50k_edit"
144 | 	Cl100kBase Encoding = "cl100k_base"
145 | 	O200kBase  Encoding = "o200k_base"
146 | )
147 | 
148 | var modelPrefixToEncoding map[Model]Encoding = map[Model]Encoding{
149 | 	"o1-": O200kBase,
150 | 	// chat
151 | 	"chatgpt-4o-":    O200kBase,
152 | 	"gpt-4o-":        O200kBase,
153 | 	"gpt-4-":         Cl100kBase,
154 | 	"gpt-3.5-turbo-": Cl100kBase,
155 | 	"gpt-35-turbo-":  Cl100kBase,
156 | 	// fine-tuned
157 | 	"ft:gpt-4":         Cl100kBase,
158 | 	"ft:gpt-3.5-turbo": Cl100kBase,
159 | 	"ft:davinci-002":   Cl100kBase,
160 | 	"ft:babbage-002":   Cl100kBase,
161 | }
162 | 
163 | // Get returns a new instance of a Codec implementation based on the specified
164 | // encoding format. The returned Codec instance can be used to encode (tokenize)
165 | // and decode (reassemble) text. If the specified encoding is not supported,
166 | // an error is returned.
167 | func Get(encoding Encoding) (Codec, error) {
168 | 	switch encoding {
169 | 	case O200kBase:
170 | 		return codec.NewO200kBase(), nil
171 | 	case Cl100kBase:
172 | 		return codec.NewCl100kBase(), nil
173 | 	case R50kBase:
174 | 		return codec.NewR50kBase(), nil
175 | 	case P50kBase:
176 | 		return codec.NewP50kBase(), nil
177 | 	case P50kEdit:
178 | 		return codec.NewP50kEdit(), nil
179 | 	default:
180 | 		return nil, ErrEncodingNotSupported
181 | 	}
182 | }
183 | 
184 | // ForModel returns a new instance of a Codec implementation based on the
185 | // specified OpenAI model. If the specified model is not supported, an error
186 | // is returned.
187 | func ForModel(model Model) (Codec, error) {
188 | 	switch model {
189 | 	case O1Preview, O1Mini, GPT4o, O3Mini:
190 | 		return Get(O200kBase)
191 | 
192 | 	case GPT4, GPT35, GPT35Turbo, TextEmbeddingAda002:
193 | 		return Get(Cl100kBase)
194 | 
195 | 	case TextDavinci003, TextDavinci002, CodeDavinci001,
196 | 		CodeDavinci002, CodeCushman002, CodeCushman001,
197 | 		DavinciCodex, CushmanCodex:
198 | 		return Get(P50kBase)
199 | 
200 | 	case TextDavinci001, TextCurie001, TextBabbage001, TextAda001, Davinci,
201 | 		Curie, Babbage, Ada, TextSimilarityDavinci001, TextSimilarityCurie001,
202 | 		TextSimilarityBabbage001, TextSimilarityAda001, TextSearchDavinciDoc001,
203 | 		TextSearchCurieDoc001, TextSearchAdaDoc001, TextSearchBabbageDoc001,
204 | 		CodeSearchBabbageCode001, CodeSearchAdaCode001:
205 | 		return Get(R50kBase)
206 | 
207 | 	case TextDavinciEdit001, CodeDavinciEdit001:
208 | 		return Get(P50kEdit)
209 | 
210 | 	case GPT2:
211 | 		return Get(GPT2Enc)
212 | 	default:
213 | 		for prefix, enc := range modelPrefixToEncoding {
214 | 			if strings.HasPrefix(string(model), string(prefix)) {
215 | 				return Get(enc)
216 | 			}
217 | 		}
218 | 		return nil, ErrModelNotSupported
219 | 	}
220 | }
221 | 


--------------------------------------------------------------------------------
/tokenizer_test.go:
--------------------------------------------------------------------------------
  1 | package tokenizer_test
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 
  6 | 	"github.com/tiktoken-go/tokenizer"
  7 | )
  8 | 
  9 | type testCase struct {
 10 | 	text string
 11 | 	ids  []uint
 12 | }
 13 | 
 14 | func TestO200kBase(t *testing.T) {
 15 | 	tok, err := tokenizer.Get(tokenizer.O200kBase)
 16 | 	if err != nil {
 17 | 		t.Fatalf("can't create tokenizer: %v", err)
 18 | 	}
 19 | 
 20 | 	tests := []testCase{
 21 | 		{text: "hello world", ids: []uint{24912, 2375}},
 22 | 		{text: "hello  world", ids: []uint{24912, 220, 2375}},
 23 | 		{text: "hello   world", ids: []uint{24912, 256, 2375}},
 24 | 		{text: "supercalifragilistic", ids: []uint{17789, 5842, 366, 17764, 311, 6207}},
 25 | 		{text: "We know what we are, but know not what we may be.", ids: []uint{2167, 1761, 1412, 581, 553, 11, 889, 1761, 625, 1412, 581, 1340, 413, 13}},
 26 | 	}
 27 | 
 28 | 	runTests(t, tok, tests)
 29 | }
 30 | 
 31 | func TestCl100kBase(t *testing.T) {
 32 | 	tok, err := tokenizer.Get(tokenizer.Cl100kBase)
 33 | 	if err != nil {
 34 | 		t.Fatalf("can't create tokenizer: %v", err)
 35 | 	}
 36 | 
 37 | 	tests := []testCase{
 38 | 		{text: "hello world", ids: []uint{15339, 1917}},
 39 | 		{text: "hello  world", ids: []uint{15339, 220, 1917}},
 40 | 		{text: "hello   world", ids: []uint{15339, 256, 1917}},
 41 | 		{text: "supercalifragilistic", ids: []uint{13066, 3035, 278, 333, 4193, 321, 4633}},
 42 | 		{text: "We know what we are, but know not what we may be.", ids: []uint{1687, 1440, 1148, 584, 527, 11, 719, 1440, 539, 1148, 584, 1253, 387, 13}},
 43 | 	}
 44 | 
 45 | 	runTests(t, tok, tests)
 46 | }
 47 | 
 48 | func TestR50kBase(t *testing.T) {
 49 | 	tok, err := tokenizer.Get(tokenizer.R50kBase)
 50 | 	if err != nil {
 51 | 		t.Fatalf("can't create tokenizer: %v", err)
 52 | 	}
 53 | 
 54 | 	tests := []testCase{
 55 | 		{text: "hello world", ids: []uint{31373, 995}},
 56 | 		{text: "hello  world", ids: []uint{31373, 220, 995}},
 57 | 		{text: "hello   world", ids: []uint{31373, 220, 220, 995}},
 58 | 		{text: "supercalifragilistic", ids: []uint{16668, 9948, 361, 22562, 346, 2569}},
 59 | 		{text: "We know what we are, but know not what we may be.", ids: []uint{1135, 760, 644, 356, 389, 11, 475, 760, 407, 644, 356, 743, 307, 13}},
 60 | 	}
 61 | 
 62 | 	runTests(t, tok, tests)
 63 | }
 64 | 
 65 | func TestP50kBase(t *testing.T) {
 66 | 	tok, err := tokenizer.Get(tokenizer.P50kBase)
 67 | 	if err != nil {
 68 | 		t.Fatalf("can't create tokenizer: %v", err)
 69 | 	}
 70 | 
 71 | 	tests := []testCase{
 72 | 		{text: "hello world", ids: []uint{31373, 995}},
 73 | 		{text: "hello  world", ids: []uint{31373, 220, 995}},
 74 | 		{text: "hello   world", ids: []uint{31373, 50257, 995}},
 75 | 		{text: "supercalifragilistic", ids: []uint{16668, 9948, 361, 22562, 346, 2569}},
 76 | 		{text: "We know what we are, but know not what we may be.", ids: []uint{1135, 760, 644, 356, 389, 11, 475, 760, 407, 644, 356, 743, 307, 13}},
 77 | 	}
 78 | 
 79 | 	runTests(t, tok, tests)
 80 | }
 81 | 
 82 | func runTests(t *testing.T, tok tokenizer.Codec, tests []testCase) {
 83 | 	for _, test := range tests {
 84 | 		t.Run(test.text, func(t *testing.T) {
 85 | 			ids, _, err := tok.Encode(test.text)
 86 | 			if err != nil {
 87 | 				t.Fatalf("error encoding: %v", err)
 88 | 			}
 89 | 			if !sliceEqual(ids, test.ids) {
 90 | 				t.Errorf("encoding mismatch - want: %v got: %v", test.ids, ids)
 91 | 			}
 92 | 
 93 | 			text, err := tok.Decode(ids)
 94 | 			if err != nil {
 95 | 				t.Fatalf("error decoding: %v", err)
 96 | 			}
 97 | 			if text != test.text {
 98 | 				t.Errorf("decoding mismatch - want: %s got: %s", test.text, text)
 99 | 			}
100 | 
101 | 			count, err := tok.Count(test.text)
102 | 			if err != nil {
103 | 				t.Fatalf("error counting: %v", err)
104 | 			}
105 | 			if count != len(test.ids) {
106 | 				t.Errorf("count mismatch - want: %d got: %d", len(test.ids), count)
107 | 			}
108 | 		})
109 | 	}
110 | }
111 | 
112 | func sliceEqual(a, b []uint) bool {
113 | 	if len(a) != len(b) {
114 | 		return false
115 | 	}
116 | 	for i, elem := range a {
117 | 		if elem != b[i] {
118 | 			return false
119 | 		}
120 | 	}
121 | 	return true
122 | }
123 | 


--------------------------------------------------------------------------------