├── .gitignore ├── LICENSE ├── README.md ├── cmd └── tokenizer │ └── main.go ├── go.mod ├── go.sum ├── js ├── array-keyed-map.js ├── gpt3-tokenizer.cjs.development.js └── text.min.js ├── tokenizer.go └── tokenizer_test.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, built with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | 14 | # Dependency directories (remove the comment below to include it) 15 | # vendor/ 16 | /tokenizer 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Pando 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | In the beginning, we couldn't find a suitable Go package calculate token for 日本語と中国語. As a result, we called JavaScript as a workaround, but it was not efficient or elegant. However, a native Go package, [tiktoken-go](https://github.com/pkoukk/tiktoken-go) is now available. It is faster. Please choose it first. 2 | 3 | --- 4 | 5 | # tokenizer-go 6 | 7 | tokenizer-go is a Go package that simplifies token calculation for OpenAI API users. Although OpenAI does not provide a native Go package for token calculation, tokenizer-go fills the gap by embedding an implementation of an npm package and extracting the results through JavaScript calls. This allows you to use tokenizer-go just like any other Go package in your projects, making it easier to work with token calculations in the Go programming language. 8 | 9 | ## Install 10 | 11 | ```shell 12 | # Use as a module 13 | go get -u github.com/pandodao/tokenizer-go 14 | 15 | # Use as a command line program 16 | go install github.com/pandodao/tokenizer-go/cmd/tokenizer@latest 17 | ``` 18 | 19 | ## Usage 20 | 21 | * As a module 22 | ```go 23 | package main 24 | 25 | import ( 26 | "fmt" 27 | 28 | "github.com/pandodao/tokenizer-go" 29 | ) 30 | 31 | func main() { 32 | t := tokenizer.MustCalToken(`Many words map to one token, but some don't: indivisible. 33 | 34 | Unicode characters like emojis may be split into many tokens containing the underlying bytes: 🤚🏾 35 | 36 | Sequences of characters commonly found next to each other may be grouped together: 1234567890`) 37 | fmt.Println(t) // Output: 64 38 | 39 | // Output: {Bpe:[7085 2456 3975 284 530 11241] Text:[Many words map to one token]} 40 | fmt.Printf("%+v\n", tokenizer.MustEncode("Many words map to one token")) 41 | 42 | // Output: Many words map to one token 43 | fmt.Println(tokenizer.MustDecode([]int{7085, 2456, 3975, 284, 530, 11241})) 44 | } 45 | ``` 46 | 47 | * As a command line program 48 | ``` 49 | ~ % tokenizer -token "hello world" 50 | 2 51 | ~ % 52 | ~ % tokenizer -encode "hello world" 53 | {"bpe":[31373,995],"text":["hello"," world"]} 54 | ~ % 55 | ~ % tokenizer -decode "[31373,995]" 56 | hello world 57 | ~ % 58 | ~ % tokenizer 59 | Usage of tokenizer: 60 | -decode string 61 | tokens to decode 62 | -encode string 63 | text to encode 64 | -token string 65 | text to calculate token 66 | ~ % 67 | ``` 68 | 69 | ## Benchmark 70 | 71 | ``` 72 | % go test -v -bench=. 73 | === RUN TestNewGojaRuntime 74 | --- PASS: TestNewGojaRuntime (0.00s) 75 | === RUN TestValidateFunctionsWithinGojaRuntime 76 | --- PASS: TestValidateFunctionsWithinGojaRuntime (0.61s) 77 | === RUN TestEncode 78 | === RUN TestEncode/ASCII_Characters 79 | tokenizer_test.go:117: Encode(Hello World) cost: 620.252292ms 80 | === RUN TestEncode/CJK_Characters 81 | tokenizer_test.go:117: Encode(你好,世界) cost: 387.25µs 82 | === RUN TestEncode/WithConcurrency 83 | tokenizer_test.go:172: Encode(ASCII_Characters) ran 20 times concurrently, cost average: 361.588418ms, cost min: 75.833µs, cost max: 1.829107916s 84 | tokenizer_test.go:178: Encode(CJK_Characters) ran 20 times concurrently, cost average: 446.462658ms, cost min: 170.292µs, cost max: 1.831984708s 85 | --- PASS: TestEncode (2.45s) 86 | --- PASS: TestEncode/ASCII_Characters (0.62s) 87 | --- PASS: TestEncode/CJK_Characters (0.00s) 88 | --- PASS: TestEncode/WithConcurrency (1.83s) 89 | === RUN TestDecode 90 | === RUN TestDecode/ASCII_Characters 91 | tokenizer_test.go:212: Decode([15496 2159]) cost: 150.416µs 92 | === RUN TestDecode/CJK_Characters 93 | tokenizer_test.go:212: Decode([19526 254 25001 121 171 120 234 10310 244 45911 234]) cost: 34.584µs 94 | === RUN TestDecode/WithConcurrency 95 | tokenizer_test.go:258: Decode(ASCII_Characters) ran 20 times concurrently, cost average: 45.558µs, cost min: 29.708µs, cost max: 153.458µs 96 | tokenizer_test.go:264: Decode(CJK_Characters) ran 20 times concurrently, cost average: 62.145µs, cost min: 37.291µs, cost max: 183.292µs 97 | --- PASS: TestDecode (0.00s) 98 | --- PASS: TestDecode/ASCII_Characters (0.00s) 99 | --- PASS: TestDecode/CJK_Characters (0.00s) 100 | --- PASS: TestDecode/WithConcurrency (0.00s) 101 | === RUN TestCalToken 102 | === RUN TestCalToken/ASCII_Characters 103 | tokenizer_test.go:298: CalToken(Hello World) cost: 357.583µs 104 | === RUN TestCalToken/CJK_Characters 105 | tokenizer_test.go:298: CalToken(你好,世界) cost: 217.709µs 106 | === RUN TestCalToken/WithConcurrency 107 | tokenizer_test.go:344: Decode(ASCII_Characters) ran 20 times concurrently, cost average: 32.636206ms, cost min: 96.75µs, cost max: 647.582833ms 108 | tokenizer_test.go:350: Decode(CJK_Characters) ran 20 times concurrently, cost average: 429.197µs, cost min: 230.375µs, cost max: 1.167416ms 109 | --- PASS: TestCalToken (0.65s) 110 | --- PASS: TestCalToken/ASCII_Characters (0.00s) 111 | --- PASS: TestCalToken/CJK_Characters (0.00s) 112 | --- PASS: TestCalToken/WithConcurrency (0.65s) 113 | goos: darwin 114 | goarch: arm64 115 | pkg: github.com/pandodao/tokenizer-go 116 | BenchmarkCalToken 117 | BenchmarkCalToken/ASCII_Characters 118 | BenchmarkCalToken/ASCII_Characters-10 546 2186558 ns/op 119 | BenchmarkCalToken/CJK_Characters 120 | BenchmarkCalToken/CJK_Characters-10 420 2942631 ns/op 121 | PASS 122 | ok github.com/pandodao/tokenizer-go 10.869s 123 | ``` 124 | 125 | ## Thanks 126 | 127 | * https://github.com/botisan-ai/gpt3-tokenizer 128 | * https://github.com/dop251/goja 129 | 130 | ## License 131 | See the [LICENSE](https://github.com/pandodao/tokenizer-go/blob/main/LICENSE) file. 132 | -------------------------------------------------------------------------------- /cmd/tokenizer/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "flag" 6 | "fmt" 7 | "os" 8 | 9 | "github.com/pandodao/tokenizer-go" 10 | ) 11 | 12 | func main() { 13 | token := flag.String("token", "", "text to calculate token") 14 | encode := flag.String("encode", "", "text to encode") 15 | decode := flag.String("decode", "", "tokens to decode") 16 | flag.Parse() 17 | 18 | switch { 19 | case *token != "": 20 | fmt.Println(tokenizer.MustCalToken(*token)) 21 | case *encode != "": 22 | data, _ := json.Marshal(tokenizer.MustEncode(*encode)) 23 | fmt.Println(string(data)) 24 | case *decode != "": 25 | var s []int 26 | if err := json.Unmarshal([]byte(*decode), &s); err != nil { 27 | fmt.Println(err.Error()) 28 | os.Exit(1) 29 | } 30 | fmt.Println(tokenizer.MustDecode(s)) 31 | default: 32 | flag.Usage() 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/pandodao/tokenizer-go 2 | 3 | go 1.19 4 | 5 | require ( 6 | github.com/dop251/goja v0.0.0-20230304130813-e2f543bf4b4c 7 | github.com/dop251/goja_nodejs v0.0.0-20230226152057-060fa99b809f 8 | github.com/stretchr/testify v1.8.2 9 | ) 10 | 11 | require ( 12 | github.com/davecgh/go-spew v1.1.1 // indirect 13 | github.com/dlclark/regexp2 v1.8.1 // indirect 14 | github.com/go-sourcemap/sourcemap v2.1.3+incompatible // indirect 15 | github.com/google/pprof v0.0.0-20230309165930-d61513b1440d // indirect 16 | github.com/pmezard/go-difflib v1.0.0 // indirect 17 | golang.org/x/text v0.8.0 // indirect 18 | gopkg.in/yaml.v3 v3.0.1 // indirect 19 | ) 20 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/chzyer/logex v1.2.0/go.mod h1:9+9sk7u7pGNWYMkh0hdiL++6OeibzJccyQU4p4MedaY= 2 | github.com/chzyer/readline v1.5.0/go.mod h1:x22KAscuvRqlLoK9CsoYsmxoXZMMFVyOl86cAH8qUic= 3 | github.com/chzyer/test v0.0.0-20210722231415-061457976a23/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= 4 | github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= 5 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 6 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 7 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 8 | github.com/dlclark/regexp2 v1.4.1-0.20201116162257-a2a8dda75c91/go.mod h1:2pZnwuY/m+8K6iRw6wQdMtk+rH5tNGR1i55kozfMjCc= 9 | github.com/dlclark/regexp2 v1.7.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= 10 | github.com/dlclark/regexp2 v1.8.1 h1:6Lcdwya6GjPUNsBct8Lg/yRPwMhABj269AAzdGSiR+0= 11 | github.com/dlclark/regexp2 v1.8.1/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= 12 | github.com/dop251/goja v0.0.0-20211022113120-dc8c55024d06/go.mod h1:R9ET47fwRVRPZnOGvHxxhuZcbrMCuiqOz3Rlrh4KSnk= 13 | github.com/dop251/goja v0.0.0-20221118162653-d4bf6fde1b86/go.mod h1:yRkwfj0CBpOGre+TwBsqPV0IH0Pk73e4PXJOeNDboGs= 14 | github.com/dop251/goja v0.0.0-20230304130813-e2f543bf4b4c h1:/utv6nmTctV6OVgfk5+O6lEMEWL+6KJy4h9NZ5fnkQQ= 15 | github.com/dop251/goja v0.0.0-20230304130813-e2f543bf4b4c/go.mod h1:QMWlm50DNe14hD7t24KEqZuUdC9sOTy8W6XbCU1mlw4= 16 | github.com/dop251/goja_nodejs v0.0.0-20210225215109-d91c329300e7/go.mod h1:hn7BA7c8pLvoGndExHudxTDKZ84Pyvv+90pbBjbTz0Y= 17 | github.com/dop251/goja_nodejs v0.0.0-20211022123610-8dd9abb0616d/go.mod h1:DngW8aVqWbuLRMHItjPUyqdj+HWPvnQe8V8y1nDpIbM= 18 | github.com/dop251/goja_nodejs v0.0.0-20230226152057-060fa99b809f h1:mmnNidRg3cMfcgyeNtIBSDZgjf/85lA/2pplccwSxYg= 19 | github.com/dop251/goja_nodejs v0.0.0-20230226152057-060fa99b809f/go.mod h1:0tlktQL7yHfYEtjcRGi/eiOkbDR5XF7gyFFvbC5//E0= 20 | github.com/go-sourcemap/sourcemap v2.1.3+incompatible h1:W1iEw64niKVGogNgBN3ePyLFfuisuzeidWPMPWmECqU= 21 | github.com/go-sourcemap/sourcemap v2.1.3+incompatible/go.mod h1:F8jJfvm2KbVjc5NqelyYJmf/v5J0dwNLS2mL4sNA1Jg= 22 | github.com/google/pprof v0.0.0-20230207041349-798e818bf904/go.mod h1:uglQLonpP8qtYCYyzA+8c/9qtqgA3qsXGYqCPKARAFg= 23 | github.com/google/pprof v0.0.0-20230309165930-d61513b1440d h1:um9/pc7tKMINFfP1eE7Wv6PRGXlcCSJkVajF7KJw3uQ= 24 | github.com/google/pprof v0.0.0-20230309165930-d61513b1440d/go.mod h1:79YE0hCXdHag9sBkw2o+N/YnZtTkXi0UT9Nnixa5eYk= 25 | github.com/ianlancetaylor/demangle v0.0.0-20220319035150-800ac71e25c2/go.mod h1:aYm2/VgdVmcIU8iMfdMvDMsRAQjcfZSKFby6HOFvi/w= 26 | github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= 27 | github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= 28 | github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0= 29 | github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk= 30 | github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= 31 | github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= 32 | github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= 33 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= 34 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 35 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 36 | github.com/rogpeppe/go-internal v1.6.1 h1:/FiVV8dS/e+YqF2JvO3yXRFbBLTIuSDkuC7aBOAvL+k= 37 | github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= 38 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 39 | github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= 40 | github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= 41 | github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 42 | github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= 43 | github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8= 44 | github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= 45 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= 46 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 47 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= 48 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= 49 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 50 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= 51 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= 52 | golang.org/x/net v0.4.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE= 53 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 54 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 55 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 56 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 57 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 58 | golang.org/x/sys v0.0.0-20220310020820-b874c991c1a5/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 59 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 60 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 61 | golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 62 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 63 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= 64 | golang.org/x/term v0.3.0/go.mod h1:q750SLmJuPmVoN1blW3UFBPREJfb1KmY3vwxfr+nFDA= 65 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 66 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 67 | golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 68 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= 69 | golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= 70 | golang.org/x/text v0.5.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= 71 | golang.org/x/text v0.8.0 h1:57P1ETyNKtuIjB4SRd15iJxuhj8Gc416Y78H3qgMh68= 72 | golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= 73 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 74 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 75 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= 76 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 77 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 78 | gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 79 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= 80 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= 81 | gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= 82 | gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= 83 | gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= 84 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 85 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 86 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 87 | -------------------------------------------------------------------------------- /js/array-keyed-map.js: -------------------------------------------------------------------------------- 1 | /* 2 | # Implementation strategy 3 | 4 | Create a tree of `Map`s, such that indexing the tree recursively (with items 5 | of a key array, sequentially), traverses the tree, so that when the key array 6 | is exhausted, the tree node we arrive at contains the value for that key 7 | array under the guaranteed-unique `Symbol` key `dataSymbol`. 8 | 9 | ## Example 10 | 11 | Start with an empty `ArrayKeyedMap` tree: 12 | 13 | { 14 | } 15 | 16 | Add ['a'] → 1: 17 | 18 | { 19 | 'a': { 20 | [dataSymbol]: 1, 21 | }, 22 | } 23 | 24 | Add [] → 0: 25 | 26 | { 27 | [dataSymbol]: 0, 28 | 'a': { 29 | [dataSymbol]: 1, 30 | }, 31 | } 32 | 33 | Add ['a', 'b', 'c', 'd'] → 4: 34 | 35 | { 36 | [dataSymbol]: 0, 37 | 'a': { 38 | [dataSymbol]: 1, 39 | 'b': { 40 | 'c': { 41 | 'd': { 42 | [dataSymbol]: 4, 43 | }, 44 | }, 45 | }, 46 | }, 47 | } 48 | 49 | String array keys are used in the above example for simplicity. In reality, 50 | we can support any values in array keys, because `Map`s do. 51 | */ 52 | 53 | const dataSymbol = Symbol('path-store-trunk') 54 | 55 | // 56 | // This class represents the external API 57 | // 58 | 59 | class ArrayKeyedMap { 60 | constructor (initialEntries = []) { 61 | this._root = new Map() 62 | this._size = 0 63 | for (const [k, v] of initialEntries) { this.set(k, v) } 64 | } 65 | 66 | set (path, value) { return set.call(this, path, value) } 67 | 68 | has (path) { return has.call(this, path) } 69 | 70 | get (path) { return get.call(this, path) } 71 | 72 | delete (path) { return del.call(this, path) } 73 | 74 | get size () { return this._size } 75 | 76 | clear () { 77 | this._root.clear() 78 | this._size = 0 79 | } 80 | 81 | hasPrefix (path) { return hasPrefix.call(this, path) } 82 | 83 | get [Symbol.toStringTag] () { return 'ArrayKeyedMap' } 84 | 85 | * [Symbol.iterator] () { yield * entries.call(this) } 86 | 87 | * entries () { yield * entries.call(this) } 88 | 89 | * keys () { yield * keys.call(this) } 90 | 91 | * values () { yield * values.call(this) } 92 | 93 | forEach (callback, thisArg) { forEach.call(this, callback, thisArg) } 94 | } 95 | 96 | module.exports = ArrayKeyedMap 97 | 98 | // 99 | // These stateless functions implement the internals 100 | // 101 | 102 | function set (path, value) { 103 | let map = this._root 104 | for (const item of path) { 105 | let nextMap = map.get(item) 106 | if (!nextMap) { 107 | // Create next map if none exists 108 | nextMap = new Map() 109 | map.set(item, nextMap) 110 | } 111 | map = nextMap 112 | } 113 | 114 | // Reached end of path. Set the data symbol to the given value, and 115 | // increment size if nothing was here before. 116 | if (!map.has(dataSymbol)) this._size += 1 117 | map.set(dataSymbol, value) 118 | return this 119 | } 120 | 121 | function has (path) { 122 | let map = this._root 123 | for (const item of path) { 124 | const nextMap = map.get(item) 125 | if (nextMap) { 126 | map = nextMap 127 | } else { 128 | return false 129 | } 130 | } 131 | return map.has(dataSymbol) 132 | } 133 | 134 | function get (path) { 135 | let map = this._root 136 | for (const item of path) { 137 | map = map.get(item) 138 | if (!map) return undefined 139 | } 140 | return map.get(dataSymbol) 141 | } 142 | 143 | function del (path) { 144 | let map = this._root 145 | 146 | // Maintain a stack of maps we visited, so we can go back and trim empty ones 147 | // if we delete something. 148 | const stack = [] 149 | 150 | for (const item of path) { 151 | const nextMap = map.get(item) 152 | if (nextMap) { 153 | stack.unshift({ parent: map, child: nextMap, item }) 154 | map = nextMap 155 | } else { 156 | // Nothing to delete 157 | return false 158 | } 159 | } 160 | 161 | // Reached end of path. Delete data, if it exists. 162 | const hadPreviousValue = map.delete(dataSymbol) 163 | 164 | // If something was deleted, decrement size and go through the stack of 165 | // visited maps, trimming any that are now empty. 166 | if (hadPreviousValue) { 167 | this._size -= 1 168 | 169 | for (const { parent, child, item } of stack) { 170 | if (child.size === 0) { 171 | parent.delete(item) 172 | } 173 | } 174 | } 175 | return hadPreviousValue 176 | } 177 | 178 | function hasPrefix (path) { 179 | let map = this._root 180 | for (const item of path) { 181 | map = map.get(item) 182 | if (!map) return false 183 | } 184 | return true 185 | } 186 | 187 | function * entries () { 188 | const stack = [{ path: [], map: this._root }] 189 | while (stack.length > 0) { 190 | const { path, map } = stack.pop() 191 | for (const [k, v] of map.entries()) { 192 | if (k === dataSymbol) yield [path, v] 193 | else stack.push({ path: path.concat([k]), map: v }) 194 | } 195 | } 196 | } 197 | 198 | function * keys () { 199 | for (const [k] of this.entries()) yield k 200 | } 201 | 202 | function * values () { 203 | for (const [, v] of this.entries()) yield v 204 | } 205 | 206 | function forEach (callback, thisArg) { 207 | for (const [k, v] of this.entries()) callback.call(thisArg, v, k, this) 208 | } 209 | -------------------------------------------------------------------------------- /js/text.min.js: -------------------------------------------------------------------------------- 1 | (function(scope) {'use strict'; 2 | function B(r,e){var f;return r instanceof Buffer?f=r:f=Buffer.from(r.buffer,r.byteOffset,r.byteLength),f.toString(e)}var w=function(r){return Buffer.from(r)};function h(r){for(var e=0,f=Math.min(256*256,r.length+1),n=new Uint16Array(f),i=[],o=0;;){var t=e=f-1){var s=n.subarray(0,o),m=s;if(i.push(String.fromCharCode.apply(null,m)),!t)return i.join("");r=r.subarray(e),e=0,o=0}var a=r[e++];if((a&128)===0)n[o++]=a;else if((a&224)===192){var d=r[e++]&63;n[o++]=(a&31)<<6|d}else if((a&240)===224){var d=r[e++]&63,l=r[e++]&63;n[o++]=(a&31)<<12|d<<6|l}else if((a&248)===240){var d=r[e++]&63,l=r[e++]&63,R=r[e++]&63,c=(a&7)<<18|d<<12|l<<6|R;c>65535&&(c-=65536,n[o++]=c>>>10&1023|55296,c=56320|c&1023),n[o++]=c}}}function F(r){for(var e=0,f=r.length,n=0,i=Math.max(32,f+(f>>>1)+7),o=new Uint8Array(i>>>3<<3);e=55296&&t<=56319){if(e=55296&&t<=56319)continue}if(n+4>o.length){i+=8,i*=1+e/r.length*2,i=i>>>3<<3;var m=new Uint8Array(i);m.set(o),o=m}if((t&4294967168)===0){o[n++]=t;continue}else if((t&4294965248)===0)o[n++]=t>>>6&31|192;else if((t&4294901760)===0)o[n++]=t>>>12&15|224,o[n++]=t>>>6&63|128;else if((t&4292870144)===0)o[n++]=t>>>18&7|240,o[n++]=t>>>12&63|128,o[n++]=t>>>6&63|128;else continue;o[n++]=t&63|128}return o.slice?o.slice(0,n):o.subarray(0,n)}var u="Failed to ",p=function(r,e,f){if(r)throw new Error("".concat(u).concat(e,": the '").concat(f,"' option is unsupported."))};var x=typeof Buffer=="function"&&Buffer.from;var A=x?w:F;function v(){this.encoding="utf-8"}v.prototype.encode=function(r,e){return p(e&&e.stream,"encode","stream"),A(r)};function U(r){var e;try{var f=new Blob([r],{type:"text/plain;charset=UTF-8"});e=URL.createObjectURL(f);var n=new XMLHttpRequest;return n.open("GET",e,!1),n.send(),n.responseText}finally{e&&URL.revokeObjectURL(e)}}var O=!x&&typeof Blob=="function"&&typeof URL=="function"&&typeof URL.createObjectURL=="function",S=["utf-8","utf8","unicode-1-1-utf-8"],T=h;x?T=B:O&&(T=function(r){try{return U(r)}catch(e){return h(r)}});var y="construct 'TextDecoder'",E="".concat(u," ").concat(y,": the ");function g(r,e){p(e&&e.fatal,y,"fatal"),r=r||"utf-8";var f;if(x?f=Buffer.isEncoding(r):f=S.indexOf(r.toLowerCase())!==-1,!f)throw new RangeError("".concat(E," encoding label provided ('").concat(r,"') is invalid."));this.encoding=r,this.fatal=!1,this.ignoreBOM=!1}g.prototype.decode=function(r,e){p(e&&e.stream,"decode","stream");var f;return r instanceof Uint8Array?f=r:r.buffer instanceof ArrayBuffer?f=new Uint8Array(r.buffer):f=new Uint8Array(r),T(f,this.encoding)};scope.TextEncoder=scope.TextEncoder||v;scope.TextDecoder=scope.TextDecoder||g; 3 | }(typeof window !== 'undefined' ? window : (typeof global !== 'undefined' ? global : this))); 4 | -------------------------------------------------------------------------------- /tokenizer.go: -------------------------------------------------------------------------------- 1 | package tokenizer 2 | 3 | import ( 4 | _ "embed" 5 | "encoding/json" 6 | "errors" 7 | "path" 8 | "sync" 9 | 10 | "github.com/dop251/goja" 11 | "github.com/dop251/goja_nodejs/require" 12 | ) 13 | 14 | var ( 15 | //go:embed js/gpt3-tokenizer.cjs.development.js 16 | tokenizerJs string 17 | 18 | //go:embed js/array-keyed-map.js 19 | arrayKeyedMapJs string 20 | 21 | //go:embed js/text.min.js 22 | fastTextEncodingJs string 23 | 24 | registry *require.Registry 25 | 26 | // optimize the alloc and instancing performance of 27 | // *goja.Runtime 28 | pool sync.Pool = sync.Pool{ 29 | New: func() any { 30 | return newGojaRuntime() 31 | }, 32 | } 33 | ) 34 | 35 | // gojaRuntime is a wrapper of *goja.Runtime with the error 36 | type gojaRuntime struct { 37 | // runtime itself 38 | vm *goja.Runtime 39 | // encode function that registered in the runtime 40 | encode goja.Callable 41 | // decode function that registered in the runtime 42 | decode goja.Callable 43 | 44 | // err is the error occurred during the initialization 45 | err error 46 | } 47 | 48 | type EncodeResult struct { 49 | Bpe []int `json:"bpe"` 50 | Text []string `json:"text"` 51 | } 52 | 53 | func init() { 54 | registry = require.NewRegistry(require.WithLoader(func(p string) ([]byte, error) { 55 | switch path.Base(p) { 56 | case "array-keyed-map": 57 | return []byte(arrayKeyedMapJs), nil 58 | case "fast-text-encoding": 59 | return []byte(fastTextEncodingJs), nil 60 | } 61 | return nil, require.IllegalModuleNameError 62 | })) 63 | 64 | // pre-alloc the *goja.Runtime once 65 | runtime := pool.Get().(*gojaRuntime) 66 | if runtime.err != nil { 67 | panic(runtime.err) 68 | } 69 | 70 | pool.Put(runtime) // put it back to the pool 71 | } 72 | 73 | // newGojaRuntime create a new *goja.Runtime and declare the 74 | // tokenizer functions, it returns the wrapped *gojaRuntime with 75 | // the error if any occurred during the initialization 76 | func newGojaRuntime() *gojaRuntime { 77 | vm := goja.New() 78 | registry.Enable(vm) 79 | _, err := vm.RunString(tokenizerJs + "\n" + 80 | `const tokenizer = new GPT3NodeTokenizer({type: 'gpt3'}); 81 | function encode(str) {return tokenizer.encode(str)} 82 | function decode(tokens) {return tokenizer.decode(tokens)}`) 83 | if err != nil { 84 | return &gojaRuntime{ 85 | vm: vm, 86 | err: err, 87 | } 88 | } 89 | 90 | encode, decode, err := getEncodeAndDecodeFunctionsWithinGojaRuntime(vm) 91 | return &gojaRuntime{ 92 | vm: vm, 93 | encode: encode, 94 | decode: decode, 95 | err: err, 96 | } 97 | } 98 | 99 | // getEncodeAndDecodeFunctionsWithinGojaRuntime returns the encode and 100 | // decode functions within the *goja.Runtime 101 | func getEncodeAndDecodeFunctionsWithinGojaRuntime(vm *goja.Runtime) (goja.Callable, goja.Callable, error) { 102 | encode, ok := goja.AssertFunction(vm.Get("encode")) 103 | if !ok { 104 | return nil, nil, errors.New("encode is not a function") 105 | } 106 | decode, ok := goja.AssertFunction(vm.Get("decode")) 107 | if !ok { 108 | return nil, nil, errors.New("decode is not a function") 109 | } 110 | 111 | return encode, decode, nil 112 | } 113 | 114 | func MustCalToken(str string) int { 115 | token, err := CalToken(str) 116 | if err != nil { 117 | panic(err) 118 | } 119 | 120 | return token 121 | } 122 | 123 | func CalToken(str string) (int, error) { 124 | r, err := Encode(str) 125 | if err != nil { 126 | return 0, err 127 | } 128 | 129 | return len(r.Bpe), nil 130 | } 131 | 132 | func MustEncode(str string) EncodeResult { 133 | r, err := Encode(str) 134 | if err != nil { 135 | panic(err) 136 | } 137 | 138 | return *r 139 | } 140 | 141 | func Encode(str string) (*EncodeResult, error) { 142 | gojaRuntime := pool.Get().(*gojaRuntime) 143 | if gojaRuntime.err != nil { 144 | return nil, gojaRuntime.err 145 | } 146 | defer pool.Put(gojaRuntime) // put it back to the pool 147 | 148 | v, err := gojaRuntime.encode(goja.Undefined(), gojaRuntime.vm.ToValue(str)) 149 | if err != nil { 150 | return nil, err 151 | } 152 | 153 | data, _ := json.Marshal(v.Export()) 154 | r := &EncodeResult{} 155 | if err := json.Unmarshal(data, r); err != nil { 156 | return nil, err 157 | } 158 | 159 | return r, nil 160 | } 161 | 162 | func MustDecode(tokens []int) string { 163 | r, err := Decode(tokens) 164 | if err != nil { 165 | panic(err) 166 | } 167 | 168 | return r 169 | } 170 | 171 | func Decode(tokens []int) (string, error) { 172 | gojaRuntime := pool.Get().(*gojaRuntime) 173 | if gojaRuntime.err != nil { 174 | return "", gojaRuntime.err 175 | } 176 | defer pool.Put(gojaRuntime) // put it back to the pool 177 | 178 | v, err := gojaRuntime.decode(goja.Undefined(), gojaRuntime.vm.ToValue(tokens)) 179 | if err != nil { 180 | return "", err 181 | } 182 | 183 | return v.String(), nil 184 | } 185 | -------------------------------------------------------------------------------- /tokenizer_test.go: -------------------------------------------------------------------------------- 1 | package tokenizer 2 | 3 | import ( 4 | "sort" 5 | "sync" 6 | "testing" 7 | "time" 8 | 9 | "github.com/dop251/goja" 10 | "github.com/stretchr/testify/assert" 11 | "github.com/stretchr/testify/require" 12 | ) 13 | 14 | func max[T int | time.Duration](slice []T) T { 15 | sort.SliceStable(slice, func(i, j int) bool { 16 | return slice[i] < slice[j] 17 | }) 18 | 19 | return slice[len(slice)-1] 20 | } 21 | 22 | func min[T int | time.Duration](slice []T) T { 23 | sort.SliceStable(slice, func(i, j int) bool { 24 | return slice[i] < slice[j] 25 | }) 26 | 27 | return slice[0] 28 | } 29 | 30 | func average[T int | time.Duration](slice []T) T { 31 | var sum T 32 | for _, item := range slice { 33 | sum += item 34 | } 35 | 36 | return sum / T(len(slice)) 37 | } 38 | 39 | func TestNewGojaRuntime(t *testing.T) { 40 | originalTokenizerJs := tokenizerJs 41 | defer func() { 42 | tokenizerJs = originalTokenizerJs 43 | }() 44 | 45 | tokenizerJs = "" 46 | runtime := newGojaRuntime() 47 | require.Error(t, runtime.err) 48 | assert.EqualError(t, runtime.err, "ReferenceError: GPT3NodeTokenizer is not defined at :2:23(3)") 49 | } 50 | 51 | func TestValidateFunctionsWithinGojaRuntime(t *testing.T) { 52 | vm := goja.New() 53 | registry.Enable(vm) 54 | 55 | encode, decode, err := getEncodeAndDecodeFunctionsWithinGojaRuntime(vm) 56 | require.Error(t, err) 57 | assert.EqualError(t, err, "encode is not a function") 58 | assert.Nil(t, encode) 59 | assert.Nil(t, decode) 60 | 61 | _, err = vm.RunString(tokenizerJs + "\n" + 62 | `const tokenizer = new GPT3NodeTokenizer({type: 'gpt3'}); 63 | function encode(str) {return tokenizer.encode(str)}`) 64 | require.NoError(t, err) 65 | 66 | encode, decode, err = getEncodeAndDecodeFunctionsWithinGojaRuntime(vm) 67 | require.Error(t, err) 68 | assert.EqualError(t, err, "decode is not a function") 69 | assert.Nil(t, encode) 70 | assert.Nil(t, decode) 71 | 72 | _, err = vm.RunString("function decode(tokens) {return tokenizer.decode(tokens)}") 73 | require.NoError(t, err) 74 | 75 | encode, decode, err = getEncodeAndDecodeFunctionsWithinGojaRuntime(vm) 76 | require.NoError(t, err) 77 | assert.NotNil(t, encode) 78 | assert.NotNil(t, decode) 79 | } 80 | 81 | type testEncode struct { 82 | testName string 83 | input string 84 | result EncodeResult 85 | ignoreText bool 86 | } 87 | 88 | func TestEncode(t *testing.T) { 89 | tables := []testEncode{ 90 | { 91 | testName: "ASCII_Characters", 92 | input: "Hello World", 93 | result: EncodeResult{ 94 | Bpe: []int{15496, 2159}, 95 | Text: []string{"Hello", " World"}, 96 | }, 97 | }, 98 | { 99 | testName: "CJK_Characters", 100 | input: "你好,世界", 101 | ignoreText: true, 102 | result: EncodeResult{ 103 | Bpe: []int{19526, 254, 25001, 121, 171, 120, 234, 10310, 244, 45911, 234}, 104 | }, 105 | }, 106 | } 107 | 108 | var ignoreTextRan bool 109 | for _, table := range tables { 110 | t.Run(table.testName, func(t *testing.T) { 111 | start := time.Now() 112 | r := MustEncode(table.input) 113 | assert.Equal(t, table.result.Bpe, r.Bpe) 114 | if !table.ignoreText { 115 | assert.Equal(t, table.result.Text, r.Text) 116 | ignoreTextRan = true 117 | } 118 | 119 | t.Logf("Encode(%s) cost: %s", table.input, time.Since(start)) 120 | }) 121 | } 122 | assert.True(t, ignoreTextRan) 123 | ignoreTextRan = false 124 | 125 | t.Run("WithConcurrency", func(t *testing.T) { 126 | concurrency := 20 127 | 128 | tablesMat := make([][]testEncode, concurrency) 129 | resultsMat := make([][]EncodeResult, concurrency) 130 | timeCostsMat := make([][]time.Duration, concurrency) 131 | for i := range tablesMat { 132 | tablesMat[i] = tables 133 | resultsMat[i] = make([]EncodeResult, len(tables)) // init 134 | timeCostsMat[i] = make([]time.Duration, len(tables)) // init 135 | } 136 | 137 | var wg sync.WaitGroup 138 | for i, elem := range tablesMat { 139 | for j := range elem { 140 | wg.Add(1) 141 | go func(iIndex, jIndex int) { 142 | start := time.Now() 143 | table := tablesMat[iIndex][jIndex] 144 | result := MustEncode(table.input) 145 | 146 | resultsMat[iIndex][jIndex] = result 147 | timeCostsMat[iIndex][jIndex] = time.Since(start) 148 | wg.Done() 149 | }(i, j) 150 | } 151 | } 152 | wg.Wait() 153 | 154 | for i, ts := range tablesMat { 155 | for j := range ts { 156 | r := resultsMat[i][j] 157 | assert.Equal(t, ts[j].result.Bpe, r.Bpe) 158 | if !ts[j].ignoreText { 159 | assert.Equal(t, ts[j].result.Text, r.Text) 160 | ignoreTextRan = true 161 | } 162 | } 163 | } 164 | 165 | assert.True(t, ignoreTextRan) 166 | 167 | timeCostsForASCIICharacters := make([]time.Duration, len(timeCostsMat)) 168 | timeCostsForCJKCharacters := make([]time.Duration, len(timeCostsMat)) 169 | for i := range timeCostsMat { 170 | timeCostsForASCIICharacters[i] = timeCostsMat[i][0] 171 | timeCostsForCJKCharacters[i] = timeCostsMat[i][1] 172 | } 173 | 174 | t.Logf("Encode(ASCII_Characters) ran %d times concurrently, cost average: %s, cost min: %s, cost max: %s", 175 | concurrency, 176 | average(timeCostsForASCIICharacters), 177 | min(timeCostsForASCIICharacters), 178 | max(timeCostsForASCIICharacters), 179 | ) 180 | t.Logf("Encode(CJK_Characters) ran %d times concurrently, cost average: %s, cost min: %s, cost max: %s", 181 | concurrency, 182 | average(timeCostsForCJKCharacters), 183 | min(timeCostsForCJKCharacters), 184 | max(timeCostsForCJKCharacters), 185 | ) 186 | }) 187 | } 188 | 189 | type testDecode struct { 190 | testName string 191 | input []int 192 | result string 193 | } 194 | 195 | func TestDecode(t *testing.T) { 196 | tables := []testDecode{ 197 | { 198 | testName: "ASCII_Characters", 199 | input: []int{15496, 2159}, 200 | result: "Hello World", 201 | }, 202 | { 203 | testName: "CJK_Characters", 204 | input: []int{19526, 254, 25001, 121, 171, 120, 234, 10310, 244, 45911, 234}, 205 | result: "你好,世界", 206 | }, 207 | } 208 | 209 | for _, table := range tables { 210 | t.Run(table.testName, func(t *testing.T) { 211 | start := time.Now() 212 | r := MustDecode(table.input) 213 | assert.Equal(t, table.result, r) 214 | t.Logf("Decode(%v) cost: %s", table.input, time.Since(start)) 215 | }) 216 | } 217 | 218 | t.Run("WithConcurrency", func(t *testing.T) { 219 | concurrency := 20 220 | 221 | tablesMat := make([][]testDecode, concurrency) 222 | resultsMat := make([][]string, concurrency) 223 | timeCostsMat := make([][]time.Duration, concurrency) 224 | for i := range tablesMat { 225 | tablesMat[i] = tables 226 | resultsMat[i] = make([]string, len(tables)) // init 227 | timeCostsMat[i] = make([]time.Duration, len(tables)) // init 228 | } 229 | 230 | var wg sync.WaitGroup 231 | for i, elem := range tablesMat { 232 | for j := range elem { 233 | wg.Add(1) 234 | go func(iIndex, jIndex int) { 235 | start := time.Now() 236 | table := tablesMat[iIndex][jIndex] 237 | result := MustDecode(table.input) 238 | 239 | resultsMat[iIndex][jIndex] = result 240 | timeCostsMat[iIndex][jIndex] = time.Since(start) 241 | wg.Done() 242 | }(i, j) 243 | } 244 | } 245 | wg.Wait() 246 | 247 | for i, elem := range tablesMat { 248 | for j := range elem { 249 | r := resultsMat[i][j] 250 | assert.Equal(t, elem[j].result, r) 251 | } 252 | } 253 | 254 | timeCostsForASCIICharacters := make([]time.Duration, len(timeCostsMat)) 255 | timeCostsForCJKCharacters := make([]time.Duration, len(timeCostsMat)) 256 | for i := range timeCostsMat { 257 | timeCostsForASCIICharacters[i] = timeCostsMat[i][0] 258 | timeCostsForCJKCharacters[i] = timeCostsMat[i][1] 259 | } 260 | 261 | t.Logf("Decode(ASCII_Characters) ran %d times concurrently, cost average: %s, cost min: %s, cost max: %s", 262 | concurrency, 263 | average(timeCostsForASCIICharacters), 264 | min(timeCostsForASCIICharacters), 265 | max(timeCostsForASCIICharacters), 266 | ) 267 | t.Logf("Decode(CJK_Characters) ran %d times concurrently, cost average: %s, cost min: %s, cost max: %s", 268 | concurrency, 269 | average(timeCostsForCJKCharacters), 270 | min(timeCostsForCJKCharacters), 271 | max(timeCostsForCJKCharacters), 272 | ) 273 | }) 274 | } 275 | 276 | type testCalToken struct { 277 | testName string 278 | input string 279 | token int 280 | } 281 | 282 | func TestCalToken(t *testing.T) { 283 | tables := []testCalToken{ 284 | { 285 | testName: "ASCII_Characters", 286 | input: "Hello World", 287 | token: 2, 288 | }, 289 | { 290 | testName: "CJK_Characters", 291 | input: "你好,世界", 292 | token: 11, 293 | }, 294 | } 295 | 296 | for _, table := range tables { 297 | t.Run(table.testName, func(t *testing.T) { 298 | start := time.Now() 299 | token := MustCalToken(table.input) 300 | assert.Equal(t, table.token, token) 301 | t.Logf("CalToken(%s) cost: %s", table.input, time.Since(start)) 302 | }) 303 | } 304 | 305 | t.Run("WithConcurrency", func(t *testing.T) { 306 | concurrency := 20 307 | 308 | tablesMat := make([][]testCalToken, concurrency) 309 | resultsMat := make([][]int, concurrency) 310 | timeCostsMat := make([][]time.Duration, concurrency) 311 | for i := range tablesMat { 312 | tablesMat[i] = tables 313 | resultsMat[i] = make([]int, len(tables)) // init 314 | timeCostsMat[i] = make([]time.Duration, len(tables)) // init 315 | } 316 | 317 | var wg sync.WaitGroup 318 | for i, elem := range tablesMat { 319 | for j := range elem { 320 | wg.Add(1) 321 | go func(iIndex, jIndex int) { 322 | start := time.Now() 323 | table := tablesMat[iIndex][jIndex] 324 | result := MustCalToken(table.input) 325 | 326 | resultsMat[iIndex][jIndex] = result 327 | timeCostsMat[iIndex][jIndex] = time.Since(start) 328 | wg.Done() 329 | }(i, j) 330 | } 331 | } 332 | wg.Wait() 333 | 334 | for i, elem := range tablesMat { 335 | for j := range elem { 336 | token := resultsMat[i][j] 337 | assert.Equal(t, elem[j].token, token) 338 | } 339 | } 340 | 341 | timeCostsForASCIICharacters := make([]time.Duration, len(timeCostsMat)) 342 | timeCostsForCJKCharacters := make([]time.Duration, len(timeCostsMat)) 343 | for i := range timeCostsMat { 344 | timeCostsForASCIICharacters[i] = timeCostsMat[i][0] 345 | timeCostsForCJKCharacters[i] = timeCostsMat[i][1] 346 | } 347 | 348 | t.Logf("Decode(ASCII_Characters) ran %d times concurrently, cost average: %s, cost min: %s, cost max: %s", 349 | concurrency, 350 | average(timeCostsForASCIICharacters), 351 | min(timeCostsForASCIICharacters), 352 | max(timeCostsForASCIICharacters), 353 | ) 354 | t.Logf("Decode(CJK_Characters) ran %d times concurrently, cost average: %s, cost min: %s, cost max: %s", 355 | concurrency, 356 | average(timeCostsForCJKCharacters), 357 | min(timeCostsForCJKCharacters), 358 | max(timeCostsForCJKCharacters), 359 | ) 360 | }) 361 | } 362 | 363 | func BenchmarkCalToken(b *testing.B) { 364 | b.Run("ASCII_Characters", func(b *testing.B) { 365 | for i := 0; i < b.N; i++ { 366 | _ = MustCalToken(`Many words map to one token, but some don't: indivisible. 367 | 368 | Unicode characters like emojis may be split into many tokens containing the underlying bytes: 🤚🏾 369 | 370 | Sequences of characters commonly found next to each other may be grouped together: 1234567890`) 371 | } 372 | }) 373 | 374 | b.Run("CJK_Characters", func(b *testing.B) { 375 | for i := 0; i < b.N; i++ { 376 | _ = MustCalToken(`许多词都会被映射到一个令牌上,但有些词的类型不会:不可分割的。 377 | 378 | 像 Emoji 这样的 Unicode 字符可以被分割成许多包含底层字节的标记:🤚🏾 379 | 380 | 常见的字符序列彼此相邻,可以归为一组:1234567890`) 381 | } 382 | }) 383 | } 384 | --------------------------------------------------------------------------------