├── .sccignore ├── tokenizers ├── sentencepiece │ ├── private │ │ ├── README.md │ │ └── protos │ │ │ ├── protos.go │ │ │ ├── README.md │ │ │ ├── gen_protos.sh │ │ │ ├── sentencepiece_model.proto │ │ │ └── sentencepiece_model.pb.go │ └── sentencepiece.go ├── api │ ├── api.go │ ├── config.go │ └── specialtoken_enumer.go └── tokenizers.go ├── huggingface.go ├── .gitignore ├── go.mod ├── hub ├── files_test.go ├── README.md ├── hub.go ├── info.go ├── download.go ├── repo.go └── files.go ├── internal ├── files │ └── files.go └── downloader │ ├── semaphore.go │ └── downloader.go ├── docs └── CHANGELOG.md ├── go.sum ├── README.md ├── LICENSE └── go-huggingface.ipynb /.sccignore: -------------------------------------------------------------------------------- 1 | internal/protos 2 | LICENSE 3 | .gitignore 4 | .idea 5 | -------------------------------------------------------------------------------- /tokenizers/sentencepiece/private/README.md: -------------------------------------------------------------------------------- 1 | # Private Packages: don't depend on these 2 | 3 | We don't use `internal/` because we need access to Jupyter Notebooks that we use for test and development. -------------------------------------------------------------------------------- /tokenizers/sentencepiece/private/protos/protos.go: -------------------------------------------------------------------------------- 1 | // Package protos have the Proto Buffer code for the sentencepiece_model.proto file, 2 | // downloaded from https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto. 3 | // 4 | // The Model 5 | package protos 6 | 7 | //go:generate ./gen_protos.sh 8 | -------------------------------------------------------------------------------- /huggingface.go: -------------------------------------------------------------------------------- 1 | // Package huggingface only holds the version of the set of tools to interact with HuggingFace using GoMLX. 2 | // 3 | // There are 3 main sub-packages: 4 | // 5 | // - hub: to download files from HuggingFace Hub, be it model files, tokenizers, data, etc. 6 | // - tokenizers: to create tokenizers from downloaded HuggingFace models. 7 | // - models: to convert model weights from different formats to GoMLX. 8 | package huggingface 9 | 10 | // Version of the library. 11 | // Manually kept in sync with project releases. 12 | var Version = "v0.0.0-dev" 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # If you prefer the allow list template instead of the deny list, see community template: 2 | # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore 3 | # 4 | # Binaries for programs and plugins 5 | *.exe 6 | *.exe~ 7 | *.dll 8 | *.so 9 | *.dylib 10 | 11 | # Test binary, built with `go test -c` 12 | *.test 13 | 14 | # Output of the go coverage tool, specifically when used with LiteIDE 15 | *.out 16 | 17 | # Dependency directories (remove the comment below to include it) 18 | # vendor/ 19 | 20 | # Go workspace file 21 | go.work 22 | go.work.sum 23 | 24 | # IDEs 25 | .idea/ 26 | 27 | # Notebooks temporary files. 28 | .ipynb_checkpoints -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/gomlx/go-huggingface 2 | 3 | go 1.24.0 4 | 5 | require ( 6 | github.com/dustin/go-humanize v1.0.1 7 | github.com/eliben/go-sentencepiece v0.6.0 8 | github.com/gofrs/flock v0.13.0 9 | github.com/google/uuid v1.6.0 10 | github.com/pkg/errors v0.9.1 11 | github.com/stretchr/testify v1.11.1 12 | google.golang.org/protobuf v1.36.10 13 | ) 14 | 15 | require ( 16 | github.com/davecgh/go-spew v1.1.1 // indirect 17 | github.com/kr/text v0.2.0 // indirect 18 | github.com/pmezard/go-difflib v1.0.0 // indirect 19 | github.com/rogpeppe/go-internal v1.14.1 // indirect 20 | golang.org/x/sys v0.38.0 // indirect 21 | gopkg.in/yaml.v3 v3.0.1 // indirect 22 | ) 23 | -------------------------------------------------------------------------------- /tokenizers/sentencepiece/private/protos/README.md: -------------------------------------------------------------------------------- 1 | # Proto Files 2 | 3 | * [`sentencepiece_model.proto`](https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto) is 4 | downloaded from the C++ original source, in [https://github.com/google/sentencepiece/](https://github.com/google/sentencepiece), 5 | but it should match the one used by the [github.com/eliben/go-sentencepiece](https://github.com/eliben/go-sentencepiece) 6 | library. 7 | 8 | Because of protoc unique file naming requirement (!?), described in email thread in https://groups.google.com/g/protobuf/c/UWWuoRWz1Uk, 9 | we compile by first creating a unique prefix directory. See `gen_protos.sh` script. 10 | -------------------------------------------------------------------------------- /tokenizers/sentencepiece/private/protos/gen_protos.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # Find Go import path: presumably unique. 5 | import_path="$(go list -f '{{.ImportPath}}')" 6 | 7 | # Extract the domain and the rest of the path 8 | domain=$(echo "$import_path" | cut -d '/' -f 1) 9 | rest_of_path=$(echo "$import_path" | cut -d '/' -f 2-) 10 | 11 | # Reverse the domain part (split by '.') 12 | reversed_domain=$(echo "$domain" | awk -F '.' '{ for (i=NF; i>1; i--) printf "%s.", $i; print $1 }' | sed 's/\.$//') 13 | 14 | # Combine the reversed domain with the rest of the path 15 | tmp_link="$reversed_domain/$rest_of_path" 16 | tmp_link=$(echo "$tmp_link" | tr '/.' '__') 17 | rm -f "${tmp_link}" 18 | ln -s . "${tmp_link}" 19 | protoc --go_out=. --go_opt=paths=source_relative "./${tmp_link}/sentencepiece_model.proto" 20 | rm -f "${tmp_link}" 21 | -------------------------------------------------------------------------------- /hub/files_test.go: -------------------------------------------------------------------------------- 1 | package hub 2 | 3 | import ( 4 | "fmt" 5 | "github.com/stretchr/testify/assert" 6 | "path/filepath" 7 | "testing" 8 | ) 9 | 10 | func TestCleanRelativeFilePath(t *testing.T) { 11 | testCases := []struct { 12 | input string 13 | expected string 14 | }{ 15 | {"foo/bar", "foo/bar"}, 16 | {"foo/../bar", "bar"}, 17 | {"foo/./bar", "foo/bar"}, 18 | {"/foo/bar", "foo/bar"}, 19 | {"foo//bar", "foo/bar"}, 20 | {"foo/bar/..", "foo"}, 21 | {"../foo/bar", "foo/bar"}, 22 | {"foo/../../../..", "."}, 23 | {"foo/../../../bar", "bar"}, 24 | {"", "."}, 25 | {".", "."}, 26 | {"..", "."}, 27 | } 28 | 29 | for _, tc := range testCases { 30 | expected := filepath.FromSlash(tc.expected) 31 | got := cleanRelativeFilePath(tc.input) 32 | fmt.Printf("\tcleanRelativeFilePath(%q) = %q\n", tc.input, got) 33 | assert.Equal(t, expected, got) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /tokenizers/api/api.go: -------------------------------------------------------------------------------- 1 | // Package api defines the Tokenizer API. 2 | // It's just a hack to break the cyclic dependency, and allow the users to import `tokenizers` and get the 3 | // default implementations. 4 | package api 5 | 6 | // Tokenizer interface allows one convert test to "tokens" (integer ids) and back. 7 | // 8 | // It also allows mapping of special tokens: tokens with a common semantic (like padding) but that 9 | // may map to different ids (int) for different tokenizers. 10 | type Tokenizer interface { 11 | Encode(text string) []int 12 | Decode([]int) string 13 | 14 | // SpecialTokenID returns ID for given special token if registered, or an error if not. 15 | SpecialTokenID(token SpecialToken) (int, error) 16 | } 17 | 18 | // SpecialToken is an enum of commonly used special tokens. 19 | type SpecialToken int 20 | 21 | const ( 22 | TokBeginningOfSentence SpecialToken = iota 23 | TokEndOfSentence 24 | TokUnknown 25 | TokPad 26 | TokMask 27 | TokClassification 28 | TokSpecialTokensCount 29 | ) 30 | 31 | //go:generate enumer -type=SpecialToken -trimprefix=Tok -transform=snake -values -text -json -yaml api.go 32 | -------------------------------------------------------------------------------- /internal/files/files.go: -------------------------------------------------------------------------------- 1 | // Package files implements generic file tools missing from the standard library. 2 | package files 3 | 4 | import ( 5 | "github.com/pkg/errors" 6 | "os" 7 | "os/user" 8 | "path" 9 | "strings" 10 | ) 11 | 12 | // Exists returns true if file or directory exists. 13 | func Exists(filePath string) bool { 14 | _, err := os.Stat(filePath) 15 | return err == nil 16 | } 17 | 18 | // ReplaceTildeInDir by the user's home directory. Returns dir if it doesn't start with "~". 19 | // 20 | // It returns an error if `dir` has an unknown user (e.g: `~unknown/...`) 21 | func ReplaceTildeInDir(dir string) (string, error) { 22 | if len(dir) == 0 { 23 | return dir, nil 24 | } 25 | if dir[0] != '~' { 26 | return dir, nil 27 | } 28 | var userName string 29 | if dir != "~" && !strings.HasPrefix(dir, "~/") { 30 | sepIdx := strings.IndexRune(dir, '/') 31 | if sepIdx == -1 { 32 | userName = dir[1:] 33 | } else { 34 | userName = dir[1:sepIdx] 35 | } 36 | } 37 | var usr *user.User 38 | var err error 39 | if userName == "" { 40 | usr, err = user.Current() 41 | } else { 42 | usr, err = user.Lookup(userName) 43 | } 44 | if err != nil { 45 | return dir, errors.Wrapf(err, "failed to lookup home directory for user in path %q", dir) 46 | } 47 | homeDir := usr.HomeDir 48 | return path.Join(homeDir, dir[1+len(userName):]), nil 49 | } 50 | -------------------------------------------------------------------------------- /hub/README.md: -------------------------------------------------------------------------------- 1 | # hub package 2 | Downloads HuggingFace Hub files, a port of huggingFace_hub python library to Go. 3 | 4 | ## Introduction 5 | 6 | A simple, straight-forward port of [github.com/huggingface/huggingface_hub](https://github.com/huggingface/huggingface_hub) library for Go. 7 | 8 | Features supported: 9 | 10 | - Cache system that matches HuggingFace Hub, so the same cache can be shared with Python. 11 | - Concurrency safe: only one download when multiple workers are trying to download simultaneously the same model. 12 | - Allow arbitrary progress function to be called (for progress bar). 13 | - Arbitrary revision. 14 | - Parallel download of files, max=20 by default. 15 | 16 | TODOs: 17 | 18 | - Add support for optional parameters. 19 | - Authentication tokens: should be relatively easy. 20 | - Resume downloads from interrupted connections. 21 | - Check disk-space before starting to download. 22 | 23 | ## Example 24 | 25 | Enumerate files from a HuggingFace repository and download all of them to a cache. 26 | 27 | ```go 28 | repo := hub.New(modelID).WithAuth(hfAuthToken) 29 | var fileNames []string 30 | for fileName, err := range repo.IterFileNames() { 31 | if err != nil { panic(err) } 32 | fmt.Printf("\t%s\n", fileName) 33 | fileNames = append(fileNames, fileName) 34 | } 35 | downloadedFiles, err := repo.DownloadFiles(fileNames...) 36 | if err != nil { ... } 37 | ``` -------------------------------------------------------------------------------- /docs/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # `go-huggingface` Changelog 2 | 3 | ## v0.3.1 4 | 5 | - Fixed go.mod/go.sum. 6 | 7 | ## v0.3.0 8 | 9 | - Bumped the version of GoMLX in tests and documentation. 10 | - Bumped version of dependencies: including github.com/daulet/tokenizers, which requires a fresh download of the 11 | corresponding c++ library libtokenizers.a. 12 | 13 | ## v0.2.2 14 | 15 | * Fixed file truncation issues during download. 16 | 17 | ## v0.2.1 18 | 19 | * Forcefully refresh (download) the revision's hash at least once before using. 20 | 21 | ## v0.2.0 22 | 23 | * Add Windows support by moving to the cross-platform flock: see PR #6, thanks to @mrmichaeladavis 24 | 25 | ## v0.1.2 26 | 27 | * If verbosity is 0, it won't print progress. 28 | * Added support for custom end-points. Default being "https://huggingface.co" or the environment variable 29 | `$HF_ENDPOINT` if defined. 30 | 31 | ## v0.1.1 32 | 33 | * Fixed URL resolution of non-model repos. 34 | * Fixed sentencepiece Tokenizer and tokenizer API string methods (using `enumer`). 35 | * Added dataset example. 36 | * Added usage with Rust tokenizer. 37 | * Improved README.md 38 | * Added SentencePiece proto support – to be used in future conversion of SentencePiece models. 39 | * Improved documentation. 40 | 41 | ## v0.1.0 42 | 43 | * package `hub`: inspect and download files from arbitrary repos. Very functional. 44 | * package `tokenizers`: 45 | * Interfaces, types and constants. 46 | * Gemma tokenizer implementation. 47 | * Not any other tokenizer implemented yet. 48 | * Examples in `README.md`. 49 | -------------------------------------------------------------------------------- /internal/downloader/semaphore.go: -------------------------------------------------------------------------------- 1 | package downloader 2 | 3 | import "sync" 4 | 5 | // Semaphore that allows dynamic resizing. 6 | // 7 | // It uses a sync.Cond to allow dynamic resizing, but it will be slower than a pure channel implementation 8 | // with a fixed capacity. This cost shouldn't matter for more coarse resource control. 9 | // 10 | // Implementation copied from github.com/gomlx/gomlx/pkg/support/xsync. 11 | type Semaphore struct { 12 | cond sync.Cond 13 | capacity, current int // Tracks capacity and current usage. 14 | } 15 | 16 | // NewSemaphore returns a Semaphore that allows at most capacity simultaneous acquisitions. 17 | // If capacity <= 0, there is no limit on acquisitions. 18 | // 19 | // FIFO ordering may be lost during resizes (Semaphore.Resize) to larger capacity, but otherwise it is respected. 20 | func NewSemaphore(capacity int) *Semaphore { 21 | return &Semaphore{ 22 | cond: sync.Cond{L: &sync.Mutex{}}, 23 | capacity: capacity, 24 | } 25 | } 26 | 27 | // Acquire resource observing current semaphore capacity. 28 | // It must be matched by exactly one call to Semaphore.Release after the reservation is no longer needed. 29 | func (s *Semaphore) Acquire() { 30 | s.cond.L.Lock() 31 | defer s.cond.L.Unlock() 32 | for { 33 | if s.capacity <= 0 || s.current < s.capacity { 34 | // No limits. 35 | s.current++ 36 | return 37 | } 38 | s.cond.Wait() 39 | } 40 | } 41 | 42 | // Release resource previously allocated with Semaphore.Acquire. 43 | func (s *Semaphore) Release() { 44 | s.cond.L.Lock() 45 | defer s.cond.L.Unlock() 46 | s.current-- 47 | if s.capacity == 0 || s.current < s.capacity-1 { 48 | return 49 | } 50 | s.cond.Signal() 51 | } 52 | 53 | // Resize the number of available resources in the Semaphore. 54 | // 55 | // If the newCapacity is larger than the previous one, this may immediately allow pending Semaphore.Acquire to proceed. 56 | // Notice since all waiting Semaphore.Acquire are awoken (broadcast), the queue order may be lost. 57 | // 58 | // If the newCapacity is smaller than the previous one, it doesn't have any effect on current acquisitions. So if the Semaphore 59 | // is being used to control a worker pool, reducing its size won't stop workers currently executing. 60 | func (s *Semaphore) Resize(newCapacity int) { 61 | s.cond.L.Lock() 62 | defer s.cond.L.Unlock() 63 | if newCapacity == s.capacity { 64 | return // No change needed. 65 | } 66 | if (newCapacity > 0 && newCapacity < s.capacity) || s.capacity == 0 { 67 | // Capacity is shrinking, no Semaphore.Acquire will be released. 68 | s.capacity = newCapacity 69 | return 70 | } 71 | 72 | // Wake-up everyone -- to preserve the queue order, we would need to call s.cond.Signal() for the amount of 73 | // increased capacity, but that would make this call O(capacity), potentially slow for large capacities. 74 | s.capacity = newCapacity 75 | s.cond.Broadcast() 76 | } 77 | -------------------------------------------------------------------------------- /tokenizers/sentencepiece/sentencepiece.go: -------------------------------------------------------------------------------- 1 | // Package sentencepiece implements a tokenizers.Tokenizer based on SentencePiece tokenizer. 2 | package sentencepiece 3 | 4 | import ( 5 | esentencepiece "github.com/eliben/go-sentencepiece" 6 | "github.com/gomlx/go-huggingface/hub" 7 | "github.com/gomlx/go-huggingface/tokenizers/api" 8 | "github.com/pkg/errors" 9 | ) 10 | 11 | // New creates a SentencePiece tokenizer based on the "tokenizer.model" file, which must be a 12 | // SentencePiece Model proto (see protos.Model). 13 | // 14 | // It implements a tokenizer.TokenizerConstructor function signature. 15 | func New(config *api.Config, repo *hub.Repo) (api.Tokenizer, error) { 16 | if !repo.HasFile("tokenizer.model") { 17 | return nil, errors.Errorf("\"tokenizer.model\" file not found in repo") 18 | } 19 | tokenizerFile, err := repo.DownloadFile("tokenizer.model") 20 | if err != nil { 21 | return nil, errors.Wrapf(err, "can't download tokenizer.json file") 22 | } 23 | proc, err := esentencepiece.NewProcessorFromPath(tokenizerFile) 24 | if err != nil { 25 | return nil, errors.Wrapf(err, "can't create sentencepiece tokenizer") 26 | } 27 | return &Tokenizer{ 28 | Processor: proc, 29 | Info: proc.ModelInfo(), 30 | }, nil 31 | } 32 | 33 | // Tokenizer implements tokenizers.Tokenizer interface based on SentencePiece tokenizer by Google. 34 | type Tokenizer struct { 35 | *esentencepiece.Processor 36 | Info *esentencepiece.ModelInfo 37 | } 38 | 39 | // Compile time assert that sentencepiece.Tokenizer implements tokenizers.Tokenizer interface. 40 | var _ api.Tokenizer = &Tokenizer{} 41 | 42 | // Encode returns the text encoded into a sequence of ids. 43 | // It implements sampler.Vocabulary. 44 | func (p *Tokenizer) Encode(text string) []int { 45 | tokens := p.Processor.Encode(text) 46 | return sliceMap(tokens, func(t esentencepiece.Token) int { return t.ID }) 47 | } 48 | 49 | // Decode returns the text from a sequence of ids. 50 | // It implements sampler.Vocabulary. 51 | func (p *Tokenizer) Decode(ids []int) string { 52 | return p.Processor.Decode(ids) 53 | } 54 | 55 | // SpecialTokenID returns the token for the given symbol, or an error if not known. 56 | func (p *Tokenizer) SpecialTokenID(token api.SpecialToken) (int, error) { 57 | switch token { 58 | case api.TokUnknown: 59 | return p.Info.UnknownID, nil 60 | case api.TokPad: 61 | return p.Info.PadID, nil 62 | case api.TokBeginningOfSentence: 63 | return p.Info.BeginningOfSentenceID, nil 64 | case api.TokEndOfSentence: 65 | return p.Info.EndOfSentenceID, nil 66 | default: 67 | return 0, errors.Errorf("unknown special token: %s (%d)", token, int(token)) 68 | } 69 | } 70 | 71 | // sliceMap executes the given function sequentially for every element on in, and returns a mapped slice. 72 | func sliceMap[In, Out any](in []In, fn func(e In) Out) (out []Out) { 73 | out = make([]Out, len(in)) 74 | for ii, e := range in { 75 | out[ii] = fn(e) 76 | } 77 | return 78 | } 79 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= 2 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 3 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 4 | github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= 5 | github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= 6 | github.com/eliben/go-sentencepiece v0.6.0 h1:wbnefMCxYyVYmeTVtiMJet+mS9CVwq5klveLpfQLsnk= 7 | github.com/eliben/go-sentencepiece v0.6.0/go.mod h1:nNYk4aMzgBoI6QFp4LUG8Eu1uO9fHD9L5ZEre93o9+c= 8 | github.com/gofrs/flock v0.13.0 h1:95JolYOvGMqeH31+FC7D2+uULf6mG61mEZ/A8dRYMzw= 9 | github.com/gofrs/flock v0.13.0/go.mod h1:jxeyy9R1auM5S6JYDBhDt+E2TCo7DkratH4Pgi8P+Z0= 10 | github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= 11 | github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= 12 | github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= 13 | github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= 14 | github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= 15 | github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= 16 | github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= 17 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= 18 | github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= 19 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 20 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 21 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 22 | github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= 23 | github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= 24 | github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= 25 | github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= 26 | golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= 27 | golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= 28 | google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE= 29 | google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= 30 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 31 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= 32 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= 33 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 34 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 35 | -------------------------------------------------------------------------------- /tokenizers/api/config.go: -------------------------------------------------------------------------------- 1 | package api 2 | 3 | import ( 4 | "encoding/json" 5 | "github.com/pkg/errors" 6 | "os" 7 | ) 8 | 9 | type TokensDecoder struct { 10 | Content string `json:"content"` 11 | Lstrip bool `json:"lstrip"` 12 | Normalized bool `json:"normalized"` 13 | Rstrip bool `json:"rstrip"` 14 | SingleWord bool `json:"single_word"` 15 | Special bool `json:"special"` 16 | } 17 | 18 | // Config struct to hold HuggingFace's tokenizer_config.json contents. 19 | // There is no formal schema for this file, but these are some common fields that may be of use. 20 | // Specific tokenizer classes are free to implement additional features as they see fit. 21 | // 22 | // The extra field ConfigFile holds the path to the file with the full config. 23 | type Config struct { 24 | ConfigFile string 25 | TokenizerClass string `json:"tokenizer_class"` 26 | 27 | ChatTemplate string `json:"chat_template"` 28 | UseDefaultSystemPrompt bool `json:"use_default_system_prompt"` 29 | 30 | ModelMaxLength float64 `json:"model_max_length"` 31 | MaxLength float64 `json:"max_length"` 32 | SpModelKwargs map[string]any `json:"sp_model_kwargs"` 33 | 34 | ClsToken string `json:"cls_token"` 35 | UnkToken string `json:"unk_token"` 36 | SepToken string `json:"sep_token"` 37 | MaskToken string `json:"mask_token"` 38 | BosToken string `json:"bos_token"` 39 | EosToken string `json:"eos_token"` 40 | PadToken string `json:"pad_token"` 41 | 42 | AddBosToken bool `json:"add_bos_token"` 43 | AddEosToken bool `json:"add_eos_token"` 44 | AddedTokensDecoder map[int]TokensDecoder `json:"added_tokens_decoder"` 45 | AdditionalSpecialTokens []string `json:"additional_special_tokens"` 46 | 47 | DoLowerCase bool `json:"do_lower_case"` 48 | CleanUpTokenizationSpaces bool `json:"clean_up_tokenization_spaces"` 49 | SpacesBetweenSpecialTokens bool `json:"spaces_between_special_tokens"` 50 | 51 | TokenizeChineseChars bool `json:"tokenize_chinese_chars"` 52 | StripAccents any `json:"strip_accents"` 53 | NameOrPath string `json:"name_or_path"` 54 | DoBasicTokenize bool `json:"do_basic_tokenize"` 55 | NeverSplit any `json:"never_split"` 56 | 57 | Stride int `json:"stride"` 58 | TruncationSide string `json:"truncation_side"` 59 | TruncationStrategy string `json:"truncation_strategy"` 60 | } 61 | 62 | // ParseConfigFile parses the given file (holding a tokenizer_config.json file) into a Config structure. 63 | func ParseConfigFile(filePath string) (*Config, error) { 64 | content, err := os.ReadFile(filePath) 65 | if err != nil { 66 | return nil, errors.Wrapf(err, "failed to read file %q", filePath) 67 | } 68 | config, err := ParseConfigContent(content) 69 | if err != nil { 70 | return nil, errors.WithMessagef(err, "read from file %q", filePath) 71 | } 72 | config.ConfigFile = filePath 73 | return config, nil 74 | } 75 | 76 | // ParseConfigContent parses the given json content (of a tokenizer_config.json file) into a Config structure. 77 | func ParseConfigContent(jsonContent []byte) (*Config, error) { 78 | config := &Config{} 79 | err := json.Unmarshal(jsonContent, config) 80 | if err != nil { 81 | return nil, errors.Wrapf(err, "failed to parse tokenizer_config json content") 82 | } 83 | return config, nil 84 | } 85 | -------------------------------------------------------------------------------- /hub/hub.go: -------------------------------------------------------------------------------- 1 | // Package hub can be used to download and cache files from HuggingFace Hub, which may 2 | // be models, tokenizers or anything. 3 | // 4 | // It is meant to be a port of huggingFace_hub python library to Go, and be able to share the same 5 | // cache structure (usually under "~/.cache/huggingface/hub"). 6 | // 7 | // It is also safe to be used concurrently by multiple programs -- it uses file system lock to control concurrency. 8 | // 9 | // Typical usage will be something like: 10 | // 11 | // repo := hub.New(modelID).WithAuth(hfAuthToken) 12 | // var fileNames []string 13 | // for fileName, err := range repo.IterFileNames() { 14 | // if err != nil { panic(err) } 15 | // fmt.Printf("\t%s\n", fileName) 16 | // fileNames = append(fileNames, fileName) 17 | // } 18 | // downloadedFiles, err := repo.DownloadFiles(fileNames...) 19 | // if err != nil { ... } 20 | // 21 | // From here, downloadedFiles will point to files in the local cache that one can read. 22 | // 23 | // Environment variables: 24 | // 25 | // - HF_ENDPOINT: Where to connect to huggingface, default is https://huggingface.co 26 | // - XDG_CACHE_HOME: Cache directory, defaults to ${HOME}/.cache 27 | package hub 28 | 29 | import ( 30 | "fmt" 31 | "github.com/gomlx/go-huggingface" 32 | "github.com/google/uuid" 33 | "github.com/pkg/errors" 34 | "os" 35 | "path" 36 | "runtime" 37 | "strings" 38 | ) 39 | 40 | // SessionId is unique and always created anew at the start of the program, and used during the life of the program. 41 | var SessionId string 42 | 43 | // panicf generates an error message and panics with it, in one function. 44 | func panicf(format string, args ...any) { 45 | err := errors.Errorf(format, args...) 46 | panic(err) 47 | } 48 | 49 | func init() { 50 | sessionUUID, err := uuid.NewRandom() 51 | if err != nil { 52 | panicf("failed generating UUID for SessionId: %v", err) 53 | } 54 | SessionId = strings.Replace(sessionUUID.String(), "-", "", -1) 55 | } 56 | 57 | var ( 58 | // DefaultDirCreationPerm is used when creating new cache subdirectories. 59 | DefaultDirCreationPerm = os.FileMode(0755) 60 | 61 | // DefaultFileCreationPerm is used when creating files inside the cache subdirectories. 62 | DefaultFileCreationPerm = os.FileMode(0644) 63 | ) 64 | 65 | const ( 66 | tokenizersVersion = "0.0.1" 67 | ) 68 | 69 | const ( 70 | HeaderXRepoCommit = "X-Repo-Commit" 71 | HeaderXLinkedETag = "X-Linked-Etag" 72 | HeaderXLinkedSize = "X-Linked-Size" 73 | ) 74 | 75 | func getEnvOr(key, defaultValue string) string { 76 | v := os.Getenv(key) 77 | if v == "" { 78 | return defaultValue 79 | } 80 | return v 81 | } 82 | 83 | // DefaultCacheDir for HuggingFace Hub, same used by the python library. 84 | // 85 | // Its prefix is either `${XDG_CACHE_HOME}` if set, or `~/.cache` otherwise. Followed by `/huggingface/hub/`. 86 | // So typically: `~/.cache/huggingface/hub/`. 87 | func DefaultCacheDir() string { 88 | cacheDir := getEnvOr("XDG_CACHE_HOME", path.Join(os.Getenv("HOME"), ".cache")) 89 | cacheDir = path.Join(cacheDir, "huggingface", "hub") 90 | return cacheDir 91 | } 92 | 93 | // DefaultHttpUserAgent returns a user agent to use with HuggingFace Hub API. 94 | func DefaultHttpUserAgent() string { 95 | return fmt.Sprintf("go-huggingface/%v; golang/%s; session_id/%s", 96 | huggingface.Version, runtime.Version(), SessionId) 97 | } 98 | 99 | // RepoIdSeparator is used to separate repository/model names parts when mapping to file names. 100 | // Likely only for internal use. 101 | const RepoIdSeparator = "--" 102 | 103 | // RepoType supported by HuggingFace-Hub 104 | type RepoType string 105 | 106 | const ( 107 | RepoTypeDataset RepoType = "datasets" 108 | RepoTypeSpace RepoType = "spaces" 109 | RepoTypeModel RepoType = "models" 110 | ) 111 | -------------------------------------------------------------------------------- /hub/info.go: -------------------------------------------------------------------------------- 1 | package hub 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "encoding/json" 7 | "fmt" 8 | "log" 9 | "os" 10 | "path" 11 | 12 | "github.com/gomlx/go-huggingface/internal/files" 13 | "github.com/pkg/errors" 14 | ) 15 | 16 | // RepoInfo holds information about a HuggingFace repo, it is the json served when hitting the URL 17 | // https://huggingface.co/api// 18 | // 19 | // TODO: Not complete, only holding the fields used so far by the library. 20 | type RepoInfo struct { 21 | ID string `json:"id"` 22 | ModelID string `json:"model_id"` 23 | Author string `json:"author"` 24 | CommitHash string `json:"sha"` 25 | Tags []string `json:"tags"` 26 | Siblings []*FileInfo `json:"siblings"` 27 | SafeTensors SafeTensorsInfo `json:"safetensors"` 28 | } 29 | 30 | // FileInfo represents one of the model file, in the Info structure. 31 | type FileInfo struct { 32 | Name string `json:"rfilename"` 33 | } 34 | 35 | // SafeTensorsInfo holds counts on number of parameters of various types. 36 | type SafeTensorsInfo struct { 37 | Total int 38 | 39 | // Parameters: maps dtype name to int. 40 | Parameters map[string]int 41 | } 42 | 43 | // Info returns the RepoInfo structure about the model. 44 | // Most users don't need to call this directly, instead use the various iterators. 45 | // 46 | // If it hasn't been downloaded or loaded from the cache yet, it loads it first. 47 | // 48 | // It may return nil if there was an issue with the downloading of the RepoInfo json from HuggingFace. 49 | // Try DownloadInfo to get an error. 50 | func (r *Repo) Info() *RepoInfo { 51 | if r.info == nil { 52 | err := r.DownloadInfo(false) 53 | if err != nil { 54 | log.Printf("Error while downloading info about Repo: %+v", err) 55 | } 56 | } 57 | return r.info 58 | } 59 | 60 | // infoURL for the API that returns the info about a repository. 61 | func (r *Repo) infoURL() string { 62 | return fmt.Sprintf("%s/api/%s/%s/revision/%s", r.hfEndpoint, r.repoType, r.ID, r.revision) 63 | } 64 | 65 | // DownloadInfo about the model, if it hasn't yet. 66 | // 67 | // It will attempt to use the "_info_.json" file in the cache directory first. 68 | // 69 | // If forceDownload is set to true, it ignores the current info or the cached one, and download it again from HuggingFace. 70 | // 71 | // See Repo.Info to access the Info directory. 72 | // Most users don't need to call this directly, instead use the various iterators. 73 | func (r *Repo) DownloadInfo(forceDownload bool) error { 74 | if r.info != nil && !forceDownload { 75 | return nil 76 | } 77 | 78 | // Create directory and file path for the info file. 79 | infoFilePath, err := r.repoCacheDir() 80 | if err != nil { 81 | return err 82 | } 83 | infoFilePath = path.Join(infoFilePath, "info") 84 | if err = os.MkdirAll(infoFilePath, DefaultDirCreationPerm); err != nil { 85 | return errors.Wrapf(err, "while creating info directory %q", infoFilePath) 86 | } 87 | infoFilePath = path.Join(infoFilePath, r.revision) 88 | 89 | // Download info file if needed. 90 | if !files.Exists(infoFilePath) || forceDownload { 91 | err := r.lockedDownload(context.Background(), r.infoURL(), infoFilePath, forceDownload, nil) 92 | if err != nil { 93 | return errors.WithMessagef(err, "failed to download repository info") 94 | } 95 | } 96 | 97 | // Read _info_.json from disk. 98 | infoJson, err := os.ReadFile(infoFilePath) 99 | if err != nil { 100 | return errors.Wrapf(err, "failed to read info for model from disk in %q -- remove the file if you want to have it re-downloaded", 101 | infoFilePath) 102 | } 103 | 104 | decoder := json.NewDecoder(bytes.NewReader(infoJson)) 105 | newInfo := &RepoInfo{} 106 | if err = decoder.Decode(newInfo); err != nil { 107 | return errors.Wrapf(err, "failed to parse info for model in %q (downloaded from %q)", 108 | infoFilePath, r.infoURL()) 109 | } 110 | r.info = newInfo 111 | return nil 112 | } 113 | -------------------------------------------------------------------------------- /tokenizers/tokenizers.go: -------------------------------------------------------------------------------- 1 | // Package tokenizers creates tokenizers from HuggingFace models. 2 | // 3 | // Given a HuggingFace repository (see hub.New to create one), tokenizers will use its "tokenizer_config.json" 4 | // and "tokenizer.json" to instantiate a Tokenizer. 5 | package tokenizers 6 | 7 | import ( 8 | "github.com/gomlx/go-huggingface/hub" 9 | "github.com/gomlx/go-huggingface/tokenizers/api" 10 | "github.com/gomlx/go-huggingface/tokenizers/sentencepiece" 11 | "github.com/pkg/errors" 12 | 13 | // Blank import. 14 | _ "github.com/gomlx/go-huggingface/tokenizers/sentencepiece" 15 | ) 16 | 17 | // Tokenizer interface allows one convert test to "tokens" (integer ids) and back. 18 | // 19 | // It also allows mapping of special tokens: tokens with a comman semantic (like padding) but that 20 | // may map to different ids (int) for different tokenizers. 21 | type Tokenizer = api.Tokenizer 22 | 23 | // SpecialToken is an enum of commonly used special tokens. 24 | type SpecialToken = api.Tokenizer 25 | 26 | const ( 27 | TokBeginningOfSentence = api.TokBeginningOfSentence 28 | TokEndOfSentence = api.TokEndOfSentence 29 | TokUnknown = api.TokUnknown 30 | TokPad = api.TokPad 31 | TokMask = api.TokMask 32 | TokClassification = api.TokClassification 33 | TokSpecialTokensCount = api.TokSpecialTokensCount 34 | ) 35 | 36 | // New creates a new tokenizer from the given HuggingFace repo (see hub.New). 37 | // 38 | // Currently, it only supports "SentencePiece" encoders, and it attempts to download details from 39 | // the repo files "tokenizer_config.json" and "tokenizer.json". 40 | // 41 | // If it fails to load those files, or create a tokenizer, it returns an error. 42 | func New(repo *hub.Repo) (Tokenizer, error) { 43 | err := repo.DownloadInfo(false) 44 | if err != nil { 45 | return nil, err 46 | } 47 | 48 | config, err := GetConfig(repo) 49 | if err != nil { 50 | return nil, err 51 | } 52 | 53 | constructor, found := registerOfClasses[config.TokenizerClass] 54 | if !found { 55 | return nil, errors.Errorf("unknown tokenizer class %q", config.TokenizerClass) 56 | } 57 | return constructor(config, repo) 58 | } 59 | 60 | // GetConfig returns the parsed "tokenizer_config.json" Config object for the repo. 61 | func GetConfig(repo *hub.Repo) (*api.Config, error) { 62 | err := repo.DownloadInfo(false) 63 | if err != nil { 64 | return nil, err 65 | } 66 | localConfigFile, err := repo.DownloadFile("tokenizer_config.json") 67 | if err != nil { 68 | return nil, err 69 | } 70 | config, err := api.ParseConfigFile(localConfigFile) // tokenizer_config.json 71 | if err != nil { 72 | return nil, err 73 | } 74 | return config, nil 75 | } 76 | 77 | // Config struct to hold HuggingFace's tokenizer_config.json contents. 78 | // There is no formal schema for this file, but these are some common fields that may be of use. 79 | // Specific tokenizer classes are free to implement additional features as they see fit. 80 | // 81 | // The extra field ConfigFile holds the path to the file with the full config. 82 | type Config = api.Config 83 | 84 | // TokenizerConstructor is used by Tokenizer implementations to provide implementations for different 85 | // tokenizer classes. 86 | type TokenizerConstructor func(config *api.Config, repo *hub.Repo) (api.Tokenizer, error) 87 | 88 | // RegisterTokenizerClass used by Tokenizer implementations. 89 | func RegisterTokenizerClass(name string, constructor TokenizerConstructor) { 90 | registerOfClasses[name] = constructor 91 | } 92 | 93 | var ( 94 | registerOfClasses = make(map[string]TokenizerConstructor) 95 | ) 96 | 97 | func init() { 98 | // Initialize sentencepiece tokenizer classes, always included. 99 | RegisterTokenizerClass("GemmaTokenizer", sentencepiece.New) 100 | 101 | //for _, className := range []string{ 102 | // "GemmaTokenizer", "BertTokenizer", "DebertaV2Tokenizer", "DistilBertTokenizer", 103 | // "DistilBertTokenizer", "RobertaTokenizer"} { 104 | //} 105 | } 106 | -------------------------------------------------------------------------------- /tokenizers/api/specialtoken_enumer.go: -------------------------------------------------------------------------------- 1 | // Code generated by "enumer -type=SpecialToken -trimprefix=Tok -transform=snake -values -text -json -yaml api.go"; DO NOT EDIT. 2 | 3 | package api 4 | 5 | import ( 6 | "encoding/json" 7 | "fmt" 8 | "strings" 9 | ) 10 | 11 | const _SpecialTokenName = "beginning_of_sentenceend_of_sentenceunknownpadmaskclassificationspecial_tokens_count" 12 | 13 | var _SpecialTokenIndex = [...]uint8{0, 21, 36, 43, 46, 50, 64, 84} 14 | 15 | const _SpecialTokenLowerName = "beginning_of_sentenceend_of_sentenceunknownpadmaskclassificationspecial_tokens_count" 16 | 17 | func (i SpecialToken) String() string { 18 | if i < 0 || i >= SpecialToken(len(_SpecialTokenIndex)-1) { 19 | return fmt.Sprintf("SpecialToken(%d)", i) 20 | } 21 | return _SpecialTokenName[_SpecialTokenIndex[i]:_SpecialTokenIndex[i+1]] 22 | } 23 | 24 | func (SpecialToken) Values() []string { 25 | return SpecialTokenStrings() 26 | } 27 | 28 | // An "invalid array index" compiler error signifies that the constant values have changed. 29 | // Re-run the stringer command to generate them again. 30 | func _SpecialTokenNoOp() { 31 | var x [1]struct{} 32 | _ = x[TokBeginningOfSentence-(0)] 33 | _ = x[TokEndOfSentence-(1)] 34 | _ = x[TokUnknown-(2)] 35 | _ = x[TokPad-(3)] 36 | _ = x[TokMask-(4)] 37 | _ = x[TokClassification-(5)] 38 | _ = x[TokSpecialTokensCount-(6)] 39 | } 40 | 41 | var _SpecialTokenValues = []SpecialToken{TokBeginningOfSentence, TokEndOfSentence, TokUnknown, TokPad, TokMask, TokClassification, TokSpecialTokensCount} 42 | 43 | var _SpecialTokenNameToValueMap = map[string]SpecialToken{ 44 | _SpecialTokenName[0:21]: TokBeginningOfSentence, 45 | _SpecialTokenLowerName[0:21]: TokBeginningOfSentence, 46 | _SpecialTokenName[21:36]: TokEndOfSentence, 47 | _SpecialTokenLowerName[21:36]: TokEndOfSentence, 48 | _SpecialTokenName[36:43]: TokUnknown, 49 | _SpecialTokenLowerName[36:43]: TokUnknown, 50 | _SpecialTokenName[43:46]: TokPad, 51 | _SpecialTokenLowerName[43:46]: TokPad, 52 | _SpecialTokenName[46:50]: TokMask, 53 | _SpecialTokenLowerName[46:50]: TokMask, 54 | _SpecialTokenName[50:64]: TokClassification, 55 | _SpecialTokenLowerName[50:64]: TokClassification, 56 | _SpecialTokenName[64:84]: TokSpecialTokensCount, 57 | _SpecialTokenLowerName[64:84]: TokSpecialTokensCount, 58 | } 59 | 60 | var _SpecialTokenNames = []string{ 61 | _SpecialTokenName[0:21], 62 | _SpecialTokenName[21:36], 63 | _SpecialTokenName[36:43], 64 | _SpecialTokenName[43:46], 65 | _SpecialTokenName[46:50], 66 | _SpecialTokenName[50:64], 67 | _SpecialTokenName[64:84], 68 | } 69 | 70 | // SpecialTokenString retrieves an enum value from the enum constants string name. 71 | // Throws an error if the param is not part of the enum. 72 | func SpecialTokenString(s string) (SpecialToken, error) { 73 | if val, ok := _SpecialTokenNameToValueMap[s]; ok { 74 | return val, nil 75 | } 76 | 77 | if val, ok := _SpecialTokenNameToValueMap[strings.ToLower(s)]; ok { 78 | return val, nil 79 | } 80 | return 0, fmt.Errorf("%s does not belong to SpecialToken values", s) 81 | } 82 | 83 | // SpecialTokenValues returns all values of the enum 84 | func SpecialTokenValues() []SpecialToken { 85 | return _SpecialTokenValues 86 | } 87 | 88 | // SpecialTokenStrings returns a slice of all String values of the enum 89 | func SpecialTokenStrings() []string { 90 | strs := make([]string, len(_SpecialTokenNames)) 91 | copy(strs, _SpecialTokenNames) 92 | return strs 93 | } 94 | 95 | // IsASpecialToken returns "true" if the value is listed in the enum definition. "false" otherwise 96 | func (i SpecialToken) IsASpecialToken() bool { 97 | for _, v := range _SpecialTokenValues { 98 | if i == v { 99 | return true 100 | } 101 | } 102 | return false 103 | } 104 | 105 | // MarshalJSON implements the json.Marshaler interface for SpecialToken 106 | func (i SpecialToken) MarshalJSON() ([]byte, error) { 107 | return json.Marshal(i.String()) 108 | } 109 | 110 | // UnmarshalJSON implements the json.Unmarshaler interface for SpecialToken 111 | func (i *SpecialToken) UnmarshalJSON(data []byte) error { 112 | var s string 113 | if err := json.Unmarshal(data, &s); err != nil { 114 | return fmt.Errorf("SpecialToken should be a string, got %s", data) 115 | } 116 | 117 | var err error 118 | *i, err = SpecialTokenString(s) 119 | return err 120 | } 121 | 122 | // MarshalText implements the encoding.TextMarshaler interface for SpecialToken 123 | func (i SpecialToken) MarshalText() ([]byte, error) { 124 | return []byte(i.String()), nil 125 | } 126 | 127 | // UnmarshalText implements the encoding.TextUnmarshaler interface for SpecialToken 128 | func (i *SpecialToken) UnmarshalText(text []byte) error { 129 | var err error 130 | *i, err = SpecialTokenString(string(text)) 131 | return err 132 | } 133 | 134 | // MarshalYAML implements a YAML Marshaler for SpecialToken 135 | func (i SpecialToken) MarshalYAML() (interface{}, error) { 136 | return i.String(), nil 137 | } 138 | 139 | // UnmarshalYAML implements a YAML Unmarshaler for SpecialToken 140 | func (i *SpecialToken) UnmarshalYAML(unmarshal func(interface{}) error) error { 141 | var s string 142 | if err := unmarshal(&s); err != nil { 143 | return err 144 | } 145 | 146 | var err error 147 | *i, err = SpecialTokenString(s) 148 | return err 149 | } 150 | -------------------------------------------------------------------------------- /hub/download.go: -------------------------------------------------------------------------------- 1 | package hub 2 | 3 | import ( 4 | "context" 5 | "log" 6 | "math/rand" 7 | "os" 8 | "path" 9 | "time" 10 | 11 | "github.com/gofrs/flock" 12 | "github.com/gomlx/go-huggingface/internal/downloader" 13 | "github.com/gomlx/go-huggingface/internal/files" 14 | "github.com/pkg/errors" 15 | ) 16 | 17 | // Generic download utilities. 18 | 19 | // getDownloadManager returns current downloader.Manager, or creates a new one for this Repo. 20 | func (r *Repo) getDownloadManager() *downloader.Manager { 21 | if r.downloadManager == nil { 22 | r.downloadManager = downloader.New().MaxParallel(r.MaxParallelDownload).WithAuthToken(r.authToken) 23 | } 24 | return r.downloadManager 25 | } 26 | 27 | // lockedDownload url to the given filePath. 28 | // 29 | // If filePath exits and forceDownload is false, it is assumed to already have been correctly downloaded, and it will return immediately. 30 | // 31 | // It downloads the file to filePath+".tmp" and then atomically move it to filePath. 32 | // 33 | // It uses a temporary filePath+".lock" to coordinate multiple processes/programs trying to download the same file at the same time. 34 | func (r *Repo) lockedDownload(ctx context.Context, url, filePath string, forceDownload bool, progressCallback downloader.ProgressCallback) error { 35 | if files.Exists(filePath) { 36 | if !forceDownload { 37 | return nil 38 | } 39 | err := os.Remove(filePath) 40 | if err != nil { 41 | return errors.Wrapf(err, "failed to remove %q while force-downloading %q", filePath, url) 42 | } 43 | } 44 | 45 | // Checks whether context has already been cancelled, and exit immediately. 46 | if err := ctx.Err(); err != nil { 47 | return err 48 | } 49 | 50 | // Create a directory for the file. 51 | if err := os.MkdirAll(path.Dir(filePath), DefaultDirCreationPerm); err != nil { 52 | return errors.Wrapf(err, "failed to create directory for file %q", filePath) 53 | } 54 | 55 | // Lock file to avoid parallel downloads. 56 | lockPath := filePath + ".lock" 57 | var mainErr error 58 | errLock := execOnFileLock(lockPath, func() { 59 | if files.Exists(filePath) { 60 | // Some concurrent other process (or goroutine) already downloaded the file. 61 | return 62 | } 63 | 64 | // Create tmpFile where to download. 65 | var tmpFileClosed bool 66 | tmpPath := filePath + ".downloading" 67 | tmpFile, err := os.Create(tmpPath) 68 | if err != nil { 69 | mainErr = errors.Wrapf(err, "creating temporary file for download in %q", tmpPath) 70 | return 71 | } 72 | defer func() { 73 | // If we exit with an error, make sure to close and remove unfinished temporary file. 74 | if !tmpFileClosed { 75 | err := tmpFile.Close() 76 | if err != nil { 77 | log.Printf("Failed closing temporary file %q: %v", tmpPath, err) 78 | } 79 | err = os.Remove(tmpPath) 80 | if err != nil { 81 | log.Printf("Failed removing temporary file %q: %v", tmpPath, err) 82 | } 83 | } 84 | }() 85 | 86 | downloadManager := r.getDownloadManager() 87 | mainErr = downloadManager.Download(ctx, url, tmpPath, progressCallback) 88 | if mainErr != nil { 89 | mainErr = errors.WithMessagef(mainErr, "while downloading %q to %q", url, tmpPath) 90 | return 91 | } 92 | 93 | // Download succeeded, move to our target location. 94 | tmpFileClosed = true 95 | if err := tmpFile.Close(); err != nil { 96 | mainErr = errors.Wrapf(err, "failed to close temporary download file %q", tmpPath) 97 | return 98 | } 99 | if err := os.Rename(tmpPath, filePath); err != nil { 100 | mainErr = errors.Wrapf(err, "failed to move downloaded file %q to %q", tmpPath, filePath) 101 | return 102 | } 103 | 104 | // File already exists, so we no longer need the lock file. 105 | err = os.Remove(lockPath) 106 | if err != nil { 107 | log.Printf("Warning: error removing lock file %q: %+v", lockPath, err) 108 | } 109 | }) 110 | if mainErr != nil { 111 | return mainErr 112 | } 113 | if errLock != nil { 114 | return errors.WithMessagef(errLock, "while locking %q to download %q", lockPath, url) 115 | } 116 | return nil 117 | } 118 | 119 | // execOnFileLock opens the lockPath file (or creates if it doesn't yet exist), locks it, and executes the function. 120 | // If the lockPath is already locked, it polls with a 1 to 2 seconds period (randomly), until it acquires the lock. 121 | // 122 | // The lockPath is not removed. It's safe to remove it from the given fn, if one knows that no new calls to 123 | // execOnFileLock with the same lockPath is going to be made. 124 | func execOnFileLock(lockPath string, fn func()) (err error) { 125 | // Create a new flock instance directly using gofrs/flock 126 | fileLock := flock.New(lockPath) 127 | 128 | // Acquire lock with retry logic 129 | for { 130 | // Try to acquire the lock 131 | locked, err := fileLock.TryLock() 132 | if err != nil { 133 | return errors.Wrapf(err, "while trying to lock %q", lockPath) 134 | } 135 | 136 | // If we got the lock, break out of the retry loop 137 | if locked { 138 | break 139 | } 140 | 141 | // Wait from 1 to 2 seconds. 142 | time.Sleep(time.Millisecond * time.Duration(1000+rand.Intn(1000))) 143 | } 144 | 145 | // Setup clean up in a deferred function, so it happens even if `fn()` panics. 146 | defer func() { 147 | unlockErr := fileLock.Unlock() 148 | if unlockErr != nil { 149 | // If we already have an error, don't overwrite it 150 | if err == nil { 151 | err = errors.Wrapf(unlockErr, "unlocking file %q", lockPath) 152 | } else { 153 | log.Printf("Error unlocking file %q: %v", lockPath, unlockErr) 154 | } 155 | } 156 | }() 157 | 158 | // We got the lock, run the function. 159 | fn() 160 | 161 | return 162 | } 163 | -------------------------------------------------------------------------------- /internal/downloader/downloader.go: -------------------------------------------------------------------------------- 1 | // Package downloader implements download in parallel of various URLs, with various progress report callback. 2 | // 3 | // It is used by the `hub` package, but it's also left public, in case it becomes useful for others. 4 | package downloader 5 | 6 | import ( 7 | "context" 8 | "fmt" 9 | "io" 10 | "net/http" 11 | "os" 12 | "path" 13 | 14 | "github.com/pkg/errors" 15 | 16 | "github.com/gomlx/go-huggingface/internal/files" 17 | ) 18 | 19 | // ProgressCallback is called as download progresses. 20 | // - totalBytes may be set to 0 if total size is not yet known. 21 | type ProgressCallback func(downloadedBytes, totalBytes int64) 22 | 23 | // Manager handles downloads, reporting back progress and errors. 24 | type Manager struct { 25 | semaphore *Semaphore 26 | authToken, userAgent string 27 | } 28 | 29 | // New creates a Manager that download files in parallel -- by default mostly 20 in parallel. 30 | func New() *Manager { 31 | return &Manager{semaphore: NewSemaphore(20)} 32 | } 33 | 34 | // MaxParallel indicates how many files to download at the same time. Default is 20. 35 | // If set to <= 0 it will download all files in parallel. 36 | // Set to 1 to make downloads sequential. 37 | func (m *Manager) MaxParallel(n int) *Manager { 38 | m.semaphore.Resize(n) 39 | return m 40 | } 41 | 42 | // WithAuthToken sets the authentication token to use in the requests. 43 | // It is passed in the header "Authorization" and prefixed with "Bearer ". 44 | // 45 | // Setting it to empty ("") is the same as resetting and not using authentication. 46 | func (m *Manager) WithAuthToken(authToken string) *Manager { 47 | m.authToken = authToken 48 | return m 49 | } 50 | 51 | // WithUserAgent sets the user agent to user. 52 | func (m *Manager) WithUserAgent(userAgent string) *Manager { 53 | m.userAgent = userAgent 54 | return m 55 | } 56 | 57 | var CancellationError = errors.New("download cancelled") 58 | 59 | // setRequestHeader with configured fields. 60 | func (m *Manager) setRequestHeader(req *http.Request) { 61 | if m.authToken != "" { 62 | req.Header.Set("Authorization", "Bearer "+m.authToken) 63 | } 64 | if m.userAgent != "" { 65 | req.Header.Set("user-agent", m.userAgent) 66 | } 67 | } 68 | 69 | // Download downloads the given url to be downloaded to the given filePath. 70 | // This may lock if it reached the maximum number of parallel downloads. 71 | // Consider calling this on its own go-routine. 72 | // 73 | // Progress of download is reported back to the given callback, if not nil. 74 | // 75 | // The context ctx can be used to interrupt the downloading. 76 | func (m *Manager) Download(ctx context.Context, url string, filePath string, callback ProgressCallback) error { 77 | m.semaphore.Acquire() 78 | defer m.semaphore.Release() 79 | 80 | client := &http.Client{ 81 | CheckRedirect: func(r *http.Request, via []*http.Request) error { 82 | r.URL.Opaque = r.URL.Path 83 | return nil 84 | }, 85 | } 86 | 87 | var err error 88 | filePath, err = files.ReplaceTildeInDir(filePath) 89 | if err != nil { 90 | return errors.Wrapf(err, "Failed to resolve user name in tilde (~) expansion: %q", filePath) 91 | } 92 | if err = os.MkdirAll(path.Dir(filePath), 0777); err != nil { 93 | return errors.Wrapf(err, "Failed to create the directory for the path: %q", path.Dir(filePath)) 94 | } 95 | var file *os.File 96 | file, err = os.Create(filePath) 97 | if err != nil { 98 | return errors.Wrapf(err, "failed creating file %q", filePath) 99 | } 100 | defer func() { 101 | if file != nil { 102 | _ = file.Close() 103 | } 104 | }() 105 | 106 | req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) 107 | if err != nil { 108 | return errors.Wrapf(err, "failed creating request for %q", url) 109 | } 110 | m.setRequestHeader(req) 111 | var resp *http.Response 112 | resp, err = client.Do(req) 113 | if err != nil { 114 | return errors.Wrapf(err, "failed downloading %q", url) 115 | } 116 | // _ = resp.Header.Write(os.Stdout) 117 | if resp.StatusCode != http.StatusOK { 118 | return fmt.Errorf("bad status code %d: %q", resp.StatusCode, resp.Header.Get("X-Error-Message")) 119 | } 120 | 121 | contentLength := resp.ContentLength 122 | if callback != nil { 123 | callback(0, contentLength) 124 | } 125 | const maxBufferSize = 1 * 1024 * 1024 126 | var buf [maxBufferSize]byte 127 | downloadedBytes := int64(0) 128 | for { 129 | if ctx.Err() != nil { 130 | return CancellationError 131 | } 132 | n, readErr := resp.Body.Read(buf[:]) 133 | if readErr != nil && readErr != io.EOF { 134 | if ctx.Err() != nil { 135 | return CancellationError 136 | } 137 | return errors.Wrapf(err, "failed downloading %q", url) 138 | } 139 | if n > 0 { 140 | wn, writeErr := file.Write(buf[:n]) 141 | if writeErr != nil && writeErr != io.EOF { 142 | return errors.Wrapf(writeErr, "failed writing %q to %q", url, filePath) 143 | } 144 | if wn != n { 145 | return errors.Wrapf(io.ErrShortWrite, "failed writing %q to %q: not enough bytes written (wanted %d, wrote only %d)", 146 | url, filePath, n, wn) 147 | } 148 | } 149 | if readErr == io.EOF { 150 | break 151 | } 152 | downloadedBytes += int64(n) 153 | if callback != nil { 154 | callback(downloadedBytes, contentLength) 155 | } 156 | } 157 | err = file.Close() 158 | file = nil 159 | if err != nil { 160 | return errors.Wrapf(err, "failed closing file %q", filePath) 161 | } 162 | if err = resp.Body.Close(); err != nil { 163 | return errors.Wrapf(err, "failed closing connection to %q", url) 164 | } 165 | return nil 166 | } 167 | 168 | // FetchHeader fetches the header of a URL (using HTTP method "HEAD"). 169 | // 170 | // Notice it may lock on the maximum number of parallel requests, so consider calling this on a separate goroutine. 171 | // 172 | // The context ctx can be used to interrupt the downloading. 173 | func (m *Manager) FetchHeader(ctx context.Context, url string) (header http.Header, contentLength int64, err error) { 174 | m.semaphore.Acquire() 175 | defer m.semaphore.Release() 176 | 177 | client := &http.Client{ 178 | CheckRedirect: func(r *http.Request, via []*http.Request) error { 179 | r.URL.Opaque = r.URL.Path 180 | return nil 181 | }, 182 | } 183 | req, err := http.NewRequestWithContext(ctx, http.MethodHead, url, nil) 184 | if err != nil { 185 | err = errors.Wrapf(err, "failed creating request for %q", url) 186 | return 187 | } 188 | m.setRequestHeader(req) 189 | req.Header.Set("Accept-Encoding", "identity") 190 | 191 | // Make the request and download the tokenizer. 192 | resp, err := client.Do(req) 193 | if err != nil { 194 | err = errors.Wrap(err, "failed request for metadata: ") 195 | return 196 | } 197 | 198 | // TODO: handle redirects. 199 | defer func() { _ = resp.Body.Close() }() 200 | _, err = io.ReadAll(resp.Body) 201 | if err != nil { 202 | err = errors.Wrapf(err, "failed reading response (%d) for metadata: ", resp.StatusCode) 203 | return 204 | } 205 | 206 | // Check status code. 207 | if resp.StatusCode != 200 { 208 | err = errors.Errorf("request for metadata from %q failed with the following message: %q", 209 | url, resp.Status) 210 | return 211 | } 212 | header = resp.Header 213 | contentLength = resp.ContentLength 214 | err = nil 215 | return 216 | } 217 | -------------------------------------------------------------------------------- /hub/repo.go: -------------------------------------------------------------------------------- 1 | package hub 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | "path" 8 | "strings" 9 | 10 | "github.com/gomlx/go-huggingface/internal/downloader" 11 | "github.com/gomlx/go-huggingface/internal/files" 12 | "github.com/pkg/errors" 13 | ) 14 | 15 | // Repo from which one wants to download files. Create it with New. 16 | type Repo struct { 17 | // ID of the Repo may include owner/model. E.g.: google/gemma-2-2b-it 18 | ID string 19 | 20 | // Hugginface endpint to use, defaults to "https://huggingface.co". 21 | hfEndpoint string 22 | 23 | // repoType of the repository, usually RepoTypeModel. 24 | repoType RepoType 25 | 26 | // revision to download, usually set to "main", but it can use a commit-hash version. 27 | revision string 28 | 29 | // revisionHashRefreshed indicates whether the revision hash has been refreshed. 30 | // We force it to be refreshed at least once before hitting the server, just in case. 31 | revisionHashRefreshed bool 32 | 33 | // authToken is the HuggingFace authentication token to be used when downloading the files. 34 | authToken string 35 | 36 | // Verbosity: 0 for quiet operation; 1 for information about progress; 2 and higher for debugging. 37 | Verbosity int 38 | 39 | // MaxParallelDownload indicates how many files to download at the same time. Default is 20. 40 | // If set to <= 0 it will download all files in parallel. 41 | // Set to 1 to make downloads sequential. 42 | MaxParallelDownload int 43 | 44 | // cacheDir is where to store the downloaded files. 45 | cacheDir string 46 | 47 | // Info about the Repo in HuggingFace, including the list of files. 48 | // It is only available after DownloadInfo is called. 49 | info *RepoInfo 50 | 51 | downloadManager *downloader.Manager 52 | 53 | useProgressBar bool 54 | } 55 | 56 | // New creates a reference to a HuggingFace model given its id. 57 | // 58 | // It uses the default cache directory in ${XDG_CACHE_HOME} (if set) or `~/.cache`, in a format that is 59 | // shared with huggingface-hub for python library. The cache is share across various programs, including Python 60 | // programs. 61 | // Use Repo.WithCacheDir to change it, or NewWithDir to use a plain directory structure, that is not shared across programs. 62 | // 63 | // The id typically include owner/model. E.g.: "google/gemma-2-2b-it" 64 | // 65 | // It defaults to being a RepoTypeModel repository. But you can change it with Repo.WithType. 66 | // 67 | // If authentication is needed, use Repo.WithAuth. 68 | func New(id string) *Repo { 69 | hfEndpoint := os.Getenv("HF_ENDPOINT") 70 | if hfEndpoint == "" { 71 | hfEndpoint = "https://huggingface.co" 72 | } else { 73 | hfEndpoint = strings.TrimSuffix(hfEndpoint, "/") 74 | } 75 | return &Repo{ 76 | ID: id, 77 | repoType: RepoTypeModel, 78 | revision: "main", 79 | hfEndpoint: hfEndpoint, 80 | cacheDir: DefaultCacheDir(), 81 | Verbosity: 1, 82 | MaxParallelDownload: 20, // At most 20 parallel downloads. 83 | } 84 | } 85 | 86 | // WithAuth sets the authentication token to use during downloads. 87 | // 88 | // Setting it to empty ("") is the same as resetting and not using authentication. 89 | func (r *Repo) WithAuth(authToken string) *Repo { 90 | r.authToken = authToken 91 | return r 92 | } 93 | 94 | // WithType sets the repository type to use during downloads. 95 | func (r *Repo) WithType(repoType RepoType) *Repo { 96 | r.repoType = repoType 97 | return r 98 | } 99 | 100 | // WithEndpoint sets the HuggingFace endpoint to use. 101 | // Default is "https://huggingface.co" or, if set, the environment variable HF_ENDPOINT. 102 | func (r *Repo) WithEndpoint(endpoint string) *Repo { 103 | r.hfEndpoint = endpoint 104 | return r 105 | } 106 | 107 | // WithRevision sets the revision to use for this Repo, defaults to "main", but can be set to a commit-hash value. 108 | func (r *Repo) WithRevision(revision string) *Repo { 109 | r.revision = revision 110 | return r 111 | } 112 | 113 | // WithCacheDir sets the cacheDir to the given directory. 114 | // 115 | // The default is given by DefaultCacheDir: `${XDG_CACHE_HOME}/huggingface/hub` if set, or `~/.cache/huggingface/hub` otherwise. 116 | func (r *Repo) WithCacheDir(cacheDir string) *Repo { 117 | newCacheDir, err := files.ReplaceTildeInDir(cacheDir) 118 | if err == nil { 119 | r.cacheDir = path.Clean(newCacheDir) 120 | } else { 121 | log.Printf("Failed to resolve directory for %q: %+v", cacheDir, err) 122 | } 123 | return r 124 | } 125 | 126 | // WithDownloadManager sets the downloader.Manager to use for download. 127 | // This is not needed, one will be created automatically if one is not set. 128 | // This is useful when downloading multiple Repos simultaneously, to coordinate limits by sharing the download manager. 129 | func (r *Repo) WithDownloadManager(manager *downloader.Manager) *Repo { 130 | r.downloadManager = manager 131 | return r 132 | } 133 | 134 | // WithProgressBar configures the usage of progress bar during download. Defaults to true. 135 | func (r *Repo) WithProgressBar(useProgressBar bool) *Repo { 136 | r.useProgressBar = useProgressBar 137 | return r 138 | } 139 | 140 | // flatFolderName returns a serialized version of a hf.co repo name and type, safe for disk storage 141 | // as a single non-nested folder. 142 | // 143 | // Based on github.com/huggingface/huggingface_hub repo_folder_name. 144 | func (r *Repo) flatFolderName() string { 145 | parts := []string{string(r.repoType)} 146 | parts = append(parts, strings.Split(r.ID, "/")...) 147 | return strings.Join(parts, RepoIdSeparator) 148 | } 149 | 150 | // repoCacheDir joins cacheDir and flatFolderName to return the cache subdirectory for the repository. 151 | // It also creates the directory, and returns an error if creation failed. 152 | func (r *Repo) repoCacheDir() (string, error) { 153 | dir := path.Join(r.cacheDir, r.flatFolderName()) 154 | err := os.MkdirAll(dir, DefaultDirCreationPerm) 155 | if err != nil { 156 | return "", errors.Wrapf(err, "while creating cache directory %q", dir) 157 | } 158 | return dir, nil 159 | } 160 | 161 | // FileURL returns the URL from which to download the file from HuggingFace. 162 | // 163 | // Usually, not used directly (use DownloadFile instead), but in case someone needs for debugging. 164 | func (r *Repo) FileURL(fileName string) (string, error) { 165 | commitHash, err := r.readCommitHashForRevision() 166 | if err != nil { 167 | return "", err 168 | } 169 | if r.repoType == RepoTypeModel { 170 | return fmt.Sprintf("%s/%s/resolve/%s/%s", r.hfEndpoint, r.ID, commitHash, fileName), nil 171 | } else { 172 | return fmt.Sprintf("%s/%s/%s/resolve/%s/%s", r.hfEndpoint, r.repoType, r.ID, commitHash, fileName), nil 173 | } 174 | } 175 | 176 | // readCommitHashForRevision finds the commit-hash for the revision, it should already be written to disk. 177 | // The revision can be itself a commit-hash, in which case it is returned directly. 178 | // 179 | // repoCacheDir is returned by Repo.repoCacheDir(). 180 | func (r *Repo) readCommitHashForRevision() (string, error) { 181 | forceDownload := !r.revisionHashRefreshed 182 | err := r.DownloadInfo(forceDownload) 183 | if err != nil { 184 | return "", err 185 | } 186 | r.revisionHashRefreshed = true 187 | return r.info.CommitHash, nil 188 | } 189 | 190 | // repoSnapshotsDir returns the snapshots directory for this repo at its revision. 191 | func (r *Repo) repoSnapshotsDir() (string, error) { 192 | cacheDir, err := r.repoCacheDir() 193 | if err != nil { 194 | return "", err 195 | } 196 | commitHash, err := r.readCommitHashForRevision() 197 | if err != nil { 198 | return "", err 199 | } 200 | snapshotsDir := path.Join(cacheDir, "snapshots", commitHash) 201 | if err = os.MkdirAll(snapshotsDir, DefaultDirCreationPerm); err != nil { 202 | return "", errors.Wrapf(err, "while creating snapshots directory %q", snapshotsDir) 203 | } 204 | return snapshotsDir, nil 205 | } 206 | 207 | // String implements fmt.Stringer. 208 | func (r *Repo) String() string { 209 | return r.ID 210 | } 211 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # **go-huggingface**, download, tokenize and convert models from HuggingFace. 2 | 3 | [![GoDev](https://img.shields.io/badge/go.dev-reference-007d9c?logo=go&logoColor=white)](https://pkg.go.dev/github.com/gomlx/go-huggingface?tab=doc) 4 | 5 | ## 📖 Overview 6 | 7 | Simple APIs for downloading (`hub`), tokenizing (`tokenizers`) and (**future work**) model conversion (`models`) of 8 | [HuggingFace🤗](huggingface.co) models using [GoMLX](https://github.com/gomlx/gomlx). 9 | 10 | 🚧 **EXPERIMENTAL and IN DEVELOPMENT**: While the `hub` package has been stable. The `tokenizers` only supports 11 | SentencePiece models (saved as proto), but has been working. 12 | 13 | ## Examples 14 | 15 | ### Preamble: Imports And Variables 16 | 17 | ```go 18 | import ( 19 | "github.com/gomlx/go-huggingface/hub" 20 | "github.com/gomlx/go-huggingface/tokenizers" 21 | ) 22 | 23 | var ( 24 | // HuggingFace authentication token read from environment. 25 | // It can be created in https://huggingface.co 26 | // Some files may require it for downloading. 27 | hfAuthToken = os.Getenv("HF_TOKEN") 28 | 29 | // Model IDs we use for testing. 30 | hfModelIDs = []string{ 31 | "google/gemma-2-2b-it", 32 | "sentence-transformers/all-MiniLM-L6-v2", 33 | "protectai/deberta-v3-base-zeroshot-v1-onnx", 34 | "KnightsAnalytics/distilbert-base-uncased-finetuned-sst-2-english", 35 | "KnightsAnalytics/distilbert-NER", 36 | "SamLowe/roberta-base-go_emotions-onnx", 37 | } 38 | ) 39 | ``` 40 | 41 | ### List files for each model 42 | 43 | ```go 44 | for _, modelID := range hfModelIDs { 45 | fmt.Printf("\n%s:\n", modelID) 46 | repo := hub.New(modelID).WithAuth(hfAuthToken) 47 | for fileName, err := range repo.IterFileNames() { 48 | if err != nil { panic(err) } 49 | fmt.Printf("\t%s\n", fileName) 50 | } 51 | } 52 | ``` 53 | 54 | The result looks like this: 55 | 56 | ``` 57 | google/gemma-2-2b-it: 58 | .gitattributes 59 | README.md 60 | config.json 61 | generation_config.json 62 | model-00001-of-00002.safetensors 63 | model-00002-of-00002.safetensors 64 | model.safetensors.index.json 65 | special_tokens_map.json 66 | tokenizer.json 67 | tokenizer.model 68 | tokenizer_config.json 69 | … 70 | ``` 71 | 72 | 73 | ### List tokenizer classes for each model 74 | 75 | ```go 76 | for _, modelID := range hfModelIDs { 77 | fmt.Printf("\n%s:\n", modelID) 78 | repo := hub.New(modelID).WithAuth(hfAuthToken) 79 | config, err := tokenizers.GetConfig(repo) 80 | if err != nil { panic(err) } 81 | fmt.Printf("\ttokenizer_class=%s\n", config.TokenizerClass) 82 | } 83 | ``` 84 | 85 | Results: 86 | 87 | ``` 88 | google/gemma-2-2b-it: 89 | tokenizer_class=GemmaTokenizer 90 | 91 | sentence-transformers/all-MiniLM-L6-v2: 92 | tokenizer_class=BertTokenizer 93 | 94 | protectai/deberta-v3-base-zeroshot-v1-onnx: 95 | tokenizer_class=DebertaV2Tokenizer 96 | … 97 | ``` 98 | 99 | 100 | ### Tokenize for [`google/gemma-2-2b-it`](https://huggingface.co/google/gemma-2-2b-it) using Go-only "SentencePiece" tokenizer 101 | 102 | * The output "Downloaded" message happens only the tokenizer file is not yet cached, so only the first time: 103 | 104 | ```go 105 | repo := hub.New("google/gemma-2-2b-it").WithAuth(hfAuthToken) 106 | tokenizer, err := tokenizers.New(repo) 107 | if err != nil { panic(err) } 108 | 109 | sentence := "The book is on the table." 110 | tokens := tokenizer.Encode(sentence) 111 | fmt.Printf("Sentence:\t%s\n", sentence) 112 | fmt.Printf("Tokens: \t%v\n", tokens) 113 | ``` 114 | 115 | ``` 116 | Downloaded 1/1 files, 4.2 MB downloaded 117 | Sentence: The book is on the table. 118 | Tokens: [651 2870 603 611 573 3037 235265] 119 | ``` 120 | 121 | ### Tokenize for a [Sentence Transformer](https://www.sbert.net/) derived model, using Rust's based [github.com/daulet/tokenizers](https://github.com/daulet/tokenizers) tokenizer 122 | 123 | For most tokenizers in HuggingFace though, there is no Go-only version yet, and for now we use the 124 | [github.com/daulet/tokenizers](https://github.com/daulet/tokenizers), which is based on a fast tokenizer written in Rust. 125 | 126 | It requires installation of the built Rust library though, 127 | see [github.com/daulet/tokenizers](https://github.com/daulet/tokenizers) on how to install it, 128 | they provide prebuilt binaries. 129 | 130 | > **Note**: `daulet/tokenizers` also provides a simple downloader, so `go-huggingface` is not strictly necessary -- 131 | > if you don't want the extra dependency and only need the tokenizer, you don't need to use it. `go-huggingface` 132 | > helps by allowing also downloading other files (models, datasets), and a shared cache across different projects 133 | > and `huggingface-hub` (the python downloader library). 134 | 135 | ```go 136 | import dtok "github.com/daulet/tokenizers" 137 | 138 | %% 139 | modelID := "KnightsAnalytics/all-MiniLM-L6-v2" 140 | repo := hub.New(modelID).WithAuth(hfAuthToken) 141 | localFile := must.M1(repo.DownloadFile("tokenizer.json")) 142 | tokenizer := must.M1(dtok.FromFile(localFile)) 143 | defer tokenizer.Close() 144 | tokens, _ := tokenizer.Encode(sentence, true) 145 | 146 | fmt.Printf("Sentence:\t%s\n", sentence) 147 | fmt.Printf("Tokens: \t%v\n", tokens) 148 | ``` 149 | 150 | ``` 151 | Sentence: The book is on the table. 152 | Tokens: [101 1996 2338 2003 2006 1996 2795 1012 102 0 0 0…] 153 | ``` 154 | 155 | ### Download and execute ONNX model for [`sentence-transformers/all-MiniLM-L6-v2`](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) 156 | 157 | Only the first 3 lines are actually demoing `go-huggingface`. 158 | The remainder lines uses [`github.com/gomlx/onnx-gomlx`](https://github.com/gomlx/onnx-gomlx) 159 | to parse and convert the ONNX model to GoMLX, and then 160 | [`github.com/gomlx/gomlx`](github.com/gomlx/gomlx) to execute the converted model 161 | for a couple of sentences. 162 | 163 | ```go 164 | // Get ONNX model. 165 | repo := hub.New("sentence-transformers/all-MiniLM-L6-v2").WithAuth(hfAuthToken) 166 | onnxFilePath, err := repo.DownloadFile("onnx/model.onnx") 167 | if err != nil { panic(err) } 168 | onnxModel, err := onnx.ReadFile(onnxFilePath) 169 | if err != nil { panic(err) } 170 | 171 | // Convert ONNX variables to GoMLX context (which stores variables): 172 | ctx := context.New() 173 | err = onnxModel.VariablesToContext(ctx) 174 | if err != nil { panic(err) } 175 | 176 | // Test input. 177 | sentences := []string{ 178 | "This is an example sentence", 179 | "Each sentence is converted"} 180 | inputIDs := [][]int64{ 181 | {101, 2023, 2003, 2019, 2742, 6251, 102}, 182 | { 101, 2169, 6251, 2003, 4991, 102, 0}} 183 | tokenTypeIDs := [][]int64{ 184 | {0, 0, 0, 0, 0, 0, 0}, 185 | {0, 0, 0, 0, 0, 0, 0}} 186 | attentionMask := [][]int64{ 187 | {1, 1, 1, 1, 1, 1, 1}, 188 | {1, 1, 1, 1, 1, 1, 0}} 189 | 190 | // Execute GoMLX graph with model. 191 | embeddings := context.ExecOnce( 192 | backends.New(), ctx, 193 | func (ctx *context.Context, inputs []*graph.Node) *graph.Node { 194 | modelOutputs := onnxModel.CallGraph(ctx, inputs[0].Graph(), map[string]*graph.Node{ 195 | "input_ids": inputs[0], 196 | "attention_mask": inputs[1], 197 | "token_type_ids": inputs[2]}) 198 | return modelOutputs[0] 199 | }, 200 | inputIDs, attentionMask, tokenTypeIDs) 201 | 202 | fmt.Printf("Sentences: \t%q\n", sentences) 203 | fmt.Printf("Embeddings:\t%s\n", embeddings) 204 | ``` 205 | 206 | ``` 207 | Sentences: ["This is an example sentence" "Each sentence is converted"] 208 | Embeddings: [2][7][384]float32{ 209 | {{0.0366, -0.0162, 0.1682, ..., 0.0554, -0.1644, -0.2967}, 210 | {0.7239, 0.6399, 0.1888, ..., 0.5946, 0.6206, 0.4897}, 211 | {0.0064, 0.0203, 0.0448, ..., 0.3464, 1.3170, -0.1670}, 212 | ..., 213 | {0.1479, -0.0643, 0.1457, ..., 0.8837, -0.3316, 0.2975}, 214 | {0.5212, 0.6563, 0.5607, ..., -0.0399, 0.0412, -1.4036}, 215 | {1.0824, 0.7140, 0.3986, ..., -0.2301, 0.3243, -1.0313}}, 216 | {{0.2802, 0.1165, -0.0418, ..., 0.2711, -0.1685, -0.2961}, 217 | {0.8729, 0.4545, -0.1091, ..., 0.1365, 0.4580, -0.2042}, 218 | {0.4752, 0.5731, 0.6304, ..., 0.6526, 0.5612, -1.3268}, 219 | ..., 220 | {0.6113, 0.7920, -0.4685, ..., 0.0854, 1.0592, -0.2983}, 221 | {0.4115, 1.0946, 0.2385, ..., 0.8984, 0.3684, -0.7333}, 222 | {0.1374, 0.5555, 0.2678, ..., 0.5426, 0.4665, -0.5284}}} 223 | ``` 224 | 225 | ## Download Dataset Files 226 | 227 | We are going to use the [HuggingFaceFW/fineweb](https://huggingface.co/datasets/HuggingFaceFW/fineweb) as an example, download one of its sample files (~2.5Gb of data) and parse the `.parquet` file. 228 | 229 | ### Structure of file 230 | First we define the structure of each entry, with the tags for the Parquet parser: 231 | 232 | ```go 233 | var ( 234 | FineWebID = "HuggingFaceFW/fineweb" 235 | FineWebSampleFile = "sample/10BT/000_00000.parquet" 236 | ) 237 | 238 | // FineWebEntry: inspection of fields in parque file done with tool in 239 | // github.com/xitongsys/parquet-go/tool/parquet-tools. 240 | // 241 | // The parquet annotations are described in: https://pkg.go.dev/github.com/parquet-go/parquet-go#SchemaOf 242 | type FineWebEntry struct { 243 | Text string `parquet:"text,snappy"` 244 | ID string `parquet:"id,snappy"` 245 | Dump string `parquet:"dump,snappy"` 246 | URL string `parquet:"url,snappy"` 247 | Score float64 `parquet:"language_score"` 248 | } 249 | 250 | // TrimString returns s trimmed to at most maxLength runes. If trimmed it appends "…" at the end. 251 | func TrimString(s string, maxLength int) string { 252 | if utf8.RuneCountInString(s) <= maxLength { 253 | return s 254 | } 255 | runes := []rune(s) 256 | return string(runes[:maxLength-1]) + "…" 257 | } 258 | ``` 259 | 260 | Now we read the `parquet` files using the library [github.com/parquet-go/parquet-go](https://github.com/parquet-go/parquet-go). 261 | 262 | ```go 263 | import ( 264 | parquet "github.com/parquet-go/parquet-go" 265 | ) 266 | 267 | func main() { 268 | // Download repo file. 269 | repo := hub.New(FineWebID).WithType(hub.RepoTypeDataset).WithAuth(hfAuthToken) 270 | localSampleFile := must.M1(repo.DownloadFile(FineWebSampleFile)) 271 | 272 | // Parquet reading using parquet-go: it's somewhat cumbersome (to open the file it needs its size!?), but it works. 273 | schema := parquet.SchemaOf(&FineWebEntry{}) 274 | fSize := must.M1(os.Stat(localSampleFile)).Size() 275 | fReader := must.M1(os.Open(localSampleFile)) 276 | fParquet := must.M1(parquet.OpenFile(fReader, fSize)) 277 | reader := parquet.NewGenericReader[FineWebEntry](fParquet, schema) 278 | defer reader.Close() 279 | 280 | // Print first 10 rows: 281 | rows := make([]FineWebEntry, 10) 282 | n := must.M1(reader.Read(rows)) 283 | fmt.Printf("%d rows read\n", n) 284 | for ii, row := range rows { 285 | fmt.Printf("Row %0d:\tScore=%.3f Text=[%q], URL=[%s]\n", ii, row.Score, TrimString(row.Text, 50), TrimString(row.URL, 40)) 286 | } 287 | } 288 | ``` 289 | 290 | Results: 291 | 292 | ``` 293 | 10 rows read 294 | Row 0: Score=0.823 Text=["|Viewing Single Post From: Spoilers for the Week …"], URL=[http://daytimeroyaltyonline.com/single/…] 295 | Row 1: Score=0.974 Text=["*sigh* Fundamentalist community, let me pass on s…"], URL=[http://endogenousretrovirus.blogspot.co…] 296 | Row 2: Score=0.873 Text=["A novel two-step immunotherapy approach has shown…"], URL=[http://news.cancerconnect.com/] 297 | Row 3: Score=0.932 Text=["Free the Cans! Working Together to Reduce Waste\nI…"], URL=[http://sharingsolution.com/2009/05/23/f…] 298 | … 299 | ``` 300 | 301 | ## [Demo Notebook](https://github.com/gomlx/go-huggingface/blob/main/go-huggingface.ipynb) 302 | 303 | All examples were taken from the [demo notebook](https://github.com/gomlx/go-huggingface/blob/main/go-huggingface.ipynb). 304 | It works it also as an easy playground to try out the functionality. 305 | 306 | You can try it out using the [GoMLX docker that includes JupyterLab](https://hub.docker.com/r/janpfeifer/gomlx_jupyterlab). -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /hub/files.go: -------------------------------------------------------------------------------- 1 | package hub 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "iter" 7 | "net/http" 8 | "os" 9 | "path" 10 | "path/filepath" 11 | "strconv" 12 | "strings" 13 | "sync" 14 | "time" 15 | 16 | "github.com/dustin/go-humanize" 17 | "github.com/gomlx/go-huggingface/internal/files" 18 | "github.com/pkg/errors" 19 | ) 20 | 21 | // IterFileNames iterate over the file names stored in the repo. 22 | // It doesn't trigger the downloading of the repo, only of the repo info. 23 | func (r *Repo) IterFileNames() iter.Seq2[string, error] { 24 | // Download info and files. 25 | err := r.DownloadInfo(false) 26 | if err != nil { 27 | // Error downloading: yield error only. 28 | return func(yield func(string, error) bool) { 29 | yield("", err) 30 | return 31 | } 32 | } 33 | return func(yield func(string, error) bool) { 34 | for _, si := range r.info.Siblings { 35 | fileName := si.Name 36 | if path.IsAbs(fileName) || strings.Index(fileName, "..") != -1 { 37 | yield("", errors.Errorf("model %q contains illegal file name %q -- it cannot be an absolute path, nor contain \"..\"", 38 | r.ID, fileName)) 39 | return 40 | } 41 | if !yield(fileName, nil) { 42 | return 43 | } 44 | } 45 | return 46 | } 47 | } 48 | 49 | // HasFile returns whether the repo has given fileName. 50 | // Notice fileName is relative to the repository, not in local disk. 51 | // 52 | // If the Repo hasn't downloaded its info yet, it attempts to download it here. 53 | // If it fails, it simply return false. 54 | // Call Repo.DownloadInfo to handle errors downloading the info. 55 | func (r *Repo) HasFile(fileName string) bool { 56 | if r.DownloadInfo(false) != nil { 57 | return false 58 | } 59 | for _, si := range r.info.Siblings { 60 | if si.Name == fileName { 61 | return true 62 | } 63 | } 64 | return false 65 | } 66 | 67 | // cleanRelativeFilePath sanitizes a file path by removing empty segments 68 | // and parent directory references ("..") for security reasons. 69 | func cleanRelativeFilePath(repoFileName string) string { 70 | // Convert to forward slashes and clean the path 71 | normalized := filepath.ToSlash(repoFileName) 72 | 73 | // Remove leading slash if present 74 | normalized = strings.TrimPrefix(normalized, "/") 75 | 76 | // Split into path components 77 | parts := strings.Split(normalized, "/") 78 | 79 | // Process parts to handle ".." components 80 | var stack []string 81 | for _, part := range parts { 82 | if part == "" || part == "." { 83 | continue 84 | } 85 | if part == ".." { 86 | if len(stack) > 0 { 87 | // Remove last element if we have something to pop 88 | stack = stack[:len(stack)-1] 89 | } 90 | continue 91 | } 92 | stack = append(stack, part) 93 | } 94 | 95 | if len(stack) == 0 { 96 | return "." 97 | } 98 | 99 | // Join with platform-specific separator 100 | return filepath.FromSlash(strings.Join(stack, "/")) 101 | } 102 | 103 | // DownloadFiles downloads the repository files (the names returned by repo.IterFileNames), and return the path to the 104 | // downloaded files in the cache structure. 105 | // 106 | // The returned downloadPaths can be read, but shouldn't be modified, since there may be other programs using the same 107 | // files. 108 | func (r *Repo) DownloadFiles(repoFiles ...string) (downloadedPaths []string, err error) { 109 | if len(repoFiles) == 0 { 110 | return nil, nil 111 | } 112 | 113 | // Create download manager, if one hasn't been created yet. 114 | downloadManager := r.getDownloadManager() 115 | 116 | // Get/create repoCacheDir. 117 | var repoCacheDir string 118 | repoCacheDir, err = r.repoCacheDir() 119 | if err != nil { 120 | return nil, err 121 | } 122 | _ = repoCacheDir 123 | 124 | // Get snapshot dir: 125 | snapshotDir, err := r.repoSnapshotsDir() 126 | if err != nil { 127 | return nil, err 128 | } 129 | 130 | // Create context to stop any downloading of files if any error occur. 131 | // The deferred cancel both cleans up the context, and also stops any pending/ongoing 132 | // transfer that may be happening if an error occurs and the function exits. 133 | ctx, cancelFn := context.WithCancel(context.Background()) 134 | defer cancelFn() 135 | 136 | // Store results. 137 | downloadedPaths = make([]string, len(repoFiles)) 138 | 139 | // Information about download progress, and firstError to report back if needed. 140 | var downloadingMu sync.Mutex 141 | var firstError error 142 | var requireDownload int // number of files that require download (and are not in cache yet). 143 | perFileDownloaded := make([]uint64, len(repoFiles)) 144 | var allFilesDownloaded uint64 145 | var numDownloadedFiles int 146 | busyLoop := `-\|/` 147 | busyLoopPos := 0 148 | lastPrintTime := time.Now() 149 | 150 | // Print downloading progress. 151 | ratePrintFn := func() { 152 | if firstError == nil { 153 | fmt.Printf("\rDownloaded %d/%d files %c %s downloaded ", 154 | numDownloadedFiles, requireDownload, busyLoop[busyLoopPos], humanize.Bytes(allFilesDownloaded)) 155 | } else { 156 | fmt.Printf("\rDownloaded %d/%d files, %s downloaded: error - %v ", 157 | numDownloadedFiles, requireDownload, humanize.Bytes(allFilesDownloaded), 158 | firstError) 159 | } 160 | busyLoopPos = (busyLoopPos + 1) % len(busyLoop) 161 | lastPrintTime = time.Now() 162 | } 163 | 164 | // Report error for a download, and interrupt everyone. 165 | reportErrorFn := func(err error) { 166 | downloadingMu.Lock() 167 | if firstError == nil { 168 | firstError = err 169 | } 170 | cancelFn() 171 | downloadingMu.Unlock() 172 | return 173 | } 174 | 175 | // Loop over each file to download. 176 | var wg sync.WaitGroup 177 | for idxFile, repoFileName := range repoFiles { 178 | fileURL, err := r.FileURL(repoFileName) 179 | if err != nil { 180 | return nil, err 181 | } 182 | 183 | // Join the path parts of fileName using the current OS separator. 184 | relativeFilePath := cleanRelativeFilePath(repoFileName) 185 | if relativeFilePath == "." { 186 | return nil, errors.Errorf("invalid file name %q", repoFileName) 187 | } 188 | snapshotPath := path.Join(snapshotDir, relativeFilePath) 189 | downloadedPaths[idxFile] = snapshotPath // This is the file pointer we are returning. 190 | if files.Exists(snapshotPath) { 191 | // File already downloaded, skip. 192 | continue 193 | } 194 | 195 | // Create directory for this individual file. 196 | dir, _ := path.Split(snapshotPath) 197 | if err = os.MkdirAll(dir, DefaultDirCreationPerm); err != nil { 198 | return nil, errors.Wrapf(err, "while creating directory to download %q", snapshotPath) 199 | } 200 | 201 | // Start downloading in a separate goroutine. 202 | wg.Add(1) 203 | go func() { 204 | defer wg.Done() 205 | 206 | // Download header of file for safety checks, and so we can find the blobPath. 207 | header, contentLength, err := downloadManager.FetchHeader(ctx, fileURL) 208 | if err != nil { 209 | reportErrorFn(err) 210 | return 211 | } 212 | metadata := extractFileMetadata(header, fileURL, contentLength) 213 | etag := metadata.ETag 214 | if etag == "" { 215 | reportErrorFn(errors.Errorf("resource %q for %q doesn't have an ETag, not able to ensure reproduceability", 216 | repoFileName, r.ID)) 217 | return 218 | } 219 | if metadata.Location != fileURL { 220 | // In the case of a redirect, remove authorization header when downloading blob 221 | reportErrorFn(errors.Errorf("resource %q for %q has a redirect from %q to %q: this can be unsafe if we send our authorization token to the new URL", 222 | repoFileName, r.ID, fileURL, metadata.Location)) 223 | return 224 | } 225 | 226 | // blobPath: download only if it has already been downloaded. 227 | blobPath := path.Join(repoCacheDir, "blobs", etag) 228 | if !files.Exists(blobPath) { 229 | requireDownload++ // This file require download. 230 | err := r.lockedDownload(ctx, fileURL, blobPath, false, func(downloadedBytes, totalBytes int64) { 231 | // Execute at every report of download. 232 | downloadingMu.Lock() 233 | defer downloadingMu.Unlock() 234 | lastReportedBytes := perFileDownloaded[idxFile] 235 | newDownloaded := uint64(downloadedBytes) - lastReportedBytes 236 | allFilesDownloaded += newDownloaded 237 | perFileDownloaded[idxFile] = uint64(downloadedBytes) 238 | if r.Verbosity > 0 && time.Since(lastPrintTime) > time.Second { 239 | ratePrintFn() 240 | } 241 | }) 242 | if err != nil { 243 | reportErrorFn(err) 244 | return 245 | } 246 | 247 | // Done, print out progress. 248 | numDownloadedFiles++ 249 | if r.Verbosity > 0 { 250 | ratePrintFn() 251 | } 252 | } 253 | 254 | // Link blob file to snapshot. 255 | err = createSymLink(snapshotPath, blobPath) 256 | if err != nil { 257 | reportErrorFn(errors.WithMessagef(err, "while downloading %q from repository %q", repoFileName, r.ID)) 258 | } 259 | }() 260 | } 261 | wg.Wait() 262 | if requireDownload > 0 { 263 | if r.Verbosity > 0 { 264 | if firstError != nil { 265 | fmt.Println() 266 | } else { 267 | fmt.Printf("\rDownloaded %d/%d files, %s downloaded \n", 268 | numDownloadedFiles, requireDownload, humanize.Bytes(allFilesDownloaded)) 269 | } 270 | } 271 | } 272 | if firstError != nil { 273 | return nil, firstError 274 | } 275 | return downloadedPaths, nil 276 | } 277 | 278 | // DownloadFile is a shortcut to DownloadFiles with only one file. 279 | func (r *Repo) DownloadFile(file string) (downloadedPath string, err error) { 280 | res, err := r.DownloadFiles(file) 281 | if err != nil { 282 | return "", err 283 | } 284 | return res[0], nil 285 | } 286 | 287 | // fileMetadata used by HuggingFace Hub. 288 | type fileMetadata struct { 289 | CommitHash, ETag, Location string 290 | Size int 291 | } 292 | 293 | func extractFileMetadata(header http.Header, url string, contentLength int64) (metadata fileMetadata) { 294 | metadata.CommitHash = header.Get(HeaderXRepoCommit) 295 | metadata.ETag = header.Get(HeaderXLinkedETag) 296 | if metadata.ETag == "" { 297 | metadata.ETag = header.Get("ETag") 298 | } 299 | metadata.ETag = removeQuotes(metadata.ETag) 300 | metadata.Location = header.Get("Location") 301 | if metadata.Location == "" { 302 | metadata.Location = url 303 | } 304 | 305 | if sizeStr := header.Get(HeaderXLinkedSize); sizeStr != "" { 306 | var err error 307 | metadata.Size, err = strconv.Atoi(sizeStr) 308 | if err != nil { 309 | metadata.Size = 0 310 | } 311 | } 312 | if metadata.Size == 0 { 313 | metadata.Size = int(contentLength) 314 | } 315 | return 316 | } 317 | 318 | func removeQuotes(str string) string { 319 | return strings.TrimRight(strings.TrimLeft(str, "\""), "\"") 320 | } 321 | 322 | // createSymlink creates a symbolic link named dst pointing to src, using a relative path if possible. 323 | // It removes previous link/file if it already exists. 324 | // 325 | // We use relative paths because: 326 | // * It's what `huggingface_hub` library does, and we want to keep things compatible. 327 | // * If the cache folder is moved or backed up, links won't break. 328 | // * Relative paths seem better handled on Windows -- although Windows is not yet fully supported for this package. 329 | // 330 | // Example layout: 331 | // 332 | // └── [ 128] snapshots 333 | // ├── [ 128] 2439f60ef33a0d46d85da5001d52aeda5b00ce9f 334 | // │ ├── [ 52] README.md -> ../../../blobs/d7edf6bd2a681fb0175f7735299831ee1b22b812 335 | // │ └── [ 76] pytorch_model.bin -> ../../../blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd 336 | func createSymLink(dst, src string) error { 337 | relLink, err := filepath.Rel(path.Dir(dst), src) 338 | if err != nil { 339 | relLink = src // Take the absolute path instead. 340 | } 341 | 342 | // Remove link/file if it already exists. 343 | err = os.Remove(dst) 344 | if err != nil && !errors.Is(err, os.ErrNotExist) { 345 | return errors.Wrapf(err, "failed to remove dst=%q before linking it to %q", dst, relLink) 346 | } 347 | 348 | if err = os.Symlink(relLink, dst); err != nil { 349 | return errors.Wrapf(err, "while symlink'ing %q to %q using %q", src, dst, relLink) 350 | } 351 | return nil 352 | } 353 | -------------------------------------------------------------------------------- /tokenizers/sentencepiece/private/protos/sentencepiece_model.proto: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | syntax = "proto2"; 16 | 17 | // Unique package name to avoid conflicts: the proto library won't allow two different 18 | // packages to define the same proto (under the same namespace). 19 | // This is broken, since that's what is needed ... a bad design from the ProtoBuf in Go. See more details here: 20 | // https://protobuf.dev/reference/go/faq/#namespace-conflict 21 | // So instead we change the proto namespace (package) name to globally unique package name: 22 | package com.github.gomlx.go_huggingface.sentencepiece; 23 | 24 | option go_package="github.com/gomlx/go-huggingface/tokenizers/sentencepiece/private/protos"; 25 | 26 | // TODO(taku): Needs to use LITE RUNTIME in OSS release. 27 | option optimize_for = LITE_RUNTIME; 28 | 29 | 30 | 31 | // TrainerSpec encodes a various parameters for SentencePiece training. 32 | // Next id: 55 33 | message TrainerSpec { 34 | /////////////////////////////////////////////////////////////////// 35 | // General parameters 36 | // 37 | // Input corpus files. 38 | // Trainer accepts the following two formats: 39 | // A) Monolingual: plain text, one sentence per line. 40 | // B) Bilingual: TSV, source sentence target sentence 41 | // When bilingual data is passed, shared vocabulary model is built. 42 | // Note that the input file must be raw corpus, not a preprocessed corpus. 43 | // Trainer only loads the first `input_sentence_size` sentences specified 44 | // with this parameter. 45 | repeated string input = 1; 46 | 47 | // Input corpus format: 48 | // "text": one-sentence-per-line text format (default) 49 | // "tsv": sentence freq 50 | optional string input_format = 7; 51 | 52 | // Output model file prefix. 53 | // .model and .vocab are generated. 54 | optional string model_prefix = 2; 55 | 56 | // Model type. only have UNIGRAM now. 57 | enum ModelType { 58 | UNIGRAM = 1; // Unigram language model with dynamic algorithm 59 | BPE = 2; // Byte Pair Encoding 60 | WORD = 3; // Delimitered by whitespace. 61 | CHAR = 4; // tokenizes into character sequence 62 | } 63 | optional ModelType model_type = 3 [default = UNIGRAM]; 64 | 65 | // Vocabulary size. 8k is the default size. 66 | optional int32 vocab_size = 4 [default = 8000]; 67 | 68 | // List of the languages this model can accept. 69 | // Since the model is language-agnostic, this field is used as a reference. 70 | repeated string accept_language = 5; 71 | 72 | // Size of self-test samples, which are encoded in the model file. 73 | optional int32 self_test_sample_size = 6 [default = 0]; 74 | 75 | // Whether to use DP version of sentencepiece. Use it with TSV input format 76 | // (requires precomputed word tab counts to work). 77 | optional bool enable_differential_privacy = 50 [default = false]; 78 | // Set these parameters if you need DP version of sentencepiece. 79 | // std of noise to add. 80 | optional float differential_privacy_noise_level = 51 [default = 0.0]; 81 | // Clipping threshold to apply after adding noise. All the words with 82 | // frequency less than this value are dropped. 83 | optional uint64 differential_privacy_clipping_threshold = 52 [default = 0]; 84 | 85 | /////////////////////////////////////////////////////////////////// 86 | // Training parameters. 87 | // 88 | // Uses characters which cover the corpus with the ratio of `chars_coverage`. 89 | // This parameter determines the set of basic Alphabet of sentence piece. 90 | // 1.0 - `chars_coverage` characters are treated as UNK. 91 | // See also required_chars field. 92 | optional float character_coverage = 10 [default = 0.9995]; 93 | 94 | // Maximum size of sentences the trainer loads from `input` parameter. 95 | // Trainer simply loads the `input` files in sequence. 96 | // It is better to shuffle the input corpus randomly. 97 | optional uint64 input_sentence_size = 11 [default = 0]; 98 | optional bool shuffle_input_sentence = 19 [default = true]; 99 | 100 | // Maximum size of sentences to make seed sentence pieces. 101 | // Extended suffix array is constructed to extract frequent 102 | // sub-strings from the corpus. This uses 20N working space, 103 | // where N is the size of corpus. 104 | optional int32 mining_sentence_size = 12 [deprecated = true]; 105 | 106 | // Maximum size of sentences to train sentence pieces. 107 | optional int32 training_sentence_size = 13 [deprecated = true]; 108 | 109 | // The size of seed sentencepieces. 110 | // `seed_sentencepiece_size` must be larger than `vocab_size`. 111 | optional int32 seed_sentencepiece_size = 14 [default = 1000000]; 112 | 113 | // In every EM sub-iterations, keeps top 114 | // `shrinking_factor` * `current sentencepieces size` with respect to 115 | // the loss of the sentence piece. This value should be smaller than 1.0. 116 | optional float shrinking_factor = 15 [default = 0.75]; 117 | 118 | // The maximum sentence length in byte. The sentences with the length 119 | // larger than `max_sentence_length` is simply ignored. 120 | // Longer input tends to bring the following risks: 121 | // * Overflow during EM training (unigram language model only) 122 | // * Performance drop because of O(n log n) cost in BPE. 123 | optional int32 max_sentence_length = 18 [default = 4192]; 124 | 125 | // Number of threads in the training. 126 | optional int32 num_threads = 16 [default = 16]; 127 | 128 | // Number of EM sub iterations. 129 | optional int32 num_sub_iterations = 17 [default = 2]; 130 | 131 | /////////////////////////////////////////////////////////////////// 132 | // SentencePiece parameters which control the shapes of sentence piece. 133 | // 134 | // Maximum length of sentencepiece. 135 | optional int32 max_sentencepiece_length = 20 [default = 16]; 136 | 137 | // Uses Unicode script to split sentence pieces. 138 | // When `split_by_unicode_script` is true, we do not allow sentence piece to 139 | // include multiple Unicode scripts, e.g. "F1" is not a valid piece. 140 | // Exception: CJ characters (Hiragana/Katakana/Han) are all handled 141 | // as one script type, since Japanese word can consist of multiple scripts. 142 | // This exception is always applied regardless of the accept-language 143 | // parameter. 144 | optional bool split_by_unicode_script = 21 [default = true]; 145 | 146 | // When `split_by_number` is true, put a boundary between number and 147 | // non-number transition. If we want to treat "F1" is one token, set this flag 148 | // to be false. 149 | optional bool split_by_number = 23 [default = true]; 150 | 151 | // Use a white space to split sentence pieces. 152 | // When `split_by_whitespace` is false, we may have the piece containing 153 | // a white space in the middle. e.g., "in_the". 154 | optional bool split_by_whitespace = 22 [default = true]; 155 | 156 | // Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello => 157 | // hello_. When `treat_whitespace_as_suffix` is true, 158 | // NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end 159 | // of sentence. 160 | optional bool treat_whitespace_as_suffix = 24 [default = false]; 161 | 162 | // Allows pieces that only contain whitespaces instead of appearing only as 163 | // prefix or suffix of other pieces. 164 | optional bool allow_whitespace_only_pieces = 26 [default = false]; 165 | 166 | // Split all digits (0-9) into separate pieces. 167 | optional bool split_digits = 25 [default = false]; 168 | 169 | // Defines the pre-tokenization delimiter. 170 | // When specified, no pieces crossing this delimiter is not included 171 | // in the vocab. Then the delimiter string is virtually ignored 172 | // during the training. This field can allows constraints on the vocabulary 173 | // selection. Note that this field is available on unigram mode. 174 | optional string pretokenization_delimiter = 53 [ default = ""]; 175 | 176 | /////////////////////////////////////////////////////////////////// 177 | // Vocabulary management 178 | // 179 | // Defines control symbols used as an indicator to 180 | // change the behavior of the decoder. and are pre-defined. 181 | // We can use this field to encode various meta information, 182 | // including language indicator in multilingual model. 183 | // These symbols are not visible to users, but visible to 184 | // the decoder. Note that when the input sentence contains control symbols, 185 | // they are not treated as one token, but segmented into normal pieces. 186 | // Control symbols must be inserted independently from the segmentation. 187 | repeated string control_symbols = 30; 188 | 189 | // Defines user defined symbols. 190 | // These symbols are added with extremely high score 191 | // so they are always treated as one unique symbol in any context. 192 | // Typical usage of user_defined_symbols is placeholder for named entities. 193 | repeated string user_defined_symbols = 31; 194 | 195 | // Defines required characters. Each UTF8 character in this string is included 196 | // in the character set regardless of character_coverage value. Unlike 197 | // user_defined_symbols, these characters have scores based on the frequency 198 | // on input sentences, and the model can form subwords using characters 199 | // in this field. 200 | optional string required_chars = 36; 201 | 202 | // Decomposes unknown pieces into UTF-8 bytes. 203 | optional bool byte_fallback = 35 [default = false]; 204 | 205 | // When creating the vocabulary file, defines whether or not to additionally 206 | // output the score for each piece. 207 | optional bool vocabulary_output_piece_score = 32 [default = true]; 208 | 209 | // `vocab_size` is treated as hard limit. Crash if 210 | // the model can not produce the vocab of size `vocab_size`, 211 | // When `hard_vocab_limit` is false, vocab_size is treated 212 | // as soft limit. Note that when model_type=char, 213 | // always assumes hard_vocab_limit = false. 214 | optional bool hard_vocab_limit = 33 [default = true]; 215 | 216 | // use all symbols for vocab extraction. This flag is valid 217 | // if model type is either CHAR or WORD 218 | optional bool use_all_vocab = 34 [default = false]; 219 | 220 | /////////////////////////////////////////////////////////////////// 221 | // Reserved special meta tokens. 222 | // * -1 is not used. 223 | // * unk_id must not be -1. 224 | // Id must starts with 0 and be contigous. 225 | optional int32 unk_id = 40 [default = 0]; // 226 | optional int32 bos_id = 41 [default = 1]; // 227 | optional int32 eos_id = 42 [default = 2]; // 228 | optional int32 pad_id = 43 [default = -1]; // (padding) 229 | optional string unk_piece = 45 [default = ""]; 230 | optional string bos_piece = 46 [default = ""]; 231 | optional string eos_piece = 47 [default = ""]; 232 | optional string pad_piece = 48 [default = ""]; 233 | 234 | // Encodes into U+2047 (DOUBLE QUESTION MARK), 235 | // since this character can be useful both for user and 236 | // developer. We can easily figure out that is emitted. 237 | optional string unk_surface = 44 [default = " \xE2\x81\x87 "]; 238 | 239 | // Increase bit depth to allow unigram model training on large 240 | // (>10M sentences) corpora. A Side-effect of enabling this flag 241 | // is increased memory usage. 242 | optional bool train_extremely_large_corpus = 49 [default = false]; 243 | 244 | // Path to a seed sentencepieces file, with one tab-separated 245 | // seed sentencepiece frequency per line. 246 | optional string seed_sentencepieces_file = 54 [default = ""]; 247 | 248 | // Customized extensions: the range of field numbers 249 | // are open to third-party extensions. 250 | extensions 200 to max; 251 | } 252 | 253 | // NormalizerSpec encodes a various parameters for string normalizaiton 254 | message NormalizerSpec { 255 | // name of normalization rule. 256 | optional string name = 1; 257 | 258 | // Pre-compiled normalization rule created by 259 | // Builder::GetPrecompiledCharsMap() or Builder::CompileCharsMap() method. 260 | // Usually this field is set by Builder::GetNormalizerSpec() method. 261 | optional bytes precompiled_charsmap = 2; 262 | 263 | // Adds dummy whitespace at the beginning of text in order to 264 | // treat "world" in "world" and "hello world" in the same way. 265 | optional bool add_dummy_prefix = 3 [default = true]; 266 | 267 | // Removes leading, trailing, and duplicate internal whitespace. 268 | optional bool remove_extra_whitespaces = 4 [default = true]; 269 | 270 | // Replaces whitespace with meta symbol. 271 | // This field must be true to train sentence piece model. 272 | optional bool escape_whitespaces = 5 [default = true]; 273 | 274 | // Custom normalization rule file in TSV format. 275 | // https://github.com/google/sentencepiece/blob/master/doc/normalization.md 276 | // This field is only used in SentencePieceTrainer::Train() method, which 277 | // compiles the rule into the binary rule stored in `precompiled_charsmap`. 278 | optional string normalization_rule_tsv = 6; 279 | 280 | // Customized extensions: the range of field numbers 281 | // are open to third-party extensions. 282 | extensions 200 to max; 283 | } 284 | 285 | // Proto to store samples for self-testing. 286 | message SelfTestData { 287 | message Sample { 288 | optional string input = 1; 289 | optional string expected = 2; 290 | } 291 | repeated Sample samples = 1; 292 | 293 | // Customized extensions: the range of field numbers 294 | // are open to third-party extensions. 295 | extensions 200 to max; 296 | } 297 | 298 | // ModelProto stores model parameters. 299 | // SentencePieceProcessor is supposed to be self-contained. 300 | // All settings/parameters which may change the behavior must be encoded 301 | // in ModelProto. 302 | message ModelProto { 303 | message SentencePiece { 304 | enum Type { 305 | NORMAL = 1; // normal symbol 306 | UNKNOWN = 2; // unknown symbol. only for now. 307 | CONTROL = 3; // control symbols. , , <2ja> etc. 308 | USER_DEFINED = 4; // user defined symbols. 309 | // Typical usage of USER_DEFINED symbol 310 | // is placeholder. 311 | BYTE = 6; // byte symbols. Used when `byte_fallback` is true. 312 | UNUSED = 5; // this piece is not used. 313 | } 314 | optional string piece = 1; // piece must not be empty. 315 | optional float score = 2; 316 | optional Type type = 3 [default = NORMAL]; 317 | 318 | // Customized extensions: the range of field numbers 319 | // are open to third-party extensions. 320 | extensions 200 to max; 321 | } 322 | 323 | // Sentence pieces with scores. 324 | repeated SentencePiece pieces = 1; 325 | 326 | // Spec used to generate this model file. 327 | optional TrainerSpec trainer_spec = 2; 328 | 329 | // Spec for text normalization. 330 | optional NormalizerSpec normalizer_spec = 3; 331 | 332 | // Stores sample input and its expected segmentation to verify the model. 333 | optional SelfTestData self_test_data = 4; 334 | 335 | // Spec for text de-normalization. 336 | optional NormalizerSpec denormalizer_spec = 5; 337 | 338 | // Customized extensions: the range of field numbers 339 | // are open to third-party extensions. 340 | extensions 200 to max; 341 | } 342 | -------------------------------------------------------------------------------- /go-huggingface.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "f64e2dff-097a-4e59-a53a-773d1c2356da", 6 | "metadata": {}, 7 | "source": [ 8 | "# `go-huggingface` Demo\n", 9 | "\n", 10 | "This demo shows how to download files and create tokenizers from HuggingFace models." 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "id": "d36287c1-e5e0-4985-83ac-1f40e8b850c7", 16 | "metadata": {}, 17 | "source": [ 18 | "## Imports and `go work` setup\n", 19 | "\n", 20 | "This is used during development, to instruct the Notebook kernel [gonb](https://github.com/janpfeifer/gonb) to use the local version of the libraries." 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "id": "8941d23e-19f2-4cb4-8538-b6acecfdba61", 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "data": { 31 | "text/markdown": [ 32 | "## GoNB version: `v0.11.1`\n", 33 | "\n", 34 | "### Build Info\n", 35 | "- Go version: go1.25.3 (OS: linux, Arch: amd64)\n" 36 | ] 37 | }, 38 | "metadata": {}, 39 | "output_type": "display_data" 40 | }, 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "\t- Replace rule for module \"github.com/gomlx/gomlx\" to local directory \"/home/janpf/Projects/gomlx\" already exists.\n", 46 | "\t- Added replace rule for module \"github.com/gomlx/onnx-gomlx\" to local directory \"/home/janpf/Projects/onnx-gomlx\".\n", 47 | "\t- Added replace rule for module \"github.com/gomlx/gemma\" to local directory \"/home/janpf/Projects/gemma\".\n", 48 | "\t- Added replace rule for module \"github.com/gomlx/go-huggingface\" to local directory \"/home/janpf/Projects/go-huggingface\".\n" 49 | ] 50 | } 51 | ], 52 | "source": [ 53 | "%version\n", 54 | "!*rm -f go.work && go work init\n", 55 | "!*go work use . \"${HOME}/Projects/gomlx\" \"${HOME}/Projects/go-huggingface\" \"${HOME}/Projects/gemma\" \"${HOME}/Projects/onnx-gomlx\"\n", 56 | "%goworkfix" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 2, 62 | "id": "fe1baf48-95fd-4e8c-94a5-e09f1a723559", 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "import (\n", 67 | " \"github.com/janpfeifer/must\"\n", 68 | " \"github.com/gomlx/go-huggingface/hub\"\n", 69 | " \"github.com/gomlx/go-huggingface/tokenizers\"\n", 70 | ")" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "id": "8bc02638-f04d-4fd6-a199-8cb5fec99600", 76 | "metadata": {}, 77 | "source": [ 78 | "## Download `tokenizer_config.json` and enumerate `tokenizer_class` for several models" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 3, 84 | "id": "43f3d2af-29b2-488e-97c4-22942dcecab2", 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "var (\n", 89 | " // HuggingFace authentication token read from environment.\n", 90 | " // It can be created in https://huggingface.co\n", 91 | " // Some files may require it for downloading.\n", 92 | " hfAuthToken = os.Getenv(\"HF_TOKEN\")\n", 93 | "\n", 94 | " // Model ids for testing.\n", 95 | " hfModelIDs = []string{\n", 96 | " \"google/gemma-2-2b-it\",\n", 97 | " \"sentence-transformers/all-MiniLM-L6-v2\",\n", 98 | " \"protectai/deberta-v3-base-zeroshot-v1-onnx\",\n", 99 | " \"KnightsAnalytics/distilbert-base-uncased-finetuned-sst-2-english\",\n", 100 | " \"KnightsAnalytics/distilbert-NER\",\n", 101 | " \"KnightsAnalytics/all-MiniLM-L6-v2\",\n", 102 | " \"SamLowe/roberta-base-go_emotions-onnx\",\n", 103 | " }\n", 104 | ")" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 4, 110 | "id": "dd4f535a-cfd7-4e10-9660-78179caa949b", 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "name": "stdout", 115 | "output_type": "stream", 116 | "text": [ 117 | "\n", 118 | "google/gemma-2-2b-it:\n", 119 | "\t.gitattributes\n", 120 | "\tREADME.md\n", 121 | "\tconfig.json\n", 122 | "\tgeneration_config.json\n", 123 | "\tmodel-00001-of-00002.safetensors\n", 124 | "\tmodel-00002-of-00002.safetensors\n", 125 | "\tmodel.safetensors.index.json\n", 126 | "\tspecial_tokens_map.json\n", 127 | "\ttokenizer.json\n", 128 | "\ttokenizer.model\n", 129 | "\ttokenizer_config.json\n", 130 | "\n", 131 | "sentence-transformers/all-MiniLM-L6-v2:\n", 132 | "\t.gitattributes\n", 133 | "\t1_Pooling/config.json\n", 134 | "\tREADME.md\n", 135 | "\tconfig.json\n", 136 | "\tconfig_sentence_transformers.json\n", 137 | "\tdata_config.json\n", 138 | "\tmodel.safetensors\n", 139 | "\tmodules.json\n", 140 | "\tonnx/model.onnx\n", 141 | "\tonnx/model_O1.onnx\n", 142 | "\tonnx/model_O2.onnx\n", 143 | "\tonnx/model_O3.onnx\n", 144 | "\tonnx/model_O4.onnx\n", 145 | "\tonnx/model_qint8_arm64.onnx\n", 146 | "\tonnx/model_qint8_avx512.onnx\n", 147 | "\tonnx/model_qint8_avx512_vnni.onnx\n", 148 | "\tonnx/model_quint8_avx2.onnx\n", 149 | "\topenvino/openvino_model.bin\n", 150 | "\topenvino/openvino_model.xml\n", 151 | "\topenvino/openvino_model_qint8_quantized.bin\n", 152 | "\topenvino/openvino_model_qint8_quantized.xml\n", 153 | "\tpytorch_model.bin\n", 154 | "\trust_model.ot\n", 155 | "\tsentence_bert_config.json\n", 156 | "\tspecial_tokens_map.json\n", 157 | "\ttf_model.h5\n", 158 | "\ttokenizer.json\n", 159 | "\ttokenizer_config.json\n", 160 | "\ttrain_script.py\n", 161 | "\tvocab.txt\n", 162 | "\n", 163 | "protectai/deberta-v3-base-zeroshot-v1-onnx:\n", 164 | "\t.gitattributes\n", 165 | "\tREADME.md\n", 166 | "\tadded_tokens.json\n", 167 | "\tconfig.json\n", 168 | "\tmerges.txt\n", 169 | "\tmodel.onnx\n", 170 | "\tspecial_tokens_map.json\n", 171 | "\tspm.model\n", 172 | "\ttokenizer.json\n", 173 | "\ttokenizer_config.json\n", 174 | "\tvocab.json\n", 175 | "\n", 176 | "KnightsAnalytics/distilbert-base-uncased-finetuned-sst-2-english:\n", 177 | "\t.gitattributes\n", 178 | "\tadded_tokens.json\n", 179 | "\tconfig.json\n", 180 | "\tmodel.onnx\n", 181 | "\tspecial_tokens_map.json\n", 182 | "\ttokenizer.json\n", 183 | "\ttokenizer_config.json\n", 184 | "\tvocab.txt\n", 185 | "\n", 186 | "KnightsAnalytics/distilbert-NER:\n", 187 | "\t.gitattributes\n", 188 | "\tconfig.json\n", 189 | "\tmodel.onnx\n", 190 | "\tspecial_tokens_map.json\n", 191 | "\ttokenizer.json\n", 192 | "\ttokenizer_config.json\n", 193 | "\tvocab.txt\n", 194 | "\n", 195 | "KnightsAnalytics/all-MiniLM-L6-v2:\n", 196 | "\t.gitattributes\n", 197 | "\tconfig.json\n", 198 | "\tdata_config.json\n", 199 | "\tmodel.onnx\n", 200 | "\tmodules.json\n", 201 | "\tspecial_tokens_map.json\n", 202 | "\ttokenizer.json\n", 203 | "\ttokenizer_config.json\n", 204 | "\tvocab.txt\n", 205 | "\n", 206 | "SamLowe/roberta-base-go_emotions-onnx:\n", 207 | "\t.gitattributes\n", 208 | "\tREADME.md\n", 209 | "\tconfig.json\n", 210 | "\tmerges.txt\n", 211 | "\tonnx/config.json\n", 212 | "\tonnx/merges.txt\n", 213 | "\tonnx/model.onnx\n", 214 | "\tonnx/model_quantized.onnx\n", 215 | "\tonnx/ort_config.json\n", 216 | "\tonnx/ort_config_quantized.json\n", 217 | "\tonnx/special_tokens_map.json\n", 218 | "\tonnx/tokenizer.json\n", 219 | "\tonnx/tokenizer_config.json\n", 220 | "\tonnx/vocab.json\n", 221 | "\tspecial_tokens_map.json\n", 222 | "\ttokenizer.json\n", 223 | "\ttokenizer_config.json\n", 224 | "\ttrainer_state.json\n", 225 | "\tvocab.json\n" 226 | ] 227 | } 228 | ], 229 | "source": [ 230 | "%%\n", 231 | "for _, modelID := range hfModelIDs {\n", 232 | " fmt.Printf(\"\\n%s:\\n\", modelID)\n", 233 | " repo := hub.New(modelID).WithAuth(hfAuthToken)\n", 234 | " for fileName, err := range repo.IterFileNames() {\n", 235 | " if err != nil { panic(err) }\n", 236 | " fmt.Printf(\"\\t%s\\n\", fileName)\n", 237 | " }\n", 238 | "}" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 5, 244 | "id": "80f9eaee-7507-4921-bccd-b8dbcd8bf86a", 245 | "metadata": {}, 246 | "outputs": [ 247 | { 248 | "name": "stdout", 249 | "output_type": "stream", 250 | "text": [ 251 | "\n", 252 | "google/gemma-2-2b-it:\n", 253 | "\ttokenizer_class=GemmaTokenizer\n", 254 | "\n", 255 | "sentence-transformers/all-MiniLM-L6-v2:\n", 256 | "\ttokenizer_class=BertTokenizer\n", 257 | "\n", 258 | "protectai/deberta-v3-base-zeroshot-v1-onnx:\n", 259 | "\ttokenizer_class=DebertaV2Tokenizer\n", 260 | "\n", 261 | "KnightsAnalytics/distilbert-base-uncased-finetuned-sst-2-english:\n", 262 | "\ttokenizer_class=DistilBertTokenizer\n", 263 | "\n", 264 | "KnightsAnalytics/distilbert-NER:\n", 265 | "\ttokenizer_class=DistilBertTokenizer\n", 266 | "\n", 267 | "KnightsAnalytics/all-MiniLM-L6-v2:\n", 268 | "\ttokenizer_class=BertTokenizer\n", 269 | "\n", 270 | "SamLowe/roberta-base-go_emotions-onnx:\n", 271 | "\ttokenizer_class=RobertaTokenizer\n" 272 | ] 273 | } 274 | ], 275 | "source": [ 276 | "%%\n", 277 | "for _, modelID := range hfModelIDs {\n", 278 | " fmt.Printf(\"\\n%s:\\n\", modelID)\n", 279 | " repo := hub.New(modelID).WithAuth(hfAuthToken)\n", 280 | " config := must.M1(tokenizers.GetConfig(repo))\n", 281 | " fmt.Printf(\"\\ttokenizer_class=%s\\n\", config.TokenizerClass)\n", 282 | "}" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "id": "3c0a7ae7-ace6-4675-873b-0336efa3c68a", 288 | "metadata": {}, 289 | "source": [ 290 | "## Create a Tokenizer\n", 291 | "\n", 292 | "### Go-only SentencePiece tokenizer:" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 6, 298 | "id": "f37e14c8-8321-40ac-b87f-1d4a222d6123", 299 | "metadata": {}, 300 | "outputs": [ 301 | { 302 | "name": "stdout", 303 | "output_type": "stream", 304 | "text": [ 305 | "Sentence:\tThe book is on the table.\n", 306 | "Tokens: \t[651 2870 603 611 573 3037 235265]\n" 307 | ] 308 | } 309 | ], 310 | "source": [ 311 | "var sentence = \"The book is on the table.\"\n", 312 | "\n", 313 | "%%\n", 314 | "repo := hub.New(\"google/gemma-2-2b-it\").WithAuth(hfAuthToken)\n", 315 | "tokenizer := must.M1(tokenizers.New(repo))\n", 316 | "tokens := tokenizer.Encode(sentence)\n", 317 | "fmt.Printf(\"Sentence:\\t%s\\n\", sentence)\n", 318 | "fmt.Printf(\"Tokens: \\t%v\\n\", tokens)\n" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "id": "cd2f6792-2e9b-427d-a2c6-316038624349", 324 | "metadata": {}, 325 | "source": [ 326 | "### Rust based [github.com/daulet/tokenizers](https://github.com/daulet/tokenizers) tokenizer\n", 327 | "\n", 328 | "For most tokenizers in HuggingFace though, there is no Go-only version yet, and for now we use the [github.com/daulet/tokenizers](https://github.com/daulet/tokenizers), which is based on a fast tokenizer written in Rust.\n", 329 | "\n", 330 | "It requires installation of the built Rust library though, see [github.com/daulet/tokenizers](https://github.com/daulet/tokenizers) on how to install it, they provide prebuilt binaries.\n", 331 | "\n", 332 | "> **Note**: `daulet/tokenizers` also provides a simple downloader, so `go-huggingface` is not strictly necessary -- if you don't want the extra dependency and only need the tokenizer, you don't need to use it. `go-huggingface` helps by allowing also downloading other files (models, datasets), and a shared cache across different projects and `huggingface-hub` (the python downloader library)." 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 13, 338 | "id": "cd706316-ef19-4f25-92dc-a1283af8987d", 339 | "metadata": {}, 340 | "outputs": [ 341 | { 342 | "name": "stdout", 343 | "output_type": "stream", 344 | "text": [ 345 | "Sentence:\tThe book is on the table.\n", 346 | "Tokens: \t[101 1996 2338 2003 2006 1996 2795 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n" 347 | ] 348 | } 349 | ], 350 | "source": [ 351 | "import dtok \"github.com/daulet/tokenizers\"\n", 352 | "\n", 353 | "%%\n", 354 | "modelID := \"KnightsAnalytics/all-MiniLM-L6-v2\"\n", 355 | "repo := hub.New(modelID).WithAuth(hfAuthToken)\n", 356 | "localFile := must.M1(repo.DownloadFile(\"tokenizer.json\"))\n", 357 | "tokenizer := must.M1(dtok.FromFile(localFile))\n", 358 | "defer tokenizer.Close()\n", 359 | "tokens, _ := tokenizer.Encode(sentence, true)\n", 360 | "\n", 361 | "fmt.Printf(\"Sentence:\\t%s\\n\", sentence)\n", 362 | "fmt.Printf(\"Tokens: \\t%v\\n\", tokens)" 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "id": "012cda0f-5ed7-418b-a7eb-3de5040a7e2c", 368 | "metadata": {}, 369 | "source": [ 370 | "## Convert ONNX model" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 10, 376 | "id": "7e7751da-e53f-47c6-a7d6-e6b760f95417", 377 | "metadata": {}, 378 | "outputs": [ 379 | { 380 | "name": "stdout", 381 | "output_type": "stream", 382 | "text": [ 383 | "Downloaded 1/1 files, 90 MB downloaded \n", 384 | "Sentences: \t[\"This is an example sentence\" \"Each sentence is converted\"]\n", 385 | "Embeddings:\t[2][7][384]float32{\n", 386 | " {{0.0365, -0.01629, 0.1682, ..., 0.05536, -0.1644, -0.2967},\n", 387 | " {0.7237, 0.6396, 0.1882, ..., 0.5939, 0.6204, 0.4902},\n", 388 | " {0.006478, 0.02025, 0.04475, ..., 0.3469, 1.317, -0.1669},\n", 389 | " ...,\n", 390 | " {0.1479, -0.06461, 0.1457, ..., 0.8841, -0.3322, 0.2979},\n", 391 | " {0.5212, 0.6562, 0.5608, ..., -0.03991, 0.04111, -1.404},\n", 392 | " {1.082, 0.7136, 0.3983, ..., -0.2299, 0.3247, -1.031}},\n", 393 | " {{0.28, 0.1164, -0.04185, ..., 0.2709, -0.1684, -0.2962},\n", 394 | " {0.8734, 0.454, -0.1082, ..., 0.1364, 0.4583, -0.2042},\n", 395 | " {0.4748, 0.5727, 0.6301, ..., 0.6525, 0.5614, -1.327},\n", 396 | " ...,\n", 397 | " {0.6108, 0.792, -0.4682, ..., 0.08599, 1.059, -0.2985},\n", 398 | " {0.4115, 1.094, 0.2389, ..., 0.8984, 0.3688, -0.7335},\n", 399 | " {0.1356, 0.5588, 0.2701, ..., 0.5426, 0.4699, -0.5305}}}\n" 400 | ] 401 | } 402 | ], 403 | "source": [ 404 | "import (\n", 405 | " \"github.com/gomlx/onnx-gomlx/onnx\"\n", 406 | " \"github.com/gomlx/gomlx/pkg/core/graph\"\n", 407 | " \"github.com/gomlx/gomlx/pkg/ml/context\"\n", 408 | " \"github.com/gomlx/gomlx/backends\"\n", 409 | " _ \"github.com/gomlx/gomlx/backends/default\"\n", 410 | ")\n", 411 | "\n", 412 | "%%\n", 413 | "// Get ONNX model.\n", 414 | "repo := hub.New(\"sentence-transformers/all-MiniLM-L6-v2\").WithAuth(hfAuthToken)\n", 415 | "onnxFilePath, err := repo.DownloadFile(\"onnx/model.onnx\")\n", 416 | "if err != nil { panic(err) }\n", 417 | "onnxModel, err := onnx.ReadFile(onnxFilePath)\n", 418 | "if err != nil { panic(err) }\n", 419 | "\n", 420 | "// Convert ONNX variables to GoMLX context (which stores variables):\n", 421 | "ctx := context.New()\n", 422 | "err = onnxModel.VariablesToContext(ctx)\n", 423 | "if err != nil { panic(err) }\n", 424 | "\n", 425 | "sentences := []string{\n", 426 | " \"This is an example sentence\", \n", 427 | " \"Each sentence is converted\"}\n", 428 | "inputIDs := [][]int64{\n", 429 | " {101, 2023, 2003, 2019, 2742, 6251, 102},\n", 430 | " { 101, 2169, 6251, 2003, 4991, 102, 0}}\n", 431 | "tokenTypeIDs := [][]int64{\n", 432 | " {0, 0, 0, 0, 0, 0, 0},\n", 433 | " {0, 0, 0, 0, 0, 0, 0}}\n", 434 | "attentionMask := [][]int64{\n", 435 | " {1, 1, 1, 1, 1, 1, 1},\n", 436 | " {1, 1, 1, 1, 1, 1, 0}}\n", 437 | "embeddings := context.MustExecOnce(\n", 438 | " backends.MustNew(), ctx, \n", 439 | " func (ctx *context.Context, inputs []*graph.Node) *graph.Node {\n", 440 | " modelOutputs := onnxModel.CallGraph(ctx, inputs[0].Graph(), map[string]*graph.Node{\n", 441 | " \"input_ids\": inputs[0],\n", 442 | " \"attention_mask\": inputs[1],\n", 443 | " \"token_type_ids\": inputs[2]})\n", 444 | " return modelOutputs[0]\n", 445 | " }, inputIDs, attentionMask, tokenTypeIDs)\n", 446 | "\n", 447 | "fmt.Printf(\"Sentences: \\t%q\\n\", sentences)\n", 448 | "fmt.Printf(\"Embeddings:\\t%s\\n\", embeddings)\n" 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "id": "f4ee266a-f0ee-48e0-ad2d-a5bd85a47043", 454 | "metadata": {}, 455 | "source": [ 456 | "## Download Dataset Files\n", 457 | "\n", 458 | "We are going to use the [HuggingFaceFW/fineweb](https://huggingface.co/datasets/HuggingFaceFW/fineweb) as an example, download one of its sample files (~2.5Gb of data) and parse the `.parquet` file.\n", 459 | "\n", 460 | "### Structure of file\n", 461 | "First we define the structure of each entry, with the tags for the Parquet parser:" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": 11, 467 | "id": "3963f645-a63b-43da-9c9d-3340a330fca7", 468 | "metadata": {}, 469 | "outputs": [], 470 | "source": [ 471 | "var (\n", 472 | " FineWebID = \"HuggingFaceFW/fineweb\"\n", 473 | " FineWebSampleFile = \"sample/10BT/000_00000.parquet\"\n", 474 | ")\n", 475 | "\n", 476 | "// FineWebEntry: inspection of fields in parque file done with tool in \n", 477 | "// github.com/xitongsys/parquet-go/tool/parquet-tools.\n", 478 | "//\n", 479 | "// The parquet annotations are described in: https://pkg.go.dev/github.com/parquet-go/parquet-go#SchemaOf\n", 480 | "type FineWebEntry struct {\n", 481 | " Text string `parquet:\"text,snappy\"`\n", 482 | " ID string `parquet:\"id,snappy\"`\n", 483 | " Dump string `parquet:\"dump,snappy\"`\n", 484 | " URL string `parquet:\"url,snappy\"`\n", 485 | " Score float64 `parquet:\"language_score\"`\n", 486 | "}\n", 487 | "\n", 488 | "// TrimString returns s trimmed to at most maxLength runes. If trimmed it appends \"…\" at the end.\n", 489 | "func TrimString(s string, maxLength int) string {\n", 490 | " if utf8.RuneCountInString(s) <= maxLength {\n", 491 | " return s\n", 492 | " }\n", 493 | " runes := []rune(s)\n", 494 | " return string(runes[:maxLength-1]) + \"…\"\n", 495 | "}" 496 | ] 497 | }, 498 | { 499 | "cell_type": "markdown", 500 | "id": "6a0c90c8-ba8b-4182-92f2-4f7921f8a4f6", 501 | "metadata": {}, 502 | "source": [ 503 | "### Read the Parquet\n", 504 | "\n", 505 | "Using the library [github.com/parquet-go/parquet-go](https://github.com/parquet-go/parquet-go)." 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": 12, 511 | "id": "bc2f3084-05fd-450b-b939-9095234fb225", 512 | "metadata": {}, 513 | "outputs": [ 514 | { 515 | "name": "stdout", 516 | "output_type": "stream", 517 | "text": [ 518 | "Downloaded 1/1 files, 2.1 GB downloaded \n", 519 | "10 rows read\n", 520 | "Row 0:\tScore=0.823 Text=[\"|Viewing Single Post From: Spoilers for the Week …\"], URL=[http://daytimeroyaltyonline.com/single/…]\n", 521 | "Row 1:\tScore=0.974 Text=[\"*sigh* Fundamentalist community, let me pass on s…\"], URL=[http://endogenousretrovirus.blogspot.co…]\n", 522 | "Row 2:\tScore=0.873 Text=[\"A novel two-step immunotherapy approach has shown…\"], URL=[http://news.cancerconnect.com/]\n", 523 | "Row 3:\tScore=0.932 Text=[\"Free the Cans! Working Together to Reduce Waste\\nI…\"], URL=[http://sharingsolution.com/2009/05/23/f…]\n", 524 | "Row 4:\tScore=0.955 Text=[\"ORLANDO, Fla. — While the Rapid Recall Exchange, …\"], URL=[http://supermarketnews.com/food-safety/…]\n", 525 | "Row 5:\tScore=0.954 Text=[\"September 28, 2010\\n2010 Season - Bowman pulls dow…\"], URL=[http://www.augustana.edu/x22236.xml]\n", 526 | "Row 6:\tScore=0.967 Text=[\"Kraft Foods has taken the Cadbury chocolate brand…\"], URL=[http://www.fdin.org.uk/2012/01/kraft-la…]\n", 527 | "Row 7:\tScore=0.874 Text=[\"You must be a registered member to view this page…\"], URL=[http://www.golivewire.com/forums/profil…]\n", 528 | "Row 8:\tScore=0.912 Text=[\"|Facility Type:||Full Service Restaurant|\\n|Inspec…\"], URL=[http://www.healthspace.com/Clients/VDH/…]\n", 529 | "Row 9:\tScore=0.925 Text=[\"News of the Week\\nBarrie Spring Studio Tour\\nApril …\"], URL=[http://www.jillpricestudios.ca/artist/w…]\n" 530 | ] 531 | } 532 | ], 533 | "source": [ 534 | "import (\n", 535 | " parquet \"github.com/parquet-go/parquet-go\"\n", 536 | ")\n", 537 | "\n", 538 | "%%\n", 539 | "// Download repo file.\n", 540 | "repo := hub.New(FineWebID).WithType(hub.RepoTypeDataset).WithAuth(hfAuthToken)\n", 541 | "localSampleFile := must.M1(repo.DownloadFile(FineWebSampleFile))\n", 542 | "\n", 543 | "// Parquet reading using parquet-go: it's somewhat cumbersome (to open the file it needs its size!?), but it works.\n", 544 | "schema := parquet.SchemaOf(&FineWebEntry{})\n", 545 | "fSize := must.M1(os.Stat(localSampleFile)).Size()\n", 546 | "fReader := must.M1(os.Open(localSampleFile))\n", 547 | "fParquet := must.M1(parquet.OpenFile(fReader, fSize))\n", 548 | "reader := parquet.NewGenericReader[FineWebEntry](fParquet, schema)\n", 549 | "defer reader.Close()\n", 550 | "\n", 551 | "// Print first 10 rows:\n", 552 | "rows := make([]FineWebEntry, 10)\n", 553 | "n := must.M1(reader.Read(rows))\n", 554 | "fmt.Printf(\"%d rows read\\n\", n)\n", 555 | "for ii, row := range rows {\n", 556 | " fmt.Printf(\"Row %0d:\\tScore=%.3f Text=[%q], URL=[%s]\\n\", ii, row.Score, TrimString(row.Text, 50), TrimString(row.URL, 40))\n", 557 | "}\n" 558 | ] 559 | } 560 | ], 561 | "metadata": { 562 | "kernelspec": { 563 | "display_name": "Go (gonb)", 564 | "language": "go", 565 | "name": "gonb" 566 | }, 567 | "language_info": { 568 | "codemirror_mode": "", 569 | "file_extension": ".go", 570 | "mimetype": "text/x-go", 571 | "name": "go", 572 | "nbconvert_exporter": "", 573 | "pygments_lexer": "", 574 | "version": "go1.25.3" 575 | } 576 | }, 577 | "nbformat": 4, 578 | "nbformat_minor": 5 579 | } 580 | -------------------------------------------------------------------------------- /tokenizers/sentencepiece/private/protos/sentencepiece_model.pb.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | // Code generated by protoc-gen-go. DO NOT EDIT. 16 | // versions: 17 | // protoc-gen-go v1.35.1 18 | // protoc v3.21.12 19 | // source: com_github_gomlx_go-huggingface_tokenizers_sentencepiece_private_protos/sentencepiece_model.proto 20 | 21 | // Unique package name to avoid conflicts: the proto library won't allow two different 22 | // packages to define the same proto (under the same namespace). 23 | // This is broken, since that's what is needed ... a bad design from the ProtoBuf in Go. See more details here: 24 | // https://protobuf.dev/reference/go/faq/#namespace-conflict 25 | // So instead we change the proto namespace (package) name to globally unique package name: 26 | 27 | package protos 28 | 29 | import ( 30 | protoreflect "google.golang.org/protobuf/reflect/protoreflect" 31 | protoimpl "google.golang.org/protobuf/runtime/protoimpl" 32 | reflect "reflect" 33 | sync "sync" 34 | ) 35 | 36 | const ( 37 | // Verify that this generated code is sufficiently up-to-date. 38 | _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) 39 | // Verify that runtime/protoimpl is sufficiently up-to-date. 40 | _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) 41 | ) 42 | 43 | // Model type. only have UNIGRAM now. 44 | type TrainerSpec_ModelType int32 45 | 46 | const ( 47 | TrainerSpec_UNIGRAM TrainerSpec_ModelType = 1 // Unigram language model with dynamic algorithm 48 | TrainerSpec_BPE TrainerSpec_ModelType = 2 // Byte Pair Encoding 49 | TrainerSpec_WORD TrainerSpec_ModelType = 3 // Delimitered by whitespace. 50 | TrainerSpec_CHAR TrainerSpec_ModelType = 4 // tokenizes into character sequence 51 | ) 52 | 53 | // Enum value maps for TrainerSpec_ModelType. 54 | var ( 55 | TrainerSpec_ModelType_name = map[int32]string{ 56 | 1: "UNIGRAM", 57 | 2: "BPE", 58 | 3: "WORD", 59 | 4: "CHAR", 60 | } 61 | TrainerSpec_ModelType_value = map[string]int32{ 62 | "UNIGRAM": 1, 63 | "BPE": 2, 64 | "WORD": 3, 65 | "CHAR": 4, 66 | } 67 | ) 68 | 69 | func (x TrainerSpec_ModelType) Enum() *TrainerSpec_ModelType { 70 | p := new(TrainerSpec_ModelType) 71 | *p = x 72 | return p 73 | } 74 | 75 | func (x TrainerSpec_ModelType) String() string { 76 | return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) 77 | } 78 | 79 | func (TrainerSpec_ModelType) Descriptor() protoreflect.EnumDescriptor { 80 | return file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_enumTypes[0].Descriptor() 81 | } 82 | 83 | func (TrainerSpec_ModelType) Type() protoreflect.EnumType { 84 | return &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_enumTypes[0] 85 | } 86 | 87 | func (x TrainerSpec_ModelType) Number() protoreflect.EnumNumber { 88 | return protoreflect.EnumNumber(x) 89 | } 90 | 91 | // Deprecated: Do not use. 92 | func (x *TrainerSpec_ModelType) UnmarshalJSON(b []byte) error { 93 | num, err := protoimpl.X.UnmarshalJSONEnum(x.Descriptor(), b) 94 | if err != nil { 95 | return err 96 | } 97 | *x = TrainerSpec_ModelType(num) 98 | return nil 99 | } 100 | 101 | // Deprecated: Use TrainerSpec_ModelType.Descriptor instead. 102 | func (TrainerSpec_ModelType) EnumDescriptor() ([]byte, []int) { 103 | return file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescGZIP(), []int{0, 0} 104 | } 105 | 106 | type ModelProto_SentencePiece_Type int32 107 | 108 | const ( 109 | ModelProto_SentencePiece_NORMAL ModelProto_SentencePiece_Type = 1 // normal symbol 110 | ModelProto_SentencePiece_UNKNOWN ModelProto_SentencePiece_Type = 2 // unknown symbol. only for now. 111 | ModelProto_SentencePiece_CONTROL ModelProto_SentencePiece_Type = 3 // control symbols. , , <2ja> etc. 112 | ModelProto_SentencePiece_USER_DEFINED ModelProto_SentencePiece_Type = 4 // user defined symbols. 113 | // Typical usage of USER_DEFINED symbol 114 | // is placeholder. 115 | ModelProto_SentencePiece_BYTE ModelProto_SentencePiece_Type = 6 // byte symbols. Used when `byte_fallback` is true. 116 | ModelProto_SentencePiece_UNUSED ModelProto_SentencePiece_Type = 5 // this piece is not used. 117 | ) 118 | 119 | // Enum value maps for ModelProto_SentencePiece_Type. 120 | var ( 121 | ModelProto_SentencePiece_Type_name = map[int32]string{ 122 | 1: "NORMAL", 123 | 2: "UNKNOWN", 124 | 3: "CONTROL", 125 | 4: "USER_DEFINED", 126 | 6: "BYTE", 127 | 5: "UNUSED", 128 | } 129 | ModelProto_SentencePiece_Type_value = map[string]int32{ 130 | "NORMAL": 1, 131 | "UNKNOWN": 2, 132 | "CONTROL": 3, 133 | "USER_DEFINED": 4, 134 | "BYTE": 6, 135 | "UNUSED": 5, 136 | } 137 | ) 138 | 139 | func (x ModelProto_SentencePiece_Type) Enum() *ModelProto_SentencePiece_Type { 140 | p := new(ModelProto_SentencePiece_Type) 141 | *p = x 142 | return p 143 | } 144 | 145 | func (x ModelProto_SentencePiece_Type) String() string { 146 | return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) 147 | } 148 | 149 | func (ModelProto_SentencePiece_Type) Descriptor() protoreflect.EnumDescriptor { 150 | return file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_enumTypes[1].Descriptor() 151 | } 152 | 153 | func (ModelProto_SentencePiece_Type) Type() protoreflect.EnumType { 154 | return &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_enumTypes[1] 155 | } 156 | 157 | func (x ModelProto_SentencePiece_Type) Number() protoreflect.EnumNumber { 158 | return protoreflect.EnumNumber(x) 159 | } 160 | 161 | // Deprecated: Do not use. 162 | func (x *ModelProto_SentencePiece_Type) UnmarshalJSON(b []byte) error { 163 | num, err := protoimpl.X.UnmarshalJSONEnum(x.Descriptor(), b) 164 | if err != nil { 165 | return err 166 | } 167 | *x = ModelProto_SentencePiece_Type(num) 168 | return nil 169 | } 170 | 171 | // Deprecated: Use ModelProto_SentencePiece_Type.Descriptor instead. 172 | func (ModelProto_SentencePiece_Type) EnumDescriptor() ([]byte, []int) { 173 | return file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescGZIP(), []int{3, 0, 0} 174 | } 175 | 176 | // TrainerSpec encodes a various parameters for SentencePiece training. 177 | // Next id: 55 178 | type TrainerSpec struct { 179 | state protoimpl.MessageState 180 | sizeCache protoimpl.SizeCache 181 | unknownFields protoimpl.UnknownFields 182 | extensionFields protoimpl.ExtensionFields 183 | 184 | // ///////////////////////////////////////////////////////////////// 185 | // General parameters 186 | // 187 | // Input corpus files. 188 | // 189 | // Trainer accepts the following two formats: 190 | // A) Monolingual: plain text, one sentence per line. 191 | // B) Bilingual: TSV, source sentence target sentence 192 | // When bilingual data is passed, shared vocabulary model is built. 193 | // Note that the input file must be raw corpus, not a preprocessed corpus. 194 | // Trainer only loads the first `input_sentence_size` sentences specified 195 | // with this parameter. 196 | Input []string `protobuf:"bytes,1,rep,name=input" json:"input,omitempty"` 197 | // Input corpus format: 198 | // "text": one-sentence-per-line text format (default) 199 | // "tsv": sentence freq 200 | InputFormat *string `protobuf:"bytes,7,opt,name=input_format,json=inputFormat" json:"input_format,omitempty"` 201 | // Output model file prefix. 202 | // .model and .vocab are generated. 203 | ModelPrefix *string `protobuf:"bytes,2,opt,name=model_prefix,json=modelPrefix" json:"model_prefix,omitempty"` 204 | ModelType *TrainerSpec_ModelType `protobuf:"varint,3,opt,name=model_type,json=modelType,enum=com.github.gomlx.go_huggingface.sentencepiece.TrainerSpec_ModelType,def=1" json:"model_type,omitempty"` 205 | // Vocabulary size. 8k is the default size. 206 | VocabSize *int32 `protobuf:"varint,4,opt,name=vocab_size,json=vocabSize,def=8000" json:"vocab_size,omitempty"` 207 | // List of the languages this model can accept. 208 | // Since the model is language-agnostic, this field is used as a reference. 209 | AcceptLanguage []string `protobuf:"bytes,5,rep,name=accept_language,json=acceptLanguage" json:"accept_language,omitempty"` 210 | // Size of self-test samples, which are encoded in the model file. 211 | SelfTestSampleSize *int32 `protobuf:"varint,6,opt,name=self_test_sample_size,json=selfTestSampleSize,def=0" json:"self_test_sample_size,omitempty"` 212 | // Whether to use DP version of sentencepiece. Use it with TSV input format 213 | // (requires precomputed word tab counts to work). 214 | EnableDifferentialPrivacy *bool `protobuf:"varint,50,opt,name=enable_differential_privacy,json=enableDifferentialPrivacy,def=0" json:"enable_differential_privacy,omitempty"` 215 | // Set these parameters if you need DP version of sentencepiece. 216 | // std of noise to add. 217 | DifferentialPrivacyNoiseLevel *float32 `protobuf:"fixed32,51,opt,name=differential_privacy_noise_level,json=differentialPrivacyNoiseLevel,def=0" json:"differential_privacy_noise_level,omitempty"` 218 | // Clipping threshold to apply after adding noise. All the words with 219 | // frequency less than this value are dropped. 220 | DifferentialPrivacyClippingThreshold *uint64 `protobuf:"varint,52,opt,name=differential_privacy_clipping_threshold,json=differentialPrivacyClippingThreshold,def=0" json:"differential_privacy_clipping_threshold,omitempty"` 221 | // ///////////////////////////////////////////////////////////////// 222 | // Training parameters. 223 | // 224 | // Uses characters which cover the corpus with the ratio of `chars_coverage`. 225 | // This parameter determines the set of basic Alphabet of sentence piece. 226 | // 1.0 - `chars_coverage` characters are treated as UNK. 227 | // See also required_chars field. 228 | CharacterCoverage *float32 `protobuf:"fixed32,10,opt,name=character_coverage,json=characterCoverage,def=0.9995" json:"character_coverage,omitempty"` 229 | // Maximum size of sentences the trainer loads from `input` parameter. 230 | // Trainer simply loads the `input` files in sequence. 231 | // It is better to shuffle the input corpus randomly. 232 | InputSentenceSize *uint64 `protobuf:"varint,11,opt,name=input_sentence_size,json=inputSentenceSize,def=0" json:"input_sentence_size,omitempty"` 233 | ShuffleInputSentence *bool `protobuf:"varint,19,opt,name=shuffle_input_sentence,json=shuffleInputSentence,def=1" json:"shuffle_input_sentence,omitempty"` 234 | // Maximum size of sentences to make seed sentence pieces. 235 | // Extended suffix array is constructed to extract frequent 236 | // sub-strings from the corpus. This uses 20N working space, 237 | // where N is the size of corpus. 238 | // 239 | // Deprecated: Marked as deprecated in com_github_gomlx_go-huggingface_tokenizers_sentencepiece_private_protos/sentencepiece_model.proto. 240 | MiningSentenceSize *int32 `protobuf:"varint,12,opt,name=mining_sentence_size,json=miningSentenceSize" json:"mining_sentence_size,omitempty"` 241 | // Maximum size of sentences to train sentence pieces. 242 | // 243 | // Deprecated: Marked as deprecated in com_github_gomlx_go-huggingface_tokenizers_sentencepiece_private_protos/sentencepiece_model.proto. 244 | TrainingSentenceSize *int32 `protobuf:"varint,13,opt,name=training_sentence_size,json=trainingSentenceSize" json:"training_sentence_size,omitempty"` 245 | // The size of seed sentencepieces. 246 | // `seed_sentencepiece_size` must be larger than `vocab_size`. 247 | SeedSentencepieceSize *int32 `protobuf:"varint,14,opt,name=seed_sentencepiece_size,json=seedSentencepieceSize,def=1000000" json:"seed_sentencepiece_size,omitempty"` 248 | // In every EM sub-iterations, keeps top 249 | // `shrinking_factor` * `current sentencepieces size` with respect to 250 | // the loss of the sentence piece. This value should be smaller than 1.0. 251 | ShrinkingFactor *float32 `protobuf:"fixed32,15,opt,name=shrinking_factor,json=shrinkingFactor,def=0.75" json:"shrinking_factor,omitempty"` 252 | // The maximum sentence length in byte. The sentences with the length 253 | // larger than `max_sentence_length` is simply ignored. 254 | // Longer input tends to bring the following risks: 255 | // - Overflow during EM training (unigram language model only) 256 | // - Performance drop because of O(n log n) cost in BPE. 257 | MaxSentenceLength *int32 `protobuf:"varint,18,opt,name=max_sentence_length,json=maxSentenceLength,def=4192" json:"max_sentence_length,omitempty"` 258 | // Number of threads in the training. 259 | NumThreads *int32 `protobuf:"varint,16,opt,name=num_threads,json=numThreads,def=16" json:"num_threads,omitempty"` 260 | // Number of EM sub iterations. 261 | NumSubIterations *int32 `protobuf:"varint,17,opt,name=num_sub_iterations,json=numSubIterations,def=2" json:"num_sub_iterations,omitempty"` 262 | // ///////////////////////////////////////////////////////////////// 263 | // SentencePiece parameters which control the shapes of sentence piece. 264 | // 265 | // Maximum length of sentencepiece. 266 | MaxSentencepieceLength *int32 `protobuf:"varint,20,opt,name=max_sentencepiece_length,json=maxSentencepieceLength,def=16" json:"max_sentencepiece_length,omitempty"` 267 | // Uses Unicode script to split sentence pieces. 268 | // When `split_by_unicode_script` is true, we do not allow sentence piece to 269 | // include multiple Unicode scripts, e.g. "F1" is not a valid piece. 270 | // Exception: CJ characters (Hiragana/Katakana/Han) are all handled 271 | // as one script type, since Japanese word can consist of multiple scripts. 272 | // This exception is always applied regardless of the accept-language 273 | // parameter. 274 | SplitByUnicodeScript *bool `protobuf:"varint,21,opt,name=split_by_unicode_script,json=splitByUnicodeScript,def=1" json:"split_by_unicode_script,omitempty"` 275 | // When `split_by_number` is true, put a boundary between number and 276 | // non-number transition. If we want to treat "F1" is one token, set this flag 277 | // to be false. 278 | SplitByNumber *bool `protobuf:"varint,23,opt,name=split_by_number,json=splitByNumber,def=1" json:"split_by_number,omitempty"` 279 | // Use a white space to split sentence pieces. 280 | // When `split_by_whitespace` is false, we may have the piece containing 281 | // a white space in the middle. e.g., "in_the". 282 | SplitByWhitespace *bool `protobuf:"varint,22,opt,name=split_by_whitespace,json=splitByWhitespace,def=1" json:"split_by_whitespace,omitempty"` 283 | // Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello => 284 | // hello_. When `treat_whitespace_as_suffix` is true, 285 | // NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end 286 | // of sentence. 287 | TreatWhitespaceAsSuffix *bool `protobuf:"varint,24,opt,name=treat_whitespace_as_suffix,json=treatWhitespaceAsSuffix,def=0" json:"treat_whitespace_as_suffix,omitempty"` 288 | // Allows pieces that only contain whitespaces instead of appearing only as 289 | // prefix or suffix of other pieces. 290 | AllowWhitespaceOnlyPieces *bool `protobuf:"varint,26,opt,name=allow_whitespace_only_pieces,json=allowWhitespaceOnlyPieces,def=0" json:"allow_whitespace_only_pieces,omitempty"` 291 | // Split all digits (0-9) into separate pieces. 292 | SplitDigits *bool `protobuf:"varint,25,opt,name=split_digits,json=splitDigits,def=0" json:"split_digits,omitempty"` 293 | // Defines the pre-tokenization delimiter. 294 | // When specified, no pieces crossing this delimiter is not included 295 | // in the vocab. Then the delimiter string is virtually ignored 296 | // during the training. This field can allows constraints on the vocabulary 297 | // selection. Note that this field is available on unigram mode. 298 | PretokenizationDelimiter *string `protobuf:"bytes,53,opt,name=pretokenization_delimiter,json=pretokenizationDelimiter,def=" json:"pretokenization_delimiter,omitempty"` 299 | // ///////////////////////////////////////////////////////////////// 300 | // Vocabulary management 301 | // 302 | // Defines control symbols used as an indicator to 303 | // change the behavior of the decoder. and are pre-defined. 304 | // We can use this field to encode various meta information, 305 | // including language indicator in multilingual model. 306 | // These symbols are not visible to users, but visible to 307 | // the decoder. Note that when the input sentence contains control symbols, 308 | // they are not treated as one token, but segmented into normal pieces. 309 | // Control symbols must be inserted independently from the segmentation. 310 | ControlSymbols []string `protobuf:"bytes,30,rep,name=control_symbols,json=controlSymbols" json:"control_symbols,omitempty"` 311 | // Defines user defined symbols. 312 | // These symbols are added with extremely high score 313 | // so they are always treated as one unique symbol in any context. 314 | // Typical usage of user_defined_symbols is placeholder for named entities. 315 | UserDefinedSymbols []string `protobuf:"bytes,31,rep,name=user_defined_symbols,json=userDefinedSymbols" json:"user_defined_symbols,omitempty"` 316 | // Defines required characters. Each UTF8 character in this string is included 317 | // in the character set regardless of character_coverage value. Unlike 318 | // user_defined_symbols, these characters have scores based on the frequency 319 | // on input sentences, and the model can form subwords using characters 320 | // in this field. 321 | RequiredChars *string `protobuf:"bytes,36,opt,name=required_chars,json=requiredChars" json:"required_chars,omitempty"` 322 | // Decomposes unknown pieces into UTF-8 bytes. 323 | ByteFallback *bool `protobuf:"varint,35,opt,name=byte_fallback,json=byteFallback,def=0" json:"byte_fallback,omitempty"` 324 | // When creating the vocabulary file, defines whether or not to additionally 325 | // output the score for each piece. 326 | VocabularyOutputPieceScore *bool `protobuf:"varint,32,opt,name=vocabulary_output_piece_score,json=vocabularyOutputPieceScore,def=1" json:"vocabulary_output_piece_score,omitempty"` 327 | // `vocab_size` is treated as hard limit. Crash if 328 | // the model can not produce the vocab of size `vocab_size`, 329 | // When `hard_vocab_limit` is false, vocab_size is treated 330 | // as soft limit. Note that when model_type=char, 331 | // always assumes hard_vocab_limit = false. 332 | HardVocabLimit *bool `protobuf:"varint,33,opt,name=hard_vocab_limit,json=hardVocabLimit,def=1" json:"hard_vocab_limit,omitempty"` 333 | // use all symbols for vocab extraction. This flag is valid 334 | // if model type is either CHAR or WORD 335 | UseAllVocab *bool `protobuf:"varint,34,opt,name=use_all_vocab,json=useAllVocab,def=0" json:"use_all_vocab,omitempty"` 336 | // ///////////////////////////////////////////////////////////////// 337 | // Reserved special meta tokens. 338 | // * -1 is not used. 339 | // * unk_id must not be -1. 340 | // Id must starts with 0 and be contigous. 341 | UnkId *int32 `protobuf:"varint,40,opt,name=unk_id,json=unkId,def=0" json:"unk_id,omitempty"` // 342 | BosId *int32 `protobuf:"varint,41,opt,name=bos_id,json=bosId,def=1" json:"bos_id,omitempty"` // 343 | EosId *int32 `protobuf:"varint,42,opt,name=eos_id,json=eosId,def=2" json:"eos_id,omitempty"` // 344 | PadId *int32 `protobuf:"varint,43,opt,name=pad_id,json=padId,def=-1" json:"pad_id,omitempty"` // (padding) 345 | UnkPiece *string `protobuf:"bytes,45,opt,name=unk_piece,json=unkPiece,def=" json:"unk_piece,omitempty"` 346 | BosPiece *string `protobuf:"bytes,46,opt,name=bos_piece,json=bosPiece,def=" json:"bos_piece,omitempty"` 347 | EosPiece *string `protobuf:"bytes,47,opt,name=eos_piece,json=eosPiece,def=" json:"eos_piece,omitempty"` 348 | PadPiece *string `protobuf:"bytes,48,opt,name=pad_piece,json=padPiece,def=" json:"pad_piece,omitempty"` 349 | // Encodes into U+2047 (DOUBLE QUESTION MARK), 350 | // since this character can be useful both for user and 351 | // developer. We can easily figure out that is emitted. 352 | UnkSurface *string `protobuf:"bytes,44,opt,name=unk_surface,json=unkSurface,def= ⁇ " json:"unk_surface,omitempty"` 353 | // Increase bit depth to allow unigram model training on large 354 | // (>10M sentences) corpora. A Side-effect of enabling this flag 355 | // is increased memory usage. 356 | TrainExtremelyLargeCorpus *bool `protobuf:"varint,49,opt,name=train_extremely_large_corpus,json=trainExtremelyLargeCorpus,def=0" json:"train_extremely_large_corpus,omitempty"` 357 | // Path to a seed sentencepieces file, with one tab-separated 358 | // seed sentencepiece frequency per line. 359 | SeedSentencepiecesFile *string `protobuf:"bytes,54,opt,name=seed_sentencepieces_file,json=seedSentencepiecesFile,def=" json:"seed_sentencepieces_file,omitempty"` 360 | } 361 | 362 | // Default values for TrainerSpec fields. 363 | const ( 364 | Default_TrainerSpec_ModelType = TrainerSpec_UNIGRAM 365 | Default_TrainerSpec_VocabSize = int32(8000) 366 | Default_TrainerSpec_SelfTestSampleSize = int32(0) 367 | Default_TrainerSpec_EnableDifferentialPrivacy = bool(false) 368 | Default_TrainerSpec_DifferentialPrivacyNoiseLevel = float32(0) 369 | Default_TrainerSpec_DifferentialPrivacyClippingThreshold = uint64(0) 370 | Default_TrainerSpec_CharacterCoverage = float32(0.9994999766349792) 371 | Default_TrainerSpec_InputSentenceSize = uint64(0) 372 | Default_TrainerSpec_ShuffleInputSentence = bool(true) 373 | Default_TrainerSpec_SeedSentencepieceSize = int32(1000000) 374 | Default_TrainerSpec_ShrinkingFactor = float32(0.75) 375 | Default_TrainerSpec_MaxSentenceLength = int32(4192) 376 | Default_TrainerSpec_NumThreads = int32(16) 377 | Default_TrainerSpec_NumSubIterations = int32(2) 378 | Default_TrainerSpec_MaxSentencepieceLength = int32(16) 379 | Default_TrainerSpec_SplitByUnicodeScript = bool(true) 380 | Default_TrainerSpec_SplitByNumber = bool(true) 381 | Default_TrainerSpec_SplitByWhitespace = bool(true) 382 | Default_TrainerSpec_TreatWhitespaceAsSuffix = bool(false) 383 | Default_TrainerSpec_AllowWhitespaceOnlyPieces = bool(false) 384 | Default_TrainerSpec_SplitDigits = bool(false) 385 | Default_TrainerSpec_PretokenizationDelimiter = string("") 386 | Default_TrainerSpec_ByteFallback = bool(false) 387 | Default_TrainerSpec_VocabularyOutputPieceScore = bool(true) 388 | Default_TrainerSpec_HardVocabLimit = bool(true) 389 | Default_TrainerSpec_UseAllVocab = bool(false) 390 | Default_TrainerSpec_UnkId = int32(0) 391 | Default_TrainerSpec_BosId = int32(1) 392 | Default_TrainerSpec_EosId = int32(2) 393 | Default_TrainerSpec_PadId = int32(-1) 394 | Default_TrainerSpec_UnkPiece = string("") 395 | Default_TrainerSpec_BosPiece = string("") 396 | Default_TrainerSpec_EosPiece = string("") 397 | Default_TrainerSpec_PadPiece = string("") 398 | Default_TrainerSpec_UnkSurface = string(" ⁇ ") 399 | Default_TrainerSpec_TrainExtremelyLargeCorpus = bool(false) 400 | Default_TrainerSpec_SeedSentencepiecesFile = string("") 401 | ) 402 | 403 | func (x *TrainerSpec) Reset() { 404 | *x = TrainerSpec{} 405 | mi := &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes[0] 406 | ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) 407 | ms.StoreMessageInfo(mi) 408 | } 409 | 410 | func (x *TrainerSpec) String() string { 411 | return protoimpl.X.MessageStringOf(x) 412 | } 413 | 414 | func (*TrainerSpec) ProtoMessage() {} 415 | 416 | func (x *TrainerSpec) ProtoReflect() protoreflect.Message { 417 | mi := &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes[0] 418 | if x != nil { 419 | ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) 420 | if ms.LoadMessageInfo() == nil { 421 | ms.StoreMessageInfo(mi) 422 | } 423 | return ms 424 | } 425 | return mi.MessageOf(x) 426 | } 427 | 428 | // Deprecated: Use TrainerSpec.ProtoReflect.Descriptor instead. 429 | func (*TrainerSpec) Descriptor() ([]byte, []int) { 430 | return file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescGZIP(), []int{0} 431 | } 432 | 433 | func (x *TrainerSpec) GetInput() []string { 434 | if x != nil { 435 | return x.Input 436 | } 437 | return nil 438 | } 439 | 440 | func (x *TrainerSpec) GetInputFormat() string { 441 | if x != nil && x.InputFormat != nil { 442 | return *x.InputFormat 443 | } 444 | return "" 445 | } 446 | 447 | func (x *TrainerSpec) GetModelPrefix() string { 448 | if x != nil && x.ModelPrefix != nil { 449 | return *x.ModelPrefix 450 | } 451 | return "" 452 | } 453 | 454 | func (x *TrainerSpec) GetModelType() TrainerSpec_ModelType { 455 | if x != nil && x.ModelType != nil { 456 | return *x.ModelType 457 | } 458 | return Default_TrainerSpec_ModelType 459 | } 460 | 461 | func (x *TrainerSpec) GetVocabSize() int32 { 462 | if x != nil && x.VocabSize != nil { 463 | return *x.VocabSize 464 | } 465 | return Default_TrainerSpec_VocabSize 466 | } 467 | 468 | func (x *TrainerSpec) GetAcceptLanguage() []string { 469 | if x != nil { 470 | return x.AcceptLanguage 471 | } 472 | return nil 473 | } 474 | 475 | func (x *TrainerSpec) GetSelfTestSampleSize() int32 { 476 | if x != nil && x.SelfTestSampleSize != nil { 477 | return *x.SelfTestSampleSize 478 | } 479 | return Default_TrainerSpec_SelfTestSampleSize 480 | } 481 | 482 | func (x *TrainerSpec) GetEnableDifferentialPrivacy() bool { 483 | if x != nil && x.EnableDifferentialPrivacy != nil { 484 | return *x.EnableDifferentialPrivacy 485 | } 486 | return Default_TrainerSpec_EnableDifferentialPrivacy 487 | } 488 | 489 | func (x *TrainerSpec) GetDifferentialPrivacyNoiseLevel() float32 { 490 | if x != nil && x.DifferentialPrivacyNoiseLevel != nil { 491 | return *x.DifferentialPrivacyNoiseLevel 492 | } 493 | return Default_TrainerSpec_DifferentialPrivacyNoiseLevel 494 | } 495 | 496 | func (x *TrainerSpec) GetDifferentialPrivacyClippingThreshold() uint64 { 497 | if x != nil && x.DifferentialPrivacyClippingThreshold != nil { 498 | return *x.DifferentialPrivacyClippingThreshold 499 | } 500 | return Default_TrainerSpec_DifferentialPrivacyClippingThreshold 501 | } 502 | 503 | func (x *TrainerSpec) GetCharacterCoverage() float32 { 504 | if x != nil && x.CharacterCoverage != nil { 505 | return *x.CharacterCoverage 506 | } 507 | return Default_TrainerSpec_CharacterCoverage 508 | } 509 | 510 | func (x *TrainerSpec) GetInputSentenceSize() uint64 { 511 | if x != nil && x.InputSentenceSize != nil { 512 | return *x.InputSentenceSize 513 | } 514 | return Default_TrainerSpec_InputSentenceSize 515 | } 516 | 517 | func (x *TrainerSpec) GetShuffleInputSentence() bool { 518 | if x != nil && x.ShuffleInputSentence != nil { 519 | return *x.ShuffleInputSentence 520 | } 521 | return Default_TrainerSpec_ShuffleInputSentence 522 | } 523 | 524 | // Deprecated: Marked as deprecated in com_github_gomlx_go-huggingface_tokenizers_sentencepiece_private_protos/sentencepiece_model.proto. 525 | func (x *TrainerSpec) GetMiningSentenceSize() int32 { 526 | if x != nil && x.MiningSentenceSize != nil { 527 | return *x.MiningSentenceSize 528 | } 529 | return 0 530 | } 531 | 532 | // Deprecated: Marked as deprecated in com_github_gomlx_go-huggingface_tokenizers_sentencepiece_private_protos/sentencepiece_model.proto. 533 | func (x *TrainerSpec) GetTrainingSentenceSize() int32 { 534 | if x != nil && x.TrainingSentenceSize != nil { 535 | return *x.TrainingSentenceSize 536 | } 537 | return 0 538 | } 539 | 540 | func (x *TrainerSpec) GetSeedSentencepieceSize() int32 { 541 | if x != nil && x.SeedSentencepieceSize != nil { 542 | return *x.SeedSentencepieceSize 543 | } 544 | return Default_TrainerSpec_SeedSentencepieceSize 545 | } 546 | 547 | func (x *TrainerSpec) GetShrinkingFactor() float32 { 548 | if x != nil && x.ShrinkingFactor != nil { 549 | return *x.ShrinkingFactor 550 | } 551 | return Default_TrainerSpec_ShrinkingFactor 552 | } 553 | 554 | func (x *TrainerSpec) GetMaxSentenceLength() int32 { 555 | if x != nil && x.MaxSentenceLength != nil { 556 | return *x.MaxSentenceLength 557 | } 558 | return Default_TrainerSpec_MaxSentenceLength 559 | } 560 | 561 | func (x *TrainerSpec) GetNumThreads() int32 { 562 | if x != nil && x.NumThreads != nil { 563 | return *x.NumThreads 564 | } 565 | return Default_TrainerSpec_NumThreads 566 | } 567 | 568 | func (x *TrainerSpec) GetNumSubIterations() int32 { 569 | if x != nil && x.NumSubIterations != nil { 570 | return *x.NumSubIterations 571 | } 572 | return Default_TrainerSpec_NumSubIterations 573 | } 574 | 575 | func (x *TrainerSpec) GetMaxSentencepieceLength() int32 { 576 | if x != nil && x.MaxSentencepieceLength != nil { 577 | return *x.MaxSentencepieceLength 578 | } 579 | return Default_TrainerSpec_MaxSentencepieceLength 580 | } 581 | 582 | func (x *TrainerSpec) GetSplitByUnicodeScript() bool { 583 | if x != nil && x.SplitByUnicodeScript != nil { 584 | return *x.SplitByUnicodeScript 585 | } 586 | return Default_TrainerSpec_SplitByUnicodeScript 587 | } 588 | 589 | func (x *TrainerSpec) GetSplitByNumber() bool { 590 | if x != nil && x.SplitByNumber != nil { 591 | return *x.SplitByNumber 592 | } 593 | return Default_TrainerSpec_SplitByNumber 594 | } 595 | 596 | func (x *TrainerSpec) GetSplitByWhitespace() bool { 597 | if x != nil && x.SplitByWhitespace != nil { 598 | return *x.SplitByWhitespace 599 | } 600 | return Default_TrainerSpec_SplitByWhitespace 601 | } 602 | 603 | func (x *TrainerSpec) GetTreatWhitespaceAsSuffix() bool { 604 | if x != nil && x.TreatWhitespaceAsSuffix != nil { 605 | return *x.TreatWhitespaceAsSuffix 606 | } 607 | return Default_TrainerSpec_TreatWhitespaceAsSuffix 608 | } 609 | 610 | func (x *TrainerSpec) GetAllowWhitespaceOnlyPieces() bool { 611 | if x != nil && x.AllowWhitespaceOnlyPieces != nil { 612 | return *x.AllowWhitespaceOnlyPieces 613 | } 614 | return Default_TrainerSpec_AllowWhitespaceOnlyPieces 615 | } 616 | 617 | func (x *TrainerSpec) GetSplitDigits() bool { 618 | if x != nil && x.SplitDigits != nil { 619 | return *x.SplitDigits 620 | } 621 | return Default_TrainerSpec_SplitDigits 622 | } 623 | 624 | func (x *TrainerSpec) GetPretokenizationDelimiter() string { 625 | if x != nil && x.PretokenizationDelimiter != nil { 626 | return *x.PretokenizationDelimiter 627 | } 628 | return Default_TrainerSpec_PretokenizationDelimiter 629 | } 630 | 631 | func (x *TrainerSpec) GetControlSymbols() []string { 632 | if x != nil { 633 | return x.ControlSymbols 634 | } 635 | return nil 636 | } 637 | 638 | func (x *TrainerSpec) GetUserDefinedSymbols() []string { 639 | if x != nil { 640 | return x.UserDefinedSymbols 641 | } 642 | return nil 643 | } 644 | 645 | func (x *TrainerSpec) GetRequiredChars() string { 646 | if x != nil && x.RequiredChars != nil { 647 | return *x.RequiredChars 648 | } 649 | return "" 650 | } 651 | 652 | func (x *TrainerSpec) GetByteFallback() bool { 653 | if x != nil && x.ByteFallback != nil { 654 | return *x.ByteFallback 655 | } 656 | return Default_TrainerSpec_ByteFallback 657 | } 658 | 659 | func (x *TrainerSpec) GetVocabularyOutputPieceScore() bool { 660 | if x != nil && x.VocabularyOutputPieceScore != nil { 661 | return *x.VocabularyOutputPieceScore 662 | } 663 | return Default_TrainerSpec_VocabularyOutputPieceScore 664 | } 665 | 666 | func (x *TrainerSpec) GetHardVocabLimit() bool { 667 | if x != nil && x.HardVocabLimit != nil { 668 | return *x.HardVocabLimit 669 | } 670 | return Default_TrainerSpec_HardVocabLimit 671 | } 672 | 673 | func (x *TrainerSpec) GetUseAllVocab() bool { 674 | if x != nil && x.UseAllVocab != nil { 675 | return *x.UseAllVocab 676 | } 677 | return Default_TrainerSpec_UseAllVocab 678 | } 679 | 680 | func (x *TrainerSpec) GetUnkId() int32 { 681 | if x != nil && x.UnkId != nil { 682 | return *x.UnkId 683 | } 684 | return Default_TrainerSpec_UnkId 685 | } 686 | 687 | func (x *TrainerSpec) GetBosId() int32 { 688 | if x != nil && x.BosId != nil { 689 | return *x.BosId 690 | } 691 | return Default_TrainerSpec_BosId 692 | } 693 | 694 | func (x *TrainerSpec) GetEosId() int32 { 695 | if x != nil && x.EosId != nil { 696 | return *x.EosId 697 | } 698 | return Default_TrainerSpec_EosId 699 | } 700 | 701 | func (x *TrainerSpec) GetPadId() int32 { 702 | if x != nil && x.PadId != nil { 703 | return *x.PadId 704 | } 705 | return Default_TrainerSpec_PadId 706 | } 707 | 708 | func (x *TrainerSpec) GetUnkPiece() string { 709 | if x != nil && x.UnkPiece != nil { 710 | return *x.UnkPiece 711 | } 712 | return Default_TrainerSpec_UnkPiece 713 | } 714 | 715 | func (x *TrainerSpec) GetBosPiece() string { 716 | if x != nil && x.BosPiece != nil { 717 | return *x.BosPiece 718 | } 719 | return Default_TrainerSpec_BosPiece 720 | } 721 | 722 | func (x *TrainerSpec) GetEosPiece() string { 723 | if x != nil && x.EosPiece != nil { 724 | return *x.EosPiece 725 | } 726 | return Default_TrainerSpec_EosPiece 727 | } 728 | 729 | func (x *TrainerSpec) GetPadPiece() string { 730 | if x != nil && x.PadPiece != nil { 731 | return *x.PadPiece 732 | } 733 | return Default_TrainerSpec_PadPiece 734 | } 735 | 736 | func (x *TrainerSpec) GetUnkSurface() string { 737 | if x != nil && x.UnkSurface != nil { 738 | return *x.UnkSurface 739 | } 740 | return Default_TrainerSpec_UnkSurface 741 | } 742 | 743 | func (x *TrainerSpec) GetTrainExtremelyLargeCorpus() bool { 744 | if x != nil && x.TrainExtremelyLargeCorpus != nil { 745 | return *x.TrainExtremelyLargeCorpus 746 | } 747 | return Default_TrainerSpec_TrainExtremelyLargeCorpus 748 | } 749 | 750 | func (x *TrainerSpec) GetSeedSentencepiecesFile() string { 751 | if x != nil && x.SeedSentencepiecesFile != nil { 752 | return *x.SeedSentencepiecesFile 753 | } 754 | return Default_TrainerSpec_SeedSentencepiecesFile 755 | } 756 | 757 | // NormalizerSpec encodes a various parameters for string normalizaiton 758 | type NormalizerSpec struct { 759 | state protoimpl.MessageState 760 | sizeCache protoimpl.SizeCache 761 | unknownFields protoimpl.UnknownFields 762 | extensionFields protoimpl.ExtensionFields 763 | 764 | // name of normalization rule. 765 | Name *string `protobuf:"bytes,1,opt,name=name" json:"name,omitempty"` 766 | // Pre-compiled normalization rule created by 767 | // Builder::GetPrecompiledCharsMap() or Builder::CompileCharsMap() method. 768 | // Usually this field is set by Builder::GetNormalizerSpec() method. 769 | PrecompiledCharsmap []byte `protobuf:"bytes,2,opt,name=precompiled_charsmap,json=precompiledCharsmap" json:"precompiled_charsmap,omitempty"` 770 | // Adds dummy whitespace at the beginning of text in order to 771 | // treat "world" in "world" and "hello world" in the same way. 772 | AddDummyPrefix *bool `protobuf:"varint,3,opt,name=add_dummy_prefix,json=addDummyPrefix,def=1" json:"add_dummy_prefix,omitempty"` 773 | // Removes leading, trailing, and duplicate internal whitespace. 774 | RemoveExtraWhitespaces *bool `protobuf:"varint,4,opt,name=remove_extra_whitespaces,json=removeExtraWhitespaces,def=1" json:"remove_extra_whitespaces,omitempty"` 775 | // Replaces whitespace with meta symbol. 776 | // This field must be true to train sentence piece model. 777 | EscapeWhitespaces *bool `protobuf:"varint,5,opt,name=escape_whitespaces,json=escapeWhitespaces,def=1" json:"escape_whitespaces,omitempty"` 778 | // Custom normalization rule file in TSV format. 779 | // https://github.com/google/sentencepiece/blob/master/doc/normalization.md 780 | // This field is only used in SentencePieceTrainer::Train() method, which 781 | // compiles the rule into the binary rule stored in `precompiled_charsmap`. 782 | NormalizationRuleTsv *string `protobuf:"bytes,6,opt,name=normalization_rule_tsv,json=normalizationRuleTsv" json:"normalization_rule_tsv,omitempty"` 783 | } 784 | 785 | // Default values for NormalizerSpec fields. 786 | const ( 787 | Default_NormalizerSpec_AddDummyPrefix = bool(true) 788 | Default_NormalizerSpec_RemoveExtraWhitespaces = bool(true) 789 | Default_NormalizerSpec_EscapeWhitespaces = bool(true) 790 | ) 791 | 792 | func (x *NormalizerSpec) Reset() { 793 | *x = NormalizerSpec{} 794 | mi := &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes[1] 795 | ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) 796 | ms.StoreMessageInfo(mi) 797 | } 798 | 799 | func (x *NormalizerSpec) String() string { 800 | return protoimpl.X.MessageStringOf(x) 801 | } 802 | 803 | func (*NormalizerSpec) ProtoMessage() {} 804 | 805 | func (x *NormalizerSpec) ProtoReflect() protoreflect.Message { 806 | mi := &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes[1] 807 | if x != nil { 808 | ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) 809 | if ms.LoadMessageInfo() == nil { 810 | ms.StoreMessageInfo(mi) 811 | } 812 | return ms 813 | } 814 | return mi.MessageOf(x) 815 | } 816 | 817 | // Deprecated: Use NormalizerSpec.ProtoReflect.Descriptor instead. 818 | func (*NormalizerSpec) Descriptor() ([]byte, []int) { 819 | return file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescGZIP(), []int{1} 820 | } 821 | 822 | func (x *NormalizerSpec) GetName() string { 823 | if x != nil && x.Name != nil { 824 | return *x.Name 825 | } 826 | return "" 827 | } 828 | 829 | func (x *NormalizerSpec) GetPrecompiledCharsmap() []byte { 830 | if x != nil { 831 | return x.PrecompiledCharsmap 832 | } 833 | return nil 834 | } 835 | 836 | func (x *NormalizerSpec) GetAddDummyPrefix() bool { 837 | if x != nil && x.AddDummyPrefix != nil { 838 | return *x.AddDummyPrefix 839 | } 840 | return Default_NormalizerSpec_AddDummyPrefix 841 | } 842 | 843 | func (x *NormalizerSpec) GetRemoveExtraWhitespaces() bool { 844 | if x != nil && x.RemoveExtraWhitespaces != nil { 845 | return *x.RemoveExtraWhitespaces 846 | } 847 | return Default_NormalizerSpec_RemoveExtraWhitespaces 848 | } 849 | 850 | func (x *NormalizerSpec) GetEscapeWhitespaces() bool { 851 | if x != nil && x.EscapeWhitespaces != nil { 852 | return *x.EscapeWhitespaces 853 | } 854 | return Default_NormalizerSpec_EscapeWhitespaces 855 | } 856 | 857 | func (x *NormalizerSpec) GetNormalizationRuleTsv() string { 858 | if x != nil && x.NormalizationRuleTsv != nil { 859 | return *x.NormalizationRuleTsv 860 | } 861 | return "" 862 | } 863 | 864 | // Proto to store samples for self-testing. 865 | type SelfTestData struct { 866 | state protoimpl.MessageState 867 | sizeCache protoimpl.SizeCache 868 | unknownFields protoimpl.UnknownFields 869 | extensionFields protoimpl.ExtensionFields 870 | 871 | Samples []*SelfTestData_Sample `protobuf:"bytes,1,rep,name=samples" json:"samples,omitempty"` 872 | } 873 | 874 | func (x *SelfTestData) Reset() { 875 | *x = SelfTestData{} 876 | mi := &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes[2] 877 | ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) 878 | ms.StoreMessageInfo(mi) 879 | } 880 | 881 | func (x *SelfTestData) String() string { 882 | return protoimpl.X.MessageStringOf(x) 883 | } 884 | 885 | func (*SelfTestData) ProtoMessage() {} 886 | 887 | func (x *SelfTestData) ProtoReflect() protoreflect.Message { 888 | mi := &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes[2] 889 | if x != nil { 890 | ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) 891 | if ms.LoadMessageInfo() == nil { 892 | ms.StoreMessageInfo(mi) 893 | } 894 | return ms 895 | } 896 | return mi.MessageOf(x) 897 | } 898 | 899 | // Deprecated: Use SelfTestData.ProtoReflect.Descriptor instead. 900 | func (*SelfTestData) Descriptor() ([]byte, []int) { 901 | return file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescGZIP(), []int{2} 902 | } 903 | 904 | func (x *SelfTestData) GetSamples() []*SelfTestData_Sample { 905 | if x != nil { 906 | return x.Samples 907 | } 908 | return nil 909 | } 910 | 911 | // ModelProto stores model parameters. 912 | // SentencePieceProcessor is supposed to be self-contained. 913 | // All settings/parameters which may change the behavior must be encoded 914 | // in ModelProto. 915 | type ModelProto struct { 916 | state protoimpl.MessageState 917 | sizeCache protoimpl.SizeCache 918 | unknownFields protoimpl.UnknownFields 919 | extensionFields protoimpl.ExtensionFields 920 | 921 | // Sentence pieces with scores. 922 | Pieces []*ModelProto_SentencePiece `protobuf:"bytes,1,rep,name=pieces" json:"pieces,omitempty"` 923 | // Spec used to generate this model file. 924 | TrainerSpec *TrainerSpec `protobuf:"bytes,2,opt,name=trainer_spec,json=trainerSpec" json:"trainer_spec,omitempty"` 925 | // Spec for text normalization. 926 | NormalizerSpec *NormalizerSpec `protobuf:"bytes,3,opt,name=normalizer_spec,json=normalizerSpec" json:"normalizer_spec,omitempty"` 927 | // Stores sample input and its expected segmentation to verify the model. 928 | SelfTestData *SelfTestData `protobuf:"bytes,4,opt,name=self_test_data,json=selfTestData" json:"self_test_data,omitempty"` 929 | // Spec for text de-normalization. 930 | DenormalizerSpec *NormalizerSpec `protobuf:"bytes,5,opt,name=denormalizer_spec,json=denormalizerSpec" json:"denormalizer_spec,omitempty"` 931 | } 932 | 933 | func (x *ModelProto) Reset() { 934 | *x = ModelProto{} 935 | mi := &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes[3] 936 | ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) 937 | ms.StoreMessageInfo(mi) 938 | } 939 | 940 | func (x *ModelProto) String() string { 941 | return protoimpl.X.MessageStringOf(x) 942 | } 943 | 944 | func (*ModelProto) ProtoMessage() {} 945 | 946 | func (x *ModelProto) ProtoReflect() protoreflect.Message { 947 | mi := &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes[3] 948 | if x != nil { 949 | ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) 950 | if ms.LoadMessageInfo() == nil { 951 | ms.StoreMessageInfo(mi) 952 | } 953 | return ms 954 | } 955 | return mi.MessageOf(x) 956 | } 957 | 958 | // Deprecated: Use ModelProto.ProtoReflect.Descriptor instead. 959 | func (*ModelProto) Descriptor() ([]byte, []int) { 960 | return file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescGZIP(), []int{3} 961 | } 962 | 963 | func (x *ModelProto) GetPieces() []*ModelProto_SentencePiece { 964 | if x != nil { 965 | return x.Pieces 966 | } 967 | return nil 968 | } 969 | 970 | func (x *ModelProto) GetTrainerSpec() *TrainerSpec { 971 | if x != nil { 972 | return x.TrainerSpec 973 | } 974 | return nil 975 | } 976 | 977 | func (x *ModelProto) GetNormalizerSpec() *NormalizerSpec { 978 | if x != nil { 979 | return x.NormalizerSpec 980 | } 981 | return nil 982 | } 983 | 984 | func (x *ModelProto) GetSelfTestData() *SelfTestData { 985 | if x != nil { 986 | return x.SelfTestData 987 | } 988 | return nil 989 | } 990 | 991 | func (x *ModelProto) GetDenormalizerSpec() *NormalizerSpec { 992 | if x != nil { 993 | return x.DenormalizerSpec 994 | } 995 | return nil 996 | } 997 | 998 | type SelfTestData_Sample struct { 999 | state protoimpl.MessageState 1000 | sizeCache protoimpl.SizeCache 1001 | unknownFields protoimpl.UnknownFields 1002 | 1003 | Input *string `protobuf:"bytes,1,opt,name=input" json:"input,omitempty"` 1004 | Expected *string `protobuf:"bytes,2,opt,name=expected" json:"expected,omitempty"` 1005 | } 1006 | 1007 | func (x *SelfTestData_Sample) Reset() { 1008 | *x = SelfTestData_Sample{} 1009 | mi := &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes[4] 1010 | ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) 1011 | ms.StoreMessageInfo(mi) 1012 | } 1013 | 1014 | func (x *SelfTestData_Sample) String() string { 1015 | return protoimpl.X.MessageStringOf(x) 1016 | } 1017 | 1018 | func (*SelfTestData_Sample) ProtoMessage() {} 1019 | 1020 | func (x *SelfTestData_Sample) ProtoReflect() protoreflect.Message { 1021 | mi := &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes[4] 1022 | if x != nil { 1023 | ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) 1024 | if ms.LoadMessageInfo() == nil { 1025 | ms.StoreMessageInfo(mi) 1026 | } 1027 | return ms 1028 | } 1029 | return mi.MessageOf(x) 1030 | } 1031 | 1032 | // Deprecated: Use SelfTestData_Sample.ProtoReflect.Descriptor instead. 1033 | func (*SelfTestData_Sample) Descriptor() ([]byte, []int) { 1034 | return file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescGZIP(), []int{2, 0} 1035 | } 1036 | 1037 | func (x *SelfTestData_Sample) GetInput() string { 1038 | if x != nil && x.Input != nil { 1039 | return *x.Input 1040 | } 1041 | return "" 1042 | } 1043 | 1044 | func (x *SelfTestData_Sample) GetExpected() string { 1045 | if x != nil && x.Expected != nil { 1046 | return *x.Expected 1047 | } 1048 | return "" 1049 | } 1050 | 1051 | type ModelProto_SentencePiece struct { 1052 | state protoimpl.MessageState 1053 | sizeCache protoimpl.SizeCache 1054 | unknownFields protoimpl.UnknownFields 1055 | extensionFields protoimpl.ExtensionFields 1056 | 1057 | Piece *string `protobuf:"bytes,1,opt,name=piece" json:"piece,omitempty"` // piece must not be empty. 1058 | Score *float32 `protobuf:"fixed32,2,opt,name=score" json:"score,omitempty"` 1059 | Type *ModelProto_SentencePiece_Type `protobuf:"varint,3,opt,name=type,enum=com.github.gomlx.go_huggingface.sentencepiece.ModelProto_SentencePiece_Type,def=1" json:"type,omitempty"` 1060 | } 1061 | 1062 | // Default values for ModelProto_SentencePiece fields. 1063 | const ( 1064 | Default_ModelProto_SentencePiece_Type = ModelProto_SentencePiece_NORMAL 1065 | ) 1066 | 1067 | func (x *ModelProto_SentencePiece) Reset() { 1068 | *x = ModelProto_SentencePiece{} 1069 | mi := &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes[5] 1070 | ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) 1071 | ms.StoreMessageInfo(mi) 1072 | } 1073 | 1074 | func (x *ModelProto_SentencePiece) String() string { 1075 | return protoimpl.X.MessageStringOf(x) 1076 | } 1077 | 1078 | func (*ModelProto_SentencePiece) ProtoMessage() {} 1079 | 1080 | func (x *ModelProto_SentencePiece) ProtoReflect() protoreflect.Message { 1081 | mi := &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes[5] 1082 | if x != nil { 1083 | ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) 1084 | if ms.LoadMessageInfo() == nil { 1085 | ms.StoreMessageInfo(mi) 1086 | } 1087 | return ms 1088 | } 1089 | return mi.MessageOf(x) 1090 | } 1091 | 1092 | // Deprecated: Use ModelProto_SentencePiece.ProtoReflect.Descriptor instead. 1093 | func (*ModelProto_SentencePiece) Descriptor() ([]byte, []int) { 1094 | return file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescGZIP(), []int{3, 0} 1095 | } 1096 | 1097 | func (x *ModelProto_SentencePiece) GetPiece() string { 1098 | if x != nil && x.Piece != nil { 1099 | return *x.Piece 1100 | } 1101 | return "" 1102 | } 1103 | 1104 | func (x *ModelProto_SentencePiece) GetScore() float32 { 1105 | if x != nil && x.Score != nil { 1106 | return *x.Score 1107 | } 1108 | return 0 1109 | } 1110 | 1111 | func (x *ModelProto_SentencePiece) GetType() ModelProto_SentencePiece_Type { 1112 | if x != nil && x.Type != nil { 1113 | return *x.Type 1114 | } 1115 | return Default_ModelProto_SentencePiece_Type 1116 | } 1117 | 1118 | var File_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto protoreflect.FileDescriptor 1119 | 1120 | var file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDesc = []byte{ 1121 | 0x0a, 0x61, 0x63, 0x6f, 0x6d, 0x5f, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x5f, 0x67, 0x6f, 0x6d, 1122 | 0x6c, 0x78, 0x5f, 0x67, 0x6f, 0x2d, 0x68, 0x75, 0x67, 0x67, 0x69, 0x6e, 0x67, 0x66, 0x61, 0x63, 1123 | 0x65, 0x5f, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x69, 0x7a, 0x65, 0x72, 0x73, 0x5f, 0x73, 0x65, 0x6e, 1124 | 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x5f, 0x70, 0x72, 0x69, 0x76, 0x61, 1125 | 0x74, 0x65, 0x5f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x73, 0x2f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 1126 | 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x5f, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x2e, 0x70, 0x72, 1127 | 0x6f, 0x74, 0x6f, 0x12, 0x2d, 0x63, 0x6f, 0x6d, 0x2e, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 1128 | 0x67, 0x6f, 0x6d, 0x6c, 0x78, 0x2e, 0x67, 0x6f, 0x5f, 0x68, 0x75, 0x67, 0x67, 0x69, 0x6e, 0x67, 1129 | 0x66, 0x61, 0x63, 0x65, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 1130 | 0x63, 0x65, 0x22, 0xe6, 0x12, 0x0a, 0x0b, 0x54, 0x72, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, 0x70, 1131 | 0x65, 0x63, 0x12, 0x14, 0x0a, 0x05, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x18, 0x01, 0x20, 0x03, 0x28, 1132 | 0x09, 0x52, 0x05, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x12, 0x21, 0x0a, 0x0c, 0x69, 0x6e, 0x70, 0x75, 1133 | 0x74, 0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x18, 0x07, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 1134 | 0x69, 0x6e, 0x70, 0x75, 0x74, 0x46, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x12, 0x21, 0x0a, 0x0c, 0x6d, 1135 | 0x6f, 0x64, 0x65, 0x6c, 0x5f, 0x70, 0x72, 0x65, 0x66, 0x69, 0x78, 0x18, 0x02, 0x20, 0x01, 0x28, 1136 | 0x09, 0x52, 0x0b, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x50, 0x72, 0x65, 0x66, 0x69, 0x78, 0x12, 0x6c, 1137 | 0x0a, 0x0a, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x5f, 0x74, 0x79, 0x70, 0x65, 0x18, 0x03, 0x20, 0x01, 1138 | 0x28, 0x0e, 0x32, 0x44, 0x2e, 0x63, 0x6f, 0x6d, 0x2e, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 1139 | 0x67, 0x6f, 0x6d, 0x6c, 0x78, 0x2e, 0x67, 0x6f, 0x5f, 0x68, 0x75, 0x67, 0x67, 0x69, 0x6e, 0x67, 1140 | 0x66, 0x61, 0x63, 0x65, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 1141 | 0x63, 0x65, 0x2e, 0x54, 0x72, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x2e, 0x4d, 1142 | 0x6f, 0x64, 0x65, 0x6c, 0x54, 0x79, 0x70, 0x65, 0x3a, 0x07, 0x55, 0x4e, 0x49, 0x47, 0x52, 0x41, 1143 | 0x4d, 0x52, 0x09, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x54, 0x79, 0x70, 0x65, 0x12, 0x23, 0x0a, 0x0a, 1144 | 0x76, 0x6f, 0x63, 0x61, 0x62, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 1145 | 0x3a, 0x04, 0x38, 0x30, 0x30, 0x30, 0x52, 0x09, 0x76, 0x6f, 0x63, 0x61, 0x62, 0x53, 0x69, 0x7a, 1146 | 0x65, 0x12, 0x27, 0x0a, 0x0f, 0x61, 0x63, 0x63, 0x65, 0x70, 0x74, 0x5f, 0x6c, 0x61, 0x6e, 0x67, 1147 | 0x75, 0x61, 0x67, 0x65, 0x18, 0x05, 0x20, 0x03, 0x28, 0x09, 0x52, 0x0e, 0x61, 0x63, 0x63, 0x65, 1148 | 0x70, 0x74, 0x4c, 0x61, 0x6e, 0x67, 0x75, 0x61, 0x67, 0x65, 0x12, 0x34, 0x0a, 0x15, 0x73, 0x65, 1149 | 0x6c, 0x66, 0x5f, 0x74, 0x65, 0x73, 0x74, 0x5f, 0x73, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x5f, 0x73, 1150 | 0x69, 0x7a, 0x65, 0x18, 0x06, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x01, 0x30, 0x52, 0x12, 0x73, 0x65, 1151 | 0x6c, 0x66, 0x54, 0x65, 0x73, 0x74, 0x53, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x53, 0x69, 0x7a, 0x65, 1152 | 0x12, 0x45, 0x0a, 0x1b, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x64, 0x69, 0x66, 0x66, 0x65, 1153 | 0x72, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x70, 0x72, 0x69, 0x76, 0x61, 0x63, 0x79, 0x18, 1154 | 0x32, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x52, 0x19, 0x65, 0x6e, 1155 | 0x61, 0x62, 0x6c, 0x65, 0x44, 0x69, 0x66, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 1156 | 0x50, 0x72, 0x69, 0x76, 0x61, 0x63, 0x79, 0x12, 0x4a, 0x0a, 0x20, 0x64, 0x69, 0x66, 0x66, 0x65, 1157 | 0x72, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x70, 0x72, 0x69, 0x76, 0x61, 0x63, 0x79, 0x5f, 1158 | 0x6e, 0x6f, 0x69, 0x73, 0x65, 0x5f, 0x6c, 0x65, 0x76, 0x65, 0x6c, 0x18, 0x33, 0x20, 0x01, 0x28, 1159 | 0x02, 0x3a, 0x01, 0x30, 0x52, 0x1d, 0x64, 0x69, 0x66, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x74, 0x69, 1160 | 0x61, 0x6c, 0x50, 0x72, 0x69, 0x76, 0x61, 0x63, 0x79, 0x4e, 0x6f, 0x69, 0x73, 0x65, 0x4c, 0x65, 1161 | 0x76, 0x65, 0x6c, 0x12, 0x58, 0x0a, 0x27, 0x64, 0x69, 0x66, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x74, 1162 | 0x69, 0x61, 0x6c, 0x5f, 0x70, 0x72, 0x69, 0x76, 0x61, 0x63, 0x79, 0x5f, 0x63, 0x6c, 0x69, 0x70, 1163 | 0x70, 0x69, 0x6e, 0x67, 0x5f, 0x74, 0x68, 0x72, 0x65, 0x73, 0x68, 0x6f, 0x6c, 0x64, 0x18, 0x34, 1164 | 0x20, 0x01, 0x28, 0x04, 0x3a, 0x01, 0x30, 0x52, 0x24, 0x64, 0x69, 0x66, 0x66, 0x65, 0x72, 0x65, 1165 | 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x50, 0x72, 0x69, 0x76, 0x61, 0x63, 0x79, 0x43, 0x6c, 0x69, 0x70, 1166 | 0x70, 0x69, 0x6e, 0x67, 0x54, 0x68, 0x72, 0x65, 0x73, 0x68, 0x6f, 0x6c, 0x64, 0x12, 0x35, 0x0a, 1167 | 0x12, 0x63, 0x68, 0x61, 0x72, 0x61, 0x63, 0x74, 0x65, 0x72, 0x5f, 0x63, 0x6f, 0x76, 0x65, 0x72, 1168 | 0x61, 0x67, 0x65, 0x18, 0x0a, 0x20, 0x01, 0x28, 0x02, 0x3a, 0x06, 0x30, 0x2e, 0x39, 0x39, 0x39, 1169 | 0x35, 0x52, 0x11, 0x63, 0x68, 0x61, 0x72, 0x61, 0x63, 0x74, 0x65, 0x72, 0x43, 0x6f, 0x76, 0x65, 1170 | 0x72, 0x61, 0x67, 0x65, 0x12, 0x31, 0x0a, 0x13, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x73, 0x65, 1171 | 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x18, 0x0b, 0x20, 0x01, 0x28, 1172 | 0x04, 0x3a, 0x01, 0x30, 0x52, 0x11, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x53, 0x65, 0x6e, 0x74, 0x65, 1173 | 0x6e, 0x63, 0x65, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x3a, 0x0a, 0x16, 0x73, 0x68, 0x75, 0x66, 0x66, 1174 | 0x6c, 0x65, 0x5f, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 1175 | 0x65, 0x18, 0x13, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x14, 0x73, 1176 | 0x68, 0x75, 0x66, 0x66, 0x6c, 0x65, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x53, 0x65, 0x6e, 0x74, 0x65, 1177 | 0x6e, 0x63, 0x65, 0x12, 0x34, 0x0a, 0x14, 0x6d, 0x69, 0x6e, 0x69, 0x6e, 0x67, 0x5f, 0x73, 0x65, 1178 | 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x18, 0x0c, 0x20, 0x01, 0x28, 1179 | 0x05, 0x42, 0x02, 0x18, 0x01, 0x52, 0x12, 0x6d, 0x69, 0x6e, 0x69, 0x6e, 0x67, 0x53, 0x65, 0x6e, 1180 | 0x74, 0x65, 0x6e, 0x63, 0x65, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x38, 0x0a, 0x16, 0x74, 0x72, 0x61, 1181 | 0x69, 0x6e, 0x69, 0x6e, 0x67, 0x5f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x73, 1182 | 0x69, 0x7a, 0x65, 0x18, 0x0d, 0x20, 0x01, 0x28, 0x05, 0x42, 0x02, 0x18, 0x01, 0x52, 0x14, 0x74, 1183 | 0x72, 0x61, 0x69, 0x6e, 0x69, 0x6e, 0x67, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x53, 1184 | 0x69, 0x7a, 0x65, 0x12, 0x3f, 0x0a, 0x17, 0x73, 0x65, 0x65, 0x64, 0x5f, 0x73, 0x65, 0x6e, 0x74, 1185 | 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x18, 0x0e, 1186 | 0x20, 0x01, 0x28, 0x05, 0x3a, 0x07, 0x31, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x52, 0x15, 0x73, 1187 | 0x65, 0x65, 0x64, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 1188 | 0x53, 0x69, 0x7a, 0x65, 0x12, 0x2f, 0x0a, 0x10, 0x73, 0x68, 0x72, 0x69, 0x6e, 0x6b, 0x69, 0x6e, 1189 | 0x67, 0x5f, 0x66, 0x61, 0x63, 0x74, 0x6f, 0x72, 0x18, 0x0f, 0x20, 0x01, 0x28, 0x02, 0x3a, 0x04, 1190 | 0x30, 0x2e, 0x37, 0x35, 0x52, 0x0f, 0x73, 0x68, 0x72, 0x69, 0x6e, 0x6b, 0x69, 0x6e, 0x67, 0x46, 1191 | 0x61, 0x63, 0x74, 0x6f, 0x72, 0x12, 0x34, 0x0a, 0x13, 0x6d, 0x61, 0x78, 0x5f, 0x73, 0x65, 0x6e, 1192 | 0x74, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x18, 0x12, 0x20, 0x01, 1193 | 0x28, 0x05, 0x3a, 0x04, 0x34, 0x31, 0x39, 0x32, 0x52, 0x11, 0x6d, 0x61, 0x78, 0x53, 0x65, 0x6e, 1194 | 0x74, 0x65, 0x6e, 0x63, 0x65, 0x4c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x12, 0x23, 0x0a, 0x0b, 0x6e, 1195 | 0x75, 0x6d, 0x5f, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x73, 0x18, 0x10, 0x20, 0x01, 0x28, 0x05, 1196 | 0x3a, 0x02, 0x31, 0x36, 0x52, 0x0a, 0x6e, 0x75, 0x6d, 0x54, 0x68, 0x72, 0x65, 0x61, 0x64, 0x73, 1197 | 0x12, 0x2f, 0x0a, 0x12, 0x6e, 0x75, 0x6d, 0x5f, 0x73, 0x75, 0x62, 0x5f, 0x69, 0x74, 0x65, 0x72, 1198 | 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x18, 0x11, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x01, 0x32, 0x52, 1199 | 0x10, 0x6e, 0x75, 0x6d, 0x53, 0x75, 0x62, 0x49, 0x74, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 1200 | 0x73, 0x12, 0x3c, 0x0a, 0x18, 0x6d, 0x61, 0x78, 0x5f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 1201 | 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x5f, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x18, 0x14, 0x20, 1202 | 0x01, 0x28, 0x05, 0x3a, 0x02, 0x31, 0x36, 0x52, 0x16, 0x6d, 0x61, 0x78, 0x53, 0x65, 0x6e, 0x74, 1203 | 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x4c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x12, 1204 | 0x3b, 0x0a, 0x17, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x5f, 0x62, 0x79, 0x5f, 0x75, 0x6e, 0x69, 0x63, 1205 | 0x6f, 0x64, 0x65, 0x5f, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x18, 0x15, 0x20, 0x01, 0x28, 0x08, 1206 | 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x14, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x42, 0x79, 0x55, 1207 | 0x6e, 0x69, 0x63, 0x6f, 0x64, 0x65, 0x53, 0x63, 0x72, 0x69, 0x70, 0x74, 0x12, 0x2c, 0x0a, 0x0f, 1208 | 0x73, 0x70, 0x6c, 0x69, 0x74, 0x5f, 0x62, 0x79, 0x5f, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x18, 1209 | 0x17, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x0d, 0x73, 0x70, 0x6c, 1210 | 0x69, 0x74, 0x42, 0x79, 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x12, 0x34, 0x0a, 0x13, 0x73, 0x70, 1211 | 0x6c, 0x69, 0x74, 0x5f, 0x62, 0x79, 0x5f, 0x77, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 1212 | 0x65, 0x18, 0x16, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x11, 0x73, 1213 | 0x70, 0x6c, 0x69, 0x74, 0x42, 0x79, 0x57, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 1214 | 0x12, 0x42, 0x0a, 0x1a, 0x74, 0x72, 0x65, 0x61, 0x74, 0x5f, 0x77, 0x68, 0x69, 0x74, 0x65, 0x73, 1215 | 0x70, 0x61, 0x63, 0x65, 0x5f, 0x61, 0x73, 0x5f, 0x73, 0x75, 0x66, 0x66, 0x69, 0x78, 0x18, 0x18, 1216 | 0x20, 0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x52, 0x17, 0x74, 0x72, 0x65, 1217 | 0x61, 0x74, 0x57, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x41, 0x73, 0x53, 0x75, 1218 | 0x66, 0x66, 0x69, 0x78, 0x12, 0x46, 0x0a, 0x1c, 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x5f, 0x77, 0x68, 1219 | 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x5f, 0x70, 0x69, 1220 | 0x65, 0x63, 0x65, 0x73, 0x18, 0x1a, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c, 0x73, 1221 | 0x65, 0x52, 0x19, 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x57, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 1222 | 0x63, 0x65, 0x4f, 0x6e, 0x6c, 0x79, 0x50, 0x69, 0x65, 0x63, 0x65, 0x73, 0x12, 0x28, 0x0a, 0x0c, 1223 | 0x73, 0x70, 0x6c, 0x69, 0x74, 0x5f, 0x64, 0x69, 0x67, 0x69, 0x74, 0x73, 0x18, 0x19, 0x20, 0x01, 1224 | 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x52, 0x0b, 0x73, 0x70, 0x6c, 0x69, 0x74, 1225 | 0x44, 0x69, 0x67, 0x69, 0x74, 0x73, 0x12, 0x3d, 0x0a, 0x19, 0x70, 0x72, 0x65, 0x74, 0x6f, 0x6b, 1226 | 0x65, 0x6e, 0x69, 0x7a, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x64, 0x65, 0x6c, 0x69, 0x6d, 0x69, 1227 | 0x74, 0x65, 0x72, 0x18, 0x35, 0x20, 0x01, 0x28, 0x09, 0x3a, 0x00, 0x52, 0x18, 0x70, 0x72, 0x65, 1228 | 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x69, 0x7a, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x44, 0x65, 0x6c, 0x69, 1229 | 0x6d, 0x69, 0x74, 0x65, 0x72, 0x12, 0x27, 0x0a, 0x0f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 1230 | 0x5f, 0x73, 0x79, 0x6d, 0x62, 0x6f, 0x6c, 0x73, 0x18, 0x1e, 0x20, 0x03, 0x28, 0x09, 0x52, 0x0e, 1231 | 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x53, 0x79, 0x6d, 0x62, 0x6f, 0x6c, 0x73, 0x12, 0x30, 1232 | 0x0a, 0x14, 0x75, 0x73, 0x65, 0x72, 0x5f, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x64, 0x5f, 0x73, 1233 | 0x79, 0x6d, 0x62, 0x6f, 0x6c, 0x73, 0x18, 0x1f, 0x20, 0x03, 0x28, 0x09, 0x52, 0x12, 0x75, 0x73, 1234 | 0x65, 0x72, 0x44, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x64, 0x53, 0x79, 0x6d, 0x62, 0x6f, 0x6c, 0x73, 1235 | 0x12, 0x25, 0x0a, 0x0e, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x5f, 0x63, 0x68, 0x61, 1236 | 0x72, 0x73, 0x18, 0x24, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0d, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 1237 | 0x65, 0x64, 0x43, 0x68, 0x61, 0x72, 0x73, 0x12, 0x2a, 0x0a, 0x0d, 0x62, 0x79, 0x74, 0x65, 0x5f, 1238 | 0x66, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x18, 0x23, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x05, 1239 | 0x66, 0x61, 0x6c, 0x73, 0x65, 0x52, 0x0c, 0x62, 0x79, 0x74, 0x65, 0x46, 0x61, 0x6c, 0x6c, 0x62, 1240 | 0x61, 0x63, 0x6b, 0x12, 0x47, 0x0a, 0x1d, 0x76, 0x6f, 0x63, 0x61, 0x62, 0x75, 0x6c, 0x61, 0x72, 1241 | 0x79, 0x5f, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x5f, 0x70, 0x69, 0x65, 0x63, 0x65, 0x5f, 0x73, 1242 | 0x63, 0x6f, 0x72, 0x65, 0x18, 0x20, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 1243 | 0x52, 0x1a, 0x76, 0x6f, 0x63, 0x61, 0x62, 0x75, 0x6c, 0x61, 0x72, 0x79, 0x4f, 0x75, 0x74, 0x70, 1244 | 0x75, 0x74, 0x50, 0x69, 0x65, 0x63, 0x65, 0x53, 0x63, 0x6f, 0x72, 0x65, 0x12, 0x2e, 0x0a, 0x10, 1245 | 0x68, 0x61, 0x72, 0x64, 0x5f, 0x76, 0x6f, 0x63, 0x61, 0x62, 0x5f, 0x6c, 0x69, 0x6d, 0x69, 0x74, 1246 | 0x18, 0x21, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x0e, 0x68, 0x61, 1247 | 0x72, 0x64, 0x56, 0x6f, 0x63, 0x61, 0x62, 0x4c, 0x69, 0x6d, 0x69, 0x74, 0x12, 0x29, 0x0a, 0x0d, 1248 | 0x75, 0x73, 0x65, 0x5f, 0x61, 0x6c, 0x6c, 0x5f, 0x76, 0x6f, 0x63, 0x61, 0x62, 0x18, 0x22, 0x20, 1249 | 0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x52, 0x0b, 0x75, 0x73, 0x65, 0x41, 1250 | 0x6c, 0x6c, 0x56, 0x6f, 0x63, 0x61, 0x62, 0x12, 0x18, 0x0a, 0x06, 0x75, 0x6e, 0x6b, 0x5f, 0x69, 1251 | 0x64, 0x18, 0x28, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x01, 0x30, 0x52, 0x05, 0x75, 0x6e, 0x6b, 0x49, 1252 | 0x64, 0x12, 0x18, 0x0a, 0x06, 0x62, 0x6f, 0x73, 0x5f, 0x69, 0x64, 0x18, 0x29, 0x20, 0x01, 0x28, 1253 | 0x05, 0x3a, 0x01, 0x31, 0x52, 0x05, 0x62, 0x6f, 0x73, 0x49, 0x64, 0x12, 0x18, 0x0a, 0x06, 0x65, 1254 | 0x6f, 0x73, 0x5f, 0x69, 0x64, 0x18, 0x2a, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x01, 0x32, 0x52, 0x05, 1255 | 0x65, 0x6f, 0x73, 0x49, 0x64, 0x12, 0x19, 0x0a, 0x06, 0x70, 0x61, 0x64, 0x5f, 0x69, 0x64, 0x18, 1256 | 0x2b, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x02, 0x2d, 0x31, 0x52, 0x05, 0x70, 0x61, 0x64, 0x49, 0x64, 1257 | 0x12, 0x22, 0x0a, 0x09, 0x75, 0x6e, 0x6b, 0x5f, 0x70, 0x69, 0x65, 0x63, 0x65, 0x18, 0x2d, 0x20, 1258 | 0x01, 0x28, 0x09, 0x3a, 0x05, 0x3c, 0x75, 0x6e, 0x6b, 0x3e, 0x52, 0x08, 0x75, 0x6e, 0x6b, 0x50, 1259 | 0x69, 0x65, 0x63, 0x65, 0x12, 0x20, 0x0a, 0x09, 0x62, 0x6f, 0x73, 0x5f, 0x70, 0x69, 0x65, 0x63, 1260 | 0x65, 0x18, 0x2e, 0x20, 0x01, 0x28, 0x09, 0x3a, 0x03, 0x3c, 0x73, 0x3e, 0x52, 0x08, 0x62, 0x6f, 1261 | 0x73, 0x50, 0x69, 0x65, 0x63, 0x65, 0x12, 0x21, 0x0a, 0x09, 0x65, 0x6f, 0x73, 0x5f, 0x70, 0x69, 1262 | 0x65, 0x63, 0x65, 0x18, 0x2f, 0x20, 0x01, 0x28, 0x09, 0x3a, 0x04, 0x3c, 0x2f, 0x73, 0x3e, 0x52, 1263 | 0x08, 0x65, 0x6f, 0x73, 0x50, 0x69, 0x65, 0x63, 0x65, 0x12, 0x22, 0x0a, 0x09, 0x70, 0x61, 0x64, 1264 | 0x5f, 0x70, 0x69, 0x65, 0x63, 0x65, 0x18, 0x30, 0x20, 0x01, 0x28, 0x09, 0x3a, 0x05, 0x3c, 0x70, 1265 | 0x61, 0x64, 0x3e, 0x52, 0x08, 0x70, 0x61, 0x64, 0x50, 0x69, 0x65, 0x63, 0x65, 0x12, 0x26, 0x0a, 1266 | 0x0b, 0x75, 0x6e, 0x6b, 0x5f, 0x73, 0x75, 0x72, 0x66, 0x61, 0x63, 0x65, 0x18, 0x2c, 0x20, 0x01, 1267 | 0x28, 0x09, 0x3a, 0x05, 0x20, 0xe2, 0x81, 0x87, 0x20, 0x52, 0x0a, 0x75, 0x6e, 0x6b, 0x53, 0x75, 1268 | 0x72, 0x66, 0x61, 0x63, 0x65, 0x12, 0x46, 0x0a, 0x1c, 0x74, 0x72, 0x61, 0x69, 0x6e, 0x5f, 0x65, 1269 | 0x78, 0x74, 0x72, 0x65, 0x6d, 0x65, 0x6c, 0x79, 0x5f, 0x6c, 0x61, 0x72, 0x67, 0x65, 0x5f, 0x63, 1270 | 0x6f, 0x72, 0x70, 0x75, 0x73, 0x18, 0x31, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c, 1271 | 0x73, 0x65, 0x52, 0x19, 0x74, 0x72, 0x61, 0x69, 0x6e, 0x45, 0x78, 0x74, 0x72, 0x65, 0x6d, 0x65, 1272 | 0x6c, 0x79, 0x4c, 0x61, 0x72, 0x67, 0x65, 0x43, 0x6f, 0x72, 0x70, 0x75, 0x73, 0x12, 0x3a, 0x0a, 1273 | 0x18, 0x73, 0x65, 0x65, 0x64, 0x5f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 1274 | 0x65, 0x63, 0x65, 0x73, 0x5f, 0x66, 0x69, 0x6c, 0x65, 0x18, 0x36, 0x20, 0x01, 0x28, 0x09, 0x3a, 1275 | 0x00, 0x52, 0x16, 0x73, 0x65, 0x65, 0x64, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 1276 | 0x69, 0x65, 0x63, 0x65, 0x73, 0x46, 0x69, 0x6c, 0x65, 0x22, 0x35, 0x0a, 0x09, 0x4d, 0x6f, 0x64, 1277 | 0x65, 0x6c, 0x54, 0x79, 0x70, 0x65, 0x12, 0x0b, 0x0a, 0x07, 0x55, 0x4e, 0x49, 0x47, 0x52, 0x41, 1278 | 0x4d, 0x10, 0x01, 0x12, 0x07, 0x0a, 0x03, 0x42, 0x50, 0x45, 0x10, 0x02, 0x12, 0x08, 0x0a, 0x04, 1279 | 0x57, 0x4f, 0x52, 0x44, 0x10, 0x03, 0x12, 0x08, 0x0a, 0x04, 0x43, 0x48, 0x41, 0x52, 0x10, 0x04, 1280 | 0x2a, 0x09, 0x08, 0xc8, 0x01, 0x10, 0x80, 0x80, 0x80, 0x80, 0x02, 0x22, 0xbd, 0x02, 0x0a, 0x0e, 1281 | 0x4e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x12, 0x12, 1282 | 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, 1283 | 0x6d, 0x65, 0x12, 0x31, 0x0a, 0x14, 0x70, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 1284 | 0x64, 0x5f, 0x63, 0x68, 0x61, 0x72, 0x73, 0x6d, 0x61, 0x70, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0c, 1285 | 0x52, 0x13, 0x70, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x43, 0x68, 0x61, 1286 | 0x72, 0x73, 0x6d, 0x61, 0x70, 0x12, 0x2e, 0x0a, 0x10, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x75, 0x6d, 1287 | 0x6d, 0x79, 0x5f, 0x70, 0x72, 0x65, 0x66, 0x69, 0x78, 0x18, 0x03, 0x20, 0x01, 0x28, 0x08, 0x3a, 1288 | 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x0e, 0x61, 0x64, 0x64, 0x44, 0x75, 0x6d, 0x6d, 0x79, 0x50, 1289 | 0x72, 0x65, 0x66, 0x69, 0x78, 0x12, 0x3e, 0x0a, 0x18, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x5f, 1290 | 0x65, 0x78, 0x74, 0x72, 0x61, 0x5f, 0x77, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 1291 | 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x16, 0x72, 1292 | 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x45, 0x78, 0x74, 0x72, 0x61, 0x57, 0x68, 0x69, 0x74, 0x65, 0x73, 1293 | 0x70, 0x61, 0x63, 0x65, 0x73, 0x12, 0x33, 0x0a, 0x12, 0x65, 0x73, 0x63, 0x61, 0x70, 0x65, 0x5f, 1294 | 0x77, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x73, 0x18, 0x05, 0x20, 0x01, 0x28, 1295 | 0x08, 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x11, 0x65, 0x73, 0x63, 0x61, 0x70, 0x65, 0x57, 1296 | 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x73, 0x12, 0x34, 0x0a, 0x16, 0x6e, 0x6f, 1297 | 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 1298 | 0x5f, 0x74, 0x73, 0x76, 0x18, 0x06, 0x20, 0x01, 0x28, 0x09, 0x52, 0x14, 0x6e, 0x6f, 0x72, 0x6d, 1299 | 0x61, 0x6c, 0x69, 0x7a, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x75, 0x6c, 0x65, 0x54, 0x73, 0x76, 1300 | 0x2a, 0x09, 0x08, 0xc8, 0x01, 0x10, 0x80, 0x80, 0x80, 0x80, 0x02, 0x22, 0xb3, 0x01, 0x0a, 0x0c, 1301 | 0x53, 0x65, 0x6c, 0x66, 0x54, 0x65, 0x73, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x5c, 0x0a, 0x07, 1302 | 0x73, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x42, 0x2e, 1303 | 0x63, 0x6f, 0x6d, 0x2e, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x67, 0x6f, 0x6d, 0x6c, 0x78, 1304 | 0x2e, 0x67, 0x6f, 0x5f, 0x68, 0x75, 0x67, 0x67, 0x69, 0x6e, 0x67, 0x66, 0x61, 0x63, 0x65, 0x2e, 1305 | 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x53, 0x65, 1306 | 0x6c, 0x66, 0x54, 0x65, 0x73, 0x74, 0x44, 0x61, 0x74, 0x61, 0x2e, 0x53, 0x61, 0x6d, 0x70, 0x6c, 1307 | 0x65, 0x52, 0x07, 0x73, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x73, 0x1a, 0x3a, 0x0a, 0x06, 0x53, 0x61, 1308 | 0x6d, 0x70, 0x6c, 0x65, 0x12, 0x14, 0x0a, 0x05, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x18, 0x01, 0x20, 1309 | 0x01, 0x28, 0x09, 0x52, 0x05, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x12, 0x1a, 0x0a, 0x08, 0x65, 0x78, 1310 | 0x70, 0x65, 0x63, 0x74, 0x65, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x65, 0x78, 1311 | 0x70, 0x65, 0x63, 0x74, 0x65, 0x64, 0x2a, 0x09, 0x08, 0xc8, 0x01, 0x10, 0x80, 0x80, 0x80, 0x80, 1312 | 0x02, 0x22, 0x97, 0x06, 0x0a, 0x0a, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x50, 0x72, 0x6f, 0x74, 0x6f, 1313 | 0x12, 0x5f, 0x0a, 0x06, 0x70, 0x69, 0x65, 0x63, 0x65, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 1314 | 0x32, 0x47, 0x2e, 0x63, 0x6f, 0x6d, 0x2e, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x67, 0x6f, 1315 | 0x6d, 0x6c, 0x78, 0x2e, 0x67, 0x6f, 0x5f, 0x68, 0x75, 0x67, 0x67, 0x69, 0x6e, 0x67, 0x66, 0x61, 1316 | 0x63, 0x65, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 1317 | 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x50, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x53, 0x65, 0x6e, 0x74, 1318 | 0x65, 0x6e, 0x63, 0x65, 0x50, 0x69, 0x65, 0x63, 0x65, 0x52, 0x06, 0x70, 0x69, 0x65, 0x63, 0x65, 1319 | 0x73, 0x12, 0x5d, 0x0a, 0x0c, 0x74, 0x72, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x5f, 0x73, 0x70, 0x65, 1320 | 0x63, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x3a, 0x2e, 0x63, 0x6f, 0x6d, 0x2e, 0x67, 0x69, 1321 | 0x74, 0x68, 0x75, 0x62, 0x2e, 0x67, 0x6f, 0x6d, 0x6c, 0x78, 0x2e, 0x67, 0x6f, 0x5f, 0x68, 0x75, 1322 | 0x67, 0x67, 0x69, 0x6e, 0x67, 0x66, 0x61, 0x63, 0x65, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 1323 | 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x54, 0x72, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, 1324 | 0x70, 0x65, 0x63, 0x52, 0x0b, 0x74, 0x72, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 1325 | 0x12, 0x66, 0x0a, 0x0f, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x72, 0x5f, 0x73, 1326 | 0x70, 0x65, 0x63, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x3d, 0x2e, 0x63, 0x6f, 0x6d, 0x2e, 1327 | 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x67, 0x6f, 0x6d, 0x6c, 0x78, 0x2e, 0x67, 0x6f, 0x5f, 1328 | 0x68, 0x75, 0x67, 0x67, 0x69, 0x6e, 0x67, 0x66, 0x61, 0x63, 0x65, 0x2e, 0x73, 0x65, 0x6e, 0x74, 1329 | 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x4e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 1330 | 0x69, 0x7a, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x52, 0x0e, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 1331 | 0x69, 0x7a, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x12, 0x61, 0x0a, 0x0e, 0x73, 0x65, 0x6c, 0x66, 1332 | 0x5f, 0x74, 0x65, 0x73, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0b, 1333 | 0x32, 0x3b, 0x2e, 0x63, 0x6f, 0x6d, 0x2e, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x67, 0x6f, 1334 | 0x6d, 0x6c, 0x78, 0x2e, 0x67, 0x6f, 0x5f, 0x68, 0x75, 0x67, 0x67, 0x69, 0x6e, 0x67, 0x66, 0x61, 1335 | 0x63, 0x65, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 1336 | 0x2e, 0x53, 0x65, 0x6c, 0x66, 0x54, 0x65, 0x73, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0c, 0x73, 1337 | 0x65, 0x6c, 0x66, 0x54, 0x65, 0x73, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x6a, 0x0a, 0x11, 0x64, 1338 | 0x65, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x72, 0x5f, 0x73, 0x70, 0x65, 0x63, 1339 | 0x18, 0x05, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x3d, 0x2e, 0x63, 0x6f, 0x6d, 0x2e, 0x67, 0x69, 0x74, 1340 | 0x68, 0x75, 0x62, 0x2e, 0x67, 0x6f, 0x6d, 0x6c, 0x78, 0x2e, 0x67, 0x6f, 0x5f, 0x68, 0x75, 0x67, 1341 | 0x67, 0x69, 0x6e, 0x67, 0x66, 0x61, 0x63, 0x65, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 1342 | 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x4e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x65, 1343 | 0x72, 0x53, 0x70, 0x65, 0x63, 0x52, 0x10, 0x64, 0x65, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 1344 | 0x7a, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x1a, 0x86, 0x02, 0x0a, 0x0d, 0x53, 0x65, 0x6e, 0x74, 1345 | 0x65, 0x6e, 0x63, 0x65, 0x50, 0x69, 0x65, 0x63, 0x65, 0x12, 0x14, 0x0a, 0x05, 0x70, 0x69, 0x65, 1346 | 0x63, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x70, 0x69, 0x65, 0x63, 0x65, 0x12, 1347 | 0x14, 0x0a, 0x05, 0x73, 0x63, 0x6f, 0x72, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x02, 0x52, 0x05, 1348 | 0x73, 0x63, 0x6f, 0x72, 0x65, 0x12, 0x68, 0x0a, 0x04, 0x74, 0x79, 0x70, 0x65, 0x18, 0x03, 0x20, 1349 | 0x01, 0x28, 0x0e, 0x32, 0x4c, 0x2e, 0x63, 0x6f, 0x6d, 0x2e, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 1350 | 0x2e, 0x67, 0x6f, 0x6d, 0x6c, 0x78, 0x2e, 0x67, 0x6f, 0x5f, 0x68, 0x75, 0x67, 0x67, 0x69, 0x6e, 1351 | 0x67, 0x66, 0x61, 0x63, 0x65, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 1352 | 0x65, 0x63, 0x65, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x50, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x53, 1353 | 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x50, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x54, 0x79, 0x70, 1354 | 0x65, 0x3a, 0x06, 0x4e, 0x4f, 0x52, 0x4d, 0x41, 0x4c, 0x52, 0x04, 0x74, 0x79, 0x70, 0x65, 0x22, 1355 | 0x54, 0x0a, 0x04, 0x54, 0x79, 0x70, 0x65, 0x12, 0x0a, 0x0a, 0x06, 0x4e, 0x4f, 0x52, 0x4d, 0x41, 1356 | 0x4c, 0x10, 0x01, 0x12, 0x0b, 0x0a, 0x07, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x02, 1357 | 0x12, 0x0b, 0x0a, 0x07, 0x43, 0x4f, 0x4e, 0x54, 0x52, 0x4f, 0x4c, 0x10, 0x03, 0x12, 0x10, 0x0a, 1358 | 0x0c, 0x55, 0x53, 0x45, 0x52, 0x5f, 0x44, 0x45, 0x46, 0x49, 0x4e, 0x45, 0x44, 0x10, 0x04, 0x12, 1359 | 0x08, 0x0a, 0x04, 0x42, 0x59, 0x54, 0x45, 0x10, 0x06, 0x12, 0x0a, 0x0a, 0x06, 0x55, 0x4e, 0x55, 1360 | 0x53, 0x45, 0x44, 0x10, 0x05, 0x2a, 0x09, 0x08, 0xc8, 0x01, 0x10, 0x80, 0x80, 0x80, 0x80, 0x02, 1361 | 0x2a, 0x09, 0x08, 0xc8, 0x01, 0x10, 0x80, 0x80, 0x80, 0x80, 0x02, 0x42, 0x4b, 0x48, 0x03, 0x5a, 1362 | 0x47, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x67, 0x6f, 0x6d, 0x6c, 1363 | 0x78, 0x2f, 0x67, 0x6f, 0x2d, 0x68, 0x75, 0x67, 0x67, 0x69, 0x6e, 0x67, 0x66, 0x61, 0x63, 0x65, 1364 | 0x2f, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x69, 0x7a, 0x65, 0x72, 0x73, 0x2f, 0x73, 0x65, 0x6e, 0x74, 1365 | 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2f, 0x70, 0x72, 0x69, 0x76, 0x61, 0x74, 1366 | 0x65, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x73, 1367 | } 1368 | 1369 | var ( 1370 | file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescOnce sync.Once 1371 | file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescData = file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDesc 1372 | ) 1373 | 1374 | func file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescGZIP() []byte { 1375 | file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescOnce.Do(func() { 1376 | file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescData = protoimpl.X.CompressGZIP(file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescData) 1377 | }) 1378 | return file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescData 1379 | } 1380 | 1381 | var file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_enumTypes = make([]protoimpl.EnumInfo, 2) 1382 | var file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes = make([]protoimpl.MessageInfo, 6) 1383 | var file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_goTypes = []any{ 1384 | (TrainerSpec_ModelType)(0), // 0: com.github.gomlx.go_huggingface.sentencepiece.TrainerSpec.ModelType 1385 | (ModelProto_SentencePiece_Type)(0), // 1: com.github.gomlx.go_huggingface.sentencepiece.ModelProto.SentencePiece.Type 1386 | (*TrainerSpec)(nil), // 2: com.github.gomlx.go_huggingface.sentencepiece.TrainerSpec 1387 | (*NormalizerSpec)(nil), // 3: com.github.gomlx.go_huggingface.sentencepiece.NormalizerSpec 1388 | (*SelfTestData)(nil), // 4: com.github.gomlx.go_huggingface.sentencepiece.SelfTestData 1389 | (*ModelProto)(nil), // 5: com.github.gomlx.go_huggingface.sentencepiece.ModelProto 1390 | (*SelfTestData_Sample)(nil), // 6: com.github.gomlx.go_huggingface.sentencepiece.SelfTestData.Sample 1391 | (*ModelProto_SentencePiece)(nil), // 7: com.github.gomlx.go_huggingface.sentencepiece.ModelProto.SentencePiece 1392 | } 1393 | var file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_depIdxs = []int32{ 1394 | 0, // 0: com.github.gomlx.go_huggingface.sentencepiece.TrainerSpec.model_type:type_name -> com.github.gomlx.go_huggingface.sentencepiece.TrainerSpec.ModelType 1395 | 6, // 1: com.github.gomlx.go_huggingface.sentencepiece.SelfTestData.samples:type_name -> com.github.gomlx.go_huggingface.sentencepiece.SelfTestData.Sample 1396 | 7, // 2: com.github.gomlx.go_huggingface.sentencepiece.ModelProto.pieces:type_name -> com.github.gomlx.go_huggingface.sentencepiece.ModelProto.SentencePiece 1397 | 2, // 3: com.github.gomlx.go_huggingface.sentencepiece.ModelProto.trainer_spec:type_name -> com.github.gomlx.go_huggingface.sentencepiece.TrainerSpec 1398 | 3, // 4: com.github.gomlx.go_huggingface.sentencepiece.ModelProto.normalizer_spec:type_name -> com.github.gomlx.go_huggingface.sentencepiece.NormalizerSpec 1399 | 4, // 5: com.github.gomlx.go_huggingface.sentencepiece.ModelProto.self_test_data:type_name -> com.github.gomlx.go_huggingface.sentencepiece.SelfTestData 1400 | 3, // 6: com.github.gomlx.go_huggingface.sentencepiece.ModelProto.denormalizer_spec:type_name -> com.github.gomlx.go_huggingface.sentencepiece.NormalizerSpec 1401 | 1, // 7: com.github.gomlx.go_huggingface.sentencepiece.ModelProto.SentencePiece.type:type_name -> com.github.gomlx.go_huggingface.sentencepiece.ModelProto.SentencePiece.Type 1402 | 8, // [8:8] is the sub-list for method output_type 1403 | 8, // [8:8] is the sub-list for method input_type 1404 | 8, // [8:8] is the sub-list for extension type_name 1405 | 8, // [8:8] is the sub-list for extension extendee 1406 | 0, // [0:8] is the sub-list for field type_name 1407 | } 1408 | 1409 | func init() { 1410 | file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_init() 1411 | } 1412 | func file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_init() { 1413 | if File_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto != nil { 1414 | return 1415 | } 1416 | type x struct{} 1417 | out := protoimpl.TypeBuilder{ 1418 | File: protoimpl.DescBuilder{ 1419 | GoPackagePath: reflect.TypeOf(x{}).PkgPath(), 1420 | RawDescriptor: file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDesc, 1421 | NumEnums: 2, 1422 | NumMessages: 6, 1423 | NumExtensions: 0, 1424 | NumServices: 0, 1425 | }, 1426 | GoTypes: file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_goTypes, 1427 | DependencyIndexes: file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_depIdxs, 1428 | EnumInfos: file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_enumTypes, 1429 | MessageInfos: file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes, 1430 | }.Build() 1431 | File_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto = out.File 1432 | file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDesc = nil 1433 | file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_goTypes = nil 1434 | file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_depIdxs = nil 1435 | } 1436 | --------------------------------------------------------------------------------