├── .sccignore
├── tokenizers
    ├── sentencepiece
    │   ├── private
    │   │   ├── README.md
    │   │   └── protos
    │   │   │   ├── protos.go
    │   │   │   ├── README.md
    │   │   │   ├── gen_protos.sh
    │   │   │   ├── sentencepiece_model.proto
    │   │   │   └── sentencepiece_model.pb.go
    │   └── sentencepiece.go
    ├── api
    │   ├── api.go
    │   ├── config.go
    │   └── specialtoken_enumer.go
    └── tokenizers.go
├── huggingface.go
├── .gitignore
├── go.mod
├── hub
    ├── files_test.go
    ├── README.md
    ├── hub.go
    ├── info.go
    ├── download.go
    ├── repo.go
    └── files.go
├── internal
    ├── files
    │   └── files.go
    └── downloader
    │   ├── semaphore.go
    │   └── downloader.go
├── docs
    └── CHANGELOG.md
├── go.sum
├── README.md
├── LICENSE
└── go-huggingface.ipynb


/.sccignore:
--------------------------------------------------------------------------------
1 | internal/protos
2 | LICENSE
3 | .gitignore
4 | .idea
5 | 


--------------------------------------------------------------------------------
/tokenizers/sentencepiece/private/README.md:
--------------------------------------------------------------------------------
1 | # Private Packages: don't depend on these
2 | 
3 | We don't use `internal/` because we need access to Jupyter Notebooks that we use for test and development.


--------------------------------------------------------------------------------
/tokenizers/sentencepiece/private/protos/protos.go:
--------------------------------------------------------------------------------
1 | // Package protos have the Proto Buffer code for the sentencepiece_model.proto file,
2 | // downloaded from https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto.
3 | //
4 | // The Model
5 | package protos
6 | 
7 | //go:generate ./gen_protos.sh
8 | 


--------------------------------------------------------------------------------
/huggingface.go:
--------------------------------------------------------------------------------
 1 | // Package huggingface only holds the version of the set of tools to interact with HuggingFace using GoMLX.
 2 | //
 3 | // There are 3 main sub-packages:
 4 | //
 5 | //   - hub: to download files from HuggingFace Hub, be it model files, tokenizers, data, etc.
 6 | //   - tokenizers: to create tokenizers from downloaded HuggingFace models.
 7 | //   - models: to convert model weights from different formats to GoMLX.
 8 | package huggingface
 9 | 
10 | // Version of the library.
11 | // Manually kept in sync with project releases.
12 | var Version = "v0.0.0-dev"
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # If you prefer the allow list template instead of the deny list, see community template:
 2 | # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore
 3 | #
 4 | # Binaries for programs and plugins
 5 | *.exe
 6 | *.exe~
 7 | *.dll
 8 | *.so
 9 | *.dylib
10 | 
11 | # Test binary, built with `go test -c`
12 | *.test
13 | 
14 | # Output of the go coverage tool, specifically when used with LiteIDE
15 | *.out
16 | 
17 | # Dependency directories (remove the comment below to include it)
18 | # vendor/
19 | 
20 | # Go workspace file
21 | go.work
22 | go.work.sum
23 | 
24 | # IDEs
25 | .idea/
26 | 
27 | # Notebooks temporary files.
28 | .ipynb_checkpoints


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/gomlx/go-huggingface
 2 | 
 3 | go 1.24.0
 4 | 
 5 | require (
 6 | 	github.com/dustin/go-humanize v1.0.1
 7 | 	github.com/eliben/go-sentencepiece v0.6.0
 8 | 	github.com/gofrs/flock v0.13.0
 9 | 	github.com/google/uuid v1.6.0
10 | 	github.com/pkg/errors v0.9.1
11 | 	github.com/stretchr/testify v1.11.1
12 | 	google.golang.org/protobuf v1.36.10
13 | )
14 | 
15 | require (
16 | 	github.com/davecgh/go-spew v1.1.1 // indirect
17 | 	github.com/kr/text v0.2.0 // indirect
18 | 	github.com/pmezard/go-difflib v1.0.0 // indirect
19 | 	github.com/rogpeppe/go-internal v1.14.1 // indirect
20 | 	golang.org/x/sys v0.38.0 // indirect
21 | 	gopkg.in/yaml.v3 v3.0.1 // indirect
22 | )
23 | 


--------------------------------------------------------------------------------
/tokenizers/sentencepiece/private/protos/README.md:
--------------------------------------------------------------------------------
 1 | # Proto Files
 2 | 
 3 | * [`sentencepiece_model.proto`](https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto) is
 4 |   downloaded from the C++ original source, in [https://github.com/google/sentencepiece/](https://github.com/google/sentencepiece),
 5 |   but it should match the one used by the [github.com/eliben/go-sentencepiece](https://github.com/eliben/go-sentencepiece)
 6 |   library.
 7 | 
 8 | Because of protoc unique file naming requirement (!?), described in email thread in https://groups.google.com/g/protobuf/c/UWWuoRWz1Uk,
 9 | we compile by first creating a unique prefix directory. See `gen_protos.sh` script.
10 | 


--------------------------------------------------------------------------------
/tokenizers/sentencepiece/private/protos/gen_protos.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # Find Go import path: presumably unique.
 5 | import_path="$(go list -f '{{.ImportPath}}')"
 6 | 
 7 | # Extract the domain and the rest of the path
 8 | domain=$(echo "$import_path" | cut -d '/' -f 1)
 9 | rest_of_path=$(echo "$import_path" | cut -d '/' -f 2-)
10 | 
11 | # Reverse the domain part (split by '.')
12 | reversed_domain=$(echo "$domain" | awk -F '.' '{ for (i=NF; i>1; i--) printf "%s.", $i; print $1 }' | sed 's/\.$//')
13 | 
14 | # Combine the reversed domain with the rest of the path
15 | tmp_link="$reversed_domain/$rest_of_path"
16 | tmp_link=$(echo "$tmp_link" | tr '/.' '__')
17 | rm -f "${tmp_link}"
18 | ln -s . "${tmp_link}"
19 | protoc --go_out=. --go_opt=paths=source_relative "./${tmp_link}/sentencepiece_model.proto"
20 | rm -f "${tmp_link}"
21 | 


--------------------------------------------------------------------------------
/hub/files_test.go:
--------------------------------------------------------------------------------
 1 | package hub
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"github.com/stretchr/testify/assert"
 6 | 	"path/filepath"
 7 | 	"testing"
 8 | )
 9 | 
10 | func TestCleanRelativeFilePath(t *testing.T) {
11 | 	testCases := []struct {
12 | 		input    string
13 | 		expected string
14 | 	}{
15 | 		{"foo/bar", "foo/bar"},
16 | 		{"foo/../bar", "bar"},
17 | 		{"foo/./bar", "foo/bar"},
18 | 		{"/foo/bar", "foo/bar"},
19 | 		{"foo//bar", "foo/bar"},
20 | 		{"foo/bar/..", "foo"},
21 | 		{"../foo/bar", "foo/bar"},
22 | 		{"foo/../../../..", "."},
23 | 		{"foo/../../../bar", "bar"},
24 | 		{"", "."},
25 | 		{".", "."},
26 | 		{"..", "."},
27 | 	}
28 | 
29 | 	for _, tc := range testCases {
30 | 		expected := filepath.FromSlash(tc.expected)
31 | 		got := cleanRelativeFilePath(tc.input)
32 | 		fmt.Printf("\tcleanRelativeFilePath(%q) = %q\n", tc.input, got)
33 | 		assert.Equal(t, expected, got)
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------
/tokenizers/api/api.go:
--------------------------------------------------------------------------------
 1 | // Package api defines the Tokenizer API.
 2 | // It's just a hack to break the cyclic dependency, and allow the users to import `tokenizers` and get the
 3 | // default implementations.
 4 | package api
 5 | 
 6 | // Tokenizer interface allows one convert test to "tokens" (integer ids) and back.
 7 | //
 8 | // It also allows mapping of special tokens: tokens with a common semantic (like padding) but that
 9 | // may map to different ids (int) for different tokenizers.
10 | type Tokenizer interface {
11 | 	Encode(text string) []int
12 | 	Decode([]int) string
13 | 
14 | 	// SpecialTokenID returns ID for given special token if registered, or an error if not.
15 | 	SpecialTokenID(token SpecialToken) (int, error)
16 | }
17 | 
18 | // SpecialToken is an enum of commonly used special tokens.
19 | type SpecialToken int
20 | 
21 | const (
22 | 	TokBeginningOfSentence SpecialToken = iota
23 | 	TokEndOfSentence
24 | 	TokUnknown
25 | 	TokPad
26 | 	TokMask
27 | 	TokClassification
28 | 	TokSpecialTokensCount
29 | )
30 | 
31 | //go:generate enumer -type=SpecialToken -trimprefix=Tok -transform=snake -values -text -json -yaml api.go
32 | 


--------------------------------------------------------------------------------
/internal/files/files.go:
--------------------------------------------------------------------------------
 1 | // Package files implements generic file tools missing from the standard library.
 2 | package files
 3 | 
 4 | import (
 5 | 	"github.com/pkg/errors"
 6 | 	"os"
 7 | 	"os/user"
 8 | 	"path"
 9 | 	"strings"
10 | )
11 | 
12 | // Exists returns true if file or directory exists.
13 | func Exists(filePath string) bool {
14 | 	_, err := os.Stat(filePath)
15 | 	return err == nil
16 | }
17 | 
18 | // ReplaceTildeInDir by the user's home directory. Returns dir if it doesn't start with "~".
19 | //
20 | // It returns an error if `dir` has an unknown user (e.g: `~unknown/...`)
21 | func ReplaceTildeInDir(dir string) (string, error) {
22 | 	if len(dir) == 0 {
23 | 		return dir, nil
24 | 	}
25 | 	if dir[0] != '~' {
26 | 		return dir, nil
27 | 	}
28 | 	var userName string
29 | 	if dir != "~" && !strings.HasPrefix(dir, "~/") {
30 | 		sepIdx := strings.IndexRune(dir, '/')
31 | 		if sepIdx == -1 {
32 | 			userName = dir[1:]
33 | 		} else {
34 | 			userName = dir[1:sepIdx]
35 | 		}
36 | 	}
37 | 	var usr *user.User
38 | 	var err error
39 | 	if userName == "" {
40 | 		usr, err = user.Current()
41 | 	} else {
42 | 		usr, err = user.Lookup(userName)
43 | 	}
44 | 	if err != nil {
45 | 		return dir, errors.Wrapf(err, "failed to lookup home directory for user in path %q", dir)
46 | 	}
47 | 	homeDir := usr.HomeDir
48 | 	return path.Join(homeDir, dir[1+len(userName):]), nil
49 | }
50 | 


--------------------------------------------------------------------------------
/hub/README.md:
--------------------------------------------------------------------------------
 1 | # hub package
 2 | Downloads HuggingFace Hub files, a port of huggingFace_hub python library to Go. 
 3 | 
 4 | ## Introduction
 5 | 
 6 | A simple, straight-forward port of [github.com/huggingface/huggingface_hub](https://github.com/huggingface/huggingface_hub) library for Go.
 7 | 
 8 | Features supported:
 9 | 
10 | - Cache system that matches HuggingFace Hub, so the same cache can be shared with Python.
11 | - Concurrency safe: only one download when multiple workers are trying to download simultaneously the same model.
12 | - Allow arbitrary progress function to be called (for progress bar).
13 | - Arbitrary revision.
14 | - Parallel download of files, max=20 by default.
15 | 
16 | TODOs:
17 | 
18 | - Add support for optional parameters.
19 | - Authentication tokens: should be relatively easy.
20 | - Resume downloads from interrupted connections.
21 | - Check disk-space before starting to download.
22 | 
23 | ## Example
24 | 
25 | Enumerate files from a HuggingFace repository and download all of them to a cache.
26 | 
27 | ```go
28 | 	repo := hub.New(modelID).WithAuth(hfAuthToken)
29 | 	var fileNames []string
30 | 	for fileName, err := range repo.IterFileNames() {
31 | 		if err != nil { panic(err) }
32 | 		fmt.Printf("\t%s\n", fileName)
33 | 		fileNames = append(fileNames, fileName)
34 | 	}
35 | 	downloadedFiles, err := repo.DownloadFiles(fileNames...)
36 | 	if err != nil { ... }
37 | ```


--------------------------------------------------------------------------------
/docs/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # `go-huggingface` Changelog
 2 | 
 3 | ## v0.3.1
 4 | 
 5 | - Fixed go.mod/go.sum.
 6 | 
 7 | ## v0.3.0
 8 | 
 9 | - Bumped the version of GoMLX in tests and documentation.
10 | - Bumped version of dependencies: including github.com/daulet/tokenizers, which requires a fresh download of the 
11 |   corresponding c++ library libtokenizers.a.
12 | 
13 | ## v0.2.2
14 | 
15 | * Fixed file truncation issues during download.
16 | 
17 | ## v0.2.1
18 | 
19 | * Forcefully refresh (download) the revision's hash at least once before using.
20 | 
21 | ## v0.2.0
22 | 
23 | * Add Windows support by moving to the cross-platform flock: see PR #6, thanks to @mrmichaeladavis
24 | 
25 | ## v0.1.2
26 | 
27 | * If verbosity is 0, it won't print progress.
28 | * Added support for custom end-points. Default being "https://huggingface.co" or the environment variable
29 |   `$HF_ENDPOINT` if defined.
30 | 
31 | ## v0.1.1
32 | 
33 | * Fixed URL resolution of non-model repos.
34 | * Fixed sentencepiece Tokenizer and tokenizer API string methods (using `enumer`).
35 | * Added dataset example. 
36 | * Added usage with Rust tokenizer.
37 | * Improved README.md
38 | * Added SentencePiece proto support – to be used in future conversion of SentencePiece models.
39 | * Improved documentation.
40 | 
41 | ## v0.1.0
42 | 
43 | * package `hub`: inspect and download files from arbitrary repos. Very functional.
44 | * package `tokenizers`:
45 | 	* Interfaces, types and constants.
46 | 	* Gemma tokenizer implementation.
47 | 	* Not any other tokenizer implemented yet.
48 | * Examples in `README.md`.
49 | 


--------------------------------------------------------------------------------
/internal/downloader/semaphore.go:
--------------------------------------------------------------------------------
 1 | package downloader
 2 | 
 3 | import "sync"
 4 | 
 5 | // Semaphore that allows dynamic resizing.
 6 | //
 7 | // It uses a sync.Cond to allow dynamic resizing, but it will be slower than a pure channel implementation
 8 | // with a fixed capacity. This cost shouldn't matter for more coarse resource control.
 9 | //
10 | // Implementation copied from github.com/gomlx/gomlx/pkg/support/xsync.
11 | type Semaphore struct {
12 | 	cond              sync.Cond
13 | 	capacity, current int // Tracks capacity and current usage.
14 | }
15 | 
16 | // NewSemaphore returns a Semaphore that allows at most capacity simultaneous acquisitions.
17 | // If capacity <= 0, there is no limit on acquisitions.
18 | //
19 | // FIFO ordering may be lost during resizes (Semaphore.Resize) to larger capacity, but otherwise it is respected.
20 | func NewSemaphore(capacity int) *Semaphore {
21 | 	return &Semaphore{
22 | 		cond:     sync.Cond{L: &sync.Mutex{}},
23 | 		capacity: capacity,
24 | 	}
25 | }
26 | 
27 | // Acquire resource observing current semaphore capacity.
28 | // It must be matched by exactly one call to Semaphore.Release after the reservation is no longer needed.
29 | func (s *Semaphore) Acquire() {
30 | 	s.cond.L.Lock()
31 | 	defer s.cond.L.Unlock()
32 | 	for {
33 | 		if s.capacity <= 0 || s.current < s.capacity {
34 | 			// No limits.
35 | 			s.current++
36 | 			return
37 | 		}
38 | 		s.cond.Wait()
39 | 	}
40 | }
41 | 
42 | // Release resource previously allocated with Semaphore.Acquire.
43 | func (s *Semaphore) Release() {
44 | 	s.cond.L.Lock()
45 | 	defer s.cond.L.Unlock()
46 | 	s.current--
47 | 	if s.capacity == 0 || s.current < s.capacity-1 {
48 | 		return
49 | 	}
50 | 	s.cond.Signal()
51 | }
52 | 
53 | // Resize the number of available resources in the Semaphore.
54 | //
55 | // If the newCapacity is larger than the previous one, this may immediately allow pending Semaphore.Acquire to proceed.
56 | // Notice since all waiting Semaphore.Acquire are awoken (broadcast), the queue order may be lost.
57 | //
58 | // If the newCapacity is smaller than the previous one, it doesn't have any effect on current acquisitions. So if the Semaphore
59 | // is being used to control a worker pool, reducing its size won't stop workers currently executing.
60 | func (s *Semaphore) Resize(newCapacity int) {
61 | 	s.cond.L.Lock()
62 | 	defer s.cond.L.Unlock()
63 | 	if newCapacity == s.capacity {
64 | 		return // No change needed.
65 | 	}
66 | 	if (newCapacity > 0 && newCapacity < s.capacity) || s.capacity == 0 {
67 | 		// Capacity is shrinking, no Semaphore.Acquire will be released.
68 | 		s.capacity = newCapacity
69 | 		return
70 | 	}
71 | 
72 | 	// Wake-up everyone -- to preserve the queue order, we would need to call s.cond.Signal() for the amount of
73 | 	// increased capacity, but that would make this call O(capacity), potentially slow for large capacities.
74 | 	s.capacity = newCapacity
75 | 	s.cond.Broadcast()
76 | }
77 | 


--------------------------------------------------------------------------------
/tokenizers/sentencepiece/sentencepiece.go:
--------------------------------------------------------------------------------
 1 | // Package sentencepiece implements a tokenizers.Tokenizer based on SentencePiece tokenizer.
 2 | package sentencepiece
 3 | 
 4 | import (
 5 | 	esentencepiece "github.com/eliben/go-sentencepiece"
 6 | 	"github.com/gomlx/go-huggingface/hub"
 7 | 	"github.com/gomlx/go-huggingface/tokenizers/api"
 8 | 	"github.com/pkg/errors"
 9 | )
10 | 
11 | // New creates a SentencePiece tokenizer based on the "tokenizer.model" file, which must be a
12 | // SentencePiece Model proto (see protos.Model).
13 | //
14 | // It implements a tokenizer.TokenizerConstructor function signature.
15 | func New(config *api.Config, repo *hub.Repo) (api.Tokenizer, error) {
16 | 	if !repo.HasFile("tokenizer.model") {
17 | 		return nil, errors.Errorf("\"tokenizer.model\" file not found in repo")
18 | 	}
19 | 	tokenizerFile, err := repo.DownloadFile("tokenizer.model")
20 | 	if err != nil {
21 | 		return nil, errors.Wrapf(err, "can't download tokenizer.json file")
22 | 	}
23 | 	proc, err := esentencepiece.NewProcessorFromPath(tokenizerFile)
24 | 	if err != nil {
25 | 		return nil, errors.Wrapf(err, "can't create sentencepiece tokenizer")
26 | 	}
27 | 	return &Tokenizer{
28 | 		Processor: proc,
29 | 		Info:      proc.ModelInfo(),
30 | 	}, nil
31 | }
32 | 
33 | // Tokenizer implements tokenizers.Tokenizer interface based on SentencePiece tokenizer by Google.
34 | type Tokenizer struct {
35 | 	*esentencepiece.Processor
36 | 	Info *esentencepiece.ModelInfo
37 | }
38 | 
39 | // Compile time assert that sentencepiece.Tokenizer implements tokenizers.Tokenizer interface.
40 | var _ api.Tokenizer = &Tokenizer{}
41 | 
42 | // Encode returns the text encoded into a sequence of ids.
43 | // It implements sampler.Vocabulary.
44 | func (p *Tokenizer) Encode(text string) []int {
45 | 	tokens := p.Processor.Encode(text)
46 | 	return sliceMap(tokens, func(t esentencepiece.Token) int { return t.ID })
47 | }
48 | 
49 | // Decode returns the text from a sequence of ids.
50 | // It implements sampler.Vocabulary.
51 | func (p *Tokenizer) Decode(ids []int) string {
52 | 	return p.Processor.Decode(ids)
53 | }
54 | 
55 | // SpecialTokenID returns the token for the given symbol, or an error if not known.
56 | func (p *Tokenizer) SpecialTokenID(token api.SpecialToken) (int, error) {
57 | 	switch token {
58 | 	case api.TokUnknown:
59 | 		return p.Info.UnknownID, nil
60 | 	case api.TokPad:
61 | 		return p.Info.PadID, nil
62 | 	case api.TokBeginningOfSentence:
63 | 		return p.Info.BeginningOfSentenceID, nil
64 | 	case api.TokEndOfSentence:
65 | 		return p.Info.EndOfSentenceID, nil
66 | 	default:
67 | 		return 0, errors.Errorf("unknown special token: %s (%d)", token, int(token))
68 | 	}
69 | }
70 | 
71 | // sliceMap executes the given function sequentially for every element on in, and returns a mapped slice.
72 | func sliceMap[In, Out any](in []In, fn func(e In) Out) (out []Out) {
73 | 	out = make([]Out, len(in))
74 | 	for ii, e := range in {
75 | 		out[ii] = fn(e)
76 | 	}
77 | 	return
78 | }
79 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
 2 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 3 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 4 | github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
 5 | github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
 6 | github.com/eliben/go-sentencepiece v0.6.0 h1:wbnefMCxYyVYmeTVtiMJet+mS9CVwq5klveLpfQLsnk=
 7 | github.com/eliben/go-sentencepiece v0.6.0/go.mod h1:nNYk4aMzgBoI6QFp4LUG8Eu1uO9fHD9L5ZEre93o9+c=
 8 | github.com/gofrs/flock v0.13.0 h1:95JolYOvGMqeH31+FC7D2+uULf6mG61mEZ/A8dRYMzw=
 9 | github.com/gofrs/flock v0.13.0/go.mod h1:jxeyy9R1auM5S6JYDBhDt+E2TCo7DkratH4Pgi8P+Z0=
10 | github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
11 | github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
12 | github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
13 | github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
14 | github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
15 | github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
16 | github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
17 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
18 | github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
19 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
20 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
21 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
22 | github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
23 | github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
24 | github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
25 | github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
26 | golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc=
27 | golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
28 | google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE=
29 | google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
30 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
31 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
32 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
33 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
34 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
35 | 


--------------------------------------------------------------------------------
/tokenizers/api/config.go:
--------------------------------------------------------------------------------
 1 | package api
 2 | 
 3 | import (
 4 | 	"encoding/json"
 5 | 	"github.com/pkg/errors"
 6 | 	"os"
 7 | )
 8 | 
 9 | type TokensDecoder struct {
10 | 	Content    string `json:"content"`
11 | 	Lstrip     bool   `json:"lstrip"`
12 | 	Normalized bool   `json:"normalized"`
13 | 	Rstrip     bool   `json:"rstrip"`
14 | 	SingleWord bool   `json:"single_word"`
15 | 	Special    bool   `json:"special"`
16 | }
17 | 
18 | // Config struct to hold HuggingFace's tokenizer_config.json contents.
19 | // There is no formal schema for this file, but these are some common fields that may be of use.
20 | // Specific tokenizer classes are free to implement additional features as they see fit.
21 | //
22 | // The extra field ConfigFile holds the path to the file with the full config.
23 | type Config struct {
24 | 	ConfigFile     string
25 | 	TokenizerClass string `json:"tokenizer_class"`
26 | 
27 | 	ChatTemplate           string `json:"chat_template"`
28 | 	UseDefaultSystemPrompt bool   `json:"use_default_system_prompt"`
29 | 
30 | 	ModelMaxLength float64        `json:"model_max_length"`
31 | 	MaxLength      float64        `json:"max_length"`
32 | 	SpModelKwargs  map[string]any `json:"sp_model_kwargs"`
33 | 
34 | 	ClsToken  string `json:"cls_token"`
35 | 	UnkToken  string `json:"unk_token"`
36 | 	SepToken  string `json:"sep_token"`
37 | 	MaskToken string `json:"mask_token"`
38 | 	BosToken  string `json:"bos_token"`
39 | 	EosToken  string `json:"eos_token"`
40 | 	PadToken  string `json:"pad_token"`
41 | 
42 | 	AddBosToken             bool                  `json:"add_bos_token"`
43 | 	AddEosToken             bool                  `json:"add_eos_token"`
44 | 	AddedTokensDecoder      map[int]TokensDecoder `json:"added_tokens_decoder"`
45 | 	AdditionalSpecialTokens []string              `json:"additional_special_tokens"`
46 | 
47 | 	DoLowerCase                bool `json:"do_lower_case"`
48 | 	CleanUpTokenizationSpaces  bool `json:"clean_up_tokenization_spaces"`
49 | 	SpacesBetweenSpecialTokens bool `json:"spaces_between_special_tokens"`
50 | 
51 | 	TokenizeChineseChars bool   `json:"tokenize_chinese_chars"`
52 | 	StripAccents         any    `json:"strip_accents"`
53 | 	NameOrPath           string `json:"name_or_path"`
54 | 	DoBasicTokenize      bool   `json:"do_basic_tokenize"`
55 | 	NeverSplit           any    `json:"never_split"`
56 | 
57 | 	Stride             int    `json:"stride"`
58 | 	TruncationSide     string `json:"truncation_side"`
59 | 	TruncationStrategy string `json:"truncation_strategy"`
60 | }
61 | 
62 | // ParseConfigFile parses the given file (holding a tokenizer_config.json file) into a Config structure.
63 | func ParseConfigFile(filePath string) (*Config, error) {
64 | 	content, err := os.ReadFile(filePath)
65 | 	if err != nil {
66 | 		return nil, errors.Wrapf(err, "failed to read file %q", filePath)
67 | 	}
68 | 	config, err := ParseConfigContent(content)
69 | 	if err != nil {
70 | 		return nil, errors.WithMessagef(err, "read from file %q", filePath)
71 | 	}
72 | 	config.ConfigFile = filePath
73 | 	return config, nil
74 | }
75 | 
76 | // ParseConfigContent parses the given json content (of a tokenizer_config.json file) into a Config structure.
77 | func ParseConfigContent(jsonContent []byte) (*Config, error) {
78 | 	config := &Config{}
79 | 	err := json.Unmarshal(jsonContent, config)
80 | 	if err != nil {
81 | 		return nil, errors.Wrapf(err, "failed to parse tokenizer_config json content")
82 | 	}
83 | 	return config, nil
84 | }
85 | 


--------------------------------------------------------------------------------
/hub/hub.go:
--------------------------------------------------------------------------------
  1 | // Package hub can be used to download and cache files from HuggingFace Hub, which may
  2 | // be models, tokenizers or anything.
  3 | //
  4 | // It is meant to be a port of huggingFace_hub python library to Go, and be able to share the same
  5 | // cache structure (usually under "~/.cache/huggingface/hub").
  6 | //
  7 | // It is also safe to be used concurrently by multiple programs -- it uses file system lock to control concurrency.
  8 | //
  9 | // Typical usage will be something like:
 10 | //
 11 | //	repo := hub.New(modelID).WithAuth(hfAuthToken)
 12 | //	var fileNames []string
 13 | //	for fileName, err := range repo.IterFileNames() {
 14 | //		if err != nil { panic(err) }
 15 | //		fmt.Printf("\t%s\n", fileName)
 16 | //		fileNames = append(fileNames, fileName)
 17 | //	}
 18 | //	downloadedFiles, err := repo.DownloadFiles(fileNames...)
 19 | //	if err != nil { ... }
 20 | //
 21 | // From here, downloadedFiles will point to files in the local cache that one can read.
 22 | //
 23 | // Environment variables:
 24 | //
 25 | // - HF_ENDPOINT: Where to connect to huggingface, default is https://huggingface.co
 26 | // - XDG_CACHE_HOME: Cache directory, defaults to ${HOME}/.cache
 27 | package hub
 28 | 
 29 | import (
 30 | 	"fmt"
 31 | 	"github.com/gomlx/go-huggingface"
 32 | 	"github.com/google/uuid"
 33 | 	"github.com/pkg/errors"
 34 | 	"os"
 35 | 	"path"
 36 | 	"runtime"
 37 | 	"strings"
 38 | )
 39 | 
 40 | // SessionId is unique and always created anew at the start of the program, and used during the life of the program.
 41 | var SessionId string
 42 | 
 43 | // panicf generates an error message and panics with it, in one function.
 44 | func panicf(format string, args ...any) {
 45 | 	err := errors.Errorf(format, args...)
 46 | 	panic(err)
 47 | }
 48 | 
 49 | func init() {
 50 | 	sessionUUID, err := uuid.NewRandom()
 51 | 	if err != nil {
 52 | 		panicf("failed generating UUID for SessionId: %v", err)
 53 | 	}
 54 | 	SessionId = strings.Replace(sessionUUID.String(), "-", "", -1)
 55 | }
 56 | 
 57 | var (
 58 | 	// DefaultDirCreationPerm is used when creating new cache subdirectories.
 59 | 	DefaultDirCreationPerm = os.FileMode(0755)
 60 | 
 61 | 	// DefaultFileCreationPerm is used when creating files inside the cache subdirectories.
 62 | 	DefaultFileCreationPerm = os.FileMode(0644)
 63 | )
 64 | 
 65 | const (
 66 | 	tokenizersVersion = "0.0.1"
 67 | )
 68 | 
 69 | const (
 70 | 	HeaderXRepoCommit = "X-Repo-Commit"
 71 | 	HeaderXLinkedETag = "X-Linked-Etag"
 72 | 	HeaderXLinkedSize = "X-Linked-Size"
 73 | )
 74 | 
 75 | func getEnvOr(key, defaultValue string) string {
 76 | 	v := os.Getenv(key)
 77 | 	if v == "" {
 78 | 		return defaultValue
 79 | 	}
 80 | 	return v
 81 | }
 82 | 
 83 | // DefaultCacheDir for HuggingFace Hub, same used by the python library.
 84 | //
 85 | // Its prefix is either `${XDG_CACHE_HOME}` if set, or `~/.cache` otherwise. Followed by `/huggingface/hub/`.
 86 | // So typically: `~/.cache/huggingface/hub/`.
 87 | func DefaultCacheDir() string {
 88 | 	cacheDir := getEnvOr("XDG_CACHE_HOME", path.Join(os.Getenv("HOME"), ".cache"))
 89 | 	cacheDir = path.Join(cacheDir, "huggingface", "hub")
 90 | 	return cacheDir
 91 | }
 92 | 
 93 | // DefaultHttpUserAgent returns a user agent to use with HuggingFace Hub API.
 94 | func DefaultHttpUserAgent() string {
 95 | 	return fmt.Sprintf("go-huggingface/%v; golang/%s; session_id/%s",
 96 | 		huggingface.Version, runtime.Version(), SessionId)
 97 | }
 98 | 
 99 | // RepoIdSeparator is used to separate repository/model names parts when mapping to file names.
100 | // Likely only for internal use.
101 | const RepoIdSeparator = "--"
102 | 
103 | // RepoType supported by HuggingFace-Hub
104 | type RepoType string
105 | 
106 | const (
107 | 	RepoTypeDataset RepoType = "datasets"
108 | 	RepoTypeSpace   RepoType = "spaces"
109 | 	RepoTypeModel   RepoType = "models"
110 | )
111 | 


--------------------------------------------------------------------------------
/hub/info.go:
--------------------------------------------------------------------------------
  1 | package hub
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"context"
  6 | 	"encoding/json"
  7 | 	"fmt"
  8 | 	"log"
  9 | 	"os"
 10 | 	"path"
 11 | 
 12 | 	"github.com/gomlx/go-huggingface/internal/files"
 13 | 	"github.com/pkg/errors"
 14 | )
 15 | 
 16 | // RepoInfo holds information about a HuggingFace repo, it is the json served when hitting the URL
 17 | // https://huggingface.co/api/<repo_type>/<model_id>
 18 | //
 19 | // TODO: Not complete, only holding the fields used so far by the library.
 20 | type RepoInfo struct {
 21 | 	ID          string          `json:"id"`
 22 | 	ModelID     string          `json:"model_id"`
 23 | 	Author      string          `json:"author"`
 24 | 	CommitHash  string          `json:"sha"`
 25 | 	Tags        []string        `json:"tags"`
 26 | 	Siblings    []*FileInfo     `json:"siblings"`
 27 | 	SafeTensors SafeTensorsInfo `json:"safetensors"`
 28 | }
 29 | 
 30 | // FileInfo represents one of the model file, in the Info structure.
 31 | type FileInfo struct {
 32 | 	Name string `json:"rfilename"`
 33 | }
 34 | 
 35 | // SafeTensorsInfo holds counts on number of parameters of various types.
 36 | type SafeTensorsInfo struct {
 37 | 	Total int
 38 | 
 39 | 	// Parameters: maps dtype name to int.
 40 | 	Parameters map[string]int
 41 | }
 42 | 
 43 | // Info returns the RepoInfo structure about the model.
 44 | // Most users don't need to call this directly, instead use the various iterators.
 45 | //
 46 | // If it hasn't been downloaded or loaded from the cache yet, it loads it first.
 47 | //
 48 | // It may return nil if there was an issue with the downloading of the RepoInfo json from HuggingFace.
 49 | // Try DownloadInfo to get an error.
 50 | func (r *Repo) Info() *RepoInfo {
 51 | 	if r.info == nil {
 52 | 		err := r.DownloadInfo(false)
 53 | 		if err != nil {
 54 | 			log.Printf("Error while downloading info about Repo: %+v", err)
 55 | 		}
 56 | 	}
 57 | 	return r.info
 58 | }
 59 | 
 60 | // infoURL for the API that returns the info about a repository.
 61 | func (r *Repo) infoURL() string {
 62 | 	return fmt.Sprintf("%s/api/%s/%s/revision/%s", r.hfEndpoint, r.repoType, r.ID, r.revision)
 63 | }
 64 | 
 65 | // DownloadInfo about the model, if it hasn't yet.
 66 | //
 67 | // It will attempt to use the "_info_.json" file in the cache directory first.
 68 | //
 69 | // If forceDownload is set to true, it ignores the current info or the cached one, and download it again from HuggingFace.
 70 | //
 71 | // See Repo.Info to access the Info directory.
 72 | // Most users don't need to call this directly, instead use the various iterators.
 73 | func (r *Repo) DownloadInfo(forceDownload bool) error {
 74 | 	if r.info != nil && !forceDownload {
 75 | 		return nil
 76 | 	}
 77 | 
 78 | 	// Create directory and file path for the info file.
 79 | 	infoFilePath, err := r.repoCacheDir()
 80 | 	if err != nil {
 81 | 		return err
 82 | 	}
 83 | 	infoFilePath = path.Join(infoFilePath, "info")
 84 | 	if err = os.MkdirAll(infoFilePath, DefaultDirCreationPerm); err != nil {
 85 | 		return errors.Wrapf(err, "while creating info directory %q", infoFilePath)
 86 | 	}
 87 | 	infoFilePath = path.Join(infoFilePath, r.revision)
 88 | 
 89 | 	// Download info file if needed.
 90 | 	if !files.Exists(infoFilePath) || forceDownload {
 91 | 		err := r.lockedDownload(context.Background(), r.infoURL(), infoFilePath, forceDownload, nil)
 92 | 		if err != nil {
 93 | 			return errors.WithMessagef(err, "failed to download repository info")
 94 | 		}
 95 | 	}
 96 | 
 97 | 	// Read _info_.json from disk.
 98 | 	infoJson, err := os.ReadFile(infoFilePath)
 99 | 	if err != nil {
100 | 		return errors.Wrapf(err, "failed to read info for model from disk in %q -- remove the file if you want to have it re-downloaded",
101 | 			infoFilePath)
102 | 	}
103 | 
104 | 	decoder := json.NewDecoder(bytes.NewReader(infoJson))
105 | 	newInfo := &RepoInfo{}
106 | 	if err = decoder.Decode(newInfo); err != nil {
107 | 		return errors.Wrapf(err, "failed to parse info for model in %q (downloaded from %q)",
108 | 			infoFilePath, r.infoURL())
109 | 	}
110 | 	r.info = newInfo
111 | 	return nil
112 | }
113 | 


--------------------------------------------------------------------------------
/tokenizers/tokenizers.go:
--------------------------------------------------------------------------------
  1 | // Package tokenizers creates tokenizers from HuggingFace models.
  2 | //
  3 | // Given a HuggingFace repository (see hub.New to create one), tokenizers will use its "tokenizer_config.json"
  4 | // and "tokenizer.json" to instantiate a Tokenizer.
  5 | package tokenizers
  6 | 
  7 | import (
  8 | 	"github.com/gomlx/go-huggingface/hub"
  9 | 	"github.com/gomlx/go-huggingface/tokenizers/api"
 10 | 	"github.com/gomlx/go-huggingface/tokenizers/sentencepiece"
 11 | 	"github.com/pkg/errors"
 12 | 
 13 | 	// Blank import.
 14 | 	_ "github.com/gomlx/go-huggingface/tokenizers/sentencepiece"
 15 | )
 16 | 
 17 | // Tokenizer interface allows one convert test to "tokens" (integer ids) and back.
 18 | //
 19 | // It also allows mapping of special tokens: tokens with a comman semantic (like padding) but that
 20 | // may map to different ids (int) for different tokenizers.
 21 | type Tokenizer = api.Tokenizer
 22 | 
 23 | // SpecialToken is an enum of commonly used special tokens.
 24 | type SpecialToken = api.Tokenizer
 25 | 
 26 | const (
 27 | 	TokBeginningOfSentence = api.TokBeginningOfSentence
 28 | 	TokEndOfSentence       = api.TokEndOfSentence
 29 | 	TokUnknown             = api.TokUnknown
 30 | 	TokPad                 = api.TokPad
 31 | 	TokMask                = api.TokMask
 32 | 	TokClassification      = api.TokClassification
 33 | 	TokSpecialTokensCount  = api.TokSpecialTokensCount
 34 | )
 35 | 
 36 | // New creates a new tokenizer from the given HuggingFace repo (see hub.New).
 37 | //
 38 | // Currently, it only supports "SentencePiece" encoders, and it attempts to download details from
 39 | // the repo files "tokenizer_config.json" and "tokenizer.json".
 40 | //
 41 | // If it fails to load those files, or create a tokenizer, it returns an error.
 42 | func New(repo *hub.Repo) (Tokenizer, error) {
 43 | 	err := repo.DownloadInfo(false)
 44 | 	if err != nil {
 45 | 		return nil, err
 46 | 	}
 47 | 
 48 | 	config, err := GetConfig(repo)
 49 | 	if err != nil {
 50 | 		return nil, err
 51 | 	}
 52 | 
 53 | 	constructor, found := registerOfClasses[config.TokenizerClass]
 54 | 	if !found {
 55 | 		return nil, errors.Errorf("unknown tokenizer class %q", config.TokenizerClass)
 56 | 	}
 57 | 	return constructor(config, repo)
 58 | }
 59 | 
 60 | // GetConfig returns the parsed "tokenizer_config.json" Config object for the repo.
 61 | func GetConfig(repo *hub.Repo) (*api.Config, error) {
 62 | 	err := repo.DownloadInfo(false)
 63 | 	if err != nil {
 64 | 		return nil, err
 65 | 	}
 66 | 	localConfigFile, err := repo.DownloadFile("tokenizer_config.json")
 67 | 	if err != nil {
 68 | 		return nil, err
 69 | 	}
 70 | 	config, err := api.ParseConfigFile(localConfigFile) // tokenizer_config.json
 71 | 	if err != nil {
 72 | 		return nil, err
 73 | 	}
 74 | 	return config, nil
 75 | }
 76 | 
 77 | // Config struct to hold HuggingFace's tokenizer_config.json contents.
 78 | // There is no formal schema for this file, but these are some common fields that may be of use.
 79 | // Specific tokenizer classes are free to implement additional features as they see fit.
 80 | //
 81 | // The extra field ConfigFile holds the path to the file with the full config.
 82 | type Config = api.Config
 83 | 
 84 | // TokenizerConstructor is used by Tokenizer implementations to provide implementations for different
 85 | // tokenizer classes.
 86 | type TokenizerConstructor func(config *api.Config, repo *hub.Repo) (api.Tokenizer, error)
 87 | 
 88 | // RegisterTokenizerClass used by Tokenizer implementations.
 89 | func RegisterTokenizerClass(name string, constructor TokenizerConstructor) {
 90 | 	registerOfClasses[name] = constructor
 91 | }
 92 | 
 93 | var (
 94 | 	registerOfClasses = make(map[string]TokenizerConstructor)
 95 | )
 96 | 
 97 | func init() {
 98 | 	// Initialize sentencepiece tokenizer classes, always included.
 99 | 	RegisterTokenizerClass("GemmaTokenizer", sentencepiece.New)
100 | 
101 | 	//for _, className := range []string{
102 | 	//	"GemmaTokenizer", "BertTokenizer", "DebertaV2Tokenizer", "DistilBertTokenizer",
103 | 	//	"DistilBertTokenizer", "RobertaTokenizer"} {
104 | 	//}
105 | }
106 | 


--------------------------------------------------------------------------------
/tokenizers/api/specialtoken_enumer.go:
--------------------------------------------------------------------------------
  1 | // Code generated by "enumer -type=SpecialToken -trimprefix=Tok -transform=snake -values -text -json -yaml api.go"; DO NOT EDIT.
  2 | 
  3 | package api
  4 | 
  5 | import (
  6 | 	"encoding/json"
  7 | 	"fmt"
  8 | 	"strings"
  9 | )
 10 | 
 11 | const _SpecialTokenName = "beginning_of_sentenceend_of_sentenceunknownpadmaskclassificationspecial_tokens_count"
 12 | 
 13 | var _SpecialTokenIndex = [...]uint8{0, 21, 36, 43, 46, 50, 64, 84}
 14 | 
 15 | const _SpecialTokenLowerName = "beginning_of_sentenceend_of_sentenceunknownpadmaskclassificationspecial_tokens_count"
 16 | 
 17 | func (i SpecialToken) String() string {
 18 | 	if i < 0 || i >= SpecialToken(len(_SpecialTokenIndex)-1) {
 19 | 		return fmt.Sprintf("SpecialToken(%d)", i)
 20 | 	}
 21 | 	return _SpecialTokenName[_SpecialTokenIndex[i]:_SpecialTokenIndex[i+1]]
 22 | }
 23 | 
 24 | func (SpecialToken) Values() []string {
 25 | 	return SpecialTokenStrings()
 26 | }
 27 | 
 28 | // An "invalid array index" compiler error signifies that the constant values have changed.
 29 | // Re-run the stringer command to generate them again.
 30 | func _SpecialTokenNoOp() {
 31 | 	var x [1]struct{}
 32 | 	_ = x[TokBeginningOfSentence-(0)]
 33 | 	_ = x[TokEndOfSentence-(1)]
 34 | 	_ = x[TokUnknown-(2)]
 35 | 	_ = x[TokPad-(3)]
 36 | 	_ = x[TokMask-(4)]
 37 | 	_ = x[TokClassification-(5)]
 38 | 	_ = x[TokSpecialTokensCount-(6)]
 39 | }
 40 | 
 41 | var _SpecialTokenValues = []SpecialToken{TokBeginningOfSentence, TokEndOfSentence, TokUnknown, TokPad, TokMask, TokClassification, TokSpecialTokensCount}
 42 | 
 43 | var _SpecialTokenNameToValueMap = map[string]SpecialToken{
 44 | 	_SpecialTokenName[0:21]:       TokBeginningOfSentence,
 45 | 	_SpecialTokenLowerName[0:21]:  TokBeginningOfSentence,
 46 | 	_SpecialTokenName[21:36]:      TokEndOfSentence,
 47 | 	_SpecialTokenLowerName[21:36]: TokEndOfSentence,
 48 | 	_SpecialTokenName[36:43]:      TokUnknown,
 49 | 	_SpecialTokenLowerName[36:43]: TokUnknown,
 50 | 	_SpecialTokenName[43:46]:      TokPad,
 51 | 	_SpecialTokenLowerName[43:46]: TokPad,
 52 | 	_SpecialTokenName[46:50]:      TokMask,
 53 | 	_SpecialTokenLowerName[46:50]: TokMask,
 54 | 	_SpecialTokenName[50:64]:      TokClassification,
 55 | 	_SpecialTokenLowerName[50:64]: TokClassification,
 56 | 	_SpecialTokenName[64:84]:      TokSpecialTokensCount,
 57 | 	_SpecialTokenLowerName[64:84]: TokSpecialTokensCount,
 58 | }
 59 | 
 60 | var _SpecialTokenNames = []string{
 61 | 	_SpecialTokenName[0:21],
 62 | 	_SpecialTokenName[21:36],
 63 | 	_SpecialTokenName[36:43],
 64 | 	_SpecialTokenName[43:46],
 65 | 	_SpecialTokenName[46:50],
 66 | 	_SpecialTokenName[50:64],
 67 | 	_SpecialTokenName[64:84],
 68 | }
 69 | 
 70 | // SpecialTokenString retrieves an enum value from the enum constants string name.
 71 | // Throws an error if the param is not part of the enum.
 72 | func SpecialTokenString(s string) (SpecialToken, error) {
 73 | 	if val, ok := _SpecialTokenNameToValueMap[s]; ok {
 74 | 		return val, nil
 75 | 	}
 76 | 
 77 | 	if val, ok := _SpecialTokenNameToValueMap[strings.ToLower(s)]; ok {
 78 | 		return val, nil
 79 | 	}
 80 | 	return 0, fmt.Errorf("%s does not belong to SpecialToken values", s)
 81 | }
 82 | 
 83 | // SpecialTokenValues returns all values of the enum
 84 | func SpecialTokenValues() []SpecialToken {
 85 | 	return _SpecialTokenValues
 86 | }
 87 | 
 88 | // SpecialTokenStrings returns a slice of all String values of the enum
 89 | func SpecialTokenStrings() []string {
 90 | 	strs := make([]string, len(_SpecialTokenNames))
 91 | 	copy(strs, _SpecialTokenNames)
 92 | 	return strs
 93 | }
 94 | 
 95 | // IsASpecialToken returns "true" if the value is listed in the enum definition. "false" otherwise
 96 | func (i SpecialToken) IsASpecialToken() bool {
 97 | 	for _, v := range _SpecialTokenValues {
 98 | 		if i == v {
 99 | 			return true
100 | 		}
101 | 	}
102 | 	return false
103 | }
104 | 
105 | // MarshalJSON implements the json.Marshaler interface for SpecialToken
106 | func (i SpecialToken) MarshalJSON() ([]byte, error) {
107 | 	return json.Marshal(i.String())
108 | }
109 | 
110 | // UnmarshalJSON implements the json.Unmarshaler interface for SpecialToken
111 | func (i *SpecialToken) UnmarshalJSON(data []byte) error {
112 | 	var s string
113 | 	if err := json.Unmarshal(data, &s); err != nil {
114 | 		return fmt.Errorf("SpecialToken should be a string, got %s", data)
115 | 	}
116 | 
117 | 	var err error
118 | 	*i, err = SpecialTokenString(s)
119 | 	return err
120 | }
121 | 
122 | // MarshalText implements the encoding.TextMarshaler interface for SpecialToken
123 | func (i SpecialToken) MarshalText() ([]byte, error) {
124 | 	return []byte(i.String()), nil
125 | }
126 | 
127 | // UnmarshalText implements the encoding.TextUnmarshaler interface for SpecialToken
128 | func (i *SpecialToken) UnmarshalText(text []byte) error {
129 | 	var err error
130 | 	*i, err = SpecialTokenString(string(text))
131 | 	return err
132 | }
133 | 
134 | // MarshalYAML implements a YAML Marshaler for SpecialToken
135 | func (i SpecialToken) MarshalYAML() (interface{}, error) {
136 | 	return i.String(), nil
137 | }
138 | 
139 | // UnmarshalYAML implements a YAML Unmarshaler for SpecialToken
140 | func (i *SpecialToken) UnmarshalYAML(unmarshal func(interface{}) error) error {
141 | 	var s string
142 | 	if err := unmarshal(&s); err != nil {
143 | 		return err
144 | 	}
145 | 
146 | 	var err error
147 | 	*i, err = SpecialTokenString(s)
148 | 	return err
149 | }
150 | 


--------------------------------------------------------------------------------
/hub/download.go:
--------------------------------------------------------------------------------
  1 | package hub
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"log"
  6 | 	"math/rand"
  7 | 	"os"
  8 | 	"path"
  9 | 	"time"
 10 | 
 11 | 	"github.com/gofrs/flock"
 12 | 	"github.com/gomlx/go-huggingface/internal/downloader"
 13 | 	"github.com/gomlx/go-huggingface/internal/files"
 14 | 	"github.com/pkg/errors"
 15 | )
 16 | 
 17 | // Generic download utilities.
 18 | 
 19 | // getDownloadManager returns current downloader.Manager, or creates a new one for this Repo.
 20 | func (r *Repo) getDownloadManager() *downloader.Manager {
 21 | 	if r.downloadManager == nil {
 22 | 		r.downloadManager = downloader.New().MaxParallel(r.MaxParallelDownload).WithAuthToken(r.authToken)
 23 | 	}
 24 | 	return r.downloadManager
 25 | }
 26 | 
 27 | // lockedDownload url to the given filePath.
 28 | //
 29 | // If filePath exits and forceDownload is false, it is assumed to already have been correctly downloaded, and it will return immediately.
 30 | //
 31 | // It downloads the file to filePath+".tmp" and then atomically move it to filePath.
 32 | //
 33 | // It uses a temporary filePath+".lock" to coordinate multiple processes/programs trying to download the same file at the same time.
 34 | func (r *Repo) lockedDownload(ctx context.Context, url, filePath string, forceDownload bool, progressCallback downloader.ProgressCallback) error {
 35 | 	if files.Exists(filePath) {
 36 | 		if !forceDownload {
 37 | 			return nil
 38 | 		}
 39 | 		err := os.Remove(filePath)
 40 | 		if err != nil {
 41 | 			return errors.Wrapf(err, "failed to remove %q while force-downloading %q", filePath, url)
 42 | 		}
 43 | 	}
 44 | 
 45 | 	// Checks whether context has already been cancelled, and exit immediately.
 46 | 	if err := ctx.Err(); err != nil {
 47 | 		return err
 48 | 	}
 49 | 
 50 | 	// Create a directory for the file.
 51 | 	if err := os.MkdirAll(path.Dir(filePath), DefaultDirCreationPerm); err != nil {
 52 | 		return errors.Wrapf(err, "failed to create directory for file %q", filePath)
 53 | 	}
 54 | 
 55 | 	// Lock file to avoid parallel downloads.
 56 | 	lockPath := filePath + ".lock"
 57 | 	var mainErr error
 58 | 	errLock := execOnFileLock(lockPath, func() {
 59 | 		if files.Exists(filePath) {
 60 | 			// Some concurrent other process (or goroutine) already downloaded the file.
 61 | 			return
 62 | 		}
 63 | 
 64 | 		// Create tmpFile where to download.
 65 | 		var tmpFileClosed bool
 66 | 		tmpPath := filePath + ".downloading"
 67 | 		tmpFile, err := os.Create(tmpPath)
 68 | 		if err != nil {
 69 | 			mainErr = errors.Wrapf(err, "creating temporary file for download in %q", tmpPath)
 70 | 			return
 71 | 		}
 72 | 		defer func() {
 73 | 			// If we exit with an error, make sure to close and remove unfinished temporary file.
 74 | 			if !tmpFileClosed {
 75 | 				err := tmpFile.Close()
 76 | 				if err != nil {
 77 | 					log.Printf("Failed closing temporary file %q: %v", tmpPath, err)
 78 | 				}
 79 | 				err = os.Remove(tmpPath)
 80 | 				if err != nil {
 81 | 					log.Printf("Failed removing temporary file %q: %v", tmpPath, err)
 82 | 				}
 83 | 			}
 84 | 		}()
 85 | 
 86 | 		downloadManager := r.getDownloadManager()
 87 | 		mainErr = downloadManager.Download(ctx, url, tmpPath, progressCallback)
 88 | 		if mainErr != nil {
 89 | 			mainErr = errors.WithMessagef(mainErr, "while downloading %q to %q", url, tmpPath)
 90 | 			return
 91 | 		}
 92 | 
 93 | 		// Download succeeded, move to our target location.
 94 | 		tmpFileClosed = true
 95 | 		if err := tmpFile.Close(); err != nil {
 96 | 			mainErr = errors.Wrapf(err, "failed to close temporary download file %q", tmpPath)
 97 | 			return
 98 | 		}
 99 | 		if err := os.Rename(tmpPath, filePath); err != nil {
100 | 			mainErr = errors.Wrapf(err, "failed to move downloaded file %q to %q", tmpPath, filePath)
101 | 			return
102 | 		}
103 | 
104 | 		// File already exists, so we no longer need the lock file.
105 | 		err = os.Remove(lockPath)
106 | 		if err != nil {
107 | 			log.Printf("Warning: error removing lock file %q: %+v", lockPath, err)
108 | 		}
109 | 	})
110 | 	if mainErr != nil {
111 | 		return mainErr
112 | 	}
113 | 	if errLock != nil {
114 | 		return errors.WithMessagef(errLock, "while locking %q to download %q", lockPath, url)
115 | 	}
116 | 	return nil
117 | }
118 | 
119 | // execOnFileLock opens the lockPath file (or creates if it doesn't yet exist), locks it, and executes the function.
120 | // If the lockPath is already locked, it polls with a 1 to 2 seconds period (randomly), until it acquires the lock.
121 | //
122 | // The lockPath is not removed. It's safe to remove it from the given fn, if one knows that no new calls to
123 | // execOnFileLock with the same lockPath is going to be made.
124 | func execOnFileLock(lockPath string, fn func()) (err error) {
125 | 	// Create a new flock instance directly using gofrs/flock
126 | 	fileLock := flock.New(lockPath)
127 | 
128 | 	// Acquire lock with retry logic
129 | 	for {
130 | 		// Try to acquire the lock
131 | 		locked, err := fileLock.TryLock()
132 | 		if err != nil {
133 | 			return errors.Wrapf(err, "while trying to lock %q", lockPath)
134 | 		}
135 | 
136 | 		// If we got the lock, break out of the retry loop
137 | 		if locked {
138 | 			break
139 | 		}
140 | 
141 | 		// Wait from 1 to 2 seconds.
142 | 		time.Sleep(time.Millisecond * time.Duration(1000+rand.Intn(1000)))
143 | 	}
144 | 
145 | 	// Setup clean up in a deferred function, so it happens even if `fn()` panics.
146 | 	defer func() {
147 | 		unlockErr := fileLock.Unlock()
148 | 		if unlockErr != nil {
149 | 			// If we already have an error, don't overwrite it
150 | 			if err == nil {
151 | 				err = errors.Wrapf(unlockErr, "unlocking file %q", lockPath)
152 | 			} else {
153 | 				log.Printf("Error unlocking file %q: %v", lockPath, unlockErr)
154 | 			}
155 | 		}
156 | 	}()
157 | 
158 | 	// We got the lock, run the function.
159 | 	fn()
160 | 
161 | 	return
162 | }
163 | 


--------------------------------------------------------------------------------
/internal/downloader/downloader.go:
--------------------------------------------------------------------------------
  1 | // Package downloader implements download in parallel of various URLs, with various progress report callback.
  2 | //
  3 | // It is used by the `hub` package, but it's also left public, in case it becomes useful for others.
  4 | package downloader
  5 | 
  6 | import (
  7 | 	"context"
  8 | 	"fmt"
  9 | 	"io"
 10 | 	"net/http"
 11 | 	"os"
 12 | 	"path"
 13 | 
 14 | 	"github.com/pkg/errors"
 15 | 
 16 | 	"github.com/gomlx/go-huggingface/internal/files"
 17 | )
 18 | 
 19 | // ProgressCallback is called as download progresses.
 20 | //   - totalBytes may be set to 0 if total size is not yet known.
 21 | type ProgressCallback func(downloadedBytes, totalBytes int64)
 22 | 
 23 | // Manager handles downloads, reporting back progress and errors.
 24 | type Manager struct {
 25 | 	semaphore            *Semaphore
 26 | 	authToken, userAgent string
 27 | }
 28 | 
 29 | // New creates a Manager that download files in parallel -- by default mostly 20 in parallel.
 30 | func New() *Manager {
 31 | 	return &Manager{semaphore: NewSemaphore(20)}
 32 | }
 33 | 
 34 | // MaxParallel indicates how many files to download at the same time. Default is 20.
 35 | // If set to <= 0 it will download all files in parallel.
 36 | // Set to 1 to make downloads sequential.
 37 | func (m *Manager) MaxParallel(n int) *Manager {
 38 | 	m.semaphore.Resize(n)
 39 | 	return m
 40 | }
 41 | 
 42 | // WithAuthToken sets the authentication token to use in the requests.
 43 | // It is passed in the header "Authorization" and prefixed with "Bearer ".
 44 | //
 45 | // Setting it to empty ("") is the same as resetting and not using authentication.
 46 | func (m *Manager) WithAuthToken(authToken string) *Manager {
 47 | 	m.authToken = authToken
 48 | 	return m
 49 | }
 50 | 
 51 | // WithUserAgent sets the user agent to user.
 52 | func (m *Manager) WithUserAgent(userAgent string) *Manager {
 53 | 	m.userAgent = userAgent
 54 | 	return m
 55 | }
 56 | 
 57 | var CancellationError = errors.New("download cancelled")
 58 | 
 59 | // setRequestHeader with configured fields.
 60 | func (m *Manager) setRequestHeader(req *http.Request) {
 61 | 	if m.authToken != "" {
 62 | 		req.Header.Set("Authorization", "Bearer "+m.authToken)
 63 | 	}
 64 | 	if m.userAgent != "" {
 65 | 		req.Header.Set("user-agent", m.userAgent)
 66 | 	}
 67 | }
 68 | 
 69 | // Download downloads the given url to be downloaded to the given filePath.
 70 | // This may lock if it reached the maximum number of parallel downloads.
 71 | // Consider calling this on its own go-routine.
 72 | //
 73 | // Progress of download is reported back to the given callback, if not nil.
 74 | //
 75 | // The context ctx can be used to interrupt the downloading.
 76 | func (m *Manager) Download(ctx context.Context, url string, filePath string, callback ProgressCallback) error {
 77 | 	m.semaphore.Acquire()
 78 | 	defer m.semaphore.Release()
 79 | 
 80 | 	client := &http.Client{
 81 | 		CheckRedirect: func(r *http.Request, via []*http.Request) error {
 82 | 			r.URL.Opaque = r.URL.Path
 83 | 			return nil
 84 | 		},
 85 | 	}
 86 | 
 87 | 	var err error
 88 | 	filePath, err = files.ReplaceTildeInDir(filePath)
 89 | 	if err != nil {
 90 | 		return errors.Wrapf(err, "Failed to resolve user name in tilde (~) expansion: %q", filePath)
 91 | 	}
 92 | 	if err = os.MkdirAll(path.Dir(filePath), 0777); err != nil {
 93 | 		return errors.Wrapf(err, "Failed to create the directory for the path: %q", path.Dir(filePath))
 94 | 	}
 95 | 	var file *os.File
 96 | 	file, err = os.Create(filePath)
 97 | 	if err != nil {
 98 | 		return errors.Wrapf(err, "failed creating file %q", filePath)
 99 | 	}
100 | 	defer func() {
101 | 		if file != nil {
102 | 			_ = file.Close()
103 | 		}
104 | 	}()
105 | 
106 | 	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
107 | 	if err != nil {
108 | 		return errors.Wrapf(err, "failed creating request for %q", url)
109 | 	}
110 | 	m.setRequestHeader(req)
111 | 	var resp *http.Response
112 | 	resp, err = client.Do(req)
113 | 	if err != nil {
114 | 		return errors.Wrapf(err, "failed downloading %q", url)
115 | 	}
116 | 	// _ = resp.Header.Write(os.Stdout)
117 | 	if resp.StatusCode != http.StatusOK {
118 | 		return fmt.Errorf("bad status code %d: %q", resp.StatusCode, resp.Header.Get("X-Error-Message"))
119 | 	}
120 | 
121 | 	contentLength := resp.ContentLength
122 | 	if callback != nil {
123 | 		callback(0, contentLength)
124 | 	}
125 | 	const maxBufferSize = 1 * 1024 * 1024
126 | 	var buf [maxBufferSize]byte
127 | 	downloadedBytes := int64(0)
128 | 	for {
129 | 		if ctx.Err() != nil {
130 | 			return CancellationError
131 | 		}
132 | 		n, readErr := resp.Body.Read(buf[:])
133 | 		if readErr != nil && readErr != io.EOF {
134 | 			if ctx.Err() != nil {
135 | 				return CancellationError
136 | 			}
137 | 			return errors.Wrapf(err, "failed downloading %q", url)
138 | 		}
139 | 		if n > 0 {
140 | 			wn, writeErr := file.Write(buf[:n])
141 | 			if writeErr != nil && writeErr != io.EOF {
142 | 				return errors.Wrapf(writeErr, "failed writing %q to %q", url, filePath)
143 | 			}
144 | 			if wn != n {
145 | 				return errors.Wrapf(io.ErrShortWrite, "failed writing %q to %q: not enough bytes written (wanted %d, wrote only %d)",
146 | 					url, filePath, n, wn)
147 | 			}
148 | 		}
149 | 		if readErr == io.EOF {
150 | 			break
151 | 		}
152 | 		downloadedBytes += int64(n)
153 | 		if callback != nil {
154 | 			callback(downloadedBytes, contentLength)
155 | 		}
156 | 	}
157 | 	err = file.Close()
158 | 	file = nil
159 | 	if err != nil {
160 | 		return errors.Wrapf(err, "failed closing file %q", filePath)
161 | 	}
162 | 	if err = resp.Body.Close(); err != nil {
163 | 		return errors.Wrapf(err, "failed closing connection to %q", url)
164 | 	}
165 | 	return nil
166 | }
167 | 
168 | // FetchHeader fetches the header of a URL (using HTTP method "HEAD").
169 | //
170 | // Notice it may lock on the maximum number of parallel requests, so consider calling this on a separate goroutine.
171 | //
172 | // The context ctx can be used to interrupt the downloading.
173 | func (m *Manager) FetchHeader(ctx context.Context, url string) (header http.Header, contentLength int64, err error) {
174 | 	m.semaphore.Acquire()
175 | 	defer m.semaphore.Release()
176 | 
177 | 	client := &http.Client{
178 | 		CheckRedirect: func(r *http.Request, via []*http.Request) error {
179 | 			r.URL.Opaque = r.URL.Path
180 | 			return nil
181 | 		},
182 | 	}
183 | 	req, err := http.NewRequestWithContext(ctx, http.MethodHead, url, nil)
184 | 	if err != nil {
185 | 		err = errors.Wrapf(err, "failed creating request for %q", url)
186 | 		return
187 | 	}
188 | 	m.setRequestHeader(req)
189 | 	req.Header.Set("Accept-Encoding", "identity")
190 | 
191 | 	// Make the request and download the tokenizer.
192 | 	resp, err := client.Do(req)
193 | 	if err != nil {
194 | 		err = errors.Wrap(err, "failed request for metadata: ")
195 | 		return
196 | 	}
197 | 
198 | 	// TODO: handle redirects.
199 | 	defer func() { _ = resp.Body.Close() }()
200 | 	_, err = io.ReadAll(resp.Body)
201 | 	if err != nil {
202 | 		err = errors.Wrapf(err, "failed reading response (%d) for metadata: ", resp.StatusCode)
203 | 		return
204 | 	}
205 | 
206 | 	// Check status code.
207 | 	if resp.StatusCode != 200 {
208 | 		err = errors.Errorf("request for metadata from %q failed with the following message: %q",
209 | 			url, resp.Status)
210 | 		return
211 | 	}
212 | 	header = resp.Header
213 | 	contentLength = resp.ContentLength
214 | 	err = nil
215 | 	return
216 | }
217 | 


--------------------------------------------------------------------------------
/hub/repo.go:
--------------------------------------------------------------------------------
  1 | package hub
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"log"
  6 | 	"os"
  7 | 	"path"
  8 | 	"strings"
  9 | 
 10 | 	"github.com/gomlx/go-huggingface/internal/downloader"
 11 | 	"github.com/gomlx/go-huggingface/internal/files"
 12 | 	"github.com/pkg/errors"
 13 | )
 14 | 
 15 | // Repo from which one wants to download files. Create it with New.
 16 | type Repo struct {
 17 | 	// ID of the Repo may include owner/model. E.g.: google/gemma-2-2b-it
 18 | 	ID string
 19 | 
 20 | 	// Hugginface endpint to use, defaults to "https://huggingface.co".
 21 | 	hfEndpoint string
 22 | 
 23 | 	// repoType of the repository, usually RepoTypeModel.
 24 | 	repoType RepoType
 25 | 
 26 | 	// revision to download, usually set to "main", but it can use a commit-hash version.
 27 | 	revision string
 28 | 
 29 | 	// revisionHashRefreshed indicates whether the revision hash has been refreshed.
 30 | 	// We force it to be refreshed at least once before hitting the server, just in case.
 31 | 	revisionHashRefreshed bool
 32 | 
 33 | 	// authToken is the HuggingFace authentication token to be used when downloading the files.
 34 | 	authToken string
 35 | 
 36 | 	// Verbosity: 0 for quiet operation; 1 for information about progress; 2 and higher for debugging.
 37 | 	Verbosity int
 38 | 
 39 | 	// MaxParallelDownload indicates how many files to download at the same time. Default is 20.
 40 | 	// If set to <= 0 it will download all files in parallel.
 41 | 	// Set to 1 to make downloads sequential.
 42 | 	MaxParallelDownload int
 43 | 
 44 | 	// cacheDir is where to store the downloaded files.
 45 | 	cacheDir string
 46 | 
 47 | 	// Info about the Repo in HuggingFace, including the list of files.
 48 | 	// It is only available after DownloadInfo is called.
 49 | 	info *RepoInfo
 50 | 
 51 | 	downloadManager *downloader.Manager
 52 | 
 53 | 	useProgressBar bool
 54 | }
 55 | 
 56 | // New creates a reference to a HuggingFace model given its id.
 57 | //
 58 | // It uses the default cache directory in ${XDG_CACHE_HOME} (if set) or `~/.cache`, in a format that is
 59 | // shared with huggingface-hub for python library. The cache is share across various programs, including Python
 60 | // programs.
 61 | // Use Repo.WithCacheDir to change it, or NewWithDir to use a plain directory structure, that is not shared across programs.
 62 | //
 63 | // The id typically include owner/model. E.g.: "google/gemma-2-2b-it"
 64 | //
 65 | // It defaults to being a RepoTypeModel repository. But you can change it with Repo.WithType.
 66 | //
 67 | // If authentication is needed, use Repo.WithAuth.
 68 | func New(id string) *Repo {
 69 | 	hfEndpoint := os.Getenv("HF_ENDPOINT")
 70 | 	if hfEndpoint == "" {
 71 | 		hfEndpoint = "https://huggingface.co"
 72 | 	} else {
 73 | 		hfEndpoint = strings.TrimSuffix(hfEndpoint, "/")
 74 | 	}
 75 | 	return &Repo{
 76 | 		ID:                  id,
 77 | 		repoType:            RepoTypeModel,
 78 | 		revision:            "main",
 79 | 		hfEndpoint:          hfEndpoint,
 80 | 		cacheDir:            DefaultCacheDir(),
 81 | 		Verbosity:           1,
 82 | 		MaxParallelDownload: 20, // At most 20 parallel downloads.
 83 | 	}
 84 | }
 85 | 
 86 | // WithAuth sets the authentication token to use during downloads.
 87 | //
 88 | // Setting it to empty ("") is the same as resetting and not using authentication.
 89 | func (r *Repo) WithAuth(authToken string) *Repo {
 90 | 	r.authToken = authToken
 91 | 	return r
 92 | }
 93 | 
 94 | // WithType sets the repository type to use during downloads.
 95 | func (r *Repo) WithType(repoType RepoType) *Repo {
 96 | 	r.repoType = repoType
 97 | 	return r
 98 | }
 99 | 
100 | // WithEndpoint sets the HuggingFace endpoint to use.
101 | // Default is "https://huggingface.co" or, if set, the environment variable HF_ENDPOINT.
102 | func (r *Repo) WithEndpoint(endpoint string) *Repo {
103 | 	r.hfEndpoint = endpoint
104 | 	return r
105 | }
106 | 
107 | // WithRevision sets the revision to use for this Repo, defaults to "main", but can be set to a commit-hash value.
108 | func (r *Repo) WithRevision(revision string) *Repo {
109 | 	r.revision = revision
110 | 	return r
111 | }
112 | 
113 | // WithCacheDir sets the cacheDir to the given directory.
114 | //
115 | // The default is given by DefaultCacheDir: `${XDG_CACHE_HOME}/huggingface/hub` if set, or `~/.cache/huggingface/hub` otherwise.
116 | func (r *Repo) WithCacheDir(cacheDir string) *Repo {
117 | 	newCacheDir, err := files.ReplaceTildeInDir(cacheDir)
118 | 	if err == nil {
119 | 		r.cacheDir = path.Clean(newCacheDir)
120 | 	} else {
121 | 		log.Printf("Failed to resolve directory for %q: %+v", cacheDir, err)
122 | 	}
123 | 	return r
124 | }
125 | 
126 | // WithDownloadManager sets the downloader.Manager to use for download.
127 | // This is not needed, one will be created automatically if one is not set.
128 | // This is useful when downloading multiple Repos simultaneously, to coordinate limits by sharing the download manager.
129 | func (r *Repo) WithDownloadManager(manager *downloader.Manager) *Repo {
130 | 	r.downloadManager = manager
131 | 	return r
132 | }
133 | 
134 | // WithProgressBar configures the usage of progress bar during download. Defaults to true.
135 | func (r *Repo) WithProgressBar(useProgressBar bool) *Repo {
136 | 	r.useProgressBar = useProgressBar
137 | 	return r
138 | }
139 | 
140 | // flatFolderName returns a serialized version of a hf.co repo name and type, safe for disk storage
141 | // as a single non-nested folder.
142 | //
143 | // Based on github.com/huggingface/huggingface_hub repo_folder_name.
144 | func (r *Repo) flatFolderName() string {
145 | 	parts := []string{string(r.repoType)}
146 | 	parts = append(parts, strings.Split(r.ID, "/")...)
147 | 	return strings.Join(parts, RepoIdSeparator)
148 | }
149 | 
150 | // repoCacheDir joins cacheDir and flatFolderName to return the cache subdirectory for the repository.
151 | // It also creates the directory, and returns an error if creation failed.
152 | func (r *Repo) repoCacheDir() (string, error) {
153 | 	dir := path.Join(r.cacheDir, r.flatFolderName())
154 | 	err := os.MkdirAll(dir, DefaultDirCreationPerm)
155 | 	if err != nil {
156 | 		return "", errors.Wrapf(err, "while creating cache directory %q", dir)
157 | 	}
158 | 	return dir, nil
159 | }
160 | 
161 | // FileURL returns the URL from which to download the file from HuggingFace.
162 | //
163 | // Usually, not used directly (use DownloadFile instead), but in case someone needs for debugging.
164 | func (r *Repo) FileURL(fileName string) (string, error) {
165 | 	commitHash, err := r.readCommitHashForRevision()
166 | 	if err != nil {
167 | 		return "", err
168 | 	}
169 | 	if r.repoType == RepoTypeModel {
170 | 		return fmt.Sprintf("%s/%s/resolve/%s/%s", r.hfEndpoint, r.ID, commitHash, fileName), nil
171 | 	} else {
172 | 		return fmt.Sprintf("%s/%s/%s/resolve/%s/%s", r.hfEndpoint, r.repoType, r.ID, commitHash, fileName), nil
173 | 	}
174 | }
175 | 
176 | // readCommitHashForRevision finds the commit-hash for the revision, it should already be written to disk.
177 | // The revision can be itself a commit-hash, in which case it is returned directly.
178 | //
179 | // repoCacheDir is returned by Repo.repoCacheDir().
180 | func (r *Repo) readCommitHashForRevision() (string, error) {
181 | 	forceDownload := !r.revisionHashRefreshed
182 | 	err := r.DownloadInfo(forceDownload)
183 | 	if err != nil {
184 | 		return "", err
185 | 	}
186 | 	r.revisionHashRefreshed = true
187 | 	return r.info.CommitHash, nil
188 | }
189 | 
190 | // repoSnapshotsDir returns the snapshots directory for this repo at its revision.
191 | func (r *Repo) repoSnapshotsDir() (string, error) {
192 | 	cacheDir, err := r.repoCacheDir()
193 | 	if err != nil {
194 | 		return "", err
195 | 	}
196 | 	commitHash, err := r.readCommitHashForRevision()
197 | 	if err != nil {
198 | 		return "", err
199 | 	}
200 | 	snapshotsDir := path.Join(cacheDir, "snapshots", commitHash)
201 | 	if err = os.MkdirAll(snapshotsDir, DefaultDirCreationPerm); err != nil {
202 | 		return "", errors.Wrapf(err, "while creating snapshots directory %q", snapshotsDir)
203 | 	}
204 | 	return snapshotsDir, nil
205 | }
206 | 
207 | // String implements fmt.Stringer.
208 | func (r *Repo) String() string {
209 | 	return r.ID
210 | }
211 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # **go-huggingface**, download, tokenize and convert models from HuggingFace. 
  2 | 
  3 | [![GoDev](https://img.shields.io/badge/go.dev-reference-007d9c?logo=go&logoColor=white)](https://pkg.go.dev/github.com/gomlx/go-huggingface?tab=doc)
  4 | 
  5 | ## 📖 Overview
  6 | 
  7 | Simple APIs for downloading (`hub`), tokenizing (`tokenizers`) and (**future work**) model conversion (`models`) of 
  8 | [HuggingFace🤗](huggingface.co) models using [GoMLX](https://github.com/gomlx/gomlx).
  9 | 
 10 | 🚧 **EXPERIMENTAL and IN DEVELOPMENT**: While the `hub` package has been stable. The `tokenizers` only supports
 11 | SentencePiece models (saved as proto), but has been working. 
 12 | 
 13 | ## Examples
 14 | 
 15 | ### Preamble: Imports And Variables
 16 | 
 17 | ```go
 18 | import (
 19 |     "github.com/gomlx/go-huggingface/hub"
 20 |     "github.com/gomlx/go-huggingface/tokenizers"
 21 | )
 22 | 
 23 | var (
 24 | 	// HuggingFace authentication token read from environment.
 25 | 	// It can be created in https://huggingface.co
 26 | 	// Some files may require it for downloading.
 27 | 	hfAuthToken = os.Getenv("HF_TOKEN")
 28 | 
 29 | 	// Model IDs we use for testing.
 30 | 	hfModelIDs = []string{
 31 | 		"google/gemma-2-2b-it",
 32 | 		"sentence-transformers/all-MiniLM-L6-v2",
 33 | 		"protectai/deberta-v3-base-zeroshot-v1-onnx",
 34 | 		"KnightsAnalytics/distilbert-base-uncased-finetuned-sst-2-english",
 35 | 		"KnightsAnalytics/distilbert-NER",
 36 | 		"SamLowe/roberta-base-go_emotions-onnx",
 37 | 	}
 38 | )
 39 | ```
 40 | 
 41 | ### List files for each model
 42 | 
 43 | ```go
 44 | for _, modelID := range hfModelIDs {
 45 | 	fmt.Printf("\n%s:\n", modelID)
 46 | 	repo := hub.New(modelID).WithAuth(hfAuthToken)
 47 | 	for fileName, err := range repo.IterFileNames() {
 48 | 		if err != nil { panic(err) }
 49 | 		fmt.Printf("\t%s\n", fileName)
 50 | 	}
 51 | }
 52 | ```
 53 | 
 54 | The result looks like this:
 55 | 
 56 | ```
 57 | google/gemma-2-2b-it:
 58 | 	.gitattributes
 59 | 	README.md
 60 | 	config.json
 61 | 	generation_config.json
 62 | 	model-00001-of-00002.safetensors
 63 | 	model-00002-of-00002.safetensors
 64 | 	model.safetensors.index.json
 65 | 	special_tokens_map.json
 66 | 	tokenizer.json
 67 | 	tokenizer.model
 68 | 	tokenizer_config.json
 69 | …
 70 | ```
 71 | 
 72 | 
 73 | ### List tokenizer classes for each model
 74 | 
 75 | ```go
 76 | for _, modelID := range hfModelIDs {
 77 | 	fmt.Printf("\n%s:\n", modelID)
 78 | 	repo := hub.New(modelID).WithAuth(hfAuthToken)
 79 | 	config, err := tokenizers.GetConfig(repo)
 80 | 	if err != nil { panic(err) }
 81 | 	fmt.Printf("\ttokenizer_class=%s\n", config.TokenizerClass)
 82 | }
 83 | ```
 84 | 
 85 | Results:
 86 | 
 87 | ```
 88 | google/gemma-2-2b-it:
 89 | 	tokenizer_class=GemmaTokenizer
 90 | 
 91 | sentence-transformers/all-MiniLM-L6-v2:
 92 | 	tokenizer_class=BertTokenizer
 93 | 
 94 | protectai/deberta-v3-base-zeroshot-v1-onnx:
 95 | 	tokenizer_class=DebertaV2Tokenizer
 96 | …
 97 | ```
 98 | 
 99 | 
100 | ### Tokenize for [`google/gemma-2-2b-it`](https://huggingface.co/google/gemma-2-2b-it) using Go-only "SentencePiece" tokenizer
101 | 
102 | * The output "Downloaded" message happens only the tokenizer file is not yet cached, so only the first time:
103 | 
104 | ```go
105 | repo := hub.New("google/gemma-2-2b-it").WithAuth(hfAuthToken)
106 | tokenizer, err := tokenizers.New(repo)
107 | if err != nil { panic(err) }
108 | 
109 | sentence := "The book is on the table."
110 | tokens := tokenizer.Encode(sentence)
111 | fmt.Printf("Sentence:\t%s\n", sentence)
112 | fmt.Printf("Tokens:  \t%v\n", tokens)
113 | ```
114 | 
115 | ```
116 | Downloaded 1/1 files, 4.2 MB downloaded         
117 | Sentence:	The book is on the table.
118 | Tokens:  	[651 2870 603 611 573 3037 235265]
119 | ```
120 | 
121 | ### Tokenize for a [Sentence Transformer](https://www.sbert.net/) derived model, using Rust's based [github.com/daulet/tokenizers](https://github.com/daulet/tokenizers) tokenizer
122 | 
123 | For most tokenizers in HuggingFace though, there is no Go-only version yet, and for now we use the 
124 | [github.com/daulet/tokenizers](https://github.com/daulet/tokenizers), which is based on a fast tokenizer written in Rust.
125 | 
126 | It requires installation of the built Rust library though, 
127 | see [github.com/daulet/tokenizers](https://github.com/daulet/tokenizers) on how to install it, 
128 | they provide prebuilt binaries.
129 | 
130 | > **Note**: `daulet/tokenizers` also provides a simple downloader, so `go-huggingface` is not strictly necessary -- 
131 | > if you don't want the extra dependency and only need the tokenizer, you don't need to use it. `go-huggingface` 
132 | > helps by allowing also downloading other files (models, datasets), and a shared cache across different projects 
133 | > and `huggingface-hub` (the python downloader library).
134 | 
135 | ```go
136 | import dtok "github.com/daulet/tokenizers"
137 | 
138 | %%
139 | modelID := "KnightsAnalytics/all-MiniLM-L6-v2"
140 | repo := hub.New(modelID).WithAuth(hfAuthToken)
141 | localFile := must.M1(repo.DownloadFile("tokenizer.json"))
142 | tokenizer := must.M1(dtok.FromFile(localFile))
143 | defer tokenizer.Close()
144 | tokens, _ := tokenizer.Encode(sentence, true)
145 | 
146 | fmt.Printf("Sentence:\t%s\n", sentence)
147 | fmt.Printf("Tokens:  \t%v\n", tokens)
148 | ```
149 | 
150 | ```
151 | Sentence:	The book is on the table.
152 | Tokens:  	[101 1996 2338 2003 2006 1996 2795 1012 102 0 0 0…]
153 | ```
154 | 
155 | ### Download and execute ONNX model for [`sentence-transformers/all-MiniLM-L6-v2`](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
156 | 
157 | Only the first 3 lines are actually demoing `go-huggingface`.
158 | The remainder lines uses [`github.com/gomlx/onnx-gomlx`](https://github.com/gomlx/onnx-gomlx)
159 | to parse and convert the ONNX model to GoMLX, and then
160 | [`github.com/gomlx/gomlx`](github.com/gomlx/gomlx) to execute the converted model
161 | for a couple of sentences.
162 | 
163 | ```go
164 | // Get ONNX model.
165 | repo := hub.New("sentence-transformers/all-MiniLM-L6-v2").WithAuth(hfAuthToken)
166 | onnxFilePath, err := repo.DownloadFile("onnx/model.onnx")
167 | if err != nil { panic(err) }
168 | onnxModel, err := onnx.ReadFile(onnxFilePath)
169 | if err != nil { panic(err) }
170 | 
171 | // Convert ONNX variables to GoMLX context (which stores variables):
172 | ctx := context.New()
173 | err = onnxModel.VariablesToContext(ctx)
174 | if err != nil { panic(err) }
175 | 
176 | // Test input.
177 | sentences := []string{
178 | 	"This is an example sentence",
179 | 	"Each sentence is converted"}
180 | inputIDs := [][]int64{
181 | 	{101, 2023, 2003, 2019, 2742, 6251,  102},
182 | 	{ 101, 2169, 6251, 2003, 4991,  102,    0}}
183 | tokenTypeIDs := [][]int64{
184 | 	{0, 0, 0, 0, 0, 0, 0},
185 | 	{0, 0, 0, 0, 0, 0, 0}}
186 | attentionMask := [][]int64{
187 | 	{1, 1, 1, 1, 1, 1, 1},
188 | 	{1, 1, 1, 1, 1, 1, 0}}
189 | 
190 | // Execute GoMLX graph with model.
191 | embeddings := context.ExecOnce(
192 | 	backends.New(), ctx,
193 | 	func (ctx *context.Context, inputs []*graph.Node) *graph.Node {
194 | 		modelOutputs := onnxModel.CallGraph(ctx, inputs[0].Graph(), map[string]*graph.Node{
195 | 			"input_ids": inputs[0],
196 | 			"attention_mask": inputs[1],
197 | 			"token_type_ids": inputs[2]})
198 | 		return modelOutputs[0]
199 | 	}, 
200 | 	inputIDs, attentionMask, tokenTypeIDs)
201 | 
202 | fmt.Printf("Sentences: \t%q\n", sentences)
203 | fmt.Printf("Embeddings:\t%s\n", embeddings)
204 | ```
205 | 
206 | ```
207 | Sentences: 	["This is an example sentence" "Each sentence is converted"]
208 | Embeddings:	[2][7][384]float32{
209 |  {{0.0366, -0.0162, 0.1682, ..., 0.0554, -0.1644, -0.2967},
210 |   {0.7239, 0.6399, 0.1888, ..., 0.5946, 0.6206, 0.4897},
211 |   {0.0064, 0.0203, 0.0448, ..., 0.3464, 1.3170, -0.1670},
212 |   ...,
213 |   {0.1479, -0.0643, 0.1457, ..., 0.8837, -0.3316, 0.2975},
214 |   {0.5212, 0.6563, 0.5607, ..., -0.0399, 0.0412, -1.4036},
215 |   {1.0824, 0.7140, 0.3986, ..., -0.2301, 0.3243, -1.0313}},
216 |  {{0.2802, 0.1165, -0.0418, ..., 0.2711, -0.1685, -0.2961},
217 |   {0.8729, 0.4545, -0.1091, ..., 0.1365, 0.4580, -0.2042},
218 |   {0.4752, 0.5731, 0.6304, ..., 0.6526, 0.5612, -1.3268},
219 |   ...,
220 |   {0.6113, 0.7920, -0.4685, ..., 0.0854, 1.0592, -0.2983},
221 |   {0.4115, 1.0946, 0.2385, ..., 0.8984, 0.3684, -0.7333},
222 |   {0.1374, 0.5555, 0.2678, ..., 0.5426, 0.4665, -0.5284}}}
223 | ```
224 | 
225 | ## Download Dataset Files
226 | 
227 | We are going to use the [HuggingFaceFW/fineweb](https://huggingface.co/datasets/HuggingFaceFW/fineweb) as an example, download one of its sample files (~2.5Gb of data) and parse the `.parquet` file.
228 | 
229 | ### Structure of file
230 | First we define the structure of each entry, with the tags for the Parquet parser:
231 | 
232 | ```go
233 | var (
234 |     FineWebID = "HuggingFaceFW/fineweb"
235 |     FineWebSampleFile = "sample/10BT/000_00000.parquet"
236 | )
237 | 
238 | // FineWebEntry: inspection of fields in parque file done with tool in 
239 | // github.com/xitongsys/parquet-go/tool/parquet-tools.
240 | //
241 | // The parquet annotations are described in: https://pkg.go.dev/github.com/parquet-go/parquet-go#SchemaOf
242 | type FineWebEntry struct {
243 |     Text string `parquet:"text,snappy"`
244 |     ID string `parquet:"id,snappy"`
245 |     Dump string `parquet:"dump,snappy"`
246 |     URL string `parquet:"url,snappy"`
247 |     Score float64 `parquet:"language_score"`
248 | }
249 | 
250 | // TrimString returns s trimmed to at most maxLength runes. If trimmed it appends "…" at the end.
251 | func TrimString(s string, maxLength int) string {
252 |     if utf8.RuneCountInString(s) <= maxLength {
253 |         return s
254 |     }
255 |     runes := []rune(s)
256 |     return string(runes[:maxLength-1]) + "…"
257 | }
258 | ```
259 | 
260 | Now we read the `parquet` files using the library [github.com/parquet-go/parquet-go](https://github.com/parquet-go/parquet-go).
261 | 
262 | ```go
263 | import (
264 |     parquet "github.com/parquet-go/parquet-go"
265 | )
266 | 
267 | func main() {
268 |     // Download repo file.
269 |     repo := hub.New(FineWebID).WithType(hub.RepoTypeDataset).WithAuth(hfAuthToken)
270 |     localSampleFile := must.M1(repo.DownloadFile(FineWebSampleFile))
271 |     
272 |     // Parquet reading using parquet-go: it's somewhat cumbersome (to open the file it needs its size!?), but it works.
273 |     schema := parquet.SchemaOf(&FineWebEntry{})
274 |     fSize := must.M1(os.Stat(localSampleFile)).Size()
275 |     fReader := must.M1(os.Open(localSampleFile))
276 |     fParquet := must.M1(parquet.OpenFile(fReader, fSize))
277 |     reader := parquet.NewGenericReader[FineWebEntry](fParquet, schema)
278 |     defer reader.Close()
279 |     
280 |     // Print first 10 rows:
281 |     rows := make([]FineWebEntry, 10)
282 |     n := must.M1(reader.Read(rows))
283 |     fmt.Printf("%d rows read\n", n)
284 |     for ii, row := range rows {
285 |         fmt.Printf("Row %0d:\tScore=%.3f Text=[%q], URL=[%s]\n", ii, row.Score, TrimString(row.Text, 50), TrimString(row.URL, 40))
286 |     }
287 | }
288 | ```
289 | 
290 | Results:
291 | 
292 | ```
293 | 10 rows read
294 | Row 0:	Score=0.823 Text=["|Viewing Single Post From: Spoilers for the Week …"], URL=[http://daytimeroyaltyonline.com/single/…]
295 | Row 1:	Score=0.974 Text=["*sigh* Fundamentalist community, let me pass on s…"], URL=[http://endogenousretrovirus.blogspot.co…]
296 | Row 2:	Score=0.873 Text=["A novel two-step immunotherapy approach has shown…"], URL=[http://news.cancerconnect.com/]
297 | Row 3:	Score=0.932 Text=["Free the Cans! Working Together to Reduce Waste\nI…"], URL=[http://sharingsolution.com/2009/05/23/f…]
298 | …
299 | ```
300 | 
301 | ## [Demo Notebook](https://github.com/gomlx/go-huggingface/blob/main/go-huggingface.ipynb)
302 | 
303 | All examples were taken from the [demo notebook](https://github.com/gomlx/go-huggingface/blob/main/go-huggingface.ipynb).
304 | It works it also as an easy playground to try out the functionality.
305 | 
306 | You can try it out using the [GoMLX docker that includes JupyterLab](https://hub.docker.com/r/janpfeifer/gomlx_jupyterlab).


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/hub/files.go:
--------------------------------------------------------------------------------
  1 | package hub
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"iter"
  7 | 	"net/http"
  8 | 	"os"
  9 | 	"path"
 10 | 	"path/filepath"
 11 | 	"strconv"
 12 | 	"strings"
 13 | 	"sync"
 14 | 	"time"
 15 | 
 16 | 	"github.com/dustin/go-humanize"
 17 | 	"github.com/gomlx/go-huggingface/internal/files"
 18 | 	"github.com/pkg/errors"
 19 | )
 20 | 
 21 | // IterFileNames iterate over the file names stored in the repo.
 22 | // It doesn't trigger the downloading of the repo, only of the repo info.
 23 | func (r *Repo) IterFileNames() iter.Seq2[string, error] {
 24 | 	// Download info and files.
 25 | 	err := r.DownloadInfo(false)
 26 | 	if err != nil {
 27 | 		// Error downloading: yield error only.
 28 | 		return func(yield func(string, error) bool) {
 29 | 			yield("", err)
 30 | 			return
 31 | 		}
 32 | 	}
 33 | 	return func(yield func(string, error) bool) {
 34 | 		for _, si := range r.info.Siblings {
 35 | 			fileName := si.Name
 36 | 			if path.IsAbs(fileName) || strings.Index(fileName, "..") != -1 {
 37 | 				yield("", errors.Errorf("model %q contains illegal file name %q -- it cannot be an absolute path, nor contain \"..\"",
 38 | 					r.ID, fileName))
 39 | 				return
 40 | 			}
 41 | 			if !yield(fileName, nil) {
 42 | 				return
 43 | 			}
 44 | 		}
 45 | 		return
 46 | 	}
 47 | }
 48 | 
 49 | // HasFile returns whether the repo has given fileName.
 50 | // Notice fileName is relative to the repository, not in local disk.
 51 | //
 52 | // If the Repo hasn't downloaded its info yet, it attempts to download it here.
 53 | // If it fails, it simply return false.
 54 | // Call Repo.DownloadInfo to handle errors downloading the info.
 55 | func (r *Repo) HasFile(fileName string) bool {
 56 | 	if r.DownloadInfo(false) != nil {
 57 | 		return false
 58 | 	}
 59 | 	for _, si := range r.info.Siblings {
 60 | 		if si.Name == fileName {
 61 | 			return true
 62 | 		}
 63 | 	}
 64 | 	return false
 65 | }
 66 | 
 67 | // cleanRelativeFilePath sanitizes a file path by removing empty segments
 68 | // and parent directory references ("..") for security reasons.
 69 | func cleanRelativeFilePath(repoFileName string) string {
 70 | 	// Convert to forward slashes and clean the path
 71 | 	normalized := filepath.ToSlash(repoFileName)
 72 | 
 73 | 	// Remove leading slash if present
 74 | 	normalized = strings.TrimPrefix(normalized, "/")
 75 | 
 76 | 	// Split into path components
 77 | 	parts := strings.Split(normalized, "/")
 78 | 
 79 | 	// Process parts to handle ".." components
 80 | 	var stack []string
 81 | 	for _, part := range parts {
 82 | 		if part == "" || part == "." {
 83 | 			continue
 84 | 		}
 85 | 		if part == ".." {
 86 | 			if len(stack) > 0 {
 87 | 				// Remove last element if we have something to pop
 88 | 				stack = stack[:len(stack)-1]
 89 | 			}
 90 | 			continue
 91 | 		}
 92 | 		stack = append(stack, part)
 93 | 	}
 94 | 
 95 | 	if len(stack) == 0 {
 96 | 		return "."
 97 | 	}
 98 | 
 99 | 	// Join with platform-specific separator
100 | 	return filepath.FromSlash(strings.Join(stack, "/"))
101 | }
102 | 
103 | // DownloadFiles downloads the repository files (the names returned by repo.IterFileNames), and return the path to the
104 | // downloaded files in the cache structure.
105 | //
106 | // The returned downloadPaths can be read, but shouldn't be modified, since there may be other programs using the same
107 | // files.
108 | func (r *Repo) DownloadFiles(repoFiles ...string) (downloadedPaths []string, err error) {
109 | 	if len(repoFiles) == 0 {
110 | 		return nil, nil
111 | 	}
112 | 
113 | 	// Create download manager, if one hasn't been created yet.
114 | 	downloadManager := r.getDownloadManager()
115 | 
116 | 	// Get/create repoCacheDir.
117 | 	var repoCacheDir string
118 | 	repoCacheDir, err = r.repoCacheDir()
119 | 	if err != nil {
120 | 		return nil, err
121 | 	}
122 | 	_ = repoCacheDir
123 | 
124 | 	// Get snapshot dir:
125 | 	snapshotDir, err := r.repoSnapshotsDir()
126 | 	if err != nil {
127 | 		return nil, err
128 | 	}
129 | 
130 | 	// Create context to stop any downloading of files if any error occur.
131 | 	// The deferred cancel both cleans up the context, and also stops any pending/ongoing
132 | 	// transfer that may be happening if an error occurs and the function exits.
133 | 	ctx, cancelFn := context.WithCancel(context.Background())
134 | 	defer cancelFn()
135 | 
136 | 	// Store results.
137 | 	downloadedPaths = make([]string, len(repoFiles))
138 | 
139 | 	// Information about download progress, and firstError to report back if needed.
140 | 	var downloadingMu sync.Mutex
141 | 	var firstError error
142 | 	var requireDownload int // number of files that require download (and are not in cache yet).
143 | 	perFileDownloaded := make([]uint64, len(repoFiles))
144 | 	var allFilesDownloaded uint64
145 | 	var numDownloadedFiles int
146 | 	busyLoop := `-\|/`
147 | 	busyLoopPos := 0
148 | 	lastPrintTime := time.Now()
149 | 
150 | 	// Print downloading progress.
151 | 	ratePrintFn := func() {
152 | 		if firstError == nil {
153 | 			fmt.Printf("\rDownloaded %d/%d files %c %s downloaded    ",
154 | 				numDownloadedFiles, requireDownload, busyLoop[busyLoopPos], humanize.Bytes(allFilesDownloaded))
155 | 		} else {
156 | 			fmt.Printf("\rDownloaded %d/%d files, %s downloaded: error - %v     ",
157 | 				numDownloadedFiles, requireDownload, humanize.Bytes(allFilesDownloaded),
158 | 				firstError)
159 | 		}
160 | 		busyLoopPos = (busyLoopPos + 1) % len(busyLoop)
161 | 		lastPrintTime = time.Now()
162 | 	}
163 | 
164 | 	// Report error for a download, and interrupt everyone.
165 | 	reportErrorFn := func(err error) {
166 | 		downloadingMu.Lock()
167 | 		if firstError == nil {
168 | 			firstError = err
169 | 		}
170 | 		cancelFn()
171 | 		downloadingMu.Unlock()
172 | 		return
173 | 	}
174 | 
175 | 	// Loop over each file to download.
176 | 	var wg sync.WaitGroup
177 | 	for idxFile, repoFileName := range repoFiles {
178 | 		fileURL, err := r.FileURL(repoFileName)
179 | 		if err != nil {
180 | 			return nil, err
181 | 		}
182 | 
183 | 		// Join the path parts of fileName using the current OS separator.
184 | 		relativeFilePath := cleanRelativeFilePath(repoFileName)
185 | 		if relativeFilePath == "." {
186 | 			return nil, errors.Errorf("invalid file name %q", repoFileName)
187 | 		}
188 | 		snapshotPath := path.Join(snapshotDir, relativeFilePath)
189 | 		downloadedPaths[idxFile] = snapshotPath // This is the file pointer we are returning.
190 | 		if files.Exists(snapshotPath) {
191 | 			// File already downloaded, skip.
192 | 			continue
193 | 		}
194 | 
195 | 		// Create directory for this individual file.
196 | 		dir, _ := path.Split(snapshotPath)
197 | 		if err = os.MkdirAll(dir, DefaultDirCreationPerm); err != nil {
198 | 			return nil, errors.Wrapf(err, "while creating directory to download %q", snapshotPath)
199 | 		}
200 | 
201 | 		// Start downloading in a separate goroutine.
202 | 		wg.Add(1)
203 | 		go func() {
204 | 			defer wg.Done()
205 | 
206 | 			// Download header of file for safety checks, and so we can find the blobPath.
207 | 			header, contentLength, err := downloadManager.FetchHeader(ctx, fileURL)
208 | 			if err != nil {
209 | 				reportErrorFn(err)
210 | 				return
211 | 			}
212 | 			metadata := extractFileMetadata(header, fileURL, contentLength)
213 | 			etag := metadata.ETag
214 | 			if etag == "" {
215 | 				reportErrorFn(errors.Errorf("resource %q for %q doesn't have an ETag, not able to ensure reproduceability",
216 | 					repoFileName, r.ID))
217 | 				return
218 | 			}
219 | 			if metadata.Location != fileURL {
220 | 				// In the case of a redirect, remove authorization header when downloading blob
221 | 				reportErrorFn(errors.Errorf("resource %q for %q has a redirect from %q to %q: this can be unsafe if we send our authorization token to the new URL",
222 | 					repoFileName, r.ID, fileURL, metadata.Location))
223 | 				return
224 | 			}
225 | 
226 | 			// blobPath: download only if it has already been downloaded.
227 | 			blobPath := path.Join(repoCacheDir, "blobs", etag)
228 | 			if !files.Exists(blobPath) {
229 | 				requireDownload++ // This file require download.
230 | 				err := r.lockedDownload(ctx, fileURL, blobPath, false, func(downloadedBytes, totalBytes int64) {
231 | 					// Execute at every report of download.
232 | 					downloadingMu.Lock()
233 | 					defer downloadingMu.Unlock()
234 | 					lastReportedBytes := perFileDownloaded[idxFile]
235 | 					newDownloaded := uint64(downloadedBytes) - lastReportedBytes
236 | 					allFilesDownloaded += newDownloaded
237 | 					perFileDownloaded[idxFile] = uint64(downloadedBytes)
238 | 					if r.Verbosity > 0 && time.Since(lastPrintTime) > time.Second {
239 | 						ratePrintFn()
240 | 					}
241 | 				})
242 | 				if err != nil {
243 | 					reportErrorFn(err)
244 | 					return
245 | 				}
246 | 
247 | 				// Done, print out progress.
248 | 				numDownloadedFiles++
249 | 				if r.Verbosity > 0 {
250 | 					ratePrintFn()
251 | 				}
252 | 			}
253 | 
254 | 			// Link blob file to snapshot.
255 | 			err = createSymLink(snapshotPath, blobPath)
256 | 			if err != nil {
257 | 				reportErrorFn(errors.WithMessagef(err, "while downloading %q from repository %q", repoFileName, r.ID))
258 | 			}
259 | 		}()
260 | 	}
261 | 	wg.Wait()
262 | 	if requireDownload > 0 {
263 | 		if r.Verbosity > 0 {
264 | 			if firstError != nil {
265 | 				fmt.Println()
266 | 			} else {
267 | 				fmt.Printf("\rDownloaded %d/%d files, %s downloaded         \n",
268 | 					numDownloadedFiles, requireDownload, humanize.Bytes(allFilesDownloaded))
269 | 			}
270 | 		}
271 | 	}
272 | 	if firstError != nil {
273 | 		return nil, firstError
274 | 	}
275 | 	return downloadedPaths, nil
276 | }
277 | 
278 | // DownloadFile is a shortcut to DownloadFiles with only one file.
279 | func (r *Repo) DownloadFile(file string) (downloadedPath string, err error) {
280 | 	res, err := r.DownloadFiles(file)
281 | 	if err != nil {
282 | 		return "", err
283 | 	}
284 | 	return res[0], nil
285 | }
286 | 
287 | // fileMetadata used by HuggingFace Hub.
288 | type fileMetadata struct {
289 | 	CommitHash, ETag, Location string
290 | 	Size                       int
291 | }
292 | 
293 | func extractFileMetadata(header http.Header, url string, contentLength int64) (metadata fileMetadata) {
294 | 	metadata.CommitHash = header.Get(HeaderXRepoCommit)
295 | 	metadata.ETag = header.Get(HeaderXLinkedETag)
296 | 	if metadata.ETag == "" {
297 | 		metadata.ETag = header.Get("ETag")
298 | 	}
299 | 	metadata.ETag = removeQuotes(metadata.ETag)
300 | 	metadata.Location = header.Get("Location")
301 | 	if metadata.Location == "" {
302 | 		metadata.Location = url
303 | 	}
304 | 
305 | 	if sizeStr := header.Get(HeaderXLinkedSize); sizeStr != "" {
306 | 		var err error
307 | 		metadata.Size, err = strconv.Atoi(sizeStr)
308 | 		if err != nil {
309 | 			metadata.Size = 0
310 | 		}
311 | 	}
312 | 	if metadata.Size == 0 {
313 | 		metadata.Size = int(contentLength)
314 | 	}
315 | 	return
316 | }
317 | 
318 | func removeQuotes(str string) string {
319 | 	return strings.TrimRight(strings.TrimLeft(str, "\""), "\"")
320 | }
321 | 
322 | // createSymlink creates a symbolic link named dst pointing to src, using a relative path if possible.
323 | // It removes previous link/file if it already exists.
324 | //
325 | // We use relative paths because:
326 | // * It's what `huggingface_hub` library does, and we want to keep things compatible.
327 | // * If the cache folder is moved or backed up, links won't break.
328 | // * Relative paths seem better handled on Windows -- although Windows is not yet fully supported for this package.
329 | //
330 | // Example layout:
331 | //
332 | //	└── [ 128]  snapshots
333 | //	  ├── [ 128]  2439f60ef33a0d46d85da5001d52aeda5b00ce9f
334 | //	  │   ├── [  52]  README.md -> ../../../blobs/d7edf6bd2a681fb0175f7735299831ee1b22b812
335 | //	  │   └── [  76]  pytorch_model.bin -> ../../../blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
336 | func createSymLink(dst, src string) error {
337 | 	relLink, err := filepath.Rel(path.Dir(dst), src)
338 | 	if err != nil {
339 | 		relLink = src // Take the absolute path instead.
340 | 	}
341 | 
342 | 	// Remove link/file if it already exists.
343 | 	err = os.Remove(dst)
344 | 	if err != nil && !errors.Is(err, os.ErrNotExist) {
345 | 		return errors.Wrapf(err, "failed to remove dst=%q before linking it to %q", dst, relLink)
346 | 	}
347 | 
348 | 	if err = os.Symlink(relLink, dst); err != nil {
349 | 		return errors.Wrapf(err, "while symlink'ing %q to %q using %q", src, dst, relLink)
350 | 	}
351 | 	return nil
352 | }
353 | 


--------------------------------------------------------------------------------
/tokenizers/sentencepiece/private/protos/sentencepiece_model.proto:
--------------------------------------------------------------------------------
  1 | // Copyright 2016 Google Inc.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.!
 14 | 
 15 | syntax = "proto2";
 16 | 
 17 | // Unique package name to avoid conflicts: the proto library won't allow two different
 18 | // packages to define the same proto (under the same namespace).
 19 | // This is broken, since that's what is needed ... a bad design from the ProtoBuf in Go. See more details here:
 20 | //    https://protobuf.dev/reference/go/faq/#namespace-conflict
 21 | // So instead we change the proto namespace (package) name to globally unique package name:
 22 | package com.github.gomlx.go_huggingface.sentencepiece;
 23 | 
 24 | option go_package="github.com/gomlx/go-huggingface/tokenizers/sentencepiece/private/protos";
 25 | 
 26 | // TODO(taku): Needs to use LITE RUNTIME in OSS release.
 27 | option optimize_for = LITE_RUNTIME;
 28 | 
 29 | 
 30 | 
 31 | // TrainerSpec encodes a various parameters for SentencePiece training.
 32 | // Next id: 55
 33 | message TrainerSpec {
 34 |   ///////////////////////////////////////////////////////////////////
 35 |   // General parameters
 36 |   //
 37 |   // Input corpus files.
 38 |   //  Trainer accepts the following two formats:
 39 |   //  A) Monolingual: plain text, one sentence per line.
 40 |   //  B) Bilingual:   TSV, source sentence <tab> target sentence
 41 |   //  When bilingual data is passed, shared vocabulary model is built.
 42 |   //  Note that the input file must be raw corpus, not a preprocessed corpus.
 43 |   //  Trainer only loads the first `input_sentence_size` sentences specified
 44 |   //  with this parameter.
 45 |   repeated string input = 1;
 46 | 
 47 |   // Input corpus format:
 48 |   // "text": one-sentence-per-line text format (default)
 49 |   // "tsv":  sentence <tab> freq
 50 |   optional string input_format = 7;
 51 | 
 52 |   // Output model file prefix.
 53 |   // <model_prefix>.model and <model_prefix>.vocab are generated.
 54 |   optional string model_prefix = 2;
 55 | 
 56 |   // Model type. only have UNIGRAM now.
 57 |   enum ModelType {
 58 |     UNIGRAM = 1;  // Unigram language model with dynamic algorithm
 59 |     BPE = 2;      // Byte Pair Encoding
 60 |     WORD = 3;     // Delimitered by whitespace.
 61 |     CHAR = 4;     // tokenizes into character sequence
 62 |   }
 63 |   optional ModelType model_type = 3 [default = UNIGRAM];
 64 | 
 65 |   // Vocabulary size. 8k is the default size.
 66 |   optional int32 vocab_size = 4 [default = 8000];
 67 | 
 68 |   // List of the languages this model can accept.
 69 |   // Since the model is language-agnostic, this field is used as a reference.
 70 |   repeated string accept_language = 5;
 71 | 
 72 |   // Size of self-test samples, which are encoded in the model file.
 73 |   optional int32 self_test_sample_size = 6 [default = 0];
 74 | 
 75 |   // Whether to use DP version of sentencepiece. Use it with TSV input format
 76 |   // (requires precomputed word tab counts to work).
 77 |   optional bool enable_differential_privacy = 50 [default = false];
 78 |   // Set these parameters if you need DP version of sentencepiece.
 79 |   // std of noise to add.
 80 |   optional float differential_privacy_noise_level = 51 [default = 0.0];
 81 |   // Clipping threshold to apply after adding noise. All the words with
 82 |   // frequency less than this value are dropped.
 83 |   optional uint64 differential_privacy_clipping_threshold = 52 [default = 0];
 84 | 
 85 |   ///////////////////////////////////////////////////////////////////
 86 |   // Training parameters.
 87 |   //
 88 |   // Uses characters which cover the corpus with the ratio of `chars_coverage`.
 89 |   // This parameter determines the set of basic Alphabet of sentence piece.
 90 |   // 1.0 - `chars_coverage` characters are treated as UNK.
 91 |   // See also required_chars field.
 92 |   optional float character_coverage = 10 [default = 0.9995];
 93 | 
 94 |   // Maximum size of sentences the trainer loads from `input` parameter.
 95 |   // Trainer simply loads the `input` files in sequence.
 96 |   // It is better to shuffle the input corpus randomly.
 97 |   optional uint64 input_sentence_size = 11 [default = 0];
 98 |   optional bool shuffle_input_sentence = 19 [default = true];
 99 | 
100 |   // Maximum size of sentences to make seed sentence pieces.
101 |   // Extended suffix array is constructed to extract frequent
102 |   // sub-strings from the corpus. This uses 20N working space,
103 |   // where N is the size of corpus.
104 |   optional int32 mining_sentence_size = 12 [deprecated = true];
105 | 
106 |   // Maximum size of sentences to train sentence pieces.
107 |   optional int32 training_sentence_size = 13 [deprecated = true];
108 | 
109 |   // The size of seed sentencepieces.
110 |   // `seed_sentencepiece_size` must be larger than `vocab_size`.
111 |   optional int32 seed_sentencepiece_size = 14 [default = 1000000];
112 | 
113 |   // In every EM sub-iterations, keeps top
114 |   // `shrinking_factor` * `current sentencepieces size` with respect to
115 |   // the loss of the sentence piece. This value should be smaller than 1.0.
116 |   optional float shrinking_factor = 15 [default = 0.75];
117 | 
118 |   // The maximum sentence length in byte. The sentences with the length
119 |   // larger than `max_sentence_length` is simply ignored.
120 |   // Longer input tends to bring the following risks:
121 |   //  * Overflow during EM training (unigram language model only)
122 |   //  * Performance drop because of O(n log n) cost in BPE.
123 |   optional int32 max_sentence_length = 18 [default = 4192];
124 | 
125 |   // Number of threads in the training.
126 |   optional int32 num_threads = 16 [default = 16];
127 | 
128 |   // Number of EM sub iterations.
129 |   optional int32 num_sub_iterations = 17 [default = 2];
130 | 
131 |   ///////////////////////////////////////////////////////////////////
132 |   // SentencePiece parameters which control the shapes of sentence piece.
133 |   //
134 |   // Maximum length of sentencepiece.
135 |   optional int32 max_sentencepiece_length = 20 [default = 16];
136 | 
137 |   // Uses Unicode script to split sentence pieces.
138 |   // When `split_by_unicode_script` is true, we do not allow sentence piece to
139 |   // include multiple Unicode scripts, e.g. "F1" is not a valid piece.
140 |   // Exception: CJ characters (Hiragana/Katakana/Han) are all handled
141 |   // as one script type, since Japanese word can consist of multiple scripts.
142 |   // This exception is always applied regardless of the accept-language
143 |   // parameter.
144 |   optional bool split_by_unicode_script = 21 [default = true];
145 | 
146 |   // When `split_by_number` is true, put a boundary between number and
147 |   // non-number transition. If we want to treat "F1" is one token, set this flag
148 |   // to be false.
149 |   optional bool split_by_number = 23 [default = true];
150 | 
151 |   // Use a white space to split sentence pieces.
152 |   // When `split_by_whitespace` is false, we may have the piece containing
153 |   // a white space in the middle. e.g., "in_the".
154 |   optional bool split_by_whitespace = 22 [default = true];
155 | 
156 |   // Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello =>
157 |   // hello_. When `treat_whitespace_as_suffix` is true,
158 |   // NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end
159 |   // of sentence.
160 |   optional bool treat_whitespace_as_suffix = 24 [default = false];
161 | 
162 |   // Allows pieces that only contain whitespaces instead of appearing only as
163 |   // prefix or suffix of other pieces.
164 |   optional bool allow_whitespace_only_pieces = 26 [default = false];
165 | 
166 |   // Split all digits (0-9) into separate pieces.
167 |   optional bool split_digits = 25 [default = false];
168 | 
169 |   // Defines the pre-tokenization delimiter.
170 |   // When specified, no pieces crossing this delimiter is not included
171 |   // in the vocab. Then the delimiter string is virtually ignored
172 |   // during the training. This field can allows constraints on the vocabulary
173 |   // selection. Note that this field is available on unigram mode.
174 |   optional string pretokenization_delimiter = 53 [ default = ""];
175 | 
176 |   ///////////////////////////////////////////////////////////////////
177 |   // Vocabulary management
178 |   //
179 |   // Defines control symbols used as an indicator to
180 |   // change the behavior of the decoder. <s> and </s> are pre-defined.
181 |   // We can use this field to encode various meta information,
182 |   // including language indicator in multilingual model.
183 |   // These symbols are not visible to users, but visible to
184 |   // the decoder. Note that when the input sentence contains control symbols,
185 |   // they are not treated as one token, but segmented into normal pieces.
186 |   // Control symbols must be inserted independently from the segmentation.
187 |   repeated string control_symbols = 30;
188 | 
189 |   // Defines user defined symbols.
190 |   // These symbols are added with extremely high score
191 |   // so they are always treated as one unique symbol in any context.
192 |   // Typical usage of user_defined_symbols is placeholder for named entities.
193 |   repeated string user_defined_symbols = 31;
194 | 
195 |   // Defines required characters. Each UTF8 character in this string is included
196 |   // in the character set regardless of character_coverage value. Unlike
197 |   // user_defined_symbols, these characters have scores based on the frequency
198 |   // on input sentences, and the model can form subwords using characters
199 |   // in this field.
200 |   optional string required_chars = 36;
201 | 
202 |   // Decomposes unknown pieces into UTF-8 bytes.
203 |   optional bool byte_fallback = 35 [default = false];
204 | 
205 |   // When creating the vocabulary file, defines whether or not to additionally
206 |   // output the score for each piece.
207 |   optional bool vocabulary_output_piece_score = 32 [default = true];
208 | 
209 |   // `vocab_size` is treated as hard limit. Crash if
210 |   // the model can not produce the vocab of size `vocab_size`,
211 |   // When `hard_vocab_limit` is false, vocab_size is treated
212 |   // as soft limit. Note that when model_type=char,
213 |   // always assumes hard_vocab_limit = false.
214 |   optional bool hard_vocab_limit = 33 [default = true];
215 | 
216 |   // use all symbols for vocab extraction. This flag is valid
217 |   // if model type is either CHAR or WORD
218 |   optional bool use_all_vocab = 34 [default = false];
219 | 
220 |   ///////////////////////////////////////////////////////////////////
221 |   // Reserved special meta tokens.
222 |   // * -1 is not used.
223 |   // * unk_id must not be -1.
224 |   // Id must starts with 0 and be contigous.
225 |   optional int32 unk_id = 40 [default = 0];   // <unk>
226 |   optional int32 bos_id = 41 [default = 1];   // <s>
227 |   optional int32 eos_id = 42 [default = 2];   // </s>
228 |   optional int32 pad_id = 43 [default = -1];  // <pad> (padding)
229 |   optional string unk_piece = 45 [default = "<unk>"];
230 |   optional string bos_piece = 46 [default = "<s>"];
231 |   optional string eos_piece = 47 [default = "</s>"];
232 |   optional string pad_piece = 48 [default = "<pad>"];
233 | 
234 |   // Encodes <unk> into U+2047 (DOUBLE QUESTION MARK),
235 |   // since this character can be useful both for user and
236 |   // developer. We can easily figure out that <unk> is emitted.
237 |   optional string unk_surface = 44 [default = " \xE2\x81\x87 "];
238 | 
239 |   // Increase bit depth to allow unigram model training on large
240 |   // (>10M sentences) corpora. A Side-effect of enabling this flag
241 |   // is increased memory usage.
242 |   optional bool train_extremely_large_corpus = 49 [default = false];
243 | 
244 |  // Path to a seed sentencepieces file, with one tab-separated
245 |   // seed sentencepiece <tab> frequency per line.
246 |   optional string seed_sentencepieces_file = 54 [default = ""];
247 | 
248 |   // Customized extensions: the range of field numbers
249 |   // are open to third-party extensions.
250 |   extensions 200 to max;
251 | }
252 | 
253 | // NormalizerSpec encodes a various parameters for string normalizaiton
254 | message NormalizerSpec {
255 |   // name of normalization rule.
256 |   optional string name = 1;
257 | 
258 |   // Pre-compiled normalization rule created by
259 |   // Builder::GetPrecompiledCharsMap() or Builder::CompileCharsMap() method.
260 |   // Usually this field is set by Builder::GetNormalizerSpec() method.
261 |   optional bytes precompiled_charsmap = 2;
262 | 
263 |   // Adds dummy whitespace at the beginning of text in order to
264 |   // treat "world" in "world" and "hello world" in the same way.
265 |   optional bool add_dummy_prefix = 3 [default = true];
266 | 
267 |   // Removes leading, trailing, and duplicate internal whitespace.
268 |   optional bool remove_extra_whitespaces = 4 [default = true];
269 | 
270 |   // Replaces whitespace with meta symbol.
271 |   // This field must be true to train sentence piece model.
272 |   optional bool escape_whitespaces = 5 [default = true];
273 | 
274 |   // Custom normalization rule file in TSV format.
275 |   // https://github.com/google/sentencepiece/blob/master/doc/normalization.md
276 |   // This field is only used in SentencePieceTrainer::Train() method, which
277 |   // compiles the rule into the binary rule stored in `precompiled_charsmap`.
278 |   optional string normalization_rule_tsv = 6;
279 | 
280 |   // Customized extensions: the range of field numbers
281 |   // are open to third-party extensions.
282 |   extensions 200 to max;
283 | }
284 | 
285 | // Proto to store samples for self-testing.
286 | message SelfTestData {
287 |   message Sample {
288 |     optional string input = 1;
289 |     optional string expected = 2;
290 |   }
291 |   repeated Sample samples = 1;
292 | 
293 |   // Customized extensions: the range of field numbers
294 |   // are open to third-party extensions.
295 |   extensions 200 to max;
296 | }
297 | 
298 | // ModelProto stores model parameters.
299 | // SentencePieceProcessor is supposed to be self-contained.
300 | // All settings/parameters which may change the behavior must be encoded
301 | // in ModelProto.
302 | message ModelProto {
303 |   message SentencePiece {
304 |     enum Type {
305 |       NORMAL = 1;        // normal symbol
306 |       UNKNOWN = 2;       // unknown symbol. only <unk> for now.
307 |       CONTROL = 3;       // control symbols. </s>, <s>, <2ja> etc.
308 |       USER_DEFINED = 4;  // user defined symbols.
309 |                          // Typical usage of USER_DEFINED symbol
310 |                          // is placeholder.
311 |       BYTE = 6;          // byte symbols. Used when `byte_fallback` is true.
312 |       UNUSED = 5;        // this piece is not used.
313 |     }
314 |     optional string piece = 1;  // piece must not be empty.
315 |     optional float score = 2;
316 |     optional Type type = 3 [default = NORMAL];
317 | 
318 |     // Customized extensions: the range of field numbers
319 |     // are open to third-party extensions.
320 |     extensions 200 to max;
321 |   }
322 | 
323 |   // Sentence pieces with scores.
324 |   repeated SentencePiece pieces = 1;
325 | 
326 |   // Spec used to generate this model file.
327 |   optional TrainerSpec trainer_spec = 2;
328 | 
329 |   // Spec for text normalization.
330 |   optional NormalizerSpec normalizer_spec = 3;
331 | 
332 |   // Stores sample input and its expected segmentation to verify the model.
333 |   optional SelfTestData self_test_data = 4;
334 | 
335 |   // Spec for text de-normalization.
336 |   optional NormalizerSpec denormalizer_spec = 5;
337 | 
338 |   // Customized extensions: the range of field numbers
339 |   // are open to third-party extensions.
340 |   extensions 200 to max;
341 | }
342 | 


--------------------------------------------------------------------------------
/go-huggingface.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "f64e2dff-097a-4e59-a53a-773d1c2356da",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# `go-huggingface` Demo\n",
  9 |     "\n",
 10 |     "This demo shows how to download files and create tokenizers from HuggingFace models."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "id": "d36287c1-e5e0-4985-83ac-1f40e8b850c7",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "## Imports and `go work` setup\n",
 19 |     "\n",
 20 |     "This is used during development, to instruct the Notebook kernel [gonb](https://github.com/janpfeifer/gonb) to use the local version of the libraries."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 1,
 26 |    "id": "8941d23e-19f2-4cb4-8538-b6acecfdba61",
 27 |    "metadata": {},
 28 |    "outputs": [
 29 |     {
 30 |      "data": {
 31 |       "text/markdown": [
 32 |        "## GoNB version: `v0.11.1`\n",
 33 |        "\n",
 34 |        "### Build Info\n",
 35 |        "- Go version: go1.25.3 (OS: linux, Arch: amd64)\n"
 36 |       ]
 37 |      },
 38 |      "metadata": {},
 39 |      "output_type": "display_data"
 40 |     },
 41 |     {
 42 |      "name": "stdout",
 43 |      "output_type": "stream",
 44 |      "text": [
 45 |       "\t- Replace rule for module \"github.com/gomlx/gomlx\" to local directory \"/home/janpf/Projects/gomlx\" already exists.\n",
 46 |       "\t- Added replace rule for module \"github.com/gomlx/onnx-gomlx\" to local directory \"/home/janpf/Projects/onnx-gomlx\".\n",
 47 |       "\t- Added replace rule for module \"github.com/gomlx/gemma\" to local directory \"/home/janpf/Projects/gemma\".\n",
 48 |       "\t- Added replace rule for module \"github.com/gomlx/go-huggingface\" to local directory \"/home/janpf/Projects/go-huggingface\".\n"
 49 |      ]
 50 |     }
 51 |    ],
 52 |    "source": [
 53 |     "%version\n",
 54 |     "!*rm -f go.work && go work init\n",
 55 |     "!*go work use . \"${HOME}/Projects/gomlx\" \"${HOME}/Projects/go-huggingface\" \"${HOME}/Projects/gemma\" \"${HOME}/Projects/onnx-gomlx\"\n",
 56 |     "%goworkfix"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 2,
 62 |    "id": "fe1baf48-95fd-4e8c-94a5-e09f1a723559",
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "import (\n",
 67 |     "    \"github.com/janpfeifer/must\"\n",
 68 |     "    \"github.com/gomlx/go-huggingface/hub\"\n",
 69 |     "    \"github.com/gomlx/go-huggingface/tokenizers\"\n",
 70 |     ")"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "id": "8bc02638-f04d-4fd6-a199-8cb5fec99600",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "## Download `tokenizer_config.json` and enumerate `tokenizer_class` for several models"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 3,
 84 |    "id": "43f3d2af-29b2-488e-97c4-22942dcecab2",
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "var (\n",
 89 |     "    // HuggingFace authentication token read from environment.\n",
 90 |     "    // It can be created in https://huggingface.co\n",
 91 |     "    // Some files may require it for downloading.\n",
 92 |     "    hfAuthToken = os.Getenv(\"HF_TOKEN\")\n",
 93 |     "\n",
 94 |     "    // Model ids for testing.\n",
 95 |     "    hfModelIDs = []string{\n",
 96 |     "        \"google/gemma-2-2b-it\",\n",
 97 |     "        \"sentence-transformers/all-MiniLM-L6-v2\",\n",
 98 |     "        \"protectai/deberta-v3-base-zeroshot-v1-onnx\",\n",
 99 |     "        \"KnightsAnalytics/distilbert-base-uncased-finetuned-sst-2-english\",\n",
100 |     "        \"KnightsAnalytics/distilbert-NER\",\n",
101 |     "        \"KnightsAnalytics/all-MiniLM-L6-v2\",\n",
102 |     "        \"SamLowe/roberta-base-go_emotions-onnx\",\n",
103 |     "    }\n",
104 |     ")"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 4,
110 |    "id": "dd4f535a-cfd7-4e10-9660-78179caa949b",
111 |    "metadata": {},
112 |    "outputs": [
113 |     {
114 |      "name": "stdout",
115 |      "output_type": "stream",
116 |      "text": [
117 |       "\n",
118 |       "google/gemma-2-2b-it:\n",
119 |       "\t.gitattributes\n",
120 |       "\tREADME.md\n",
121 |       "\tconfig.json\n",
122 |       "\tgeneration_config.json\n",
123 |       "\tmodel-00001-of-00002.safetensors\n",
124 |       "\tmodel-00002-of-00002.safetensors\n",
125 |       "\tmodel.safetensors.index.json\n",
126 |       "\tspecial_tokens_map.json\n",
127 |       "\ttokenizer.json\n",
128 |       "\ttokenizer.model\n",
129 |       "\ttokenizer_config.json\n",
130 |       "\n",
131 |       "sentence-transformers/all-MiniLM-L6-v2:\n",
132 |       "\t.gitattributes\n",
133 |       "\t1_Pooling/config.json\n",
134 |       "\tREADME.md\n",
135 |       "\tconfig.json\n",
136 |       "\tconfig_sentence_transformers.json\n",
137 |       "\tdata_config.json\n",
138 |       "\tmodel.safetensors\n",
139 |       "\tmodules.json\n",
140 |       "\tonnx/model.onnx\n",
141 |       "\tonnx/model_O1.onnx\n",
142 |       "\tonnx/model_O2.onnx\n",
143 |       "\tonnx/model_O3.onnx\n",
144 |       "\tonnx/model_O4.onnx\n",
145 |       "\tonnx/model_qint8_arm64.onnx\n",
146 |       "\tonnx/model_qint8_avx512.onnx\n",
147 |       "\tonnx/model_qint8_avx512_vnni.onnx\n",
148 |       "\tonnx/model_quint8_avx2.onnx\n",
149 |       "\topenvino/openvino_model.bin\n",
150 |       "\topenvino/openvino_model.xml\n",
151 |       "\topenvino/openvino_model_qint8_quantized.bin\n",
152 |       "\topenvino/openvino_model_qint8_quantized.xml\n",
153 |       "\tpytorch_model.bin\n",
154 |       "\trust_model.ot\n",
155 |       "\tsentence_bert_config.json\n",
156 |       "\tspecial_tokens_map.json\n",
157 |       "\ttf_model.h5\n",
158 |       "\ttokenizer.json\n",
159 |       "\ttokenizer_config.json\n",
160 |       "\ttrain_script.py\n",
161 |       "\tvocab.txt\n",
162 |       "\n",
163 |       "protectai/deberta-v3-base-zeroshot-v1-onnx:\n",
164 |       "\t.gitattributes\n",
165 |       "\tREADME.md\n",
166 |       "\tadded_tokens.json\n",
167 |       "\tconfig.json\n",
168 |       "\tmerges.txt\n",
169 |       "\tmodel.onnx\n",
170 |       "\tspecial_tokens_map.json\n",
171 |       "\tspm.model\n",
172 |       "\ttokenizer.json\n",
173 |       "\ttokenizer_config.json\n",
174 |       "\tvocab.json\n",
175 |       "\n",
176 |       "KnightsAnalytics/distilbert-base-uncased-finetuned-sst-2-english:\n",
177 |       "\t.gitattributes\n",
178 |       "\tadded_tokens.json\n",
179 |       "\tconfig.json\n",
180 |       "\tmodel.onnx\n",
181 |       "\tspecial_tokens_map.json\n",
182 |       "\ttokenizer.json\n",
183 |       "\ttokenizer_config.json\n",
184 |       "\tvocab.txt\n",
185 |       "\n",
186 |       "KnightsAnalytics/distilbert-NER:\n",
187 |       "\t.gitattributes\n",
188 |       "\tconfig.json\n",
189 |       "\tmodel.onnx\n",
190 |       "\tspecial_tokens_map.json\n",
191 |       "\ttokenizer.json\n",
192 |       "\ttokenizer_config.json\n",
193 |       "\tvocab.txt\n",
194 |       "\n",
195 |       "KnightsAnalytics/all-MiniLM-L6-v2:\n",
196 |       "\t.gitattributes\n",
197 |       "\tconfig.json\n",
198 |       "\tdata_config.json\n",
199 |       "\tmodel.onnx\n",
200 |       "\tmodules.json\n",
201 |       "\tspecial_tokens_map.json\n",
202 |       "\ttokenizer.json\n",
203 |       "\ttokenizer_config.json\n",
204 |       "\tvocab.txt\n",
205 |       "\n",
206 |       "SamLowe/roberta-base-go_emotions-onnx:\n",
207 |       "\t.gitattributes\n",
208 |       "\tREADME.md\n",
209 |       "\tconfig.json\n",
210 |       "\tmerges.txt\n",
211 |       "\tonnx/config.json\n",
212 |       "\tonnx/merges.txt\n",
213 |       "\tonnx/model.onnx\n",
214 |       "\tonnx/model_quantized.onnx\n",
215 |       "\tonnx/ort_config.json\n",
216 |       "\tonnx/ort_config_quantized.json\n",
217 |       "\tonnx/special_tokens_map.json\n",
218 |       "\tonnx/tokenizer.json\n",
219 |       "\tonnx/tokenizer_config.json\n",
220 |       "\tonnx/vocab.json\n",
221 |       "\tspecial_tokens_map.json\n",
222 |       "\ttokenizer.json\n",
223 |       "\ttokenizer_config.json\n",
224 |       "\ttrainer_state.json\n",
225 |       "\tvocab.json\n"
226 |      ]
227 |     }
228 |    ],
229 |    "source": [
230 |     "%%\n",
231 |     "for _, modelID := range hfModelIDs {\n",
232 |     "    fmt.Printf(\"\\n%s:\\n\", modelID)\n",
233 |     "    repo := hub.New(modelID).WithAuth(hfAuthToken)\n",
234 |     "    for fileName, err := range repo.IterFileNames() {\n",
235 |     "        if err != nil { panic(err) }\n",
236 |     "        fmt.Printf(\"\\t%s\\n\", fileName)\n",
237 |     "    }\n",
238 |     "}"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": 5,
244 |    "id": "80f9eaee-7507-4921-bccd-b8dbcd8bf86a",
245 |    "metadata": {},
246 |    "outputs": [
247 |     {
248 |      "name": "stdout",
249 |      "output_type": "stream",
250 |      "text": [
251 |       "\n",
252 |       "google/gemma-2-2b-it:\n",
253 |       "\ttokenizer_class=GemmaTokenizer\n",
254 |       "\n",
255 |       "sentence-transformers/all-MiniLM-L6-v2:\n",
256 |       "\ttokenizer_class=BertTokenizer\n",
257 |       "\n",
258 |       "protectai/deberta-v3-base-zeroshot-v1-onnx:\n",
259 |       "\ttokenizer_class=DebertaV2Tokenizer\n",
260 |       "\n",
261 |       "KnightsAnalytics/distilbert-base-uncased-finetuned-sst-2-english:\n",
262 |       "\ttokenizer_class=DistilBertTokenizer\n",
263 |       "\n",
264 |       "KnightsAnalytics/distilbert-NER:\n",
265 |       "\ttokenizer_class=DistilBertTokenizer\n",
266 |       "\n",
267 |       "KnightsAnalytics/all-MiniLM-L6-v2:\n",
268 |       "\ttokenizer_class=BertTokenizer\n",
269 |       "\n",
270 |       "SamLowe/roberta-base-go_emotions-onnx:\n",
271 |       "\ttokenizer_class=RobertaTokenizer\n"
272 |      ]
273 |     }
274 |    ],
275 |    "source": [
276 |     "%%\n",
277 |     "for _, modelID := range hfModelIDs {\n",
278 |     "    fmt.Printf(\"\\n%s:\\n\", modelID)\n",
279 |     "    repo := hub.New(modelID).WithAuth(hfAuthToken)\n",
280 |     "    config := must.M1(tokenizers.GetConfig(repo))\n",
281 |     "    fmt.Printf(\"\\ttokenizer_class=%s\\n\", config.TokenizerClass)\n",
282 |     "}"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "markdown",
287 |    "id": "3c0a7ae7-ace6-4675-873b-0336efa3c68a",
288 |    "metadata": {},
289 |    "source": [
290 |     "## Create a Tokenizer\n",
291 |     "\n",
292 |     "### Go-only SentencePiece tokenizer:"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": 6,
298 |    "id": "f37e14c8-8321-40ac-b87f-1d4a222d6123",
299 |    "metadata": {},
300 |    "outputs": [
301 |     {
302 |      "name": "stdout",
303 |      "output_type": "stream",
304 |      "text": [
305 |       "Sentence:\tThe book is on the table.\n",
306 |       "Tokens:  \t[651 2870 603 611 573 3037 235265]\n"
307 |      ]
308 |     }
309 |    ],
310 |    "source": [
311 |     "var sentence = \"The book is on the table.\"\n",
312 |     "\n",
313 |     "%%\n",
314 |     "repo := hub.New(\"google/gemma-2-2b-it\").WithAuth(hfAuthToken)\n",
315 |     "tokenizer := must.M1(tokenizers.New(repo))\n",
316 |     "tokens := tokenizer.Encode(sentence)\n",
317 |     "fmt.Printf(\"Sentence:\\t%s\\n\", sentence)\n",
318 |     "fmt.Printf(\"Tokens:  \\t%v\\n\", tokens)\n"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "markdown",
323 |    "id": "cd2f6792-2e9b-427d-a2c6-316038624349",
324 |    "metadata": {},
325 |    "source": [
326 |     "### Rust based [github.com/daulet/tokenizers](https://github.com/daulet/tokenizers) tokenizer\n",
327 |     "\n",
328 |     "For most tokenizers in HuggingFace though, there is no Go-only version yet, and for now we use the [github.com/daulet/tokenizers](https://github.com/daulet/tokenizers), which is based on a fast tokenizer written in Rust.\n",
329 |     "\n",
330 |     "It requires installation of the built Rust library though, see [github.com/daulet/tokenizers](https://github.com/daulet/tokenizers) on how to install it, they provide prebuilt binaries.\n",
331 |     "\n",
332 |     "> **Note**: `daulet/tokenizers` also provides a simple downloader, so `go-huggingface` is not strictly necessary -- if you don't want the extra dependency and only need the tokenizer, you don't need to use it. `go-huggingface` helps by allowing also downloading other files (models, datasets), and a shared cache across different projects and `huggingface-hub` (the python downloader library)."
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": 13,
338 |    "id": "cd706316-ef19-4f25-92dc-a1283af8987d",
339 |    "metadata": {},
340 |    "outputs": [
341 |     {
342 |      "name": "stdout",
343 |      "output_type": "stream",
344 |      "text": [
345 |       "Sentence:\tThe book is on the table.\n",
346 |       "Tokens:  \t[101 1996 2338 2003 2006 1996 2795 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n"
347 |      ]
348 |     }
349 |    ],
350 |    "source": [
351 |     "import dtok \"github.com/daulet/tokenizers\"\n",
352 |     "\n",
353 |     "%%\n",
354 |     "modelID := \"KnightsAnalytics/all-MiniLM-L6-v2\"\n",
355 |     "repo := hub.New(modelID).WithAuth(hfAuthToken)\n",
356 |     "localFile := must.M1(repo.DownloadFile(\"tokenizer.json\"))\n",
357 |     "tokenizer := must.M1(dtok.FromFile(localFile))\n",
358 |     "defer tokenizer.Close()\n",
359 |     "tokens, _ := tokenizer.Encode(sentence, true)\n",
360 |     "\n",
361 |     "fmt.Printf(\"Sentence:\\t%s\\n\", sentence)\n",
362 |     "fmt.Printf(\"Tokens:  \\t%v\\n\", tokens)"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "markdown",
367 |    "id": "012cda0f-5ed7-418b-a7eb-3de5040a7e2c",
368 |    "metadata": {},
369 |    "source": [
370 |     "## Convert ONNX model"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "code",
375 |    "execution_count": 10,
376 |    "id": "7e7751da-e53f-47c6-a7d6-e6b760f95417",
377 |    "metadata": {},
378 |    "outputs": [
379 |     {
380 |      "name": "stdout",
381 |      "output_type": "stream",
382 |      "text": [
383 |       "Downloaded 1/1 files, 90 MB downloaded         \n",
384 |       "Sentences: \t[\"This is an example sentence\" \"Each sentence is converted\"]\n",
385 |       "Embeddings:\t[2][7][384]float32{\n",
386 |       " {{0.0365, -0.01629, 0.1682, ..., 0.05536, -0.1644, -0.2967},\n",
387 |       "  {0.7237, 0.6396, 0.1882, ..., 0.5939, 0.6204, 0.4902},\n",
388 |       "  {0.006478, 0.02025, 0.04475, ..., 0.3469, 1.317, -0.1669},\n",
389 |       "  ...,\n",
390 |       "  {0.1479, -0.06461, 0.1457, ..., 0.8841, -0.3322, 0.2979},\n",
391 |       "  {0.5212, 0.6562, 0.5608, ..., -0.03991, 0.04111, -1.404},\n",
392 |       "  {1.082, 0.7136, 0.3983, ..., -0.2299, 0.3247, -1.031}},\n",
393 |       " {{0.28, 0.1164, -0.04185, ..., 0.2709, -0.1684, -0.2962},\n",
394 |       "  {0.8734, 0.454, -0.1082, ..., 0.1364, 0.4583, -0.2042},\n",
395 |       "  {0.4748, 0.5727, 0.6301, ..., 0.6525, 0.5614, -1.327},\n",
396 |       "  ...,\n",
397 |       "  {0.6108, 0.792, -0.4682, ..., 0.08599, 1.059, -0.2985},\n",
398 |       "  {0.4115, 1.094, 0.2389, ..., 0.8984, 0.3688, -0.7335},\n",
399 |       "  {0.1356, 0.5588, 0.2701, ..., 0.5426, 0.4699, -0.5305}}}\n"
400 |      ]
401 |     }
402 |    ],
403 |    "source": [
404 |     "import (\n",
405 |     "    \"github.com/gomlx/onnx-gomlx/onnx\"\n",
406 |     "    \"github.com/gomlx/gomlx/pkg/core/graph\"\n",
407 |     "    \"github.com/gomlx/gomlx/pkg/ml/context\"\n",
408 |     "    \"github.com/gomlx/gomlx/backends\"\n",
409 |     "    _ \"github.com/gomlx/gomlx/backends/default\"\n",
410 |     ")\n",
411 |     "\n",
412 |     "%%\n",
413 |     "// Get ONNX model.\n",
414 |     "repo := hub.New(\"sentence-transformers/all-MiniLM-L6-v2\").WithAuth(hfAuthToken)\n",
415 |     "onnxFilePath, err := repo.DownloadFile(\"onnx/model.onnx\")\n",
416 |     "if err != nil { panic(err) }\n",
417 |     "onnxModel, err := onnx.ReadFile(onnxFilePath)\n",
418 |     "if err != nil { panic(err) }\n",
419 |     "\n",
420 |     "// Convert ONNX variables to GoMLX context (which stores variables):\n",
421 |     "ctx := context.New()\n",
422 |     "err = onnxModel.VariablesToContext(ctx)\n",
423 |     "if err != nil { panic(err) }\n",
424 |     "\n",
425 |     "sentences := []string{\n",
426 |     "    \"This is an example sentence\", \n",
427 |     "    \"Each sentence is converted\"}\n",
428 |     "inputIDs := [][]int64{\n",
429 |     "    {101, 2023, 2003, 2019, 2742, 6251,  102},\n",
430 |     "    { 101, 2169, 6251, 2003, 4991,  102,    0}}\n",
431 |     "tokenTypeIDs := [][]int64{\n",
432 |     "    {0, 0, 0, 0, 0, 0, 0},\n",
433 |     "    {0, 0, 0, 0, 0, 0, 0}}\n",
434 |     "attentionMask := [][]int64{\n",
435 |     "    {1, 1, 1, 1, 1, 1, 1},\n",
436 |     "    {1, 1, 1, 1, 1, 1, 0}}\n",
437 |     "embeddings := context.MustExecOnce(\n",
438 |     "    backends.MustNew(), ctx, \n",
439 |     "    func (ctx *context.Context, inputs []*graph.Node) *graph.Node {\n",
440 |     "        modelOutputs := onnxModel.CallGraph(ctx, inputs[0].Graph(), map[string]*graph.Node{\n",
441 |     "            \"input_ids\": inputs[0],\n",
442 |     "            \"attention_mask\": inputs[1],\n",
443 |     "            \"token_type_ids\": inputs[2]})\n",
444 |     "        return modelOutputs[0]\n",
445 |     "    }, inputIDs, attentionMask, tokenTypeIDs)\n",
446 |     "\n",
447 |     "fmt.Printf(\"Sentences: \\t%q\\n\", sentences)\n",
448 |     "fmt.Printf(\"Embeddings:\\t%s\\n\", embeddings)\n"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "markdown",
453 |    "id": "f4ee266a-f0ee-48e0-ad2d-a5bd85a47043",
454 |    "metadata": {},
455 |    "source": [
456 |     "## Download Dataset Files\n",
457 |     "\n",
458 |     "We are going to use the [HuggingFaceFW/fineweb](https://huggingface.co/datasets/HuggingFaceFW/fineweb) as an example, download one of its sample files (~2.5Gb of data) and parse the `.parquet` file.\n",
459 |     "\n",
460 |     "### Structure of file\n",
461 |     "First we define the structure of each entry, with the tags for the Parquet parser:"
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "code",
466 |    "execution_count": 11,
467 |    "id": "3963f645-a63b-43da-9c9d-3340a330fca7",
468 |    "metadata": {},
469 |    "outputs": [],
470 |    "source": [
471 |     "var (\n",
472 |     "    FineWebID = \"HuggingFaceFW/fineweb\"\n",
473 |     "    FineWebSampleFile = \"sample/10BT/000_00000.parquet\"\n",
474 |     ")\n",
475 |     "\n",
476 |     "// FineWebEntry: inspection of fields in parque file done with tool in \n",
477 |     "// github.com/xitongsys/parquet-go/tool/parquet-tools.\n",
478 |     "//\n",
479 |     "// The parquet annotations are described in: https://pkg.go.dev/github.com/parquet-go/parquet-go#SchemaOf\n",
480 |     "type FineWebEntry struct {\n",
481 |     "    Text string `parquet:\"text,snappy\"`\n",
482 |     "    ID string `parquet:\"id,snappy\"`\n",
483 |     "    Dump string `parquet:\"dump,snappy\"`\n",
484 |     "    URL string `parquet:\"url,snappy\"`\n",
485 |     "    Score float64 `parquet:\"language_score\"`\n",
486 |     "}\n",
487 |     "\n",
488 |     "// TrimString returns s trimmed to at most maxLength runes. If trimmed it appends \"…\" at the end.\n",
489 |     "func TrimString(s string, maxLength int) string {\n",
490 |     "    if utf8.RuneCountInString(s) <= maxLength {\n",
491 |     "        return s\n",
492 |     "    }\n",
493 |     "    runes := []rune(s)\n",
494 |     "    return string(runes[:maxLength-1]) + \"…\"\n",
495 |     "}"
496 |    ]
497 |   },
498 |   {
499 |    "cell_type": "markdown",
500 |    "id": "6a0c90c8-ba8b-4182-92f2-4f7921f8a4f6",
501 |    "metadata": {},
502 |    "source": [
503 |     "### Read the Parquet\n",
504 |     "\n",
505 |     "Using the library [github.com/parquet-go/parquet-go](https://github.com/parquet-go/parquet-go)."
506 |    ]
507 |   },
508 |   {
509 |    "cell_type": "code",
510 |    "execution_count": 12,
511 |    "id": "bc2f3084-05fd-450b-b939-9095234fb225",
512 |    "metadata": {},
513 |    "outputs": [
514 |     {
515 |      "name": "stdout",
516 |      "output_type": "stream",
517 |      "text": [
518 |       "Downloaded 1/1 files, 2.1 GB downloaded         \n",
519 |       "10 rows read\n",
520 |       "Row 0:\tScore=0.823 Text=[\"|Viewing Single Post From: Spoilers for the Week …\"], URL=[http://daytimeroyaltyonline.com/single/…]\n",
521 |       "Row 1:\tScore=0.974 Text=[\"*sigh* Fundamentalist community, let me pass on s…\"], URL=[http://endogenousretrovirus.blogspot.co…]\n",
522 |       "Row 2:\tScore=0.873 Text=[\"A novel two-step immunotherapy approach has shown…\"], URL=[http://news.cancerconnect.com/]\n",
523 |       "Row 3:\tScore=0.932 Text=[\"Free the Cans! Working Together to Reduce Waste\\nI…\"], URL=[http://sharingsolution.com/2009/05/23/f…]\n",
524 |       "Row 4:\tScore=0.955 Text=[\"ORLANDO, Fla. — While the Rapid Recall Exchange, …\"], URL=[http://supermarketnews.com/food-safety/…]\n",
525 |       "Row 5:\tScore=0.954 Text=[\"September 28, 2010\\n2010 Season - Bowman pulls dow…\"], URL=[http://www.augustana.edu/x22236.xml]\n",
526 |       "Row 6:\tScore=0.967 Text=[\"Kraft Foods has taken the Cadbury chocolate brand…\"], URL=[http://www.fdin.org.uk/2012/01/kraft-la…]\n",
527 |       "Row 7:\tScore=0.874 Text=[\"You must be a registered member to view this page…\"], URL=[http://www.golivewire.com/forums/profil…]\n",
528 |       "Row 8:\tScore=0.912 Text=[\"|Facility Type:||Full Service Restaurant|\\n|Inspec…\"], URL=[http://www.healthspace.com/Clients/VDH/…]\n",
529 |       "Row 9:\tScore=0.925 Text=[\"News of the Week\\nBarrie Spring Studio Tour\\nApril …\"], URL=[http://www.jillpricestudios.ca/artist/w…]\n"
530 |      ]
531 |     }
532 |    ],
533 |    "source": [
534 |     "import (\n",
535 |     "    parquet \"github.com/parquet-go/parquet-go\"\n",
536 |     ")\n",
537 |     "\n",
538 |     "%%\n",
539 |     "// Download repo file.\n",
540 |     "repo := hub.New(FineWebID).WithType(hub.RepoTypeDataset).WithAuth(hfAuthToken)\n",
541 |     "localSampleFile := must.M1(repo.DownloadFile(FineWebSampleFile))\n",
542 |     "\n",
543 |     "// Parquet reading using parquet-go: it's somewhat cumbersome (to open the file it needs its size!?), but it works.\n",
544 |     "schema := parquet.SchemaOf(&FineWebEntry{})\n",
545 |     "fSize := must.M1(os.Stat(localSampleFile)).Size()\n",
546 |     "fReader := must.M1(os.Open(localSampleFile))\n",
547 |     "fParquet := must.M1(parquet.OpenFile(fReader, fSize))\n",
548 |     "reader := parquet.NewGenericReader[FineWebEntry](fParquet, schema)\n",
549 |     "defer reader.Close()\n",
550 |     "\n",
551 |     "// Print first 10 rows:\n",
552 |     "rows := make([]FineWebEntry, 10)\n",
553 |     "n := must.M1(reader.Read(rows))\n",
554 |     "fmt.Printf(\"%d rows read\\n\", n)\n",
555 |     "for ii, row := range rows {\n",
556 |     "    fmt.Printf(\"Row %0d:\\tScore=%.3f Text=[%q], URL=[%s]\\n\", ii, row.Score, TrimString(row.Text, 50), TrimString(row.URL, 40))\n",
557 |     "}\n"
558 |    ]
559 |   }
560 |  ],
561 |  "metadata": {
562 |   "kernelspec": {
563 |    "display_name": "Go (gonb)",
564 |    "language": "go",
565 |    "name": "gonb"
566 |   },
567 |   "language_info": {
568 |    "codemirror_mode": "",
569 |    "file_extension": ".go",
570 |    "mimetype": "text/x-go",
571 |    "name": "go",
572 |    "nbconvert_exporter": "",
573 |    "pygments_lexer": "",
574 |    "version": "go1.25.3"
575 |   }
576 |  },
577 |  "nbformat": 4,
578 |  "nbformat_minor": 5
579 | }
580 | 


--------------------------------------------------------------------------------
/tokenizers/sentencepiece/private/protos/sentencepiece_model.pb.go:
--------------------------------------------------------------------------------
   1 | // Copyright 2016 Google Inc.
   2 | //
   3 | // Licensed under the Apache License, Version 2.0 (the "License");
   4 | // you may not use this file except in compliance with the License.
   5 | // You may obtain a copy of the License at
   6 | //
   7 | //     http://www.apache.org/licenses/LICENSE-2.0
   8 | //
   9 | // Unless required by applicable law or agreed to in writing, software
  10 | // distributed under the License is distributed on an "AS IS" BASIS,
  11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 | // See the License for the specific language governing permissions and
  13 | // limitations under the License.!
  14 | 
  15 | // Code generated by protoc-gen-go. DO NOT EDIT.
  16 | // versions:
  17 | // 	protoc-gen-go v1.35.1
  18 | // 	protoc        v3.21.12
  19 | // source: com_github_gomlx_go-huggingface_tokenizers_sentencepiece_private_protos/sentencepiece_model.proto
  20 | 
  21 | // Unique package name to avoid conflicts: the proto library won't allow two different
  22 | // packages to define the same proto (under the same namespace).
  23 | // This is broken, since that's what is needed ... a bad design from the ProtoBuf in Go. See more details here:
  24 | //    https://protobuf.dev/reference/go/faq/#namespace-conflict
  25 | // So instead we change the proto namespace (package) name to globally unique package name:
  26 | 
  27 | package protos
  28 | 
  29 | import (
  30 | 	protoreflect "google.golang.org/protobuf/reflect/protoreflect"
  31 | 	protoimpl "google.golang.org/protobuf/runtime/protoimpl"
  32 | 	reflect "reflect"
  33 | 	sync "sync"
  34 | )
  35 | 
  36 | const (
  37 | 	// Verify that this generated code is sufficiently up-to-date.
  38 | 	_ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion)
  39 | 	// Verify that runtime/protoimpl is sufficiently up-to-date.
  40 | 	_ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20)
  41 | )
  42 | 
  43 | // Model type. only have UNIGRAM now.
  44 | type TrainerSpec_ModelType int32
  45 | 
  46 | const (
  47 | 	TrainerSpec_UNIGRAM TrainerSpec_ModelType = 1 // Unigram language model with dynamic algorithm
  48 | 	TrainerSpec_BPE     TrainerSpec_ModelType = 2 // Byte Pair Encoding
  49 | 	TrainerSpec_WORD    TrainerSpec_ModelType = 3 // Delimitered by whitespace.
  50 | 	TrainerSpec_CHAR    TrainerSpec_ModelType = 4 // tokenizes into character sequence
  51 | )
  52 | 
  53 | // Enum value maps for TrainerSpec_ModelType.
  54 | var (
  55 | 	TrainerSpec_ModelType_name = map[int32]string{
  56 | 		1: "UNIGRAM",
  57 | 		2: "BPE",
  58 | 		3: "WORD",
  59 | 		4: "CHAR",
  60 | 	}
  61 | 	TrainerSpec_ModelType_value = map[string]int32{
  62 | 		"UNIGRAM": 1,
  63 | 		"BPE":     2,
  64 | 		"WORD":    3,
  65 | 		"CHAR":    4,
  66 | 	}
  67 | )
  68 | 
  69 | func (x TrainerSpec_ModelType) Enum() *TrainerSpec_ModelType {
  70 | 	p := new(TrainerSpec_ModelType)
  71 | 	*p = x
  72 | 	return p
  73 | }
  74 | 
  75 | func (x TrainerSpec_ModelType) String() string {
  76 | 	return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x))
  77 | }
  78 | 
  79 | func (TrainerSpec_ModelType) Descriptor() protoreflect.EnumDescriptor {
  80 | 	return file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_enumTypes[0].Descriptor()
  81 | }
  82 | 
  83 | func (TrainerSpec_ModelType) Type() protoreflect.EnumType {
  84 | 	return &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_enumTypes[0]
  85 | }
  86 | 
  87 | func (x TrainerSpec_ModelType) Number() protoreflect.EnumNumber {
  88 | 	return protoreflect.EnumNumber(x)
  89 | }
  90 | 
  91 | // Deprecated: Do not use.
  92 | func (x *TrainerSpec_ModelType) UnmarshalJSON(b []byte) error {
  93 | 	num, err := protoimpl.X.UnmarshalJSONEnum(x.Descriptor(), b)
  94 | 	if err != nil {
  95 | 		return err
  96 | 	}
  97 | 	*x = TrainerSpec_ModelType(num)
  98 | 	return nil
  99 | }
 100 | 
 101 | // Deprecated: Use TrainerSpec_ModelType.Descriptor instead.
 102 | func (TrainerSpec_ModelType) EnumDescriptor() ([]byte, []int) {
 103 | 	return file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescGZIP(), []int{0, 0}
 104 | }
 105 | 
 106 | type ModelProto_SentencePiece_Type int32
 107 | 
 108 | const (
 109 | 	ModelProto_SentencePiece_NORMAL       ModelProto_SentencePiece_Type = 1 // normal symbol
 110 | 	ModelProto_SentencePiece_UNKNOWN      ModelProto_SentencePiece_Type = 2 // unknown symbol. only <unk> for now.
 111 | 	ModelProto_SentencePiece_CONTROL      ModelProto_SentencePiece_Type = 3 // control symbols. </s>, <s>, <2ja> etc.
 112 | 	ModelProto_SentencePiece_USER_DEFINED ModelProto_SentencePiece_Type = 4 // user defined symbols.
 113 | 	// Typical usage of USER_DEFINED symbol
 114 | 	// is placeholder.
 115 | 	ModelProto_SentencePiece_BYTE   ModelProto_SentencePiece_Type = 6 // byte symbols. Used when `byte_fallback` is true.
 116 | 	ModelProto_SentencePiece_UNUSED ModelProto_SentencePiece_Type = 5 // this piece is not used.
 117 | )
 118 | 
 119 | // Enum value maps for ModelProto_SentencePiece_Type.
 120 | var (
 121 | 	ModelProto_SentencePiece_Type_name = map[int32]string{
 122 | 		1: "NORMAL",
 123 | 		2: "UNKNOWN",
 124 | 		3: "CONTROL",
 125 | 		4: "USER_DEFINED",
 126 | 		6: "BYTE",
 127 | 		5: "UNUSED",
 128 | 	}
 129 | 	ModelProto_SentencePiece_Type_value = map[string]int32{
 130 | 		"NORMAL":       1,
 131 | 		"UNKNOWN":      2,
 132 | 		"CONTROL":      3,
 133 | 		"USER_DEFINED": 4,
 134 | 		"BYTE":         6,
 135 | 		"UNUSED":       5,
 136 | 	}
 137 | )
 138 | 
 139 | func (x ModelProto_SentencePiece_Type) Enum() *ModelProto_SentencePiece_Type {
 140 | 	p := new(ModelProto_SentencePiece_Type)
 141 | 	*p = x
 142 | 	return p
 143 | }
 144 | 
 145 | func (x ModelProto_SentencePiece_Type) String() string {
 146 | 	return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x))
 147 | }
 148 | 
 149 | func (ModelProto_SentencePiece_Type) Descriptor() protoreflect.EnumDescriptor {
 150 | 	return file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_enumTypes[1].Descriptor()
 151 | }
 152 | 
 153 | func (ModelProto_SentencePiece_Type) Type() protoreflect.EnumType {
 154 | 	return &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_enumTypes[1]
 155 | }
 156 | 
 157 | func (x ModelProto_SentencePiece_Type) Number() protoreflect.EnumNumber {
 158 | 	return protoreflect.EnumNumber(x)
 159 | }
 160 | 
 161 | // Deprecated: Do not use.
 162 | func (x *ModelProto_SentencePiece_Type) UnmarshalJSON(b []byte) error {
 163 | 	num, err := protoimpl.X.UnmarshalJSONEnum(x.Descriptor(), b)
 164 | 	if err != nil {
 165 | 		return err
 166 | 	}
 167 | 	*x = ModelProto_SentencePiece_Type(num)
 168 | 	return nil
 169 | }
 170 | 
 171 | // Deprecated: Use ModelProto_SentencePiece_Type.Descriptor instead.
 172 | func (ModelProto_SentencePiece_Type) EnumDescriptor() ([]byte, []int) {
 173 | 	return file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescGZIP(), []int{3, 0, 0}
 174 | }
 175 | 
 176 | // TrainerSpec encodes a various parameters for SentencePiece training.
 177 | // Next id: 55
 178 | type TrainerSpec struct {
 179 | 	state           protoimpl.MessageState
 180 | 	sizeCache       protoimpl.SizeCache
 181 | 	unknownFields   protoimpl.UnknownFields
 182 | 	extensionFields protoimpl.ExtensionFields
 183 | 
 184 | 	// /////////////////////////////////////////////////////////////////
 185 | 	// General parameters
 186 | 	//
 187 | 	// Input corpus files.
 188 | 	//
 189 | 	//	Trainer accepts the following two formats:
 190 | 	//	A) Monolingual: plain text, one sentence per line.
 191 | 	//	B) Bilingual:   TSV, source sentence <tab> target sentence
 192 | 	//	When bilingual data is passed, shared vocabulary model is built.
 193 | 	//	Note that the input file must be raw corpus, not a preprocessed corpus.
 194 | 	//	Trainer only loads the first `input_sentence_size` sentences specified
 195 | 	//	with this parameter.
 196 | 	Input []string `protobuf:"bytes,1,rep,name=input" json:"input,omitempty"`
 197 | 	// Input corpus format:
 198 | 	// "text": one-sentence-per-line text format (default)
 199 | 	// "tsv":  sentence <tab> freq
 200 | 	InputFormat *string `protobuf:"bytes,7,opt,name=input_format,json=inputFormat" json:"input_format,omitempty"`
 201 | 	// Output model file prefix.
 202 | 	// <model_prefix>.model and <model_prefix>.vocab are generated.
 203 | 	ModelPrefix *string                `protobuf:"bytes,2,opt,name=model_prefix,json=modelPrefix" json:"model_prefix,omitempty"`
 204 | 	ModelType   *TrainerSpec_ModelType `protobuf:"varint,3,opt,name=model_type,json=modelType,enum=com.github.gomlx.go_huggingface.sentencepiece.TrainerSpec_ModelType,def=1" json:"model_type,omitempty"`
 205 | 	// Vocabulary size. 8k is the default size.
 206 | 	VocabSize *int32 `protobuf:"varint,4,opt,name=vocab_size,json=vocabSize,def=8000" json:"vocab_size,omitempty"`
 207 | 	// List of the languages this model can accept.
 208 | 	// Since the model is language-agnostic, this field is used as a reference.
 209 | 	AcceptLanguage []string `protobuf:"bytes,5,rep,name=accept_language,json=acceptLanguage" json:"accept_language,omitempty"`
 210 | 	// Size of self-test samples, which are encoded in the model file.
 211 | 	SelfTestSampleSize *int32 `protobuf:"varint,6,opt,name=self_test_sample_size,json=selfTestSampleSize,def=0" json:"self_test_sample_size,omitempty"`
 212 | 	// Whether to use DP version of sentencepiece. Use it with TSV input format
 213 | 	// (requires precomputed word tab counts to work).
 214 | 	EnableDifferentialPrivacy *bool `protobuf:"varint,50,opt,name=enable_differential_privacy,json=enableDifferentialPrivacy,def=0" json:"enable_differential_privacy,omitempty"`
 215 | 	// Set these parameters if you need DP version of sentencepiece.
 216 | 	// std of noise to add.
 217 | 	DifferentialPrivacyNoiseLevel *float32 `protobuf:"fixed32,51,opt,name=differential_privacy_noise_level,json=differentialPrivacyNoiseLevel,def=0" json:"differential_privacy_noise_level,omitempty"`
 218 | 	// Clipping threshold to apply after adding noise. All the words with
 219 | 	// frequency less than this value are dropped.
 220 | 	DifferentialPrivacyClippingThreshold *uint64 `protobuf:"varint,52,opt,name=differential_privacy_clipping_threshold,json=differentialPrivacyClippingThreshold,def=0" json:"differential_privacy_clipping_threshold,omitempty"`
 221 | 	// /////////////////////////////////////////////////////////////////
 222 | 	// Training parameters.
 223 | 	//
 224 | 	// Uses characters which cover the corpus with the ratio of `chars_coverage`.
 225 | 	// This parameter determines the set of basic Alphabet of sentence piece.
 226 | 	// 1.0 - `chars_coverage` characters are treated as UNK.
 227 | 	// See also required_chars field.
 228 | 	CharacterCoverage *float32 `protobuf:"fixed32,10,opt,name=character_coverage,json=characterCoverage,def=0.9995" json:"character_coverage,omitempty"`
 229 | 	// Maximum size of sentences the trainer loads from `input` parameter.
 230 | 	// Trainer simply loads the `input` files in sequence.
 231 | 	// It is better to shuffle the input corpus randomly.
 232 | 	InputSentenceSize    *uint64 `protobuf:"varint,11,opt,name=input_sentence_size,json=inputSentenceSize,def=0" json:"input_sentence_size,omitempty"`
 233 | 	ShuffleInputSentence *bool   `protobuf:"varint,19,opt,name=shuffle_input_sentence,json=shuffleInputSentence,def=1" json:"shuffle_input_sentence,omitempty"`
 234 | 	// Maximum size of sentences to make seed sentence pieces.
 235 | 	// Extended suffix array is constructed to extract frequent
 236 | 	// sub-strings from the corpus. This uses 20N working space,
 237 | 	// where N is the size of corpus.
 238 | 	//
 239 | 	// Deprecated: Marked as deprecated in com_github_gomlx_go-huggingface_tokenizers_sentencepiece_private_protos/sentencepiece_model.proto.
 240 | 	MiningSentenceSize *int32 `protobuf:"varint,12,opt,name=mining_sentence_size,json=miningSentenceSize" json:"mining_sentence_size,omitempty"`
 241 | 	// Maximum size of sentences to train sentence pieces.
 242 | 	//
 243 | 	// Deprecated: Marked as deprecated in com_github_gomlx_go-huggingface_tokenizers_sentencepiece_private_protos/sentencepiece_model.proto.
 244 | 	TrainingSentenceSize *int32 `protobuf:"varint,13,opt,name=training_sentence_size,json=trainingSentenceSize" json:"training_sentence_size,omitempty"`
 245 | 	// The size of seed sentencepieces.
 246 | 	// `seed_sentencepiece_size` must be larger than `vocab_size`.
 247 | 	SeedSentencepieceSize *int32 `protobuf:"varint,14,opt,name=seed_sentencepiece_size,json=seedSentencepieceSize,def=1000000" json:"seed_sentencepiece_size,omitempty"`
 248 | 	// In every EM sub-iterations, keeps top
 249 | 	// `shrinking_factor` * `current sentencepieces size` with respect to
 250 | 	// the loss of the sentence piece. This value should be smaller than 1.0.
 251 | 	ShrinkingFactor *float32 `protobuf:"fixed32,15,opt,name=shrinking_factor,json=shrinkingFactor,def=0.75" json:"shrinking_factor,omitempty"`
 252 | 	// The maximum sentence length in byte. The sentences with the length
 253 | 	// larger than `max_sentence_length` is simply ignored.
 254 | 	// Longer input tends to bring the following risks:
 255 | 	//   - Overflow during EM training (unigram language model only)
 256 | 	//   - Performance drop because of O(n log n) cost in BPE.
 257 | 	MaxSentenceLength *int32 `protobuf:"varint,18,opt,name=max_sentence_length,json=maxSentenceLength,def=4192" json:"max_sentence_length,omitempty"`
 258 | 	// Number of threads in the training.
 259 | 	NumThreads *int32 `protobuf:"varint,16,opt,name=num_threads,json=numThreads,def=16" json:"num_threads,omitempty"`
 260 | 	// Number of EM sub iterations.
 261 | 	NumSubIterations *int32 `protobuf:"varint,17,opt,name=num_sub_iterations,json=numSubIterations,def=2" json:"num_sub_iterations,omitempty"`
 262 | 	// /////////////////////////////////////////////////////////////////
 263 | 	// SentencePiece parameters which control the shapes of sentence piece.
 264 | 	//
 265 | 	// Maximum length of sentencepiece.
 266 | 	MaxSentencepieceLength *int32 `protobuf:"varint,20,opt,name=max_sentencepiece_length,json=maxSentencepieceLength,def=16" json:"max_sentencepiece_length,omitempty"`
 267 | 	// Uses Unicode script to split sentence pieces.
 268 | 	// When `split_by_unicode_script` is true, we do not allow sentence piece to
 269 | 	// include multiple Unicode scripts, e.g. "F1" is not a valid piece.
 270 | 	// Exception: CJ characters (Hiragana/Katakana/Han) are all handled
 271 | 	// as one script type, since Japanese word can consist of multiple scripts.
 272 | 	// This exception is always applied regardless of the accept-language
 273 | 	// parameter.
 274 | 	SplitByUnicodeScript *bool `protobuf:"varint,21,opt,name=split_by_unicode_script,json=splitByUnicodeScript,def=1" json:"split_by_unicode_script,omitempty"`
 275 | 	// When `split_by_number` is true, put a boundary between number and
 276 | 	// non-number transition. If we want to treat "F1" is one token, set this flag
 277 | 	// to be false.
 278 | 	SplitByNumber *bool `protobuf:"varint,23,opt,name=split_by_number,json=splitByNumber,def=1" json:"split_by_number,omitempty"`
 279 | 	// Use a white space to split sentence pieces.
 280 | 	// When `split_by_whitespace` is false, we may have the piece containing
 281 | 	// a white space in the middle. e.g., "in_the".
 282 | 	SplitByWhitespace *bool `protobuf:"varint,22,opt,name=split_by_whitespace,json=splitByWhitespace,def=1" json:"split_by_whitespace,omitempty"`
 283 | 	// Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello =>
 284 | 	// hello_. When `treat_whitespace_as_suffix` is true,
 285 | 	// NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end
 286 | 	// of sentence.
 287 | 	TreatWhitespaceAsSuffix *bool `protobuf:"varint,24,opt,name=treat_whitespace_as_suffix,json=treatWhitespaceAsSuffix,def=0" json:"treat_whitespace_as_suffix,omitempty"`
 288 | 	// Allows pieces that only contain whitespaces instead of appearing only as
 289 | 	// prefix or suffix of other pieces.
 290 | 	AllowWhitespaceOnlyPieces *bool `protobuf:"varint,26,opt,name=allow_whitespace_only_pieces,json=allowWhitespaceOnlyPieces,def=0" json:"allow_whitespace_only_pieces,omitempty"`
 291 | 	// Split all digits (0-9) into separate pieces.
 292 | 	SplitDigits *bool `protobuf:"varint,25,opt,name=split_digits,json=splitDigits,def=0" json:"split_digits,omitempty"`
 293 | 	// Defines the pre-tokenization delimiter.
 294 | 	// When specified, no pieces crossing this delimiter is not included
 295 | 	// in the vocab. Then the delimiter string is virtually ignored
 296 | 	// during the training. This field can allows constraints on the vocabulary
 297 | 	// selection. Note that this field is available on unigram mode.
 298 | 	PretokenizationDelimiter *string `protobuf:"bytes,53,opt,name=pretokenization_delimiter,json=pretokenizationDelimiter,def=" json:"pretokenization_delimiter,omitempty"`
 299 | 	// /////////////////////////////////////////////////////////////////
 300 | 	// Vocabulary management
 301 | 	//
 302 | 	// Defines control symbols used as an indicator to
 303 | 	// change the behavior of the decoder. <s> and </s> are pre-defined.
 304 | 	// We can use this field to encode various meta information,
 305 | 	// including language indicator in multilingual model.
 306 | 	// These symbols are not visible to users, but visible to
 307 | 	// the decoder. Note that when the input sentence contains control symbols,
 308 | 	// they are not treated as one token, but segmented into normal pieces.
 309 | 	// Control symbols must be inserted independently from the segmentation.
 310 | 	ControlSymbols []string `protobuf:"bytes,30,rep,name=control_symbols,json=controlSymbols" json:"control_symbols,omitempty"`
 311 | 	// Defines user defined symbols.
 312 | 	// These symbols are added with extremely high score
 313 | 	// so they are always treated as one unique symbol in any context.
 314 | 	// Typical usage of user_defined_symbols is placeholder for named entities.
 315 | 	UserDefinedSymbols []string `protobuf:"bytes,31,rep,name=user_defined_symbols,json=userDefinedSymbols" json:"user_defined_symbols,omitempty"`
 316 | 	// Defines required characters. Each UTF8 character in this string is included
 317 | 	// in the character set regardless of character_coverage value. Unlike
 318 | 	// user_defined_symbols, these characters have scores based on the frequency
 319 | 	// on input sentences, and the model can form subwords using characters
 320 | 	// in this field.
 321 | 	RequiredChars *string `protobuf:"bytes,36,opt,name=required_chars,json=requiredChars" json:"required_chars,omitempty"`
 322 | 	// Decomposes unknown pieces into UTF-8 bytes.
 323 | 	ByteFallback *bool `protobuf:"varint,35,opt,name=byte_fallback,json=byteFallback,def=0" json:"byte_fallback,omitempty"`
 324 | 	// When creating the vocabulary file, defines whether or not to additionally
 325 | 	// output the score for each piece.
 326 | 	VocabularyOutputPieceScore *bool `protobuf:"varint,32,opt,name=vocabulary_output_piece_score,json=vocabularyOutputPieceScore,def=1" json:"vocabulary_output_piece_score,omitempty"`
 327 | 	// `vocab_size` is treated as hard limit. Crash if
 328 | 	// the model can not produce the vocab of size `vocab_size`,
 329 | 	// When `hard_vocab_limit` is false, vocab_size is treated
 330 | 	// as soft limit. Note that when model_type=char,
 331 | 	// always assumes hard_vocab_limit = false.
 332 | 	HardVocabLimit *bool `protobuf:"varint,33,opt,name=hard_vocab_limit,json=hardVocabLimit,def=1" json:"hard_vocab_limit,omitempty"`
 333 | 	// use all symbols for vocab extraction. This flag is valid
 334 | 	// if model type is either CHAR or WORD
 335 | 	UseAllVocab *bool `protobuf:"varint,34,opt,name=use_all_vocab,json=useAllVocab,def=0" json:"use_all_vocab,omitempty"`
 336 | 	// /////////////////////////////////////////////////////////////////
 337 | 	// Reserved special meta tokens.
 338 | 	// * -1 is not used.
 339 | 	// * unk_id must not be -1.
 340 | 	// Id must starts with 0 and be contigous.
 341 | 	UnkId    *int32  `protobuf:"varint,40,opt,name=unk_id,json=unkId,def=0" json:"unk_id,omitempty"`  // <unk>
 342 | 	BosId    *int32  `protobuf:"varint,41,opt,name=bos_id,json=bosId,def=1" json:"bos_id,omitempty"`  // <s>
 343 | 	EosId    *int32  `protobuf:"varint,42,opt,name=eos_id,json=eosId,def=2" json:"eos_id,omitempty"`  // </s>
 344 | 	PadId    *int32  `protobuf:"varint,43,opt,name=pad_id,json=padId,def=-1" json:"pad_id,omitempty"` // <pad> (padding)
 345 | 	UnkPiece *string `protobuf:"bytes,45,opt,name=unk_piece,json=unkPiece,def=<unk>" json:"unk_piece,omitempty"`
 346 | 	BosPiece *string `protobuf:"bytes,46,opt,name=bos_piece,json=bosPiece,def=<s>" json:"bos_piece,omitempty"`
 347 | 	EosPiece *string `protobuf:"bytes,47,opt,name=eos_piece,json=eosPiece,def=</s>" json:"eos_piece,omitempty"`
 348 | 	PadPiece *string `protobuf:"bytes,48,opt,name=pad_piece,json=padPiece,def=<pad>" json:"pad_piece,omitempty"`
 349 | 	// Encodes <unk> into U+2047 (DOUBLE QUESTION MARK),
 350 | 	// since this character can be useful both for user and
 351 | 	// developer. We can easily figure out that <unk> is emitted.
 352 | 	UnkSurface *string `protobuf:"bytes,44,opt,name=unk_surface,json=unkSurface,def= ⁇ " json:"unk_surface,omitempty"`
 353 | 	// Increase bit depth to allow unigram model training on large
 354 | 	// (>10M sentences) corpora. A Side-effect of enabling this flag
 355 | 	// is increased memory usage.
 356 | 	TrainExtremelyLargeCorpus *bool `protobuf:"varint,49,opt,name=train_extremely_large_corpus,json=trainExtremelyLargeCorpus,def=0" json:"train_extremely_large_corpus,omitempty"`
 357 | 	// Path to a seed sentencepieces file, with one tab-separated
 358 | 	// seed sentencepiece <tab> frequency per line.
 359 | 	SeedSentencepiecesFile *string `protobuf:"bytes,54,opt,name=seed_sentencepieces_file,json=seedSentencepiecesFile,def=" json:"seed_sentencepieces_file,omitempty"`
 360 | }
 361 | 
 362 | // Default values for TrainerSpec fields.
 363 | const (
 364 | 	Default_TrainerSpec_ModelType                            = TrainerSpec_UNIGRAM
 365 | 	Default_TrainerSpec_VocabSize                            = int32(8000)
 366 | 	Default_TrainerSpec_SelfTestSampleSize                   = int32(0)
 367 | 	Default_TrainerSpec_EnableDifferentialPrivacy            = bool(false)
 368 | 	Default_TrainerSpec_DifferentialPrivacyNoiseLevel        = float32(0)
 369 | 	Default_TrainerSpec_DifferentialPrivacyClippingThreshold = uint64(0)
 370 | 	Default_TrainerSpec_CharacterCoverage                    = float32(0.9994999766349792)
 371 | 	Default_TrainerSpec_InputSentenceSize                    = uint64(0)
 372 | 	Default_TrainerSpec_ShuffleInputSentence                 = bool(true)
 373 | 	Default_TrainerSpec_SeedSentencepieceSize                = int32(1000000)
 374 | 	Default_TrainerSpec_ShrinkingFactor                      = float32(0.75)
 375 | 	Default_TrainerSpec_MaxSentenceLength                    = int32(4192)
 376 | 	Default_TrainerSpec_NumThreads                           = int32(16)
 377 | 	Default_TrainerSpec_NumSubIterations                     = int32(2)
 378 | 	Default_TrainerSpec_MaxSentencepieceLength               = int32(16)
 379 | 	Default_TrainerSpec_SplitByUnicodeScript                 = bool(true)
 380 | 	Default_TrainerSpec_SplitByNumber                        = bool(true)
 381 | 	Default_TrainerSpec_SplitByWhitespace                    = bool(true)
 382 | 	Default_TrainerSpec_TreatWhitespaceAsSuffix              = bool(false)
 383 | 	Default_TrainerSpec_AllowWhitespaceOnlyPieces            = bool(false)
 384 | 	Default_TrainerSpec_SplitDigits                          = bool(false)
 385 | 	Default_TrainerSpec_PretokenizationDelimiter             = string("")
 386 | 	Default_TrainerSpec_ByteFallback                         = bool(false)
 387 | 	Default_TrainerSpec_VocabularyOutputPieceScore           = bool(true)
 388 | 	Default_TrainerSpec_HardVocabLimit                       = bool(true)
 389 | 	Default_TrainerSpec_UseAllVocab                          = bool(false)
 390 | 	Default_TrainerSpec_UnkId                                = int32(0)
 391 | 	Default_TrainerSpec_BosId                                = int32(1)
 392 | 	Default_TrainerSpec_EosId                                = int32(2)
 393 | 	Default_TrainerSpec_PadId                                = int32(-1)
 394 | 	Default_TrainerSpec_UnkPiece                             = string("<unk>")
 395 | 	Default_TrainerSpec_BosPiece                             = string("<s>")
 396 | 	Default_TrainerSpec_EosPiece                             = string("</s>")
 397 | 	Default_TrainerSpec_PadPiece                             = string("<pad>")
 398 | 	Default_TrainerSpec_UnkSurface                           = string(" ⁇ ")
 399 | 	Default_TrainerSpec_TrainExtremelyLargeCorpus            = bool(false)
 400 | 	Default_TrainerSpec_SeedSentencepiecesFile               = string("")
 401 | )
 402 | 
 403 | func (x *TrainerSpec) Reset() {
 404 | 	*x = TrainerSpec{}
 405 | 	mi := &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes[0]
 406 | 	ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
 407 | 	ms.StoreMessageInfo(mi)
 408 | }
 409 | 
 410 | func (x *TrainerSpec) String() string {
 411 | 	return protoimpl.X.MessageStringOf(x)
 412 | }
 413 | 
 414 | func (*TrainerSpec) ProtoMessage() {}
 415 | 
 416 | func (x *TrainerSpec) ProtoReflect() protoreflect.Message {
 417 | 	mi := &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes[0]
 418 | 	if x != nil {
 419 | 		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
 420 | 		if ms.LoadMessageInfo() == nil {
 421 | 			ms.StoreMessageInfo(mi)
 422 | 		}
 423 | 		return ms
 424 | 	}
 425 | 	return mi.MessageOf(x)
 426 | }
 427 | 
 428 | // Deprecated: Use TrainerSpec.ProtoReflect.Descriptor instead.
 429 | func (*TrainerSpec) Descriptor() ([]byte, []int) {
 430 | 	return file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescGZIP(), []int{0}
 431 | }
 432 | 
 433 | func (x *TrainerSpec) GetInput() []string {
 434 | 	if x != nil {
 435 | 		return x.Input
 436 | 	}
 437 | 	return nil
 438 | }
 439 | 
 440 | func (x *TrainerSpec) GetInputFormat() string {
 441 | 	if x != nil && x.InputFormat != nil {
 442 | 		return *x.InputFormat
 443 | 	}
 444 | 	return ""
 445 | }
 446 | 
 447 | func (x *TrainerSpec) GetModelPrefix() string {
 448 | 	if x != nil && x.ModelPrefix != nil {
 449 | 		return *x.ModelPrefix
 450 | 	}
 451 | 	return ""
 452 | }
 453 | 
 454 | func (x *TrainerSpec) GetModelType() TrainerSpec_ModelType {
 455 | 	if x != nil && x.ModelType != nil {
 456 | 		return *x.ModelType
 457 | 	}
 458 | 	return Default_TrainerSpec_ModelType
 459 | }
 460 | 
 461 | func (x *TrainerSpec) GetVocabSize() int32 {
 462 | 	if x != nil && x.VocabSize != nil {
 463 | 		return *x.VocabSize
 464 | 	}
 465 | 	return Default_TrainerSpec_VocabSize
 466 | }
 467 | 
 468 | func (x *TrainerSpec) GetAcceptLanguage() []string {
 469 | 	if x != nil {
 470 | 		return x.AcceptLanguage
 471 | 	}
 472 | 	return nil
 473 | }
 474 | 
 475 | func (x *TrainerSpec) GetSelfTestSampleSize() int32 {
 476 | 	if x != nil && x.SelfTestSampleSize != nil {
 477 | 		return *x.SelfTestSampleSize
 478 | 	}
 479 | 	return Default_TrainerSpec_SelfTestSampleSize
 480 | }
 481 | 
 482 | func (x *TrainerSpec) GetEnableDifferentialPrivacy() bool {
 483 | 	if x != nil && x.EnableDifferentialPrivacy != nil {
 484 | 		return *x.EnableDifferentialPrivacy
 485 | 	}
 486 | 	return Default_TrainerSpec_EnableDifferentialPrivacy
 487 | }
 488 | 
 489 | func (x *TrainerSpec) GetDifferentialPrivacyNoiseLevel() float32 {
 490 | 	if x != nil && x.DifferentialPrivacyNoiseLevel != nil {
 491 | 		return *x.DifferentialPrivacyNoiseLevel
 492 | 	}
 493 | 	return Default_TrainerSpec_DifferentialPrivacyNoiseLevel
 494 | }
 495 | 
 496 | func (x *TrainerSpec) GetDifferentialPrivacyClippingThreshold() uint64 {
 497 | 	if x != nil && x.DifferentialPrivacyClippingThreshold != nil {
 498 | 		return *x.DifferentialPrivacyClippingThreshold
 499 | 	}
 500 | 	return Default_TrainerSpec_DifferentialPrivacyClippingThreshold
 501 | }
 502 | 
 503 | func (x *TrainerSpec) GetCharacterCoverage() float32 {
 504 | 	if x != nil && x.CharacterCoverage != nil {
 505 | 		return *x.CharacterCoverage
 506 | 	}
 507 | 	return Default_TrainerSpec_CharacterCoverage
 508 | }
 509 | 
 510 | func (x *TrainerSpec) GetInputSentenceSize() uint64 {
 511 | 	if x != nil && x.InputSentenceSize != nil {
 512 | 		return *x.InputSentenceSize
 513 | 	}
 514 | 	return Default_TrainerSpec_InputSentenceSize
 515 | }
 516 | 
 517 | func (x *TrainerSpec) GetShuffleInputSentence() bool {
 518 | 	if x != nil && x.ShuffleInputSentence != nil {
 519 | 		return *x.ShuffleInputSentence
 520 | 	}
 521 | 	return Default_TrainerSpec_ShuffleInputSentence
 522 | }
 523 | 
 524 | // Deprecated: Marked as deprecated in com_github_gomlx_go-huggingface_tokenizers_sentencepiece_private_protos/sentencepiece_model.proto.
 525 | func (x *TrainerSpec) GetMiningSentenceSize() int32 {
 526 | 	if x != nil && x.MiningSentenceSize != nil {
 527 | 		return *x.MiningSentenceSize
 528 | 	}
 529 | 	return 0
 530 | }
 531 | 
 532 | // Deprecated: Marked as deprecated in com_github_gomlx_go-huggingface_tokenizers_sentencepiece_private_protos/sentencepiece_model.proto.
 533 | func (x *TrainerSpec) GetTrainingSentenceSize() int32 {
 534 | 	if x != nil && x.TrainingSentenceSize != nil {
 535 | 		return *x.TrainingSentenceSize
 536 | 	}
 537 | 	return 0
 538 | }
 539 | 
 540 | func (x *TrainerSpec) GetSeedSentencepieceSize() int32 {
 541 | 	if x != nil && x.SeedSentencepieceSize != nil {
 542 | 		return *x.SeedSentencepieceSize
 543 | 	}
 544 | 	return Default_TrainerSpec_SeedSentencepieceSize
 545 | }
 546 | 
 547 | func (x *TrainerSpec) GetShrinkingFactor() float32 {
 548 | 	if x != nil && x.ShrinkingFactor != nil {
 549 | 		return *x.ShrinkingFactor
 550 | 	}
 551 | 	return Default_TrainerSpec_ShrinkingFactor
 552 | }
 553 | 
 554 | func (x *TrainerSpec) GetMaxSentenceLength() int32 {
 555 | 	if x != nil && x.MaxSentenceLength != nil {
 556 | 		return *x.MaxSentenceLength
 557 | 	}
 558 | 	return Default_TrainerSpec_MaxSentenceLength
 559 | }
 560 | 
 561 | func (x *TrainerSpec) GetNumThreads() int32 {
 562 | 	if x != nil && x.NumThreads != nil {
 563 | 		return *x.NumThreads
 564 | 	}
 565 | 	return Default_TrainerSpec_NumThreads
 566 | }
 567 | 
 568 | func (x *TrainerSpec) GetNumSubIterations() int32 {
 569 | 	if x != nil && x.NumSubIterations != nil {
 570 | 		return *x.NumSubIterations
 571 | 	}
 572 | 	return Default_TrainerSpec_NumSubIterations
 573 | }
 574 | 
 575 | func (x *TrainerSpec) GetMaxSentencepieceLength() int32 {
 576 | 	if x != nil && x.MaxSentencepieceLength != nil {
 577 | 		return *x.MaxSentencepieceLength
 578 | 	}
 579 | 	return Default_TrainerSpec_MaxSentencepieceLength
 580 | }
 581 | 
 582 | func (x *TrainerSpec) GetSplitByUnicodeScript() bool {
 583 | 	if x != nil && x.SplitByUnicodeScript != nil {
 584 | 		return *x.SplitByUnicodeScript
 585 | 	}
 586 | 	return Default_TrainerSpec_SplitByUnicodeScript
 587 | }
 588 | 
 589 | func (x *TrainerSpec) GetSplitByNumber() bool {
 590 | 	if x != nil && x.SplitByNumber != nil {
 591 | 		return *x.SplitByNumber
 592 | 	}
 593 | 	return Default_TrainerSpec_SplitByNumber
 594 | }
 595 | 
 596 | func (x *TrainerSpec) GetSplitByWhitespace() bool {
 597 | 	if x != nil && x.SplitByWhitespace != nil {
 598 | 		return *x.SplitByWhitespace
 599 | 	}
 600 | 	return Default_TrainerSpec_SplitByWhitespace
 601 | }
 602 | 
 603 | func (x *TrainerSpec) GetTreatWhitespaceAsSuffix() bool {
 604 | 	if x != nil && x.TreatWhitespaceAsSuffix != nil {
 605 | 		return *x.TreatWhitespaceAsSuffix
 606 | 	}
 607 | 	return Default_TrainerSpec_TreatWhitespaceAsSuffix
 608 | }
 609 | 
 610 | func (x *TrainerSpec) GetAllowWhitespaceOnlyPieces() bool {
 611 | 	if x != nil && x.AllowWhitespaceOnlyPieces != nil {
 612 | 		return *x.AllowWhitespaceOnlyPieces
 613 | 	}
 614 | 	return Default_TrainerSpec_AllowWhitespaceOnlyPieces
 615 | }
 616 | 
 617 | func (x *TrainerSpec) GetSplitDigits() bool {
 618 | 	if x != nil && x.SplitDigits != nil {
 619 | 		return *x.SplitDigits
 620 | 	}
 621 | 	return Default_TrainerSpec_SplitDigits
 622 | }
 623 | 
 624 | func (x *TrainerSpec) GetPretokenizationDelimiter() string {
 625 | 	if x != nil && x.PretokenizationDelimiter != nil {
 626 | 		return *x.PretokenizationDelimiter
 627 | 	}
 628 | 	return Default_TrainerSpec_PretokenizationDelimiter
 629 | }
 630 | 
 631 | func (x *TrainerSpec) GetControlSymbols() []string {
 632 | 	if x != nil {
 633 | 		return x.ControlSymbols
 634 | 	}
 635 | 	return nil
 636 | }
 637 | 
 638 | func (x *TrainerSpec) GetUserDefinedSymbols() []string {
 639 | 	if x != nil {
 640 | 		return x.UserDefinedSymbols
 641 | 	}
 642 | 	return nil
 643 | }
 644 | 
 645 | func (x *TrainerSpec) GetRequiredChars() string {
 646 | 	if x != nil && x.RequiredChars != nil {
 647 | 		return *x.RequiredChars
 648 | 	}
 649 | 	return ""
 650 | }
 651 | 
 652 | func (x *TrainerSpec) GetByteFallback() bool {
 653 | 	if x != nil && x.ByteFallback != nil {
 654 | 		return *x.ByteFallback
 655 | 	}
 656 | 	return Default_TrainerSpec_ByteFallback
 657 | }
 658 | 
 659 | func (x *TrainerSpec) GetVocabularyOutputPieceScore() bool {
 660 | 	if x != nil && x.VocabularyOutputPieceScore != nil {
 661 | 		return *x.VocabularyOutputPieceScore
 662 | 	}
 663 | 	return Default_TrainerSpec_VocabularyOutputPieceScore
 664 | }
 665 | 
 666 | func (x *TrainerSpec) GetHardVocabLimit() bool {
 667 | 	if x != nil && x.HardVocabLimit != nil {
 668 | 		return *x.HardVocabLimit
 669 | 	}
 670 | 	return Default_TrainerSpec_HardVocabLimit
 671 | }
 672 | 
 673 | func (x *TrainerSpec) GetUseAllVocab() bool {
 674 | 	if x != nil && x.UseAllVocab != nil {
 675 | 		return *x.UseAllVocab
 676 | 	}
 677 | 	return Default_TrainerSpec_UseAllVocab
 678 | }
 679 | 
 680 | func (x *TrainerSpec) GetUnkId() int32 {
 681 | 	if x != nil && x.UnkId != nil {
 682 | 		return *x.UnkId
 683 | 	}
 684 | 	return Default_TrainerSpec_UnkId
 685 | }
 686 | 
 687 | func (x *TrainerSpec) GetBosId() int32 {
 688 | 	if x != nil && x.BosId != nil {
 689 | 		return *x.BosId
 690 | 	}
 691 | 	return Default_TrainerSpec_BosId
 692 | }
 693 | 
 694 | func (x *TrainerSpec) GetEosId() int32 {
 695 | 	if x != nil && x.EosId != nil {
 696 | 		return *x.EosId
 697 | 	}
 698 | 	return Default_TrainerSpec_EosId
 699 | }
 700 | 
 701 | func (x *TrainerSpec) GetPadId() int32 {
 702 | 	if x != nil && x.PadId != nil {
 703 | 		return *x.PadId
 704 | 	}
 705 | 	return Default_TrainerSpec_PadId
 706 | }
 707 | 
 708 | func (x *TrainerSpec) GetUnkPiece() string {
 709 | 	if x != nil && x.UnkPiece != nil {
 710 | 		return *x.UnkPiece
 711 | 	}
 712 | 	return Default_TrainerSpec_UnkPiece
 713 | }
 714 | 
 715 | func (x *TrainerSpec) GetBosPiece() string {
 716 | 	if x != nil && x.BosPiece != nil {
 717 | 		return *x.BosPiece
 718 | 	}
 719 | 	return Default_TrainerSpec_BosPiece
 720 | }
 721 | 
 722 | func (x *TrainerSpec) GetEosPiece() string {
 723 | 	if x != nil && x.EosPiece != nil {
 724 | 		return *x.EosPiece
 725 | 	}
 726 | 	return Default_TrainerSpec_EosPiece
 727 | }
 728 | 
 729 | func (x *TrainerSpec) GetPadPiece() string {
 730 | 	if x != nil && x.PadPiece != nil {
 731 | 		return *x.PadPiece
 732 | 	}
 733 | 	return Default_TrainerSpec_PadPiece
 734 | }
 735 | 
 736 | func (x *TrainerSpec) GetUnkSurface() string {
 737 | 	if x != nil && x.UnkSurface != nil {
 738 | 		return *x.UnkSurface
 739 | 	}
 740 | 	return Default_TrainerSpec_UnkSurface
 741 | }
 742 | 
 743 | func (x *TrainerSpec) GetTrainExtremelyLargeCorpus() bool {
 744 | 	if x != nil && x.TrainExtremelyLargeCorpus != nil {
 745 | 		return *x.TrainExtremelyLargeCorpus
 746 | 	}
 747 | 	return Default_TrainerSpec_TrainExtremelyLargeCorpus
 748 | }
 749 | 
 750 | func (x *TrainerSpec) GetSeedSentencepiecesFile() string {
 751 | 	if x != nil && x.SeedSentencepiecesFile != nil {
 752 | 		return *x.SeedSentencepiecesFile
 753 | 	}
 754 | 	return Default_TrainerSpec_SeedSentencepiecesFile
 755 | }
 756 | 
 757 | // NormalizerSpec encodes a various parameters for string normalizaiton
 758 | type NormalizerSpec struct {
 759 | 	state           protoimpl.MessageState
 760 | 	sizeCache       protoimpl.SizeCache
 761 | 	unknownFields   protoimpl.UnknownFields
 762 | 	extensionFields protoimpl.ExtensionFields
 763 | 
 764 | 	// name of normalization rule.
 765 | 	Name *string `protobuf:"bytes,1,opt,name=name" json:"name,omitempty"`
 766 | 	// Pre-compiled normalization rule created by
 767 | 	// Builder::GetPrecompiledCharsMap() or Builder::CompileCharsMap() method.
 768 | 	// Usually this field is set by Builder::GetNormalizerSpec() method.
 769 | 	PrecompiledCharsmap []byte `protobuf:"bytes,2,opt,name=precompiled_charsmap,json=precompiledCharsmap" json:"precompiled_charsmap,omitempty"`
 770 | 	// Adds dummy whitespace at the beginning of text in order to
 771 | 	// treat "world" in "world" and "hello world" in the same way.
 772 | 	AddDummyPrefix *bool `protobuf:"varint,3,opt,name=add_dummy_prefix,json=addDummyPrefix,def=1" json:"add_dummy_prefix,omitempty"`
 773 | 	// Removes leading, trailing, and duplicate internal whitespace.
 774 | 	RemoveExtraWhitespaces *bool `protobuf:"varint,4,opt,name=remove_extra_whitespaces,json=removeExtraWhitespaces,def=1" json:"remove_extra_whitespaces,omitempty"`
 775 | 	// Replaces whitespace with meta symbol.
 776 | 	// This field must be true to train sentence piece model.
 777 | 	EscapeWhitespaces *bool `protobuf:"varint,5,opt,name=escape_whitespaces,json=escapeWhitespaces,def=1" json:"escape_whitespaces,omitempty"`
 778 | 	// Custom normalization rule file in TSV format.
 779 | 	// https://github.com/google/sentencepiece/blob/master/doc/normalization.md
 780 | 	// This field is only used in SentencePieceTrainer::Train() method, which
 781 | 	// compiles the rule into the binary rule stored in `precompiled_charsmap`.
 782 | 	NormalizationRuleTsv *string `protobuf:"bytes,6,opt,name=normalization_rule_tsv,json=normalizationRuleTsv" json:"normalization_rule_tsv,omitempty"`
 783 | }
 784 | 
 785 | // Default values for NormalizerSpec fields.
 786 | const (
 787 | 	Default_NormalizerSpec_AddDummyPrefix         = bool(true)
 788 | 	Default_NormalizerSpec_RemoveExtraWhitespaces = bool(true)
 789 | 	Default_NormalizerSpec_EscapeWhitespaces      = bool(true)
 790 | )
 791 | 
 792 | func (x *NormalizerSpec) Reset() {
 793 | 	*x = NormalizerSpec{}
 794 | 	mi := &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes[1]
 795 | 	ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
 796 | 	ms.StoreMessageInfo(mi)
 797 | }
 798 | 
 799 | func (x *NormalizerSpec) String() string {
 800 | 	return protoimpl.X.MessageStringOf(x)
 801 | }
 802 | 
 803 | func (*NormalizerSpec) ProtoMessage() {}
 804 | 
 805 | func (x *NormalizerSpec) ProtoReflect() protoreflect.Message {
 806 | 	mi := &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes[1]
 807 | 	if x != nil {
 808 | 		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
 809 | 		if ms.LoadMessageInfo() == nil {
 810 | 			ms.StoreMessageInfo(mi)
 811 | 		}
 812 | 		return ms
 813 | 	}
 814 | 	return mi.MessageOf(x)
 815 | }
 816 | 
 817 | // Deprecated: Use NormalizerSpec.ProtoReflect.Descriptor instead.
 818 | func (*NormalizerSpec) Descriptor() ([]byte, []int) {
 819 | 	return file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescGZIP(), []int{1}
 820 | }
 821 | 
 822 | func (x *NormalizerSpec) GetName() string {
 823 | 	if x != nil && x.Name != nil {
 824 | 		return *x.Name
 825 | 	}
 826 | 	return ""
 827 | }
 828 | 
 829 | func (x *NormalizerSpec) GetPrecompiledCharsmap() []byte {
 830 | 	if x != nil {
 831 | 		return x.PrecompiledCharsmap
 832 | 	}
 833 | 	return nil
 834 | }
 835 | 
 836 | func (x *NormalizerSpec) GetAddDummyPrefix() bool {
 837 | 	if x != nil && x.AddDummyPrefix != nil {
 838 | 		return *x.AddDummyPrefix
 839 | 	}
 840 | 	return Default_NormalizerSpec_AddDummyPrefix
 841 | }
 842 | 
 843 | func (x *NormalizerSpec) GetRemoveExtraWhitespaces() bool {
 844 | 	if x != nil && x.RemoveExtraWhitespaces != nil {
 845 | 		return *x.RemoveExtraWhitespaces
 846 | 	}
 847 | 	return Default_NormalizerSpec_RemoveExtraWhitespaces
 848 | }
 849 | 
 850 | func (x *NormalizerSpec) GetEscapeWhitespaces() bool {
 851 | 	if x != nil && x.EscapeWhitespaces != nil {
 852 | 		return *x.EscapeWhitespaces
 853 | 	}
 854 | 	return Default_NormalizerSpec_EscapeWhitespaces
 855 | }
 856 | 
 857 | func (x *NormalizerSpec) GetNormalizationRuleTsv() string {
 858 | 	if x != nil && x.NormalizationRuleTsv != nil {
 859 | 		return *x.NormalizationRuleTsv
 860 | 	}
 861 | 	return ""
 862 | }
 863 | 
 864 | // Proto to store samples for self-testing.
 865 | type SelfTestData struct {
 866 | 	state           protoimpl.MessageState
 867 | 	sizeCache       protoimpl.SizeCache
 868 | 	unknownFields   protoimpl.UnknownFields
 869 | 	extensionFields protoimpl.ExtensionFields
 870 | 
 871 | 	Samples []*SelfTestData_Sample `protobuf:"bytes,1,rep,name=samples" json:"samples,omitempty"`
 872 | }
 873 | 
 874 | func (x *SelfTestData) Reset() {
 875 | 	*x = SelfTestData{}
 876 | 	mi := &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes[2]
 877 | 	ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
 878 | 	ms.StoreMessageInfo(mi)
 879 | }
 880 | 
 881 | func (x *SelfTestData) String() string {
 882 | 	return protoimpl.X.MessageStringOf(x)
 883 | }
 884 | 
 885 | func (*SelfTestData) ProtoMessage() {}
 886 | 
 887 | func (x *SelfTestData) ProtoReflect() protoreflect.Message {
 888 | 	mi := &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes[2]
 889 | 	if x != nil {
 890 | 		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
 891 | 		if ms.LoadMessageInfo() == nil {
 892 | 			ms.StoreMessageInfo(mi)
 893 | 		}
 894 | 		return ms
 895 | 	}
 896 | 	return mi.MessageOf(x)
 897 | }
 898 | 
 899 | // Deprecated: Use SelfTestData.ProtoReflect.Descriptor instead.
 900 | func (*SelfTestData) Descriptor() ([]byte, []int) {
 901 | 	return file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescGZIP(), []int{2}
 902 | }
 903 | 
 904 | func (x *SelfTestData) GetSamples() []*SelfTestData_Sample {
 905 | 	if x != nil {
 906 | 		return x.Samples
 907 | 	}
 908 | 	return nil
 909 | }
 910 | 
 911 | // ModelProto stores model parameters.
 912 | // SentencePieceProcessor is supposed to be self-contained.
 913 | // All settings/parameters which may change the behavior must be encoded
 914 | // in ModelProto.
 915 | type ModelProto struct {
 916 | 	state           protoimpl.MessageState
 917 | 	sizeCache       protoimpl.SizeCache
 918 | 	unknownFields   protoimpl.UnknownFields
 919 | 	extensionFields protoimpl.ExtensionFields
 920 | 
 921 | 	// Sentence pieces with scores.
 922 | 	Pieces []*ModelProto_SentencePiece `protobuf:"bytes,1,rep,name=pieces" json:"pieces,omitempty"`
 923 | 	// Spec used to generate this model file.
 924 | 	TrainerSpec *TrainerSpec `protobuf:"bytes,2,opt,name=trainer_spec,json=trainerSpec" json:"trainer_spec,omitempty"`
 925 | 	// Spec for text normalization.
 926 | 	NormalizerSpec *NormalizerSpec `protobuf:"bytes,3,opt,name=normalizer_spec,json=normalizerSpec" json:"normalizer_spec,omitempty"`
 927 | 	// Stores sample input and its expected segmentation to verify the model.
 928 | 	SelfTestData *SelfTestData `protobuf:"bytes,4,opt,name=self_test_data,json=selfTestData" json:"self_test_data,omitempty"`
 929 | 	// Spec for text de-normalization.
 930 | 	DenormalizerSpec *NormalizerSpec `protobuf:"bytes,5,opt,name=denormalizer_spec,json=denormalizerSpec" json:"denormalizer_spec,omitempty"`
 931 | }
 932 | 
 933 | func (x *ModelProto) Reset() {
 934 | 	*x = ModelProto{}
 935 | 	mi := &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes[3]
 936 | 	ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
 937 | 	ms.StoreMessageInfo(mi)
 938 | }
 939 | 
 940 | func (x *ModelProto) String() string {
 941 | 	return protoimpl.X.MessageStringOf(x)
 942 | }
 943 | 
 944 | func (*ModelProto) ProtoMessage() {}
 945 | 
 946 | func (x *ModelProto) ProtoReflect() protoreflect.Message {
 947 | 	mi := &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes[3]
 948 | 	if x != nil {
 949 | 		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
 950 | 		if ms.LoadMessageInfo() == nil {
 951 | 			ms.StoreMessageInfo(mi)
 952 | 		}
 953 | 		return ms
 954 | 	}
 955 | 	return mi.MessageOf(x)
 956 | }
 957 | 
 958 | // Deprecated: Use ModelProto.ProtoReflect.Descriptor instead.
 959 | func (*ModelProto) Descriptor() ([]byte, []int) {
 960 | 	return file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescGZIP(), []int{3}
 961 | }
 962 | 
 963 | func (x *ModelProto) GetPieces() []*ModelProto_SentencePiece {
 964 | 	if x != nil {
 965 | 		return x.Pieces
 966 | 	}
 967 | 	return nil
 968 | }
 969 | 
 970 | func (x *ModelProto) GetTrainerSpec() *TrainerSpec {
 971 | 	if x != nil {
 972 | 		return x.TrainerSpec
 973 | 	}
 974 | 	return nil
 975 | }
 976 | 
 977 | func (x *ModelProto) GetNormalizerSpec() *NormalizerSpec {
 978 | 	if x != nil {
 979 | 		return x.NormalizerSpec
 980 | 	}
 981 | 	return nil
 982 | }
 983 | 
 984 | func (x *ModelProto) GetSelfTestData() *SelfTestData {
 985 | 	if x != nil {
 986 | 		return x.SelfTestData
 987 | 	}
 988 | 	return nil
 989 | }
 990 | 
 991 | func (x *ModelProto) GetDenormalizerSpec() *NormalizerSpec {
 992 | 	if x != nil {
 993 | 		return x.DenormalizerSpec
 994 | 	}
 995 | 	return nil
 996 | }
 997 | 
 998 | type SelfTestData_Sample struct {
 999 | 	state         protoimpl.MessageState
1000 | 	sizeCache     protoimpl.SizeCache
1001 | 	unknownFields protoimpl.UnknownFields
1002 | 
1003 | 	Input    *string `protobuf:"bytes,1,opt,name=input" json:"input,omitempty"`
1004 | 	Expected *string `protobuf:"bytes,2,opt,name=expected" json:"expected,omitempty"`
1005 | }
1006 | 
1007 | func (x *SelfTestData_Sample) Reset() {
1008 | 	*x = SelfTestData_Sample{}
1009 | 	mi := &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes[4]
1010 | 	ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
1011 | 	ms.StoreMessageInfo(mi)
1012 | }
1013 | 
1014 | func (x *SelfTestData_Sample) String() string {
1015 | 	return protoimpl.X.MessageStringOf(x)
1016 | }
1017 | 
1018 | func (*SelfTestData_Sample) ProtoMessage() {}
1019 | 
1020 | func (x *SelfTestData_Sample) ProtoReflect() protoreflect.Message {
1021 | 	mi := &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes[4]
1022 | 	if x != nil {
1023 | 		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
1024 | 		if ms.LoadMessageInfo() == nil {
1025 | 			ms.StoreMessageInfo(mi)
1026 | 		}
1027 | 		return ms
1028 | 	}
1029 | 	return mi.MessageOf(x)
1030 | }
1031 | 
1032 | // Deprecated: Use SelfTestData_Sample.ProtoReflect.Descriptor instead.
1033 | func (*SelfTestData_Sample) Descriptor() ([]byte, []int) {
1034 | 	return file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescGZIP(), []int{2, 0}
1035 | }
1036 | 
1037 | func (x *SelfTestData_Sample) GetInput() string {
1038 | 	if x != nil && x.Input != nil {
1039 | 		return *x.Input
1040 | 	}
1041 | 	return ""
1042 | }
1043 | 
1044 | func (x *SelfTestData_Sample) GetExpected() string {
1045 | 	if x != nil && x.Expected != nil {
1046 | 		return *x.Expected
1047 | 	}
1048 | 	return ""
1049 | }
1050 | 
1051 | type ModelProto_SentencePiece struct {
1052 | 	state           protoimpl.MessageState
1053 | 	sizeCache       protoimpl.SizeCache
1054 | 	unknownFields   protoimpl.UnknownFields
1055 | 	extensionFields protoimpl.ExtensionFields
1056 | 
1057 | 	Piece *string                        `protobuf:"bytes,1,opt,name=piece" json:"piece,omitempty"` // piece must not be empty.
1058 | 	Score *float32                       `protobuf:"fixed32,2,opt,name=score" json:"score,omitempty"`
1059 | 	Type  *ModelProto_SentencePiece_Type `protobuf:"varint,3,opt,name=type,enum=com.github.gomlx.go_huggingface.sentencepiece.ModelProto_SentencePiece_Type,def=1" json:"type,omitempty"`
1060 | }
1061 | 
1062 | // Default values for ModelProto_SentencePiece fields.
1063 | const (
1064 | 	Default_ModelProto_SentencePiece_Type = ModelProto_SentencePiece_NORMAL
1065 | )
1066 | 
1067 | func (x *ModelProto_SentencePiece) Reset() {
1068 | 	*x = ModelProto_SentencePiece{}
1069 | 	mi := &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes[5]
1070 | 	ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
1071 | 	ms.StoreMessageInfo(mi)
1072 | }
1073 | 
1074 | func (x *ModelProto_SentencePiece) String() string {
1075 | 	return protoimpl.X.MessageStringOf(x)
1076 | }
1077 | 
1078 | func (*ModelProto_SentencePiece) ProtoMessage() {}
1079 | 
1080 | func (x *ModelProto_SentencePiece) ProtoReflect() protoreflect.Message {
1081 | 	mi := &file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes[5]
1082 | 	if x != nil {
1083 | 		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
1084 | 		if ms.LoadMessageInfo() == nil {
1085 | 			ms.StoreMessageInfo(mi)
1086 | 		}
1087 | 		return ms
1088 | 	}
1089 | 	return mi.MessageOf(x)
1090 | }
1091 | 
1092 | // Deprecated: Use ModelProto_SentencePiece.ProtoReflect.Descriptor instead.
1093 | func (*ModelProto_SentencePiece) Descriptor() ([]byte, []int) {
1094 | 	return file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescGZIP(), []int{3, 0}
1095 | }
1096 | 
1097 | func (x *ModelProto_SentencePiece) GetPiece() string {
1098 | 	if x != nil && x.Piece != nil {
1099 | 		return *x.Piece
1100 | 	}
1101 | 	return ""
1102 | }
1103 | 
1104 | func (x *ModelProto_SentencePiece) GetScore() float32 {
1105 | 	if x != nil && x.Score != nil {
1106 | 		return *x.Score
1107 | 	}
1108 | 	return 0
1109 | }
1110 | 
1111 | func (x *ModelProto_SentencePiece) GetType() ModelProto_SentencePiece_Type {
1112 | 	if x != nil && x.Type != nil {
1113 | 		return *x.Type
1114 | 	}
1115 | 	return Default_ModelProto_SentencePiece_Type
1116 | }
1117 | 
1118 | var File_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto protoreflect.FileDescriptor
1119 | 
1120 | var file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDesc = []byte{
1121 | 	0x0a, 0x61, 0x63, 0x6f, 0x6d, 0x5f, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x5f, 0x67, 0x6f, 0x6d,
1122 | 	0x6c, 0x78, 0x5f, 0x67, 0x6f, 0x2d, 0x68, 0x75, 0x67, 0x67, 0x69, 0x6e, 0x67, 0x66, 0x61, 0x63,
1123 | 	0x65, 0x5f, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x69, 0x7a, 0x65, 0x72, 0x73, 0x5f, 0x73, 0x65, 0x6e,
1124 | 	0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x5f, 0x70, 0x72, 0x69, 0x76, 0x61,
1125 | 	0x74, 0x65, 0x5f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x73, 0x2f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e,
1126 | 	0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x5f, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x2e, 0x70, 0x72,
1127 | 	0x6f, 0x74, 0x6f, 0x12, 0x2d, 0x63, 0x6f, 0x6d, 0x2e, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e,
1128 | 	0x67, 0x6f, 0x6d, 0x6c, 0x78, 0x2e, 0x67, 0x6f, 0x5f, 0x68, 0x75, 0x67, 0x67, 0x69, 0x6e, 0x67,
1129 | 	0x66, 0x61, 0x63, 0x65, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65,
1130 | 	0x63, 0x65, 0x22, 0xe6, 0x12, 0x0a, 0x0b, 0x54, 0x72, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, 0x70,
1131 | 	0x65, 0x63, 0x12, 0x14, 0x0a, 0x05, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x18, 0x01, 0x20, 0x03, 0x28,
1132 | 	0x09, 0x52, 0x05, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x12, 0x21, 0x0a, 0x0c, 0x69, 0x6e, 0x70, 0x75,
1133 | 	0x74, 0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x18, 0x07, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b,
1134 | 	0x69, 0x6e, 0x70, 0x75, 0x74, 0x46, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x12, 0x21, 0x0a, 0x0c, 0x6d,
1135 | 	0x6f, 0x64, 0x65, 0x6c, 0x5f, 0x70, 0x72, 0x65, 0x66, 0x69, 0x78, 0x18, 0x02, 0x20, 0x01, 0x28,
1136 | 	0x09, 0x52, 0x0b, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x50, 0x72, 0x65, 0x66, 0x69, 0x78, 0x12, 0x6c,
1137 | 	0x0a, 0x0a, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x5f, 0x74, 0x79, 0x70, 0x65, 0x18, 0x03, 0x20, 0x01,
1138 | 	0x28, 0x0e, 0x32, 0x44, 0x2e, 0x63, 0x6f, 0x6d, 0x2e, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e,
1139 | 	0x67, 0x6f, 0x6d, 0x6c, 0x78, 0x2e, 0x67, 0x6f, 0x5f, 0x68, 0x75, 0x67, 0x67, 0x69, 0x6e, 0x67,
1140 | 	0x66, 0x61, 0x63, 0x65, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65,
1141 | 	0x63, 0x65, 0x2e, 0x54, 0x72, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x2e, 0x4d,
1142 | 	0x6f, 0x64, 0x65, 0x6c, 0x54, 0x79, 0x70, 0x65, 0x3a, 0x07, 0x55, 0x4e, 0x49, 0x47, 0x52, 0x41,
1143 | 	0x4d, 0x52, 0x09, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x54, 0x79, 0x70, 0x65, 0x12, 0x23, 0x0a, 0x0a,
1144 | 	0x76, 0x6f, 0x63, 0x61, 0x62, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05,
1145 | 	0x3a, 0x04, 0x38, 0x30, 0x30, 0x30, 0x52, 0x09, 0x76, 0x6f, 0x63, 0x61, 0x62, 0x53, 0x69, 0x7a,
1146 | 	0x65, 0x12, 0x27, 0x0a, 0x0f, 0x61, 0x63, 0x63, 0x65, 0x70, 0x74, 0x5f, 0x6c, 0x61, 0x6e, 0x67,
1147 | 	0x75, 0x61, 0x67, 0x65, 0x18, 0x05, 0x20, 0x03, 0x28, 0x09, 0x52, 0x0e, 0x61, 0x63, 0x63, 0x65,
1148 | 	0x70, 0x74, 0x4c, 0x61, 0x6e, 0x67, 0x75, 0x61, 0x67, 0x65, 0x12, 0x34, 0x0a, 0x15, 0x73, 0x65,
1149 | 	0x6c, 0x66, 0x5f, 0x74, 0x65, 0x73, 0x74, 0x5f, 0x73, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x5f, 0x73,
1150 | 	0x69, 0x7a, 0x65, 0x18, 0x06, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x01, 0x30, 0x52, 0x12, 0x73, 0x65,
1151 | 	0x6c, 0x66, 0x54, 0x65, 0x73, 0x74, 0x53, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x53, 0x69, 0x7a, 0x65,
1152 | 	0x12, 0x45, 0x0a, 0x1b, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x64, 0x69, 0x66, 0x66, 0x65,
1153 | 	0x72, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x70, 0x72, 0x69, 0x76, 0x61, 0x63, 0x79, 0x18,
1154 | 	0x32, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x52, 0x19, 0x65, 0x6e,
1155 | 	0x61, 0x62, 0x6c, 0x65, 0x44, 0x69, 0x66, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c,
1156 | 	0x50, 0x72, 0x69, 0x76, 0x61, 0x63, 0x79, 0x12, 0x4a, 0x0a, 0x20, 0x64, 0x69, 0x66, 0x66, 0x65,
1157 | 	0x72, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x70, 0x72, 0x69, 0x76, 0x61, 0x63, 0x79, 0x5f,
1158 | 	0x6e, 0x6f, 0x69, 0x73, 0x65, 0x5f, 0x6c, 0x65, 0x76, 0x65, 0x6c, 0x18, 0x33, 0x20, 0x01, 0x28,
1159 | 	0x02, 0x3a, 0x01, 0x30, 0x52, 0x1d, 0x64, 0x69, 0x66, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x74, 0x69,
1160 | 	0x61, 0x6c, 0x50, 0x72, 0x69, 0x76, 0x61, 0x63, 0x79, 0x4e, 0x6f, 0x69, 0x73, 0x65, 0x4c, 0x65,
1161 | 	0x76, 0x65, 0x6c, 0x12, 0x58, 0x0a, 0x27, 0x64, 0x69, 0x66, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x74,
1162 | 	0x69, 0x61, 0x6c, 0x5f, 0x70, 0x72, 0x69, 0x76, 0x61, 0x63, 0x79, 0x5f, 0x63, 0x6c, 0x69, 0x70,
1163 | 	0x70, 0x69, 0x6e, 0x67, 0x5f, 0x74, 0x68, 0x72, 0x65, 0x73, 0x68, 0x6f, 0x6c, 0x64, 0x18, 0x34,
1164 | 	0x20, 0x01, 0x28, 0x04, 0x3a, 0x01, 0x30, 0x52, 0x24, 0x64, 0x69, 0x66, 0x66, 0x65, 0x72, 0x65,
1165 | 	0x6e, 0x74, 0x69, 0x61, 0x6c, 0x50, 0x72, 0x69, 0x76, 0x61, 0x63, 0x79, 0x43, 0x6c, 0x69, 0x70,
1166 | 	0x70, 0x69, 0x6e, 0x67, 0x54, 0x68, 0x72, 0x65, 0x73, 0x68, 0x6f, 0x6c, 0x64, 0x12, 0x35, 0x0a,
1167 | 	0x12, 0x63, 0x68, 0x61, 0x72, 0x61, 0x63, 0x74, 0x65, 0x72, 0x5f, 0x63, 0x6f, 0x76, 0x65, 0x72,
1168 | 	0x61, 0x67, 0x65, 0x18, 0x0a, 0x20, 0x01, 0x28, 0x02, 0x3a, 0x06, 0x30, 0x2e, 0x39, 0x39, 0x39,
1169 | 	0x35, 0x52, 0x11, 0x63, 0x68, 0x61, 0x72, 0x61, 0x63, 0x74, 0x65, 0x72, 0x43, 0x6f, 0x76, 0x65,
1170 | 	0x72, 0x61, 0x67, 0x65, 0x12, 0x31, 0x0a, 0x13, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x73, 0x65,
1171 | 	0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x18, 0x0b, 0x20, 0x01, 0x28,
1172 | 	0x04, 0x3a, 0x01, 0x30, 0x52, 0x11, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x53, 0x65, 0x6e, 0x74, 0x65,
1173 | 	0x6e, 0x63, 0x65, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x3a, 0x0a, 0x16, 0x73, 0x68, 0x75, 0x66, 0x66,
1174 | 	0x6c, 0x65, 0x5f, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63,
1175 | 	0x65, 0x18, 0x13, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x14, 0x73,
1176 | 	0x68, 0x75, 0x66, 0x66, 0x6c, 0x65, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x53, 0x65, 0x6e, 0x74, 0x65,
1177 | 	0x6e, 0x63, 0x65, 0x12, 0x34, 0x0a, 0x14, 0x6d, 0x69, 0x6e, 0x69, 0x6e, 0x67, 0x5f, 0x73, 0x65,
1178 | 	0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x18, 0x0c, 0x20, 0x01, 0x28,
1179 | 	0x05, 0x42, 0x02, 0x18, 0x01, 0x52, 0x12, 0x6d, 0x69, 0x6e, 0x69, 0x6e, 0x67, 0x53, 0x65, 0x6e,
1180 | 	0x74, 0x65, 0x6e, 0x63, 0x65, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x38, 0x0a, 0x16, 0x74, 0x72, 0x61,
1181 | 	0x69, 0x6e, 0x69, 0x6e, 0x67, 0x5f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x73,
1182 | 	0x69, 0x7a, 0x65, 0x18, 0x0d, 0x20, 0x01, 0x28, 0x05, 0x42, 0x02, 0x18, 0x01, 0x52, 0x14, 0x74,
1183 | 	0x72, 0x61, 0x69, 0x6e, 0x69, 0x6e, 0x67, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x53,
1184 | 	0x69, 0x7a, 0x65, 0x12, 0x3f, 0x0a, 0x17, 0x73, 0x65, 0x65, 0x64, 0x5f, 0x73, 0x65, 0x6e, 0x74,
1185 | 	0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x18, 0x0e,
1186 | 	0x20, 0x01, 0x28, 0x05, 0x3a, 0x07, 0x31, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x52, 0x15, 0x73,
1187 | 	0x65, 0x65, 0x64, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65,
1188 | 	0x53, 0x69, 0x7a, 0x65, 0x12, 0x2f, 0x0a, 0x10, 0x73, 0x68, 0x72, 0x69, 0x6e, 0x6b, 0x69, 0x6e,
1189 | 	0x67, 0x5f, 0x66, 0x61, 0x63, 0x74, 0x6f, 0x72, 0x18, 0x0f, 0x20, 0x01, 0x28, 0x02, 0x3a, 0x04,
1190 | 	0x30, 0x2e, 0x37, 0x35, 0x52, 0x0f, 0x73, 0x68, 0x72, 0x69, 0x6e, 0x6b, 0x69, 0x6e, 0x67, 0x46,
1191 | 	0x61, 0x63, 0x74, 0x6f, 0x72, 0x12, 0x34, 0x0a, 0x13, 0x6d, 0x61, 0x78, 0x5f, 0x73, 0x65, 0x6e,
1192 | 	0x74, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x18, 0x12, 0x20, 0x01,
1193 | 	0x28, 0x05, 0x3a, 0x04, 0x34, 0x31, 0x39, 0x32, 0x52, 0x11, 0x6d, 0x61, 0x78, 0x53, 0x65, 0x6e,
1194 | 	0x74, 0x65, 0x6e, 0x63, 0x65, 0x4c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x12, 0x23, 0x0a, 0x0b, 0x6e,
1195 | 	0x75, 0x6d, 0x5f, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x73, 0x18, 0x10, 0x20, 0x01, 0x28, 0x05,
1196 | 	0x3a, 0x02, 0x31, 0x36, 0x52, 0x0a, 0x6e, 0x75, 0x6d, 0x54, 0x68, 0x72, 0x65, 0x61, 0x64, 0x73,
1197 | 	0x12, 0x2f, 0x0a, 0x12, 0x6e, 0x75, 0x6d, 0x5f, 0x73, 0x75, 0x62, 0x5f, 0x69, 0x74, 0x65, 0x72,
1198 | 	0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x18, 0x11, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x01, 0x32, 0x52,
1199 | 	0x10, 0x6e, 0x75, 0x6d, 0x53, 0x75, 0x62, 0x49, 0x74, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e,
1200 | 	0x73, 0x12, 0x3c, 0x0a, 0x18, 0x6d, 0x61, 0x78, 0x5f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63,
1201 | 	0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x5f, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x18, 0x14, 0x20,
1202 | 	0x01, 0x28, 0x05, 0x3a, 0x02, 0x31, 0x36, 0x52, 0x16, 0x6d, 0x61, 0x78, 0x53, 0x65, 0x6e, 0x74,
1203 | 	0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x4c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x12,
1204 | 	0x3b, 0x0a, 0x17, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x5f, 0x62, 0x79, 0x5f, 0x75, 0x6e, 0x69, 0x63,
1205 | 	0x6f, 0x64, 0x65, 0x5f, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x18, 0x15, 0x20, 0x01, 0x28, 0x08,
1206 | 	0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x14, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x42, 0x79, 0x55,
1207 | 	0x6e, 0x69, 0x63, 0x6f, 0x64, 0x65, 0x53, 0x63, 0x72, 0x69, 0x70, 0x74, 0x12, 0x2c, 0x0a, 0x0f,
1208 | 	0x73, 0x70, 0x6c, 0x69, 0x74, 0x5f, 0x62, 0x79, 0x5f, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x18,
1209 | 	0x17, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x0d, 0x73, 0x70, 0x6c,
1210 | 	0x69, 0x74, 0x42, 0x79, 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x12, 0x34, 0x0a, 0x13, 0x73, 0x70,
1211 | 	0x6c, 0x69, 0x74, 0x5f, 0x62, 0x79, 0x5f, 0x77, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63,
1212 | 	0x65, 0x18, 0x16, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x11, 0x73,
1213 | 	0x70, 0x6c, 0x69, 0x74, 0x42, 0x79, 0x57, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65,
1214 | 	0x12, 0x42, 0x0a, 0x1a, 0x74, 0x72, 0x65, 0x61, 0x74, 0x5f, 0x77, 0x68, 0x69, 0x74, 0x65, 0x73,
1215 | 	0x70, 0x61, 0x63, 0x65, 0x5f, 0x61, 0x73, 0x5f, 0x73, 0x75, 0x66, 0x66, 0x69, 0x78, 0x18, 0x18,
1216 | 	0x20, 0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x52, 0x17, 0x74, 0x72, 0x65,
1217 | 	0x61, 0x74, 0x57, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x41, 0x73, 0x53, 0x75,
1218 | 	0x66, 0x66, 0x69, 0x78, 0x12, 0x46, 0x0a, 0x1c, 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x5f, 0x77, 0x68,
1219 | 	0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x5f, 0x70, 0x69,
1220 | 	0x65, 0x63, 0x65, 0x73, 0x18, 0x1a, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c, 0x73,
1221 | 	0x65, 0x52, 0x19, 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x57, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61,
1222 | 	0x63, 0x65, 0x4f, 0x6e, 0x6c, 0x79, 0x50, 0x69, 0x65, 0x63, 0x65, 0x73, 0x12, 0x28, 0x0a, 0x0c,
1223 | 	0x73, 0x70, 0x6c, 0x69, 0x74, 0x5f, 0x64, 0x69, 0x67, 0x69, 0x74, 0x73, 0x18, 0x19, 0x20, 0x01,
1224 | 	0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x52, 0x0b, 0x73, 0x70, 0x6c, 0x69, 0x74,
1225 | 	0x44, 0x69, 0x67, 0x69, 0x74, 0x73, 0x12, 0x3d, 0x0a, 0x19, 0x70, 0x72, 0x65, 0x74, 0x6f, 0x6b,
1226 | 	0x65, 0x6e, 0x69, 0x7a, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x64, 0x65, 0x6c, 0x69, 0x6d, 0x69,
1227 | 	0x74, 0x65, 0x72, 0x18, 0x35, 0x20, 0x01, 0x28, 0x09, 0x3a, 0x00, 0x52, 0x18, 0x70, 0x72, 0x65,
1228 | 	0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x69, 0x7a, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x44, 0x65, 0x6c, 0x69,
1229 | 	0x6d, 0x69, 0x74, 0x65, 0x72, 0x12, 0x27, 0x0a, 0x0f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c,
1230 | 	0x5f, 0x73, 0x79, 0x6d, 0x62, 0x6f, 0x6c, 0x73, 0x18, 0x1e, 0x20, 0x03, 0x28, 0x09, 0x52, 0x0e,
1231 | 	0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x53, 0x79, 0x6d, 0x62, 0x6f, 0x6c, 0x73, 0x12, 0x30,
1232 | 	0x0a, 0x14, 0x75, 0x73, 0x65, 0x72, 0x5f, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x64, 0x5f, 0x73,
1233 | 	0x79, 0x6d, 0x62, 0x6f, 0x6c, 0x73, 0x18, 0x1f, 0x20, 0x03, 0x28, 0x09, 0x52, 0x12, 0x75, 0x73,
1234 | 	0x65, 0x72, 0x44, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x64, 0x53, 0x79, 0x6d, 0x62, 0x6f, 0x6c, 0x73,
1235 | 	0x12, 0x25, 0x0a, 0x0e, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x5f, 0x63, 0x68, 0x61,
1236 | 	0x72, 0x73, 0x18, 0x24, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0d, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72,
1237 | 	0x65, 0x64, 0x43, 0x68, 0x61, 0x72, 0x73, 0x12, 0x2a, 0x0a, 0x0d, 0x62, 0x79, 0x74, 0x65, 0x5f,
1238 | 	0x66, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x18, 0x23, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x05,
1239 | 	0x66, 0x61, 0x6c, 0x73, 0x65, 0x52, 0x0c, 0x62, 0x79, 0x74, 0x65, 0x46, 0x61, 0x6c, 0x6c, 0x62,
1240 | 	0x61, 0x63, 0x6b, 0x12, 0x47, 0x0a, 0x1d, 0x76, 0x6f, 0x63, 0x61, 0x62, 0x75, 0x6c, 0x61, 0x72,
1241 | 	0x79, 0x5f, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x5f, 0x70, 0x69, 0x65, 0x63, 0x65, 0x5f, 0x73,
1242 | 	0x63, 0x6f, 0x72, 0x65, 0x18, 0x20, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65,
1243 | 	0x52, 0x1a, 0x76, 0x6f, 0x63, 0x61, 0x62, 0x75, 0x6c, 0x61, 0x72, 0x79, 0x4f, 0x75, 0x74, 0x70,
1244 | 	0x75, 0x74, 0x50, 0x69, 0x65, 0x63, 0x65, 0x53, 0x63, 0x6f, 0x72, 0x65, 0x12, 0x2e, 0x0a, 0x10,
1245 | 	0x68, 0x61, 0x72, 0x64, 0x5f, 0x76, 0x6f, 0x63, 0x61, 0x62, 0x5f, 0x6c, 0x69, 0x6d, 0x69, 0x74,
1246 | 	0x18, 0x21, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x0e, 0x68, 0x61,
1247 | 	0x72, 0x64, 0x56, 0x6f, 0x63, 0x61, 0x62, 0x4c, 0x69, 0x6d, 0x69, 0x74, 0x12, 0x29, 0x0a, 0x0d,
1248 | 	0x75, 0x73, 0x65, 0x5f, 0x61, 0x6c, 0x6c, 0x5f, 0x76, 0x6f, 0x63, 0x61, 0x62, 0x18, 0x22, 0x20,
1249 | 	0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x52, 0x0b, 0x75, 0x73, 0x65, 0x41,
1250 | 	0x6c, 0x6c, 0x56, 0x6f, 0x63, 0x61, 0x62, 0x12, 0x18, 0x0a, 0x06, 0x75, 0x6e, 0x6b, 0x5f, 0x69,
1251 | 	0x64, 0x18, 0x28, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x01, 0x30, 0x52, 0x05, 0x75, 0x6e, 0x6b, 0x49,
1252 | 	0x64, 0x12, 0x18, 0x0a, 0x06, 0x62, 0x6f, 0x73, 0x5f, 0x69, 0x64, 0x18, 0x29, 0x20, 0x01, 0x28,
1253 | 	0x05, 0x3a, 0x01, 0x31, 0x52, 0x05, 0x62, 0x6f, 0x73, 0x49, 0x64, 0x12, 0x18, 0x0a, 0x06, 0x65,
1254 | 	0x6f, 0x73, 0x5f, 0x69, 0x64, 0x18, 0x2a, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x01, 0x32, 0x52, 0x05,
1255 | 	0x65, 0x6f, 0x73, 0x49, 0x64, 0x12, 0x19, 0x0a, 0x06, 0x70, 0x61, 0x64, 0x5f, 0x69, 0x64, 0x18,
1256 | 	0x2b, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x02, 0x2d, 0x31, 0x52, 0x05, 0x70, 0x61, 0x64, 0x49, 0x64,
1257 | 	0x12, 0x22, 0x0a, 0x09, 0x75, 0x6e, 0x6b, 0x5f, 0x70, 0x69, 0x65, 0x63, 0x65, 0x18, 0x2d, 0x20,
1258 | 	0x01, 0x28, 0x09, 0x3a, 0x05, 0x3c, 0x75, 0x6e, 0x6b, 0x3e, 0x52, 0x08, 0x75, 0x6e, 0x6b, 0x50,
1259 | 	0x69, 0x65, 0x63, 0x65, 0x12, 0x20, 0x0a, 0x09, 0x62, 0x6f, 0x73, 0x5f, 0x70, 0x69, 0x65, 0x63,
1260 | 	0x65, 0x18, 0x2e, 0x20, 0x01, 0x28, 0x09, 0x3a, 0x03, 0x3c, 0x73, 0x3e, 0x52, 0x08, 0x62, 0x6f,
1261 | 	0x73, 0x50, 0x69, 0x65, 0x63, 0x65, 0x12, 0x21, 0x0a, 0x09, 0x65, 0x6f, 0x73, 0x5f, 0x70, 0x69,
1262 | 	0x65, 0x63, 0x65, 0x18, 0x2f, 0x20, 0x01, 0x28, 0x09, 0x3a, 0x04, 0x3c, 0x2f, 0x73, 0x3e, 0x52,
1263 | 	0x08, 0x65, 0x6f, 0x73, 0x50, 0x69, 0x65, 0x63, 0x65, 0x12, 0x22, 0x0a, 0x09, 0x70, 0x61, 0x64,
1264 | 	0x5f, 0x70, 0x69, 0x65, 0x63, 0x65, 0x18, 0x30, 0x20, 0x01, 0x28, 0x09, 0x3a, 0x05, 0x3c, 0x70,
1265 | 	0x61, 0x64, 0x3e, 0x52, 0x08, 0x70, 0x61, 0x64, 0x50, 0x69, 0x65, 0x63, 0x65, 0x12, 0x26, 0x0a,
1266 | 	0x0b, 0x75, 0x6e, 0x6b, 0x5f, 0x73, 0x75, 0x72, 0x66, 0x61, 0x63, 0x65, 0x18, 0x2c, 0x20, 0x01,
1267 | 	0x28, 0x09, 0x3a, 0x05, 0x20, 0xe2, 0x81, 0x87, 0x20, 0x52, 0x0a, 0x75, 0x6e, 0x6b, 0x53, 0x75,
1268 | 	0x72, 0x66, 0x61, 0x63, 0x65, 0x12, 0x46, 0x0a, 0x1c, 0x74, 0x72, 0x61, 0x69, 0x6e, 0x5f, 0x65,
1269 | 	0x78, 0x74, 0x72, 0x65, 0x6d, 0x65, 0x6c, 0x79, 0x5f, 0x6c, 0x61, 0x72, 0x67, 0x65, 0x5f, 0x63,
1270 | 	0x6f, 0x72, 0x70, 0x75, 0x73, 0x18, 0x31, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c,
1271 | 	0x73, 0x65, 0x52, 0x19, 0x74, 0x72, 0x61, 0x69, 0x6e, 0x45, 0x78, 0x74, 0x72, 0x65, 0x6d, 0x65,
1272 | 	0x6c, 0x79, 0x4c, 0x61, 0x72, 0x67, 0x65, 0x43, 0x6f, 0x72, 0x70, 0x75, 0x73, 0x12, 0x3a, 0x0a,
1273 | 	0x18, 0x73, 0x65, 0x65, 0x64, 0x5f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69,
1274 | 	0x65, 0x63, 0x65, 0x73, 0x5f, 0x66, 0x69, 0x6c, 0x65, 0x18, 0x36, 0x20, 0x01, 0x28, 0x09, 0x3a,
1275 | 	0x00, 0x52, 0x16, 0x73, 0x65, 0x65, 0x64, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70,
1276 | 	0x69, 0x65, 0x63, 0x65, 0x73, 0x46, 0x69, 0x6c, 0x65, 0x22, 0x35, 0x0a, 0x09, 0x4d, 0x6f, 0x64,
1277 | 	0x65, 0x6c, 0x54, 0x79, 0x70, 0x65, 0x12, 0x0b, 0x0a, 0x07, 0x55, 0x4e, 0x49, 0x47, 0x52, 0x41,
1278 | 	0x4d, 0x10, 0x01, 0x12, 0x07, 0x0a, 0x03, 0x42, 0x50, 0x45, 0x10, 0x02, 0x12, 0x08, 0x0a, 0x04,
1279 | 	0x57, 0x4f, 0x52, 0x44, 0x10, 0x03, 0x12, 0x08, 0x0a, 0x04, 0x43, 0x48, 0x41, 0x52, 0x10, 0x04,
1280 | 	0x2a, 0x09, 0x08, 0xc8, 0x01, 0x10, 0x80, 0x80, 0x80, 0x80, 0x02, 0x22, 0xbd, 0x02, 0x0a, 0x0e,
1281 | 	0x4e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x12, 0x12,
1282 | 	0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61,
1283 | 	0x6d, 0x65, 0x12, 0x31, 0x0a, 0x14, 0x70, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65,
1284 | 	0x64, 0x5f, 0x63, 0x68, 0x61, 0x72, 0x73, 0x6d, 0x61, 0x70, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0c,
1285 | 	0x52, 0x13, 0x70, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x43, 0x68, 0x61,
1286 | 	0x72, 0x73, 0x6d, 0x61, 0x70, 0x12, 0x2e, 0x0a, 0x10, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x75, 0x6d,
1287 | 	0x6d, 0x79, 0x5f, 0x70, 0x72, 0x65, 0x66, 0x69, 0x78, 0x18, 0x03, 0x20, 0x01, 0x28, 0x08, 0x3a,
1288 | 	0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x0e, 0x61, 0x64, 0x64, 0x44, 0x75, 0x6d, 0x6d, 0x79, 0x50,
1289 | 	0x72, 0x65, 0x66, 0x69, 0x78, 0x12, 0x3e, 0x0a, 0x18, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x5f,
1290 | 	0x65, 0x78, 0x74, 0x72, 0x61, 0x5f, 0x77, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65,
1291 | 	0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x16, 0x72,
1292 | 	0x65, 0x6d, 0x6f, 0x76, 0x65, 0x45, 0x78, 0x74, 0x72, 0x61, 0x57, 0x68, 0x69, 0x74, 0x65, 0x73,
1293 | 	0x70, 0x61, 0x63, 0x65, 0x73, 0x12, 0x33, 0x0a, 0x12, 0x65, 0x73, 0x63, 0x61, 0x70, 0x65, 0x5f,
1294 | 	0x77, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x73, 0x18, 0x05, 0x20, 0x01, 0x28,
1295 | 	0x08, 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x11, 0x65, 0x73, 0x63, 0x61, 0x70, 0x65, 0x57,
1296 | 	0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x73, 0x12, 0x34, 0x0a, 0x16, 0x6e, 0x6f,
1297 | 	0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x72, 0x75, 0x6c, 0x65,
1298 | 	0x5f, 0x74, 0x73, 0x76, 0x18, 0x06, 0x20, 0x01, 0x28, 0x09, 0x52, 0x14, 0x6e, 0x6f, 0x72, 0x6d,
1299 | 	0x61, 0x6c, 0x69, 0x7a, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x75, 0x6c, 0x65, 0x54, 0x73, 0x76,
1300 | 	0x2a, 0x09, 0x08, 0xc8, 0x01, 0x10, 0x80, 0x80, 0x80, 0x80, 0x02, 0x22, 0xb3, 0x01, 0x0a, 0x0c,
1301 | 	0x53, 0x65, 0x6c, 0x66, 0x54, 0x65, 0x73, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x5c, 0x0a, 0x07,
1302 | 	0x73, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x42, 0x2e,
1303 | 	0x63, 0x6f, 0x6d, 0x2e, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x67, 0x6f, 0x6d, 0x6c, 0x78,
1304 | 	0x2e, 0x67, 0x6f, 0x5f, 0x68, 0x75, 0x67, 0x67, 0x69, 0x6e, 0x67, 0x66, 0x61, 0x63, 0x65, 0x2e,
1305 | 	0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x53, 0x65,
1306 | 	0x6c, 0x66, 0x54, 0x65, 0x73, 0x74, 0x44, 0x61, 0x74, 0x61, 0x2e, 0x53, 0x61, 0x6d, 0x70, 0x6c,
1307 | 	0x65, 0x52, 0x07, 0x73, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x73, 0x1a, 0x3a, 0x0a, 0x06, 0x53, 0x61,
1308 | 	0x6d, 0x70, 0x6c, 0x65, 0x12, 0x14, 0x0a, 0x05, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x18, 0x01, 0x20,
1309 | 	0x01, 0x28, 0x09, 0x52, 0x05, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x12, 0x1a, 0x0a, 0x08, 0x65, 0x78,
1310 | 	0x70, 0x65, 0x63, 0x74, 0x65, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x65, 0x78,
1311 | 	0x70, 0x65, 0x63, 0x74, 0x65, 0x64, 0x2a, 0x09, 0x08, 0xc8, 0x01, 0x10, 0x80, 0x80, 0x80, 0x80,
1312 | 	0x02, 0x22, 0x97, 0x06, 0x0a, 0x0a, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x50, 0x72, 0x6f, 0x74, 0x6f,
1313 | 	0x12, 0x5f, 0x0a, 0x06, 0x70, 0x69, 0x65, 0x63, 0x65, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b,
1314 | 	0x32, 0x47, 0x2e, 0x63, 0x6f, 0x6d, 0x2e, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x67, 0x6f,
1315 | 	0x6d, 0x6c, 0x78, 0x2e, 0x67, 0x6f, 0x5f, 0x68, 0x75, 0x67, 0x67, 0x69, 0x6e, 0x67, 0x66, 0x61,
1316 | 	0x63, 0x65, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65,
1317 | 	0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x50, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x53, 0x65, 0x6e, 0x74,
1318 | 	0x65, 0x6e, 0x63, 0x65, 0x50, 0x69, 0x65, 0x63, 0x65, 0x52, 0x06, 0x70, 0x69, 0x65, 0x63, 0x65,
1319 | 	0x73, 0x12, 0x5d, 0x0a, 0x0c, 0x74, 0x72, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x5f, 0x73, 0x70, 0x65,
1320 | 	0x63, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x3a, 0x2e, 0x63, 0x6f, 0x6d, 0x2e, 0x67, 0x69,
1321 | 	0x74, 0x68, 0x75, 0x62, 0x2e, 0x67, 0x6f, 0x6d, 0x6c, 0x78, 0x2e, 0x67, 0x6f, 0x5f, 0x68, 0x75,
1322 | 	0x67, 0x67, 0x69, 0x6e, 0x67, 0x66, 0x61, 0x63, 0x65, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e,
1323 | 	0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x54, 0x72, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53,
1324 | 	0x70, 0x65, 0x63, 0x52, 0x0b, 0x74, 0x72, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63,
1325 | 	0x12, 0x66, 0x0a, 0x0f, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x72, 0x5f, 0x73,
1326 | 	0x70, 0x65, 0x63, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x3d, 0x2e, 0x63, 0x6f, 0x6d, 0x2e,
1327 | 	0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x67, 0x6f, 0x6d, 0x6c, 0x78, 0x2e, 0x67, 0x6f, 0x5f,
1328 | 	0x68, 0x75, 0x67, 0x67, 0x69, 0x6e, 0x67, 0x66, 0x61, 0x63, 0x65, 0x2e, 0x73, 0x65, 0x6e, 0x74,
1329 | 	0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x4e, 0x6f, 0x72, 0x6d, 0x61, 0x6c,
1330 | 	0x69, 0x7a, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x52, 0x0e, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c,
1331 | 	0x69, 0x7a, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x12, 0x61, 0x0a, 0x0e, 0x73, 0x65, 0x6c, 0x66,
1332 | 	0x5f, 0x74, 0x65, 0x73, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0b,
1333 | 	0x32, 0x3b, 0x2e, 0x63, 0x6f, 0x6d, 0x2e, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x67, 0x6f,
1334 | 	0x6d, 0x6c, 0x78, 0x2e, 0x67, 0x6f, 0x5f, 0x68, 0x75, 0x67, 0x67, 0x69, 0x6e, 0x67, 0x66, 0x61,
1335 | 	0x63, 0x65, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65,
1336 | 	0x2e, 0x53, 0x65, 0x6c, 0x66, 0x54, 0x65, 0x73, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0c, 0x73,
1337 | 	0x65, 0x6c, 0x66, 0x54, 0x65, 0x73, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x6a, 0x0a, 0x11, 0x64,
1338 | 	0x65, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x72, 0x5f, 0x73, 0x70, 0x65, 0x63,
1339 | 	0x18, 0x05, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x3d, 0x2e, 0x63, 0x6f, 0x6d, 0x2e, 0x67, 0x69, 0x74,
1340 | 	0x68, 0x75, 0x62, 0x2e, 0x67, 0x6f, 0x6d, 0x6c, 0x78, 0x2e, 0x67, 0x6f, 0x5f, 0x68, 0x75, 0x67,
1341 | 	0x67, 0x69, 0x6e, 0x67, 0x66, 0x61, 0x63, 0x65, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63,
1342 | 	0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x4e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x65,
1343 | 	0x72, 0x53, 0x70, 0x65, 0x63, 0x52, 0x10, 0x64, 0x65, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69,
1344 | 	0x7a, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x1a, 0x86, 0x02, 0x0a, 0x0d, 0x53, 0x65, 0x6e, 0x74,
1345 | 	0x65, 0x6e, 0x63, 0x65, 0x50, 0x69, 0x65, 0x63, 0x65, 0x12, 0x14, 0x0a, 0x05, 0x70, 0x69, 0x65,
1346 | 	0x63, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x70, 0x69, 0x65, 0x63, 0x65, 0x12,
1347 | 	0x14, 0x0a, 0x05, 0x73, 0x63, 0x6f, 0x72, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x02, 0x52, 0x05,
1348 | 	0x73, 0x63, 0x6f, 0x72, 0x65, 0x12, 0x68, 0x0a, 0x04, 0x74, 0x79, 0x70, 0x65, 0x18, 0x03, 0x20,
1349 | 	0x01, 0x28, 0x0e, 0x32, 0x4c, 0x2e, 0x63, 0x6f, 0x6d, 0x2e, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62,
1350 | 	0x2e, 0x67, 0x6f, 0x6d, 0x6c, 0x78, 0x2e, 0x67, 0x6f, 0x5f, 0x68, 0x75, 0x67, 0x67, 0x69, 0x6e,
1351 | 	0x67, 0x66, 0x61, 0x63, 0x65, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69,
1352 | 	0x65, 0x63, 0x65, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x50, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x53,
1353 | 	0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x50, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x54, 0x79, 0x70,
1354 | 	0x65, 0x3a, 0x06, 0x4e, 0x4f, 0x52, 0x4d, 0x41, 0x4c, 0x52, 0x04, 0x74, 0x79, 0x70, 0x65, 0x22,
1355 | 	0x54, 0x0a, 0x04, 0x54, 0x79, 0x70, 0x65, 0x12, 0x0a, 0x0a, 0x06, 0x4e, 0x4f, 0x52, 0x4d, 0x41,
1356 | 	0x4c, 0x10, 0x01, 0x12, 0x0b, 0x0a, 0x07, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x02,
1357 | 	0x12, 0x0b, 0x0a, 0x07, 0x43, 0x4f, 0x4e, 0x54, 0x52, 0x4f, 0x4c, 0x10, 0x03, 0x12, 0x10, 0x0a,
1358 | 	0x0c, 0x55, 0x53, 0x45, 0x52, 0x5f, 0x44, 0x45, 0x46, 0x49, 0x4e, 0x45, 0x44, 0x10, 0x04, 0x12,
1359 | 	0x08, 0x0a, 0x04, 0x42, 0x59, 0x54, 0x45, 0x10, 0x06, 0x12, 0x0a, 0x0a, 0x06, 0x55, 0x4e, 0x55,
1360 | 	0x53, 0x45, 0x44, 0x10, 0x05, 0x2a, 0x09, 0x08, 0xc8, 0x01, 0x10, 0x80, 0x80, 0x80, 0x80, 0x02,
1361 | 	0x2a, 0x09, 0x08, 0xc8, 0x01, 0x10, 0x80, 0x80, 0x80, 0x80, 0x02, 0x42, 0x4b, 0x48, 0x03, 0x5a,
1362 | 	0x47, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x67, 0x6f, 0x6d, 0x6c,
1363 | 	0x78, 0x2f, 0x67, 0x6f, 0x2d, 0x68, 0x75, 0x67, 0x67, 0x69, 0x6e, 0x67, 0x66, 0x61, 0x63, 0x65,
1364 | 	0x2f, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x69, 0x7a, 0x65, 0x72, 0x73, 0x2f, 0x73, 0x65, 0x6e, 0x74,
1365 | 	0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2f, 0x70, 0x72, 0x69, 0x76, 0x61, 0x74,
1366 | 	0x65, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x73,
1367 | }
1368 | 
1369 | var (
1370 | 	file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescOnce sync.Once
1371 | 	file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescData = file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDesc
1372 | )
1373 | 
1374 | func file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescGZIP() []byte {
1375 | 	file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescOnce.Do(func() {
1376 | 		file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescData = protoimpl.X.CompressGZIP(file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescData)
1377 | 	})
1378 | 	return file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDescData
1379 | }
1380 | 
1381 | var file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_enumTypes = make([]protoimpl.EnumInfo, 2)
1382 | var file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes = make([]protoimpl.MessageInfo, 6)
1383 | var file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_goTypes = []any{
1384 | 	(TrainerSpec_ModelType)(0),         // 0: com.github.gomlx.go_huggingface.sentencepiece.TrainerSpec.ModelType
1385 | 	(ModelProto_SentencePiece_Type)(0), // 1: com.github.gomlx.go_huggingface.sentencepiece.ModelProto.SentencePiece.Type
1386 | 	(*TrainerSpec)(nil),                // 2: com.github.gomlx.go_huggingface.sentencepiece.TrainerSpec
1387 | 	(*NormalizerSpec)(nil),             // 3: com.github.gomlx.go_huggingface.sentencepiece.NormalizerSpec
1388 | 	(*SelfTestData)(nil),               // 4: com.github.gomlx.go_huggingface.sentencepiece.SelfTestData
1389 | 	(*ModelProto)(nil),                 // 5: com.github.gomlx.go_huggingface.sentencepiece.ModelProto
1390 | 	(*SelfTestData_Sample)(nil),        // 6: com.github.gomlx.go_huggingface.sentencepiece.SelfTestData.Sample
1391 | 	(*ModelProto_SentencePiece)(nil),   // 7: com.github.gomlx.go_huggingface.sentencepiece.ModelProto.SentencePiece
1392 | }
1393 | var file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_depIdxs = []int32{
1394 | 	0, // 0: com.github.gomlx.go_huggingface.sentencepiece.TrainerSpec.model_type:type_name -> com.github.gomlx.go_huggingface.sentencepiece.TrainerSpec.ModelType
1395 | 	6, // 1: com.github.gomlx.go_huggingface.sentencepiece.SelfTestData.samples:type_name -> com.github.gomlx.go_huggingface.sentencepiece.SelfTestData.Sample
1396 | 	7, // 2: com.github.gomlx.go_huggingface.sentencepiece.ModelProto.pieces:type_name -> com.github.gomlx.go_huggingface.sentencepiece.ModelProto.SentencePiece
1397 | 	2, // 3: com.github.gomlx.go_huggingface.sentencepiece.ModelProto.trainer_spec:type_name -> com.github.gomlx.go_huggingface.sentencepiece.TrainerSpec
1398 | 	3, // 4: com.github.gomlx.go_huggingface.sentencepiece.ModelProto.normalizer_spec:type_name -> com.github.gomlx.go_huggingface.sentencepiece.NormalizerSpec
1399 | 	4, // 5: com.github.gomlx.go_huggingface.sentencepiece.ModelProto.self_test_data:type_name -> com.github.gomlx.go_huggingface.sentencepiece.SelfTestData
1400 | 	3, // 6: com.github.gomlx.go_huggingface.sentencepiece.ModelProto.denormalizer_spec:type_name -> com.github.gomlx.go_huggingface.sentencepiece.NormalizerSpec
1401 | 	1, // 7: com.github.gomlx.go_huggingface.sentencepiece.ModelProto.SentencePiece.type:type_name -> com.github.gomlx.go_huggingface.sentencepiece.ModelProto.SentencePiece.Type
1402 | 	8, // [8:8] is the sub-list for method output_type
1403 | 	8, // [8:8] is the sub-list for method input_type
1404 | 	8, // [8:8] is the sub-list for extension type_name
1405 | 	8, // [8:8] is the sub-list for extension extendee
1406 | 	0, // [0:8] is the sub-list for field type_name
1407 | }
1408 | 
1409 | func init() {
1410 | 	file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_init()
1411 | }
1412 | func file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_init() {
1413 | 	if File_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto != nil {
1414 | 		return
1415 | 	}
1416 | 	type x struct{}
1417 | 	out := protoimpl.TypeBuilder{
1418 | 		File: protoimpl.DescBuilder{
1419 | 			GoPackagePath: reflect.TypeOf(x{}).PkgPath(),
1420 | 			RawDescriptor: file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDesc,
1421 | 			NumEnums:      2,
1422 | 			NumMessages:   6,
1423 | 			NumExtensions: 0,
1424 | 			NumServices:   0,
1425 | 		},
1426 | 		GoTypes:           file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_goTypes,
1427 | 		DependencyIndexes: file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_depIdxs,
1428 | 		EnumInfos:         file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_enumTypes,
1429 | 		MessageInfos:      file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_msgTypes,
1430 | 	}.Build()
1431 | 	File_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto = out.File
1432 | 	file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_rawDesc = nil
1433 | 	file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_goTypes = nil
1434 | 	file_com_github_gomlx_go_huggingface_tokenizers_sentencepiece_private_protos_sentencepiece_model_proto_depIdxs = nil
1435 | }
1436 | 


--------------------------------------------------------------------------------