├── .github
    └── workflows
    │   └── pages.yml
├── .gitignore
├── LICENSE
├── README.md
├── benchmark_test.go
├── doc
    └── toklogo2.png
├── example_test.go
├── go.mod
├── go.sum
├── internal
    ├── cmd
    │   ├── dumper
    │   │   └── main.go
    │   └── wasm
    │   │   ├── .gitignore
    │   │   ├── Makefile
    │   │   ├── assets
    │   │       ├── index.html
    │   │       ├── script.js
    │   │       └── wasm_exec.js
    │   │   └── main.go
    ├── model
    │   ├── gen.sh
    │   ├── sentencepiece_model.pb.go
    │   └── sentencepiece_model.proto
    ├── prefixmatcher
    │   ├── prefixmatcher.go
    │   └── prefixmatcher_test.go
    └── priorityqueue
    │   ├── priorityqueue.go
    │   └── priorityqueue_test.go
├── normalize.go
├── processor.go
├── processor_test.go
├── system_test.go
├── test
    ├── gocode1.txt
    ├── htmlcode1.txt
    ├── latexcode1.txt
    ├── opening-multilang.txt
    ├── perlcode1.txt
    ├── pg2000_spanish.txt
    ├── pg41845_telugu.txt
    ├── pg7193_english.txt
    ├── pycode1.txt
    ├── romeo-juliet-english.txt
    └── sp-dump-ids.py
└── token.go


/.github/workflows/pages.yml:
--------------------------------------------------------------------------------
 1 | # Simple workflow for deploying static content to GitHub Pages
 2 | name: Deploy static content to Pages
 3 | 
 4 | on:
 5 |   # Runs on pushes targeting the default branch
 6 |   push:
 7 |     branches: ["main"]
 8 | 
 9 |   # Allows you to run this workflow manually from the Actions tab
10 |   workflow_dispatch:
11 | 
12 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
13 | permissions:
14 |   contents: read
15 |   pages: write
16 |   id-token: write
17 | 
18 | # Allow one concurrent deployment
19 | concurrency:
20 |   group: "pages"
21 |   cancel-in-progress: true
22 | 
23 | jobs:
24 |   # Single deploy job since we're just deploying
25 |   deploy:
26 |     environment:
27 |       name: github-pages
28 |       url: ${{ steps.deployment.outputs.page_url }}
29 |     runs-on: ubuntu-latest
30 |     steps:
31 |       - name: Checkout
32 |         uses: actions/checkout@v3
33 | 
34 |       - name: Set up Go
35 |         uses: actions/setup-go@v4
36 |         with:
37 |           go-version: 1.22.5
38 | 
39 |       - name: Setup Pages
40 |         uses: actions/configure-pages@v2
41 | 
42 |       - name: Build wasm
43 |         run: |
44 |           cd internal/cmd/wasm
45 |           make build
46 | 
47 |       - name: Upload artifact
48 |         uses: actions/upload-pages-artifact@v1
49 |         with:
50 |           # Upload
51 |           path: 'internal/cmd/wasm/assets'
52 | 
53 |       - name: Deploy to GitHub Pages
54 |         id: deployment
55 |         uses: actions/deploy-pages@v1
56 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # If you prefer the allow list template instead of the deny list, see community template:
 2 | # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore
 3 | #
 4 | # Binaries for programs and plugins
 5 | *.exe
 6 | *.exe~
 7 | *.dll
 8 | *.so
 9 | *.dylib
10 | 
11 | # Test binary, built with `go test -c`
12 | *.test
13 | 
14 | # Output of the go coverage tool, specifically when used with LiteIDE
15 | *.out
16 | 
17 | # Dependency directories (remove the comment below to include it)
18 | # vendor/
19 | 
20 | # Go workspace file
21 | go.work
22 | go.work.sum
23 | 
24 | # env file
25 | .env
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # go-sentencepiece
 2 | 
 3 | <p align="center">
 4 |   <img alt="Logo" src="doc/toklogo2.png" />
 5 | </p>
 6 | 
 7 | ----
 8 | 
 9 | [![Go Reference](https://pkg.go.dev/badge/github.com/eliben/go-sentencepiece.svg)](https://pkg.go.dev/github.com/eliben/go-sentencepiece)
10 | 
11 | This is a pure Go implementation of encoding and decoding text with
12 | the [SentencePiece tokenizer](https://github.com/google/sentencepiece).
13 | 
14 | "Encoding" is the operation used to split text into tokens, using
15 | a trained tokenizer model. "Decoding" is the reverse process - converting
16 | a list of tokens into the original text.
17 | 
18 | SentencePiece is a general family of tokenizers that is configured
19 | by a protobuf configuration file. This repository currently focuses
20 | on implementing just the functionality required to reproduce the
21 | tokenization of [Gemma models](https://ai.google.dev/gemma) (the same
22 | tokenizer is used for Google's proprietary Gemini family of models).
23 | Specifically, it only implements BPE tokenization since this is what
24 | Gemma uses.
25 | 
26 | ## Current status
27 | 
28 | This package should be ready to use for encoding text into tokens
29 | using the Gemma tokenizer; it's been reasonably optimized and extensively
30 | tested vs. the [SentencePiece Python bindings](https://pypi.org/project/sentencepiece/)
31 | (see `system_test.go` in this repository).
32 | 
33 | If you find any problems or discrepancies, please open an issue.
34 | 
35 | ## Tokenizer configuration
36 | 
37 | The configuration file for the tokenizer is a protobuf (structured
38 | data, serialized in the [protocol buffer format](https://protobuf.dev/))
39 | that describes a trained tokenizer model; it includes
40 | the complete learned vocabulary used for tokenization, as well as
41 | other configuration information.
42 | 
43 | It is not part of this repository. Please fetch it from the
44 | [official Gemma implementation repository](https://github.com/google/gemma_pytorch/tree/main/tokenizer).
45 | `NewProcessor*` constructors will expect to read this file.
46 | 
47 | ## Developing
48 | 
49 | A protobuf is used to configure the tokenizer. The structure of the
50 | protobuf is described by the `internal/model/sentencepiece_model.proto` file,
51 | which is vendored from https://github.com/google/sentencepiece
52 | 
53 | To re-generate the `*.pb.go` file from it:
54 | 
55 | ```
56 | $ cd internal/model
57 | $ ./gen.sh
58 | ```
59 | 
60 | The configuration protobuf itself is obtained as described in the
61 | [Tokenizer configuration](#tokenizer-configuration) section. All
62 | tests require the `MODELPATH` env var to point to a local
63 | copy of the tokenizer configuration file.
64 | 
65 | ## Online demo
66 | 
67 | To see an in-browser demo of this tokenizer in action, visit
68 | https://eliben.github.io/go-sentencepiece/
69 | 
70 | The Go code is compiled to WebAssembly and loaded from a small
71 | JS program to allow interactive encoding of text.
72 | 


--------------------------------------------------------------------------------
/benchmark_test.go:
--------------------------------------------------------------------------------
 1 | package sentencepiece
 2 | 
 3 | import (
 4 | 	"io/ioutil"
 5 | 	"path/filepath"
 6 | 	"runtime"
 7 | 	"testing"
 8 | )
 9 | 
10 | func BenchmarkEncoder(b *testing.B) {
11 | 	buf, err := ioutil.ReadFile(filepath.Join("test", "pg7193_english.txt"))
12 | 	if err != nil {
13 | 		b.Fatal(err)
14 | 	}
15 | 	sbuf := string(buf)
16 | 
17 | 	proc := createProcessor(b)
18 | 	b.ResetTimer()
19 | 	total := 0
20 | 
21 | 	for range b.N {
22 | 		toks := proc.Encode(sbuf)
23 | 		total += len(toks)
24 | 	}
25 | 	runtime.KeepAlive(total)
26 | 
27 | 	b.ReportMetric(float64(total)/float64(b.Elapsed().Seconds()), "tokens/sec")
28 | }
29 | 
30 | func BenchmarkDecoder(b *testing.B) {
31 | 	buf, err := ioutil.ReadFile(filepath.Join("test", "pg7193_english.txt"))
32 | 	if err != nil {
33 | 		b.Fatal(err)
34 | 	}
35 | 	sbuf := string(buf)
36 | 
37 | 	proc := createProcessor(b)
38 | 	toks := proc.Encode(sbuf)
39 | 
40 | 	b.ResetTimer()
41 | 	total := 0
42 | 
43 | 	for range b.N {
44 | 		t := proc.DecodeTokens(toks)
45 | 		total += len(t)
46 | 	}
47 | 	runtime.KeepAlive(total)
48 | 
49 | 	b.ReportMetric(float64(len(toks)*b.N)/float64(b.Elapsed().Seconds()), "tokens/sec")
50 | }
51 | 


--------------------------------------------------------------------------------
/doc/toklogo2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eliben/go-sentencepiece/dd59fe97df461d1fa84d15c25a51f025156eece1/doc/toklogo2.png


--------------------------------------------------------------------------------
/example_test.go:
--------------------------------------------------------------------------------
 1 | package sentencepiece_test
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | 	"os"
 7 | 
 8 | 	"github.com/eliben/go-sentencepiece"
 9 | )
10 | 
11 | func ExampleEncode() {
12 | 	protoFile := os.Getenv("MODELPATH")
13 | 	if protoFile == "" {
14 | 		log.Println("Need MODELPATH env var to run example")
15 | 		return
16 | 	}
17 | 
18 | 	proc, err := sentencepiece.NewProcessorFromPath(protoFile)
19 | 	if err != nil {
20 | 		log.Fatal(err)
21 | 	}
22 | 
23 | 	text := "Encoding produces tokens that LLMs can learn and understand"
24 | 	tokens := proc.Encode(text)
25 | 
26 | 	for _, token := range tokens {
27 | 		fmt.Println(token)
28 | 	}
29 | }
30 | 
31 | func ExampleDecode() {
32 | 	protoFile := os.Getenv("MODELPATH")
33 | 	if protoFile == "" {
34 | 		log.Println("Need MODELPATH env var to run example")
35 | 		return
36 | 	}
37 | 
38 | 	proc, err := sentencepiece.NewProcessorFromPath(protoFile)
39 | 	if err != nil {
40 | 		log.Fatal(err)
41 | 	}
42 | 
43 | 	ids := []int{17534, 2134}
44 | 	text := proc.Decode(ids)
45 | 
46 | 	fmt.Println(text)
47 | }
48 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/eliben/go-sentencepiece
2 | 
3 | go 1.22.5
4 | 
5 | require google.golang.org/protobuf v1.34.2
6 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU=
2 | github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
3 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
4 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
5 | google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg=
6 | google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw=
7 | 


--------------------------------------------------------------------------------
/internal/cmd/dumper/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | // Command dumper is a debugging utility for internal use. It helps explore
 4 | // the model proto and compare results with other tools.
 5 | 
 6 | import (
 7 | 	"flag"
 8 | 	"fmt"
 9 | 	"io/ioutil"
10 | 	"log"
11 | 	"os"
12 | 	"unicode"
13 | 
14 | 	"github.com/eliben/go-sentencepiece"
15 | 	"github.com/eliben/go-sentencepiece/internal/model"
16 | 	"google.golang.org/protobuf/encoding/prototext"
17 | 	"google.golang.org/protobuf/proto"
18 | )
19 | 
20 | func main() {
21 | 	fDumpAll := flag.Bool("dumpall", false, "dump entire model proto")
22 | 	fFindUni := flag.Bool("finduni", false, "find unicode runes not in pieces")
23 | 	fFindBytes := flag.Bool("findbytes", false, "show all byte pieces with their IDs")
24 | 	fEncodeFile := flag.String("encodefile", "", "file name to open and encode")
25 | 	flag.Parse()
26 | 
27 | 	modelPath := os.Getenv("MODELPATH")
28 | 	if modelPath == "" {
29 | 		log.Fatal("Need MODELPATH env var to run")
30 | 	}
31 | 
32 | 	b, err := ioutil.ReadFile(modelPath)
33 | 	if err != nil {
34 | 		log.Fatal(err)
35 | 	}
36 | 
37 | 	var protomodel model.ModelProto
38 | 	err = proto.Unmarshal(b, &protomodel)
39 | 	if err != nil {
40 | 		log.Fatal(err)
41 | 	}
42 | 
43 | 	if *fDumpAll {
44 | 		fmt.Println(prototext.Format(&protomodel))
45 | 	} else if *fFindBytes {
46 | 		for i, piece := range protomodel.GetPieces() {
47 | 			if piece.GetType() == model.ModelProto_SentencePiece_BYTE {
48 | 				fmt.Printf("%5d: %s\n", i, piece.GetPiece())
49 | 			}
50 | 		}
51 | 
52 | 	} else if *fFindUni {
53 | 		pieces := make(map[string]int)
54 | 		for i, piece := range protomodel.GetPieces() {
55 | 			pieces[piece.GetPiece()] = i
56 | 		}
57 | 
58 | 		for r := rune(0); r <= unicode.MaxRune; r++ {
59 | 			if unicode.IsPrint(r) {
60 | 				if _, found := pieces[string(r)]; !found {
61 | 					fmt.Printf("not in pieces: %U %q\n", r, string(r))
62 | 				}
63 | 			}
64 | 		}
65 | 	} else if *fEncodeFile != "" {
66 | 		proc, err := sentencepiece.NewProcessorFromPath(modelPath)
67 | 		if err != nil {
68 | 			log.Fatal(err)
69 | 		}
70 | 
71 | 		b, err := ioutil.ReadFile(*fEncodeFile)
72 | 		if err != nil {
73 | 			log.Fatal(err)
74 | 		}
75 | 
76 | 		tokens := proc.Encode(string(b))
77 | 		for _, t := range tokens {
78 | 			fmt.Println(t.ID)
79 | 		}
80 | 	}
81 | }
82 | 


--------------------------------------------------------------------------------
/internal/cmd/wasm/.gitignore:
--------------------------------------------------------------------------------
1 | *.wasm
2 | embed_data
3 | 


--------------------------------------------------------------------------------
/internal/cmd/wasm/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: build serve clean
 2 | 
 3 | build:
 4 | 	mkdir -p embed_data
 5 | 	wget https://github.com/google/gemma_pytorch/raw/main/tokenizer/tokenizer.model -O embed_data/tokenizer.model
 6 | 	GOOS=js GOARCH=wasm go build -o assets/gospm.wasm main.go
 7 | 
 8 | serve:
 9 | 	go run github.com/eliben/static-server@latest -port 8873 assets
10 | 
11 | clean:
12 | 	rm -rf embed_data assets/gospm.wasm
13 | 


--------------------------------------------------------------------------------
/internal/cmd/wasm/assets/index.html:
--------------------------------------------------------------------------------
  1 | <!doctype html>
  2 | <style>
  3 |   .container {
  4 |     font-family: Verdana, sans-serif;
  5 |     margin: 0 auto;
  6 |     display: grid;
  7 |     max-width: 900px;
  8 |     grid-template-columns: 2fr 1fr;
  9 |     grid-template-rows: repeat(5, fit-content());
 10 |     grid-template-areas:
 11 |       "textTitle  sidebar"
 12 |       "textBox    sidebar"
 13 |       "outTitle   sidebar"
 14 |       "outBox     sidebar"
 15 |       "flipbox    sidebar"
 16 |   }
 17 | 
 18 |   .textTitle {
 19 |     grid-area: textTitle;
 20 |     justify-self: center;
 21 |     font-weight: bold;
 22 |   }
 23 | 
 24 |   .textBox {
 25 |     grid-area: textBox;
 26 |     justify-self: center;
 27 |   }
 28 | 
 29 |   .outTitle {
 30 |     grid-area: outTitle;
 31 |     justify-self: center;
 32 |     font-weight: bold;
 33 |     padding-top: 40px;
 34 |   }
 35 | 
 36 |   .outBox {
 37 |     display: inline-block;
 38 |     grid-area: outBox;
 39 |     justify-self: center;
 40 |     padding: 10px;
 41 |     max-height: 500px;
 42 |     min-height: 200px;
 43 |     width: 550px;
 44 |     background: #dbdbdb;
 45 |     overflow: auto;
 46 |     font-family: Verdana, monospace;
 47 |     font-size: 14px;
 48 |   }
 49 | 
 50 |   .flipbox {
 51 |     grid-area: flipbox;
 52 |     justify-self: center;
 53 |     text-align: center;
 54 |     padding-top: 6px;
 55 |     padding-right: 10px;
 56 |   }
 57 | 
 58 |   .sidebar {
 59 |     grid-area: sidebar;
 60 |     font-family: Verdana, sans-serif;
 61 |     font-size: 14px;
 62 |     border: solid 1px;
 63 |     border-color: darkgray;
 64 |     padding: 5px;
 65 |   }
 66 | 
 67 |   .sidebar-list {
 68 |     padding-left: 1em;
 69 |   }
 70 | 
 71 |   .toggle-switch {
 72 |     display: flex;
 73 |     border: 2px solid #656865;
 74 |     border-radius: 10px;
 75 |     overflow: hidden;
 76 |   }
 77 | 
 78 |   .option {
 79 |     flex: 1;
 80 |     text-align: center;
 81 |     padding: 6px 6px;
 82 |     cursor: pointer;
 83 |     transition: background-color 0.3s ease;
 84 |     user-select: none;
 85 |   }
 86 | 
 87 |   input[type="radio"] {
 88 |     display: none;
 89 |   }
 90 | 
 91 |   input[type="radio"]:checked+label {
 92 |     background-color: #4c6faf;
 93 |     color: white;
 94 |   }
 95 | </style>
 96 | 
 97 | <script src="wasm_exec.js"></script>
 98 | <script>
 99 |   const go = new Go();
100 |   WebAssembly.instantiateStreaming(fetch("gospm.wasm"), go.importObject).then(
101 |     (result) => {
102 |       go.run(result.instance);
103 |       init();
104 |     });
105 | </script>
106 | 
107 | <body>
108 |   <div class="container">
109 |     <div class="textTitle">Text</div>
110 |     <div class="textBox">
111 |       <textarea id="text" name="text" rows="12" cols="70" spellcheck="false"></textarea>
112 |     </div>
113 |     <div class="outTitle">Tokens</div>
114 |     <div class="outBox">
115 |       <div id="tokens" name="tokens"></div>
116 |     </div>
117 |     <div class="flipbox">
118 |       <div class="toggle-switch">
119 |         <input type="radio" id="showText" name="toggle" value="showText" checked>
120 |         <label for="showText" class="option">Text</label>
121 | 
122 |         <input type="radio" id="showTokens" name="toggle" value="showTokens">
123 |         <label for="showTokens" class="option">IDs</label>
124 |       </div>
125 |     </div>
126 |     <div class="sidebar">
127 |       <p><b>SentencePiece tokenizer</b></p>
128 |       <p>
129 |         Enter text in the box - tokenization is done as you type.
130 |       </p>
131 |       <hr/>
132 |       <p>
133 |         This is a <a href="https://github.com/google/sentencepiece">SentencePiece</a>
134 |         tokenizer implemented in pure Go and compiled to WebAssembly.
135 |         The vocabulary and settings are taken from the
136 |         <a href="https://ai.google.dev/gemma">Google AI Gemma open model</a>.
137 |       </p>
138 | 
139 |     </div>
140 |   </div>
141 |   <script src="script.js"></script>
142 | </body>


--------------------------------------------------------------------------------
/internal/cmd/wasm/assets/script.js:
--------------------------------------------------------------------------------
 1 | 'use strict';
 2 | 
 3 | const TextBox = document.querySelector('#text');
 4 | TextBox.addEventListener('input', onStateChange);
 5 | 
 6 | const OutBox = document.querySelector('#tokens');
 7 | 
 8 | let radioText = document.querySelector('#showText');
 9 | let radioTokens = document.querySelector('#showTokens');
10 | radioText.addEventListener('change', onStateChange);
11 | radioTokens.addEventListener('change', onStateChange);
12 | 
13 | function init() {
14 |     // Trigger a redraw to get started.
15 |     onStateChange();
16 | }
17 | 
18 | //------------------
19 | 
20 | function onStateChange() {
21 |     const text = TextBox.value;
22 | 
23 |     if (radioTokens.checked) {
24 |         const start = performance.now();
25 |         let tokens = textToIDs(text);
26 |         const end = performance.now();
27 |         console.log("textToIDs elapsed (ms): ", end - start);
28 |         OutBox.textContent = "[" + tokens.join(", ") + "]";
29 |     } else {
30 |         const start = performance.now();
31 |         let pieces = textToPieces(text);
32 |         const end = performance.now();
33 |         console.log("textToPieces elapsed (ms): ", end - start);
34 |         console.log(pieces);
35 | 
36 |         OutBox.innerHTML = '';
37 |         // To have different background colors for each piece, we need to
38 |         // wrap each piece in a span. The color is cycled between 8 different
39 |         // colors, in jumps of 135 degrees to make them sufficiently far apart
40 |         // and not repeat for 8 cycles (since 360/8 = 45, we could use any
41 |         // multiple of 45 that's not also a multiple of 180).
42 |         for (let i = 0; i < pieces.length; i++) {
43 |             if (pieces[i] === '\n') {
44 |                 OutBox.appendChild(document.createElement('br'));
45 |             } else {
46 |                 let color = i % 8;
47 |                 let span = document.createElement('span');
48 |                 span.textContent = pieces[i];
49 |                 span.style.lineHeight = 1.5;
50 |                 span.style.backgroundColor = `hsl(${color * 135}, 40%, 70%)`;
51 |                 span.style.whiteSpace = 'pre';
52 |                 span.style.display = 'inline-block';
53 |                 OutBox.appendChild(span);
54 |             }
55 |         }
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/internal/cmd/wasm/assets/wasm_exec.js:
--------------------------------------------------------------------------------
  1 | // Copyright 2018 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | "use strict";
  6 | 
  7 | (() => {
  8 | 	const enosys = () => {
  9 | 		const err = new Error("not implemented");
 10 | 		err.code = "ENOSYS";
 11 | 		return err;
 12 | 	};
 13 | 
 14 | 	if (!globalThis.fs) {
 15 | 		let outputBuf = "";
 16 | 		globalThis.fs = {
 17 | 			constants: { O_WRONLY: -1, O_RDWR: -1, O_CREAT: -1, O_TRUNC: -1, O_APPEND: -1, O_EXCL: -1 }, // unused
 18 | 			writeSync(fd, buf) {
 19 | 				outputBuf += decoder.decode(buf);
 20 | 				const nl = outputBuf.lastIndexOf("\n");
 21 | 				if (nl != -1) {
 22 | 					console.log(outputBuf.substring(0, nl));
 23 | 					outputBuf = outputBuf.substring(nl + 1);
 24 | 				}
 25 | 				return buf.length;
 26 | 			},
 27 | 			write(fd, buf, offset, length, position, callback) {
 28 | 				if (offset !== 0 || length !== buf.length || position !== null) {
 29 | 					callback(enosys());
 30 | 					return;
 31 | 				}
 32 | 				const n = this.writeSync(fd, buf);
 33 | 				callback(null, n);
 34 | 			},
 35 | 			chmod(path, mode, callback) { callback(enosys()); },
 36 | 			chown(path, uid, gid, callback) { callback(enosys()); },
 37 | 			close(fd, callback) { callback(enosys()); },
 38 | 			fchmod(fd, mode, callback) { callback(enosys()); },
 39 | 			fchown(fd, uid, gid, callback) { callback(enosys()); },
 40 | 			fstat(fd, callback) { callback(enosys()); },
 41 | 			fsync(fd, callback) { callback(null); },
 42 | 			ftruncate(fd, length, callback) { callback(enosys()); },
 43 | 			lchown(path, uid, gid, callback) { callback(enosys()); },
 44 | 			link(path, link, callback) { callback(enosys()); },
 45 | 			lstat(path, callback) { callback(enosys()); },
 46 | 			mkdir(path, perm, callback) { callback(enosys()); },
 47 | 			open(path, flags, mode, callback) { callback(enosys()); },
 48 | 			read(fd, buffer, offset, length, position, callback) { callback(enosys()); },
 49 | 			readdir(path, callback) { callback(enosys()); },
 50 | 			readlink(path, callback) { callback(enosys()); },
 51 | 			rename(from, to, callback) { callback(enosys()); },
 52 | 			rmdir(path, callback) { callback(enosys()); },
 53 | 			stat(path, callback) { callback(enosys()); },
 54 | 			symlink(path, link, callback) { callback(enosys()); },
 55 | 			truncate(path, length, callback) { callback(enosys()); },
 56 | 			unlink(path, callback) { callback(enosys()); },
 57 | 			utimes(path, atime, mtime, callback) { callback(enosys()); },
 58 | 		};
 59 | 	}
 60 | 
 61 | 	if (!globalThis.process) {
 62 | 		globalThis.process = {
 63 | 			getuid() { return -1; },
 64 | 			getgid() { return -1; },
 65 | 			geteuid() { return -1; },
 66 | 			getegid() { return -1; },
 67 | 			getgroups() { throw enosys(); },
 68 | 			pid: -1,
 69 | 			ppid: -1,
 70 | 			umask() { throw enosys(); },
 71 | 			cwd() { throw enosys(); },
 72 | 			chdir() { throw enosys(); },
 73 | 		}
 74 | 	}
 75 | 
 76 | 	if (!globalThis.crypto) {
 77 | 		throw new Error("globalThis.crypto is not available, polyfill required (crypto.getRandomValues only)");
 78 | 	}
 79 | 
 80 | 	if (!globalThis.performance) {
 81 | 		throw new Error("globalThis.performance is not available, polyfill required (performance.now only)");
 82 | 	}
 83 | 
 84 | 	if (!globalThis.TextEncoder) {
 85 | 		throw new Error("globalThis.TextEncoder is not available, polyfill required");
 86 | 	}
 87 | 
 88 | 	if (!globalThis.TextDecoder) {
 89 | 		throw new Error("globalThis.TextDecoder is not available, polyfill required");
 90 | 	}
 91 | 
 92 | 	const encoder = new TextEncoder("utf-8");
 93 | 	const decoder = new TextDecoder("utf-8");
 94 | 
 95 | 	globalThis.Go = class {
 96 | 		constructor() {
 97 | 			this.argv = ["js"];
 98 | 			this.env = {};
 99 | 			this.exit = (code) => {
100 | 				if (code !== 0) {
101 | 					console.warn("exit code:", code);
102 | 				}
103 | 			};
104 | 			this._exitPromise = new Promise((resolve) => {
105 | 				this._resolveExitPromise = resolve;
106 | 			});
107 | 			this._pendingEvent = null;
108 | 			this._scheduledTimeouts = new Map();
109 | 			this._nextCallbackTimeoutID = 1;
110 | 
111 | 			const setInt64 = (addr, v) => {
112 | 				this.mem.setUint32(addr + 0, v, true);
113 | 				this.mem.setUint32(addr + 4, Math.floor(v / 4294967296), true);
114 | 			}
115 | 
116 | 			const setInt32 = (addr, v) => {
117 | 				this.mem.setUint32(addr + 0, v, true);
118 | 			}
119 | 
120 | 			const getInt64 = (addr) => {
121 | 				const low = this.mem.getUint32(addr + 0, true);
122 | 				const high = this.mem.getInt32(addr + 4, true);
123 | 				return low + high * 4294967296;
124 | 			}
125 | 
126 | 			const loadValue = (addr) => {
127 | 				const f = this.mem.getFloat64(addr, true);
128 | 				if (f === 0) {
129 | 					return undefined;
130 | 				}
131 | 				if (!isNaN(f)) {
132 | 					return f;
133 | 				}
134 | 
135 | 				const id = this.mem.getUint32(addr, true);
136 | 				return this._values[id];
137 | 			}
138 | 
139 | 			const storeValue = (addr, v) => {
140 | 				const nanHead = 0x7FF80000;
141 | 
142 | 				if (typeof v === "number" && v !== 0) {
143 | 					if (isNaN(v)) {
144 | 						this.mem.setUint32(addr + 4, nanHead, true);
145 | 						this.mem.setUint32(addr, 0, true);
146 | 						return;
147 | 					}
148 | 					this.mem.setFloat64(addr, v, true);
149 | 					return;
150 | 				}
151 | 
152 | 				if (v === undefined) {
153 | 					this.mem.setFloat64(addr, 0, true);
154 | 					return;
155 | 				}
156 | 
157 | 				let id = this._ids.get(v);
158 | 				if (id === undefined) {
159 | 					id = this._idPool.pop();
160 | 					if (id === undefined) {
161 | 						id = this._values.length;
162 | 					}
163 | 					this._values[id] = v;
164 | 					this._goRefCounts[id] = 0;
165 | 					this._ids.set(v, id);
166 | 				}
167 | 				this._goRefCounts[id]++;
168 | 				let typeFlag = 0;
169 | 				switch (typeof v) {
170 | 					case "object":
171 | 						if (v !== null) {
172 | 							typeFlag = 1;
173 | 						}
174 | 						break;
175 | 					case "string":
176 | 						typeFlag = 2;
177 | 						break;
178 | 					case "symbol":
179 | 						typeFlag = 3;
180 | 						break;
181 | 					case "function":
182 | 						typeFlag = 4;
183 | 						break;
184 | 				}
185 | 				this.mem.setUint32(addr + 4, nanHead | typeFlag, true);
186 | 				this.mem.setUint32(addr, id, true);
187 | 			}
188 | 
189 | 			const loadSlice = (addr) => {
190 | 				const array = getInt64(addr + 0);
191 | 				const len = getInt64(addr + 8);
192 | 				return new Uint8Array(this._inst.exports.mem.buffer, array, len);
193 | 			}
194 | 
195 | 			const loadSliceOfValues = (addr) => {
196 | 				const array = getInt64(addr + 0);
197 | 				const len = getInt64(addr + 8);
198 | 				const a = new Array(len);
199 | 				for (let i = 0; i < len; i++) {
200 | 					a[i] = loadValue(array + i * 8);
201 | 				}
202 | 				return a;
203 | 			}
204 | 
205 | 			const loadString = (addr) => {
206 | 				const saddr = getInt64(addr + 0);
207 | 				const len = getInt64(addr + 8);
208 | 				return decoder.decode(new DataView(this._inst.exports.mem.buffer, saddr, len));
209 | 			}
210 | 
211 | 			const timeOrigin = Date.now() - performance.now();
212 | 			this.importObject = {
213 | 				_gotest: {
214 | 					add: (a, b) => a + b,
215 | 				},
216 | 				gojs: {
217 | 					// Go's SP does not change as long as no Go code is running. Some operations (e.g. calls, getters and setters)
218 | 					// may synchronously trigger a Go event handler. This makes Go code get executed in the middle of the imported
219 | 					// function. A goroutine can switch to a new stack if the current stack is too small (see morestack function).
220 | 					// This changes the SP, thus we have to update the SP used by the imported function.
221 | 
222 | 					// func wasmExit(code int32)
223 | 					"runtime.wasmExit": (sp) => {
224 | 						sp >>>= 0;
225 | 						const code = this.mem.getInt32(sp + 8, true);
226 | 						this.exited = true;
227 | 						delete this._inst;
228 | 						delete this._values;
229 | 						delete this._goRefCounts;
230 | 						delete this._ids;
231 | 						delete this._idPool;
232 | 						this.exit(code);
233 | 					},
234 | 
235 | 					// func wasmWrite(fd uintptr, p unsafe.Pointer, n int32)
236 | 					"runtime.wasmWrite": (sp) => {
237 | 						sp >>>= 0;
238 | 						const fd = getInt64(sp + 8);
239 | 						const p = getInt64(sp + 16);
240 | 						const n = this.mem.getInt32(sp + 24, true);
241 | 						fs.writeSync(fd, new Uint8Array(this._inst.exports.mem.buffer, p, n));
242 | 					},
243 | 
244 | 					// func resetMemoryDataView()
245 | 					"runtime.resetMemoryDataView": (sp) => {
246 | 						sp >>>= 0;
247 | 						this.mem = new DataView(this._inst.exports.mem.buffer);
248 | 					},
249 | 
250 | 					// func nanotime1() int64
251 | 					"runtime.nanotime1": (sp) => {
252 | 						sp >>>= 0;
253 | 						setInt64(sp + 8, (timeOrigin + performance.now()) * 1000000);
254 | 					},
255 | 
256 | 					// func walltime() (sec int64, nsec int32)
257 | 					"runtime.walltime": (sp) => {
258 | 						sp >>>= 0;
259 | 						const msec = (new Date).getTime();
260 | 						setInt64(sp + 8, msec / 1000);
261 | 						this.mem.setInt32(sp + 16, (msec % 1000) * 1000000, true);
262 | 					},
263 | 
264 | 					// func scheduleTimeoutEvent(delay int64) int32
265 | 					"runtime.scheduleTimeoutEvent": (sp) => {
266 | 						sp >>>= 0;
267 | 						const id = this._nextCallbackTimeoutID;
268 | 						this._nextCallbackTimeoutID++;
269 | 						this._scheduledTimeouts.set(id, setTimeout(
270 | 							() => {
271 | 								this._resume();
272 | 								while (this._scheduledTimeouts.has(id)) {
273 | 									// for some reason Go failed to register the timeout event, log and try again
274 | 									// (temporary workaround for https://github.com/golang/go/issues/28975)
275 | 									console.warn("scheduleTimeoutEvent: missed timeout event");
276 | 									this._resume();
277 | 								}
278 | 							},
279 | 							getInt64(sp + 8),
280 | 						));
281 | 						this.mem.setInt32(sp + 16, id, true);
282 | 					},
283 | 
284 | 					// func clearTimeoutEvent(id int32)
285 | 					"runtime.clearTimeoutEvent": (sp) => {
286 | 						sp >>>= 0;
287 | 						const id = this.mem.getInt32(sp + 8, true);
288 | 						clearTimeout(this._scheduledTimeouts.get(id));
289 | 						this._scheduledTimeouts.delete(id);
290 | 					},
291 | 
292 | 					// func getRandomData(r []byte)
293 | 					"runtime.getRandomData": (sp) => {
294 | 						sp >>>= 0;
295 | 						crypto.getRandomValues(loadSlice(sp + 8));
296 | 					},
297 | 
298 | 					// func finalizeRef(v ref)
299 | 					"syscall/js.finalizeRef": (sp) => {
300 | 						sp >>>= 0;
301 | 						const id = this.mem.getUint32(sp + 8, true);
302 | 						this._goRefCounts[id]--;
303 | 						if (this._goRefCounts[id] === 0) {
304 | 							const v = this._values[id];
305 | 							this._values[id] = null;
306 | 							this._ids.delete(v);
307 | 							this._idPool.push(id);
308 | 						}
309 | 					},
310 | 
311 | 					// func stringVal(value string) ref
312 | 					"syscall/js.stringVal": (sp) => {
313 | 						sp >>>= 0;
314 | 						storeValue(sp + 24, loadString(sp + 8));
315 | 					},
316 | 
317 | 					// func valueGet(v ref, p string) ref
318 | 					"syscall/js.valueGet": (sp) => {
319 | 						sp >>>= 0;
320 | 						const result = Reflect.get(loadValue(sp + 8), loadString(sp + 16));
321 | 						sp = this._inst.exports.getsp() >>> 0; // see comment above
322 | 						storeValue(sp + 32, result);
323 | 					},
324 | 
325 | 					// func valueSet(v ref, p string, x ref)
326 | 					"syscall/js.valueSet": (sp) => {
327 | 						sp >>>= 0;
328 | 						Reflect.set(loadValue(sp + 8), loadString(sp + 16), loadValue(sp + 32));
329 | 					},
330 | 
331 | 					// func valueDelete(v ref, p string)
332 | 					"syscall/js.valueDelete": (sp) => {
333 | 						sp >>>= 0;
334 | 						Reflect.deleteProperty(loadValue(sp + 8), loadString(sp + 16));
335 | 					},
336 | 
337 | 					// func valueIndex(v ref, i int) ref
338 | 					"syscall/js.valueIndex": (sp) => {
339 | 						sp >>>= 0;
340 | 						storeValue(sp + 24, Reflect.get(loadValue(sp + 8), getInt64(sp + 16)));
341 | 					},
342 | 
343 | 					// valueSetIndex(v ref, i int, x ref)
344 | 					"syscall/js.valueSetIndex": (sp) => {
345 | 						sp >>>= 0;
346 | 						Reflect.set(loadValue(sp + 8), getInt64(sp + 16), loadValue(sp + 24));
347 | 					},
348 | 
349 | 					// func valueCall(v ref, m string, args []ref) (ref, bool)
350 | 					"syscall/js.valueCall": (sp) => {
351 | 						sp >>>= 0;
352 | 						try {
353 | 							const v = loadValue(sp + 8);
354 | 							const m = Reflect.get(v, loadString(sp + 16));
355 | 							const args = loadSliceOfValues(sp + 32);
356 | 							const result = Reflect.apply(m, v, args);
357 | 							sp = this._inst.exports.getsp() >>> 0; // see comment above
358 | 							storeValue(sp + 56, result);
359 | 							this.mem.setUint8(sp + 64, 1);
360 | 						} catch (err) {
361 | 							sp = this._inst.exports.getsp() >>> 0; // see comment above
362 | 							storeValue(sp + 56, err);
363 | 							this.mem.setUint8(sp + 64, 0);
364 | 						}
365 | 					},
366 | 
367 | 					// func valueInvoke(v ref, args []ref) (ref, bool)
368 | 					"syscall/js.valueInvoke": (sp) => {
369 | 						sp >>>= 0;
370 | 						try {
371 | 							const v = loadValue(sp + 8);
372 | 							const args = loadSliceOfValues(sp + 16);
373 | 							const result = Reflect.apply(v, undefined, args);
374 | 							sp = this._inst.exports.getsp() >>> 0; // see comment above
375 | 							storeValue(sp + 40, result);
376 | 							this.mem.setUint8(sp + 48, 1);
377 | 						} catch (err) {
378 | 							sp = this._inst.exports.getsp() >>> 0; // see comment above
379 | 							storeValue(sp + 40, err);
380 | 							this.mem.setUint8(sp + 48, 0);
381 | 						}
382 | 					},
383 | 
384 | 					// func valueNew(v ref, args []ref) (ref, bool)
385 | 					"syscall/js.valueNew": (sp) => {
386 | 						sp >>>= 0;
387 | 						try {
388 | 							const v = loadValue(sp + 8);
389 | 							const args = loadSliceOfValues(sp + 16);
390 | 							const result = Reflect.construct(v, args);
391 | 							sp = this._inst.exports.getsp() >>> 0; // see comment above
392 | 							storeValue(sp + 40, result);
393 | 							this.mem.setUint8(sp + 48, 1);
394 | 						} catch (err) {
395 | 							sp = this._inst.exports.getsp() >>> 0; // see comment above
396 | 							storeValue(sp + 40, err);
397 | 							this.mem.setUint8(sp + 48, 0);
398 | 						}
399 | 					},
400 | 
401 | 					// func valueLength(v ref) int
402 | 					"syscall/js.valueLength": (sp) => {
403 | 						sp >>>= 0;
404 | 						setInt64(sp + 16, parseInt(loadValue(sp + 8).length));
405 | 					},
406 | 
407 | 					// valuePrepareString(v ref) (ref, int)
408 | 					"syscall/js.valuePrepareString": (sp) => {
409 | 						sp >>>= 0;
410 | 						const str = encoder.encode(String(loadValue(sp + 8)));
411 | 						storeValue(sp + 16, str);
412 | 						setInt64(sp + 24, str.length);
413 | 					},
414 | 
415 | 					// valueLoadString(v ref, b []byte)
416 | 					"syscall/js.valueLoadString": (sp) => {
417 | 						sp >>>= 0;
418 | 						const str = loadValue(sp + 8);
419 | 						loadSlice(sp + 16).set(str);
420 | 					},
421 | 
422 | 					// func valueInstanceOf(v ref, t ref) bool
423 | 					"syscall/js.valueInstanceOf": (sp) => {
424 | 						sp >>>= 0;
425 | 						this.mem.setUint8(sp + 24, (loadValue(sp + 8) instanceof loadValue(sp + 16)) ? 1 : 0);
426 | 					},
427 | 
428 | 					// func copyBytesToGo(dst []byte, src ref) (int, bool)
429 | 					"syscall/js.copyBytesToGo": (sp) => {
430 | 						sp >>>= 0;
431 | 						const dst = loadSlice(sp + 8);
432 | 						const src = loadValue(sp + 32);
433 | 						if (!(src instanceof Uint8Array || src instanceof Uint8ClampedArray)) {
434 | 							this.mem.setUint8(sp + 48, 0);
435 | 							return;
436 | 						}
437 | 						const toCopy = src.subarray(0, dst.length);
438 | 						dst.set(toCopy);
439 | 						setInt64(sp + 40, toCopy.length);
440 | 						this.mem.setUint8(sp + 48, 1);
441 | 					},
442 | 
443 | 					// func copyBytesToJS(dst ref, src []byte) (int, bool)
444 | 					"syscall/js.copyBytesToJS": (sp) => {
445 | 						sp >>>= 0;
446 | 						const dst = loadValue(sp + 8);
447 | 						const src = loadSlice(sp + 16);
448 | 						if (!(dst instanceof Uint8Array || dst instanceof Uint8ClampedArray)) {
449 | 							this.mem.setUint8(sp + 48, 0);
450 | 							return;
451 | 						}
452 | 						const toCopy = src.subarray(0, dst.length);
453 | 						dst.set(toCopy);
454 | 						setInt64(sp + 40, toCopy.length);
455 | 						this.mem.setUint8(sp + 48, 1);
456 | 					},
457 | 
458 | 					"debug": (value) => {
459 | 						console.log(value);
460 | 					},
461 | 				}
462 | 			};
463 | 		}
464 | 
465 | 		async run(instance) {
466 | 			if (!(instance instanceof WebAssembly.Instance)) {
467 | 				throw new Error("Go.run: WebAssembly.Instance expected");
468 | 			}
469 | 			this._inst = instance;
470 | 			this.mem = new DataView(this._inst.exports.mem.buffer);
471 | 			this._values = [ // JS values that Go currently has references to, indexed by reference id
472 | 				NaN,
473 | 				0,
474 | 				null,
475 | 				true,
476 | 				false,
477 | 				globalThis,
478 | 				this,
479 | 			];
480 | 			this._goRefCounts = new Array(this._values.length).fill(Infinity); // number of references that Go has to a JS value, indexed by reference id
481 | 			this._ids = new Map([ // mapping from JS values to reference ids
482 | 				[0, 1],
483 | 				[null, 2],
484 | 				[true, 3],
485 | 				[false, 4],
486 | 				[globalThis, 5],
487 | 				[this, 6],
488 | 			]);
489 | 			this._idPool = [];   // unused ids that have been garbage collected
490 | 			this.exited = false; // whether the Go program has exited
491 | 
492 | 			// Pass command line arguments and environment variables to WebAssembly by writing them to the linear memory.
493 | 			let offset = 4096;
494 | 
495 | 			const strPtr = (str) => {
496 | 				const ptr = offset;
497 | 				const bytes = encoder.encode(str + "\0");
498 | 				new Uint8Array(this.mem.buffer, offset, bytes.length).set(bytes);
499 | 				offset += bytes.length;
500 | 				if (offset % 8 !== 0) {
501 | 					offset += 8 - (offset % 8);
502 | 				}
503 | 				return ptr;
504 | 			};
505 | 
506 | 			const argc = this.argv.length;
507 | 
508 | 			const argvPtrs = [];
509 | 			this.argv.forEach((arg) => {
510 | 				argvPtrs.push(strPtr(arg));
511 | 			});
512 | 			argvPtrs.push(0);
513 | 
514 | 			const keys = Object.keys(this.env).sort();
515 | 			keys.forEach((key) => {
516 | 				argvPtrs.push(strPtr(`${key}=${this.env[key]}`));
517 | 			});
518 | 			argvPtrs.push(0);
519 | 
520 | 			const argv = offset;
521 | 			argvPtrs.forEach((ptr) => {
522 | 				this.mem.setUint32(offset, ptr, true);
523 | 				this.mem.setUint32(offset + 4, 0, true);
524 | 				offset += 8;
525 | 			});
526 | 
527 | 			// The linker guarantees global data starts from at least wasmMinDataAddr.
528 | 			// Keep in sync with cmd/link/internal/ld/data.go:wasmMinDataAddr.
529 | 			const wasmMinDataAddr = 4096 + 8192;
530 | 			if (offset >= wasmMinDataAddr) {
531 | 				throw new Error("total length of command line and environment variables exceeds limit");
532 | 			}
533 | 
534 | 			this._inst.exports.run(argc, argv);
535 | 			if (this.exited) {
536 | 				this._resolveExitPromise();
537 | 			}
538 | 			await this._exitPromise;
539 | 		}
540 | 
541 | 		_resume() {
542 | 			if (this.exited) {
543 | 				throw new Error("Go program has already exited");
544 | 			}
545 | 			this._inst.exports.resume();
546 | 			if (this.exited) {
547 | 				this._resolveExitPromise();
548 | 			}
549 | 		}
550 | 
551 | 		_makeFuncWrapper(id) {
552 | 			const go = this;
553 | 			return function () {
554 | 				const event = { id: id, this: this, args: arguments };
555 | 				go._pendingEvent = event;
556 | 				go._resume();
557 | 				return event.result;
558 | 			};
559 | 		}
560 | 	}
561 | })();
562 | 


--------------------------------------------------------------------------------
/internal/cmd/wasm/main.go:
--------------------------------------------------------------------------------
 1 | //go:build js && wasm
 2 | 
 3 | // Main binary for exposing the go-sentencepiece functionality in the browser
 4 | // via WASM. The required functionality is exposed via the syscall/js interface.
 5 | // This module should only be built in js && wasm mode.
 6 | package main
 7 | 
 8 | import (
 9 | 	_ "embed"
10 | 	"fmt"
11 | 	"log"
12 | 	"strings"
13 | 	"sync"
14 | 	"syscall/js"
15 | 
16 | 	"github.com/eliben/go-sentencepiece"
17 | )
18 | 
19 | //go:embed embed_data/tokenizer.model
20 | var modelFileData string
21 | var spm *sentencepiece.Processor
22 | 
23 | func main() {
24 | 	var once sync.Once
25 | 	once.Do(func() {
26 | 		var err error
27 | 		spm, err = sentencepiece.NewProcessor(strings.NewReader(modelFileData))
28 | 		if err != nil {
29 | 			log.Fatal(err)
30 | 		}
31 | 		fmt.Printf("processor loaded, vocab len=%v\n", spm.ModelInfo().VocabularySize)
32 | 	})
33 | 
34 | 	js.Global().Set("textToIDs", jsTextToIDs)
35 | 	js.Global().Set("textToPieces", jsTextToPieces)
36 | 
37 | 	// For the Go code to be usable from JS, the main function has to run forever.
38 | 	<-make(chan bool)
39 | }
40 | 
41 | var jsTextToIDs = js.FuncOf(func(this js.Value, args []js.Value) interface{} {
42 | 	if len(args) != 1 {
43 | 		return "expected 1 argument: text to tokenize"
44 | 	}
45 | 	txt := args[0].String()
46 | 	tokens := spm.Encode(txt)
47 | 
48 | 	jsTokens := js.Global().Get("Array").New()
49 | 	for _, t := range tokens {
50 | 		jsTokens.Call("push", js.ValueOf(t.ID))
51 | 	}
52 | 	return jsTokens
53 | })
54 | 
55 | var jsTextToPieces = js.FuncOf(func(this js.Value, args []js.Value) interface{} {
56 | 	if len(args) != 1 {
57 | 		return "expected 1 argument: text to tokenize"
58 | 	}
59 | 	txt := args[0].String()
60 | 	tokens := spm.Encode(txt)
61 | 
62 | 	jsTokens := js.Global().Get("Array").New()
63 | 	for _, t := range tokens {
64 | 		jsTokens.Call("push", js.ValueOf(t.Text))
65 | 	}
66 | 	return jsTokens
67 | })
68 | 


--------------------------------------------------------------------------------
/internal/model/gen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -o pipefail
 4 | set -eux
 5 | 
 6 | protoc \
 7 |   --go_out=. \
 8 |   --go_opt="Msentencepiece_model.proto=;model" sentencepiece_model.proto
 9 | 
10 | goimports -w .
11 | 
12 | 


--------------------------------------------------------------------------------
/internal/model/sentencepiece_model.proto:
--------------------------------------------------------------------------------
  1 | // Copyright 2024 Google LLC
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | syntax = "proto2";
 16 | 
 17 | // TODO(taku): Needs to use LITE RUNTIME in OSS release.
 18 | option optimize_for = LITE_RUNTIME;
 19 | 
 20 | package sentencepiece;
 21 | 
 22 | // TrainerSpec encodes a various parameters for SentencePiece training.
 23 | // Next id: 55
 24 | message TrainerSpec {
 25 |   ///////////////////////////////////////////////////////////////////
 26 |   // General parameters
 27 |   //
 28 |   // Input corpus files.
 29 |   //  Trainer accepts the following two formats:
 30 |   //  A) Monolingual: plain text, one sentence per line.
 31 |   //  B) Bilingual:   TSV, source sentence <tab> target sentence
 32 |   //  When bilingual data is passed, shared vocabulary model is built.
 33 |   //  Note that the input file must be raw corpus, not a preprocessed corpus.
 34 |   //  Trainer only loads the first `input_sentence_size` sentences specified
 35 |   //  with this parameter.
 36 |   repeated string input = 1;
 37 | 
 38 |   // Input corpus format:
 39 |   // "text": one-sentence-per-line text format (default)
 40 |   // "tsv":  sentence <tab> freq
 41 |   optional string input_format = 7;
 42 | 
 43 |   // Output model file prefix.
 44 |   // <model_prefix>.model and <model_prefix>.vocab are generated.
 45 |   optional string model_prefix = 2;
 46 | 
 47 |   // Model type. only have UNIGRAM now.
 48 |   enum ModelType {
 49 |     UNIGRAM = 1;  // Unigram language model with dynamic algorithm
 50 |     BPE = 2;      // Byte Pair Encoding
 51 |     WORD = 3;     // Delimitered by whitespace.
 52 |     CHAR = 4;     // tokenizes into character sequence
 53 |   }
 54 |   optional ModelType model_type = 3 [default = UNIGRAM];
 55 | 
 56 |   // Vocabulary size. 8k is the default size.
 57 |   optional int32 vocab_size = 4 [default = 8000];
 58 | 
 59 |   // List of the languages this model can accept.
 60 |   // Since the model is language-agnostic, this field is used as a reference.
 61 |   repeated string accept_language = 5;
 62 | 
 63 |   // Size of self-test samples, which are encoded in the model file.
 64 |   optional int32 self_test_sample_size = 6 [default = 0];
 65 | 
 66 |   // Whether to use DP version of sentencepiece. Use it with TSV input format
 67 |   // (requires precomputed word tab counts to work).
 68 |   optional bool enable_differential_privacy = 50 [default = false];
 69 |   // Set these parameters if you need DP version of sentencepiece.
 70 |   // std of noise to add.
 71 |   optional float differential_privacy_noise_level = 51 [default = 0.0];
 72 |   // Clipping threshold to apply after adding noise. All the words with
 73 |   // frequency less than this value are dropped.
 74 |   optional uint64 differential_privacy_clipping_threshold = 52 [default = 0];
 75 | 
 76 |   ///////////////////////////////////////////////////////////////////
 77 |   // Training parameters.
 78 |   //
 79 |   // Uses characters which cover the corpus with the ratio of `chars_coverage`.
 80 |   // This parameter determines the set of basic Alphabet of sentence piece.
 81 |   // 1.0 - `chars_coverage` characters are treated as UNK.
 82 |   // See also required_chars field.
 83 |   optional float character_coverage = 10 [default = 0.9995];
 84 | 
 85 |   // Maximum size of sentences the trainer loads from `input` parameter.
 86 |   // Trainer simply loads the `input` files in sequence.
 87 |   // It is better to shuffle the input corpus randomly.
 88 |   optional uint64 input_sentence_size = 11 [default = 0];
 89 |   optional bool shuffle_input_sentence = 19 [default = true];
 90 | 
 91 |   // Maximum size of sentences to make seed sentence pieces.
 92 |   // Extended suffix array is constructed to extract frequent
 93 |   // sub-strings from the corpus. This uses 20N working space,
 94 |   // where N is the size of corpus.
 95 |   optional int32 mining_sentence_size = 12 [deprecated = true];
 96 | 
 97 |   // Maximum size of sentences to train sentence pieces.
 98 |   optional int32 training_sentence_size = 13 [deprecated = true];
 99 | 
100 |   // The size of seed sentencepieces.
101 |   // `seed_sentencepiece_size` must be larger than `vocab_size`.
102 |   optional int32 seed_sentencepiece_size = 14 [default = 1000000];
103 | 
104 |   // In every EM sub-iterations, keeps top
105 |   // `shrinking_factor` * `current sentencepieces size` with respect to
106 |   // the loss of the sentence piece. This value should be smaller than 1.0.
107 |   optional float shrinking_factor = 15 [default = 0.75];
108 | 
109 |   // The maximum sentence length in byte. The sentences with the length
110 |   // larger than `max_sentence_length` is simply ignored.
111 |   // Longer input tends to bring the following risks:
112 |   //  * Overflow during EM training (unigram language model only)
113 |   //  * Performance drop because of O(n log n) cost in BPE.
114 |   optional int32 max_sentence_length = 18 [default = 4192];
115 | 
116 |   // Number of threads in the training.
117 |   optional int32 num_threads = 16 [default = 16];
118 | 
119 |   // Number of EM sub iterations.
120 |   optional int32 num_sub_iterations = 17 [default = 2];
121 | 
122 |   ///////////////////////////////////////////////////////////////////
123 |   // SentencePiece parameters which control the shapes of sentence piece.
124 |   //
125 |   // Maximum length of sentencepiece.
126 |   optional int32 max_sentencepiece_length = 20 [default = 16];
127 | 
128 |   // Uses Unicode script to split sentence pieces.
129 |   // When `split_by_unicode_script` is true, we do not allow sentence piece to
130 |   // include multiple Unicode scripts, e.g. "F1" is not a valid piece.
131 |   // Exception: CJ characters (Hiragana/Katakana/Han) are all handled
132 |   // as one script type, since Japanese word can consist of multiple scripts.
133 |   // This exception is always applied regardless of the accept-language
134 |   // parameter.
135 |   optional bool split_by_unicode_script = 21 [default = true];
136 | 
137 |   // When `split_by_number` is true, put a boundary between number and
138 |   // non-number transition. If we want to treat "F1" is one token, set this flag
139 |   // to be false.
140 |   optional bool split_by_number = 23 [default = true];
141 | 
142 |   // Use a white space to split sentence pieces.
143 |   // When `split_by_whitespace` is false, we may have the piece containing
144 |   // a white space in the middle. e.g., "in_the".
145 |   optional bool split_by_whitespace = 22 [default = true];
146 | 
147 |   // Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello =>
148 |   // hello_. When `treat_whitespace_as_suffix` is true,
149 |   // NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end
150 |   // of sentence.
151 |   optional bool treat_whitespace_as_suffix = 24 [default = false];
152 | 
153 |   // Allows pieces that only contain whitespaces instead of appearing only as
154 |   // prefix or suffix of other pieces.
155 |   optional bool allow_whitespace_only_pieces = 26 [default = false];
156 | 
157 |   // Split all digits (0-9) into separate pieces.
158 |   optional bool split_digits = 25 [default = false];
159 | 
160 |   // Defines the pre-tokenization delimiter.
161 |   // When specified, no pieces crossing this delimiter is not included
162 |   // in the vocab. Then the delimiter string is virtually ignored
163 |   // during the training. This field can allows constraints on the vocabulary
164 |   // selection. Note that this field is available on unigram mode.
165 |   optional string pretokenization_delimiter = 53 [ default = ""];
166 | 
167 |   ///////////////////////////////////////////////////////////////////
168 |   // Vocabulary management
169 |   //
170 |   // Defines control symbols used as an indicator to
171 |   // change the behavior of the decoder. <s> and </s> are pre-defined.
172 |   // We can use this field to encode various meta information,
173 |   // including language indicator in multilingual model.
174 |   // These symbols are not visible to users, but visible to
175 |   // the decoder. Note that when the input sentence contains control symbols,
176 |   // they are not treated as one token, but segmented into normal pieces.
177 |   // Control symbols must be inserted independently from the segmentation.
178 |   repeated string control_symbols = 30;
179 | 
180 |   // Defines user defined symbols.
181 |   // These symbols are added with extremely high score
182 |   // so they are always treated as one unique symbol in any context.
183 |   // Typical usage of user_defined_symbols is placeholder for named entities.
184 |   repeated string user_defined_symbols = 31;
185 | 
186 |   // Defines required characters. Each UTF8 character in this string is included
187 |   // in the character set regardless of character_coverage value. Unlike
188 |   // user_defined_symbols, these characters have scores based on the frequency
189 |   // on input sentences, and the model can form subwords using characters
190 |   // in this field.
191 |   optional string required_chars = 36;
192 | 
193 |   // Decomposes unknown pieces into UTF-8 bytes.
194 |   optional bool byte_fallback = 35 [default = false];
195 | 
196 |   // When creating the vocabulary file, defines whether or not to additionally
197 |   // output the score for each piece.
198 |   optional bool vocabulary_output_piece_score = 32 [default = true];
199 | 
200 |   // `vocab_size` is treated as hard limit. Crash if
201 |   // the model can not produce the vocab of size `vocab_size`,
202 |   // When `hard_vocab_limit` is false, vocab_size is treated
203 |   // as soft limit. Note that when model_type=char,
204 |   // always assumes hard_vocab_limit = false.
205 |   optional bool hard_vocab_limit = 33 [default = true];
206 | 
207 |   // use all symbols for vocab extraction. This flag is valid
208 |   // if model type is either CHAR or WORD
209 |   optional bool use_all_vocab = 34 [default = false];
210 | 
211 |   ///////////////////////////////////////////////////////////////////
212 |   // Reserved special meta tokens.
213 |   // * -1 is not used.
214 |   // * unk_id must not be -1.
215 |   // Id must starts with 0 and be contigous.
216 |   optional int32 unk_id = 40 [default = 0];   // <unk>
217 |   optional int32 bos_id = 41 [default = 1];   // <s>
218 |   optional int32 eos_id = 42 [default = 2];   // </s>
219 |   optional int32 pad_id = 43 [default = -1];  // <pad> (padding)
220 |   optional string unk_piece = 45 [default = "<unk>"];
221 |   optional string bos_piece = 46 [default = "<s>"];
222 |   optional string eos_piece = 47 [default = "</s>"];
223 |   optional string pad_piece = 48 [default = "<pad>"];
224 | 
225 |   // Encodes <unk> into U+2047 (DOUBLE QUESTION MARK),
226 |   // since this character can be useful both for user and
227 |   // developer. We can easily figure out that <unk> is emitted.
228 |   optional string unk_surface = 44 [default = " \xE2\x81\x87 "];
229 | 
230 |   // Increase bit depth to allow unigram model training on large
231 |   // (>10M sentences) corpora. A Side-effect of enabling this flag
232 |   // is increased memory usage.
233 |   optional bool train_extremely_large_corpus = 49 [default = false];
234 | 
235 |  // Path to a seed sentencepieces file, with one tab-separated
236 |   // seed sentencepiece <tab> frequency per line.
237 |   optional string seed_sentencepieces_file = 54 [default = ""];
238 | 
239 |   // Customized extensions: the range of field numbers
240 |   // are open to third-party extensions.
241 |   extensions 200 to max;
242 | }
243 | 
244 | // NormalizerSpec encodes a various parameters for string normalizaiton
245 | message NormalizerSpec {
246 |   // name of normalization rule.
247 |   optional string name = 1;
248 | 
249 |   // Pre-compiled normalization rule created by
250 |   // Builder::GetPrecompiledCharsMap() or Builder::CompileCharsMap() method.
251 |   // Usually this field is set by Builder::GetNormalizerSpec() method.
252 |   optional bytes precompiled_charsmap = 2;
253 | 
254 |   // Adds dummy whitespace at the beginning of text in order to
255 |   // treat "world" in "world" and "hello world" in the same way.
256 |   optional bool add_dummy_prefix = 3 [default = true];
257 | 
258 |   // Removes leading, trailing, and duplicate internal whitespace.
259 |   optional bool remove_extra_whitespaces = 4 [default = true];
260 | 
261 |   // Replaces whitespace with meta symbol.
262 |   // This field must be true to train sentence piece model.
263 |   optional bool escape_whitespaces = 5 [default = true];
264 | 
265 |   // Custom normalization rule file in TSV format.
266 |   // https://github.com/google/sentencepiece/blob/master/doc/normalization.md
267 |   // This field is only used in SentencePieceTrainer::Train() method, which
268 |   // compiles the rule into the binary rule stored in `precompiled_charsmap`.
269 |   optional string normalization_rule_tsv = 6;
270 | 
271 |   // Customized extensions: the range of field numbers
272 |   // are open to third-party extensions.
273 |   extensions 200 to max;
274 | }
275 | 
276 | // Proto to store samples for self-testing.
277 | message SelfTestData {
278 |   message Sample {
279 |     optional string input = 1;
280 |     optional string expected = 2;
281 |   }
282 |   repeated Sample samples = 1;
283 | 
284 |   // Customized extensions: the range of field numbers
285 |   // are open to third-party extensions.
286 |   extensions 200 to max;
287 | }
288 | 
289 | // ModelProto stores model parameters.
290 | // SentencePieceProcessor is supposed to be self-contained.
291 | // All settings/parameters which may change the behavior must be encoded
292 | // in ModelProto.
293 | message ModelProto {
294 |   message SentencePiece {
295 |     enum Type {
296 |       NORMAL = 1;        // normal symbol
297 |       UNKNOWN = 2;       // unknown symbol. only <unk> for now.
298 |       CONTROL = 3;       // control symbols. </s>, <s>, <2ja> etc.
299 |       USER_DEFINED = 4;  // user defined symbols.
300 |                          // Typical usage of USER_DEFINED symbol
301 |                          // is placeholder.
302 |       BYTE = 6;          // byte symbols. Used when `byte_fallback` is true.
303 |       UNUSED = 5;        // this piece is not used.
304 |     }
305 |     optional string piece = 1;  // piece must not be empty.
306 |     optional float score = 2;
307 |     optional Type type = 3 [default = NORMAL];
308 | 
309 |     // Customized extensions: the range of field numbers
310 |     // are open to third-party extensions.
311 |     extensions 200 to max;
312 |   }
313 | 
314 |   // Sentence pieces with scores.
315 |   repeated SentencePiece pieces = 1;
316 | 
317 |   // Spec used to generate this model file.
318 |   optional TrainerSpec trainer_spec = 2;
319 | 
320 |   // Spec for text normalization.
321 |   optional NormalizerSpec normalizer_spec = 3;
322 | 
323 |   // Stores sample input and its expected segmentation to verify the model.
324 |   optional SelfTestData self_test_data = 4;
325 | 
326 |   // Spec for text de-normalization.
327 |   optional NormalizerSpec denormalizer_spec = 5;
328 | 
329 |   // Customized extensions: the range of field numbers
330 |   // are open to third-party extensions.
331 |   extensions 200 to max;
332 | }
333 | 


--------------------------------------------------------------------------------
/internal/prefixmatcher/prefixmatcher.go:
--------------------------------------------------------------------------------
 1 | package prefixmatcher
 2 | 
 3 | import (
 4 | 	"unicode/utf8"
 5 | )
 6 | 
 7 | // PrefixMatcher helps find longest prefixes. See [FindPrefixLen].
 8 | type PrefixMatcher struct {
 9 | 	root *trieNode
10 | }
11 | 
12 | type trieNode struct {
13 | 	children map[rune]*trieNode
14 | 	final    bool
15 | }
16 | 
17 | // NewFromSet creates a new [PrefixMatcher] from a set of strings tha represent
18 | // the vocabulary.
19 | func NewFromSet(vocab map[string]bool) *PrefixMatcher {
20 | 	pm := &PrefixMatcher{root: newNode()}
21 | 	for word := range vocab {
22 | 		pm.add(word)
23 | 	}
24 | 	return pm
25 | }
26 | 
27 | // FindPrefixLen finds the longest prefix of text that matches a vocabulary
28 | // word, and returns it. If 0 is returned, no prefix was found.
29 | func (pm *PrefixMatcher) FindPrefixLen(text string) int {
30 | 	node := pm.root
31 | 	maxLen := 0
32 | 
33 | 	for i, r := range text {
34 | 		child := node.children[r]
35 | 		if child == nil {
36 | 			// r not found in this node, so we're done.
37 | 			return maxLen
38 | 		}
39 | 		if child.final {
40 | 			maxLen = i + utf8.RuneLen(r)
41 | 		}
42 | 		node = child
43 | 	}
44 | 
45 | 	return maxLen
46 | }
47 | 
48 | func (pm *PrefixMatcher) add(word string) {
49 | 	node := pm.root
50 | 
51 | 	for _, r := range word {
52 | 		child := node.children[r]
53 | 		if child == nil {
54 | 			child = newNode()
55 | 			node.children[r] = child
56 | 		}
57 | 		node = child
58 | 	}
59 | 
60 | 	node.final = true
61 | }
62 | 
63 | func newNode() *trieNode {
64 | 	return &trieNode{
65 | 		children: make(map[rune]*trieNode),
66 | 		final:    false,
67 | 	}
68 | }
69 | 


--------------------------------------------------------------------------------
/internal/prefixmatcher/prefixmatcher_test.go:
--------------------------------------------------------------------------------
  1 | package prefixmatcher
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"testing"
  6 | )
  7 | 
  8 | func dumpNode(n *trieNode, prefix string) string {
  9 | 	var s string
 10 | 	if n.final {
 11 | 		s = fmt.Sprintf("%sfinal\n", prefix)
 12 | 	}
 13 | 	for r, c := range n.children {
 14 | 		s += fmt.Sprintf("%s%q ->\n%s", prefix, r, dumpNode(c, prefix+"  "))
 15 | 	}
 16 | 	return s
 17 | }
 18 | 
 19 | func TestSmallVocab(t *testing.T) {
 20 | 	vocab := map[string]bool{
 21 | 		"ham":    true,
 22 | 		"yefet":  true,
 23 | 		"hamat":  true,
 24 | 		"hamela": true,
 25 | 		"世界":     true,
 26 | 
 27 | 		"▁▁":     true,
 28 | 		"▁▁▁":    true,
 29 | 		"▁▁▁▁":   true,
 30 | 		"▁▁▁▁▁":  true,
 31 | 		"▁▁▁▁▁▁": true,
 32 | 	}
 33 | 	pm := NewFromSet(vocab)
 34 | 
 35 | 	var tests = []struct {
 36 | 		text    string
 37 | 		wantLen int
 38 | 	}{
 39 | 		{"zyx", 0},
 40 | 		{"ham", 3},
 41 | 		{"hama", 3},
 42 | 		{"zham", 0},
 43 | 		{"hame", 3},
 44 | 		{"hamy", 3},
 45 | 		{"hamat", 5},
 46 | 		{"hamatar", 5},
 47 | 		{"hamela", 6},
 48 | 		{"hamelar", 6},
 49 | 		{"y", 0},
 50 | 		{"ye", 0},
 51 | 		{"yefet", 5},
 52 | 		{"yefeton", 5},
 53 | 		{"世界", 6},
 54 | 		{"世", 0},
 55 | 		{"世p", 0},
 56 | 		{"世界foo", 6},
 57 | 		{"▁", 0},
 58 | 		{"▁▁", 6},
 59 | 		{"▁▁▁", 9},
 60 | 		{"▁▁▁▁", 12},
 61 | 		{"▁▁▁▁▁", 15},
 62 | 		{"▁▁▁▁▁▁", 18},
 63 | 		{"▁▁▁▁▁▁▁", 18},
 64 | 		{"▁▁▁▁▁▁p", 18},
 65 | 	}
 66 | 
 67 | 	for _, tt := range tests {
 68 | 		t.Run(tt.text, func(t *testing.T) {
 69 | 			gotLen := pm.FindPrefixLen(tt.text)
 70 | 			if gotLen != tt.wantLen {
 71 | 				t.Errorf("got %v, want %v", gotLen, tt.wantLen)
 72 | 			}
 73 | 		})
 74 | 	}
 75 | }
 76 | 
 77 | func TestSingleAndDoubleLetter(t *testing.T) {
 78 | 	vocab := make(map[string]bool)
 79 | 
 80 | 	for r1 := 'a'; r1 <= 'z'; r1++ {
 81 | 		vocab[string(r1)] = true
 82 | 
 83 | 		for r2 := 'a'; r2 <= 'z'; r2++ {
 84 | 			vocab[string(r1)+string(r2)] = true
 85 | 		}
 86 | 	}
 87 | 
 88 | 	pm := NewFromSet(vocab)
 89 | 
 90 | 	assertLen := func(text string, wantLen int) {
 91 | 		t.Helper()
 92 | 		gotLen := pm.FindPrefixLen(text)
 93 | 		if gotLen != wantLen {
 94 | 			t.Errorf("got %v, want %v", gotLen, wantLen)
 95 | 		}
 96 | 	}
 97 | 
 98 | 	for r1 := 'a'; r1 <= 'z'; r1++ {
 99 | 		assertLen(string(r1), 1)
100 | 		for r2 := 'a'; r2 <= 'z'; r2++ {
101 | 			assertLen(string(r1)+string(r2), 2)
102 | 			for r3 := 'a'; r3 <= 'z'; r3++ {
103 | 				assertLen(string(r1)+string(r2)+string(r3), 2)
104 | 			}
105 | 		}
106 | 	}
107 | }
108 | 


--------------------------------------------------------------------------------
/internal/priorityqueue/priorityqueue.go:
--------------------------------------------------------------------------------
  1 | // Package priorityqueue provides a generic priority queue with Insert,
  2 | // PopMax, and RemoveFunc operations.
  3 | package priorityqueue
  4 | 
  5 | // PriorityQueue is a generic priority queue with a configurable comparison
  6 | // function.
  7 | type PriorityQueue[T any] struct {
  8 | 	cmp func(a, b T) int
  9 | 
 10 | 	// items holds the queue's items as a binary heap.
 11 | 	// items[0] is a dummy element that's not used. If the queue has N elements,
 12 | 	// they are stored at indices 1...N (N == len(items)-1)
 13 | 	// For an element at index i, its parent is at index i/2, and its children
 14 | 	// are at indices 2i and 2i+1. The root of the heap is at index 1.
 15 | 	items []T
 16 | }
 17 | 
 18 | // New creates a new PriorityQueue, configured with a function that
 19 | // compares the priorities of two items a and b; it should return a number > 0
 20 | // if the priority of a is higher, 0 if the priorities are equal, and a
 21 | // number < 0 otherwise.
 22 | // sizeHint sets the initial capacity of the queue; -1 means to use the default.
 23 | func New[T any](sizeHint int, cmp func(a, b T) int) *PriorityQueue[T] {
 24 | 	return &PriorityQueue[T]{cmp: cmp, items: make([]T, 1, max(1, sizeHint+1))}
 25 | }
 26 | 
 27 | // Len returns the length (number of items) of the priority queue.
 28 | func (pq *PriorityQueue[T]) Len() int {
 29 | 	return len(pq.items) - 1
 30 | }
 31 | 
 32 | // Insert inserts a new element into the priority queue.
 33 | func (pq *PriorityQueue[T]) Insert(elem T) {
 34 | 	pq.items = append(pq.items, elem)
 35 | 	pq.siftup(len(pq.items) - 1)
 36 | }
 37 | 
 38 | // PopMax returns the element with the maximal priority in the queue, and
 39 | // removes it from the queue. Warning: to maintain a clean API, PopMax panics
 40 | // if the queue is empty. Make sure to check Len() first.
 41 | func (pq *PriorityQueue[T]) PopMax() T {
 42 | 	if len(pq.items) < 2 {
 43 | 		panic("popping from empty priority queue")
 44 | 	}
 45 | 	maxItem := pq.items[1]
 46 | 	pq.items[1] = pq.items[len(pq.items)-1]
 47 | 	pq.items = pq.items[:len(pq.items)-1]
 48 | 	pq.siftdown(1)
 49 | 	return maxItem
 50 | }
 51 | 
 52 | // RemoveFunc removes all elements for which rm returns true.
 53 | func (pq *PriorityQueue[T]) RemoveFunc(rm func(T) bool) {
 54 | 	// This is effectively slices.DeleteFunc, but inlined because we start from index 1.
 55 | 	i := 1
 56 | 	for ; i < len(pq.items); i++ {
 57 | 		if rm(pq.items[i]) {
 58 | 			break
 59 | 		}
 60 | 	}
 61 | 	if i == len(pq.items) {
 62 | 		return // nothing to remove
 63 | 	}
 64 | 	for j := i + 1; j < len(pq.items); j++ {
 65 | 		if v := pq.items[j]; !rm(v) {
 66 | 			pq.items[i] = v
 67 | 			i++
 68 | 		}
 69 | 	}
 70 | 	// Clear the tail.
 71 | 	clear(pq.items[i:])
 72 | 	pq.items = pq.items[:i]
 73 | 	pq.rebuildHeap()
 74 | }
 75 | 
 76 | // rebuildHeap rebuilds the entire heap from scratch.
 77 | func (pq *PriorityQueue[T]) rebuildHeap() {
 78 | 	for i := len(pq.items) / 2; i >= 1; i-- {
 79 | 		pq.siftdown(i)
 80 | 	}
 81 | }
 82 | 
 83 | func (pq *PriorityQueue[T]) siftup(n int) {
 84 | 	i := n
 85 | 	for {
 86 | 		if i == 1 {
 87 | 			// Reached root, we're done.
 88 | 			return
 89 | 		}
 90 | 		// p is the index of i's parent
 91 | 		// if p parent has a higher priority than i, we're done.
 92 | 		p := i / 2
 93 | 		if pq.cmp(pq.items[p], pq.items[i]) >= 0 {
 94 | 			return
 95 | 		}
 96 | 		pq.items[i], pq.items[p] = pq.items[p], pq.items[i]
 97 | 		i = p
 98 | 	}
 99 | }
100 | 
101 | func (pq *PriorityQueue[T]) siftdown(i int) {
102 | 	for {
103 | 		c := 2 * i
104 | 		if c >= len(pq.items) {
105 | 			return
106 | 		}
107 | 		// c is not out of bounds, so it's the index of the left child of i
108 | 
109 | 		// Figure out the child index with the maximal priority
110 | 		maxChild := c
111 | 		if c+1 < len(pq.items) {
112 | 			// c+1 is not out of bounds, so it's the index of the right child of i
113 | 			if pq.cmp(pq.items[c+1], pq.items[c]) > 0 {
114 | 				maxChild = c + 1
115 | 			}
116 | 		}
117 | 		if pq.cmp(pq.items[i], pq.items[maxChild]) >= 0 {
118 | 			// i has higher priority than either child, so we're done.
119 | 			return
120 | 		}
121 | 
122 | 		pq.items[i], pq.items[maxChild] = pq.items[maxChild], pq.items[i]
123 | 		i = maxChild
124 | 	}
125 | }
126 | 


--------------------------------------------------------------------------------
/internal/priorityqueue/priorityqueue_test.go:
--------------------------------------------------------------------------------
  1 | package priorityqueue
  2 | 
  3 | import (
  4 | 	"math/rand"
  5 | 	"slices"
  6 | 	"testing"
  7 | )
  8 | 
  9 | func TestBasicQueueWithStrings(t *testing.T) {
 10 | 	stringLenCmp := func(a, b string) int {
 11 | 		return len(a) - len(b)
 12 | 	}
 13 | 
 14 | 	pq := New(-1, stringLenCmp)
 15 | 
 16 | 	assertPopAndSize := func(s string, n int) {
 17 | 		t.Helper()
 18 | 		got := pq.PopMax()
 19 | 		if got != s {
 20 | 			t.Errorf("got %v, want %v", got, s)
 21 | 		}
 22 | 		if n != pq.Len() {
 23 | 			t.Errorf("got len=%v, want %v", pq.Len(), n)
 24 | 		}
 25 | 	}
 26 | 
 27 | 	pq.Insert("one")
 28 | 	pq.Insert("four")
 29 | 	pq.Insert("sixteen")
 30 | 	pq.Insert("un")
 31 | 
 32 | 	// Pop all elements in max order
 33 | 	assertPopAndSize("sixteen", 3)
 34 | 	assertPopAndSize("four", 2)
 35 | 	assertPopAndSize("one", 1)
 36 | 	assertPopAndSize("un", 0)
 37 | 
 38 | 	// Insert+pop, insert+pop...
 39 | 	pq.Insert("xyz")
 40 | 	assertPopAndSize("xyz", 0)
 41 | 	pq.Insert("foobarbaz")
 42 | 	assertPopAndSize("foobarbaz", 0)
 43 | 	pq.Insert("1")
 44 | 	assertPopAndSize("1", 0)
 45 | 
 46 | 	// Inserts after popping some
 47 | 	pq.Insert("mercury")
 48 | 	pq.Insert("venus")
 49 | 	assertPopAndSize("mercury", 1)
 50 | 	pq.Insert("jupiter")
 51 | 	assertPopAndSize("jupiter", 1)
 52 | 	pq.Insert("moon")
 53 | 	assertPopAndSize("venus", 1)
 54 | 	assertPopAndSize("moon", 0)
 55 | 
 56 | 	// Insert two, pop 1, a few times
 57 | 	pq.Insert("mercury")
 58 | 	pq.Insert("venus")
 59 | 	assertPopAndSize("mercury", 1)
 60 | 	pq.Insert("mars")
 61 | 	pq.Insert("jupiter")
 62 | 	assertPopAndSize("jupiter", 2) // contains: venus, mars
 63 | 	pq.Insert("ganimede")
 64 | 	pq.Insert("europa")
 65 | 	assertPopAndSize("ganimede", 3) // contains: venus, mars, europa
 66 | 	pq.Insert("enceladus")
 67 | 	pq.Insert("io")
 68 | 	assertPopAndSize("enceladus", 4)
 69 | 	assertPopAndSize("europa", 3)
 70 | 	assertPopAndSize("venus", 2)
 71 | 	assertPopAndSize("mars", 1)
 72 | 	assertPopAndSize("io", 0)
 73 | 
 74 | 	// Insert these words in random orders; they should still all pop in the
 75 | 	// expected order by length.
 76 | 	words := []string{"z", "xy", "uvw", "post", "dworb"}
 77 | 	for i := 0; i < 100; i++ {
 78 | 		w := slices.Clone(words)
 79 | 		rand.Shuffle(len(w), func(i, j int) {
 80 | 			w[i], w[j] = w[j], w[i]
 81 | 		})
 82 | 
 83 | 		for _, word := range w {
 84 | 			pq.Insert(word)
 85 | 		}
 86 | 
 87 | 		assertPopAndSize("dworb", 4)
 88 | 		assertPopAndSize("post", 3)
 89 | 		assertPopAndSize("uvw", 2)
 90 | 		assertPopAndSize("xy", 1)
 91 | 		assertPopAndSize("z", 0)
 92 | 	}
 93 | }
 94 | 
 95 | func TestBasicQueueWithCustomType(t *testing.T) {
 96 | 	type Item struct {
 97 | 		Name string
 98 | 		Cost int
 99 | 	}
100 | 
101 | 	itemCostCmp := func(a, b Item) int {
102 | 		return a.Cost - b.Cost
103 | 	}
104 | 
105 | 	pq := New(-1, itemCostCmp)
106 | 
107 | 	assertPop := func(s string) {
108 | 		t.Helper()
109 | 		got := pq.PopMax()
110 | 		if got.Name != s {
111 | 			t.Errorf("got %v, want %v", got.Name, s)
112 | 		}
113 | 	}
114 | 
115 | 	// Push in decreasing cost order
116 | 	pq.Insert(Item{"joe", 20})
117 | 	pq.Insert(Item{"maxm", 3})
118 | 	pq.Insert(Item{"jabbar", 1})
119 | 	assertPop("joe")
120 | 	assertPop("maxm")
121 | 	assertPop("jabbar")
122 | 
123 | 	// Push in increasing cost order
124 | 	pq.Insert(Item{"x", 1})
125 | 	pq.Insert(Item{"y", 29})
126 | 	pq.Insert(Item{"z", 88})
127 | 	assertPop("z")
128 | 	assertPop("y")
129 | 	assertPop("x")
130 | }
131 | 


--------------------------------------------------------------------------------
/normalize.go:
--------------------------------------------------------------------------------
 1 | package sentencepiece
 2 | 
 3 | import "strings"
 4 | 
 5 | // normalize performs unicode normalization.
 6 | //
 7 | // SentencePiece has a feature to perform configurable unicode normalization on
 8 | // the input text and has some options for adding dummy whitespace prefixes or
 9 | // trimming whitespace. However, the model we're working with has a very simple
10 | // normalizer that does none of this. These options can be added in the future
11 | // if needed.
12 | func normalize(text string) string {
13 | 	return replaceSpacesBySeparator(text)
14 | }
15 | 
16 | const whitespaceSeparator = "▁"
17 | 
18 | // replaceSpacesBySeparator replaces spaces by the whitespace separator used by
19 | // the model.
20 | func replaceSpacesBySeparator(text string) string {
21 | 	return strings.ReplaceAll(text, " ", whitespaceSeparator)
22 | }
23 | 
24 | // replaceSeparatorsBySpace replaces the whitespace separator used by
25 | // the model back with spaces.
26 | func replaceSeparatorsBySpace(text string) string {
27 | 	return strings.ReplaceAll(text, whitespaceSeparator, " ")
28 | }
29 | 


--------------------------------------------------------------------------------
/processor.go:
--------------------------------------------------------------------------------
  1 | package sentencepiece
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"io"
  6 | 	"os"
  7 | 	"strconv"
  8 | 	"strings"
  9 | 	"unicode/utf8"
 10 | 
 11 | 	"github.com/eliben/go-sentencepiece/internal/model"
 12 | 	"github.com/eliben/go-sentencepiece/internal/prefixmatcher"
 13 | 	"github.com/eliben/go-sentencepiece/internal/priorityqueue"
 14 | 	"google.golang.org/protobuf/proto"
 15 | )
 16 | 
 17 | const debugEncode = false
 18 | 
 19 | // Processor represents a SentencePiece processor (tokenizer).
 20 | // A Processor converts input text into a sequence of tokens LLMs use, and back.
 21 | // The mapping between token IDs and the text they represent is read from the
 22 | // model proto (provided to the constructor); it's the same between all calls
 23 | // to the Encode method.
 24 | //
 25 | // The term "processor" comes from the original C++ SentencePiece library and
 26 | // its Python bindings.
 27 | type Processor struct {
 28 | 	model *model.ModelProto
 29 | 
 30 | 	pieces   map[string]int
 31 | 	reserved map[string]int
 32 | 
 33 | 	// unknownID is the token identifier of the UNKNOWN piece
 34 | 	unknownID int
 35 | 
 36 | 	// userDefinedMatcher is a prefix matcher for symbols that are of
 37 | 	// "user-defined" type in the model proto.
 38 | 	userDefinedMatcher *prefixmatcher.PrefixMatcher
 39 | 
 40 | 	// byte2Token is a cache of byte values and the tokens they represent
 41 | 	byte2Token map[byte]Token
 42 | 
 43 | 	// idToByte maps IDs to byte values they represent
 44 | 	idToByte map[int]byte
 45 | 
 46 | 	// maxPieceLength is the maximum length of a piece in the model.
 47 | 	// This is used to preallocate a buffer for merging symbols.
 48 | 	maxPieceLength int
 49 | }
 50 | 
 51 | // NewProcessorFromPath creates a new Processor from a file path to the protobuf
 52 | // data.
 53 | func NewProcessorFromPath(protoFile string) (*Processor, error) {
 54 | 	f, err := os.Open(protoFile)
 55 | 	if err != nil {
 56 | 		return nil, fmt.Errorf("unable to read %q: %v", protoFile, err)
 57 | 	}
 58 | 	defer f.Close()
 59 | 	return NewProcessor(f)
 60 | }
 61 | 
 62 | // NewProcessor creates a new Processor from a reader with the protobuf data.
 63 | func NewProcessor(protoReader io.Reader) (*Processor, error) {
 64 | 	b, err := io.ReadAll(protoReader)
 65 | 	if err != nil {
 66 | 		return nil, fmt.Errorf("unable to read protobuf data: %v", err)
 67 | 	}
 68 | 
 69 | 	var mp model.ModelProto
 70 | 	err = proto.Unmarshal(b, &mp)
 71 | 	if err != nil {
 72 | 		return nil, fmt.Errorf("unable to unmarshal protobuf: %v", err)
 73 | 	}
 74 | 
 75 | 	tspec := mp.GetTrainerSpec()
 76 | 	if tspec.GetModelType() != model.TrainerSpec_BPE {
 77 | 		return nil, fmt.Errorf("model type %s not supported", tspec.GetModelType())
 78 | 	}
 79 | 
 80 | 	nspec := mp.GetNormalizerSpec()
 81 | 	if *nspec.AddDummyPrefix || *nspec.RemoveExtraWhitespaces {
 82 | 		return nil, fmt.Errorf("normalizer spec options not supported: %s", nspec)
 83 | 	}
 84 | 
 85 | 	userDefined := make(map[string]bool)
 86 | 	pieces := make(map[string]int)
 87 | 	reserved := make(map[string]int)
 88 | 	byte2Token := make(map[byte]Token)
 89 | 	idToByte := make(map[int]byte)
 90 | 	unkID := -1
 91 | 	maxPieceLength := 0
 92 | 
 93 | 	for i, piece := range mp.GetPieces() {
 94 | 		isNormalPiece := (piece.GetType() == model.ModelProto_SentencePiece_NORMAL ||
 95 | 			piece.GetType() == model.ModelProto_SentencePiece_USER_DEFINED ||
 96 | 			piece.GetType() == model.ModelProto_SentencePiece_UNUSED)
 97 | 
 98 | 		if isNormalPiece {
 99 | 			pieces[piece.GetPiece()] = i
100 | 			maxPieceLength = max(maxPieceLength, len(piece.GetPiece()))
101 | 		} else {
102 | 			reserved[piece.GetPiece()] = i
103 | 		}
104 | 
105 | 		if piece.GetType() == model.ModelProto_SentencePiece_USER_DEFINED {
106 | 			userDefined[piece.GetPiece()] = true
107 | 		} else if piece.GetType() == model.ModelProto_SentencePiece_UNKNOWN {
108 | 			if unkID > 0 {
109 | 				return nil, fmt.Errorf("unk redefined")
110 | 			}
111 | 			unkID = i
112 | 		} else if piece.GetType() == model.ModelProto_SentencePiece_BYTE {
113 | 			if !tspec.GetByteFallback() {
114 | 				return nil, fmt.Errorf("byte piece %q is found although `byte_fallback=false`", piece.GetPiece())
115 | 			}
116 | 			bv := convertHexValue(piece.GetPiece())
117 | 			if bv >= 0 && bv < 256 {
118 | 				byte2Token[byte(bv)] = Token{ID: i, Text: piece.GetPiece()}
119 | 				idToByte[i] = byte(bv)
120 | 			}
121 | 		}
122 | 	}
123 | 
124 | 	if unkID < 0 {
125 | 		return nil, fmt.Errorf("unk symbol is not defined")
126 | 	}
127 | 
128 | 	// In case byte_fallback is specified, make sure that all 256 possible byte
129 | 	// values were found.
130 | 	if tspec.GetByteFallback() {
131 | 		for i := 0; i < 256; i++ {
132 | 			if _, found := byte2Token[byte(i)]; !found {
133 | 				return nil, fmt.Errorf("byte value 0x%02X not found", i)
134 | 			}
135 | 		}
136 | 	}
137 | 
138 | 	return &Processor{
139 | 		model:              &mp,
140 | 		userDefinedMatcher: prefixmatcher.NewFromSet(userDefined),
141 | 		byte2Token:         byte2Token,
142 | 		idToByte:           idToByte,
143 | 		unknownID:          unkID,
144 | 		pieces:             pieces,
145 | 		reserved:           reserved,
146 | 		maxPieceLength:     maxPieceLength,
147 | 	}, nil
148 | }
149 | 
150 | // Encode tokenizes the input text and returns a list of Tokens.
151 | func (proc *Processor) Encode(text string) []Token {
152 | 	text = normalize(text)
153 | 
154 | 	// We begin by having each symbol a single Unicode character (or a
155 | 	// user-defined string), and will iteratively merge them into larger and
156 | 	// larger symbols until we have the final list of tokens.
157 | 	// Since this list of symbols changes a lot, we represent it as a
158 | 	// doubly-linked list in the symList slice. Each element in this slice has
159 | 	// prev/next links to the next "live" symbol in the list; noMerge means this
160 | 	// is a user-defined symbol we're not allowed to merge with neighbors.
161 | 	// After the algorithm is finished, many elements in symList will be "dead"
162 | 	// (unreachable by next/prev links from the first element).
163 | 	// This representation is inspired by the implementation of bpe::Model
164 | 	// in the SentencePiece C++ library.
165 | 
166 | 	type symListElem struct {
167 | 		prev, next int
168 | 		noMerge    bool
169 | 		symbol     string
170 | 	}
171 | 	symList := make([]symListElem, 0, len(text))
172 | 
173 | 	for {
174 | 		// Match the next symbol in text
175 | 		slen, found := proc.symbolMatch(text)
176 | 
177 | 		// Append a list element for this symbol; note that this element will be
178 | 		// at index len(symList), so prev/next are set up accordingly.
179 | 		sym := symListElem{
180 | 			noMerge: found,
181 | 			symbol:  text[:slen],
182 | 			prev:    len(symList) - 1,
183 | 			next:    len(symList) + 1,
184 | 		}
185 | 		symList = append(symList, sym)
186 | 
187 | 		// Advance the text slice to the next symbol; if no more text, we're done.
188 | 		text = text[slen:]
189 | 		if len(text) == 0 {
190 | 			break
191 | 		}
192 | 	}
193 | 
194 | 	if len(symList) == 0 {
195 | 		return nil
196 | 	}
197 | 	symList[len(symList)-1].next = -1
198 | 	nTokens := len(symList)
199 | 
200 | 	debugShowSymList := func(prefix string) {
201 | 		if debugEncode {
202 | 			fmt.Println(prefix)
203 | 			for i, elem := range symList {
204 | 				fmt.Printf("[%3d]: [prev: %3v, next: %3d, noMerge: %v] %q\n", i, elem.prev, elem.next, elem.noMerge, elem.symbol)
205 | 			}
206 | 		}
207 | 	}
208 | 	debugShowSymList("initial")
209 | 
210 | 	// To avoid repeating work, we manage a priority queue of "merge candidates".
211 | 	// Each candidate has pointers to the symList list for the left and right
212 | 	// symbol in the pair, as well as the combined symbol's score.
213 | 	// The priority of merging is determined by this score, with position as
214 | 	// the tie-breaker (earlier pairs are preferred).
215 | 	type mergeCandidate struct {
216 | 		left, right int
217 | 		length      int
218 | 		score       float32
219 | 	}
220 | 
221 | 	mergeQueue := priorityqueue.New(len(symList), func(a, b mergeCandidate) int {
222 | 		if a.score > b.score || (a.score == b.score && a.left < b.left) {
223 | 			return 1
224 | 		}
225 | 		return -1
226 | 	})
227 | 
228 | 	// findMerged looks for x+y in the vocabulary, and returns the
229 | 	// merged piece, its ID and true if found. buf is a reusable buffer used to
230 | 	// merge two strings together without allocations.
231 | 	buf := make([]byte, proc.maxPieceLength)
232 | 	findMerged := func(x, y symListElem) (string, int, bool) {
233 | 		buf = buf[:len(x.symbol)+len(y.symbol)]
234 | 		copy(buf, x.symbol)
235 | 		copy(buf[len(x.symbol):], y.symbol)
236 | 		if id, found := proc.pieces[string(buf)]; found {
237 | 			return proc.model.GetPieces()[id].GetPiece(), id, true
238 | 		}
239 | 		return "", 0, false
240 | 	}
241 | 
242 | 	// suggestNewMergePair is called to potentially add a new mergeCandidate to
243 | 	// mergeQueue. The candidate is added if it's valid, both its parts are
244 | 	// allowed to merge, and it appears in the vocabulary.
245 | 	suggestNewMergePair := func(left, right int) {
246 | 		if left == -1 || right == -1 || symList[left].noMerge || symList[right].noMerge {
247 | 			return
248 | 		}
249 | 
250 | 		if mergedSymbol, id, ok := findMerged(symList[left], symList[right]); ok {
251 | 			mergeQueue.Insert(mergeCandidate{
252 | 				left:   left,
253 | 				right:  right,
254 | 				length: len(mergedSymbol),
255 | 				score:  proc.model.GetPieces()[id].GetScore(),
256 | 			})
257 | 		}
258 | 	}
259 | 
260 | 	// Seed the merge queue with all pairs of symbols from symList
261 | 	for i := 1; i < len(symList); i++ {
262 | 		suggestNewMergePair(i-1, i)
263 | 	}
264 | 
265 | 	// candidateIsDead indicates that a candidate is out of date: one of its
266 | 	// parts was already merged with another symbol, so we don't want to consider
267 | 	// it any more.
268 | 	candidateIsDead := func(candidate mergeCandidate) bool {
269 | 		leftSymbol := symList[candidate.left].symbol
270 | 		rightSymbol := symList[candidate.right].symbol
271 | 		return leftSymbol == "" || rightSymbol == "" || len(leftSymbol)+len(rightSymbol) != candidate.length
272 | 	}
273 | 
274 | 	// Main loop
275 | 	mergeQueueDead := 0
276 | 	for mergeQueue.Len() > 0 {
277 | 		candidate := mergeQueue.PopMax()
278 | 		leftSymbol := symList[candidate.left]
279 | 		rightSymbol := symList[candidate.right]
280 | 
281 | 		if candidateIsDead(candidate) {
282 | 			mergeQueueDead--
283 | 			continue
284 | 		}
285 | 
286 | 		// If there are lots more dead merge candidates than live ones, remove the
287 | 		// dead. This is a relatively expensive operation but it's performed rarely,
288 | 		// and it makes the priority queue smaller - making all subsequent
289 | 		// operations faster.
290 | 		// The factor of 3 was determined empirically.
291 | 		if mergeQueueDead*3 > mergeQueue.Len() {
292 | 			mergeQueue.RemoveFunc(candidateIsDead)
293 | 			mergeQueueDead = 0
294 | 		}
295 | 
296 | 		// Do the merge:
297 | 		// 1. Merge the concatenation of leftSymbol and rightSymbol into leftSymbol
298 | 		mergedSymbol, _, ok := findMerged(leftSymbol, rightSymbol)
299 | 		if !ok {
300 | 			panic("failed to merge symbols")
301 | 		}
302 | 		symList[candidate.left].symbol = mergedSymbol
303 | 		nTokens--
304 | 
305 | 		// 2. Update prev/next pointers
306 | 		symList[candidate.left].next = rightSymbol.next
307 | 		if rightSymbol.next >= 0 {
308 | 			symList[rightSymbol.next].prev = candidate.left
309 | 		}
310 | 
311 | 		// 3. Mark the right element in the pair as outdated (it's been merged
312 | 		//    into the left one).
313 | 		symList[candidate.right].symbol = ""
314 | 		mergeQueueDead++
315 | 
316 | 		// 4. Add merge suggestions for the newly merged symbol with its neighbors
317 | 		suggestNewMergePair(leftSymbol.prev, candidate.left)
318 | 		suggestNewMergePair(candidate.left, rightSymbol.next)
319 | 	}
320 | 
321 | 	// Collect the final list of tokens from the remaining elements of symList.
322 | 	tokens := make([]Token, 0, nTokens)
323 | 	for i := 0; i >= 0; i = symList[i].next {
324 | 		symbol := symList[i].symbol
325 | 		id := proc.symbolToID(symbol)
326 | 
327 | 		if id == proc.unknownID && proc.model.GetTrainerSpec().GetByteFallback() {
328 | 			// Decompose this symbol into bytes, and report each byte as a separate
329 | 			// token.
330 | 			for i := 0; i < len(symbol); i++ {
331 | 				tokens = append(tokens, proc.byte2Token[symbol[i]])
332 | 			}
333 | 		} else {
334 | 			tokens = append(tokens, Token{ID: id, Text: symbol})
335 | 		}
336 | 	}
337 | 
338 | 	return tokens
339 | }
340 | 
341 | // symbolMatch finds the length of the first symbol in text. A symbol is either
342 | // a user-defined symbol from the proto or a single rune. The second return
343 | // value is true iff a user-defined symbol was matched.
344 | func (proc *Processor) symbolMatch(text string) (int, bool) {
345 | 	prefixLen := proc.userDefinedMatcher.FindPrefixLen(text)
346 | 	if prefixLen > 0 {
347 | 		return prefixLen, true
348 | 	}
349 | 	// Not found a user-defined prefix; get the length of next rune.
350 | 	_, rlen := utf8.DecodeRuneInString(text)
351 | 	return rlen, false
352 | }
353 | 
354 | const (
355 | 	symbolBOS = "<bos>"
356 | 	symbolEOS = "<eos>"
357 | 	symbolUNK = "<unk>"
358 | 	symbolPAD = "<pad>"
359 | )
360 | 
361 | // symbolToID finds the right ID for the given textual symbol, or returns
362 | // proc.unknownID if the symbol is unknown.
363 | func (proc *Processor) symbolToID(symbol string) int {
364 | 	if id, found := proc.reserved[symbol]; found {
365 | 		return id
366 | 	}
367 | 	if id, found := proc.pieces[symbol]; found {
368 | 		return id
369 | 	}
370 | 	return proc.unknownID
371 | }
372 | 
373 | // convertHexValue converts strings of the form "<0xXY>" to the (unsigned)
374 | // integer value of the hexadecimal number XY. -1 is returned for bad input.
375 | func convertHexValue(bv string) int {
376 | 	bv = strings.TrimPrefix(bv, "<0x")
377 | 	bv = strings.TrimSuffix(bv, ">")
378 | 	n, err := strconv.ParseInt(bv, 16, 32)
379 | 	if err != nil {
380 | 		return -1
381 | 	}
382 | 	return int(n)
383 | }
384 | 
385 | // Decode translates a list of IDs produced by [Encode] back into the string
386 | // it represents.
387 | func (proc *Processor) Decode(ids []int) string {
388 | 	var sb strings.Builder
389 | 
390 | 	for i := 0; i < len(ids); {
391 | 		// Find a run of IDs that represent single bytes starting at i.
392 | 		nextNonByte := i
393 | 		for nextNonByte < len(ids) && proc.isByteID(ids[nextNonByte]) {
394 | 			nextNonByte++
395 | 		}
396 | 		numBytes := nextNonByte - i
397 | 
398 | 		// Handle a run of numBytes IDs, by decoding them into utf8 runes.
399 | 		if numBytes > 0 {
400 | 			buf := make([]byte, 0, numBytes)
401 | 			for bi := i; bi < nextNonByte; bi++ {
402 | 				buf = append(buf, proc.idToByte[ids[bi]])
403 | 			}
404 | 
405 | 			for len(buf) > 0 {
406 | 				// DecodeRune returns utf8.RuneError ('\uFFFD') for bad UTF8 encodings,
407 | 				// and this is exactly what SentencePiece is supposed to emit for them.
408 | 				// So we don't do any special handling for UTF8 decode errors here.
409 | 				r, size := utf8.DecodeRune(buf)
410 | 				sb.WriteRune(r)
411 | 				buf = buf[size:]
412 | 			}
413 | 		}
414 | 
415 | 		if nextNonByte >= len(ids) {
416 | 			break
417 | 		}
418 | 		// Here nextNonByte is the index of an ID that's not a single byte.
419 | 		id := ids[nextNonByte]
420 | 		if proc.isControlID(id) {
421 | 			// Don't emit anything for control IDs
422 | 		} else if id == proc.unknownID {
423 | 			// Special "unk_surface" string for unknown IDs
424 | 			sb.WriteString(proc.model.GetTrainerSpec().GetUnkSurface())
425 | 		} else {
426 | 			piece := proc.model.GetPieces()[id].GetPiece()
427 | 			sb.WriteString(replaceSeparatorsBySpace(piece))
428 | 		}
429 | 		i = nextNonByte + 1
430 | 	}
431 | 
432 | 	return sb.String()
433 | }
434 | 
435 | // DecodeTokens is a convenience wrapper around [Decode], accepting a list of
436 | // tokens as returned by [Encode]. It only uses the ID fields of tokens to
437 | // decode the text.
438 | func (proc *Processor) DecodeTokens(tokens []Token) string {
439 | 	ids := make([]int, len(tokens))
440 | 	for i, t := range tokens {
441 | 		ids[i] = t.ID
442 | 	}
443 | 	return proc.Decode(ids)
444 | }
445 | 
446 | func (proc *Processor) isByteID(id int) bool {
447 | 	return proc.model.GetPieces()[id].GetType() == model.ModelProto_SentencePiece_BYTE
448 | }
449 | 
450 | func (proc *Processor) isControlID(id int) bool {
451 | 	return proc.model.GetPieces()[id].GetType() == model.ModelProto_SentencePiece_CONTROL
452 | }
453 | 
454 | // ModelInfo stores information about the model proto loaded by the processor.
455 | type ModelInfo struct {
456 | 	VocabularySize        int
457 | 	BeginningOfSentenceID int
458 | 	EndOfSentenceID       int
459 | 	UnknownID             int
460 | 	PadID                 int
461 | }
462 | 
463 | // ModelInfo returns information about the loaded proto model file.
464 | func (proc *Processor) ModelInfo() *ModelInfo {
465 | 	getControlID := func(symbol string) int {
466 | 		if id := proc.symbolToID(symbol); proc.isControlID(id) {
467 | 			return id
468 | 		}
469 | 		return -1
470 | 	}
471 | 
472 | 	return &ModelInfo{
473 | 		VocabularySize:        len(proc.model.GetPieces()),
474 | 		BeginningOfSentenceID: getControlID(symbolBOS),
475 | 		EndOfSentenceID:       getControlID(symbolEOS),
476 | 		PadID:                 getControlID(symbolPAD),
477 | 		UnknownID:             proc.unknownID,
478 | 	}
479 | }
480 | 


--------------------------------------------------------------------------------
/processor_test.go:
--------------------------------------------------------------------------------
  1 | package sentencepiece
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"os"
  6 | 	"slices"
  7 | 	"testing"
  8 | )
  9 | 
 10 | func createProcessor(t testing.TB) *Processor {
 11 | 	t.Helper()
 12 | 	protoFile := os.Getenv("MODELPATH")
 13 | 	if protoFile == "" {
 14 | 		t.Fatal("Need MODELPATH env var to run tests")
 15 | 	}
 16 | 
 17 | 	proc, err := NewProcessorFromPath(protoFile)
 18 | 	if err != nil {
 19 | 		t.Error(err)
 20 | 	}
 21 | 	return proc
 22 | }
 23 | 
 24 | func TestEncodeIDs(t *testing.T) {
 25 | 	proc := createProcessor(t)
 26 | 
 27 | 	var tests = []struct {
 28 | 		text    string
 29 | 		wantIDs []int
 30 | 	}{
 31 | 		{"hello world", []int{17534, 2134}},
 32 | 		{"12345", []int{235274, 235284, 235304, 235310, 235308}},
 33 | 		{"  ", []int{139}},
 34 | 		{"   ", []int{140}},
 35 | 		{"        ", []int{145}},
 36 | 		{"ҔӌԐڎ", []int{427, 365, 428, 357, 429, 361, 435, 359}},
 37 | 		{" <mask>  <pad>", []int{235248, 4, 139, 235322, 8939, 235313}},
 38 | 		{"<table><th></th></table>", []int{169, 175, 183, 177}},
 39 | 		{"one line\nand another line", []int{785, 2017, 108, 639, 2550, 2017}},
 40 | 		{"Language: English\r\n\r\nCredits: Produced by David Widger\r\n", []int{14357, 235292, 4645, 235316, 108, 235316, 108, 34711, 235292, 99662, 731, 6046, 37303, 1197, 235316, 108}},
 41 | 		{"Bienvenido a este proyecto", []int{176831, 476, 4004, 25431}},
 42 | 		{"अस्मिन् परियोजनायां स्वागतम्", []int{236088, 22740, 212361, 18029, 14480, 19900, 146166, 6751, 235563, 56545, 44071, 235550, 26989}},
 43 | 		{"if allow == true { return x;} else {return x+y;}", []int{648, 2765, 1159, 1382, 612, 2203, 1141, 22505, 1354, 612, 773, 1141, 235340, 235267, 22505}},
 44 | 	}
 45 | 
 46 | 	for _, tt := range tests {
 47 | 		t.Run(tt.text, func(t *testing.T) {
 48 | 			got := proc.Encode(tt.text)
 49 | 
 50 | 			var gotIDs []int
 51 | 			for _, t := range got {
 52 | 				gotIDs = append(gotIDs, t.ID)
 53 | 			}
 54 | 
 55 | 			if !slices.Equal(gotIDs, tt.wantIDs) {
 56 | 				t.Errorf("got  %v\nwant: %v\n", gotIDs, tt.wantIDs)
 57 | 			}
 58 | 		})
 59 | 	}
 60 | }
 61 | 
 62 | func TestProcessorWithText(t *testing.T) {
 63 | 	proc := createProcessor(t)
 64 | 
 65 | 	var tests = []struct {
 66 | 		text       string
 67 | 		wantTokens []Token
 68 | 	}{
 69 | 		{"hi <td> bye",
 70 | 			[]Token{
 71 | 				{544, "hi"},
 72 | 				{235248, "▁"},
 73 | 				{176, "<td>"},
 74 | 				{44788, "▁bye"},
 75 | 			}},
 76 | 		{"hiƻ <td>🤨there ⇲bob, สวัสดี",
 77 | 			[]Token{
 78 | 				{544, "hi"},
 79 | 				{415, "<0xC6>"},
 80 | 				{404, "<0xBB>"},
 81 | 				{235248, "▁"},
 82 | 				{176, "<td>"},
 83 | 				{241847, "🤨"},
 84 | 				{11048, "there"},
 85 | 				{235248, "▁"},
 86 | 				{248372, "⇲"},
 87 | 				{26242, "bob"},
 88 | 				{235269, ","},
 89 | 				{12515, "▁ส"},
 90 | 				{151622, "วัส"},
 91 | 				{28890, "ดี"},
 92 | 			}},
 93 | 	}
 94 | 
 95 | 	for _, tt := range tests {
 96 | 		t.Run(tt.text, func(t *testing.T) {
 97 | 			got := proc.Encode(tt.text)
 98 | 			if !slices.Equal(got, tt.wantTokens) {
 99 | 				t.Errorf("got  %v\nwant: %v\n", got, tt.wantTokens)
100 | 			}
101 | 		})
102 | 	}
103 | }
104 | 
105 | func TestSymbolMatch(t *testing.T) {
106 | 	proc := createProcessor(t)
107 | 
108 | 	var tests = []struct {
109 | 		text      string
110 | 		wantLen   int
111 | 		wantFound bool
112 | 	}{
113 | 		{"<td>", 4, true},
114 | 		{"<s>", 3, true},
115 | 		{"</s>", 4, true},
116 | 		{"<start_of_turn>", 15, true},
117 | 		{"<start_of_turn!", 1, false},
118 | 		{"▁▁", 6, true},
119 | 		{"▁▁▁▁▁▁", 18, true},
120 | 		{"bob", 1, false},
121 | 		{"🤨", 4, false},
122 | 		{"สวัสดี", 3, false},
123 | 	}
124 | 
125 | 	for _, tt := range tests {
126 | 		t.Run(tt.text, func(t *testing.T) {
127 | 			gotLen, gotFound := proc.symbolMatch(tt.text)
128 | 			if gotLen != tt.wantLen || gotFound != tt.wantFound {
129 | 				t.Errorf("got (%v, %v), want (%v, %v)", gotLen, gotFound, tt.wantLen, tt.wantFound)
130 | 			}
131 | 		})
132 | 	}
133 | }
134 | 
135 | func TestConvertHexValue(t *testing.T) {
136 | 	var tests = []struct {
137 | 		in    string
138 | 		wantN int
139 | 	}{
140 | 		{"<0x40>", 64},
141 | 		{"<0x00>", 0},
142 | 		{"<0x1a>", 26},
143 | 		{"<0xF3>", 243},
144 | 
145 | 		{"0x12>", -1},
146 | 		{"<x12>", -1},
147 | 		{"<012>", -1},
148 | 		{"<0xTA>", -1},
149 | 	}
150 | 
151 | 	for _, tt := range tests {
152 | 		t.Run(tt.in, func(t *testing.T) {
153 | 			gotN := convertHexValue(tt.in)
154 | 			if gotN != tt.wantN {
155 | 				t.Errorf("got %v, want %v", gotN, tt.wantN)
156 | 			}
157 | 		})
158 | 	}
159 | }
160 | 
161 | func TestDecoder(t *testing.T) {
162 | 	proc := createProcessor(t)
163 | 
164 | 	var tests = []struct {
165 | 		IDs      []int
166 | 		wantText string
167 | 	}{
168 | 		{[]int{17534, 2134}, "hello world"},
169 | 		{[]int{427, 365, 428, 357, 29422, 1653, 427, 365, 428, 357}, "Ҕӌnever againҔӌ"},
170 | 		{[]int{785, 2017, 108, 639, 2550, 2017}, "one line\nand another line"},
171 | 		{[]int{1001, 1002, 1003, 1004}, "buark}) res"},
172 | 		{[]int{111001, 111002, 111003, 111004}, " Wichita EducaçãoVocabulary天堂"},
173 | 		{[]int{139}, "  "},
174 | 		{[]int{140}, "   "},
175 | 		{[]int{145}, "        "},
176 | 		{[]int{441, 401, 387}, "ส"},
177 | 		{[]int{411, 380}, "£"},
178 | 
179 | 		// control IDs (0, 1, 2)
180 | 		{[]int{2, 411, 380}, "£"},
181 | 		{[]int{1, 2, 411, 380}, "£"},
182 | 		{[]int{2, 411, 380, 0, 1, 2, 0}, "£"},
183 | 
184 | 		// unknown (id=3)
185 | 		{[]int{3, 411, 380}, " ⁇ £"},
186 | 		{[]int{3, 3, 1000, 3}, " ⁇  ⁇ ew ⁇ "},
187 | 
188 | 		// invalid bytes for UTF-8, produce "invalid unicode" runes
189 | 		{[]int{349, 349, 349}, "���"},
190 | 		{[]int{800, 348, 500, 348}, "sed�it�"},
191 | 	}
192 | 
193 | 	for _, tt := range tests {
194 | 		t.Run(fmt.Sprintf("%v", tt.IDs), func(t *testing.T) {
195 | 			got := proc.Decode(tt.IDs)
196 | 			if got != tt.wantText {
197 | 				t.Errorf("got %q\nwant %q\n", got, tt.wantText)
198 | 			}
199 | 		})
200 | 	}
201 | }
202 | 
203 | func TestDecodeTokens(t *testing.T) {
204 | 	proc := createProcessor(t)
205 | 	wantText := "hello   world"
206 | 	tokens := []Token{
207 | 		Token{17534, "xxx"},
208 | 		Token{139, "xxx"},
209 | 		Token{2134, "xxx"}}
210 | 
211 | 	text := proc.DecodeTokens(tokens)
212 | 	if text != wantText {
213 | 		t.Errorf("got %q, want %q", text, wantText)
214 | 	}
215 | }
216 | 
217 | func TestInfo(t *testing.T) {
218 | 	proc := createProcessor(t)
219 | 	info := proc.ModelInfo()
220 | 
221 | 	// Assumes we use the known model file
222 | 	wantVocabSize := 256000
223 | 	wantBOS := 2
224 | 	wantEOS := 1
225 | 	wantPAD := 0
226 | 	wantUNK := 3
227 | 
228 | 	if info.VocabularySize != wantVocabSize {
229 | 		t.Errorf("got %v, want %v", info.VocabularySize, wantVocabSize)
230 | 	}
231 | 	if info.BeginningOfSentenceID != wantBOS {
232 | 		t.Errorf("got %v, want %v", info.BeginningOfSentenceID, wantBOS)
233 | 	}
234 | 	if info.EndOfSentenceID != wantEOS {
235 | 		t.Errorf("got %v, want %v", info.EndOfSentenceID, wantEOS)
236 | 	}
237 | 	if info.PadID != wantPAD {
238 | 		t.Errorf("got %v, want %v", info.PadID, wantPAD)
239 | 	}
240 | 	if info.UnknownID != wantUNK {
241 | 		t.Errorf("got %v, want %v", info.UnknownID, wantUNK)
242 | 	}
243 | }
244 | 


--------------------------------------------------------------------------------
/system_test.go:
--------------------------------------------------------------------------------
  1 | package sentencepiece
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"bytes"
  6 | 	"fmt"
  7 | 	"io/ioutil"
  8 | 	"log"
  9 | 	"os"
 10 | 	"os/exec"
 11 | 	"path/filepath"
 12 | 	"slices"
 13 | 	"strconv"
 14 | 	"testing"
 15 | )
 16 | 
 17 | // "System" test for comparing our Procesor with the canonical sentencepiece
 18 | // Python package (officially distributed with the original C++ implementation
 19 | // of the algorithm).
 20 | // It also runs Decode for a round-trip test to ensure we get the original
 21 | // text back.
 22 | //
 23 | // This test will only run if python3 is available and is able to successfully
 24 | // load the sentencepiece library. Typically this means that 'go test' will
 25 | // have to run from an activated Python virtual environment where the library
 26 | // was installed.
 27 | 
 28 | func TestVsSentencepiecePython(t *testing.T) {
 29 | 	proc := createProcessor(t)
 30 | 
 31 | 	if _, err := exec.Command("python3", "-c", "import sentencepiece").Output(); err != nil {
 32 | 		t.Skip("This test only runs when python3 with sentencepiece is available")
 33 | 	}
 34 | 	pyProgramPath := filepath.Join("test", "sp-dump-ids.py")
 35 | 
 36 | 	paths, err := filepath.Glob(filepath.Join("test", "*.txt"))
 37 | 	if err != nil {
 38 | 		t.Fatal(err)
 39 | 	}
 40 | 
 41 | 	for _, path := range paths {
 42 | 		_, filename := filepath.Split(path)
 43 | 		testname := filename[:len(filename)-len(filepath.Ext(path))]
 44 | 
 45 | 		t.Run(testname, func(t *testing.T) {
 46 | 			// Step 1: run the Python program to tokenize path into IDs.
 47 | 			pyOut, err := exec.Command("python3", pyProgramPath, path).Output()
 48 | 			if err != nil {
 49 | 				t.Fatalf("while running %v on %v: %v", pyProgramPath, path, err)
 50 | 			}
 51 | 
 52 | 			pyIDs := pyOutToIDs(pyOut)
 53 | 
 54 | 			// Step 2: use our Processor to tokenize path into IDs.
 55 | 			buf, err := ioutil.ReadFile(path)
 56 | 			if err != nil {
 57 | 				log.Fatal(err)
 58 | 			}
 59 | 			text := string(buf)
 60 | 			var goIDs []int
 61 | 			goTokens := proc.Encode(text)
 62 | 			for _, t := range goTokens {
 63 | 				goIDs = append(goIDs, t.ID)
 64 | 			}
 65 | 
 66 | 			// Step 3: compare the two; dump IDs to temp files for debugging in case
 67 | 			// of a mismatch.
 68 | 			if !slices.Equal(pyIDs, goIDs) {
 69 | 				tmppy := dumpIDsToTempFile(testname+"-py-", pyIDs)
 70 | 				tmpgo := dumpIDsToTempFile(testname+"-go-", goIDs)
 71 | 
 72 | 				t.Errorf("IDs mismatch; dumped to %q and %q", tmppy, tmpgo)
 73 | 			}
 74 | 
 75 | 			// Step 4: round-trip Decode to get original text back
 76 | 			newText := proc.Decode(goIDs)
 77 | 			if text != newText {
 78 | 				t.Errorf("text mismatch after Decode")
 79 | 			}
 80 | 		})
 81 | 	}
 82 | }
 83 | 
 84 | // pyOutToIDs takes the entire stdout output of the Python program and parses
 85 | // it into a list of integer IDs.
 86 | func pyOutToIDs(pyOut []byte) []int {
 87 | 	var IDs []int
 88 | 	scanner := bufio.NewScanner(bytes.NewReader(pyOut))
 89 | 	for scanner.Scan() {
 90 | 		i, err := strconv.Atoi(scanner.Text())
 91 | 		if err != nil {
 92 | 			log.Fatal(err)
 93 | 		}
 94 | 		IDs = append(IDs, i)
 95 | 	}
 96 | 	if err := scanner.Err(); err != nil {
 97 | 		log.Fatal(err)
 98 | 	}
 99 | 	return IDs
100 | }
101 | 
102 | // dumpIDsToTempFile dumps the given IDs (one per line) to a temporary file with
103 | // the given prefix, and returns the name of the temporary file.
104 | func dumpIDsToTempFile(prefix string, IDs []int) string {
105 | 	tf, err := os.CreateTemp("", prefix)
106 | 	if err != nil {
107 | 		log.Fatal(err)
108 | 	}
109 | 	defer tf.Close()
110 | 
111 | 	for _, id := range IDs {
112 | 		fmt.Fprintf(tf, "%d\n", id)
113 | 	}
114 | 	return tf.Name()
115 | }
116 | 


--------------------------------------------------------------------------------
/test/gocode1.txt:
--------------------------------------------------------------------------------
  1 | var (
  2 | 	file_sentencepiece_model_proto_rawDescOnce sync.Once
  3 | 	file_sentencepiece_model_proto_rawDescData = file_sentencepiece_model_proto_rawDesc
  4 | )
  5 | 
  6 | func file_sentencepiece_model_proto_rawDescGZIP() []byte {
  7 | 	file_sentencepiece_model_proto_rawDescOnce.Do(func() {
  8 | 		file_sentencepiece_model_proto_rawDescData = protoimpl.X.CompressGZIP(file_sentencepiece_model_proto_rawDescData)
  9 | 	})
 10 | 	return file_sentencepiece_model_proto_rawDescData
 11 | }
 12 | 
 13 | var file_sentencepiece_model_proto_enumTypes = make([]protoimpl.EnumInfo, 2)
 14 | var file_sentencepiece_model_proto_msgTypes = make([]protoimpl.MessageInfo, 6)
 15 | var file_sentencepiece_model_proto_goTypes = []interface{}{
 16 | 	(TrainerSpec_ModelType)(0),         // 0: sentencepiece.TrainerSpec.ModelType
 17 | 	(ModelProto_SentencePiece_Type)(0), // 1: sentencepiece.ModelProto.SentencePiece.Type
 18 | 	(*TrainerSpec)(nil),                // 2: sentencepiece.TrainerSpec
 19 | 	(*NormalizerSpec)(nil),             // 3: sentencepiece.NormalizerSpec
 20 | 	(*SelfTestData)(nil),               // 4: sentencepiece.SelfTestData
 21 | 	(*ModelProto)(nil),                 // 5: sentencepiece.ModelProto
 22 | 	(*SelfTestData_Sample)(nil),        // 6: sentencepiece.SelfTestData.Sample
 23 | 	(*ModelProto_SentencePiece)(nil),   // 7: sentencepiece.ModelProto.SentencePiece
 24 | }
 25 | var file_sentencepiece_model_proto_depIdxs = []int32{
 26 | 	0, // 0: sentencepiece.TrainerSpec.model_type:type_name -> sentencepiece.TrainerSpec.ModelType
 27 | 	6, // 1: sentencepiece.SelfTestData.samples:type_name -> sentencepiece.SelfTestData.Sample
 28 | 	7, // 2: sentencepiece.ModelProto.pieces:type_name -> sentencepiece.ModelProto.SentencePiece
 29 | 	2, // 3: sentencepiece.ModelProto.trainer_spec:type_name -> sentencepiece.TrainerSpec
 30 | 	3, // 4: sentencepiece.ModelProto.normalizer_spec:type_name -> sentencepiece.NormalizerSpec
 31 | 	4, // 5: sentencepiece.ModelProto.self_test_data:type_name -> sentencepiece.SelfTestData
 32 | 	3, // 6: sentencepiece.ModelProto.denormalizer_spec:type_name -> sentencepiece.NormalizerSpec
 33 | 	1, // 7: sentencepiece.ModelProto.SentencePiece.type:type_name -> sentencepiece.ModelProto.SentencePiece.Type
 34 | 	8, // [8:8] is the sub-list for method output_type
 35 | 	8, // [8:8] is the sub-list for method input_type
 36 | 	8, // [8:8] is the sub-list for extension type_name
 37 | 	8, // [8:8] is the sub-list for extension extendee
 38 | 	0, // [0:8] is the sub-list for field type_name
 39 | }
 40 | 
 41 | func init() { file_sentencepiece_model_proto_init() }
 42 | func file_sentencepiece_model_proto_init() {
 43 | 	if File_sentencepiece_model_proto != nil {
 44 | 		return
 45 | 	}
 46 | 	if !protoimpl.UnsafeEnabled {
 47 | 		file_sentencepiece_model_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} {
 48 | 			switch v := v.(*TrainerSpec); i {
 49 | 			case 0:
 50 | 				return &v.state
 51 | 			case 1:
 52 | 				return &v.sizeCache
 53 | 			case 2:
 54 | 				return &v.unknownFields
 55 | 			case 3:
 56 | 				return &v.extensionFields
 57 | 			default:
 58 | 				return nil
 59 | 			}
 60 | 		}
 61 | 		file_sentencepiece_model_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} {
 62 | 			switch v := v.(*NormalizerSpec); i {
 63 | 			case 0:
 64 | 				return &v.state
 65 | 			case 1:
 66 | 				return &v.sizeCache
 67 | 			case 2:
 68 | 				return &v.unknownFields
 69 | 			case 3:
 70 | 				return &v.extensionFields
 71 | 			default:
 72 | 				return nil
 73 | 			}
 74 | 		}
 75 | 		file_sentencepiece_model_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} {
 76 | 			switch v := v.(*SelfTestData); i {
 77 | 			case 0:
 78 | 				return &v.state
 79 | 			case 1:
 80 | 				return &v.sizeCache
 81 | 			case 2:
 82 | 				return &v.unknownFields
 83 | 			case 3:
 84 | 				return &v.extensionFields
 85 | 			default:
 86 | 				return nil
 87 | 			}
 88 | 		}
 89 | 		file_sentencepiece_model_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} {
 90 | 			switch v := v.(*ModelProto); i {
 91 | 			case 0:
 92 | 				return &v.state
 93 | 			case 1:
 94 | 				return &v.sizeCache
 95 | 			case 2:
 96 | 				return &v.unknownFields
 97 | 			case 3:
 98 | 				return &v.extensionFields
 99 | 			default:
100 | 				return nil
101 | 			}
102 | 		}
103 | 		file_sentencepiece_model_proto_msgTypes[4].Exporter = func(v interface{}, i int) interface{} {
104 | 			switch v := v.(*SelfTestData_Sample); i {
105 | 			case 0:
106 | 				return &v.state
107 | 			case 1:
108 | 				return &v.sizeCache
109 | 			case 2:
110 | 				return &v.unknownFields
111 | 			default:
112 | 				return nil
113 | 			}
114 | 		}
115 | 		file_sentencepiece_model_proto_msgTypes[5].Exporter = func(v interface{}, i int) interface{} {
116 | 			switch v := v.(*ModelProto_SentencePiece); i {
117 | 			case 0:
118 | 				return &v.state
119 | 			case 1:
120 | 				return &v.sizeCache
121 | 			case 2:
122 | 				return &v.unknownFields
123 | 			case 3:
124 | 				return &v.extensionFields
125 | 			default:
126 | 				return nil
127 | 			}
128 | 		}
129 | 	}
130 | 	type x struct{}
131 | 	out := protoimpl.TypeBuilder{
132 | 		File: protoimpl.DescBuilder{
133 | 			GoPackagePath: reflect.TypeOf(x{}).PkgPath(),
134 | 			RawDescriptor: file_sentencepiece_model_proto_rawDesc,
135 | 			NumEnums:      2,
136 | 			NumMessages:   6,
137 | 			NumExtensions: 0,
138 | 			NumServices:   0,
139 | 		},
140 | 		GoTypes:           file_sentencepiece_model_proto_goTypes,
141 | 		DependencyIndexes: file_sentencepiece_model_proto_depIdxs,
142 | 		EnumInfos:         file_sentencepiece_model_proto_enumTypes,
143 | 		MessageInfos:      file_sentencepiece_model_proto_msgTypes,
144 | 	}.Build()
145 | 	File_sentencepiece_model_proto = out.File
146 | 	file_sentencepiece_model_proto_rawDesc = nil
147 | 	file_sentencepiece_model_proto_goTypes = nil
148 | 	file_sentencepiece_model_proto_depIdxs = nil
149 | }
150 | 
151 | 


--------------------------------------------------------------------------------
/test/htmlcode1.txt:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en" data-theme="auto">
  3 | <head>
  4 | 
  5 | <link rel="preconnect" href="https://www.googletagmanager.com">
  6 | <script >(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
  7 |   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
  8 |   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
  9 |   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
 10 |   })(window,document,'script','dataLayer','GTM-W8MVQXG');</script>
 11 |   
 12 | <meta charset="utf-8">
 13 | <meta name="viewport" content="width=device-width, initial-scale=1">
 14 | <meta name="theme-color" content="#00add8">
 15 | <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Material+Icons">
 16 | <link rel="stylesheet" href="/css/styles.css">
 17 | <link rel="icon" href="/images/favicon-gopher.png" sizes="any">
 18 | <link rel="apple-touch-icon" href="/images/favicon-gopher-plain.png"/>
 19 | <link rel="icon" href="/images/favicon-gopher.svg" type="image/svg+xml">
 20 | <link rel="me" href="https://hachyderm.io/@golang">
 21 | 
 22 |   
 23 |   <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
 24 |   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
 25 |   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
 26 |   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
 27 |   })(window,document,'script','dataLayer','GTM-W8MVQXG');</script>
 28 |   
 29 | <script src="/js/site.js"></script>
 30 | <meta name="og:url" content="https://go.dev/">
 31 | <meta name="og:title" content="The Go Programming Language">
 32 | <title>The Go Programming Language</title>
 33 | 
 34 | <meta name="og:description" content="Go is an open source programming language that makes it simple to build secure, scalable systems.">
 35 | <meta name="description" content="Go is an open source programming language that makes it simple to build secure, scalable systems.">
 36 | 
 37 | <meta name="og:image" content="https://go.dev/doc/gopher/gopher5logo.jpg">
 38 | <meta name="twitter:image" content="https://go.dev/doc/gopher/gopherbelly300.jpg">
 39 | <meta name="twitter:card" content="summary">
 40 | <meta name="twitter:site" content="@golang">
 41 | </head>
 42 | <body class="Site">
 43 |   
 44 | <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-W8MVQXG"
 45 |   height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
 46 |   
 47 | 
 48 | 
 49 | <header class="Site-header js-siteHeader">
 50 |   <div class="Header Header--dark">
 51 |     <nav class="Header-nav">
 52 |       <a href="/">
 53 |         <img
 54 |           class="js-headerLogo Header-logo"
 55 |           src="/images/go-logo-white.svg"
 56 |           alt="Go">
 57 |       </a>
 58 |       <div class="skip-navigation-wrapper">
 59 |         <a class="skip-to-content-link" aria-label="Skip to main content" href="#main-content"> Skip to Main Content </a>
 60 |       </div>
 61 |       <div class="Header-rightContent">
 62 |         <ul class="Header-menu">
 63 |           <li class="Header-menuItem ">
 64 |             <a href="#"  class="js-desktop-menu-hover" aria-label=Why&#32;Go aria-describedby="dropdown-description">
 65 |               Why Go <i class="material-icons" aria-hidden="true">arrow_drop_down</i>
 66 |             </a>
 67 |             <div class="screen-reader-only" id="dropdown-description" hidden>
 68 |               Press Enter to activate/deactivate dropdown
 69 |             </div>
 70 |               <ul class="Header-submenu js-desktop-submenu-hover" aria-label="submenu">
 71 |                   <li class="Header-submenuItem">
 72 |                     <div>
 73 |                         <a href="/solutions/case-studies">
 74 |                           Case Studies
 75 |                           
 76 |                         </a>
 77 |                     </div>
 78 |                     <p>Common problems companies solve with Go</p>
 79 |                   </li>
 80 |                   <li class="Header-submenuItem">
 81 |                     <div>
 82 |                         <a href="/solutions/use-cases">
 83 |                           Use Cases
 84 |                           
 85 |                         </a>
 86 |                     </div>
 87 |                     <p>Stories about how and why companies use Go</p>
 88 |                   </li>
 89 |                   <li class="Header-submenuItem">
 90 |                     <div>
 91 |                         <a href="/security/">
 92 |                           Security
 93 |                           
 94 |                         </a>
 95 |                     </div>
 96 |                     <p>How Go can help keep you secure by default</p>
 97 |                   </li>
 98 |               </ul>
 99 |           </li>
100 |           <li class="Header-menuItem ">
101 |             <a href="/learn/"  aria-label=Learn aria-describedby="dropdown-description">
102 |               Learn 
103 |             </a>
104 |             <div class="screen-reader-only" id="dropdown-description" hidden>
105 |               Press Enter to activate/deactivate dropdown
106 |             </div>
107 |           </li>
108 |           <li class="Header-menuItem ">
109 |             <a href="#"  class="js-desktop-menu-hover" aria-label=Docs aria-describedby="dropdown-description">
110 |               Docs <i class="material-icons" aria-hidden="true">arrow_drop_down</i>
111 |             </a>
112 |             <div class="screen-reader-only" id="dropdown-description" hidden>
113 |               Press Enter to activate/deactivate dropdown
114 |             </div>
115 |               <ul class="Header-submenu js-desktop-submenu-hover" aria-label="submenu">
116 |                   <li class="Header-submenuItem">
117 |                     <div>
118 |                         <a href="/doc/effective_go">
119 |                           Effective Go
120 |                           
121 |                         </a>
122 |                     </div>
123 |                     <p>Tips for writing clear, performant, and idiomatic Go code</p>
124 |                   </li>
125 |                   <li class="Header-submenuItem">
126 |                     <div>
127 |                         <a href="/doc">
128 |                           Go User Manual
129 |                           
130 |                         </a>
131 |                     </div>
132 |                     <p>A complete introduction to building software with Go</p>
133 |                   </li>
134 |                   <li class="Header-submenuItem">
135 |                     <div>
136 |                         <a href="https://pkg.go.dev/std">
137 |                           Standard library
138 |                           
139 |                         </a>
140 |                     </div>
141 |                     <p>Reference documentation for Go&#39;s standard library</p>
142 |                   </li>
143 |                   <li class="Header-submenuItem">
144 |                     <div>
145 |                         <a href="/doc/devel/release">
146 |                           Release Notes
147 |                           
148 |                         </a>
149 |                     </div>
150 |                     <p>Learn what&#39;s new in each Go release</p>
151 |                   </li>
152 |               </ul>
153 |           </li>
154 |           <li class="Header-menuItem ">
155 |             <a href="https://pkg.go.dev"  aria-label=Packages aria-describedby="dropdown-description">
156 |               Packages 
157 |             </a>
158 |             <div class="screen-reader-only" id="dropdown-description" hidden>
159 |               Press Enter to activate/deactivate dropdown
160 |             </div>
161 |           </li>
162 |           <li class="Header-menuItem ">
163 |             <a href="#"  class="js-desktop-menu-hover" aria-label=Community aria-describedby="dropdown-description">
164 |               Community <i class="material-icons" aria-hidden="true">arrow_drop_down</i>
165 |             </a>
166 |             <div class="screen-reader-only" id="dropdown-description" hidden>
167 |               Press Enter to activate/deactivate dropdown
168 |             </div>
169 |               <ul class="Header-submenu js-desktop-submenu-hover" aria-label="submenu">
170 |                   <li class="Header-submenuItem">
171 |                     <div>
172 |                         <a href="/talks/">
173 |                           Recorded Talks
174 |                           
175 |                         </a>
176 |                     </div>
177 |                     <p>Videos from prior events</p>
178 |                   </li>
179 |                   <li class="Header-submenuItem">
180 |                     <div>
181 |                         <a href="https://www.meetup.com/pro/go">
182 |                           Meetups
183 |                            <i class="material-icons">open_in_new</i>
184 |                         </a>
185 |                     </div>
186 |                     <p>Meet other local Go developers</p>
187 |                   </li>
188 |                   <li class="Header-submenuItem">
189 |                     <div>
190 |                         <a href="/wiki/Conferences">
191 |                           Conferences
192 |                            <i class="material-icons">open_in_new</i>
193 |                         </a>
194 |                     </div>
195 |                     <p>Learn and network with Go developers from around the world</p>
196 |                   </li>
197 |                   <li class="Header-submenuItem">
198 |                     <div>
199 |                         <a href="/blog">
200 |                           Go blog
201 |                           
202 |                         </a>
203 |                     </div>
204 |                     <p>The Go project&#39;s official blog.</p>
205 |                   </li>
206 |                   <li class="Header-submenuItem">
207 |                     <div>
208 |                         <a href="/help">
209 |                           Go project
210 |                           
211 |                         </a>
212 |                     </div>
213 |                     <p>Get help and stay informed from Go</p>
214 |                   </li>
215 |                   <li class="Header-submenuItem">
216 |                     <div>
217 |                         Get connected
218 |                     </div>
219 |                     <p></p>
220 |                       <div class="Header-socialIcons">
221 |                         
222 |                         <a class="Header-socialIcon" aria-label="Get connected with google-groups (Opens in new window)" href="https://groups.google.com/g/golang-nuts"><img src="/images/logos/social/google-groups.svg" /></a>
223 |                         <a class="Header-socialIcon" aria-label="Get connected with github (Opens in new window)" href="https://github.com/golang"><img src="/images/logos/social/github.svg" /></a>
224 |                         <a class="Header-socialIcon" aria-label="Get connected with twitter (Opens in new window)" href="https://twitter.com/golang"><img src="/images/logos/social/twitter.svg" /></a>
225 |                         <a class="Header-socialIcon" aria-label="Get connected with reddit (Opens in new window)" href="https://www.reddit.com/r/golang/"><img src="/images/logos/social/reddit.svg" /></a>
226 |                         <a class="Header-socialIcon" aria-label="Get connected with slack (Opens in new window)" href="https://invite.slack.golangbridge.org/"><img src="/images/logos/social/slack.svg" /></a>
227 |                         <a class="Header-socialIcon" aria-label="Get connected with stack-overflow (Opens in new window)" href="https://stackoverflow.com/tags/go"><img src="/images/logos/social/stack-overflow.svg" /></a>
228 |                       </div>
229 |                   </li>
230 |               </ul>
231 |           </li>
232 |         </ul>
233 |         <button class="Header-navOpen js-headerMenuButton Header-navOpen--white" aria-label="Open navigation.">
234 |         </button>
235 |       </div>
236 |     </nav>
237 |     
238 |   </div>
239 | </header>
240 | 


--------------------------------------------------------------------------------
/test/latexcode1.txt:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | \usepackage{amsmath, amssymb}
 3 | \usepackage{amsfonts}
 4 | \usepackage{amsthm}
 5 | 
 6 | \newtheorem{theorem}{Theorem}
 7 | 
 8 | \begin{document}
 9 | 
10 | \title{Proof of Green's Theorem}
11 | \author{}
12 | \date{}
13 | \maketitle
14 | 
15 | \begin{theorem}[Green's Theorem]
16 | Let \( C \) be a positively oriented, simple closed curve in the plane, and let \( D \) be the region bounded by \( C \). If \( L(x, y) \) and \( M(x, y) \) have continuous partial derivatives on an open region that contains \( D \) and \( C \), then
17 | \[
18 | \oint_C \left( L \, dx + M \, dy \right) = \iint_D \left( \frac{\partial M}{\partial x} - \frac{\partial L}{\partial y} \right) dA.
19 | \]
20 | \end{theorem}
21 | 
22 | \begin{proof}
23 | We will prove Green's Theorem by breaking the region \( D \) into small rectangles and then using the Fundamental Theorem of Calculus.
24 | 
25 | Assume that the region \( D \) is divided into \( m \times n \) small rectangles. For each small rectangle \( R_{ij} \) with vertices \((x_i, y_j)\), \((x_{i+1}, y_j)\), \((x_{i+1}, y_{j+1})\), and \((x_i, y_{j+1})\), we approximate the line integral around the boundary of \( R_{ij} \):
26 | 
27 | \[
28 | \oint_{\partial R_{ij}} \left( L \, dx + M \, dy \right) \approx \left( M(x_{i+1}, y_{j+1}) - M(x_{i}, y_{j+1}) \right)(x_{i+1} - x_i) - \left( L(x_{i+1}, y_{j+1}) - L(x_{i+1}, y_j) \right)(y_{j+1} - y_j).
29 | \]
30 | 
31 | This expression can be rewritten as:
32 | 
33 | \[
34 | \oint_{\partial R_{ij}} \left( L \, dx + M \, dy \right) \approx \left( \frac{\partial M}{\partial x} - \frac{\partial L}{\partial y} \right) \Delta x \Delta y,
35 | \]
36 | where \( \Delta x = x_{i+1} - x_i \) and \( \Delta y = y_{j+1} - y_j \).
37 | 
38 | Summing over all rectangles in the region \( D \), we obtain:
39 | 
40 | \[
41 | \sum_{i,j} \oint_{\partial R_{ij}} \left( L \, dx + M \, dy \right) \approx \sum_{i,j} \left( \frac{\partial M}{\partial x} - \frac{\partial L}{\partial y} \right) \Delta x \Delta y.
42 | \]
43 | 
44 | The left-hand side of this equation is approximately the line integral over \( C \), and the right-hand side is a Riemann sum that approximates the double integral over \( D \):
45 | 
46 | \[
47 | \oint_C \left( L \, dx + M \, dy \right) = \iint_D \left( \frac{\partial M}{\partial x} - \frac{\partial L}{\partial y} \right) dA.
48 | \]
49 | 
50 | Thus, Green's Theorem is proved.
51 | \end{proof}
52 | 
53 | \end{document}
54 | 
55 | 


--------------------------------------------------------------------------------
/test/opening-multilang.txt:
--------------------------------------------------------------------------------
 1 | Somewhere in la Mancha, in a place whose name I do not care to remember, a
 2 | gentleman lived not long ago, one of those who has a lance and ancient shield on
 3 | a shelf and keeps a skinny nag and a greyhound for racing.
 4 | 
 5 | En un lugar de la Mancha, de cuyo nombre no quiero acordarme, no ha mucho tiempo
 6 | que vivía un hidalgo de los de lanza en astillero, adarga antigua, rocín flaco y
 7 | galgo corredor.
 8 | 
 9 | 「ラ・マンチャのどこか、名前を覚えたくもない場所で、古い盾と槍を棚に飾り、痩せた馬と猟犬を飼っていた紳士が、そう遠くない昔に住んでいた。」
10 | 
11 | 라 만차 어딘가에서, 이름을 기억하고 싶지 않은 장소에서, 고대 방패와 창을 선반에 두고, 말라깽이 말과 경주용 그레이하운드를 키우는 신사가 얼마 전에 살았다.
12 | 
13 | ला मांचायाः काचित् स्थले, यस्य नाम स्मर्तुम् न इच्छामि, तत्र कदाचित् कश्चन सज्जनः वसति स्म, यस्य शस्त्रं प्राचीनं च कवचं तिष्ठति, तस्य च अश्वः कृशः च श्वा धावनाय अस्ति।
14 | 
15 | Где-то в Ла-Манче, в месте, имя которого я не хочу вспоминать, жил некогда
16 | дворянин, один из тех, кто держал копье и старинный щит на полке, а также худую
17 | лошадь и борзую для охоты.
18 | 


--------------------------------------------------------------------------------
/test/perlcode1.txt:
--------------------------------------------------------------------------------
  1 | sub init_sim
  2 | {
  3 | 	my $args = 
  4 | 	{
  5 | 		-init_addr	=> 0,
  6 | 		-mem_file	=> undef,
  7 | 		-mem_ref	=> undef,
  8 | 		-device_dir	=> './',
  9 | 		@_,
 10 | 	};
 11 | 	
 12 | 	$args->{-device_dir} .= '/' unless $args->{-device_dir} =~ /\/^/;
 13 | 	
 14 | 	# init memory
 15 | 	for (my $i = 0; $i < get_mix_mem_size(); ++$i)
 16 | 	{
 17 | 		$mem[$i] = empty_word();
 18 | 	}
 19 | 	
 20 | 	$rA = empty_word();
 21 | 	$rX = empty_word();
 22 | 	$rJ = empty_word();
 23 | 	$rI[$_] = empty_word()
 24 | 		foreach (1 .. 6);
 25 | 
 26 | 	$f_overflow = 0;
 27 | 	$f_comparison = 0;
 28 | 	$time = 0;
 29 | 	$lc = $args->{-init_addr};
 30 | 	$simulation_ended = 0;
 31 | 	@io_device = ();
 32 | 
 33 | 	# init IO devices
 34 | 	#
 35 | 	foreach my $n (0 .. 15)
 36 | 	{
 37 | 		if ($n >= 0 and $n <= 7)
 38 | 		{
 39 | 			push(@io_device, {filename => "tape${n}.dev", io_type => "bio", block_size => 100, data => undef});
 40 | 		}
 41 | 		elsif ($n >= 8 and $n <= 15)
 42 | 		{
 43 | 			my $m = $n - 8;
 44 | 			push(@io_device, {filename => "disk${m}.dev", io_type => "bio", block_size => 100, data => undef});
 45 | 		}
 46 | 	}
 47 | 	
 48 | 	push(@io_device, {filename => "cardrd.dev", io_type => "ci", block_size => 16});
 49 | 	push(@io_device, {filename => "cardwr.dev", io_type => "co", block_size => 16});
 50 | 	push(@io_device, {filename => "printer.dev", io_type => "co", block_size => 24});
 51 | 	push(@io_device, {filename => "stdio", io_type => "cio", block_size => 14});
 52 | 	push(@io_device, {filename => "paper.dev", io_type => "ci", block_size => 14});
 53 | 	
 54 | 	foreach my $dev (@io_device)
 55 | 	{
 56 | 		$dev->{filename} = $args->{-device_dir} . $dev->{filename};
 57 | 	}
 58 | 	
 59 | 	$saved_mem_file = $args->{-mem_file};
 60 | 	$saved_mem_ref = $args->{-mem_ref};
 61 | 	$saved_init_addr = $args->{-init_addr};
 62 | 	
 63 | 	if (defined $args->{-mem_file})
 64 | 	{
 65 | 		load_memory_from_text_file($args->{-mem_file});
 66 | 	}
 67 | 	elsif (defined $args->{-mem_ref})
 68 | 	{
 69 | 		@mem = @{$args->{-mem_ref}};
 70 | 	}
 71 | 	else
 72 | 	{
 73 | 		warn("No memory file or reference given to the simulator\n");
 74 | 	}
 75 | }
 76 | 
 77 | 
 78 | sub simulation_ended
 79 | {
 80 | 	return $simulation_ended;
 81 | }
 82 | 
 83 | 
 84 | sub fetch_next_instruction
 85 | {
 86 | 	return @{$mem[$lc]};
 87 | }
 88 | 
 89 | 
 90 | # Executes one instruction
 91 | #
 92 | sub step_sim
 93 | {
 94 | 	address_is_legal($lc)
 95 | 		or runtime_error("location counter out of memory bounds");
 96 | 	
 97 | 	my @word = fetch_next_instruction();
 98 | 	
 99 | 	my $opcode = $word[5];
100 | 	my $F = $word[4];
101 | 	
102 | 	if ($opcode == 5 and $F == 2)		# HLT
103 | 	{
104 | 		$simulation_ended = 1;
105 | 		return;
106 | 	}
107 | 	elsif ($opcode == 0)	# NOP
108 | 	{
109 | 		$lc++;
110 | 		return;
111 | 	}
112 | 	else
113 | 	{
114 | 		# Dispatch the instruction to the appropriate handler,
115 | 		# based on the opcode.
116 | 		#
117 | 		if (defined $opcode_map{$opcode})
118 | 		{
119 | 			my $op_func = $opcode_map{$opcode};
120 | 			$op_func->(@word);			
121 | 			$lc++;
122 | 		}
123 | 		else
124 | 		{
125 | 			runtime_error("illegal opcode: $opcode");
126 | 		}
127 | 	}
128 | }
129 | 
130 | 
131 | sub get_mem_ref
132 | {
133 | 	return \@mem;
134 | }
135 | 
136 | 
137 | # Simulates the MIX code until a HLT instruction is
138 | # incountered.
139 | #
140 | sub run_sim
141 | {
142 | 	# step through the whole program
143 | 	#
144 | 	until (simulation_ended())
145 | 	{
146 | 		step_sim();
147 | 	}
148 | 	
149 | 	# update the binary devices
150 | 	#
151 | 	foreach my $devref (@io_device)
152 | 	{
153 | 		next unless is_binary_device($devref) and defined $devref->{data};
154 | 		
155 | 		my $fh = $devref->{handle};
156 | 		close $fh if defined $fh;
157 | 		
158 | 		unless (open($fh, ">$devref->{filename}"))
159 | 		{	
160 | 			warn "Unable to write device $devref->{filename}\n";
161 | 			next;
162 | 		}
163 | 		
164 | 		foreach my $block_n (keys %{$devref->{data}})
165 | 		{
166 | 			print $fh "$block_n\n";
167 | 			
168 | 			for (my $i = 0; $i < $devref->{block_size}; ++$i)
169 | 			{
170 | 				print $fh sprintf("%2s %2s %2s %2s %2s %2s\n", @{$devref->{data}->{$block_n}->[$i]});
171 | 			}
172 | 		}
173 | 		
174 | 		close $fh;
175 | 	}
176 | }
177 | 
178 | sub interactive_sim
179 | {
180 | 	local $| = 1;
181 | 	my %breakpoints;
182 | 	
183 | 	print "\nWelcome to MIXSim interaction !\n\n";
184 | 	
185 | 	interaction: while (1)
186 | 	{
187 | 		printf "[%4s]> ", $lc;
188 | 		my $command = <>;
189 | 		chomp($command);
190 | 		
191 | 		# strip leading and trailing whitespace
192 | 		$command =~ s/^\s+//;
193 | 		$command =~ s/\s+$//;
194 | 		
195 | 		my @toks = split('\s+', $command);
196 | 		next if @toks == 0;
197 | 		
198 | 		if ($command eq "s")
199 | 		{
200 | 			step_sim();
201 | 			
202 | 			print "Simulation ended (HLT)\n" if (simulation_ended());
203 | 			
204 | 		}
205 | 		elsif ($command eq "c" or $command eq "cl")
206 | 		{
207 | 			step_loop: while (1)
208 | 			{
209 | 				if (exists $breakpoints{$lc})
210 | 				{
211 | 					print "Breakpoint stop at address $lc\n";
212 | 					last step_loop;
213 | 				}
214 | 				
215 | 				if (simulation_ended())
216 | 				{
217 | 					print "Simulation ended (HLT)\n" if (simulation_ended());
218 | 					last step_loop;
219 | 				}
220 | 				
221 | 				print "$lc\n" if $command eq "cl";
222 | 				step_sim();
223 | 			}
224 | 		}
225 | 		elsif ($command eq "rst")
226 | 		{
227 | 			if (defined $saved_mem_file)
228 | 			{
229 | 				init_sim(-mem_file => $saved_mem_file, -init_addr => $saved_init_addr);
230 | 			}
231 | 			elsif (defined $saved_mem_ref)
232 | 			{
233 | 				init_sim(-mem_ref => $saved_mem_ref, -init_addr => $saved_init_addr);
234 | 			}
235 | 		}
236 | 		elsif ($command eq "r")
237 | 		{
238 | 			print state_dump(), "\n";
239 | 		}
240 | 		elsif ($command eq "sr")
241 | 		{
242 | 			step_sim();
243 | 			print state_dump(), "\n";
244 | 		}
245 | 		elsif ($toks[0] eq "m")
246 | 		{
247 | 			if (@toks == 1)
248 | 			{
249 | 				print memory_dump(\@mem);
250 | 			}
251 | 			elsif (@toks == 2)
252 | 			{
253 | 				my $addr = $toks[1];
254 | 				address_is_legal($addr) or interactive_error("Illegal address $addr");
255 | 				printf("%4s : %2s %2s %2s %2s %2s %2s\n", $addr, @{$mem[$addr]});
256 | 			}
257 | 			else
258 | 			{
259 | 				interactive_error("Illegal m command");
260 | 			}
261 | 		}
262 | 		elsif ($toks[0] eq "b")
263 | 		{
264 | 			if (@toks != 2) 
265 | 			{
266 | 				interactive_error("Illegal b command");
267 | 				next;
268 | 			}
269 | 			
270 | 			my $addr = $toks[1];
271 | 			
272 | 			if (not address_is_legal($addr))  
273 | 			{
274 | 				interactive_error("Illegal address $addr");
275 | 				next;
276 | 			}
277 | 			
278 | 			if (exists $breakpoints{$addr})
279 | 			{
280 | 				delete($breakpoints{$addr});
281 | 				print "Removed breakpoint at $addr\n";
282 | 			}
283 | 			else
284 | 			{
285 | 				$breakpoints{$addr} = 1;
286 | 				print "Set breakpoint at $addr\n";
287 | 			}
288 | 		}
289 | 		elsif ($command eq "bl")
290 | 		{
291 | 			my @bkpt_keys = keys %breakpoints;
292 | 			
293 | 			if (@bkpt_keys == 0)
294 | 			{
295 | 				print "No breakpoints set\n";
296 | 			}
297 | 			else
298 | 			{
299 | 				print "Breakpoints set at:\n";
300 | 				
301 | 				if (@bkpt_keys == 1)
302 | 				{
303 | 					print "$bkpt_keys[0]  ";
304 | 				}
305 | 				else
306 | 				{
307 | 					foreach my $addr (sort {$a <=> $b} @bkpt_keys)
308 | 					{
309 | 						print "$addr  ";
310 | 					}
311 | 				}
312 | 				
313 | 				print "\n";
314 | 			}
315 | 		}
316 | 		elsif ($command eq "br")
317 | 		{
318 | 			%breakpoints = ();
319 | 		}
320 | 		elsif ($command eq "h")
321 | 		{
322 | 			print "\n*** MIXSim interaction help ***\n\n";
323 | 			print "s       \t\t step\n";
324 | 			print "c       \t\t continue until next breakpoint or HLT\n";
325 | 			print "cl      \t\t same as 'c', with an execution trace\n"; 
326 | 			print "rst     \t\t restart simulation (breakpoints remain)\n";
327 | 			print "r       \t\t print contents of registers\n";
328 | 			print "sr      \t\t step and print contents of registers\n";
329 | 			print "m       \t\t print all non-zero memory words\n";
330 | 			print "m <addr>\t\t print a memory word at <addr>\n";
331 | 			print "b <addr>\t\t set/unset a breakpoint at <addr>\n";
332 | 			print "bl      \t\t list all breakpoints\n";
333 | 			print "br      \t\t remove all breakpoints\n";
334 | 			print "h       \t\t show this help\n";
335 | 			print "x or q  \t\t exit interaction\n\n";
336 | 		}
337 | 		elsif ($command eq "x" or $command eq "q")
338 | 		{
339 | 			last interaction;
340 | 		}
341 | 		else
342 | 		{
343 | 			print "Illegal command. Type 'h' for help\n";
344 | 		}
345 | 	}
346 | 	
347 | 	print "\nBye !\n\n";
348 | }
349 | 
350 | 
351 | # Returns a state dump - contents of all the registers
352 | #
353 | sub state_dump
354 | {
355 | 	my $dump_str = "";
356 | 
357 | 	$dump_str .= sprintf("rA   : %2s %2s %2s %2s %2s %2s\n", @{$rA});
358 | 	$dump_str .= sprintf("rX   : %2s %2s %2s %2s %2s %2s\n", @{$rX});
359 | 
360 | 	$dump_str .= sprintf("rI$_  : %2s %2s %2s %2s %2s %2s\n", @{$rI[$_]})
361 | 		foreach (1 .. 6);
362 | 	
363 | 	$dump_str .= "\n";
364 | 	$dump_str .= sprintf("rJ   : %2s %2s %2s %2s %2s %2s\n", @{$rJ});
365 | 	$dump_str .= sprintf("lc   : %5s\n", $lc);
366 | 	$dump_str .= sprintf("ovf  : %2s\n", $f_overflow);
367 | 	$dump_str .= sprintf("comp : %2s\n", $f_comparison);
368 | }
369 | 
370 | 
371 | # Reports runtime errors - errors that occured during simulation
372 | # as a result of incorrect machine code. $lc is reported
373 | #
374 | sub runtime_error
375 | {
376 | 	my ($msg) = @_;
377 | 	
378 | 	die("Simulation error at address $lc: $msg\n");
379 | }
380 | 
381 | 
382 | 


--------------------------------------------------------------------------------
/test/pg2000_spanish.txt:
--------------------------------------------------------------------------------
  1 | The Project Gutenberg eBook of Don Quijote
  2 |     
  3 | This ebook is for the use of anyone anywhere in the United States and
  4 | most other parts of the world at no cost and with almost no restrictions
  5 | whatsoever. You may copy it, give it away or re-use it under the terms
  6 | of the Project Gutenberg License included with this ebook or online
  7 | at www.gutenberg.org. If you are not located in the United States,
  8 | you will have to check the laws of the country where you are located
  9 | before using this eBook.
 10 | 
 11 | Title: Don Quijote
 12 | 
 13 | Author: Miguel de Cervantes Saavedra
 14 | 
 15 | Release date: December 1, 1999 [eBook #2000]
 16 |                 Most recently updated: January 17, 2021
 17 | 
 18 | Language: Spanish
 19 | 
 20 | Credits: an anonymous Project Gutenberg volunteer and Joaquin Cuenca Abela
 21 | 
 22 | 
 23 | *** START OF THE PROJECT GUTENBERG EBOOK DON QUIJOTE ***
 24 | 
 25 | 
 26 | 
 27 | 
 28 | El ingenioso hidalgo don Quijote de la Mancha
 29 | 
 30 | 
 31 | 
 32 | por Miguel de Cervantes Saavedra
 33 | 
 34 | 
 35 | 
 36 | 
 37 | 
 38 | El ingenioso hidalgo don Quijote de la Mancha
 39 | 
 40 | 
 41 |   
 42 | Tasa
 43 | 
 44 |   
 45 | Testimonio de las erratas
 46 | 
 47 |   
 48 | El Rey
 49 | 
 50 |   
 51 | Al Duque de Béjar
 52 | 
 53 |   
 54 | Prólogo
 55 | 
 56 |   
 57 | Al libro de don Quijote de la Mancha
 58 | 
 59 | 
 60 | 
 61 | Que trata de la condición y ejercicio del famoso
 62 | hidalgo don Quijote de la Mancha
 63 | 
 64 | Que trata de la primera salida que de su tierra hizo
 65 | el ingenioso don Quijote
 66 | 
 67 | Donde se cuenta la graciosa manera que tuvo don
 68 | Quijote en armarse caballero
 69 | 
 70 | De lo que le sucedió a nuestro caballero cuando salió
 71 | de la venta
 72 | 
 73 | Donde se prosigue la narración de la desgracia de
 74 | nuestro caballero
 75 | 
 76 | Del donoso y grande escrutinio que el cura y el
 77 | barbero hicieron en la librería de nuestro ingenioso hidalgo
 78 | 
 79 | De la segunda salida de nuestro buen caballero don
 80 | Quijote de la Mancha
 81 | 
 82 | Del buen suceso que el valeroso don Quijote tuvo en
 83 | la espantable y jamás imaginada aventura de los molinos de viento, con
 84 | otros sucesos dignos de felice recordación
 85 | 
 86 | Donde se concluye y da fin a la estupenda batalla que
 87 | el gallardo vizcaíno y el valiente manchego tuvieron
 88 | 
 89 | De lo que más le avino a don Quijote con el vizcaíno, y
 90 | del peligro en que se vio con una turba de yangüeses
 91 | 
 92 | De lo que le sucedió a don Quijote con unos
 93 | cabreros
 94 | 
 95 | De lo que contó un cabrero a los que estaban con don
 96 | Quijote
 97 | 
 98 | Donde se da fin al cuento de la pastora Marcela, con
 99 | otros sucesos
100 | 
101 | Donde se ponen los versos desesperados del difunto
102 | pastor, con otros no esperados sucesos
103 | 
104 | Donde se cuenta la desgraciada aventura que se topó
105 | don Quijote en topar con unos desalmados yangüeses
106 | 
107 | De lo que le sucedió al ingenioso hidalgo en la venta
108 | que él imaginaba ser castillo
109 | 
110 | Donde se prosiguen los innumerables trabajos que el
111 | bravo don Quijote y su buen escudero Sancho Panza pasaron en la venta
112 | que, por su mal, pensó que era castillo
113 | 
114 | Donde se cuentan las razones que pasó Sancho Panza
115 | con su señor Don Quijote, con otras aventuras dignas de ser
116 | contadas
117 | 
118 | De las discretas razones que Sancho pasaba con su
119 | amo, y de la aventura que le sucedió con un cuerpo muerto, con otros
120 | acontecimientos famosos
121 | 
122 | De la jamás vista ni oída aventura que con más poco
123 | peligro fue acabada de famoso caballero en el mundo, como la que acabó
124 | el valeroso don Quijote de la Mancha
125 | 
126 | Que trata de la alta aventura y rica ganancia del
127 | yelmo de Mambrino, con otras cosas sucedidas a nuestro invencible
128 | caballero
129 | 
130 | De la libertad que dio don Quijote a muchos
131 | desdichados que, mal de su grado, los llevaban donde no quisieran
132 | ir
133 | 
134 | De lo que le aconteció al famoso don Quijote en
135 | Sierra Morena, que fue una de las más raras aventuras que en esta
136 | verdadera historia se cuentan
137 | 
138 | Donde se prosigue la aventura de la Sierra
139 | Morena
140 | 
141 | Que trata de las estrañas cosas que en Sierra Morena
142 | sucedieron al valiente caballero de la Mancha, y de la imitación que
143 | hizo a la penitencia de Beltenebros
144 | 
145 | Donde se prosiguen las finezas que de enamorado hizo
146 | don Quijote en Sierra Morena
147 | 
148 | De cómo salieron con su intención el cura y el
149 | barbero, con otras cosas dignas de que se cuenten en esta grande
150 | historia
151 | 
152 | Que trata de la nueva y agradable aventura que al
153 | cura y barbero sucedió en la mesma sierra
154 | 
155 | Que trata de la discreción de la hermosa Dorotea,
156 | con otras cosas de mucho gusto y pasatiempo
157 | 
158 | Que trata del gracioso artificio y orden que se tuvo
159 | en sacar a nuestro enamorado caballero de la asperísima penitencia en
160 | que se había puesto
161 | 
162 | De los sabrosos razonamientos que pasaron entre don
163 | Quijote y Sancho Panza, su escudero, con otros sucesos
164 | 
165 | Que trata de lo que sucedió en la venta a toda la
166 | cuadrilla de don Quijote
167 | 
168 | Donde se cuenta la novela del Curioso
169 | impertinente
170 | 


--------------------------------------------------------------------------------
/test/pg41845_telugu.txt:
--------------------------------------------------------------------------------
  1 | The Project Gutenberg eBook of ఓనమాలు
  2 |     
  3 | This ebook is for the use of anyone anywhere in the United States and
  4 | most other parts of the world at no cost and with almost no restrictions
  5 | whatsoever. You may copy it, give it away or re-use it under the terms
  6 | of the Project Gutenberg License included with this ebook or online
  7 | at www.gutenberg.org. If you are not located in the United States,
  8 | you will have to check the laws of the country where you are located
  9 | before using this eBook.
 10 | 
 11 | Title: ఓనమాలు
 12 | 
 13 | Author: Mahidhara Ramamohan Rao
 14 | 
 15 | Release date: January 14, 2013 [eBook #41845]
 16 | 
 17 | Language: Telugu
 18 | 
 19 | Credits: Produced by volunteers at Pustakam.net
 20 | 
 21 | 
 22 | *** START OF THE PROJECT GUTENBERG EBOOK ఓనమాలు ***
 23 | 
 24 | 
 25 | 
 26 | 
 27 | Produced by volunteers at Pustakam.net
 28 | 
 29 | 
 30 | 
 31 | 
 32 | అవంతీ ప్రచురణలు 4.
 33 | 
 34 | 
 35 | 
 36 | 
 37 | ఓనమాలు
 38 | 
 39 | 
 40 | 
 41 | 
 42 | రచన:
 43 | 
 44 | మహీధర రామమోహనరావు
 45 | 
 46 | 
 47 | 
 48 | 
 49 | సోల్ డిస్ట్రిబ్యూటర్లు:
 50 | 
 51 | విశాలాంధ్ర ప్రచురణాలయం,
 52 | 
 53 | విజయవాడ-2
 54 | 
 55 | 
 56 | 
 57 | 
 58 | మొదటి ముద్రణ
 59 | 
 60 | 1956
 61 | 
 62 | 
 63 | 
 64 | 
 65 | వెల
 66 | 
 67 | రెండు రూపాయల పావలా
 68 | 
 69 | 
 70 | 
 71 | 
 72 | అవంతీ ప్రెస్
 73 | 
 74 | రాజమండ్రి
 75 | 
 76 | 
 77 | 
 78 | 
 79 | 1947....
 80 | 
 81 | ....నాటి తెలంగాణా ఒక అగ్నిగుండం.
 82 | 
 83 | దుస్సహమైన జాగీర్దారీ వ్యవస్థను నిర్మూలించగల పోరాటాల్ని ప్రజానీకం సాగిస్తూంది. వాటినన్నింటినీ ఒకే జెండా క్రిందికి తెచ్చి,
 84 | రాజకీయ నాయకత్వం సమకూర్చడానికై ఆంధ్రమహాసభా, కమ్యూనిస్టు పార్టీ సన్నాహాలు సాగిస్తున్నాయి.
 85 | 
 86 | రెండో వైపున – విదేశీ పాలనకూ, సంస్థానాధీశుల నిరంకుశ పాలనకూ వ్యతిరేకంగా జాతీయ ప్రజాతంత్ర పోరాటాలు తెలంగాణాన్ని
 87 | అలుముకొంటున్నాయి.
 88 | 
 89 | ప్రజాతంత్ర హక్కులకై సాగుతున్న ఈ పోరాటాలు ఐక్యతను కూర్చుకొంటూ నిజాము పరిపాలనా యంత్రాన్ని మొదలంట కదిల్చివేస్తున్నాయి.
 90 | 
 91 | ఈ దశలో …
 92 | 
 93 | విచ్ఛిన్నమైపోతున్న జాగీర్దారీ వ్యవస్థను రక్షించగల శక్తి నిజాము ప్రభుత్వానికి లేదని గ్రహించిన భూస్వామ్యవర్గం నూతన
 94 | నాయకత్వం కొరకై వెతుకులాడుతూ జాతీయోద్యమంలో తనకు రక్షణనివ్వగల శక్తుల్ని చూసుకొంది.
 95 | 
 96 | సమాజంలో తనకున్న బలం క్రమంగా క్షీణించి పోతూంటే, కూలిపోతున్న తన అధికారాన్ని పరిరక్షించుకొనేటందుకై మతవాదుల్నీ, రౌడీల్ని
 97 | సమీకరించి విధ్వంసకాండకు పూనుకొంది నిజాము సర్కారు.
 98 | 
 99 | ప్రజానీకానికీ, ప్రతిరోధ శక్తులకూ మధ్య జరిగిన ఈ ఘర్షణలలో తెలంగాణా ఒక అగ్నిగుండమే అయింది.
100 | 
101 | ఆనాటి సంఘర్షణలే నా ఈ నవలకు కథావస్తువు. సుదీర్ఘమైన ఈ నవలలో మొదటి భాగం పాఠకుల ముందుంచుతున్నా. త్వరలోనే
102 | మిగతావీ.
103 | 
104 | విజయవాడ,
105 | 
106 | 20-3-56
107 | 
108 | రచయిత.
109 | 
110 | 
111 | 
112 | 
113 | భూమి కోసం
114 | భుక్తి కోసం
115 | నిగళబంధ
116 | విముక్తి కోసం
117 | నేల కొరిగిన
118 | తెలుగు జోదుల
119 | కిత్తు నంజలులు.
120 | 
121 | కృతజ్ఞత
122 | 
123 | తమ పత్రికలో ధారావాహికగా వెలువడిన ఈ నవలను పుస్తకరూపంలో ప్రచురించుకొనుటకనుమతించిన విశాలాంధ్ర సంపాదకులకు -
124 | 
125 | రచయిత.
126 | 
127 | 
128 | 
129 | 
130 | ఓనమాలు
131 | (మొదటి భాగం)
132 | 
133 | 
134 | 
135 | 
136 | ఒకటో ప్రకరణం.
137 | 
138 | 
139 | అటువంటివాడు ఒక వారం పది రోజులనుంచి పరధ్యానంగా వుంటున్నాడు. ఆతడు దూరదూరంగా వుంటున్నాడనిపించింది. ఆ ఆలోచనతో మనస్సు
140 | కరిగిపోతూంది; హృదయం ఆరాటపడిపోతూంది; అతనిని కదిలించడానికి చేసిన ప్రయత్నాలన్నీ, విఫలం అయ్యాయనిపిస్తూంటే ఎంతో
141 | బాధపడిపోతూంది. ఈ వారం పది రోజులుగా అతనిలో కనిపిస్తున్న ధోరణి ఏమిటో అర్థం కాలేదు. ఏమేమిటో కారణాలు కల్పించుకొంటూంది.
142 | ఆ కారణాలన్నీ ఆమెను మరింత బాధిస్తున్నాయి.
143 | 
144 | అతడు తన ఎరికలో ఇంత గాఢంగా ఆలోచనల్లో మునిగి వుండడం ఎప్పుడూ జరగలేదు. అతడు ఆలోచించవలసిన విషయాలు మాత్రం
145 | పెద్దగా ఏం వున్నాయిగనక. ఆస్తా...సెంటు భూమి లేదు. పన్నుకి పీడించేవాళ్ళింక పుట్టవలిసిందేనని అతడే వేళాకోళంగా
146 | అంటూంటాడు....తల్లా, తండ్రా?...ఆ ఇద్దరూ కూడా ఏనాడో మరణించారు.
147 | 
148 | ...పెళ్ళామా, పిల్లలా?....ఈ మాట ఆలోచనకు వచ్చినప్పుడు సత్తెమ్మ అంత సులభంగా 'కాదు' అనుకోలేకపోయింది.
149 | ఆలోచించగా, ఆలోచించగా అసలు కారణం అక్కడే వున్నట్లు కూడా అనిపించింది. అనిపించడంతో కళ్ళనీళ్లు తిరిగేయి.
150 | 
151 | అతనిని కాదనడానికి తనకున్న హక్కు ఏమిటి? అతని కోసం తాను ఎంతయినా త్యాగం చేసి వుండొచ్చు. ఉండొచ్చునేమిటి? చేసింది.
152 | 
153 | ఊరువాళ్ళ మాటల్ని ఖాతరు చెయ్యలేదు. తల్లి ఏడ్పును లెక్కచెయ్యలేదు. కుల మర్యాదల నాలోచించలేదు. అతని కోసం
154 | ఆత్మార్పణ చేసుకొంది. సమాజంలో ఆడది చేయగల త్యాగానికది పరాకాష్ఠ. అయితేనేం?...
155 | 
156 | అతడు తనకి మగడు కాదు. తనకి మగడు లేడు. వెంకటయ్య కోసం తాను ఎంత తపన పడ్డా, తానో వితంతువు మాత్రమే. అతని
157 | మీద తనకు హక్కు లేదు.
158 | 
159 | తనతో సావాసం చేసేక అతడు ఇతర పడుచుల్ని అంటుకోలేదు. కన్నెత్తి కూడా చూడలేదు. వెంకటయ్య కోసం దార్లుకాచిన పడుచుల్నీ,
160 | అతని మాటకోసం కాట్లాడుకొన్న పడుచుల్నీ ఆమె ఎరుగును. అన్నీ ఎరిగే ఆమె అతనితో నేస్తం చేసింది. తనతో చేరేక అతడు
161 | పూర్తిగా మారిపోయేడు. అతని పరిచయాల విషయంలో తాను పడ్డ జాలికూడా అతనికి నవ్వుతాలయింది. ఆ సంగతినామె ఎరుగును. అతడు
162 | తనదే లోకంగా ఆనందిస్తున్నాడు. తనకేమాత్రం కష్టం కలిగినా గిజగిజలాడి పోతాడు. తన కాళ్ళక్రింద కళ్ళు పరిచేడు.
163 | కళ్ళముందు హృదయం విప్పేడు.
164 | 
165 | 


--------------------------------------------------------------------------------
/test/pg7193_english.txt:
--------------------------------------------------------------------------------
  1 | ﻿The Project Gutenberg eBook of The Adventures of Tom Sawyer, Part 1.
  2 |     
  3 | This ebook is for the use of anyone anywhere in the United States and
  4 | most other parts of the world at no cost and with almost no restrictions
  5 | whatsoever. You may copy it, give it away or re-use it under the terms
  6 | of the Project Gutenberg License included with this ebook or online
  7 | at www.gutenberg.org. If you are not located in the United States,
  8 | you will have to check the laws of the country where you are located
  9 | before using this eBook.
 10 | 
 11 | Title: The Adventures of Tom Sawyer, Part 1.
 12 | 
 13 | Author: Mark Twain
 14 | 
 15 | Release date: June 29, 2004 [eBook #7193]
 16 |                 Most recently updated: December 30, 2020
 17 | 
 18 | Language: English
 19 | 
 20 | Credits: Produced by David Widger
 21 | 
 22 | 
 23 | *** START OF THE PROJECT GUTENBERG EBOOK THE ADVENTURES OF TOM SAWYER, PART 1. ***
 24 | 
 25 | 
 26 | 
 27 | 
 28 | Produced by David Widger
 29 | 
 30 | 
 31 | 
 32 | 
 33 |                    THE ADVENTURES OF TOM SAWYER
 34 |                                 BY
 35 |                             MARK TWAIN
 36 |                      (Samuel Langhorne Clemens)
 37 | 
 38 |                               Part 1
 39 | 
 40 | 
 41 |                            P R E F A C E
 42 | 
 43 | MOST of the adventures recorded in this book really occurred; one or
 44 | two were experiences of my own, the rest those of boys who were
 45 | schoolmates of mine. Huck Finn is drawn from life; Tom Sawyer also, but
 46 | not from an individual--he is a combination of the characteristics of
 47 | three boys whom I knew, and therefore belongs to the composite order of
 48 | architecture.
 49 | 
 50 | The odd superstitions touched upon were all prevalent among children
 51 | and slaves in the West at the period of this story--that is to say,
 52 | thirty or forty years ago.
 53 | 
 54 | Although my book is intended mainly for the entertainment of boys and
 55 | girls, I hope it will not be shunned by men and women on that account,
 56 | for part of my plan has been to try to pleasantly remind adults of what
 57 | they once were themselves, and of how they felt and thought and talked,
 58 | and what queer enterprises they sometimes engaged in.
 59 | 
 60 |                                                             THE AUTHOR.
 61 | 
 62 | HARTFORD, 1876.
 63 | 
 64 | 
 65 | 
 66 |                           T O M   S A W Y E R
 67 | 
 68 | 
 69 | 
 70 | CHAPTER I
 71 | 
 72 | "TOM!"
 73 | 
 74 | No answer.
 75 | 
 76 | "TOM!"
 77 | 
 78 | No answer.
 79 | 
 80 | "What's gone with that boy,  I wonder? You TOM!"
 81 | 
 82 | No answer.
 83 | 
 84 | The old lady pulled her spectacles down and looked over them about the
 85 | room; then she put them up and looked out under them. She seldom or
 86 | never looked THROUGH them for so small a thing as a boy; they were her
 87 | state pair, the pride of her heart, and were built for "style," not
 88 | service--she could have seen through a pair of stove-lids just as well.
 89 | She looked perplexed for a moment, and then said, not fiercely, but
 90 | still loud enough for the furniture to hear:
 91 | 
 92 | "Well, I lay if I get hold of you I'll--"
 93 | 
 94 | She did not finish, for by this time she was bending down and punching
 95 | under the bed with the broom, and so she needed breath to punctuate the
 96 | punches with. She resurrected nothing but the cat.
 97 | 
 98 | "I never did see the beat of that boy!"
 99 | 
100 | She went to the open door and stood in it and looked out among the
101 | tomato vines and "jimpson" weeds that constituted the garden. No Tom.
102 | So she lifted up her voice at an angle calculated for distance and
103 | shouted:
104 | 
105 | "Y-o-u-u TOM!"
106 | 
107 | There was a slight noise behind her and she turned just in time to
108 | seize a small boy by the slack of his roundabout and arrest his flight.
109 | 
110 | "There! I might 'a' thought of that closet. What you been doing in
111 | there?"
112 | 
113 | "Nothing."
114 | 
115 | "Nothing! Look at your hands. And look at your mouth. What IS that
116 | truck?"
117 | 
118 | "I don't know, aunt."
119 | 
120 | "Well, I know. It's jam--that's what it is. Forty times I've said if
121 | you didn't let that jam alone I'd skin you. Hand me that switch."
122 | 
123 | The switch hovered in the air--the peril was desperate--
124 | 
125 | "My! Look behind you, aunt!"
126 | 
127 | The old lady whirled round, and snatched her skirts out of danger. The
128 | lad fled on the instant, scrambled up the high board-fence, and
129 | disappeared over it.
130 | 
131 | His aunt Polly stood surprised a moment, and then broke into a gentle
132 | laugh.
133 | 
134 | "Hang the boy, can't I never learn anything? Ain't he played me tricks
135 | enough like that for me to be looking out for him by this time? But old
136 | fools is the biggest fools there is. Can't learn an old dog new tricks,
137 | as the saying is. But my goodness, he never plays them alike, two days,
138 | and how is a body to know what's coming? He 'pears to know just how
139 | long he can torment me before I get my dander up, and he knows if he
140 | can make out to put me off for a minute or make me laugh, it's all down
141 | again and I can't hit him a lick. I ain't doing my duty by that boy,
142 | and that's the Lord's truth, goodness knows. Spare the rod and spile
143 | the child, as the Good Book says. I'm a laying up sin and suffering for
144 | us both, I know. He's full of the Old Scratch, but laws-a-me! he's my
145 | own dead sister's boy, poor thing, and I ain't got the heart to lash
146 | him, somehow. Every time I let him off, my conscience does hurt me so,
147 | and every time I hit him my old heart most breaks. Well-a-well, man
148 | that is born of woman is of few days and full of trouble, as the
149 | Scripture says, and I reckon it's so. He'll play hookey this evening, *
150 | and [* Southwestern for "afternoon"] I'll just be obleeged to make him
151 | work, to-morrow, to punish him. It's mighty hard to make him work
152 | Saturdays, when all the boys is having holiday, but he hates work more
153 | than he hates anything else, and I've GOT to do some of my duty by him,
154 | or I'll be the ruination of the child."
155 | 
156 | Tom did play hookey, and he had a very good time. He got back home
157 | barely in season to help Jim, the small colored boy, saw next-day's
158 | wood and split the kindlings before supper--at least he was there in
159 | time to tell his adventures to Jim while Jim did three-fourths of the
160 | work. Tom's younger brother (or rather half-brother) Sid was already
161 | through with his part of the work (picking up chips), for he was a
162 | quiet boy, and had no adventurous, troublesome ways.
163 | 
164 | While Tom was eating his supper, and stealing sugar as opportunity
165 | offered, Aunt Polly asked him questions that were full of guile, and
166 | very deep--for she wanted to trap him into damaging revealments. Like
167 | many other simple-hearted souls, it was her pet vanity to believe she
168 | was endowed with a talent for dark and mysterious diplomacy, and she
169 | loved to contemplate her most transparent devices as marvels of low
170 | cunning. Said she:
171 | 
172 | "Tom, it was middling warm in school, warn't it?"
173 | 
174 | "Yes'm."
175 | 
176 | "Powerful warm, warn't it?"
177 | 
178 | "Yes'm."
179 | 
180 | "Didn't you want to go in a-swimming, Tom?"
181 | 
182 | A bit of a scare shot through Tom--a touch of uncomfortable suspicion.
183 | He searched Aunt Polly's face, but it told him nothing. So he said:
184 | 
185 | "No'm--well, not very much."
186 | 
187 | The old lady reached out her hand and felt Tom's shirt, and said:
188 | 
189 | "But you ain't too warm now, though." And it flattered her to reflect
190 | that she had discovered that the shirt was dry without anybody knowing
191 | that that was what she had in her mind. But in spite of her, Tom knew
192 | where the wind lay, now. So he forestalled what might be the next move:
193 | 
194 | "Some of us pumped on our heads--mine's damp yet. See?"
195 | 
196 | Aunt Polly was vexed to think she had overlooked that bit of
197 | circumstantial evidence, and missed a trick. Then she had a new
198 | inspiration:
199 | 
200 | "Tom, you didn't have to undo your shirt collar where I sewed it, to
201 | pump on your head, did you? Unbutton your jacket!"
202 | 
203 | The trouble vanished out of Tom's face. He opened his jacket. His
204 | shirt collar was securely sewed.
205 | 
206 | "Bother! Well, go 'long with you. I'd made sure you'd played hookey
207 | and been a-swimming. But I forgive ye, Tom. I reckon you're a kind of a
208 | singed cat, as the saying is--better'n you look. THIS time."
209 | 
210 | She was half sorry her sagacity had miscarried, and half glad that Tom
211 | had stumbled into obedient conduct for once.
212 | 
213 | But Sidney said:
214 | 
215 | "Well, now, if I didn't think you sewed his collar with white thread,
216 | but it's black."
217 | 
218 | "Why, I did sew it with white! Tom!"
219 | 
220 | But Tom did not wait for the rest. As he went out at the door he said:
221 | 
222 | "Siddy, I'll lick you for that."
223 | 
224 | In a safe place Tom examined two large needles which were thrust into
225 | the lapels of his jacket, and had thread bound about them--one needle
226 | carried white thread and the other black. He said:
227 | 
228 | "She'd never noticed if it hadn't been for Sid. Confound it! sometimes
229 | she sews it with white, and sometimes she sews it with black. I wish to
230 | geeminy she'd stick to one or t'other--I can't keep the run of 'em. But
231 | I bet you I'll lam Sid for that. I'll learn him!"
232 | 
233 | He was not the Model Boy of the village. He knew the model boy very
234 | well though--and loathed him.
235 | 
236 | Within two minutes, or even less, he had forgotten all his troubles.
237 | Not because his troubles were one whit less heavy and bitter to him
238 | than a man's are to a man, but because a new and powerful interest bore
239 | them down and drove them out of his mind for the time--just as men's
240 | misfortunes are forgotten in the excitement of new enterprises. This
241 | new interest was a valued novelty in whistling, which he had just
242 | acquired from a negro, and he was suffering to practise it undisturbed.
243 | It consisted in a peculiar bird-like turn, a sort of liquid warble,
244 | produced by touching the tongue to the roof of the mouth at short
245 | intervals in the midst of the music--the reader probably remembers how
246 | to do it, if he has ever been a boy. Diligence and attention soon gave
247 | him the knack of it, and he strode down the street with his mouth full
248 | of harmony and his soul full of gratitude. He felt much as an
249 | astronomer feels who has discovered a new planet--no doubt, as far as
250 | strong, deep, unalloyed pleasure is concerned, the advantage was with
251 | the boy, not the astronomer.
252 | 
253 | The summer evenings were long. It was not dark, yet. Presently Tom
254 | checked his whistle. A stranger was before him--a boy a shade larger
255 | than himself. A new-comer of any age or either sex was an impressive
256 | curiosity in the poor little shabby village of St. Petersburg. This boy
257 | was well dressed, too--well dressed on a week-day. This was simply
258 | astounding. His cap was a dainty thing, his close-buttoned blue cloth
259 | roundabout was new and natty, and so were his pantaloons. He had shoes
260 | on--and it was only Friday. He even wore a necktie, a bright bit of
261 | ribbon. He had a citified air about him that ate into Tom's vitals. The
262 | more Tom stared at the splendid marvel, the higher he turned up his
263 | nose at his finery and the shabbier and shabbier his own outfit seemed
264 | to him to grow. Neither boy spoke. If one moved, the other moved--but
265 | only sidewise, in a circle; they kept face to face and eye to eye all
266 | the time. Finally Tom said:
267 | 
268 | "I can lick you!"
269 | 
270 | "I'd like to see you try it."
271 | 
272 | "Well, I can do it."
273 | 
274 | "No you can't, either."
275 | 
276 | "Yes I can."
277 | 
278 | "No you can't."
279 | 
280 | "I can."
281 | 
282 | "You can't."
283 | 
284 | "Can!"
285 | 
286 | "Can't!"
287 | 
288 | An uncomfortable pause. Then Tom said:
289 | 
290 | "What's your name?"
291 | 
292 | "'Tisn't any of your business, maybe."
293 | 
294 | "Well I 'low I'll MAKE it my business."
295 | 
296 | "Well why don't you?"
297 | 
298 | "If you say much, I will."
299 | 
300 | "Much--much--MUCH. There now."
301 | 
302 | "Oh, you think you're mighty smart, DON'T you? I could lick you with
303 | one hand tied behind me, if I wanted to."
304 | 
305 | "Well why don't you DO it? You SAY you can do it."
306 | 
307 | "Well I WILL, if you fool with me."
308 | 
309 | "Oh yes--I've seen whole families in the same fix."
310 | 
311 | "Smarty! You think you're SOME, now, DON'T you? Oh, what a hat!"
312 | 
313 | "You can lump that hat if you don't like it. I dare you to knock it
314 | off--and anybody that'll take a dare will suck eggs."
315 | 
316 | "You're a liar!"
317 | 
318 | "You're another."
319 | 
320 | "You're a fighting liar and dasn't take it up."
321 | 
322 | "Aw--take a walk!"
323 | 
324 | "Say--if you give me much more of your sass I'll take and bounce a
325 | rock off'n your head."
326 | 
327 | "Oh, of COURSE you will."
328 | 
329 | "Well I WILL."
330 | 
331 | "Well why don't you DO it then? What do you keep SAYING you will for?
332 | Why don't you DO it? It's because you're afraid."
333 | 
334 | "I AIN'T afraid."
335 | 
336 | "You are."
337 | 
338 | "I ain't."
339 | 
340 | "You are."
341 | 
342 | Another pause, and more eying and sidling around each other. Presently
343 | they were shoulder to shoulder. Tom said:
344 | 
345 | "Get away from here!"
346 | 
347 | "Go away yourself!"
348 | 
349 | "I won't."
350 | 
351 | "I won't either."
352 | 
353 | So they stood, each with a foot placed at an angle as a brace, and
354 | both shoving with might and main, and glowering at each other with
355 | hate. But neither could get an advantage. After struggling till both
356 | were hot and flushed, each relaxed his strain with watchful caution,
357 | and Tom said:
358 | 
359 | "You're a coward and a pup. I'll tell my big brother on you, and he
360 | can thrash you with his little finger, and I'll make him do it, too."
361 | 
362 | "What do I care for your big brother? I've got a brother that's bigger
363 | than he is--and what's more, he can throw him over that fence, too."
364 | [Both brothers were imaginary.]
365 | 
366 | "That's a lie."
367 | 
368 | "YOUR saying so don't make it so."
369 | 
370 | Tom drew a line in the dust with his big toe, and said:
371 | 
372 | "I dare you to step over that, and I'll lick you till you can't stand
373 | up. Anybody that'll take a dare will steal sheep."
374 | 
375 | The new boy stepped over promptly, and said:
376 | 
377 | "Now you said you'd do it, now let's see you do it."
378 | 
379 | "Don't you crowd me now; you better look out."
380 | 
381 | "Well, you SAID you'd do it--why don't you do it?"
382 | 
383 | "By jingo! for two cents I WILL do it."
384 | 
385 | The new boy took two broad coppers out of his pocket and held them out
386 | with derision. Tom struck them to the ground. In an instant both boys
387 | were rolling and tumbling in the dirt, gripped together like cats; and
388 | for the space of a minute they tugged and tore at each other's hair and
389 | clothes, punched and scratched each other's nose, and covered
390 | themselves with dust and glory. Presently the confusion took form, and
391 | through the fog of battle Tom appeared, seated astride the new boy, and
392 | pounding him with his fists. "Holler 'nuff!" said he.
393 | 
394 | The boy only struggled to free himself. He was crying--mainly from rage.
395 | 
396 | "Holler 'nuff!"--and the pounding went on.
397 | 


--------------------------------------------------------------------------------
/test/pycode1.txt:
--------------------------------------------------------------------------------
  1 | class NamedInitializer(Node):
  2 |     __slots__ = ('name', 'expr', 'coord', '__weakref__')
  3 |     def __init__(self, name, expr, coord=None):
  4 |         self.name = name
  5 |         self.expr = expr
  6 |         self.coord = coord
  7 | 
  8 |     def children(self):
  9 |         nodelist = []
 10 |         if self.expr is not None: nodelist.append(("expr", self.expr))
 11 |         for i, child in enumerate(self.name or []):
 12 |             nodelist.append(("name[%d]" % i, child))
 13 |         return tuple(nodelist)
 14 | 
 15 |     def __iter__(self):
 16 |         if self.expr is not None:
 17 |             yield self.expr
 18 |         for child in (self.name or []):
 19 |             yield child
 20 | 
 21 |     attr_names = ()
 22 | 
 23 | class ParamList(Node):
 24 |     __slots__ = ('params', 'coord', '__weakref__')
 25 |     def __init__(self, params, coord=None):
 26 |         self.params = params
 27 |         self.coord = coord
 28 | 
 29 |     def children(self):
 30 |         nodelist = []
 31 |         for i, child in enumerate(self.params or []):
 32 |             nodelist.append(("params[%d]" % i, child))
 33 |         return tuple(nodelist)
 34 | 
 35 |     def __iter__(self):
 36 |         for child in (self.params or []):
 37 |             yield child
 38 | 
 39 |     attr_names = ()
 40 | 
 41 | class PtrDecl(Node):
 42 |     __slots__ = ('quals', 'type', 'coord', '__weakref__')
 43 |     def __init__(self, quals, type, coord=None):
 44 |         self.quals = quals
 45 |         self.type = type
 46 |         self.coord = coord
 47 | 
 48 |     def children(self):
 49 |         nodelist = []
 50 |         if self.type is not None: nodelist.append(("type", self.type))
 51 |         return tuple(nodelist)
 52 | 
 53 |     def __iter__(self):
 54 |         if self.type is not None:
 55 |             yield self.type
 56 | 
 57 |     attr_names = ('quals', )
 58 | 
 59 | class Return(Node):
 60 |     __slots__ = ('expr', 'coord', '__weakref__')
 61 |     def __init__(self, expr, coord=None):
 62 |         self.expr = expr
 63 |         self.coord = coord
 64 | 
 65 |     def children(self):
 66 |         nodelist = []
 67 |         if self.expr is not None: nodelist.append(("expr", self.expr))
 68 |         return tuple(nodelist)
 69 | 
 70 |     def __iter__(self):
 71 |         if self.expr is not None:
 72 |             yield self.expr
 73 | 
 74 |     attr_names = ()
 75 | 
 76 | class StaticAssert(Node):
 77 |     __slots__ = ('cond', 'message', 'coord', '__weakref__')
 78 |     def __init__(self, cond, message, coord=None):
 79 |         self.cond = cond
 80 |         self.message = message
 81 |         self.coord = coord
 82 | 
 83 |     def children(self):
 84 |         nodelist = []
 85 |         if self.cond is not None: nodelist.append(("cond", self.cond))
 86 |         if self.message is not None: nodelist.append(("message", self.message))
 87 |         return tuple(nodelist)
 88 | 
 89 |     def __iter__(self):
 90 |         if self.cond is not None:
 91 |             yield self.cond
 92 |         if self.message is not None:
 93 |             yield self.message
 94 | 
 95 |     attr_names = ()
 96 | 
 97 | class Struct(Node):
 98 |     __slots__ = ('name', 'decls', 'coord', '__weakref__')
 99 |     def __init__(self, name, decls, coord=None):
100 |         self.name = name
101 |         self.decls = decls
102 |         self.coord = coord
103 | 
104 |     def children(self):
105 |         nodelist = []
106 |         for i, child in enumerate(self.decls or []):
107 |             nodelist.append(("decls[%d]" % i, child))
108 |         return tuple(nodelist)
109 | 
110 |     def __iter__(self):
111 |         for child in (self.decls or []):
112 |             yield child
113 | 
114 |     attr_names = ('name', )
115 | 
116 | class StructRef(Node):
117 |     __slots__ = ('name', 'type', 'field', 'coord', '__weakref__')
118 |     def __init__(self, name, type, field, coord=None):
119 |         self.name = name
120 |         self.type = type
121 |         self.field = field
122 |         self.coord = coord
123 | 
124 |     def children(self):
125 |         nodelist = []
126 |         if self.name is not None: nodelist.append(("name", self.name))
127 |         if self.field is not None: nodelist.append(("field", self.field))
128 |         return tuple(nodelist)
129 | 
130 |     def __iter__(self):
131 |         if self.name is not None:
132 |             yield self.name
133 |         if self.field is not None:
134 |             yield self.field
135 | 
136 |     attr_names = ('type', )
137 | 
138 | class Switch(Node):
139 |     __slots__ = ('cond', 'stmt', 'coord', '__weakref__')
140 |     def __init__(self, cond, stmt, coord=None):
141 |         self.cond = cond
142 |         self.stmt = stmt
143 |         self.coord = coord
144 | 
145 |     def children(self):
146 |         nodelist = []
147 |         if self.cond is not None: nodelist.append(("cond", self.cond))
148 |         if self.stmt is not None: nodelist.append(("stmt", self.stmt))
149 |         return tuple(nodelist)
150 | 
151 |     def __iter__(self):
152 |         if self.cond is not None:
153 |             yield self.cond
154 |         if self.stmt is not None:
155 |             yield self.stmt
156 | 
157 |     attr_names = ()
158 | 
159 | class TernaryOp(Node):
160 |     __slots__ = ('cond', 'iftrue', 'iffalse', 'coord', '__weakref__')
161 |     def __init__(self, cond, iftrue, iffalse, coord=None):
162 |         self.cond = cond
163 |         self.iftrue = iftrue
164 |         self.iffalse = iffalse
165 |         self.coord = coord
166 | 
167 |     def children(self):
168 |         nodelist = []
169 |         if self.cond is not None: nodelist.append(("cond", self.cond))
170 |         if self.iftrue is not None: nodelist.append(("iftrue", self.iftrue))
171 |         if self.iffalse is not None: nodelist.append(("iffalse", self.iffalse))
172 |         return tuple(nodelist)
173 | 
174 |     def __iter__(self):
175 |         if self.cond is not None:
176 |             yield self.cond
177 |         if self.iftrue is not None:
178 |             yield self.iftrue
179 |         if self.iffalse is not None:
180 |             yield self.iffalse
181 | 
182 |     attr_names = ()
183 | 
184 | class TypeDecl(Node):
185 |     __slots__ = ('declname', 'quals', 'align', 'type', 'coord', '__weakref__')
186 |     def __init__(self, declname, quals, align, type, coord=None):
187 |         self.declname = declname
188 |         self.quals = quals
189 |         self.align = align
190 |         self.type = type
191 |         self.coord = coord
192 | 
193 |     def children(self):
194 |         nodelist = []
195 |         if self.type is not None: nodelist.append(("type", self.type))
196 |         return tuple(nodelist)
197 | 
198 |     def __iter__(self):
199 |         if self.type is not None:
200 |             yield self.type
201 | 
202 |     attr_names = ('declname', 'quals', 'align', )
203 | 
204 | class Typedef(Node):
205 |     __slots__ = ('name', 'quals', 'storage', 'type', 'coord', '__weakref__')
206 |     def __init__(self, name, quals, storage, type, coord=None):
207 |         self.name = name
208 |         self.quals = quals
209 |         self.storage = storage
210 |         self.type = type
211 |         self.coord = coord
212 | 
213 |     def children(self):
214 |         nodelist = []
215 |         if self.type is not None: nodelist.append(("type", self.type))
216 |         return tuple(nodelist)
217 | 
218 |     def __iter__(self):
219 |         if self.type is not None:
220 |             yield self.type
221 | 
222 |     attr_names = ('name', 'quals', 'storage', )
223 | 
224 | class Typename(Node):
225 |     __slots__ = ('name', 'quals', 'align', 'type', 'coord', '__weakref__')
226 |     def __init__(self, name, quals, align, type, coord=None):
227 |         self.name = name
228 |         self.quals = quals
229 |         self.align = align
230 |         self.type = type
231 |         self.coord = coord
232 | 
233 |     def children(self):
234 |         nodelist = []
235 |         if self.type is not None: nodelist.append(("type", self.type))
236 |         return tuple(nodelist)
237 | 
238 |     def __iter__(self):
239 |         if self.type is not None:
240 |             yield self.type
241 | 
242 |     attr_names = ('name', 'quals', 'align', )
243 | 
244 | class UnaryOp(Node):
245 |     __slots__ = ('op', 'expr', 'coord', '__weakref__')
246 |     def __init__(self, op, expr, coord=None):
247 |         self.op = op
248 |         self.expr = expr
249 |         self.coord = coord
250 | 
251 |     def children(self):
252 |         nodelist = []
253 |         if self.expr is not None: nodelist.append(("expr", self.expr))
254 |         return tuple(nodelist)
255 | 
256 |     def __iter__(self):
257 |         if self.expr is not None:
258 |             yield self.expr
259 | 
260 |     attr_names = ('op', )
261 | 
262 | class Union(Node):
263 |     __slots__ = ('name', 'decls', 'coord', '__weakref__')
264 |     def __init__(self, name, decls, coord=None):
265 |         self.name = name
266 |         self.decls = decls
267 |         self.coord = coord
268 | 
269 |     def children(self):
270 |         nodelist = []
271 |         for i, child in enumerate(self.decls or []):
272 |             nodelist.append(("decls[%d]" % i, child))
273 |         return tuple(nodelist)
274 | 
275 |     def __iter__(self):
276 |         for child in (self.decls or []):
277 |             yield child
278 | 
279 |     attr_names = ('name', )
280 | 
281 | class While(Node):
282 |     __slots__ = ('cond', 'stmt', 'coord', '__weakref__')
283 |     def __init__(self, cond, stmt, coord=None):
284 |         self.cond = cond
285 |         self.stmt = stmt
286 |         self.coord = coord
287 | 
288 |     def children(self):
289 |         nodelist = []
290 |         if self.cond is not None: nodelist.append(("cond", self.cond))
291 |         if self.stmt is not None: nodelist.append(("stmt", self.stmt))
292 |         return tuple(nodelist)
293 | 
294 |     def __iter__(self):
295 |         if self.cond is not None:
296 |             yield self.cond
297 |         if self.stmt is not None:
298 |             yield self.stmt
299 | 
300 |     attr_names = ()
301 | 
302 | class Pragma(Node):
303 |     __slots__ = ('string', 'coord', '__weakref__')
304 |     def __init__(self, string, coord=None):
305 |         self.string = string
306 |         self.coord = coord
307 | 
308 |     def children(self):
309 |         nodelist = []
310 |         return tuple(nodelist)
311 | 
312 |     def __iter__(self):
313 |         return
314 |         yield
315 | 
316 |     attr_names = ('string', )
317 | 


--------------------------------------------------------------------------------
/test/sp-dump-ids.py:
--------------------------------------------------------------------------------
 1 | # Uses the sentencepiece package to tokenize the file provided as a command-line
 2 | # argument; emits all token IDs to stdout, one per line.
 3 | #
 4 | # Requires the MODELPATH env var to be set to the binary proto describing
 5 | # the tokenizer model.
 6 | import sentencepiece as spm
 7 | import os, sys
 8 | 
 9 | with open(sys.argv[1], "r", newline="") as f:
10 |     text = f.read()
11 |     sp = spm.SentencePieceProcessor(model_file=os.getenv("MODELPATH"))
12 |     ids = sp.encode(text)
13 | 
14 |     # Print ids out, one per line
15 |     for id in ids:
16 |         print(id)
17 | 


--------------------------------------------------------------------------------
/token.go:
--------------------------------------------------------------------------------
 1 | package sentencepiece
 2 | 
 3 | import "fmt"
 4 | 
 5 | // Token represents a single token from the input text. ID is a unique token
 6 | // identifier that the model uses in its internal representation. Text is
 7 | // the piece of text this token represents.
 8 | type Token struct {
 9 | 	ID   int
10 | 	Text string
11 | }
12 | 
13 | func (t Token) String() string {
14 | 	return fmt.Sprintf("Token{ID: %v, Text: %q}", t.ID, t.Text)
15 | }
16 | 


--------------------------------------------------------------------------------