├── .github
└── workflows
│ └── pages.yml
├── .gitignore
├── LICENSE
├── README.md
├── benchmark_test.go
├── doc
└── toklogo2.png
├── example_test.go
├── go.mod
├── go.sum
├── internal
├── cmd
│ ├── dumper
│ │ └── main.go
│ └── wasm
│ │ ├── .gitignore
│ │ ├── Makefile
│ │ ├── assets
│ │ ├── index.html
│ │ ├── script.js
│ │ └── wasm_exec.js
│ │ └── main.go
├── model
│ ├── gen.sh
│ ├── sentencepiece_model.pb.go
│ └── sentencepiece_model.proto
├── prefixmatcher
│ ├── prefixmatcher.go
│ └── prefixmatcher_test.go
└── priorityqueue
│ ├── priorityqueue.go
│ └── priorityqueue_test.go
├── normalize.go
├── processor.go
├── processor_test.go
├── system_test.go
├── test
├── gocode1.txt
├── htmlcode1.txt
├── latexcode1.txt
├── opening-multilang.txt
├── perlcode1.txt
├── pg2000_spanish.txt
├── pg41845_telugu.txt
├── pg7193_english.txt
├── pycode1.txt
├── romeo-juliet-english.txt
└── sp-dump-ids.py
└── token.go
/.github/workflows/pages.yml:
--------------------------------------------------------------------------------
1 | # Simple workflow for deploying static content to GitHub Pages
2 | name: Deploy static content to Pages
3 |
4 | on:
5 | # Runs on pushes targeting the default branch
6 | push:
7 | branches: ["main"]
8 |
9 | # Allows you to run this workflow manually from the Actions tab
10 | workflow_dispatch:
11 |
12 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
13 | permissions:
14 | contents: read
15 | pages: write
16 | id-token: write
17 |
18 | # Allow one concurrent deployment
19 | concurrency:
20 | group: "pages"
21 | cancel-in-progress: true
22 |
23 | jobs:
24 | # Single deploy job since we're just deploying
25 | deploy:
26 | environment:
27 | name: github-pages
28 | url: ${{ steps.deployment.outputs.page_url }}
29 | runs-on: ubuntu-latest
30 | steps:
31 | - name: Checkout
32 | uses: actions/checkout@v3
33 |
34 | - name: Set up Go
35 | uses: actions/setup-go@v4
36 | with:
37 | go-version: 1.22.5
38 |
39 | - name: Setup Pages
40 | uses: actions/configure-pages@v2
41 |
42 | - name: Build wasm
43 | run: |
44 | cd internal/cmd/wasm
45 | make build
46 |
47 | - name: Upload artifact
48 | uses: actions/upload-pages-artifact@v1
49 | with:
50 | # Upload
51 | path: 'internal/cmd/wasm/assets'
52 |
53 | - name: Deploy to GitHub Pages
54 | id: deployment
55 | uses: actions/deploy-pages@v1
56 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # If you prefer the allow list template instead of the deny list, see community template:
2 | # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore
3 | #
4 | # Binaries for programs and plugins
5 | *.exe
6 | *.exe~
7 | *.dll
8 | *.so
9 | *.dylib
10 |
11 | # Test binary, built with `go test -c`
12 | *.test
13 |
14 | # Output of the go coverage tool, specifically when used with LiteIDE
15 | *.out
16 |
17 | # Dependency directories (remove the comment below to include it)
18 | # vendor/
19 |
20 | # Go workspace file
21 | go.work
22 | go.work.sum
23 |
24 | # env file
25 | .env
26 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # go-sentencepiece
2 |
3 |
4 |
5 |
6 |
7 | ----
8 |
9 | [](https://pkg.go.dev/github.com/eliben/go-sentencepiece)
10 |
11 | This is a pure Go implementation of encoding and decoding text with
12 | the [SentencePiece tokenizer](https://github.com/google/sentencepiece).
13 |
14 | "Encoding" is the operation used to split text into tokens, using
15 | a trained tokenizer model. "Decoding" is the reverse process - converting
16 | a list of tokens into the original text.
17 |
18 | SentencePiece is a general family of tokenizers that is configured
19 | by a protobuf configuration file. This repository currently focuses
20 | on implementing just the functionality required to reproduce the
21 | tokenization of [Gemma models](https://ai.google.dev/gemma) (the same
22 | tokenizer is used for Google's proprietary Gemini family of models).
23 | Specifically, it only implements BPE tokenization since this is what
24 | Gemma uses.
25 |
26 | ## Current status
27 |
28 | This package should be ready to use for encoding text into tokens
29 | using the Gemma tokenizer; it's been reasonably optimized and extensively
30 | tested vs. the [SentencePiece Python bindings](https://pypi.org/project/sentencepiece/)
31 | (see `system_test.go` in this repository).
32 |
33 | If you find any problems or discrepancies, please open an issue.
34 |
35 | ## Tokenizer configuration
36 |
37 | The configuration file for the tokenizer is a protobuf (structured
38 | data, serialized in the [protocol buffer format](https://protobuf.dev/))
39 | that describes a trained tokenizer model; it includes
40 | the complete learned vocabulary used for tokenization, as well as
41 | other configuration information.
42 |
43 | It is not part of this repository. Please fetch it from the
44 | [official Gemma implementation repository](https://github.com/google/gemma_pytorch/tree/main/tokenizer).
45 | `NewProcessor*` constructors will expect to read this file.
46 |
47 | ## Developing
48 |
49 | A protobuf is used to configure the tokenizer. The structure of the
50 | protobuf is described by the `internal/model/sentencepiece_model.proto` file,
51 | which is vendored from https://github.com/google/sentencepiece
52 |
53 | To re-generate the `*.pb.go` file from it:
54 |
55 | ```
56 | $ cd internal/model
57 | $ ./gen.sh
58 | ```
59 |
60 | The configuration protobuf itself is obtained as described in the
61 | [Tokenizer configuration](#tokenizer-configuration) section. All
62 | tests require the `MODELPATH` env var to point to a local
63 | copy of the tokenizer configuration file.
64 |
65 | ## Online demo
66 |
67 | To see an in-browser demo of this tokenizer in action, visit
68 | https://eliben.github.io/go-sentencepiece/
69 |
70 | The Go code is compiled to WebAssembly and loaded from a small
71 | JS program to allow interactive encoding of text.
72 |
--------------------------------------------------------------------------------
/benchmark_test.go:
--------------------------------------------------------------------------------
1 | package sentencepiece
2 |
3 | import (
4 | "io/ioutil"
5 | "path/filepath"
6 | "runtime"
7 | "testing"
8 | )
9 |
10 | func BenchmarkEncoder(b *testing.B) {
11 | buf, err := ioutil.ReadFile(filepath.Join("test", "pg7193_english.txt"))
12 | if err != nil {
13 | b.Fatal(err)
14 | }
15 | sbuf := string(buf)
16 |
17 | proc := createProcessor(b)
18 | b.ResetTimer()
19 | total := 0
20 |
21 | for range b.N {
22 | toks := proc.Encode(sbuf)
23 | total += len(toks)
24 | }
25 | runtime.KeepAlive(total)
26 |
27 | b.ReportMetric(float64(total)/float64(b.Elapsed().Seconds()), "tokens/sec")
28 | }
29 |
30 | func BenchmarkDecoder(b *testing.B) {
31 | buf, err := ioutil.ReadFile(filepath.Join("test", "pg7193_english.txt"))
32 | if err != nil {
33 | b.Fatal(err)
34 | }
35 | sbuf := string(buf)
36 |
37 | proc := createProcessor(b)
38 | toks := proc.Encode(sbuf)
39 |
40 | b.ResetTimer()
41 | total := 0
42 |
43 | for range b.N {
44 | t := proc.DecodeTokens(toks)
45 | total += len(t)
46 | }
47 | runtime.KeepAlive(total)
48 |
49 | b.ReportMetric(float64(len(toks)*b.N)/float64(b.Elapsed().Seconds()), "tokens/sec")
50 | }
51 |
--------------------------------------------------------------------------------
/doc/toklogo2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eliben/go-sentencepiece/dd59fe97df461d1fa84d15c25a51f025156eece1/doc/toklogo2.png
--------------------------------------------------------------------------------
/example_test.go:
--------------------------------------------------------------------------------
1 | package sentencepiece_test
2 |
3 | import (
4 | "fmt"
5 | "log"
6 | "os"
7 |
8 | "github.com/eliben/go-sentencepiece"
9 | )
10 |
11 | func ExampleEncode() {
12 | protoFile := os.Getenv("MODELPATH")
13 | if protoFile == "" {
14 | log.Println("Need MODELPATH env var to run example")
15 | return
16 | }
17 |
18 | proc, err := sentencepiece.NewProcessorFromPath(protoFile)
19 | if err != nil {
20 | log.Fatal(err)
21 | }
22 |
23 | text := "Encoding produces tokens that LLMs can learn and understand"
24 | tokens := proc.Encode(text)
25 |
26 | for _, token := range tokens {
27 | fmt.Println(token)
28 | }
29 | }
30 |
31 | func ExampleDecode() {
32 | protoFile := os.Getenv("MODELPATH")
33 | if protoFile == "" {
34 | log.Println("Need MODELPATH env var to run example")
35 | return
36 | }
37 |
38 | proc, err := sentencepiece.NewProcessorFromPath(protoFile)
39 | if err != nil {
40 | log.Fatal(err)
41 | }
42 |
43 | ids := []int{17534, 2134}
44 | text := proc.Decode(ids)
45 |
46 | fmt.Println(text)
47 | }
48 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/eliben/go-sentencepiece
2 |
3 | go 1.22.5
4 |
5 | require google.golang.org/protobuf v1.34.2
6 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU=
2 | github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
3 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
4 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
5 | google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg=
6 | google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw=
7 |
--------------------------------------------------------------------------------
/internal/cmd/dumper/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | // Command dumper is a debugging utility for internal use. It helps explore
4 | // the model proto and compare results with other tools.
5 |
6 | import (
7 | "flag"
8 | "fmt"
9 | "io/ioutil"
10 | "log"
11 | "os"
12 | "unicode"
13 |
14 | "github.com/eliben/go-sentencepiece"
15 | "github.com/eliben/go-sentencepiece/internal/model"
16 | "google.golang.org/protobuf/encoding/prototext"
17 | "google.golang.org/protobuf/proto"
18 | )
19 |
20 | func main() {
21 | fDumpAll := flag.Bool("dumpall", false, "dump entire model proto")
22 | fFindUni := flag.Bool("finduni", false, "find unicode runes not in pieces")
23 | fFindBytes := flag.Bool("findbytes", false, "show all byte pieces with their IDs")
24 | fEncodeFile := flag.String("encodefile", "", "file name to open and encode")
25 | flag.Parse()
26 |
27 | modelPath := os.Getenv("MODELPATH")
28 | if modelPath == "" {
29 | log.Fatal("Need MODELPATH env var to run")
30 | }
31 |
32 | b, err := ioutil.ReadFile(modelPath)
33 | if err != nil {
34 | log.Fatal(err)
35 | }
36 |
37 | var protomodel model.ModelProto
38 | err = proto.Unmarshal(b, &protomodel)
39 | if err != nil {
40 | log.Fatal(err)
41 | }
42 |
43 | if *fDumpAll {
44 | fmt.Println(prototext.Format(&protomodel))
45 | } else if *fFindBytes {
46 | for i, piece := range protomodel.GetPieces() {
47 | if piece.GetType() == model.ModelProto_SentencePiece_BYTE {
48 | fmt.Printf("%5d: %s\n", i, piece.GetPiece())
49 | }
50 | }
51 |
52 | } else if *fFindUni {
53 | pieces := make(map[string]int)
54 | for i, piece := range protomodel.GetPieces() {
55 | pieces[piece.GetPiece()] = i
56 | }
57 |
58 | for r := rune(0); r <= unicode.MaxRune; r++ {
59 | if unicode.IsPrint(r) {
60 | if _, found := pieces[string(r)]; !found {
61 | fmt.Printf("not in pieces: %U %q\n", r, string(r))
62 | }
63 | }
64 | }
65 | } else if *fEncodeFile != "" {
66 | proc, err := sentencepiece.NewProcessorFromPath(modelPath)
67 | if err != nil {
68 | log.Fatal(err)
69 | }
70 |
71 | b, err := ioutil.ReadFile(*fEncodeFile)
72 | if err != nil {
73 | log.Fatal(err)
74 | }
75 |
76 | tokens := proc.Encode(string(b))
77 | for _, t := range tokens {
78 | fmt.Println(t.ID)
79 | }
80 | }
81 | }
82 |
--------------------------------------------------------------------------------
/internal/cmd/wasm/.gitignore:
--------------------------------------------------------------------------------
1 | *.wasm
2 | embed_data
3 |
--------------------------------------------------------------------------------
/internal/cmd/wasm/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: build serve clean
2 |
3 | build:
4 | mkdir -p embed_data
5 | wget https://github.com/google/gemma_pytorch/raw/main/tokenizer/tokenizer.model -O embed_data/tokenizer.model
6 | GOOS=js GOARCH=wasm go build -o assets/gospm.wasm main.go
7 |
8 | serve:
9 | go run github.com/eliben/static-server@latest -port 8873 assets
10 |
11 | clean:
12 | rm -rf embed_data assets/gospm.wasm
13 |
--------------------------------------------------------------------------------
/internal/cmd/wasm/assets/index.html:
--------------------------------------------------------------------------------
1 |
2 |
96 |
97 |
98 |
106 |
107 |
108 |
109 |
Text
110 |
111 |
112 |
113 |
Tokens
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
SentencePiece tokenizer
128 |
129 | Enter text in the box - tokenization is done as you type.
130 |
131 |
132 |
133 | This is a SentencePiece
134 | tokenizer implemented in pure Go and compiled to WebAssembly.
135 | The vocabulary and settings are taken from the
136 | Google AI Gemma open model.
137 |
138 |
139 |
140 |
141 |
142 |
--------------------------------------------------------------------------------
/internal/cmd/wasm/assets/script.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const TextBox = document.querySelector('#text');
4 | TextBox.addEventListener('input', onStateChange);
5 |
6 | const OutBox = document.querySelector('#tokens');
7 |
8 | let radioText = document.querySelector('#showText');
9 | let radioTokens = document.querySelector('#showTokens');
10 | radioText.addEventListener('change', onStateChange);
11 | radioTokens.addEventListener('change', onStateChange);
12 |
13 | function init() {
14 | // Trigger a redraw to get started.
15 | onStateChange();
16 | }
17 |
18 | //------------------
19 |
20 | function onStateChange() {
21 | const text = TextBox.value;
22 |
23 | if (radioTokens.checked) {
24 | const start = performance.now();
25 | let tokens = textToIDs(text);
26 | const end = performance.now();
27 | console.log("textToIDs elapsed (ms): ", end - start);
28 | OutBox.textContent = "[" + tokens.join(", ") + "]";
29 | } else {
30 | const start = performance.now();
31 | let pieces = textToPieces(text);
32 | const end = performance.now();
33 | console.log("textToPieces elapsed (ms): ", end - start);
34 | console.log(pieces);
35 |
36 | OutBox.innerHTML = '';
37 | // To have different background colors for each piece, we need to
38 | // wrap each piece in a span. The color is cycled between 8 different
39 | // colors, in jumps of 135 degrees to make them sufficiently far apart
40 | // and not repeat for 8 cycles (since 360/8 = 45, we could use any
41 | // multiple of 45 that's not also a multiple of 180).
42 | for (let i = 0; i < pieces.length; i++) {
43 | if (pieces[i] === '\n') {
44 | OutBox.appendChild(document.createElement('br'));
45 | } else {
46 | let color = i % 8;
47 | let span = document.createElement('span');
48 | span.textContent = pieces[i];
49 | span.style.lineHeight = 1.5;
50 | span.style.backgroundColor = `hsl(${color * 135}, 40%, 70%)`;
51 | span.style.whiteSpace = 'pre';
52 | span.style.display = 'inline-block';
53 | OutBox.appendChild(span);
54 | }
55 | }
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/internal/cmd/wasm/assets/wasm_exec.js:
--------------------------------------------------------------------------------
1 | // Copyright 2018 The Go Authors. All rights reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 |
5 | "use strict";
6 |
7 | (() => {
8 | const enosys = () => {
9 | const err = new Error("not implemented");
10 | err.code = "ENOSYS";
11 | return err;
12 | };
13 |
14 | if (!globalThis.fs) {
15 | let outputBuf = "";
16 | globalThis.fs = {
17 | constants: { O_WRONLY: -1, O_RDWR: -1, O_CREAT: -1, O_TRUNC: -1, O_APPEND: -1, O_EXCL: -1 }, // unused
18 | writeSync(fd, buf) {
19 | outputBuf += decoder.decode(buf);
20 | const nl = outputBuf.lastIndexOf("\n");
21 | if (nl != -1) {
22 | console.log(outputBuf.substring(0, nl));
23 | outputBuf = outputBuf.substring(nl + 1);
24 | }
25 | return buf.length;
26 | },
27 | write(fd, buf, offset, length, position, callback) {
28 | if (offset !== 0 || length !== buf.length || position !== null) {
29 | callback(enosys());
30 | return;
31 | }
32 | const n = this.writeSync(fd, buf);
33 | callback(null, n);
34 | },
35 | chmod(path, mode, callback) { callback(enosys()); },
36 | chown(path, uid, gid, callback) { callback(enosys()); },
37 | close(fd, callback) { callback(enosys()); },
38 | fchmod(fd, mode, callback) { callback(enosys()); },
39 | fchown(fd, uid, gid, callback) { callback(enosys()); },
40 | fstat(fd, callback) { callback(enosys()); },
41 | fsync(fd, callback) { callback(null); },
42 | ftruncate(fd, length, callback) { callback(enosys()); },
43 | lchown(path, uid, gid, callback) { callback(enosys()); },
44 | link(path, link, callback) { callback(enosys()); },
45 | lstat(path, callback) { callback(enosys()); },
46 | mkdir(path, perm, callback) { callback(enosys()); },
47 | open(path, flags, mode, callback) { callback(enosys()); },
48 | read(fd, buffer, offset, length, position, callback) { callback(enosys()); },
49 | readdir(path, callback) { callback(enosys()); },
50 | readlink(path, callback) { callback(enosys()); },
51 | rename(from, to, callback) { callback(enosys()); },
52 | rmdir(path, callback) { callback(enosys()); },
53 | stat(path, callback) { callback(enosys()); },
54 | symlink(path, link, callback) { callback(enosys()); },
55 | truncate(path, length, callback) { callback(enosys()); },
56 | unlink(path, callback) { callback(enosys()); },
57 | utimes(path, atime, mtime, callback) { callback(enosys()); },
58 | };
59 | }
60 |
61 | if (!globalThis.process) {
62 | globalThis.process = {
63 | getuid() { return -1; },
64 | getgid() { return -1; },
65 | geteuid() { return -1; },
66 | getegid() { return -1; },
67 | getgroups() { throw enosys(); },
68 | pid: -1,
69 | ppid: -1,
70 | umask() { throw enosys(); },
71 | cwd() { throw enosys(); },
72 | chdir() { throw enosys(); },
73 | }
74 | }
75 |
76 | if (!globalThis.crypto) {
77 | throw new Error("globalThis.crypto is not available, polyfill required (crypto.getRandomValues only)");
78 | }
79 |
80 | if (!globalThis.performance) {
81 | throw new Error("globalThis.performance is not available, polyfill required (performance.now only)");
82 | }
83 |
84 | if (!globalThis.TextEncoder) {
85 | throw new Error("globalThis.TextEncoder is not available, polyfill required");
86 | }
87 |
88 | if (!globalThis.TextDecoder) {
89 | throw new Error("globalThis.TextDecoder is not available, polyfill required");
90 | }
91 |
92 | const encoder = new TextEncoder("utf-8");
93 | const decoder = new TextDecoder("utf-8");
94 |
95 | globalThis.Go = class {
96 | constructor() {
97 | this.argv = ["js"];
98 | this.env = {};
99 | this.exit = (code) => {
100 | if (code !== 0) {
101 | console.warn("exit code:", code);
102 | }
103 | };
104 | this._exitPromise = new Promise((resolve) => {
105 | this._resolveExitPromise = resolve;
106 | });
107 | this._pendingEvent = null;
108 | this._scheduledTimeouts = new Map();
109 | this._nextCallbackTimeoutID = 1;
110 |
111 | const setInt64 = (addr, v) => {
112 | this.mem.setUint32(addr + 0, v, true);
113 | this.mem.setUint32(addr + 4, Math.floor(v / 4294967296), true);
114 | }
115 |
116 | const setInt32 = (addr, v) => {
117 | this.mem.setUint32(addr + 0, v, true);
118 | }
119 |
120 | const getInt64 = (addr) => {
121 | const low = this.mem.getUint32(addr + 0, true);
122 | const high = this.mem.getInt32(addr + 4, true);
123 | return low + high * 4294967296;
124 | }
125 |
126 | const loadValue = (addr) => {
127 | const f = this.mem.getFloat64(addr, true);
128 | if (f === 0) {
129 | return undefined;
130 | }
131 | if (!isNaN(f)) {
132 | return f;
133 | }
134 |
135 | const id = this.mem.getUint32(addr, true);
136 | return this._values[id];
137 | }
138 |
139 | const storeValue = (addr, v) => {
140 | const nanHead = 0x7FF80000;
141 |
142 | if (typeof v === "number" && v !== 0) {
143 | if (isNaN(v)) {
144 | this.mem.setUint32(addr + 4, nanHead, true);
145 | this.mem.setUint32(addr, 0, true);
146 | return;
147 | }
148 | this.mem.setFloat64(addr, v, true);
149 | return;
150 | }
151 |
152 | if (v === undefined) {
153 | this.mem.setFloat64(addr, 0, true);
154 | return;
155 | }
156 |
157 | let id = this._ids.get(v);
158 | if (id === undefined) {
159 | id = this._idPool.pop();
160 | if (id === undefined) {
161 | id = this._values.length;
162 | }
163 | this._values[id] = v;
164 | this._goRefCounts[id] = 0;
165 | this._ids.set(v, id);
166 | }
167 | this._goRefCounts[id]++;
168 | let typeFlag = 0;
169 | switch (typeof v) {
170 | case "object":
171 | if (v !== null) {
172 | typeFlag = 1;
173 | }
174 | break;
175 | case "string":
176 | typeFlag = 2;
177 | break;
178 | case "symbol":
179 | typeFlag = 3;
180 | break;
181 | case "function":
182 | typeFlag = 4;
183 | break;
184 | }
185 | this.mem.setUint32(addr + 4, nanHead | typeFlag, true);
186 | this.mem.setUint32(addr, id, true);
187 | }
188 |
189 | const loadSlice = (addr) => {
190 | const array = getInt64(addr + 0);
191 | const len = getInt64(addr + 8);
192 | return new Uint8Array(this._inst.exports.mem.buffer, array, len);
193 | }
194 |
195 | const loadSliceOfValues = (addr) => {
196 | const array = getInt64(addr + 0);
197 | const len = getInt64(addr + 8);
198 | const a = new Array(len);
199 | for (let i = 0; i < len; i++) {
200 | a[i] = loadValue(array + i * 8);
201 | }
202 | return a;
203 | }
204 |
205 | const loadString = (addr) => {
206 | const saddr = getInt64(addr + 0);
207 | const len = getInt64(addr + 8);
208 | return decoder.decode(new DataView(this._inst.exports.mem.buffer, saddr, len));
209 | }
210 |
211 | const timeOrigin = Date.now() - performance.now();
212 | this.importObject = {
213 | _gotest: {
214 | add: (a, b) => a + b,
215 | },
216 | gojs: {
217 | // Go's SP does not change as long as no Go code is running. Some operations (e.g. calls, getters and setters)
218 | // may synchronously trigger a Go event handler. This makes Go code get executed in the middle of the imported
219 | // function. A goroutine can switch to a new stack if the current stack is too small (see morestack function).
220 | // This changes the SP, thus we have to update the SP used by the imported function.
221 |
222 | // func wasmExit(code int32)
223 | "runtime.wasmExit": (sp) => {
224 | sp >>>= 0;
225 | const code = this.mem.getInt32(sp + 8, true);
226 | this.exited = true;
227 | delete this._inst;
228 | delete this._values;
229 | delete this._goRefCounts;
230 | delete this._ids;
231 | delete this._idPool;
232 | this.exit(code);
233 | },
234 |
235 | // func wasmWrite(fd uintptr, p unsafe.Pointer, n int32)
236 | "runtime.wasmWrite": (sp) => {
237 | sp >>>= 0;
238 | const fd = getInt64(sp + 8);
239 | const p = getInt64(sp + 16);
240 | const n = this.mem.getInt32(sp + 24, true);
241 | fs.writeSync(fd, new Uint8Array(this._inst.exports.mem.buffer, p, n));
242 | },
243 |
244 | // func resetMemoryDataView()
245 | "runtime.resetMemoryDataView": (sp) => {
246 | sp >>>= 0;
247 | this.mem = new DataView(this._inst.exports.mem.buffer);
248 | },
249 |
250 | // func nanotime1() int64
251 | "runtime.nanotime1": (sp) => {
252 | sp >>>= 0;
253 | setInt64(sp + 8, (timeOrigin + performance.now()) * 1000000);
254 | },
255 |
256 | // func walltime() (sec int64, nsec int32)
257 | "runtime.walltime": (sp) => {
258 | sp >>>= 0;
259 | const msec = (new Date).getTime();
260 | setInt64(sp + 8, msec / 1000);
261 | this.mem.setInt32(sp + 16, (msec % 1000) * 1000000, true);
262 | },
263 |
264 | // func scheduleTimeoutEvent(delay int64) int32
265 | "runtime.scheduleTimeoutEvent": (sp) => {
266 | sp >>>= 0;
267 | const id = this._nextCallbackTimeoutID;
268 | this._nextCallbackTimeoutID++;
269 | this._scheduledTimeouts.set(id, setTimeout(
270 | () => {
271 | this._resume();
272 | while (this._scheduledTimeouts.has(id)) {
273 | // for some reason Go failed to register the timeout event, log and try again
274 | // (temporary workaround for https://github.com/golang/go/issues/28975)
275 | console.warn("scheduleTimeoutEvent: missed timeout event");
276 | this._resume();
277 | }
278 | },
279 | getInt64(sp + 8),
280 | ));
281 | this.mem.setInt32(sp + 16, id, true);
282 | },
283 |
284 | // func clearTimeoutEvent(id int32)
285 | "runtime.clearTimeoutEvent": (sp) => {
286 | sp >>>= 0;
287 | const id = this.mem.getInt32(sp + 8, true);
288 | clearTimeout(this._scheduledTimeouts.get(id));
289 | this._scheduledTimeouts.delete(id);
290 | },
291 |
292 | // func getRandomData(r []byte)
293 | "runtime.getRandomData": (sp) => {
294 | sp >>>= 0;
295 | crypto.getRandomValues(loadSlice(sp + 8));
296 | },
297 |
298 | // func finalizeRef(v ref)
299 | "syscall/js.finalizeRef": (sp) => {
300 | sp >>>= 0;
301 | const id = this.mem.getUint32(sp + 8, true);
302 | this._goRefCounts[id]--;
303 | if (this._goRefCounts[id] === 0) {
304 | const v = this._values[id];
305 | this._values[id] = null;
306 | this._ids.delete(v);
307 | this._idPool.push(id);
308 | }
309 | },
310 |
311 | // func stringVal(value string) ref
312 | "syscall/js.stringVal": (sp) => {
313 | sp >>>= 0;
314 | storeValue(sp + 24, loadString(sp + 8));
315 | },
316 |
317 | // func valueGet(v ref, p string) ref
318 | "syscall/js.valueGet": (sp) => {
319 | sp >>>= 0;
320 | const result = Reflect.get(loadValue(sp + 8), loadString(sp + 16));
321 | sp = this._inst.exports.getsp() >>> 0; // see comment above
322 | storeValue(sp + 32, result);
323 | },
324 |
325 | // func valueSet(v ref, p string, x ref)
326 | "syscall/js.valueSet": (sp) => {
327 | sp >>>= 0;
328 | Reflect.set(loadValue(sp + 8), loadString(sp + 16), loadValue(sp + 32));
329 | },
330 |
331 | // func valueDelete(v ref, p string)
332 | "syscall/js.valueDelete": (sp) => {
333 | sp >>>= 0;
334 | Reflect.deleteProperty(loadValue(sp + 8), loadString(sp + 16));
335 | },
336 |
337 | // func valueIndex(v ref, i int) ref
338 | "syscall/js.valueIndex": (sp) => {
339 | sp >>>= 0;
340 | storeValue(sp + 24, Reflect.get(loadValue(sp + 8), getInt64(sp + 16)));
341 | },
342 |
343 | // valueSetIndex(v ref, i int, x ref)
344 | "syscall/js.valueSetIndex": (sp) => {
345 | sp >>>= 0;
346 | Reflect.set(loadValue(sp + 8), getInt64(sp + 16), loadValue(sp + 24));
347 | },
348 |
349 | // func valueCall(v ref, m string, args []ref) (ref, bool)
350 | "syscall/js.valueCall": (sp) => {
351 | sp >>>= 0;
352 | try {
353 | const v = loadValue(sp + 8);
354 | const m = Reflect.get(v, loadString(sp + 16));
355 | const args = loadSliceOfValues(sp + 32);
356 | const result = Reflect.apply(m, v, args);
357 | sp = this._inst.exports.getsp() >>> 0; // see comment above
358 | storeValue(sp + 56, result);
359 | this.mem.setUint8(sp + 64, 1);
360 | } catch (err) {
361 | sp = this._inst.exports.getsp() >>> 0; // see comment above
362 | storeValue(sp + 56, err);
363 | this.mem.setUint8(sp + 64, 0);
364 | }
365 | },
366 |
367 | // func valueInvoke(v ref, args []ref) (ref, bool)
368 | "syscall/js.valueInvoke": (sp) => {
369 | sp >>>= 0;
370 | try {
371 | const v = loadValue(sp + 8);
372 | const args = loadSliceOfValues(sp + 16);
373 | const result = Reflect.apply(v, undefined, args);
374 | sp = this._inst.exports.getsp() >>> 0; // see comment above
375 | storeValue(sp + 40, result);
376 | this.mem.setUint8(sp + 48, 1);
377 | } catch (err) {
378 | sp = this._inst.exports.getsp() >>> 0; // see comment above
379 | storeValue(sp + 40, err);
380 | this.mem.setUint8(sp + 48, 0);
381 | }
382 | },
383 |
384 | // func valueNew(v ref, args []ref) (ref, bool)
385 | "syscall/js.valueNew": (sp) => {
386 | sp >>>= 0;
387 | try {
388 | const v = loadValue(sp + 8);
389 | const args = loadSliceOfValues(sp + 16);
390 | const result = Reflect.construct(v, args);
391 | sp = this._inst.exports.getsp() >>> 0; // see comment above
392 | storeValue(sp + 40, result);
393 | this.mem.setUint8(sp + 48, 1);
394 | } catch (err) {
395 | sp = this._inst.exports.getsp() >>> 0; // see comment above
396 | storeValue(sp + 40, err);
397 | this.mem.setUint8(sp + 48, 0);
398 | }
399 | },
400 |
401 | // func valueLength(v ref) int
402 | "syscall/js.valueLength": (sp) => {
403 | sp >>>= 0;
404 | setInt64(sp + 16, parseInt(loadValue(sp + 8).length));
405 | },
406 |
407 | // valuePrepareString(v ref) (ref, int)
408 | "syscall/js.valuePrepareString": (sp) => {
409 | sp >>>= 0;
410 | const str = encoder.encode(String(loadValue(sp + 8)));
411 | storeValue(sp + 16, str);
412 | setInt64(sp + 24, str.length);
413 | },
414 |
415 | // valueLoadString(v ref, b []byte)
416 | "syscall/js.valueLoadString": (sp) => {
417 | sp >>>= 0;
418 | const str = loadValue(sp + 8);
419 | loadSlice(sp + 16).set(str);
420 | },
421 |
422 | // func valueInstanceOf(v ref, t ref) bool
423 | "syscall/js.valueInstanceOf": (sp) => {
424 | sp >>>= 0;
425 | this.mem.setUint8(sp + 24, (loadValue(sp + 8) instanceof loadValue(sp + 16)) ? 1 : 0);
426 | },
427 |
428 | // func copyBytesToGo(dst []byte, src ref) (int, bool)
429 | "syscall/js.copyBytesToGo": (sp) => {
430 | sp >>>= 0;
431 | const dst = loadSlice(sp + 8);
432 | const src = loadValue(sp + 32);
433 | if (!(src instanceof Uint8Array || src instanceof Uint8ClampedArray)) {
434 | this.mem.setUint8(sp + 48, 0);
435 | return;
436 | }
437 | const toCopy = src.subarray(0, dst.length);
438 | dst.set(toCopy);
439 | setInt64(sp + 40, toCopy.length);
440 | this.mem.setUint8(sp + 48, 1);
441 | },
442 |
443 | // func copyBytesToJS(dst ref, src []byte) (int, bool)
444 | "syscall/js.copyBytesToJS": (sp) => {
445 | sp >>>= 0;
446 | const dst = loadValue(sp + 8);
447 | const src = loadSlice(sp + 16);
448 | if (!(dst instanceof Uint8Array || dst instanceof Uint8ClampedArray)) {
449 | this.mem.setUint8(sp + 48, 0);
450 | return;
451 | }
452 | const toCopy = src.subarray(0, dst.length);
453 | dst.set(toCopy);
454 | setInt64(sp + 40, toCopy.length);
455 | this.mem.setUint8(sp + 48, 1);
456 | },
457 |
458 | "debug": (value) => {
459 | console.log(value);
460 | },
461 | }
462 | };
463 | }
464 |
465 | async run(instance) {
466 | if (!(instance instanceof WebAssembly.Instance)) {
467 | throw new Error("Go.run: WebAssembly.Instance expected");
468 | }
469 | this._inst = instance;
470 | this.mem = new DataView(this._inst.exports.mem.buffer);
471 | this._values = [ // JS values that Go currently has references to, indexed by reference id
472 | NaN,
473 | 0,
474 | null,
475 | true,
476 | false,
477 | globalThis,
478 | this,
479 | ];
480 | this._goRefCounts = new Array(this._values.length).fill(Infinity); // number of references that Go has to a JS value, indexed by reference id
481 | this._ids = new Map([ // mapping from JS values to reference ids
482 | [0, 1],
483 | [null, 2],
484 | [true, 3],
485 | [false, 4],
486 | [globalThis, 5],
487 | [this, 6],
488 | ]);
489 | this._idPool = []; // unused ids that have been garbage collected
490 | this.exited = false; // whether the Go program has exited
491 |
492 | // Pass command line arguments and environment variables to WebAssembly by writing them to the linear memory.
493 | let offset = 4096;
494 |
495 | const strPtr = (str) => {
496 | const ptr = offset;
497 | const bytes = encoder.encode(str + "\0");
498 | new Uint8Array(this.mem.buffer, offset, bytes.length).set(bytes);
499 | offset += bytes.length;
500 | if (offset % 8 !== 0) {
501 | offset += 8 - (offset % 8);
502 | }
503 | return ptr;
504 | };
505 |
506 | const argc = this.argv.length;
507 |
508 | const argvPtrs = [];
509 | this.argv.forEach((arg) => {
510 | argvPtrs.push(strPtr(arg));
511 | });
512 | argvPtrs.push(0);
513 |
514 | const keys = Object.keys(this.env).sort();
515 | keys.forEach((key) => {
516 | argvPtrs.push(strPtr(`${key}=${this.env[key]}`));
517 | });
518 | argvPtrs.push(0);
519 |
520 | const argv = offset;
521 | argvPtrs.forEach((ptr) => {
522 | this.mem.setUint32(offset, ptr, true);
523 | this.mem.setUint32(offset + 4, 0, true);
524 | offset += 8;
525 | });
526 |
527 | // The linker guarantees global data starts from at least wasmMinDataAddr.
528 | // Keep in sync with cmd/link/internal/ld/data.go:wasmMinDataAddr.
529 | const wasmMinDataAddr = 4096 + 8192;
530 | if (offset >= wasmMinDataAddr) {
531 | throw new Error("total length of command line and environment variables exceeds limit");
532 | }
533 |
534 | this._inst.exports.run(argc, argv);
535 | if (this.exited) {
536 | this._resolveExitPromise();
537 | }
538 | await this._exitPromise;
539 | }
540 |
541 | _resume() {
542 | if (this.exited) {
543 | throw new Error("Go program has already exited");
544 | }
545 | this._inst.exports.resume();
546 | if (this.exited) {
547 | this._resolveExitPromise();
548 | }
549 | }
550 |
551 | _makeFuncWrapper(id) {
552 | const go = this;
553 | return function () {
554 | const event = { id: id, this: this, args: arguments };
555 | go._pendingEvent = event;
556 | go._resume();
557 | return event.result;
558 | };
559 | }
560 | }
561 | })();
562 |
--------------------------------------------------------------------------------
/internal/cmd/wasm/main.go:
--------------------------------------------------------------------------------
1 | //go:build js && wasm
2 |
3 | // Main binary for exposing the go-sentencepiece functionality in the browser
4 | // via WASM. The required functionality is exposed via the syscall/js interface.
5 | // This module should only be built in js && wasm mode.
6 | package main
7 |
8 | import (
9 | _ "embed"
10 | "fmt"
11 | "log"
12 | "strings"
13 | "sync"
14 | "syscall/js"
15 |
16 | "github.com/eliben/go-sentencepiece"
17 | )
18 |
19 | //go:embed embed_data/tokenizer.model
20 | var modelFileData string
21 | var spm *sentencepiece.Processor
22 |
23 | func main() {
24 | var once sync.Once
25 | once.Do(func() {
26 | var err error
27 | spm, err = sentencepiece.NewProcessor(strings.NewReader(modelFileData))
28 | if err != nil {
29 | log.Fatal(err)
30 | }
31 | fmt.Printf("processor loaded, vocab len=%v\n", spm.ModelInfo().VocabularySize)
32 | })
33 |
34 | js.Global().Set("textToIDs", jsTextToIDs)
35 | js.Global().Set("textToPieces", jsTextToPieces)
36 |
37 | // For the Go code to be usable from JS, the main function has to run forever.
38 | <-make(chan bool)
39 | }
40 |
41 | var jsTextToIDs = js.FuncOf(func(this js.Value, args []js.Value) interface{} {
42 | if len(args) != 1 {
43 | return "expected 1 argument: text to tokenize"
44 | }
45 | txt := args[0].String()
46 | tokens := spm.Encode(txt)
47 |
48 | jsTokens := js.Global().Get("Array").New()
49 | for _, t := range tokens {
50 | jsTokens.Call("push", js.ValueOf(t.ID))
51 | }
52 | return jsTokens
53 | })
54 |
55 | var jsTextToPieces = js.FuncOf(func(this js.Value, args []js.Value) interface{} {
56 | if len(args) != 1 {
57 | return "expected 1 argument: text to tokenize"
58 | }
59 | txt := args[0].String()
60 | tokens := spm.Encode(txt)
61 |
62 | jsTokens := js.Global().Get("Array").New()
63 | for _, t := range tokens {
64 | jsTokens.Call("push", js.ValueOf(t.Text))
65 | }
66 | return jsTokens
67 | })
68 |
--------------------------------------------------------------------------------
/internal/model/gen.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -o pipefail
4 | set -eux
5 |
6 | protoc \
7 | --go_out=. \
8 | --go_opt="Msentencepiece_model.proto=;model" sentencepiece_model.proto
9 |
10 | goimports -w .
11 |
12 |
--------------------------------------------------------------------------------
/internal/model/sentencepiece_model.proto:
--------------------------------------------------------------------------------
1 | // Copyright 2024 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | syntax = "proto2";
16 |
17 | // TODO(taku): Needs to use LITE RUNTIME in OSS release.
18 | option optimize_for = LITE_RUNTIME;
19 |
20 | package sentencepiece;
21 |
22 | // TrainerSpec encodes a various parameters for SentencePiece training.
23 | // Next id: 55
24 | message TrainerSpec {
25 | ///////////////////////////////////////////////////////////////////
26 | // General parameters
27 | //
28 | // Input corpus files.
29 | // Trainer accepts the following two formats:
30 | // A) Monolingual: plain text, one sentence per line.
31 | // B) Bilingual: TSV, source sentence target sentence
32 | // When bilingual data is passed, shared vocabulary model is built.
33 | // Note that the input file must be raw corpus, not a preprocessed corpus.
34 | // Trainer only loads the first `input_sentence_size` sentences specified
35 | // with this parameter.
36 | repeated string input = 1;
37 |
38 | // Input corpus format:
39 | // "text": one-sentence-per-line text format (default)
40 | // "tsv": sentence freq
41 | optional string input_format = 7;
42 |
43 | // Output model file prefix.
44 | // .model and .vocab are generated.
45 | optional string model_prefix = 2;
46 |
47 | // Model type. only have UNIGRAM now.
48 | enum ModelType {
49 | UNIGRAM = 1; // Unigram language model with dynamic algorithm
50 | BPE = 2; // Byte Pair Encoding
51 | WORD = 3; // Delimitered by whitespace.
52 | CHAR = 4; // tokenizes into character sequence
53 | }
54 | optional ModelType model_type = 3 [default = UNIGRAM];
55 |
56 | // Vocabulary size. 8k is the default size.
57 | optional int32 vocab_size = 4 [default = 8000];
58 |
59 | // List of the languages this model can accept.
60 | // Since the model is language-agnostic, this field is used as a reference.
61 | repeated string accept_language = 5;
62 |
63 | // Size of self-test samples, which are encoded in the model file.
64 | optional int32 self_test_sample_size = 6 [default = 0];
65 |
66 | // Whether to use DP version of sentencepiece. Use it with TSV input format
67 | // (requires precomputed word tab counts to work).
68 | optional bool enable_differential_privacy = 50 [default = false];
69 | // Set these parameters if you need DP version of sentencepiece.
70 | // std of noise to add.
71 | optional float differential_privacy_noise_level = 51 [default = 0.0];
72 | // Clipping threshold to apply after adding noise. All the words with
73 | // frequency less than this value are dropped.
74 | optional uint64 differential_privacy_clipping_threshold = 52 [default = 0];
75 |
76 | ///////////////////////////////////////////////////////////////////
77 | // Training parameters.
78 | //
79 | // Uses characters which cover the corpus with the ratio of `chars_coverage`.
80 | // This parameter determines the set of basic Alphabet of sentence piece.
81 | // 1.0 - `chars_coverage` characters are treated as UNK.
82 | // See also required_chars field.
83 | optional float character_coverage = 10 [default = 0.9995];
84 |
85 | // Maximum size of sentences the trainer loads from `input` parameter.
86 | // Trainer simply loads the `input` files in sequence.
87 | // It is better to shuffle the input corpus randomly.
88 | optional uint64 input_sentence_size = 11 [default = 0];
89 | optional bool shuffle_input_sentence = 19 [default = true];
90 |
91 | // Maximum size of sentences to make seed sentence pieces.
92 | // Extended suffix array is constructed to extract frequent
93 | // sub-strings from the corpus. This uses 20N working space,
94 | // where N is the size of corpus.
95 | optional int32 mining_sentence_size = 12 [deprecated = true];
96 |
97 | // Maximum size of sentences to train sentence pieces.
98 | optional int32 training_sentence_size = 13 [deprecated = true];
99 |
100 | // The size of seed sentencepieces.
101 | // `seed_sentencepiece_size` must be larger than `vocab_size`.
102 | optional int32 seed_sentencepiece_size = 14 [default = 1000000];
103 |
104 | // In every EM sub-iterations, keeps top
105 | // `shrinking_factor` * `current sentencepieces size` with respect to
106 | // the loss of the sentence piece. This value should be smaller than 1.0.
107 | optional float shrinking_factor = 15 [default = 0.75];
108 |
109 | // The maximum sentence length in byte. The sentences with the length
110 | // larger than `max_sentence_length` is simply ignored.
111 | // Longer input tends to bring the following risks:
112 | // * Overflow during EM training (unigram language model only)
113 | // * Performance drop because of O(n log n) cost in BPE.
114 | optional int32 max_sentence_length = 18 [default = 4192];
115 |
116 | // Number of threads in the training.
117 | optional int32 num_threads = 16 [default = 16];
118 |
119 | // Number of EM sub iterations.
120 | optional int32 num_sub_iterations = 17 [default = 2];
121 |
122 | ///////////////////////////////////////////////////////////////////
123 | // SentencePiece parameters which control the shapes of sentence piece.
124 | //
125 | // Maximum length of sentencepiece.
126 | optional int32 max_sentencepiece_length = 20 [default = 16];
127 |
128 | // Uses Unicode script to split sentence pieces.
129 | // When `split_by_unicode_script` is true, we do not allow sentence piece to
130 | // include multiple Unicode scripts, e.g. "F1" is not a valid piece.
131 | // Exception: CJ characters (Hiragana/Katakana/Han) are all handled
132 | // as one script type, since Japanese word can consist of multiple scripts.
133 | // This exception is always applied regardless of the accept-language
134 | // parameter.
135 | optional bool split_by_unicode_script = 21 [default = true];
136 |
137 | // When `split_by_number` is true, put a boundary between number and
138 | // non-number transition. If we want to treat "F1" is one token, set this flag
139 | // to be false.
140 | optional bool split_by_number = 23 [default = true];
141 |
142 | // Use a white space to split sentence pieces.
143 | // When `split_by_whitespace` is false, we may have the piece containing
144 | // a white space in the middle. e.g., "in_the".
145 | optional bool split_by_whitespace = 22 [default = true];
146 |
147 | // Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello =>
148 | // hello_. When `treat_whitespace_as_suffix` is true,
149 | // NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end
150 | // of sentence.
151 | optional bool treat_whitespace_as_suffix = 24 [default = false];
152 |
153 | // Allows pieces that only contain whitespaces instead of appearing only as
154 | // prefix or suffix of other pieces.
155 | optional bool allow_whitespace_only_pieces = 26 [default = false];
156 |
157 | // Split all digits (0-9) into separate pieces.
158 | optional bool split_digits = 25 [default = false];
159 |
160 | // Defines the pre-tokenization delimiter.
161 | // When specified, no pieces crossing this delimiter is not included
162 | // in the vocab. Then the delimiter string is virtually ignored
163 | // during the training. This field can allows constraints on the vocabulary
164 | // selection. Note that this field is available on unigram mode.
165 | optional string pretokenization_delimiter = 53 [ default = ""];
166 |
167 | ///////////////////////////////////////////////////////////////////
168 | // Vocabulary management
169 | //
170 | // Defines control symbols used as an indicator to
171 | // change the behavior of the decoder. and are pre-defined.
172 | // We can use this field to encode various meta information,
173 | // including language indicator in multilingual model.
174 | // These symbols are not visible to users, but visible to
175 | // the decoder. Note that when the input sentence contains control symbols,
176 | // they are not treated as one token, but segmented into normal pieces.
177 | // Control symbols must be inserted independently from the segmentation.
178 | repeated string control_symbols = 30;
179 |
180 | // Defines user defined symbols.
181 | // These symbols are added with extremely high score
182 | // so they are always treated as one unique symbol in any context.
183 | // Typical usage of user_defined_symbols is placeholder for named entities.
184 | repeated string user_defined_symbols = 31;
185 |
186 | // Defines required characters. Each UTF8 character in this string is included
187 | // in the character set regardless of character_coverage value. Unlike
188 | // user_defined_symbols, these characters have scores based on the frequency
189 | // on input sentences, and the model can form subwords using characters
190 | // in this field.
191 | optional string required_chars = 36;
192 |
193 | // Decomposes unknown pieces into UTF-8 bytes.
194 | optional bool byte_fallback = 35 [default = false];
195 |
196 | // When creating the vocabulary file, defines whether or not to additionally
197 | // output the score for each piece.
198 | optional bool vocabulary_output_piece_score = 32 [default = true];
199 |
200 | // `vocab_size` is treated as hard limit. Crash if
201 | // the model can not produce the vocab of size `vocab_size`,
202 | // When `hard_vocab_limit` is false, vocab_size is treated
203 | // as soft limit. Note that when model_type=char,
204 | // always assumes hard_vocab_limit = false.
205 | optional bool hard_vocab_limit = 33 [default = true];
206 |
207 | // use all symbols for vocab extraction. This flag is valid
208 | // if model type is either CHAR or WORD
209 | optional bool use_all_vocab = 34 [default = false];
210 |
211 | ///////////////////////////////////////////////////////////////////
212 | // Reserved special meta tokens.
213 | // * -1 is not used.
214 | // * unk_id must not be -1.
215 | // Id must starts with 0 and be contigous.
216 | optional int32 unk_id = 40 [default = 0]; //
217 | optional int32 bos_id = 41 [default = 1]; //
218 | optional int32 eos_id = 42 [default = 2]; //
219 | optional int32 pad_id = 43 [default = -1]; // (padding)
220 | optional string unk_piece = 45 [default = ""];
221 | optional string bos_piece = 46 [default = ""];
222 | optional string eos_piece = 47 [default = ""];
223 | optional string pad_piece = 48 [default = ""];
224 |
225 | // Encodes into U+2047 (DOUBLE QUESTION MARK),
226 | // since this character can be useful both for user and
227 | // developer. We can easily figure out that is emitted.
228 | optional string unk_surface = 44 [default = " \xE2\x81\x87 "];
229 |
230 | // Increase bit depth to allow unigram model training on large
231 | // (>10M sentences) corpora. A Side-effect of enabling this flag
232 | // is increased memory usage.
233 | optional bool train_extremely_large_corpus = 49 [default = false];
234 |
235 | // Path to a seed sentencepieces file, with one tab-separated
236 | // seed sentencepiece frequency per line.
237 | optional string seed_sentencepieces_file = 54 [default = ""];
238 |
239 | // Customized extensions: the range of field numbers
240 | // are open to third-party extensions.
241 | extensions 200 to max;
242 | }
243 |
244 | // NormalizerSpec encodes a various parameters for string normalizaiton
245 | message NormalizerSpec {
246 | // name of normalization rule.
247 | optional string name = 1;
248 |
249 | // Pre-compiled normalization rule created by
250 | // Builder::GetPrecompiledCharsMap() or Builder::CompileCharsMap() method.
251 | // Usually this field is set by Builder::GetNormalizerSpec() method.
252 | optional bytes precompiled_charsmap = 2;
253 |
254 | // Adds dummy whitespace at the beginning of text in order to
255 | // treat "world" in "world" and "hello world" in the same way.
256 | optional bool add_dummy_prefix = 3 [default = true];
257 |
258 | // Removes leading, trailing, and duplicate internal whitespace.
259 | optional bool remove_extra_whitespaces = 4 [default = true];
260 |
261 | // Replaces whitespace with meta symbol.
262 | // This field must be true to train sentence piece model.
263 | optional bool escape_whitespaces = 5 [default = true];
264 |
265 | // Custom normalization rule file in TSV format.
266 | // https://github.com/google/sentencepiece/blob/master/doc/normalization.md
267 | // This field is only used in SentencePieceTrainer::Train() method, which
268 | // compiles the rule into the binary rule stored in `precompiled_charsmap`.
269 | optional string normalization_rule_tsv = 6;
270 |
271 | // Customized extensions: the range of field numbers
272 | // are open to third-party extensions.
273 | extensions 200 to max;
274 | }
275 |
276 | // Proto to store samples for self-testing.
277 | message SelfTestData {
278 | message Sample {
279 | optional string input = 1;
280 | optional string expected = 2;
281 | }
282 | repeated Sample samples = 1;
283 |
284 | // Customized extensions: the range of field numbers
285 | // are open to third-party extensions.
286 | extensions 200 to max;
287 | }
288 |
289 | // ModelProto stores model parameters.
290 | // SentencePieceProcessor is supposed to be self-contained.
291 | // All settings/parameters which may change the behavior must be encoded
292 | // in ModelProto.
293 | message ModelProto {
294 | message SentencePiece {
295 | enum Type {
296 | NORMAL = 1; // normal symbol
297 | UNKNOWN = 2; // unknown symbol. only for now.
298 | CONTROL = 3; // control symbols. , , <2ja> etc.
299 | USER_DEFINED = 4; // user defined symbols.
300 | // Typical usage of USER_DEFINED symbol
301 | // is placeholder.
302 | BYTE = 6; // byte symbols. Used when `byte_fallback` is true.
303 | UNUSED = 5; // this piece is not used.
304 | }
305 | optional string piece = 1; // piece must not be empty.
306 | optional float score = 2;
307 | optional Type type = 3 [default = NORMAL];
308 |
309 | // Customized extensions: the range of field numbers
310 | // are open to third-party extensions.
311 | extensions 200 to max;
312 | }
313 |
314 | // Sentence pieces with scores.
315 | repeated SentencePiece pieces = 1;
316 |
317 | // Spec used to generate this model file.
318 | optional TrainerSpec trainer_spec = 2;
319 |
320 | // Spec for text normalization.
321 | optional NormalizerSpec normalizer_spec = 3;
322 |
323 | // Stores sample input and its expected segmentation to verify the model.
324 | optional SelfTestData self_test_data = 4;
325 |
326 | // Spec for text de-normalization.
327 | optional NormalizerSpec denormalizer_spec = 5;
328 |
329 | // Customized extensions: the range of field numbers
330 | // are open to third-party extensions.
331 | extensions 200 to max;
332 | }
333 |
--------------------------------------------------------------------------------
/internal/prefixmatcher/prefixmatcher.go:
--------------------------------------------------------------------------------
1 | package prefixmatcher
2 |
3 | import (
4 | "unicode/utf8"
5 | )
6 |
7 | // PrefixMatcher helps find longest prefixes. See [FindPrefixLen].
8 | type PrefixMatcher struct {
9 | root *trieNode
10 | }
11 |
12 | type trieNode struct {
13 | children map[rune]*trieNode
14 | final bool
15 | }
16 |
17 | // NewFromSet creates a new [PrefixMatcher] from a set of strings tha represent
18 | // the vocabulary.
19 | func NewFromSet(vocab map[string]bool) *PrefixMatcher {
20 | pm := &PrefixMatcher{root: newNode()}
21 | for word := range vocab {
22 | pm.add(word)
23 | }
24 | return pm
25 | }
26 |
27 | // FindPrefixLen finds the longest prefix of text that matches a vocabulary
28 | // word, and returns it. If 0 is returned, no prefix was found.
29 | func (pm *PrefixMatcher) FindPrefixLen(text string) int {
30 | node := pm.root
31 | maxLen := 0
32 |
33 | for i, r := range text {
34 | child := node.children[r]
35 | if child == nil {
36 | // r not found in this node, so we're done.
37 | return maxLen
38 | }
39 | if child.final {
40 | maxLen = i + utf8.RuneLen(r)
41 | }
42 | node = child
43 | }
44 |
45 | return maxLen
46 | }
47 |
48 | func (pm *PrefixMatcher) add(word string) {
49 | node := pm.root
50 |
51 | for _, r := range word {
52 | child := node.children[r]
53 | if child == nil {
54 | child = newNode()
55 | node.children[r] = child
56 | }
57 | node = child
58 | }
59 |
60 | node.final = true
61 | }
62 |
63 | func newNode() *trieNode {
64 | return &trieNode{
65 | children: make(map[rune]*trieNode),
66 | final: false,
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/internal/prefixmatcher/prefixmatcher_test.go:
--------------------------------------------------------------------------------
1 | package prefixmatcher
2 |
3 | import (
4 | "fmt"
5 | "testing"
6 | )
7 |
8 | func dumpNode(n *trieNode, prefix string) string {
9 | var s string
10 | if n.final {
11 | s = fmt.Sprintf("%sfinal\n", prefix)
12 | }
13 | for r, c := range n.children {
14 | s += fmt.Sprintf("%s%q ->\n%s", prefix, r, dumpNode(c, prefix+" "))
15 | }
16 | return s
17 | }
18 |
19 | func TestSmallVocab(t *testing.T) {
20 | vocab := map[string]bool{
21 | "ham": true,
22 | "yefet": true,
23 | "hamat": true,
24 | "hamela": true,
25 | "世界": true,
26 |
27 | "▁▁": true,
28 | "▁▁▁": true,
29 | "▁▁▁▁": true,
30 | "▁▁▁▁▁": true,
31 | "▁▁▁▁▁▁": true,
32 | }
33 | pm := NewFromSet(vocab)
34 |
35 | var tests = []struct {
36 | text string
37 | wantLen int
38 | }{
39 | {"zyx", 0},
40 | {"ham", 3},
41 | {"hama", 3},
42 | {"zham", 0},
43 | {"hame", 3},
44 | {"hamy", 3},
45 | {"hamat", 5},
46 | {"hamatar", 5},
47 | {"hamela", 6},
48 | {"hamelar", 6},
49 | {"y", 0},
50 | {"ye", 0},
51 | {"yefet", 5},
52 | {"yefeton", 5},
53 | {"世界", 6},
54 | {"世", 0},
55 | {"世p", 0},
56 | {"世界foo", 6},
57 | {"▁", 0},
58 | {"▁▁", 6},
59 | {"▁▁▁", 9},
60 | {"▁▁▁▁", 12},
61 | {"▁▁▁▁▁", 15},
62 | {"▁▁▁▁▁▁", 18},
63 | {"▁▁▁▁▁▁▁", 18},
64 | {"▁▁▁▁▁▁p", 18},
65 | }
66 |
67 | for _, tt := range tests {
68 | t.Run(tt.text, func(t *testing.T) {
69 | gotLen := pm.FindPrefixLen(tt.text)
70 | if gotLen != tt.wantLen {
71 | t.Errorf("got %v, want %v", gotLen, tt.wantLen)
72 | }
73 | })
74 | }
75 | }
76 |
77 | func TestSingleAndDoubleLetter(t *testing.T) {
78 | vocab := make(map[string]bool)
79 |
80 | for r1 := 'a'; r1 <= 'z'; r1++ {
81 | vocab[string(r1)] = true
82 |
83 | for r2 := 'a'; r2 <= 'z'; r2++ {
84 | vocab[string(r1)+string(r2)] = true
85 | }
86 | }
87 |
88 | pm := NewFromSet(vocab)
89 |
90 | assertLen := func(text string, wantLen int) {
91 | t.Helper()
92 | gotLen := pm.FindPrefixLen(text)
93 | if gotLen != wantLen {
94 | t.Errorf("got %v, want %v", gotLen, wantLen)
95 | }
96 | }
97 |
98 | for r1 := 'a'; r1 <= 'z'; r1++ {
99 | assertLen(string(r1), 1)
100 | for r2 := 'a'; r2 <= 'z'; r2++ {
101 | assertLen(string(r1)+string(r2), 2)
102 | for r3 := 'a'; r3 <= 'z'; r3++ {
103 | assertLen(string(r1)+string(r2)+string(r3), 2)
104 | }
105 | }
106 | }
107 | }
108 |
--------------------------------------------------------------------------------
/internal/priorityqueue/priorityqueue.go:
--------------------------------------------------------------------------------
1 | // Package priorityqueue provides a generic priority queue with Insert,
2 | // PopMax, and RemoveFunc operations.
3 | package priorityqueue
4 |
5 | // PriorityQueue is a generic priority queue with a configurable comparison
6 | // function.
7 | type PriorityQueue[T any] struct {
8 | cmp func(a, b T) int
9 |
10 | // items holds the queue's items as a binary heap.
11 | // items[0] is a dummy element that's not used. If the queue has N elements,
12 | // they are stored at indices 1...N (N == len(items)-1)
13 | // For an element at index i, its parent is at index i/2, and its children
14 | // are at indices 2i and 2i+1. The root of the heap is at index 1.
15 | items []T
16 | }
17 |
18 | // New creates a new PriorityQueue, configured with a function that
19 | // compares the priorities of two items a and b; it should return a number > 0
20 | // if the priority of a is higher, 0 if the priorities are equal, and a
21 | // number < 0 otherwise.
22 | // sizeHint sets the initial capacity of the queue; -1 means to use the default.
23 | func New[T any](sizeHint int, cmp func(a, b T) int) *PriorityQueue[T] {
24 | return &PriorityQueue[T]{cmp: cmp, items: make([]T, 1, max(1, sizeHint+1))}
25 | }
26 |
27 | // Len returns the length (number of items) of the priority queue.
28 | func (pq *PriorityQueue[T]) Len() int {
29 | return len(pq.items) - 1
30 | }
31 |
32 | // Insert inserts a new element into the priority queue.
33 | func (pq *PriorityQueue[T]) Insert(elem T) {
34 | pq.items = append(pq.items, elem)
35 | pq.siftup(len(pq.items) - 1)
36 | }
37 |
38 | // PopMax returns the element with the maximal priority in the queue, and
39 | // removes it from the queue. Warning: to maintain a clean API, PopMax panics
40 | // if the queue is empty. Make sure to check Len() first.
41 | func (pq *PriorityQueue[T]) PopMax() T {
42 | if len(pq.items) < 2 {
43 | panic("popping from empty priority queue")
44 | }
45 | maxItem := pq.items[1]
46 | pq.items[1] = pq.items[len(pq.items)-1]
47 | pq.items = pq.items[:len(pq.items)-1]
48 | pq.siftdown(1)
49 | return maxItem
50 | }
51 |
52 | // RemoveFunc removes all elements for which rm returns true.
53 | func (pq *PriorityQueue[T]) RemoveFunc(rm func(T) bool) {
54 | // This is effectively slices.DeleteFunc, but inlined because we start from index 1.
55 | i := 1
56 | for ; i < len(pq.items); i++ {
57 | if rm(pq.items[i]) {
58 | break
59 | }
60 | }
61 | if i == len(pq.items) {
62 | return // nothing to remove
63 | }
64 | for j := i + 1; j < len(pq.items); j++ {
65 | if v := pq.items[j]; !rm(v) {
66 | pq.items[i] = v
67 | i++
68 | }
69 | }
70 | // Clear the tail.
71 | clear(pq.items[i:])
72 | pq.items = pq.items[:i]
73 | pq.rebuildHeap()
74 | }
75 |
76 | // rebuildHeap rebuilds the entire heap from scratch.
77 | func (pq *PriorityQueue[T]) rebuildHeap() {
78 | for i := len(pq.items) / 2; i >= 1; i-- {
79 | pq.siftdown(i)
80 | }
81 | }
82 |
83 | func (pq *PriorityQueue[T]) siftup(n int) {
84 | i := n
85 | for {
86 | if i == 1 {
87 | // Reached root, we're done.
88 | return
89 | }
90 | // p is the index of i's parent
91 | // if p parent has a higher priority than i, we're done.
92 | p := i / 2
93 | if pq.cmp(pq.items[p], pq.items[i]) >= 0 {
94 | return
95 | }
96 | pq.items[i], pq.items[p] = pq.items[p], pq.items[i]
97 | i = p
98 | }
99 | }
100 |
101 | func (pq *PriorityQueue[T]) siftdown(i int) {
102 | for {
103 | c := 2 * i
104 | if c >= len(pq.items) {
105 | return
106 | }
107 | // c is not out of bounds, so it's the index of the left child of i
108 |
109 | // Figure out the child index with the maximal priority
110 | maxChild := c
111 | if c+1 < len(pq.items) {
112 | // c+1 is not out of bounds, so it's the index of the right child of i
113 | if pq.cmp(pq.items[c+1], pq.items[c]) > 0 {
114 | maxChild = c + 1
115 | }
116 | }
117 | if pq.cmp(pq.items[i], pq.items[maxChild]) >= 0 {
118 | // i has higher priority than either child, so we're done.
119 | return
120 | }
121 |
122 | pq.items[i], pq.items[maxChild] = pq.items[maxChild], pq.items[i]
123 | i = maxChild
124 | }
125 | }
126 |
--------------------------------------------------------------------------------
/internal/priorityqueue/priorityqueue_test.go:
--------------------------------------------------------------------------------
1 | package priorityqueue
2 |
3 | import (
4 | "math/rand"
5 | "slices"
6 | "testing"
7 | )
8 |
9 | func TestBasicQueueWithStrings(t *testing.T) {
10 | stringLenCmp := func(a, b string) int {
11 | return len(a) - len(b)
12 | }
13 |
14 | pq := New(-1, stringLenCmp)
15 |
16 | assertPopAndSize := func(s string, n int) {
17 | t.Helper()
18 | got := pq.PopMax()
19 | if got != s {
20 | t.Errorf("got %v, want %v", got, s)
21 | }
22 | if n != pq.Len() {
23 | t.Errorf("got len=%v, want %v", pq.Len(), n)
24 | }
25 | }
26 |
27 | pq.Insert("one")
28 | pq.Insert("four")
29 | pq.Insert("sixteen")
30 | pq.Insert("un")
31 |
32 | // Pop all elements in max order
33 | assertPopAndSize("sixteen", 3)
34 | assertPopAndSize("four", 2)
35 | assertPopAndSize("one", 1)
36 | assertPopAndSize("un", 0)
37 |
38 | // Insert+pop, insert+pop...
39 | pq.Insert("xyz")
40 | assertPopAndSize("xyz", 0)
41 | pq.Insert("foobarbaz")
42 | assertPopAndSize("foobarbaz", 0)
43 | pq.Insert("1")
44 | assertPopAndSize("1", 0)
45 |
46 | // Inserts after popping some
47 | pq.Insert("mercury")
48 | pq.Insert("venus")
49 | assertPopAndSize("mercury", 1)
50 | pq.Insert("jupiter")
51 | assertPopAndSize("jupiter", 1)
52 | pq.Insert("moon")
53 | assertPopAndSize("venus", 1)
54 | assertPopAndSize("moon", 0)
55 |
56 | // Insert two, pop 1, a few times
57 | pq.Insert("mercury")
58 | pq.Insert("venus")
59 | assertPopAndSize("mercury", 1)
60 | pq.Insert("mars")
61 | pq.Insert("jupiter")
62 | assertPopAndSize("jupiter", 2) // contains: venus, mars
63 | pq.Insert("ganimede")
64 | pq.Insert("europa")
65 | assertPopAndSize("ganimede", 3) // contains: venus, mars, europa
66 | pq.Insert("enceladus")
67 | pq.Insert("io")
68 | assertPopAndSize("enceladus", 4)
69 | assertPopAndSize("europa", 3)
70 | assertPopAndSize("venus", 2)
71 | assertPopAndSize("mars", 1)
72 | assertPopAndSize("io", 0)
73 |
74 | // Insert these words in random orders; they should still all pop in the
75 | // expected order by length.
76 | words := []string{"z", "xy", "uvw", "post", "dworb"}
77 | for i := 0; i < 100; i++ {
78 | w := slices.Clone(words)
79 | rand.Shuffle(len(w), func(i, j int) {
80 | w[i], w[j] = w[j], w[i]
81 | })
82 |
83 | for _, word := range w {
84 | pq.Insert(word)
85 | }
86 |
87 | assertPopAndSize("dworb", 4)
88 | assertPopAndSize("post", 3)
89 | assertPopAndSize("uvw", 2)
90 | assertPopAndSize("xy", 1)
91 | assertPopAndSize("z", 0)
92 | }
93 | }
94 |
95 | func TestBasicQueueWithCustomType(t *testing.T) {
96 | type Item struct {
97 | Name string
98 | Cost int
99 | }
100 |
101 | itemCostCmp := func(a, b Item) int {
102 | return a.Cost - b.Cost
103 | }
104 |
105 | pq := New(-1, itemCostCmp)
106 |
107 | assertPop := func(s string) {
108 | t.Helper()
109 | got := pq.PopMax()
110 | if got.Name != s {
111 | t.Errorf("got %v, want %v", got.Name, s)
112 | }
113 | }
114 |
115 | // Push in decreasing cost order
116 | pq.Insert(Item{"joe", 20})
117 | pq.Insert(Item{"maxm", 3})
118 | pq.Insert(Item{"jabbar", 1})
119 | assertPop("joe")
120 | assertPop("maxm")
121 | assertPop("jabbar")
122 |
123 | // Push in increasing cost order
124 | pq.Insert(Item{"x", 1})
125 | pq.Insert(Item{"y", 29})
126 | pq.Insert(Item{"z", 88})
127 | assertPop("z")
128 | assertPop("y")
129 | assertPop("x")
130 | }
131 |
--------------------------------------------------------------------------------
/normalize.go:
--------------------------------------------------------------------------------
1 | package sentencepiece
2 |
3 | import "strings"
4 |
5 | // normalize performs unicode normalization.
6 | //
7 | // SentencePiece has a feature to perform configurable unicode normalization on
8 | // the input text and has some options for adding dummy whitespace prefixes or
9 | // trimming whitespace. However, the model we're working with has a very simple
10 | // normalizer that does none of this. These options can be added in the future
11 | // if needed.
12 | func normalize(text string) string {
13 | return replaceSpacesBySeparator(text)
14 | }
15 |
16 | const whitespaceSeparator = "▁"
17 |
18 | // replaceSpacesBySeparator replaces spaces by the whitespace separator used by
19 | // the model.
20 | func replaceSpacesBySeparator(text string) string {
21 | return strings.ReplaceAll(text, " ", whitespaceSeparator)
22 | }
23 |
24 | // replaceSeparatorsBySpace replaces the whitespace separator used by
25 | // the model back with spaces.
26 | func replaceSeparatorsBySpace(text string) string {
27 | return strings.ReplaceAll(text, whitespaceSeparator, " ")
28 | }
29 |
--------------------------------------------------------------------------------
/processor.go:
--------------------------------------------------------------------------------
1 | package sentencepiece
2 |
3 | import (
4 | "fmt"
5 | "io"
6 | "os"
7 | "strconv"
8 | "strings"
9 | "unicode/utf8"
10 |
11 | "github.com/eliben/go-sentencepiece/internal/model"
12 | "github.com/eliben/go-sentencepiece/internal/prefixmatcher"
13 | "github.com/eliben/go-sentencepiece/internal/priorityqueue"
14 | "google.golang.org/protobuf/proto"
15 | )
16 |
17 | const debugEncode = false
18 |
19 | // Processor represents a SentencePiece processor (tokenizer).
20 | // A Processor converts input text into a sequence of tokens LLMs use, and back.
21 | // The mapping between token IDs and the text they represent is read from the
22 | // model proto (provided to the constructor); it's the same between all calls
23 | // to the Encode method.
24 | //
25 | // The term "processor" comes from the original C++ SentencePiece library and
26 | // its Python bindings.
27 | type Processor struct {
28 | model *model.ModelProto
29 |
30 | pieces map[string]int
31 | reserved map[string]int
32 |
33 | // unknownID is the token identifier of the UNKNOWN piece
34 | unknownID int
35 |
36 | // userDefinedMatcher is a prefix matcher for symbols that are of
37 | // "user-defined" type in the model proto.
38 | userDefinedMatcher *prefixmatcher.PrefixMatcher
39 |
40 | // byte2Token is a cache of byte values and the tokens they represent
41 | byte2Token map[byte]Token
42 |
43 | // idToByte maps IDs to byte values they represent
44 | idToByte map[int]byte
45 |
46 | // maxPieceLength is the maximum length of a piece in the model.
47 | // This is used to preallocate a buffer for merging symbols.
48 | maxPieceLength int
49 | }
50 |
51 | // NewProcessorFromPath creates a new Processor from a file path to the protobuf
52 | // data.
53 | func NewProcessorFromPath(protoFile string) (*Processor, error) {
54 | f, err := os.Open(protoFile)
55 | if err != nil {
56 | return nil, fmt.Errorf("unable to read %q: %v", protoFile, err)
57 | }
58 | defer f.Close()
59 | return NewProcessor(f)
60 | }
61 |
62 | // NewProcessor creates a new Processor from a reader with the protobuf data.
63 | func NewProcessor(protoReader io.Reader) (*Processor, error) {
64 | b, err := io.ReadAll(protoReader)
65 | if err != nil {
66 | return nil, fmt.Errorf("unable to read protobuf data: %v", err)
67 | }
68 |
69 | var mp model.ModelProto
70 | err = proto.Unmarshal(b, &mp)
71 | if err != nil {
72 | return nil, fmt.Errorf("unable to unmarshal protobuf: %v", err)
73 | }
74 |
75 | tspec := mp.GetTrainerSpec()
76 | if tspec.GetModelType() != model.TrainerSpec_BPE {
77 | return nil, fmt.Errorf("model type %s not supported", tspec.GetModelType())
78 | }
79 |
80 | nspec := mp.GetNormalizerSpec()
81 | if *nspec.AddDummyPrefix || *nspec.RemoveExtraWhitespaces {
82 | return nil, fmt.Errorf("normalizer spec options not supported: %s", nspec)
83 | }
84 |
85 | userDefined := make(map[string]bool)
86 | pieces := make(map[string]int)
87 | reserved := make(map[string]int)
88 | byte2Token := make(map[byte]Token)
89 | idToByte := make(map[int]byte)
90 | unkID := -1
91 | maxPieceLength := 0
92 |
93 | for i, piece := range mp.GetPieces() {
94 | isNormalPiece := (piece.GetType() == model.ModelProto_SentencePiece_NORMAL ||
95 | piece.GetType() == model.ModelProto_SentencePiece_USER_DEFINED ||
96 | piece.GetType() == model.ModelProto_SentencePiece_UNUSED)
97 |
98 | if isNormalPiece {
99 | pieces[piece.GetPiece()] = i
100 | maxPieceLength = max(maxPieceLength, len(piece.GetPiece()))
101 | } else {
102 | reserved[piece.GetPiece()] = i
103 | }
104 |
105 | if piece.GetType() == model.ModelProto_SentencePiece_USER_DEFINED {
106 | userDefined[piece.GetPiece()] = true
107 | } else if piece.GetType() == model.ModelProto_SentencePiece_UNKNOWN {
108 | if unkID > 0 {
109 | return nil, fmt.Errorf("unk redefined")
110 | }
111 | unkID = i
112 | } else if piece.GetType() == model.ModelProto_SentencePiece_BYTE {
113 | if !tspec.GetByteFallback() {
114 | return nil, fmt.Errorf("byte piece %q is found although `byte_fallback=false`", piece.GetPiece())
115 | }
116 | bv := convertHexValue(piece.GetPiece())
117 | if bv >= 0 && bv < 256 {
118 | byte2Token[byte(bv)] = Token{ID: i, Text: piece.GetPiece()}
119 | idToByte[i] = byte(bv)
120 | }
121 | }
122 | }
123 |
124 | if unkID < 0 {
125 | return nil, fmt.Errorf("unk symbol is not defined")
126 | }
127 |
128 | // In case byte_fallback is specified, make sure that all 256 possible byte
129 | // values were found.
130 | if tspec.GetByteFallback() {
131 | for i := 0; i < 256; i++ {
132 | if _, found := byte2Token[byte(i)]; !found {
133 | return nil, fmt.Errorf("byte value 0x%02X not found", i)
134 | }
135 | }
136 | }
137 |
138 | return &Processor{
139 | model: &mp,
140 | userDefinedMatcher: prefixmatcher.NewFromSet(userDefined),
141 | byte2Token: byte2Token,
142 | idToByte: idToByte,
143 | unknownID: unkID,
144 | pieces: pieces,
145 | reserved: reserved,
146 | maxPieceLength: maxPieceLength,
147 | }, nil
148 | }
149 |
150 | // Encode tokenizes the input text and returns a list of Tokens.
151 | func (proc *Processor) Encode(text string) []Token {
152 | text = normalize(text)
153 |
154 | // We begin by having each symbol a single Unicode character (or a
155 | // user-defined string), and will iteratively merge them into larger and
156 | // larger symbols until we have the final list of tokens.
157 | // Since this list of symbols changes a lot, we represent it as a
158 | // doubly-linked list in the symList slice. Each element in this slice has
159 | // prev/next links to the next "live" symbol in the list; noMerge means this
160 | // is a user-defined symbol we're not allowed to merge with neighbors.
161 | // After the algorithm is finished, many elements in symList will be "dead"
162 | // (unreachable by next/prev links from the first element).
163 | // This representation is inspired by the implementation of bpe::Model
164 | // in the SentencePiece C++ library.
165 |
166 | type symListElem struct {
167 | prev, next int
168 | noMerge bool
169 | symbol string
170 | }
171 | symList := make([]symListElem, 0, len(text))
172 |
173 | for {
174 | // Match the next symbol in text
175 | slen, found := proc.symbolMatch(text)
176 |
177 | // Append a list element for this symbol; note that this element will be
178 | // at index len(symList), so prev/next are set up accordingly.
179 | sym := symListElem{
180 | noMerge: found,
181 | symbol: text[:slen],
182 | prev: len(symList) - 1,
183 | next: len(symList) + 1,
184 | }
185 | symList = append(symList, sym)
186 |
187 | // Advance the text slice to the next symbol; if no more text, we're done.
188 | text = text[slen:]
189 | if len(text) == 0 {
190 | break
191 | }
192 | }
193 |
194 | if len(symList) == 0 {
195 | return nil
196 | }
197 | symList[len(symList)-1].next = -1
198 | nTokens := len(symList)
199 |
200 | debugShowSymList := func(prefix string) {
201 | if debugEncode {
202 | fmt.Println(prefix)
203 | for i, elem := range symList {
204 | fmt.Printf("[%3d]: [prev: %3v, next: %3d, noMerge: %v] %q\n", i, elem.prev, elem.next, elem.noMerge, elem.symbol)
205 | }
206 | }
207 | }
208 | debugShowSymList("initial")
209 |
210 | // To avoid repeating work, we manage a priority queue of "merge candidates".
211 | // Each candidate has pointers to the symList list for the left and right
212 | // symbol in the pair, as well as the combined symbol's score.
213 | // The priority of merging is determined by this score, with position as
214 | // the tie-breaker (earlier pairs are preferred).
215 | type mergeCandidate struct {
216 | left, right int
217 | length int
218 | score float32
219 | }
220 |
221 | mergeQueue := priorityqueue.New(len(symList), func(a, b mergeCandidate) int {
222 | if a.score > b.score || (a.score == b.score && a.left < b.left) {
223 | return 1
224 | }
225 | return -1
226 | })
227 |
228 | // findMerged looks for x+y in the vocabulary, and returns the
229 | // merged piece, its ID and true if found. buf is a reusable buffer used to
230 | // merge two strings together without allocations.
231 | buf := make([]byte, proc.maxPieceLength)
232 | findMerged := func(x, y symListElem) (string, int, bool) {
233 | buf = buf[:len(x.symbol)+len(y.symbol)]
234 | copy(buf, x.symbol)
235 | copy(buf[len(x.symbol):], y.symbol)
236 | if id, found := proc.pieces[string(buf)]; found {
237 | return proc.model.GetPieces()[id].GetPiece(), id, true
238 | }
239 | return "", 0, false
240 | }
241 |
242 | // suggestNewMergePair is called to potentially add a new mergeCandidate to
243 | // mergeQueue. The candidate is added if it's valid, both its parts are
244 | // allowed to merge, and it appears in the vocabulary.
245 | suggestNewMergePair := func(left, right int) {
246 | if left == -1 || right == -1 || symList[left].noMerge || symList[right].noMerge {
247 | return
248 | }
249 |
250 | if mergedSymbol, id, ok := findMerged(symList[left], symList[right]); ok {
251 | mergeQueue.Insert(mergeCandidate{
252 | left: left,
253 | right: right,
254 | length: len(mergedSymbol),
255 | score: proc.model.GetPieces()[id].GetScore(),
256 | })
257 | }
258 | }
259 |
260 | // Seed the merge queue with all pairs of symbols from symList
261 | for i := 1; i < len(symList); i++ {
262 | suggestNewMergePair(i-1, i)
263 | }
264 |
265 | // candidateIsDead indicates that a candidate is out of date: one of its
266 | // parts was already merged with another symbol, so we don't want to consider
267 | // it any more.
268 | candidateIsDead := func(candidate mergeCandidate) bool {
269 | leftSymbol := symList[candidate.left].symbol
270 | rightSymbol := symList[candidate.right].symbol
271 | return leftSymbol == "" || rightSymbol == "" || len(leftSymbol)+len(rightSymbol) != candidate.length
272 | }
273 |
274 | // Main loop
275 | mergeQueueDead := 0
276 | for mergeQueue.Len() > 0 {
277 | candidate := mergeQueue.PopMax()
278 | leftSymbol := symList[candidate.left]
279 | rightSymbol := symList[candidate.right]
280 |
281 | if candidateIsDead(candidate) {
282 | mergeQueueDead--
283 | continue
284 | }
285 |
286 | // If there are lots more dead merge candidates than live ones, remove the
287 | // dead. This is a relatively expensive operation but it's performed rarely,
288 | // and it makes the priority queue smaller - making all subsequent
289 | // operations faster.
290 | // The factor of 3 was determined empirically.
291 | if mergeQueueDead*3 > mergeQueue.Len() {
292 | mergeQueue.RemoveFunc(candidateIsDead)
293 | mergeQueueDead = 0
294 | }
295 |
296 | // Do the merge:
297 | // 1. Merge the concatenation of leftSymbol and rightSymbol into leftSymbol
298 | mergedSymbol, _, ok := findMerged(leftSymbol, rightSymbol)
299 | if !ok {
300 | panic("failed to merge symbols")
301 | }
302 | symList[candidate.left].symbol = mergedSymbol
303 | nTokens--
304 |
305 | // 2. Update prev/next pointers
306 | symList[candidate.left].next = rightSymbol.next
307 | if rightSymbol.next >= 0 {
308 | symList[rightSymbol.next].prev = candidate.left
309 | }
310 |
311 | // 3. Mark the right element in the pair as outdated (it's been merged
312 | // into the left one).
313 | symList[candidate.right].symbol = ""
314 | mergeQueueDead++
315 |
316 | // 4. Add merge suggestions for the newly merged symbol with its neighbors
317 | suggestNewMergePair(leftSymbol.prev, candidate.left)
318 | suggestNewMergePair(candidate.left, rightSymbol.next)
319 | }
320 |
321 | // Collect the final list of tokens from the remaining elements of symList.
322 | tokens := make([]Token, 0, nTokens)
323 | for i := 0; i >= 0; i = symList[i].next {
324 | symbol := symList[i].symbol
325 | id := proc.symbolToID(symbol)
326 |
327 | if id == proc.unknownID && proc.model.GetTrainerSpec().GetByteFallback() {
328 | // Decompose this symbol into bytes, and report each byte as a separate
329 | // token.
330 | for i := 0; i < len(symbol); i++ {
331 | tokens = append(tokens, proc.byte2Token[symbol[i]])
332 | }
333 | } else {
334 | tokens = append(tokens, Token{ID: id, Text: symbol})
335 | }
336 | }
337 |
338 | return tokens
339 | }
340 |
341 | // symbolMatch finds the length of the first symbol in text. A symbol is either
342 | // a user-defined symbol from the proto or a single rune. The second return
343 | // value is true iff a user-defined symbol was matched.
344 | func (proc *Processor) symbolMatch(text string) (int, bool) {
345 | prefixLen := proc.userDefinedMatcher.FindPrefixLen(text)
346 | if prefixLen > 0 {
347 | return prefixLen, true
348 | }
349 | // Not found a user-defined prefix; get the length of next rune.
350 | _, rlen := utf8.DecodeRuneInString(text)
351 | return rlen, false
352 | }
353 |
354 | const (
355 | symbolBOS = ""
356 | symbolEOS = ""
357 | symbolUNK = ""
358 | symbolPAD = ""
359 | )
360 |
361 | // symbolToID finds the right ID for the given textual symbol, or returns
362 | // proc.unknownID if the symbol is unknown.
363 | func (proc *Processor) symbolToID(symbol string) int {
364 | if id, found := proc.reserved[symbol]; found {
365 | return id
366 | }
367 | if id, found := proc.pieces[symbol]; found {
368 | return id
369 | }
370 | return proc.unknownID
371 | }
372 |
373 | // convertHexValue converts strings of the form "<0xXY>" to the (unsigned)
374 | // integer value of the hexadecimal number XY. -1 is returned for bad input.
375 | func convertHexValue(bv string) int {
376 | bv = strings.TrimPrefix(bv, "<0x")
377 | bv = strings.TrimSuffix(bv, ">")
378 | n, err := strconv.ParseInt(bv, 16, 32)
379 | if err != nil {
380 | return -1
381 | }
382 | return int(n)
383 | }
384 |
385 | // Decode translates a list of IDs produced by [Encode] back into the string
386 | // it represents.
387 | func (proc *Processor) Decode(ids []int) string {
388 | var sb strings.Builder
389 |
390 | for i := 0; i < len(ids); {
391 | // Find a run of IDs that represent single bytes starting at i.
392 | nextNonByte := i
393 | for nextNonByte < len(ids) && proc.isByteID(ids[nextNonByte]) {
394 | nextNonByte++
395 | }
396 | numBytes := nextNonByte - i
397 |
398 | // Handle a run of numBytes IDs, by decoding them into utf8 runes.
399 | if numBytes > 0 {
400 | buf := make([]byte, 0, numBytes)
401 | for bi := i; bi < nextNonByte; bi++ {
402 | buf = append(buf, proc.idToByte[ids[bi]])
403 | }
404 |
405 | for len(buf) > 0 {
406 | // DecodeRune returns utf8.RuneError ('\uFFFD') for bad UTF8 encodings,
407 | // and this is exactly what SentencePiece is supposed to emit for them.
408 | // So we don't do any special handling for UTF8 decode errors here.
409 | r, size := utf8.DecodeRune(buf)
410 | sb.WriteRune(r)
411 | buf = buf[size:]
412 | }
413 | }
414 |
415 | if nextNonByte >= len(ids) {
416 | break
417 | }
418 | // Here nextNonByte is the index of an ID that's not a single byte.
419 | id := ids[nextNonByte]
420 | if proc.isControlID(id) {
421 | // Don't emit anything for control IDs
422 | } else if id == proc.unknownID {
423 | // Special "unk_surface" string for unknown IDs
424 | sb.WriteString(proc.model.GetTrainerSpec().GetUnkSurface())
425 | } else {
426 | piece := proc.model.GetPieces()[id].GetPiece()
427 | sb.WriteString(replaceSeparatorsBySpace(piece))
428 | }
429 | i = nextNonByte + 1
430 | }
431 |
432 | return sb.String()
433 | }
434 |
435 | // DecodeTokens is a convenience wrapper around [Decode], accepting a list of
436 | // tokens as returned by [Encode]. It only uses the ID fields of tokens to
437 | // decode the text.
438 | func (proc *Processor) DecodeTokens(tokens []Token) string {
439 | ids := make([]int, len(tokens))
440 | for i, t := range tokens {
441 | ids[i] = t.ID
442 | }
443 | return proc.Decode(ids)
444 | }
445 |
446 | func (proc *Processor) isByteID(id int) bool {
447 | return proc.model.GetPieces()[id].GetType() == model.ModelProto_SentencePiece_BYTE
448 | }
449 |
450 | func (proc *Processor) isControlID(id int) bool {
451 | return proc.model.GetPieces()[id].GetType() == model.ModelProto_SentencePiece_CONTROL
452 | }
453 |
454 | // ModelInfo stores information about the model proto loaded by the processor.
455 | type ModelInfo struct {
456 | VocabularySize int
457 | BeginningOfSentenceID int
458 | EndOfSentenceID int
459 | UnknownID int
460 | PadID int
461 | }
462 |
463 | // ModelInfo returns information about the loaded proto model file.
464 | func (proc *Processor) ModelInfo() *ModelInfo {
465 | getControlID := func(symbol string) int {
466 | if id := proc.symbolToID(symbol); proc.isControlID(id) {
467 | return id
468 | }
469 | return -1
470 | }
471 |
472 | return &ModelInfo{
473 | VocabularySize: len(proc.model.GetPieces()),
474 | BeginningOfSentenceID: getControlID(symbolBOS),
475 | EndOfSentenceID: getControlID(symbolEOS),
476 | PadID: getControlID(symbolPAD),
477 | UnknownID: proc.unknownID,
478 | }
479 | }
480 |
--------------------------------------------------------------------------------
/processor_test.go:
--------------------------------------------------------------------------------
1 | package sentencepiece
2 |
3 | import (
4 | "fmt"
5 | "os"
6 | "slices"
7 | "testing"
8 | )
9 |
10 | func createProcessor(t testing.TB) *Processor {
11 | t.Helper()
12 | protoFile := os.Getenv("MODELPATH")
13 | if protoFile == "" {
14 | t.Fatal("Need MODELPATH env var to run tests")
15 | }
16 |
17 | proc, err := NewProcessorFromPath(protoFile)
18 | if err != nil {
19 | t.Error(err)
20 | }
21 | return proc
22 | }
23 |
24 | func TestEncodeIDs(t *testing.T) {
25 | proc := createProcessor(t)
26 |
27 | var tests = []struct {
28 | text string
29 | wantIDs []int
30 | }{
31 | {"hello world", []int{17534, 2134}},
32 | {"12345", []int{235274, 235284, 235304, 235310, 235308}},
33 | {" ", []int{139}},
34 | {" ", []int{140}},
35 | {" ", []int{145}},
36 | {"ҔӌԐڎ", []int{427, 365, 428, 357, 429, 361, 435, 359}},
37 | {" ", []int{235248, 4, 139, 235322, 8939, 235313}},
38 | {"
", 4, true},
114 | {"", 3, true},
115 | {"", 4, true},
116 | {"", 15, true},
117 | {"", 64},
141 | {"<0x00>", 0},
142 | {"<0x1a>", 26},
143 | {"<0xF3>", 243},
144 |
145 | {"0x12>", -1},
146 | {"", -1},
147 | {"<012>", -1},
148 | {"<0xTA>", -1},
149 | }
150 |
151 | for _, tt := range tests {
152 | t.Run(tt.in, func(t *testing.T) {
153 | gotN := convertHexValue(tt.in)
154 | if gotN != tt.wantN {
155 | t.Errorf("got %v, want %v", gotN, tt.wantN)
156 | }
157 | })
158 | }
159 | }
160 |
161 | func TestDecoder(t *testing.T) {
162 | proc := createProcessor(t)
163 |
164 | var tests = []struct {
165 | IDs []int
166 | wantText string
167 | }{
168 | {[]int{17534, 2134}, "hello world"},
169 | {[]int{427, 365, 428, 357, 29422, 1653, 427, 365, 428, 357}, "Ҕӌnever againҔӌ"},
170 | {[]int{785, 2017, 108, 639, 2550, 2017}, "one line\nand another line"},
171 | {[]int{1001, 1002, 1003, 1004}, "buark}) res"},
172 | {[]int{111001, 111002, 111003, 111004}, " Wichita EducaçãoVocabulary天堂"},
173 | {[]int{139}, " "},
174 | {[]int{140}, " "},
175 | {[]int{145}, " "},
176 | {[]int{441, 401, 387}, "ส"},
177 | {[]int{411, 380}, "£"},
178 |
179 | // control IDs (0, 1, 2)
180 | {[]int{2, 411, 380}, "£"},
181 | {[]int{1, 2, 411, 380}, "£"},
182 | {[]int{2, 411, 380, 0, 1, 2, 0}, "£"},
183 |
184 | // unknown (id=3)
185 | {[]int{3, 411, 380}, " ⁇ £"},
186 | {[]int{3, 3, 1000, 3}, " ⁇ ⁇ ew ⁇ "},
187 |
188 | // invalid bytes for UTF-8, produce "invalid unicode" runes
189 | {[]int{349, 349, 349}, "���"},
190 | {[]int{800, 348, 500, 348}, "sed�it�"},
191 | }
192 |
193 | for _, tt := range tests {
194 | t.Run(fmt.Sprintf("%v", tt.IDs), func(t *testing.T) {
195 | got := proc.Decode(tt.IDs)
196 | if got != tt.wantText {
197 | t.Errorf("got %q\nwant %q\n", got, tt.wantText)
198 | }
199 | })
200 | }
201 | }
202 |
203 | func TestDecodeTokens(t *testing.T) {
204 | proc := createProcessor(t)
205 | wantText := "hello world"
206 | tokens := []Token{
207 | Token{17534, "xxx"},
208 | Token{139, "xxx"},
209 | Token{2134, "xxx"}}
210 |
211 | text := proc.DecodeTokens(tokens)
212 | if text != wantText {
213 | t.Errorf("got %q, want %q", text, wantText)
214 | }
215 | }
216 |
217 | func TestInfo(t *testing.T) {
218 | proc := createProcessor(t)
219 | info := proc.ModelInfo()
220 |
221 | // Assumes we use the known model file
222 | wantVocabSize := 256000
223 | wantBOS := 2
224 | wantEOS := 1
225 | wantPAD := 0
226 | wantUNK := 3
227 |
228 | if info.VocabularySize != wantVocabSize {
229 | t.Errorf("got %v, want %v", info.VocabularySize, wantVocabSize)
230 | }
231 | if info.BeginningOfSentenceID != wantBOS {
232 | t.Errorf("got %v, want %v", info.BeginningOfSentenceID, wantBOS)
233 | }
234 | if info.EndOfSentenceID != wantEOS {
235 | t.Errorf("got %v, want %v", info.EndOfSentenceID, wantEOS)
236 | }
237 | if info.PadID != wantPAD {
238 | t.Errorf("got %v, want %v", info.PadID, wantPAD)
239 | }
240 | if info.UnknownID != wantUNK {
241 | t.Errorf("got %v, want %v", info.UnknownID, wantUNK)
242 | }
243 | }
244 |
--------------------------------------------------------------------------------
/system_test.go:
--------------------------------------------------------------------------------
1 | package sentencepiece
2 |
3 | import (
4 | "bufio"
5 | "bytes"
6 | "fmt"
7 | "io/ioutil"
8 | "log"
9 | "os"
10 | "os/exec"
11 | "path/filepath"
12 | "slices"
13 | "strconv"
14 | "testing"
15 | )
16 |
17 | // "System" test for comparing our Procesor with the canonical sentencepiece
18 | // Python package (officially distributed with the original C++ implementation
19 | // of the algorithm).
20 | // It also runs Decode for a round-trip test to ensure we get the original
21 | // text back.
22 | //
23 | // This test will only run if python3 is available and is able to successfully
24 | // load the sentencepiece library. Typically this means that 'go test' will
25 | // have to run from an activated Python virtual environment where the library
26 | // was installed.
27 |
28 | func TestVsSentencepiecePython(t *testing.T) {
29 | proc := createProcessor(t)
30 |
31 | if _, err := exec.Command("python3", "-c", "import sentencepiece").Output(); err != nil {
32 | t.Skip("This test only runs when python3 with sentencepiece is available")
33 | }
34 | pyProgramPath := filepath.Join("test", "sp-dump-ids.py")
35 |
36 | paths, err := filepath.Glob(filepath.Join("test", "*.txt"))
37 | if err != nil {
38 | t.Fatal(err)
39 | }
40 |
41 | for _, path := range paths {
42 | _, filename := filepath.Split(path)
43 | testname := filename[:len(filename)-len(filepath.Ext(path))]
44 |
45 | t.Run(testname, func(t *testing.T) {
46 | // Step 1: run the Python program to tokenize path into IDs.
47 | pyOut, err := exec.Command("python3", pyProgramPath, path).Output()
48 | if err != nil {
49 | t.Fatalf("while running %v on %v: %v", pyProgramPath, path, err)
50 | }
51 |
52 | pyIDs := pyOutToIDs(pyOut)
53 |
54 | // Step 2: use our Processor to tokenize path into IDs.
55 | buf, err := ioutil.ReadFile(path)
56 | if err != nil {
57 | log.Fatal(err)
58 | }
59 | text := string(buf)
60 | var goIDs []int
61 | goTokens := proc.Encode(text)
62 | for _, t := range goTokens {
63 | goIDs = append(goIDs, t.ID)
64 | }
65 |
66 | // Step 3: compare the two; dump IDs to temp files for debugging in case
67 | // of a mismatch.
68 | if !slices.Equal(pyIDs, goIDs) {
69 | tmppy := dumpIDsToTempFile(testname+"-py-", pyIDs)
70 | tmpgo := dumpIDsToTempFile(testname+"-go-", goIDs)
71 |
72 | t.Errorf("IDs mismatch; dumped to %q and %q", tmppy, tmpgo)
73 | }
74 |
75 | // Step 4: round-trip Decode to get original text back
76 | newText := proc.Decode(goIDs)
77 | if text != newText {
78 | t.Errorf("text mismatch after Decode")
79 | }
80 | })
81 | }
82 | }
83 |
84 | // pyOutToIDs takes the entire stdout output of the Python program and parses
85 | // it into a list of integer IDs.
86 | func pyOutToIDs(pyOut []byte) []int {
87 | var IDs []int
88 | scanner := bufio.NewScanner(bytes.NewReader(pyOut))
89 | for scanner.Scan() {
90 | i, err := strconv.Atoi(scanner.Text())
91 | if err != nil {
92 | log.Fatal(err)
93 | }
94 | IDs = append(IDs, i)
95 | }
96 | if err := scanner.Err(); err != nil {
97 | log.Fatal(err)
98 | }
99 | return IDs
100 | }
101 |
102 | // dumpIDsToTempFile dumps the given IDs (one per line) to a temporary file with
103 | // the given prefix, and returns the name of the temporary file.
104 | func dumpIDsToTempFile(prefix string, IDs []int) string {
105 | tf, err := os.CreateTemp("", prefix)
106 | if err != nil {
107 | log.Fatal(err)
108 | }
109 | defer tf.Close()
110 |
111 | for _, id := range IDs {
112 | fmt.Fprintf(tf, "%d\n", id)
113 | }
114 | return tf.Name()
115 | }
116 |
--------------------------------------------------------------------------------
/test/gocode1.txt:
--------------------------------------------------------------------------------
1 | var (
2 | file_sentencepiece_model_proto_rawDescOnce sync.Once
3 | file_sentencepiece_model_proto_rawDescData = file_sentencepiece_model_proto_rawDesc
4 | )
5 |
6 | func file_sentencepiece_model_proto_rawDescGZIP() []byte {
7 | file_sentencepiece_model_proto_rawDescOnce.Do(func() {
8 | file_sentencepiece_model_proto_rawDescData = protoimpl.X.CompressGZIP(file_sentencepiece_model_proto_rawDescData)
9 | })
10 | return file_sentencepiece_model_proto_rawDescData
11 | }
12 |
13 | var file_sentencepiece_model_proto_enumTypes = make([]protoimpl.EnumInfo, 2)
14 | var file_sentencepiece_model_proto_msgTypes = make([]protoimpl.MessageInfo, 6)
15 | var file_sentencepiece_model_proto_goTypes = []interface{}{
16 | (TrainerSpec_ModelType)(0), // 0: sentencepiece.TrainerSpec.ModelType
17 | (ModelProto_SentencePiece_Type)(0), // 1: sentencepiece.ModelProto.SentencePiece.Type
18 | (*TrainerSpec)(nil), // 2: sentencepiece.TrainerSpec
19 | (*NormalizerSpec)(nil), // 3: sentencepiece.NormalizerSpec
20 | (*SelfTestData)(nil), // 4: sentencepiece.SelfTestData
21 | (*ModelProto)(nil), // 5: sentencepiece.ModelProto
22 | (*SelfTestData_Sample)(nil), // 6: sentencepiece.SelfTestData.Sample
23 | (*ModelProto_SentencePiece)(nil), // 7: sentencepiece.ModelProto.SentencePiece
24 | }
25 | var file_sentencepiece_model_proto_depIdxs = []int32{
26 | 0, // 0: sentencepiece.TrainerSpec.model_type:type_name -> sentencepiece.TrainerSpec.ModelType
27 | 6, // 1: sentencepiece.SelfTestData.samples:type_name -> sentencepiece.SelfTestData.Sample
28 | 7, // 2: sentencepiece.ModelProto.pieces:type_name -> sentencepiece.ModelProto.SentencePiece
29 | 2, // 3: sentencepiece.ModelProto.trainer_spec:type_name -> sentencepiece.TrainerSpec
30 | 3, // 4: sentencepiece.ModelProto.normalizer_spec:type_name -> sentencepiece.NormalizerSpec
31 | 4, // 5: sentencepiece.ModelProto.self_test_data:type_name -> sentencepiece.SelfTestData
32 | 3, // 6: sentencepiece.ModelProto.denormalizer_spec:type_name -> sentencepiece.NormalizerSpec
33 | 1, // 7: sentencepiece.ModelProto.SentencePiece.type:type_name -> sentencepiece.ModelProto.SentencePiece.Type
34 | 8, // [8:8] is the sub-list for method output_type
35 | 8, // [8:8] is the sub-list for method input_type
36 | 8, // [8:8] is the sub-list for extension type_name
37 | 8, // [8:8] is the sub-list for extension extendee
38 | 0, // [0:8] is the sub-list for field type_name
39 | }
40 |
41 | func init() { file_sentencepiece_model_proto_init() }
42 | func file_sentencepiece_model_proto_init() {
43 | if File_sentencepiece_model_proto != nil {
44 | return
45 | }
46 | if !protoimpl.UnsafeEnabled {
47 | file_sentencepiece_model_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} {
48 | switch v := v.(*TrainerSpec); i {
49 | case 0:
50 | return &v.state
51 | case 1:
52 | return &v.sizeCache
53 | case 2:
54 | return &v.unknownFields
55 | case 3:
56 | return &v.extensionFields
57 | default:
58 | return nil
59 | }
60 | }
61 | file_sentencepiece_model_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} {
62 | switch v := v.(*NormalizerSpec); i {
63 | case 0:
64 | return &v.state
65 | case 1:
66 | return &v.sizeCache
67 | case 2:
68 | return &v.unknownFields
69 | case 3:
70 | return &v.extensionFields
71 | default:
72 | return nil
73 | }
74 | }
75 | file_sentencepiece_model_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} {
76 | switch v := v.(*SelfTestData); i {
77 | case 0:
78 | return &v.state
79 | case 1:
80 | return &v.sizeCache
81 | case 2:
82 | return &v.unknownFields
83 | case 3:
84 | return &v.extensionFields
85 | default:
86 | return nil
87 | }
88 | }
89 | file_sentencepiece_model_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} {
90 | switch v := v.(*ModelProto); i {
91 | case 0:
92 | return &v.state
93 | case 1:
94 | return &v.sizeCache
95 | case 2:
96 | return &v.unknownFields
97 | case 3:
98 | return &v.extensionFields
99 | default:
100 | return nil
101 | }
102 | }
103 | file_sentencepiece_model_proto_msgTypes[4].Exporter = func(v interface{}, i int) interface{} {
104 | switch v := v.(*SelfTestData_Sample); i {
105 | case 0:
106 | return &v.state
107 | case 1:
108 | return &v.sizeCache
109 | case 2:
110 | return &v.unknownFields
111 | default:
112 | return nil
113 | }
114 | }
115 | file_sentencepiece_model_proto_msgTypes[5].Exporter = func(v interface{}, i int) interface{} {
116 | switch v := v.(*ModelProto_SentencePiece); i {
117 | case 0:
118 | return &v.state
119 | case 1:
120 | return &v.sizeCache
121 | case 2:
122 | return &v.unknownFields
123 | case 3:
124 | return &v.extensionFields
125 | default:
126 | return nil
127 | }
128 | }
129 | }
130 | type x struct{}
131 | out := protoimpl.TypeBuilder{
132 | File: protoimpl.DescBuilder{
133 | GoPackagePath: reflect.TypeOf(x{}).PkgPath(),
134 | RawDescriptor: file_sentencepiece_model_proto_rawDesc,
135 | NumEnums: 2,
136 | NumMessages: 6,
137 | NumExtensions: 0,
138 | NumServices: 0,
139 | },
140 | GoTypes: file_sentencepiece_model_proto_goTypes,
141 | DependencyIndexes: file_sentencepiece_model_proto_depIdxs,
142 | EnumInfos: file_sentencepiece_model_proto_enumTypes,
143 | MessageInfos: file_sentencepiece_model_proto_msgTypes,
144 | }.Build()
145 | File_sentencepiece_model_proto = out.File
146 | file_sentencepiece_model_proto_rawDesc = nil
147 | file_sentencepiece_model_proto_goTypes = nil
148 | file_sentencepiece_model_proto_depIdxs = nil
149 | }
150 |
151 |
--------------------------------------------------------------------------------
/test/htmlcode1.txt:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
28 |
29 |
30 |
31 |
32 | The Go Programming Language
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
46 |
47 |
48 |
49 |
50 |
51 |
237 |
238 |
239 |
240 |
--------------------------------------------------------------------------------
/test/latexcode1.txt:
--------------------------------------------------------------------------------
1 | \documentclass{article}
2 | \usepackage{amsmath, amssymb}
3 | \usepackage{amsfonts}
4 | \usepackage{amsthm}
5 |
6 | \newtheorem{theorem}{Theorem}
7 |
8 | \begin{document}
9 |
10 | \title{Proof of Green's Theorem}
11 | \author{}
12 | \date{}
13 | \maketitle
14 |
15 | \begin{theorem}[Green's Theorem]
16 | Let \( C \) be a positively oriented, simple closed curve in the plane, and let \( D \) be the region bounded by \( C \). If \( L(x, y) \) and \( M(x, y) \) have continuous partial derivatives on an open region that contains \( D \) and \( C \), then
17 | \[
18 | \oint_C \left( L \, dx + M \, dy \right) = \iint_D \left( \frac{\partial M}{\partial x} - \frac{\partial L}{\partial y} \right) dA.
19 | \]
20 | \end{theorem}
21 |
22 | \begin{proof}
23 | We will prove Green's Theorem by breaking the region \( D \) into small rectangles and then using the Fundamental Theorem of Calculus.
24 |
25 | Assume that the region \( D \) is divided into \( m \times n \) small rectangles. For each small rectangle \( R_{ij} \) with vertices \((x_i, y_j)\), \((x_{i+1}, y_j)\), \((x_{i+1}, y_{j+1})\), and \((x_i, y_{j+1})\), we approximate the line integral around the boundary of \( R_{ij} \):
26 |
27 | \[
28 | \oint_{\partial R_{ij}} \left( L \, dx + M \, dy \right) \approx \left( M(x_{i+1}, y_{j+1}) - M(x_{i}, y_{j+1}) \right)(x_{i+1} - x_i) - \left( L(x_{i+1}, y_{j+1}) - L(x_{i+1}, y_j) \right)(y_{j+1} - y_j).
29 | \]
30 |
31 | This expression can be rewritten as:
32 |
33 | \[
34 | \oint_{\partial R_{ij}} \left( L \, dx + M \, dy \right) \approx \left( \frac{\partial M}{\partial x} - \frac{\partial L}{\partial y} \right) \Delta x \Delta y,
35 | \]
36 | where \( \Delta x = x_{i+1} - x_i \) and \( \Delta y = y_{j+1} - y_j \).
37 |
38 | Summing over all rectangles in the region \( D \), we obtain:
39 |
40 | \[
41 | \sum_{i,j} \oint_{\partial R_{ij}} \left( L \, dx + M \, dy \right) \approx \sum_{i,j} \left( \frac{\partial M}{\partial x} - \frac{\partial L}{\partial y} \right) \Delta x \Delta y.
42 | \]
43 |
44 | The left-hand side of this equation is approximately the line integral over \( C \), and the right-hand side is a Riemann sum that approximates the double integral over \( D \):
45 |
46 | \[
47 | \oint_C \left( L \, dx + M \, dy \right) = \iint_D \left( \frac{\partial M}{\partial x} - \frac{\partial L}{\partial y} \right) dA.
48 | \]
49 |
50 | Thus, Green's Theorem is proved.
51 | \end{proof}
52 |
53 | \end{document}
54 |
55 |
--------------------------------------------------------------------------------
/test/opening-multilang.txt:
--------------------------------------------------------------------------------
1 | Somewhere in la Mancha, in a place whose name I do not care to remember, a
2 | gentleman lived not long ago, one of those who has a lance and ancient shield on
3 | a shelf and keeps a skinny nag and a greyhound for racing.
4 |
5 | En un lugar de la Mancha, de cuyo nombre no quiero acordarme, no ha mucho tiempo
6 | que vivía un hidalgo de los de lanza en astillero, adarga antigua, rocín flaco y
7 | galgo corredor.
8 |
9 | 「ラ・マンチャのどこか、名前を覚えたくもない場所で、古い盾と槍を棚に飾り、痩せた馬と猟犬を飼っていた紳士が、そう遠くない昔に住んでいた。」
10 |
11 | 라 만차 어딘가에서, 이름을 기억하고 싶지 않은 장소에서, 고대 방패와 창을 선반에 두고, 말라깽이 말과 경주용 그레이하운드를 키우는 신사가 얼마 전에 살았다.
12 |
13 | ला मांचायाः काचित् स्थले, यस्य नाम स्मर्तुम् न इच्छामि, तत्र कदाचित् कश्चन सज्जनः वसति स्म, यस्य शस्त्रं प्राचीनं च कवचं तिष्ठति, तस्य च अश्वः कृशः च श्वा धावनाय अस्ति।
14 |
15 | Где-то в Ла-Манче, в месте, имя которого я не хочу вспоминать, жил некогда
16 | дворянин, один из тех, кто держал копье и старинный щит на полке, а также худую
17 | лошадь и борзую для охоты.
18 |
--------------------------------------------------------------------------------
/test/perlcode1.txt:
--------------------------------------------------------------------------------
1 | sub init_sim
2 | {
3 | my $args =
4 | {
5 | -init_addr => 0,
6 | -mem_file => undef,
7 | -mem_ref => undef,
8 | -device_dir => './',
9 | @_,
10 | };
11 |
12 | $args->{-device_dir} .= '/' unless $args->{-device_dir} =~ /\/^/;
13 |
14 | # init memory
15 | for (my $i = 0; $i < get_mix_mem_size(); ++$i)
16 | {
17 | $mem[$i] = empty_word();
18 | }
19 |
20 | $rA = empty_word();
21 | $rX = empty_word();
22 | $rJ = empty_word();
23 | $rI[$_] = empty_word()
24 | foreach (1 .. 6);
25 |
26 | $f_overflow = 0;
27 | $f_comparison = 0;
28 | $time = 0;
29 | $lc = $args->{-init_addr};
30 | $simulation_ended = 0;
31 | @io_device = ();
32 |
33 | # init IO devices
34 | #
35 | foreach my $n (0 .. 15)
36 | {
37 | if ($n >= 0 and $n <= 7)
38 | {
39 | push(@io_device, {filename => "tape${n}.dev", io_type => "bio", block_size => 100, data => undef});
40 | }
41 | elsif ($n >= 8 and $n <= 15)
42 | {
43 | my $m = $n - 8;
44 | push(@io_device, {filename => "disk${m}.dev", io_type => "bio", block_size => 100, data => undef});
45 | }
46 | }
47 |
48 | push(@io_device, {filename => "cardrd.dev", io_type => "ci", block_size => 16});
49 | push(@io_device, {filename => "cardwr.dev", io_type => "co", block_size => 16});
50 | push(@io_device, {filename => "printer.dev", io_type => "co", block_size => 24});
51 | push(@io_device, {filename => "stdio", io_type => "cio", block_size => 14});
52 | push(@io_device, {filename => "paper.dev", io_type => "ci", block_size => 14});
53 |
54 | foreach my $dev (@io_device)
55 | {
56 | $dev->{filename} = $args->{-device_dir} . $dev->{filename};
57 | }
58 |
59 | $saved_mem_file = $args->{-mem_file};
60 | $saved_mem_ref = $args->{-mem_ref};
61 | $saved_init_addr = $args->{-init_addr};
62 |
63 | if (defined $args->{-mem_file})
64 | {
65 | load_memory_from_text_file($args->{-mem_file});
66 | }
67 | elsif (defined $args->{-mem_ref})
68 | {
69 | @mem = @{$args->{-mem_ref}};
70 | }
71 | else
72 | {
73 | warn("No memory file or reference given to the simulator\n");
74 | }
75 | }
76 |
77 |
78 | sub simulation_ended
79 | {
80 | return $simulation_ended;
81 | }
82 |
83 |
84 | sub fetch_next_instruction
85 | {
86 | return @{$mem[$lc]};
87 | }
88 |
89 |
90 | # Executes one instruction
91 | #
92 | sub step_sim
93 | {
94 | address_is_legal($lc)
95 | or runtime_error("location counter out of memory bounds");
96 |
97 | my @word = fetch_next_instruction();
98 |
99 | my $opcode = $word[5];
100 | my $F = $word[4];
101 |
102 | if ($opcode == 5 and $F == 2) # HLT
103 | {
104 | $simulation_ended = 1;
105 | return;
106 | }
107 | elsif ($opcode == 0) # NOP
108 | {
109 | $lc++;
110 | return;
111 | }
112 | else
113 | {
114 | # Dispatch the instruction to the appropriate handler,
115 | # based on the opcode.
116 | #
117 | if (defined $opcode_map{$opcode})
118 | {
119 | my $op_func = $opcode_map{$opcode};
120 | $op_func->(@word);
121 | $lc++;
122 | }
123 | else
124 | {
125 | runtime_error("illegal opcode: $opcode");
126 | }
127 | }
128 | }
129 |
130 |
131 | sub get_mem_ref
132 | {
133 | return \@mem;
134 | }
135 |
136 |
137 | # Simulates the MIX code until a HLT instruction is
138 | # incountered.
139 | #
140 | sub run_sim
141 | {
142 | # step through the whole program
143 | #
144 | until (simulation_ended())
145 | {
146 | step_sim();
147 | }
148 |
149 | # update the binary devices
150 | #
151 | foreach my $devref (@io_device)
152 | {
153 | next unless is_binary_device($devref) and defined $devref->{data};
154 |
155 | my $fh = $devref->{handle};
156 | close $fh if defined $fh;
157 |
158 | unless (open($fh, ">$devref->{filename}"))
159 | {
160 | warn "Unable to write device $devref->{filename}\n";
161 | next;
162 | }
163 |
164 | foreach my $block_n (keys %{$devref->{data}})
165 | {
166 | print $fh "$block_n\n";
167 |
168 | for (my $i = 0; $i < $devref->{block_size}; ++$i)
169 | {
170 | print $fh sprintf("%2s %2s %2s %2s %2s %2s\n", @{$devref->{data}->{$block_n}->[$i]});
171 | }
172 | }
173 |
174 | close $fh;
175 | }
176 | }
177 |
178 | sub interactive_sim
179 | {
180 | local $| = 1;
181 | my %breakpoints;
182 |
183 | print "\nWelcome to MIXSim interaction !\n\n";
184 |
185 | interaction: while (1)
186 | {
187 | printf "[%4s]> ", $lc;
188 | my $command = <>;
189 | chomp($command);
190 |
191 | # strip leading and trailing whitespace
192 | $command =~ s/^\s+//;
193 | $command =~ s/\s+$//;
194 |
195 | my @toks = split('\s+', $command);
196 | next if @toks == 0;
197 |
198 | if ($command eq "s")
199 | {
200 | step_sim();
201 |
202 | print "Simulation ended (HLT)\n" if (simulation_ended());
203 |
204 | }
205 | elsif ($command eq "c" or $command eq "cl")
206 | {
207 | step_loop: while (1)
208 | {
209 | if (exists $breakpoints{$lc})
210 | {
211 | print "Breakpoint stop at address $lc\n";
212 | last step_loop;
213 | }
214 |
215 | if (simulation_ended())
216 | {
217 | print "Simulation ended (HLT)\n" if (simulation_ended());
218 | last step_loop;
219 | }
220 |
221 | print "$lc\n" if $command eq "cl";
222 | step_sim();
223 | }
224 | }
225 | elsif ($command eq "rst")
226 | {
227 | if (defined $saved_mem_file)
228 | {
229 | init_sim(-mem_file => $saved_mem_file, -init_addr => $saved_init_addr);
230 | }
231 | elsif (defined $saved_mem_ref)
232 | {
233 | init_sim(-mem_ref => $saved_mem_ref, -init_addr => $saved_init_addr);
234 | }
235 | }
236 | elsif ($command eq "r")
237 | {
238 | print state_dump(), "\n";
239 | }
240 | elsif ($command eq "sr")
241 | {
242 | step_sim();
243 | print state_dump(), "\n";
244 | }
245 | elsif ($toks[0] eq "m")
246 | {
247 | if (@toks == 1)
248 | {
249 | print memory_dump(\@mem);
250 | }
251 | elsif (@toks == 2)
252 | {
253 | my $addr = $toks[1];
254 | address_is_legal($addr) or interactive_error("Illegal address $addr");
255 | printf("%4s : %2s %2s %2s %2s %2s %2s\n", $addr, @{$mem[$addr]});
256 | }
257 | else
258 | {
259 | interactive_error("Illegal m command");
260 | }
261 | }
262 | elsif ($toks[0] eq "b")
263 | {
264 | if (@toks != 2)
265 | {
266 | interactive_error("Illegal b command");
267 | next;
268 | }
269 |
270 | my $addr = $toks[1];
271 |
272 | if (not address_is_legal($addr))
273 | {
274 | interactive_error("Illegal address $addr");
275 | next;
276 | }
277 |
278 | if (exists $breakpoints{$addr})
279 | {
280 | delete($breakpoints{$addr});
281 | print "Removed breakpoint at $addr\n";
282 | }
283 | else
284 | {
285 | $breakpoints{$addr} = 1;
286 | print "Set breakpoint at $addr\n";
287 | }
288 | }
289 | elsif ($command eq "bl")
290 | {
291 | my @bkpt_keys = keys %breakpoints;
292 |
293 | if (@bkpt_keys == 0)
294 | {
295 | print "No breakpoints set\n";
296 | }
297 | else
298 | {
299 | print "Breakpoints set at:\n";
300 |
301 | if (@bkpt_keys == 1)
302 | {
303 | print "$bkpt_keys[0] ";
304 | }
305 | else
306 | {
307 | foreach my $addr (sort {$a <=> $b} @bkpt_keys)
308 | {
309 | print "$addr ";
310 | }
311 | }
312 |
313 | print "\n";
314 | }
315 | }
316 | elsif ($command eq "br")
317 | {
318 | %breakpoints = ();
319 | }
320 | elsif ($command eq "h")
321 | {
322 | print "\n*** MIXSim interaction help ***\n\n";
323 | print "s \t\t step\n";
324 | print "c \t\t continue until next breakpoint or HLT\n";
325 | print "cl \t\t same as 'c', with an execution trace\n";
326 | print "rst \t\t restart simulation (breakpoints remain)\n";
327 | print "r \t\t print contents of registers\n";
328 | print "sr \t\t step and print contents of registers\n";
329 | print "m \t\t print all non-zero memory words\n";
330 | print "m \t\t print a memory word at \n";
331 | print "b \t\t set/unset a breakpoint at \n";
332 | print "bl \t\t list all breakpoints\n";
333 | print "br \t\t remove all breakpoints\n";
334 | print "h \t\t show this help\n";
335 | print "x or q \t\t exit interaction\n\n";
336 | }
337 | elsif ($command eq "x" or $command eq "q")
338 | {
339 | last interaction;
340 | }
341 | else
342 | {
343 | print "Illegal command. Type 'h' for help\n";
344 | }
345 | }
346 |
347 | print "\nBye !\n\n";
348 | }
349 |
350 |
351 | # Returns a state dump - contents of all the registers
352 | #
353 | sub state_dump
354 | {
355 | my $dump_str = "";
356 |
357 | $dump_str .= sprintf("rA : %2s %2s %2s %2s %2s %2s\n", @{$rA});
358 | $dump_str .= sprintf("rX : %2s %2s %2s %2s %2s %2s\n", @{$rX});
359 |
360 | $dump_str .= sprintf("rI$_ : %2s %2s %2s %2s %2s %2s\n", @{$rI[$_]})
361 | foreach (1 .. 6);
362 |
363 | $dump_str .= "\n";
364 | $dump_str .= sprintf("rJ : %2s %2s %2s %2s %2s %2s\n", @{$rJ});
365 | $dump_str .= sprintf("lc : %5s\n", $lc);
366 | $dump_str .= sprintf("ovf : %2s\n", $f_overflow);
367 | $dump_str .= sprintf("comp : %2s\n", $f_comparison);
368 | }
369 |
370 |
371 | # Reports runtime errors - errors that occured during simulation
372 | # as a result of incorrect machine code. $lc is reported
373 | #
374 | sub runtime_error
375 | {
376 | my ($msg) = @_;
377 |
378 | die("Simulation error at address $lc: $msg\n");
379 | }
380 |
381 |
382 |
--------------------------------------------------------------------------------
/test/pg2000_spanish.txt:
--------------------------------------------------------------------------------
1 | The Project Gutenberg eBook of Don Quijote
2 |
3 | This ebook is for the use of anyone anywhere in the United States and
4 | most other parts of the world at no cost and with almost no restrictions
5 | whatsoever. You may copy it, give it away or re-use it under the terms
6 | of the Project Gutenberg License included with this ebook or online
7 | at www.gutenberg.org. If you are not located in the United States,
8 | you will have to check the laws of the country where you are located
9 | before using this eBook.
10 |
11 | Title: Don Quijote
12 |
13 | Author: Miguel de Cervantes Saavedra
14 |
15 | Release date: December 1, 1999 [eBook #2000]
16 | Most recently updated: January 17, 2021
17 |
18 | Language: Spanish
19 |
20 | Credits: an anonymous Project Gutenberg volunteer and Joaquin Cuenca Abela
21 |
22 |
23 | *** START OF THE PROJECT GUTENBERG EBOOK DON QUIJOTE ***
24 |
25 |
26 |
27 |
28 | El ingenioso hidalgo don Quijote de la Mancha
29 |
30 |
31 |
32 | por Miguel de Cervantes Saavedra
33 |
34 |
35 |
36 |
37 |
38 | El ingenioso hidalgo don Quijote de la Mancha
39 |
40 |
41 |
42 | Tasa
43 |
44 |
45 | Testimonio de las erratas
46 |
47 |
48 | El Rey
49 |
50 |
51 | Al Duque de Béjar
52 |
53 |
54 | Prólogo
55 |
56 |
57 | Al libro de don Quijote de la Mancha
58 |
59 |
60 |
61 | Que trata de la condición y ejercicio del famoso
62 | hidalgo don Quijote de la Mancha
63 |
64 | Que trata de la primera salida que de su tierra hizo
65 | el ingenioso don Quijote
66 |
67 | Donde se cuenta la graciosa manera que tuvo don
68 | Quijote en armarse caballero
69 |
70 | De lo que le sucedió a nuestro caballero cuando salió
71 | de la venta
72 |
73 | Donde se prosigue la narración de la desgracia de
74 | nuestro caballero
75 |
76 | Del donoso y grande escrutinio que el cura y el
77 | barbero hicieron en la librería de nuestro ingenioso hidalgo
78 |
79 | De la segunda salida de nuestro buen caballero don
80 | Quijote de la Mancha
81 |
82 | Del buen suceso que el valeroso don Quijote tuvo en
83 | la espantable y jamás imaginada aventura de los molinos de viento, con
84 | otros sucesos dignos de felice recordación
85 |
86 | Donde se concluye y da fin a la estupenda batalla que
87 | el gallardo vizcaíno y el valiente manchego tuvieron
88 |
89 | De lo que más le avino a don Quijote con el vizcaíno, y
90 | del peligro en que se vio con una turba de yangüeses
91 |
92 | De lo que le sucedió a don Quijote con unos
93 | cabreros
94 |
95 | De lo que contó un cabrero a los que estaban con don
96 | Quijote
97 |
98 | Donde se da fin al cuento de la pastora Marcela, con
99 | otros sucesos
100 |
101 | Donde se ponen los versos desesperados del difunto
102 | pastor, con otros no esperados sucesos
103 |
104 | Donde se cuenta la desgraciada aventura que se topó
105 | don Quijote en topar con unos desalmados yangüeses
106 |
107 | De lo que le sucedió al ingenioso hidalgo en la venta
108 | que él imaginaba ser castillo
109 |
110 | Donde se prosiguen los innumerables trabajos que el
111 | bravo don Quijote y su buen escudero Sancho Panza pasaron en la venta
112 | que, por su mal, pensó que era castillo
113 |
114 | Donde se cuentan las razones que pasó Sancho Panza
115 | con su señor Don Quijote, con otras aventuras dignas de ser
116 | contadas
117 |
118 | De las discretas razones que Sancho pasaba con su
119 | amo, y de la aventura que le sucedió con un cuerpo muerto, con otros
120 | acontecimientos famosos
121 |
122 | De la jamás vista ni oída aventura que con más poco
123 | peligro fue acabada de famoso caballero en el mundo, como la que acabó
124 | el valeroso don Quijote de la Mancha
125 |
126 | Que trata de la alta aventura y rica ganancia del
127 | yelmo de Mambrino, con otras cosas sucedidas a nuestro invencible
128 | caballero
129 |
130 | De la libertad que dio don Quijote a muchos
131 | desdichados que, mal de su grado, los llevaban donde no quisieran
132 | ir
133 |
134 | De lo que le aconteció al famoso don Quijote en
135 | Sierra Morena, que fue una de las más raras aventuras que en esta
136 | verdadera historia se cuentan
137 |
138 | Donde se prosigue la aventura de la Sierra
139 | Morena
140 |
141 | Que trata de las estrañas cosas que en Sierra Morena
142 | sucedieron al valiente caballero de la Mancha, y de la imitación que
143 | hizo a la penitencia de Beltenebros
144 |
145 | Donde se prosiguen las finezas que de enamorado hizo
146 | don Quijote en Sierra Morena
147 |
148 | De cómo salieron con su intención el cura y el
149 | barbero, con otras cosas dignas de que se cuenten en esta grande
150 | historia
151 |
152 | Que trata de la nueva y agradable aventura que al
153 | cura y barbero sucedió en la mesma sierra
154 |
155 | Que trata de la discreción de la hermosa Dorotea,
156 | con otras cosas de mucho gusto y pasatiempo
157 |
158 | Que trata del gracioso artificio y orden que se tuvo
159 | en sacar a nuestro enamorado caballero de la asperísima penitencia en
160 | que se había puesto
161 |
162 | De los sabrosos razonamientos que pasaron entre don
163 | Quijote y Sancho Panza, su escudero, con otros sucesos
164 |
165 | Que trata de lo que sucedió en la venta a toda la
166 | cuadrilla de don Quijote
167 |
168 | Donde se cuenta la novela del Curioso
169 | impertinente
170 |
--------------------------------------------------------------------------------
/test/pg41845_telugu.txt:
--------------------------------------------------------------------------------
1 | The Project Gutenberg eBook of ఓనమాలు
2 |
3 | This ebook is for the use of anyone anywhere in the United States and
4 | most other parts of the world at no cost and with almost no restrictions
5 | whatsoever. You may copy it, give it away or re-use it under the terms
6 | of the Project Gutenberg License included with this ebook or online
7 | at www.gutenberg.org. If you are not located in the United States,
8 | you will have to check the laws of the country where you are located
9 | before using this eBook.
10 |
11 | Title: ఓనమాలు
12 |
13 | Author: Mahidhara Ramamohan Rao
14 |
15 | Release date: January 14, 2013 [eBook #41845]
16 |
17 | Language: Telugu
18 |
19 | Credits: Produced by volunteers at Pustakam.net
20 |
21 |
22 | *** START OF THE PROJECT GUTENBERG EBOOK ఓనమాలు ***
23 |
24 |
25 |
26 |
27 | Produced by volunteers at Pustakam.net
28 |
29 |
30 |
31 |
32 | అవంతీ ప్రచురణలు 4.
33 |
34 |
35 |
36 |
37 | ఓనమాలు
38 |
39 |
40 |
41 |
42 | రచన:
43 |
44 | మహీధర రామమోహనరావు
45 |
46 |
47 |
48 |
49 | సోల్ డిస్ట్రిబ్యూటర్లు:
50 |
51 | విశాలాంధ్ర ప్రచురణాలయం,
52 |
53 | విజయవాడ-2
54 |
55 |
56 |
57 |
58 | మొదటి ముద్రణ
59 |
60 | 1956
61 |
62 |
63 |
64 |
65 | వెల
66 |
67 | రెండు రూపాయల పావలా
68 |
69 |
70 |
71 |
72 | అవంతీ ప్రెస్
73 |
74 | రాజమండ్రి
75 |
76 |
77 |
78 |
79 | 1947....
80 |
81 | ....నాటి తెలంగాణా ఒక అగ్నిగుండం.
82 |
83 | దుస్సహమైన జాగీర్దారీ వ్యవస్థను నిర్మూలించగల పోరాటాల్ని ప్రజానీకం సాగిస్తూంది. వాటినన్నింటినీ ఒకే జెండా క్రిందికి తెచ్చి,
84 | రాజకీయ నాయకత్వం సమకూర్చడానికై ఆంధ్రమహాసభా, కమ్యూనిస్టు పార్టీ సన్నాహాలు సాగిస్తున్నాయి.
85 |
86 | రెండో వైపున – విదేశీ పాలనకూ, సంస్థానాధీశుల నిరంకుశ పాలనకూ వ్యతిరేకంగా జాతీయ ప్రజాతంత్ర పోరాటాలు తెలంగాణాన్ని
87 | అలుముకొంటున్నాయి.
88 |
89 | ప్రజాతంత్ర హక్కులకై సాగుతున్న ఈ పోరాటాలు ఐక్యతను కూర్చుకొంటూ నిజాము పరిపాలనా యంత్రాన్ని మొదలంట కదిల్చివేస్తున్నాయి.
90 |
91 | ఈ దశలో …
92 |
93 | విచ్ఛిన్నమైపోతున్న జాగీర్దారీ వ్యవస్థను రక్షించగల శక్తి నిజాము ప్రభుత్వానికి లేదని గ్రహించిన భూస్వామ్యవర్గం నూతన
94 | నాయకత్వం కొరకై వెతుకులాడుతూ జాతీయోద్యమంలో తనకు రక్షణనివ్వగల శక్తుల్ని చూసుకొంది.
95 |
96 | సమాజంలో తనకున్న బలం క్రమంగా క్షీణించి పోతూంటే, కూలిపోతున్న తన అధికారాన్ని పరిరక్షించుకొనేటందుకై మతవాదుల్నీ, రౌడీల్ని
97 | సమీకరించి విధ్వంసకాండకు పూనుకొంది నిజాము సర్కారు.
98 |
99 | ప్రజానీకానికీ, ప్రతిరోధ శక్తులకూ మధ్య జరిగిన ఈ ఘర్షణలలో తెలంగాణా ఒక అగ్నిగుండమే అయింది.
100 |
101 | ఆనాటి సంఘర్షణలే నా ఈ నవలకు కథావస్తువు. సుదీర్ఘమైన ఈ నవలలో మొదటి భాగం పాఠకుల ముందుంచుతున్నా. త్వరలోనే
102 | మిగతావీ.
103 |
104 | విజయవాడ,
105 |
106 | 20-3-56
107 |
108 | రచయిత.
109 |
110 |
111 |
112 |
113 | భూమి కోసం
114 | భుక్తి కోసం
115 | నిగళబంధ
116 | విముక్తి కోసం
117 | నేల కొరిగిన
118 | తెలుగు జోదుల
119 | కిత్తు నంజలులు.
120 |
121 | కృతజ్ఞత
122 |
123 | తమ పత్రికలో ధారావాహికగా వెలువడిన ఈ నవలను పుస్తకరూపంలో ప్రచురించుకొనుటకనుమతించిన విశాలాంధ్ర సంపాదకులకు -
124 |
125 | రచయిత.
126 |
127 |
128 |
129 |
130 | ఓనమాలు
131 | (మొదటి భాగం)
132 |
133 |
134 |
135 |
136 | ఒకటో ప్రకరణం.
137 |
138 |
139 | అటువంటివాడు ఒక వారం పది రోజులనుంచి పరధ్యానంగా వుంటున్నాడు. ఆతడు దూరదూరంగా వుంటున్నాడనిపించింది. ఆ ఆలోచనతో మనస్సు
140 | కరిగిపోతూంది; హృదయం ఆరాటపడిపోతూంది; అతనిని కదిలించడానికి చేసిన ప్రయత్నాలన్నీ, విఫలం అయ్యాయనిపిస్తూంటే ఎంతో
141 | బాధపడిపోతూంది. ఈ వారం పది రోజులుగా అతనిలో కనిపిస్తున్న ధోరణి ఏమిటో అర్థం కాలేదు. ఏమేమిటో కారణాలు కల్పించుకొంటూంది.
142 | ఆ కారణాలన్నీ ఆమెను మరింత బాధిస్తున్నాయి.
143 |
144 | అతడు తన ఎరికలో ఇంత గాఢంగా ఆలోచనల్లో మునిగి వుండడం ఎప్పుడూ జరగలేదు. అతడు ఆలోచించవలసిన విషయాలు మాత్రం
145 | పెద్దగా ఏం వున్నాయిగనక. ఆస్తా...సెంటు భూమి లేదు. పన్నుకి పీడించేవాళ్ళింక పుట్టవలిసిందేనని అతడే వేళాకోళంగా
146 | అంటూంటాడు....తల్లా, తండ్రా?...ఆ ఇద్దరూ కూడా ఏనాడో మరణించారు.
147 |
148 | ...పెళ్ళామా, పిల్లలా?....ఈ మాట ఆలోచనకు వచ్చినప్పుడు సత్తెమ్మ అంత సులభంగా 'కాదు' అనుకోలేకపోయింది.
149 | ఆలోచించగా, ఆలోచించగా అసలు కారణం అక్కడే వున్నట్లు కూడా అనిపించింది. అనిపించడంతో కళ్ళనీళ్లు తిరిగేయి.
150 |
151 | అతనిని కాదనడానికి తనకున్న హక్కు ఏమిటి? అతని కోసం తాను ఎంతయినా త్యాగం చేసి వుండొచ్చు. ఉండొచ్చునేమిటి? చేసింది.
152 |
153 | ఊరువాళ్ళ మాటల్ని ఖాతరు చెయ్యలేదు. తల్లి ఏడ్పును లెక్కచెయ్యలేదు. కుల మర్యాదల నాలోచించలేదు. అతని కోసం
154 | ఆత్మార్పణ చేసుకొంది. సమాజంలో ఆడది చేయగల త్యాగానికది పరాకాష్ఠ. అయితేనేం?...
155 |
156 | అతడు తనకి మగడు కాదు. తనకి మగడు లేడు. వెంకటయ్య కోసం తాను ఎంత తపన పడ్డా, తానో వితంతువు మాత్రమే. అతని
157 | మీద తనకు హక్కు లేదు.
158 |
159 | తనతో సావాసం చేసేక అతడు ఇతర పడుచుల్ని అంటుకోలేదు. కన్నెత్తి కూడా చూడలేదు. వెంకటయ్య కోసం దార్లుకాచిన పడుచుల్నీ,
160 | అతని మాటకోసం కాట్లాడుకొన్న పడుచుల్నీ ఆమె ఎరుగును. అన్నీ ఎరిగే ఆమె అతనితో నేస్తం చేసింది. తనతో చేరేక అతడు
161 | పూర్తిగా మారిపోయేడు. అతని పరిచయాల విషయంలో తాను పడ్డ జాలికూడా అతనికి నవ్వుతాలయింది. ఆ సంగతినామె ఎరుగును. అతడు
162 | తనదే లోకంగా ఆనందిస్తున్నాడు. తనకేమాత్రం కష్టం కలిగినా గిజగిజలాడి పోతాడు. తన కాళ్ళక్రింద కళ్ళు పరిచేడు.
163 | కళ్ళముందు హృదయం విప్పేడు.
164 |
165 |
--------------------------------------------------------------------------------
/test/pg7193_english.txt:
--------------------------------------------------------------------------------
1 | The Project Gutenberg eBook of The Adventures of Tom Sawyer, Part 1.
2 |
3 | This ebook is for the use of anyone anywhere in the United States and
4 | most other parts of the world at no cost and with almost no restrictions
5 | whatsoever. You may copy it, give it away or re-use it under the terms
6 | of the Project Gutenberg License included with this ebook or online
7 | at www.gutenberg.org. If you are not located in the United States,
8 | you will have to check the laws of the country where you are located
9 | before using this eBook.
10 |
11 | Title: The Adventures of Tom Sawyer, Part 1.
12 |
13 | Author: Mark Twain
14 |
15 | Release date: June 29, 2004 [eBook #7193]
16 | Most recently updated: December 30, 2020
17 |
18 | Language: English
19 |
20 | Credits: Produced by David Widger
21 |
22 |
23 | *** START OF THE PROJECT GUTENBERG EBOOK THE ADVENTURES OF TOM SAWYER, PART 1. ***
24 |
25 |
26 |
27 |
28 | Produced by David Widger
29 |
30 |
31 |
32 |
33 | THE ADVENTURES OF TOM SAWYER
34 | BY
35 | MARK TWAIN
36 | (Samuel Langhorne Clemens)
37 |
38 | Part 1
39 |
40 |
41 | P R E F A C E
42 |
43 | MOST of the adventures recorded in this book really occurred; one or
44 | two were experiences of my own, the rest those of boys who were
45 | schoolmates of mine. Huck Finn is drawn from life; Tom Sawyer also, but
46 | not from an individual--he is a combination of the characteristics of
47 | three boys whom I knew, and therefore belongs to the composite order of
48 | architecture.
49 |
50 | The odd superstitions touched upon were all prevalent among children
51 | and slaves in the West at the period of this story--that is to say,
52 | thirty or forty years ago.
53 |
54 | Although my book is intended mainly for the entertainment of boys and
55 | girls, I hope it will not be shunned by men and women on that account,
56 | for part of my plan has been to try to pleasantly remind adults of what
57 | they once were themselves, and of how they felt and thought and talked,
58 | and what queer enterprises they sometimes engaged in.
59 |
60 | THE AUTHOR.
61 |
62 | HARTFORD, 1876.
63 |
64 |
65 |
66 | T O M S A W Y E R
67 |
68 |
69 |
70 | CHAPTER I
71 |
72 | "TOM!"
73 |
74 | No answer.
75 |
76 | "TOM!"
77 |
78 | No answer.
79 |
80 | "What's gone with that boy, I wonder? You TOM!"
81 |
82 | No answer.
83 |
84 | The old lady pulled her spectacles down and looked over them about the
85 | room; then she put them up and looked out under them. She seldom or
86 | never looked THROUGH them for so small a thing as a boy; they were her
87 | state pair, the pride of her heart, and were built for "style," not
88 | service--she could have seen through a pair of stove-lids just as well.
89 | She looked perplexed for a moment, and then said, not fiercely, but
90 | still loud enough for the furniture to hear:
91 |
92 | "Well, I lay if I get hold of you I'll--"
93 |
94 | She did not finish, for by this time she was bending down and punching
95 | under the bed with the broom, and so she needed breath to punctuate the
96 | punches with. She resurrected nothing but the cat.
97 |
98 | "I never did see the beat of that boy!"
99 |
100 | She went to the open door and stood in it and looked out among the
101 | tomato vines and "jimpson" weeds that constituted the garden. No Tom.
102 | So she lifted up her voice at an angle calculated for distance and
103 | shouted:
104 |
105 | "Y-o-u-u TOM!"
106 |
107 | There was a slight noise behind her and she turned just in time to
108 | seize a small boy by the slack of his roundabout and arrest his flight.
109 |
110 | "There! I might 'a' thought of that closet. What you been doing in
111 | there?"
112 |
113 | "Nothing."
114 |
115 | "Nothing! Look at your hands. And look at your mouth. What IS that
116 | truck?"
117 |
118 | "I don't know, aunt."
119 |
120 | "Well, I know. It's jam--that's what it is. Forty times I've said if
121 | you didn't let that jam alone I'd skin you. Hand me that switch."
122 |
123 | The switch hovered in the air--the peril was desperate--
124 |
125 | "My! Look behind you, aunt!"
126 |
127 | The old lady whirled round, and snatched her skirts out of danger. The
128 | lad fled on the instant, scrambled up the high board-fence, and
129 | disappeared over it.
130 |
131 | His aunt Polly stood surprised a moment, and then broke into a gentle
132 | laugh.
133 |
134 | "Hang the boy, can't I never learn anything? Ain't he played me tricks
135 | enough like that for me to be looking out for him by this time? But old
136 | fools is the biggest fools there is. Can't learn an old dog new tricks,
137 | as the saying is. But my goodness, he never plays them alike, two days,
138 | and how is a body to know what's coming? He 'pears to know just how
139 | long he can torment me before I get my dander up, and he knows if he
140 | can make out to put me off for a minute or make me laugh, it's all down
141 | again and I can't hit him a lick. I ain't doing my duty by that boy,
142 | and that's the Lord's truth, goodness knows. Spare the rod and spile
143 | the child, as the Good Book says. I'm a laying up sin and suffering for
144 | us both, I know. He's full of the Old Scratch, but laws-a-me! he's my
145 | own dead sister's boy, poor thing, and I ain't got the heart to lash
146 | him, somehow. Every time I let him off, my conscience does hurt me so,
147 | and every time I hit him my old heart most breaks. Well-a-well, man
148 | that is born of woman is of few days and full of trouble, as the
149 | Scripture says, and I reckon it's so. He'll play hookey this evening, *
150 | and [* Southwestern for "afternoon"] I'll just be obleeged to make him
151 | work, to-morrow, to punish him. It's mighty hard to make him work
152 | Saturdays, when all the boys is having holiday, but he hates work more
153 | than he hates anything else, and I've GOT to do some of my duty by him,
154 | or I'll be the ruination of the child."
155 |
156 | Tom did play hookey, and he had a very good time. He got back home
157 | barely in season to help Jim, the small colored boy, saw next-day's
158 | wood and split the kindlings before supper--at least he was there in
159 | time to tell his adventures to Jim while Jim did three-fourths of the
160 | work. Tom's younger brother (or rather half-brother) Sid was already
161 | through with his part of the work (picking up chips), for he was a
162 | quiet boy, and had no adventurous, troublesome ways.
163 |
164 | While Tom was eating his supper, and stealing sugar as opportunity
165 | offered, Aunt Polly asked him questions that were full of guile, and
166 | very deep--for she wanted to trap him into damaging revealments. Like
167 | many other simple-hearted souls, it was her pet vanity to believe she
168 | was endowed with a talent for dark and mysterious diplomacy, and she
169 | loved to contemplate her most transparent devices as marvels of low
170 | cunning. Said she:
171 |
172 | "Tom, it was middling warm in school, warn't it?"
173 |
174 | "Yes'm."
175 |
176 | "Powerful warm, warn't it?"
177 |
178 | "Yes'm."
179 |
180 | "Didn't you want to go in a-swimming, Tom?"
181 |
182 | A bit of a scare shot through Tom--a touch of uncomfortable suspicion.
183 | He searched Aunt Polly's face, but it told him nothing. So he said:
184 |
185 | "No'm--well, not very much."
186 |
187 | The old lady reached out her hand and felt Tom's shirt, and said:
188 |
189 | "But you ain't too warm now, though." And it flattered her to reflect
190 | that she had discovered that the shirt was dry without anybody knowing
191 | that that was what she had in her mind. But in spite of her, Tom knew
192 | where the wind lay, now. So he forestalled what might be the next move:
193 |
194 | "Some of us pumped on our heads--mine's damp yet. See?"
195 |
196 | Aunt Polly was vexed to think she had overlooked that bit of
197 | circumstantial evidence, and missed a trick. Then she had a new
198 | inspiration:
199 |
200 | "Tom, you didn't have to undo your shirt collar where I sewed it, to
201 | pump on your head, did you? Unbutton your jacket!"
202 |
203 | The trouble vanished out of Tom's face. He opened his jacket. His
204 | shirt collar was securely sewed.
205 |
206 | "Bother! Well, go 'long with you. I'd made sure you'd played hookey
207 | and been a-swimming. But I forgive ye, Tom. I reckon you're a kind of a
208 | singed cat, as the saying is--better'n you look. THIS time."
209 |
210 | She was half sorry her sagacity had miscarried, and half glad that Tom
211 | had stumbled into obedient conduct for once.
212 |
213 | But Sidney said:
214 |
215 | "Well, now, if I didn't think you sewed his collar with white thread,
216 | but it's black."
217 |
218 | "Why, I did sew it with white! Tom!"
219 |
220 | But Tom did not wait for the rest. As he went out at the door he said:
221 |
222 | "Siddy, I'll lick you for that."
223 |
224 | In a safe place Tom examined two large needles which were thrust into
225 | the lapels of his jacket, and had thread bound about them--one needle
226 | carried white thread and the other black. He said:
227 |
228 | "She'd never noticed if it hadn't been for Sid. Confound it! sometimes
229 | she sews it with white, and sometimes she sews it with black. I wish to
230 | geeminy she'd stick to one or t'other--I can't keep the run of 'em. But
231 | I bet you I'll lam Sid for that. I'll learn him!"
232 |
233 | He was not the Model Boy of the village. He knew the model boy very
234 | well though--and loathed him.
235 |
236 | Within two minutes, or even less, he had forgotten all his troubles.
237 | Not because his troubles were one whit less heavy and bitter to him
238 | than a man's are to a man, but because a new and powerful interest bore
239 | them down and drove them out of his mind for the time--just as men's
240 | misfortunes are forgotten in the excitement of new enterprises. This
241 | new interest was a valued novelty in whistling, which he had just
242 | acquired from a negro, and he was suffering to practise it undisturbed.
243 | It consisted in a peculiar bird-like turn, a sort of liquid warble,
244 | produced by touching the tongue to the roof of the mouth at short
245 | intervals in the midst of the music--the reader probably remembers how
246 | to do it, if he has ever been a boy. Diligence and attention soon gave
247 | him the knack of it, and he strode down the street with his mouth full
248 | of harmony and his soul full of gratitude. He felt much as an
249 | astronomer feels who has discovered a new planet--no doubt, as far as
250 | strong, deep, unalloyed pleasure is concerned, the advantage was with
251 | the boy, not the astronomer.
252 |
253 | The summer evenings were long. It was not dark, yet. Presently Tom
254 | checked his whistle. A stranger was before him--a boy a shade larger
255 | than himself. A new-comer of any age or either sex was an impressive
256 | curiosity in the poor little shabby village of St. Petersburg. This boy
257 | was well dressed, too--well dressed on a week-day. This was simply
258 | astounding. His cap was a dainty thing, his close-buttoned blue cloth
259 | roundabout was new and natty, and so were his pantaloons. He had shoes
260 | on--and it was only Friday. He even wore a necktie, a bright bit of
261 | ribbon. He had a citified air about him that ate into Tom's vitals. The
262 | more Tom stared at the splendid marvel, the higher he turned up his
263 | nose at his finery and the shabbier and shabbier his own outfit seemed
264 | to him to grow. Neither boy spoke. If one moved, the other moved--but
265 | only sidewise, in a circle; they kept face to face and eye to eye all
266 | the time. Finally Tom said:
267 |
268 | "I can lick you!"
269 |
270 | "I'd like to see you try it."
271 |
272 | "Well, I can do it."
273 |
274 | "No you can't, either."
275 |
276 | "Yes I can."
277 |
278 | "No you can't."
279 |
280 | "I can."
281 |
282 | "You can't."
283 |
284 | "Can!"
285 |
286 | "Can't!"
287 |
288 | An uncomfortable pause. Then Tom said:
289 |
290 | "What's your name?"
291 |
292 | "'Tisn't any of your business, maybe."
293 |
294 | "Well I 'low I'll MAKE it my business."
295 |
296 | "Well why don't you?"
297 |
298 | "If you say much, I will."
299 |
300 | "Much--much--MUCH. There now."
301 |
302 | "Oh, you think you're mighty smart, DON'T you? I could lick you with
303 | one hand tied behind me, if I wanted to."
304 |
305 | "Well why don't you DO it? You SAY you can do it."
306 |
307 | "Well I WILL, if you fool with me."
308 |
309 | "Oh yes--I've seen whole families in the same fix."
310 |
311 | "Smarty! You think you're SOME, now, DON'T you? Oh, what a hat!"
312 |
313 | "You can lump that hat if you don't like it. I dare you to knock it
314 | off--and anybody that'll take a dare will suck eggs."
315 |
316 | "You're a liar!"
317 |
318 | "You're another."
319 |
320 | "You're a fighting liar and dasn't take it up."
321 |
322 | "Aw--take a walk!"
323 |
324 | "Say--if you give me much more of your sass I'll take and bounce a
325 | rock off'n your head."
326 |
327 | "Oh, of COURSE you will."
328 |
329 | "Well I WILL."
330 |
331 | "Well why don't you DO it then? What do you keep SAYING you will for?
332 | Why don't you DO it? It's because you're afraid."
333 |
334 | "I AIN'T afraid."
335 |
336 | "You are."
337 |
338 | "I ain't."
339 |
340 | "You are."
341 |
342 | Another pause, and more eying and sidling around each other. Presently
343 | they were shoulder to shoulder. Tom said:
344 |
345 | "Get away from here!"
346 |
347 | "Go away yourself!"
348 |
349 | "I won't."
350 |
351 | "I won't either."
352 |
353 | So they stood, each with a foot placed at an angle as a brace, and
354 | both shoving with might and main, and glowering at each other with
355 | hate. But neither could get an advantage. After struggling till both
356 | were hot and flushed, each relaxed his strain with watchful caution,
357 | and Tom said:
358 |
359 | "You're a coward and a pup. I'll tell my big brother on you, and he
360 | can thrash you with his little finger, and I'll make him do it, too."
361 |
362 | "What do I care for your big brother? I've got a brother that's bigger
363 | than he is--and what's more, he can throw him over that fence, too."
364 | [Both brothers were imaginary.]
365 |
366 | "That's a lie."
367 |
368 | "YOUR saying so don't make it so."
369 |
370 | Tom drew a line in the dust with his big toe, and said:
371 |
372 | "I dare you to step over that, and I'll lick you till you can't stand
373 | up. Anybody that'll take a dare will steal sheep."
374 |
375 | The new boy stepped over promptly, and said:
376 |
377 | "Now you said you'd do it, now let's see you do it."
378 |
379 | "Don't you crowd me now; you better look out."
380 |
381 | "Well, you SAID you'd do it--why don't you do it?"
382 |
383 | "By jingo! for two cents I WILL do it."
384 |
385 | The new boy took two broad coppers out of his pocket and held them out
386 | with derision. Tom struck them to the ground. In an instant both boys
387 | were rolling and tumbling in the dirt, gripped together like cats; and
388 | for the space of a minute they tugged and tore at each other's hair and
389 | clothes, punched and scratched each other's nose, and covered
390 | themselves with dust and glory. Presently the confusion took form, and
391 | through the fog of battle Tom appeared, seated astride the new boy, and
392 | pounding him with his fists. "Holler 'nuff!" said he.
393 |
394 | The boy only struggled to free himself. He was crying--mainly from rage.
395 |
396 | "Holler 'nuff!"--and the pounding went on.
397 |
--------------------------------------------------------------------------------
/test/pycode1.txt:
--------------------------------------------------------------------------------
1 | class NamedInitializer(Node):
2 | __slots__ = ('name', 'expr', 'coord', '__weakref__')
3 | def __init__(self, name, expr, coord=None):
4 | self.name = name
5 | self.expr = expr
6 | self.coord = coord
7 |
8 | def children(self):
9 | nodelist = []
10 | if self.expr is not None: nodelist.append(("expr", self.expr))
11 | for i, child in enumerate(self.name or []):
12 | nodelist.append(("name[%d]" % i, child))
13 | return tuple(nodelist)
14 |
15 | def __iter__(self):
16 | if self.expr is not None:
17 | yield self.expr
18 | for child in (self.name or []):
19 | yield child
20 |
21 | attr_names = ()
22 |
23 | class ParamList(Node):
24 | __slots__ = ('params', 'coord', '__weakref__')
25 | def __init__(self, params, coord=None):
26 | self.params = params
27 | self.coord = coord
28 |
29 | def children(self):
30 | nodelist = []
31 | for i, child in enumerate(self.params or []):
32 | nodelist.append(("params[%d]" % i, child))
33 | return tuple(nodelist)
34 |
35 | def __iter__(self):
36 | for child in (self.params or []):
37 | yield child
38 |
39 | attr_names = ()
40 |
41 | class PtrDecl(Node):
42 | __slots__ = ('quals', 'type', 'coord', '__weakref__')
43 | def __init__(self, quals, type, coord=None):
44 | self.quals = quals
45 | self.type = type
46 | self.coord = coord
47 |
48 | def children(self):
49 | nodelist = []
50 | if self.type is not None: nodelist.append(("type", self.type))
51 | return tuple(nodelist)
52 |
53 | def __iter__(self):
54 | if self.type is not None:
55 | yield self.type
56 |
57 | attr_names = ('quals', )
58 |
59 | class Return(Node):
60 | __slots__ = ('expr', 'coord', '__weakref__')
61 | def __init__(self, expr, coord=None):
62 | self.expr = expr
63 | self.coord = coord
64 |
65 | def children(self):
66 | nodelist = []
67 | if self.expr is not None: nodelist.append(("expr", self.expr))
68 | return tuple(nodelist)
69 |
70 | def __iter__(self):
71 | if self.expr is not None:
72 | yield self.expr
73 |
74 | attr_names = ()
75 |
76 | class StaticAssert(Node):
77 | __slots__ = ('cond', 'message', 'coord', '__weakref__')
78 | def __init__(self, cond, message, coord=None):
79 | self.cond = cond
80 | self.message = message
81 | self.coord = coord
82 |
83 | def children(self):
84 | nodelist = []
85 | if self.cond is not None: nodelist.append(("cond", self.cond))
86 | if self.message is not None: nodelist.append(("message", self.message))
87 | return tuple(nodelist)
88 |
89 | def __iter__(self):
90 | if self.cond is not None:
91 | yield self.cond
92 | if self.message is not None:
93 | yield self.message
94 |
95 | attr_names = ()
96 |
97 | class Struct(Node):
98 | __slots__ = ('name', 'decls', 'coord', '__weakref__')
99 | def __init__(self, name, decls, coord=None):
100 | self.name = name
101 | self.decls = decls
102 | self.coord = coord
103 |
104 | def children(self):
105 | nodelist = []
106 | for i, child in enumerate(self.decls or []):
107 | nodelist.append(("decls[%d]" % i, child))
108 | return tuple(nodelist)
109 |
110 | def __iter__(self):
111 | for child in (self.decls or []):
112 | yield child
113 |
114 | attr_names = ('name', )
115 |
116 | class StructRef(Node):
117 | __slots__ = ('name', 'type', 'field', 'coord', '__weakref__')
118 | def __init__(self, name, type, field, coord=None):
119 | self.name = name
120 | self.type = type
121 | self.field = field
122 | self.coord = coord
123 |
124 | def children(self):
125 | nodelist = []
126 | if self.name is not None: nodelist.append(("name", self.name))
127 | if self.field is not None: nodelist.append(("field", self.field))
128 | return tuple(nodelist)
129 |
130 | def __iter__(self):
131 | if self.name is not None:
132 | yield self.name
133 | if self.field is not None:
134 | yield self.field
135 |
136 | attr_names = ('type', )
137 |
138 | class Switch(Node):
139 | __slots__ = ('cond', 'stmt', 'coord', '__weakref__')
140 | def __init__(self, cond, stmt, coord=None):
141 | self.cond = cond
142 | self.stmt = stmt
143 | self.coord = coord
144 |
145 | def children(self):
146 | nodelist = []
147 | if self.cond is not None: nodelist.append(("cond", self.cond))
148 | if self.stmt is not None: nodelist.append(("stmt", self.stmt))
149 | return tuple(nodelist)
150 |
151 | def __iter__(self):
152 | if self.cond is not None:
153 | yield self.cond
154 | if self.stmt is not None:
155 | yield self.stmt
156 |
157 | attr_names = ()
158 |
159 | class TernaryOp(Node):
160 | __slots__ = ('cond', 'iftrue', 'iffalse', 'coord', '__weakref__')
161 | def __init__(self, cond, iftrue, iffalse, coord=None):
162 | self.cond = cond
163 | self.iftrue = iftrue
164 | self.iffalse = iffalse
165 | self.coord = coord
166 |
167 | def children(self):
168 | nodelist = []
169 | if self.cond is not None: nodelist.append(("cond", self.cond))
170 | if self.iftrue is not None: nodelist.append(("iftrue", self.iftrue))
171 | if self.iffalse is not None: nodelist.append(("iffalse", self.iffalse))
172 | return tuple(nodelist)
173 |
174 | def __iter__(self):
175 | if self.cond is not None:
176 | yield self.cond
177 | if self.iftrue is not None:
178 | yield self.iftrue
179 | if self.iffalse is not None:
180 | yield self.iffalse
181 |
182 | attr_names = ()
183 |
184 | class TypeDecl(Node):
185 | __slots__ = ('declname', 'quals', 'align', 'type', 'coord', '__weakref__')
186 | def __init__(self, declname, quals, align, type, coord=None):
187 | self.declname = declname
188 | self.quals = quals
189 | self.align = align
190 | self.type = type
191 | self.coord = coord
192 |
193 | def children(self):
194 | nodelist = []
195 | if self.type is not None: nodelist.append(("type", self.type))
196 | return tuple(nodelist)
197 |
198 | def __iter__(self):
199 | if self.type is not None:
200 | yield self.type
201 |
202 | attr_names = ('declname', 'quals', 'align', )
203 |
204 | class Typedef(Node):
205 | __slots__ = ('name', 'quals', 'storage', 'type', 'coord', '__weakref__')
206 | def __init__(self, name, quals, storage, type, coord=None):
207 | self.name = name
208 | self.quals = quals
209 | self.storage = storage
210 | self.type = type
211 | self.coord = coord
212 |
213 | def children(self):
214 | nodelist = []
215 | if self.type is not None: nodelist.append(("type", self.type))
216 | return tuple(nodelist)
217 |
218 | def __iter__(self):
219 | if self.type is not None:
220 | yield self.type
221 |
222 | attr_names = ('name', 'quals', 'storage', )
223 |
224 | class Typename(Node):
225 | __slots__ = ('name', 'quals', 'align', 'type', 'coord', '__weakref__')
226 | def __init__(self, name, quals, align, type, coord=None):
227 | self.name = name
228 | self.quals = quals
229 | self.align = align
230 | self.type = type
231 | self.coord = coord
232 |
233 | def children(self):
234 | nodelist = []
235 | if self.type is not None: nodelist.append(("type", self.type))
236 | return tuple(nodelist)
237 |
238 | def __iter__(self):
239 | if self.type is not None:
240 | yield self.type
241 |
242 | attr_names = ('name', 'quals', 'align', )
243 |
244 | class UnaryOp(Node):
245 | __slots__ = ('op', 'expr', 'coord', '__weakref__')
246 | def __init__(self, op, expr, coord=None):
247 | self.op = op
248 | self.expr = expr
249 | self.coord = coord
250 |
251 | def children(self):
252 | nodelist = []
253 | if self.expr is not None: nodelist.append(("expr", self.expr))
254 | return tuple(nodelist)
255 |
256 | def __iter__(self):
257 | if self.expr is not None:
258 | yield self.expr
259 |
260 | attr_names = ('op', )
261 |
262 | class Union(Node):
263 | __slots__ = ('name', 'decls', 'coord', '__weakref__')
264 | def __init__(self, name, decls, coord=None):
265 | self.name = name
266 | self.decls = decls
267 | self.coord = coord
268 |
269 | def children(self):
270 | nodelist = []
271 | for i, child in enumerate(self.decls or []):
272 | nodelist.append(("decls[%d]" % i, child))
273 | return tuple(nodelist)
274 |
275 | def __iter__(self):
276 | for child in (self.decls or []):
277 | yield child
278 |
279 | attr_names = ('name', )
280 |
281 | class While(Node):
282 | __slots__ = ('cond', 'stmt', 'coord', '__weakref__')
283 | def __init__(self, cond, stmt, coord=None):
284 | self.cond = cond
285 | self.stmt = stmt
286 | self.coord = coord
287 |
288 | def children(self):
289 | nodelist = []
290 | if self.cond is not None: nodelist.append(("cond", self.cond))
291 | if self.stmt is not None: nodelist.append(("stmt", self.stmt))
292 | return tuple(nodelist)
293 |
294 | def __iter__(self):
295 | if self.cond is not None:
296 | yield self.cond
297 | if self.stmt is not None:
298 | yield self.stmt
299 |
300 | attr_names = ()
301 |
302 | class Pragma(Node):
303 | __slots__ = ('string', 'coord', '__weakref__')
304 | def __init__(self, string, coord=None):
305 | self.string = string
306 | self.coord = coord
307 |
308 | def children(self):
309 | nodelist = []
310 | return tuple(nodelist)
311 |
312 | def __iter__(self):
313 | return
314 | yield
315 |
316 | attr_names = ('string', )
317 |
--------------------------------------------------------------------------------
/test/sp-dump-ids.py:
--------------------------------------------------------------------------------
1 | # Uses the sentencepiece package to tokenize the file provided as a command-line
2 | # argument; emits all token IDs to stdout, one per line.
3 | #
4 | # Requires the MODELPATH env var to be set to the binary proto describing
5 | # the tokenizer model.
6 | import sentencepiece as spm
7 | import os, sys
8 |
9 | with open(sys.argv[1], "r", newline="") as f:
10 | text = f.read()
11 | sp = spm.SentencePieceProcessor(model_file=os.getenv("MODELPATH"))
12 | ids = sp.encode(text)
13 |
14 | # Print ids out, one per line
15 | for id in ids:
16 | print(id)
17 |
--------------------------------------------------------------------------------
/token.go:
--------------------------------------------------------------------------------
1 | package sentencepiece
2 |
3 | import "fmt"
4 |
5 | // Token represents a single token from the input text. ID is a unique token
6 | // identifier that the model uses in its internal representation. Text is
7 | // the piece of text this token represents.
8 | type Token struct {
9 | ID int
10 | Text string
11 | }
12 |
13 | func (t Token) String() string {
14 | return fmt.Sprintf("Token{ID: %v, Text: %q}", t.ID, t.Text)
15 | }
16 |
--------------------------------------------------------------------------------