├── .github └── workflows │ └── pages.yml ├── .gitignore ├── LICENSE ├── README.md ├── benchmark_test.go ├── doc └── toklogo2.png ├── example_test.go ├── go.mod ├── go.sum ├── internal ├── cmd │ ├── dumper │ │ └── main.go │ └── wasm │ │ ├── .gitignore │ │ ├── Makefile │ │ ├── assets │ │ ├── index.html │ │ ├── script.js │ │ └── wasm_exec.js │ │ └── main.go ├── model │ ├── gen.sh │ ├── sentencepiece_model.pb.go │ └── sentencepiece_model.proto ├── prefixmatcher │ ├── prefixmatcher.go │ └── prefixmatcher_test.go └── priorityqueue │ ├── priorityqueue.go │ └── priorityqueue_test.go ├── normalize.go ├── processor.go ├── processor_test.go ├── system_test.go ├── test ├── gocode1.txt ├── htmlcode1.txt ├── latexcode1.txt ├── opening-multilang.txt ├── perlcode1.txt ├── pg2000_spanish.txt ├── pg41845_telugu.txt ├── pg7193_english.txt ├── pycode1.txt ├── romeo-juliet-english.txt └── sp-dump-ids.py └── token.go /.github/workflows/pages.yml: -------------------------------------------------------------------------------- 1 | # Simple workflow for deploying static content to GitHub Pages 2 | name: Deploy static content to Pages 3 | 4 | on: 5 | # Runs on pushes targeting the default branch 6 | push: 7 | branches: ["main"] 8 | 9 | # Allows you to run this workflow manually from the Actions tab 10 | workflow_dispatch: 11 | 12 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 13 | permissions: 14 | contents: read 15 | pages: write 16 | id-token: write 17 | 18 | # Allow one concurrent deployment 19 | concurrency: 20 | group: "pages" 21 | cancel-in-progress: true 22 | 23 | jobs: 24 | # Single deploy job since we're just deploying 25 | deploy: 26 | environment: 27 | name: github-pages 28 | url: ${{ steps.deployment.outputs.page_url }} 29 | runs-on: ubuntu-latest 30 | steps: 31 | - name: Checkout 32 | uses: actions/checkout@v3 33 | 34 | - name: Set up Go 35 | uses: actions/setup-go@v4 36 | with: 37 | go-version: 1.22.5 38 | 39 | - name: Setup Pages 40 | uses: actions/configure-pages@v2 41 | 42 | - name: Build wasm 43 | run: | 44 | cd internal/cmd/wasm 45 | make build 46 | 47 | - name: Upload artifact 48 | uses: actions/upload-pages-artifact@v1 49 | with: 50 | # Upload 51 | path: 'internal/cmd/wasm/assets' 52 | 53 | - name: Deploy to GitHub Pages 54 | id: deployment 55 | uses: actions/deploy-pages@v1 56 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # If you prefer the allow list template instead of the deny list, see community template: 2 | # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore 3 | # 4 | # Binaries for programs and plugins 5 | *.exe 6 | *.exe~ 7 | *.dll 8 | *.so 9 | *.dylib 10 | 11 | # Test binary, built with `go test -c` 12 | *.test 13 | 14 | # Output of the go coverage tool, specifically when used with LiteIDE 15 | *.out 16 | 17 | # Dependency directories (remove the comment below to include it) 18 | # vendor/ 19 | 20 | # Go workspace file 21 | go.work 22 | go.work.sum 23 | 24 | # env file 25 | .env 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # go-sentencepiece 2 | 3 |

4 | Logo 5 |

6 | 7 | ---- 8 | 9 | [![Go Reference](https://pkg.go.dev/badge/github.com/eliben/go-sentencepiece.svg)](https://pkg.go.dev/github.com/eliben/go-sentencepiece) 10 | 11 | This is a pure Go implementation of encoding and decoding text with 12 | the [SentencePiece tokenizer](https://github.com/google/sentencepiece). 13 | 14 | "Encoding" is the operation used to split text into tokens, using 15 | a trained tokenizer model. "Decoding" is the reverse process - converting 16 | a list of tokens into the original text. 17 | 18 | SentencePiece is a general family of tokenizers that is configured 19 | by a protobuf configuration file. This repository currently focuses 20 | on implementing just the functionality required to reproduce the 21 | tokenization of [Gemma models](https://ai.google.dev/gemma) (the same 22 | tokenizer is used for Google's proprietary Gemini family of models). 23 | Specifically, it only implements BPE tokenization since this is what 24 | Gemma uses. 25 | 26 | ## Current status 27 | 28 | This package should be ready to use for encoding text into tokens 29 | using the Gemma tokenizer; it's been reasonably optimized and extensively 30 | tested vs. the [SentencePiece Python bindings](https://pypi.org/project/sentencepiece/) 31 | (see `system_test.go` in this repository). 32 | 33 | If you find any problems or discrepancies, please open an issue. 34 | 35 | ## Tokenizer configuration 36 | 37 | The configuration file for the tokenizer is a protobuf (structured 38 | data, serialized in the [protocol buffer format](https://protobuf.dev/)) 39 | that describes a trained tokenizer model; it includes 40 | the complete learned vocabulary used for tokenization, as well as 41 | other configuration information. 42 | 43 | It is not part of this repository. Please fetch it from the 44 | [official Gemma implementation repository](https://github.com/google/gemma_pytorch/tree/main/tokenizer). 45 | `NewProcessor*` constructors will expect to read this file. 46 | 47 | ## Developing 48 | 49 | A protobuf is used to configure the tokenizer. The structure of the 50 | protobuf is described by the `internal/model/sentencepiece_model.proto` file, 51 | which is vendored from https://github.com/google/sentencepiece 52 | 53 | To re-generate the `*.pb.go` file from it: 54 | 55 | ``` 56 | $ cd internal/model 57 | $ ./gen.sh 58 | ``` 59 | 60 | The configuration protobuf itself is obtained as described in the 61 | [Tokenizer configuration](#tokenizer-configuration) section. All 62 | tests require the `MODELPATH` env var to point to a local 63 | copy of the tokenizer configuration file. 64 | 65 | ## Online demo 66 | 67 | To see an in-browser demo of this tokenizer in action, visit 68 | https://eliben.github.io/go-sentencepiece/ 69 | 70 | The Go code is compiled to WebAssembly and loaded from a small 71 | JS program to allow interactive encoding of text. 72 | -------------------------------------------------------------------------------- /benchmark_test.go: -------------------------------------------------------------------------------- 1 | package sentencepiece 2 | 3 | import ( 4 | "io/ioutil" 5 | "path/filepath" 6 | "runtime" 7 | "testing" 8 | ) 9 | 10 | func BenchmarkEncoder(b *testing.B) { 11 | buf, err := ioutil.ReadFile(filepath.Join("test", "pg7193_english.txt")) 12 | if err != nil { 13 | b.Fatal(err) 14 | } 15 | sbuf := string(buf) 16 | 17 | proc := createProcessor(b) 18 | b.ResetTimer() 19 | total := 0 20 | 21 | for range b.N { 22 | toks := proc.Encode(sbuf) 23 | total += len(toks) 24 | } 25 | runtime.KeepAlive(total) 26 | 27 | b.ReportMetric(float64(total)/float64(b.Elapsed().Seconds()), "tokens/sec") 28 | } 29 | 30 | func BenchmarkDecoder(b *testing.B) { 31 | buf, err := ioutil.ReadFile(filepath.Join("test", "pg7193_english.txt")) 32 | if err != nil { 33 | b.Fatal(err) 34 | } 35 | sbuf := string(buf) 36 | 37 | proc := createProcessor(b) 38 | toks := proc.Encode(sbuf) 39 | 40 | b.ResetTimer() 41 | total := 0 42 | 43 | for range b.N { 44 | t := proc.DecodeTokens(toks) 45 | total += len(t) 46 | } 47 | runtime.KeepAlive(total) 48 | 49 | b.ReportMetric(float64(len(toks)*b.N)/float64(b.Elapsed().Seconds()), "tokens/sec") 50 | } 51 | -------------------------------------------------------------------------------- /doc/toklogo2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eliben/go-sentencepiece/dd59fe97df461d1fa84d15c25a51f025156eece1/doc/toklogo2.png -------------------------------------------------------------------------------- /example_test.go: -------------------------------------------------------------------------------- 1 | package sentencepiece_test 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | 8 | "github.com/eliben/go-sentencepiece" 9 | ) 10 | 11 | func ExampleEncode() { 12 | protoFile := os.Getenv("MODELPATH") 13 | if protoFile == "" { 14 | log.Println("Need MODELPATH env var to run example") 15 | return 16 | } 17 | 18 | proc, err := sentencepiece.NewProcessorFromPath(protoFile) 19 | if err != nil { 20 | log.Fatal(err) 21 | } 22 | 23 | text := "Encoding produces tokens that LLMs can learn and understand" 24 | tokens := proc.Encode(text) 25 | 26 | for _, token := range tokens { 27 | fmt.Println(token) 28 | } 29 | } 30 | 31 | func ExampleDecode() { 32 | protoFile := os.Getenv("MODELPATH") 33 | if protoFile == "" { 34 | log.Println("Need MODELPATH env var to run example") 35 | return 36 | } 37 | 38 | proc, err := sentencepiece.NewProcessorFromPath(protoFile) 39 | if err != nil { 40 | log.Fatal(err) 41 | } 42 | 43 | ids := []int{17534, 2134} 44 | text := proc.Decode(ids) 45 | 46 | fmt.Println(text) 47 | } 48 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/eliben/go-sentencepiece 2 | 3 | go 1.22.5 4 | 5 | require google.golang.org/protobuf v1.34.2 6 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU= 2 | github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= 3 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= 4 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 5 | google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= 6 | google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= 7 | -------------------------------------------------------------------------------- /internal/cmd/dumper/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // Command dumper is a debugging utility for internal use. It helps explore 4 | // the model proto and compare results with other tools. 5 | 6 | import ( 7 | "flag" 8 | "fmt" 9 | "io/ioutil" 10 | "log" 11 | "os" 12 | "unicode" 13 | 14 | "github.com/eliben/go-sentencepiece" 15 | "github.com/eliben/go-sentencepiece/internal/model" 16 | "google.golang.org/protobuf/encoding/prototext" 17 | "google.golang.org/protobuf/proto" 18 | ) 19 | 20 | func main() { 21 | fDumpAll := flag.Bool("dumpall", false, "dump entire model proto") 22 | fFindUni := flag.Bool("finduni", false, "find unicode runes not in pieces") 23 | fFindBytes := flag.Bool("findbytes", false, "show all byte pieces with their IDs") 24 | fEncodeFile := flag.String("encodefile", "", "file name to open and encode") 25 | flag.Parse() 26 | 27 | modelPath := os.Getenv("MODELPATH") 28 | if modelPath == "" { 29 | log.Fatal("Need MODELPATH env var to run") 30 | } 31 | 32 | b, err := ioutil.ReadFile(modelPath) 33 | if err != nil { 34 | log.Fatal(err) 35 | } 36 | 37 | var protomodel model.ModelProto 38 | err = proto.Unmarshal(b, &protomodel) 39 | if err != nil { 40 | log.Fatal(err) 41 | } 42 | 43 | if *fDumpAll { 44 | fmt.Println(prototext.Format(&protomodel)) 45 | } else if *fFindBytes { 46 | for i, piece := range protomodel.GetPieces() { 47 | if piece.GetType() == model.ModelProto_SentencePiece_BYTE { 48 | fmt.Printf("%5d: %s\n", i, piece.GetPiece()) 49 | } 50 | } 51 | 52 | } else if *fFindUni { 53 | pieces := make(map[string]int) 54 | for i, piece := range protomodel.GetPieces() { 55 | pieces[piece.GetPiece()] = i 56 | } 57 | 58 | for r := rune(0); r <= unicode.MaxRune; r++ { 59 | if unicode.IsPrint(r) { 60 | if _, found := pieces[string(r)]; !found { 61 | fmt.Printf("not in pieces: %U %q\n", r, string(r)) 62 | } 63 | } 64 | } 65 | } else if *fEncodeFile != "" { 66 | proc, err := sentencepiece.NewProcessorFromPath(modelPath) 67 | if err != nil { 68 | log.Fatal(err) 69 | } 70 | 71 | b, err := ioutil.ReadFile(*fEncodeFile) 72 | if err != nil { 73 | log.Fatal(err) 74 | } 75 | 76 | tokens := proc.Encode(string(b)) 77 | for _, t := range tokens { 78 | fmt.Println(t.ID) 79 | } 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /internal/cmd/wasm/.gitignore: -------------------------------------------------------------------------------- 1 | *.wasm 2 | embed_data 3 | -------------------------------------------------------------------------------- /internal/cmd/wasm/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: build serve clean 2 | 3 | build: 4 | mkdir -p embed_data 5 | wget https://github.com/google/gemma_pytorch/raw/main/tokenizer/tokenizer.model -O embed_data/tokenizer.model 6 | GOOS=js GOARCH=wasm go build -o assets/gospm.wasm main.go 7 | 8 | serve: 9 | go run github.com/eliben/static-server@latest -port 8873 assets 10 | 11 | clean: 12 | rm -rf embed_data assets/gospm.wasm 13 | -------------------------------------------------------------------------------- /internal/cmd/wasm/assets/index.html: -------------------------------------------------------------------------------- 1 | 2 | 96 | 97 | 98 | 106 | 107 | 108 |
109 |
Text
110 |
111 | 112 |
113 |
Tokens
114 |
115 |
116 |
117 |
118 |
119 | 120 | 121 | 122 | 123 | 124 |
125 |
126 | 140 |
141 | 142 | -------------------------------------------------------------------------------- /internal/cmd/wasm/assets/script.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const TextBox = document.querySelector('#text'); 4 | TextBox.addEventListener('input', onStateChange); 5 | 6 | const OutBox = document.querySelector('#tokens'); 7 | 8 | let radioText = document.querySelector('#showText'); 9 | let radioTokens = document.querySelector('#showTokens'); 10 | radioText.addEventListener('change', onStateChange); 11 | radioTokens.addEventListener('change', onStateChange); 12 | 13 | function init() { 14 | // Trigger a redraw to get started. 15 | onStateChange(); 16 | } 17 | 18 | //------------------ 19 | 20 | function onStateChange() { 21 | const text = TextBox.value; 22 | 23 | if (radioTokens.checked) { 24 | const start = performance.now(); 25 | let tokens = textToIDs(text); 26 | const end = performance.now(); 27 | console.log("textToIDs elapsed (ms): ", end - start); 28 | OutBox.textContent = "[" + tokens.join(", ") + "]"; 29 | } else { 30 | const start = performance.now(); 31 | let pieces = textToPieces(text); 32 | const end = performance.now(); 33 | console.log("textToPieces elapsed (ms): ", end - start); 34 | console.log(pieces); 35 | 36 | OutBox.innerHTML = ''; 37 | // To have different background colors for each piece, we need to 38 | // wrap each piece in a span. The color is cycled between 8 different 39 | // colors, in jumps of 135 degrees to make them sufficiently far apart 40 | // and not repeat for 8 cycles (since 360/8 = 45, we could use any 41 | // multiple of 45 that's not also a multiple of 180). 42 | for (let i = 0; i < pieces.length; i++) { 43 | if (pieces[i] === '\n') { 44 | OutBox.appendChild(document.createElement('br')); 45 | } else { 46 | let color = i % 8; 47 | let span = document.createElement('span'); 48 | span.textContent = pieces[i]; 49 | span.style.lineHeight = 1.5; 50 | span.style.backgroundColor = `hsl(${color * 135}, 40%, 70%)`; 51 | span.style.whiteSpace = 'pre'; 52 | span.style.display = 'inline-block'; 53 | OutBox.appendChild(span); 54 | } 55 | } 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /internal/cmd/wasm/assets/wasm_exec.js: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | "use strict"; 6 | 7 | (() => { 8 | const enosys = () => { 9 | const err = new Error("not implemented"); 10 | err.code = "ENOSYS"; 11 | return err; 12 | }; 13 | 14 | if (!globalThis.fs) { 15 | let outputBuf = ""; 16 | globalThis.fs = { 17 | constants: { O_WRONLY: -1, O_RDWR: -1, O_CREAT: -1, O_TRUNC: -1, O_APPEND: -1, O_EXCL: -1 }, // unused 18 | writeSync(fd, buf) { 19 | outputBuf += decoder.decode(buf); 20 | const nl = outputBuf.lastIndexOf("\n"); 21 | if (nl != -1) { 22 | console.log(outputBuf.substring(0, nl)); 23 | outputBuf = outputBuf.substring(nl + 1); 24 | } 25 | return buf.length; 26 | }, 27 | write(fd, buf, offset, length, position, callback) { 28 | if (offset !== 0 || length !== buf.length || position !== null) { 29 | callback(enosys()); 30 | return; 31 | } 32 | const n = this.writeSync(fd, buf); 33 | callback(null, n); 34 | }, 35 | chmod(path, mode, callback) { callback(enosys()); }, 36 | chown(path, uid, gid, callback) { callback(enosys()); }, 37 | close(fd, callback) { callback(enosys()); }, 38 | fchmod(fd, mode, callback) { callback(enosys()); }, 39 | fchown(fd, uid, gid, callback) { callback(enosys()); }, 40 | fstat(fd, callback) { callback(enosys()); }, 41 | fsync(fd, callback) { callback(null); }, 42 | ftruncate(fd, length, callback) { callback(enosys()); }, 43 | lchown(path, uid, gid, callback) { callback(enosys()); }, 44 | link(path, link, callback) { callback(enosys()); }, 45 | lstat(path, callback) { callback(enosys()); }, 46 | mkdir(path, perm, callback) { callback(enosys()); }, 47 | open(path, flags, mode, callback) { callback(enosys()); }, 48 | read(fd, buffer, offset, length, position, callback) { callback(enosys()); }, 49 | readdir(path, callback) { callback(enosys()); }, 50 | readlink(path, callback) { callback(enosys()); }, 51 | rename(from, to, callback) { callback(enosys()); }, 52 | rmdir(path, callback) { callback(enosys()); }, 53 | stat(path, callback) { callback(enosys()); }, 54 | symlink(path, link, callback) { callback(enosys()); }, 55 | truncate(path, length, callback) { callback(enosys()); }, 56 | unlink(path, callback) { callback(enosys()); }, 57 | utimes(path, atime, mtime, callback) { callback(enosys()); }, 58 | }; 59 | } 60 | 61 | if (!globalThis.process) { 62 | globalThis.process = { 63 | getuid() { return -1; }, 64 | getgid() { return -1; }, 65 | geteuid() { return -1; }, 66 | getegid() { return -1; }, 67 | getgroups() { throw enosys(); }, 68 | pid: -1, 69 | ppid: -1, 70 | umask() { throw enosys(); }, 71 | cwd() { throw enosys(); }, 72 | chdir() { throw enosys(); }, 73 | } 74 | } 75 | 76 | if (!globalThis.crypto) { 77 | throw new Error("globalThis.crypto is not available, polyfill required (crypto.getRandomValues only)"); 78 | } 79 | 80 | if (!globalThis.performance) { 81 | throw new Error("globalThis.performance is not available, polyfill required (performance.now only)"); 82 | } 83 | 84 | if (!globalThis.TextEncoder) { 85 | throw new Error("globalThis.TextEncoder is not available, polyfill required"); 86 | } 87 | 88 | if (!globalThis.TextDecoder) { 89 | throw new Error("globalThis.TextDecoder is not available, polyfill required"); 90 | } 91 | 92 | const encoder = new TextEncoder("utf-8"); 93 | const decoder = new TextDecoder("utf-8"); 94 | 95 | globalThis.Go = class { 96 | constructor() { 97 | this.argv = ["js"]; 98 | this.env = {}; 99 | this.exit = (code) => { 100 | if (code !== 0) { 101 | console.warn("exit code:", code); 102 | } 103 | }; 104 | this._exitPromise = new Promise((resolve) => { 105 | this._resolveExitPromise = resolve; 106 | }); 107 | this._pendingEvent = null; 108 | this._scheduledTimeouts = new Map(); 109 | this._nextCallbackTimeoutID = 1; 110 | 111 | const setInt64 = (addr, v) => { 112 | this.mem.setUint32(addr + 0, v, true); 113 | this.mem.setUint32(addr + 4, Math.floor(v / 4294967296), true); 114 | } 115 | 116 | const setInt32 = (addr, v) => { 117 | this.mem.setUint32(addr + 0, v, true); 118 | } 119 | 120 | const getInt64 = (addr) => { 121 | const low = this.mem.getUint32(addr + 0, true); 122 | const high = this.mem.getInt32(addr + 4, true); 123 | return low + high * 4294967296; 124 | } 125 | 126 | const loadValue = (addr) => { 127 | const f = this.mem.getFloat64(addr, true); 128 | if (f === 0) { 129 | return undefined; 130 | } 131 | if (!isNaN(f)) { 132 | return f; 133 | } 134 | 135 | const id = this.mem.getUint32(addr, true); 136 | return this._values[id]; 137 | } 138 | 139 | const storeValue = (addr, v) => { 140 | const nanHead = 0x7FF80000; 141 | 142 | if (typeof v === "number" && v !== 0) { 143 | if (isNaN(v)) { 144 | this.mem.setUint32(addr + 4, nanHead, true); 145 | this.mem.setUint32(addr, 0, true); 146 | return; 147 | } 148 | this.mem.setFloat64(addr, v, true); 149 | return; 150 | } 151 | 152 | if (v === undefined) { 153 | this.mem.setFloat64(addr, 0, true); 154 | return; 155 | } 156 | 157 | let id = this._ids.get(v); 158 | if (id === undefined) { 159 | id = this._idPool.pop(); 160 | if (id === undefined) { 161 | id = this._values.length; 162 | } 163 | this._values[id] = v; 164 | this._goRefCounts[id] = 0; 165 | this._ids.set(v, id); 166 | } 167 | this._goRefCounts[id]++; 168 | let typeFlag = 0; 169 | switch (typeof v) { 170 | case "object": 171 | if (v !== null) { 172 | typeFlag = 1; 173 | } 174 | break; 175 | case "string": 176 | typeFlag = 2; 177 | break; 178 | case "symbol": 179 | typeFlag = 3; 180 | break; 181 | case "function": 182 | typeFlag = 4; 183 | break; 184 | } 185 | this.mem.setUint32(addr + 4, nanHead | typeFlag, true); 186 | this.mem.setUint32(addr, id, true); 187 | } 188 | 189 | const loadSlice = (addr) => { 190 | const array = getInt64(addr + 0); 191 | const len = getInt64(addr + 8); 192 | return new Uint8Array(this._inst.exports.mem.buffer, array, len); 193 | } 194 | 195 | const loadSliceOfValues = (addr) => { 196 | const array = getInt64(addr + 0); 197 | const len = getInt64(addr + 8); 198 | const a = new Array(len); 199 | for (let i = 0; i < len; i++) { 200 | a[i] = loadValue(array + i * 8); 201 | } 202 | return a; 203 | } 204 | 205 | const loadString = (addr) => { 206 | const saddr = getInt64(addr + 0); 207 | const len = getInt64(addr + 8); 208 | return decoder.decode(new DataView(this._inst.exports.mem.buffer, saddr, len)); 209 | } 210 | 211 | const timeOrigin = Date.now() - performance.now(); 212 | this.importObject = { 213 | _gotest: { 214 | add: (a, b) => a + b, 215 | }, 216 | gojs: { 217 | // Go's SP does not change as long as no Go code is running. Some operations (e.g. calls, getters and setters) 218 | // may synchronously trigger a Go event handler. This makes Go code get executed in the middle of the imported 219 | // function. A goroutine can switch to a new stack if the current stack is too small (see morestack function). 220 | // This changes the SP, thus we have to update the SP used by the imported function. 221 | 222 | // func wasmExit(code int32) 223 | "runtime.wasmExit": (sp) => { 224 | sp >>>= 0; 225 | const code = this.mem.getInt32(sp + 8, true); 226 | this.exited = true; 227 | delete this._inst; 228 | delete this._values; 229 | delete this._goRefCounts; 230 | delete this._ids; 231 | delete this._idPool; 232 | this.exit(code); 233 | }, 234 | 235 | // func wasmWrite(fd uintptr, p unsafe.Pointer, n int32) 236 | "runtime.wasmWrite": (sp) => { 237 | sp >>>= 0; 238 | const fd = getInt64(sp + 8); 239 | const p = getInt64(sp + 16); 240 | const n = this.mem.getInt32(sp + 24, true); 241 | fs.writeSync(fd, new Uint8Array(this._inst.exports.mem.buffer, p, n)); 242 | }, 243 | 244 | // func resetMemoryDataView() 245 | "runtime.resetMemoryDataView": (sp) => { 246 | sp >>>= 0; 247 | this.mem = new DataView(this._inst.exports.mem.buffer); 248 | }, 249 | 250 | // func nanotime1() int64 251 | "runtime.nanotime1": (sp) => { 252 | sp >>>= 0; 253 | setInt64(sp + 8, (timeOrigin + performance.now()) * 1000000); 254 | }, 255 | 256 | // func walltime() (sec int64, nsec int32) 257 | "runtime.walltime": (sp) => { 258 | sp >>>= 0; 259 | const msec = (new Date).getTime(); 260 | setInt64(sp + 8, msec / 1000); 261 | this.mem.setInt32(sp + 16, (msec % 1000) * 1000000, true); 262 | }, 263 | 264 | // func scheduleTimeoutEvent(delay int64) int32 265 | "runtime.scheduleTimeoutEvent": (sp) => { 266 | sp >>>= 0; 267 | const id = this._nextCallbackTimeoutID; 268 | this._nextCallbackTimeoutID++; 269 | this._scheduledTimeouts.set(id, setTimeout( 270 | () => { 271 | this._resume(); 272 | while (this._scheduledTimeouts.has(id)) { 273 | // for some reason Go failed to register the timeout event, log and try again 274 | // (temporary workaround for https://github.com/golang/go/issues/28975) 275 | console.warn("scheduleTimeoutEvent: missed timeout event"); 276 | this._resume(); 277 | } 278 | }, 279 | getInt64(sp + 8), 280 | )); 281 | this.mem.setInt32(sp + 16, id, true); 282 | }, 283 | 284 | // func clearTimeoutEvent(id int32) 285 | "runtime.clearTimeoutEvent": (sp) => { 286 | sp >>>= 0; 287 | const id = this.mem.getInt32(sp + 8, true); 288 | clearTimeout(this._scheduledTimeouts.get(id)); 289 | this._scheduledTimeouts.delete(id); 290 | }, 291 | 292 | // func getRandomData(r []byte) 293 | "runtime.getRandomData": (sp) => { 294 | sp >>>= 0; 295 | crypto.getRandomValues(loadSlice(sp + 8)); 296 | }, 297 | 298 | // func finalizeRef(v ref) 299 | "syscall/js.finalizeRef": (sp) => { 300 | sp >>>= 0; 301 | const id = this.mem.getUint32(sp + 8, true); 302 | this._goRefCounts[id]--; 303 | if (this._goRefCounts[id] === 0) { 304 | const v = this._values[id]; 305 | this._values[id] = null; 306 | this._ids.delete(v); 307 | this._idPool.push(id); 308 | } 309 | }, 310 | 311 | // func stringVal(value string) ref 312 | "syscall/js.stringVal": (sp) => { 313 | sp >>>= 0; 314 | storeValue(sp + 24, loadString(sp + 8)); 315 | }, 316 | 317 | // func valueGet(v ref, p string) ref 318 | "syscall/js.valueGet": (sp) => { 319 | sp >>>= 0; 320 | const result = Reflect.get(loadValue(sp + 8), loadString(sp + 16)); 321 | sp = this._inst.exports.getsp() >>> 0; // see comment above 322 | storeValue(sp + 32, result); 323 | }, 324 | 325 | // func valueSet(v ref, p string, x ref) 326 | "syscall/js.valueSet": (sp) => { 327 | sp >>>= 0; 328 | Reflect.set(loadValue(sp + 8), loadString(sp + 16), loadValue(sp + 32)); 329 | }, 330 | 331 | // func valueDelete(v ref, p string) 332 | "syscall/js.valueDelete": (sp) => { 333 | sp >>>= 0; 334 | Reflect.deleteProperty(loadValue(sp + 8), loadString(sp + 16)); 335 | }, 336 | 337 | // func valueIndex(v ref, i int) ref 338 | "syscall/js.valueIndex": (sp) => { 339 | sp >>>= 0; 340 | storeValue(sp + 24, Reflect.get(loadValue(sp + 8), getInt64(sp + 16))); 341 | }, 342 | 343 | // valueSetIndex(v ref, i int, x ref) 344 | "syscall/js.valueSetIndex": (sp) => { 345 | sp >>>= 0; 346 | Reflect.set(loadValue(sp + 8), getInt64(sp + 16), loadValue(sp + 24)); 347 | }, 348 | 349 | // func valueCall(v ref, m string, args []ref) (ref, bool) 350 | "syscall/js.valueCall": (sp) => { 351 | sp >>>= 0; 352 | try { 353 | const v = loadValue(sp + 8); 354 | const m = Reflect.get(v, loadString(sp + 16)); 355 | const args = loadSliceOfValues(sp + 32); 356 | const result = Reflect.apply(m, v, args); 357 | sp = this._inst.exports.getsp() >>> 0; // see comment above 358 | storeValue(sp + 56, result); 359 | this.mem.setUint8(sp + 64, 1); 360 | } catch (err) { 361 | sp = this._inst.exports.getsp() >>> 0; // see comment above 362 | storeValue(sp + 56, err); 363 | this.mem.setUint8(sp + 64, 0); 364 | } 365 | }, 366 | 367 | // func valueInvoke(v ref, args []ref) (ref, bool) 368 | "syscall/js.valueInvoke": (sp) => { 369 | sp >>>= 0; 370 | try { 371 | const v = loadValue(sp + 8); 372 | const args = loadSliceOfValues(sp + 16); 373 | const result = Reflect.apply(v, undefined, args); 374 | sp = this._inst.exports.getsp() >>> 0; // see comment above 375 | storeValue(sp + 40, result); 376 | this.mem.setUint8(sp + 48, 1); 377 | } catch (err) { 378 | sp = this._inst.exports.getsp() >>> 0; // see comment above 379 | storeValue(sp + 40, err); 380 | this.mem.setUint8(sp + 48, 0); 381 | } 382 | }, 383 | 384 | // func valueNew(v ref, args []ref) (ref, bool) 385 | "syscall/js.valueNew": (sp) => { 386 | sp >>>= 0; 387 | try { 388 | const v = loadValue(sp + 8); 389 | const args = loadSliceOfValues(sp + 16); 390 | const result = Reflect.construct(v, args); 391 | sp = this._inst.exports.getsp() >>> 0; // see comment above 392 | storeValue(sp + 40, result); 393 | this.mem.setUint8(sp + 48, 1); 394 | } catch (err) { 395 | sp = this._inst.exports.getsp() >>> 0; // see comment above 396 | storeValue(sp + 40, err); 397 | this.mem.setUint8(sp + 48, 0); 398 | } 399 | }, 400 | 401 | // func valueLength(v ref) int 402 | "syscall/js.valueLength": (sp) => { 403 | sp >>>= 0; 404 | setInt64(sp + 16, parseInt(loadValue(sp + 8).length)); 405 | }, 406 | 407 | // valuePrepareString(v ref) (ref, int) 408 | "syscall/js.valuePrepareString": (sp) => { 409 | sp >>>= 0; 410 | const str = encoder.encode(String(loadValue(sp + 8))); 411 | storeValue(sp + 16, str); 412 | setInt64(sp + 24, str.length); 413 | }, 414 | 415 | // valueLoadString(v ref, b []byte) 416 | "syscall/js.valueLoadString": (sp) => { 417 | sp >>>= 0; 418 | const str = loadValue(sp + 8); 419 | loadSlice(sp + 16).set(str); 420 | }, 421 | 422 | // func valueInstanceOf(v ref, t ref) bool 423 | "syscall/js.valueInstanceOf": (sp) => { 424 | sp >>>= 0; 425 | this.mem.setUint8(sp + 24, (loadValue(sp + 8) instanceof loadValue(sp + 16)) ? 1 : 0); 426 | }, 427 | 428 | // func copyBytesToGo(dst []byte, src ref) (int, bool) 429 | "syscall/js.copyBytesToGo": (sp) => { 430 | sp >>>= 0; 431 | const dst = loadSlice(sp + 8); 432 | const src = loadValue(sp + 32); 433 | if (!(src instanceof Uint8Array || src instanceof Uint8ClampedArray)) { 434 | this.mem.setUint8(sp + 48, 0); 435 | return; 436 | } 437 | const toCopy = src.subarray(0, dst.length); 438 | dst.set(toCopy); 439 | setInt64(sp + 40, toCopy.length); 440 | this.mem.setUint8(sp + 48, 1); 441 | }, 442 | 443 | // func copyBytesToJS(dst ref, src []byte) (int, bool) 444 | "syscall/js.copyBytesToJS": (sp) => { 445 | sp >>>= 0; 446 | const dst = loadValue(sp + 8); 447 | const src = loadSlice(sp + 16); 448 | if (!(dst instanceof Uint8Array || dst instanceof Uint8ClampedArray)) { 449 | this.mem.setUint8(sp + 48, 0); 450 | return; 451 | } 452 | const toCopy = src.subarray(0, dst.length); 453 | dst.set(toCopy); 454 | setInt64(sp + 40, toCopy.length); 455 | this.mem.setUint8(sp + 48, 1); 456 | }, 457 | 458 | "debug": (value) => { 459 | console.log(value); 460 | }, 461 | } 462 | }; 463 | } 464 | 465 | async run(instance) { 466 | if (!(instance instanceof WebAssembly.Instance)) { 467 | throw new Error("Go.run: WebAssembly.Instance expected"); 468 | } 469 | this._inst = instance; 470 | this.mem = new DataView(this._inst.exports.mem.buffer); 471 | this._values = [ // JS values that Go currently has references to, indexed by reference id 472 | NaN, 473 | 0, 474 | null, 475 | true, 476 | false, 477 | globalThis, 478 | this, 479 | ]; 480 | this._goRefCounts = new Array(this._values.length).fill(Infinity); // number of references that Go has to a JS value, indexed by reference id 481 | this._ids = new Map([ // mapping from JS values to reference ids 482 | [0, 1], 483 | [null, 2], 484 | [true, 3], 485 | [false, 4], 486 | [globalThis, 5], 487 | [this, 6], 488 | ]); 489 | this._idPool = []; // unused ids that have been garbage collected 490 | this.exited = false; // whether the Go program has exited 491 | 492 | // Pass command line arguments and environment variables to WebAssembly by writing them to the linear memory. 493 | let offset = 4096; 494 | 495 | const strPtr = (str) => { 496 | const ptr = offset; 497 | const bytes = encoder.encode(str + "\0"); 498 | new Uint8Array(this.mem.buffer, offset, bytes.length).set(bytes); 499 | offset += bytes.length; 500 | if (offset % 8 !== 0) { 501 | offset += 8 - (offset % 8); 502 | } 503 | return ptr; 504 | }; 505 | 506 | const argc = this.argv.length; 507 | 508 | const argvPtrs = []; 509 | this.argv.forEach((arg) => { 510 | argvPtrs.push(strPtr(arg)); 511 | }); 512 | argvPtrs.push(0); 513 | 514 | const keys = Object.keys(this.env).sort(); 515 | keys.forEach((key) => { 516 | argvPtrs.push(strPtr(`${key}=${this.env[key]}`)); 517 | }); 518 | argvPtrs.push(0); 519 | 520 | const argv = offset; 521 | argvPtrs.forEach((ptr) => { 522 | this.mem.setUint32(offset, ptr, true); 523 | this.mem.setUint32(offset + 4, 0, true); 524 | offset += 8; 525 | }); 526 | 527 | // The linker guarantees global data starts from at least wasmMinDataAddr. 528 | // Keep in sync with cmd/link/internal/ld/data.go:wasmMinDataAddr. 529 | const wasmMinDataAddr = 4096 + 8192; 530 | if (offset >= wasmMinDataAddr) { 531 | throw new Error("total length of command line and environment variables exceeds limit"); 532 | } 533 | 534 | this._inst.exports.run(argc, argv); 535 | if (this.exited) { 536 | this._resolveExitPromise(); 537 | } 538 | await this._exitPromise; 539 | } 540 | 541 | _resume() { 542 | if (this.exited) { 543 | throw new Error("Go program has already exited"); 544 | } 545 | this._inst.exports.resume(); 546 | if (this.exited) { 547 | this._resolveExitPromise(); 548 | } 549 | } 550 | 551 | _makeFuncWrapper(id) { 552 | const go = this; 553 | return function () { 554 | const event = { id: id, this: this, args: arguments }; 555 | go._pendingEvent = event; 556 | go._resume(); 557 | return event.result; 558 | }; 559 | } 560 | } 561 | })(); 562 | -------------------------------------------------------------------------------- /internal/cmd/wasm/main.go: -------------------------------------------------------------------------------- 1 | //go:build js && wasm 2 | 3 | // Main binary for exposing the go-sentencepiece functionality in the browser 4 | // via WASM. The required functionality is exposed via the syscall/js interface. 5 | // This module should only be built in js && wasm mode. 6 | package main 7 | 8 | import ( 9 | _ "embed" 10 | "fmt" 11 | "log" 12 | "strings" 13 | "sync" 14 | "syscall/js" 15 | 16 | "github.com/eliben/go-sentencepiece" 17 | ) 18 | 19 | //go:embed embed_data/tokenizer.model 20 | var modelFileData string 21 | var spm *sentencepiece.Processor 22 | 23 | func main() { 24 | var once sync.Once 25 | once.Do(func() { 26 | var err error 27 | spm, err = sentencepiece.NewProcessor(strings.NewReader(modelFileData)) 28 | if err != nil { 29 | log.Fatal(err) 30 | } 31 | fmt.Printf("processor loaded, vocab len=%v\n", spm.ModelInfo().VocabularySize) 32 | }) 33 | 34 | js.Global().Set("textToIDs", jsTextToIDs) 35 | js.Global().Set("textToPieces", jsTextToPieces) 36 | 37 | // For the Go code to be usable from JS, the main function has to run forever. 38 | <-make(chan bool) 39 | } 40 | 41 | var jsTextToIDs = js.FuncOf(func(this js.Value, args []js.Value) interface{} { 42 | if len(args) != 1 { 43 | return "expected 1 argument: text to tokenize" 44 | } 45 | txt := args[0].String() 46 | tokens := spm.Encode(txt) 47 | 48 | jsTokens := js.Global().Get("Array").New() 49 | for _, t := range tokens { 50 | jsTokens.Call("push", js.ValueOf(t.ID)) 51 | } 52 | return jsTokens 53 | }) 54 | 55 | var jsTextToPieces = js.FuncOf(func(this js.Value, args []js.Value) interface{} { 56 | if len(args) != 1 { 57 | return "expected 1 argument: text to tokenize" 58 | } 59 | txt := args[0].String() 60 | tokens := spm.Encode(txt) 61 | 62 | jsTokens := js.Global().Get("Array").New() 63 | for _, t := range tokens { 64 | jsTokens.Call("push", js.ValueOf(t.Text)) 65 | } 66 | return jsTokens 67 | }) 68 | -------------------------------------------------------------------------------- /internal/model/gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o pipefail 4 | set -eux 5 | 6 | protoc \ 7 | --go_out=. \ 8 | --go_opt="Msentencepiece_model.proto=;model" sentencepiece_model.proto 9 | 10 | goimports -w . 11 | 12 | -------------------------------------------------------------------------------- /internal/model/sentencepiece_model.proto: -------------------------------------------------------------------------------- 1 | // Copyright 2024 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | syntax = "proto2"; 16 | 17 | // TODO(taku): Needs to use LITE RUNTIME in OSS release. 18 | option optimize_for = LITE_RUNTIME; 19 | 20 | package sentencepiece; 21 | 22 | // TrainerSpec encodes a various parameters for SentencePiece training. 23 | // Next id: 55 24 | message TrainerSpec { 25 | /////////////////////////////////////////////////////////////////// 26 | // General parameters 27 | // 28 | // Input corpus files. 29 | // Trainer accepts the following two formats: 30 | // A) Monolingual: plain text, one sentence per line. 31 | // B) Bilingual: TSV, source sentence target sentence 32 | // When bilingual data is passed, shared vocabulary model is built. 33 | // Note that the input file must be raw corpus, not a preprocessed corpus. 34 | // Trainer only loads the first `input_sentence_size` sentences specified 35 | // with this parameter. 36 | repeated string input = 1; 37 | 38 | // Input corpus format: 39 | // "text": one-sentence-per-line text format (default) 40 | // "tsv": sentence freq 41 | optional string input_format = 7; 42 | 43 | // Output model file prefix. 44 | // .model and .vocab are generated. 45 | optional string model_prefix = 2; 46 | 47 | // Model type. only have UNIGRAM now. 48 | enum ModelType { 49 | UNIGRAM = 1; // Unigram language model with dynamic algorithm 50 | BPE = 2; // Byte Pair Encoding 51 | WORD = 3; // Delimitered by whitespace. 52 | CHAR = 4; // tokenizes into character sequence 53 | } 54 | optional ModelType model_type = 3 [default = UNIGRAM]; 55 | 56 | // Vocabulary size. 8k is the default size. 57 | optional int32 vocab_size = 4 [default = 8000]; 58 | 59 | // List of the languages this model can accept. 60 | // Since the model is language-agnostic, this field is used as a reference. 61 | repeated string accept_language = 5; 62 | 63 | // Size of self-test samples, which are encoded in the model file. 64 | optional int32 self_test_sample_size = 6 [default = 0]; 65 | 66 | // Whether to use DP version of sentencepiece. Use it with TSV input format 67 | // (requires precomputed word tab counts to work). 68 | optional bool enable_differential_privacy = 50 [default = false]; 69 | // Set these parameters if you need DP version of sentencepiece. 70 | // std of noise to add. 71 | optional float differential_privacy_noise_level = 51 [default = 0.0]; 72 | // Clipping threshold to apply after adding noise. All the words with 73 | // frequency less than this value are dropped. 74 | optional uint64 differential_privacy_clipping_threshold = 52 [default = 0]; 75 | 76 | /////////////////////////////////////////////////////////////////// 77 | // Training parameters. 78 | // 79 | // Uses characters which cover the corpus with the ratio of `chars_coverage`. 80 | // This parameter determines the set of basic Alphabet of sentence piece. 81 | // 1.0 - `chars_coverage` characters are treated as UNK. 82 | // See also required_chars field. 83 | optional float character_coverage = 10 [default = 0.9995]; 84 | 85 | // Maximum size of sentences the trainer loads from `input` parameter. 86 | // Trainer simply loads the `input` files in sequence. 87 | // It is better to shuffle the input corpus randomly. 88 | optional uint64 input_sentence_size = 11 [default = 0]; 89 | optional bool shuffle_input_sentence = 19 [default = true]; 90 | 91 | // Maximum size of sentences to make seed sentence pieces. 92 | // Extended suffix array is constructed to extract frequent 93 | // sub-strings from the corpus. This uses 20N working space, 94 | // where N is the size of corpus. 95 | optional int32 mining_sentence_size = 12 [deprecated = true]; 96 | 97 | // Maximum size of sentences to train sentence pieces. 98 | optional int32 training_sentence_size = 13 [deprecated = true]; 99 | 100 | // The size of seed sentencepieces. 101 | // `seed_sentencepiece_size` must be larger than `vocab_size`. 102 | optional int32 seed_sentencepiece_size = 14 [default = 1000000]; 103 | 104 | // In every EM sub-iterations, keeps top 105 | // `shrinking_factor` * `current sentencepieces size` with respect to 106 | // the loss of the sentence piece. This value should be smaller than 1.0. 107 | optional float shrinking_factor = 15 [default = 0.75]; 108 | 109 | // The maximum sentence length in byte. The sentences with the length 110 | // larger than `max_sentence_length` is simply ignored. 111 | // Longer input tends to bring the following risks: 112 | // * Overflow during EM training (unigram language model only) 113 | // * Performance drop because of O(n log n) cost in BPE. 114 | optional int32 max_sentence_length = 18 [default = 4192]; 115 | 116 | // Number of threads in the training. 117 | optional int32 num_threads = 16 [default = 16]; 118 | 119 | // Number of EM sub iterations. 120 | optional int32 num_sub_iterations = 17 [default = 2]; 121 | 122 | /////////////////////////////////////////////////////////////////// 123 | // SentencePiece parameters which control the shapes of sentence piece. 124 | // 125 | // Maximum length of sentencepiece. 126 | optional int32 max_sentencepiece_length = 20 [default = 16]; 127 | 128 | // Uses Unicode script to split sentence pieces. 129 | // When `split_by_unicode_script` is true, we do not allow sentence piece to 130 | // include multiple Unicode scripts, e.g. "F1" is not a valid piece. 131 | // Exception: CJ characters (Hiragana/Katakana/Han) are all handled 132 | // as one script type, since Japanese word can consist of multiple scripts. 133 | // This exception is always applied regardless of the accept-language 134 | // parameter. 135 | optional bool split_by_unicode_script = 21 [default = true]; 136 | 137 | // When `split_by_number` is true, put a boundary between number and 138 | // non-number transition. If we want to treat "F1" is one token, set this flag 139 | // to be false. 140 | optional bool split_by_number = 23 [default = true]; 141 | 142 | // Use a white space to split sentence pieces. 143 | // When `split_by_whitespace` is false, we may have the piece containing 144 | // a white space in the middle. e.g., "in_the". 145 | optional bool split_by_whitespace = 22 [default = true]; 146 | 147 | // Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello => 148 | // hello_. When `treat_whitespace_as_suffix` is true, 149 | // NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end 150 | // of sentence. 151 | optional bool treat_whitespace_as_suffix = 24 [default = false]; 152 | 153 | // Allows pieces that only contain whitespaces instead of appearing only as 154 | // prefix or suffix of other pieces. 155 | optional bool allow_whitespace_only_pieces = 26 [default = false]; 156 | 157 | // Split all digits (0-9) into separate pieces. 158 | optional bool split_digits = 25 [default = false]; 159 | 160 | // Defines the pre-tokenization delimiter. 161 | // When specified, no pieces crossing this delimiter is not included 162 | // in the vocab. Then the delimiter string is virtually ignored 163 | // during the training. This field can allows constraints on the vocabulary 164 | // selection. Note that this field is available on unigram mode. 165 | optional string pretokenization_delimiter = 53 [ default = ""]; 166 | 167 | /////////////////////////////////////////////////////////////////// 168 | // Vocabulary management 169 | // 170 | // Defines control symbols used as an indicator to 171 | // change the behavior of the decoder. and are pre-defined. 172 | // We can use this field to encode various meta information, 173 | // including language indicator in multilingual model. 174 | // These symbols are not visible to users, but visible to 175 | // the decoder. Note that when the input sentence contains control symbols, 176 | // they are not treated as one token, but segmented into normal pieces. 177 | // Control symbols must be inserted independently from the segmentation. 178 | repeated string control_symbols = 30; 179 | 180 | // Defines user defined symbols. 181 | // These symbols are added with extremely high score 182 | // so they are always treated as one unique symbol in any context. 183 | // Typical usage of user_defined_symbols is placeholder for named entities. 184 | repeated string user_defined_symbols = 31; 185 | 186 | // Defines required characters. Each UTF8 character in this string is included 187 | // in the character set regardless of character_coverage value. Unlike 188 | // user_defined_symbols, these characters have scores based on the frequency 189 | // on input sentences, and the model can form subwords using characters 190 | // in this field. 191 | optional string required_chars = 36; 192 | 193 | // Decomposes unknown pieces into UTF-8 bytes. 194 | optional bool byte_fallback = 35 [default = false]; 195 | 196 | // When creating the vocabulary file, defines whether or not to additionally 197 | // output the score for each piece. 198 | optional bool vocabulary_output_piece_score = 32 [default = true]; 199 | 200 | // `vocab_size` is treated as hard limit. Crash if 201 | // the model can not produce the vocab of size `vocab_size`, 202 | // When `hard_vocab_limit` is false, vocab_size is treated 203 | // as soft limit. Note that when model_type=char, 204 | // always assumes hard_vocab_limit = false. 205 | optional bool hard_vocab_limit = 33 [default = true]; 206 | 207 | // use all symbols for vocab extraction. This flag is valid 208 | // if model type is either CHAR or WORD 209 | optional bool use_all_vocab = 34 [default = false]; 210 | 211 | /////////////////////////////////////////////////////////////////// 212 | // Reserved special meta tokens. 213 | // * -1 is not used. 214 | // * unk_id must not be -1. 215 | // Id must starts with 0 and be contigous. 216 | optional int32 unk_id = 40 [default = 0]; // 217 | optional int32 bos_id = 41 [default = 1]; // 218 | optional int32 eos_id = 42 [default = 2]; // 219 | optional int32 pad_id = 43 [default = -1]; // (padding) 220 | optional string unk_piece = 45 [default = ""]; 221 | optional string bos_piece = 46 [default = ""]; 222 | optional string eos_piece = 47 [default = ""]; 223 | optional string pad_piece = 48 [default = ""]; 224 | 225 | // Encodes into U+2047 (DOUBLE QUESTION MARK), 226 | // since this character can be useful both for user and 227 | // developer. We can easily figure out that is emitted. 228 | optional string unk_surface = 44 [default = " \xE2\x81\x87 "]; 229 | 230 | // Increase bit depth to allow unigram model training on large 231 | // (>10M sentences) corpora. A Side-effect of enabling this flag 232 | // is increased memory usage. 233 | optional bool train_extremely_large_corpus = 49 [default = false]; 234 | 235 | // Path to a seed sentencepieces file, with one tab-separated 236 | // seed sentencepiece frequency per line. 237 | optional string seed_sentencepieces_file = 54 [default = ""]; 238 | 239 | // Customized extensions: the range of field numbers 240 | // are open to third-party extensions. 241 | extensions 200 to max; 242 | } 243 | 244 | // NormalizerSpec encodes a various parameters for string normalizaiton 245 | message NormalizerSpec { 246 | // name of normalization rule. 247 | optional string name = 1; 248 | 249 | // Pre-compiled normalization rule created by 250 | // Builder::GetPrecompiledCharsMap() or Builder::CompileCharsMap() method. 251 | // Usually this field is set by Builder::GetNormalizerSpec() method. 252 | optional bytes precompiled_charsmap = 2; 253 | 254 | // Adds dummy whitespace at the beginning of text in order to 255 | // treat "world" in "world" and "hello world" in the same way. 256 | optional bool add_dummy_prefix = 3 [default = true]; 257 | 258 | // Removes leading, trailing, and duplicate internal whitespace. 259 | optional bool remove_extra_whitespaces = 4 [default = true]; 260 | 261 | // Replaces whitespace with meta symbol. 262 | // This field must be true to train sentence piece model. 263 | optional bool escape_whitespaces = 5 [default = true]; 264 | 265 | // Custom normalization rule file in TSV format. 266 | // https://github.com/google/sentencepiece/blob/master/doc/normalization.md 267 | // This field is only used in SentencePieceTrainer::Train() method, which 268 | // compiles the rule into the binary rule stored in `precompiled_charsmap`. 269 | optional string normalization_rule_tsv = 6; 270 | 271 | // Customized extensions: the range of field numbers 272 | // are open to third-party extensions. 273 | extensions 200 to max; 274 | } 275 | 276 | // Proto to store samples for self-testing. 277 | message SelfTestData { 278 | message Sample { 279 | optional string input = 1; 280 | optional string expected = 2; 281 | } 282 | repeated Sample samples = 1; 283 | 284 | // Customized extensions: the range of field numbers 285 | // are open to third-party extensions. 286 | extensions 200 to max; 287 | } 288 | 289 | // ModelProto stores model parameters. 290 | // SentencePieceProcessor is supposed to be self-contained. 291 | // All settings/parameters which may change the behavior must be encoded 292 | // in ModelProto. 293 | message ModelProto { 294 | message SentencePiece { 295 | enum Type { 296 | NORMAL = 1; // normal symbol 297 | UNKNOWN = 2; // unknown symbol. only for now. 298 | CONTROL = 3; // control symbols. , , <2ja> etc. 299 | USER_DEFINED = 4; // user defined symbols. 300 | // Typical usage of USER_DEFINED symbol 301 | // is placeholder. 302 | BYTE = 6; // byte symbols. Used when `byte_fallback` is true. 303 | UNUSED = 5; // this piece is not used. 304 | } 305 | optional string piece = 1; // piece must not be empty. 306 | optional float score = 2; 307 | optional Type type = 3 [default = NORMAL]; 308 | 309 | // Customized extensions: the range of field numbers 310 | // are open to third-party extensions. 311 | extensions 200 to max; 312 | } 313 | 314 | // Sentence pieces with scores. 315 | repeated SentencePiece pieces = 1; 316 | 317 | // Spec used to generate this model file. 318 | optional TrainerSpec trainer_spec = 2; 319 | 320 | // Spec for text normalization. 321 | optional NormalizerSpec normalizer_spec = 3; 322 | 323 | // Stores sample input and its expected segmentation to verify the model. 324 | optional SelfTestData self_test_data = 4; 325 | 326 | // Spec for text de-normalization. 327 | optional NormalizerSpec denormalizer_spec = 5; 328 | 329 | // Customized extensions: the range of field numbers 330 | // are open to third-party extensions. 331 | extensions 200 to max; 332 | } 333 | -------------------------------------------------------------------------------- /internal/prefixmatcher/prefixmatcher.go: -------------------------------------------------------------------------------- 1 | package prefixmatcher 2 | 3 | import ( 4 | "unicode/utf8" 5 | ) 6 | 7 | // PrefixMatcher helps find longest prefixes. See [FindPrefixLen]. 8 | type PrefixMatcher struct { 9 | root *trieNode 10 | } 11 | 12 | type trieNode struct { 13 | children map[rune]*trieNode 14 | final bool 15 | } 16 | 17 | // NewFromSet creates a new [PrefixMatcher] from a set of strings tha represent 18 | // the vocabulary. 19 | func NewFromSet(vocab map[string]bool) *PrefixMatcher { 20 | pm := &PrefixMatcher{root: newNode()} 21 | for word := range vocab { 22 | pm.add(word) 23 | } 24 | return pm 25 | } 26 | 27 | // FindPrefixLen finds the longest prefix of text that matches a vocabulary 28 | // word, and returns it. If 0 is returned, no prefix was found. 29 | func (pm *PrefixMatcher) FindPrefixLen(text string) int { 30 | node := pm.root 31 | maxLen := 0 32 | 33 | for i, r := range text { 34 | child := node.children[r] 35 | if child == nil { 36 | // r not found in this node, so we're done. 37 | return maxLen 38 | } 39 | if child.final { 40 | maxLen = i + utf8.RuneLen(r) 41 | } 42 | node = child 43 | } 44 | 45 | return maxLen 46 | } 47 | 48 | func (pm *PrefixMatcher) add(word string) { 49 | node := pm.root 50 | 51 | for _, r := range word { 52 | child := node.children[r] 53 | if child == nil { 54 | child = newNode() 55 | node.children[r] = child 56 | } 57 | node = child 58 | } 59 | 60 | node.final = true 61 | } 62 | 63 | func newNode() *trieNode { 64 | return &trieNode{ 65 | children: make(map[rune]*trieNode), 66 | final: false, 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /internal/prefixmatcher/prefixmatcher_test.go: -------------------------------------------------------------------------------- 1 | package prefixmatcher 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | func dumpNode(n *trieNode, prefix string) string { 9 | var s string 10 | if n.final { 11 | s = fmt.Sprintf("%sfinal\n", prefix) 12 | } 13 | for r, c := range n.children { 14 | s += fmt.Sprintf("%s%q ->\n%s", prefix, r, dumpNode(c, prefix+" ")) 15 | } 16 | return s 17 | } 18 | 19 | func TestSmallVocab(t *testing.T) { 20 | vocab := map[string]bool{ 21 | "ham": true, 22 | "yefet": true, 23 | "hamat": true, 24 | "hamela": true, 25 | "世界": true, 26 | 27 | "▁▁": true, 28 | "▁▁▁": true, 29 | "▁▁▁▁": true, 30 | "▁▁▁▁▁": true, 31 | "▁▁▁▁▁▁": true, 32 | } 33 | pm := NewFromSet(vocab) 34 | 35 | var tests = []struct { 36 | text string 37 | wantLen int 38 | }{ 39 | {"zyx", 0}, 40 | {"ham", 3}, 41 | {"hama", 3}, 42 | {"zham", 0}, 43 | {"hame", 3}, 44 | {"hamy", 3}, 45 | {"hamat", 5}, 46 | {"hamatar", 5}, 47 | {"hamela", 6}, 48 | {"hamelar", 6}, 49 | {"y", 0}, 50 | {"ye", 0}, 51 | {"yefet", 5}, 52 | {"yefeton", 5}, 53 | {"世界", 6}, 54 | {"世", 0}, 55 | {"世p", 0}, 56 | {"世界foo", 6}, 57 | {"▁", 0}, 58 | {"▁▁", 6}, 59 | {"▁▁▁", 9}, 60 | {"▁▁▁▁", 12}, 61 | {"▁▁▁▁▁", 15}, 62 | {"▁▁▁▁▁▁", 18}, 63 | {"▁▁▁▁▁▁▁", 18}, 64 | {"▁▁▁▁▁▁p", 18}, 65 | } 66 | 67 | for _, tt := range tests { 68 | t.Run(tt.text, func(t *testing.T) { 69 | gotLen := pm.FindPrefixLen(tt.text) 70 | if gotLen != tt.wantLen { 71 | t.Errorf("got %v, want %v", gotLen, tt.wantLen) 72 | } 73 | }) 74 | } 75 | } 76 | 77 | func TestSingleAndDoubleLetter(t *testing.T) { 78 | vocab := make(map[string]bool) 79 | 80 | for r1 := 'a'; r1 <= 'z'; r1++ { 81 | vocab[string(r1)] = true 82 | 83 | for r2 := 'a'; r2 <= 'z'; r2++ { 84 | vocab[string(r1)+string(r2)] = true 85 | } 86 | } 87 | 88 | pm := NewFromSet(vocab) 89 | 90 | assertLen := func(text string, wantLen int) { 91 | t.Helper() 92 | gotLen := pm.FindPrefixLen(text) 93 | if gotLen != wantLen { 94 | t.Errorf("got %v, want %v", gotLen, wantLen) 95 | } 96 | } 97 | 98 | for r1 := 'a'; r1 <= 'z'; r1++ { 99 | assertLen(string(r1), 1) 100 | for r2 := 'a'; r2 <= 'z'; r2++ { 101 | assertLen(string(r1)+string(r2), 2) 102 | for r3 := 'a'; r3 <= 'z'; r3++ { 103 | assertLen(string(r1)+string(r2)+string(r3), 2) 104 | } 105 | } 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /internal/priorityqueue/priorityqueue.go: -------------------------------------------------------------------------------- 1 | // Package priorityqueue provides a generic priority queue with Insert, 2 | // PopMax, and RemoveFunc operations. 3 | package priorityqueue 4 | 5 | // PriorityQueue is a generic priority queue with a configurable comparison 6 | // function. 7 | type PriorityQueue[T any] struct { 8 | cmp func(a, b T) int 9 | 10 | // items holds the queue's items as a binary heap. 11 | // items[0] is a dummy element that's not used. If the queue has N elements, 12 | // they are stored at indices 1...N (N == len(items)-1) 13 | // For an element at index i, its parent is at index i/2, and its children 14 | // are at indices 2i and 2i+1. The root of the heap is at index 1. 15 | items []T 16 | } 17 | 18 | // New creates a new PriorityQueue, configured with a function that 19 | // compares the priorities of two items a and b; it should return a number > 0 20 | // if the priority of a is higher, 0 if the priorities are equal, and a 21 | // number < 0 otherwise. 22 | // sizeHint sets the initial capacity of the queue; -1 means to use the default. 23 | func New[T any](sizeHint int, cmp func(a, b T) int) *PriorityQueue[T] { 24 | return &PriorityQueue[T]{cmp: cmp, items: make([]T, 1, max(1, sizeHint+1))} 25 | } 26 | 27 | // Len returns the length (number of items) of the priority queue. 28 | func (pq *PriorityQueue[T]) Len() int { 29 | return len(pq.items) - 1 30 | } 31 | 32 | // Insert inserts a new element into the priority queue. 33 | func (pq *PriorityQueue[T]) Insert(elem T) { 34 | pq.items = append(pq.items, elem) 35 | pq.siftup(len(pq.items) - 1) 36 | } 37 | 38 | // PopMax returns the element with the maximal priority in the queue, and 39 | // removes it from the queue. Warning: to maintain a clean API, PopMax panics 40 | // if the queue is empty. Make sure to check Len() first. 41 | func (pq *PriorityQueue[T]) PopMax() T { 42 | if len(pq.items) < 2 { 43 | panic("popping from empty priority queue") 44 | } 45 | maxItem := pq.items[1] 46 | pq.items[1] = pq.items[len(pq.items)-1] 47 | pq.items = pq.items[:len(pq.items)-1] 48 | pq.siftdown(1) 49 | return maxItem 50 | } 51 | 52 | // RemoveFunc removes all elements for which rm returns true. 53 | func (pq *PriorityQueue[T]) RemoveFunc(rm func(T) bool) { 54 | // This is effectively slices.DeleteFunc, but inlined because we start from index 1. 55 | i := 1 56 | for ; i < len(pq.items); i++ { 57 | if rm(pq.items[i]) { 58 | break 59 | } 60 | } 61 | if i == len(pq.items) { 62 | return // nothing to remove 63 | } 64 | for j := i + 1; j < len(pq.items); j++ { 65 | if v := pq.items[j]; !rm(v) { 66 | pq.items[i] = v 67 | i++ 68 | } 69 | } 70 | // Clear the tail. 71 | clear(pq.items[i:]) 72 | pq.items = pq.items[:i] 73 | pq.rebuildHeap() 74 | } 75 | 76 | // rebuildHeap rebuilds the entire heap from scratch. 77 | func (pq *PriorityQueue[T]) rebuildHeap() { 78 | for i := len(pq.items) / 2; i >= 1; i-- { 79 | pq.siftdown(i) 80 | } 81 | } 82 | 83 | func (pq *PriorityQueue[T]) siftup(n int) { 84 | i := n 85 | for { 86 | if i == 1 { 87 | // Reached root, we're done. 88 | return 89 | } 90 | // p is the index of i's parent 91 | // if p parent has a higher priority than i, we're done. 92 | p := i / 2 93 | if pq.cmp(pq.items[p], pq.items[i]) >= 0 { 94 | return 95 | } 96 | pq.items[i], pq.items[p] = pq.items[p], pq.items[i] 97 | i = p 98 | } 99 | } 100 | 101 | func (pq *PriorityQueue[T]) siftdown(i int) { 102 | for { 103 | c := 2 * i 104 | if c >= len(pq.items) { 105 | return 106 | } 107 | // c is not out of bounds, so it's the index of the left child of i 108 | 109 | // Figure out the child index with the maximal priority 110 | maxChild := c 111 | if c+1 < len(pq.items) { 112 | // c+1 is not out of bounds, so it's the index of the right child of i 113 | if pq.cmp(pq.items[c+1], pq.items[c]) > 0 { 114 | maxChild = c + 1 115 | } 116 | } 117 | if pq.cmp(pq.items[i], pq.items[maxChild]) >= 0 { 118 | // i has higher priority than either child, so we're done. 119 | return 120 | } 121 | 122 | pq.items[i], pq.items[maxChild] = pq.items[maxChild], pq.items[i] 123 | i = maxChild 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /internal/priorityqueue/priorityqueue_test.go: -------------------------------------------------------------------------------- 1 | package priorityqueue 2 | 3 | import ( 4 | "math/rand" 5 | "slices" 6 | "testing" 7 | ) 8 | 9 | func TestBasicQueueWithStrings(t *testing.T) { 10 | stringLenCmp := func(a, b string) int { 11 | return len(a) - len(b) 12 | } 13 | 14 | pq := New(-1, stringLenCmp) 15 | 16 | assertPopAndSize := func(s string, n int) { 17 | t.Helper() 18 | got := pq.PopMax() 19 | if got != s { 20 | t.Errorf("got %v, want %v", got, s) 21 | } 22 | if n != pq.Len() { 23 | t.Errorf("got len=%v, want %v", pq.Len(), n) 24 | } 25 | } 26 | 27 | pq.Insert("one") 28 | pq.Insert("four") 29 | pq.Insert("sixteen") 30 | pq.Insert("un") 31 | 32 | // Pop all elements in max order 33 | assertPopAndSize("sixteen", 3) 34 | assertPopAndSize("four", 2) 35 | assertPopAndSize("one", 1) 36 | assertPopAndSize("un", 0) 37 | 38 | // Insert+pop, insert+pop... 39 | pq.Insert("xyz") 40 | assertPopAndSize("xyz", 0) 41 | pq.Insert("foobarbaz") 42 | assertPopAndSize("foobarbaz", 0) 43 | pq.Insert("1") 44 | assertPopAndSize("1", 0) 45 | 46 | // Inserts after popping some 47 | pq.Insert("mercury") 48 | pq.Insert("venus") 49 | assertPopAndSize("mercury", 1) 50 | pq.Insert("jupiter") 51 | assertPopAndSize("jupiter", 1) 52 | pq.Insert("moon") 53 | assertPopAndSize("venus", 1) 54 | assertPopAndSize("moon", 0) 55 | 56 | // Insert two, pop 1, a few times 57 | pq.Insert("mercury") 58 | pq.Insert("venus") 59 | assertPopAndSize("mercury", 1) 60 | pq.Insert("mars") 61 | pq.Insert("jupiter") 62 | assertPopAndSize("jupiter", 2) // contains: venus, mars 63 | pq.Insert("ganimede") 64 | pq.Insert("europa") 65 | assertPopAndSize("ganimede", 3) // contains: venus, mars, europa 66 | pq.Insert("enceladus") 67 | pq.Insert("io") 68 | assertPopAndSize("enceladus", 4) 69 | assertPopAndSize("europa", 3) 70 | assertPopAndSize("venus", 2) 71 | assertPopAndSize("mars", 1) 72 | assertPopAndSize("io", 0) 73 | 74 | // Insert these words in random orders; they should still all pop in the 75 | // expected order by length. 76 | words := []string{"z", "xy", "uvw", "post", "dworb"} 77 | for i := 0; i < 100; i++ { 78 | w := slices.Clone(words) 79 | rand.Shuffle(len(w), func(i, j int) { 80 | w[i], w[j] = w[j], w[i] 81 | }) 82 | 83 | for _, word := range w { 84 | pq.Insert(word) 85 | } 86 | 87 | assertPopAndSize("dworb", 4) 88 | assertPopAndSize("post", 3) 89 | assertPopAndSize("uvw", 2) 90 | assertPopAndSize("xy", 1) 91 | assertPopAndSize("z", 0) 92 | } 93 | } 94 | 95 | func TestBasicQueueWithCustomType(t *testing.T) { 96 | type Item struct { 97 | Name string 98 | Cost int 99 | } 100 | 101 | itemCostCmp := func(a, b Item) int { 102 | return a.Cost - b.Cost 103 | } 104 | 105 | pq := New(-1, itemCostCmp) 106 | 107 | assertPop := func(s string) { 108 | t.Helper() 109 | got := pq.PopMax() 110 | if got.Name != s { 111 | t.Errorf("got %v, want %v", got.Name, s) 112 | } 113 | } 114 | 115 | // Push in decreasing cost order 116 | pq.Insert(Item{"joe", 20}) 117 | pq.Insert(Item{"maxm", 3}) 118 | pq.Insert(Item{"jabbar", 1}) 119 | assertPop("joe") 120 | assertPop("maxm") 121 | assertPop("jabbar") 122 | 123 | // Push in increasing cost order 124 | pq.Insert(Item{"x", 1}) 125 | pq.Insert(Item{"y", 29}) 126 | pq.Insert(Item{"z", 88}) 127 | assertPop("z") 128 | assertPop("y") 129 | assertPop("x") 130 | } 131 | -------------------------------------------------------------------------------- /normalize.go: -------------------------------------------------------------------------------- 1 | package sentencepiece 2 | 3 | import "strings" 4 | 5 | // normalize performs unicode normalization. 6 | // 7 | // SentencePiece has a feature to perform configurable unicode normalization on 8 | // the input text and has some options for adding dummy whitespace prefixes or 9 | // trimming whitespace. However, the model we're working with has a very simple 10 | // normalizer that does none of this. These options can be added in the future 11 | // if needed. 12 | func normalize(text string) string { 13 | return replaceSpacesBySeparator(text) 14 | } 15 | 16 | const whitespaceSeparator = "▁" 17 | 18 | // replaceSpacesBySeparator replaces spaces by the whitespace separator used by 19 | // the model. 20 | func replaceSpacesBySeparator(text string) string { 21 | return strings.ReplaceAll(text, " ", whitespaceSeparator) 22 | } 23 | 24 | // replaceSeparatorsBySpace replaces the whitespace separator used by 25 | // the model back with spaces. 26 | func replaceSeparatorsBySpace(text string) string { 27 | return strings.ReplaceAll(text, whitespaceSeparator, " ") 28 | } 29 | -------------------------------------------------------------------------------- /processor.go: -------------------------------------------------------------------------------- 1 | package sentencepiece 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "os" 7 | "strconv" 8 | "strings" 9 | "unicode/utf8" 10 | 11 | "github.com/eliben/go-sentencepiece/internal/model" 12 | "github.com/eliben/go-sentencepiece/internal/prefixmatcher" 13 | "github.com/eliben/go-sentencepiece/internal/priorityqueue" 14 | "google.golang.org/protobuf/proto" 15 | ) 16 | 17 | const debugEncode = false 18 | 19 | // Processor represents a SentencePiece processor (tokenizer). 20 | // A Processor converts input text into a sequence of tokens LLMs use, and back. 21 | // The mapping between token IDs and the text they represent is read from the 22 | // model proto (provided to the constructor); it's the same between all calls 23 | // to the Encode method. 24 | // 25 | // The term "processor" comes from the original C++ SentencePiece library and 26 | // its Python bindings. 27 | type Processor struct { 28 | model *model.ModelProto 29 | 30 | pieces map[string]int 31 | reserved map[string]int 32 | 33 | // unknownID is the token identifier of the UNKNOWN piece 34 | unknownID int 35 | 36 | // userDefinedMatcher is a prefix matcher for symbols that are of 37 | // "user-defined" type in the model proto. 38 | userDefinedMatcher *prefixmatcher.PrefixMatcher 39 | 40 | // byte2Token is a cache of byte values and the tokens they represent 41 | byte2Token map[byte]Token 42 | 43 | // idToByte maps IDs to byte values they represent 44 | idToByte map[int]byte 45 | 46 | // maxPieceLength is the maximum length of a piece in the model. 47 | // This is used to preallocate a buffer for merging symbols. 48 | maxPieceLength int 49 | } 50 | 51 | // NewProcessorFromPath creates a new Processor from a file path to the protobuf 52 | // data. 53 | func NewProcessorFromPath(protoFile string) (*Processor, error) { 54 | f, err := os.Open(protoFile) 55 | if err != nil { 56 | return nil, fmt.Errorf("unable to read %q: %v", protoFile, err) 57 | } 58 | defer f.Close() 59 | return NewProcessor(f) 60 | } 61 | 62 | // NewProcessor creates a new Processor from a reader with the protobuf data. 63 | func NewProcessor(protoReader io.Reader) (*Processor, error) { 64 | b, err := io.ReadAll(protoReader) 65 | if err != nil { 66 | return nil, fmt.Errorf("unable to read protobuf data: %v", err) 67 | } 68 | 69 | var mp model.ModelProto 70 | err = proto.Unmarshal(b, &mp) 71 | if err != nil { 72 | return nil, fmt.Errorf("unable to unmarshal protobuf: %v", err) 73 | } 74 | 75 | tspec := mp.GetTrainerSpec() 76 | if tspec.GetModelType() != model.TrainerSpec_BPE { 77 | return nil, fmt.Errorf("model type %s not supported", tspec.GetModelType()) 78 | } 79 | 80 | nspec := mp.GetNormalizerSpec() 81 | if *nspec.AddDummyPrefix || *nspec.RemoveExtraWhitespaces { 82 | return nil, fmt.Errorf("normalizer spec options not supported: %s", nspec) 83 | } 84 | 85 | userDefined := make(map[string]bool) 86 | pieces := make(map[string]int) 87 | reserved := make(map[string]int) 88 | byte2Token := make(map[byte]Token) 89 | idToByte := make(map[int]byte) 90 | unkID := -1 91 | maxPieceLength := 0 92 | 93 | for i, piece := range mp.GetPieces() { 94 | isNormalPiece := (piece.GetType() == model.ModelProto_SentencePiece_NORMAL || 95 | piece.GetType() == model.ModelProto_SentencePiece_USER_DEFINED || 96 | piece.GetType() == model.ModelProto_SentencePiece_UNUSED) 97 | 98 | if isNormalPiece { 99 | pieces[piece.GetPiece()] = i 100 | maxPieceLength = max(maxPieceLength, len(piece.GetPiece())) 101 | } else { 102 | reserved[piece.GetPiece()] = i 103 | } 104 | 105 | if piece.GetType() == model.ModelProto_SentencePiece_USER_DEFINED { 106 | userDefined[piece.GetPiece()] = true 107 | } else if piece.GetType() == model.ModelProto_SentencePiece_UNKNOWN { 108 | if unkID > 0 { 109 | return nil, fmt.Errorf("unk redefined") 110 | } 111 | unkID = i 112 | } else if piece.GetType() == model.ModelProto_SentencePiece_BYTE { 113 | if !tspec.GetByteFallback() { 114 | return nil, fmt.Errorf("byte piece %q is found although `byte_fallback=false`", piece.GetPiece()) 115 | } 116 | bv := convertHexValue(piece.GetPiece()) 117 | if bv >= 0 && bv < 256 { 118 | byte2Token[byte(bv)] = Token{ID: i, Text: piece.GetPiece()} 119 | idToByte[i] = byte(bv) 120 | } 121 | } 122 | } 123 | 124 | if unkID < 0 { 125 | return nil, fmt.Errorf("unk symbol is not defined") 126 | } 127 | 128 | // In case byte_fallback is specified, make sure that all 256 possible byte 129 | // values were found. 130 | if tspec.GetByteFallback() { 131 | for i := 0; i < 256; i++ { 132 | if _, found := byte2Token[byte(i)]; !found { 133 | return nil, fmt.Errorf("byte value 0x%02X not found", i) 134 | } 135 | } 136 | } 137 | 138 | return &Processor{ 139 | model: &mp, 140 | userDefinedMatcher: prefixmatcher.NewFromSet(userDefined), 141 | byte2Token: byte2Token, 142 | idToByte: idToByte, 143 | unknownID: unkID, 144 | pieces: pieces, 145 | reserved: reserved, 146 | maxPieceLength: maxPieceLength, 147 | }, nil 148 | } 149 | 150 | // Encode tokenizes the input text and returns a list of Tokens. 151 | func (proc *Processor) Encode(text string) []Token { 152 | text = normalize(text) 153 | 154 | // We begin by having each symbol a single Unicode character (or a 155 | // user-defined string), and will iteratively merge them into larger and 156 | // larger symbols until we have the final list of tokens. 157 | // Since this list of symbols changes a lot, we represent it as a 158 | // doubly-linked list in the symList slice. Each element in this slice has 159 | // prev/next links to the next "live" symbol in the list; noMerge means this 160 | // is a user-defined symbol we're not allowed to merge with neighbors. 161 | // After the algorithm is finished, many elements in symList will be "dead" 162 | // (unreachable by next/prev links from the first element). 163 | // This representation is inspired by the implementation of bpe::Model 164 | // in the SentencePiece C++ library. 165 | 166 | type symListElem struct { 167 | prev, next int 168 | noMerge bool 169 | symbol string 170 | } 171 | symList := make([]symListElem, 0, len(text)) 172 | 173 | for { 174 | // Match the next symbol in text 175 | slen, found := proc.symbolMatch(text) 176 | 177 | // Append a list element for this symbol; note that this element will be 178 | // at index len(symList), so prev/next are set up accordingly. 179 | sym := symListElem{ 180 | noMerge: found, 181 | symbol: text[:slen], 182 | prev: len(symList) - 1, 183 | next: len(symList) + 1, 184 | } 185 | symList = append(symList, sym) 186 | 187 | // Advance the text slice to the next symbol; if no more text, we're done. 188 | text = text[slen:] 189 | if len(text) == 0 { 190 | break 191 | } 192 | } 193 | 194 | if len(symList) == 0 { 195 | return nil 196 | } 197 | symList[len(symList)-1].next = -1 198 | nTokens := len(symList) 199 | 200 | debugShowSymList := func(prefix string) { 201 | if debugEncode { 202 | fmt.Println(prefix) 203 | for i, elem := range symList { 204 | fmt.Printf("[%3d]: [prev: %3v, next: %3d, noMerge: %v] %q\n", i, elem.prev, elem.next, elem.noMerge, elem.symbol) 205 | } 206 | } 207 | } 208 | debugShowSymList("initial") 209 | 210 | // To avoid repeating work, we manage a priority queue of "merge candidates". 211 | // Each candidate has pointers to the symList list for the left and right 212 | // symbol in the pair, as well as the combined symbol's score. 213 | // The priority of merging is determined by this score, with position as 214 | // the tie-breaker (earlier pairs are preferred). 215 | type mergeCandidate struct { 216 | left, right int 217 | length int 218 | score float32 219 | } 220 | 221 | mergeQueue := priorityqueue.New(len(symList), func(a, b mergeCandidate) int { 222 | if a.score > b.score || (a.score == b.score && a.left < b.left) { 223 | return 1 224 | } 225 | return -1 226 | }) 227 | 228 | // findMerged looks for x+y in the vocabulary, and returns the 229 | // merged piece, its ID and true if found. buf is a reusable buffer used to 230 | // merge two strings together without allocations. 231 | buf := make([]byte, proc.maxPieceLength) 232 | findMerged := func(x, y symListElem) (string, int, bool) { 233 | buf = buf[:len(x.symbol)+len(y.symbol)] 234 | copy(buf, x.symbol) 235 | copy(buf[len(x.symbol):], y.symbol) 236 | if id, found := proc.pieces[string(buf)]; found { 237 | return proc.model.GetPieces()[id].GetPiece(), id, true 238 | } 239 | return "", 0, false 240 | } 241 | 242 | // suggestNewMergePair is called to potentially add a new mergeCandidate to 243 | // mergeQueue. The candidate is added if it's valid, both its parts are 244 | // allowed to merge, and it appears in the vocabulary. 245 | suggestNewMergePair := func(left, right int) { 246 | if left == -1 || right == -1 || symList[left].noMerge || symList[right].noMerge { 247 | return 248 | } 249 | 250 | if mergedSymbol, id, ok := findMerged(symList[left], symList[right]); ok { 251 | mergeQueue.Insert(mergeCandidate{ 252 | left: left, 253 | right: right, 254 | length: len(mergedSymbol), 255 | score: proc.model.GetPieces()[id].GetScore(), 256 | }) 257 | } 258 | } 259 | 260 | // Seed the merge queue with all pairs of symbols from symList 261 | for i := 1; i < len(symList); i++ { 262 | suggestNewMergePair(i-1, i) 263 | } 264 | 265 | // candidateIsDead indicates that a candidate is out of date: one of its 266 | // parts was already merged with another symbol, so we don't want to consider 267 | // it any more. 268 | candidateIsDead := func(candidate mergeCandidate) bool { 269 | leftSymbol := symList[candidate.left].symbol 270 | rightSymbol := symList[candidate.right].symbol 271 | return leftSymbol == "" || rightSymbol == "" || len(leftSymbol)+len(rightSymbol) != candidate.length 272 | } 273 | 274 | // Main loop 275 | mergeQueueDead := 0 276 | for mergeQueue.Len() > 0 { 277 | candidate := mergeQueue.PopMax() 278 | leftSymbol := symList[candidate.left] 279 | rightSymbol := symList[candidate.right] 280 | 281 | if candidateIsDead(candidate) { 282 | mergeQueueDead-- 283 | continue 284 | } 285 | 286 | // If there are lots more dead merge candidates than live ones, remove the 287 | // dead. This is a relatively expensive operation but it's performed rarely, 288 | // and it makes the priority queue smaller - making all subsequent 289 | // operations faster. 290 | // The factor of 3 was determined empirically. 291 | if mergeQueueDead*3 > mergeQueue.Len() { 292 | mergeQueue.RemoveFunc(candidateIsDead) 293 | mergeQueueDead = 0 294 | } 295 | 296 | // Do the merge: 297 | // 1. Merge the concatenation of leftSymbol and rightSymbol into leftSymbol 298 | mergedSymbol, _, ok := findMerged(leftSymbol, rightSymbol) 299 | if !ok { 300 | panic("failed to merge symbols") 301 | } 302 | symList[candidate.left].symbol = mergedSymbol 303 | nTokens-- 304 | 305 | // 2. Update prev/next pointers 306 | symList[candidate.left].next = rightSymbol.next 307 | if rightSymbol.next >= 0 { 308 | symList[rightSymbol.next].prev = candidate.left 309 | } 310 | 311 | // 3. Mark the right element in the pair as outdated (it's been merged 312 | // into the left one). 313 | symList[candidate.right].symbol = "" 314 | mergeQueueDead++ 315 | 316 | // 4. Add merge suggestions for the newly merged symbol with its neighbors 317 | suggestNewMergePair(leftSymbol.prev, candidate.left) 318 | suggestNewMergePair(candidate.left, rightSymbol.next) 319 | } 320 | 321 | // Collect the final list of tokens from the remaining elements of symList. 322 | tokens := make([]Token, 0, nTokens) 323 | for i := 0; i >= 0; i = symList[i].next { 324 | symbol := symList[i].symbol 325 | id := proc.symbolToID(symbol) 326 | 327 | if id == proc.unknownID && proc.model.GetTrainerSpec().GetByteFallback() { 328 | // Decompose this symbol into bytes, and report each byte as a separate 329 | // token. 330 | for i := 0; i < len(symbol); i++ { 331 | tokens = append(tokens, proc.byte2Token[symbol[i]]) 332 | } 333 | } else { 334 | tokens = append(tokens, Token{ID: id, Text: symbol}) 335 | } 336 | } 337 | 338 | return tokens 339 | } 340 | 341 | // symbolMatch finds the length of the first symbol in text. A symbol is either 342 | // a user-defined symbol from the proto or a single rune. The second return 343 | // value is true iff a user-defined symbol was matched. 344 | func (proc *Processor) symbolMatch(text string) (int, bool) { 345 | prefixLen := proc.userDefinedMatcher.FindPrefixLen(text) 346 | if prefixLen > 0 { 347 | return prefixLen, true 348 | } 349 | // Not found a user-defined prefix; get the length of next rune. 350 | _, rlen := utf8.DecodeRuneInString(text) 351 | return rlen, false 352 | } 353 | 354 | const ( 355 | symbolBOS = "" 356 | symbolEOS = "" 357 | symbolUNK = "" 358 | symbolPAD = "" 359 | ) 360 | 361 | // symbolToID finds the right ID for the given textual symbol, or returns 362 | // proc.unknownID if the symbol is unknown. 363 | func (proc *Processor) symbolToID(symbol string) int { 364 | if id, found := proc.reserved[symbol]; found { 365 | return id 366 | } 367 | if id, found := proc.pieces[symbol]; found { 368 | return id 369 | } 370 | return proc.unknownID 371 | } 372 | 373 | // convertHexValue converts strings of the form "<0xXY>" to the (unsigned) 374 | // integer value of the hexadecimal number XY. -1 is returned for bad input. 375 | func convertHexValue(bv string) int { 376 | bv = strings.TrimPrefix(bv, "<0x") 377 | bv = strings.TrimSuffix(bv, ">") 378 | n, err := strconv.ParseInt(bv, 16, 32) 379 | if err != nil { 380 | return -1 381 | } 382 | return int(n) 383 | } 384 | 385 | // Decode translates a list of IDs produced by [Encode] back into the string 386 | // it represents. 387 | func (proc *Processor) Decode(ids []int) string { 388 | var sb strings.Builder 389 | 390 | for i := 0; i < len(ids); { 391 | // Find a run of IDs that represent single bytes starting at i. 392 | nextNonByte := i 393 | for nextNonByte < len(ids) && proc.isByteID(ids[nextNonByte]) { 394 | nextNonByte++ 395 | } 396 | numBytes := nextNonByte - i 397 | 398 | // Handle a run of numBytes IDs, by decoding them into utf8 runes. 399 | if numBytes > 0 { 400 | buf := make([]byte, 0, numBytes) 401 | for bi := i; bi < nextNonByte; bi++ { 402 | buf = append(buf, proc.idToByte[ids[bi]]) 403 | } 404 | 405 | for len(buf) > 0 { 406 | // DecodeRune returns utf8.RuneError ('\uFFFD') for bad UTF8 encodings, 407 | // and this is exactly what SentencePiece is supposed to emit for them. 408 | // So we don't do any special handling for UTF8 decode errors here. 409 | r, size := utf8.DecodeRune(buf) 410 | sb.WriteRune(r) 411 | buf = buf[size:] 412 | } 413 | } 414 | 415 | if nextNonByte >= len(ids) { 416 | break 417 | } 418 | // Here nextNonByte is the index of an ID that's not a single byte. 419 | id := ids[nextNonByte] 420 | if proc.isControlID(id) { 421 | // Don't emit anything for control IDs 422 | } else if id == proc.unknownID { 423 | // Special "unk_surface" string for unknown IDs 424 | sb.WriteString(proc.model.GetTrainerSpec().GetUnkSurface()) 425 | } else { 426 | piece := proc.model.GetPieces()[id].GetPiece() 427 | sb.WriteString(replaceSeparatorsBySpace(piece)) 428 | } 429 | i = nextNonByte + 1 430 | } 431 | 432 | return sb.String() 433 | } 434 | 435 | // DecodeTokens is a convenience wrapper around [Decode], accepting a list of 436 | // tokens as returned by [Encode]. It only uses the ID fields of tokens to 437 | // decode the text. 438 | func (proc *Processor) DecodeTokens(tokens []Token) string { 439 | ids := make([]int, len(tokens)) 440 | for i, t := range tokens { 441 | ids[i] = t.ID 442 | } 443 | return proc.Decode(ids) 444 | } 445 | 446 | func (proc *Processor) isByteID(id int) bool { 447 | return proc.model.GetPieces()[id].GetType() == model.ModelProto_SentencePiece_BYTE 448 | } 449 | 450 | func (proc *Processor) isControlID(id int) bool { 451 | return proc.model.GetPieces()[id].GetType() == model.ModelProto_SentencePiece_CONTROL 452 | } 453 | 454 | // ModelInfo stores information about the model proto loaded by the processor. 455 | type ModelInfo struct { 456 | VocabularySize int 457 | BeginningOfSentenceID int 458 | EndOfSentenceID int 459 | UnknownID int 460 | PadID int 461 | } 462 | 463 | // ModelInfo returns information about the loaded proto model file. 464 | func (proc *Processor) ModelInfo() *ModelInfo { 465 | getControlID := func(symbol string) int { 466 | if id := proc.symbolToID(symbol); proc.isControlID(id) { 467 | return id 468 | } 469 | return -1 470 | } 471 | 472 | return &ModelInfo{ 473 | VocabularySize: len(proc.model.GetPieces()), 474 | BeginningOfSentenceID: getControlID(symbolBOS), 475 | EndOfSentenceID: getControlID(symbolEOS), 476 | PadID: getControlID(symbolPAD), 477 | UnknownID: proc.unknownID, 478 | } 479 | } 480 | -------------------------------------------------------------------------------- /processor_test.go: -------------------------------------------------------------------------------- 1 | package sentencepiece 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "slices" 7 | "testing" 8 | ) 9 | 10 | func createProcessor(t testing.TB) *Processor { 11 | t.Helper() 12 | protoFile := os.Getenv("MODELPATH") 13 | if protoFile == "" { 14 | t.Fatal("Need MODELPATH env var to run tests") 15 | } 16 | 17 | proc, err := NewProcessorFromPath(protoFile) 18 | if err != nil { 19 | t.Error(err) 20 | } 21 | return proc 22 | } 23 | 24 | func TestEncodeIDs(t *testing.T) { 25 | proc := createProcessor(t) 26 | 27 | var tests = []struct { 28 | text string 29 | wantIDs []int 30 | }{ 31 | {"hello world", []int{17534, 2134}}, 32 | {"12345", []int{235274, 235284, 235304, 235310, 235308}}, 33 | {" ", []int{139}}, 34 | {" ", []int{140}}, 35 | {" ", []int{145}}, 36 | {"ҔӌԐڎ", []int{427, 365, 428, 357, 429, 361, 435, 359}}, 37 | {" ", []int{235248, 4, 139, 235322, 8939, 235313}}, 38 | {"
", []int{169, 175, 183, 177}}, 39 | {"one line\nand another line", []int{785, 2017, 108, 639, 2550, 2017}}, 40 | {"Language: English\r\n\r\nCredits: Produced by David Widger\r\n", []int{14357, 235292, 4645, 235316, 108, 235316, 108, 34711, 235292, 99662, 731, 6046, 37303, 1197, 235316, 108}}, 41 | {"Bienvenido a este proyecto", []int{176831, 476, 4004, 25431}}, 42 | {"अस्मिन् परियोजनायां स्वागतम्", []int{236088, 22740, 212361, 18029, 14480, 19900, 146166, 6751, 235563, 56545, 44071, 235550, 26989}}, 43 | {"if allow == true { return x;} else {return x+y;}", []int{648, 2765, 1159, 1382, 612, 2203, 1141, 22505, 1354, 612, 773, 1141, 235340, 235267, 22505}}, 44 | } 45 | 46 | for _, tt := range tests { 47 | t.Run(tt.text, func(t *testing.T) { 48 | got := proc.Encode(tt.text) 49 | 50 | var gotIDs []int 51 | for _, t := range got { 52 | gotIDs = append(gotIDs, t.ID) 53 | } 54 | 55 | if !slices.Equal(gotIDs, tt.wantIDs) { 56 | t.Errorf("got %v\nwant: %v\n", gotIDs, tt.wantIDs) 57 | } 58 | }) 59 | } 60 | } 61 | 62 | func TestProcessorWithText(t *testing.T) { 63 | proc := createProcessor(t) 64 | 65 | var tests = []struct { 66 | text string 67 | wantTokens []Token 68 | }{ 69 | {"hi bye", 70 | []Token{ 71 | {544, "hi"}, 72 | {235248, "▁"}, 73 | {176, ""}, 74 | {44788, "▁bye"}, 75 | }}, 76 | {"hiƻ 🤨there ⇲bob, สวัสดี", 77 | []Token{ 78 | {544, "hi"}, 79 | {415, "<0xC6>"}, 80 | {404, "<0xBB>"}, 81 | {235248, "▁"}, 82 | {176, ""}, 83 | {241847, "🤨"}, 84 | {11048, "there"}, 85 | {235248, "▁"}, 86 | {248372, "⇲"}, 87 | {26242, "bob"}, 88 | {235269, ","}, 89 | {12515, "▁ส"}, 90 | {151622, "วัส"}, 91 | {28890, "ดี"}, 92 | }}, 93 | } 94 | 95 | for _, tt := range tests { 96 | t.Run(tt.text, func(t *testing.T) { 97 | got := proc.Encode(tt.text) 98 | if !slices.Equal(got, tt.wantTokens) { 99 | t.Errorf("got %v\nwant: %v\n", got, tt.wantTokens) 100 | } 101 | }) 102 | } 103 | } 104 | 105 | func TestSymbolMatch(t *testing.T) { 106 | proc := createProcessor(t) 107 | 108 | var tests = []struct { 109 | text string 110 | wantLen int 111 | wantFound bool 112 | }{ 113 | {"", 4, true}, 114 | {"", 3, true}, 115 | {"", 4, true}, 116 | {"", 15, true}, 117 | {"", 64}, 141 | {"<0x00>", 0}, 142 | {"<0x1a>", 26}, 143 | {"<0xF3>", 243}, 144 | 145 | {"0x12>", -1}, 146 | {"", -1}, 147 | {"<012>", -1}, 148 | {"<0xTA>", -1}, 149 | } 150 | 151 | for _, tt := range tests { 152 | t.Run(tt.in, func(t *testing.T) { 153 | gotN := convertHexValue(tt.in) 154 | if gotN != tt.wantN { 155 | t.Errorf("got %v, want %v", gotN, tt.wantN) 156 | } 157 | }) 158 | } 159 | } 160 | 161 | func TestDecoder(t *testing.T) { 162 | proc := createProcessor(t) 163 | 164 | var tests = []struct { 165 | IDs []int 166 | wantText string 167 | }{ 168 | {[]int{17534, 2134}, "hello world"}, 169 | {[]int{427, 365, 428, 357, 29422, 1653, 427, 365, 428, 357}, "Ҕӌnever againҔӌ"}, 170 | {[]int{785, 2017, 108, 639, 2550, 2017}, "one line\nand another line"}, 171 | {[]int{1001, 1002, 1003, 1004}, "buark}) res"}, 172 | {[]int{111001, 111002, 111003, 111004}, " Wichita EducaçãoVocabulary天堂"}, 173 | {[]int{139}, " "}, 174 | {[]int{140}, " "}, 175 | {[]int{145}, " "}, 176 | {[]int{441, 401, 387}, "ส"}, 177 | {[]int{411, 380}, "£"}, 178 | 179 | // control IDs (0, 1, 2) 180 | {[]int{2, 411, 380}, "£"}, 181 | {[]int{1, 2, 411, 380}, "£"}, 182 | {[]int{2, 411, 380, 0, 1, 2, 0}, "£"}, 183 | 184 | // unknown (id=3) 185 | {[]int{3, 411, 380}, " ⁇ £"}, 186 | {[]int{3, 3, 1000, 3}, " ⁇ ⁇ ew ⁇ "}, 187 | 188 | // invalid bytes for UTF-8, produce "invalid unicode" runes 189 | {[]int{349, 349, 349}, "���"}, 190 | {[]int{800, 348, 500, 348}, "sed�it�"}, 191 | } 192 | 193 | for _, tt := range tests { 194 | t.Run(fmt.Sprintf("%v", tt.IDs), func(t *testing.T) { 195 | got := proc.Decode(tt.IDs) 196 | if got != tt.wantText { 197 | t.Errorf("got %q\nwant %q\n", got, tt.wantText) 198 | } 199 | }) 200 | } 201 | } 202 | 203 | func TestDecodeTokens(t *testing.T) { 204 | proc := createProcessor(t) 205 | wantText := "hello world" 206 | tokens := []Token{ 207 | Token{17534, "xxx"}, 208 | Token{139, "xxx"}, 209 | Token{2134, "xxx"}} 210 | 211 | text := proc.DecodeTokens(tokens) 212 | if text != wantText { 213 | t.Errorf("got %q, want %q", text, wantText) 214 | } 215 | } 216 | 217 | func TestInfo(t *testing.T) { 218 | proc := createProcessor(t) 219 | info := proc.ModelInfo() 220 | 221 | // Assumes we use the known model file 222 | wantVocabSize := 256000 223 | wantBOS := 2 224 | wantEOS := 1 225 | wantPAD := 0 226 | wantUNK := 3 227 | 228 | if info.VocabularySize != wantVocabSize { 229 | t.Errorf("got %v, want %v", info.VocabularySize, wantVocabSize) 230 | } 231 | if info.BeginningOfSentenceID != wantBOS { 232 | t.Errorf("got %v, want %v", info.BeginningOfSentenceID, wantBOS) 233 | } 234 | if info.EndOfSentenceID != wantEOS { 235 | t.Errorf("got %v, want %v", info.EndOfSentenceID, wantEOS) 236 | } 237 | if info.PadID != wantPAD { 238 | t.Errorf("got %v, want %v", info.PadID, wantPAD) 239 | } 240 | if info.UnknownID != wantUNK { 241 | t.Errorf("got %v, want %v", info.UnknownID, wantUNK) 242 | } 243 | } 244 | -------------------------------------------------------------------------------- /system_test.go: -------------------------------------------------------------------------------- 1 | package sentencepiece 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "fmt" 7 | "io/ioutil" 8 | "log" 9 | "os" 10 | "os/exec" 11 | "path/filepath" 12 | "slices" 13 | "strconv" 14 | "testing" 15 | ) 16 | 17 | // "System" test for comparing our Procesor with the canonical sentencepiece 18 | // Python package (officially distributed with the original C++ implementation 19 | // of the algorithm). 20 | // It also runs Decode for a round-trip test to ensure we get the original 21 | // text back. 22 | // 23 | // This test will only run if python3 is available and is able to successfully 24 | // load the sentencepiece library. Typically this means that 'go test' will 25 | // have to run from an activated Python virtual environment where the library 26 | // was installed. 27 | 28 | func TestVsSentencepiecePython(t *testing.T) { 29 | proc := createProcessor(t) 30 | 31 | if _, err := exec.Command("python3", "-c", "import sentencepiece").Output(); err != nil { 32 | t.Skip("This test only runs when python3 with sentencepiece is available") 33 | } 34 | pyProgramPath := filepath.Join("test", "sp-dump-ids.py") 35 | 36 | paths, err := filepath.Glob(filepath.Join("test", "*.txt")) 37 | if err != nil { 38 | t.Fatal(err) 39 | } 40 | 41 | for _, path := range paths { 42 | _, filename := filepath.Split(path) 43 | testname := filename[:len(filename)-len(filepath.Ext(path))] 44 | 45 | t.Run(testname, func(t *testing.T) { 46 | // Step 1: run the Python program to tokenize path into IDs. 47 | pyOut, err := exec.Command("python3", pyProgramPath, path).Output() 48 | if err != nil { 49 | t.Fatalf("while running %v on %v: %v", pyProgramPath, path, err) 50 | } 51 | 52 | pyIDs := pyOutToIDs(pyOut) 53 | 54 | // Step 2: use our Processor to tokenize path into IDs. 55 | buf, err := ioutil.ReadFile(path) 56 | if err != nil { 57 | log.Fatal(err) 58 | } 59 | text := string(buf) 60 | var goIDs []int 61 | goTokens := proc.Encode(text) 62 | for _, t := range goTokens { 63 | goIDs = append(goIDs, t.ID) 64 | } 65 | 66 | // Step 3: compare the two; dump IDs to temp files for debugging in case 67 | // of a mismatch. 68 | if !slices.Equal(pyIDs, goIDs) { 69 | tmppy := dumpIDsToTempFile(testname+"-py-", pyIDs) 70 | tmpgo := dumpIDsToTempFile(testname+"-go-", goIDs) 71 | 72 | t.Errorf("IDs mismatch; dumped to %q and %q", tmppy, tmpgo) 73 | } 74 | 75 | // Step 4: round-trip Decode to get original text back 76 | newText := proc.Decode(goIDs) 77 | if text != newText { 78 | t.Errorf("text mismatch after Decode") 79 | } 80 | }) 81 | } 82 | } 83 | 84 | // pyOutToIDs takes the entire stdout output of the Python program and parses 85 | // it into a list of integer IDs. 86 | func pyOutToIDs(pyOut []byte) []int { 87 | var IDs []int 88 | scanner := bufio.NewScanner(bytes.NewReader(pyOut)) 89 | for scanner.Scan() { 90 | i, err := strconv.Atoi(scanner.Text()) 91 | if err != nil { 92 | log.Fatal(err) 93 | } 94 | IDs = append(IDs, i) 95 | } 96 | if err := scanner.Err(); err != nil { 97 | log.Fatal(err) 98 | } 99 | return IDs 100 | } 101 | 102 | // dumpIDsToTempFile dumps the given IDs (one per line) to a temporary file with 103 | // the given prefix, and returns the name of the temporary file. 104 | func dumpIDsToTempFile(prefix string, IDs []int) string { 105 | tf, err := os.CreateTemp("", prefix) 106 | if err != nil { 107 | log.Fatal(err) 108 | } 109 | defer tf.Close() 110 | 111 | for _, id := range IDs { 112 | fmt.Fprintf(tf, "%d\n", id) 113 | } 114 | return tf.Name() 115 | } 116 | -------------------------------------------------------------------------------- /test/gocode1.txt: -------------------------------------------------------------------------------- 1 | var ( 2 | file_sentencepiece_model_proto_rawDescOnce sync.Once 3 | file_sentencepiece_model_proto_rawDescData = file_sentencepiece_model_proto_rawDesc 4 | ) 5 | 6 | func file_sentencepiece_model_proto_rawDescGZIP() []byte { 7 | file_sentencepiece_model_proto_rawDescOnce.Do(func() { 8 | file_sentencepiece_model_proto_rawDescData = protoimpl.X.CompressGZIP(file_sentencepiece_model_proto_rawDescData) 9 | }) 10 | return file_sentencepiece_model_proto_rawDescData 11 | } 12 | 13 | var file_sentencepiece_model_proto_enumTypes = make([]protoimpl.EnumInfo, 2) 14 | var file_sentencepiece_model_proto_msgTypes = make([]protoimpl.MessageInfo, 6) 15 | var file_sentencepiece_model_proto_goTypes = []interface{}{ 16 | (TrainerSpec_ModelType)(0), // 0: sentencepiece.TrainerSpec.ModelType 17 | (ModelProto_SentencePiece_Type)(0), // 1: sentencepiece.ModelProto.SentencePiece.Type 18 | (*TrainerSpec)(nil), // 2: sentencepiece.TrainerSpec 19 | (*NormalizerSpec)(nil), // 3: sentencepiece.NormalizerSpec 20 | (*SelfTestData)(nil), // 4: sentencepiece.SelfTestData 21 | (*ModelProto)(nil), // 5: sentencepiece.ModelProto 22 | (*SelfTestData_Sample)(nil), // 6: sentencepiece.SelfTestData.Sample 23 | (*ModelProto_SentencePiece)(nil), // 7: sentencepiece.ModelProto.SentencePiece 24 | } 25 | var file_sentencepiece_model_proto_depIdxs = []int32{ 26 | 0, // 0: sentencepiece.TrainerSpec.model_type:type_name -> sentencepiece.TrainerSpec.ModelType 27 | 6, // 1: sentencepiece.SelfTestData.samples:type_name -> sentencepiece.SelfTestData.Sample 28 | 7, // 2: sentencepiece.ModelProto.pieces:type_name -> sentencepiece.ModelProto.SentencePiece 29 | 2, // 3: sentencepiece.ModelProto.trainer_spec:type_name -> sentencepiece.TrainerSpec 30 | 3, // 4: sentencepiece.ModelProto.normalizer_spec:type_name -> sentencepiece.NormalizerSpec 31 | 4, // 5: sentencepiece.ModelProto.self_test_data:type_name -> sentencepiece.SelfTestData 32 | 3, // 6: sentencepiece.ModelProto.denormalizer_spec:type_name -> sentencepiece.NormalizerSpec 33 | 1, // 7: sentencepiece.ModelProto.SentencePiece.type:type_name -> sentencepiece.ModelProto.SentencePiece.Type 34 | 8, // [8:8] is the sub-list for method output_type 35 | 8, // [8:8] is the sub-list for method input_type 36 | 8, // [8:8] is the sub-list for extension type_name 37 | 8, // [8:8] is the sub-list for extension extendee 38 | 0, // [0:8] is the sub-list for field type_name 39 | } 40 | 41 | func init() { file_sentencepiece_model_proto_init() } 42 | func file_sentencepiece_model_proto_init() { 43 | if File_sentencepiece_model_proto != nil { 44 | return 45 | } 46 | if !protoimpl.UnsafeEnabled { 47 | file_sentencepiece_model_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { 48 | switch v := v.(*TrainerSpec); i { 49 | case 0: 50 | return &v.state 51 | case 1: 52 | return &v.sizeCache 53 | case 2: 54 | return &v.unknownFields 55 | case 3: 56 | return &v.extensionFields 57 | default: 58 | return nil 59 | } 60 | } 61 | file_sentencepiece_model_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { 62 | switch v := v.(*NormalizerSpec); i { 63 | case 0: 64 | return &v.state 65 | case 1: 66 | return &v.sizeCache 67 | case 2: 68 | return &v.unknownFields 69 | case 3: 70 | return &v.extensionFields 71 | default: 72 | return nil 73 | } 74 | } 75 | file_sentencepiece_model_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { 76 | switch v := v.(*SelfTestData); i { 77 | case 0: 78 | return &v.state 79 | case 1: 80 | return &v.sizeCache 81 | case 2: 82 | return &v.unknownFields 83 | case 3: 84 | return &v.extensionFields 85 | default: 86 | return nil 87 | } 88 | } 89 | file_sentencepiece_model_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} { 90 | switch v := v.(*ModelProto); i { 91 | case 0: 92 | return &v.state 93 | case 1: 94 | return &v.sizeCache 95 | case 2: 96 | return &v.unknownFields 97 | case 3: 98 | return &v.extensionFields 99 | default: 100 | return nil 101 | } 102 | } 103 | file_sentencepiece_model_proto_msgTypes[4].Exporter = func(v interface{}, i int) interface{} { 104 | switch v := v.(*SelfTestData_Sample); i { 105 | case 0: 106 | return &v.state 107 | case 1: 108 | return &v.sizeCache 109 | case 2: 110 | return &v.unknownFields 111 | default: 112 | return nil 113 | } 114 | } 115 | file_sentencepiece_model_proto_msgTypes[5].Exporter = func(v interface{}, i int) interface{} { 116 | switch v := v.(*ModelProto_SentencePiece); i { 117 | case 0: 118 | return &v.state 119 | case 1: 120 | return &v.sizeCache 121 | case 2: 122 | return &v.unknownFields 123 | case 3: 124 | return &v.extensionFields 125 | default: 126 | return nil 127 | } 128 | } 129 | } 130 | type x struct{} 131 | out := protoimpl.TypeBuilder{ 132 | File: protoimpl.DescBuilder{ 133 | GoPackagePath: reflect.TypeOf(x{}).PkgPath(), 134 | RawDescriptor: file_sentencepiece_model_proto_rawDesc, 135 | NumEnums: 2, 136 | NumMessages: 6, 137 | NumExtensions: 0, 138 | NumServices: 0, 139 | }, 140 | GoTypes: file_sentencepiece_model_proto_goTypes, 141 | DependencyIndexes: file_sentencepiece_model_proto_depIdxs, 142 | EnumInfos: file_sentencepiece_model_proto_enumTypes, 143 | MessageInfos: file_sentencepiece_model_proto_msgTypes, 144 | }.Build() 145 | File_sentencepiece_model_proto = out.File 146 | file_sentencepiece_model_proto_rawDesc = nil 147 | file_sentencepiece_model_proto_goTypes = nil 148 | file_sentencepiece_model_proto_depIdxs = nil 149 | } 150 | 151 | -------------------------------------------------------------------------------- /test/htmlcode1.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 28 | 29 | 30 | 31 | 32 | The Go Programming Language 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 46 | 47 | 48 | 49 | 240 | -------------------------------------------------------------------------------- /test/latexcode1.txt: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{amsmath, amssymb} 3 | \usepackage{amsfonts} 4 | \usepackage{amsthm} 5 | 6 | \newtheorem{theorem}{Theorem} 7 | 8 | \begin{document} 9 | 10 | \title{Proof of Green's Theorem} 11 | \author{} 12 | \date{} 13 | \maketitle 14 | 15 | \begin{theorem}[Green's Theorem] 16 | Let \( C \) be a positively oriented, simple closed curve in the plane, and let \( D \) be the region bounded by \( C \). If \( L(x, y) \) and \( M(x, y) \) have continuous partial derivatives on an open region that contains \( D \) and \( C \), then 17 | \[ 18 | \oint_C \left( L \, dx + M \, dy \right) = \iint_D \left( \frac{\partial M}{\partial x} - \frac{\partial L}{\partial y} \right) dA. 19 | \] 20 | \end{theorem} 21 | 22 | \begin{proof} 23 | We will prove Green's Theorem by breaking the region \( D \) into small rectangles and then using the Fundamental Theorem of Calculus. 24 | 25 | Assume that the region \( D \) is divided into \( m \times n \) small rectangles. For each small rectangle \( R_{ij} \) with vertices \((x_i, y_j)\), \((x_{i+1}, y_j)\), \((x_{i+1}, y_{j+1})\), and \((x_i, y_{j+1})\), we approximate the line integral around the boundary of \( R_{ij} \): 26 | 27 | \[ 28 | \oint_{\partial R_{ij}} \left( L \, dx + M \, dy \right) \approx \left( M(x_{i+1}, y_{j+1}) - M(x_{i}, y_{j+1}) \right)(x_{i+1} - x_i) - \left( L(x_{i+1}, y_{j+1}) - L(x_{i+1}, y_j) \right)(y_{j+1} - y_j). 29 | \] 30 | 31 | This expression can be rewritten as: 32 | 33 | \[ 34 | \oint_{\partial R_{ij}} \left( L \, dx + M \, dy \right) \approx \left( \frac{\partial M}{\partial x} - \frac{\partial L}{\partial y} \right) \Delta x \Delta y, 35 | \] 36 | where \( \Delta x = x_{i+1} - x_i \) and \( \Delta y = y_{j+1} - y_j \). 37 | 38 | Summing over all rectangles in the region \( D \), we obtain: 39 | 40 | \[ 41 | \sum_{i,j} \oint_{\partial R_{ij}} \left( L \, dx + M \, dy \right) \approx \sum_{i,j} \left( \frac{\partial M}{\partial x} - \frac{\partial L}{\partial y} \right) \Delta x \Delta y. 42 | \] 43 | 44 | The left-hand side of this equation is approximately the line integral over \( C \), and the right-hand side is a Riemann sum that approximates the double integral over \( D \): 45 | 46 | \[ 47 | \oint_C \left( L \, dx + M \, dy \right) = \iint_D \left( \frac{\partial M}{\partial x} - \frac{\partial L}{\partial y} \right) dA. 48 | \] 49 | 50 | Thus, Green's Theorem is proved. 51 | \end{proof} 52 | 53 | \end{document} 54 | 55 | -------------------------------------------------------------------------------- /test/opening-multilang.txt: -------------------------------------------------------------------------------- 1 | Somewhere in la Mancha, in a place whose name I do not care to remember, a 2 | gentleman lived not long ago, one of those who has a lance and ancient shield on 3 | a shelf and keeps a skinny nag and a greyhound for racing. 4 | 5 | En un lugar de la Mancha, de cuyo nombre no quiero acordarme, no ha mucho tiempo 6 | que vivía un hidalgo de los de lanza en astillero, adarga antigua, rocín flaco y 7 | galgo corredor. 8 | 9 | 「ラ・マンチャのどこか、名前を覚えたくもない場所で、古い盾と槍を棚に飾り、痩せた馬と猟犬を飼っていた紳士が、そう遠くない昔に住んでいた。」 10 | 11 | 라 만차 어딘가에서, 이름을 기억하고 싶지 않은 장소에서, 고대 방패와 창을 선반에 두고, 말라깽이 말과 경주용 그레이하운드를 키우는 신사가 얼마 전에 살았다. 12 | 13 | ला मांचायाः काचित् स्थले, यस्य नाम स्मर्तुम् न इच्छामि, तत्र कदाचित् कश्चन सज्जनः वसति स्म, यस्य शस्त्रं प्राचीनं च कवचं तिष्ठति, तस्य च अश्वः कृशः च श्वा धावनाय अस्ति। 14 | 15 | Где-то в Ла-Манче, в месте, имя которого я не хочу вспоминать, жил некогда 16 | дворянин, один из тех, кто держал копье и старинный щит на полке, а также худую 17 | лошадь и борзую для охоты. 18 | -------------------------------------------------------------------------------- /test/perlcode1.txt: -------------------------------------------------------------------------------- 1 | sub init_sim 2 | { 3 | my $args = 4 | { 5 | -init_addr => 0, 6 | -mem_file => undef, 7 | -mem_ref => undef, 8 | -device_dir => './', 9 | @_, 10 | }; 11 | 12 | $args->{-device_dir} .= '/' unless $args->{-device_dir} =~ /\/^/; 13 | 14 | # init memory 15 | for (my $i = 0; $i < get_mix_mem_size(); ++$i) 16 | { 17 | $mem[$i] = empty_word(); 18 | } 19 | 20 | $rA = empty_word(); 21 | $rX = empty_word(); 22 | $rJ = empty_word(); 23 | $rI[$_] = empty_word() 24 | foreach (1 .. 6); 25 | 26 | $f_overflow = 0; 27 | $f_comparison = 0; 28 | $time = 0; 29 | $lc = $args->{-init_addr}; 30 | $simulation_ended = 0; 31 | @io_device = (); 32 | 33 | # init IO devices 34 | # 35 | foreach my $n (0 .. 15) 36 | { 37 | if ($n >= 0 and $n <= 7) 38 | { 39 | push(@io_device, {filename => "tape${n}.dev", io_type => "bio", block_size => 100, data => undef}); 40 | } 41 | elsif ($n >= 8 and $n <= 15) 42 | { 43 | my $m = $n - 8; 44 | push(@io_device, {filename => "disk${m}.dev", io_type => "bio", block_size => 100, data => undef}); 45 | } 46 | } 47 | 48 | push(@io_device, {filename => "cardrd.dev", io_type => "ci", block_size => 16}); 49 | push(@io_device, {filename => "cardwr.dev", io_type => "co", block_size => 16}); 50 | push(@io_device, {filename => "printer.dev", io_type => "co", block_size => 24}); 51 | push(@io_device, {filename => "stdio", io_type => "cio", block_size => 14}); 52 | push(@io_device, {filename => "paper.dev", io_type => "ci", block_size => 14}); 53 | 54 | foreach my $dev (@io_device) 55 | { 56 | $dev->{filename} = $args->{-device_dir} . $dev->{filename}; 57 | } 58 | 59 | $saved_mem_file = $args->{-mem_file}; 60 | $saved_mem_ref = $args->{-mem_ref}; 61 | $saved_init_addr = $args->{-init_addr}; 62 | 63 | if (defined $args->{-mem_file}) 64 | { 65 | load_memory_from_text_file($args->{-mem_file}); 66 | } 67 | elsif (defined $args->{-mem_ref}) 68 | { 69 | @mem = @{$args->{-mem_ref}}; 70 | } 71 | else 72 | { 73 | warn("No memory file or reference given to the simulator\n"); 74 | } 75 | } 76 | 77 | 78 | sub simulation_ended 79 | { 80 | return $simulation_ended; 81 | } 82 | 83 | 84 | sub fetch_next_instruction 85 | { 86 | return @{$mem[$lc]}; 87 | } 88 | 89 | 90 | # Executes one instruction 91 | # 92 | sub step_sim 93 | { 94 | address_is_legal($lc) 95 | or runtime_error("location counter out of memory bounds"); 96 | 97 | my @word = fetch_next_instruction(); 98 | 99 | my $opcode = $word[5]; 100 | my $F = $word[4]; 101 | 102 | if ($opcode == 5 and $F == 2) # HLT 103 | { 104 | $simulation_ended = 1; 105 | return; 106 | } 107 | elsif ($opcode == 0) # NOP 108 | { 109 | $lc++; 110 | return; 111 | } 112 | else 113 | { 114 | # Dispatch the instruction to the appropriate handler, 115 | # based on the opcode. 116 | # 117 | if (defined $opcode_map{$opcode}) 118 | { 119 | my $op_func = $opcode_map{$opcode}; 120 | $op_func->(@word); 121 | $lc++; 122 | } 123 | else 124 | { 125 | runtime_error("illegal opcode: $opcode"); 126 | } 127 | } 128 | } 129 | 130 | 131 | sub get_mem_ref 132 | { 133 | return \@mem; 134 | } 135 | 136 | 137 | # Simulates the MIX code until a HLT instruction is 138 | # incountered. 139 | # 140 | sub run_sim 141 | { 142 | # step through the whole program 143 | # 144 | until (simulation_ended()) 145 | { 146 | step_sim(); 147 | } 148 | 149 | # update the binary devices 150 | # 151 | foreach my $devref (@io_device) 152 | { 153 | next unless is_binary_device($devref) and defined $devref->{data}; 154 | 155 | my $fh = $devref->{handle}; 156 | close $fh if defined $fh; 157 | 158 | unless (open($fh, ">$devref->{filename}")) 159 | { 160 | warn "Unable to write device $devref->{filename}\n"; 161 | next; 162 | } 163 | 164 | foreach my $block_n (keys %{$devref->{data}}) 165 | { 166 | print $fh "$block_n\n"; 167 | 168 | for (my $i = 0; $i < $devref->{block_size}; ++$i) 169 | { 170 | print $fh sprintf("%2s %2s %2s %2s %2s %2s\n", @{$devref->{data}->{$block_n}->[$i]}); 171 | } 172 | } 173 | 174 | close $fh; 175 | } 176 | } 177 | 178 | sub interactive_sim 179 | { 180 | local $| = 1; 181 | my %breakpoints; 182 | 183 | print "\nWelcome to MIXSim interaction !\n\n"; 184 | 185 | interaction: while (1) 186 | { 187 | printf "[%4s]> ", $lc; 188 | my $command = <>; 189 | chomp($command); 190 | 191 | # strip leading and trailing whitespace 192 | $command =~ s/^\s+//; 193 | $command =~ s/\s+$//; 194 | 195 | my @toks = split('\s+', $command); 196 | next if @toks == 0; 197 | 198 | if ($command eq "s") 199 | { 200 | step_sim(); 201 | 202 | print "Simulation ended (HLT)\n" if (simulation_ended()); 203 | 204 | } 205 | elsif ($command eq "c" or $command eq "cl") 206 | { 207 | step_loop: while (1) 208 | { 209 | if (exists $breakpoints{$lc}) 210 | { 211 | print "Breakpoint stop at address $lc\n"; 212 | last step_loop; 213 | } 214 | 215 | if (simulation_ended()) 216 | { 217 | print "Simulation ended (HLT)\n" if (simulation_ended()); 218 | last step_loop; 219 | } 220 | 221 | print "$lc\n" if $command eq "cl"; 222 | step_sim(); 223 | } 224 | } 225 | elsif ($command eq "rst") 226 | { 227 | if (defined $saved_mem_file) 228 | { 229 | init_sim(-mem_file => $saved_mem_file, -init_addr => $saved_init_addr); 230 | } 231 | elsif (defined $saved_mem_ref) 232 | { 233 | init_sim(-mem_ref => $saved_mem_ref, -init_addr => $saved_init_addr); 234 | } 235 | } 236 | elsif ($command eq "r") 237 | { 238 | print state_dump(), "\n"; 239 | } 240 | elsif ($command eq "sr") 241 | { 242 | step_sim(); 243 | print state_dump(), "\n"; 244 | } 245 | elsif ($toks[0] eq "m") 246 | { 247 | if (@toks == 1) 248 | { 249 | print memory_dump(\@mem); 250 | } 251 | elsif (@toks == 2) 252 | { 253 | my $addr = $toks[1]; 254 | address_is_legal($addr) or interactive_error("Illegal address $addr"); 255 | printf("%4s : %2s %2s %2s %2s %2s %2s\n", $addr, @{$mem[$addr]}); 256 | } 257 | else 258 | { 259 | interactive_error("Illegal m command"); 260 | } 261 | } 262 | elsif ($toks[0] eq "b") 263 | { 264 | if (@toks != 2) 265 | { 266 | interactive_error("Illegal b command"); 267 | next; 268 | } 269 | 270 | my $addr = $toks[1]; 271 | 272 | if (not address_is_legal($addr)) 273 | { 274 | interactive_error("Illegal address $addr"); 275 | next; 276 | } 277 | 278 | if (exists $breakpoints{$addr}) 279 | { 280 | delete($breakpoints{$addr}); 281 | print "Removed breakpoint at $addr\n"; 282 | } 283 | else 284 | { 285 | $breakpoints{$addr} = 1; 286 | print "Set breakpoint at $addr\n"; 287 | } 288 | } 289 | elsif ($command eq "bl") 290 | { 291 | my @bkpt_keys = keys %breakpoints; 292 | 293 | if (@bkpt_keys == 0) 294 | { 295 | print "No breakpoints set\n"; 296 | } 297 | else 298 | { 299 | print "Breakpoints set at:\n"; 300 | 301 | if (@bkpt_keys == 1) 302 | { 303 | print "$bkpt_keys[0] "; 304 | } 305 | else 306 | { 307 | foreach my $addr (sort {$a <=> $b} @bkpt_keys) 308 | { 309 | print "$addr "; 310 | } 311 | } 312 | 313 | print "\n"; 314 | } 315 | } 316 | elsif ($command eq "br") 317 | { 318 | %breakpoints = (); 319 | } 320 | elsif ($command eq "h") 321 | { 322 | print "\n*** MIXSim interaction help ***\n\n"; 323 | print "s \t\t step\n"; 324 | print "c \t\t continue until next breakpoint or HLT\n"; 325 | print "cl \t\t same as 'c', with an execution trace\n"; 326 | print "rst \t\t restart simulation (breakpoints remain)\n"; 327 | print "r \t\t print contents of registers\n"; 328 | print "sr \t\t step and print contents of registers\n"; 329 | print "m \t\t print all non-zero memory words\n"; 330 | print "m \t\t print a memory word at \n"; 331 | print "b \t\t set/unset a breakpoint at \n"; 332 | print "bl \t\t list all breakpoints\n"; 333 | print "br \t\t remove all breakpoints\n"; 334 | print "h \t\t show this help\n"; 335 | print "x or q \t\t exit interaction\n\n"; 336 | } 337 | elsif ($command eq "x" or $command eq "q") 338 | { 339 | last interaction; 340 | } 341 | else 342 | { 343 | print "Illegal command. Type 'h' for help\n"; 344 | } 345 | } 346 | 347 | print "\nBye !\n\n"; 348 | } 349 | 350 | 351 | # Returns a state dump - contents of all the registers 352 | # 353 | sub state_dump 354 | { 355 | my $dump_str = ""; 356 | 357 | $dump_str .= sprintf("rA : %2s %2s %2s %2s %2s %2s\n", @{$rA}); 358 | $dump_str .= sprintf("rX : %2s %2s %2s %2s %2s %2s\n", @{$rX}); 359 | 360 | $dump_str .= sprintf("rI$_ : %2s %2s %2s %2s %2s %2s\n", @{$rI[$_]}) 361 | foreach (1 .. 6); 362 | 363 | $dump_str .= "\n"; 364 | $dump_str .= sprintf("rJ : %2s %2s %2s %2s %2s %2s\n", @{$rJ}); 365 | $dump_str .= sprintf("lc : %5s\n", $lc); 366 | $dump_str .= sprintf("ovf : %2s\n", $f_overflow); 367 | $dump_str .= sprintf("comp : %2s\n", $f_comparison); 368 | } 369 | 370 | 371 | # Reports runtime errors - errors that occured during simulation 372 | # as a result of incorrect machine code. $lc is reported 373 | # 374 | sub runtime_error 375 | { 376 | my ($msg) = @_; 377 | 378 | die("Simulation error at address $lc: $msg\n"); 379 | } 380 | 381 | 382 | -------------------------------------------------------------------------------- /test/pg2000_spanish.txt: -------------------------------------------------------------------------------- 1 | The Project Gutenberg eBook of Don Quijote 2 | 3 | This ebook is for the use of anyone anywhere in the United States and 4 | most other parts of the world at no cost and with almost no restrictions 5 | whatsoever. You may copy it, give it away or re-use it under the terms 6 | of the Project Gutenberg License included with this ebook or online 7 | at www.gutenberg.org. If you are not located in the United States, 8 | you will have to check the laws of the country where you are located 9 | before using this eBook. 10 | 11 | Title: Don Quijote 12 | 13 | Author: Miguel de Cervantes Saavedra 14 | 15 | Release date: December 1, 1999 [eBook #2000] 16 | Most recently updated: January 17, 2021 17 | 18 | Language: Spanish 19 | 20 | Credits: an anonymous Project Gutenberg volunteer and Joaquin Cuenca Abela 21 | 22 | 23 | *** START OF THE PROJECT GUTENBERG EBOOK DON QUIJOTE *** 24 | 25 | 26 | 27 | 28 | El ingenioso hidalgo don Quijote de la Mancha 29 | 30 | 31 | 32 | por Miguel de Cervantes Saavedra 33 | 34 | 35 | 36 | 37 | 38 | El ingenioso hidalgo don Quijote de la Mancha 39 | 40 | 41 | 42 | Tasa 43 | 44 | 45 | Testimonio de las erratas 46 | 47 | 48 | El Rey 49 | 50 | 51 | Al Duque de Béjar 52 | 53 | 54 | Prólogo 55 | 56 | 57 | Al libro de don Quijote de la Mancha 58 | 59 | 60 | 61 | Que trata de la condición y ejercicio del famoso 62 | hidalgo don Quijote de la Mancha 63 | 64 | Que trata de la primera salida que de su tierra hizo 65 | el ingenioso don Quijote 66 | 67 | Donde se cuenta la graciosa manera que tuvo don 68 | Quijote en armarse caballero 69 | 70 | De lo que le sucedió a nuestro caballero cuando salió 71 | de la venta 72 | 73 | Donde se prosigue la narración de la desgracia de 74 | nuestro caballero 75 | 76 | Del donoso y grande escrutinio que el cura y el 77 | barbero hicieron en la librería de nuestro ingenioso hidalgo 78 | 79 | De la segunda salida de nuestro buen caballero don 80 | Quijote de la Mancha 81 | 82 | Del buen suceso que el valeroso don Quijote tuvo en 83 | la espantable y jamás imaginada aventura de los molinos de viento, con 84 | otros sucesos dignos de felice recordación 85 | 86 | Donde se concluye y da fin a la estupenda batalla que 87 | el gallardo vizcaíno y el valiente manchego tuvieron 88 | 89 | De lo que más le avino a don Quijote con el vizcaíno, y 90 | del peligro en que se vio con una turba de yangüeses 91 | 92 | De lo que le sucedió a don Quijote con unos 93 | cabreros 94 | 95 | De lo que contó un cabrero a los que estaban con don 96 | Quijote 97 | 98 | Donde se da fin al cuento de la pastora Marcela, con 99 | otros sucesos 100 | 101 | Donde se ponen los versos desesperados del difunto 102 | pastor, con otros no esperados sucesos 103 | 104 | Donde se cuenta la desgraciada aventura que se topó 105 | don Quijote en topar con unos desalmados yangüeses 106 | 107 | De lo que le sucedió al ingenioso hidalgo en la venta 108 | que él imaginaba ser castillo 109 | 110 | Donde se prosiguen los innumerables trabajos que el 111 | bravo don Quijote y su buen escudero Sancho Panza pasaron en la venta 112 | que, por su mal, pensó que era castillo 113 | 114 | Donde se cuentan las razones que pasó Sancho Panza 115 | con su señor Don Quijote, con otras aventuras dignas de ser 116 | contadas 117 | 118 | De las discretas razones que Sancho pasaba con su 119 | amo, y de la aventura que le sucedió con un cuerpo muerto, con otros 120 | acontecimientos famosos 121 | 122 | De la jamás vista ni oída aventura que con más poco 123 | peligro fue acabada de famoso caballero en el mundo, como la que acabó 124 | el valeroso don Quijote de la Mancha 125 | 126 | Que trata de la alta aventura y rica ganancia del 127 | yelmo de Mambrino, con otras cosas sucedidas a nuestro invencible 128 | caballero 129 | 130 | De la libertad que dio don Quijote a muchos 131 | desdichados que, mal de su grado, los llevaban donde no quisieran 132 | ir 133 | 134 | De lo que le aconteció al famoso don Quijote en 135 | Sierra Morena, que fue una de las más raras aventuras que en esta 136 | verdadera historia se cuentan 137 | 138 | Donde se prosigue la aventura de la Sierra 139 | Morena 140 | 141 | Que trata de las estrañas cosas que en Sierra Morena 142 | sucedieron al valiente caballero de la Mancha, y de la imitación que 143 | hizo a la penitencia de Beltenebros 144 | 145 | Donde se prosiguen las finezas que de enamorado hizo 146 | don Quijote en Sierra Morena 147 | 148 | De cómo salieron con su intención el cura y el 149 | barbero, con otras cosas dignas de que se cuenten en esta grande 150 | historia 151 | 152 | Que trata de la nueva y agradable aventura que al 153 | cura y barbero sucedió en la mesma sierra 154 | 155 | Que trata de la discreción de la hermosa Dorotea, 156 | con otras cosas de mucho gusto y pasatiempo 157 | 158 | Que trata del gracioso artificio y orden que se tuvo 159 | en sacar a nuestro enamorado caballero de la asperísima penitencia en 160 | que se había puesto 161 | 162 | De los sabrosos razonamientos que pasaron entre don 163 | Quijote y Sancho Panza, su escudero, con otros sucesos 164 | 165 | Que trata de lo que sucedió en la venta a toda la 166 | cuadrilla de don Quijote 167 | 168 | Donde se cuenta la novela del Curioso 169 | impertinente 170 | -------------------------------------------------------------------------------- /test/pg41845_telugu.txt: -------------------------------------------------------------------------------- 1 | The Project Gutenberg eBook of ఓనమాలు 2 | 3 | This ebook is for the use of anyone anywhere in the United States and 4 | most other parts of the world at no cost and with almost no restrictions 5 | whatsoever. You may copy it, give it away or re-use it under the terms 6 | of the Project Gutenberg License included with this ebook or online 7 | at www.gutenberg.org. If you are not located in the United States, 8 | you will have to check the laws of the country where you are located 9 | before using this eBook. 10 | 11 | Title: ఓనమాలు 12 | 13 | Author: Mahidhara Ramamohan Rao 14 | 15 | Release date: January 14, 2013 [eBook #41845] 16 | 17 | Language: Telugu 18 | 19 | Credits: Produced by volunteers at Pustakam.net 20 | 21 | 22 | *** START OF THE PROJECT GUTENBERG EBOOK ఓనమాలు *** 23 | 24 | 25 | 26 | 27 | Produced by volunteers at Pustakam.net 28 | 29 | 30 | 31 | 32 | అవంతీ ప్రచురణలు 4. 33 | 34 | 35 | 36 | 37 | ఓనమాలు 38 | 39 | 40 | 41 | 42 | రచన: 43 | 44 | మహీధర రామమోహనరావు 45 | 46 | 47 | 48 | 49 | సోల్ డిస్ట్రిబ్యూటర్లు: 50 | 51 | విశాలాంధ్ర ప్రచురణాలయం, 52 | 53 | విజయవాడ-2 54 | 55 | 56 | 57 | 58 | మొదటి ముద్రణ 59 | 60 | 1956 61 | 62 | 63 | 64 | 65 | వెల 66 | 67 | రెండు రూపాయల పావలా 68 | 69 | 70 | 71 | 72 | అవంతీ ప్రెస్ 73 | 74 | రాజమండ్రి 75 | 76 | 77 | 78 | 79 | 1947.... 80 | 81 | ....నాటి తెలంగాణా ఒక అగ్నిగుండం. 82 | 83 | దుస్సహమైన జాగీర్దారీ వ్యవస్థను నిర్మూలించగల పోరాటాల్ని ప్రజానీకం సాగిస్తూంది. వాటినన్నింటినీ ఒకే జెండా క్రిందికి తెచ్చి, 84 | రాజకీయ నాయకత్వం సమకూర్చడానికై ఆంధ్రమహాసభా, కమ్యూనిస్టు పార్టీ సన్నాహాలు సాగిస్తున్నాయి. 85 | 86 | రెండో వైపున – విదేశీ పాలనకూ, సంస్థానాధీశుల నిరంకుశ పాలనకూ వ్యతిరేకంగా జాతీయ ప్రజాతంత్ర పోరాటాలు తెలంగాణాన్ని 87 | అలుముకొంటున్నాయి. 88 | 89 | ప్రజాతంత్ర హక్కులకై సాగుతున్న ఈ పోరాటాలు ఐక్యతను కూర్చుకొంటూ నిజాము పరిపాలనా యంత్రాన్ని మొదలంట కదిల్చివేస్తున్నాయి. 90 | 91 | ఈ దశలో … 92 | 93 | విచ్ఛిన్నమైపోతున్న జాగీర్దారీ వ్యవస్థను రక్షించగల శక్తి నిజాము ప్రభుత్వానికి లేదని గ్రహించిన భూస్వామ్యవర్గం నూతన 94 | నాయకత్వం కొరకై వెతుకులాడుతూ జాతీయోద్యమంలో తనకు రక్షణనివ్వగల శక్తుల్ని చూసుకొంది. 95 | 96 | సమాజంలో తనకున్న బలం క్రమంగా క్షీణించి పోతూంటే, కూలిపోతున్న తన అధికారాన్ని పరిరక్షించుకొనేటందుకై మతవాదుల్నీ, రౌడీల్ని 97 | సమీకరించి విధ్వంసకాండకు పూనుకొంది నిజాము సర్కారు. 98 | 99 | ప్రజానీకానికీ, ప్రతిరోధ శక్తులకూ మధ్య జరిగిన ఈ ఘర్షణలలో తెలంగాణా ఒక అగ్నిగుండమే అయింది. 100 | 101 | ఆనాటి సంఘర్షణలే నా ఈ నవలకు కథావస్తువు. సుదీర్ఘమైన ఈ నవలలో మొదటి భాగం పాఠకుల ముందుంచుతున్నా. త్వరలోనే 102 | మిగతావీ. 103 | 104 | విజయవాడ, 105 | 106 | 20-3-56 107 | 108 | రచయిత. 109 | 110 | 111 | 112 | 113 | భూమి కోసం 114 | భుక్తి కోసం 115 | నిగళబంధ 116 | విముక్తి కోసం 117 | నేల కొరిగిన 118 | తెలుగు జోదుల 119 | కిత్తు నంజలులు. 120 | 121 | కృతజ్ఞత 122 | 123 | తమ పత్రికలో ధారావాహికగా వెలువడిన ఈ నవలను పుస్తకరూపంలో ప్రచురించుకొనుటకనుమతించిన విశాలాంధ్ర సంపాదకులకు - 124 | 125 | రచయిత. 126 | 127 | 128 | 129 | 130 | ఓనమాలు 131 | (మొదటి భాగం) 132 | 133 | 134 | 135 | 136 | ఒకటో ప్రకరణం. 137 | 138 | 139 | అటువంటివాడు ఒక వారం పది రోజులనుంచి పరధ్యానంగా వుంటున్నాడు. ఆతడు దూరదూరంగా వుంటున్నాడనిపించింది. ఆ ఆలోచనతో మనస్సు 140 | కరిగిపోతూంది; హృదయం ఆరాటపడిపోతూంది; అతనిని కదిలించడానికి చేసిన ప్రయత్నాలన్నీ, విఫలం అయ్యాయనిపిస్తూంటే ఎంతో 141 | బాధపడిపోతూంది. ఈ వారం పది రోజులుగా అతనిలో కనిపిస్తున్న ధోరణి ఏమిటో అర్థం కాలేదు. ఏమేమిటో కారణాలు కల్పించుకొంటూంది. 142 | ఆ కారణాలన్నీ ఆమెను మరింత బాధిస్తున్నాయి. 143 | 144 | అతడు తన ఎరికలో ఇంత గాఢంగా ఆలోచనల్లో మునిగి వుండడం ఎప్పుడూ జరగలేదు. అతడు ఆలోచించవలసిన విషయాలు మాత్రం 145 | పెద్దగా ఏం వున్నాయిగనక. ఆస్తా...సెంటు భూమి లేదు. పన్నుకి పీడించేవాళ్ళింక పుట్టవలిసిందేనని అతడే వేళాకోళంగా 146 | అంటూంటాడు....తల్లా, తండ్రా?...ఆ ఇద్దరూ కూడా ఏనాడో మరణించారు. 147 | 148 | ...పెళ్ళామా, పిల్లలా?....ఈ మాట ఆలోచనకు వచ్చినప్పుడు సత్తెమ్మ అంత సులభంగా 'కాదు' అనుకోలేకపోయింది. 149 | ఆలోచించగా, ఆలోచించగా అసలు కారణం అక్కడే వున్నట్లు కూడా అనిపించింది. అనిపించడంతో కళ్ళనీళ్లు తిరిగేయి. 150 | 151 | అతనిని కాదనడానికి తనకున్న హక్కు ఏమిటి? అతని కోసం తాను ఎంతయినా త్యాగం చేసి వుండొచ్చు. ఉండొచ్చునేమిటి? చేసింది. 152 | 153 | ఊరువాళ్ళ మాటల్ని ఖాతరు చెయ్యలేదు. తల్లి ఏడ్పును లెక్కచెయ్యలేదు. కుల మర్యాదల నాలోచించలేదు. అతని కోసం 154 | ఆత్మార్పణ చేసుకొంది. సమాజంలో ఆడది చేయగల త్యాగానికది పరాకాష్ఠ. అయితేనేం?... 155 | 156 | అతడు తనకి మగడు కాదు. తనకి మగడు లేడు. వెంకటయ్య కోసం తాను ఎంత తపన పడ్డా, తానో వితంతువు మాత్రమే. అతని 157 | మీద తనకు హక్కు లేదు. 158 | 159 | తనతో సావాసం చేసేక అతడు ఇతర పడుచుల్ని అంటుకోలేదు. కన్నెత్తి కూడా చూడలేదు. వెంకటయ్య కోసం దార్లుకాచిన పడుచుల్నీ, 160 | అతని మాటకోసం కాట్లాడుకొన్న పడుచుల్నీ ఆమె ఎరుగును. అన్నీ ఎరిగే ఆమె అతనితో నేస్తం చేసింది. తనతో చేరేక అతడు 161 | పూర్తిగా మారిపోయేడు. అతని పరిచయాల విషయంలో తాను పడ్డ జాలికూడా అతనికి నవ్వుతాలయింది. ఆ సంగతినామె ఎరుగును. అతడు 162 | తనదే లోకంగా ఆనందిస్తున్నాడు. తనకేమాత్రం కష్టం కలిగినా గిజగిజలాడి పోతాడు. తన కాళ్ళక్రింద కళ్ళు పరిచేడు. 163 | కళ్ళముందు హృదయం విప్పేడు. 164 | 165 | -------------------------------------------------------------------------------- /test/pg7193_english.txt: -------------------------------------------------------------------------------- 1 | The Project Gutenberg eBook of The Adventures of Tom Sawyer, Part 1. 2 | 3 | This ebook is for the use of anyone anywhere in the United States and 4 | most other parts of the world at no cost and with almost no restrictions 5 | whatsoever. You may copy it, give it away or re-use it under the terms 6 | of the Project Gutenberg License included with this ebook or online 7 | at www.gutenberg.org. If you are not located in the United States, 8 | you will have to check the laws of the country where you are located 9 | before using this eBook. 10 | 11 | Title: The Adventures of Tom Sawyer, Part 1. 12 | 13 | Author: Mark Twain 14 | 15 | Release date: June 29, 2004 [eBook #7193] 16 | Most recently updated: December 30, 2020 17 | 18 | Language: English 19 | 20 | Credits: Produced by David Widger 21 | 22 | 23 | *** START OF THE PROJECT GUTENBERG EBOOK THE ADVENTURES OF TOM SAWYER, PART 1. *** 24 | 25 | 26 | 27 | 28 | Produced by David Widger 29 | 30 | 31 | 32 | 33 | THE ADVENTURES OF TOM SAWYER 34 | BY 35 | MARK TWAIN 36 | (Samuel Langhorne Clemens) 37 | 38 | Part 1 39 | 40 | 41 | P R E F A C E 42 | 43 | MOST of the adventures recorded in this book really occurred; one or 44 | two were experiences of my own, the rest those of boys who were 45 | schoolmates of mine. Huck Finn is drawn from life; Tom Sawyer also, but 46 | not from an individual--he is a combination of the characteristics of 47 | three boys whom I knew, and therefore belongs to the composite order of 48 | architecture. 49 | 50 | The odd superstitions touched upon were all prevalent among children 51 | and slaves in the West at the period of this story--that is to say, 52 | thirty or forty years ago. 53 | 54 | Although my book is intended mainly for the entertainment of boys and 55 | girls, I hope it will not be shunned by men and women on that account, 56 | for part of my plan has been to try to pleasantly remind adults of what 57 | they once were themselves, and of how they felt and thought and talked, 58 | and what queer enterprises they sometimes engaged in. 59 | 60 | THE AUTHOR. 61 | 62 | HARTFORD, 1876. 63 | 64 | 65 | 66 | T O M S A W Y E R 67 | 68 | 69 | 70 | CHAPTER I 71 | 72 | "TOM!" 73 | 74 | No answer. 75 | 76 | "TOM!" 77 | 78 | No answer. 79 | 80 | "What's gone with that boy, I wonder? You TOM!" 81 | 82 | No answer. 83 | 84 | The old lady pulled her spectacles down and looked over them about the 85 | room; then she put them up and looked out under them. She seldom or 86 | never looked THROUGH them for so small a thing as a boy; they were her 87 | state pair, the pride of her heart, and were built for "style," not 88 | service--she could have seen through a pair of stove-lids just as well. 89 | She looked perplexed for a moment, and then said, not fiercely, but 90 | still loud enough for the furniture to hear: 91 | 92 | "Well, I lay if I get hold of you I'll--" 93 | 94 | She did not finish, for by this time she was bending down and punching 95 | under the bed with the broom, and so she needed breath to punctuate the 96 | punches with. She resurrected nothing but the cat. 97 | 98 | "I never did see the beat of that boy!" 99 | 100 | She went to the open door and stood in it and looked out among the 101 | tomato vines and "jimpson" weeds that constituted the garden. No Tom. 102 | So she lifted up her voice at an angle calculated for distance and 103 | shouted: 104 | 105 | "Y-o-u-u TOM!" 106 | 107 | There was a slight noise behind her and she turned just in time to 108 | seize a small boy by the slack of his roundabout and arrest his flight. 109 | 110 | "There! I might 'a' thought of that closet. What you been doing in 111 | there?" 112 | 113 | "Nothing." 114 | 115 | "Nothing! Look at your hands. And look at your mouth. What IS that 116 | truck?" 117 | 118 | "I don't know, aunt." 119 | 120 | "Well, I know. It's jam--that's what it is. Forty times I've said if 121 | you didn't let that jam alone I'd skin you. Hand me that switch." 122 | 123 | The switch hovered in the air--the peril was desperate-- 124 | 125 | "My! Look behind you, aunt!" 126 | 127 | The old lady whirled round, and snatched her skirts out of danger. The 128 | lad fled on the instant, scrambled up the high board-fence, and 129 | disappeared over it. 130 | 131 | His aunt Polly stood surprised a moment, and then broke into a gentle 132 | laugh. 133 | 134 | "Hang the boy, can't I never learn anything? Ain't he played me tricks 135 | enough like that for me to be looking out for him by this time? But old 136 | fools is the biggest fools there is. Can't learn an old dog new tricks, 137 | as the saying is. But my goodness, he never plays them alike, two days, 138 | and how is a body to know what's coming? He 'pears to know just how 139 | long he can torment me before I get my dander up, and he knows if he 140 | can make out to put me off for a minute or make me laugh, it's all down 141 | again and I can't hit him a lick. I ain't doing my duty by that boy, 142 | and that's the Lord's truth, goodness knows. Spare the rod and spile 143 | the child, as the Good Book says. I'm a laying up sin and suffering for 144 | us both, I know. He's full of the Old Scratch, but laws-a-me! he's my 145 | own dead sister's boy, poor thing, and I ain't got the heart to lash 146 | him, somehow. Every time I let him off, my conscience does hurt me so, 147 | and every time I hit him my old heart most breaks. Well-a-well, man 148 | that is born of woman is of few days and full of trouble, as the 149 | Scripture says, and I reckon it's so. He'll play hookey this evening, * 150 | and [* Southwestern for "afternoon"] I'll just be obleeged to make him 151 | work, to-morrow, to punish him. It's mighty hard to make him work 152 | Saturdays, when all the boys is having holiday, but he hates work more 153 | than he hates anything else, and I've GOT to do some of my duty by him, 154 | or I'll be the ruination of the child." 155 | 156 | Tom did play hookey, and he had a very good time. He got back home 157 | barely in season to help Jim, the small colored boy, saw next-day's 158 | wood and split the kindlings before supper--at least he was there in 159 | time to tell his adventures to Jim while Jim did three-fourths of the 160 | work. Tom's younger brother (or rather half-brother) Sid was already 161 | through with his part of the work (picking up chips), for he was a 162 | quiet boy, and had no adventurous, troublesome ways. 163 | 164 | While Tom was eating his supper, and stealing sugar as opportunity 165 | offered, Aunt Polly asked him questions that were full of guile, and 166 | very deep--for she wanted to trap him into damaging revealments. Like 167 | many other simple-hearted souls, it was her pet vanity to believe she 168 | was endowed with a talent for dark and mysterious diplomacy, and she 169 | loved to contemplate her most transparent devices as marvels of low 170 | cunning. Said she: 171 | 172 | "Tom, it was middling warm in school, warn't it?" 173 | 174 | "Yes'm." 175 | 176 | "Powerful warm, warn't it?" 177 | 178 | "Yes'm." 179 | 180 | "Didn't you want to go in a-swimming, Tom?" 181 | 182 | A bit of a scare shot through Tom--a touch of uncomfortable suspicion. 183 | He searched Aunt Polly's face, but it told him nothing. So he said: 184 | 185 | "No'm--well, not very much." 186 | 187 | The old lady reached out her hand and felt Tom's shirt, and said: 188 | 189 | "But you ain't too warm now, though." And it flattered her to reflect 190 | that she had discovered that the shirt was dry without anybody knowing 191 | that that was what she had in her mind. But in spite of her, Tom knew 192 | where the wind lay, now. So he forestalled what might be the next move: 193 | 194 | "Some of us pumped on our heads--mine's damp yet. See?" 195 | 196 | Aunt Polly was vexed to think she had overlooked that bit of 197 | circumstantial evidence, and missed a trick. Then she had a new 198 | inspiration: 199 | 200 | "Tom, you didn't have to undo your shirt collar where I sewed it, to 201 | pump on your head, did you? Unbutton your jacket!" 202 | 203 | The trouble vanished out of Tom's face. He opened his jacket. His 204 | shirt collar was securely sewed. 205 | 206 | "Bother! Well, go 'long with you. I'd made sure you'd played hookey 207 | and been a-swimming. But I forgive ye, Tom. I reckon you're a kind of a 208 | singed cat, as the saying is--better'n you look. THIS time." 209 | 210 | She was half sorry her sagacity had miscarried, and half glad that Tom 211 | had stumbled into obedient conduct for once. 212 | 213 | But Sidney said: 214 | 215 | "Well, now, if I didn't think you sewed his collar with white thread, 216 | but it's black." 217 | 218 | "Why, I did sew it with white! Tom!" 219 | 220 | But Tom did not wait for the rest. As he went out at the door he said: 221 | 222 | "Siddy, I'll lick you for that." 223 | 224 | In a safe place Tom examined two large needles which were thrust into 225 | the lapels of his jacket, and had thread bound about them--one needle 226 | carried white thread and the other black. He said: 227 | 228 | "She'd never noticed if it hadn't been for Sid. Confound it! sometimes 229 | she sews it with white, and sometimes she sews it with black. I wish to 230 | geeminy she'd stick to one or t'other--I can't keep the run of 'em. But 231 | I bet you I'll lam Sid for that. I'll learn him!" 232 | 233 | He was not the Model Boy of the village. He knew the model boy very 234 | well though--and loathed him. 235 | 236 | Within two minutes, or even less, he had forgotten all his troubles. 237 | Not because his troubles were one whit less heavy and bitter to him 238 | than a man's are to a man, but because a new and powerful interest bore 239 | them down and drove them out of his mind for the time--just as men's 240 | misfortunes are forgotten in the excitement of new enterprises. This 241 | new interest was a valued novelty in whistling, which he had just 242 | acquired from a negro, and he was suffering to practise it undisturbed. 243 | It consisted in a peculiar bird-like turn, a sort of liquid warble, 244 | produced by touching the tongue to the roof of the mouth at short 245 | intervals in the midst of the music--the reader probably remembers how 246 | to do it, if he has ever been a boy. Diligence and attention soon gave 247 | him the knack of it, and he strode down the street with his mouth full 248 | of harmony and his soul full of gratitude. He felt much as an 249 | astronomer feels who has discovered a new planet--no doubt, as far as 250 | strong, deep, unalloyed pleasure is concerned, the advantage was with 251 | the boy, not the astronomer. 252 | 253 | The summer evenings were long. It was not dark, yet. Presently Tom 254 | checked his whistle. A stranger was before him--a boy a shade larger 255 | than himself. A new-comer of any age or either sex was an impressive 256 | curiosity in the poor little shabby village of St. Petersburg. This boy 257 | was well dressed, too--well dressed on a week-day. This was simply 258 | astounding. His cap was a dainty thing, his close-buttoned blue cloth 259 | roundabout was new and natty, and so were his pantaloons. He had shoes 260 | on--and it was only Friday. He even wore a necktie, a bright bit of 261 | ribbon. He had a citified air about him that ate into Tom's vitals. The 262 | more Tom stared at the splendid marvel, the higher he turned up his 263 | nose at his finery and the shabbier and shabbier his own outfit seemed 264 | to him to grow. Neither boy spoke. If one moved, the other moved--but 265 | only sidewise, in a circle; they kept face to face and eye to eye all 266 | the time. Finally Tom said: 267 | 268 | "I can lick you!" 269 | 270 | "I'd like to see you try it." 271 | 272 | "Well, I can do it." 273 | 274 | "No you can't, either." 275 | 276 | "Yes I can." 277 | 278 | "No you can't." 279 | 280 | "I can." 281 | 282 | "You can't." 283 | 284 | "Can!" 285 | 286 | "Can't!" 287 | 288 | An uncomfortable pause. Then Tom said: 289 | 290 | "What's your name?" 291 | 292 | "'Tisn't any of your business, maybe." 293 | 294 | "Well I 'low I'll MAKE it my business." 295 | 296 | "Well why don't you?" 297 | 298 | "If you say much, I will." 299 | 300 | "Much--much--MUCH. There now." 301 | 302 | "Oh, you think you're mighty smart, DON'T you? I could lick you with 303 | one hand tied behind me, if I wanted to." 304 | 305 | "Well why don't you DO it? You SAY you can do it." 306 | 307 | "Well I WILL, if you fool with me." 308 | 309 | "Oh yes--I've seen whole families in the same fix." 310 | 311 | "Smarty! You think you're SOME, now, DON'T you? Oh, what a hat!" 312 | 313 | "You can lump that hat if you don't like it. I dare you to knock it 314 | off--and anybody that'll take a dare will suck eggs." 315 | 316 | "You're a liar!" 317 | 318 | "You're another." 319 | 320 | "You're a fighting liar and dasn't take it up." 321 | 322 | "Aw--take a walk!" 323 | 324 | "Say--if you give me much more of your sass I'll take and bounce a 325 | rock off'n your head." 326 | 327 | "Oh, of COURSE you will." 328 | 329 | "Well I WILL." 330 | 331 | "Well why don't you DO it then? What do you keep SAYING you will for? 332 | Why don't you DO it? It's because you're afraid." 333 | 334 | "I AIN'T afraid." 335 | 336 | "You are." 337 | 338 | "I ain't." 339 | 340 | "You are." 341 | 342 | Another pause, and more eying and sidling around each other. Presently 343 | they were shoulder to shoulder. Tom said: 344 | 345 | "Get away from here!" 346 | 347 | "Go away yourself!" 348 | 349 | "I won't." 350 | 351 | "I won't either." 352 | 353 | So they stood, each with a foot placed at an angle as a brace, and 354 | both shoving with might and main, and glowering at each other with 355 | hate. But neither could get an advantage. After struggling till both 356 | were hot and flushed, each relaxed his strain with watchful caution, 357 | and Tom said: 358 | 359 | "You're a coward and a pup. I'll tell my big brother on you, and he 360 | can thrash you with his little finger, and I'll make him do it, too." 361 | 362 | "What do I care for your big brother? I've got a brother that's bigger 363 | than he is--and what's more, he can throw him over that fence, too." 364 | [Both brothers were imaginary.] 365 | 366 | "That's a lie." 367 | 368 | "YOUR saying so don't make it so." 369 | 370 | Tom drew a line in the dust with his big toe, and said: 371 | 372 | "I dare you to step over that, and I'll lick you till you can't stand 373 | up. Anybody that'll take a dare will steal sheep." 374 | 375 | The new boy stepped over promptly, and said: 376 | 377 | "Now you said you'd do it, now let's see you do it." 378 | 379 | "Don't you crowd me now; you better look out." 380 | 381 | "Well, you SAID you'd do it--why don't you do it?" 382 | 383 | "By jingo! for two cents I WILL do it." 384 | 385 | The new boy took two broad coppers out of his pocket and held them out 386 | with derision. Tom struck them to the ground. In an instant both boys 387 | were rolling and tumbling in the dirt, gripped together like cats; and 388 | for the space of a minute they tugged and tore at each other's hair and 389 | clothes, punched and scratched each other's nose, and covered 390 | themselves with dust and glory. Presently the confusion took form, and 391 | through the fog of battle Tom appeared, seated astride the new boy, and 392 | pounding him with his fists. "Holler 'nuff!" said he. 393 | 394 | The boy only struggled to free himself. He was crying--mainly from rage. 395 | 396 | "Holler 'nuff!"--and the pounding went on. 397 | -------------------------------------------------------------------------------- /test/pycode1.txt: -------------------------------------------------------------------------------- 1 | class NamedInitializer(Node): 2 | __slots__ = ('name', 'expr', 'coord', '__weakref__') 3 | def __init__(self, name, expr, coord=None): 4 | self.name = name 5 | self.expr = expr 6 | self.coord = coord 7 | 8 | def children(self): 9 | nodelist = [] 10 | if self.expr is not None: nodelist.append(("expr", self.expr)) 11 | for i, child in enumerate(self.name or []): 12 | nodelist.append(("name[%d]" % i, child)) 13 | return tuple(nodelist) 14 | 15 | def __iter__(self): 16 | if self.expr is not None: 17 | yield self.expr 18 | for child in (self.name or []): 19 | yield child 20 | 21 | attr_names = () 22 | 23 | class ParamList(Node): 24 | __slots__ = ('params', 'coord', '__weakref__') 25 | def __init__(self, params, coord=None): 26 | self.params = params 27 | self.coord = coord 28 | 29 | def children(self): 30 | nodelist = [] 31 | for i, child in enumerate(self.params or []): 32 | nodelist.append(("params[%d]" % i, child)) 33 | return tuple(nodelist) 34 | 35 | def __iter__(self): 36 | for child in (self.params or []): 37 | yield child 38 | 39 | attr_names = () 40 | 41 | class PtrDecl(Node): 42 | __slots__ = ('quals', 'type', 'coord', '__weakref__') 43 | def __init__(self, quals, type, coord=None): 44 | self.quals = quals 45 | self.type = type 46 | self.coord = coord 47 | 48 | def children(self): 49 | nodelist = [] 50 | if self.type is not None: nodelist.append(("type", self.type)) 51 | return tuple(nodelist) 52 | 53 | def __iter__(self): 54 | if self.type is not None: 55 | yield self.type 56 | 57 | attr_names = ('quals', ) 58 | 59 | class Return(Node): 60 | __slots__ = ('expr', 'coord', '__weakref__') 61 | def __init__(self, expr, coord=None): 62 | self.expr = expr 63 | self.coord = coord 64 | 65 | def children(self): 66 | nodelist = [] 67 | if self.expr is not None: nodelist.append(("expr", self.expr)) 68 | return tuple(nodelist) 69 | 70 | def __iter__(self): 71 | if self.expr is not None: 72 | yield self.expr 73 | 74 | attr_names = () 75 | 76 | class StaticAssert(Node): 77 | __slots__ = ('cond', 'message', 'coord', '__weakref__') 78 | def __init__(self, cond, message, coord=None): 79 | self.cond = cond 80 | self.message = message 81 | self.coord = coord 82 | 83 | def children(self): 84 | nodelist = [] 85 | if self.cond is not None: nodelist.append(("cond", self.cond)) 86 | if self.message is not None: nodelist.append(("message", self.message)) 87 | return tuple(nodelist) 88 | 89 | def __iter__(self): 90 | if self.cond is not None: 91 | yield self.cond 92 | if self.message is not None: 93 | yield self.message 94 | 95 | attr_names = () 96 | 97 | class Struct(Node): 98 | __slots__ = ('name', 'decls', 'coord', '__weakref__') 99 | def __init__(self, name, decls, coord=None): 100 | self.name = name 101 | self.decls = decls 102 | self.coord = coord 103 | 104 | def children(self): 105 | nodelist = [] 106 | for i, child in enumerate(self.decls or []): 107 | nodelist.append(("decls[%d]" % i, child)) 108 | return tuple(nodelist) 109 | 110 | def __iter__(self): 111 | for child in (self.decls or []): 112 | yield child 113 | 114 | attr_names = ('name', ) 115 | 116 | class StructRef(Node): 117 | __slots__ = ('name', 'type', 'field', 'coord', '__weakref__') 118 | def __init__(self, name, type, field, coord=None): 119 | self.name = name 120 | self.type = type 121 | self.field = field 122 | self.coord = coord 123 | 124 | def children(self): 125 | nodelist = [] 126 | if self.name is not None: nodelist.append(("name", self.name)) 127 | if self.field is not None: nodelist.append(("field", self.field)) 128 | return tuple(nodelist) 129 | 130 | def __iter__(self): 131 | if self.name is not None: 132 | yield self.name 133 | if self.field is not None: 134 | yield self.field 135 | 136 | attr_names = ('type', ) 137 | 138 | class Switch(Node): 139 | __slots__ = ('cond', 'stmt', 'coord', '__weakref__') 140 | def __init__(self, cond, stmt, coord=None): 141 | self.cond = cond 142 | self.stmt = stmt 143 | self.coord = coord 144 | 145 | def children(self): 146 | nodelist = [] 147 | if self.cond is not None: nodelist.append(("cond", self.cond)) 148 | if self.stmt is not None: nodelist.append(("stmt", self.stmt)) 149 | return tuple(nodelist) 150 | 151 | def __iter__(self): 152 | if self.cond is not None: 153 | yield self.cond 154 | if self.stmt is not None: 155 | yield self.stmt 156 | 157 | attr_names = () 158 | 159 | class TernaryOp(Node): 160 | __slots__ = ('cond', 'iftrue', 'iffalse', 'coord', '__weakref__') 161 | def __init__(self, cond, iftrue, iffalse, coord=None): 162 | self.cond = cond 163 | self.iftrue = iftrue 164 | self.iffalse = iffalse 165 | self.coord = coord 166 | 167 | def children(self): 168 | nodelist = [] 169 | if self.cond is not None: nodelist.append(("cond", self.cond)) 170 | if self.iftrue is not None: nodelist.append(("iftrue", self.iftrue)) 171 | if self.iffalse is not None: nodelist.append(("iffalse", self.iffalse)) 172 | return tuple(nodelist) 173 | 174 | def __iter__(self): 175 | if self.cond is not None: 176 | yield self.cond 177 | if self.iftrue is not None: 178 | yield self.iftrue 179 | if self.iffalse is not None: 180 | yield self.iffalse 181 | 182 | attr_names = () 183 | 184 | class TypeDecl(Node): 185 | __slots__ = ('declname', 'quals', 'align', 'type', 'coord', '__weakref__') 186 | def __init__(self, declname, quals, align, type, coord=None): 187 | self.declname = declname 188 | self.quals = quals 189 | self.align = align 190 | self.type = type 191 | self.coord = coord 192 | 193 | def children(self): 194 | nodelist = [] 195 | if self.type is not None: nodelist.append(("type", self.type)) 196 | return tuple(nodelist) 197 | 198 | def __iter__(self): 199 | if self.type is not None: 200 | yield self.type 201 | 202 | attr_names = ('declname', 'quals', 'align', ) 203 | 204 | class Typedef(Node): 205 | __slots__ = ('name', 'quals', 'storage', 'type', 'coord', '__weakref__') 206 | def __init__(self, name, quals, storage, type, coord=None): 207 | self.name = name 208 | self.quals = quals 209 | self.storage = storage 210 | self.type = type 211 | self.coord = coord 212 | 213 | def children(self): 214 | nodelist = [] 215 | if self.type is not None: nodelist.append(("type", self.type)) 216 | return tuple(nodelist) 217 | 218 | def __iter__(self): 219 | if self.type is not None: 220 | yield self.type 221 | 222 | attr_names = ('name', 'quals', 'storage', ) 223 | 224 | class Typename(Node): 225 | __slots__ = ('name', 'quals', 'align', 'type', 'coord', '__weakref__') 226 | def __init__(self, name, quals, align, type, coord=None): 227 | self.name = name 228 | self.quals = quals 229 | self.align = align 230 | self.type = type 231 | self.coord = coord 232 | 233 | def children(self): 234 | nodelist = [] 235 | if self.type is not None: nodelist.append(("type", self.type)) 236 | return tuple(nodelist) 237 | 238 | def __iter__(self): 239 | if self.type is not None: 240 | yield self.type 241 | 242 | attr_names = ('name', 'quals', 'align', ) 243 | 244 | class UnaryOp(Node): 245 | __slots__ = ('op', 'expr', 'coord', '__weakref__') 246 | def __init__(self, op, expr, coord=None): 247 | self.op = op 248 | self.expr = expr 249 | self.coord = coord 250 | 251 | def children(self): 252 | nodelist = [] 253 | if self.expr is not None: nodelist.append(("expr", self.expr)) 254 | return tuple(nodelist) 255 | 256 | def __iter__(self): 257 | if self.expr is not None: 258 | yield self.expr 259 | 260 | attr_names = ('op', ) 261 | 262 | class Union(Node): 263 | __slots__ = ('name', 'decls', 'coord', '__weakref__') 264 | def __init__(self, name, decls, coord=None): 265 | self.name = name 266 | self.decls = decls 267 | self.coord = coord 268 | 269 | def children(self): 270 | nodelist = [] 271 | for i, child in enumerate(self.decls or []): 272 | nodelist.append(("decls[%d]" % i, child)) 273 | return tuple(nodelist) 274 | 275 | def __iter__(self): 276 | for child in (self.decls or []): 277 | yield child 278 | 279 | attr_names = ('name', ) 280 | 281 | class While(Node): 282 | __slots__ = ('cond', 'stmt', 'coord', '__weakref__') 283 | def __init__(self, cond, stmt, coord=None): 284 | self.cond = cond 285 | self.stmt = stmt 286 | self.coord = coord 287 | 288 | def children(self): 289 | nodelist = [] 290 | if self.cond is not None: nodelist.append(("cond", self.cond)) 291 | if self.stmt is not None: nodelist.append(("stmt", self.stmt)) 292 | return tuple(nodelist) 293 | 294 | def __iter__(self): 295 | if self.cond is not None: 296 | yield self.cond 297 | if self.stmt is not None: 298 | yield self.stmt 299 | 300 | attr_names = () 301 | 302 | class Pragma(Node): 303 | __slots__ = ('string', 'coord', '__weakref__') 304 | def __init__(self, string, coord=None): 305 | self.string = string 306 | self.coord = coord 307 | 308 | def children(self): 309 | nodelist = [] 310 | return tuple(nodelist) 311 | 312 | def __iter__(self): 313 | return 314 | yield 315 | 316 | attr_names = ('string', ) 317 | -------------------------------------------------------------------------------- /test/sp-dump-ids.py: -------------------------------------------------------------------------------- 1 | # Uses the sentencepiece package to tokenize the file provided as a command-line 2 | # argument; emits all token IDs to stdout, one per line. 3 | # 4 | # Requires the MODELPATH env var to be set to the binary proto describing 5 | # the tokenizer model. 6 | import sentencepiece as spm 7 | import os, sys 8 | 9 | with open(sys.argv[1], "r", newline="") as f: 10 | text = f.read() 11 | sp = spm.SentencePieceProcessor(model_file=os.getenv("MODELPATH")) 12 | ids = sp.encode(text) 13 | 14 | # Print ids out, one per line 15 | for id in ids: 16 | print(id) 17 | -------------------------------------------------------------------------------- /token.go: -------------------------------------------------------------------------------- 1 | package sentencepiece 2 | 3 | import "fmt" 4 | 5 | // Token represents a single token from the input text. ID is a unique token 6 | // identifier that the model uses in its internal representation. Text is 7 | // the piece of text this token represents. 8 | type Token struct { 9 | ID int 10 | Text string 11 | } 12 | 13 | func (t Token) String() string { 14 | return fmt.Sprintf("Token{ID: %v, Text: %q}", t.ID, t.Text) 15 | } 16 | --------------------------------------------------------------------------------