├── VERSION.txt ├── python ├── test │ ├── __init__.py │ ├── botchan.txt │ ├── test_model.model │ └── test_ja_model.model ├── VERSION.txt ├── .gitignore ├── setup.cfg ├── MANIFEST.in ├── build_bundled.sh ├── make_py_wheel.sh ├── make_py_wheel_mac.sh ├── add_new_vocab.ipynb └── setup.py ├── tensorflow ├── .gitignore └── README.md ├── third_party ├── CMakeLists.txt ├── absl │ ├── flags │ │ ├── parse.h │ │ └── flag.h │ ├── container │ │ ├── flat_hash_set.h │ │ └── flat_hash_map.h │ ├── strings │ │ ├── strip.h │ │ ├── numbers.h │ │ ├── str_format.h │ │ ├── match.h │ │ ├── ascii.h │ │ ├── str_cat.h │ │ ├── str_replace.h │ │ ├── str_join.h │ │ └── str_split.h │ └── memory │ │ └── memory.h ├── esaxx │ ├── LICENSE │ └── esa.hxx ├── darts_clone │ └── LICENSE └── protobuf-lite │ ├── LICENSE │ ├── google │ └── protobuf │ │ ├── port.h │ │ ├── stubs │ │ ├── once.h │ │ ├── stl_util.h │ │ ├── time.h │ │ ├── stringprintf.h │ │ ├── hash.h │ │ └── status.h │ │ ├── generated_enum_util.h │ │ ├── has_bits.h │ │ ├── generated_enum_reflection.h │ │ └── port_undef.inc │ ├── statusor.cc │ ├── zero_copy_stream.cc │ ├── implicit_weak_message.cc │ ├── generated_enum_util.cc │ └── status.cc ├── config.h.in ├── sentencepiece.pc.in ├── appveyor.yml ├── src ├── model_factory.h ├── test_main.cc ├── word_model.h ├── char_model.h ├── trainer_factory.h ├── word_model.cc ├── char_model_trainer.h ├── unicode_script.cc ├── freelist_test.cc ├── char_model.cc ├── init.h ├── word_model_trainer.h ├── unicode_script_test.cc ├── filesystem_test.cc ├── model_factory.cc ├── trainer_factory_test.cc ├── bpe_model.h ├── model_factory_test.cc ├── char_model_trainer.cc ├── filesystem.h ├── testharness.cc ├── spm_export_vocab_main.cc ├── pretokenizer_for_training.cc ├── trainer_factory.cc ├── pretokenizer_for_training.h ├── word_model_trainer.cc ├── freelist.h ├── word_model_trainer_test.cc ├── sentencepiece.proto ├── char_model_trainer_test.cc ├── word_model_test.cc ├── pretokenizer_for_training_test.cc ├── unigram_model_trainer_test.cc ├── unicode_script.h ├── char_model_test.cc ├── filesystem.cc ├── unigram_model_trainer.h ├── spm_decode_main.cc └── spm_normalize_main.cc ├── .gitignore ├── doc ├── special_symbols.md ├── normalization.md └── options.md ├── data ├── extract_headers.pl └── gen_unicode_scripts_code.pl ├── test.bat ├── CONTRIBUTING.md └── test.sh /VERSION.txt: -------------------------------------------------------------------------------- 1 | 0.1.96 2 | -------------------------------------------------------------------------------- /python/test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python/VERSION.txt: -------------------------------------------------------------------------------- 1 | 0.1.96 2 | -------------------------------------------------------------------------------- /python/test/botchan.txt: -------------------------------------------------------------------------------- 1 | ../../data/botchan.txt -------------------------------------------------------------------------------- /python/.gitignore: -------------------------------------------------------------------------------- 1 | /*.so 2 | /build 3 | /*.pickle 4 | -------------------------------------------------------------------------------- /python/setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /tensorflow/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | sdist/ 3 | dist/ 4 | tmp/ 5 | *py[cod] 6 | -------------------------------------------------------------------------------- /third_party/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories(absl/strings darts_clone esaxx protobuf-lite) 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /python/test/test_model.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmcinnes/sentencepiece/master/python/test/test_model.model -------------------------------------------------------------------------------- /python/test/test_ja_model.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmcinnes/sentencepiece/master/python/test/test_ja_model.model -------------------------------------------------------------------------------- /python/MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include test *.py *.model botchan.txt 2 | recursive-include src *.i 3 | include *.md VERSION.* build_bundled.sh 4 | -------------------------------------------------------------------------------- /config.h.in: -------------------------------------------------------------------------------- 1 | #ifndef CONFIG_H_ 2 | #define CONFIG_H_ 3 | 4 | #define VERSION "@PROJECT_VERSION@" 5 | #define PACKAGE "@PROJECT_NAME@" 6 | #define PACKAGE_STRING "@PROJECT_NAME@" 7 | 8 | 9 | #endif // CONFIG_H_ 10 | -------------------------------------------------------------------------------- /sentencepiece.pc.in: -------------------------------------------------------------------------------- 1 | prefix=@prefix@ 2 | exec_prefix=@exec_prefix@ 3 | libdir=@libdir@ 4 | includedir=@includedir@ 5 | 6 | Name: @PROJECT_NAME@ 7 | Description: Unsupervised text tokenizer and detokenizer for Neural Network-based text generation. 8 | Version: @PROJECT_VERSION@ 9 | Libs: -L${libdir} -lsentencepiece -lsentencepiece_train @libprotobuf_lite@ @pkgconfiglibs@ 10 | Cflags: -I${includedir} @pkgconfigcflags@ 11 | -------------------------------------------------------------------------------- /python/build_bundled.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | VERSION="$1" 4 | 5 | mkdir bundled 6 | cd bundled 7 | # Try taged version. Othewise, use head. 8 | git clone https://github.com/google/sentencepiece.git \ 9 | -b v"${VERSION}" --depth 1 || \ 10 | git clone https://github.com/google/sentencepiece.git --depth 1 11 | 12 | cd sentencepiece 13 | mkdir build 14 | cd build 15 | cmake .. -DSPM_ENABLE_SHARED=OFF -DCMAKE_INSTALL_PREFIX=../.. 16 | make -j $(nproc) 17 | make install 18 | cd ../.. 19 | -------------------------------------------------------------------------------- /third_party/absl/flags/parse.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef ABSL_FLAGS_PARSE_H_ 16 | #define ABSL_FLAGS_PARSE_H_ 17 | 18 | #include 19 | 20 | namespace absl { 21 | 22 | std::vector ParseCommandLine(int argc, char *argv[]); 23 | } // namespace absl 24 | 25 | #endif // ABSL_FLAGS_PARSE_H_ 26 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | version: '{branch} build {build}' 2 | image: Visual Studio 2019 3 | platform: 4 | - x64 5 | - Win32 6 | configuration: Release 7 | clone_depth: 50 8 | clone_folder: c:\projects\sentencepiece 9 | #init: 10 | # - ps: iex ((new-object net.webclient).DownloadString('https://raw.githubusercontent.com/appveyor/ci/master/scripts/enable-rdp.ps1')) 11 | #on_finish: 12 | # - ps: $blockRdp = $true; iex ((new-object net.webclient).DownloadString('https://raw.githubusercontent.com/appveyor/ci/master/scripts/enable-rdp.ps1')) 13 | build_script: 14 | - cmd: call test.bat %platform% 15 | artifacts: 16 | - path: build\sentencepiece*.7z 17 | - path: python\dist\*.whl 18 | deploy: 19 | description: 'SentencePiece Windows release' 20 | provider: GitHub 21 | auth_token: 22 | secure: Aq4jHo/HY6WFFKs1h9cCWfi3U4ZsVTooUEhtgBfcJM6SUhnZdPVazIcKCtiR32kc 23 | draft: false 24 | prerelease: false 25 | on: 26 | branch: master 27 | appveyor_repo_tag: true 28 | -------------------------------------------------------------------------------- /src/model_factory.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef MODEL_FACTORY_H_ 16 | #define MODEL_FACTORY_H_ 17 | 18 | #include 19 | 20 | #include "model_interface.h" 21 | #include "sentencepiece_model.pb.h" 22 | 23 | namespace sentencepiece { 24 | 25 | class ModelFactory { 26 | public: 27 | // Creates Model instance from |model_proto|. 28 | static std::unique_ptr Create(const ModelProto &model_proto); 29 | }; 30 | } // namespace sentencepiece 31 | #endif // MODEL_FACTORY_H_ 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | Makefile 2 | Makefile.in 3 | /ar-lib 4 | /mdate-sh 5 | /py-compile 6 | /test-driver 7 | /ylwrap 8 | /build 9 | 10 | /autom4te.cache 11 | /autoscan.log 12 | /autoscan-*.log 13 | /aclocal.m4 14 | /compile 15 | /config.guess 16 | /config.sub 17 | /configure 18 | /configure.scan 19 | /depcomp 20 | /install-sh 21 | /missing 22 | /stamp-h1 23 | /libtool 24 | /config.h 25 | /config.status 26 | /autogen.sh 27 | /ltmain.sh 28 | 29 | CMakeFiles 30 | CMakeCache.txt 31 | config.h 32 | sentencepiece.pc 33 | CPackConfig.cmake 34 | CTestTestfile.cmake 35 | CPackSourceConfig.cmake 36 | DartConfiguration.tcl 37 | 38 | *.o 39 | *.lo 40 | *.a 41 | *.la 42 | *.pyc 43 | 44 | .libs 45 | .deps 46 | 47 | *.m4 48 | *.log 49 | *.trs 50 | 51 | compile_charsmap 52 | 53 | spm_decode 54 | spm_encode 55 | spm_export_vocab 56 | spm_train 57 | spm_normalize 58 | spm_test 59 | 60 | .DS_Store 61 | *.egg-info/ 62 | dist/ 63 | *.swp 64 | *.swo 65 | *.pyc 66 | 67 | m.model 68 | m.vocab 69 | 70 | cmake_install.cmake 71 | libsentencepiece.so* 72 | libsentencepiece_train.so* 73 | python/bundled 74 | _sentencepiece.*.so 75 | third_party/abseil-cpp 76 | -------------------------------------------------------------------------------- /third_party/absl/container/flat_hash_set.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef ABSL_CONTAINER_FLAT_HASH_SET_ 16 | #define ABSL_CONTAINER_FLAT_HASH_SET_ 17 | 18 | #include 19 | 20 | namespace absl { 21 | 22 | template , 23 | typename Eq = std::equal_to, 24 | typename Allocator = std::allocator> 25 | using flat_hash_set = std::unordered_set; 26 | 27 | } 28 | 29 | #endif // ABSL_CONTAINER_FLAT_HASH_SET_ 30 | -------------------------------------------------------------------------------- /third_party/absl/strings/strip.h: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright 2017 The Abseil Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // 16 | #ifndef ABSL_STRINGS_STRIP_H_ 17 | #define ABSL_STRINGS_STRIP_H_ 18 | 19 | #include 20 | 21 | #include "third_party/absl/strings/match.h" 22 | 23 | namespace absl { 24 | 25 | inline bool ConsumePrefix(absl::string_view *str, absl::string_view expected) { 26 | if (!absl::StartsWith(*str, expected)) return false; 27 | str->remove_prefix(expected.size()); 28 | return true; 29 | } 30 | 31 | } // namespace absl 32 | #endif // ABSL_STRINGS_STRIP_H 33 | -------------------------------------------------------------------------------- /src/test_main.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include "init.h" 16 | #include "testharness.h" 17 | 18 | #ifdef OS_WIN 19 | ABSL_FLAG(std::string, test_srcdir, "..\\data", "Data directory."); 20 | #else 21 | ABSL_FLAG(std::string, test_srcdir, "../data", "Data directory."); 22 | #endif 23 | 24 | ABSL_FLAG(std::string, test_tmpdir, "test_tmp", "Temporary directory."); 25 | 26 | int main(int argc, char **argv) { 27 | sentencepiece::ParseCommandLineFlags(argv[0], &argc, &argv, true); 28 | sentencepiece::test::RunAllTests(); 29 | return 0; 30 | } 31 | -------------------------------------------------------------------------------- /third_party/absl/container/flat_hash_map.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef ABSL_CONTAINER_FLAT_HASH_MAP_ 16 | #define ABSL_CONTAINER_FLAT_HASH_MAP_ 17 | 18 | #include 19 | 20 | namespace absl { 21 | 22 | template , 23 | typename Eq = std::equal_to, 24 | typename Allocator = std::allocator>> 25 | using flat_hash_map = std::unordered_map; 26 | 27 | } 28 | 29 | #endif // ABSL_CONTAINER_FLAT_HASH_MAP_ 30 | -------------------------------------------------------------------------------- /doc/special_symbols.md: -------------------------------------------------------------------------------- 1 | # Use custom symbols 2 | SentencePiece model supports two types of special symbols. 3 | 4 | ## Control symbol 5 | Control symbols are used to encode special indicators for the decoder to change the behavior dynamically. 6 | Example includes the language indicators in multi-lingual models. `` and `` are reserved control symbols. 7 | Control symbols must be inserted outside of the SentencePiece segmentation. Developers need to take the responsibility to insert these symbols in data generation and decoding. 8 | 9 | It is guaranteed that control symbols have no corresponding surface strings in the original user input. Control symbols are decoded into empty strings. 10 | 11 | ## User defined symbol 12 | User defined symbol is handled as one piece in any context. If this symbol is included in the input text, this symbol is always extracted as one piece. 13 | 14 | ## Specify special symbols in training time 15 | Use `--control_symbols` and `--user_defined_symbols` flags as follows 16 | 17 | ``` 18 | % spm_train --control_symbols=, --user_defined_symbols=, --input= --model_prefix= --vocab_size=8000 19 | ``` 20 | -------------------------------------------------------------------------------- /third_party/absl/strings/numbers.h: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright 2017 The Abseil Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // 16 | #ifndef ABSL_STRINGS_NUMBERS_H_ 17 | #define ABSL_STRINGS_NUMBERS_H_ 18 | 19 | #include 20 | 21 | #include "third_party/absl/strings/string_view.h" 22 | 23 | namespace absl { 24 | 25 | // TODO(taku): Re-implement this, as it is slow. 26 | template 27 | inline bool SimpleAtoi(absl::string_view s, T *result) { 28 | std::stringstream ss; 29 | return (ss << s.data() && ss >> *result); 30 | } 31 | 32 | } // namespace absl 33 | #endif // ABSL_STRINGS_NUMBERS_H_ 34 | -------------------------------------------------------------------------------- /src/word_model.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef WORD_MODEL_H_ 16 | #define WORD_MODEL_H_ 17 | 18 | #include "model_interface.h" 19 | #include "sentencepiece_model.pb.h" 20 | 21 | namespace sentencepiece { 22 | namespace word { 23 | 24 | // Tokenize text with whitespaces. 25 | class Model : public ModelInterface { 26 | public: 27 | explicit Model(const ModelProto &model_proto); 28 | ~Model() override; 29 | 30 | EncodeResult Encode(absl::string_view normalized) const override; 31 | }; 32 | } // namespace word 33 | } // namespace sentencepiece 34 | #endif // WORD_MODEL_H_ 35 | -------------------------------------------------------------------------------- /src/char_model.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef CHAR_MODEL_H_ 16 | #define CHAR_MODEL_H_ 17 | 18 | #include "model_interface.h" 19 | #include "sentencepiece_model.pb.h" 20 | 21 | namespace sentencepiece { 22 | namespace character { 23 | 24 | // Tokenize text into character sequence 25 | class Model : public ModelInterface { 26 | public: 27 | explicit Model(const ModelProto &model_proto); 28 | ~Model() override; 29 | 30 | EncodeResult Encode(absl::string_view normalized) const override; 31 | }; 32 | } // namespace character 33 | } // namespace sentencepiece 34 | #endif // CHAR_MODEL_H_ 35 | -------------------------------------------------------------------------------- /third_party/esaxx/LICENSE: -------------------------------------------------------------------------------- 1 | This is the esaxx copyright. 2 | 3 | Copyright (c) 2010 Daisuke Okanohara All Rights Reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person 6 | obtaining a copy of this software and associated documentation 7 | files (the "Software"), to deal in the Software without 8 | restriction, including without limitation the rights to use, 9 | copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the 11 | Software is furnished to do so, subject to the following 12 | conditions: 13 | 14 | The above copyright notice and this permission notice shall be 15 | included in all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 19 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 21 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 22 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 | OTHER DEALINGS IN THE SOFTWARE. 25 | -------------------------------------------------------------------------------- /src/trainer_factory.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef TRAINER_FACTORY_H_ 16 | #define TRAINER_FACTORY_H_ 17 | 18 | #include 19 | 20 | #include "sentencepiece_model.pb.h" 21 | #include "trainer_interface.h" 22 | 23 | namespace sentencepiece { 24 | 25 | class TrainerFactory { 26 | public: 27 | // Creates Trainer instance from |trainer_spec| and |normalizer_spec|. 28 | static std::unique_ptr Create( 29 | const TrainerSpec &trainer_spec, const NormalizerSpec &normalizer_spec, 30 | const NormalizerSpec &denormalizer_spec); 31 | }; 32 | } // namespace sentencepiece 33 | #endif // TRAINER_FACTORY_H_ 34 | -------------------------------------------------------------------------------- /third_party/absl/strings/str_format.h: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright 2017 The Abseil Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // 16 | #ifndef ABSL_STRINGS_STR_FORMAT_H 17 | #define ABSL_STRINGS_STR_FORMAT_H 18 | 19 | #include 20 | 21 | #include 22 | 23 | #include "third_party/absl/strings/string_view.h" 24 | 25 | namespace absl { 26 | 27 | template 28 | std::string StrFormat(const char *format, Args const &... args) { 29 | const int len = ::snprintf(nullptr, 0, format, args...); 30 | std::string s; 31 | s.resize(len); 32 | ::snprintf(&s[0], s.size() + 1, format, args...); 33 | return s; 34 | } 35 | 36 | } // namespace absl 37 | #endif // ABSL_MEMORY_MEMORY_H_ 38 | -------------------------------------------------------------------------------- /data/extract_headers.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2018 Google Inc. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Extract header files required for build protobuf-lite 18 | # 19 | # usage: ./extract_headers.pl *.cc 20 | 21 | use strict; 22 | use warnings; 23 | 24 | sub Process() { 25 | my $file = shift @_; 26 | if ($file =~ /\.h$/) { 27 | print "$file\n"; 28 | } 29 | return unless open(F, $file); 30 | my @files = (); 31 | while () { 32 | chomp; 33 | if (/\#include <(google\/protobuf\/[^>]+)>/) { 34 | push @files, $1; 35 | } 36 | } 37 | close(F); 38 | for my $file (@files) { 39 | &Process($file); 40 | } 41 | } 42 | 43 | for my $f (@ARGV) { 44 | &Process($f); 45 | } 46 | -------------------------------------------------------------------------------- /src/word_model.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include "util.h" 16 | #include "word_model.h" 17 | 18 | namespace sentencepiece { 19 | namespace word { 20 | 21 | Model::Model(const ModelProto &model_proto) { 22 | model_proto_ = &model_proto; 23 | InitializePieces(); 24 | } 25 | 26 | Model::~Model() {} 27 | 28 | EncodeResult Model::Encode(absl::string_view normalized) const { 29 | if (!status().ok() || normalized.empty()) { 30 | return {}; 31 | } 32 | 33 | EncodeResult output; 34 | for (const auto &w : SplitIntoWords(normalized)) { 35 | output.emplace_back(w, PieceToId(w)); 36 | } 37 | 38 | return output; 39 | } 40 | 41 | } // namespace word 42 | } // namespace sentencepiece 43 | -------------------------------------------------------------------------------- /test.bat: -------------------------------------------------------------------------------- 1 | set PLATFORM=%1 2 | if "%PLATFORM%"=="" set PLATFORM=x64 3 | set PLATFORM_PREFIX= 4 | if "%PLATFORM%"=="x64" set PLATFORM_PREFIX=-x64 5 | set _CL_=/utf-8 6 | set PATH=c:\Program Files\Git\usr\bin;c:\MinGW\bin;%PATH% 7 | set CURRENT_PATH=%~dp0 8 | set LIBRARY_PATH=%CURRENT_PATH%build\root 9 | 10 | mkdir build 11 | cd build 12 | 13 | cmake .. -A %PLATFORM% -DSPM_BUILD_TEST=ON -DSPM_ENABLE_SHARED=OFF -DCMAKE_INSTALL_PREFIX=%LIBRARY_PATH% 14 | cmake --build . --config Release --target install || goto :error 15 | ctest -C Release || goto :error 16 | cpack || goto :error 17 | 18 | cd ..\python 19 | rem call :BuildPython C:\Python27%PLATFORM_PREFIX% 20 | call :BuildPython C:\Python35%PLATFORM_PREFIX% 21 | call :BuildPython C:\Python36%PLATFORM_PREFIX% 22 | call :BuildPython C:\Python37%PLATFORM_PREFIX% 23 | call :BuildPython C:\Python38%PLATFORM_PREFIX% 24 | call :BuildPython C:\Python39%PLATFORM_PREFIX% 25 | c:\Python38%PLATFORM_PREFIX%\python setup.py sdist || goto :error 26 | exit 27 | 28 | :BuildPython 29 | %1\python -m pip install wheel || goto :error 30 | %1\python setup.py build || goto :error 31 | %1\python setup.py bdist_wheel || goto :error 32 | %1\python setup.py test || goto :error 33 | rmdir /Q /S build 34 | del /S *.pyd 35 | exit /b 36 | 37 | :error 38 | exit /b %errorlevel% 39 | -------------------------------------------------------------------------------- /src/char_model_trainer.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef CHAR_MODEL_TRAINER_H_ 16 | #define CHAR_MODEL_TRAINER_H_ 17 | 18 | #include "sentencepiece_model.pb.h" 19 | #include "trainer_interface.h" 20 | 21 | namespace sentencepiece { 22 | namespace character { 23 | 24 | // Trainer class for character model. 25 | class Trainer : public TrainerInterface { 26 | public: 27 | Trainer(const TrainerSpec &trainer_spec, 28 | const NormalizerSpec &normalizer_spec, 29 | const NormalizerSpec &denormalizer_spec) 30 | : TrainerInterface::TrainerInterface(trainer_spec, normalizer_spec, 31 | denormalizer_spec) {} 32 | 33 | util::Status Train() override; 34 | }; 35 | } // namespace character 36 | } // namespace sentencepiece 37 | #endif // CHAR_MODEL_TRAINER_H_ 38 | -------------------------------------------------------------------------------- /src/unicode_script.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include 16 | 17 | #include "third_party/absl/container/flat_hash_map.h" 18 | #include "unicode_script.h" 19 | #include "unicode_script_map.h" 20 | #include "util.h" 21 | 22 | namespace sentencepiece { 23 | namespace unicode_script { 24 | namespace { 25 | class GetScriptInternal { 26 | public: 27 | GetScriptInternal() { InitTable(&smap_); } 28 | 29 | ScriptType GetScript(char32 c) const { 30 | return port::FindWithDefault(smap_, c, ScriptType::U_Common); 31 | } 32 | 33 | private: 34 | absl::flat_hash_map smap_; 35 | }; 36 | } // namespace 37 | 38 | ScriptType GetScript(char32 c) { 39 | static GetScriptInternal sc; 40 | return sc.GetScript(c); 41 | } 42 | } // namespace unicode_script 43 | } // namespace sentencepiece 44 | -------------------------------------------------------------------------------- /src/freelist_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include "freelist.h" 16 | #include "testharness.h" 17 | 18 | namespace sentencepiece { 19 | namespace model { 20 | 21 | TEST(FreeListTest, BasicTest) { 22 | FreeList l(5); 23 | EXPECT_EQ(0, l.size()); 24 | 25 | constexpr size_t kSize = 32; 26 | 27 | for (size_t i = 0; i < kSize; ++i) { 28 | int *n = l.Allocate(); 29 | EXPECT_EQ(0, *n); 30 | *n = i; 31 | } 32 | 33 | EXPECT_EQ(kSize, l.size()); 34 | for (size_t i = 0; i < kSize; ++i) { 35 | EXPECT_EQ(i, *l[i]); 36 | } 37 | 38 | l.Free(); 39 | EXPECT_EQ(0, l.size()); 40 | 41 | // Zero-initialized after `Free`. 42 | for (size_t i = 0; i < kSize; ++i) { 43 | int *n = l.Allocate(); 44 | EXPECT_EQ(0, *n); 45 | } 46 | } 47 | } // namespace model 48 | } // namespace sentencepiece 49 | -------------------------------------------------------------------------------- /third_party/absl/strings/match.h: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright 2017 The Abseil Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // 16 | #ifndef ABSL_STRINGS_MATCH_H_ 17 | #define ABSL_STRINGS_MATCH_H_ 18 | 19 | #include 20 | 21 | #include "third_party/absl/strings/string_view.h" 22 | 23 | namespace absl { 24 | 25 | inline bool StartsWith(absl::string_view text, absl::string_view prefix) { 26 | return prefix.empty() || 27 | (text.size() >= prefix.size() && 28 | memcmp(text.data(), prefix.data(), prefix.size()) == 0); 29 | } 30 | 31 | inline bool EndsWith(absl::string_view text, absl::string_view suffix) { 32 | return suffix.empty() || (text.size() >= suffix.size() && 33 | memcmp(text.data() + (text.size() - suffix.size()), 34 | suffix.data(), suffix.size()) == 0); 35 | } 36 | 37 | } // namespace absl 38 | #endif // ABSL_STRINGS_MATCH_H_ 39 | -------------------------------------------------------------------------------- /third_party/absl/strings/ascii.h: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright 2017 The Abseil Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // 16 | #ifndef ABSL_STRINGS_ASCII_H_ 17 | #define ABSL_STRINGS_ASCII_H_ 18 | 19 | #include 20 | 21 | #include 22 | 23 | #include "third_party/absl/strings/string_view.h" 24 | 25 | namespace absl { 26 | 27 | inline std::string AsciiStrToUpper(absl::string_view value) { 28 | std::string upper_value = std::string(value); 29 | std::transform(upper_value.begin(), upper_value.end(), upper_value.begin(), 30 | ::toupper); 31 | return upper_value; 32 | } 33 | 34 | inline std::string AsciiStrToLower(absl::string_view value) { 35 | std::string lower_value = std::string(value); 36 | std::transform(lower_value.begin(), lower_value.end(), lower_value.begin(), 37 | ::tolower); 38 | return lower_value; 39 | } 40 | } // namespace absl 41 | #endif // ABSL_STRINGS_ASCII_H_ 42 | -------------------------------------------------------------------------------- /src/char_model.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include "char_model.h" 16 | #include "util.h" 17 | 18 | namespace sentencepiece { 19 | namespace character { 20 | 21 | Model::Model(const ModelProto &model_proto) { 22 | model_proto_ = &model_proto; 23 | InitializePieces(); 24 | } 25 | 26 | Model::~Model() {} 27 | 28 | EncodeResult Model::Encode(absl::string_view normalized) const { 29 | if (!status().ok() || normalized.empty()) { 30 | return {}; 31 | } 32 | 33 | // Splits the input into character sequence 34 | EncodeResult output; 35 | while (!normalized.empty()) { 36 | const int mblen = matcher_->PrefixMatch(normalized); 37 | absl::string_view w(normalized.data(), mblen); 38 | output.emplace_back(w, PieceToId(w)); 39 | normalized.remove_prefix(mblen); 40 | } 41 | 42 | return output; 43 | } 44 | 45 | } // namespace character 46 | } // namespace sentencepiece 47 | -------------------------------------------------------------------------------- /src/init.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef INIT_H_ 16 | #define INIT_H_ 17 | 18 | #include "common.h" 19 | #include "third_party/absl/flags/flag.h" 20 | #include "third_party/absl/flags/parse.h" 21 | 22 | ABSL_DECLARE_FLAG(int32, minloglevel); 23 | 24 | namespace sentencepiece { 25 | inline void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, 26 | bool remove_arg = true) { 27 | const auto unused_args = absl::ParseCommandLine(*argc, *argv); 28 | 29 | if (remove_arg) { 30 | char **argv_val = *argv; 31 | *argv = argv_val = argv_val + *argc - unused_args.size(); 32 | std::copy(unused_args.begin(), unused_args.end(), argv_val); 33 | *argc = static_cast(unused_args.size()); 34 | } 35 | 36 | logging::SetMinLogLevel(absl::GetFlag(FLAGS_minloglevel)); 37 | } 38 | } // namespace sentencepiece 39 | 40 | #endif // INIT_H_ 41 | -------------------------------------------------------------------------------- /third_party/darts_clone/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2008-2011, Susumu Yata 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | 6 | - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 8 | - Neither the name of the nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 9 | 10 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 11 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Want to contribute? Great! First, read this page (including the small print at the end). 2 | 3 | ### Before you contribute 4 | Before we can use your code, you must sign the 5 | [Google Individual Contributor License Agreement](https://cla.developers.google.com/about/google-individual) 6 | (CLA), which you can do online. The CLA is necessary mainly because you own the 7 | copyright to your changes even after your contribution becomes part of our 8 | codebase, so we need your permission to use and distribute your code. We also 9 | need to be sure of various other things—for instance, that you'll tell us if you 10 | know that your code infringes on other people's patents. You don't have to sign 11 | the CLA until after you've submitted your code for review and a member has 12 | approved it, but you must do it before we can put your code into our codebase. 13 | Before you start working on a larger contribution, you should get in touch with 14 | us first through the issue tracker with your idea so that we can help out and 15 | possibly guide you. Coordinating up-front makes it much easier to avoid 16 | frustration later on. 17 | 18 | ### Code reviews 19 | All submissions, including submissions by project members, require review. We 20 | use Github pull requests for this purpose. 21 | 22 | ### The small print 23 | Contributions made by corporations are covered by a different agreement than 24 | the one above, the [Software Grant and Corporate Contributor License Agreement](https://cla.developers.google.com/about/google-corporate). 25 | -------------------------------------------------------------------------------- /src/word_model_trainer.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef WORD_MODEL_TRAINER_H_ 16 | #define WORD_MODEL_TRAINER_H_ 17 | 18 | #include "sentencepiece_model.pb.h" 19 | #include "trainer_interface.h" 20 | 21 | namespace sentencepiece { 22 | namespace word { 23 | 24 | // Trainer class for word model. 25 | // 26 | // Word model simply counts the frequency of 27 | // space-delimited tokens, then keep top 28 | // |vocab_size| frequent tokens. 29 | class Trainer : public TrainerInterface { 30 | public: 31 | Trainer(const TrainerSpec &trainer_spec, 32 | const NormalizerSpec &normalizer_spec, 33 | const NormalizerSpec &denormalizer_spec) 34 | : TrainerInterface::TrainerInterface(trainer_spec, normalizer_spec, 35 | denormalizer_spec) {} 36 | 37 | util::Status Train() override; 38 | }; 39 | } // namespace word 40 | } // namespace sentencepiece 41 | #endif // WORD_MODEL_TRAINER_H_ 42 | -------------------------------------------------------------------------------- /tensorflow/README.md: -------------------------------------------------------------------------------- 1 | # SentencePiece TensorFlow module 2 | 3 | ## WARNING 4 | tf_sentencepiece is going to be deprecated in tensorflow 2.3.0. tf_sentencepiece for tensorflow 2.2.0x is the last release of tf_sentencepiece. Use [tensoflow_text](https://github.com/tensorflow/text) to run sentencepiece on tensorflow. 5 | 6 | Example 7 | ```Python 8 | import tensorflow as tf 9 | import tensorflow_text as text 10 | 11 | model = open('test_model.model', 'rb').read() 12 | s1 = text.SentencepieceTokenizer(model=model) 13 | print(s1.tokenize(['hello world'])) 14 | print(s1.tokenize_with_offsets(['hello world'])) 15 | 16 | s2 = text.SentencepieceTokenizer(model=model, out_type=tf.dtypes.string) 17 | print(s2.tokenize(['hello world'])) 18 | print(s2.tokenize_with_offsets(['hello world'])) 19 | ``` 20 | 21 | ## Introduction 22 | 23 | SentencePiece TensorFlow module implements the encode (text to id/piece) and decode (id/piece to text) operations which are executed lazily on top of TensorFlow's Session mechanism. This module allows to make an end-to-end training/inference computatation graph by directly feeding raw sentences with the tf.placeholder. 24 | The SentencePiece model (model proto) is passed as an attribute of the TensorFlow operation 25 | and embedded into the TensorFlow graph so the model and graph become purely self-contained. 26 | 27 | ## Build and Install SentencePiece 28 | For Linux (x64), macOS environment: 29 | 30 | ``` 31 | % pip install tf_sentencepiece 32 | ``` 33 | 34 | ## Usage 35 | Use pydoc to see the usage instruction 36 | ``` 37 | % pydoc sentencepiece_processor_ops 38 | ``` 39 | 40 | [Sample code](https://colab.research.google.com/drive/1rQ0tgXmHv02sMO6VdTO0yYaTvc1Yv1yP) 41 | -------------------------------------------------------------------------------- /third_party/absl/strings/str_cat.h: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright 2017 The Abseil Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // 16 | #ifndef ABSL_STRINGS_STR_CAT_H_ 17 | #define ABSL_STRINGS_STR_CAT_H_ 18 | 19 | #include 20 | #include 21 | 22 | #include "third_party/absl/strings/numbers.h" 23 | #include "third_party/absl/strings/string_view.h" 24 | 25 | namespace absl { 26 | 27 | inline std::string StrCat(int v) { 28 | std::ostringstream os; 29 | os << v; 30 | return os.str(); 31 | } 32 | 33 | inline std::string StrCat(absl::string_view str) { 34 | return std::string(str.data(), str.size()); 35 | } 36 | 37 | template 38 | inline std::string StrCat(absl::string_view first, const T &...rest) { 39 | return StrCat(first) + StrCat(rest...); 40 | } 41 | 42 | template 43 | inline std::string StrCat(int first, const T &...rest) { 44 | return StrCat(first) + StrCat(rest...); 45 | } 46 | 47 | inline void StrAppend(std::string *base, absl::string_view str) { 48 | base->append(str.data(), str.size()); 49 | } 50 | 51 | } // namespace absl 52 | #endif // ABSL_STRINGS_STR_CAT_H_ 53 | -------------------------------------------------------------------------------- /src/unicode_script_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include "common.h" 16 | #include "testharness.h" 17 | #include "third_party/absl/strings/string_view.h" 18 | #include "unicode_script.h" 19 | #include "util.h" 20 | 21 | namespace sentencepiece { 22 | namespace unicode_script { 23 | ScriptType GetScriptType(absl::string_view s) { 24 | const auto ut = string_util::UTF8ToUnicodeText(s); 25 | CHECK_EQ(1, ut.size()); 26 | return GetScript(ut[0]); 27 | } 28 | 29 | TEST(UnicodeScript, GetScriptTypeTest) { 30 | EXPECT_EQ(U_Han, GetScriptType("京")); 31 | EXPECT_EQ(U_Han, GetScriptType("太")); 32 | EXPECT_EQ(U_Hiragana, GetScriptType("い")); 33 | EXPECT_EQ(U_Katakana, GetScriptType("グ")); 34 | EXPECT_EQ(U_Common, GetScriptType("ー")); 35 | EXPECT_EQ(U_Latin, GetScriptType("a")); 36 | EXPECT_EQ(U_Latin, GetScriptType("A")); 37 | EXPECT_EQ(U_Common, GetScriptType("0")); 38 | EXPECT_EQ(U_Common, GetScriptType("$")); 39 | EXPECT_EQ(U_Common, GetScriptType("@")); 40 | EXPECT_EQ(U_Common, GetScriptType("-")); 41 | } 42 | } // namespace unicode_script 43 | } // namespace sentencepiece 44 | -------------------------------------------------------------------------------- /data/gen_unicode_scripts_code.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2016 Google Inc. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Generate unicode_sciript_data.h from Unicode Scripts.txt 18 | # 19 | # usage: ./gen_unicode_Scripts_code.pl < scripts > unicode_script_data.h 20 | # 21 | print "#ifndef UNICODE_SCRIPT_DATA_H_\n"; 22 | print "#define UNICODE_SCRIPT_DATA_H_\n"; 23 | print "namespace sentencepiece {\n"; 24 | print "namespace unicode_script {\n"; 25 | print "namespace {\n"; 26 | print "void InitTable(std::unordered_map *smap) {\n"; 27 | print " CHECK_NOTNULL(smap)->clear();\n"; 28 | 29 | while (<>) { 30 | chomp; 31 | if (/^([0-9A-F]+)\s+;\s+(\S+)\s+\#/) { 32 | printf(" (*smap)[0x%s] = U_%s;\n", $1, $2); 33 | } elsif (/^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(\S+)\s+\#/) { 34 | printf(" for (char32 c = 0x%s; c <= 0x%s; ++c)\n", $1, $2); 35 | printf(" (*smap)[c] = U_%s;\n", $3); 36 | } else { 37 | next; 38 | } 39 | } 40 | 41 | print "}\n"; 42 | print "} // namespace\n"; 43 | print "} // namespace unicode_script\n"; 44 | print "} // namespace sentencepiece\n"; 45 | print "#endif // UNICODE_SCRIPT_DATA_H_\n"; 46 | -------------------------------------------------------------------------------- /src/filesystem_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include "filesystem.h" 16 | #include "testharness.h" 17 | #include "third_party/absl/strings/str_cat.h" 18 | #include "util.h" 19 | 20 | namespace sentencepiece { 21 | 22 | TEST(UtilTest, FilesystemTest) { 23 | const std::vector kData = { 24 | "This" 25 | "is" 26 | "a" 27 | "test"}; 28 | 29 | { 30 | auto output = filesystem::NewWritableFile( 31 | util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "test_file")); 32 | for (size_t i = 0; i < kData.size(); ++i) { 33 | output->WriteLine(kData[i]); 34 | } 35 | } 36 | 37 | { 38 | auto input = filesystem::NewReadableFile( 39 | util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "test_file")); 40 | std::string line; 41 | for (size_t i = 0; i < kData.size(); ++i) { 42 | EXPECT_TRUE(input->ReadLine(&line)); 43 | EXPECT_EQ(kData[i], line); 44 | } 45 | EXPECT_FALSE(input->ReadLine(&line)); 46 | } 47 | } 48 | 49 | TEST(UtilTest, FilesystemInvalidFileTest) { 50 | auto input = filesystem::NewReadableFile("__UNKNOWN__FILE__"); 51 | EXPECT_FALSE(input->status().ok()); 52 | } 53 | 54 | } // namespace sentencepiece 55 | -------------------------------------------------------------------------------- /src/model_factory.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include "bpe_model.h" 16 | #include "char_model.h" 17 | #include "model_factory.h" 18 | #include "third_party/absl/memory/memory.h" 19 | #include "unigram_model.h" 20 | #include "word_model.h" 21 | 22 | namespace sentencepiece { 23 | 24 | // Instantiate Model instance from |model_proto| 25 | std::unique_ptr ModelFactory::Create( 26 | const ModelProto& model_proto) { 27 | const auto& trainer_spec = model_proto.trainer_spec(); 28 | 29 | switch (trainer_spec.model_type()) { 30 | case TrainerSpec::UNIGRAM: 31 | return absl::make_unique(model_proto); 32 | break; 33 | case TrainerSpec::BPE: 34 | return absl::make_unique(model_proto); 35 | break; 36 | case TrainerSpec::WORD: 37 | return absl::make_unique(model_proto); 38 | break; 39 | case TrainerSpec::CHAR: 40 | return absl::make_unique(model_proto); 41 | break; 42 | default: 43 | LOG(ERROR) << "Unknown model_type: " << trainer_spec.model_type(); 44 | return nullptr; 45 | break; 46 | } 47 | 48 | return absl::make_unique(model_proto); 49 | } 50 | } // namespace sentencepiece 51 | -------------------------------------------------------------------------------- /third_party/protobuf-lite/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2008 Google Inc. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above 10 | copyright notice, this list of conditions and the following disclaimer 11 | in the documentation and/or other materials provided with the 12 | distribution. 13 | * Neither the name of Google Inc. nor the names of its 14 | contributors may be used to endorse or promote products derived from 15 | this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | Code generated by the Protocol Buffer compiler is owned by the owner 30 | of the input file used when generating it. This code is not 31 | standalone and requires a support library to be linked with it. This 32 | support library is itself covered by the above license. 33 | -------------------------------------------------------------------------------- /src/trainer_factory_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include "testharness.h" 16 | #include "trainer_factory.h" 17 | 18 | namespace sentencepiece { 19 | 20 | TEST(TrainerFactoryTest, BasicTest) { 21 | TrainerSpec trainer_spec; 22 | NormalizerSpec normalizer_spec; 23 | NormalizerSpec denormalizer_spec; 24 | 25 | trainer_spec.set_model_prefix("model"); 26 | trainer_spec.add_input("input"); 27 | 28 | { 29 | trainer_spec.set_model_type(TrainerSpec::UNIGRAM); 30 | auto m = TrainerFactory::Create(trainer_spec, normalizer_spec, 31 | denormalizer_spec); 32 | } 33 | 34 | { 35 | trainer_spec.set_model_type(TrainerSpec::BPE); 36 | auto m = TrainerFactory::Create(trainer_spec, normalizer_spec, 37 | denormalizer_spec); 38 | } 39 | 40 | { 41 | trainer_spec.set_model_type(TrainerSpec::WORD); 42 | auto m = TrainerFactory::Create(trainer_spec, normalizer_spec, 43 | denormalizer_spec); 44 | } 45 | 46 | { 47 | trainer_spec.set_model_type(TrainerSpec::CHAR); 48 | auto m = TrainerFactory::Create(trainer_spec, normalizer_spec, 49 | denormalizer_spec); 50 | } 51 | } 52 | } // namespace sentencepiece 53 | -------------------------------------------------------------------------------- /third_party/absl/flags/flag.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef ABSL_FLAGS_FLAG_H_ 16 | #define ABSL_FLAGS_FLAG_H_ 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | namespace absl { 24 | namespace internal { 25 | struct FlagFunc; 26 | 27 | void RegisterFlag(const std::string &name, FlagFunc *func); 28 | } // namespace internal 29 | 30 | template 31 | class Flag { 32 | public: 33 | Flag(const char *name, const char *type, const char *help, 34 | const T &defautl_value); 35 | virtual ~Flag(); 36 | const T &value() const; 37 | void set_value(const T &value); 38 | void set_value_as_str(const std::string &value_as_str); 39 | 40 | private: 41 | T value_; 42 | std::unique_ptr func_; 43 | }; 44 | 45 | template 46 | const T &GetFlag(const Flag &flag) { 47 | return flag.value(); 48 | } 49 | 50 | template 51 | void SetFlag(Flag *flag, const V &v) { 52 | const T value(v); 53 | flag->set_value(value); 54 | } 55 | } // namespace absl 56 | 57 | #define ABSL_FLAG(Type, name, defautl_value, help) \ 58 | absl::Flag FLAGS_##name(#name, #Type, help, defautl_value); 59 | 60 | #define ABSL_DECLARE_FLAG(Type, name) extern absl::Flag FLAGS_##name; 61 | 62 | #endif // ABSL_FLAGS_FLAG_H_ 63 | -------------------------------------------------------------------------------- /src/bpe_model.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef BPE_MODEL_H_ 16 | #define BPE_MODEL_H_ 17 | 18 | #include "model_interface.h" 19 | #include "sentencepiece_model.pb.h" 20 | 21 | namespace sentencepiece { 22 | namespace bpe { 23 | 24 | // Segmentation model with BPE (Byte Pair Encoding) 25 | // Details: 26 | // Neural Machine Translation of Rare Words with Subword Units 27 | // https://arxiv.org/abs/1508.07909 28 | // 29 | // https://en.wikipedia.org/wiki/Byte_pair_encoding 30 | class Model : public ModelInterface { 31 | public: 32 | explicit Model(const ModelProto &model_proto); 33 | ~Model() override; 34 | 35 | EncodeResult Encode(absl::string_view normalized) const override { 36 | return SampleEncode(normalized, 0.0); 37 | } 38 | 39 | // Sampling with BPE-dropout: https://arxiv.org/pdf/1910.13267.pdf 40 | // `alpha` is dropout probability in BPE-dropout paper. 41 | // Skips merge operation with `alpha` probability. 42 | // When alpha <= 0.0, no sampling is performed. 43 | EncodeResult SampleEncode(absl::string_view normalized, 44 | float alpha) const override; 45 | 46 | bool IsSampleEncodeAvailable() const override { return true; } 47 | 48 | bool IsNBestEncodeAvailable() const override { return false; } 49 | }; 50 | } // namespace bpe 51 | } // namespace sentencepiece 52 | #endif // BPE_MODEL_H_ 53 | -------------------------------------------------------------------------------- /src/model_factory_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include "model_factory.h" 16 | #include "testharness.h" 17 | 18 | namespace sentencepiece { 19 | 20 | TEST(ModelFactoryTest, BasicTest) { 21 | ModelProto model_proto; 22 | 23 | auto *sp1 = model_proto.add_pieces(); 24 | auto *sp2 = model_proto.add_pieces(); 25 | auto *sp3 = model_proto.add_pieces(); 26 | 27 | sp1->set_type(ModelProto::SentencePiece::UNKNOWN); 28 | sp1->set_piece(""); 29 | sp2->set_type(ModelProto::SentencePiece::CONTROL); 30 | sp2->set_piece(""); 31 | sp3->set_type(ModelProto::SentencePiece::CONTROL); 32 | sp3->set_piece(""); 33 | 34 | auto *sp4 = model_proto.add_pieces(); 35 | sp4->set_piece("test"); 36 | sp4->set_score(1.0); 37 | 38 | { 39 | model_proto.mutable_trainer_spec()->set_model_type(TrainerSpec::UNIGRAM); 40 | auto m = ModelFactory::Create(model_proto); 41 | } 42 | 43 | { 44 | model_proto.mutable_trainer_spec()->set_model_type(TrainerSpec::BPE); 45 | auto m = ModelFactory::Create(model_proto); 46 | } 47 | 48 | { 49 | model_proto.mutable_trainer_spec()->set_model_type(TrainerSpec::WORD); 50 | auto m = ModelFactory::Create(model_proto); 51 | } 52 | 53 | { 54 | model_proto.mutable_trainer_spec()->set_model_type(TrainerSpec::CHAR); 55 | auto m = ModelFactory::Create(model_proto); 56 | } 57 | } 58 | } // namespace sentencepiece 59 | -------------------------------------------------------------------------------- /src/char_model_trainer.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include 16 | 17 | #include "char_model.h" 18 | #include "char_model_trainer.h" 19 | #include "util.h" 20 | 21 | namespace sentencepiece { 22 | namespace character { 23 | 24 | util::Status Trainer::Train() { 25 | RETURN_IF_ERROR(status()); 26 | 27 | CHECK_OR_RETURN(normalizer_spec_.escape_whitespaces()); 28 | CHECK_EQ_OR_RETURN(TrainerSpec::CHAR, trainer_spec_.model_type()); 29 | 30 | RETURN_IF_ERROR(LoadSentences()); 31 | 32 | const int vocab_size = trainer_spec_.vocab_size() - meta_pieces_.size(); 33 | CHECK_GE_OR_RETURN(vocab_size, 0); 34 | 35 | uint64 sum = 0; 36 | for (const auto &it : required_chars_) { 37 | sum += it.second; 38 | } 39 | 40 | const auto logsum = std::log(static_cast(sum)); 41 | 42 | CHECK_OR_RETURN(final_pieces_.empty()); 43 | for (const auto &it : Sorted(required_chars_)) { 44 | if (!trainer_spec_.use_all_vocab() && 45 | final_pieces_.size() == static_cast(vocab_size)) { 46 | break; 47 | } 48 | final_pieces_.emplace_back( 49 | string_util::UnicodeCharToUTF8(it.first), 50 | std::log(static_cast(it.second)) - logsum); 51 | } 52 | 53 | if (trainer_spec_.use_all_vocab()) { 54 | trainer_spec_.set_vocab_size(final_pieces_.size() + meta_pieces_.size()); 55 | } 56 | 57 | return Save(); 58 | } 59 | } // namespace character 60 | } // namespace sentencepiece 61 | -------------------------------------------------------------------------------- /python/make_py_wheel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 Google Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License.! 15 | set -e # exit immediately on error 16 | set -x # display all commands 17 | 18 | CMAKE_VERSION=3.12.0 19 | 20 | run_docker() { 21 | cd `dirname $0` 22 | docker pull $1 23 | docker run --rm -ti --name py_sentencepiece \ 24 | -v `pwd`/../:/sentencepiece -w /sentencepiece/python \ 25 | -td $1 /bin/bash 26 | docker exec py_sentencepiece bash -c "./make_py_wheel.sh native $2" 27 | docker stop py_sentencepiece 28 | } 29 | 30 | build() { 31 | TRG=$1 32 | rm -fr build 33 | mkdir -p build 34 | cd build 35 | 36 | # Install sentencepiece 37 | cmake ../.. -DSPM_ENABLE_SHARED=OFF 38 | make -j4 39 | make install 40 | cd .. 41 | 42 | for i in /opt/python/* 43 | do 44 | export LD_LIBRARY_PATH=/usr/local/lib:/usr/lib 45 | $i/bin/python setup.py clean 46 | $i/bin/python setup.py bdist 47 | strip build/*/*/*.so 48 | $i/bin/python setup.py bdist_wheel 49 | $i/bin/python setup.py test 50 | rm -fr build 51 | rm -fr *.so 52 | done 53 | 54 | cd dist 55 | for i in *${TRG}.whl 56 | do 57 | auditwheel repair $i 58 | done 59 | 60 | mv -f wheelhouse/*${TRG}.whl . 61 | 62 | cd .. 63 | rm -fr build 64 | } 65 | 66 | if [ "$1" = "native" ]; then 67 | build $2 68 | elif [ "$#" -eq 1 ]; then 69 | run_docker quay.io/pypa/manylinux2014_${1} ${1} 70 | else 71 | run_docker quay.io/pypa/manylinux2014_i686 i686 72 | run_docker quay.io/pypa/manylinux2014_x86_64 x86_64 73 | fi 74 | -------------------------------------------------------------------------------- /src/filesystem.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef FILESYSTEM_H_ 16 | #define FILESYSTEM_H_ 17 | 18 | #include 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | #include "common.h" 25 | #include "sentencepiece_processor.h" 26 | #include "third_party/absl/strings/string_view.h" 27 | 28 | namespace sentencepiece { 29 | namespace filesystem { 30 | class ReadableFile { 31 | public: 32 | ReadableFile() {} 33 | explicit ReadableFile(absl::string_view filename, bool is_binary = false) {} 34 | virtual ~ReadableFile() {} 35 | 36 | virtual util::Status status() const = 0; 37 | virtual bool ReadLine(std::string *line) = 0; 38 | virtual bool ReadAll(std::string *line) = 0; 39 | }; 40 | 41 | class WritableFile { 42 | public: 43 | WritableFile() {} 44 | explicit WritableFile(absl::string_view filename, bool is_binary = false) {} 45 | virtual ~WritableFile() {} 46 | 47 | virtual util::Status status() const = 0; 48 | virtual bool Write(absl::string_view text) = 0; 49 | virtual bool WriteLine(absl::string_view text) = 0; 50 | }; 51 | 52 | std::unique_ptr NewReadableFile(absl::string_view filename, 53 | bool is_binary = false); 54 | std::unique_ptr NewWritableFile(absl::string_view filename, 55 | bool is_binary = false); 56 | 57 | } // namespace filesystem 58 | } // namespace sentencepiece 59 | #endif // FILESYSTEM_H_ 60 | -------------------------------------------------------------------------------- /src/testharness.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include "testharness.h" 16 | 17 | #ifndef OS_WIN 18 | #include 19 | #include 20 | #else 21 | #include 22 | #endif 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | #include "common.h" 29 | #include "third_party/absl/strings/str_cat.h" 30 | #include "util.h" 31 | 32 | namespace sentencepiece { 33 | namespace test { 34 | 35 | namespace { 36 | struct Test { 37 | const char *base; 38 | const char *name; 39 | void (*func)(); 40 | }; 41 | std::vector *tests; 42 | } // namespace 43 | 44 | bool RegisterTest(const char *base, const char *name, void (*func)()) { 45 | if (tests == nullptr) { 46 | tests = new std::vector; 47 | } 48 | Test t; 49 | t.base = base; 50 | t.name = name; 51 | t.func = func; 52 | tests->emplace_back(t); 53 | return true; 54 | } 55 | 56 | int RunAllTests() { 57 | int num = 0; 58 | #ifdef OS_WIN 59 | _mkdir(absl::GetFlag(FLAGS_test_tmpdir).c_str()); 60 | #else 61 | mkdir(absl::GetFlag(FLAGS_test_tmpdir).c_str(), S_IRUSR | S_IWUSR | S_IXUSR); 62 | #endif 63 | 64 | if (tests == nullptr) { 65 | std::cerr << "No tests are found" << std::endl; 66 | return 0; 67 | } 68 | 69 | for (const Test &t : *(tests)) { 70 | std::cerr << "[ RUN ] " << t.base << "." << t.name << std::endl; 71 | (*t.func)(); 72 | std::cerr << "[ OK ] " << t.base << "." << t.name << std::endl; 73 | ++num; 74 | } 75 | std::cerr << "==== PASSED " << num << " tests" << std::endl; 76 | 77 | return 0; 78 | } 79 | } // namespace test 80 | } // namespace sentencepiece 81 | -------------------------------------------------------------------------------- /third_party/protobuf-lite/google/protobuf/port.h: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | // A common header that is included across all protobuf headers. We do our best 32 | // to avoid #defining any macros here; instead we generally put macros in 33 | // port_def.inc and port_undef.inc so they are not visible from outside of 34 | // protobuf. 35 | 36 | #ifndef GOOGLE_PROTOBUF_PORT_H__ 37 | #define GOOGLE_PROTOBUF_PORT_H__ 38 | 39 | 40 | #include 41 | 42 | 43 | #endif // GOOGLE_PROTOBUF_PORT_H__ 44 | -------------------------------------------------------------------------------- /src/spm_export_vocab_main.cc: -------------------------------------------------------------------------------- 1 | 2 | 3 | // Copyright 2016 Google Inc. 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // n// http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License.! 15 | 16 | #include 17 | 18 | #include "common.h" 19 | #include "filesystem.h" 20 | #include "init.h" 21 | #include "sentencepiece_model.pb.h" 22 | #include "sentencepiece_processor.h" 23 | #include "third_party/absl/flags/flag.h" 24 | 25 | ABSL_FLAG(std::string, output, "", "Output filename"); 26 | ABSL_FLAG(std::string, model, "", "input model file name"); 27 | ABSL_FLAG(std::string, output_format, "vocab", 28 | "output format. choose from vocab or syms. vocab outputs pieces " 29 | "and scores, syms outputs pieces and indices."); 30 | 31 | int main(int argc, char *argv[]) { 32 | sentencepiece::ParseCommandLineFlags(argv[0], &argc, &argv, true); 33 | 34 | sentencepiece::SentencePieceProcessor sp; 35 | CHECK_OK(sp.Load(absl::GetFlag(FLAGS_model))); 36 | 37 | auto output = 38 | sentencepiece::filesystem::NewWritableFile(absl::GetFlag(FLAGS_output)); 39 | CHECK_OK(output->status()); 40 | 41 | if (absl::GetFlag(FLAGS_output_format) == "vocab") { 42 | for (const auto &piece : sp.model_proto().pieces()) { 43 | std::ostringstream os; 44 | os << piece.piece() << "\t" << piece.score(); 45 | output->WriteLine(os.str()); 46 | } 47 | } else if (absl::GetFlag(FLAGS_output_format) == "syms") { 48 | for (int i = 0; i < sp.model_proto().pieces_size(); i++) { 49 | std::ostringstream os; 50 | os << sp.model_proto().pieces(i).piece() << "\t" << i; 51 | output->WriteLine(os.str()); 52 | } 53 | } else { 54 | LOG(FATAL) << "Unsupported output format: " 55 | << absl::GetFlag(FLAGS_output_format); 56 | } 57 | 58 | return 0; 59 | } 60 | -------------------------------------------------------------------------------- /src/pretokenizer_for_training.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | #include 15 | 16 | #include "pretokenizer_for_training.h" 17 | #include "third_party/absl/strings/str_replace.h" 18 | 19 | namespace sentencepiece { 20 | namespace pretokenizer { 21 | 22 | namespace { 23 | // TODO(taku): They are defined in trainer_interface.h but we 24 | // defined them explicitly to avoid the dependency to trainier_interface. 25 | // Currently, we have no separated build rules. 26 | const char kWSStr[] = "\xe2\x96\x81"; 27 | const char kUPPBoundaryStr[] = "\t"; 28 | } // namespace 29 | 30 | std::string PretokenizerForTrainingInterface::PreTokenize( 31 | absl::string_view text) const { 32 | return Postprocess(Tokenize(Preprocess(text))); 33 | } 34 | 35 | // static 36 | std::string PretokenizerForTrainingInterface::Preprocess( 37 | absl::string_view text) { 38 | // Escapes kWSStr (_) as this character may not be processed by pre-tokenizer. 39 | return absl::StrReplaceAll(text, {{kWSStr, " "}}); 40 | } 41 | 42 | // static 43 | std::string PretokenizerForTrainingInterface::Postprocess( 44 | const SentencePieceText &spt) { 45 | // Inserts kUPPBoundaryStr before/after of token boundaries. 46 | std::string output; 47 | int prev = 0; 48 | for (const auto &piece : spt.pieces()) { 49 | if (prev == piece.begin() && piece.begin() != 0) { 50 | output += kUPPBoundaryStr; 51 | } else { 52 | output.append(piece.begin() - prev, ' '); 53 | } 54 | output += piece.surface(); 55 | prev = piece.end(); 56 | } 57 | 58 | // Restores kWSStr. 59 | return absl::StrReplaceAll(output, {{" ", kWSStr}}); 60 | } 61 | 62 | } // namespace pretokenizer 63 | } // namespace sentencepiece 64 | -------------------------------------------------------------------------------- /third_party/protobuf-lite/statusor.cc: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | #include 32 | 33 | #include 34 | 35 | namespace google { 36 | namespace protobuf { 37 | namespace util { 38 | namespace internal { 39 | 40 | void StatusOrHelper::Crash(const Status& status) { 41 | GOOGLE_LOG(FATAL) << "Attempting to fetch value instead of handling error " 42 | << status.ToString(); 43 | } 44 | 45 | } // namespace internal 46 | } // namespace util 47 | } // namespace protobuf 48 | } // namespace google 49 | -------------------------------------------------------------------------------- /src/trainer_factory.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include "bpe_model_trainer.h" 16 | #include "char_model_trainer.h" 17 | #include "third_party/absl/memory/memory.h" 18 | #include "trainer_factory.h" 19 | #include "unigram_model_trainer.h" 20 | #include "word_model_trainer.h" 21 | 22 | namespace sentencepiece { 23 | 24 | // Instantiate Trainer instance from trainer_spec and normalization_spec 25 | std::unique_ptr TrainerFactory::Create( 26 | const TrainerSpec &trainer_spec, const NormalizerSpec &normalizer_spec, 27 | const NormalizerSpec &denormalizer_spec) { 28 | switch (trainer_spec.model_type()) { 29 | case TrainerSpec::UNIGRAM: 30 | return absl::make_unique(trainer_spec, normalizer_spec, 31 | denormalizer_spec); 32 | break; 33 | case TrainerSpec::BPE: 34 | return absl::make_unique(trainer_spec, normalizer_spec, 35 | denormalizer_spec); 36 | break; 37 | case TrainerSpec::WORD: 38 | return absl::make_unique(trainer_spec, normalizer_spec, 39 | denormalizer_spec); 40 | break; 41 | case TrainerSpec::CHAR: 42 | return absl::make_unique( 43 | trainer_spec, normalizer_spec, denormalizer_spec); 44 | break; 45 | default: 46 | LOG(FATAL) << "Unknown model_type: " << trainer_spec.model_type(); 47 | break; 48 | } 49 | 50 | return absl::make_unique(trainer_spec, normalizer_spec, 51 | denormalizer_spec); 52 | } 53 | } // namespace sentencepiece 54 | -------------------------------------------------------------------------------- /src/pretokenizer_for_training.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef PRETOKENIZER_FOR_TRAINING_H_ 16 | #define PRETOKENIZER_FOR_TRAINING_H_ 17 | 18 | #include 19 | #include 20 | 21 | #include "common.h" 22 | #include "sentencepiece.pb.h" 23 | #include "sentencepiece_processor.h" 24 | #include "third_party/absl/strings/string_view.h" 25 | 26 | namespace sentencepiece { 27 | namespace pretokenizer { 28 | 29 | class PretokenizerForTrainingInterface { 30 | public: 31 | PretokenizerForTrainingInterface() {} 32 | virtual ~PretokenizerForTrainingInterface() {} 33 | virtual util::Status status() const = 0; 34 | 35 | // Puts kUPPBoundaryStr before and after the pre-tokenizer's segmentation 36 | // when there are no spaces between these tokens. 37 | // Example1: 38 | // input: 東京です 39 | // segmentation: piece[0] = {0, 6}, piece[1] = {6, 12} 40 | // output: 東京です (here kUPPBoundaryStr is ) 41 | // 42 | // Example2: 43 | // input: I love sentencepiece 44 | // segmentation: piece[0] = {0, 1}, piece[1] = {2, 6}, 45 | // piece[2] = {7, 15}, piece[3] = {15, 20} 46 | // output: I love sentencepiece. 47 | std::string PreTokenize(absl::string_view text) const; 48 | 49 | // Returns pre-tokenized result. 50 | // Note that the pre-tokenized constraint is specified with the 51 | // byte offsets (SentencePiece::begin, SentencePiece::end) over 52 | // the input text. 53 | virtual SentencePieceText Tokenize(absl::string_view text) const = 0; 54 | 55 | private: 56 | static std::string Preprocess(absl::string_view text); 57 | static std::string Postprocess(const SentencePieceText &spt); 58 | }; 59 | 60 | } // namespace pretokenizer 61 | } // namespace sentencepiece 62 | 63 | #endif // PRETOKENIZER_FOR_TRAINING_H_ 64 | -------------------------------------------------------------------------------- /src/word_model_trainer.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include 16 | #include 17 | 18 | #include "third_party/absl/container/flat_hash_map.h" 19 | #include "third_party/absl/strings/string_view.h" 20 | #include "util.h" 21 | #include "word_model.h" 22 | #include "word_model_trainer.h" 23 | 24 | namespace sentencepiece { 25 | namespace word { 26 | 27 | util::Status Trainer::Train() { 28 | RETURN_IF_ERROR(status()); 29 | 30 | CHECK_OR_RETURN(normalizer_spec_.escape_whitespaces()); 31 | CHECK_EQ_OR_RETURN(TrainerSpec::WORD, trainer_spec_.model_type()); 32 | 33 | RETURN_IF_ERROR(LoadSentences()); 34 | 35 | absl::flat_hash_map freq; 36 | for (const auto &it : sentences_) { 37 | for (const auto &s : SplitIntoWords(it.first)) { 38 | freq[std::string(s)] += it.second; 39 | } 40 | } 41 | 42 | const int vocab_size = trainer_spec_.vocab_size() - meta_pieces_.size(); 43 | CHECK_GE_OR_RETURN(vocab_size, 0); 44 | 45 | uint64 sum = 0; 46 | for (const auto &it : freq) { 47 | sum += it.second; 48 | } 49 | 50 | const auto logsum = std::log(static_cast(sum)); 51 | 52 | CHECK_OR_RETURN(final_pieces_.empty()); 53 | for (const auto &it : Sorted(freq)) { 54 | if (it.first.find(kUNKStr) != std::string::npos) { 55 | continue; 56 | } 57 | if (!trainer_spec_.use_all_vocab() && 58 | final_pieces_.size() == static_cast(vocab_size)) { 59 | break; 60 | } 61 | final_pieces_.emplace_back( 62 | it.first, std::log(static_cast(it.second)) - logsum); 63 | } 64 | 65 | if (trainer_spec_.use_all_vocab()) { 66 | trainer_spec_.set_vocab_size(final_pieces_.size() + meta_pieces_.size()); 67 | } 68 | 69 | return Save(); 70 | } 71 | } // namespace word 72 | } // namespace sentencepiece 73 | -------------------------------------------------------------------------------- /third_party/absl/strings/str_replace.h: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright 2017 The Abseil Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // 16 | #ifndef ABSL_STRINGS_STR_REPLACE_H_ 17 | #define ABSL_STRINGS_STR_REPLACE_H_ 18 | 19 | #include 20 | 21 | #include "third_party/absl/strings/string_view.h" 22 | 23 | namespace absl { 24 | 25 | inline void StringReplace(absl::string_view s, absl::string_view oldsub, 26 | absl::string_view newsub, bool replace_all, 27 | std::string *res) { 28 | if (oldsub.empty()) { 29 | res->append(s.data(), s.size()); 30 | return; 31 | } 32 | 33 | absl::string_view::size_type start_pos = 0; 34 | do { 35 | const absl::string_view::size_type pos = s.find(oldsub, start_pos); 36 | if (pos == absl::string_view::npos) { 37 | break; 38 | } 39 | res->append(s.data() + start_pos, pos - start_pos); 40 | res->append(newsub.data(), newsub.size()); 41 | start_pos = pos + oldsub.size(); 42 | } while (replace_all); 43 | res->append(s.data() + start_pos, s.size() - start_pos); 44 | } 45 | 46 | inline std::string StringReplace(absl::string_view s, absl::string_view oldsub, 47 | absl::string_view newsub, bool replace_all) { 48 | std::string ret; 49 | StringReplace(s, oldsub, newsub, replace_all, &ret); 50 | return ret; 51 | } 52 | 53 | inline std::string StrReplaceAll( 54 | absl::string_view s, 55 | const std::vector> 56 | &patterns) { 57 | std::string prev(s.data(), s.size()); 58 | std::string result; 59 | for (const auto &it : patterns) { 60 | result.clear(); 61 | StringReplace(prev, it.first, it.second, true, &result); 62 | prev = result; 63 | } 64 | return result; 65 | } 66 | 67 | } // namespace absl 68 | #endif // ABSL_STRINGS_STR_REPLACE_H_ 69 | -------------------------------------------------------------------------------- /third_party/protobuf-lite/google/protobuf/stubs/once.h: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | #ifndef GOOGLE_PROTOBUF_STUBS_ONCE_H__ 32 | #define GOOGLE_PROTOBUF_STUBS_ONCE_H__ 33 | 34 | #include 35 | #include 36 | 37 | #include 38 | 39 | namespace google { 40 | namespace protobuf { 41 | namespace internal { 42 | 43 | using once_flag = std::once_flag; 44 | template 45 | void call_once(Args&&... args ) { 46 | std::call_once(std::forward(args)...); 47 | } 48 | 49 | } // namespace internal 50 | } // namespace protobuf 51 | } // namespace google 52 | 53 | #include 54 | 55 | #endif // GOOGLE_PROTOBUF_STUBS_ONCE_H__ 56 | -------------------------------------------------------------------------------- /third_party/protobuf-lite/zero_copy_stream.cc: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | // Author: kenton@google.com (Kenton Varda) 32 | // Based on original Protocol Buffers design by 33 | // Sanjay Ghemawat, Jeff Dean, and others. 34 | 35 | #include 36 | 37 | #include 38 | #include 39 | 40 | namespace google { 41 | namespace protobuf { 42 | namespace io { 43 | 44 | 45 | bool ZeroCopyOutputStream::WriteAliasedRaw(const void* /* data */, 46 | int /* size */) { 47 | GOOGLE_LOG(FATAL) << "This ZeroCopyOutputStream doesn't support aliasing. " 48 | "Reaching here usually means a ZeroCopyOutputStream " 49 | "implementation bug."; 50 | return false; 51 | } 52 | 53 | } // namespace io 54 | } // namespace protobuf 55 | } // namespace google 56 | -------------------------------------------------------------------------------- /src/freelist.h: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef FREELIST_H_ 16 | #define FREELIST_H_ 17 | 18 | #include 19 | 20 | #include 21 | 22 | namespace sentencepiece { 23 | namespace model { 24 | 25 | // Simple FreeList that allocates a chunk of T at once. 26 | template 27 | class FreeList { 28 | public: 29 | FreeList() = delete; 30 | explicit FreeList(size_t chunk_size) : chunk_size_(chunk_size) {} 31 | virtual ~FreeList() { 32 | for (auto& chunk : freelist_) delete[] chunk; 33 | } 34 | 35 | // `Free` doesn't free the object but reuse the allocated memory chunks. 36 | void Free() { 37 | const int size = std::min(chunk_index_ + 1, freelist_.size()); 38 | for (int i = 0; i < size; ++i) { 39 | T* chunk = freelist_[i]; 40 | memset(static_cast(chunk), 0, sizeof(*chunk) * chunk_size_); 41 | } 42 | chunk_index_ = 0; 43 | element_index_ = 0; 44 | } 45 | 46 | // Returns the number of allocated elements. 47 | size_t size() const { return chunk_size_ * chunk_index_ + element_index_; } 48 | 49 | // Returns the element as an array. 50 | T* operator[](size_t index) const { 51 | return freelist_[index / chunk_size_] + index % chunk_size_; 52 | } 53 | 54 | // Allocates new element. 55 | T* Allocate() { 56 | if (element_index_ >= chunk_size_) { 57 | ++chunk_index_; 58 | element_index_ = 0; 59 | } 60 | 61 | if (chunk_index_ == freelist_.size()) { 62 | T* chunk = new T[chunk_size_]; 63 | memset(static_cast(chunk), 0, sizeof(*chunk) * chunk_size_); 64 | freelist_.push_back(chunk); 65 | } 66 | 67 | T* result = freelist_[chunk_index_] + element_index_; 68 | ++element_index_; 69 | 70 | return result; 71 | } 72 | 73 | private: 74 | std::vector freelist_; 75 | 76 | // The last element is stored at freelist_[chunk_index_][element_index_] 77 | size_t element_index_ = 0; 78 | size_t chunk_index_ = 0; 79 | const size_t chunk_size_ = 0; 80 | }; 81 | } // namespace model 82 | } // namespace sentencepiece 83 | #endif // FREELIST_H_ 84 | -------------------------------------------------------------------------------- /src/word_model_trainer_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include 16 | #include 17 | 18 | #include "filesystem.h" 19 | #include "sentencepiece_processor.h" 20 | #include "testharness.h" 21 | #include "third_party/absl/strings/str_cat.h" 22 | #include "third_party/absl/strings/str_join.h" 23 | #include "util.h" 24 | #include "word_model_trainer.h" 25 | 26 | namespace sentencepiece { 27 | namespace word { 28 | namespace { 29 | 30 | // Space symbol (U+2581) 31 | #define WS "\xE2\x96\x81" 32 | 33 | std::string RunTrainer(const std::vector &input, int size) { 34 | const std::string input_file = 35 | util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "input"); 36 | const std::string model_prefix = 37 | util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "model"); 38 | { 39 | auto output = filesystem::NewWritableFile(input_file); 40 | for (const auto &line : input) { 41 | output->WriteLine(line); 42 | } 43 | } 44 | 45 | TrainerSpec trainer_spec; 46 | trainer_spec.set_model_type(TrainerSpec::WORD); 47 | trainer_spec.add_input(input_file); 48 | trainer_spec.set_vocab_size(size - 3); // remove , , 49 | trainer_spec.set_model_prefix(model_prefix); 50 | 51 | NormalizerSpec normalizer_spec; 52 | normalizer_spec.set_name("identity"); 53 | normalizer_spec.set_add_dummy_prefix(true); 54 | 55 | NormalizerSpec denormalizer_spec; 56 | 57 | Trainer trainer(trainer_spec, normalizer_spec, denormalizer_spec); 58 | EXPECT_TRUE(trainer.Train().ok()); 59 | 60 | SentencePieceProcessor processor; 61 | EXPECT_TRUE(processor.Load(model_prefix + ".model").ok()); 62 | 63 | const auto &model = processor.model_proto(); 64 | std::vector pieces; 65 | 66 | // remove , , 67 | for (int i = 3; i < model.pieces_size(); ++i) { 68 | pieces.emplace_back(model.pieces(i).piece()); 69 | } 70 | 71 | return absl::StrJoin(pieces, " "); 72 | } 73 | } // namespace 74 | 75 | TEST(TrainerTest, BasicTest) { 76 | EXPECT_EQ(WS "I " WS "apple " WS "have " WS "pen", 77 | RunTrainer({"I have a pen", "I have an apple", "apple pen"}, 10)); 78 | } 79 | } // namespace word 80 | } // namespace sentencepiece 81 | -------------------------------------------------------------------------------- /src/sentencepiece.proto: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | syntax = "proto2"; 16 | 17 | // TODO(taku): Needs to use LITE RUNTIME in OSS release. 18 | option optimize_for = LITE_RUNTIME; 19 | 20 | package sentencepiece; 21 | 22 | // SentencePieceText manages a user-facing source sentence, 23 | // postprocessed target sentence, and internal segmentation 24 | // with byte offsets. 25 | message SentencePieceText { 26 | message SentencePiece { 27 | // Internal representation for the decoder. 28 | // - Decoder can use |piece| as a basic token. 29 | // - the piece must be non-empty. 30 | // - A whitespace is replaced with a meta symbol. 31 | // - Concatenation of pieces is not always the same as the |text|. 32 | optional string piece = 1; 33 | 34 | // Vocabulary id. 35 | optional uint32 id = 2; 36 | 37 | // External representation for the client. 38 | // - It is always guaranteed that 39 | // text.substr(begin, end - begin) == surface. 40 | // - Concatenation of surface is always the same as the |text|. 41 | // - |surface| may contain whitespaces. 42 | // - |surface| may be empty if the piece encodes 43 | // a control vocabulary. e.g., , , . 44 | // - When |surface| is empty, always begin == end. (zero-length span). 45 | optional string surface = 3; 46 | 47 | optional uint32 begin = 4; 48 | optional uint32 end = 5; 49 | 50 | // Customized extensions: the range of field numbers 51 | // are open to third-party extensions. 52 | extensions 200 to max; 53 | } 54 | 55 | // User input or postprocessed text. This should be immutable 56 | // since the byte range in SentencePiece is pointing to a span over this 57 | // text. Meta symbols for whitespaces are not included. 58 | optional string text = 1; 59 | 60 | // A sequence of sentence pieces. 61 | repeated SentencePiece pieces = 2; 62 | 63 | // Score (usually log probability) for MultiSentencePieceText. 64 | optional float score = 3; 65 | 66 | // Customized extensions: the range of field numbers 67 | // are open to third-party extensions. 68 | extensions 200 to max; 69 | } 70 | 71 | message NBestSentencePieceText { 72 | repeated SentencePieceText nbests = 1; 73 | } 74 | -------------------------------------------------------------------------------- /third_party/absl/strings/str_join.h: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright 2017 The Abseil Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // 16 | #ifndef ABSL_STRINGS_STR_JOIN_H_ 17 | #define ABSL_STRINGS_STR_JOIN_H_ 18 | 19 | #include 20 | 21 | #include "third_party/absl/strings/string_view.h" 22 | 23 | namespace absl { 24 | namespace { 25 | template 26 | inline size_t Itoa(T val, char *s) { 27 | char *org = s; 28 | 29 | if (val < 0) { 30 | *s++ = '-'; 31 | val = -val; 32 | } 33 | char *t = s; 34 | 35 | T mod = 0; 36 | while (val) { 37 | mod = val % 10; 38 | *t++ = static_cast(mod) + '0'; 39 | val /= 10; 40 | } 41 | 42 | if (s == t) { 43 | *t++ = '0'; 44 | } 45 | 46 | *t = '\0'; 47 | std::reverse(s, t); 48 | return static_cast(t - org); 49 | } 50 | } // namespace 51 | 52 | inline std::string StrJoin(const std::vector &tokens, 53 | absl::string_view delim) { 54 | std::string result; 55 | if (!tokens.empty()) { 56 | result.append(tokens[0]); 57 | } 58 | for (size_t i = 1; i < tokens.size(); ++i) { 59 | result.append(delim.data(), delim.size()); 60 | result.append(tokens[i]); 61 | } 62 | return result; 63 | } 64 | 65 | inline std::string StrJoin(const std::vector &tokens, 66 | absl::string_view delim) { 67 | std::string result; 68 | if (!tokens.empty()) { 69 | result.append(tokens[0].data(), tokens[0].size()); 70 | } 71 | for (size_t i = 1; i < tokens.size(); ++i) { 72 | result.append(delim.data(), delim.size()); 73 | result.append(tokens[i].data(), tokens[i].size()); 74 | } 75 | return result; 76 | } 77 | 78 | inline std::string StrJoin(const std::vector &tokens, 79 | absl::string_view delim) { 80 | std::string result; 81 | char buf[32]; 82 | if (!tokens.empty()) { 83 | const size_t len = Itoa(tokens[0], buf); 84 | result.append(buf, len); 85 | } 86 | for (size_t i = 1; i < tokens.size(); ++i) { 87 | result.append(delim.data(), delim.size()); 88 | const size_t len = Itoa(tokens[i], buf); 89 | result.append(buf, len); 90 | } 91 | return result; 92 | } 93 | 94 | } // namespace absl 95 | #endif // ABSL_STRINGS_STR_CAT_H_ 96 | -------------------------------------------------------------------------------- /src/char_model_trainer_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include 16 | #include 17 | 18 | #include "char_model_trainer.h" 19 | #include "filesystem.h" 20 | #include "sentencepiece_processor.h" 21 | #include "testharness.h" 22 | #include "third_party/absl/strings/str_cat.h" 23 | #include "third_party/absl/strings/str_join.h" 24 | #include "util.h" 25 | 26 | namespace sentencepiece { 27 | namespace character { 28 | namespace { 29 | 30 | // Space symbol (U+2581) 31 | #define WS "\xE2\x96\x81" 32 | 33 | std::string RunTrainer(const std::vector &input, int size) { 34 | const std::string input_file = 35 | util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "input"); 36 | const std::string model_prefix = 37 | util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "model"); 38 | { 39 | auto output = filesystem::NewWritableFile(input_file); 40 | for (const auto &line : input) { 41 | output->WriteLine(line); 42 | } 43 | } 44 | 45 | TrainerSpec trainer_spec; 46 | trainer_spec.set_model_type(TrainerSpec::CHAR); 47 | trainer_spec.add_input(input_file); 48 | trainer_spec.set_vocab_size(size); 49 | trainer_spec.set_model_prefix(model_prefix); 50 | 51 | NormalizerSpec normalizer_spec; 52 | normalizer_spec.set_name("identity"); 53 | 54 | NormalizerSpec denormalizer_spec; 55 | 56 | Trainer trainer(trainer_spec, normalizer_spec, denormalizer_spec); 57 | EXPECT_TRUE(trainer.Train().ok()); 58 | 59 | SentencePieceProcessor processor; 60 | EXPECT_TRUE(processor.Load(model_prefix + ".model").ok()); 61 | 62 | const auto &model = processor.model_proto(); 63 | std::vector pieces; 64 | 65 | // remove , , 66 | for (int i = 3; i < model.pieces_size(); ++i) { 67 | pieces.emplace_back(model.pieces(i).piece()); 68 | } 69 | 70 | return absl::StrJoin(pieces, " "); 71 | } 72 | 73 | TEST(TrainerTest, BasicTest) { 74 | EXPECT_EQ(WS " a e p n I h l v", 75 | RunTrainer({"I have a pen", "I have an apple", "apple pen"}, 100)); 76 | EXPECT_EQ(WS " a", // , , , _, a 77 | RunTrainer({"I have a pen", "I have an apple", "apple pen"}, 5)); 78 | } 79 | 80 | } // namespace 81 | } // namespace character 82 | } // namespace sentencepiece 83 | -------------------------------------------------------------------------------- /third_party/absl/memory/memory.h: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright 2017 The Abseil Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // 16 | // ----------------------------------------------------------------------------- 17 | // File: string_view.h 18 | // ----------------------------------------------------------------------------- 19 | // 20 | // This file contains the definition of the `absl::string_view` class. A 21 | // `string_view` points to a contiguous span of characters, often part or all of 22 | // another `std::string`, double-quoted std::string literal, character array, or 23 | // even another `string_view`. 24 | // 25 | // This `absl::string_view` abstraction is designed to be a drop-in 26 | // replacement for the C++17 `std::string_view` abstraction. 27 | #ifndef ABSL_MEMORY_MEMORY_H_ 28 | #define ABSL_MEMORY_MEMORY_H_ 29 | 30 | #include 31 | 32 | namespace absl { 33 | 34 | // Trait to select overloads and return types for MakeUnique. 35 | template 36 | struct MakeUniqueResult { 37 | using scalar = std::unique_ptr; 38 | }; 39 | template 40 | struct MakeUniqueResult { 41 | using array = std::unique_ptr; 42 | }; 43 | template 44 | struct MakeUniqueResult { 45 | using invalid = void; 46 | }; 47 | 48 | // MakeUnique(...) is an early implementation of C++14 std::make_unique. 49 | // It is designed to be 100% compatible with std::make_unique so that the 50 | // eventual switchover will be a simple renaming operation. 51 | template 52 | typename MakeUniqueResult::scalar make_unique(Args &&... args) { // NOLINT 53 | return std::unique_ptr( 54 | new T(std::forward(args)...)); // NOLINT(build/c++11) 55 | } 56 | 57 | // Overload for array of unknown bound. 58 | // The allocation of arrays needs to use the array form of new, 59 | // and cannot take element constructor arguments. 60 | template 61 | typename MakeUniqueResult::array make_unique(size_t n) { 62 | return std::unique_ptr(new typename std::remove_extent::type[n]()); 63 | } 64 | 65 | // Reject arrays of known bound. 66 | template 67 | typename MakeUniqueResult::invalid make_unique(Args &&... /* args */) = 68 | delete; // NOLINT 69 | 70 | } // namespace absl 71 | #endif // ABSL_MEMORY_MEMORY_H_ 72 | -------------------------------------------------------------------------------- /src/word_model_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include 16 | 17 | #include "sentencepiece_model.pb.h" 18 | #include "testharness.h" 19 | #include "util.h" 20 | #include "word_model.h" 21 | 22 | namespace sentencepiece { 23 | namespace word { 24 | namespace { 25 | 26 | // Space symbol (U+2581) 27 | #define WS "\xe2\x96\x81" 28 | 29 | ModelProto MakeBaseModelProto() { 30 | ModelProto model_proto; 31 | auto *sp1 = model_proto.add_pieces(); 32 | auto *sp2 = model_proto.add_pieces(); 33 | auto *sp3 = model_proto.add_pieces(); 34 | 35 | sp1->set_type(ModelProto::SentencePiece::UNKNOWN); 36 | sp1->set_piece(""); 37 | sp2->set_type(ModelProto::SentencePiece::CONTROL); 38 | sp2->set_piece(""); 39 | sp3->set_type(ModelProto::SentencePiece::CONTROL); 40 | sp3->set_piece(""); 41 | 42 | return model_proto; 43 | } 44 | 45 | void AddPiece(ModelProto *model_proto, const std::string &piece, 46 | float score = 0.0) { 47 | auto *sp = model_proto->add_pieces(); 48 | sp->set_piece(piece); 49 | sp->set_score(score); 50 | } 51 | 52 | TEST(WordModelTest, EncodeTest) { 53 | ModelProto model_proto = MakeBaseModelProto(); 54 | 55 | AddPiece(&model_proto, WS "ab"); 56 | AddPiece(&model_proto, WS "cd"); 57 | AddPiece(&model_proto, WS "abc"); 58 | AddPiece(&model_proto, WS "a", 0.1); 59 | AddPiece(&model_proto, WS "b", 0.2); 60 | AddPiece(&model_proto, WS "c", 0.3); 61 | AddPiece(&model_proto, WS "d", 0.4); 62 | 63 | const Model model(model_proto); 64 | 65 | EncodeResult result; 66 | 67 | result = model.Encode(""); 68 | EXPECT_TRUE(result.empty()); 69 | 70 | result = model.Encode(WS "a" WS "b" WS "c"); 71 | EXPECT_EQ(3, result.size()); 72 | EXPECT_EQ(WS "a", result[0].first); 73 | EXPECT_EQ(WS "b", result[1].first); 74 | EXPECT_EQ(WS "c", result[2].first); 75 | 76 | result = model.Encode(WS "ab" WS "cd" WS "abc"); 77 | EXPECT_EQ(3, result.size()); 78 | EXPECT_EQ(WS "ab", result[0].first); 79 | EXPECT_EQ(WS "cd", result[1].first); 80 | EXPECT_EQ(WS "abc", result[2].first); 81 | } 82 | 83 | TEST(WordModelTest, NotSupportedTest) { 84 | ModelProto model_proto = MakeBaseModelProto(); 85 | const Model model(model_proto); 86 | EXPECT_EQ(NBestEncodeResult(), model.NBestEncode("test", 10)); 87 | EXPECT_EQ(EncodeResult(), model.SampleEncode("test", 0.1)); 88 | } 89 | 90 | } // namespace 91 | } // namespace word 92 | } // namespace sentencepiece 93 | -------------------------------------------------------------------------------- /third_party/protobuf-lite/implicit_weak_message.cc: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | #include 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | #include 39 | 40 | namespace google { 41 | namespace protobuf { 42 | namespace internal { 43 | 44 | const char* ImplicitWeakMessage::_InternalParse(const char* ptr, 45 | ParseContext* ctx) { 46 | return ctx->AppendString(ptr, &data_); 47 | } 48 | 49 | ExplicitlyConstructed 50 | implicit_weak_message_default_instance; 51 | internal::once_flag implicit_weak_message_once_init_; 52 | 53 | void InitImplicitWeakMessageDefaultInstance() { 54 | implicit_weak_message_default_instance.DefaultConstruct(); 55 | } 56 | 57 | const ImplicitWeakMessage* ImplicitWeakMessage::default_instance() { 58 | internal::call_once(implicit_weak_message_once_init_, 59 | InitImplicitWeakMessageDefaultInstance); 60 | return &implicit_weak_message_default_instance.get(); 61 | } 62 | 63 | } // namespace internal 64 | } // namespace protobuf 65 | } // namespace google 66 | -------------------------------------------------------------------------------- /third_party/absl/strings/str_split.h: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright 2017 The Abseil Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // 16 | #ifndef ABSL_STRINGS_STR_SPLIT_H_ 17 | #define ABSL_STRINGS_STR_SPLIT_H_ 18 | 19 | #include 20 | #include 21 | 22 | #include "third_party/absl/strings/string_view.h" 23 | 24 | namespace absl { 25 | namespace internal { 26 | 27 | class Splitter { 28 | public: 29 | Splitter(absl::string_view str, absl::string_view delim, bool allow_empty) { 30 | size_t current_pos = 0; 31 | size_t found_pos = 0; 32 | while ((found_pos = str.find_first_of(delim, current_pos)) != 33 | absl::string_view::npos) { 34 | if ((allow_empty && found_pos >= current_pos) || 35 | (!allow_empty && found_pos > current_pos)) { 36 | result_.push_back(str.substr(current_pos, found_pos - current_pos)); 37 | } 38 | current_pos = found_pos + 1; 39 | } 40 | if (str.size() > current_pos) { 41 | result_.push_back(str.substr(current_pos, str.size() - current_pos)); 42 | } 43 | } 44 | template 45 | operator std::vector() const; 46 | 47 | using const_iterator = std::vector::const_iterator; 48 | const_iterator begin() const { return result_.begin(); } 49 | const_iterator end() const { return result_.end(); } 50 | 51 | private: 52 | std::vector result_; 53 | }; 54 | 55 | template <> 56 | inline Splitter::operator std::vector() const { 57 | std::vector x(result_.size()); 58 | for (size_t i = 0; i < x.size(); ++i) 59 | x[i].assign(result_[i].data(), result_[i].size()); 60 | return x; 61 | } 62 | 63 | template <> 64 | inline Splitter::operator std::vector() const { 65 | return result_; 66 | } 67 | } // namespace internal 68 | 69 | inline constexpr bool AllowEmpty() { return true; }; 70 | 71 | inline internal::Splitter StrSplit(absl::string_view str, 72 | absl::string_view delim, 73 | bool allow_empty = false) { 74 | return internal::Splitter(str, delim, allow_empty); 75 | } 76 | 77 | inline internal::Splitter StrSplit(absl::string_view str, const char c, 78 | bool allow_empty = false) { 79 | char delim[2]; 80 | delim[0] = c; 81 | delim[1] = '\0'; 82 | return internal::Splitter(str, delim, allow_empty); 83 | } 84 | 85 | } // namespace absl 86 | #endif // ABSL_STRINGS_STR_SPLIT_H_ 87 | -------------------------------------------------------------------------------- /python/make_py_wheel_mac.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 Google Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License.! 15 | 16 | set -e # exit immediately on error 17 | set -x # display all commands 18 | 19 | build_python() { 20 | VERSION=$1 21 | URL=$2 22 | INSTALL_PATH="/Library/Frameworks/Python.framework/Versions/${VERSION}/bin" 23 | CURRENT_PATH=${PATH} 24 | 25 | curl -L -o python.pkg ${URL} 26 | sudo installer -pkg python.pkg -target / 27 | 28 | if [ -f "${INSTALL_PATH}/python3" ]; then 29 | ln -s ${INSTALL_PATH}/python3 ${INSTALL_PATH}/python 30 | ln -s ${INSTALL_PATH}/python3-config ${INSTALL_PATH}/python-config 31 | ln -s ${INSTALL_PATH}/pip3 ${INSTALL_PATH}/pip 32 | fi 33 | 34 | export PATH="${INSTALL_PATH}:${CURRENT_PATH}" 35 | ls -l ${INSTALL_PATH} 36 | which python 37 | which pip 38 | python --version 39 | sudo python get-pip.py --no-setuptools --no-wheel --ignore-installed 40 | pip install --upgrade setuptools 41 | pip install wheel 42 | pip install delocate 43 | python setup.py clean 44 | python setup.py bdist_wheel --plat-name=macosx_10_6_x86_64 45 | python setup.py test 46 | delocate-listdeps dist/*.whl 47 | delocate-wheel -w dist/delocated_wheel dist/*.whl 48 | export PATH="${CURRENT_PATH}" 49 | 50 | ls -l dist/delocated_wheel 51 | rm -fr build 52 | rm -fr *.so 53 | rm -fr dist/*.whl 54 | rm -fr python.pkg 55 | } 56 | 57 | build() { 58 | cd python 59 | rm -fr build 60 | mkdir -p build 61 | cd build 62 | 63 | # Install sentencepiece 64 | cmake ../.. -DSPM_ENABLE_SHARED=OFF -DSPM_NO_THREADLOCAL=ON 65 | make -j4 VERBOSE=1 66 | make install 67 | cd .. 68 | 69 | mkdir -p dist/delocated_wheel 70 | 71 | # build_python 2.7 https://www.python.org/ftp/python/2.7.15/python-2.7.15-macosx10.6.pkg 72 | # latest pip doesn't support Py3.4 73 | # build_python 3.4 https://www.python.org/ftp/python/3.4.4/python-3.4.4-macosx10.6.pkg 74 | curl -L -O https://bootstrap.pypa.io/pip/3.5/get-pip.py 75 | build_python 3.5 https://www.python.org/ftp/python/3.5.4/python-3.5.4-macosx10.6.pkg 76 | 77 | curl -L -O https://bootstrap.pypa.io/get-pip.py 78 | build_python 3.6 https://www.python.org/ftp/python/3.6.6/python-3.6.6-macosx10.6.pkg 79 | build_python 3.7 https://www.python.org/ftp/python/3.7.9/python-3.7.9-macosx10.9.pkg 80 | build_python 3.8 https://www.python.org/ftp/python/3.8.6/python-3.8.6-macosx10.9.pkg 81 | build_python 3.9 https://www.python.org/ftp/python/3.9.0/python-3.9.0-macosx10.9.pkg 82 | 83 | cd .. 84 | 85 | rm -fr build 86 | } 87 | 88 | build 89 | -------------------------------------------------------------------------------- /src/pretokenizer_for_training_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | #include "pretokenizer_for_training.h" 15 | #include "testharness.h" 16 | #include "third_party/absl/strings/str_cat.h" 17 | #include "trainer_interface.h" 18 | 19 | namespace sentencepiece { 20 | namespace pretokenizer { 21 | 22 | class MockPretokenizer : public PretokenizerForTrainingInterface { 23 | public: 24 | MockPretokenizer() {} 25 | ~MockPretokenizer() {} 26 | 27 | SentencePieceText Tokenize(absl::string_view text) const override { 28 | return spt_; 29 | } 30 | 31 | util::Status status() const override { return util::OkStatus(); } 32 | 33 | void SetOutput(const SentencePieceText &spt) { spt_ = spt; } 34 | 35 | private: 36 | SentencePieceText spt_; 37 | }; 38 | 39 | TEST(PretokenizerForTrainingTest, BaseTest) { 40 | MockPretokenizer mock; 41 | 42 | { 43 | SentencePieceText spt; 44 | spt.set_text("I love sentencepiece"); 45 | auto *p1 = spt.add_pieces(); 46 | p1->set_surface("I"); 47 | p1->set_begin(0); 48 | p1->set_end(1); 49 | 50 | auto *p2 = spt.add_pieces(); 51 | p2->set_surface("love"); 52 | p2->set_begin(2); 53 | p2->set_end(6); 54 | 55 | auto *p3 = spt.add_pieces(); 56 | p3->set_surface("sentence"); 57 | p3->set_begin(7); 58 | p3->set_end(15); 59 | 60 | auto *p4 = spt.add_pieces(); 61 | p4->set_surface("piece"); 62 | p4->set_begin(15); 63 | p4->set_end(20); 64 | 65 | mock.SetOutput(spt); 66 | 67 | EXPECT_EQ(absl::StrCat("I", TrainerInterface::kWSStr, "love", 68 | TrainerInterface::kWSStr, "sentence\tpiece"), 69 | mock.PreTokenize("I love sentencepiece")); 70 | } 71 | 72 | { 73 | SentencePieceText spt; 74 | spt.set_text("これはペンです"); 75 | auto *p1 = spt.add_pieces(); 76 | p1->set_surface("これ"); 77 | p1->set_begin(0); 78 | p1->set_end(6); 79 | 80 | auto *p2 = spt.add_pieces(); 81 | p2->set_surface("は"); 82 | p2->set_begin(6); 83 | p2->set_end(9); 84 | 85 | auto *p3 = spt.add_pieces(); 86 | p3->set_surface("ペン"); 87 | p3->set_begin(9); 88 | p3->set_end(15); 89 | 90 | auto *p4 = spt.add_pieces(); 91 | p4->set_surface("です"); 92 | p4->set_begin(15); 93 | p4->set_end(21); 94 | 95 | mock.SetOutput(spt); 96 | 97 | EXPECT_EQ("これ\tは\tペン\tです", mock.PreTokenize("これはペンです")); 98 | } 99 | } 100 | 101 | } // namespace pretokenizer 102 | } // namespace sentencepiece 103 | -------------------------------------------------------------------------------- /doc/normalization.md: -------------------------------------------------------------------------------- 1 | # Use custom normalization rule 2 | By default, SentencePiece normalizes the input sentence with a variant of Unicode 3 | [NFKC](https://en.wikipedia.org/wiki/Unicode_equivalence). 4 | 5 | SentencePiece allows us to define custom normalization rule, which is stored in the model file. 6 | 7 | ## Use pre-defined normalization rule 8 | SentencePiece provides the following pre-defined normalization rule. It is recommended to use one of them unless you have any special reasons. 9 | 10 | * **nmt_nfkc**: [NFKC](https://en.wikipedia.org/wiki/Unicode_equivalence) normalization with some additional normalization around spaces. (default) 11 | * **nfkc**: original NFKC normalization. 12 | * **nmt_nfkc_cf**: nmt_nfkc + [Unicode case folding](https://www.w3.org/International/wiki/Case_folding) (mostly lower casing) 13 | * **nfkc_cf**: nfkc + [Unicode case folding](https://www.w3.org/International/wiki/Case_folding). 14 | * **identity**: no normalization 15 | 16 | You can choose the normalization rule with `--normalization_rule_name` flag. 17 | ``` 18 | % spm_train --normalization_rule_name=identity --input= --model_prefix= --vocab_size=8000 19 | ``` 20 | 21 | NOTE: Due to the limitation of normalization algorithm, full NFKC normalization is not implemented. [builder.h] describes example character sequences not normalized by our NFKC implementation. 22 | 23 | The difference between **nmt_nfkc** and **nfkc** can be found via ```diff -u data/nfkc.tsv data/nmt_nfkc.tsv``` command. 24 | 25 | ## Use custom normalization rule 26 | The normalization is performed with user-defined string-to-string mappings and leftmost longest matching. 27 | 28 | You can use custom normalization rule by preparing a TSV file formatted as follows: 29 | ``` 30 | 41 302 300 1EA6 31 | 41 302 301 1EA4 32 | 41 302 303 1EAA 33 | ... 34 | ``` 35 | In this sample, UCS4 sequence [41 302 300] (hex) is converted into [1EA6] (hex). When there are ambiguities in the conversions, the longest rule is used. 36 | Note that the tab is used as a delimiter for source and target sequence and space is used as a delimiter for UCS4 characters. We can make the target sequence empty to remove some specific characters from the text. 37 | See [data/nfkc.tsv](../data/nfkc.tsv) as an example. Once a TSV file is prepared, you can specify it with `--normalization_rule_tsv` flag. 38 | ``` 39 | % spm_train --normalization_rule_tsv= --input= --model_prefix= --vocab_size=8000 40 | ``` 41 | 42 | `` embeds the normalization rule so the same normalization rule is applied when `` is used. 43 | 44 | 45 | ## Command line tool to perform normalization 46 | ``` 47 | % spm_normalize --model= file1 file2.. 48 | % spm_normalize --normalizatoin_rule_tsv=custom.tsv file1 file2.. 49 | ``` 50 | The first command line uses the normalization rule embedded in the model file. The second command line uses the normalization rule in TSV file and is useful to make normalization rule interactively. 51 | -------------------------------------------------------------------------------- /src/unigram_model_trainer_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include "sentencepiece_model.pb.h" 16 | #include "sentencepiece_processor.h" 17 | #include "sentencepiece_trainer.h" 18 | #include "testharness.h" 19 | #include "third_party/absl/strings/str_cat.h" 20 | #include "third_party/absl/strings/str_join.h" 21 | #include "unigram_model_trainer.h" 22 | #include "util.h" 23 | 24 | namespace sentencepiece { 25 | namespace unigram { 26 | namespace { 27 | 28 | // Space symbol 29 | #define WS "\xe2\x96\x81" 30 | 31 | TEST(UnigramTrainerTest, TrainerModelTest) { 32 | TrainerSpec trainer_spec; 33 | NormalizerSpec normalizer_spec; 34 | const TrainerModel model(trainer_spec, normalizer_spec); 35 | EXPECT_EQ(EncodeResult(), model.Encode("test")); 36 | } 37 | 38 | static constexpr char kTestInputData[] = "wagahaiwa_nekodearu.txt"; 39 | 40 | TEST(UnigramTrainerTest, EndToEndTest) { 41 | const std::string input = 42 | util::JoinPath(absl::GetFlag(FLAGS_test_srcdir), kTestInputData); 43 | 44 | ASSERT_TRUE( 45 | SentencePieceTrainer::Train( 46 | absl::StrCat( 47 | "--model_prefix=", 48 | util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "tmp_model"), 49 | " --input=", input, 50 | " --vocab_size=8000 --normalization_rule_name=identity", 51 | " --model_type=unigram --user_defined_symbols=", 52 | " --control_symbols= --max_sentence_length=2048")) 53 | .ok()); 54 | 55 | SentencePieceProcessor sp; 56 | EXPECT_TRUE(sp.Load(util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), 57 | "tmp_model.model")) 58 | .ok()); 59 | EXPECT_EQ(8000, sp.GetPieceSize()); 60 | 61 | const int cid = sp.PieceToId(""); 62 | const int uid = sp.PieceToId(""); 63 | EXPECT_TRUE(sp.IsControl(cid)); 64 | EXPECT_FALSE(sp.IsUnknown(uid)); 65 | 66 | std::vector tok; 67 | 68 | EXPECT_TRUE(sp.Encode("", &tok).ok()); 69 | EXPECT_TRUE(tok.empty()); 70 | 71 | EXPECT_TRUE(sp.Encode("吾輩《わがはい》は猫である。名前はまだ無い。" 72 | "どこで生れたかとんと見当《けんとう》がつかぬ。" 73 | "何でも薄暗いじめじめした所でニャーニャー泣いていた事だ" 74 | "けは記憶している" 75 | "。", 76 | &tok) 77 | .ok()); 78 | // TODO(taku): Temporally disable this test on Windows. 79 | #ifndef OS_WIN 80 | EXPECT_EQ(WS 81 | " 吾輩 《 わが はい 》 は 猫 である 。 名前 はまだ 無い 。 " 82 | "どこ で 生 れた か とん と 見当 《 けん とう 》 が つか ぬ 。 " 83 | "何でも 薄 暗 い じめ じめ した 所で ニャーニャー " 84 | "泣 い ていた 事 だけは 記憶 している 。", 85 | absl::StrJoin(tok, " ")); 86 | #endif 87 | } 88 | 89 | } // namespace 90 | } // namespace unigram 91 | } // namespace sentencepiece 92 | -------------------------------------------------------------------------------- /python/add_new_vocab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "#### You can add new special tokens to pre-trained sentencepiece model\n", 8 | "#### Run this code in google/sentencepiece/python/" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "### Load pre-trained sentencepiece model\n", 16 | "Pre-trained model is needed" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 1, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "data": { 26 | "text/plain": [ 27 | "371391" 28 | ] 29 | }, 30 | "execution_count": 1, 31 | "metadata": {}, 32 | "output_type": "execute_result" 33 | } 34 | ], 35 | "source": [ 36 | "import sentencepiece_model_pb2 as model\n", 37 | "m = model.ModelProto()\n", 38 | "m.ParseFromString(open(\"old.model\", \"rb\").read())" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "### Load tokens want to add\n", 46 | "Prepare the list of new tokens want to add" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 2, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/plain": [ 57 | "['[UNK]',\n", 58 | " '[PAD]',\n", 59 | " '[CLS]',\n", 60 | " '[SEP]',\n", 61 | " '[MASK]',\n", 62 | " '[EOS]',\n", 63 | " '[DOMAIN]',\n", 64 | " '[SLOT]',\n", 65 | " '[ACTION]']" 66 | ] 67 | }, 68 | "execution_count": 2, 69 | "metadata": {}, 70 | "output_type": "execute_result" 71 | } 72 | ], 73 | "source": [ 74 | "special_tokens = open(\"special_tokens.txt\", \"r\").read().split(\"\\n\")\n", 75 | "special_tokens" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "### Add new tokens to sentencepiece model" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 3, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "for token in special_tokens:\n", 92 | " new_token = model.ModelProto().SentencePiece()\n", 93 | " new_token.piece = token\n", 94 | " new_token.score = 0\n", 95 | " m.pieces.append(new_token)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "### Save new sentencepiece model\n", 103 | "Load the new sentencepiece model to your NLP system" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 4, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "with open('new.model', 'wb') as f:\n", 113 | " f.write(m.SerializeToString())" 114 | ] 115 | } 116 | ], 117 | "metadata": { 118 | "kernelspec": { 119 | "display_name": "Python 3", 120 | "language": "python", 121 | "name": "python3" 122 | }, 123 | "language_info": { 124 | "codemirror_mode": { 125 | "name": "ipython", 126 | "version": 3 127 | }, 128 | "file_extension": ".py", 129 | "mimetype": "text/x-python", 130 | "name": "python", 131 | "nbconvert_exporter": "python", 132 | "pygments_lexer": "ipython3", 133 | "version": "3.6.10" 134 | } 135 | }, 136 | "nbformat": 4, 137 | "nbformat_minor": 4 138 | } 139 | -------------------------------------------------------------------------------- /third_party/protobuf-lite/google/protobuf/stubs/stl_util.h: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | // from google3/util/gtl/stl_util.h 32 | 33 | #ifndef GOOGLE_PROTOBUF_STUBS_STL_UTIL_H__ 34 | #define GOOGLE_PROTOBUF_STUBS_STL_UTIL_H__ 35 | 36 | #include 37 | 38 | namespace google { 39 | namespace protobuf { 40 | 41 | // Inside Google, this function implements a horrible, disgusting hack in which 42 | // we reach into the string's private implementation and resize it without 43 | // initializing the new bytes. In some cases doing this can significantly 44 | // improve performance. However, since it's totally non-portable it has no 45 | // place in open source code. Feel free to fill this function in with your 46 | // own disgusting hack if you want the perf boost. 47 | inline void STLStringResizeUninitialized(std::string* s, size_t new_size) { 48 | s->resize(new_size); 49 | } 50 | 51 | // Return a mutable char* pointing to a string's internal buffer, 52 | // which may not be null-terminated. Writing through this pointer will 53 | // modify the string. 54 | // 55 | // string_as_array(&str)[i] is valid for 0 <= i < str.size() until the 56 | // next call to a string method that invalidates iterators. 57 | // 58 | // As of 2006-04, there is no standard-blessed way of getting a 59 | // mutable reference to a string's internal buffer. However, issue 530 60 | // (http://www.open-std.org/JTC1/SC22/WG21/docs/lwg-active.html#530) 61 | // proposes this as the method. According to Matt Austern, this should 62 | // already work on all current implementations. 63 | inline char* string_as_array(std::string* str) { 64 | // DO NOT USE const_cast(str->data())! See the unittest for why. 65 | return str->empty() ? nullptr : &*str->begin(); 66 | } 67 | 68 | } // namespace protobuf 69 | } // namespace google 70 | 71 | #endif // GOOGLE_PROTOBUF_STUBS_STL_UTIL_H__ 72 | -------------------------------------------------------------------------------- /third_party/protobuf-lite/google/protobuf/generated_enum_util.h: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | #ifndef GOOGLE_PROTOBUF_GENERATED_ENUM_UTIL_H__ 32 | #define GOOGLE_PROTOBUF_GENERATED_ENUM_UTIL_H__ 33 | 34 | #include 35 | 36 | #include 37 | #include 38 | 39 | #include 40 | 41 | #ifdef SWIG 42 | #error "You cannot SWIG proto headers" 43 | #endif 44 | 45 | namespace google { 46 | namespace protobuf { 47 | 48 | // This type trait can be used to cause templates to only match proto2 enum 49 | // types. 50 | template 51 | struct is_proto_enum : ::std::false_type {}; 52 | 53 | namespace internal { 54 | 55 | // The table entry format for storing enum name-to-value mapping used with lite 56 | // protos. This struct and the following related functions should only be used 57 | // by protobuf generated code. 58 | struct EnumEntry { 59 | StringPiece name; 60 | int value; 61 | }; 62 | 63 | // Looks up a numeric enum value given the string name. 64 | PROTOBUF_EXPORT bool LookUpEnumValue(const EnumEntry* enums, size_t size, 65 | StringPiece name, int* value); 66 | 67 | // Looks up an enum name given the numeric value. 68 | PROTOBUF_EXPORT int LookUpEnumName(const EnumEntry* enums, 69 | const int* sorted_indices, size_t size, 70 | int value); 71 | 72 | // Initializes the list of enum names in std::string form. 73 | PROTOBUF_EXPORT bool InitializeEnumStrings( 74 | const EnumEntry* enums, const int* sorted_indices, size_t size, 75 | internal::ExplicitlyConstructed* enum_strings); 76 | 77 | } // namespace internal 78 | } // namespace protobuf 79 | } // namespace google 80 | 81 | #include 82 | 83 | #endif // GOOGLE_PROTOBUF_GENERATED_ENUM_UTIL_H__ 84 | -------------------------------------------------------------------------------- /third_party/protobuf-lite/google/protobuf/stubs/time.h: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | #ifndef GOOGLE_PROTOBUF_STUBS_TIME_H_ 31 | #define GOOGLE_PROTOBUF_STUBS_TIME_H_ 32 | 33 | #include 34 | 35 | #include 36 | 37 | namespace google { 38 | namespace protobuf { 39 | namespace internal { 40 | 41 | struct DateTime { 42 | int year; 43 | int month; 44 | int day; 45 | int hour; 46 | int minute; 47 | int second; 48 | }; 49 | 50 | // Converts a timestamp (seconds elapsed since 1970-01-01T00:00:00, could be 51 | // negative to represent time before 1970-01-01) to DateTime. Returns false 52 | // if the timestamp is not in the range between 0001-01-01T00:00:00 and 53 | // 9999-12-31T23:59:59. 54 | bool PROTOBUF_EXPORT SecondsToDateTime(int64 seconds, DateTime* time); 55 | // Converts DateTime to a timestamp (seconds since 1970-01-01T00:00:00). 56 | // Returns false if the DateTime is not valid or is not in the valid range. 57 | bool PROTOBUF_EXPORT DateTimeToSeconds(const DateTime& time, int64* seconds); 58 | 59 | void PROTOBUF_EXPORT GetCurrentTime(int64* seconds, int32* nanos); 60 | 61 | // Formats a time string in RFC3339 format. 62 | // 63 | // For example, "2015-05-20T13:29:35.120Z". For nanos, 0, 3, 6 or 9 fractional 64 | // digits will be used depending on how many are required to represent the exact 65 | // value. 66 | // 67 | // Note that "nanos" must in the range of [0, 999999999]. 68 | std::string PROTOBUF_EXPORT FormatTime(int64 seconds, int32 nanos); 69 | // Parses a time string. This method accepts RFC3339 date/time string with UTC 70 | // offset. For example, "2015-05-20T13:29:35.120-08:00". 71 | bool PROTOBUF_EXPORT ParseTime(const std::string& value, int64* seconds, 72 | int32* nanos); 73 | 74 | } // namespace internal 75 | } // namespace protobuf 76 | } // namespace google 77 | 78 | #include 79 | 80 | #endif // GOOGLE_PROTOBUF_STUBS_TIME_H_ 81 | -------------------------------------------------------------------------------- /src/unicode_script.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef UNICODE_SCRIPT_H_ 16 | #define UNICODE_SCRIPT_H_ 17 | 18 | #include "common.h" 19 | 20 | namespace sentencepiece { 21 | namespace unicode_script { 22 | enum ScriptType { 23 | U_Adlam, 24 | U_Ahom, 25 | U_Anatolian_Hieroglyphs, 26 | U_Arabic, 27 | U_Armenian, 28 | U_Avestan, 29 | U_Balinese, 30 | U_Bamum, 31 | U_Bassa_Vah, 32 | U_Batak, 33 | U_Bengali, 34 | U_Bhaiksuki, 35 | U_Bopomofo, 36 | U_Brahmi, 37 | U_Braille, 38 | U_Buginese, 39 | U_Buhid, 40 | U_Canadian_Aboriginal, 41 | U_Carian, 42 | U_Caucasian_Albanian, 43 | U_Chakma, 44 | U_Cham, 45 | U_Cherokee, 46 | U_Common, 47 | U_Coptic, 48 | U_Cuneiform, 49 | U_Cypriot, 50 | U_Cyrillic, 51 | U_Deseret, 52 | U_Devanagari, 53 | U_Duployan, 54 | U_Egyptian_Hieroglyphs, 55 | U_Elbasan, 56 | U_Ethiopic, 57 | U_Georgian, 58 | U_Glagolitic, 59 | U_Gothic, 60 | U_Grantha, 61 | U_Greek, 62 | U_Gujarati, 63 | U_Gurmukhi, 64 | U_Han, 65 | U_Hangul, 66 | U_Hanunoo, 67 | U_Hatran, 68 | U_Hebrew, 69 | U_Hiragana, 70 | U_Imperial_Aramaic, 71 | U_Inherited, 72 | U_Inscriptional_Pahlavi, 73 | U_Inscriptional_Parthian, 74 | U_Javanese, 75 | U_Kaithi, 76 | U_Kannada, 77 | U_Katakana, 78 | U_Kayah_Li, 79 | U_Kharoshthi, 80 | U_Khmer, 81 | U_Khojki, 82 | U_Khudawadi, 83 | U_Lao, 84 | U_Latin, 85 | U_Lepcha, 86 | U_Limbu, 87 | U_Linear_A, 88 | U_Linear_B, 89 | U_Lisu, 90 | U_Lycian, 91 | U_Lydian, 92 | U_Mahajani, 93 | U_Malayalam, 94 | U_Mandaic, 95 | U_Manichaean, 96 | U_Marchen, 97 | U_Meetei_Mayek, 98 | U_Mende_Kikakui, 99 | U_Meroitic_Cursive, 100 | U_Meroitic_Hieroglyphs, 101 | U_Miao, 102 | U_Modi, 103 | U_Mongolian, 104 | U_Mro, 105 | U_Multani, 106 | U_Myanmar, 107 | U_Nabataean, 108 | U_New_Tai_Lue, 109 | U_Newa, 110 | U_Nko, 111 | U_Ogham, 112 | U_Ol_Chiki, 113 | U_Old_Hungarian, 114 | U_Old_Italic, 115 | U_Old_North_Arabian, 116 | U_Old_Permic, 117 | U_Old_Persian, 118 | U_Old_South_Arabian, 119 | U_Old_Turkic, 120 | U_Oriya, 121 | U_Osage, 122 | U_Osmanya, 123 | U_Pahawh_Hmong, 124 | U_Palmyrene, 125 | U_Pau_Cin_Hau, 126 | U_Phags_Pa, 127 | U_Phoenician, 128 | U_Psalter_Pahlavi, 129 | U_Rejang, 130 | U_Runic, 131 | U_Samaritan, 132 | U_Saurashtra, 133 | U_Sharada, 134 | U_Shavian, 135 | U_Siddham, 136 | U_SignWriting, 137 | U_Sinhala, 138 | U_Sora_Sompeng, 139 | U_Sundanese, 140 | U_Syloti_Nagri, 141 | U_Syriac, 142 | U_Tagalog, 143 | U_Tagbanwa, 144 | U_Tai_Le, 145 | U_Tai_Tham, 146 | U_Tai_Viet, 147 | U_Takri, 148 | U_Tamil, 149 | U_Tangut, 150 | U_Telugu, 151 | U_Thaana, 152 | U_Thai, 153 | U_Tibetan, 154 | U_Tifinagh, 155 | U_Tirhuta, 156 | U_Ugaritic, 157 | U_Vai, 158 | U_Warang_Citi, 159 | U_Yi 160 | }; 161 | 162 | ScriptType GetScript(char32 c); 163 | } // namespace unicode_script 164 | } // namespace sentencepiece 165 | #endif // UNICODE_SCRIPT 166 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Copyright 2018 Google Inc. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License.! 16 | 17 | set -e # exit immediately on error 18 | set -x # display all commands 19 | 20 | setup_ubuntu() { 21 | export DEBIAN_FRONTEND=noninteractive 22 | apt-get update 23 | apt-get install -y build-essential cmake git pkg-config python3-pip 24 | pip3 install --upgrade pip 25 | 26 | export PATH="/usr/local/bin:$PATH" 27 | 28 | . /etc/os-release 29 | if [ "${VERSION_ID}" = "14.04" ]; then 30 | apt-get install -y cmake3 python-dev 31 | fi 32 | } 33 | 34 | setup_debian() { 35 | setup_ubuntu 36 | } 37 | 38 | setup_fedora() { 39 | dnf update -y 40 | dnf install -y rpm-build gcc-c++ make cmake pkg-config python-pip python-devel 41 | } 42 | 43 | build_generic() { 44 | mkdir -p build 45 | cd build 46 | cmake .. -DSPM_BUILD_TEST=ON 47 | make -j2 48 | make CTEST_OUTPUT_ON_FAILURE=1 test 49 | make package_source 50 | cd .. 51 | } 52 | 53 | build_python() { 54 | cd build 55 | make install 56 | cd .. 57 | export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib64:$LD_LIBRARY_PATH 58 | export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig:/usr/local/lib64/pkgconfig 59 | ldconfig -v 60 | cd python 61 | python3 setup.py test 62 | cd .. 63 | } 64 | 65 | build_linux_gcc_coverall_ubuntu() { 66 | setup_debian 67 | apt-get install -y lcov 68 | pip3 install cpp-coveralls 69 | pip3 install 'requests[security]' 70 | build_generic 71 | build_python 72 | mkdir -p build 73 | cd build 74 | cmake .. -DSPM_COVERAGE=ON 75 | make -j2 76 | make coverage 77 | coveralls --exclude-pattern '.*(include|usr|test|third_party|pb|_main).*' --gcov-options '\-lp' --gcov gcov 78 | cd .. 79 | } 80 | 81 | build_linux_gcc_ubuntu() { 82 | setup_ubuntu 83 | build_generic 84 | build_python 85 | } 86 | 87 | build_linux_gcc_ubuntu_i386() { 88 | setup_ubuntu 89 | build_generic 90 | build_python 91 | } 92 | 93 | build_linux_gcc_debian() { 94 | setup_debian 95 | build_generic 96 | build_python 97 | } 98 | 99 | build_linux_gcc_fedora() { 100 | setup_fedora 101 | build_generic 102 | build_python 103 | } 104 | 105 | build_linux_clang_ubuntu() { 106 | setup_ubuntu 107 | apt-get install -y clang 108 | export CXX="clang++" CC="clang" 109 | build_generic 110 | rm -fr build 111 | } 112 | 113 | build_osx() { 114 | # brew update 115 | # brew install protobuf || brew link --overwrite protobuf 116 | # brew link --overwrite python@2 117 | build_generic 118 | # cd build 119 | # make install 120 | } 121 | 122 | run_docker() { 123 | docker pull "$1" 124 | docker run -e COVERALLS_REPO_TOKEN=${COVERALLS_REPO_TOKEN} --rm -ti --name travis-ci -v `pwd`:/sentencepiece -w /sentencepiece -td "$1" /bin/bash 125 | docker exec travis-ci bash -c "./test.sh native $2" 126 | docker stop travis-ci 127 | } 128 | 129 | ## main 130 | if [ "$#" -ne 2 ]; then 131 | echo "sh test.sh ." 132 | echo "when is native, runs command natively without docker." 133 | exit 134 | fi 135 | 136 | if [ "$1" = "native" ]; then 137 | eval "$2" 138 | else 139 | run_docker $1 $2 140 | fi 141 | -------------------------------------------------------------------------------- /third_party/protobuf-lite/generated_enum_util.cc: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | #include 32 | 33 | #include 34 | 35 | #include 36 | 37 | namespace google { 38 | namespace protobuf { 39 | namespace internal { 40 | namespace { 41 | 42 | bool EnumCompareByName(const EnumEntry& a, const EnumEntry& b) { 43 | return StringPiece(a.name) < StringPiece(b.name); 44 | } 45 | 46 | // Gets the numeric value of the EnumEntry at the given index, but returns a 47 | // special value for the index -1. This gives a way to use std::lower_bound on a 48 | // sorted array of indices while searching for value that we associate with -1. 49 | int GetValue(const EnumEntry* enums, int i, int target) { 50 | if (i == -1) { 51 | return target; 52 | } else { 53 | return enums[i].value; 54 | } 55 | } 56 | 57 | } // namespace 58 | 59 | bool LookUpEnumValue(const EnumEntry* enums, size_t size, 60 | StringPiece name, int* value) { 61 | EnumEntry target{name, 0}; 62 | auto it = std::lower_bound(enums, enums + size, target, EnumCompareByName); 63 | if (it != enums + size && it->name == name) { 64 | *value = it->value; 65 | return true; 66 | } 67 | return false; 68 | } 69 | 70 | int LookUpEnumName(const EnumEntry* enums, const int* sorted_indices, 71 | size_t size, int value) { 72 | auto comparator = [enums, value](int a, int b) { 73 | return GetValue(enums, a, value) < GetValue(enums, b, value); 74 | }; 75 | auto it = 76 | std::lower_bound(sorted_indices, sorted_indices + size, -1, comparator); 77 | if (it != sorted_indices + size && enums[*it].value == value) { 78 | return it - sorted_indices; 79 | } 80 | return -1; 81 | } 82 | 83 | bool InitializeEnumStrings( 84 | const EnumEntry* enums, const int* sorted_indices, size_t size, 85 | internal::ExplicitlyConstructed* enum_strings) { 86 | for (int i = 0; i < size; ++i) { 87 | enum_strings[i].Construct(enums[sorted_indices[i]].name); 88 | internal::OnShutdownDestroyString(enum_strings[i].get_mutable()); 89 | } 90 | return true; 91 | } 92 | 93 | } // namespace internal 94 | } // namespace protobuf 95 | } // namespace google 96 | -------------------------------------------------------------------------------- /third_party/protobuf-lite/google/protobuf/stubs/stringprintf.h: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2012 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | // from google3/base/stringprintf.h 32 | // 33 | // Printf variants that place their output in a C++ string. 34 | // 35 | // Usage: 36 | // string result = StringPrintf("%d %s\n", 10, "hello"); 37 | // SStringPrintf(&result, "%d %s\n", 10, "hello"); 38 | // StringAppendF(&result, "%d %s\n", 20, "there"); 39 | 40 | #ifndef GOOGLE_PROTOBUF_STUBS_STRINGPRINTF_H 41 | #define GOOGLE_PROTOBUF_STUBS_STRINGPRINTF_H 42 | 43 | #include 44 | #include 45 | #include 46 | 47 | #include 48 | 49 | #include 50 | 51 | namespace google { 52 | namespace protobuf { 53 | 54 | // Return a C++ string 55 | PROTOBUF_EXPORT extern std::string StringPrintf(const char* format, ...); 56 | 57 | // Store result into a supplied string and return it 58 | PROTOBUF_EXPORT extern const std::string& SStringPrintf(std::string* dst, 59 | const char* format, 60 | ...); 61 | 62 | // Append result to a supplied string 63 | PROTOBUF_EXPORT extern void StringAppendF(std::string* dst, const char* format, 64 | ...); 65 | 66 | // Lower-level routine that takes a va_list and appends to a specified 67 | // string. All other routines are just convenience wrappers around it. 68 | PROTOBUF_EXPORT extern void StringAppendV(std::string* dst, const char* format, 69 | va_list ap); 70 | 71 | // The max arguments supported by StringPrintfVector 72 | PROTOBUF_EXPORT extern const int kStringPrintfVectorMaxArgs; 73 | 74 | // You can use this version when all your arguments are strings, but 75 | // you don't know how many arguments you'll have at compile time. 76 | // StringPrintfVector will LOG(FATAL) if v.size() > kStringPrintfVectorMaxArgs 77 | PROTOBUF_EXPORT extern std::string StringPrintfVector( 78 | const char* format, const std::vector& v); 79 | 80 | } // namespace protobuf 81 | } // namespace google 82 | 83 | #include 84 | 85 | #endif // GOOGLE_PROTOBUF_STUBS_STRINGPRINTF_H 86 | -------------------------------------------------------------------------------- /src/char_model_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include 16 | 17 | #include "char_model.h" 18 | #include "testharness.h" 19 | #include "util.h" 20 | 21 | namespace sentencepiece { 22 | namespace character { 23 | namespace { 24 | 25 | // Space symbol (U+2581) 26 | #define WS "\xe2\x96\x81" 27 | 28 | ModelProto MakeBaseModelProto() { 29 | ModelProto model_proto; 30 | auto *sp1 = model_proto.add_pieces(); 31 | auto *sp2 = model_proto.add_pieces(); 32 | auto *sp3 = model_proto.add_pieces(); 33 | 34 | sp1->set_type(ModelProto::SentencePiece::UNKNOWN); 35 | sp1->set_piece(""); 36 | sp2->set_type(ModelProto::SentencePiece::CONTROL); 37 | sp2->set_piece(""); 38 | sp3->set_type(ModelProto::SentencePiece::CONTROL); 39 | sp3->set_piece(""); 40 | 41 | return model_proto; 42 | } 43 | 44 | void AddPiece(ModelProto *model_proto, const std::string &piece, 45 | float score = 0.0) { 46 | auto *sp = model_proto->add_pieces(); 47 | sp->set_piece(piece); 48 | sp->set_score(score); 49 | } 50 | 51 | TEST(ModelTest, EncodeTest) { 52 | ModelProto model_proto = MakeBaseModelProto(); 53 | 54 | AddPiece(&model_proto, WS, 0.0); 55 | AddPiece(&model_proto, "a", 0.1); 56 | AddPiece(&model_proto, "b", 0.2); 57 | AddPiece(&model_proto, "c", 0.3); 58 | AddPiece(&model_proto, "d", 0.4); 59 | AddPiece(&model_proto, "ABC", 0.4); 60 | model_proto.mutable_pieces(8)->set_type( 61 | ModelProto::SentencePiece::USER_DEFINED); 62 | 63 | const Model model(model_proto); 64 | 65 | EncodeResult result; 66 | 67 | result = model.Encode(""); 68 | EXPECT_TRUE(result.empty()); 69 | 70 | result = model.Encode(WS "a" WS "b" WS "c"); 71 | EXPECT_EQ(6, result.size()); 72 | EXPECT_EQ(WS, result[0].first); 73 | EXPECT_EQ("a", result[1].first); 74 | EXPECT_EQ(WS, result[2].first); 75 | EXPECT_EQ("b", result[3].first); 76 | EXPECT_EQ(WS, result[4].first); 77 | EXPECT_EQ("c", result[5].first); 78 | 79 | result = model.Encode(WS "ab" WS "cd" WS "abc"); 80 | EXPECT_EQ(10, result.size()); 81 | EXPECT_EQ(WS, result[0].first); 82 | EXPECT_EQ("a", result[1].first); 83 | EXPECT_EQ("b", result[2].first); 84 | EXPECT_EQ(WS, result[3].first); 85 | EXPECT_EQ("c", result[4].first); 86 | EXPECT_EQ("d", result[5].first); 87 | EXPECT_EQ(WS, result[6].first); 88 | EXPECT_EQ("a", result[7].first); 89 | EXPECT_EQ("b", result[8].first); 90 | EXPECT_EQ("c", result[9].first); 91 | 92 | // makes a broken utf-8 93 | const std::string broken_utf8 = std::string("あ").substr(0, 1); 94 | result = model.Encode(broken_utf8); 95 | EXPECT_EQ(1, result.size()); 96 | EXPECT_EQ(broken_utf8, result[0].first); 97 | 98 | // "ABC" is treated as one piece, as it is USER_DEFINED. 99 | result = model.Encode(WS "abABCcd"); 100 | EXPECT_EQ(6, result.size()); 101 | EXPECT_EQ(WS, result[0].first); 102 | EXPECT_EQ("a", result[1].first); 103 | EXPECT_EQ("b", result[2].first); 104 | EXPECT_EQ("ABC", result[3].first); 105 | EXPECT_EQ("c", result[4].first); 106 | EXPECT_EQ("d", result[5].first); 107 | } 108 | 109 | TEST(CharModelTest, NotSupportedTest) { 110 | ModelProto model_proto = MakeBaseModelProto(); 111 | const Model model(model_proto); 112 | EXPECT_EQ(NBestEncodeResult(), model.NBestEncode("test", 10)); 113 | EXPECT_EQ(EncodeResult(), model.SampleEncode("test", 0.1)); 114 | } 115 | 116 | } // namespace 117 | } // namespace character 118 | } // namespace sentencepiece 119 | -------------------------------------------------------------------------------- /src/filesystem.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include 16 | 17 | #include "filesystem.h" 18 | #include "third_party/absl/memory/memory.h" 19 | #include "util.h" 20 | 21 | #if defined(OS_WIN) && defined(UNICODE) && defined(_UNICODE) 22 | #define WPATH(path) (::sentencepiece::win32::Utf8ToWide(path).c_str()) 23 | #else 24 | #define WPATH(path) (path) 25 | #endif 26 | 27 | namespace sentencepiece { 28 | namespace filesystem { 29 | 30 | class PosixReadableFile : public ReadableFile { 31 | public: 32 | PosixReadableFile(absl::string_view filename, bool is_binary = false) 33 | : is_(filename.empty() 34 | ? &std::cin 35 | : new std::ifstream(WPATH(filename.data()), 36 | is_binary ? std::ios::binary | std::ios::in 37 | : std::ios::in)) { 38 | if (!*is_) 39 | status_ = util::StatusBuilder(util::StatusCode::kNotFound, GTL_LOC) 40 | << "\"" << filename.data() << "\": " << util::StrError(errno); 41 | } 42 | 43 | ~PosixReadableFile() { 44 | if (is_ != &std::cin) delete is_; 45 | } 46 | 47 | util::Status status() const { return status_; } 48 | 49 | bool ReadLine(std::string *line) { 50 | return static_cast(std::getline(*is_, *line)); 51 | } 52 | 53 | bool ReadAll(std::string *line) { 54 | if (is_ == &std::cin) { 55 | LOG(ERROR) << "ReadAll is not supported for stdin."; 56 | return false; 57 | } 58 | line->assign(std::istreambuf_iterator(*is_), 59 | std::istreambuf_iterator()); 60 | return true; 61 | } 62 | 63 | private: 64 | util::Status status_; 65 | std::istream *is_; 66 | }; 67 | 68 | class PosixWritableFile : public WritableFile { 69 | public: 70 | PosixWritableFile(absl::string_view filename, bool is_binary = false) 71 | : os_(filename.empty() 72 | ? &std::cout 73 | : new std::ofstream(WPATH(filename.data()), 74 | is_binary ? std::ios::binary | std::ios::out 75 | : std::ios::out)) { 76 | if (!*os_) 77 | status_ = 78 | util::StatusBuilder(util::StatusCode::kPermissionDenied, GTL_LOC) 79 | << "\"" << filename.data() << "\": " << util::StrError(errno); 80 | } 81 | 82 | ~PosixWritableFile() { 83 | if (os_ != &std::cout) delete os_; 84 | } 85 | 86 | util::Status status() const { return status_; } 87 | 88 | bool Write(absl::string_view text) { 89 | os_->write(text.data(), text.size()); 90 | return os_->good(); 91 | } 92 | 93 | bool WriteLine(absl::string_view text) { return Write(text) && Write("\n"); } 94 | 95 | private: 96 | util::Status status_; 97 | std::ostream *os_; 98 | }; 99 | 100 | using DefaultReadableFile = PosixReadableFile; 101 | using DefaultWritableFile = PosixWritableFile; 102 | 103 | std::unique_ptr NewReadableFile(absl::string_view filename, 104 | bool is_binary) { 105 | return absl::make_unique(filename, is_binary); 106 | } 107 | 108 | std::unique_ptr NewWritableFile(absl::string_view filename, 109 | bool is_binary) { 110 | return absl::make_unique(filename, is_binary); 111 | } 112 | 113 | } // namespace filesystem 114 | } // namespace sentencepiece 115 | -------------------------------------------------------------------------------- /third_party/protobuf-lite/google/protobuf/has_bits.h: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | #ifndef GOOGLE_PROTOBUF_HAS_BITS_H__ 32 | #define GOOGLE_PROTOBUF_HAS_BITS_H__ 33 | 34 | #include 35 | #include 36 | 37 | #include 38 | 39 | #ifdef SWIG 40 | #error "You cannot SWIG proto headers" 41 | #endif 42 | 43 | namespace google { 44 | namespace protobuf { 45 | namespace internal { 46 | 47 | template 48 | class HasBits { 49 | public: 50 | constexpr HasBits() PROTOBUF_ALWAYS_INLINE : has_bits_{} {} 51 | 52 | void Clear() PROTOBUF_ALWAYS_INLINE { 53 | memset(has_bits_, 0, sizeof(has_bits_)); 54 | } 55 | 56 | uint32& operator[](int index) PROTOBUF_ALWAYS_INLINE { 57 | return has_bits_[index]; 58 | } 59 | 60 | const uint32& operator[](int index) const PROTOBUF_ALWAYS_INLINE { 61 | return has_bits_[index]; 62 | } 63 | 64 | bool operator==(const HasBits& rhs) const { 65 | return memcmp(has_bits_, rhs.has_bits_, sizeof(has_bits_)) == 0; 66 | } 67 | 68 | bool operator!=(const HasBits& rhs) const { 69 | return !(*this == rhs); 70 | } 71 | 72 | void Or(const HasBits& rhs) { 73 | for (size_t i = 0; i < doublewords; i++) has_bits_[i] |= rhs[i]; 74 | } 75 | 76 | bool empty() const; 77 | 78 | private: 79 | uint32 has_bits_[doublewords]; 80 | }; 81 | 82 | template <> 83 | inline bool HasBits<1>::empty() const { 84 | return !has_bits_[0]; 85 | } 86 | 87 | template <> 88 | inline bool HasBits<2>::empty() const { 89 | return !(has_bits_[0] | has_bits_[1]); 90 | } 91 | 92 | template <> 93 | inline bool HasBits<3>::empty() const { 94 | return !(has_bits_[0] | has_bits_[1] | has_bits_[2]); 95 | } 96 | 97 | template <> 98 | inline bool HasBits<4>::empty() const { 99 | return !(has_bits_[0] | has_bits_[1] | has_bits_[2] | has_bits_[3]); 100 | } 101 | 102 | template 103 | inline bool HasBits::empty() const { 104 | for (size_t i = 0; i < doublewords; ++i) { 105 | if (has_bits_[i]) return false; 106 | } 107 | return true; 108 | } 109 | 110 | } // namespace internal 111 | } // namespace protobuf 112 | } // namespace google 113 | 114 | #include 115 | 116 | #endif // GOOGLE_PROTOBUF_HAS_BITS_H__ 117 | -------------------------------------------------------------------------------- /third_party/protobuf-lite/google/protobuf/generated_enum_reflection.h: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | // Author: jasonh@google.com (Jason Hsueh) 32 | // 33 | // This header is logically internal, but is made public because it is used 34 | // from protocol-compiler-generated code, which may reside in other components. 35 | // It provides reflection support for generated enums, and is included in 36 | // generated .pb.h files and should have minimal dependencies. The methods are 37 | // implemented in generated_message_reflection.cc. 38 | 39 | #ifndef GOOGLE_PROTOBUF_GENERATED_ENUM_REFLECTION_H__ 40 | #define GOOGLE_PROTOBUF_GENERATED_ENUM_REFLECTION_H__ 41 | 42 | #include 43 | 44 | #include 45 | #include 46 | #include 47 | 48 | #ifdef SWIG 49 | #error "You cannot SWIG proto headers" 50 | #endif 51 | 52 | #include 53 | 54 | namespace google { 55 | namespace protobuf { 56 | class EnumDescriptor; 57 | } // namespace protobuf 58 | } // namespace google 59 | 60 | namespace google { 61 | namespace protobuf { 62 | 63 | // Returns the EnumDescriptor for enum type E, which must be a 64 | // proto-declared enum type. Code generated by the protocol compiler 65 | // will include specializations of this template for each enum type declared. 66 | template 67 | const EnumDescriptor* GetEnumDescriptor(); 68 | 69 | namespace internal { 70 | 71 | // Helper for EnumType_Parse functions: try to parse the string 'name' as 72 | // an enum name of the given type, returning true and filling in value on 73 | // success, or returning false and leaving value unchanged on failure. 74 | PROTOBUF_EXPORT bool ParseNamedEnum(const EnumDescriptor* descriptor, 75 | ConstStringParam name, int* value); 76 | 77 | template 78 | bool ParseNamedEnum(const EnumDescriptor* descriptor, ConstStringParam name, 79 | EnumType* value) { 80 | int tmp; 81 | if (!ParseNamedEnum(descriptor, name, &tmp)) return false; 82 | *value = static_cast(tmp); 83 | return true; 84 | } 85 | 86 | // Just a wrapper around printing the name of a value. The main point of this 87 | // function is not to be inlined, so that you can do this without including 88 | // descriptor.h. 89 | PROTOBUF_EXPORT const std::string& NameOfEnum(const EnumDescriptor* descriptor, 90 | int value); 91 | 92 | } // namespace internal 93 | } // namespace protobuf 94 | } // namespace google 95 | 96 | #include 97 | 98 | #endif // GOOGLE_PROTOBUF_GENERATED_ENUM_REFLECTION_H__ 99 | -------------------------------------------------------------------------------- /src/unigram_model_trainer.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef UNIGRAM_MODEL_TRAINER_H_ 16 | #define UNIGRAM_MODEL_TRAINER_H_ 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #include "sentencepiece_model.pb.h" 24 | #include "third_party/absl/strings/string_view.h" 25 | #include "trainer_interface.h" 26 | #include "unigram_model.h" 27 | #include "util.h" 28 | 29 | namespace sentencepiece { 30 | namespace unigram { 31 | 32 | using string_util::UnicodeText; 33 | 34 | class TrainerModel : public Model { 35 | public: 36 | using SentencePieces = std::vector>; 37 | 38 | TrainerModel() {} 39 | TrainerModel(const ModelProto &model_proto) = delete; 40 | TrainerModel(const TrainerSpec &trainer_spec, 41 | const NormalizerSpec &normalizaiton_spec); 42 | ~TrainerModel() override; 43 | 44 | // Returns the sentencepieces. 45 | // The meta symbols, e.g., are NOT included. 46 | const SentencePieces &GetSentencePieces() const; 47 | 48 | // Sets sentencepieces. The sentencepieces are moved. 49 | // The meta symbols, e.g., are NOT included. 50 | void SetSentencePieces(SentencePieces &&sentencepieces); 51 | 52 | EncodeResult Encode(absl::string_view normalized) const override { 53 | return {}; 54 | } 55 | 56 | private: 57 | SentencePieces sentencepieces_; 58 | TrainerSpec trainer_spec_; 59 | NormalizerSpec normalizer_spec_; 60 | ModelProto model_proto_data_; 61 | }; 62 | 63 | class Trainer : public TrainerInterface { 64 | public: 65 | Trainer(const TrainerSpec &trainer_spec, 66 | const NormalizerSpec &normalizer_spec, 67 | const NormalizerSpec &denormalizer_spec) 68 | : TrainerInterface::TrainerInterface(trainer_spec, normalizer_spec, 69 | denormalizer_spec) {} 70 | 71 | util::Status Train() override; 72 | 73 | private: 74 | FRIEND_TEST(TrainerTest, IsValidSentencePieceTest); 75 | 76 | // Makes seed pieces from the training corpus. 77 | // The size of seed pieces is determined by seed_sentencepiece_size. 78 | // node_int_type should be of integer type (int32 or int64), 79 | // determined by train_extremely_large_corpus. 80 | template 81 | TrainerModel::SentencePieces MakeSeedSentencePieces() const; 82 | 83 | // Executes the E step of EM and returns expected count. 84 | // The index of return array is the vocab id. 85 | // |objective| is a negative likelihood of the current model. 86 | // |num_token| is the number of total tokens to tokenize 87 | // training corpus. 88 | std::vector RunEStep(const TrainerModel &model, float *objective, 89 | int64 *num_tokens) const; 90 | 91 | // Executes the M step of EM with the expected frequency and 92 | // returns new pieces. 93 | TrainerModel::SentencePieces RunMStep( 94 | const TrainerModel &model, const std::vector &expected) const; 95 | 96 | // Heuristically prunes the current pieces. 97 | // This is called after each EM sub-iteration. 98 | TrainerModel::SentencePieces PruneSentencePieces( 99 | const TrainerModel &model) const; 100 | 101 | // Makes the final sentence pieces by incorporating the required characters 102 | // and control/user defined symbols. 103 | TrainerModel::SentencePieces FinalizeSentencePieces( 104 | const TrainerModel &model) const; 105 | 106 | // When the size of SentencePieces becomes less than desired_vocab_size_, 107 | // break the main training loop. desired_vocab_size_ = 1.1 * vocab_size_ 108 | // for now. 109 | int desired_vocab_size_; 110 | }; 111 | } // namespace unigram 112 | } // namespace sentencepiece 113 | #endif // UNIGRAM_MODEL_TRAINER_H_ 114 | -------------------------------------------------------------------------------- /third_party/protobuf-lite/google/protobuf/stubs/hash.h: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | // Author: kenton@google.com (Kenton Varda) 32 | 33 | #ifndef GOOGLE_PROTOBUF_STUBS_HASH_H__ 34 | #define GOOGLE_PROTOBUF_STUBS_HASH_H__ 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | 41 | # define GOOGLE_PROTOBUF_HASH_NAMESPACE_DECLARATION_START \ 42 | namespace google { \ 43 | namespace protobuf { 44 | # define GOOGLE_PROTOBUF_HASH_NAMESPACE_DECLARATION_END }} 45 | 46 | namespace google { 47 | namespace protobuf { 48 | 49 | template 50 | struct hash : public std::hash {}; 51 | 52 | template 53 | struct hash { 54 | inline size_t operator()(const Key* key) const { 55 | return reinterpret_cast(key); 56 | } 57 | }; 58 | 59 | // Unlike the old SGI version, the TR1 "hash" does not special-case char*. So, 60 | // we go ahead and provide our own implementation. 61 | template <> 62 | struct hash { 63 | inline size_t operator()(const char* str) const { 64 | size_t result = 0; 65 | for (; *str != '\0'; str++) { 66 | result = 5 * result + static_cast(*str); 67 | } 68 | return result; 69 | } 70 | }; 71 | 72 | template<> 73 | struct hash { 74 | size_t operator()(bool x) const { 75 | return static_cast(x); 76 | } 77 | }; 78 | 79 | template <> 80 | struct hash { 81 | inline size_t operator()(const std::string& key) const { 82 | return hash()(key.c_str()); 83 | } 84 | 85 | static const size_t bucket_size = 4; 86 | static const size_t min_buckets = 8; 87 | inline bool operator()(const std::string& a, const std::string& b) const { 88 | return a < b; 89 | } 90 | }; 91 | 92 | template 93 | struct hash > { 94 | inline size_t operator()(const std::pair& key) const { 95 | size_t first_hash = hash()(key.first); 96 | size_t second_hash = hash()(key.second); 97 | 98 | // FIXME(kenton): What is the best way to compute this hash? I have 99 | // no idea! This seems a bit better than an XOR. 100 | return first_hash * ((1 << 16) - 1) + second_hash; 101 | } 102 | 103 | static const size_t bucket_size = 4; 104 | static const size_t min_buckets = 8; 105 | inline bool operator()(const std::pair& a, 106 | const std::pair& b) const { 107 | return a < b; 108 | } 109 | }; 110 | 111 | } // namespace protobuf 112 | } // namespace google 113 | 114 | #endif // GOOGLE_PROTOBUF_STUBS_HASH_H__ 115 | -------------------------------------------------------------------------------- /src/spm_decode_main.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | #include "common.h" 20 | #include "filesystem.h" 21 | #include "init.h" 22 | #include "sentencepiece.pb.h" 23 | #include "sentencepiece_processor.h" 24 | #include "third_party/absl/flags/flag.h" 25 | #include "third_party/absl/strings/str_split.h" 26 | #include "util.h" 27 | 28 | ABSL_FLAG(std::string, model, "", "model file name"); 29 | ABSL_FLAG(std::string, input, "", "input filename"); 30 | ABSL_FLAG(std::string, output, "", "output filename"); 31 | ABSL_FLAG(std::string, input_format, "piece", "choose from piece or id"); 32 | ABSL_FLAG(std::string, output_format, "string", "choose from string or proto"); 33 | ABSL_FLAG(std::string, extra_options, "", 34 | "':' separated encoder extra options, e.g., \"reverse:bos:eos\""); 35 | 36 | int main(int argc, char *argv[]) { 37 | sentencepiece::ParseCommandLineFlags(argv[0], &argc, &argv, true); 38 | std::vector rest_args; 39 | 40 | if (absl::GetFlag(FLAGS_input).empty()) { 41 | for (int i = 1; i < argc; ++i) { 42 | rest_args.push_back(std::string(argv[i])); 43 | } 44 | } else { 45 | rest_args.push_back(absl::GetFlag(FLAGS_input)); 46 | } 47 | 48 | if (rest_args.empty()) 49 | rest_args.push_back(""); // empty means that reading from stdin. 50 | 51 | CHECK(!absl::GetFlag(FLAGS_model).empty()); 52 | 53 | sentencepiece::SentencePieceProcessor sp; 54 | CHECK_OK(sp.Load(absl::GetFlag(FLAGS_model))); 55 | CHECK_OK(sp.SetDecodeExtraOptions(absl::GetFlag(FLAGS_extra_options))); 56 | 57 | auto output = 58 | sentencepiece::filesystem::NewWritableFile(absl::GetFlag(FLAGS_output)); 59 | CHECK_OK(output->status()); 60 | 61 | std::string detok, line; 62 | sentencepiece::SentencePieceText spt; 63 | std::function &pieces)> process; 64 | 65 | auto ToIds = [&](const std::vector &pieces) { 66 | std::vector ids; 67 | ids.reserve(pieces.size()); 68 | for (const auto &s : pieces) { 69 | ids.push_back(atoi(s.c_str())); 70 | } 71 | return ids; 72 | }; 73 | 74 | if (absl::GetFlag(FLAGS_input_format) == "piece") { 75 | if (absl::GetFlag(FLAGS_output_format) == "string") { 76 | process = [&](const std::vector &pieces) { 77 | CHECK_OK(sp.Decode(pieces, &detok)); 78 | output->WriteLine(detok); 79 | }; 80 | } else if (absl::GetFlag(FLAGS_output_format) == "proto") { 81 | process = [&](const std::vector &pieces) { 82 | CHECK_OK(sp.Decode(pieces, &spt)); 83 | }; 84 | } else { 85 | LOG(FATAL) << "Unknown output format: " 86 | << absl::GetFlag(FLAGS_output_format); 87 | } 88 | } else if (absl::GetFlag(FLAGS_input_format) == "id") { 89 | if (absl::GetFlag(FLAGS_output_format) == "string") { 90 | process = [&](const std::vector &pieces) { 91 | CHECK_OK(sp.Decode(ToIds(pieces), &detok)); 92 | output->WriteLine(detok); 93 | }; 94 | } else if (absl::GetFlag(FLAGS_output_format) == "proto") { 95 | process = [&](const std::vector &pieces) { 96 | CHECK_OK(sp.Decode(ToIds(pieces), &spt)); 97 | }; 98 | } else { 99 | LOG(FATAL) << "Unknown output format: " 100 | << absl::GetFlag(FLAGS_output_format); 101 | } 102 | } else { 103 | LOG(FATAL) << "Unknown input format: " << absl::GetFlag(FLAGS_input_format); 104 | } 105 | 106 | for (const auto &filename : rest_args) { 107 | auto input = sentencepiece::filesystem::NewReadableFile(filename); 108 | CHECK_OK(input->status()); 109 | while (input->ReadLine(&line)) { 110 | const auto pieces = absl::StrSplit(line, " "); 111 | process(pieces); 112 | } 113 | } 114 | 115 | return 0; 116 | } 117 | -------------------------------------------------------------------------------- /doc/options.md: -------------------------------------------------------------------------------- 1 | # Training options 2 | 3 | The training options for the `spm_train` can be listed using `spm_train --help`. Since the standard `pip install` of sentencepiece does not necessarily install `spm_train`, the options are also listed here. 4 | 5 | ``` 6 | --help (show help) type: bool default: false 7 | --version (show version) type: bool default: false 8 | --minloglevel (Messages logged at a lower level than this don't actually get logged anywhere) type: int default: 0 9 | --input (comma separated list of input sentences) type: std::string default: "" 10 | --input_format (Input format. Supported format is `text` or `tsv`.) type: std::string default: "" 11 | --model_prefix (output model prefix) type: std::string default: "" 12 | --model_type (model algorithm: unigram, bpe, word or char) type: std::string default: "unigram" 13 | --vocab_size (vocabulary size) type: int32 default: 8000 14 | --accept_language (comma-separated list of languages this model can accept) type: std::string default: "" 15 | --self_test_sample_size (the size of self test samples) type: int32 default: 0 16 | --character_coverage (character coverage to determine the minimum symbols) type: double default: 0.9995 17 | --input_sentence_size (maximum size of sentences the trainer loads) type: int32 default: 0 18 | --shuffle_input_sentence (Randomly sample input sentences in advance. Valid when --input_sentence_size > 0) type: bool default: true 19 | --seed_sentencepiece_size (the size of seed sentencepieces) type: int32 default: 1000000 20 | --shrinking_factor (Keeps top shrinking_factor pieces with respect to the loss) type: double default: 0.75 21 | --num_threads (number of threads for training) type: int32 default: 16 22 | --num_sub_iterations (number of EM sub-iterations) type: int32 default: 2 23 | --max_sentencepiece_length (maximum length of sentence piece) type: int32 default: 16 24 | --max_sentence_length (maximum length of sentence in byte) type: int32 default: 4192 25 | --split_by_unicode_script (use Unicode script to split sentence pieces) type: bool default: true 26 | --split_by_number (split tokens by numbers (0-9)) type: bool default: true 27 | --split_by_whitespace (use a white space to split sentence pieces) type: bool default: true 28 | --split_digits (split all digits (0-9) into separate pieces) type: bool default: false 29 | --treat_whitespace_as_suffix (treat whitespace marker as suffix instead of prefix.) type: bool default: false 30 | --control_symbols (comma separated list of control symbols) type: std::string default: "" 31 | --user_defined_symbols (comma separated list of user defined symbols) type: std::string default: "" 32 | --required_chars (UTF8 characters in this flag are always used in the character set regardless of --character_coverage) type: std::string default: "" 33 | --byte_fallback (decompose unknown pieces into UTF-8 byte pieces) type: bool default: false 34 | --vocabulary_output_piece_score (Define score in vocab file) type: bool default: true 35 | --normalization_rule_name (Normalization rule name. Choose from nfkc or identity) type: std::string default: "nmt_nfkc" 36 | --normalization_rule_tsv (Normalization rule TSV file. ) type: std::string default: "" 37 | --denormalization_rule_tsv (Denormalization rule TSV file.) type: std::string default: "" 38 | --add_dummy_prefix (Add dummy whitespace at the beginning of text) type: bool default: true 39 | --remove_extra_whitespaces (Removes leading, trailing, and duplicate internal whitespace) type: bool default: true 40 | --hard_vocab_limit (If set to false, --vocab_size is considered as a soft limit.) type: bool default: true 41 | --use_all_vocab (If set to true, use all tokens as vocab. Valid for word/char models.) type: bool default: false 42 | --unk_id (Override UNK () id.) type: int32 default: 0 43 | --bos_id (Override BOS () id. Set -1 to disable BOS.) type: int32 default: 1 44 | --eos_id (Override EOS () id. Set -1 to disable EOS.) type: int32 default: 2 45 | --pad_id (Override PAD () id. Set -1 to disable PAD.) type: int32 default: -1 46 | --unk_piece (Override UNK () piece.) type: std::string default: "" 47 | --bos_piece (Override BOS () piece.) type: std::string default: "" 48 | --eos_piece (Override EOS () piece.) type: std::string default: "" 49 | --pad_piece (Override PAD () piece.) type: std::string default: "" 50 | --unk_surface (Dummy surface string for . In decoding is decoded to `unk_surface`.) type: std::string default: " ⁇ " 51 | --train_extremely_large_corpus (Increase bit depth for unigram tokenization.) type: bool default: false 52 | ``` 53 | -------------------------------------------------------------------------------- /third_party/protobuf-lite/google/protobuf/stubs/status.h: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | #ifndef GOOGLE_PROTOBUF_STUBS_STATUS_H_ 31 | #define GOOGLE_PROTOBUF_STUBS_STATUS_H_ 32 | 33 | #include 34 | #include 35 | 36 | #include 37 | #include 38 | 39 | #include 40 | 41 | namespace google { 42 | namespace protobuf { 43 | namespace util { 44 | namespace error { 45 | // These values must match error codes defined in google/rpc/code.proto. 46 | enum Code { 47 | OK = 0, 48 | CANCELLED = 1, 49 | UNKNOWN = 2, 50 | INVALID_ARGUMENT = 3, 51 | DEADLINE_EXCEEDED = 4, 52 | NOT_FOUND = 5, 53 | ALREADY_EXISTS = 6, 54 | PERMISSION_DENIED = 7, 55 | UNAUTHENTICATED = 16, 56 | RESOURCE_EXHAUSTED = 8, 57 | FAILED_PRECONDITION = 9, 58 | ABORTED = 10, 59 | OUT_OF_RANGE = 11, 60 | UNIMPLEMENTED = 12, 61 | INTERNAL = 13, 62 | UNAVAILABLE = 14, 63 | DATA_LOSS = 15, 64 | }; 65 | } // namespace error 66 | 67 | class PROTOBUF_EXPORT Status { 68 | public: 69 | // Creates a "successful" status. 70 | Status(); 71 | 72 | // Create a status in the canonical error space with the specified 73 | // code, and error message. If "code == 0", error_message is 74 | // ignored and a Status object identical to Status::OK is 75 | // constructed. 76 | Status(error::Code error_code, StringPiece error_message); 77 | Status(const Status&); 78 | Status& operator=(const Status& x); 79 | ~Status() {} 80 | 81 | // Some pre-defined Status objects 82 | static const Status OK; // Identical to 0-arg constructor 83 | static const Status CANCELLED; 84 | static const Status UNKNOWN; 85 | 86 | // Accessor 87 | bool ok() const { 88 | return error_code_ == error::OK; 89 | } 90 | int error_code() const { 91 | return error_code_; 92 | } 93 | error::Code code() const { 94 | return error_code_; 95 | } 96 | StringPiece error_message() const { 97 | return error_message_; 98 | } 99 | StringPiece message() const { 100 | return error_message_; 101 | } 102 | 103 | bool operator==(const Status& x) const; 104 | bool operator!=(const Status& x) const { 105 | return !operator==(x); 106 | } 107 | 108 | // Return a combination of the error code name and message. 109 | std::string ToString() const; 110 | 111 | private: 112 | error::Code error_code_; 113 | std::string error_message_; 114 | }; 115 | 116 | // Prints a human-readable representation of 'x' to 'os'. 117 | PROTOBUF_EXPORT std::ostream& operator<<(std::ostream& os, const Status& x); 118 | 119 | } // namespace util 120 | } // namespace protobuf 121 | } // namespace google 122 | 123 | #include 124 | 125 | #endif // GOOGLE_PROTOBUF_STUBS_STATUS_H_ 126 | -------------------------------------------------------------------------------- /src/spm_normalize_main.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include "builder.h" 16 | #include "common.h" 17 | #include "filesystem.h" 18 | #include "init.h" 19 | #include "normalizer.h" 20 | #include "sentencepiece.pb.h" 21 | #include "sentencepiece_model.pb.h" 22 | #include "sentencepiece_processor.h" 23 | #include "sentencepiece_trainer.h" 24 | #include "third_party/absl/flags/flag.h" 25 | 26 | ABSL_FLAG(std::string, model, "", "Model file name"); 27 | ABSL_FLAG(bool, use_internal_normalization, false, 28 | "Use NormalizerSpec \"as-is\" to run the normalizer " 29 | "for SentencePiece segmentation"); 30 | ABSL_FLAG(std::string, normalization_rule_name, "", 31 | "Normalization rule name. " 32 | "Choose from nfkc or identity"); 33 | ABSL_FLAG(std::string, normalization_rule_tsv, "", 34 | "Normalization rule TSV file. "); 35 | ABSL_FLAG(bool, remove_extra_whitespaces, true, "Remove extra whitespaces"); 36 | ABSL_FLAG(bool, decompile, false, 37 | "Decompile compiled charamap and output it as TSV."); 38 | ABSL_FLAG(std::string, input, "", "Input filename"); 39 | ABSL_FLAG(std::string, output, "", "Output filename"); 40 | 41 | using sentencepiece::ModelProto; 42 | using sentencepiece::NormalizerSpec; 43 | using sentencepiece::SentencePieceProcessor; 44 | using sentencepiece::SentencePieceTrainer; 45 | using sentencepiece::normalizer::Builder; 46 | using sentencepiece::normalizer::Normalizer; 47 | 48 | int main(int argc, char *argv[]) { 49 | sentencepiece::ParseCommandLineFlags(argv[0], &argc, &argv, true); 50 | std::vector rest_args; 51 | 52 | if (absl::GetFlag(FLAGS_input).empty()) { 53 | for (int i = 1; i < argc; ++i) { 54 | rest_args.push_back(std::string(argv[i])); 55 | } 56 | } else { 57 | rest_args.push_back(absl::GetFlag(FLAGS_input)); 58 | } 59 | 60 | NormalizerSpec spec; 61 | 62 | if (!absl::GetFlag(FLAGS_model).empty()) { 63 | ModelProto model_proto; 64 | SentencePieceProcessor sp; 65 | CHECK_OK(sp.Load(absl::GetFlag(FLAGS_model))); 66 | spec = sp.model_proto().normalizer_spec(); 67 | } else if (!absl::GetFlag(FLAGS_normalization_rule_tsv).empty()) { 68 | spec.set_normalization_rule_tsv( 69 | absl::GetFlag(FLAGS_normalization_rule_tsv)); 70 | CHECK_OK(SentencePieceTrainer::PopulateNormalizerSpec(&spec)); 71 | } else if (!absl::GetFlag(FLAGS_normalization_rule_name).empty()) { 72 | spec.set_name(absl::GetFlag(FLAGS_normalization_rule_name)); 73 | CHECK_OK(SentencePieceTrainer::PopulateNormalizerSpec(&spec)); 74 | } else { 75 | LOG(FATAL) << "Sets --model, normalization_rule_tsv, or " 76 | "normalization_rule_name flag."; 77 | } 78 | 79 | // Uses the normalizer spec encoded in the model_pb. 80 | if (!absl::GetFlag(FLAGS_use_internal_normalization)) { 81 | spec.set_add_dummy_prefix(false); // do not add dummy prefix. 82 | spec.set_escape_whitespaces(false); // do not output meta symbol. 83 | spec.set_remove_extra_whitespaces( 84 | absl::GetFlag(FLAGS_remove_extra_whitespaces)); 85 | } 86 | 87 | if (absl::GetFlag(FLAGS_decompile)) { 88 | Builder::CharsMap chars_map; 89 | CHECK_OK( 90 | Builder::DecompileCharsMap(spec.precompiled_charsmap(), &chars_map)); 91 | CHECK_OK(Builder::SaveCharsMap(absl::GetFlag(FLAGS_output), chars_map)); 92 | } else { 93 | const Normalizer normalizer(spec); 94 | auto output = 95 | sentencepiece::filesystem::NewWritableFile(absl::GetFlag(FLAGS_output)); 96 | CHECK_OK(output->status()); 97 | 98 | if (rest_args.empty()) { 99 | rest_args.push_back(""); // empty means that read from stdin. 100 | } 101 | 102 | std::string line; 103 | for (const auto &filename : rest_args) { 104 | auto input = sentencepiece::filesystem::NewReadableFile(filename); 105 | CHECK_OK(input->status()); 106 | while (input->ReadLine(&line)) { 107 | output->WriteLine(normalizer.Normalize(line)); 108 | } 109 | } 110 | } 111 | 112 | return 0; 113 | } 114 | -------------------------------------------------------------------------------- /third_party/protobuf-lite/google/protobuf/port_undef.inc: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | // #undefs all macros defined in port_def.inc. See comments in port_def.inc 32 | // for more info. 33 | 34 | #ifndef PROTOBUF_NAMESPACE 35 | #error "port_undef.inc must be included after port_def.inc" 36 | #endif 37 | #undef PROTOBUF_NAMESPACE 38 | #undef PROTOBUF_NAMESPACE_ID 39 | #undef PROTOBUF_ALWAYS_INLINE 40 | #undef PROTOBUF_COLD 41 | #undef PROTOBUF_NOINLINE 42 | #undef PROTOBUF_SECTION_VARIABLE 43 | #undef PROTOBUF_DEPRECATED 44 | #undef PROTOBUF_DEPRECATED_ENUM 45 | #undef PROTOBUF_DEPRECATED_MSG 46 | #undef PROTOBUF_FUNC_ALIGN 47 | #undef PROTOBUF_RETURNS_NONNULL 48 | #undef PROTOBUF_ATTRIBUTE_REINITIALIZES 49 | #undef PROTOBUF_RTTI 50 | #undef PROTOBUF_VERSION 51 | #undef PROTOBUF_VERSION_SUFFIX 52 | #undef PROTOBUF_FIELD_OFFSET 53 | #undef PROTOBUF_MIN_HEADER_VERSION_FOR_PROTOC 54 | #undef PROTOBUF_MIN_PROTOC_VERSION 55 | #undef PROTOBUF_PREDICT_TRUE 56 | #undef PROTOBUF_PREDICT_FALSE 57 | #undef PROTOBUF_LONGLONG 58 | #undef PROTOBUF_ULONGLONG 59 | #undef PROTOBUF_LL_FORMAT 60 | #undef PROTOBUF_GUARDED_BY 61 | #undef PROTOBUF_FALLTHROUGH_INTENDED 62 | #undef PROTOBUF_EXPORT 63 | #undef PROTOC_EXPORT 64 | #undef PROTOBUF_MUST_USE_RESULT 65 | #undef PROTOBUF_NAMESPACE_OPEN 66 | #undef PROTOBUF_NAMESPACE_CLOSE 67 | #undef PROTOBUF_UNUSED 68 | #undef PROTOBUF_ASSUME 69 | #undef PROTOBUF_EXPORT_TEMPLATE_DECLARE 70 | #undef PROTOBUF_EXPORT_TEMPLATE_DEFINE 71 | #undef PROTOBUF_ALIGNAS 72 | #undef PROTOBUF_FINAL 73 | #undef PROTOBUF_THREAD_LOCAL 74 | #undef PROTOBUF_MESSAGE_OWNED_ARENA_EXPERIMENT 75 | #undef PROTOBUF_DISABLE_MSVC_UNION_WARNING 76 | #undef PROTOBUF_ENABLE_MSVC_UNION_WARNING 77 | #undef PROTOBUF_CONSTINIT 78 | #undef PROTOBUF_MAYBE_CONSTEXPR 79 | #undef PROTOBUF_ATTRIBUTE_NO_DESTROY 80 | 81 | // Restore macro that may have been #undef'd in port_def.inc. 82 | #ifdef _MSC_VER 83 | #pragma pop_macro("CREATE_NEW") 84 | #pragma pop_macro("DOUBLE_CLICK") 85 | #pragma pop_macro("ERROR") 86 | #pragma pop_macro("ERROR_BUSY") 87 | #pragma pop_macro("ERROR_NOT_FOUND") 88 | #pragma pop_macro("GetMessage") 89 | #pragma pop_macro("IGNORE") 90 | #pragma pop_macro("IN") 91 | #pragma pop_macro("INPUT_KEYBOARD") 92 | #pragma pop_macro("OUT") 93 | #pragma pop_macro("OPTIONAL") 94 | #pragma pop_macro("min") 95 | #pragma pop_macro("max") 96 | #pragma pop_macro("NEAR") 97 | #pragma pop_macro("NO_DATA") 98 | #pragma pop_macro("NO_ERROR") 99 | #pragma pop_macro("REASON_UNKNOWN") 100 | #pragma pop_macro("SERVICE_DISABLED") 101 | #pragma pop_macro("SEVERITY_ERROR") 102 | #pragma pop_macro("STRICT") 103 | #pragma pop_macro("timezone") 104 | #endif 105 | 106 | #if defined(__clang__) || defined(__GNUC__) || defined(_MSC_VER) 107 | #pragma pop_macro("DEBUG") 108 | #pragma pop_macro("TRUE") 109 | #pragma pop_macro("FALSE") 110 | #endif // defined(__clang__) || defined(__GNUC__) || defined(_MSC_VER) 111 | 112 | #if defined(__clang__) 113 | #pragma clang diagnostic pop 114 | #elif defined(__GNUC__) 115 | #pragma GCC diagnostic pop 116 | #endif 117 | -------------------------------------------------------------------------------- /third_party/esaxx/esa.hxx: -------------------------------------------------------------------------------- 1 | /* 2 | * esa.hxx 3 | * Copyright (c) 2010 Daisuke Okanohara All Rights Reserved. 4 | * 5 | * Permission is hereby granted, free of charge, to any person 6 | * obtaining a copy of this software and associated documentation 7 | * files (the "Software"), to deal in the Software without 8 | * restriction, including without limitation the rights to use, 9 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | * copies of the Software, and to permit persons to whom the 11 | * Software is furnished to do so, subject to the following 12 | * conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be 15 | * included in all copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 19 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 21 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 22 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 | * OTHER DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | #ifndef _ESA_HXX 28 | #define _ESA_HXX 29 | 30 | #include 31 | #include 32 | #include 33 | #include "sais.hxx" 34 | 35 | namespace esaxx_private { 36 | template 37 | index_type suffixtree(string_type T, sarray_type SA, sarray_type L, sarray_type R, sarray_type D, index_type n){ 38 | if (n == 0){ 39 | return 0; 40 | } 41 | sarray_type Psi = L; 42 | Psi[SA[0]] = SA[n-1]; 43 | for (index_type i = 1; i < n; ++i){ 44 | Psi[SA[i]] = SA[i-1]; 45 | } 46 | 47 | // Compare at most 2n log n charcters. Practically fastest 48 | // "Permuted Longest-Common-Prefix Array", Juha Karkkainen, CPM 09 49 | sarray_type PLCP = R; 50 | index_type h = 0; 51 | for (index_type i = 0; i < n; ++i){ 52 | index_type j = Psi[i]; 53 | while (i+h < n && j+h < n && 54 | T[i+h] == T[j+h]){ 55 | ++h; 56 | } 57 | PLCP[i] = h; 58 | if (h > 0) --h; 59 | } 60 | 61 | sarray_type H = L; 62 | for (index_type i = 0; i < n; ++i){ 63 | H[i] = PLCP[SA[i]]; 64 | } 65 | H[0] = -1; 66 | 67 | std::vector > S; 68 | S.push_back(std::make_pair((index_type)-1, (index_type)-1)); 69 | size_t nodeNum = 0; 70 | for (index_type i = 0; ; ++i){ 71 | std::pair cur (i, (i == n) ? -1 : H[i]); 72 | std::pair cand(S.back()); 73 | while (cand.second > cur.second){ 74 | if (i - cand.first > 1){ 75 | L[nodeNum] = cand.first; 76 | R[nodeNum] = i; 77 | D[nodeNum] = cand.second; 78 | ++nodeNum; 79 | } 80 | cur.first = cand.first; 81 | S.pop_back(); 82 | cand = S.back(); 83 | } 84 | if (cand.second < cur.second){ 85 | S.push_back(cur); 86 | } 87 | if (i == n) break; 88 | S.push_back(std::make_pair(i, n - SA[i] + 1)); 89 | } 90 | return nodeNum; 91 | } 92 | } 93 | 94 | /** 95 | * @brief Build an enhanced suffix array of a given string in linear time 96 | * For an input text T, esaxx() builds an enhancd suffix array in linear time. 97 | * i-th internal node is represented as a triple (L[i], R[i], D[i]); 98 | * L[i] and R[i] is the left/right boundary of the suffix array as SA[L[i]....R[i]-1] 99 | * D[i] is the depth of the internal node 100 | * The number of internal node is at most N-1 and return the actual number by 101 | * @param T[0...n-1] The input string. (random access iterator) 102 | * @param SA[0...n-1] The output suffix array (random access iterator) 103 | * @param L[0...n-1] The output left boundary of internal node (random access iterator) 104 | * @param R[0...n-1] The output right boundary of internal node (random access iterator) 105 | * @param D[0...n-1] The output depth of internal node (random access iterator) 106 | * @param n The length of the input string 107 | * @param k The alphabet size 108 | * @pram nodeNum The output the number of internal node 109 | * @return 0 if succeded, -1 or -2 otherwise 110 | */ 111 | 112 | template 113 | int esaxx(string_type T, sarray_type SA, sarray_type L, sarray_type R, sarray_type D, 114 | index_type n, index_type k, index_type& nodeNum) { 115 | if ((n < 0) || (k <= 0)) return -1; 116 | int err = saisxx(T, SA, n, k); 117 | if (err != 0){ 118 | return err; 119 | } 120 | nodeNum = esaxx_private::suffixtree(T, SA, L, R, D, n); 121 | return 0; 122 | } 123 | 124 | 125 | #endif // _ESA_HXX 126 | -------------------------------------------------------------------------------- /third_party/protobuf-lite/status.cc: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | #include 31 | 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | namespace google { 38 | namespace protobuf { 39 | namespace util { 40 | namespace error { 41 | inline std::string CodeEnumToString(error::Code code) { 42 | switch (code) { 43 | case OK: 44 | return "OK"; 45 | case CANCELLED: 46 | return "CANCELLED"; 47 | case UNKNOWN: 48 | return "UNKNOWN"; 49 | case INVALID_ARGUMENT: 50 | return "INVALID_ARGUMENT"; 51 | case DEADLINE_EXCEEDED: 52 | return "DEADLINE_EXCEEDED"; 53 | case NOT_FOUND: 54 | return "NOT_FOUND"; 55 | case ALREADY_EXISTS: 56 | return "ALREADY_EXISTS"; 57 | case PERMISSION_DENIED: 58 | return "PERMISSION_DENIED"; 59 | case UNAUTHENTICATED: 60 | return "UNAUTHENTICATED"; 61 | case RESOURCE_EXHAUSTED: 62 | return "RESOURCE_EXHAUSTED"; 63 | case FAILED_PRECONDITION: 64 | return "FAILED_PRECONDITION"; 65 | case ABORTED: 66 | return "ABORTED"; 67 | case OUT_OF_RANGE: 68 | return "OUT_OF_RANGE"; 69 | case UNIMPLEMENTED: 70 | return "UNIMPLEMENTED"; 71 | case INTERNAL: 72 | return "INTERNAL"; 73 | case UNAVAILABLE: 74 | return "UNAVAILABLE"; 75 | case DATA_LOSS: 76 | return "DATA_LOSS"; 77 | } 78 | 79 | // No default clause, clang will abort if a code is missing from 80 | // above switch. 81 | return "UNKNOWN"; 82 | } 83 | } // namespace error. 84 | 85 | const Status Status::OK = Status(); 86 | const Status Status::CANCELLED = Status(error::CANCELLED, ""); 87 | const Status Status::UNKNOWN = Status(error::UNKNOWN, ""); 88 | 89 | Status::Status() : error_code_(error::OK) { 90 | } 91 | 92 | Status::Status(error::Code error_code, StringPiece error_message) 93 | : error_code_(error_code) { 94 | if (error_code != error::OK) { 95 | error_message_ = error_message.ToString(); 96 | } 97 | } 98 | 99 | Status::Status(const Status& other) 100 | : error_code_(other.error_code_), error_message_(other.error_message_) { 101 | } 102 | 103 | Status& Status::operator=(const Status& other) { 104 | error_code_ = other.error_code_; 105 | error_message_ = other.error_message_; 106 | return *this; 107 | } 108 | 109 | bool Status::operator==(const Status& x) const { 110 | return error_code_ == x.error_code_ && 111 | error_message_ == x.error_message_; 112 | } 113 | 114 | std::string Status::ToString() const { 115 | if (error_code_ == error::OK) { 116 | return "OK"; 117 | } else { 118 | if (error_message_.empty()) { 119 | return error::CodeEnumToString(error_code_); 120 | } else { 121 | return error::CodeEnumToString(error_code_) + ":" + 122 | error_message_; 123 | } 124 | } 125 | } 126 | 127 | std::ostream& operator<<(std::ostream& os, const Status& x) { 128 | os << x.ToString(); 129 | return os; 130 | } 131 | 132 | } // namespace util 133 | } // namespace protobuf 134 | } // namespace google 135 | -------------------------------------------------------------------------------- /python/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2018 Google Inc. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License.! 16 | 17 | from setuptools import setup, Extension 18 | from setuptools.command.build_ext import build_ext as _build_ext 19 | from setuptools.command.build_py import build_py as _build_py 20 | import codecs 21 | import string 22 | import subprocess 23 | import sys 24 | import os 25 | 26 | sys.path.append(os.path.join('.', 'test')) 27 | 28 | 29 | def long_description(): 30 | with codecs.open('README.md', 'r', 'utf-8') as f: 31 | long_description = f.read() 32 | return long_description 33 | 34 | 35 | def version(): 36 | with codecs.open('VERSION.txt', 'r', 'utf-8') as f: 37 | version = f.read().rstrip() 38 | return version 39 | 40 | 41 | def run_pkg_config(section, pkg_config_path=None): 42 | try: 43 | cmd = 'pkg-config sentencepiece --{}'.format(section) 44 | if pkg_config_path: 45 | cmd = 'env PKG_CONFIG_PATH={} {}'.format(pkg_config_path, cmd) 46 | output = subprocess.check_output(cmd, shell=True) 47 | if sys.version_info >= (3, 0, 0): 48 | output = output.decode('utf-8') 49 | except subprocess.CalledProcessError: 50 | sys.stderr.write('Failed to find sentencepiece pkg-config\n') 51 | sys.exit(1) 52 | return output.strip().split() 53 | 54 | 55 | def is_sentencepiece_installed(): 56 | try: 57 | subprocess.check_call('pkg-config sentencepiece --libs', shell=True) 58 | return True 59 | except subprocess.CalledProcessError: 60 | return False 61 | 62 | 63 | class build_ext(_build_ext): 64 | """Override build_extension to run cmake.""" 65 | 66 | def build_extension(self, ext): 67 | pkg_config_path = None 68 | if not is_sentencepiece_installed(): 69 | subprocess.check_call(['./build_bundled.sh', version()]) 70 | pkg_config_path = './bundled/lib/pkgconfig:./bundled/lib64/pkgconfig' 71 | 72 | cflags = ['-std=c++11'] 73 | # Fix compile on some versions of Mac OSX 74 | # See: https://github.com/neulab/xnmt/issues/199 75 | if sys.platform == 'darwin': 76 | cflags.append('-mmacosx-version-min=10.9') 77 | cflags = cflags + run_pkg_config('cflags', pkg_config_path) 78 | libs = run_pkg_config('libs', pkg_config_path) 79 | print('## cflags={}'.format(' '.join(cflags))) 80 | print('## libs={}'.format(' '.join(libs))) 81 | ext.extra_compile_args = cflags 82 | ext.extra_link_args = libs 83 | _build_ext.build_extension(self, ext) 84 | 85 | 86 | if os.name == 'nt': 87 | cflags = ['/MT', '/I..\\build\\root\\include'] 88 | libs = [ 89 | '..\\build\\root\\lib\\sentencepiece.lib', 90 | '..\\build\\root\\lib\\sentencepiece_train.lib' 91 | ] 92 | SENTENCEPIECE_EXT = Extension( 93 | 'sentencepiece._sentencepiece', 94 | sources=['src/sentencepiece/sentencepiece_wrap.cxx'], 95 | extra_compile_args=cflags, 96 | extra_link_args=libs) 97 | cmdclass = {} 98 | else: 99 | SENTENCEPIECE_EXT = Extension( 100 | 'sentencepiece._sentencepiece', 101 | sources=['src/sentencepiece/sentencepiece_wrap.cxx']) 102 | cmdclass = {'build_ext': build_ext} 103 | 104 | setup( 105 | name='sentencepiece', 106 | author='Taku Kudo', 107 | author_email='taku@google.com', 108 | description='SentencePiece python wrapper', 109 | long_description=long_description(), 110 | long_description_content_type='text/markdown', 111 | version=version(), 112 | package_dir={'': 'src'}, 113 | url='https://github.com/google/sentencepiece', 114 | license='Apache', 115 | platforms='Unix', 116 | py_modules=[ 117 | 'sentencepiece/__init__', 'sentencepiece/sentencepiece_model_pb2', 118 | 'sentencepiece/sentencepiece_pb2' 119 | ], 120 | ext_modules=[SENTENCEPIECE_EXT], 121 | cmdclass=cmdclass, 122 | classifiers=[ 123 | 'Development Status :: 5 - Production/Stable', 'Environment :: Console', 124 | 'Intended Audience :: Developers', 125 | 'Intended Audience :: Science/Research', 126 | 'License :: OSI Approved :: Apache Software License', 127 | 'Operating System :: Unix', 'Programming Language :: Python', 128 | 'Topic :: Text Processing :: Linguistic', 129 | 'Topic :: Software Development :: Libraries :: Python Modules' 130 | ], 131 | test_suite='sentencepiece_test.suite') 132 | --------------------------------------------------------------------------------