├── inst ├── models │ ├── nl-fr-dekamer.model │ ├── nl.wiki.bpe.vs1000.model │ ├── nl-fr-dekamer-unigram.model │ └── nl.wiki.bpe.vs1000.d25.w2v.bin └── spc-help │ └── spm_train ├── src ├── third_party │ ├── CMakeLists.txt │ ├── absl │ │ ├── flags │ │ │ ├── parse.h │ │ │ └── flag.h │ │ ├── container │ │ │ ├── flat_hash_set.h │ │ │ └── flat_hash_map.h │ │ ├── strings │ │ │ ├── strip.h │ │ │ ├── numbers.h │ │ │ ├── str_format.h │ │ │ ├── match.h │ │ │ ├── ascii.h │ │ │ ├── str_cat.h │ │ │ ├── str_replace.h │ │ │ ├── str_join.h │ │ │ └── str_split.h │ │ └── memory │ │ │ └── memory.h │ ├── esaxx │ │ └── LICENSE │ ├── darts_clone │ │ └── LICENSE │ └── protobuf-lite │ │ ├── LICENSE │ │ ├── google │ │ └── protobuf │ │ │ ├── port.h │ │ │ ├── stubs │ │ │ ├── once.h │ │ │ ├── stl_util.h │ │ │ ├── time.h │ │ │ ├── stringprintf.h │ │ │ ├── hash.h │ │ │ └── status.h │ │ │ ├── generated_enum_util.h │ │ │ ├── has_bits.h │ │ │ └── generated_enum_reflection.h │ │ ├── statusor.cc │ │ ├── zero_copy_stream.cc │ │ ├── implicit_weak_message.cc │ │ └── generated_enum_util.cc ├── config.h ├── sentencepiece │ └── src │ │ ├── model_factory.h │ │ ├── test_main.cc │ │ ├── word_model.h │ │ ├── char_model.h │ │ ├── trainer_factory.h │ │ ├── word_model.cc │ │ ├── char_model_trainer.h │ │ ├── unicode_script.cc │ │ ├── freelist_test.cc │ │ ├── char_model.cc │ │ ├── init.h │ │ ├── word_model_trainer.h │ │ ├── unicode_script_test.cc │ │ ├── filesystem_test.cc │ │ ├── model_factory.cc │ │ ├── trainer_factory_test.cc │ │ ├── bpe_model.h │ │ ├── model_factory_test.cc │ │ ├── char_model_trainer.cc │ │ ├── filesystem.h │ │ ├── testharness.cc │ │ ├── spm_export_vocab_main.cc │ │ ├── pretokenizer_for_training.cc │ │ ├── trainer_factory.cc │ │ ├── pretokenizer_for_training.h │ │ ├── word_model_trainer.cc │ │ ├── freelist.h │ │ ├── word_model_trainer_test.cc │ │ ├── sentencepiece.proto │ │ ├── char_model_trainer_test.cc │ │ ├── word_model_test.cc │ │ ├── pretokenizer_for_training_test.cc │ │ ├── unigram_model_trainer_test.cc │ │ ├── unicode_script.h │ │ ├── filesystem.cc │ │ ├── char_model_test.cc │ │ ├── unigram_model_trainer.h │ │ ├── spm_decode_main.cc │ │ └── spm_normalize_main.cc ├── rcpp_wordpiece.cpp └── Makevars ├── .gitignore ├── R ├── pkg.R ├── utils.R ├── word2vec.R ├── RcppExports.R └── wordpiece.R ├── .Rbuildignore ├── sentencepiece.Rproj ├── NAMESPACE ├── man ├── txt_remove_.Rd ├── sentencepiece_load_model.Rd ├── read_word2vec.Rd ├── sentencepiece_decode.Rd ├── wordpiece_encode.Rd ├── BPEembedder.Rd ├── BPEembed.Rd ├── predict.BPEembed.Rd ├── sentencepiece.Rd ├── sentencepiece_download_model.Rd └── sentencepiece_encode.Rd ├── .github └── workflows │ ├── R-CMD-check.yml │ └── rhub.yaml └── NEWS.md /inst/models/nl-fr-dekamer.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bnosac/sentencepiece/HEAD/inst/models/nl-fr-dekamer.model -------------------------------------------------------------------------------- /src/third_party/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories(absl/strings darts_clone esaxx protobuf-lite) 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /inst/models/nl.wiki.bpe.vs1000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bnosac/sentencepiece/HEAD/inst/models/nl.wiki.bpe.vs1000.model -------------------------------------------------------------------------------- /inst/models/nl-fr-dekamer-unigram.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bnosac/sentencepiece/HEAD/inst/models/nl-fr-dekamer-unigram.model -------------------------------------------------------------------------------- /inst/models/nl.wiki.bpe.vs1000.d25.w2v.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bnosac/sentencepiece/HEAD/inst/models/nl.wiki.bpe.vs1000.d25.w2v.bin -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | src/*.o 6 | src/*.so 7 | src/*.dll 8 | inst/extdata 9 | inst/extdata/dekamer.txt 10 | dev 11 | -------------------------------------------------------------------------------- /R/pkg.R: -------------------------------------------------------------------------------- 1 | #' @importFrom Rcpp evalCpp 2 | #' @importFrom utils head capture.output packageVersion 3 | #' @importFrom stats predict 4 | #' @useDynLib sentencepiece 5 | NULL 6 | -------------------------------------------------------------------------------- /src/config.h: -------------------------------------------------------------------------------- 1 | #ifndef CONFIG_H_ 2 | #define CONFIG_H_ 3 | 4 | #define VERSION "0.1.84" 5 | #define PACKAGE "sentencepiece" 6 | #define PACKAGE_STRING "sentencepiece" 7 | 8 | 9 | #endif // CONFIG_H_ -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | .github 4 | LICENSE$ 5 | dev 6 | inst/extdata 7 | inst/extdata/dekamer.txt 8 | inst/models/english.model 9 | inst/models/english.vocab 10 | inst/models/nl.wiki.bpe.vs200000.model 11 | inst/models/nl.wiki.bpe.vs200000.vocab 12 | -------------------------------------------------------------------------------- /sentencepiece.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | PackageRoxygenize: rd,collate,namespace 19 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | #' @title Remove prefixed underscore 5 | #' @description Remove prefixed underscore unicode character 'LOWER ONE EIGHTH BLOCK' (U+2581) 6 | #' @param x a character vector 7 | #' @param replacement character string how to replace the underscore. Defaults to the empty string. 8 | #' @return \code{x} where the prefixed underscore is removed 9 | #' @export 10 | #' @examples 11 | #' x <- c("\u2581word", "hello", "_regularunderscore") 12 | #' x 13 | #' txt_remove_(x) 14 | txt_remove_ <- function(x, replacement = ""){ 15 | gsub(pattern = "^\u2581", replacement = replacement, x) 16 | } -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(predict,BPEembed) 4 | S3method(print,BPEembed) 5 | S3method(print,sentencepiece) 6 | export(BPEembed) 7 | export(BPEembedder) 8 | export(read_word2vec) 9 | export(sentencepiece) 10 | export(sentencepiece_decode) 11 | export(sentencepiece_download_model) 12 | export(sentencepiece_encode) 13 | export(sentencepiece_load_model) 14 | export(txt_remove_) 15 | export(wordpiece_encode) 16 | importFrom(Rcpp,evalCpp) 17 | importFrom(stats,predict) 18 | importFrom(utils,capture.output) 19 | importFrom(utils,head) 20 | importFrom(utils,packageVersion) 21 | useDynLib(sentencepiece) 22 | -------------------------------------------------------------------------------- /man/txt_remove_.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{txt_remove_} 4 | \alias{txt_remove_} 5 | \title{Remove prefixed underscore} 6 | \usage{ 7 | txt_remove_(x, replacement = "") 8 | } 9 | \arguments{ 10 | \item{x}{a character vector} 11 | 12 | \item{replacement}{character string how to replace the underscore. Defaults to the empty string.} 13 | } 14 | \value{ 15 | \code{x} where the prefixed underscore is removed 16 | } 17 | \description{ 18 | Remove prefixed underscore unicode character 'LOWER ONE EIGHTH BLOCK' (U+2581) 19 | } 20 | \examples{ 21 | x <- c("\u2581word", "hello", "_regularunderscore") 22 | x 23 | txt_remove_(x) 24 | } 25 | -------------------------------------------------------------------------------- /src/third_party/absl/flags/parse.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef ABSL_FLAGS_PARSE_H_ 16 | #define ABSL_FLAGS_PARSE_H_ 17 | 18 | #include 19 | 20 | namespace absl { 21 | 22 | std::vector ParseCommandLine(int argc, char *argv[]); 23 | } // namespace absl 24 | 25 | #endif // ABSL_FLAGS_PARSE_H_ 26 | -------------------------------------------------------------------------------- /src/third_party/absl/container/flat_hash_set.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef ABSL_CONTAINER_FLAT_HASH_SET_ 16 | #define ABSL_CONTAINER_FLAT_HASH_SET_ 17 | 18 | #include 19 | 20 | namespace absl { 21 | 22 | template , 23 | typename Eq = std::equal_to, 24 | typename Allocator = std::allocator> 25 | using flat_hash_set = std::unordered_set; 26 | 27 | } 28 | 29 | #endif // ABSL_CONTAINER_FLAT_HASH_SET_ 30 | -------------------------------------------------------------------------------- /src/sentencepiece/src/model_factory.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef MODEL_FACTORY_H_ 16 | #define MODEL_FACTORY_H_ 17 | 18 | #include 19 | 20 | #include "model_interface.h" 21 | #include "sentencepiece_model.pb.h" 22 | 23 | namespace sentencepiece { 24 | 25 | class ModelFactory { 26 | public: 27 | // Creates Model instance from |model_proto|. 28 | static std::unique_ptr Create(const ModelProto &model_proto); 29 | }; 30 | } // namespace sentencepiece 31 | #endif // MODEL_FACTORY_H_ 32 | -------------------------------------------------------------------------------- /src/third_party/absl/strings/strip.h: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright 2017 The Abseil Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // 16 | #ifndef ABSL_STRINGS_STRIP_H_ 17 | #define ABSL_STRINGS_STRIP_H_ 18 | 19 | #include 20 | 21 | #include "third_party/absl/strings/match.h" 22 | 23 | namespace absl { 24 | 25 | inline bool ConsumePrefix(absl::string_view *str, absl::string_view expected) { 26 | if (!absl::StartsWith(*str, expected)) return false; 27 | str->remove_prefix(expected.size()); 28 | return true; 29 | } 30 | 31 | } // namespace absl 32 | #endif // ABSL_STRINGS_STRIP_H 33 | -------------------------------------------------------------------------------- /src/third_party/absl/container/flat_hash_map.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef ABSL_CONTAINER_FLAT_HASH_MAP_ 16 | #define ABSL_CONTAINER_FLAT_HASH_MAP_ 17 | 18 | #include 19 | 20 | namespace absl { 21 | 22 | template , 23 | typename Eq = std::equal_to, 24 | typename Allocator = std::allocator>> 25 | using flat_hash_map = std::unordered_map; 26 | 27 | } 28 | 29 | #endif // ABSL_CONTAINER_FLAT_HASH_MAP_ 30 | -------------------------------------------------------------------------------- /src/sentencepiece/src/test_main.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include "init.h" 16 | #include "testharness.h" 17 | 18 | #ifdef OS_WIN 19 | ABSL_FLAG(std::string, test_srcdir, "..\\data", "Data directory."); 20 | #else 21 | ABSL_FLAG(std::string, test_srcdir, "../data", "Data directory."); 22 | #endif 23 | 24 | ABSL_FLAG(std::string, test_tmpdir, "test_tmp", "Temporary directory."); 25 | 26 | int main(int argc, char **argv) { 27 | sentencepiece::ParseCommandLineFlags(argv[0], &argc, &argv, true); 28 | sentencepiece::test::RunAllTests(); 29 | return 0; 30 | } 31 | -------------------------------------------------------------------------------- /src/third_party/absl/strings/numbers.h: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright 2017 The Abseil Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // 16 | #ifndef ABSL_STRINGS_NUMBERS_H_ 17 | #define ABSL_STRINGS_NUMBERS_H_ 18 | 19 | #include 20 | 21 | #include "third_party/absl/strings/string_view.h" 22 | 23 | namespace absl { 24 | 25 | // TODO(taku): Re-implement this, as it is slow. 26 | template 27 | inline bool SimpleAtoi(absl::string_view s, T *result) { 28 | std::stringstream ss; 29 | return (ss << s.data() && ss >> *result); 30 | } 31 | 32 | } // namespace absl 33 | #endif // ABSL_STRINGS_NUMBERS_H_ 34 | -------------------------------------------------------------------------------- /src/sentencepiece/src/word_model.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef WORD_MODEL_H_ 16 | #define WORD_MODEL_H_ 17 | 18 | #include "model_interface.h" 19 | #include "sentencepiece_model.pb.h" 20 | 21 | namespace sentencepiece { 22 | namespace word { 23 | 24 | // Tokenize text with whitespaces. 25 | class Model : public ModelInterface { 26 | public: 27 | explicit Model(const ModelProto &model_proto); 28 | ~Model() override; 29 | 30 | EncodeResult Encode(absl::string_view normalized) const override; 31 | }; 32 | } // namespace word 33 | } // namespace sentencepiece 34 | #endif // WORD_MODEL_H_ 35 | -------------------------------------------------------------------------------- /src/third_party/esaxx/LICENSE: -------------------------------------------------------------------------------- 1 | This is the esaxx copyright. 2 | 3 | Copyright (c) 2010 Daisuke Okanohara All Rights Reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person 6 | obtaining a copy of this software and associated documentation 7 | files (the "Software"), to deal in the Software without 8 | restriction, including without limitation the rights to use, 9 | copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the 11 | Software is furnished to do so, subject to the following 12 | conditions: 13 | 14 | The above copyright notice and this permission notice shall be 15 | included in all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 19 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 21 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 22 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 | OTHER DEALINGS IN THE SOFTWARE. 25 | -------------------------------------------------------------------------------- /src/sentencepiece/src/char_model.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef CHAR_MODEL_H_ 16 | #define CHAR_MODEL_H_ 17 | 18 | #include "model_interface.h" 19 | #include "sentencepiece_model.pb.h" 20 | 21 | namespace sentencepiece { 22 | namespace character { 23 | 24 | // Tokenize text into character sequence 25 | class Model : public ModelInterface { 26 | public: 27 | explicit Model(const ModelProto &model_proto); 28 | ~Model() override; 29 | 30 | EncodeResult Encode(absl::string_view normalized) const override; 31 | }; 32 | } // namespace character 33 | } // namespace sentencepiece 34 | #endif // CHAR_MODEL_H_ 35 | -------------------------------------------------------------------------------- /src/sentencepiece/src/trainer_factory.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef TRAINER_FACTORY_H_ 16 | #define TRAINER_FACTORY_H_ 17 | 18 | #include 19 | 20 | #include "sentencepiece_model.pb.h" 21 | #include "trainer_interface.h" 22 | 23 | namespace sentencepiece { 24 | 25 | class TrainerFactory { 26 | public: 27 | // Creates Trainer instance from |trainer_spec| and |normalizer_spec|. 28 | static std::unique_ptr Create( 29 | const TrainerSpec &trainer_spec, const NormalizerSpec &normalizer_spec, 30 | const NormalizerSpec &denormalizer_spec); 31 | }; 32 | } // namespace sentencepiece 33 | #endif // TRAINER_FACTORY_H_ 34 | -------------------------------------------------------------------------------- /src/third_party/absl/strings/str_format.h: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright 2017 The Abseil Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // 16 | #ifndef ABSL_STRINGS_STR_FORMAT_H 17 | #define ABSL_STRINGS_STR_FORMAT_H 18 | 19 | #include 20 | 21 | #include 22 | 23 | #include "third_party/absl/strings/string_view.h" 24 | 25 | namespace absl { 26 | 27 | template 28 | std::string StrFormat(const char *format, Args const &... args) { 29 | const int len = ::snprintf(nullptr, 0, format, args...); 30 | std::string s; 31 | s.resize(len); 32 | ::snprintf(&s[0], s.size() + 1, format, args...); 33 | return s; 34 | } 35 | 36 | } // namespace absl 37 | #endif // ABSL_MEMORY_MEMORY_H_ 38 | -------------------------------------------------------------------------------- /src/sentencepiece/src/word_model.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include "util.h" 16 | #include "word_model.h" 17 | 18 | namespace sentencepiece { 19 | namespace word { 20 | 21 | Model::Model(const ModelProto &model_proto) { 22 | model_proto_ = &model_proto; 23 | InitializePieces(); 24 | } 25 | 26 | Model::~Model() {} 27 | 28 | EncodeResult Model::Encode(absl::string_view normalized) const { 29 | if (!status().ok() || normalized.empty()) { 30 | return {}; 31 | } 32 | 33 | EncodeResult output; 34 | for (const auto &w : SplitIntoWords(normalized)) { 35 | output.emplace_back(w, PieceToId(w)); 36 | } 37 | 38 | return output; 39 | } 40 | 41 | } // namespace word 42 | } // namespace sentencepiece 43 | -------------------------------------------------------------------------------- /man/sentencepiece_load_model.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sentencepiece.R 3 | \name{sentencepiece_load_model} 4 | \alias{sentencepiece_load_model} 5 | \title{Load a Sentencepiece model} 6 | \usage{ 7 | sentencepiece_load_model(file = "sentencepiece.model") 8 | } 9 | \arguments{ 10 | \item{file}{path to the file containing the Sentencepiece model} 11 | } 12 | \value{ 13 | an object of class \code{sentencepiece} which is a list with elements 14 | \itemize{ 15 | \item{model: an Rcpp pointer to the model} 16 | \item{model_path: the path to the model} 17 | \item{vocab_size: the size of the Sentencepiece vocabulary} 18 | \item{vocabulary: the Sentencepiece vocabulary which is a data.frame with columns id and subword} 19 | } 20 | } 21 | \description{ 22 | Load a Sentencepiece model which either was trained with \code{\link{sentencepiece}} or which you have found in the wild. 23 | } 24 | \examples{ 25 | model <- system.file(package = "sentencepiece", "models", "nl-fr-dekamer.model") 26 | model <- sentencepiece_load_model(file = model) 27 | 28 | txt <- c("De eigendomsoverdracht aan de deelstaten is ingewikkeld.", 29 | "On est d'accord sur le prix de la biere?") 30 | sentencepiece_encode(model, x = txt, type = "subwords") 31 | sentencepiece_encode(model, x = txt, type = "ids") 32 | } 33 | -------------------------------------------------------------------------------- /src/sentencepiece/src/char_model_trainer.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef CHAR_MODEL_TRAINER_H_ 16 | #define CHAR_MODEL_TRAINER_H_ 17 | 18 | #include "sentencepiece_model.pb.h" 19 | #include "trainer_interface.h" 20 | 21 | namespace sentencepiece { 22 | namespace character { 23 | 24 | // Trainer class for character model. 25 | class Trainer : public TrainerInterface { 26 | public: 27 | Trainer(const TrainerSpec &trainer_spec, 28 | const NormalizerSpec &normalizer_spec, 29 | const NormalizerSpec &denormalizer_spec) 30 | : TrainerInterface::TrainerInterface(trainer_spec, normalizer_spec, 31 | denormalizer_spec) {} 32 | 33 | util::Status Train() override; 34 | }; 35 | } // namespace character 36 | } // namespace sentencepiece 37 | #endif // CHAR_MODEL_TRAINER_H_ 38 | -------------------------------------------------------------------------------- /src/sentencepiece/src/unicode_script.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include 16 | 17 | #include "third_party/absl/container/flat_hash_map.h" 18 | #include "unicode_script.h" 19 | #include "unicode_script_map.h" 20 | #include "util.h" 21 | 22 | namespace sentencepiece { 23 | namespace unicode_script { 24 | namespace { 25 | class GetScriptInternal { 26 | public: 27 | GetScriptInternal() { InitTable(&smap_); } 28 | 29 | ScriptType GetScript(char32 c) const { 30 | return port::FindWithDefault(smap_, c, ScriptType::U_Common); 31 | } 32 | 33 | private: 34 | absl::flat_hash_map smap_; 35 | }; 36 | } // namespace 37 | 38 | ScriptType GetScript(char32 c) { 39 | static GetScriptInternal sc; 40 | return sc.GetScript(c); 41 | } 42 | } // namespace unicode_script 43 | } // namespace sentencepiece 44 | -------------------------------------------------------------------------------- /src/sentencepiece/src/freelist_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include "freelist.h" 16 | #include "testharness.h" 17 | 18 | namespace sentencepiece { 19 | namespace model { 20 | 21 | TEST(FreeListTest, BasicTest) { 22 | FreeList l(5); 23 | EXPECT_EQ(0, l.size()); 24 | 25 | constexpr size_t kSize = 32; 26 | 27 | for (size_t i = 0; i < kSize; ++i) { 28 | int *n = l.Allocate(); 29 | EXPECT_EQ(0, *n); 30 | *n = i; 31 | } 32 | 33 | EXPECT_EQ(kSize, l.size()); 34 | for (size_t i = 0; i < kSize; ++i) { 35 | EXPECT_EQ(i, *l[i]); 36 | } 37 | 38 | l.Free(); 39 | EXPECT_EQ(0, l.size()); 40 | 41 | // Zero-initialized after `Free`. 42 | for (size_t i = 0; i < kSize; ++i) { 43 | int *n = l.Allocate(); 44 | EXPECT_EQ(0, *n); 45 | } 46 | } 47 | } // namespace model 48 | } // namespace sentencepiece 49 | -------------------------------------------------------------------------------- /src/third_party/absl/strings/match.h: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright 2017 The Abseil Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // 16 | #ifndef ABSL_STRINGS_MATCH_H_ 17 | #define ABSL_STRINGS_MATCH_H_ 18 | 19 | #include 20 | 21 | #include "third_party/absl/strings/string_view.h" 22 | 23 | namespace absl { 24 | 25 | inline bool StartsWith(absl::string_view text, absl::string_view prefix) { 26 | return prefix.empty() || 27 | (text.size() >= prefix.size() && 28 | memcmp(text.data(), prefix.data(), prefix.size()) == 0); 29 | } 30 | 31 | inline bool EndsWith(absl::string_view text, absl::string_view suffix) { 32 | return suffix.empty() || (text.size() >= suffix.size() && 33 | memcmp(text.data() + (text.size() - suffix.size()), 34 | suffix.data(), suffix.size()) == 0); 35 | } 36 | 37 | } // namespace absl 38 | #endif // ABSL_STRINGS_MATCH_H_ 39 | -------------------------------------------------------------------------------- /src/third_party/absl/strings/ascii.h: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright 2017 The Abseil Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // 16 | #ifndef ABSL_STRINGS_ASCII_H_ 17 | #define ABSL_STRINGS_ASCII_H_ 18 | 19 | #include 20 | 21 | #include 22 | 23 | #include "third_party/absl/strings/string_view.h" 24 | 25 | namespace absl { 26 | 27 | inline std::string AsciiStrToUpper(absl::string_view value) { 28 | std::string upper_value = std::string(value); 29 | std::transform(upper_value.begin(), upper_value.end(), upper_value.begin(), 30 | ::toupper); 31 | return upper_value; 32 | } 33 | 34 | inline std::string AsciiStrToLower(absl::string_view value) { 35 | std::string lower_value = std::string(value); 36 | std::transform(lower_value.begin(), lower_value.end(), lower_value.begin(), 37 | ::tolower); 38 | return lower_value; 39 | } 40 | } // namespace absl 41 | #endif // ABSL_STRINGS_ASCII_H_ 42 | -------------------------------------------------------------------------------- /src/third_party/darts_clone/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2008-2011, Susumu Yata 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | 6 | - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 8 | - Neither the name of the nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 9 | 10 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 11 | -------------------------------------------------------------------------------- /src/sentencepiece/src/char_model.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include "char_model.h" 16 | #include "util.h" 17 | 18 | namespace sentencepiece { 19 | namespace character { 20 | 21 | Model::Model(const ModelProto &model_proto) { 22 | model_proto_ = &model_proto; 23 | InitializePieces(); 24 | } 25 | 26 | Model::~Model() {} 27 | 28 | EncodeResult Model::Encode(absl::string_view normalized) const { 29 | if (!status().ok() || normalized.empty()) { 30 | return {}; 31 | } 32 | 33 | // Splits the input into character sequence 34 | EncodeResult output; 35 | while (!normalized.empty()) { 36 | const int mblen = matcher_->PrefixMatch(normalized); 37 | absl::string_view w(normalized.data(), mblen); 38 | output.emplace_back(w, PieceToId(w)); 39 | normalized.remove_prefix(mblen); 40 | } 41 | 42 | return output; 43 | } 44 | 45 | } // namespace character 46 | } // namespace sentencepiece 47 | -------------------------------------------------------------------------------- /src/sentencepiece/src/init.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef INIT_H_ 16 | #define INIT_H_ 17 | 18 | #include "common.h" 19 | #include "third_party/absl/flags/flag.h" 20 | #include "third_party/absl/flags/parse.h" 21 | 22 | ABSL_DECLARE_FLAG(int32, minloglevel); 23 | 24 | namespace sentencepiece { 25 | inline void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, 26 | bool remove_arg = true) { 27 | const auto unused_args = absl::ParseCommandLine(*argc, *argv); 28 | 29 | if (remove_arg) { 30 | char **argv_val = *argv; 31 | *argv = argv_val = argv_val + *argc - unused_args.size(); 32 | std::copy(unused_args.begin(), unused_args.end(), argv_val); 33 | *argc = static_cast(unused_args.size()); 34 | } 35 | 36 | logging::SetMinLogLevel(absl::GetFlag(FLAGS_minloglevel)); 37 | } 38 | } // namespace sentencepiece 39 | 40 | #endif // INIT_H_ 41 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - master 5 | pull_request: 6 | branches: 7 | - master 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ${{ matrix.config.os }} 14 | 15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | config: 21 | - {os: macos-latest, r: 'release'} 22 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 23 | - {os: ubuntu-latest, r: 'release'} 24 | - {os: ubuntu-latest, r: 'oldrel'} 25 | - {os: ubuntu-latest, r: 'oldrel-1'} 26 | - {os: ubuntu-latest, r: 'oldrel-2'} 27 | - {os: ubuntu-latest, r: 'oldrel-3'} 28 | 29 | env: 30 | R_REMOTES_NO_ERRORS_FROM_WARNINGS: true 31 | RSPM: ${{ matrix.config.rspm }} 32 | GITHUB_PAT: ${{ secrets.PAT }} 33 | steps: 34 | - uses: actions/checkout@v3 35 | 36 | - uses: r-lib/actions/setup-pandoc@v2 37 | 38 | - uses: r-lib/actions/setup-r@v2 39 | with: 40 | r-version: ${{ matrix.config.r }} 41 | http-user-agent: ${{ matrix.config.http-user-agent }} 42 | use-public-rspm: true 43 | 44 | - uses: r-lib/actions/setup-r-dependencies@v2 45 | with: 46 | extra-packages: any::rcmdcheck 47 | needs: check 48 | 49 | - uses: r-lib/actions/check-r-package@v2 50 | with: 51 | upload-snapshots: true -------------------------------------------------------------------------------- /man/read_word2vec.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/word2vec.R 3 | \name{read_word2vec} 4 | \alias{read_word2vec} 5 | \title{Read a word2vec embedding file} 6 | \usage{ 7 | read_word2vec( 8 | x, 9 | type = c("txt", "bin"), 10 | n = .Machine$integer.max, 11 | encoding = "UTF-8", 12 | normalize = TRUE 13 | ) 14 | } 15 | \arguments{ 16 | \item{x}{path to the file} 17 | 18 | \item{type}{either 'bin' or 'txt' indicating the \code{file} is a binary file or a text file} 19 | 20 | \item{n}{integer, indicating to limit the number of words to read in. Defaults to reading all words.} 21 | 22 | \item{encoding}{encoding to be assumed for the words. Defaults to 'UTF-8'} 23 | 24 | \item{normalize}{logical indicating to normalize the embeddings by dividing by the factor (sqrt(sum(x . x) / length(x))). Defaults to TRUE.} 25 | } 26 | \value{ 27 | a matrix with one row per token containing the embedding of the token 28 | } 29 | \description{ 30 | Read a word2vec embedding file as a dense matrix. This uses \code{\link[word2vec]{read.wordvectors}} from the word2vec package. 31 | } 32 | \examples{ 33 | folder <- system.file(package = "sentencepiece", "models") 34 | embedding <- file.path(folder, "nl.wiki.bpe.vs1000.d25.w2v.bin") 35 | embedding <- read_word2vec(embedding, type = "bin") 36 | head(embedding) 37 | embedding <- file.path(folder, "nl.wiki.bpe.vs1000.d25.w2v.txt") 38 | embedding <- read_word2vec(embedding, type = "txt") 39 | head(embedding, n = 10) 40 | } 41 | \seealso{ 42 | \code{\link[word2vec]{read.wordvectors}} 43 | } 44 | -------------------------------------------------------------------------------- /src/sentencepiece/src/word_model_trainer.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef WORD_MODEL_TRAINER_H_ 16 | #define WORD_MODEL_TRAINER_H_ 17 | 18 | #include "sentencepiece_model.pb.h" 19 | #include "trainer_interface.h" 20 | 21 | namespace sentencepiece { 22 | namespace word { 23 | 24 | // Trainer class for word model. 25 | // 26 | // Word model simply counts the frequency of 27 | // space-delimited tokens, then keep top 28 | // |vocab_size| frequent tokens. 29 | class Trainer : public TrainerInterface { 30 | public: 31 | Trainer(const TrainerSpec &trainer_spec, 32 | const NormalizerSpec &normalizer_spec, 33 | const NormalizerSpec &denormalizer_spec) 34 | : TrainerInterface::TrainerInterface(trainer_spec, normalizer_spec, 35 | denormalizer_spec) {} 36 | 37 | util::Status Train() override; 38 | }; 39 | } // namespace word 40 | } // namespace sentencepiece 41 | #endif // WORD_MODEL_TRAINER_H_ 42 | -------------------------------------------------------------------------------- /R/word2vec.R: -------------------------------------------------------------------------------- 1 | #' @title Read a word2vec embedding file 2 | #' @description Read a word2vec embedding file as a dense matrix. This uses \code{\link[word2vec]{read.wordvectors}} from the word2vec package. 3 | #' @param x path to the file 4 | #' @param type either 'bin' or 'txt' indicating the \code{file} is a binary file or a text file 5 | #' @param n integer, indicating to limit the number of words to read in. Defaults to reading all words. 6 | #' @param normalize logical indicating to normalize the embeddings by dividing by the factor (sqrt(sum(x . x) / length(x))). Defaults to TRUE. 7 | #' @param encoding encoding to be assumed for the words. Defaults to 'UTF-8' 8 | #' @return a matrix with one row per token containing the embedding of the token 9 | #' @seealso \code{\link[word2vec]{read.wordvectors}} 10 | #' @export 11 | #' @examples 12 | #' folder <- system.file(package = "sentencepiece", "models") 13 | #' embedding <- file.path(folder, "nl.wiki.bpe.vs1000.d25.w2v.bin") 14 | #' embedding <- read_word2vec(embedding, type = "bin") 15 | #' head(embedding) 16 | #' embedding <- file.path(folder, "nl.wiki.bpe.vs1000.d25.w2v.txt") 17 | #' embedding <- read_word2vec(embedding, type = "txt") 18 | #' head(embedding, n = 10) 19 | read_word2vec <- function(x, type = c("txt", "bin"), n = .Machine$integer.max, encoding = "UTF-8", normalize = TRUE){ 20 | type <- match.arg(type) 21 | requireNamespace("word2vec") 22 | if(packageVersion("word2vec") < "0.2.0"){ 23 | stop("This requires word2vec package >= 0.2.0") 24 | } 25 | embedding <- word2vec::read.wordvectors(file = x, type = type, n = n, encoding = encoding, normalize = normalize) 26 | embedding 27 | } -------------------------------------------------------------------------------- /man/sentencepiece_decode.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sentencepiece.R 3 | \name{sentencepiece_decode} 4 | \alias{sentencepiece_decode} 5 | \title{Decode encoded sequences back to text} 6 | \usage{ 7 | sentencepiece_decode(model, x) 8 | } 9 | \arguments{ 10 | \item{model}{an object of class \code{sentencepiece} as returned by \code{\link{sentencepiece_load_model}} or \code{\link{sentencepiece}}} 11 | 12 | \item{x}{an integer vector of Sentencepiece id's or a list of these} 13 | } 14 | \value{ 15 | a character vector of detokenised text or if you encoded with \code{nbest}, a list of these 16 | } 17 | \description{ 18 | Decode a sequence of Sentencepiece ids into text again 19 | } 20 | \examples{ 21 | model <- system.file(package = "sentencepiece", "models", "nl-fr-dekamer.model") 22 | model <- sentencepiece_load_model(file = model) 23 | 24 | txt <- c("De eigendomsoverdracht aan de deelstaten is ingewikkeld.", 25 | "On est d'accord sur le prix de la biere?") 26 | 27 | x <- sentencepiece_encode(model, x = txt, type = "subwords") 28 | sentencepiece_decode(model, x) 29 | x <- sentencepiece_encode(model, x = txt, type = "ids") 30 | sentencepiece_decode(model, x) 31 | 32 | model <- system.file(package = "sentencepiece", "models", 33 | "nl-fr-dekamer-unigram.model") 34 | model <- sentencepiece_load_model(file = model) 35 | x <- sentencepiece_encode(model, x = txt, type = "subwords", nbest = 3) 36 | sentencepiece_decode(model, x) 37 | x <- sentencepiece_encode(model, x = txt, type = "subwords", 38 | nbest = 3, alpha = 0.1) 39 | sentencepiece_decode(model, x) 40 | } 41 | -------------------------------------------------------------------------------- /src/third_party/absl/strings/str_cat.h: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright 2017 The Abseil Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // 16 | #ifndef ABSL_STRINGS_STR_CAT_H_ 17 | #define ABSL_STRINGS_STR_CAT_H_ 18 | 19 | #include 20 | #include 21 | 22 | #include "third_party/absl/strings/numbers.h" 23 | #include "third_party/absl/strings/string_view.h" 24 | 25 | namespace absl { 26 | 27 | inline std::string StrCat(int v) { 28 | std::ostringstream os; 29 | os << v; 30 | return os.str(); 31 | } 32 | 33 | inline std::string StrCat(absl::string_view str) { 34 | return std::string(str.data(), str.size()); 35 | } 36 | 37 | template 38 | inline std::string StrCat(absl::string_view first, const T &...rest) { 39 | return StrCat(first) + StrCat(rest...); 40 | } 41 | 42 | template 43 | inline std::string StrCat(int first, const T &...rest) { 44 | return StrCat(first) + StrCat(rest...); 45 | } 46 | 47 | inline void StrAppend(std::string *base, absl::string_view str) { 48 | base->append(str.data(), str.size()); 49 | } 50 | 51 | } // namespace absl 52 | #endif // ABSL_STRINGS_STR_CAT_H_ 53 | -------------------------------------------------------------------------------- /src/sentencepiece/src/unicode_script_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include "common.h" 16 | #include "testharness.h" 17 | #include "third_party/absl/strings/string_view.h" 18 | #include "unicode_script.h" 19 | #include "util.h" 20 | 21 | namespace sentencepiece { 22 | namespace unicode_script { 23 | ScriptType GetScriptType(absl::string_view s) { 24 | const auto ut = string_util::UTF8ToUnicodeText(s); 25 | CHECK_EQ(1, ut.size()); 26 | return GetScript(ut[0]); 27 | } 28 | 29 | TEST(UnicodeScript, GetScriptTypeTest) { 30 | EXPECT_EQ(U_Han, GetScriptType("京")); 31 | EXPECT_EQ(U_Han, GetScriptType("太")); 32 | EXPECT_EQ(U_Hiragana, GetScriptType("い")); 33 | EXPECT_EQ(U_Katakana, GetScriptType("グ")); 34 | EXPECT_EQ(U_Common, GetScriptType("ー")); 35 | EXPECT_EQ(U_Latin, GetScriptType("a")); 36 | EXPECT_EQ(U_Latin, GetScriptType("A")); 37 | EXPECT_EQ(U_Common, GetScriptType("0")); 38 | EXPECT_EQ(U_Common, GetScriptType("$")); 39 | EXPECT_EQ(U_Common, GetScriptType("@")); 40 | EXPECT_EQ(U_Common, GetScriptType("-")); 41 | } 42 | } // namespace unicode_script 43 | } // namespace sentencepiece 44 | -------------------------------------------------------------------------------- /src/rcpp_wordpiece.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | // [[Rcpp::export]] 5 | Rcpp::StringVector wordpiece_encode_as_subwords(std::string x, std::vector vocabulary, std::string unk_token="[UNK]", unsigned int max_input_chars_per_word=100) { 6 | // Tokenizes a piece of text into its word pieces, using a greedy longest-match-first algorithm to perform tokenization 7 | Rcpp::StringVector output_tokens; 8 | unsigned int len = x.length(); 9 | if(len > max_input_chars_per_word){ 10 | output_tokens.push_back(unk_token); 11 | } else{ 12 | unsigned int start = 0; 13 | std::vector sub_tokens; 14 | while(start < len){ 15 | unsigned int end = len - 1; 16 | std::string cur_substr = ""; 17 | std::string substr; 18 | while(start <= end){ 19 | substr = x.substr(start, end - start + 1); 20 | // Rcpp::Rcout << substr << ":" << start << "-" << end <<"\n"; 21 | if(start > 0){ 22 | substr = "##" + substr; 23 | } 24 | if(std::find(vocabulary.begin(), vocabulary.end(), substr) != vocabulary.end()){ 25 | cur_substr = substr; 26 | break; 27 | } 28 | if (end > 0) { 29 | end = end - 1; 30 | } else { 31 | break; 32 | } 33 | } 34 | if(cur_substr == ""){ 35 | sub_tokens.push_back(unk_token); 36 | break; 37 | } 38 | sub_tokens.push_back(cur_substr); 39 | start = end + 1; 40 | } 41 | if(sub_tokens.size() == 0){ 42 | output_tokens.push_back(unk_token); 43 | }else{ 44 | for (unsigned int i = 0; i < sub_tokens.size(); i++){ 45 | output_tokens.push_back(sub_tokens[i]); 46 | } 47 | } 48 | } 49 | return output_tokens; 50 | } -------------------------------------------------------------------------------- /src/sentencepiece/src/filesystem_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include "filesystem.h" 16 | #include "testharness.h" 17 | #include "third_party/absl/strings/str_cat.h" 18 | #include "util.h" 19 | 20 | namespace sentencepiece { 21 | 22 | TEST(UtilTest, FilesystemTest) { 23 | const std::vector kData = { 24 | "This" 25 | "is" 26 | "a" 27 | "test"}; 28 | 29 | { 30 | auto output = filesystem::NewWritableFile( 31 | util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "test_file")); 32 | for (size_t i = 0; i < kData.size(); ++i) { 33 | output->WriteLine(kData[i]); 34 | } 35 | } 36 | 37 | { 38 | auto input = filesystem::NewReadableFile( 39 | util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "test_file")); 40 | std::string line; 41 | for (size_t i = 0; i < kData.size(); ++i) { 42 | EXPECT_TRUE(input->ReadLine(&line)); 43 | EXPECT_EQ(kData[i], line); 44 | } 45 | EXPECT_FALSE(input->ReadLine(&line)); 46 | } 47 | } 48 | 49 | TEST(UtilTest, FilesystemInvalidFileTest) { 50 | auto input = filesystem::NewReadableFile("__UNKNOWN__FILE__"); 51 | EXPECT_FALSE(input->status().ok()); 52 | } 53 | 54 | } // namespace sentencepiece 55 | -------------------------------------------------------------------------------- /src/sentencepiece/src/model_factory.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include "bpe_model.h" 16 | #include "char_model.h" 17 | #include "model_factory.h" 18 | #include "third_party/absl/memory/memory.h" 19 | #include "unigram_model.h" 20 | #include "word_model.h" 21 | 22 | namespace sentencepiece { 23 | 24 | // Instantiate Model instance from |model_proto| 25 | std::unique_ptr ModelFactory::Create( 26 | const ModelProto& model_proto) { 27 | const auto& trainer_spec = model_proto.trainer_spec(); 28 | 29 | switch (trainer_spec.model_type()) { 30 | case TrainerSpec::UNIGRAM: 31 | return absl::make_unique(model_proto); 32 | break; 33 | case TrainerSpec::BPE: 34 | return absl::make_unique(model_proto); 35 | break; 36 | case TrainerSpec::WORD: 37 | return absl::make_unique(model_proto); 38 | break; 39 | case TrainerSpec::CHAR: 40 | return absl::make_unique(model_proto); 41 | break; 42 | default: 43 | LOG(ERROR) << "Unknown model_type: " << trainer_spec.model_type(); 44 | return nullptr; 45 | break; 46 | } 47 | 48 | return absl::make_unique(model_proto); 49 | } 50 | } // namespace sentencepiece 51 | -------------------------------------------------------------------------------- /src/third_party/protobuf-lite/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2008 Google Inc. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above 10 | copyright notice, this list of conditions and the following disclaimer 11 | in the documentation and/or other materials provided with the 12 | distribution. 13 | * Neither the name of Google Inc. nor the names of its 14 | contributors may be used to endorse or promote products derived from 15 | this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | Code generated by the Protocol Buffer compiler is owned by the owner 30 | of the input file used when generating it. This code is not 31 | standalone and requires a support library to be linked with it. This 32 | support library is itself covered by the above license. 33 | -------------------------------------------------------------------------------- /src/sentencepiece/src/trainer_factory_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include "testharness.h" 16 | #include "trainer_factory.h" 17 | 18 | namespace sentencepiece { 19 | 20 | TEST(TrainerFactoryTest, BasicTest) { 21 | TrainerSpec trainer_spec; 22 | NormalizerSpec normalizer_spec; 23 | NormalizerSpec denormalizer_spec; 24 | 25 | trainer_spec.set_model_prefix("model"); 26 | trainer_spec.add_input("input"); 27 | 28 | { 29 | trainer_spec.set_model_type(TrainerSpec::UNIGRAM); 30 | auto m = TrainerFactory::Create(trainer_spec, normalizer_spec, 31 | denormalizer_spec); 32 | } 33 | 34 | { 35 | trainer_spec.set_model_type(TrainerSpec::BPE); 36 | auto m = TrainerFactory::Create(trainer_spec, normalizer_spec, 37 | denormalizer_spec); 38 | } 39 | 40 | { 41 | trainer_spec.set_model_type(TrainerSpec::WORD); 42 | auto m = TrainerFactory::Create(trainer_spec, normalizer_spec, 43 | denormalizer_spec); 44 | } 45 | 46 | { 47 | trainer_spec.set_model_type(TrainerSpec::CHAR); 48 | auto m = TrainerFactory::Create(trainer_spec, normalizer_spec, 49 | denormalizer_spec); 50 | } 51 | } 52 | } // namespace sentencepiece 53 | -------------------------------------------------------------------------------- /man/wordpiece_encode.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wordpiece.R 3 | \name{wordpiece_encode} 4 | \alias{wordpiece_encode} 5 | \title{Wordpiece encoding} 6 | \usage{ 7 | wordpiece_encode( 8 | x, 9 | vocabulary = character(), 10 | type = c("subwords", "ids"), 11 | unk_token = "[UNK]", 12 | max_input_chars_per_word = 100L 13 | ) 14 | } 15 | \arguments{ 16 | \item{x}{a character vector with text which can be splitted based on white space to obtain words} 17 | 18 | \item{vocabulary}{a character vector of the vocabulary} 19 | 20 | \item{type}{a character string, either 'subwords' or 'ids' to get the subwords or the corresponding ids of these subwords as defined in the vocabulary of the model. 21 | Defaults to 'subwords'.} 22 | 23 | \item{unk_token}{character string with a value for a token which is not part of the vocabulary. Defaults to '[UNK]'} 24 | 25 | \item{max_input_chars_per_word}{integer. A word which is longer than this specified number of characters will be set to the unknown token.} 26 | } 27 | \value{ 28 | a list of subword tokens 29 | } 30 | \description{ 31 | Wordpiece encoding, usefull for BERT-style tokenisation. 32 | Experimental version mimicing class WordpieceTokenizer from \url{https://github.com/huggingface/transformers/blob/master/src/transformers/models/bert/tokenization_bert.py} 33 | } 34 | \examples{ 35 | wordpiece_encode("unaffable", vocabulary = c("un", "##aff", "##able")) 36 | wordpiece_encode(x = c("unaffable", "unaffableun"), 37 | vocabulary = c("un", "##aff", "##able")) 38 | wordpiece_encode(x = c("unaffable", "unaffableun", "unknown territory"), 39 | vocabulary = c("un", "##aff", "##able", "##un")) 40 | wordpiece_encode(x = c("unaffable", "unaffableun", "unknown territory"), 41 | vocabulary = c("un", "##aff", "##able", "##un"), 42 | type = "ids") 43 | } 44 | -------------------------------------------------------------------------------- /src/third_party/absl/flags/flag.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef ABSL_FLAGS_FLAG_H_ 16 | #define ABSL_FLAGS_FLAG_H_ 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | namespace absl { 24 | namespace internal { 25 | struct FlagFunc; 26 | 27 | void RegisterFlag(const std::string &name, FlagFunc *func); 28 | } // namespace internal 29 | 30 | template 31 | class Flag { 32 | public: 33 | Flag(const char *name, const char *type, const char *help, 34 | const T &defautl_value); 35 | virtual ~Flag(); 36 | const T &value() const; 37 | void set_value(const T &value); 38 | void set_value_as_str(const std::string &value_as_str); 39 | 40 | private: 41 | T value_; 42 | std::unique_ptr func_; 43 | }; 44 | 45 | template 46 | const T &GetFlag(const Flag &flag) { 47 | return flag.value(); 48 | } 49 | 50 | template 51 | void SetFlag(Flag *flag, const V &v) { 52 | const T value(v); 53 | flag->set_value(value); 54 | } 55 | } // namespace absl 56 | 57 | #define ABSL_FLAG(Type, name, defautl_value, help) \ 58 | absl::Flag FLAGS_##name(#name, #Type, help, defautl_value); 59 | 60 | #define ABSL_DECLARE_FLAG(Type, name) extern absl::Flag FLAGS_##name; 61 | 62 | #endif // ABSL_FLAGS_FLAG_H_ 63 | -------------------------------------------------------------------------------- /src/sentencepiece/src/bpe_model.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef BPE_MODEL_H_ 16 | #define BPE_MODEL_H_ 17 | 18 | #include "model_interface.h" 19 | #include "sentencepiece_model.pb.h" 20 | 21 | namespace sentencepiece { 22 | namespace bpe { 23 | 24 | // Segmentation model with BPE (Byte Pair Encoding) 25 | // Details: 26 | // Neural Machine Translation of Rare Words with Subword Units 27 | // https://arxiv.org/abs/1508.07909 28 | // 29 | // https://en.wikipedia.org/wiki/Byte_pair_encoding 30 | class Model : public ModelInterface { 31 | public: 32 | explicit Model(const ModelProto &model_proto); 33 | ~Model() override; 34 | 35 | EncodeResult Encode(absl::string_view normalized) const override { 36 | return SampleEncode(normalized, 0.0); 37 | } 38 | 39 | // Sampling with BPE-dropout: https://arxiv.org/pdf/1910.13267.pdf 40 | // `alpha` is dropout probability in BPE-dropout paper. 41 | // Skips merge operation with `alpha` probability. 42 | // When alpha <= 0.0, no sampling is performed. 43 | EncodeResult SampleEncode(absl::string_view normalized, 44 | float alpha) const override; 45 | 46 | bool IsSampleEncodeAvailable() const override { return true; } 47 | 48 | bool IsNBestEncodeAvailable() const override { return false; } 49 | }; 50 | } // namespace bpe 51 | } // namespace sentencepiece 52 | #endif // BPE_MODEL_H_ 53 | -------------------------------------------------------------------------------- /src/sentencepiece/src/model_factory_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include "model_factory.h" 16 | #include "testharness.h" 17 | 18 | namespace sentencepiece { 19 | 20 | TEST(ModelFactoryTest, BasicTest) { 21 | ModelProto model_proto; 22 | 23 | auto *sp1 = model_proto.add_pieces(); 24 | auto *sp2 = model_proto.add_pieces(); 25 | auto *sp3 = model_proto.add_pieces(); 26 | 27 | sp1->set_type(ModelProto::SentencePiece::UNKNOWN); 28 | sp1->set_piece(""); 29 | sp2->set_type(ModelProto::SentencePiece::CONTROL); 30 | sp2->set_piece(""); 31 | sp3->set_type(ModelProto::SentencePiece::CONTROL); 32 | sp3->set_piece(""); 33 | 34 | auto *sp4 = model_proto.add_pieces(); 35 | sp4->set_piece("test"); 36 | sp4->set_score(1.0); 37 | 38 | { 39 | model_proto.mutable_trainer_spec()->set_model_type(TrainerSpec::UNIGRAM); 40 | auto m = ModelFactory::Create(model_proto); 41 | } 42 | 43 | { 44 | model_proto.mutable_trainer_spec()->set_model_type(TrainerSpec::BPE); 45 | auto m = ModelFactory::Create(model_proto); 46 | } 47 | 48 | { 49 | model_proto.mutable_trainer_spec()->set_model_type(TrainerSpec::WORD); 50 | auto m = ModelFactory::Create(model_proto); 51 | } 52 | 53 | { 54 | model_proto.mutable_trainer_spec()->set_model_type(TrainerSpec::CHAR); 55 | auto m = ModelFactory::Create(model_proto); 56 | } 57 | } 58 | } // namespace sentencepiece 59 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | spc_train <- function(args) { 5 | .Call('_sentencepiece_spc_train', PACKAGE = 'sentencepiece', args) 6 | } 7 | 8 | spc_load_model <- function(file) { 9 | .Call('_sentencepiece_spc_load_model', PACKAGE = 'sentencepiece', file) 10 | } 11 | 12 | spc_encode_as_subwords <- function(model, x) { 13 | .Call('_sentencepiece_spc_encode_as_subwords', PACKAGE = 'sentencepiece', model, x) 14 | } 15 | 16 | spc_encode_as_ids <- function(model, x) { 17 | .Call('_sentencepiece_spc_encode_as_ids', PACKAGE = 'sentencepiece', model, x) 18 | } 19 | 20 | spc_encode_as_subwords_sample <- function(model, x, nbest_size = -1L, alpha = 1) { 21 | .Call('_sentencepiece_spc_encode_as_subwords_sample', PACKAGE = 'sentencepiece', model, x, nbest_size, alpha) 22 | } 23 | 24 | spc_encode_as_ids_sample <- function(model, x, nbest_size = -1L, alpha = 1) { 25 | .Call('_sentencepiece_spc_encode_as_ids_sample', PACKAGE = 'sentencepiece', model, x, nbest_size, alpha) 26 | } 27 | 28 | spc_encode_as_subwords_nbest <- function(model, x, nbest_size = -1L) { 29 | .Call('_sentencepiece_spc_encode_as_subwords_nbest', PACKAGE = 'sentencepiece', model, x, nbest_size) 30 | } 31 | 32 | spc_encode_as_ids_nbest <- function(model, x, nbest_size = -1L) { 33 | .Call('_sentencepiece_spc_encode_as_ids_nbest', PACKAGE = 'sentencepiece', model, x, nbest_size) 34 | } 35 | 36 | spc_decode_ids <- function(model, x) { 37 | .Call('_sentencepiece_spc_decode_ids', PACKAGE = 'sentencepiece', model, x) 38 | } 39 | 40 | spc_decode_subwords <- function(model, x) { 41 | .Call('_sentencepiece_spc_decode_subwords', PACKAGE = 'sentencepiece', model, x) 42 | } 43 | 44 | wordpiece_encode_as_subwords <- function(x, vocabulary, unk_token = "[UNK]", max_input_chars_per_word = 100L) { 45 | .Call('_sentencepiece_wordpiece_encode_as_subwords', PACKAGE = 'sentencepiece', x, vocabulary, unk_token, max_input_chars_per_word) 46 | } 47 | 48 | -------------------------------------------------------------------------------- /src/sentencepiece/src/char_model_trainer.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include 16 | 17 | #include "char_model.h" 18 | #include "char_model_trainer.h" 19 | #include "util.h" 20 | 21 | namespace sentencepiece { 22 | namespace character { 23 | 24 | util::Status Trainer::Train() { 25 | RETURN_IF_ERROR(status()); 26 | 27 | CHECK_OR_RETURN(normalizer_spec_.escape_whitespaces()); 28 | CHECK_EQ_OR_RETURN(TrainerSpec::CHAR, trainer_spec_.model_type()); 29 | 30 | RETURN_IF_ERROR(LoadSentences()); 31 | 32 | const int vocab_size = trainer_spec_.vocab_size() - meta_pieces_.size(); 33 | CHECK_GE_OR_RETURN(vocab_size, 0); 34 | 35 | uint64 sum = 0; 36 | for (const auto &it : required_chars_) { 37 | sum += it.second; 38 | } 39 | 40 | const auto logsum = static_cast(log(static_cast(sum))); 41 | 42 | CHECK_OR_RETURN(final_pieces_.empty()); 43 | for (const auto &it : Sorted(required_chars_)) { 44 | if (!trainer_spec_.use_all_vocab() && 45 | final_pieces_.size() == static_cast(vocab_size)) { 46 | break; 47 | } 48 | final_pieces_.emplace_back( 49 | string_util::UnicodeCharToUTF8(it.first), 50 | static_cast(log(static_cast(it.second))) - logsum); 51 | } 52 | 53 | if (trainer_spec_.use_all_vocab()) { 54 | trainer_spec_.set_vocab_size(final_pieces_.size() + meta_pieces_.size()); 55 | } 56 | 57 | return Save(); 58 | } 59 | } // namespace character 60 | } // namespace sentencepiece 61 | -------------------------------------------------------------------------------- /src/sentencepiece/src/filesystem.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef FILESYSTEM_H_ 16 | #define FILESYSTEM_H_ 17 | 18 | #include 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | #include "common.h" 25 | #include "sentencepiece_processor.h" 26 | #include "third_party/absl/strings/string_view.h" 27 | 28 | namespace sentencepiece { 29 | namespace filesystem { 30 | class ReadableFile { 31 | public: 32 | ReadableFile() {} 33 | explicit ReadableFile(absl::string_view filename, bool is_binary = false) {} 34 | virtual ~ReadableFile() {} 35 | 36 | virtual util::Status status() const = 0; 37 | virtual bool ReadLine(std::string *line) = 0; 38 | virtual bool ReadAll(std::string *line) = 0; 39 | }; 40 | 41 | class WritableFile { 42 | public: 43 | WritableFile() {} 44 | explicit WritableFile(absl::string_view filename, bool is_binary = false) {} 45 | virtual ~WritableFile() {} 46 | 47 | virtual util::Status status() const = 0; 48 | virtual bool Write(absl::string_view text) = 0; 49 | virtual bool WriteLine(absl::string_view text) = 0; 50 | }; 51 | 52 | std::unique_ptr NewReadableFile(absl::string_view filename, 53 | bool is_binary = false); 54 | std::unique_ptr NewWritableFile(absl::string_view filename, 55 | bool is_binary = false); 56 | 57 | } // namespace filesystem 58 | } // namespace sentencepiece 59 | #endif // FILESYSTEM_H_ 60 | -------------------------------------------------------------------------------- /man/BPEembedder.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bpemb.R 3 | \name{BPEembedder} 4 | \alias{BPEembedder} 5 | \title{Build a BPEembed model containing a Sentencepiece and Word2vec model} 6 | \usage{ 7 | BPEembedder( 8 | x, 9 | tokenizer = c("bpe", "char", "unigram", "word"), 10 | args = list(vocab_size = 8000, coverage = 0.9999), 11 | ... 12 | ) 13 | } 14 | \arguments{ 15 | \item{x}{a data.frame with columns doc_id and text} 16 | 17 | \item{tokenizer}{character string with the type of sentencepiece tokenizer. Either 'bpe', 'char', 'unigram' or 'word' for Byte Pair Encoding, Character level encoding, 18 | Unigram encoding or pretokenised word encoding. Defaults to 'bpe' (Byte Pair Encoding). Passed on to \code{\link{sentencepiece}}} 19 | 20 | \item{args}{a list of arguments passed on to \code{\link{sentencepiece}}} 21 | 22 | \item{...}{arguments passed on to \code{\link[word2vec]{word2vec}} for training a word2vec model} 23 | } 24 | \value{ 25 | an object of class BPEembed which is a list with elements 26 | \itemize{ 27 | \item{model: a sentencepiece model as loaded with \code{\link{sentencepiece_load_model}}} 28 | \item{embedding: a matrix with embeddings as loaded with \code{\link[word2vec]{read.wordvectors}}} 29 | \item{dim: the dimension of the embedding} 30 | \item{n: the number of elements in the vocabulary} 31 | \item{file_sentencepiece: the sentencepiece model file} 32 | \item{file_word2vec: the word2vec embedding file} 33 | } 34 | } 35 | \description{ 36 | Build a sentencepiece model on text and build a matching word2vec model on the sentencepiece vocabulary 37 | } 38 | \examples{ 39 | library(tokenizers.bpe) 40 | data(belgium_parliament, package = "tokenizers.bpe") 41 | x <- subset(belgium_parliament, language \%in\% "dutch") 42 | model <- BPEembedder(x, tokenizer = "bpe", args = list(vocab_size = 1000), 43 | type = "cbow", dim = 20, iter = 10) 44 | model 45 | 46 | txt <- c("De eigendomsoverdracht aan de deelstaten is ingewikkeld.") 47 | values <- predict(model, txt, type = "encode") 48 | } 49 | \seealso{ 50 | \code{\link{sentencepiece}}, \code{\link[word2vec]{word2vec}}, \code{\link{predict.BPEembed}} 51 | } 52 | -------------------------------------------------------------------------------- /src/third_party/protobuf-lite/google/protobuf/port.h: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | // A common header that is included across all protobuf headers. We do our best 32 | // to avoid #defining any macros here; instead we generally put macros in 33 | // port_def.inc and port_undef.inc so they are not visible from outside of 34 | // protobuf. 35 | 36 | #ifndef GOOGLE_PROTOBUF_PORT_H__ 37 | #define GOOGLE_PROTOBUF_PORT_H__ 38 | 39 | 40 | #include 41 | 42 | 43 | #endif // GOOGLE_PROTOBUF_PORT_H__ 44 | -------------------------------------------------------------------------------- /src/sentencepiece/src/testharness.cc: -------------------------------------------------------------------------------- 1 | #include 2 | // Copyright 2016 Google Inc. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License.! 15 | 16 | #include "testharness.h" 17 | 18 | #ifndef OS_WIN 19 | #include 20 | #include 21 | #else 22 | #include 23 | #endif 24 | 25 | #include 26 | #include 27 | #include 28 | 29 | #include "common.h" 30 | #include "third_party/absl/strings/str_cat.h" 31 | #include "util.h" 32 | 33 | namespace sentencepiece { 34 | namespace test { 35 | 36 | namespace { 37 | struct Test { 38 | const char *base; 39 | const char *name; 40 | void (*func)(); 41 | }; 42 | std::vector *tests; 43 | } // namespace 44 | 45 | bool RegisterTest(const char *base, const char *name, void (*func)()) { 46 | if (tests == nullptr) { 47 | tests = new std::vector; 48 | } 49 | Test t; 50 | t.base = base; 51 | t.name = name; 52 | t.func = func; 53 | tests->emplace_back(t); 54 | return true; 55 | } 56 | 57 | int RunAllTests() { 58 | int num = 0; 59 | #ifdef OS_WIN 60 | _mkdir(absl::GetFlag(FLAGS_test_tmpdir).c_str()); 61 | #else 62 | mkdir(absl::GetFlag(FLAGS_test_tmpdir).c_str(), S_IRUSR | S_IWUSR | S_IXUSR); 63 | #endif 64 | 65 | if (tests == nullptr) { 66 | Rcpp::Rcerr << "No tests are found" << std::endl; 67 | return 0; 68 | } 69 | 70 | for (const Test &t : *(tests)) { 71 | Rcpp::Rcerr << "[ RUN ] " << t.base << "." << t.name << std::endl; 72 | (*t.func)(); 73 | Rcpp::Rcerr << "[ OK ] " << t.base << "." << t.name << std::endl; 74 | ++num; 75 | } 76 | Rcpp::Rcerr << "==== PASSED " << num << " tests" << std::endl; 77 | 78 | return 0; 79 | } 80 | } // namespace test 81 | } // namespace sentencepiece 82 | -------------------------------------------------------------------------------- /src/sentencepiece/src/spm_export_vocab_main.cc: -------------------------------------------------------------------------------- 1 | 2 | 3 | // Copyright 2016 Google Inc. 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // n// http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License.! 15 | 16 | #include 17 | 18 | #include "common.h" 19 | #include "filesystem.h" 20 | #include "init.h" 21 | #include "sentencepiece_model.pb.h" 22 | #include "sentencepiece_processor.h" 23 | #include "third_party/absl/flags/flag.h" 24 | 25 | ABSL_FLAG(std::string, output, "", "Output filename"); 26 | ABSL_FLAG(std::string, model, "", "input model file name"); 27 | ABSL_FLAG(std::string, output_format, "vocab", 28 | "output format. choose from vocab or syms. vocab outputs pieces " 29 | "and scores, syms outputs pieces and indices."); 30 | 31 | int main(int argc, char *argv[]) { 32 | sentencepiece::ParseCommandLineFlags(argv[0], &argc, &argv, true); 33 | 34 | sentencepiece::SentencePieceProcessor sp; 35 | CHECK_OK(sp.Load(absl::GetFlag(FLAGS_model))); 36 | 37 | auto output = 38 | sentencepiece::filesystem::NewWritableFile(absl::GetFlag(FLAGS_output)); 39 | CHECK_OK(output->status()); 40 | 41 | if (absl::GetFlag(FLAGS_output_format) == "vocab") { 42 | for (const auto &piece : sp.model_proto().pieces()) { 43 | std::ostringstream os; 44 | os << piece.piece() << "\t" << piece.score(); 45 | output->WriteLine(os.str()); 46 | } 47 | } else if (absl::GetFlag(FLAGS_output_format) == "syms") { 48 | for (int i = 0; i < sp.model_proto().pieces_size(); i++) { 49 | std::ostringstream os; 50 | os << sp.model_proto().pieces(i).piece() << "\t" << i; 51 | output->WriteLine(os.str()); 52 | } 53 | } else { 54 | LOG(FATAL) << "Unsupported output format: " 55 | << absl::GetFlag(FLAGS_output_format); 56 | } 57 | 58 | return 0; 59 | } 60 | -------------------------------------------------------------------------------- /src/third_party/protobuf-lite/statusor.cc: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | #include 32 | 33 | #include 34 | 35 | namespace google { 36 | namespace protobuf { 37 | namespace util { 38 | namespace internal { 39 | 40 | void StatusOrHelper::Crash(const Status& status) { 41 | GOOGLE_LOG(FATAL) << "Attempting to fetch value instead of handling error " 42 | << status.ToString(); 43 | } 44 | 45 | } // namespace internal 46 | } // namespace util 47 | } // namespace protobuf 48 | } // namespace google 49 | -------------------------------------------------------------------------------- /src/sentencepiece/src/pretokenizer_for_training.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | #include 15 | 16 | #include "pretokenizer_for_training.h" 17 | #include "third_party/absl/strings/str_replace.h" 18 | 19 | namespace sentencepiece { 20 | namespace pretokenizer { 21 | 22 | namespace { 23 | // TODO(taku): They are defined in trainer_interface.h but we 24 | // defined them explicitly to avoid the dependency to trainier_interface. 25 | // Currently, we have no separated build rules. 26 | const char kWSStr[] = "\xe2\x96\x81"; 27 | const char kUPPBoundaryStr[] = "\t"; 28 | } // namespace 29 | 30 | std::string PretokenizerForTrainingInterface::PreTokenize( 31 | absl::string_view text) const { 32 | return Postprocess(Tokenize(Preprocess(text))); 33 | } 34 | 35 | // static 36 | std::string PretokenizerForTrainingInterface::Preprocess( 37 | absl::string_view text) { 38 | // Escapes kWSStr (_) as this character may not be processed by pre-tokenizer. 39 | return absl::StrReplaceAll(text, {{kWSStr, " "}}); 40 | } 41 | 42 | // static 43 | std::string PretokenizerForTrainingInterface::Postprocess( 44 | const SentencePieceText &spt) { 45 | // Inserts kUPPBoundaryStr before/after of token boundaries. 46 | std::string output; 47 | int prev = 0; 48 | for (const auto &piece : spt.pieces()) { 49 | if (prev == piece.begin() && piece.begin() != 0) { 50 | output += kUPPBoundaryStr; 51 | } else { 52 | output.append(piece.begin() - prev, ' '); 53 | } 54 | output += piece.surface(); 55 | prev = piece.end(); 56 | } 57 | 58 | // Restores kWSStr. 59 | return absl::StrReplaceAll(output, {{" ", kWSStr}}); 60 | } 61 | 62 | } // namespace pretokenizer 63 | } // namespace sentencepiece 64 | -------------------------------------------------------------------------------- /src/sentencepiece/src/trainer_factory.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include "bpe_model_trainer.h" 16 | #include "char_model_trainer.h" 17 | #include "third_party/absl/memory/memory.h" 18 | #include "trainer_factory.h" 19 | #include "unigram_model_trainer.h" 20 | #include "word_model_trainer.h" 21 | 22 | namespace sentencepiece { 23 | 24 | // Instantiate Trainer instance from trainer_spec and normalization_spec 25 | std::unique_ptr TrainerFactory::Create( 26 | const TrainerSpec &trainer_spec, const NormalizerSpec &normalizer_spec, 27 | const NormalizerSpec &denormalizer_spec) { 28 | switch (trainer_spec.model_type()) { 29 | case TrainerSpec::UNIGRAM: 30 | return absl::make_unique(trainer_spec, normalizer_spec, 31 | denormalizer_spec); 32 | break; 33 | case TrainerSpec::BPE: 34 | return absl::make_unique(trainer_spec, normalizer_spec, 35 | denormalizer_spec); 36 | break; 37 | case TrainerSpec::WORD: 38 | return absl::make_unique(trainer_spec, normalizer_spec, 39 | denormalizer_spec); 40 | break; 41 | case TrainerSpec::CHAR: 42 | return absl::make_unique( 43 | trainer_spec, normalizer_spec, denormalizer_spec); 44 | break; 45 | default: 46 | LOG(FATAL) << "Unknown model_type: " << trainer_spec.model_type(); 47 | break; 48 | } 49 | 50 | return absl::make_unique(trainer_spec, normalizer_spec, 51 | denormalizer_spec); 52 | } 53 | } // namespace sentencepiece 54 | -------------------------------------------------------------------------------- /R/wordpiece.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | #' @title Wordpiece encoding 7 | #' @description Wordpiece encoding, usefull for BERT-style tokenisation. 8 | #' Experimental version mimicing class WordpieceTokenizer from \url{https://github.com/huggingface/transformers/blob/master/src/transformers/models/bert/tokenization_bert.py} 9 | #' @param x a character vector with text which can be splitted based on white space to obtain words 10 | #' @param vocabulary a character vector of the vocabulary 11 | #' @param type a character string, either 'subwords' or 'ids' to get the subwords or the corresponding ids of these subwords as defined in the vocabulary of the model. 12 | #' Defaults to 'subwords'. 13 | #' @param unk_token character string with a value for a token which is not part of the vocabulary. Defaults to '[UNK]' 14 | #' @param max_input_chars_per_word integer. A word which is longer than this specified number of characters will be set to the unknown token. 15 | #' @return a list of subword tokens 16 | #' @export 17 | #' @examples 18 | #' wordpiece_encode("unaffable", vocabulary = c("un", "##aff", "##able")) 19 | #' wordpiece_encode(x = c("unaffable", "unaffableun"), 20 | #' vocabulary = c("un", "##aff", "##able")) 21 | #' wordpiece_encode(x = c("unaffable", "unaffableun", "unknown territory"), 22 | #' vocabulary = c("un", "##aff", "##able", "##un")) 23 | #' wordpiece_encode(x = c("unaffable", "unaffableun", "unknown territory"), 24 | #' vocabulary = c("un", "##aff", "##able", "##un"), 25 | #' type = "ids") 26 | wordpiece_encode <- function(x, vocabulary = character(), type = c("subwords", "ids"), unk_token = "[UNK]", max_input_chars_per_word = 100L){ 27 | type <- match.arg(type) 28 | max_input_chars_per_word <- as.integer(max_input_chars_per_word) 29 | unk_token <- as.character(unk_token) 30 | vocabulary <- as.character(vocabulary) 31 | x <- as.character(x) 32 | x <- trimws(x) 33 | x <- strsplit(x, " ") 34 | x <- lapply(x, FUN = function(terms){ 35 | subwords <- lapply(terms, FUN=function(term) wordpiece_encode_as_subwords(term, vocabulary, unk_token, max_input_chars_per_word)) 36 | subwords <- unlist(subwords, use.names = FALSE) 37 | subwords 38 | }) 39 | if(type == "ids"){ 40 | x <- lapply(x, FUN = function(x){ 41 | match(x, vocabulary) - 1L 42 | }) 43 | } 44 | x 45 | } 46 | 47 | -------------------------------------------------------------------------------- /src/sentencepiece/src/pretokenizer_for_training.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef PRETOKENIZER_FOR_TRAINING_H_ 16 | #define PRETOKENIZER_FOR_TRAINING_H_ 17 | 18 | #include 19 | #include 20 | 21 | #include "common.h" 22 | #include "sentencepiece.pb.h" 23 | #include "sentencepiece_processor.h" 24 | #include "third_party/absl/strings/string_view.h" 25 | 26 | namespace sentencepiece { 27 | namespace pretokenizer { 28 | 29 | class PretokenizerForTrainingInterface { 30 | public: 31 | PretokenizerForTrainingInterface() {} 32 | virtual ~PretokenizerForTrainingInterface() {} 33 | virtual util::Status status() const = 0; 34 | 35 | // Puts kUPPBoundaryStr before and after the pre-tokenizer's segmentation 36 | // when there are no spaces between these tokens. 37 | // Example1: 38 | // input: 東京です 39 | // segmentation: piece[0] = {0, 6}, piece[1] = {6, 12} 40 | // output: 東京です (here kUPPBoundaryStr is ) 41 | // 42 | // Example2: 43 | // input: I love sentencepiece 44 | // segmentation: piece[0] = {0, 1}, piece[1] = {2, 6}, 45 | // piece[2] = {7, 15}, piece[3] = {15, 20} 46 | // output: I love sentencepiece. 47 | std::string PreTokenize(absl::string_view text) const; 48 | 49 | // Returns pre-tokenized result. 50 | // Note that the pre-tokenized constraint is specified with the 51 | // byte offsets (SentencePiece::begin, SentencePiece::end) over 52 | // the input text. 53 | virtual SentencePieceText Tokenize(absl::string_view text) const = 0; 54 | 55 | private: 56 | static std::string Preprocess(absl::string_view text); 57 | static std::string Postprocess(const SentencePieceText &spt); 58 | }; 59 | 60 | } // namespace pretokenizer 61 | } // namespace sentencepiece 62 | 63 | #endif // PRETOKENIZER_FOR_TRAINING_H_ 64 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | ## CHANGES IN sentencepiece VERSION 0.2.4 2 | 3 | - Drop C++11 from Makevars 4 | - std::iterator replacement in src/third_party/protobuf-lite/google/protobuf/repeated_field.h as std::iterator is deprecated in C++17 5 | 6 | ## CHANGES IN sentencepiece VERSION 0.2.3 7 | 8 | - fix R CMD check warning due to change in version 0.2.2. 9 | - in third_party/protobuf-lite/strutil.cc:506:33: warning: argument to ‘sizeof’ in ‘int snprintf(char*, size_t, const char*, ...)’ call is the same expression as the destination; did you mean to provide an explicit length? [-Wsizeof-pointer-memaccess] 10 | - this part of third_party/protobuf-lite/strutil.cc was not used in sentencepiece 11 | 12 | ## CHANGES IN sentencepiece VERSION 0.2.2 13 | 14 | - use snprintf instead of sprintf to handle the R CMD check deprecating note on M1mac 15 | 16 | ## CHANGES IN sentencepiece VERSION 0.2.1 17 | 18 | - Fix for clang-UBSAN error 19 | 20 | ## CHANGES IN sentencepiece VERSION 0.2 21 | 22 | - Fix wordpiece bug for 1-character words. (@jonthegeek, #4) 23 | - Upgraded to sentencepiece release v0.1.96 24 | 25 | ## CHANGES IN sentencepiece VERSION 0.1.3 26 | 27 | - Fix wordpiece bug for 1-character words. (@jonthegeek, #4) 28 | - Fix Solaris installation issue related to incorrect usage of pointer as a function 29 | - Also download the binary model in sentencepiece_download_model as it can be loaded with word2vec::read.wordvectors 30 | - read_word2vec now uses word2vec::read.wordvectors from word2vec >= 0.2.0 31 | - added BPEembed and predict.BPEembed 32 | - allow subword regularisation by adding nbest and alpha option in sentencepiece_encode and changed sentencepiece_decode accordingly 33 | - Added txt_remove_ 34 | - Upgrade sentencepiece to release v0.1.91 commit a32d7dc6ce6f383a65ad6e1cbe1983f94ab11932 which has subword regularisation for BPE 35 | 36 | ## CHANGES IN sentencepiece VERSION 0.1.2 37 | 38 | - Fix Solaris installation issue which used log of uint64 which is not defined on Solaris 39 | 40 | ## CHANGES IN sentencepiece VERSION 0.1.1 41 | 42 | - Added verbose argument in sentencepiece 43 | 44 | ## CHANGES IN sentencepiece VERSION 0.1.0 45 | 46 | - Initial package based on https://github.com/google/sentencepiece release v0.1.84 commit 2424d82d396b43b2556203c592e48a621ef10f3c 47 | - Third-party code from https://github.com/google/sentencepiece/tree/master/third_party is put in src/absl, src/esaxx, src/darts_clone, src/protobuf-lite 48 | -------------------------------------------------------------------------------- /src/third_party/absl/strings/str_replace.h: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright 2017 The Abseil Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // 16 | #ifndef ABSL_STRINGS_STR_REPLACE_H_ 17 | #define ABSL_STRINGS_STR_REPLACE_H_ 18 | 19 | #include 20 | 21 | #include "third_party/absl/strings/string_view.h" 22 | 23 | namespace absl { 24 | 25 | inline void StringReplace(absl::string_view s, absl::string_view oldsub, 26 | absl::string_view newsub, bool replace_all, 27 | std::string *res) { 28 | if (oldsub.empty()) { 29 | res->append(s.data(), s.size()); 30 | return; 31 | } 32 | 33 | absl::string_view::size_type start_pos = 0; 34 | do { 35 | const absl::string_view::size_type pos = s.find(oldsub, start_pos); 36 | if (pos == absl::string_view::npos) { 37 | break; 38 | } 39 | res->append(s.data() + start_pos, pos - start_pos); 40 | res->append(newsub.data(), newsub.size()); 41 | start_pos = pos + oldsub.size(); 42 | } while (replace_all); 43 | res->append(s.data() + start_pos, s.size() - start_pos); 44 | } 45 | 46 | inline std::string StringReplace(absl::string_view s, absl::string_view oldsub, 47 | absl::string_view newsub, bool replace_all) { 48 | std::string ret; 49 | StringReplace(s, oldsub, newsub, replace_all, &ret); 50 | return ret; 51 | } 52 | 53 | inline std::string StrReplaceAll( 54 | absl::string_view s, 55 | const std::vector> 56 | &patterns) { 57 | std::string prev(s.data(), s.size()); 58 | std::string result; 59 | for (const auto &it : patterns) { 60 | result.clear(); 61 | StringReplace(prev, it.first, it.second, true, &result); 62 | prev = result; 63 | } 64 | return result; 65 | } 66 | 67 | } // namespace absl 68 | #endif // ABSL_STRINGS_STR_REPLACE_H_ 69 | -------------------------------------------------------------------------------- /src/third_party/protobuf-lite/google/protobuf/stubs/once.h: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | #ifndef GOOGLE_PROTOBUF_STUBS_ONCE_H__ 32 | #define GOOGLE_PROTOBUF_STUBS_ONCE_H__ 33 | 34 | #include 35 | #include 36 | 37 | #include 38 | 39 | namespace google { 40 | namespace protobuf { 41 | namespace internal { 42 | 43 | using once_flag = std::once_flag; 44 | template 45 | void call_once(Args&&... args ) { 46 | std::call_once(std::forward(args)...); 47 | } 48 | 49 | } // namespace internal 50 | } // namespace protobuf 51 | } // namespace google 52 | 53 | #include 54 | 55 | #endif // GOOGLE_PROTOBUF_STUBS_ONCE_H__ 56 | -------------------------------------------------------------------------------- /src/sentencepiece/src/word_model_trainer.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include 16 | #include 17 | 18 | #include "third_party/absl/container/flat_hash_map.h" 19 | #include "third_party/absl/strings/string_view.h" 20 | #include "util.h" 21 | #include "word_model.h" 22 | #include "word_model_trainer.h" 23 | 24 | namespace sentencepiece { 25 | namespace word { 26 | 27 | util::Status Trainer::Train() { 28 | RETURN_IF_ERROR(status()); 29 | 30 | CHECK_OR_RETURN(normalizer_spec_.escape_whitespaces()); 31 | CHECK_EQ_OR_RETURN(TrainerSpec::WORD, trainer_spec_.model_type()); 32 | 33 | RETURN_IF_ERROR(LoadSentences()); 34 | 35 | absl::flat_hash_map freq; 36 | for (const auto &it : sentences_) { 37 | for (const auto &s : SplitIntoWords(it.first)) { 38 | freq[std::string(s)] += it.second; 39 | } 40 | } 41 | 42 | const int vocab_size = trainer_spec_.vocab_size() - meta_pieces_.size(); 43 | CHECK_GE_OR_RETURN(vocab_size, 0); 44 | 45 | uint64 sum = 0; 46 | for (const auto &it : freq) { 47 | sum += it.second; 48 | } 49 | 50 | const auto logsum = static_cast(log(static_cast(sum))); 51 | 52 | CHECK_OR_RETURN(final_pieces_.empty()); 53 | for (const auto &it : Sorted(freq)) { 54 | if (it.first.find(kUNKStr) != std::string::npos) { 55 | continue; 56 | } 57 | if (!trainer_spec_.use_all_vocab() && 58 | final_pieces_.size() == static_cast(vocab_size)) { 59 | break; 60 | } 61 | final_pieces_.emplace_back(it.first, static_cast(log(static_cast(it.second))) - logsum); 62 | } 63 | 64 | if (trainer_spec_.use_all_vocab()) { 65 | trainer_spec_.set_vocab_size(final_pieces_.size() + meta_pieces_.size()); 66 | } 67 | 68 | return Save(); 69 | } 70 | } // namespace word 71 | } // namespace sentencepiece 72 | -------------------------------------------------------------------------------- /man/BPEembed.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bpemb.R 3 | \name{BPEembed} 4 | \alias{BPEembed} 5 | \title{Tokenise and embed text alongside a Sentencepiece and Word2vec model} 6 | \usage{ 7 | BPEembed( 8 | file_sentencepiece = x$file_model, 9 | file_word2vec = x$glove.bin$file_model, 10 | x, 11 | normalize = TRUE 12 | ) 13 | } 14 | \arguments{ 15 | \item{file_sentencepiece}{the path to the file containing the sentencepiece model} 16 | 17 | \item{file_word2vec}{the path to the file containing the word2vec embeddings} 18 | 19 | \item{x}{the result of a call to \code{\link{sentencepiece_download_model}}. 20 | If this is provided, arguments \code{file_sentencepiece} and \code{file_word2vec} will not be used.} 21 | 22 | \item{normalize}{passed on to \code{\link[word2vec]{read.wordvectors}} to read in \code{file_word2vec}. Defaults to \code{TRUE}.} 23 | } 24 | \value{ 25 | an object of class BPEembed which is a list with elements 26 | \itemize{ 27 | \item{model: a sentencepiece model as loaded with \code{\link{sentencepiece_load_model}}} 28 | \item{embedding: a matrix with embeddings as loaded with \code{\link[word2vec]{read.wordvectors}}} 29 | \item{dim: the dimension of the embedding} 30 | \item{n: the number of elements in the vocabulary} 31 | \item{file_sentencepiece: the sentencepiece model file} 32 | \item{file_word2vec: the word2vec embedding file} 33 | } 34 | } 35 | \description{ 36 | Use a sentencepiece model to tokenise text and get the embeddings of these 37 | } 38 | \examples{ 39 | ## 40 | ## Example loading model from disk 41 | ## 42 | folder <- system.file(package = "sentencepiece", "models") 43 | embedding <- file.path(folder, "nl.wiki.bpe.vs1000.d25.w2v.bin") 44 | model <- file.path(folder, "nl.wiki.bpe.vs1000.model") 45 | encoder <- BPEembed(model, embedding) 46 | 47 | ## Do tokenisation with the sentencepiece model + embed these 48 | txt <- c("De eigendomsoverdracht aan de deelstaten is ingewikkeld.", 49 | "On est d'accord sur le prix de la biere?") 50 | values <- predict(encoder, txt, type = "encode") 51 | str(values) 52 | values 53 | 54 | txt <- rownames(values[[1]]) 55 | predict(encoder, txt, type = "decode") 56 | txt <- lapply(values, FUN = rownames) 57 | predict(encoder, txt, type = "decode") 58 | } 59 | \seealso{ 60 | \code{\link{predict.BPEembed}}, \code{\link{sentencepiece_load_model}}, \code{\link{sentencepiece_download_model}}, \code{\link[word2vec]{read.wordvectors}} 61 | } 62 | -------------------------------------------------------------------------------- /man/predict.BPEembed.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bpemb.R 3 | \name{predict.BPEembed} 4 | \alias{predict.BPEembed} 5 | \title{Encode and Decode alongside a BPEembed model} 6 | \usage{ 7 | \method{predict}{BPEembed}(object, newdata, type = c("encode", "decode", "tokenize"), ...) 8 | } 9 | \arguments{ 10 | \item{object}{an object of class BPEembed as returned by \code{\link{BPEembed}}} 11 | 12 | \item{newdata}{a character vector of text to encode or a character vector of encoded tokens to decode or a list of those} 13 | 14 | \item{type}{character string, either 'encode', 'decode' or 'tokenize'} 15 | 16 | \item{...}{further arguments passed on to the methods} 17 | } 18 | \value{ 19 | \itemize{ 20 | \item{in case type is set to \code{'encode'}: a list of matrices containing embeddings of the text which is tokenised with \code{\link{sentencepiece_encode}}} 21 | \item{in case type is set to \code{'decode'}: a character vector of decoded text as returned by \code{\link{sentencepiece_decode}}} 22 | \item{in case type is set to \code{'tokenize'}: a tokenised \code{\link{sentencepiece_encode}}} 23 | } 24 | } 25 | \description{ 26 | Use the sentencepiece model to either 27 | \itemize{ 28 | \item{encode: tokenise and embed text} 29 | \item{decode: get the untokenised text back of tokenised data} 30 | \item{tokenize: only tokenize alongside the sentencepiece model} 31 | } 32 | } 33 | \examples{ 34 | embedding <- system.file(package = "sentencepiece", "models", 35 | "nl.wiki.bpe.vs1000.d25.w2v.bin") 36 | model <- system.file(package = "sentencepiece", "models", 37 | "nl.wiki.bpe.vs1000.model") 38 | encoder <- BPEembed(model, embedding) 39 | 40 | txt <- c("De eigendomsoverdracht aan de deelstaten is ingewikkeld.", 41 | "On est d'accord sur le prix de la biere?") 42 | values <- predict(encoder, txt, type = "encode") 43 | str(values) 44 | values 45 | 46 | txt <- rownames(values[[1]]) 47 | predict(encoder, txt, type = "decode") 48 | txt <- lapply(values, FUN = rownames) 49 | predict(encoder, txt, type = "decode") 50 | txt <- c("De eigendomsoverdracht aan de deelstaten is ingewikkeld.", 51 | "On est d'accord sur le prix de la biere?") 52 | predict(encoder, txt, type = "tokenize", "subwords") 53 | predict(encoder, txt, type = "tokenize", "ids") 54 | } 55 | \seealso{ 56 | \code{\link{BPEembed}}, \code{\link{sentencepiece_decode}}, \code{\link{sentencepiece_encode}} 57 | } 58 | -------------------------------------------------------------------------------- /src/third_party/protobuf-lite/zero_copy_stream.cc: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | // Author: kenton@google.com (Kenton Varda) 32 | // Based on original Protocol Buffers design by 33 | // Sanjay Ghemawat, Jeff Dean, and others. 34 | 35 | #include 36 | 37 | #include 38 | #include 39 | 40 | namespace google { 41 | namespace protobuf { 42 | namespace io { 43 | 44 | 45 | bool ZeroCopyOutputStream::WriteAliasedRaw(const void* /* data */, 46 | int /* size */) { 47 | GOOGLE_LOG(FATAL) << "This ZeroCopyOutputStream doesn't support aliasing. " 48 | "Reaching here usually means a ZeroCopyOutputStream " 49 | "implementation bug."; 50 | return false; 51 | } 52 | 53 | } // namespace io 54 | } // namespace protobuf 55 | } // namespace google 56 | -------------------------------------------------------------------------------- /src/sentencepiece/src/freelist.h: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef FREELIST_H_ 16 | #define FREELIST_H_ 17 | 18 | #include 19 | 20 | #include 21 | 22 | namespace sentencepiece { 23 | namespace model { 24 | 25 | // Simple FreeList that allocates a chunk of T at once. 26 | template 27 | class FreeList { 28 | public: 29 | FreeList() = delete; 30 | explicit FreeList(size_t chunk_size) : chunk_size_(chunk_size) {} 31 | virtual ~FreeList() { 32 | for (auto& chunk : freelist_) delete[] chunk; 33 | } 34 | 35 | // `Free` doesn't free the object but reuse the allocated memory chunks. 36 | void Free() { 37 | const int size = std::min(chunk_index_ + 1, freelist_.size()); 38 | for (int i = 0; i < size; ++i) { 39 | T* chunk = freelist_[i]; 40 | memset(static_cast(chunk), 0, sizeof(*chunk) * chunk_size_); 41 | } 42 | chunk_index_ = 0; 43 | element_index_ = 0; 44 | } 45 | 46 | // Returns the number of allocated elements. 47 | size_t size() const { return chunk_size_ * chunk_index_ + element_index_; } 48 | 49 | // Returns the element as an array. 50 | T* operator[](size_t index) const { 51 | return freelist_[index / chunk_size_] + index % chunk_size_; 52 | } 53 | 54 | // Allocates new element. 55 | T* Allocate() { 56 | if (element_index_ >= chunk_size_) { 57 | ++chunk_index_; 58 | element_index_ = 0; 59 | } 60 | 61 | if (chunk_index_ == freelist_.size()) { 62 | T* chunk = new T[chunk_size_]; 63 | memset(static_cast(chunk), 0, sizeof(*chunk) * chunk_size_); 64 | freelist_.push_back(chunk); 65 | } 66 | 67 | T* result = freelist_[chunk_index_] + element_index_; 68 | ++element_index_; 69 | 70 | return result; 71 | } 72 | 73 | private: 74 | std::vector freelist_; 75 | 76 | // The last element is stored at freelist_[chunk_index_][element_index_] 77 | size_t element_index_ = 0; 78 | size_t chunk_index_ = 0; 79 | const size_t chunk_size_ = 0; 80 | }; 81 | } // namespace model 82 | } // namespace sentencepiece 83 | #endif // FREELIST_H_ 84 | -------------------------------------------------------------------------------- /src/sentencepiece/src/word_model_trainer_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include 16 | #include 17 | 18 | #include "filesystem.h" 19 | #include "sentencepiece_processor.h" 20 | #include "testharness.h" 21 | #include "third_party/absl/strings/str_cat.h" 22 | #include "third_party/absl/strings/str_join.h" 23 | #include "util.h" 24 | #include "word_model_trainer.h" 25 | 26 | namespace sentencepiece { 27 | namespace word { 28 | namespace { 29 | 30 | // Space symbol (U+2581) 31 | #define WS "\xE2\x96\x81" 32 | 33 | std::string RunTrainer(const std::vector &input, int size) { 34 | const std::string input_file = 35 | util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "input"); 36 | const std::string model_prefix = 37 | util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "model"); 38 | { 39 | auto output = filesystem::NewWritableFile(input_file); 40 | for (const auto &line : input) { 41 | output->WriteLine(line); 42 | } 43 | } 44 | 45 | TrainerSpec trainer_spec; 46 | trainer_spec.set_model_type(TrainerSpec::WORD); 47 | trainer_spec.add_input(input_file); 48 | trainer_spec.set_vocab_size(size - 3); // remove , , 49 | trainer_spec.set_model_prefix(model_prefix); 50 | 51 | NormalizerSpec normalizer_spec; 52 | normalizer_spec.set_name("identity"); 53 | normalizer_spec.set_add_dummy_prefix(true); 54 | 55 | NormalizerSpec denormalizer_spec; 56 | 57 | Trainer trainer(trainer_spec, normalizer_spec, denormalizer_spec); 58 | EXPECT_TRUE(trainer.Train().ok()); 59 | 60 | SentencePieceProcessor processor; 61 | EXPECT_TRUE(processor.Load(model_prefix + ".model").ok()); 62 | 63 | const auto &model = processor.model_proto(); 64 | std::vector pieces; 65 | 66 | // remove , , 67 | for (int i = 3; i < model.pieces_size(); ++i) { 68 | pieces.emplace_back(model.pieces(i).piece()); 69 | } 70 | 71 | return absl::StrJoin(pieces, " "); 72 | } 73 | } // namespace 74 | 75 | TEST(TrainerTest, BasicTest) { 76 | EXPECT_EQ(WS "I " WS "apple " WS "have " WS "pen", 77 | RunTrainer({"I have a pen", "I have an apple", "apple pen"}, 10)); 78 | } 79 | } // namespace word 80 | } // namespace sentencepiece 81 | -------------------------------------------------------------------------------- /src/third_party/absl/strings/str_join.h: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright 2017 The Abseil Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // 16 | #ifndef ABSL_STRINGS_STR_JOIN_H_ 17 | #define ABSL_STRINGS_STR_JOIN_H_ 18 | 19 | #include 20 | 21 | #include "third_party/absl/strings/string_view.h" 22 | 23 | namespace absl { 24 | namespace { 25 | template 26 | inline size_t Itoa(T val, char *s) { 27 | char *org = s; 28 | 29 | if (val < 0) { 30 | *s++ = '-'; 31 | val = -val; 32 | } 33 | char *t = s; 34 | 35 | T mod = 0; 36 | while (val) { 37 | mod = val % 10; 38 | *t++ = static_cast(mod) + '0'; 39 | val /= 10; 40 | } 41 | 42 | if (s == t) { 43 | *t++ = '0'; 44 | } 45 | 46 | *t = '\0'; 47 | std::reverse(s, t); 48 | return static_cast(t - org); 49 | } 50 | } // namespace 51 | 52 | inline std::string StrJoin(const std::vector &tokens, 53 | absl::string_view delim) { 54 | std::string result; 55 | if (!tokens.empty()) { 56 | result.append(tokens[0]); 57 | } 58 | for (size_t i = 1; i < tokens.size(); ++i) { 59 | result.append(delim.data(), delim.size()); 60 | result.append(tokens[i]); 61 | } 62 | return result; 63 | } 64 | 65 | inline std::string StrJoin(const std::vector &tokens, 66 | absl::string_view delim) { 67 | std::string result; 68 | if (!tokens.empty()) { 69 | result.append(tokens[0].data(), tokens[0].size()); 70 | } 71 | for (size_t i = 1; i < tokens.size(); ++i) { 72 | result.append(delim.data(), delim.size()); 73 | result.append(tokens[i].data(), tokens[i].size()); 74 | } 75 | return result; 76 | } 77 | 78 | inline std::string StrJoin(const std::vector &tokens, 79 | absl::string_view delim) { 80 | std::string result; 81 | char buf[32]; 82 | if (!tokens.empty()) { 83 | const size_t len = Itoa(tokens[0], buf); 84 | result.append(buf, len); 85 | } 86 | for (size_t i = 1; i < tokens.size(); ++i) { 87 | result.append(delim.data(), delim.size()); 88 | const size_t len = Itoa(tokens[i], buf); 89 | result.append(buf, len); 90 | } 91 | return result; 92 | } 93 | 94 | } // namespace absl 95 | #endif // ABSL_STRINGS_STR_CAT_H_ 96 | -------------------------------------------------------------------------------- /src/sentencepiece/src/sentencepiece.proto: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | syntax = "proto2"; 16 | 17 | // TODO(taku): Needs to use LITE RUNTIME in OSS release. 18 | option optimize_for = LITE_RUNTIME; 19 | 20 | package sentencepiece; 21 | 22 | // SentencePieceText manages a user-facing source sentence, 23 | // postprocessed target sentence, and internal segmentation 24 | // with byte offsets. 25 | message SentencePieceText { 26 | message SentencePiece { 27 | // Internal representation for the decoder. 28 | // - Decoder can use |piece| as a basic token. 29 | // - the piece must be non-empty. 30 | // - A whitespace is replaced with a meta symbol. 31 | // - Concatenation of pieces is not always the same as the |text|. 32 | optional string piece = 1; 33 | 34 | // Vocabulary id. 35 | optional uint32 id = 2; 36 | 37 | // External representation for the client. 38 | // - It is always guaranteed that 39 | // text.substr(begin, end - begin) == surface. 40 | // - Concatenation of surface is always the same as the |text|. 41 | // - |surface| may contain whitespaces. 42 | // - |surface| may be empty if the piece encodes 43 | // a control vocabulary. e.g., , , . 44 | // - When |surface| is empty, always begin == end. (zero-length span). 45 | optional string surface = 3; 46 | 47 | optional uint32 begin = 4; 48 | optional uint32 end = 5; 49 | 50 | // Customized extensions: the range of field numbers 51 | // are open to third-party extensions. 52 | extensions 200 to max; 53 | } 54 | 55 | // User input or postprocessed text. This should be immutable 56 | // since the byte range in SentencePiece is pointing to a span over this 57 | // text. Meta symbols for whitespaces are not included. 58 | optional string text = 1; 59 | 60 | // A sequence of sentence pieces. 61 | repeated SentencePiece pieces = 2; 62 | 63 | // Score (usually log probability) for MultiSentencePieceText. 64 | optional float score = 3; 65 | 66 | // Customized extensions: the range of field numbers 67 | // are open to third-party extensions. 68 | extensions 200 to max; 69 | } 70 | 71 | message NBestSentencePieceText { 72 | repeated SentencePieceText nbests = 1; 73 | } 74 | -------------------------------------------------------------------------------- /src/sentencepiece/src/char_model_trainer_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include 16 | #include 17 | 18 | #include "char_model_trainer.h" 19 | #include "filesystem.h" 20 | #include "sentencepiece_processor.h" 21 | #include "testharness.h" 22 | #include "third_party/absl/strings/str_cat.h" 23 | #include "third_party/absl/strings/str_join.h" 24 | #include "util.h" 25 | 26 | namespace sentencepiece { 27 | namespace character { 28 | namespace { 29 | 30 | // Space symbol (U+2581) 31 | #define WS "\xE2\x96\x81" 32 | 33 | std::string RunTrainer(const std::vector &input, int size) { 34 | const std::string input_file = 35 | util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "input"); 36 | const std::string model_prefix = 37 | util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "model"); 38 | { 39 | auto output = filesystem::NewWritableFile(input_file); 40 | for (const auto &line : input) { 41 | output->WriteLine(line); 42 | } 43 | } 44 | 45 | TrainerSpec trainer_spec; 46 | trainer_spec.set_model_type(TrainerSpec::CHAR); 47 | trainer_spec.add_input(input_file); 48 | trainer_spec.set_vocab_size(size); 49 | trainer_spec.set_model_prefix(model_prefix); 50 | 51 | NormalizerSpec normalizer_spec; 52 | normalizer_spec.set_name("identity"); 53 | 54 | NormalizerSpec denormalizer_spec; 55 | 56 | Trainer trainer(trainer_spec, normalizer_spec, denormalizer_spec); 57 | EXPECT_TRUE(trainer.Train().ok()); 58 | 59 | SentencePieceProcessor processor; 60 | EXPECT_TRUE(processor.Load(model_prefix + ".model").ok()); 61 | 62 | const auto &model = processor.model_proto(); 63 | std::vector pieces; 64 | 65 | // remove , , 66 | for (int i = 3; i < model.pieces_size(); ++i) { 67 | pieces.emplace_back(model.pieces(i).piece()); 68 | } 69 | 70 | return absl::StrJoin(pieces, " "); 71 | } 72 | 73 | TEST(TrainerTest, BasicTest) { 74 | EXPECT_EQ(WS " a e p n I h l v", 75 | RunTrainer({"I have a pen", "I have an apple", "apple pen"}, 100)); 76 | EXPECT_EQ(WS " a", // , , , _, a 77 | RunTrainer({"I have a pen", "I have an apple", "apple pen"}, 5)); 78 | } 79 | 80 | } // namespace 81 | } // namespace character 82 | } // namespace sentencepiece 83 | -------------------------------------------------------------------------------- /src/third_party/absl/memory/memory.h: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright 2017 The Abseil Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // 16 | // ----------------------------------------------------------------------------- 17 | // File: string_view.h 18 | // ----------------------------------------------------------------------------- 19 | // 20 | // This file contains the definition of the `absl::string_view` class. A 21 | // `string_view` points to a contiguous span of characters, often part or all of 22 | // another `std::string`, double-quoted std::string literal, character array, or 23 | // even another `string_view`. 24 | // 25 | // This `absl::string_view` abstraction is designed to be a drop-in 26 | // replacement for the C++17 `std::string_view` abstraction. 27 | #ifndef ABSL_MEMORY_MEMORY_H_ 28 | #define ABSL_MEMORY_MEMORY_H_ 29 | 30 | #include 31 | 32 | namespace absl { 33 | 34 | // Trait to select overloads and return types for MakeUnique. 35 | template 36 | struct MakeUniqueResult { 37 | using scalar = std::unique_ptr; 38 | }; 39 | template 40 | struct MakeUniqueResult { 41 | using array = std::unique_ptr; 42 | }; 43 | template 44 | struct MakeUniqueResult { 45 | using invalid = void; 46 | }; 47 | 48 | // MakeUnique(...) is an early implementation of C++14 std::make_unique. 49 | // It is designed to be 100% compatible with std::make_unique so that the 50 | // eventual switchover will be a simple renaming operation. 51 | template 52 | typename MakeUniqueResult::scalar make_unique(Args &&... args) { // NOLINT 53 | return std::unique_ptr( 54 | new T(std::forward(args)...)); // NOLINT(build/c++11) 55 | } 56 | 57 | // Overload for array of unknown bound. 58 | // The allocation of arrays needs to use the array form of new, 59 | // and cannot take element constructor arguments. 60 | template 61 | typename MakeUniqueResult::array make_unique(size_t n) { 62 | return std::unique_ptr(new typename std::remove_extent::type[n]()); 63 | } 64 | 65 | // Reject arrays of known bound. 66 | template 67 | typename MakeUniqueResult::invalid make_unique(Args &&... /* args */) = 68 | delete; // NOLINT 69 | 70 | } // namespace absl 71 | #endif // ABSL_MEMORY_MEMORY_H_ 72 | -------------------------------------------------------------------------------- /src/third_party/protobuf-lite/implicit_weak_message.cc: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | #include 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | #include 39 | 40 | namespace google { 41 | namespace protobuf { 42 | namespace internal { 43 | 44 | const char* ImplicitWeakMessage::_InternalParse(const char* ptr, 45 | ParseContext* ctx) { 46 | return ctx->AppendString(ptr, &data_); 47 | } 48 | 49 | ExplicitlyConstructed 50 | implicit_weak_message_default_instance; 51 | internal::once_flag implicit_weak_message_once_init_; 52 | 53 | void InitImplicitWeakMessageDefaultInstance() { 54 | implicit_weak_message_default_instance.DefaultConstruct(); 55 | } 56 | 57 | const ImplicitWeakMessage* ImplicitWeakMessage::default_instance() { 58 | internal::call_once(implicit_weak_message_once_init_, 59 | InitImplicitWeakMessageDefaultInstance); 60 | return &implicit_weak_message_default_instance.get(); 61 | } 62 | 63 | } // namespace internal 64 | } // namespace protobuf 65 | } // namespace google 66 | -------------------------------------------------------------------------------- /src/sentencepiece/src/word_model_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include 16 | 17 | #include "sentencepiece_model.pb.h" 18 | #include "testharness.h" 19 | #include "util.h" 20 | #include "word_model.h" 21 | 22 | namespace sentencepiece { 23 | namespace word { 24 | namespace { 25 | 26 | // Space symbol (U+2581) 27 | #define WS "\xe2\x96\x81" 28 | 29 | ModelProto MakeBaseModelProto() { 30 | ModelProto model_proto; 31 | auto *sp1 = model_proto.add_pieces(); 32 | auto *sp2 = model_proto.add_pieces(); 33 | auto *sp3 = model_proto.add_pieces(); 34 | 35 | sp1->set_type(ModelProto::SentencePiece::UNKNOWN); 36 | sp1->set_piece(""); 37 | sp2->set_type(ModelProto::SentencePiece::CONTROL); 38 | sp2->set_piece(""); 39 | sp3->set_type(ModelProto::SentencePiece::CONTROL); 40 | sp3->set_piece(""); 41 | 42 | return model_proto; 43 | } 44 | 45 | void AddPiece(ModelProto *model_proto, const std::string &piece, 46 | float score = 0.0) { 47 | auto *sp = model_proto->add_pieces(); 48 | sp->set_piece(piece); 49 | sp->set_score(score); 50 | } 51 | 52 | TEST(WordModelTest, EncodeTest) { 53 | ModelProto model_proto = MakeBaseModelProto(); 54 | 55 | AddPiece(&model_proto, WS "ab"); 56 | AddPiece(&model_proto, WS "cd"); 57 | AddPiece(&model_proto, WS "abc"); 58 | AddPiece(&model_proto, WS "a", 0.1); 59 | AddPiece(&model_proto, WS "b", 0.2); 60 | AddPiece(&model_proto, WS "c", 0.3); 61 | AddPiece(&model_proto, WS "d", 0.4); 62 | 63 | const Model model(model_proto); 64 | 65 | EncodeResult result; 66 | 67 | result = model.Encode(""); 68 | EXPECT_TRUE(result.empty()); 69 | 70 | result = model.Encode(WS "a" WS "b" WS "c"); 71 | EXPECT_EQ(3, result.size()); 72 | EXPECT_EQ(WS "a", result[0].first); 73 | EXPECT_EQ(WS "b", result[1].first); 74 | EXPECT_EQ(WS "c", result[2].first); 75 | 76 | result = model.Encode(WS "ab" WS "cd" WS "abc"); 77 | EXPECT_EQ(3, result.size()); 78 | EXPECT_EQ(WS "ab", result[0].first); 79 | EXPECT_EQ(WS "cd", result[1].first); 80 | EXPECT_EQ(WS "abc", result[2].first); 81 | } 82 | 83 | TEST(WordModelTest, NotSupportedTest) { 84 | ModelProto model_proto = MakeBaseModelProto(); 85 | const Model model(model_proto); 86 | EXPECT_EQ(NBestEncodeResult(), model.NBestEncode("test", 10)); 87 | EXPECT_EQ(EncodeResult(), model.SampleEncode("test", 0.1)); 88 | } 89 | 90 | } // namespace 91 | } // namespace word 92 | } // namespace sentencepiece 93 | -------------------------------------------------------------------------------- /src/third_party/absl/strings/str_split.h: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright 2017 The Abseil Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // 16 | #ifndef ABSL_STRINGS_STR_SPLIT_H_ 17 | #define ABSL_STRINGS_STR_SPLIT_H_ 18 | 19 | #include 20 | #include 21 | 22 | #include "third_party/absl/strings/string_view.h" 23 | 24 | namespace absl { 25 | namespace internal { 26 | 27 | class Splitter { 28 | public: 29 | Splitter(absl::string_view str, absl::string_view delim, bool allow_empty) { 30 | size_t current_pos = 0; 31 | size_t found_pos = 0; 32 | while ((found_pos = str.find_first_of(delim, current_pos)) != 33 | absl::string_view::npos) { 34 | if ((allow_empty && found_pos >= current_pos) || 35 | (!allow_empty && found_pos > current_pos)) { 36 | result_.push_back(str.substr(current_pos, found_pos - current_pos)); 37 | } 38 | current_pos = found_pos + 1; 39 | } 40 | if (str.size() > current_pos) { 41 | result_.push_back(str.substr(current_pos, str.size() - current_pos)); 42 | } 43 | } 44 | template 45 | operator std::vector() const; 46 | 47 | using const_iterator = std::vector::const_iterator; 48 | const_iterator begin() const { return result_.begin(); } 49 | const_iterator end() const { return result_.end(); } 50 | 51 | private: 52 | std::vector result_; 53 | }; 54 | 55 | template <> 56 | inline Splitter::operator std::vector() const { 57 | std::vector x(result_.size()); 58 | for (size_t i = 0; i < x.size(); ++i) 59 | x[i].assign(result_[i].data(), result_[i].size()); 60 | return x; 61 | } 62 | 63 | template <> 64 | inline Splitter::operator std::vector() const { 65 | return result_; 66 | } 67 | } // namespace internal 68 | 69 | inline constexpr bool AllowEmpty() { return true; }; 70 | 71 | inline internal::Splitter StrSplit(absl::string_view str, 72 | absl::string_view delim, 73 | bool allow_empty = false) { 74 | return internal::Splitter(str, delim, allow_empty); 75 | } 76 | 77 | inline internal::Splitter StrSplit(absl::string_view str, const char c, 78 | bool allow_empty = false) { 79 | char delim[2]; 80 | delim[0] = c; 81 | delim[1] = '\0'; 82 | return internal::Splitter(str, delim, allow_empty); 83 | } 84 | 85 | } // namespace absl 86 | #endif // ABSL_STRINGS_STR_SPLIT_H_ 87 | -------------------------------------------------------------------------------- /src/sentencepiece/src/pretokenizer_for_training_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | #include "pretokenizer_for_training.h" 15 | #include "testharness.h" 16 | #include "third_party/absl/strings/str_cat.h" 17 | #include "trainer_interface.h" 18 | 19 | namespace sentencepiece { 20 | namespace pretokenizer { 21 | 22 | class MockPretokenizer : public PretokenizerForTrainingInterface { 23 | public: 24 | MockPretokenizer() {} 25 | ~MockPretokenizer() {} 26 | 27 | SentencePieceText Tokenize(absl::string_view text) const override { 28 | return spt_; 29 | } 30 | 31 | util::Status status() const override { return util::OkStatus(); } 32 | 33 | void SetOutput(const SentencePieceText &spt) { spt_ = spt; } 34 | 35 | private: 36 | SentencePieceText spt_; 37 | }; 38 | 39 | TEST(PretokenizerForTrainingTest, BaseTest) { 40 | MockPretokenizer mock; 41 | 42 | { 43 | SentencePieceText spt; 44 | spt.set_text("I love sentencepiece"); 45 | auto *p1 = spt.add_pieces(); 46 | p1->set_surface("I"); 47 | p1->set_begin(0); 48 | p1->set_end(1); 49 | 50 | auto *p2 = spt.add_pieces(); 51 | p2->set_surface("love"); 52 | p2->set_begin(2); 53 | p2->set_end(6); 54 | 55 | auto *p3 = spt.add_pieces(); 56 | p3->set_surface("sentence"); 57 | p3->set_begin(7); 58 | p3->set_end(15); 59 | 60 | auto *p4 = spt.add_pieces(); 61 | p4->set_surface("piece"); 62 | p4->set_begin(15); 63 | p4->set_end(20); 64 | 65 | mock.SetOutput(spt); 66 | 67 | EXPECT_EQ(absl::StrCat("I", TrainerInterface::kWSStr, "love", 68 | TrainerInterface::kWSStr, "sentence\tpiece"), 69 | mock.PreTokenize("I love sentencepiece")); 70 | } 71 | 72 | { 73 | SentencePieceText spt; 74 | spt.set_text("これはペンです"); 75 | auto *p1 = spt.add_pieces(); 76 | p1->set_surface("これ"); 77 | p1->set_begin(0); 78 | p1->set_end(6); 79 | 80 | auto *p2 = spt.add_pieces(); 81 | p2->set_surface("は"); 82 | p2->set_begin(6); 83 | p2->set_end(9); 84 | 85 | auto *p3 = spt.add_pieces(); 86 | p3->set_surface("ペン"); 87 | p3->set_begin(9); 88 | p3->set_end(15); 89 | 90 | auto *p4 = spt.add_pieces(); 91 | p4->set_surface("です"); 92 | p4->set_begin(15); 93 | p4->set_end(21); 94 | 95 | mock.SetOutput(spt); 96 | 97 | EXPECT_EQ("これ\tは\tペン\tです", mock.PreTokenize("これはペンです")); 98 | } 99 | } 100 | 101 | } // namespace pretokenizer 102 | } // namespace sentencepiece 103 | -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | PKG_LIBS = -pthread 2 | PKG_CPPFLAGS = -D HAVE_PTHREAD=1 -pthread -D_USE_INTERNAL_STRING_VIEW -DSTRICT_R_HEADERS -I. -I./sentencepiece -I./sentencepiece/src -I./sentencepiece/src/builtin_pb -I./third_party/absl -I./third_party/darts_clone -I./third_party/esaxx -I./third_party/protobuf-lite 3 | 4 | SOURCES = third_party/protobuf-lite/arena.cc \ 5 | third_party/protobuf-lite/arenastring.cc \ 6 | third_party/protobuf-lite/bytestream.cc \ 7 | third_party/protobuf-lite/coded_stream.cc \ 8 | third_party/protobuf-lite/common.cc \ 9 | third_party/protobuf-lite/extension_set.cc \ 10 | third_party/protobuf-lite/generated_enum_util.cc \ 11 | third_party/protobuf-lite/generated_message_table_driven_lite.cc \ 12 | third_party/protobuf-lite/generated_message_util.cc \ 13 | third_party/protobuf-lite/implicit_weak_message.cc \ 14 | third_party/protobuf-lite/int128.cc \ 15 | third_party/protobuf-lite/io_win32.cc \ 16 | third_party/protobuf-lite/message_lite.cc \ 17 | third_party/protobuf-lite/parse_context.cc \ 18 | third_party/protobuf-lite/repeated_field.cc \ 19 | third_party/protobuf-lite/status.cc \ 20 | third_party/protobuf-lite/statusor.cc \ 21 | third_party/protobuf-lite/stringpiece.cc \ 22 | third_party/protobuf-lite/stringprintf.cc \ 23 | third_party/protobuf-lite/structurally_valid.cc \ 24 | third_party/protobuf-lite/strutil.cc \ 25 | third_party/protobuf-lite/time.cc \ 26 | third_party/protobuf-lite/wire_format_lite.cc \ 27 | third_party/protobuf-lite/zero_copy_stream.cc \ 28 | third_party/protobuf-lite/zero_copy_stream_impl.cc \ 29 | third_party/protobuf-lite/zero_copy_stream_impl_lite.cc \ 30 | third_party/absl/strings/string_view.cc \ 31 | third_party/absl/flags/flag.cc \ 32 | sentencepiece/src/builtin_pb/sentencepiece.pb.cc \ 33 | sentencepiece/src/builtin_pb/sentencepiece_model.pb.cc \ 34 | sentencepiece/src/bpe_model.cc \ 35 | sentencepiece/src/bpe_model_trainer.cc \ 36 | sentencepiece/src/builder.cc \ 37 | sentencepiece/src/char_model.cc \ 38 | sentencepiece/src/char_model_trainer.cc \ 39 | sentencepiece/src/error.cc \ 40 | sentencepiece/src/filesystem.cc \ 41 | sentencepiece/src/model_factory.cc \ 42 | sentencepiece/src/model_interface.cc \ 43 | sentencepiece/src/normalizer.cc \ 44 | sentencepiece/src/pretokenizer_for_training.cc \ 45 | sentencepiece/src/sentencepiece_processor.cc \ 46 | sentencepiece/src/sentencepiece_trainer.cc \ 47 | sentencepiece/src/trainer_factory.cc \ 48 | sentencepiece/src/trainer_interface.cc \ 49 | sentencepiece/src/unicode_script.cc \ 50 | sentencepiece/src/unigram_model.cc \ 51 | sentencepiece/src/unigram_model_trainer.cc \ 52 | sentencepiece/src/util.cc \ 53 | sentencepiece/src/word_model.cc \ 54 | sentencepiece/src/word_model_trainer.cc \ 55 | rcpp_sentencepiece.cpp \ 56 | rcpp_wordpiece.cpp \ 57 | RcppExports.cpp 58 | 59 | OBJ = $(SOURCES:.cc=.o) 60 | OBJECTS = $(OBJ:.cpp=.o) 61 | 62 | .PHONY: all 63 | 64 | all: $(SHLIB); rm -f $(OBJECTS) 65 | #all: $(SHLIB); rm -f rcpp_wordpiece.o; rm -f rcpp_sentencepiece.o; rm -f RcppExports.o 66 | #all: $(SHLIB); rm -f third_party/protobuf-lite/repeated_field.o; 67 | 68 | -------------------------------------------------------------------------------- /src/sentencepiece/src/unigram_model_trainer_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include "sentencepiece_model.pb.h" 16 | #include "sentencepiece_processor.h" 17 | #include "sentencepiece_trainer.h" 18 | #include "testharness.h" 19 | #include "third_party/absl/strings/str_cat.h" 20 | #include "third_party/absl/strings/str_join.h" 21 | #include "unigram_model_trainer.h" 22 | #include "util.h" 23 | 24 | namespace sentencepiece { 25 | namespace unigram { 26 | namespace { 27 | 28 | // Space symbol 29 | #define WS "\xe2\x96\x81" 30 | 31 | TEST(UnigramTrainerTest, TrainerModelTest) { 32 | TrainerSpec trainer_spec; 33 | NormalizerSpec normalizer_spec; 34 | const TrainerModel model(trainer_spec, normalizer_spec); 35 | EXPECT_EQ(EncodeResult(), model.Encode("test")); 36 | } 37 | 38 | static constexpr char kTestInputData[] = "wagahaiwa_nekodearu.txt"; 39 | 40 | TEST(UnigramTrainerTest, EndToEndTest) { 41 | const std::string input = 42 | util::JoinPath(absl::GetFlag(FLAGS_test_srcdir), kTestInputData); 43 | 44 | ASSERT_TRUE( 45 | SentencePieceTrainer::Train( 46 | absl::StrCat( 47 | "--model_prefix=", 48 | util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "tmp_model"), 49 | " --input=", input, 50 | " --vocab_size=8000 --normalization_rule_name=identity", 51 | " --model_type=unigram --user_defined_symbols=", 52 | " --control_symbols= --max_sentence_length=2048")) 53 | .ok()); 54 | 55 | SentencePieceProcessor sp; 56 | EXPECT_TRUE(sp.Load(util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), 57 | "tmp_model.model")) 58 | .ok()); 59 | EXPECT_EQ(8000, sp.GetPieceSize()); 60 | 61 | const int cid = sp.PieceToId(""); 62 | const int uid = sp.PieceToId(""); 63 | EXPECT_TRUE(sp.IsControl(cid)); 64 | EXPECT_FALSE(sp.IsUnknown(uid)); 65 | 66 | std::vector tok; 67 | 68 | EXPECT_TRUE(sp.Encode("", &tok).ok()); 69 | EXPECT_TRUE(tok.empty()); 70 | 71 | EXPECT_TRUE(sp.Encode("吾輩《わがはい》は猫である。名前はまだ無い。" 72 | "どこで生れたかとんと見当《けんとう》がつかぬ。" 73 | "何でも薄暗いじめじめした所でニャーニャー泣いていた事だ" 74 | "けは記憶している" 75 | "。", 76 | &tok) 77 | .ok()); 78 | // TODO(taku): Temporally disable this test on Windows. 79 | #ifndef OS_WIN 80 | EXPECT_EQ(WS 81 | " 吾輩 《 わが はい 》 は 猫 である 。 名前 はまだ 無い 。 " 82 | "どこ で 生 れた か とん と 見当 《 けん とう 》 が つか ぬ 。 " 83 | "何でも 薄 暗 い じめ じめ した 所で ニャーニャー " 84 | "泣 い ていた 事 だけは 記憶 している 。", 85 | absl::StrJoin(tok, " ")); 86 | #endif 87 | } 88 | 89 | } // namespace 90 | } // namespace unigram 91 | } // namespace sentencepiece 92 | -------------------------------------------------------------------------------- /.github/workflows/rhub.yaml: -------------------------------------------------------------------------------- 1 | # R-hub's generic GitHub Actions workflow file. It's canonical location is at 2 | # https://github.com/r-hub/actions/blob/v1/workflows/rhub.yaml 3 | # You can update this file to a newer version using the rhub2 package: 4 | # 5 | # rhub::rhub_setup() 6 | # 7 | # It is unlikely that you need to modify this file manually. 8 | 9 | name: R-hub 10 | run-name: "${{ github.event.inputs.id }}: ${{ github.event.inputs.name || format('Manually run by {0}', github.triggering_actor) }}" 11 | 12 | on: 13 | workflow_dispatch: 14 | inputs: 15 | config: 16 | description: 'A comma separated list of R-hub platforms to use.' 17 | type: string 18 | default: 'linux,windows,macos,clang-asan,clang-ubsan,gcc-asan,nold,rchk,ubuntu-clang,valgrind' 19 | name: 20 | description: 'Run name. You can leave this empty now.' 21 | type: string 22 | id: 23 | description: 'Unique ID. You can leave this empty now.' 24 | type: string 25 | 26 | jobs: 27 | 28 | setup: 29 | runs-on: ubuntu-latest 30 | outputs: 31 | containers: ${{ steps.rhub-setup.outputs.containers }} 32 | platforms: ${{ steps.rhub-setup.outputs.platforms }} 33 | 34 | steps: 35 | # NO NEED TO CHECKOUT HERE 36 | - uses: r-hub/actions/setup@v1 37 | with: 38 | config: ${{ github.event.inputs.config }} 39 | id: rhub-setup 40 | 41 | linux-containers: 42 | needs: setup 43 | if: ${{ needs.setup.outputs.containers != '[]' }} 44 | runs-on: ubuntu-latest 45 | name: ${{ matrix.config.label }} 46 | strategy: 47 | fail-fast: false 48 | matrix: 49 | config: ${{ fromJson(needs.setup.outputs.containers) }} 50 | container: 51 | image: ${{ matrix.config.container }} 52 | 53 | steps: 54 | - uses: r-hub/actions/checkout@v1 55 | - uses: r-hub/actions/platform-info@v1 56 | with: 57 | token: ${{ secrets.RHUB_TOKEN }} 58 | job-config: ${{ matrix.config.job-config }} 59 | - uses: r-hub/actions/setup-deps@v1 60 | with: 61 | token: ${{ secrets.RHUB_TOKEN }} 62 | job-config: ${{ matrix.config.job-config }} 63 | - uses: r-hub/actions/run-check@v1 64 | with: 65 | token: ${{ secrets.RHUB_TOKEN }} 66 | job-config: ${{ matrix.config.job-config }} 67 | 68 | other-platforms: 69 | needs: setup 70 | if: ${{ needs.setup.outputs.platforms != '[]' }} 71 | runs-on: ${{ matrix.config.os }} 72 | name: ${{ matrix.config.label }} 73 | strategy: 74 | fail-fast: false 75 | matrix: 76 | config: ${{ fromJson(needs.setup.outputs.platforms) }} 77 | 78 | steps: 79 | - uses: r-hub/actions/checkout@v1 80 | - uses: r-hub/actions/setup-r@v1 81 | with: 82 | job-config: ${{ matrix.config.job-config }} 83 | token: ${{ secrets.RHUB_TOKEN }} 84 | - uses: r-hub/actions/platform-info@v1 85 | with: 86 | token: ${{ secrets.RHUB_TOKEN }} 87 | job-config: ${{ matrix.config.job-config }} 88 | - uses: r-hub/actions/setup-deps@v1 89 | with: 90 | job-config: ${{ matrix.config.job-config }} 91 | token: ${{ secrets.RHUB_TOKEN }} 92 | - uses: r-hub/actions/run-check@v1 93 | with: 94 | job-config: ${{ matrix.config.job-config }} 95 | token: ${{ secrets.RHUB_TOKEN }} 96 | -------------------------------------------------------------------------------- /src/third_party/protobuf-lite/google/protobuf/stubs/stl_util.h: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | // from google3/util/gtl/stl_util.h 32 | 33 | #ifndef GOOGLE_PROTOBUF_STUBS_STL_UTIL_H__ 34 | #define GOOGLE_PROTOBUF_STUBS_STL_UTIL_H__ 35 | 36 | #include 37 | 38 | namespace google { 39 | namespace protobuf { 40 | 41 | // Inside Google, this function implements a horrible, disgusting hack in which 42 | // we reach into the string's private implementation and resize it without 43 | // initializing the new bytes. In some cases doing this can significantly 44 | // improve performance. However, since it's totally non-portable it has no 45 | // place in open source code. Feel free to fill this function in with your 46 | // own disgusting hack if you want the perf boost. 47 | inline void STLStringResizeUninitialized(std::string* s, size_t new_size) { 48 | s->resize(new_size); 49 | } 50 | 51 | // Return a mutable char* pointing to a string's internal buffer, 52 | // which may not be null-terminated. Writing through this pointer will 53 | // modify the string. 54 | // 55 | // string_as_array(&str)[i] is valid for 0 <= i < str.size() until the 56 | // next call to a string method that invalidates iterators. 57 | // 58 | // As of 2006-04, there is no standard-blessed way of getting a 59 | // mutable reference to a string's internal buffer. However, issue 530 60 | // (http://www.open-std.org/JTC1/SC22/WG21/docs/lwg-active.html#530) 61 | // proposes this as the method. According to Matt Austern, this should 62 | // already work on all current implementations. 63 | inline char* string_as_array(std::string* str) { 64 | // DO NOT USE const_cast(str->data())! See the unittest for why. 65 | return str->empty() ? nullptr : &*str->begin(); 66 | } 67 | 68 | } // namespace protobuf 69 | } // namespace google 70 | 71 | #endif // GOOGLE_PROTOBUF_STUBS_STL_UTIL_H__ 72 | -------------------------------------------------------------------------------- /inst/spc-help/spm_train: -------------------------------------------------------------------------------- 1 | sentencepiece 2 | 3 | Usage: spm_train [options] files 4 | 5 | --accept_language (comma-separated list of languages this model can accept) type: string default: 6 | --add_dummy_prefix (Add dummy whitespace at the beginning of text) type: bool default: true 7 | --bos_id (Override BOS () id. Set -1 to disable BOS.) type: int32 default: 1 8 | --bos_piece (Override BOS () piece.) type: string default: 9 | --character_coverage (character coverage to determine the minimum symbols) type: double default: 0.9995 10 | --control_symbols (comma separated list of control symbols) type: string default: 11 | --eos_id (Override EOS () id. Set -1 to disable EOS.) type: int32 default: 2 12 | --eos_piece (Override EOS () piece.) type: string default: 13 | --hard_vocab_limit (If set to false, --vocab_size is considered as a soft limit.) type: bool default: true 14 | --input (comma separated list of input sentences) type: string default: 15 | --input_format (Input format. Supported format is `text` or `tsv`.) type: string default: 16 | --input_sentence_size (maximum size of sentences the trainer loads) type: int32 default: 0 17 | --max_sentence_length (maximum length of sentence in byte) type: int32 default: 4192 18 | --max_sentencepiece_length (maximum length of sentence piece) type: int32 default: 16 19 | --model_prefix (output model prefix) type: string default: 20 | --model_type (model algorithm: unigram, bpe, word or char) type: string default: unigram 21 | --normalization_rule_name (Normalization rule name. Choose from nfkc or identity) type: string default: nmt_nfkc 22 | --normalization_rule_tsv (Normalization rule TSV file. ) type: string default: 23 | --num_sub_iterations (number of EM sub-iterations) type: int32 default: 2 24 | --num_threads (number of threads for training) type: int32 default: 16 25 | --pad_id (Override PAD () id. Set -1 to disable PAD.) type: int32 default: -1 26 | --pad_piece (Override PAD () piece.) type: string default: 27 | --remove_extra_whitespaces (Removes leading, trailing, and duplicate internal whitespace) type: bool default: true 28 | --seed_sentencepiece_size (the size of seed sentencepieces) type: int32 default: 1000000 29 | --self_test_sample_size (the size of self test samples) type: int32 default: 0 30 | --shrinking_factor (Keeps top shrinking_factor pieces with respect to the loss) type: double default: 0.75 31 | --shuffle_input_sentence (Randomly sample input sentences in advance. Valid when --input_sentence_size > 0) type: bool default: true 32 | --split_by_number (split tokens by numbers (0-9)) type: bool default: true 33 | --split_by_unicode_script (use Unicode script to split sentence pieces) type: bool default: true 34 | --split_by_whitespace (use a white space to split sentence pieces) type: bool default: true 35 | --treat_whitespace_as_suffix (treat whitespace marker as suffix instead of prefix.) type: bool default: false 36 | --unk_id (Override UNK () id.) type: int32 default: 0 37 | --unk_piece (Override UNK () piece.) type: string default: 38 | --unk_surface (Dummy surface string for . In decoding is decoded to `unk_surface`.) type: string default: ⁇ 39 | --use_all_vocab (If set to true, use all tokens as vocab. Valid for word/char models.) type: bool default: false 40 | --user_defined_symbols (comma separated list of user defined symbols) type: string default: 41 | --vocab_size (vocabulary size) type: int32 default: 8000 42 | 43 | 44 | -------------------------------------------------------------------------------- /src/third_party/protobuf-lite/google/protobuf/generated_enum_util.h: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | #ifndef GOOGLE_PROTOBUF_GENERATED_ENUM_UTIL_H__ 32 | #define GOOGLE_PROTOBUF_GENERATED_ENUM_UTIL_H__ 33 | 34 | #include 35 | 36 | #include 37 | #include 38 | 39 | #include 40 | 41 | #ifdef SWIG 42 | #error "You cannot SWIG proto headers" 43 | #endif 44 | 45 | namespace google { 46 | namespace protobuf { 47 | 48 | // This type trait can be used to cause templates to only match proto2 enum 49 | // types. 50 | template 51 | struct is_proto_enum : ::std::false_type {}; 52 | 53 | namespace internal { 54 | 55 | // The table entry format for storing enum name-to-value mapping used with lite 56 | // protos. This struct and the following related functions should only be used 57 | // by protobuf generated code. 58 | struct EnumEntry { 59 | StringPiece name; 60 | int value; 61 | }; 62 | 63 | // Looks up a numeric enum value given the string name. 64 | PROTOBUF_EXPORT bool LookUpEnumValue(const EnumEntry* enums, size_t size, 65 | StringPiece name, int* value); 66 | 67 | // Looks up an enum name given the numeric value. 68 | PROTOBUF_EXPORT int LookUpEnumName(const EnumEntry* enums, 69 | const int* sorted_indices, size_t size, 70 | int value); 71 | 72 | // Initializes the list of enum names in std::string form. 73 | PROTOBUF_EXPORT bool InitializeEnumStrings( 74 | const EnumEntry* enums, const int* sorted_indices, size_t size, 75 | internal::ExplicitlyConstructed* enum_strings); 76 | 77 | } // namespace internal 78 | } // namespace protobuf 79 | } // namespace google 80 | 81 | #include 82 | 83 | #endif // GOOGLE_PROTOBUF_GENERATED_ENUM_UTIL_H__ 84 | -------------------------------------------------------------------------------- /src/third_party/protobuf-lite/google/protobuf/stubs/time.h: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | #ifndef GOOGLE_PROTOBUF_STUBS_TIME_H_ 31 | #define GOOGLE_PROTOBUF_STUBS_TIME_H_ 32 | 33 | #include 34 | 35 | #include 36 | 37 | namespace google { 38 | namespace protobuf { 39 | namespace internal { 40 | 41 | struct DateTime { 42 | int year; 43 | int month; 44 | int day; 45 | int hour; 46 | int minute; 47 | int second; 48 | }; 49 | 50 | // Converts a timestamp (seconds elapsed since 1970-01-01T00:00:00, could be 51 | // negative to represent time before 1970-01-01) to DateTime. Returns false 52 | // if the timestamp is not in the range between 0001-01-01T00:00:00 and 53 | // 9999-12-31T23:59:59. 54 | bool PROTOBUF_EXPORT SecondsToDateTime(int64 seconds, DateTime* time); 55 | // Converts DateTime to a timestamp (seconds since 1970-01-01T00:00:00). 56 | // Returns false if the DateTime is not valid or is not in the valid range. 57 | bool PROTOBUF_EXPORT DateTimeToSeconds(const DateTime& time, int64* seconds); 58 | 59 | void PROTOBUF_EXPORT GetCurrentTime(int64* seconds, int32* nanos); 60 | 61 | // Formats a time string in RFC3339 format. 62 | // 63 | // For example, "2015-05-20T13:29:35.120Z". For nanos, 0, 3, 6 or 9 fractional 64 | // digits will be used depending on how many are required to represent the exact 65 | // value. 66 | // 67 | // Note that "nanos" must in the range of [0, 999999999]. 68 | std::string PROTOBUF_EXPORT FormatTime(int64 seconds, int32 nanos); 69 | // Parses a time string. This method accepts RFC3339 date/time string with UTC 70 | // offset. For example, "2015-05-20T13:29:35.120-08:00". 71 | bool PROTOBUF_EXPORT ParseTime(const std::string& value, int64* seconds, 72 | int32* nanos); 73 | 74 | } // namespace internal 75 | } // namespace protobuf 76 | } // namespace google 77 | 78 | #include 79 | 80 | #endif // GOOGLE_PROTOBUF_STUBS_TIME_H_ 81 | -------------------------------------------------------------------------------- /src/sentencepiece/src/unicode_script.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef UNICODE_SCRIPT_H_ 16 | #define UNICODE_SCRIPT_H_ 17 | 18 | #include "common.h" 19 | 20 | namespace sentencepiece { 21 | namespace unicode_script { 22 | enum ScriptType : int32_t { 23 | U_Adlam, 24 | U_Ahom, 25 | U_Anatolian_Hieroglyphs, 26 | U_Arabic, 27 | U_Armenian, 28 | U_Avestan, 29 | U_Balinese, 30 | U_Bamum, 31 | U_Bassa_Vah, 32 | U_Batak, 33 | U_Bengali, 34 | U_Bhaiksuki, 35 | U_Bopomofo, 36 | U_Brahmi, 37 | U_Braille, 38 | U_Buginese, 39 | U_Buhid, 40 | U_Canadian_Aboriginal, 41 | U_Carian, 42 | U_Caucasian_Albanian, 43 | U_Chakma, 44 | U_Cham, 45 | U_Cherokee, 46 | U_Common, 47 | U_Coptic, 48 | U_Cuneiform, 49 | U_Cypriot, 50 | U_Cyrillic, 51 | U_Deseret, 52 | U_Devanagari, 53 | U_Duployan, 54 | U_Egyptian_Hieroglyphs, 55 | U_Elbasan, 56 | U_Ethiopic, 57 | U_Georgian, 58 | U_Glagolitic, 59 | U_Gothic, 60 | U_Grantha, 61 | U_Greek, 62 | U_Gujarati, 63 | U_Gurmukhi, 64 | U_Han, 65 | U_Hangul, 66 | U_Hanunoo, 67 | U_Hatran, 68 | U_Hebrew, 69 | U_Hiragana, 70 | U_Imperial_Aramaic, 71 | U_Inherited, 72 | U_Inscriptional_Pahlavi, 73 | U_Inscriptional_Parthian, 74 | U_Javanese, 75 | U_Kaithi, 76 | U_Kannada, 77 | U_Katakana, 78 | U_Kayah_Li, 79 | U_Kharoshthi, 80 | U_Khmer, 81 | U_Khojki, 82 | U_Khudawadi, 83 | U_Lao, 84 | U_Latin, 85 | U_Lepcha, 86 | U_Limbu, 87 | U_Linear_A, 88 | U_Linear_B, 89 | U_Lisu, 90 | U_Lycian, 91 | U_Lydian, 92 | U_Mahajani, 93 | U_Malayalam, 94 | U_Mandaic, 95 | U_Manichaean, 96 | U_Marchen, 97 | U_Meetei_Mayek, 98 | U_Mende_Kikakui, 99 | U_Meroitic_Cursive, 100 | U_Meroitic_Hieroglyphs, 101 | U_Miao, 102 | U_Modi, 103 | U_Mongolian, 104 | U_Mro, 105 | U_Multani, 106 | U_Myanmar, 107 | U_Nabataean, 108 | U_New_Tai_Lue, 109 | U_Newa, 110 | U_Nko, 111 | U_Ogham, 112 | U_Ol_Chiki, 113 | U_Old_Hungarian, 114 | U_Old_Italic, 115 | U_Old_North_Arabian, 116 | U_Old_Permic, 117 | U_Old_Persian, 118 | U_Old_South_Arabian, 119 | U_Old_Turkic, 120 | U_Oriya, 121 | U_Osage, 122 | U_Osmanya, 123 | U_Pahawh_Hmong, 124 | U_Palmyrene, 125 | U_Pau_Cin_Hau, 126 | U_Phags_Pa, 127 | U_Phoenician, 128 | U_Psalter_Pahlavi, 129 | U_Rejang, 130 | U_Runic, 131 | U_Samaritan, 132 | U_Saurashtra, 133 | U_Sharada, 134 | U_Shavian, 135 | U_Siddham, 136 | U_SignWriting, 137 | U_Sinhala, 138 | U_Sora_Sompeng, 139 | U_Sundanese, 140 | U_Syloti_Nagri, 141 | U_Syriac, 142 | U_Tagalog, 143 | U_Tagbanwa, 144 | U_Tai_Le, 145 | U_Tai_Tham, 146 | U_Tai_Viet, 147 | U_Takri, 148 | U_Tamil, 149 | U_Tangut, 150 | U_Telugu, 151 | U_Thaana, 152 | U_Thai, 153 | U_Tibetan, 154 | U_Tifinagh, 155 | U_Tirhuta, 156 | U_Ugaritic, 157 | U_Vai, 158 | U_Warang_Citi, 159 | U_Yi 160 | }; 161 | 162 | ScriptType GetScript(char32 c); 163 | } // namespace unicode_script 164 | } // namespace sentencepiece 165 | #endif // UNICODE_SCRIPT 166 | -------------------------------------------------------------------------------- /man/sentencepiece.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sentencepiece.R 3 | \name{sentencepiece} 4 | \alias{sentencepiece} 5 | \title{Construct a Sentencepiece model} 6 | \usage{ 7 | sentencepiece( 8 | x, 9 | type = c("bpe", "char", "unigram", "word"), 10 | vocab_size = 8000, 11 | coverage = 0.9999, 12 | model_prefix = "sentencepiece", 13 | model_dir = tempdir(), 14 | threads = 1L, 15 | args, 16 | verbose = FALSE 17 | ) 18 | } 19 | \arguments{ 20 | \item{x}{a character vector of path(s) to the text files containing training data} 21 | 22 | \item{type}{either one of 'bpe', 'char', 'unigram' or 'word' for Byte Pair Encoding, Character level encoding, 23 | Unigram encoding or pretokenised word encoding. Defaults to 'bpe' (Byte Pair Encoding).} 24 | 25 | \item{vocab_size}{integer indicating the number of tokens in the final vocabulary. Defaults to 8000.} 26 | 27 | \item{coverage}{fraction of characters covered by the model. Must be in the range [0, 1]. A good value to use is about 0.9999.} 28 | 29 | \item{model_prefix}{character string with the name of the model. Defaults to 'sentencepiece'. 30 | When executing the function 2 files will be created in the directory specified by \code{model_dir}, namely 31 | sentencepiece.model with the model and sentencepiece.vocab containing the vocabulary of the model. 32 | You can change the name of the model by providing the \code{model_prefix} argument.} 33 | 34 | \item{model_dir}{directory where the model will be saved. Defaults to the temporary directory (tempdir())} 35 | 36 | \item{threads}{integer indicating number of threads to use when building the model} 37 | 38 | \item{args}{character string with arguments passed on to sentencepiece::SentencePieceTrainer::Train (for expert use only)} 39 | 40 | \item{verbose}{logical indicating to show progress of sentencepiece training. Defaults to \code{FALSE}.} 41 | } 42 | \value{ 43 | an object of class \code{sentencepiece} which is defined at \code{\link{sentencepiece_load_model}} 44 | } 45 | \description{ 46 | Construct a Sentencepiece model on text. 47 | } 48 | \examples{ 49 | library(tokenizers.bpe) 50 | data(belgium_parliament, package = "tokenizers.bpe") 51 | path <- "traindata.txt" 52 | folder <- getwd() 53 | \dontshow{ 54 | path <- tempfile("traindata_", fileext = ".txt") 55 | folder <- tempdir() 56 | } 57 | writeLines(belgium_parliament$text, con = path) 58 | \dontshow{ 59 | model <- sentencepiece(path, type = "char", vocab_size = 30, model_dir = folder) 60 | model <- sentencepiece(path, type = "unigram", vocab_size = 50, model_dir = folder) 61 | model <- sentencepiece(path, type = "bpe", vocab_size = 200, model_dir = folder) 62 | } 63 | \donttest{ 64 | model <- sentencepiece(path, type = "char", 65 | model_dir = folder, verbose = TRUE) 66 | model <- sentencepiece(path, type = "unigram", vocab_size = 20000, 67 | model_dir = folder, verbose = TRUE) 68 | model <- sentencepiece(path, type = "bpe", vocab_size = 4000, 69 | model_dir = folder, verbose = TRUE) 70 | 71 | txt <- c("De eigendomsoverdracht aan de deelstaten is ingewikkeld.", 72 | "On est d'accord sur le prix de la biere?") 73 | sentencepiece_encode(model, x = txt, type = "subwords") 74 | sentencepiece_encode(model, x = txt, type = "ids") 75 | 76 | 77 | model <- sentencepiece_load_model(file.path(folder, "sentencepiece.model")) 78 | sentencepiece_encode(model, x = txt, type = "subwords") 79 | sentencepiece_encode(model, x = txt, type = "ids") 80 | } 81 | 82 | \dontshow{ 83 | # clean up for CRAN 84 | file.remove(file.path(folder, "sentencepiece.model")) 85 | file.remove(file.path(folder, "sentencepiece.vocab")) 86 | file.remove(path) 87 | } 88 | } 89 | \seealso{ 90 | \code{\link{sentencepiece_load_model}} 91 | } 92 | -------------------------------------------------------------------------------- /src/third_party/protobuf-lite/generated_enum_util.cc: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | #include 32 | 33 | #include 34 | 35 | #include 36 | 37 | namespace google { 38 | namespace protobuf { 39 | namespace internal { 40 | namespace { 41 | 42 | bool EnumCompareByName(const EnumEntry& a, const EnumEntry& b) { 43 | return StringPiece(a.name) < StringPiece(b.name); 44 | } 45 | 46 | // Gets the numeric value of the EnumEntry at the given index, but returns a 47 | // special value for the index -1. This gives a way to use std::lower_bound on a 48 | // sorted array of indices while searching for value that we associate with -1. 49 | int GetValue(const EnumEntry* enums, int i, int target) { 50 | if (i == -1) { 51 | return target; 52 | } else { 53 | return enums[i].value; 54 | } 55 | } 56 | 57 | } // namespace 58 | 59 | bool LookUpEnumValue(const EnumEntry* enums, size_t size, 60 | StringPiece name, int* value) { 61 | EnumEntry target{name, 0}; 62 | auto it = std::lower_bound(enums, enums + size, target, EnumCompareByName); 63 | if (it != enums + size && it->name == name) { 64 | *value = it->value; 65 | return true; 66 | } 67 | return false; 68 | } 69 | 70 | int LookUpEnumName(const EnumEntry* enums, const int* sorted_indices, 71 | size_t size, int value) { 72 | auto comparator = [enums, value](int a, int b) { 73 | return GetValue(enums, a, value) < GetValue(enums, b, value); 74 | }; 75 | auto it = 76 | std::lower_bound(sorted_indices, sorted_indices + size, -1, comparator); 77 | if (it != sorted_indices + size && enums[*it].value == value) { 78 | return it - sorted_indices; 79 | } 80 | return -1; 81 | } 82 | 83 | bool InitializeEnumStrings( 84 | const EnumEntry* enums, const int* sorted_indices, size_t size, 85 | internal::ExplicitlyConstructed* enum_strings) { 86 | for (int i = 0; i < size; ++i) { 87 | enum_strings[i].Construct(enums[sorted_indices[i]].name); 88 | internal::OnShutdownDestroyString(enum_strings[i].get_mutable()); 89 | } 90 | return true; 91 | } 92 | 93 | } // namespace internal 94 | } // namespace protobuf 95 | } // namespace google 96 | -------------------------------------------------------------------------------- /src/third_party/protobuf-lite/google/protobuf/stubs/stringprintf.h: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2012 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | // from google3/base/stringprintf.h 32 | // 33 | // Printf variants that place their output in a C++ string. 34 | // 35 | // Usage: 36 | // string result = StringPrintf("%d %s\n", 10, "hello"); 37 | // SStringPrintf(&result, "%d %s\n", 10, "hello"); 38 | // StringAppendF(&result, "%d %s\n", 20, "there"); 39 | 40 | #ifndef GOOGLE_PROTOBUF_STUBS_STRINGPRINTF_H 41 | #define GOOGLE_PROTOBUF_STUBS_STRINGPRINTF_H 42 | 43 | #include 44 | #include 45 | #include 46 | 47 | #include 48 | 49 | #include 50 | 51 | namespace google { 52 | namespace protobuf { 53 | 54 | // Return a C++ string 55 | PROTOBUF_EXPORT extern std::string StringPrintf(const char* format, ...); 56 | 57 | // Store result into a supplied string and return it 58 | PROTOBUF_EXPORT extern const std::string& SStringPrintf(std::string* dst, 59 | const char* format, 60 | ...); 61 | 62 | // Append result to a supplied string 63 | PROTOBUF_EXPORT extern void StringAppendF(std::string* dst, const char* format, 64 | ...); 65 | 66 | // Lower-level routine that takes a va_list and appends to a specified 67 | // string. All other routines are just convenience wrappers around it. 68 | PROTOBUF_EXPORT extern void StringAppendV(std::string* dst, const char* format, 69 | va_list ap); 70 | 71 | // The max arguments supported by StringPrintfVector 72 | PROTOBUF_EXPORT extern const int kStringPrintfVectorMaxArgs; 73 | 74 | // You can use this version when all your arguments are strings, but 75 | // you don't know how many arguments you'll have at compile time. 76 | // StringPrintfVector will LOG(FATAL) if v.size() > kStringPrintfVectorMaxArgs 77 | PROTOBUF_EXPORT extern std::string StringPrintfVector( 78 | const char* format, const std::vector& v); 79 | 80 | } // namespace protobuf 81 | } // namespace google 82 | 83 | #include 84 | 85 | #endif // GOOGLE_PROTOBUF_STUBS_STRINGPRINTF_H 86 | -------------------------------------------------------------------------------- /src/sentencepiece/src/filesystem.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include 16 | 17 | #include "filesystem.h" 18 | #include "third_party/absl/memory/memory.h" 19 | #include "util.h" 20 | 21 | #if defined(OS_WIN) && defined(UNICODE) && defined(_UNICODE) 22 | #define WPATH(path) (::sentencepiece::win32::Utf8ToWide(path).c_str()) 23 | #else 24 | #define WPATH(path) (path) 25 | #endif 26 | 27 | namespace sentencepiece { 28 | namespace filesystem { 29 | 30 | class PosixReadableFile : public ReadableFile { 31 | public: 32 | PosixReadableFile(absl::string_view filename, bool is_binary = false) 33 | : is_(filename.empty() 34 | ? &std::cin 35 | : new std::ifstream(WPATH(filename.data()), 36 | is_binary ? std::ios::binary | std::ios::in 37 | : std::ios::in)) { 38 | if (!*is_) 39 | status_ = util::StatusBuilder(util::StatusCode::kNotFound, GTL_LOC) 40 | << "\"" << filename.data() << "\": " << util::StrError(errno); 41 | } 42 | 43 | ~PosixReadableFile() { 44 | if (is_ != &std::cin) delete is_; 45 | } 46 | 47 | util::Status status() const { return status_; } 48 | 49 | bool ReadLine(std::string *line) { 50 | return static_cast(std::getline(*is_, *line)); 51 | } 52 | 53 | bool ReadAll(std::string *line) { 54 | if (is_ == &std::cin) { 55 | LOG(ERROR) << "ReadAll is not supported for stdin."; 56 | return false; 57 | } 58 | line->assign(std::istreambuf_iterator(*is_), 59 | std::istreambuf_iterator()); 60 | return true; 61 | } 62 | 63 | private: 64 | util::Status status_; 65 | std::istream *is_; 66 | }; 67 | 68 | class PosixWritableFile : public WritableFile { 69 | public: 70 | PosixWritableFile(absl::string_view filename, bool is_binary = false) 71 | : os_(new std::ofstream(WPATH(filename.data()), 72 | is_binary ? std::ios::binary | std::ios::out 73 | : std::ios::out)) { 74 | if (!*os_) 75 | status_ = 76 | util::StatusBuilder(util::StatusCode::kPermissionDenied, GTL_LOC) 77 | << "\"" << filename.data() << "\": " << util::StrError(errno); 78 | } 79 | 80 | ~PosixWritableFile() { 81 | delete os_; 82 | } 83 | 84 | util::Status status() const { return status_; } 85 | 86 | bool Write(absl::string_view text) { 87 | os_->write(text.data(), text.size()); 88 | return os_->good(); 89 | } 90 | 91 | bool WriteLine(absl::string_view text) { return Write(text) && Write("\n"); } 92 | 93 | private: 94 | util::Status status_; 95 | std::ostream *os_; 96 | }; 97 | 98 | using DefaultReadableFile = PosixReadableFile; 99 | using DefaultWritableFile = PosixWritableFile; 100 | 101 | std::unique_ptr NewReadableFile(absl::string_view filename, 102 | bool is_binary) { 103 | return absl::make_unique(filename, is_binary); 104 | } 105 | 106 | std::unique_ptr NewWritableFile(absl::string_view filename, 107 | bool is_binary) { 108 | return absl::make_unique(filename, is_binary); 109 | } 110 | 111 | } // namespace filesystem 112 | } // namespace sentencepiece 113 | -------------------------------------------------------------------------------- /man/sentencepiece_download_model.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bpemb.R 3 | \name{sentencepiece_download_model} 4 | \alias{sentencepiece_download_model} 5 | \title{Download a Sentencepiece model} 6 | \usage{ 7 | sentencepiece_download_model( 8 | language, 9 | vocab_size, 10 | dim, 11 | model_dir = system.file(package = "sentencepiece", "models") 12 | ) 13 | } 14 | \arguments{ 15 | \item{language}{a character string with the language name. This can be either a plain language or a wikipedia shorthand. \cr 16 | Possible values can be found by looking at the examples or typing sentencepiece:::.bpemb$languages \cr 17 | If you provide multi it downloads the multilingual model available at \url{https://bpemb.h-its.org/multi/}} 18 | 19 | \item{vocab_size}{integer indicating the number of tokens in the final vocabulary. Defaults to 5000. Possible values depend on the language. To inspect possible values, type sentencepiece:::.bpemb$vocab_sizes and look to your language of your choice.} 20 | 21 | \item{dim}{dimension of the embedding. Either 25, 50, 100, 200 or 300.} 22 | 23 | \item{model_dir}{path to the location where the model will be downloaded to. Defaults to \code{system.file(package = "sentencepiece", "models")}.} 24 | } 25 | \value{ 26 | a list with elements 27 | \itemize{ 28 | \item{language: the provided language} 29 | \item{wikicode: the wikipedia code of the provided language} 30 | \item{file_model: the path to the downloaded Sentencepiece model} 31 | \item{url: the url where the Sentencepiece model was fetched from} 32 | \item{download_failed: logical, indicating if the download failed} 33 | \item{download_message: a character string with possible download failure information} 34 | \item{glove: a list with elements file_model, url, download_failed and download_message indicating the path to the Glove embeddings in txt format. Only present if the dim argument is provided in the function. Otherwise the embeddings will not be downloaded} 35 | \item{glove.bin: a list with elements file_model, url, download_failed and download_message indicating the path to the Glove embeddings in bin format. Only present if the dim argument is provided in the function. Otherwise the embeddings will not be downloaded} 36 | } 37 | } 38 | \description{ 39 | Download pretrained models built on Wikipedia 40 | made available at \url{https://bpemb.h-its.org} through \url{https://github.com/bheinzerling/bpemb}. 41 | These models contain Byte Pair Encoded models trained with sentencepiece as well 42 | as Glove embeddings of these Byte Pair subwords. Models for 275 languages are available. 43 | } 44 | \examples{ 45 | path <- getwd() 46 | \dontshow{ 47 | path <- tempdir() 48 | } 49 | \donttest{ 50 | 51 | ## 52 | ## Download only the tokeniser model 53 | ## 54 | dl <- sentencepiece_download_model("Russian", vocab_size = 50000, model_dir = path) 55 | dl <- sentencepiece_download_model("English", vocab_size = 100000, model_dir = path) 56 | dl <- sentencepiece_download_model("French", vocab_size = 25000, model_dir = path) 57 | dl <- sentencepiece_download_model("multi", vocab_size = 320000, model_dir = path) 58 | dl <- sentencepiece_download_model("Vlaams", vocab_size = 1000, model_dir = path) 59 | dl <- sentencepiece_download_model("Dutch", vocab_size = 25000, model_dir = path) 60 | dl <- sentencepiece_download_model("nl", vocab_size = 25000, model_dir = path) 61 | str(dl) 62 | model <- sentencepiece_load_model(dl$file_model) 63 | 64 | ## 65 | ## Download the tokeniser model + Glove embeddings of Byte Pairs 66 | ## 67 | dl <- sentencepiece_download_model("nl", vocab_size = 1000, dim = 50, model_dir = path) 68 | str(dl) 69 | model <- sentencepiece_load_model(dl$file_model) 70 | embedding <- read_word2vec(dl$glove$file_model) 71 | } 72 | 73 | 74 | dl <- sentencepiece_download_model("nl", vocab_size = 1000, dim = 25, 75 | model_dir = tempdir()) 76 | str(dl) 77 | 78 | \dontshow{ 79 | # clean up for CRAN 80 | f <- list.files(tempdir(), pattern = ".vocab$|.model$", full.names = TRUE) 81 | invisible(file.remove(f)) 82 | } 83 | } 84 | \seealso{ 85 | \code{\link{sentencepiece_load_model}} 86 | } 87 | -------------------------------------------------------------------------------- /src/sentencepiece/src/char_model_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include 16 | 17 | #include "char_model.h" 18 | #include "testharness.h" 19 | #include "util.h" 20 | 21 | namespace sentencepiece { 22 | namespace character { 23 | namespace { 24 | 25 | // Space symbol (U+2581) 26 | #define WS "\xe2\x96\x81" 27 | 28 | ModelProto MakeBaseModelProto() { 29 | ModelProto model_proto; 30 | auto *sp1 = model_proto.add_pieces(); 31 | auto *sp2 = model_proto.add_pieces(); 32 | auto *sp3 = model_proto.add_pieces(); 33 | 34 | sp1->set_type(ModelProto::SentencePiece::UNKNOWN); 35 | sp1->set_piece(""); 36 | sp2->set_type(ModelProto::SentencePiece::CONTROL); 37 | sp2->set_piece(""); 38 | sp3->set_type(ModelProto::SentencePiece::CONTROL); 39 | sp3->set_piece(""); 40 | 41 | return model_proto; 42 | } 43 | 44 | void AddPiece(ModelProto *model_proto, const std::string &piece, 45 | float score = 0.0) { 46 | auto *sp = model_proto->add_pieces(); 47 | sp->set_piece(piece); 48 | sp->set_score(score); 49 | } 50 | 51 | TEST(ModelTest, EncodeTest) { 52 | ModelProto model_proto = MakeBaseModelProto(); 53 | 54 | AddPiece(&model_proto, WS, 0.0); 55 | AddPiece(&model_proto, "a", 0.1); 56 | AddPiece(&model_proto, "b", 0.2); 57 | AddPiece(&model_proto, "c", 0.3); 58 | AddPiece(&model_proto, "d", 0.4); 59 | AddPiece(&model_proto, "ABC", 0.4); 60 | model_proto.mutable_pieces(8)->set_type( 61 | ModelProto::SentencePiece::USER_DEFINED); 62 | 63 | const Model model(model_proto); 64 | 65 | EncodeResult result; 66 | 67 | result = model.Encode(""); 68 | EXPECT_TRUE(result.empty()); 69 | 70 | result = model.Encode(WS "a" WS "b" WS "c"); 71 | EXPECT_EQ(6, result.size()); 72 | EXPECT_EQ(WS, result[0].first); 73 | EXPECT_EQ("a", result[1].first); 74 | EXPECT_EQ(WS, result[2].first); 75 | EXPECT_EQ("b", result[3].first); 76 | EXPECT_EQ(WS, result[4].first); 77 | EXPECT_EQ("c", result[5].first); 78 | 79 | result = model.Encode(WS "ab" WS "cd" WS "abc"); 80 | EXPECT_EQ(10, result.size()); 81 | EXPECT_EQ(WS, result[0].first); 82 | EXPECT_EQ("a", result[1].first); 83 | EXPECT_EQ("b", result[2].first); 84 | EXPECT_EQ(WS, result[3].first); 85 | EXPECT_EQ("c", result[4].first); 86 | EXPECT_EQ("d", result[5].first); 87 | EXPECT_EQ(WS, result[6].first); 88 | EXPECT_EQ("a", result[7].first); 89 | EXPECT_EQ("b", result[8].first); 90 | EXPECT_EQ("c", result[9].first); 91 | 92 | // makes a broken utf-8 93 | const std::string broken_utf8 = std::string("あ").substr(0, 1); 94 | result = model.Encode(broken_utf8); 95 | EXPECT_EQ(1, result.size()); 96 | EXPECT_EQ(broken_utf8, result[0].first); 97 | 98 | // "ABC" is treated as one piece, as it is USER_DEFINED. 99 | result = model.Encode(WS "abABCcd"); 100 | EXPECT_EQ(6, result.size()); 101 | EXPECT_EQ(WS, result[0].first); 102 | EXPECT_EQ("a", result[1].first); 103 | EXPECT_EQ("b", result[2].first); 104 | EXPECT_EQ("ABC", result[3].first); 105 | EXPECT_EQ("c", result[4].first); 106 | EXPECT_EQ("d", result[5].first); 107 | } 108 | 109 | TEST(CharModelTest, NotSupportedTest) { 110 | ModelProto model_proto = MakeBaseModelProto(); 111 | const Model model(model_proto); 112 | EXPECT_EQ(NBestEncodeResult(), model.NBestEncode("test", 10)); 113 | EXPECT_EQ(EncodeResult(), model.SampleEncode("test", 0.1)); 114 | } 115 | 116 | } // namespace 117 | } // namespace character 118 | } // namespace sentencepiece 119 | -------------------------------------------------------------------------------- /src/third_party/protobuf-lite/google/protobuf/has_bits.h: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | #ifndef GOOGLE_PROTOBUF_HAS_BITS_H__ 32 | #define GOOGLE_PROTOBUF_HAS_BITS_H__ 33 | 34 | #include 35 | #include 36 | 37 | #include 38 | 39 | #ifdef SWIG 40 | #error "You cannot SWIG proto headers" 41 | #endif 42 | 43 | namespace google { 44 | namespace protobuf { 45 | namespace internal { 46 | 47 | template 48 | class HasBits { 49 | public: 50 | constexpr HasBits() PROTOBUF_ALWAYS_INLINE : has_bits_{} {} 51 | 52 | void Clear() PROTOBUF_ALWAYS_INLINE { 53 | memset(has_bits_, 0, sizeof(has_bits_)); 54 | } 55 | 56 | uint32& operator[](int index) PROTOBUF_ALWAYS_INLINE { 57 | return has_bits_[index]; 58 | } 59 | 60 | const uint32& operator[](int index) const PROTOBUF_ALWAYS_INLINE { 61 | return has_bits_[index]; 62 | } 63 | 64 | bool operator==(const HasBits& rhs) const { 65 | return memcmp(has_bits_, rhs.has_bits_, sizeof(has_bits_)) == 0; 66 | } 67 | 68 | bool operator!=(const HasBits& rhs) const { 69 | return !(*this == rhs); 70 | } 71 | 72 | void Or(const HasBits& rhs) { 73 | for (size_t i = 0; i < doublewords; i++) has_bits_[i] |= rhs[i]; 74 | } 75 | 76 | bool empty() const; 77 | 78 | private: 79 | uint32 has_bits_[doublewords]; 80 | }; 81 | 82 | template <> 83 | inline bool HasBits<1>::empty() const { 84 | return !has_bits_[0]; 85 | } 86 | 87 | template <> 88 | inline bool HasBits<2>::empty() const { 89 | return !(has_bits_[0] | has_bits_[1]); 90 | } 91 | 92 | template <> 93 | inline bool HasBits<3>::empty() const { 94 | return !(has_bits_[0] | has_bits_[1] | has_bits_[2]); 95 | } 96 | 97 | template <> 98 | inline bool HasBits<4>::empty() const { 99 | return !(has_bits_[0] | has_bits_[1] | has_bits_[2] | has_bits_[3]); 100 | } 101 | 102 | template 103 | inline bool HasBits::empty() const { 104 | for (size_t i = 0; i < doublewords; ++i) { 105 | if (has_bits_[i]) return false; 106 | } 107 | return true; 108 | } 109 | 110 | } // namespace internal 111 | } // namespace protobuf 112 | } // namespace google 113 | 114 | #include 115 | 116 | #endif // GOOGLE_PROTOBUF_HAS_BITS_H__ 117 | -------------------------------------------------------------------------------- /man/sentencepiece_encode.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sentencepiece.R 3 | \name{sentencepiece_encode} 4 | \alias{sentencepiece_encode} 5 | \title{Tokenise text alongside a Sentencepiece model} 6 | \usage{ 7 | sentencepiece_encode( 8 | model, 9 | x, 10 | type = c("subwords", "ids"), 11 | nbest = -1L, 12 | alpha = 0.1 13 | ) 14 | } 15 | \arguments{ 16 | \item{model}{an object of class \code{sentencepiece} as returned by \code{\link{sentencepiece_load_model}} or \code{\link{sentencepiece}}} 17 | 18 | \item{x}{a character vector of text (in UTF-8 Encoding)} 19 | 20 | \item{type}{a character string, either 'subwords' or 'ids' to get the subwords or the corresponding ids of these subwords as defined in the vocabulary of the model. 21 | Defaults to 'subwords'.} 22 | 23 | \item{nbest}{integer indicating the number of segmentations to extract. See the details. The argument is not used if you do not provide a value for it.} 24 | 25 | \item{alpha}{smoothing parameter to perform subword regularisation. Typical values are 0.1, 0.2 or 0.5. See the details. The argument is not used if you do not provide a value for it or do not provide a value for \code{nbest}.} 26 | } 27 | \value{ 28 | a list with tokenised text, one for each element of \code{x} 29 | unless you provide \code{nbest} without providing \code{alpha} in which case the result is a list of list of \code{nbest} tokenised texts 30 | } 31 | \description{ 32 | Tokenise text alongside a Sentencepiece model 33 | } 34 | \details{ 35 | If you specify \code{alpha} to perform subword regularisation, keep in mind the following. \cr 36 | When alpha is 0.0, one segmentation is uniformly sampled from the \code{nbest} or lattice. 37 | The best Viterbi segmentation is more likely sampled when setting larger \code{alpha} values like 0.1. \cr 38 | \itemize{ 39 | \item If you provide a positive value for \code{nbest}, approximately samples one segmentation from \code{nbest} candidates. 40 | \item If you provide a negative value for \code{nbest}, samples one segmentation from the hypotheses (Lattice) according to the generation probabilities using forward-filtering and backward-sampling algorithm. 41 | } 42 | \code{nbest} and \code{alpha} correspond respectively to the parameter \code{l} and in \code{alpha} 43 | in the paper \url{https://arxiv.org/abs/1804.10959} where (\code{nbest} < 0 means l = infinity).\cr 44 | 45 | If the model is a BPE model, \code{alpha} is the merge probability \code{p} explained in \url{https://arxiv.org/abs/1910.13267}. 46 | In a BPE model, nbest-based sampling is not supported so the nbest parameter is ignored although 47 | it still needs to be provided if you want to make use of \code{alpha}. 48 | } 49 | \examples{ 50 | model <- system.file(package = "sentencepiece", "models", "nl-fr-dekamer.model") 51 | model <- sentencepiece_load_model(file = model) 52 | 53 | txt <- c("De eigendomsoverdracht aan de deelstaten is ingewikkeld.", 54 | "On est d'accord sur le prix de la biere?") 55 | sentencepiece_encode(model, x = txt, type = "subwords") 56 | sentencepiece_encode(model, x = txt, type = "ids") 57 | 58 | ## Examples using subword regularisation 59 | model <- system.file(package = "sentencepiece", "models", "nl-fr-dekamer-unigram.model") 60 | model <- sentencepiece_load_model(file = model) 61 | 62 | txt <- c("Goed zo", 63 | "On est d'accord") 64 | sentencepiece_encode(model, x = txt, type = "subwords", nbest = 4) 65 | sentencepiece_encode(model, x = txt, type = "ids", nbest = 4) 66 | sentencepiece_encode(model, x = txt, type = "subwords", nbest = 2) 67 | sentencepiece_encode(model, x = txt, type = "ids", nbest = 2) 68 | sentencepiece_encode(model, x = txt, type = "subwords", nbest = 1) 69 | sentencepiece_encode(model, x = txt, type = "ids", nbest = 1) 70 | sentencepiece_encode(model, x = txt, type = "subwords", nbest = 4, alpha = 0.1) 71 | sentencepiece_encode(model, x = txt, type = "ids", nbest = 4, alpha = 0.1) 72 | sentencepiece_encode(model, x = txt, type = "subwords", nbest = -1, alpha = 0.1) 73 | sentencepiece_encode(model, x = txt, type = "ids", nbest = -1, alpha = 0.1) 74 | sentencepiece_encode(model, x = txt, type = "subwords", nbest = -1, alpha = 0) 75 | sentencepiece_encode(model, x = txt, type = "ids", nbest = -1, alpha = 0) 76 | } 77 | -------------------------------------------------------------------------------- /src/third_party/protobuf-lite/google/protobuf/generated_enum_reflection.h: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | // Author: jasonh@google.com (Jason Hsueh) 32 | // 33 | // This header is logically internal, but is made public because it is used 34 | // from protocol-compiler-generated code, which may reside in other components. 35 | // It provides reflection support for generated enums, and is included in 36 | // generated .pb.h files and should have minimal dependencies. The methods are 37 | // implemented in generated_message_reflection.cc. 38 | 39 | #ifndef GOOGLE_PROTOBUF_GENERATED_ENUM_REFLECTION_H__ 40 | #define GOOGLE_PROTOBUF_GENERATED_ENUM_REFLECTION_H__ 41 | 42 | #include 43 | 44 | #include 45 | #include 46 | #include 47 | 48 | #ifdef SWIG 49 | #error "You cannot SWIG proto headers" 50 | #endif 51 | 52 | #include 53 | 54 | namespace google { 55 | namespace protobuf { 56 | class EnumDescriptor; 57 | } // namespace protobuf 58 | } // namespace google 59 | 60 | namespace google { 61 | namespace protobuf { 62 | 63 | // Returns the EnumDescriptor for enum type E, which must be a 64 | // proto-declared enum type. Code generated by the protocol compiler 65 | // will include specializations of this template for each enum type declared. 66 | template 67 | const EnumDescriptor* GetEnumDescriptor(); 68 | 69 | namespace internal { 70 | 71 | // Helper for EnumType_Parse functions: try to parse the string 'name' as 72 | // an enum name of the given type, returning true and filling in value on 73 | // success, or returning false and leaving value unchanged on failure. 74 | PROTOBUF_EXPORT bool ParseNamedEnum(const EnumDescriptor* descriptor, 75 | ConstStringParam name, int* value); 76 | 77 | template 78 | bool ParseNamedEnum(const EnumDescriptor* descriptor, ConstStringParam name, 79 | EnumType* value) { 80 | int tmp; 81 | if (!ParseNamedEnum(descriptor, name, &tmp)) return false; 82 | *value = static_cast(tmp); 83 | return true; 84 | } 85 | 86 | // Just a wrapper around printing the name of a value. The main point of this 87 | // function is not to be inlined, so that you can do this without including 88 | // descriptor.h. 89 | PROTOBUF_EXPORT const std::string& NameOfEnum(const EnumDescriptor* descriptor, 90 | int value); 91 | 92 | } // namespace internal 93 | } // namespace protobuf 94 | } // namespace google 95 | 96 | #include 97 | 98 | #endif // GOOGLE_PROTOBUF_GENERATED_ENUM_REFLECTION_H__ 99 | -------------------------------------------------------------------------------- /src/sentencepiece/src/unigram_model_trainer.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #ifndef UNIGRAM_MODEL_TRAINER_H_ 16 | #define UNIGRAM_MODEL_TRAINER_H_ 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #include "sentencepiece_model.pb.h" 24 | #include "third_party/absl/strings/string_view.h" 25 | #include "trainer_interface.h" 26 | #include "unigram_model.h" 27 | #include "util.h" 28 | 29 | namespace sentencepiece { 30 | namespace unigram { 31 | 32 | using string_util::UnicodeText; 33 | 34 | class TrainerModel : public Model { 35 | public: 36 | using SentencePieces = std::vector>; 37 | 38 | TrainerModel() {} 39 | TrainerModel(const ModelProto &model_proto) = delete; 40 | TrainerModel(const TrainerSpec &trainer_spec, 41 | const NormalizerSpec &normalizaiton_spec); 42 | ~TrainerModel() override; 43 | 44 | // Returns the sentencepieces. 45 | // The meta symbols, e.g., are NOT included. 46 | const SentencePieces &GetSentencePieces() const; 47 | 48 | // Sets sentencepieces. The sentencepieces are moved. 49 | // The meta symbols, e.g., are NOT included. 50 | void SetSentencePieces(SentencePieces &&sentencepieces); 51 | 52 | EncodeResult Encode(absl::string_view normalized) const override { 53 | return {}; 54 | } 55 | 56 | private: 57 | SentencePieces sentencepieces_; 58 | TrainerSpec trainer_spec_; 59 | NormalizerSpec normalizer_spec_; 60 | ModelProto model_proto_data_; 61 | }; 62 | 63 | class Trainer : public TrainerInterface { 64 | public: 65 | Trainer(const TrainerSpec &trainer_spec, 66 | const NormalizerSpec &normalizer_spec, 67 | const NormalizerSpec &denormalizer_spec) 68 | : TrainerInterface::TrainerInterface(trainer_spec, normalizer_spec, 69 | denormalizer_spec) {} 70 | 71 | util::Status Train() override; 72 | 73 | private: 74 | // FRIEND_TEST(TrainerTest, IsValidSentencePieceTest); 75 | 76 | // Makes seed pieces from the training corpus. 77 | // The size of seed pieces is determined by seed_sentencepiece_size. 78 | // node_int_type should be of integer type (int32 or int64), 79 | // determined by train_extremely_large_corpus. 80 | template 81 | TrainerModel::SentencePieces MakeSeedSentencePieces() const; 82 | 83 | // Executes the E step of EM and returns expected count. 84 | // The index of return array is the vocab id. 85 | // |objective| is a negative likelihood of the current model. 86 | // |num_token| is the number of total tokens to tokenize 87 | // training corpus. 88 | std::vector RunEStep(const TrainerModel &model, float *objective, 89 | int64 *num_tokens) const; 90 | 91 | // Executes the M step of EM with the expected frequency and 92 | // returns new pieces. 93 | TrainerModel::SentencePieces RunMStep( 94 | const TrainerModel &model, const std::vector &expected) const; 95 | 96 | // Heuristically prunes the current pieces. 97 | // This is called after each EM sub-iteration. 98 | TrainerModel::SentencePieces PruneSentencePieces( 99 | const TrainerModel &model) const; 100 | 101 | // Makes the final sentence pieces by incorporating the required characters 102 | // and control/user defined symbols. 103 | TrainerModel::SentencePieces FinalizeSentencePieces( 104 | const TrainerModel &model) const; 105 | 106 | // When the size of SentencePieces becomes less than desired_vocab_size_, 107 | // break the main training loop. desired_vocab_size_ = 1.1 * vocab_size_ 108 | // for now. 109 | int desired_vocab_size_; 110 | }; 111 | } // namespace unigram 112 | } // namespace sentencepiece 113 | #endif // UNIGRAM_MODEL_TRAINER_H_ 114 | -------------------------------------------------------------------------------- /src/third_party/protobuf-lite/google/protobuf/stubs/hash.h: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | // Author: kenton@google.com (Kenton Varda) 32 | 33 | #ifndef GOOGLE_PROTOBUF_STUBS_HASH_H__ 34 | #define GOOGLE_PROTOBUF_STUBS_HASH_H__ 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | 41 | # define GOOGLE_PROTOBUF_HASH_NAMESPACE_DECLARATION_START \ 42 | namespace google { \ 43 | namespace protobuf { 44 | # define GOOGLE_PROTOBUF_HASH_NAMESPACE_DECLARATION_END }} 45 | 46 | namespace google { 47 | namespace protobuf { 48 | 49 | template 50 | struct hash : public std::hash {}; 51 | 52 | template 53 | struct hash { 54 | inline size_t operator()(const Key* key) const { 55 | return reinterpret_cast(key); 56 | } 57 | }; 58 | 59 | // Unlike the old SGI version, the TR1 "hash" does not special-case char*. So, 60 | // we go ahead and provide our own implementation. 61 | template <> 62 | struct hash { 63 | inline size_t operator()(const char* str) const { 64 | size_t result = 0; 65 | for (; *str != '\0'; str++) { 66 | result = 5 * result + static_cast(*str); 67 | } 68 | return result; 69 | } 70 | }; 71 | 72 | template<> 73 | struct hash { 74 | size_t operator()(bool x) const { 75 | return static_cast(x); 76 | } 77 | }; 78 | 79 | template <> 80 | struct hash { 81 | inline size_t operator()(const std::string& key) const { 82 | return hash()(key.c_str()); 83 | } 84 | 85 | static const size_t bucket_size = 4; 86 | static const size_t min_buckets = 8; 87 | inline bool operator()(const std::string& a, const std::string& b) const { 88 | return a < b; 89 | } 90 | }; 91 | 92 | template 93 | struct hash > { 94 | inline size_t operator()(const std::pair& key) const { 95 | size_t first_hash = hash()(key.first); 96 | size_t second_hash = hash()(key.second); 97 | 98 | // FIXME(kenton): What is the best way to compute this hash? I have 99 | // no idea! This seems a bit better than an XOR. 100 | return first_hash * ((1 << 16) - 1) + second_hash; 101 | } 102 | 103 | static const size_t bucket_size = 4; 104 | static const size_t min_buckets = 8; 105 | inline bool operator()(const std::pair& a, 106 | const std::pair& b) const { 107 | return a < b; 108 | } 109 | }; 110 | 111 | } // namespace protobuf 112 | } // namespace google 113 | 114 | #endif // GOOGLE_PROTOBUF_STUBS_HASH_H__ 115 | -------------------------------------------------------------------------------- /src/sentencepiece/src/spm_decode_main.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | #include "common.h" 20 | #include "filesystem.h" 21 | #include "init.h" 22 | #include "sentencepiece.pb.h" 23 | #include "sentencepiece_processor.h" 24 | #include "third_party/absl/flags/flag.h" 25 | #include "third_party/absl/strings/str_split.h" 26 | #include "util.h" 27 | 28 | ABSL_FLAG(std::string, model, "", "model file name"); 29 | ABSL_FLAG(std::string, input, "", "input filename"); 30 | ABSL_FLAG(std::string, output, "", "output filename"); 31 | ABSL_FLAG(std::string, input_format, "piece", "choose from piece or id"); 32 | ABSL_FLAG(std::string, output_format, "string", "choose from string or proto"); 33 | ABSL_FLAG(std::string, extra_options, "", 34 | "':' separated encoder extra options, e.g., \"reverse:bos:eos\""); 35 | 36 | int main(int argc, char *argv[]) { 37 | sentencepiece::ParseCommandLineFlags(argv[0], &argc, &argv, true); 38 | std::vector rest_args; 39 | 40 | if (absl::GetFlag(FLAGS_input).empty()) { 41 | for (int i = 1; i < argc; ++i) { 42 | rest_args.push_back(std::string(argv[i])); 43 | } 44 | } else { 45 | rest_args.push_back(absl::GetFlag(FLAGS_input)); 46 | } 47 | 48 | if (rest_args.empty()) 49 | rest_args.push_back(""); // empty means that reading from stdin. 50 | 51 | CHECK(!absl::GetFlag(FLAGS_model).empty()); 52 | 53 | sentencepiece::SentencePieceProcessor sp; 54 | CHECK_OK(sp.Load(absl::GetFlag(FLAGS_model))); 55 | CHECK_OK(sp.SetDecodeExtraOptions(absl::GetFlag(FLAGS_extra_options))); 56 | 57 | auto output = 58 | sentencepiece::filesystem::NewWritableFile(absl::GetFlag(FLAGS_output)); 59 | CHECK_OK(output->status()); 60 | 61 | std::string detok, line; 62 | sentencepiece::SentencePieceText spt; 63 | std::function &pieces)> process; 64 | 65 | auto ToIds = [&](const std::vector &pieces) { 66 | std::vector ids; 67 | ids.reserve(pieces.size()); 68 | for (const auto &s : pieces) { 69 | ids.push_back(atoi(s.c_str())); 70 | } 71 | return ids; 72 | }; 73 | 74 | if (absl::GetFlag(FLAGS_input_format) == "piece") { 75 | if (absl::GetFlag(FLAGS_output_format) == "string") { 76 | process = [&](const std::vector &pieces) { 77 | CHECK_OK(sp.Decode(pieces, &detok)); 78 | output->WriteLine(detok); 79 | }; 80 | } else if (absl::GetFlag(FLAGS_output_format) == "proto") { 81 | process = [&](const std::vector &pieces) { 82 | CHECK_OK(sp.Decode(pieces, &spt)); 83 | }; 84 | } else { 85 | LOG(FATAL) << "Unknown output format: " 86 | << absl::GetFlag(FLAGS_output_format); 87 | } 88 | } else if (absl::GetFlag(FLAGS_input_format) == "id") { 89 | if (absl::GetFlag(FLAGS_output_format) == "string") { 90 | process = [&](const std::vector &pieces) { 91 | CHECK_OK(sp.Decode(ToIds(pieces), &detok)); 92 | output->WriteLine(detok); 93 | }; 94 | } else if (absl::GetFlag(FLAGS_output_format) == "proto") { 95 | process = [&](const std::vector &pieces) { 96 | CHECK_OK(sp.Decode(ToIds(pieces), &spt)); 97 | }; 98 | } else { 99 | LOG(FATAL) << "Unknown output format: " 100 | << absl::GetFlag(FLAGS_output_format); 101 | } 102 | } else { 103 | LOG(FATAL) << "Unknown input format: " << absl::GetFlag(FLAGS_input_format); 104 | } 105 | 106 | for (const auto &filename : rest_args) { 107 | auto input = sentencepiece::filesystem::NewReadableFile(filename); 108 | CHECK_OK(input->status()); 109 | while (input->ReadLine(&line)) { 110 | const auto pieces = absl::StrSplit(line, " "); 111 | process(pieces); 112 | } 113 | } 114 | 115 | return 0; 116 | } 117 | -------------------------------------------------------------------------------- /src/third_party/protobuf-lite/google/protobuf/stubs/status.h: -------------------------------------------------------------------------------- 1 | // Protocol Buffers - Google's data interchange format 2 | // Copyright 2008 Google Inc. All rights reserved. 3 | // https://developers.google.com/protocol-buffers/ 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are 7 | // met: 8 | // 9 | // * Redistributions of source code must retain the above copyright 10 | // notice, this list of conditions and the following disclaimer. 11 | // * Redistributions in binary form must reproduce the above 12 | // copyright notice, this list of conditions and the following disclaimer 13 | // in the documentation and/or other materials provided with the 14 | // distribution. 15 | // * Neither the name of Google Inc. nor the names of its 16 | // contributors may be used to endorse or promote products derived from 17 | // this software without specific prior written permission. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | #ifndef GOOGLE_PROTOBUF_STUBS_STATUS_H_ 31 | #define GOOGLE_PROTOBUF_STUBS_STATUS_H_ 32 | 33 | #include 34 | #include 35 | 36 | #include 37 | #include 38 | 39 | #include 40 | 41 | namespace google { 42 | namespace protobuf { 43 | namespace util { 44 | namespace error { 45 | // These values must match error codes defined in google/rpc/code.proto. 46 | enum Code { 47 | OK = 0, 48 | CANCELLED = 1, 49 | UNKNOWN = 2, 50 | INVALID_ARGUMENT = 3, 51 | DEADLINE_EXCEEDED = 4, 52 | NOT_FOUND = 5, 53 | ALREADY_EXISTS = 6, 54 | PERMISSION_DENIED = 7, 55 | UNAUTHENTICATED = 16, 56 | RESOURCE_EXHAUSTED = 8, 57 | FAILED_PRECONDITION = 9, 58 | ABORTED = 10, 59 | OUT_OF_RANGE = 11, 60 | UNIMPLEMENTED = 12, 61 | INTERNAL = 13, 62 | UNAVAILABLE = 14, 63 | DATA_LOSS = 15, 64 | }; 65 | } // namespace error 66 | 67 | class PROTOBUF_EXPORT Status { 68 | public: 69 | // Creates a "successful" status. 70 | Status(); 71 | 72 | // Create a status in the canonical error space with the specified 73 | // code, and error message. If "code == 0", error_message is 74 | // ignored and a Status object identical to Status::OK is 75 | // constructed. 76 | Status(error::Code error_code, StringPiece error_message); 77 | Status(const Status&); 78 | Status& operator=(const Status& x); 79 | ~Status() {} 80 | 81 | // Some pre-defined Status objects 82 | static const Status OK; // Identical to 0-arg constructor 83 | static const Status CANCELLED; 84 | static const Status UNKNOWN; 85 | 86 | // Accessor 87 | bool ok() const { 88 | return error_code_ == error::OK; 89 | } 90 | int error_code() const { 91 | return error_code_; 92 | } 93 | error::Code code() const { 94 | return error_code_; 95 | } 96 | StringPiece error_message() const { 97 | return error_message_; 98 | } 99 | StringPiece message() const { 100 | return error_message_; 101 | } 102 | 103 | bool operator==(const Status& x) const; 104 | bool operator!=(const Status& x) const { 105 | return !operator==(x); 106 | } 107 | 108 | // Return a combination of the error code name and message. 109 | std::string ToString() const; 110 | 111 | private: 112 | error::Code error_code_; 113 | std::string error_message_; 114 | }; 115 | 116 | // Prints a human-readable representation of 'x' to 'os'. 117 | PROTOBUF_EXPORT std::ostream& operator<<(std::ostream& os, const Status& x); 118 | 119 | } // namespace util 120 | } // namespace protobuf 121 | } // namespace google 122 | 123 | #include 124 | 125 | #endif // GOOGLE_PROTOBUF_STUBS_STATUS_H_ 126 | -------------------------------------------------------------------------------- /src/sentencepiece/src/spm_normalize_main.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.! 14 | 15 | #include "builder.h" 16 | #include "common.h" 17 | #include "filesystem.h" 18 | #include "init.h" 19 | #include "normalizer.h" 20 | #include "sentencepiece.pb.h" 21 | #include "sentencepiece_model.pb.h" 22 | #include "sentencepiece_processor.h" 23 | #include "sentencepiece_trainer.h" 24 | #include "third_party/absl/flags/flag.h" 25 | 26 | ABSL_FLAG(std::string, model, "", "Model file name"); 27 | ABSL_FLAG(bool, use_internal_normalization, false, 28 | "Use NormalizerSpec \"as-is\" to run the normalizer " 29 | "for SentencePiece segmentation"); 30 | ABSL_FLAG(std::string, normalization_rule_name, "", 31 | "Normalization rule name. " 32 | "Choose from nfkc or identity"); 33 | ABSL_FLAG(std::string, normalization_rule_tsv, "", 34 | "Normalization rule TSV file. "); 35 | ABSL_FLAG(bool, remove_extra_whitespaces, true, "Remove extra whitespaces"); 36 | ABSL_FLAG(bool, decompile, false, 37 | "Decompile compiled charamap and output it as TSV."); 38 | ABSL_FLAG(std::string, input, "", "Input filename"); 39 | ABSL_FLAG(std::string, output, "", "Output filename"); 40 | 41 | using sentencepiece::ModelProto; 42 | using sentencepiece::NormalizerSpec; 43 | using sentencepiece::SentencePieceProcessor; 44 | using sentencepiece::SentencePieceTrainer; 45 | using sentencepiece::normalizer::Builder; 46 | using sentencepiece::normalizer::Normalizer; 47 | 48 | int main(int argc, char *argv[]) { 49 | sentencepiece::ParseCommandLineFlags(argv[0], &argc, &argv, true); 50 | std::vector rest_args; 51 | 52 | if (absl::GetFlag(FLAGS_input).empty()) { 53 | for (int i = 1; i < argc; ++i) { 54 | rest_args.push_back(std::string(argv[i])); 55 | } 56 | } else { 57 | rest_args.push_back(absl::GetFlag(FLAGS_input)); 58 | } 59 | 60 | NormalizerSpec spec; 61 | 62 | if (!absl::GetFlag(FLAGS_model).empty()) { 63 | ModelProto model_proto; 64 | SentencePieceProcessor sp; 65 | CHECK_OK(sp.Load(absl::GetFlag(FLAGS_model))); 66 | spec = sp.model_proto().normalizer_spec(); 67 | } else if (!absl::GetFlag(FLAGS_normalization_rule_tsv).empty()) { 68 | spec.set_normalization_rule_tsv( 69 | absl::GetFlag(FLAGS_normalization_rule_tsv)); 70 | CHECK_OK(SentencePieceTrainer::PopulateNormalizerSpec(&spec)); 71 | } else if (!absl::GetFlag(FLAGS_normalization_rule_name).empty()) { 72 | spec.set_name(absl::GetFlag(FLAGS_normalization_rule_name)); 73 | CHECK_OK(SentencePieceTrainer::PopulateNormalizerSpec(&spec)); 74 | } else { 75 | LOG(FATAL) << "Sets --model, normalization_rule_tsv, or " 76 | "normalization_rule_name flag."; 77 | } 78 | 79 | // Uses the normalizer spec encoded in the model_pb. 80 | if (!absl::GetFlag(FLAGS_use_internal_normalization)) { 81 | spec.set_add_dummy_prefix(false); // do not add dummy prefix. 82 | spec.set_escape_whitespaces(false); // do not output meta symbol. 83 | spec.set_remove_extra_whitespaces( 84 | absl::GetFlag(FLAGS_remove_extra_whitespaces)); 85 | } 86 | 87 | if (absl::GetFlag(FLAGS_decompile)) { 88 | Builder::CharsMap chars_map; 89 | CHECK_OK( 90 | Builder::DecompileCharsMap(spec.precompiled_charsmap(), &chars_map)); 91 | CHECK_OK(Builder::SaveCharsMap(absl::GetFlag(FLAGS_output), chars_map)); 92 | } else { 93 | const Normalizer normalizer(spec); 94 | auto output = 95 | sentencepiece::filesystem::NewWritableFile(absl::GetFlag(FLAGS_output)); 96 | CHECK_OK(output->status()); 97 | 98 | if (rest_args.empty()) { 99 | rest_args.push_back(""); // empty means that read from stdin. 100 | } 101 | 102 | std::string line; 103 | for (const auto &filename : rest_args) { 104 | auto input = sentencepiece::filesystem::NewReadableFile(filename); 105 | CHECK_OK(input->status()); 106 | while (input->ReadLine(&line)) { 107 | output->WriteLine(normalizer.Normalize(line)); 108 | } 109 | } 110 | } 111 | 112 | return 0; 113 | } 114 | --------------------------------------------------------------------------------