├── inst
    ├── models
    │   ├── nl-fr-dekamer.model
    │   ├── nl.wiki.bpe.vs1000.model
    │   ├── nl-fr-dekamer-unigram.model
    │   └── nl.wiki.bpe.vs1000.d25.w2v.bin
    └── spc-help
    │   └── spm_train
├── src
    ├── third_party
    │   ├── CMakeLists.txt
    │   ├── absl
    │   │   ├── flags
    │   │   │   ├── parse.h
    │   │   │   └── flag.h
    │   │   ├── container
    │   │   │   ├── flat_hash_set.h
    │   │   │   └── flat_hash_map.h
    │   │   ├── strings
    │   │   │   ├── strip.h
    │   │   │   ├── numbers.h
    │   │   │   ├── str_format.h
    │   │   │   ├── match.h
    │   │   │   ├── ascii.h
    │   │   │   ├── str_cat.h
    │   │   │   ├── str_replace.h
    │   │   │   ├── str_join.h
    │   │   │   └── str_split.h
    │   │   └── memory
    │   │   │   └── memory.h
    │   ├── esaxx
    │   │   └── LICENSE
    │   ├── darts_clone
    │   │   └── LICENSE
    │   └── protobuf-lite
    │   │   ├── LICENSE
    │   │   ├── google
    │   │       └── protobuf
    │   │       │   ├── port.h
    │   │       │   ├── stubs
    │   │       │       ├── once.h
    │   │       │       ├── stl_util.h
    │   │       │       ├── time.h
    │   │       │       ├── stringprintf.h
    │   │       │       ├── hash.h
    │   │       │       └── status.h
    │   │       │   ├── generated_enum_util.h
    │   │       │   ├── has_bits.h
    │   │       │   └── generated_enum_reflection.h
    │   │   ├── statusor.cc
    │   │   ├── zero_copy_stream.cc
    │   │   ├── implicit_weak_message.cc
    │   │   └── generated_enum_util.cc
    ├── config.h
    ├── sentencepiece
    │   └── src
    │   │   ├── model_factory.h
    │   │   ├── test_main.cc
    │   │   ├── word_model.h
    │   │   ├── char_model.h
    │   │   ├── trainer_factory.h
    │   │   ├── word_model.cc
    │   │   ├── char_model_trainer.h
    │   │   ├── unicode_script.cc
    │   │   ├── freelist_test.cc
    │   │   ├── char_model.cc
    │   │   ├── init.h
    │   │   ├── word_model_trainer.h
    │   │   ├── unicode_script_test.cc
    │   │   ├── filesystem_test.cc
    │   │   ├── model_factory.cc
    │   │   ├── trainer_factory_test.cc
    │   │   ├── bpe_model.h
    │   │   ├── model_factory_test.cc
    │   │   ├── char_model_trainer.cc
    │   │   ├── filesystem.h
    │   │   ├── testharness.cc
    │   │   ├── spm_export_vocab_main.cc
    │   │   ├── pretokenizer_for_training.cc
    │   │   ├── trainer_factory.cc
    │   │   ├── pretokenizer_for_training.h
    │   │   ├── word_model_trainer.cc
    │   │   ├── freelist.h
    │   │   ├── word_model_trainer_test.cc
    │   │   ├── sentencepiece.proto
    │   │   ├── char_model_trainer_test.cc
    │   │   ├── word_model_test.cc
    │   │   ├── pretokenizer_for_training_test.cc
    │   │   ├── unigram_model_trainer_test.cc
    │   │   ├── unicode_script.h
    │   │   ├── filesystem.cc
    │   │   ├── char_model_test.cc
    │   │   ├── unigram_model_trainer.h
    │   │   ├── spm_decode_main.cc
    │   │   └── spm_normalize_main.cc
    ├── rcpp_wordpiece.cpp
    └── Makevars
├── .gitignore
├── R
    ├── pkg.R
    ├── utils.R
    ├── word2vec.R
    ├── RcppExports.R
    └── wordpiece.R
├── .Rbuildignore
├── sentencepiece.Rproj
├── NAMESPACE
├── man
    ├── txt_remove_.Rd
    ├── sentencepiece_load_model.Rd
    ├── read_word2vec.Rd
    ├── sentencepiece_decode.Rd
    ├── wordpiece_encode.Rd
    ├── BPEembedder.Rd
    ├── BPEembed.Rd
    ├── predict.BPEembed.Rd
    ├── sentencepiece.Rd
    ├── sentencepiece_download_model.Rd
    └── sentencepiece_encode.Rd
├── .github
    └── workflows
    │   ├── R-CMD-check.yml
    │   └── rhub.yaml
└── NEWS.md


/inst/models/nl-fr-dekamer.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bnosac/sentencepiece/HEAD/inst/models/nl-fr-dekamer.model


--------------------------------------------------------------------------------
/src/third_party/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | include_directories(absl/strings darts_clone esaxx protobuf-lite)
2 | 
3 | 
4 | 
5 | 


--------------------------------------------------------------------------------
/inst/models/nl.wiki.bpe.vs1000.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bnosac/sentencepiece/HEAD/inst/models/nl.wiki.bpe.vs1000.model


--------------------------------------------------------------------------------
/inst/models/nl-fr-dekamer-unigram.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bnosac/sentencepiece/HEAD/inst/models/nl-fr-dekamer-unigram.model


--------------------------------------------------------------------------------
/inst/models/nl.wiki.bpe.vs1000.d25.w2v.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bnosac/sentencepiece/HEAD/inst/models/nl.wiki.bpe.vs1000.d25.w2v.bin


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | .Ruserdata
 5 | src/*.o
 6 | src/*.so
 7 | src/*.dll
 8 | inst/extdata
 9 | inst/extdata/dekamer.txt
10 | dev
11 | 


--------------------------------------------------------------------------------
/R/pkg.R:
--------------------------------------------------------------------------------
1 | #' @importFrom Rcpp evalCpp
2 | #' @importFrom utils head capture.output packageVersion
3 | #' @importFrom stats predict
4 | #' @useDynLib sentencepiece
5 | NULL
6 | 


--------------------------------------------------------------------------------
/src/config.h:
--------------------------------------------------------------------------------
1 | #ifndef CONFIG_H_
2 | #define CONFIG_H_
3 | 
4 | #define VERSION "0.1.84"
5 | #define PACKAGE "sentencepiece"
6 | #define PACKAGE_STRING "sentencepiece"
7 | 
8 | 
9 | #endif  // CONFIG_H_


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | .github
 4 | LICENSE$
 5 | dev
 6 | inst/extdata
 7 | inst/extdata/dekamer.txt
 8 | inst/models/english.model
 9 | inst/models/english.vocab
10 | inst/models/nl.wiki.bpe.vs200000.model
11 | inst/models/nl.wiki.bpe.vs200000.vocab
12 | 


--------------------------------------------------------------------------------
/sentencepiece.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Package
16 | PackageUseDevtools: Yes
17 | PackageInstallArgs: --no-multiarch --with-keep.source
18 | PackageRoxygenize: rd,collate,namespace
19 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | #' @title Remove prefixed underscore
 5 | #' @description Remove prefixed underscore unicode character 'LOWER ONE EIGHTH BLOCK' (U+2581)
 6 | #' @param x a character vector
 7 | #' @param replacement character string how to replace the underscore. Defaults to the empty string.
 8 | #' @return \code{x} where the prefixed underscore is removed
 9 | #' @export
10 | #' @examples
11 | #' x <- c("\u2581word", "hello", "_regularunderscore")
12 | #' x
13 | #' txt_remove_(x)
14 | txt_remove_ <- function(x, replacement = ""){
15 |   gsub(pattern = "^\u2581", replacement = replacement, x)
16 | }


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(predict,BPEembed)
 4 | S3method(print,BPEembed)
 5 | S3method(print,sentencepiece)
 6 | export(BPEembed)
 7 | export(BPEembedder)
 8 | export(read_word2vec)
 9 | export(sentencepiece)
10 | export(sentencepiece_decode)
11 | export(sentencepiece_download_model)
12 | export(sentencepiece_encode)
13 | export(sentencepiece_load_model)
14 | export(txt_remove_)
15 | export(wordpiece_encode)
16 | importFrom(Rcpp,evalCpp)
17 | importFrom(stats,predict)
18 | importFrom(utils,capture.output)
19 | importFrom(utils,head)
20 | importFrom(utils,packageVersion)
21 | useDynLib(sentencepiece)
22 | 


--------------------------------------------------------------------------------
/man/txt_remove_.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{txt_remove_}
 4 | \alias{txt_remove_}
 5 | \title{Remove prefixed underscore}
 6 | \usage{
 7 | txt_remove_(x, replacement = "")
 8 | }
 9 | \arguments{
10 | \item{x}{a character vector}
11 | 
12 | \item{replacement}{character string how to replace the underscore. Defaults to the empty string.}
13 | }
14 | \value{
15 | \code{x} where the prefixed underscore is removed
16 | }
17 | \description{
18 | Remove prefixed underscore unicode character 'LOWER ONE EIGHTH BLOCK' (U+2581)
19 | }
20 | \examples{
21 | x <- c("\u2581word", "hello", "_regularunderscore")
22 | x
23 | txt_remove_(x)
24 | }
25 | 


--------------------------------------------------------------------------------
/src/third_party/absl/flags/parse.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #ifndef ABSL_FLAGS_PARSE_H_
16 | #define ABSL_FLAGS_PARSE_H_
17 | 
18 | #include <vector>
19 | 
20 | namespace absl {
21 | 
22 | std::vector<char *> ParseCommandLine(int argc, char *argv[]);
23 | }  // namespace absl
24 | 
25 | #endif  // ABSL_FLAGS_PARSE_H_
26 | 


--------------------------------------------------------------------------------
/src/third_party/absl/container/flat_hash_set.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #ifndef ABSL_CONTAINER_FLAT_HASH_SET_
16 | #define ABSL_CONTAINER_FLAT_HASH_SET_
17 | 
18 | #include <unordered_set>
19 | 
20 | namespace absl {
21 | 
22 | template <typename T, typename Hash = std::hash<T>,
23 |           typename Eq = std::equal_to<T>,
24 |           typename Allocator = std::allocator<T>>
25 | using flat_hash_set = std::unordered_set<T, Hash, Eq, Allocator>;
26 | 
27 | }
28 | 
29 | #endif  // ABSL_CONTAINER_FLAT_HASH_SET_
30 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/model_factory.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #ifndef MODEL_FACTORY_H_
16 | #define MODEL_FACTORY_H_
17 | 
18 | #include <memory>
19 | 
20 | #include "model_interface.h"
21 | #include "sentencepiece_model.pb.h"
22 | 
23 | namespace sentencepiece {
24 | 
25 | class ModelFactory {
26 |  public:
27 |   // Creates Model instance from |model_proto|.
28 |   static std::unique_ptr<ModelInterface> Create(const ModelProto &model_proto);
29 | };
30 | }  // namespace sentencepiece
31 | #endif  // MODEL_FACTORY_H_
32 | 


--------------------------------------------------------------------------------
/src/third_party/absl/strings/strip.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Copyright 2017 The Abseil Authors.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | //
16 | #ifndef ABSL_STRINGS_STRIP_H_
17 | #define ABSL_STRINGS_STRIP_H_
18 | 
19 | #include <string>
20 | 
21 | #include "third_party/absl/strings/match.h"
22 | 
23 | namespace absl {
24 | 
25 | inline bool ConsumePrefix(absl::string_view *str, absl::string_view expected) {
26 |   if (!absl::StartsWith(*str, expected)) return false;
27 |   str->remove_prefix(expected.size());
28 |   return true;
29 | }
30 | 
31 | }  // namespace absl
32 | #endif  // ABSL_STRINGS_STRIP_H
33 | 


--------------------------------------------------------------------------------
/src/third_party/absl/container/flat_hash_map.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #ifndef ABSL_CONTAINER_FLAT_HASH_MAP_
16 | #define ABSL_CONTAINER_FLAT_HASH_MAP_
17 | 
18 | #include <unordered_map>
19 | 
20 | namespace absl {
21 | 
22 | template <typename K, typename V, typename Hash = std::hash<K>,
23 |           typename Eq = std::equal_to<K>,
24 |           typename Allocator = std::allocator<std::pair<const K, V>>>
25 | using flat_hash_map = std::unordered_map<K, V, Hash, Eq, Allocator>;
26 | 
27 | }
28 | 
29 | #endif  // ABSL_CONTAINER_FLAT_HASH_MAP_
30 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/test_main.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #include "init.h"
16 | #include "testharness.h"
17 | 
18 | #ifdef OS_WIN
19 | ABSL_FLAG(std::string, test_srcdir, "..\\data", "Data directory.");
20 | #else
21 | ABSL_FLAG(std::string, test_srcdir, "../data", "Data directory.");
22 | #endif
23 | 
24 | ABSL_FLAG(std::string, test_tmpdir, "test_tmp", "Temporary directory.");
25 | 
26 | int main(int argc, char **argv) {
27 |   sentencepiece::ParseCommandLineFlags(argv[0], &argc, &argv, true);
28 |   sentencepiece::test::RunAllTests();
29 |   return 0;
30 | }
31 | 


--------------------------------------------------------------------------------
/src/third_party/absl/strings/numbers.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Copyright 2017 The Abseil Authors.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | //
16 | #ifndef ABSL_STRINGS_NUMBERS_H_
17 | #define ABSL_STRINGS_NUMBERS_H_
18 | 
19 | #include <sstream>
20 | 
21 | #include "third_party/absl/strings/string_view.h"
22 | 
23 | namespace absl {
24 | 
25 | // TODO(taku): Re-implement this, as it is slow.
26 | template <typename T>
27 | inline bool SimpleAtoi(absl::string_view s, T *result) {
28 |   std::stringstream ss;
29 |   return (ss << s.data() && ss >> *result);
30 | }
31 | 
32 | }  // namespace absl
33 | #endif  // ABSL_STRINGS_NUMBERS_H_
34 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/word_model.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #ifndef WORD_MODEL_H_
16 | #define WORD_MODEL_H_
17 | 
18 | #include "model_interface.h"
19 | #include "sentencepiece_model.pb.h"
20 | 
21 | namespace sentencepiece {
22 | namespace word {
23 | 
24 | // Tokenize text with whitespaces.
25 | class Model : public ModelInterface {
26 |  public:
27 |   explicit Model(const ModelProto &model_proto);
28 |   ~Model() override;
29 | 
30 |   EncodeResult Encode(absl::string_view normalized) const override;
31 | };
32 | }  // namespace word
33 | }  // namespace sentencepiece
34 | #endif  // WORD_MODEL_H_
35 | 


--------------------------------------------------------------------------------
/src/third_party/esaxx/LICENSE:
--------------------------------------------------------------------------------
 1 | This is the esaxx copyright.
 2 | 
 3 | Copyright (c) 2010 Daisuke Okanohara All Rights Reserved.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person
 6 | obtaining a copy of this software and associated documentation
 7 | files (the "Software"), to deal in the Software without
 8 | restriction, including without limitation the rights to use,
 9 | copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the
11 | Software is furnished to do so, subject to the following
12 | conditions:
13 | 
14 | The above copyright notice and this permission notice shall be
15 | included in all copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24 | OTHER DEALINGS IN THE SOFTWARE.
25 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/char_model.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #ifndef CHAR_MODEL_H_
16 | #define CHAR_MODEL_H_
17 | 
18 | #include "model_interface.h"
19 | #include "sentencepiece_model.pb.h"
20 | 
21 | namespace sentencepiece {
22 | namespace character {
23 | 
24 | // Tokenize text into character sequence
25 | class Model : public ModelInterface {
26 |  public:
27 |   explicit Model(const ModelProto &model_proto);
28 |   ~Model() override;
29 | 
30 |   EncodeResult Encode(absl::string_view normalized) const override;
31 | };
32 | }  // namespace character
33 | }  // namespace sentencepiece
34 | #endif  // CHAR_MODEL_H_
35 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/trainer_factory.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #ifndef TRAINER_FACTORY_H_
16 | #define TRAINER_FACTORY_H_
17 | 
18 | #include <memory>
19 | 
20 | #include "sentencepiece_model.pb.h"
21 | #include "trainer_interface.h"
22 | 
23 | namespace sentencepiece {
24 | 
25 | class TrainerFactory {
26 |  public:
27 |   // Creates Trainer instance from |trainer_spec| and |normalizer_spec|.
28 |   static std::unique_ptr<TrainerInterface> Create(
29 |       const TrainerSpec &trainer_spec, const NormalizerSpec &normalizer_spec,
30 |       const NormalizerSpec &denormalizer_spec);
31 | };
32 | }  // namespace sentencepiece
33 | #endif  // TRAINER_FACTORY_H_
34 | 


--------------------------------------------------------------------------------
/src/third_party/absl/strings/str_format.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Copyright 2017 The Abseil Authors.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | //
16 | #ifndef ABSL_STRINGS_STR_FORMAT_H
17 | #define ABSL_STRINGS_STR_FORMAT_H
18 | 
19 | #include <stdio.h>
20 | 
21 | #include <string>
22 | 
23 | #include "third_party/absl/strings/string_view.h"
24 | 
25 | namespace absl {
26 | 
27 | template <typename... Args>
28 | std::string StrFormat(const char *format, Args const &... args) {
29 |   const int len = ::snprintf(nullptr, 0, format, args...);
30 |   std::string s;
31 |   s.resize(len);
32 |   ::snprintf(&s[0], s.size() + 1, format, args...);
33 |   return s;
34 | }
35 | 
36 | }  // namespace absl
37 | #endif  // ABSL_MEMORY_MEMORY_H_
38 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/word_model.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #include "util.h"
16 | #include "word_model.h"
17 | 
18 | namespace sentencepiece {
19 | namespace word {
20 | 
21 | Model::Model(const ModelProto &model_proto) {
22 |   model_proto_ = &model_proto;
23 |   InitializePieces();
24 | }
25 | 
26 | Model::~Model() {}
27 | 
28 | EncodeResult Model::Encode(absl::string_view normalized) const {
29 |   if (!status().ok() || normalized.empty()) {
30 |     return {};
31 |   }
32 | 
33 |   EncodeResult output;
34 |   for (const auto &w : SplitIntoWords(normalized)) {
35 |     output.emplace_back(w, PieceToId(w));
36 |   }
37 | 
38 |   return output;
39 | }
40 | 
41 | }  // namespace word
42 | }  // namespace sentencepiece
43 | 


--------------------------------------------------------------------------------
/man/sentencepiece_load_model.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sentencepiece.R
 3 | \name{sentencepiece_load_model}
 4 | \alias{sentencepiece_load_model}
 5 | \title{Load a Sentencepiece model}
 6 | \usage{
 7 | sentencepiece_load_model(file = "sentencepiece.model")
 8 | }
 9 | \arguments{
10 | \item{file}{path to the file containing the Sentencepiece model}
11 | }
12 | \value{
13 | an object of class \code{sentencepiece} which is a list with elements
14 | \itemize{
15 | \item{model: an Rcpp pointer to the model}
16 | \item{model_path: the path to the model}
17 | \item{vocab_size: the size of the Sentencepiece vocabulary}
18 | \item{vocabulary: the Sentencepiece vocabulary which is a data.frame with columns id and subword}
19 | }
20 | }
21 | \description{
22 | Load a Sentencepiece model which either was trained with \code{\link{sentencepiece}} or which you have found in the wild.
23 | }
24 | \examples{
25 | model <- system.file(package = "sentencepiece", "models", "nl-fr-dekamer.model")
26 | model <- sentencepiece_load_model(file = model)
27 | 
28 | txt <- c("De eigendomsoverdracht aan de deelstaten is ingewikkeld.",
29 |          "On est d'accord sur le prix de la biere?")
30 | sentencepiece_encode(model, x = txt, type = "subwords")
31 | sentencepiece_encode(model, x = txt, type = "ids")
32 | }
33 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/char_model_trainer.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #ifndef CHAR_MODEL_TRAINER_H_
16 | #define CHAR_MODEL_TRAINER_H_
17 | 
18 | #include "sentencepiece_model.pb.h"
19 | #include "trainer_interface.h"
20 | 
21 | namespace sentencepiece {
22 | namespace character {
23 | 
24 | // Trainer class for character model.
25 | class Trainer : public TrainerInterface {
26 |  public:
27 |   Trainer(const TrainerSpec &trainer_spec,
28 |           const NormalizerSpec &normalizer_spec,
29 |           const NormalizerSpec &denormalizer_spec)
30 |       : TrainerInterface::TrainerInterface(trainer_spec, normalizer_spec,
31 |                                            denormalizer_spec) {}
32 | 
33 |   util::Status Train() override;
34 | };
35 | }  // namespace character
36 | }  // namespace sentencepiece
37 | #endif  // CHAR_MODEL_TRAINER_H_
38 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/unicode_script.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #include <unordered_map>
16 | 
17 | #include "third_party/absl/container/flat_hash_map.h"
18 | #include "unicode_script.h"
19 | #include "unicode_script_map.h"
20 | #include "util.h"
21 | 
22 | namespace sentencepiece {
23 | namespace unicode_script {
24 | namespace {
25 | class GetScriptInternal {
26 |  public:
27 |   GetScriptInternal() { InitTable(&smap_); }
28 | 
29 |   ScriptType GetScript(char32 c) const {
30 |     return port::FindWithDefault(smap_, c, ScriptType::U_Common);
31 |   }
32 | 
33 |  private:
34 |   absl::flat_hash_map<char32, ScriptType> smap_;
35 | };
36 | }  // namespace
37 | 
38 | ScriptType GetScript(char32 c) {
39 |   static GetScriptInternal sc;
40 |   return sc.GetScript(c);
41 | }
42 | }  // namespace unicode_script
43 | }  // namespace sentencepiece
44 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/freelist_test.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #include "freelist.h"
16 | #include "testharness.h"
17 | 
18 | namespace sentencepiece {
19 | namespace model {
20 | 
21 | TEST(FreeListTest, BasicTest) {
22 |   FreeList<int> l(5);
23 |   EXPECT_EQ(0, l.size());
24 | 
25 |   constexpr size_t kSize = 32;
26 | 
27 |   for (size_t i = 0; i < kSize; ++i) {
28 |     int *n = l.Allocate();
29 |     EXPECT_EQ(0, *n);
30 |     *n = i;
31 |   }
32 | 
33 |   EXPECT_EQ(kSize, l.size());
34 |   for (size_t i = 0; i < kSize; ++i) {
35 |     EXPECT_EQ(i, *l[i]);
36 |   }
37 | 
38 |   l.Free();
39 |   EXPECT_EQ(0, l.size());
40 | 
41 |   // Zero-initialized after `Free`.
42 |   for (size_t i = 0; i < kSize; ++i) {
43 |     int *n = l.Allocate();
44 |     EXPECT_EQ(0, *n);
45 |   }
46 | }
47 | }  // namespace model
48 | }  // namespace sentencepiece
49 | 


--------------------------------------------------------------------------------
/src/third_party/absl/strings/match.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Copyright 2017 The Abseil Authors.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | //
16 | #ifndef ABSL_STRINGS_MATCH_H_
17 | #define ABSL_STRINGS_MATCH_H_
18 | 
19 | #include <string>
20 | 
21 | #include "third_party/absl/strings/string_view.h"
22 | 
23 | namespace absl {
24 | 
25 | inline bool StartsWith(absl::string_view text, absl::string_view prefix) {
26 |   return prefix.empty() ||
27 |          (text.size() >= prefix.size() &&
28 |           memcmp(text.data(), prefix.data(), prefix.size()) == 0);
29 | }
30 | 
31 | inline bool EndsWith(absl::string_view text, absl::string_view suffix) {
32 |   return suffix.empty() || (text.size() >= suffix.size() &&
33 |                             memcmp(text.data() + (text.size() - suffix.size()),
34 |                                    suffix.data(), suffix.size()) == 0);
35 | }
36 | 
37 | }  // namespace absl
38 | #endif  // ABSL_STRINGS_MATCH_H_
39 | 


--------------------------------------------------------------------------------
/src/third_party/absl/strings/ascii.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Copyright 2017 The Abseil Authors.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | //
16 | #ifndef ABSL_STRINGS_ASCII_H_
17 | #define ABSL_STRINGS_ASCII_H_
18 | 
19 | #include <ctype.h>
20 | 
21 | #include <string>
22 | 
23 | #include "third_party/absl/strings/string_view.h"
24 | 
25 | namespace absl {
26 | 
27 | inline std::string AsciiStrToUpper(absl::string_view value) {
28 |   std::string upper_value = std::string(value);
29 |   std::transform(upper_value.begin(), upper_value.end(), upper_value.begin(),
30 |                  ::toupper);
31 |   return upper_value;
32 | }
33 | 
34 | inline std::string AsciiStrToLower(absl::string_view value) {
35 |   std::string lower_value = std::string(value);
36 |   std::transform(lower_value.begin(), lower_value.end(), lower_value.begin(),
37 |                  ::tolower);
38 |   return lower_value;
39 | }
40 | }  // namespace absl
41 | #endif  // ABSL_STRINGS_ASCII_H_
42 | 


--------------------------------------------------------------------------------
/src/third_party/darts_clone/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2008-2011, Susumu Yata
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 5 | 
 6 | - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
 7 | - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
 8 | - Neither the name of the <ORGANIZATION> nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
 9 | 
10 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
11 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/char_model.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #include "char_model.h"
16 | #include "util.h"
17 | 
18 | namespace sentencepiece {
19 | namespace character {
20 | 
21 | Model::Model(const ModelProto &model_proto) {
22 |   model_proto_ = &model_proto;
23 |   InitializePieces();
24 | }
25 | 
26 | Model::~Model() {}
27 | 
28 | EncodeResult Model::Encode(absl::string_view normalized) const {
29 |   if (!status().ok() || normalized.empty()) {
30 |     return {};
31 |   }
32 | 
33 |   // Splits the input into character sequence
34 |   EncodeResult output;
35 |   while (!normalized.empty()) {
36 |     const int mblen = matcher_->PrefixMatch(normalized);
37 |     absl::string_view w(normalized.data(), mblen);
38 |     output.emplace_back(w, PieceToId(w));
39 |     normalized.remove_prefix(mblen);
40 |   }
41 | 
42 |   return output;
43 | }
44 | 
45 | }  // namespace character
46 | }  // namespace sentencepiece
47 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/init.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #ifndef INIT_H_
16 | #define INIT_H_
17 | 
18 | #include "common.h"
19 | #include "third_party/absl/flags/flag.h"
20 | #include "third_party/absl/flags/parse.h"
21 | 
22 | ABSL_DECLARE_FLAG(int32, minloglevel);
23 | 
24 | namespace sentencepiece {
25 | inline void ParseCommandLineFlags(const char *usage, int *argc, char ***argv,
26 |                                   bool remove_arg = true) {
27 |   const auto unused_args = absl::ParseCommandLine(*argc, *argv);
28 | 
29 |   if (remove_arg) {
30 |     char **argv_val = *argv;
31 |     *argv = argv_val = argv_val + *argc - unused_args.size();
32 |     std::copy(unused_args.begin(), unused_args.end(), argv_val);
33 |     *argc = static_cast<int>(unused_args.size());
34 |   }
35 | 
36 |   logging::SetMinLogLevel(absl::GetFlag(FLAGS_minloglevel));
37 | }
38 | }  // namespace sentencepiece
39 | 
40 | #endif  // INIT_H_
41 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     branches:
 4 |       - master
 5 |   pull_request:
 6 |     branches:
 7 |       - master
 8 | 
 9 | name: R-CMD-check
10 | 
11 | jobs:
12 |   R-CMD-check:
13 |     runs-on: ${{ matrix.config.os }}
14 | 
15 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
16 | 
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 |         config:
21 |           - {os: macos-latest,   r: 'release'}
22 |           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
23 |           - {os: ubuntu-latest,   r: 'release'}
24 |           - {os: ubuntu-latest,   r: 'oldrel'}
25 |           - {os: ubuntu-latest,   r: 'oldrel-1'}   
26 |           - {os: ubuntu-latest,   r: 'oldrel-2'} 
27 |           - {os: ubuntu-latest,   r: 'oldrel-3'} 
28 | 
29 |     env:
30 |       R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
31 |       RSPM: ${{ matrix.config.rspm }}
32 |       GITHUB_PAT: ${{ secrets.PAT }}
33 |     steps:
34 |       - uses: actions/checkout@v3
35 | 
36 |       - uses: r-lib/actions/setup-pandoc@v2
37 | 
38 |       - uses: r-lib/actions/setup-r@v2
39 |         with:
40 |           r-version: ${{ matrix.config.r }}
41 |           http-user-agent: ${{ matrix.config.http-user-agent }}
42 |           use-public-rspm: true
43 | 
44 |       - uses: r-lib/actions/setup-r-dependencies@v2
45 |         with:
46 |           extra-packages: any::rcmdcheck
47 |           needs: check
48 | 
49 |       - uses: r-lib/actions/check-r-package@v2
50 |         with:
51 |           upload-snapshots: true


--------------------------------------------------------------------------------
/man/read_word2vec.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/word2vec.R
 3 | \name{read_word2vec}
 4 | \alias{read_word2vec}
 5 | \title{Read a word2vec embedding file}
 6 | \usage{
 7 | read_word2vec(
 8 |   x,
 9 |   type = c("txt", "bin"),
10 |   n = .Machine$integer.max,
11 |   encoding = "UTF-8",
12 |   normalize = TRUE
13 | )
14 | }
15 | \arguments{
16 | \item{x}{path to the file}
17 | 
18 | \item{type}{either 'bin' or 'txt' indicating the \code{file} is a binary file or a text file}
19 | 
20 | \item{n}{integer, indicating to limit the number of words to read in. Defaults to reading all words.}
21 | 
22 | \item{encoding}{encoding to be assumed for the words. Defaults to 'UTF-8'}
23 | 
24 | \item{normalize}{logical indicating to normalize the embeddings by dividing by the factor (sqrt(sum(x . x) / length(x))). Defaults to TRUE.}
25 | }
26 | \value{
27 | a matrix with one row per token containing the embedding of the token
28 | }
29 | \description{
30 | Read a word2vec embedding file as a dense matrix. This uses \code{\link[word2vec]{read.wordvectors}} from the word2vec package.
31 | }
32 | \examples{
33 | folder    <- system.file(package = "sentencepiece", "models")
34 | embedding <- file.path(folder, "nl.wiki.bpe.vs1000.d25.w2v.bin")
35 | embedding <- read_word2vec(embedding, type = "bin")
36 | head(embedding)
37 | embedding <- file.path(folder, "nl.wiki.bpe.vs1000.d25.w2v.txt")
38 | embedding <- read_word2vec(embedding, type = "txt")
39 | head(embedding, n = 10)
40 | }
41 | \seealso{
42 | \code{\link[word2vec]{read.wordvectors}}
43 | }
44 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/word_model_trainer.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #ifndef WORD_MODEL_TRAINER_H_
16 | #define WORD_MODEL_TRAINER_H_
17 | 
18 | #include "sentencepiece_model.pb.h"
19 | #include "trainer_interface.h"
20 | 
21 | namespace sentencepiece {
22 | namespace word {
23 | 
24 | // Trainer class for word model.
25 | //
26 | // Word model simply counts the frequency of
27 | // space-delimited tokens, then keep top
28 | // |vocab_size| frequent tokens.
29 | class Trainer : public TrainerInterface {
30 |  public:
31 |   Trainer(const TrainerSpec &trainer_spec,
32 |           const NormalizerSpec &normalizer_spec,
33 |           const NormalizerSpec &denormalizer_spec)
34 |       : TrainerInterface::TrainerInterface(trainer_spec, normalizer_spec,
35 |                                            denormalizer_spec) {}
36 | 
37 |   util::Status Train() override;
38 | };
39 | }  // namespace word
40 | }  // namespace sentencepiece
41 | #endif  // WORD_MODEL_TRAINER_H_
42 | 


--------------------------------------------------------------------------------
/R/word2vec.R:
--------------------------------------------------------------------------------
 1 | #' @title Read a word2vec embedding file
 2 | #' @description  Read a word2vec embedding file as a dense matrix. This uses \code{\link[word2vec]{read.wordvectors}} from the word2vec package.
 3 | #' @param x path to the file
 4 | #' @param type either 'bin' or 'txt' indicating the \code{file} is a binary file or a text file
 5 | #' @param n integer, indicating to limit the number of words to read in. Defaults to reading all words.
 6 | #' @param normalize logical indicating to normalize the embeddings by dividing by the factor (sqrt(sum(x . x) / length(x))). Defaults to TRUE. 
 7 | #' @param encoding encoding to be assumed for the words. Defaults to 'UTF-8'
 8 | #' @return a matrix with one row per token containing the embedding of the token
 9 | #' @seealso \code{\link[word2vec]{read.wordvectors}}
10 | #' @export
11 | #' @examples
12 | #' folder    <- system.file(package = "sentencepiece", "models")
13 | #' embedding <- file.path(folder, "nl.wiki.bpe.vs1000.d25.w2v.bin")
14 | #' embedding <- read_word2vec(embedding, type = "bin")
15 | #' head(embedding)
16 | #' embedding <- file.path(folder, "nl.wiki.bpe.vs1000.d25.w2v.txt")
17 | #' embedding <- read_word2vec(embedding, type = "txt")
18 | #' head(embedding, n = 10)
19 | read_word2vec <- function(x, type = c("txt", "bin"), n = .Machine$integer.max, encoding = "UTF-8", normalize = TRUE){
20 |   type <- match.arg(type)
21 |   requireNamespace("word2vec")
22 |   if(packageVersion("word2vec") < "0.2.0"){
23 |     stop("This requires word2vec package >= 0.2.0")
24 |   }
25 |   embedding <- word2vec::read.wordvectors(file = x, type = type, n = n, encoding = encoding, normalize = normalize)
26 |   embedding
27 | }


--------------------------------------------------------------------------------
/man/sentencepiece_decode.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sentencepiece.R
 3 | \name{sentencepiece_decode}
 4 | \alias{sentencepiece_decode}
 5 | \title{Decode encoded sequences back to text}
 6 | \usage{
 7 | sentencepiece_decode(model, x)
 8 | }
 9 | \arguments{
10 | \item{model}{an object of class \code{sentencepiece} as returned by \code{\link{sentencepiece_load_model}} or \code{\link{sentencepiece}}}
11 | 
12 | \item{x}{an integer vector of Sentencepiece id's or a list of these}
13 | }
14 | \value{
15 | a character vector of detokenised text or if you encoded with \code{nbest}, a list of these
16 | }
17 | \description{
18 | Decode a sequence of Sentencepiece ids into text again
19 | }
20 | \examples{
21 | model <- system.file(package = "sentencepiece", "models", "nl-fr-dekamer.model")
22 | model <- sentencepiece_load_model(file = model)
23 | 
24 | txt <- c("De eigendomsoverdracht aan de deelstaten is ingewikkeld.",
25 |          "On est d'accord sur le prix de la biere?")
26 |        
27 | x <- sentencepiece_encode(model, x = txt, type = "subwords")
28 | sentencepiece_decode(model, x)
29 | x <- sentencepiece_encode(model, x = txt, type = "ids")
30 | sentencepiece_decode(model, x)
31 | 
32 | model <- system.file(package = "sentencepiece", "models", 
33 |                      "nl-fr-dekamer-unigram.model")
34 | model <- sentencepiece_load_model(file = model)
35 | x <- sentencepiece_encode(model, x = txt, type = "subwords", nbest = 3)
36 | sentencepiece_decode(model, x)
37 | x <- sentencepiece_encode(model, x = txt, type = "subwords", 
38 |                           nbest = 3, alpha = 0.1)
39 | sentencepiece_decode(model, x)
40 | }
41 | 


--------------------------------------------------------------------------------
/src/third_party/absl/strings/str_cat.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Copyright 2017 The Abseil Authors.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | //
16 | #ifndef ABSL_STRINGS_STR_CAT_H_
17 | #define ABSL_STRINGS_STR_CAT_H_
18 | 
19 | #include <sstream>
20 | #include <string>
21 | 
22 | #include "third_party/absl/strings/numbers.h"
23 | #include "third_party/absl/strings/string_view.h"
24 | 
25 | namespace absl {
26 | 
27 | inline std::string StrCat(int v) {
28 |   std::ostringstream os;
29 |   os << v;
30 |   return os.str();
31 | }
32 | 
33 | inline std::string StrCat(absl::string_view str) {
34 |   return std::string(str.data(), str.size());
35 | }
36 | 
37 | template <typename... T>
38 | inline std::string StrCat(absl::string_view first, const T &...rest) {
39 |   return StrCat(first) + StrCat(rest...);
40 | }
41 | 
42 | template <typename... T>
43 | inline std::string StrCat(int first, const T &...rest) {
44 |   return StrCat(first) + StrCat(rest...);
45 | }
46 | 
47 | inline void StrAppend(std::string *base, absl::string_view str) {
48 |   base->append(str.data(), str.size());
49 | }
50 | 
51 | }  // namespace absl
52 | #endif  // ABSL_STRINGS_STR_CAT_H_
53 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/unicode_script_test.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #include "common.h"
16 | #include "testharness.h"
17 | #include "third_party/absl/strings/string_view.h"
18 | #include "unicode_script.h"
19 | #include "util.h"
20 | 
21 | namespace sentencepiece {
22 | namespace unicode_script {
23 | ScriptType GetScriptType(absl::string_view s) {
24 |   const auto ut = string_util::UTF8ToUnicodeText(s);
25 |   CHECK_EQ(1, ut.size());
26 |   return GetScript(ut[0]);
27 | }
28 | 
29 | TEST(UnicodeScript, GetScriptTypeTest) {
30 |   EXPECT_EQ(U_Han, GetScriptType("京"));
31 |   EXPECT_EQ(U_Han, GetScriptType("太"));
32 |   EXPECT_EQ(U_Hiragana, GetScriptType("い"));
33 |   EXPECT_EQ(U_Katakana, GetScriptType("グ"));
34 |   EXPECT_EQ(U_Common, GetScriptType("ー"));
35 |   EXPECT_EQ(U_Latin, GetScriptType("a"));
36 |   EXPECT_EQ(U_Latin, GetScriptType("A"));
37 |   EXPECT_EQ(U_Common, GetScriptType("0"));
38 |   EXPECT_EQ(U_Common, GetScriptType("$"));
39 |   EXPECT_EQ(U_Common, GetScriptType("@"));
40 |   EXPECT_EQ(U_Common, GetScriptType("-"));
41 | }
42 | }  // namespace unicode_script
43 | }  // namespace sentencepiece
44 | 


--------------------------------------------------------------------------------
/src/rcpp_wordpiece.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | 
 3 | 
 4 | // [[Rcpp::export]]
 5 | Rcpp::StringVector wordpiece_encode_as_subwords(std::string x, std::vector<std::string> vocabulary, std::string unk_token="[UNK]", unsigned int max_input_chars_per_word=100) {
 6 |   // Tokenizes a piece of text into its word pieces, using a greedy longest-match-first algorithm to perform tokenization
 7 |   Rcpp::StringVector output_tokens;
 8 |   unsigned int len = x.length();
 9 |   if(len > max_input_chars_per_word){
10 |     output_tokens.push_back(unk_token);
11 |   } else{
12 |     unsigned int start = 0;
13 |     std::vector<std::string> sub_tokens;
14 |     while(start < len){
15 |       unsigned int end = len - 1;
16 |       std::string cur_substr = "";
17 |       std::string substr;
18 |       while(start <= end){
19 |         substr = x.substr(start, end - start + 1);
20 |         // Rcpp::Rcout << substr << ":" << start << "-" << end <<"\n";
21 |         if(start > 0){
22 |           substr = "##" + substr;
23 |         }
24 |         if(std::find(vocabulary.begin(), vocabulary.end(), substr) != vocabulary.end()){
25 |           cur_substr = substr;
26 |           break;
27 |         }
28 |         if (end > 0) {
29 |           end = end - 1;
30 |         } else {
31 |           break;
32 |         }
33 |       }
34 |       if(cur_substr == ""){
35 |         sub_tokens.push_back(unk_token);
36 |         break;
37 |       }
38 |       sub_tokens.push_back(cur_substr);
39 |       start = end + 1;
40 |     }
41 |     if(sub_tokens.size() == 0){
42 |       output_tokens.push_back(unk_token);
43 |     }else{
44 |       for (unsigned int i = 0; i < sub_tokens.size(); i++){
45 |         output_tokens.push_back(sub_tokens[i]);
46 |       }
47 |     }
48 |   }
49 |   return output_tokens;
50 | }


--------------------------------------------------------------------------------
/src/sentencepiece/src/filesystem_test.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #include "filesystem.h"
16 | #include "testharness.h"
17 | #include "third_party/absl/strings/str_cat.h"
18 | #include "util.h"
19 | 
20 | namespace sentencepiece {
21 | 
22 | TEST(UtilTest, FilesystemTest) {
23 |   const std::vector<std::string> kData = {
24 |       "This"
25 |       "is"
26 |       "a"
27 |       "test"};
28 | 
29 |   {
30 |     auto output = filesystem::NewWritableFile(
31 |         util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "test_file"));
32 |     for (size_t i = 0; i < kData.size(); ++i) {
33 |       output->WriteLine(kData[i]);
34 |     }
35 |   }
36 | 
37 |   {
38 |     auto input = filesystem::NewReadableFile(
39 |         util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "test_file"));
40 |     std::string line;
41 |     for (size_t i = 0; i < kData.size(); ++i) {
42 |       EXPECT_TRUE(input->ReadLine(&line));
43 |       EXPECT_EQ(kData[i], line);
44 |     }
45 |     EXPECT_FALSE(input->ReadLine(&line));
46 |   }
47 | }
48 | 
49 | TEST(UtilTest, FilesystemInvalidFileTest) {
50 |   auto input = filesystem::NewReadableFile("__UNKNOWN__FILE__");
51 |   EXPECT_FALSE(input->status().ok());
52 | }
53 | 
54 | }  // namespace sentencepiece
55 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/model_factory.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #include "bpe_model.h"
16 | #include "char_model.h"
17 | #include "model_factory.h"
18 | #include "third_party/absl/memory/memory.h"
19 | #include "unigram_model.h"
20 | #include "word_model.h"
21 | 
22 | namespace sentencepiece {
23 | 
24 | // Instantiate Model instance from |model_proto|
25 | std::unique_ptr<ModelInterface> ModelFactory::Create(
26 |     const ModelProto& model_proto) {
27 |   const auto& trainer_spec = model_proto.trainer_spec();
28 | 
29 |   switch (trainer_spec.model_type()) {
30 |     case TrainerSpec::UNIGRAM:
31 |       return absl::make_unique<unigram::Model>(model_proto);
32 |       break;
33 |     case TrainerSpec::BPE:
34 |       return absl::make_unique<bpe::Model>(model_proto);
35 |       break;
36 |     case TrainerSpec::WORD:
37 |       return absl::make_unique<word::Model>(model_proto);
38 |       break;
39 |     case TrainerSpec::CHAR:
40 |       return absl::make_unique<character::Model>(model_proto);
41 |       break;
42 |     default:
43 |       LOG(ERROR) << "Unknown model_type: " << trainer_spec.model_type();
44 |       return nullptr;
45 |       break;
46 |   }
47 | 
48 |   return absl::make_unique<unigram::Model>(model_proto);
49 | }
50 | }  // namespace sentencepiece
51 | 


--------------------------------------------------------------------------------
/src/third_party/protobuf-lite/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2008 Google Inc.  All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are
 5 | met:
 6 | 
 7 |     * Redistributions of source code must retain the above copyright
 8 | notice, this list of conditions and the following disclaimer.
 9 |     * Redistributions in binary form must reproduce the above
10 | copyright notice, this list of conditions and the following disclaimer
11 | in the documentation and/or other materials provided with the
12 | distribution.
13 |     * Neither the name of Google Inc. nor the names of its
14 | contributors may be used to endorse or promote products derived from
15 | this software without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 | Code generated by the Protocol Buffer compiler is owned by the owner
30 | of the input file used when generating it.  This code is not
31 | standalone and requires a support library to be linked with it.  This
32 | support library is itself covered by the above license.
33 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/trainer_factory_test.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #include "testharness.h"
16 | #include "trainer_factory.h"
17 | 
18 | namespace sentencepiece {
19 | 
20 | TEST(TrainerFactoryTest, BasicTest) {
21 |   TrainerSpec trainer_spec;
22 |   NormalizerSpec normalizer_spec;
23 |   NormalizerSpec denormalizer_spec;
24 | 
25 |   trainer_spec.set_model_prefix("model");
26 |   trainer_spec.add_input("input");
27 | 
28 |   {
29 |     trainer_spec.set_model_type(TrainerSpec::UNIGRAM);
30 |     auto m = TrainerFactory::Create(trainer_spec, normalizer_spec,
31 |                                     denormalizer_spec);
32 |   }
33 | 
34 |   {
35 |     trainer_spec.set_model_type(TrainerSpec::BPE);
36 |     auto m = TrainerFactory::Create(trainer_spec, normalizer_spec,
37 |                                     denormalizer_spec);
38 |   }
39 | 
40 |   {
41 |     trainer_spec.set_model_type(TrainerSpec::WORD);
42 |     auto m = TrainerFactory::Create(trainer_spec, normalizer_spec,
43 |                                     denormalizer_spec);
44 |   }
45 | 
46 |   {
47 |     trainer_spec.set_model_type(TrainerSpec::CHAR);
48 |     auto m = TrainerFactory::Create(trainer_spec, normalizer_spec,
49 |                                     denormalizer_spec);
50 |   }
51 | }
52 | }  // namespace sentencepiece
53 | 


--------------------------------------------------------------------------------
/man/wordpiece_encode.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/wordpiece.R
 3 | \name{wordpiece_encode}
 4 | \alias{wordpiece_encode}
 5 | \title{Wordpiece encoding}
 6 | \usage{
 7 | wordpiece_encode(
 8 |   x,
 9 |   vocabulary = character(),
10 |   type = c("subwords", "ids"),
11 |   unk_token = "[UNK]",
12 |   max_input_chars_per_word = 100L
13 | )
14 | }
15 | \arguments{
16 | \item{x}{a character vector with text which can be splitted based on white space to obtain words}
17 | 
18 | \item{vocabulary}{a character vector of the vocabulary}
19 | 
20 | \item{type}{a character string, either 'subwords' or 'ids' to get the subwords or the corresponding ids of these subwords as defined in the vocabulary of the model. 
21 | Defaults to 'subwords'.}
22 | 
23 | \item{unk_token}{character string with a value for a token which is not part of the vocabulary. Defaults to '[UNK]'}
24 | 
25 | \item{max_input_chars_per_word}{integer. A word which is longer than this specified number of characters will be set to the unknown token.}
26 | }
27 | \value{
28 | a list of subword tokens
29 | }
30 | \description{
31 | Wordpiece encoding, usefull for BERT-style tokenisation. 
32 | Experimental version mimicing class WordpieceTokenizer from \url{https://github.com/huggingface/transformers/blob/master/src/transformers/models/bert/tokenization_bert.py}
33 | }
34 | \examples{
35 | wordpiece_encode("unaffable", vocabulary = c("un", "##aff", "##able")) 
36 | wordpiece_encode(x = c("unaffable", "unaffableun"), 
37 |                  vocabulary = c("un", "##aff", "##able"))
38 | wordpiece_encode(x = c("unaffable", "unaffableun", "unknown territory"), 
39 |                  vocabulary = c("un", "##aff", "##able", "##un")) 
40 | wordpiece_encode(x = c("unaffable", "unaffableun", "unknown territory"), 
41 |                  vocabulary = c("un", "##aff", "##able", "##un"),
42 |                  type = "ids") 
43 | }
44 | 


--------------------------------------------------------------------------------
/src/third_party/absl/flags/flag.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #ifndef ABSL_FLAGS_FLAG_H_
16 | #define ABSL_FLAGS_FLAG_H_
17 | 
18 | #include <functional>
19 | #include <memory>
20 | #include <string>
21 | #include <vector>
22 | 
23 | namespace absl {
24 | namespace internal {
25 | struct FlagFunc;
26 | 
27 | void RegisterFlag(const std::string &name, FlagFunc *func);
28 | }  // namespace internal
29 | 
30 | template <typename T>
31 | class Flag {
32 |  public:
33 |   Flag(const char *name, const char *type, const char *help,
34 |        const T &defautl_value);
35 |   virtual ~Flag();
36 |   const T &value() const;
37 |   void set_value(const T &value);
38 |   void set_value_as_str(const std::string &value_as_str);
39 | 
40 |  private:
41 |   T value_;
42 |   std::unique_ptr<internal::FlagFunc> func_;
43 | };
44 | 
45 | template <typename T>
46 | const T &GetFlag(const Flag<T> &flag) {
47 |   return flag.value();
48 | }
49 | 
50 | template <typename T, typename V>
51 | void SetFlag(Flag<T> *flag, const V &v) {
52 |   const T value(v);
53 |   flag->set_value(value);
54 | }
55 | }  // namespace absl
56 | 
57 | #define ABSL_FLAG(Type, name, defautl_value, help) \
58 |   absl::Flag<Type> FLAGS_##name(#name, #Type, help, defautl_value);
59 | 
60 | #define ABSL_DECLARE_FLAG(Type, name) extern absl::Flag<Type> FLAGS_##name;
61 | 
62 | #endif  // ABSL_FLAGS_FLAG_H_
63 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/bpe_model.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #ifndef BPE_MODEL_H_
16 | #define BPE_MODEL_H_
17 | 
18 | #include "model_interface.h"
19 | #include "sentencepiece_model.pb.h"
20 | 
21 | namespace sentencepiece {
22 | namespace bpe {
23 | 
24 | // Segmentation model with BPE (Byte Pair Encoding)
25 | // Details:
26 | // Neural Machine Translation of Rare Words with Subword Units
27 | // https://arxiv.org/abs/1508.07909
28 | //
29 | // https://en.wikipedia.org/wiki/Byte_pair_encoding
30 | class Model : public ModelInterface {
31 |  public:
32 |   explicit Model(const ModelProto &model_proto);
33 |   ~Model() override;
34 | 
35 |   EncodeResult Encode(absl::string_view normalized) const override {
36 |     return SampleEncode(normalized, 0.0);
37 |   }
38 | 
39 |   // Sampling with BPE-dropout: https://arxiv.org/pdf/1910.13267.pdf
40 |   // `alpha` is dropout probability in BPE-dropout paper.
41 |   // Skips merge operation with `alpha` probability.
42 |   // When alpha <= 0.0, no sampling is performed.
43 |   EncodeResult SampleEncode(absl::string_view normalized,
44 |                             float alpha) const override;
45 | 
46 |   bool IsSampleEncodeAvailable() const override { return true; }
47 | 
48 |   bool IsNBestEncodeAvailable() const override { return false; }
49 | };
50 | }  // namespace bpe
51 | }  // namespace sentencepiece
52 | #endif  // BPE_MODEL_H_
53 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/model_factory_test.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #include "model_factory.h"
16 | #include "testharness.h"
17 | 
18 | namespace sentencepiece {
19 | 
20 | TEST(ModelFactoryTest, BasicTest) {
21 |   ModelProto model_proto;
22 | 
23 |   auto *sp1 = model_proto.add_pieces();
24 |   auto *sp2 = model_proto.add_pieces();
25 |   auto *sp3 = model_proto.add_pieces();
26 | 
27 |   sp1->set_type(ModelProto::SentencePiece::UNKNOWN);
28 |   sp1->set_piece("<unk>");
29 |   sp2->set_type(ModelProto::SentencePiece::CONTROL);
30 |   sp2->set_piece("<s>");
31 |   sp3->set_type(ModelProto::SentencePiece::CONTROL);
32 |   sp3->set_piece("</s>");
33 | 
34 |   auto *sp4 = model_proto.add_pieces();
35 |   sp4->set_piece("test");
36 |   sp4->set_score(1.0);
37 | 
38 |   {
39 |     model_proto.mutable_trainer_spec()->set_model_type(TrainerSpec::UNIGRAM);
40 |     auto m = ModelFactory::Create(model_proto);
41 |   }
42 | 
43 |   {
44 |     model_proto.mutable_trainer_spec()->set_model_type(TrainerSpec::BPE);
45 |     auto m = ModelFactory::Create(model_proto);
46 |   }
47 | 
48 |   {
49 |     model_proto.mutable_trainer_spec()->set_model_type(TrainerSpec::WORD);
50 |     auto m = ModelFactory::Create(model_proto);
51 |   }
52 | 
53 |   {
54 |     model_proto.mutable_trainer_spec()->set_model_type(TrainerSpec::CHAR);
55 |     auto m = ModelFactory::Create(model_proto);
56 |   }
57 | }
58 | }  // namespace sentencepiece
59 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | spc_train <- function(args) {
 5 |     .Call('_sentencepiece_spc_train', PACKAGE = 'sentencepiece', args)
 6 | }
 7 | 
 8 | spc_load_model <- function(file) {
 9 |     .Call('_sentencepiece_spc_load_model', PACKAGE = 'sentencepiece', file)
10 | }
11 | 
12 | spc_encode_as_subwords <- function(model, x) {
13 |     .Call('_sentencepiece_spc_encode_as_subwords', PACKAGE = 'sentencepiece', model, x)
14 | }
15 | 
16 | spc_encode_as_ids <- function(model, x) {
17 |     .Call('_sentencepiece_spc_encode_as_ids', PACKAGE = 'sentencepiece', model, x)
18 | }
19 | 
20 | spc_encode_as_subwords_sample <- function(model, x, nbest_size = -1L, alpha = 1) {
21 |     .Call('_sentencepiece_spc_encode_as_subwords_sample', PACKAGE = 'sentencepiece', model, x, nbest_size, alpha)
22 | }
23 | 
24 | spc_encode_as_ids_sample <- function(model, x, nbest_size = -1L, alpha = 1) {
25 |     .Call('_sentencepiece_spc_encode_as_ids_sample', PACKAGE = 'sentencepiece', model, x, nbest_size, alpha)
26 | }
27 | 
28 | spc_encode_as_subwords_nbest <- function(model, x, nbest_size = -1L) {
29 |     .Call('_sentencepiece_spc_encode_as_subwords_nbest', PACKAGE = 'sentencepiece', model, x, nbest_size)
30 | }
31 | 
32 | spc_encode_as_ids_nbest <- function(model, x, nbest_size = -1L) {
33 |     .Call('_sentencepiece_spc_encode_as_ids_nbest', PACKAGE = 'sentencepiece', model, x, nbest_size)
34 | }
35 | 
36 | spc_decode_ids <- function(model, x) {
37 |     .Call('_sentencepiece_spc_decode_ids', PACKAGE = 'sentencepiece', model, x)
38 | }
39 | 
40 | spc_decode_subwords <- function(model, x) {
41 |     .Call('_sentencepiece_spc_decode_subwords', PACKAGE = 'sentencepiece', model, x)
42 | }
43 | 
44 | wordpiece_encode_as_subwords <- function(x, vocabulary, unk_token = "[UNK]", max_input_chars_per_word = 100L) {
45 |     .Call('_sentencepiece_wordpiece_encode_as_subwords', PACKAGE = 'sentencepiece', x, vocabulary, unk_token, max_input_chars_per_word)
46 | }
47 | 
48 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/char_model_trainer.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #include <cmath>
16 | 
17 | #include "char_model.h"
18 | #include "char_model_trainer.h"
19 | #include "util.h"
20 | 
21 | namespace sentencepiece {
22 | namespace character {
23 | 
24 | util::Status Trainer::Train() {
25 |   RETURN_IF_ERROR(status());
26 | 
27 |   CHECK_OR_RETURN(normalizer_spec_.escape_whitespaces());
28 |   CHECK_EQ_OR_RETURN(TrainerSpec::CHAR, trainer_spec_.model_type());
29 | 
30 |   RETURN_IF_ERROR(LoadSentences());
31 | 
32 |   const int vocab_size = trainer_spec_.vocab_size() - meta_pieces_.size();
33 |   CHECK_GE_OR_RETURN(vocab_size, 0);
34 | 
35 |   uint64 sum = 0;
36 |   for (const auto &it : required_chars_) {
37 |     sum += it.second;
38 |   }
39 | 
40 |   const auto logsum = static_cast<float>(log(static_cast<long double>(sum)));
41 | 
42 |   CHECK_OR_RETURN(final_pieces_.empty());
43 |   for (const auto &it : Sorted(required_chars_)) {
44 |     if (!trainer_spec_.use_all_vocab() &&
45 |         final_pieces_.size() == static_cast<size_t>(vocab_size)) {
46 |       break;
47 |     }
48 |     final_pieces_.emplace_back(
49 |         string_util::UnicodeCharToUTF8(it.first),
50 |         static_cast<float>(log(static_cast<long double>(it.second))) - logsum);
51 |   }
52 | 
53 |   if (trainer_spec_.use_all_vocab()) {
54 |     trainer_spec_.set_vocab_size(final_pieces_.size() + meta_pieces_.size());
55 |   }
56 | 
57 |   return Save();
58 | }
59 | }  // namespace character
60 | }  // namespace sentencepiece
61 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/filesystem.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #ifndef FILESYSTEM_H_
16 | #define FILESYSTEM_H_
17 | 
18 | #include <stdio.h>
19 | 
20 | #include <fstream>
21 | #include <memory>
22 | #include <string>
23 | 
24 | #include "common.h"
25 | #include "sentencepiece_processor.h"
26 | #include "third_party/absl/strings/string_view.h"
27 | 
28 | namespace sentencepiece {
29 | namespace filesystem {
30 | class ReadableFile {
31 |  public:
32 |   ReadableFile() {}
33 |   explicit ReadableFile(absl::string_view filename, bool is_binary = false) {}
34 |   virtual ~ReadableFile() {}
35 | 
36 |   virtual util::Status status() const = 0;
37 |   virtual bool ReadLine(std::string *line) = 0;
38 |   virtual bool ReadAll(std::string *line) = 0;
39 | };
40 | 
41 | class WritableFile {
42 |  public:
43 |   WritableFile() {}
44 |   explicit WritableFile(absl::string_view filename, bool is_binary = false) {}
45 |   virtual ~WritableFile() {}
46 | 
47 |   virtual util::Status status() const = 0;
48 |   virtual bool Write(absl::string_view text) = 0;
49 |   virtual bool WriteLine(absl::string_view text) = 0;
50 | };
51 | 
52 | std::unique_ptr<ReadableFile> NewReadableFile(absl::string_view filename,
53 |                                               bool is_binary = false);
54 | std::unique_ptr<WritableFile> NewWritableFile(absl::string_view filename,
55 |                                               bool is_binary = false);
56 | 
57 | }  // namespace filesystem
58 | }  // namespace sentencepiece
59 | #endif  // FILESYSTEM_H_
60 | 


--------------------------------------------------------------------------------
/man/BPEembedder.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/bpemb.R
 3 | \name{BPEembedder}
 4 | \alias{BPEembedder}
 5 | \title{Build a BPEembed model containing a Sentencepiece and Word2vec model}
 6 | \usage{
 7 | BPEembedder(
 8 |   x,
 9 |   tokenizer = c("bpe", "char", "unigram", "word"),
10 |   args = list(vocab_size = 8000, coverage = 0.9999),
11 |   ...
12 | )
13 | }
14 | \arguments{
15 | \item{x}{a data.frame with columns doc_id and text}
16 | 
17 | \item{tokenizer}{character string with the type of sentencepiece tokenizer. Either 'bpe', 'char', 'unigram' or 'word' for Byte Pair Encoding, Character level encoding,
18 | Unigram encoding or pretokenised word encoding. Defaults to 'bpe' (Byte Pair Encoding). Passed on to \code{\link{sentencepiece}}}
19 | 
20 | \item{args}{a list of arguments passed on to \code{\link{sentencepiece}}}
21 | 
22 | \item{...}{arguments passed on to \code{\link[word2vec]{word2vec}} for training a word2vec model}
23 | }
24 | \value{
25 | an object of class BPEembed which is a list with elements 
26 | \itemize{
27 | \item{model: a sentencepiece model as loaded with \code{\link{sentencepiece_load_model}}}
28 | \item{embedding: a matrix with embeddings as loaded with \code{\link[word2vec]{read.wordvectors}}}
29 | \item{dim: the dimension of the embedding}
30 | \item{n: the number of elements in the vocabulary}
31 | \item{file_sentencepiece: the sentencepiece model file}
32 | \item{file_word2vec: the word2vec embedding file}
33 | }
34 | }
35 | \description{
36 | Build a sentencepiece model on text and build a matching word2vec model on the sentencepiece vocabulary
37 | }
38 | \examples{
39 | library(tokenizers.bpe)
40 | data(belgium_parliament, package = "tokenizers.bpe")
41 | x     <- subset(belgium_parliament, language \%in\% "dutch")
42 | model <- BPEembedder(x, tokenizer = "bpe", args = list(vocab_size = 1000),
43 |                      type = "cbow", dim = 20, iter = 10) 
44 | model
45 | 
46 | txt    <- c("De eigendomsoverdracht aan de deelstaten is ingewikkeld.")
47 | values <- predict(model, txt, type = "encode")  
48 | }
49 | \seealso{
50 | \code{\link{sentencepiece}}, \code{\link[word2vec]{word2vec}}, \code{\link{predict.BPEembed}}
51 | }
52 | 


--------------------------------------------------------------------------------
/src/third_party/protobuf-lite/google/protobuf/port.h:
--------------------------------------------------------------------------------
 1 | // Protocol Buffers - Google's data interchange format
 2 | // Copyright 2008 Google Inc.  All rights reserved.
 3 | // https://developers.google.com/protocol-buffers/
 4 | //
 5 | // Redistribution and use in source and binary forms, with or without
 6 | // modification, are permitted provided that the following conditions are
 7 | // met:
 8 | //
 9 | //     * Redistributions of source code must retain the above copyright
10 | // notice, this list of conditions and the following disclaimer.
11 | //     * Redistributions in binary form must reproduce the above
12 | // copyright notice, this list of conditions and the following disclaimer
13 | // in the documentation and/or other materials provided with the
14 | // distribution.
15 | //     * Neither the name of Google Inc. nor the names of its
16 | // contributors may be used to endorse or promote products derived from
17 | // this software without specific prior written permission.
18 | //
19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 
31 | // A common header that is included across all protobuf headers.  We do our best
32 | // to avoid #defining any macros here; instead we generally put macros in
33 | // port_def.inc and port_undef.inc so they are not visible from outside of
34 | // protobuf.
35 | 
36 | #ifndef GOOGLE_PROTOBUF_PORT_H__
37 | #define GOOGLE_PROTOBUF_PORT_H__
38 | 
39 | 
40 | #include <google/protobuf/stubs/port.h>
41 | 
42 | 
43 | #endif  // GOOGLE_PROTOBUF_PORT_H__
44 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/testharness.cc:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | // Copyright 2016 Google Inc.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.!
15 | 
16 | #include "testharness.h"
17 | 
18 | #ifndef OS_WIN
19 | #include <sys/stat.h>
20 | #include <unistd.h>
21 | #else
22 | #include <direct.h>
23 | #endif
24 | 
25 | #include <memory>
26 | #include <string>
27 | #include <vector>
28 | 
29 | #include "common.h"
30 | #include "third_party/absl/strings/str_cat.h"
31 | #include "util.h"
32 | 
33 | namespace sentencepiece {
34 | namespace test {
35 | 
36 | namespace {
37 | struct Test {
38 |   const char *base;
39 |   const char *name;
40 |   void (*func)();
41 | };
42 | std::vector<Test> *tests;
43 | }  // namespace
44 | 
45 | bool RegisterTest(const char *base, const char *name, void (*func)()) {
46 |   if (tests == nullptr) {
47 |     tests = new std::vector<Test>;
48 |   }
49 |   Test t;
50 |   t.base = base;
51 |   t.name = name;
52 |   t.func = func;
53 |   tests->emplace_back(t);
54 |   return true;
55 | }
56 | 
57 | int RunAllTests() {
58 |   int num = 0;
59 | #ifdef OS_WIN
60 |   _mkdir(absl::GetFlag(FLAGS_test_tmpdir).c_str());
61 | #else
62 |   mkdir(absl::GetFlag(FLAGS_test_tmpdir).c_str(), S_IRUSR | S_IWUSR | S_IXUSR);
63 | #endif
64 | 
65 |   if (tests == nullptr) {
66 |     Rcpp::Rcerr << "No tests are found" << std::endl;
67 |     return 0;
68 |   }
69 | 
70 |   for (const Test &t : *(tests)) {
71 |     Rcpp::Rcerr << "[ RUN      ] " << t.base << "." << t.name << std::endl;
72 |     (*t.func)();
73 |     Rcpp::Rcerr << "[       OK ] " << t.base << "." << t.name << std::endl;
74 |     ++num;
75 |   }
76 |   Rcpp::Rcerr << "==== PASSED " << num << " tests" << std::endl;
77 | 
78 |   return 0;
79 | }
80 | }  // namespace test
81 | }  // namespace sentencepiece
82 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/spm_export_vocab_main.cc:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | // Copyright 2016 Google Inc.
 4 | //
 5 | // Licensed under the Apache License, Version 2.0 (the "License");
 6 | // you may not use this file except in compliance with the License.
 7 | // You may obtain a copy of the License at
 8 | // n//     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.!
15 | 
16 | #include <sstream>
17 | 
18 | #include "common.h"
19 | #include "filesystem.h"
20 | #include "init.h"
21 | #include "sentencepiece_model.pb.h"
22 | #include "sentencepiece_processor.h"
23 | #include "third_party/absl/flags/flag.h"
24 | 
25 | ABSL_FLAG(std::string, output, "", "Output filename");
26 | ABSL_FLAG(std::string, model, "", "input model file name");
27 | ABSL_FLAG(std::string, output_format, "vocab",
28 |           "output format. choose from vocab or syms. vocab outputs pieces "
29 |           "and scores, syms outputs pieces and indices.");
30 | 
31 | int main(int argc, char *argv[]) {
32 |   sentencepiece::ParseCommandLineFlags(argv[0], &argc, &argv, true);
33 | 
34 |   sentencepiece::SentencePieceProcessor sp;
35 |   CHECK_OK(sp.Load(absl::GetFlag(FLAGS_model)));
36 | 
37 |   auto output =
38 |       sentencepiece::filesystem::NewWritableFile(absl::GetFlag(FLAGS_output));
39 |   CHECK_OK(output->status());
40 | 
41 |   if (absl::GetFlag(FLAGS_output_format) == "vocab") {
42 |     for (const auto &piece : sp.model_proto().pieces()) {
43 |       std::ostringstream os;
44 |       os << piece.piece() << "\t" << piece.score();
45 |       output->WriteLine(os.str());
46 |     }
47 |   } else if (absl::GetFlag(FLAGS_output_format) == "syms") {
48 |     for (int i = 0; i < sp.model_proto().pieces_size(); i++) {
49 |       std::ostringstream os;
50 |       os << sp.model_proto().pieces(i).piece() << "\t" << i;
51 |       output->WriteLine(os.str());
52 |     }
53 |   } else {
54 |     LOG(FATAL) << "Unsupported output format: "
55 |                << absl::GetFlag(FLAGS_output_format);
56 |   }
57 | 
58 |   return 0;
59 | }
60 | 


--------------------------------------------------------------------------------
/src/third_party/protobuf-lite/statusor.cc:
--------------------------------------------------------------------------------
 1 | // Protocol Buffers - Google's data interchange format
 2 | // Copyright 2008 Google Inc.  All rights reserved.
 3 | // https://developers.google.com/protocol-buffers/
 4 | //
 5 | // Redistribution and use in source and binary forms, with or without
 6 | // modification, are permitted provided that the following conditions are
 7 | // met:
 8 | //
 9 | //     * Redistributions of source code must retain the above copyright
10 | // notice, this list of conditions and the following disclaimer.
11 | //     * Redistributions in binary form must reproduce the above
12 | // copyright notice, this list of conditions and the following disclaimer
13 | // in the documentation and/or other materials provided with the
14 | // distribution.
15 | //     * Neither the name of Google Inc. nor the names of its
16 | // contributors may be used to endorse or promote products derived from
17 | // this software without specific prior written permission.
18 | //
19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 
31 | #include <google/protobuf/stubs/statusor.h>
32 | 
33 | #include <google/protobuf/stubs/logging.h>
34 | 
35 | namespace google {
36 | namespace protobuf {
37 | namespace util {
38 | namespace internal {
39 | 
40 | void StatusOrHelper::Crash(const Status& status) {
41 |   GOOGLE_LOG(FATAL) << "Attempting to fetch value instead of handling error "
42 |                     << status.ToString();
43 | }
44 | 
45 | }  // namespace internal
46 | }  // namespace util
47 | }  // namespace protobuf
48 | }  // namespace google
49 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/pretokenizer_for_training.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | #include <string>
15 | 
16 | #include "pretokenizer_for_training.h"
17 | #include "third_party/absl/strings/str_replace.h"
18 | 
19 | namespace sentencepiece {
20 | namespace pretokenizer {
21 | 
22 | namespace {
23 | // TODO(taku): They are defined in trainer_interface.h but we
24 | // defined them explicitly to avoid the dependency to trainier_interface.
25 | // Currently, we have no separated build rules.
26 | const char kWSStr[] = "\xe2\x96\x81";
27 | const char kUPPBoundaryStr[] = "\t";
28 | }  // namespace
29 | 
30 | std::string PretokenizerForTrainingInterface::PreTokenize(
31 |     absl::string_view text) const {
32 |   return Postprocess(Tokenize(Preprocess(text)));
33 | }
34 | 
35 | // static
36 | std::string PretokenizerForTrainingInterface::Preprocess(
37 |     absl::string_view text) {
38 |   // Escapes kWSStr (_) as this character may not be processed by pre-tokenizer.
39 |   return absl::StrReplaceAll(text, {{kWSStr, " "}});
40 | }
41 | 
42 | // static
43 | std::string PretokenizerForTrainingInterface::Postprocess(
44 |     const SentencePieceText &spt) {
45 |   // Inserts kUPPBoundaryStr before/after of token boundaries.
46 |   std::string output;
47 |   int prev = 0;
48 |   for (const auto &piece : spt.pieces()) {
49 |     if (prev == piece.begin() && piece.begin() != 0) {
50 |       output += kUPPBoundaryStr;
51 |     } else {
52 |       output.append(piece.begin() - prev, ' ');
53 |     }
54 |     output += piece.surface();
55 |     prev = piece.end();
56 |   }
57 | 
58 |   // Restores kWSStr.
59 |   return absl::StrReplaceAll(output, {{" ", kWSStr}});
60 | }
61 | 
62 | }  // namespace pretokenizer
63 | }  // namespace sentencepiece
64 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/trainer_factory.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #include "bpe_model_trainer.h"
16 | #include "char_model_trainer.h"
17 | #include "third_party/absl/memory/memory.h"
18 | #include "trainer_factory.h"
19 | #include "unigram_model_trainer.h"
20 | #include "word_model_trainer.h"
21 | 
22 | namespace sentencepiece {
23 | 
24 | // Instantiate Trainer instance from trainer_spec and normalization_spec
25 | std::unique_ptr<TrainerInterface> TrainerFactory::Create(
26 |     const TrainerSpec &trainer_spec, const NormalizerSpec &normalizer_spec,
27 |     const NormalizerSpec &denormalizer_spec) {
28 |   switch (trainer_spec.model_type()) {
29 |     case TrainerSpec::UNIGRAM:
30 |       return absl::make_unique<unigram::Trainer>(trainer_spec, normalizer_spec,
31 |                                                  denormalizer_spec);
32 |       break;
33 |     case TrainerSpec::BPE:
34 |       return absl::make_unique<bpe::Trainer>(trainer_spec, normalizer_spec,
35 |                                              denormalizer_spec);
36 |       break;
37 |     case TrainerSpec::WORD:
38 |       return absl::make_unique<word::Trainer>(trainer_spec, normalizer_spec,
39 |                                               denormalizer_spec);
40 |       break;
41 |     case TrainerSpec::CHAR:
42 |       return absl::make_unique<character::Trainer>(
43 |           trainer_spec, normalizer_spec, denormalizer_spec);
44 |       break;
45 |     default:
46 |       LOG(FATAL) << "Unknown model_type: " << trainer_spec.model_type();
47 |       break;
48 |   }
49 | 
50 |   return absl::make_unique<unigram::Trainer>(trainer_spec, normalizer_spec,
51 |                                              denormalizer_spec);
52 | }
53 | }  // namespace sentencepiece
54 | 


--------------------------------------------------------------------------------
/R/wordpiece.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 |        
 4 | 
 5 | 
 6 | #' @title Wordpiece encoding
 7 | #' @description Wordpiece encoding, usefull for BERT-style tokenisation. 
 8 | #' Experimental version mimicing class WordpieceTokenizer from \url{https://github.com/huggingface/transformers/blob/master/src/transformers/models/bert/tokenization_bert.py}
 9 | #' @param x a character vector with text which can be splitted based on white space to obtain words
10 | #' @param vocabulary a character vector of the vocabulary
11 | #' @param type a character string, either 'subwords' or 'ids' to get the subwords or the corresponding ids of these subwords as defined in the vocabulary of the model. 
12 | #' Defaults to 'subwords'.
13 | #' @param unk_token character string with a value for a token which is not part of the vocabulary. Defaults to '[UNK]'
14 | #' @param max_input_chars_per_word integer. A word which is longer than this specified number of characters will be set to the unknown token.
15 | #' @return a list of subword tokens
16 | #' @export
17 | #' @examples
18 | #' wordpiece_encode("unaffable", vocabulary = c("un", "##aff", "##able")) 
19 | #' wordpiece_encode(x = c("unaffable", "unaffableun"), 
20 | #'                  vocabulary = c("un", "##aff", "##able"))
21 | #' wordpiece_encode(x = c("unaffable", "unaffableun", "unknown territory"), 
22 | #'                  vocabulary = c("un", "##aff", "##able", "##un")) 
23 | #' wordpiece_encode(x = c("unaffable", "unaffableun", "unknown territory"), 
24 | #'                  vocabulary = c("un", "##aff", "##able", "##un"),
25 | #'                  type = "ids") 
26 | wordpiece_encode <- function(x, vocabulary = character(), type = c("subwords", "ids"), unk_token = "[UNK]", max_input_chars_per_word = 100L){
27 |   type <- match.arg(type)
28 |   max_input_chars_per_word <- as.integer(max_input_chars_per_word)
29 |   unk_token <- as.character(unk_token)
30 |   vocabulary <- as.character(vocabulary)
31 |   x <- as.character(x)
32 |   x <- trimws(x)
33 |   x <- strsplit(x, " ")
34 |   x <- lapply(x, FUN = function(terms){
35 |     subwords <- lapply(terms, FUN=function(term) wordpiece_encode_as_subwords(term, vocabulary, unk_token, max_input_chars_per_word))
36 |     subwords <- unlist(subwords, use.names = FALSE)
37 |     subwords
38 |   })
39 |   if(type == "ids"){
40 |     x <- lapply(x, FUN = function(x){
41 |       match(x, vocabulary) - 1L
42 |     })
43 |   }
44 |   x
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/pretokenizer_for_training.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #ifndef PRETOKENIZER_FOR_TRAINING_H_
16 | #define PRETOKENIZER_FOR_TRAINING_H_
17 | 
18 | #include <memory>
19 | #include <string>
20 | 
21 | #include "common.h"
22 | #include "sentencepiece.pb.h"
23 | #include "sentencepiece_processor.h"
24 | #include "third_party/absl/strings/string_view.h"
25 | 
26 | namespace sentencepiece {
27 | namespace pretokenizer {
28 | 
29 | class PretokenizerForTrainingInterface {
30 |  public:
31 |   PretokenizerForTrainingInterface() {}
32 |   virtual ~PretokenizerForTrainingInterface() {}
33 |   virtual util::Status status() const = 0;
34 | 
35 |   // Puts kUPPBoundaryStr before and after the pre-tokenizer's segmentation
36 |   // when there are no spaces between these tokens.
37 |   // Example1:
38 |   // input: 東京です
39 |   // segmentation: piece[0] = {0, 6}, piece[1] = {6, 12}
40 |   // output: 東京<tab>です (here kUPPBoundaryStr is <tab>)
41 |   //
42 |   // Example2:
43 |   // input: I love sentencepiece
44 |   // segmentation: piece[0] = {0, 1}, piece[1] = {2, 6},
45 |   //               piece[2] = {7, 15}, piece[3] = {15, 20}
46 |   // output: I love sentence<tab>piece.
47 |   std::string PreTokenize(absl::string_view text) const;
48 | 
49 |   // Returns pre-tokenized result.
50 |   // Note that the pre-tokenized constraint is specified with the
51 |   // byte offsets (SentencePiece::begin, SentencePiece::end) over
52 |   // the input text.
53 |   virtual SentencePieceText Tokenize(absl::string_view text) const = 0;
54 | 
55 |  private:
56 |   static std::string Preprocess(absl::string_view text);
57 |   static std::string Postprocess(const SentencePieceText &spt);
58 | };
59 | 
60 | }  // namespace pretokenizer
61 | }  // namespace sentencepiece
62 | 
63 | #endif  // PRETOKENIZER_FOR_TRAINING_H_
64 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | ## CHANGES IN sentencepiece VERSION 0.2.4
 2 | 
 3 | - Drop C++11 from Makevars
 4 | - std::iterator replacement in src/third_party/protobuf-lite/google/protobuf/repeated_field.h as std::iterator is deprecated in C++17
 5 | 
 6 | ## CHANGES IN sentencepiece VERSION 0.2.3
 7 | 
 8 | - fix R CMD check warning due to change in version 0.2.2. 
 9 |     - in third_party/protobuf-lite/strutil.cc:506:33: warning: argument to ‘sizeof’ in ‘int snprintf(char*, size_t, const char*, ...)’ call is the same expression as the destination; did you mean to provide an explicit length? [-Wsizeof-pointer-memaccess]
10 |     - this part of third_party/protobuf-lite/strutil.cc was not used in sentencepiece
11 | 
12 | ## CHANGES IN sentencepiece VERSION 0.2.2
13 | 
14 | - use snprintf instead of sprintf to handle the R CMD check deprecating note on M1mac
15 | 
16 | ## CHANGES IN sentencepiece VERSION 0.2.1
17 | 
18 | - Fix for clang-UBSAN error
19 | 
20 | ## CHANGES IN sentencepiece VERSION 0.2
21 | 
22 | - Fix wordpiece bug for 1-character words. (@jonthegeek, #4)
23 | - Upgraded to sentencepiece release v0.1.96
24 | 
25 | ## CHANGES IN sentencepiece VERSION 0.1.3
26 | 
27 | - Fix wordpiece bug for 1-character words. (@jonthegeek, #4)
28 | - Fix Solaris installation issue related to incorrect usage of pointer as a function 
29 | - Also download the binary model in sentencepiece_download_model as it can be loaded with word2vec::read.wordvectors
30 | - read_word2vec now uses word2vec::read.wordvectors from word2vec >= 0.2.0
31 | - added BPEembed and predict.BPEembed
32 | - allow subword regularisation by adding nbest and alpha option in sentencepiece_encode and changed sentencepiece_decode accordingly
33 | - Added txt_remove_
34 | - Upgrade sentencepiece to release v0.1.91 commit a32d7dc6ce6f383a65ad6e1cbe1983f94ab11932 which has subword regularisation for BPE
35 | 
36 | ## CHANGES IN sentencepiece VERSION 0.1.2
37 | 
38 | - Fix Solaris installation issue which used log of uint64 which is not defined on Solaris
39 | 
40 | ## CHANGES IN sentencepiece VERSION 0.1.1
41 | 
42 | - Added verbose argument in sentencepiece
43 | 
44 | ## CHANGES IN sentencepiece VERSION 0.1.0
45 | 
46 | - Initial package based on https://github.com/google/sentencepiece release v0.1.84 commit  2424d82d396b43b2556203c592e48a621ef10f3c
47 | - Third-party code from https://github.com/google/sentencepiece/tree/master/third_party is put in src/absl, src/esaxx, src/darts_clone, src/protobuf-lite
48 | 


--------------------------------------------------------------------------------
/src/third_party/absl/strings/str_replace.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Copyright 2017 The Abseil Authors.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | //
16 | #ifndef ABSL_STRINGS_STR_REPLACE_H_
17 | #define ABSL_STRINGS_STR_REPLACE_H_
18 | 
19 | #include <string>
20 | 
21 | #include "third_party/absl/strings/string_view.h"
22 | 
23 | namespace absl {
24 | 
25 | inline void StringReplace(absl::string_view s, absl::string_view oldsub,
26 |                           absl::string_view newsub, bool replace_all,
27 |                           std::string *res) {
28 |   if (oldsub.empty()) {
29 |     res->append(s.data(), s.size());
30 |     return;
31 |   }
32 | 
33 |   absl::string_view::size_type start_pos = 0;
34 |   do {
35 |     const absl::string_view::size_type pos = s.find(oldsub, start_pos);
36 |     if (pos == absl::string_view::npos) {
37 |       break;
38 |     }
39 |     res->append(s.data() + start_pos, pos - start_pos);
40 |     res->append(newsub.data(), newsub.size());
41 |     start_pos = pos + oldsub.size();
42 |   } while (replace_all);
43 |   res->append(s.data() + start_pos, s.size() - start_pos);
44 | }
45 | 
46 | inline std::string StringReplace(absl::string_view s, absl::string_view oldsub,
47 |                                  absl::string_view newsub, bool replace_all) {
48 |   std::string ret;
49 |   StringReplace(s, oldsub, newsub, replace_all, &ret);
50 |   return ret;
51 | }
52 | 
53 | inline std::string StrReplaceAll(
54 |     absl::string_view s,
55 |     const std::vector<std::pair<absl::string_view, absl::string_view>>
56 |         &patterns) {
57 |   std::string prev(s.data(), s.size());
58 |   std::string result;
59 |   for (const auto &it : patterns) {
60 |     result.clear();
61 |     StringReplace(prev, it.first, it.second, true, &result);
62 |     prev = result;
63 |   }
64 |   return result;
65 | }
66 | 
67 | }  // namespace absl
68 | #endif  // ABSL_STRINGS_STR_REPLACE_H_
69 | 


--------------------------------------------------------------------------------
/src/third_party/protobuf-lite/google/protobuf/stubs/once.h:
--------------------------------------------------------------------------------
 1 | // Protocol Buffers - Google's data interchange format
 2 | // Copyright 2008 Google Inc.  All rights reserved.
 3 | // https://developers.google.com/protocol-buffers/
 4 | //
 5 | // Redistribution and use in source and binary forms, with or without
 6 | // modification, are permitted provided that the following conditions are
 7 | // met:
 8 | //
 9 | //     * Redistributions of source code must retain the above copyright
10 | // notice, this list of conditions and the following disclaimer.
11 | //     * Redistributions in binary form must reproduce the above
12 | // copyright notice, this list of conditions and the following disclaimer
13 | // in the documentation and/or other materials provided with the
14 | // distribution.
15 | //     * Neither the name of Google Inc. nor the names of its
16 | // contributors may be used to endorse or promote products derived from
17 | // this software without specific prior written permission.
18 | //
19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 
31 | #ifndef GOOGLE_PROTOBUF_STUBS_ONCE_H__
32 | #define GOOGLE_PROTOBUF_STUBS_ONCE_H__
33 | 
34 | #include <mutex>
35 | #include <utility>
36 | 
37 | #include <google/protobuf/port_def.inc>
38 | 
39 | namespace google {
40 | namespace protobuf {
41 | namespace internal {
42 | 
43 | using once_flag = std::once_flag;
44 | template <typename... Args>
45 | void call_once(Args&&... args ) {
46 |   std::call_once(std::forward<Args>(args)...);
47 | }
48 | 
49 | }  // namespace internal
50 | }  // namespace protobuf
51 | }  // namespace google
52 | 
53 | #include <google/protobuf/port_undef.inc>
54 | 
55 | #endif  // GOOGLE_PROTOBUF_STUBS_ONCE_H__
56 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/word_model_trainer.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #include <cmath>
16 | #include <string>
17 | 
18 | #include "third_party/absl/container/flat_hash_map.h"
19 | #include "third_party/absl/strings/string_view.h"
20 | #include "util.h"
21 | #include "word_model.h"
22 | #include "word_model_trainer.h"
23 | 
24 | namespace sentencepiece {
25 | namespace word {
26 | 
27 | util::Status Trainer::Train() {
28 |   RETURN_IF_ERROR(status());
29 | 
30 |   CHECK_OR_RETURN(normalizer_spec_.escape_whitespaces());
31 |   CHECK_EQ_OR_RETURN(TrainerSpec::WORD, trainer_spec_.model_type());
32 | 
33 |   RETURN_IF_ERROR(LoadSentences());
34 | 
35 |   absl::flat_hash_map<std::string, uint64> freq;
36 |   for (const auto &it : sentences_) {
37 |     for (const auto &s : SplitIntoWords(it.first)) {
38 |       freq[std::string(s)] += it.second;
39 |     }
40 |   }
41 | 
42 |   const int vocab_size = trainer_spec_.vocab_size() - meta_pieces_.size();
43 |   CHECK_GE_OR_RETURN(vocab_size, 0);
44 | 
45 |   uint64 sum = 0;
46 |   for (const auto &it : freq) {
47 |     sum += it.second;
48 |   }
49 | 
50 |   const auto logsum = static_cast<float>(log(static_cast<long double>(sum)));
51 | 
52 |   CHECK_OR_RETURN(final_pieces_.empty());
53 |   for (const auto &it : Sorted(freq)) {
54 |     if (it.first.find(kUNKStr) != std::string::npos) {
55 |       continue;
56 |     }
57 |     if (!trainer_spec_.use_all_vocab() &&
58 |         final_pieces_.size() == static_cast<size_t>(vocab_size)) {
59 |       break;
60 |     }
61 |     final_pieces_.emplace_back(it.first, static_cast<float>(log(static_cast<long double>(it.second))) - logsum);
62 |   }
63 | 
64 |   if (trainer_spec_.use_all_vocab()) {
65 |     trainer_spec_.set_vocab_size(final_pieces_.size() + meta_pieces_.size());
66 |   }
67 | 
68 |   return Save();
69 | }
70 | }  // namespace word
71 | }  // namespace sentencepiece
72 | 


--------------------------------------------------------------------------------
/man/BPEembed.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/bpemb.R
 3 | \name{BPEembed}
 4 | \alias{BPEembed}
 5 | \title{Tokenise and embed text alongside a Sentencepiece and Word2vec model}
 6 | \usage{
 7 | BPEembed(
 8 |   file_sentencepiece = x$file_model,
 9 |   file_word2vec = x$glove.bin$file_model,
10 |   x,
11 |   normalize = TRUE
12 | )
13 | }
14 | \arguments{
15 | \item{file_sentencepiece}{the path to the file containing the sentencepiece model}
16 | 
17 | \item{file_word2vec}{the path to the file containing the word2vec embeddings}
18 | 
19 | \item{x}{the result of a call to \code{\link{sentencepiece_download_model}}. 
20 | If this is provided, arguments \code{file_sentencepiece} and \code{file_word2vec} will not be used.}
21 | 
22 | \item{normalize}{passed on to \code{\link[word2vec]{read.wordvectors}} to read in \code{file_word2vec}. Defaults to \code{TRUE}.}
23 | }
24 | \value{
25 | an object of class BPEembed which is a list with elements 
26 | \itemize{
27 | \item{model: a sentencepiece model as loaded with \code{\link{sentencepiece_load_model}}}
28 | \item{embedding: a matrix with embeddings as loaded with \code{\link[word2vec]{read.wordvectors}}}
29 | \item{dim: the dimension of the embedding}
30 | \item{n: the number of elements in the vocabulary}
31 | \item{file_sentencepiece: the sentencepiece model file}
32 | \item{file_word2vec: the word2vec embedding file}
33 | }
34 | }
35 | \description{
36 | Use a sentencepiece model to tokenise text and get the embeddings of these
37 | }
38 | \examples{
39 | ##
40 | ## Example loading model from disk
41 | ##
42 | folder    <- system.file(package = "sentencepiece", "models")
43 | embedding <- file.path(folder, "nl.wiki.bpe.vs1000.d25.w2v.bin")
44 | model     <- file.path(folder, "nl.wiki.bpe.vs1000.model")
45 | encoder   <- BPEembed(model, embedding)  
46 | 
47 | ## Do tokenisation with the sentencepiece model + embed these
48 | txt    <- c("De eigendomsoverdracht aan de deelstaten is ingewikkeld.",
49 |             "On est d'accord sur le prix de la biere?")
50 | values <- predict(encoder, txt, type = "encode")  
51 | str(values) 
52 | values
53 | 
54 | txt <- rownames(values[[1]])
55 | predict(encoder, txt, type = "decode") 
56 | txt <- lapply(values, FUN = rownames) 
57 | predict(encoder, txt, type = "decode") 
58 | }
59 | \seealso{
60 | \code{\link{predict.BPEembed}}, \code{\link{sentencepiece_load_model}}, \code{\link{sentencepiece_download_model}}, \code{\link[word2vec]{read.wordvectors}}
61 | }
62 | 


--------------------------------------------------------------------------------
/man/predict.BPEembed.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/bpemb.R
 3 | \name{predict.BPEembed}
 4 | \alias{predict.BPEembed}
 5 | \title{Encode and Decode alongside a BPEembed model}
 6 | \usage{
 7 | \method{predict}{BPEembed}(object, newdata, type = c("encode", "decode", "tokenize"), ...)
 8 | }
 9 | \arguments{
10 | \item{object}{an object of class BPEembed as returned by \code{\link{BPEembed}}}
11 | 
12 | \item{newdata}{a character vector of text to encode or a character vector of encoded tokens to decode or a list of those}
13 | 
14 | \item{type}{character string, either 'encode', 'decode' or 'tokenize'}
15 | 
16 | \item{...}{further arguments passed on to the methods}
17 | }
18 | \value{
19 | \itemize{
20 | \item{in case type is set to \code{'encode'}: a list of matrices containing embeddings of the text which is tokenised with \code{\link{sentencepiece_encode}}}
21 | \item{in case type is set to \code{'decode'}: a character vector of decoded text as returned by \code{\link{sentencepiece_decode}}}
22 | \item{in case type is set to \code{'tokenize'}: a tokenised \code{\link{sentencepiece_encode}}}
23 | }
24 | }
25 | \description{
26 | Use the sentencepiece model to either
27 | \itemize{
28 | \item{encode: tokenise and embed text}
29 | \item{decode: get the untokenised text back of tokenised data}
30 | \item{tokenize: only tokenize alongside the sentencepiece model}
31 | }
32 | }
33 | \examples{
34 | embedding <- system.file(package = "sentencepiece", "models", 
35 |                          "nl.wiki.bpe.vs1000.d25.w2v.bin")
36 | model     <- system.file(package = "sentencepiece", "models", 
37 |                          "nl.wiki.bpe.vs1000.model")    
38 | encoder   <- BPEembed(model, embedding)  
39 | 
40 | txt      <- c("De eigendomsoverdracht aan de deelstaten is ingewikkeld.",
41 |               "On est d'accord sur le prix de la biere?")
42 | values   <- predict(encoder, txt, type = "encode")  
43 | str(values) 
44 | values
45 | 
46 | txt <- rownames(values[[1]])
47 | predict(encoder, txt, type = "decode") 
48 | txt <- lapply(values, FUN = rownames) 
49 | predict(encoder, txt, type = "decode") 
50 | txt <- c("De eigendomsoverdracht aan de deelstaten is ingewikkeld.",
51 |          "On est d'accord sur le prix de la biere?")
52 | predict(encoder, txt, type = "tokenize", "subwords") 
53 | predict(encoder, txt, type = "tokenize", "ids")  
54 | }
55 | \seealso{
56 | \code{\link{BPEembed}}, \code{\link{sentencepiece_decode}}, \code{\link{sentencepiece_encode}}
57 | }
58 | 


--------------------------------------------------------------------------------
/src/third_party/protobuf-lite/zero_copy_stream.cc:
--------------------------------------------------------------------------------
 1 | // Protocol Buffers - Google's data interchange format
 2 | // Copyright 2008 Google Inc.  All rights reserved.
 3 | // https://developers.google.com/protocol-buffers/
 4 | //
 5 | // Redistribution and use in source and binary forms, with or without
 6 | // modification, are permitted provided that the following conditions are
 7 | // met:
 8 | //
 9 | //     * Redistributions of source code must retain the above copyright
10 | // notice, this list of conditions and the following disclaimer.
11 | //     * Redistributions in binary form must reproduce the above
12 | // copyright notice, this list of conditions and the following disclaimer
13 | // in the documentation and/or other materials provided with the
14 | // distribution.
15 | //     * Neither the name of Google Inc. nor the names of its
16 | // contributors may be used to endorse or promote products derived from
17 | // this software without specific prior written permission.
18 | //
19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 
31 | // Author: kenton@google.com (Kenton Varda)
32 | //  Based on original Protocol Buffers design by
33 | //  Sanjay Ghemawat, Jeff Dean, and others.
34 | 
35 | #include <google/protobuf/io/zero_copy_stream.h>
36 | 
37 | #include <google/protobuf/stubs/logging.h>
38 | #include <google/protobuf/stubs/common.h>
39 | 
40 | namespace google {
41 | namespace protobuf {
42 | namespace io {
43 | 
44 | 
45 | bool ZeroCopyOutputStream::WriteAliasedRaw(const void* /* data */,
46 |                                            int /* size */) {
47 |   GOOGLE_LOG(FATAL) << "This ZeroCopyOutputStream doesn't support aliasing. "
48 |                 "Reaching here usually means a ZeroCopyOutputStream "
49 |                 "implementation bug.";
50 |   return false;
51 | }
52 | 
53 | }  // namespace io
54 | }  // namespace protobuf
55 | }  // namespace google
56 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/freelist.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2018 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #ifndef FREELIST_H_
16 | #define FREELIST_H_
17 | 
18 | #include <string.h>
19 | 
20 | #include <vector>
21 | 
22 | namespace sentencepiece {
23 | namespace model {
24 | 
25 | // Simple FreeList that allocates a chunk of T at once.
26 | template <class T>
27 | class FreeList {
28 |  public:
29 |   FreeList() = delete;
30 |   explicit FreeList(size_t chunk_size) : chunk_size_(chunk_size) {}
31 |   virtual ~FreeList() {
32 |     for (auto& chunk : freelist_) delete[] chunk;
33 |   }
34 | 
35 |   // `Free` doesn't free the object but reuse the allocated memory chunks.
36 |   void Free() {
37 |     const int size = std::min<int>(chunk_index_ + 1, freelist_.size());
38 |     for (int i = 0; i < size; ++i) {
39 |       T* chunk = freelist_[i];
40 |       memset(static_cast<void*>(chunk), 0, sizeof(*chunk) * chunk_size_);
41 |     }
42 |     chunk_index_ = 0;
43 |     element_index_ = 0;
44 |   }
45 | 
46 |   // Returns the number of allocated elements.
47 |   size_t size() const { return chunk_size_ * chunk_index_ + element_index_; }
48 | 
49 |   // Returns the element as an array.
50 |   T* operator[](size_t index) const {
51 |     return freelist_[index / chunk_size_] + index % chunk_size_;
52 |   }
53 | 
54 |   // Allocates new element.
55 |   T* Allocate() {
56 |     if (element_index_ >= chunk_size_) {
57 |       ++chunk_index_;
58 |       element_index_ = 0;
59 |     }
60 | 
61 |     if (chunk_index_ == freelist_.size()) {
62 |       T* chunk = new T[chunk_size_];
63 |       memset(static_cast<void*>(chunk), 0, sizeof(*chunk) * chunk_size_);
64 |       freelist_.push_back(chunk);
65 |     }
66 | 
67 |     T* result = freelist_[chunk_index_] + element_index_;
68 |     ++element_index_;
69 | 
70 |     return result;
71 |   }
72 | 
73 |  private:
74 |   std::vector<T*> freelist_;
75 | 
76 |   // The last element is stored at freelist_[chunk_index_][element_index_]
77 |   size_t element_index_ = 0;
78 |   size_t chunk_index_ = 0;
79 |   const size_t chunk_size_ = 0;
80 | };
81 | }  // namespace model
82 | }  // namespace sentencepiece
83 | #endif  // FREELIST_H_
84 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/word_model_trainer_test.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #include <string>
16 | #include <vector>
17 | 
18 | #include "filesystem.h"
19 | #include "sentencepiece_processor.h"
20 | #include "testharness.h"
21 | #include "third_party/absl/strings/str_cat.h"
22 | #include "third_party/absl/strings/str_join.h"
23 | #include "util.h"
24 | #include "word_model_trainer.h"
25 | 
26 | namespace sentencepiece {
27 | namespace word {
28 | namespace {
29 | 
30 | // Space symbol (U+2581)
31 | #define WS "\xE2\x96\x81"
32 | 
33 | std::string RunTrainer(const std::vector<std::string> &input, int size) {
34 |   const std::string input_file =
35 |       util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "input");
36 |   const std::string model_prefix =
37 |       util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "model");
38 |   {
39 |     auto output = filesystem::NewWritableFile(input_file);
40 |     for (const auto &line : input) {
41 |       output->WriteLine(line);
42 |     }
43 |   }
44 | 
45 |   TrainerSpec trainer_spec;
46 |   trainer_spec.set_model_type(TrainerSpec::WORD);
47 |   trainer_spec.add_input(input_file);
48 |   trainer_spec.set_vocab_size(size - 3);  // remove <unk>, <s>, </s>
49 |   trainer_spec.set_model_prefix(model_prefix);
50 | 
51 |   NormalizerSpec normalizer_spec;
52 |   normalizer_spec.set_name("identity");
53 |   normalizer_spec.set_add_dummy_prefix(true);
54 | 
55 |   NormalizerSpec denormalizer_spec;
56 | 
57 |   Trainer trainer(trainer_spec, normalizer_spec, denormalizer_spec);
58 |   EXPECT_TRUE(trainer.Train().ok());
59 | 
60 |   SentencePieceProcessor processor;
61 |   EXPECT_TRUE(processor.Load(model_prefix + ".model").ok());
62 | 
63 |   const auto &model = processor.model_proto();
64 |   std::vector<std::string> pieces;
65 | 
66 |   // remove <unk>, <s>, </s>
67 |   for (int i = 3; i < model.pieces_size(); ++i) {
68 |     pieces.emplace_back(model.pieces(i).piece());
69 |   }
70 | 
71 |   return absl::StrJoin(pieces, " ");
72 | }
73 | }  // namespace
74 | 
75 | TEST(TrainerTest, BasicTest) {
76 |   EXPECT_EQ(WS "I " WS "apple " WS "have " WS "pen",
77 |             RunTrainer({"I have a pen", "I have an apple", "apple pen"}, 10));
78 | }
79 | }  // namespace word
80 | }  // namespace sentencepiece
81 | 


--------------------------------------------------------------------------------
/src/third_party/absl/strings/str_join.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Copyright 2017 The Abseil Authors.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | //
16 | #ifndef ABSL_STRINGS_STR_JOIN_H_
17 | #define ABSL_STRINGS_STR_JOIN_H_
18 | 
19 | #include <string>
20 | 
21 | #include "third_party/absl/strings/string_view.h"
22 | 
23 | namespace absl {
24 | namespace {
25 | template <typename T>
26 | inline size_t Itoa(T val, char *s) {
27 |   char *org = s;
28 | 
29 |   if (val < 0) {
30 |     *s++ = '-';
31 |     val = -val;
32 |   }
33 |   char *t = s;
34 | 
35 |   T mod = 0;
36 |   while (val) {
37 |     mod = val % 10;
38 |     *t++ = static_cast<char>(mod) + '0';
39 |     val /= 10;
40 |   }
41 | 
42 |   if (s == t) {
43 |     *t++ = '0';
44 |   }
45 | 
46 |   *t = '\0';
47 |   std::reverse(s, t);
48 |   return static_cast<size_t>(t - org);
49 | }
50 | }  // namespace
51 | 
52 | inline std::string StrJoin(const std::vector<std::string> &tokens,
53 |                            absl::string_view delim) {
54 |   std::string result;
55 |   if (!tokens.empty()) {
56 |     result.append(tokens[0]);
57 |   }
58 |   for (size_t i = 1; i < tokens.size(); ++i) {
59 |     result.append(delim.data(), delim.size());
60 |     result.append(tokens[i]);
61 |   }
62 |   return result;
63 | }
64 | 
65 | inline std::string StrJoin(const std::vector<absl::string_view> &tokens,
66 |                            absl::string_view delim) {
67 |   std::string result;
68 |   if (!tokens.empty()) {
69 |     result.append(tokens[0].data(), tokens[0].size());
70 |   }
71 |   for (size_t i = 1; i < tokens.size(); ++i) {
72 |     result.append(delim.data(), delim.size());
73 |     result.append(tokens[i].data(), tokens[i].size());
74 |   }
75 |   return result;
76 | }
77 | 
78 | inline std::string StrJoin(const std::vector<int> &tokens,
79 |                            absl::string_view delim) {
80 |   std::string result;
81 |   char buf[32];
82 |   if (!tokens.empty()) {
83 |     const size_t len = Itoa(tokens[0], buf);
84 |     result.append(buf, len);
85 |   }
86 |   for (size_t i = 1; i < tokens.size(); ++i) {
87 |     result.append(delim.data(), delim.size());
88 |     const size_t len = Itoa(tokens[i], buf);
89 |     result.append(buf, len);
90 |   }
91 |   return result;
92 | }
93 | 
94 | }  // namespace absl
95 | #endif  // ABSL_STRINGS_STR_CAT_H_
96 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/sentencepiece.proto:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | syntax = "proto2";
16 | 
17 | // TODO(taku): Needs to use LITE RUNTIME in OSS release.
18 | option optimize_for = LITE_RUNTIME;
19 | 
20 | package sentencepiece;
21 | 
22 | // SentencePieceText manages a user-facing source sentence,
23 | // postprocessed target sentence, and internal segmentation
24 | // with byte offsets.
25 | message SentencePieceText {
26 |   message SentencePiece {
27 |     // Internal representation for the decoder.
28 |     // - Decoder can use |piece| as a basic token.
29 |     // - the piece must be non-empty.
30 |     // - A whitespace is replaced with a meta symbol.
31 |     // - Concatenation of pieces is not always the same as the |text|.
32 |     optional string piece = 1;
33 | 
34 |     // Vocabulary id.
35 |     optional uint32 id = 2;
36 | 
37 |     // External representation for the client.
38 |     // - It is always guaranteed that
39 |     //   text.substr(begin, end - begin) == surface.
40 |     // - Concatenation of surface is always the same as the |text|.
41 |     // - |surface| may contain whitespaces.
42 |     // - |surface| may be empty if the piece encodes
43 |     //   a control vocabulary. e.g., <s>, </s>, <unk>.
44 |     // - When |surface| is empty, always begin == end. (zero-length span).
45 |     optional string surface = 3;
46 | 
47 |     optional uint32 begin = 4;
48 |     optional uint32 end = 5;
49 | 
50 |     // Customized extensions: the range of field numbers
51 |     // are open to third-party extensions.
52 |     extensions 200 to max;
53 |   }
54 | 
55 |   // User input or postprocessed text. This should be immutable
56 |   // since the byte range in SentencePiece is pointing to a span over this
57 |   // text. Meta symbols for whitespaces are not included.
58 |   optional string text = 1;
59 | 
60 |   // A sequence of sentence pieces.
61 |   repeated SentencePiece pieces = 2;
62 | 
63 |   // Score (usually log probability) for MultiSentencePieceText.
64 |   optional float score = 3;
65 | 
66 |   // Customized extensions: the range of field numbers
67 |   // are open to third-party extensions.
68 |   extensions 200 to max;
69 | }
70 | 
71 | message NBestSentencePieceText {
72 |   repeated SentencePieceText nbests = 1;
73 | }
74 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/char_model_trainer_test.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #include <string>
16 | #include <vector>
17 | 
18 | #include "char_model_trainer.h"
19 | #include "filesystem.h"
20 | #include "sentencepiece_processor.h"
21 | #include "testharness.h"
22 | #include "third_party/absl/strings/str_cat.h"
23 | #include "third_party/absl/strings/str_join.h"
24 | #include "util.h"
25 | 
26 | namespace sentencepiece {
27 | namespace character {
28 | namespace {
29 | 
30 | // Space symbol (U+2581)
31 | #define WS "\xE2\x96\x81"
32 | 
33 | std::string RunTrainer(const std::vector<std::string> &input, int size) {
34 |   const std::string input_file =
35 |       util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "input");
36 |   const std::string model_prefix =
37 |       util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "model");
38 |   {
39 |     auto output = filesystem::NewWritableFile(input_file);
40 |     for (const auto &line : input) {
41 |       output->WriteLine(line);
42 |     }
43 |   }
44 | 
45 |   TrainerSpec trainer_spec;
46 |   trainer_spec.set_model_type(TrainerSpec::CHAR);
47 |   trainer_spec.add_input(input_file);
48 |   trainer_spec.set_vocab_size(size);
49 |   trainer_spec.set_model_prefix(model_prefix);
50 | 
51 |   NormalizerSpec normalizer_spec;
52 |   normalizer_spec.set_name("identity");
53 | 
54 |   NormalizerSpec denormalizer_spec;
55 | 
56 |   Trainer trainer(trainer_spec, normalizer_spec, denormalizer_spec);
57 |   EXPECT_TRUE(trainer.Train().ok());
58 | 
59 |   SentencePieceProcessor processor;
60 |   EXPECT_TRUE(processor.Load(model_prefix + ".model").ok());
61 | 
62 |   const auto &model = processor.model_proto();
63 |   std::vector<std::string> pieces;
64 | 
65 |   // remove <unk>, <s>, </s>
66 |   for (int i = 3; i < model.pieces_size(); ++i) {
67 |     pieces.emplace_back(model.pieces(i).piece());
68 |   }
69 | 
70 |   return absl::StrJoin(pieces, " ");
71 | }
72 | 
73 | TEST(TrainerTest, BasicTest) {
74 |   EXPECT_EQ(WS " a e p n I h l v",
75 |             RunTrainer({"I have a pen", "I have an apple", "apple pen"}, 100));
76 |   EXPECT_EQ(WS " a",  // <unk>, <s>, </s>, _, a
77 |             RunTrainer({"I have a pen", "I have an apple", "apple pen"}, 5));
78 | }
79 | 
80 | }  // namespace
81 | }  // namespace character
82 | }  // namespace sentencepiece
83 | 


--------------------------------------------------------------------------------
/src/third_party/absl/memory/memory.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Copyright 2017 The Abseil Authors.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | //
16 | // -----------------------------------------------------------------------------
17 | // File: string_view.h
18 | // -----------------------------------------------------------------------------
19 | //
20 | // This file contains the definition of the `absl::string_view` class. A
21 | // `string_view` points to a contiguous span of characters, often part or all of
22 | // another `std::string`, double-quoted std::string literal, character array, or
23 | // even another `string_view`.
24 | //
25 | // This `absl::string_view` abstraction is designed to be a drop-in
26 | // replacement for the C++17 `std::string_view` abstraction.
27 | #ifndef ABSL_MEMORY_MEMORY_H_
28 | #define ABSL_MEMORY_MEMORY_H_
29 | 
30 | #include <memory>
31 | 
32 | namespace absl {
33 | 
34 | // Trait to select overloads and return types for MakeUnique.
35 | template <typename T>
36 | struct MakeUniqueResult {
37 |   using scalar = std::unique_ptr<T>;
38 | };
39 | template <typename T>
40 | struct MakeUniqueResult<T[]> {
41 |   using array = std::unique_ptr<T[]>;
42 | };
43 | template <typename T, size_t N>
44 | struct MakeUniqueResult<T[N]> {
45 |   using invalid = void;
46 | };
47 | 
48 | // MakeUnique<T>(...) is an early implementation of C++14 std::make_unique.
49 | // It is designed to be 100% compatible with std::make_unique so that the
50 | // eventual switchover will be a simple renaming operation.
51 | template <typename T, typename... Args>
52 | typename MakeUniqueResult<T>::scalar make_unique(Args &&... args) {  // NOLINT
53 |   return std::unique_ptr<T>(
54 |       new T(std::forward<Args>(args)...));  // NOLINT(build/c++11)
55 | }
56 | 
57 | // Overload for array of unknown bound.
58 | // The allocation of arrays needs to use the array form of new,
59 | // and cannot take element constructor arguments.
60 | template <typename T>
61 | typename MakeUniqueResult<T>::array make_unique(size_t n) {
62 |   return std::unique_ptr<T>(new typename std::remove_extent<T>::type[n]());
63 | }
64 | 
65 | // Reject arrays of known bound.
66 | template <typename T, typename... Args>
67 | typename MakeUniqueResult<T>::invalid make_unique(Args &&... /* args */) =
68 |     delete;  // NOLINT
69 | 
70 | }  // namespace absl
71 | #endif  // ABSL_MEMORY_MEMORY_H_
72 | 


--------------------------------------------------------------------------------
/src/third_party/protobuf-lite/implicit_weak_message.cc:
--------------------------------------------------------------------------------
 1 | // Protocol Buffers - Google's data interchange format
 2 | // Copyright 2008 Google Inc.  All rights reserved.
 3 | // https://developers.google.com/protocol-buffers/
 4 | //
 5 | // Redistribution and use in source and binary forms, with or without
 6 | // modification, are permitted provided that the following conditions are
 7 | // met:
 8 | //
 9 | //     * Redistributions of source code must retain the above copyright
10 | // notice, this list of conditions and the following disclaimer.
11 | //     * Redistributions in binary form must reproduce the above
12 | // copyright notice, this list of conditions and the following disclaimer
13 | // in the documentation and/or other materials provided with the
14 | // distribution.
15 | //     * Neither the name of Google Inc. nor the names of its
16 | // contributors may be used to endorse or promote products derived from
17 | // this software without specific prior written permission.
18 | //
19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 
31 | #include <google/protobuf/implicit_weak_message.h>
32 | 
33 | #include <google/protobuf/parse_context.h>
34 | #include <google/protobuf/io/zero_copy_stream_impl_lite.h>
35 | #include <google/protobuf/stubs/once.h>
36 | #include <google/protobuf/wire_format_lite.h>
37 | 
38 | #include <google/protobuf/port_def.inc>
39 | 
40 | namespace google {
41 | namespace protobuf {
42 | namespace internal {
43 | 
44 | const char* ImplicitWeakMessage::_InternalParse(const char* ptr,
45 |                                                 ParseContext* ctx) {
46 |   return ctx->AppendString(ptr, &data_);
47 | }
48 | 
49 | ExplicitlyConstructed<ImplicitWeakMessage>
50 |     implicit_weak_message_default_instance;
51 | internal::once_flag implicit_weak_message_once_init_;
52 | 
53 | void InitImplicitWeakMessageDefaultInstance() {
54 |   implicit_weak_message_default_instance.DefaultConstruct();
55 | }
56 | 
57 | const ImplicitWeakMessage* ImplicitWeakMessage::default_instance() {
58 |   internal::call_once(implicit_weak_message_once_init_,
59 |                       InitImplicitWeakMessageDefaultInstance);
60 |   return &implicit_weak_message_default_instance.get();
61 | }
62 | 
63 | }  // namespace internal
64 | }  // namespace protobuf
65 | }  // namespace google
66 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/word_model_test.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #include <string>
16 | 
17 | #include "sentencepiece_model.pb.h"
18 | #include "testharness.h"
19 | #include "util.h"
20 | #include "word_model.h"
21 | 
22 | namespace sentencepiece {
23 | namespace word {
24 | namespace {
25 | 
26 | // Space symbol (U+2581)
27 | #define WS "\xe2\x96\x81"
28 | 
29 | ModelProto MakeBaseModelProto() {
30 |   ModelProto model_proto;
31 |   auto *sp1 = model_proto.add_pieces();
32 |   auto *sp2 = model_proto.add_pieces();
33 |   auto *sp3 = model_proto.add_pieces();
34 | 
35 |   sp1->set_type(ModelProto::SentencePiece::UNKNOWN);
36 |   sp1->set_piece("<unk>");
37 |   sp2->set_type(ModelProto::SentencePiece::CONTROL);
38 |   sp2->set_piece("<s>");
39 |   sp3->set_type(ModelProto::SentencePiece::CONTROL);
40 |   sp3->set_piece("</s>");
41 | 
42 |   return model_proto;
43 | }
44 | 
45 | void AddPiece(ModelProto *model_proto, const std::string &piece,
46 |               float score = 0.0) {
47 |   auto *sp = model_proto->add_pieces();
48 |   sp->set_piece(piece);
49 |   sp->set_score(score);
50 | }
51 | 
52 | TEST(WordModelTest, EncodeTest) {
53 |   ModelProto model_proto = MakeBaseModelProto();
54 | 
55 |   AddPiece(&model_proto, WS "ab");
56 |   AddPiece(&model_proto, WS "cd");
57 |   AddPiece(&model_proto, WS "abc");
58 |   AddPiece(&model_proto, WS "a", 0.1);
59 |   AddPiece(&model_proto, WS "b", 0.2);
60 |   AddPiece(&model_proto, WS "c", 0.3);
61 |   AddPiece(&model_proto, WS "d", 0.4);
62 | 
63 |   const Model model(model_proto);
64 | 
65 |   EncodeResult result;
66 | 
67 |   result = model.Encode("");
68 |   EXPECT_TRUE(result.empty());
69 | 
70 |   result = model.Encode(WS "a" WS "b" WS "c");
71 |   EXPECT_EQ(3, result.size());
72 |   EXPECT_EQ(WS "a", result[0].first);
73 |   EXPECT_EQ(WS "b", result[1].first);
74 |   EXPECT_EQ(WS "c", result[2].first);
75 | 
76 |   result = model.Encode(WS "ab" WS "cd" WS "abc");
77 |   EXPECT_EQ(3, result.size());
78 |   EXPECT_EQ(WS "ab", result[0].first);
79 |   EXPECT_EQ(WS "cd", result[1].first);
80 |   EXPECT_EQ(WS "abc", result[2].first);
81 | }
82 | 
83 | TEST(WordModelTest, NotSupportedTest) {
84 |   ModelProto model_proto = MakeBaseModelProto();
85 |   const Model model(model_proto);
86 |   EXPECT_EQ(NBestEncodeResult(), model.NBestEncode("test", 10));
87 |   EXPECT_EQ(EncodeResult(), model.SampleEncode("test", 0.1));
88 | }
89 | 
90 | }  // namespace
91 | }  // namespace word
92 | }  // namespace sentencepiece
93 | 


--------------------------------------------------------------------------------
/src/third_party/absl/strings/str_split.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Copyright 2017 The Abseil Authors.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //      http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | //
16 | #ifndef ABSL_STRINGS_STR_SPLIT_H_
17 | #define ABSL_STRINGS_STR_SPLIT_H_
18 | 
19 | #include <string>
20 | #include <vector>
21 | 
22 | #include "third_party/absl/strings/string_view.h"
23 | 
24 | namespace absl {
25 | namespace internal {
26 | 
27 | class Splitter {
28 |  public:
29 |   Splitter(absl::string_view str, absl::string_view delim, bool allow_empty) {
30 |     size_t current_pos = 0;
31 |     size_t found_pos = 0;
32 |     while ((found_pos = str.find_first_of(delim, current_pos)) !=
33 |            absl::string_view::npos) {
34 |       if ((allow_empty && found_pos >= current_pos) ||
35 |           (!allow_empty && found_pos > current_pos)) {
36 |         result_.push_back(str.substr(current_pos, found_pos - current_pos));
37 |       }
38 |       current_pos = found_pos + 1;
39 |     }
40 |     if (str.size() > current_pos) {
41 |       result_.push_back(str.substr(current_pos, str.size() - current_pos));
42 |     }
43 |   }
44 |   template <class T>
45 |   operator std::vector<T>() const;
46 | 
47 |   using const_iterator = std::vector<absl::string_view>::const_iterator;
48 |   const_iterator begin() const { return result_.begin(); }
49 |   const_iterator end() const { return result_.end(); }
50 | 
51 |  private:
52 |   std::vector<absl::string_view> result_;
53 | };
54 | 
55 | template <>
56 | inline Splitter::operator std::vector<std::string>() const {
57 |   std::vector<std::string> x(result_.size());
58 |   for (size_t i = 0; i < x.size(); ++i)
59 |     x[i].assign(result_[i].data(), result_[i].size());
60 |   return x;
61 | }
62 | 
63 | template <>
64 | inline Splitter::operator std::vector<absl::string_view>() const {
65 |   return result_;
66 | }
67 | }  // namespace internal
68 | 
69 | inline constexpr bool AllowEmpty() { return true; };
70 | 
71 | inline internal::Splitter StrSplit(absl::string_view str,
72 |                                    absl::string_view delim,
73 |                                    bool allow_empty = false) {
74 |   return internal::Splitter(str, delim, allow_empty);
75 | }
76 | 
77 | inline internal::Splitter StrSplit(absl::string_view str, const char c,
78 |                                    bool allow_empty = false) {
79 |   char delim[2];
80 |   delim[0] = c;
81 |   delim[1] = '\0';
82 |   return internal::Splitter(str, delim, allow_empty);
83 | }
84 | 
85 | }  // namespace absl
86 | #endif  // ABSL_STRINGS_STR_SPLIT_H_
87 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/pretokenizer_for_training_test.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2016 Google Inc.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.!
 14 | #include "pretokenizer_for_training.h"
 15 | #include "testharness.h"
 16 | #include "third_party/absl/strings/str_cat.h"
 17 | #include "trainer_interface.h"
 18 | 
 19 | namespace sentencepiece {
 20 | namespace pretokenizer {
 21 | 
 22 | class MockPretokenizer : public PretokenizerForTrainingInterface {
 23 |  public:
 24 |   MockPretokenizer() {}
 25 |   ~MockPretokenizer() {}
 26 | 
 27 |   SentencePieceText Tokenize(absl::string_view text) const override {
 28 |     return spt_;
 29 |   }
 30 | 
 31 |   util::Status status() const override { return util::OkStatus(); }
 32 | 
 33 |   void SetOutput(const SentencePieceText &spt) { spt_ = spt; }
 34 | 
 35 |  private:
 36 |   SentencePieceText spt_;
 37 | };
 38 | 
 39 | TEST(PretokenizerForTrainingTest, BaseTest) {
 40 |   MockPretokenizer mock;
 41 | 
 42 |   {
 43 |     SentencePieceText spt;
 44 |     spt.set_text("I love sentencepiece");
 45 |     auto *p1 = spt.add_pieces();
 46 |     p1->set_surface("I");
 47 |     p1->set_begin(0);
 48 |     p1->set_end(1);
 49 | 
 50 |     auto *p2 = spt.add_pieces();
 51 |     p2->set_surface("love");
 52 |     p2->set_begin(2);
 53 |     p2->set_end(6);
 54 | 
 55 |     auto *p3 = spt.add_pieces();
 56 |     p3->set_surface("sentence");
 57 |     p3->set_begin(7);
 58 |     p3->set_end(15);
 59 | 
 60 |     auto *p4 = spt.add_pieces();
 61 |     p4->set_surface("piece");
 62 |     p4->set_begin(15);
 63 |     p4->set_end(20);
 64 | 
 65 |     mock.SetOutput(spt);
 66 | 
 67 |     EXPECT_EQ(absl::StrCat("I", TrainerInterface::kWSStr, "love",
 68 |                            TrainerInterface::kWSStr, "sentence\tpiece"),
 69 |               mock.PreTokenize("I love sentencepiece"));
 70 |   }
 71 | 
 72 |   {
 73 |     SentencePieceText spt;
 74 |     spt.set_text("これはペンです");
 75 |     auto *p1 = spt.add_pieces();
 76 |     p1->set_surface("これ");
 77 |     p1->set_begin(0);
 78 |     p1->set_end(6);
 79 | 
 80 |     auto *p2 = spt.add_pieces();
 81 |     p2->set_surface("は");
 82 |     p2->set_begin(6);
 83 |     p2->set_end(9);
 84 | 
 85 |     auto *p3 = spt.add_pieces();
 86 |     p3->set_surface("ペン");
 87 |     p3->set_begin(9);
 88 |     p3->set_end(15);
 89 | 
 90 |     auto *p4 = spt.add_pieces();
 91 |     p4->set_surface("です");
 92 |     p4->set_begin(15);
 93 |     p4->set_end(21);
 94 | 
 95 |     mock.SetOutput(spt);
 96 | 
 97 |     EXPECT_EQ("これ\tは\tペン\tです", mock.PreTokenize("これはペンです"));
 98 |   }
 99 | }
100 | 
101 | }  // namespace pretokenizer
102 | }  // namespace sentencepiece
103 | 


--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
 1 | PKG_LIBS = -pthread 
 2 | PKG_CPPFLAGS = -D HAVE_PTHREAD=1 -pthread -D_USE_INTERNAL_STRING_VIEW -DSTRICT_R_HEADERS -I. -I./sentencepiece -I./sentencepiece/src -I./sentencepiece/src/builtin_pb -I./third_party/absl -I./third_party/darts_clone -I./third_party/esaxx -I./third_party/protobuf-lite 
 3 | 
 4 | SOURCES = third_party/protobuf-lite/arena.cc   \
 5 | 					third_party/protobuf-lite/arenastring.cc   \
 6 | 					third_party/protobuf-lite/bytestream.cc   \
 7 | 					third_party/protobuf-lite/coded_stream.cc   \
 8 | 					third_party/protobuf-lite/common.cc   \
 9 | 					third_party/protobuf-lite/extension_set.cc   \
10 | 					third_party/protobuf-lite/generated_enum_util.cc   \
11 | 					third_party/protobuf-lite/generated_message_table_driven_lite.cc   \
12 | 					third_party/protobuf-lite/generated_message_util.cc   \
13 | 					third_party/protobuf-lite/implicit_weak_message.cc   \
14 | 					third_party/protobuf-lite/int128.cc   \
15 | 					third_party/protobuf-lite/io_win32.cc   \
16 | 					third_party/protobuf-lite/message_lite.cc   \
17 | 					third_party/protobuf-lite/parse_context.cc   \
18 | 					third_party/protobuf-lite/repeated_field.cc   \
19 | 					third_party/protobuf-lite/status.cc   \
20 | 					third_party/protobuf-lite/statusor.cc   \
21 | 					third_party/protobuf-lite/stringpiece.cc   \
22 | 					third_party/protobuf-lite/stringprintf.cc   \
23 | 					third_party/protobuf-lite/structurally_valid.cc   \
24 | 					third_party/protobuf-lite/strutil.cc   \
25 | 					third_party/protobuf-lite/time.cc   \
26 | 					third_party/protobuf-lite/wire_format_lite.cc   \
27 | 					third_party/protobuf-lite/zero_copy_stream.cc   \
28 | 					third_party/protobuf-lite/zero_copy_stream_impl.cc   \
29 | 					third_party/protobuf-lite/zero_copy_stream_impl_lite.cc   \
30 | 					third_party/absl/strings/string_view.cc   \
31 | 					third_party/absl/flags/flag.cc   \
32 | 					sentencepiece/src/builtin_pb/sentencepiece.pb.cc   \
33 | 					sentencepiece/src/builtin_pb/sentencepiece_model.pb.cc   \
34 | 					sentencepiece/src/bpe_model.cc   \
35 | 					sentencepiece/src/bpe_model_trainer.cc   \
36 | 					sentencepiece/src/builder.cc   \
37 | 					sentencepiece/src/char_model.cc   \
38 | 					sentencepiece/src/char_model_trainer.cc   \
39 | 					sentencepiece/src/error.cc   \
40 | 					sentencepiece/src/filesystem.cc   \
41 | 					sentencepiece/src/model_factory.cc   \
42 | 					sentencepiece/src/model_interface.cc   \
43 | 					sentencepiece/src/normalizer.cc   \
44 | 					sentencepiece/src/pretokenizer_for_training.cc  \
45 | 					sentencepiece/src/sentencepiece_processor.cc   \
46 | 					sentencepiece/src/sentencepiece_trainer.cc   \
47 | 					sentencepiece/src/trainer_factory.cc   \
48 | 					sentencepiece/src/trainer_interface.cc   \
49 | 					sentencepiece/src/unicode_script.cc   \
50 | 					sentencepiece/src/unigram_model.cc   \
51 | 					sentencepiece/src/unigram_model_trainer.cc   \
52 | 					sentencepiece/src/util.cc   \
53 | 					sentencepiece/src/word_model.cc   \
54 | 					sentencepiece/src/word_model_trainer.cc   \
55 | 					rcpp_sentencepiece.cpp   \
56 | 					rcpp_wordpiece.cpp   \
57 | 					RcppExports.cpp
58 | 
59 | OBJ       = $(SOURCES:.cc=.o)
60 | OBJECTS   = $(OBJ:.cpp=.o)
61 | 
62 | .PHONY: all
63 | 
64 | all: $(SHLIB); rm -f $(OBJECTS)
65 | #all: $(SHLIB); rm -f rcpp_wordpiece.o; rm -f rcpp_sentencepiece.o; rm -f RcppExports.o
66 | #all: $(SHLIB); rm -f third_party/protobuf-lite/repeated_field.o; 
67 | 
68 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/unigram_model_trainer_test.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Google Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.!
14 | 
15 | #include "sentencepiece_model.pb.h"
16 | #include "sentencepiece_processor.h"
17 | #include "sentencepiece_trainer.h"
18 | #include "testharness.h"
19 | #include "third_party/absl/strings/str_cat.h"
20 | #include "third_party/absl/strings/str_join.h"
21 | #include "unigram_model_trainer.h"
22 | #include "util.h"
23 | 
24 | namespace sentencepiece {
25 | namespace unigram {
26 | namespace {
27 | 
28 | // Space symbol
29 | #define WS "\xe2\x96\x81"
30 | 
31 | TEST(UnigramTrainerTest, TrainerModelTest) {
32 |   TrainerSpec trainer_spec;
33 |   NormalizerSpec normalizer_spec;
34 |   const TrainerModel model(trainer_spec, normalizer_spec);
35 |   EXPECT_EQ(EncodeResult(), model.Encode("test"));
36 | }
37 | 
38 | static constexpr char kTestInputData[] = "wagahaiwa_nekodearu.txt";
39 | 
40 | TEST(UnigramTrainerTest, EndToEndTest) {
41 |   const std::string input =
42 |       util::JoinPath(absl::GetFlag(FLAGS_test_srcdir), kTestInputData);
43 | 
44 |   ASSERT_TRUE(
45 |       SentencePieceTrainer::Train(
46 |           absl::StrCat(
47 |               "--model_prefix=",
48 |               util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "tmp_model"),
49 |               " --input=", input,
50 |               " --vocab_size=8000 --normalization_rule_name=identity",
51 |               " --model_type=unigram --user_defined_symbols=<user>",
52 |               " --control_symbols=<ctrl> --max_sentence_length=2048"))
53 |           .ok());
54 | 
55 |   SentencePieceProcessor sp;
56 |   EXPECT_TRUE(sp.Load(util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir),
57 |                                      "tmp_model.model"))
58 |                   .ok());
59 |   EXPECT_EQ(8000, sp.GetPieceSize());
60 | 
61 |   const int cid = sp.PieceToId("<ctrl>");
62 |   const int uid = sp.PieceToId("<user>");
63 |   EXPECT_TRUE(sp.IsControl(cid));
64 |   EXPECT_FALSE(sp.IsUnknown(uid));
65 | 
66 |   std::vector<std::string> tok;
67 | 
68 |   EXPECT_TRUE(sp.Encode("", &tok).ok());
69 |   EXPECT_TRUE(tok.empty());
70 | 
71 |   EXPECT_TRUE(sp.Encode("吾輩《わがはい》は猫である。名前はまだ無い。"
72 |                         "どこで生れたかとんと見当《けんとう》がつかぬ。"
73 |                         "何でも薄暗いじめじめした所でニャーニャー泣いていた事だ"
74 |                         "けは記憶している"
75 |                         "。",
76 |                         &tok)
77 |                   .ok());
78 |   // TODO(taku): Temporally disable this test on Windows.
79 | #ifndef OS_WIN
80 |   EXPECT_EQ(WS
81 |             " 吾輩 《 わが はい 》 は 猫 である 。 名前 はまだ 無い 。 "
82 |             "どこ で 生 れた か とん と 見当 《 けん とう 》 が つか ぬ 。 "
83 |             "何でも 薄 暗 い じめ じめ した 所で ニャーニャー "
84 |             "泣 い ていた 事 だけは 記憶 している 。",
85 |             absl::StrJoin(tok, " "));
86 | #endif
87 | }
88 | 
89 | }  // namespace
90 | }  // namespace unigram
91 | }  // namespace sentencepiece
92 | 


--------------------------------------------------------------------------------
/.github/workflows/rhub.yaml:
--------------------------------------------------------------------------------
 1 | # R-hub's generic GitHub Actions workflow file. It's canonical location is at
 2 | # https://github.com/r-hub/actions/blob/v1/workflows/rhub.yaml
 3 | # You can update this file to a newer version using the rhub2 package:
 4 | #
 5 | # rhub::rhub_setup()
 6 | #
 7 | # It is unlikely that you need to modify this file manually.
 8 | 
 9 | name: R-hub
10 | run-name: "${{ github.event.inputs.id }}: ${{ github.event.inputs.name || format('Manually run by {0}', github.triggering_actor) }}"
11 | 
12 | on:
13 |   workflow_dispatch:
14 |     inputs:
15 |       config:
16 |         description: 'A comma separated list of R-hub platforms to use.'
17 |         type: string
18 |         default: 'linux,windows,macos,clang-asan,clang-ubsan,gcc-asan,nold,rchk,ubuntu-clang,valgrind'
19 |       name:
20 |         description: 'Run name. You can leave this empty now.'
21 |         type: string
22 |       id:
23 |         description: 'Unique ID. You can leave this empty now.'
24 |         type: string
25 | 
26 | jobs:
27 | 
28 |   setup:
29 |     runs-on: ubuntu-latest
30 |     outputs:
31 |       containers: ${{ steps.rhub-setup.outputs.containers }}
32 |       platforms: ${{ steps.rhub-setup.outputs.platforms }}
33 | 
34 |     steps:
35 |     # NO NEED TO CHECKOUT HERE
36 |     - uses: r-hub/actions/setup@v1
37 |       with:
38 |         config: ${{ github.event.inputs.config }}
39 |       id: rhub-setup
40 | 
41 |   linux-containers:
42 |     needs: setup
43 |     if: ${{ needs.setup.outputs.containers != '[]' }}
44 |     runs-on: ubuntu-latest
45 |     name: ${{ matrix.config.label }}
46 |     strategy:
47 |       fail-fast: false
48 |       matrix:
49 |         config: ${{ fromJson(needs.setup.outputs.containers) }}
50 |     container:
51 |       image: ${{ matrix.config.container }}
52 | 
53 |     steps:
54 |       - uses: r-hub/actions/checkout@v1
55 |       - uses: r-hub/actions/platform-info@v1
56 |         with:
57 |           token: ${{ secrets.RHUB_TOKEN }}
58 |           job-config: ${{ matrix.config.job-config }}
59 |       - uses: r-hub/actions/setup-deps@v1
60 |         with:
61 |           token: ${{ secrets.RHUB_TOKEN }}
62 |           job-config: ${{ matrix.config.job-config }}
63 |       - uses: r-hub/actions/run-check@v1
64 |         with:
65 |           token: ${{ secrets.RHUB_TOKEN }}
66 |           job-config: ${{ matrix.config.job-config }}
67 | 
68 |   other-platforms:
69 |     needs: setup
70 |     if: ${{ needs.setup.outputs.platforms != '[]' }}
71 |     runs-on: ${{ matrix.config.os }}
72 |     name: ${{ matrix.config.label }}
73 |     strategy:
74 |       fail-fast: false
75 |       matrix:
76 |         config: ${{ fromJson(needs.setup.outputs.platforms) }}
77 | 
78 |     steps:
79 |       - uses: r-hub/actions/checkout@v1
80 |       - uses: r-hub/actions/setup-r@v1
81 |         with:
82 |           job-config: ${{ matrix.config.job-config }}
83 |           token: ${{ secrets.RHUB_TOKEN }}
84 |       - uses: r-hub/actions/platform-info@v1
85 |         with:
86 |           token: ${{ secrets.RHUB_TOKEN }}
87 |           job-config: ${{ matrix.config.job-config }}
88 |       - uses: r-hub/actions/setup-deps@v1
89 |         with:
90 |           job-config: ${{ matrix.config.job-config }}
91 |           token: ${{ secrets.RHUB_TOKEN }}
92 |       - uses: r-hub/actions/run-check@v1
93 |         with:
94 |           job-config: ${{ matrix.config.job-config }}
95 |           token: ${{ secrets.RHUB_TOKEN }}
96 | 


--------------------------------------------------------------------------------
/src/third_party/protobuf-lite/google/protobuf/stubs/stl_util.h:
--------------------------------------------------------------------------------
 1 | // Protocol Buffers - Google's data interchange format
 2 | // Copyright 2008 Google Inc.  All rights reserved.
 3 | // https://developers.google.com/protocol-buffers/
 4 | //
 5 | // Redistribution and use in source and binary forms, with or without
 6 | // modification, are permitted provided that the following conditions are
 7 | // met:
 8 | //
 9 | //     * Redistributions of source code must retain the above copyright
10 | // notice, this list of conditions and the following disclaimer.
11 | //     * Redistributions in binary form must reproduce the above
12 | // copyright notice, this list of conditions and the following disclaimer
13 | // in the documentation and/or other materials provided with the
14 | // distribution.
15 | //     * Neither the name of Google Inc. nor the names of its
16 | // contributors may be used to endorse or promote products derived from
17 | // this software without specific prior written permission.
18 | //
19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 
31 | // from google3/util/gtl/stl_util.h
32 | 
33 | #ifndef GOOGLE_PROTOBUF_STUBS_STL_UTIL_H__
34 | #define GOOGLE_PROTOBUF_STUBS_STL_UTIL_H__
35 | 
36 | #include <google/protobuf/stubs/common.h>
37 | 
38 | namespace google {
39 | namespace protobuf {
40 | 
41 | // Inside Google, this function implements a horrible, disgusting hack in which
42 | // we reach into the string's private implementation and resize it without
43 | // initializing the new bytes.  In some cases doing this can significantly
44 | // improve performance.  However, since it's totally non-portable it has no
45 | // place in open source code.  Feel free to fill this function in with your
46 | // own disgusting hack if you want the perf boost.
47 | inline void STLStringResizeUninitialized(std::string* s, size_t new_size) {
48 |   s->resize(new_size);
49 | }
50 | 
51 | // Return a mutable char* pointing to a string's internal buffer,
52 | // which may not be null-terminated. Writing through this pointer will
53 | // modify the string.
54 | //
55 | // string_as_array(&str)[i] is valid for 0 <= i < str.size() until the
56 | // next call to a string method that invalidates iterators.
57 | //
58 | // As of 2006-04, there is no standard-blessed way of getting a
59 | // mutable reference to a string's internal buffer. However, issue 530
60 | // (http://www.open-std.org/JTC1/SC22/WG21/docs/lwg-active.html#530)
61 | // proposes this as the method. According to Matt Austern, this should
62 | // already work on all current implementations.
63 | inline char* string_as_array(std::string* str) {
64 |   // DO NOT USE const_cast<char*>(str->data())! See the unittest for why.
65 |   return str->empty() ? nullptr : &*str->begin();
66 | }
67 | 
68 | }  // namespace protobuf
69 | }  // namespace google
70 | 
71 | #endif  // GOOGLE_PROTOBUF_STUBS_STL_UTIL_H__
72 | 


--------------------------------------------------------------------------------
/inst/spc-help/spm_train:
--------------------------------------------------------------------------------
 1 | sentencepiece
 2 | 
 3 | Usage: spm_train [options] files
 4 | 
 5 |    --accept_language (comma-separated list of languages this model can accept)  type: string  default: 
 6 |    --add_dummy_prefix (Add dummy whitespace at the beginning of text)  type: bool  default: true
 7 |    --bos_id (Override BOS (<s>) id. Set -1 to disable BOS.)  type: int32  default: 1
 8 |    --bos_piece (Override BOS (<s>) piece.)  type: string  default: <s>
 9 |    --character_coverage (character coverage to determine the minimum symbols)  type: double  default: 0.9995
10 |    --control_symbols (comma separated list of control symbols)  type: string  default: 
11 |    --eos_id (Override EOS (</s>) id. Set -1 to disable EOS.)  type: int32  default: 2
12 |    --eos_piece (Override EOS (</s>) piece.)  type: string  default: </s>
13 |    --hard_vocab_limit (If set to false, --vocab_size is considered as a soft limit.)  type: bool  default: true
14 |    --input (comma separated list of input sentences)  type: string  default: 
15 |    --input_format (Input format. Supported format is `text` or `tsv`.)  type: string  default: 
16 |    --input_sentence_size (maximum size of sentences the trainer loads)  type: int32  default: 0
17 |    --max_sentence_length (maximum length of sentence in byte)  type: int32  default: 4192
18 |    --max_sentencepiece_length (maximum length of sentence piece)  type: int32  default: 16
19 |    --model_prefix (output model prefix)  type: string  default: 
20 |    --model_type (model algorithm: unigram, bpe, word or char)  type: string  default: unigram
21 |    --normalization_rule_name (Normalization rule name. Choose from nfkc or identity)  type: string  default: nmt_nfkc
22 |    --normalization_rule_tsv (Normalization rule TSV file. )  type: string  default: 
23 |    --num_sub_iterations (number of EM sub-iterations)  type: int32  default: 2
24 |    --num_threads (number of threads for training)  type: int32  default: 16
25 |    --pad_id (Override PAD (<pad>) id. Set -1 to disable PAD.)  type: int32  default: -1
26 |    --pad_piece (Override PAD (<pad>) piece.)  type: string  default: <pad>
27 |    --remove_extra_whitespaces (Removes leading, trailing, and duplicate internal whitespace)  type: bool  default: true
28 |    --seed_sentencepiece_size (the size of seed sentencepieces)  type: int32  default: 1000000
29 |    --self_test_sample_size (the size of self test samples)  type: int32  default: 0
30 |    --shrinking_factor (Keeps top shrinking_factor pieces with respect to the loss)  type: double  default: 0.75
31 |    --shuffle_input_sentence (Randomly sample input sentences in advance. Valid when --input_sentence_size > 0)  type: bool  default: true
32 |    --split_by_number (split tokens by numbers (0-9))  type: bool  default: true
33 |    --split_by_unicode_script (use Unicode script to split sentence pieces)  type: bool  default: true
34 |    --split_by_whitespace (use a white space to split sentence pieces)  type: bool  default: true
35 |    --treat_whitespace_as_suffix (treat whitespace marker as suffix instead of prefix.)  type: bool  default: false
36 |    --unk_id (Override UNK (<unk>) id.)  type: int32  default: 0
37 |    --unk_piece (Override UNK (<unk>) piece.)  type: string  default: <unk>
38 |    --unk_surface (Dummy surface string for <unk>. In decoding <unk> is decoded to `unk_surface`.)  type: string  default:  ⁇ 
39 |    --use_all_vocab (If set to true, use all tokens as vocab. Valid for word/char models.)  type: bool  default: false
40 |    --user_defined_symbols (comma separated list of user defined symbols)  type: string  default: 
41 |    --vocab_size (vocabulary size)  type: int32  default: 8000
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/src/third_party/protobuf-lite/google/protobuf/generated_enum_util.h:
--------------------------------------------------------------------------------
 1 | // Protocol Buffers - Google's data interchange format
 2 | // Copyright 2008 Google Inc.  All rights reserved.
 3 | // https://developers.google.com/protocol-buffers/
 4 | //
 5 | // Redistribution and use in source and binary forms, with or without
 6 | // modification, are permitted provided that the following conditions are
 7 | // met:
 8 | //
 9 | //     * Redistributions of source code must retain the above copyright
10 | // notice, this list of conditions and the following disclaimer.
11 | //     * Redistributions in binary form must reproduce the above
12 | // copyright notice, this list of conditions and the following disclaimer
13 | // in the documentation and/or other materials provided with the
14 | // distribution.
15 | //     * Neither the name of Google Inc. nor the names of its
16 | // contributors may be used to endorse or promote products derived from
17 | // this software without specific prior written permission.
18 | //
19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 
31 | #ifndef GOOGLE_PROTOBUF_GENERATED_ENUM_UTIL_H__
32 | #define GOOGLE_PROTOBUF_GENERATED_ENUM_UTIL_H__
33 | 
34 | #include <type_traits>
35 | 
36 | #include <google/protobuf/message_lite.h>
37 | #include <google/protobuf/stubs/strutil.h>
38 | 
39 | #include <google/protobuf/port_def.inc>
40 | 
41 | #ifdef SWIG
42 | #error "You cannot SWIG proto headers"
43 | #endif
44 | 
45 | namespace google {
46 | namespace protobuf {
47 | 
48 | // This type trait can be used to cause templates to only match proto2 enum
49 | // types.
50 | template <typename T>
51 | struct is_proto_enum : ::std::false_type {};
52 | 
53 | namespace internal {
54 | 
55 | // The table entry format for storing enum name-to-value mapping used with lite
56 | // protos. This struct and the following related functions should only be used
57 | // by protobuf generated code.
58 | struct EnumEntry {
59 |   StringPiece name;
60 |   int value;
61 | };
62 | 
63 | // Looks up a numeric enum value given the string name.
64 | PROTOBUF_EXPORT bool LookUpEnumValue(const EnumEntry* enums, size_t size,
65 |                                      StringPiece name, int* value);
66 | 
67 | // Looks up an enum name given the numeric value.
68 | PROTOBUF_EXPORT int LookUpEnumName(const EnumEntry* enums,
69 |                                    const int* sorted_indices, size_t size,
70 |                                    int value);
71 | 
72 | // Initializes the list of enum names in std::string form.
73 | PROTOBUF_EXPORT bool InitializeEnumStrings(
74 |     const EnumEntry* enums, const int* sorted_indices, size_t size,
75 |     internal::ExplicitlyConstructed<std::string>* enum_strings);
76 | 
77 | }  // namespace internal
78 | }  // namespace protobuf
79 | }  // namespace google
80 | 
81 | #include <google/protobuf/port_undef.inc>
82 | 
83 | #endif  // GOOGLE_PROTOBUF_GENERATED_ENUM_UTIL_H__
84 | 


--------------------------------------------------------------------------------
/src/third_party/protobuf-lite/google/protobuf/stubs/time.h:
--------------------------------------------------------------------------------
 1 | // Protocol Buffers - Google's data interchange format
 2 | // Copyright 2008 Google Inc.  All rights reserved.
 3 | // https://developers.google.com/protocol-buffers/
 4 | //
 5 | // Redistribution and use in source and binary forms, with or without
 6 | // modification, are permitted provided that the following conditions are
 7 | // met:
 8 | //
 9 | //     * Redistributions of source code must retain the above copyright
10 | // notice, this list of conditions and the following disclaimer.
11 | //     * Redistributions in binary form must reproduce the above
12 | // copyright notice, this list of conditions and the following disclaimer
13 | // in the documentation and/or other materials provided with the
14 | // distribution.
15 | //     * Neither the name of Google Inc. nor the names of its
16 | // contributors may be used to endorse or promote products derived from
17 | // this software without specific prior written permission.
18 | //
19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | #ifndef GOOGLE_PROTOBUF_STUBS_TIME_H_
31 | #define GOOGLE_PROTOBUF_STUBS_TIME_H_
32 | 
33 | #include <google/protobuf/stubs/common.h>
34 | 
35 | #include <google/protobuf/port_def.inc>
36 | 
37 | namespace google {
38 | namespace protobuf {
39 | namespace internal {
40 | 
41 | struct DateTime {
42 |   int year;
43 |   int month;
44 |   int day;
45 |   int hour;
46 |   int minute;
47 |   int second;
48 | };
49 | 
50 | // Converts a timestamp (seconds elapsed since 1970-01-01T00:00:00, could be
51 | // negative to represent time before 1970-01-01) to DateTime. Returns false
52 | // if the timestamp is not in the range between 0001-01-01T00:00:00 and
53 | // 9999-12-31T23:59:59.
54 | bool PROTOBUF_EXPORT SecondsToDateTime(int64 seconds, DateTime* time);
55 | // Converts DateTime to a timestamp (seconds since 1970-01-01T00:00:00).
56 | // Returns false if the DateTime is not valid or is not in the valid range.
57 | bool PROTOBUF_EXPORT DateTimeToSeconds(const DateTime& time, int64* seconds);
58 | 
59 | void PROTOBUF_EXPORT GetCurrentTime(int64* seconds, int32* nanos);
60 | 
61 | // Formats a time string in RFC3339 format.
62 | //
63 | // For example, "2015-05-20T13:29:35.120Z". For nanos, 0, 3, 6 or 9 fractional
64 | // digits will be used depending on how many are required to represent the exact
65 | // value.
66 | //
67 | // Note that "nanos" must in the range of [0, 999999999].
68 | std::string PROTOBUF_EXPORT FormatTime(int64 seconds, int32 nanos);
69 | // Parses a time string. This method accepts RFC3339 date/time string with UTC
70 | // offset. For example, "2015-05-20T13:29:35.120-08:00".
71 | bool PROTOBUF_EXPORT ParseTime(const std::string& value, int64* seconds,
72 |                                int32* nanos);
73 | 
74 | }  // namespace internal
75 | }  // namespace protobuf
76 | }  // namespace google
77 | 
78 | #include <google/protobuf/port_undef.inc>
79 | 
80 | #endif  // GOOGLE_PROTOBUF_STUBS_TIME_H_
81 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/unicode_script.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2016 Google Inc.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.!
 14 | 
 15 | #ifndef UNICODE_SCRIPT_H_
 16 | #define UNICODE_SCRIPT_H_
 17 | 
 18 | #include "common.h"
 19 | 
 20 | namespace sentencepiece {
 21 | namespace unicode_script {
 22 | enum ScriptType : int32_t {
 23 |   U_Adlam,
 24 |   U_Ahom,
 25 |   U_Anatolian_Hieroglyphs,
 26 |   U_Arabic,
 27 |   U_Armenian,
 28 |   U_Avestan,
 29 |   U_Balinese,
 30 |   U_Bamum,
 31 |   U_Bassa_Vah,
 32 |   U_Batak,
 33 |   U_Bengali,
 34 |   U_Bhaiksuki,
 35 |   U_Bopomofo,
 36 |   U_Brahmi,
 37 |   U_Braille,
 38 |   U_Buginese,
 39 |   U_Buhid,
 40 |   U_Canadian_Aboriginal,
 41 |   U_Carian,
 42 |   U_Caucasian_Albanian,
 43 |   U_Chakma,
 44 |   U_Cham,
 45 |   U_Cherokee,
 46 |   U_Common,
 47 |   U_Coptic,
 48 |   U_Cuneiform,
 49 |   U_Cypriot,
 50 |   U_Cyrillic,
 51 |   U_Deseret,
 52 |   U_Devanagari,
 53 |   U_Duployan,
 54 |   U_Egyptian_Hieroglyphs,
 55 |   U_Elbasan,
 56 |   U_Ethiopic,
 57 |   U_Georgian,
 58 |   U_Glagolitic,
 59 |   U_Gothic,
 60 |   U_Grantha,
 61 |   U_Greek,
 62 |   U_Gujarati,
 63 |   U_Gurmukhi,
 64 |   U_Han,
 65 |   U_Hangul,
 66 |   U_Hanunoo,
 67 |   U_Hatran,
 68 |   U_Hebrew,
 69 |   U_Hiragana,
 70 |   U_Imperial_Aramaic,
 71 |   U_Inherited,
 72 |   U_Inscriptional_Pahlavi,
 73 |   U_Inscriptional_Parthian,
 74 |   U_Javanese,
 75 |   U_Kaithi,
 76 |   U_Kannada,
 77 |   U_Katakana,
 78 |   U_Kayah_Li,
 79 |   U_Kharoshthi,
 80 |   U_Khmer,
 81 |   U_Khojki,
 82 |   U_Khudawadi,
 83 |   U_Lao,
 84 |   U_Latin,
 85 |   U_Lepcha,
 86 |   U_Limbu,
 87 |   U_Linear_A,
 88 |   U_Linear_B,
 89 |   U_Lisu,
 90 |   U_Lycian,
 91 |   U_Lydian,
 92 |   U_Mahajani,
 93 |   U_Malayalam,
 94 |   U_Mandaic,
 95 |   U_Manichaean,
 96 |   U_Marchen,
 97 |   U_Meetei_Mayek,
 98 |   U_Mende_Kikakui,
 99 |   U_Meroitic_Cursive,
100 |   U_Meroitic_Hieroglyphs,
101 |   U_Miao,
102 |   U_Modi,
103 |   U_Mongolian,
104 |   U_Mro,
105 |   U_Multani,
106 |   U_Myanmar,
107 |   U_Nabataean,
108 |   U_New_Tai_Lue,
109 |   U_Newa,
110 |   U_Nko,
111 |   U_Ogham,
112 |   U_Ol_Chiki,
113 |   U_Old_Hungarian,
114 |   U_Old_Italic,
115 |   U_Old_North_Arabian,
116 |   U_Old_Permic,
117 |   U_Old_Persian,
118 |   U_Old_South_Arabian,
119 |   U_Old_Turkic,
120 |   U_Oriya,
121 |   U_Osage,
122 |   U_Osmanya,
123 |   U_Pahawh_Hmong,
124 |   U_Palmyrene,
125 |   U_Pau_Cin_Hau,
126 |   U_Phags_Pa,
127 |   U_Phoenician,
128 |   U_Psalter_Pahlavi,
129 |   U_Rejang,
130 |   U_Runic,
131 |   U_Samaritan,
132 |   U_Saurashtra,
133 |   U_Sharada,
134 |   U_Shavian,
135 |   U_Siddham,
136 |   U_SignWriting,
137 |   U_Sinhala,
138 |   U_Sora_Sompeng,
139 |   U_Sundanese,
140 |   U_Syloti_Nagri,
141 |   U_Syriac,
142 |   U_Tagalog,
143 |   U_Tagbanwa,
144 |   U_Tai_Le,
145 |   U_Tai_Tham,
146 |   U_Tai_Viet,
147 |   U_Takri,
148 |   U_Tamil,
149 |   U_Tangut,
150 |   U_Telugu,
151 |   U_Thaana,
152 |   U_Thai,
153 |   U_Tibetan,
154 |   U_Tifinagh,
155 |   U_Tirhuta,
156 |   U_Ugaritic,
157 |   U_Vai,
158 |   U_Warang_Citi,
159 |   U_Yi
160 | };
161 | 
162 | ScriptType GetScript(char32 c);
163 | }  // namespace unicode_script
164 | }  // namespace sentencepiece
165 | #endif  // UNICODE_SCRIPT
166 | 


--------------------------------------------------------------------------------
/man/sentencepiece.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sentencepiece.R
 3 | \name{sentencepiece}
 4 | \alias{sentencepiece}
 5 | \title{Construct a Sentencepiece model}
 6 | \usage{
 7 | sentencepiece(
 8 |   x,
 9 |   type = c("bpe", "char", "unigram", "word"),
10 |   vocab_size = 8000,
11 |   coverage = 0.9999,
12 |   model_prefix = "sentencepiece",
13 |   model_dir = tempdir(),
14 |   threads = 1L,
15 |   args,
16 |   verbose = FALSE
17 | )
18 | }
19 | \arguments{
20 | \item{x}{a character vector of path(s) to the text files containing training data}
21 | 
22 | \item{type}{either one of 'bpe', 'char', 'unigram' or 'word' for Byte Pair Encoding, Character level encoding,
23 | Unigram encoding or pretokenised word encoding. Defaults to 'bpe' (Byte Pair Encoding).}
24 | 
25 | \item{vocab_size}{integer indicating the number of tokens in the final vocabulary. Defaults to 8000.}
26 | 
27 | \item{coverage}{fraction of characters covered by the model. Must be in the range [0, 1]. A good value to use is about 0.9999.}
28 | 
29 | \item{model_prefix}{character string with the name of the model. Defaults to 'sentencepiece'.
30 | When executing the function 2 files will be created in the directory specified by \code{model_dir}, namely
31 | sentencepiece.model with the model and sentencepiece.vocab containing the vocabulary of the model. 
32 | You can change the name of the model by providing the \code{model_prefix} argument.}
33 | 
34 | \item{model_dir}{directory where the model will be saved. Defaults to the temporary directory (tempdir())}
35 | 
36 | \item{threads}{integer indicating number of threads to use when building the model}
37 | 
38 | \item{args}{character string with arguments passed on to sentencepiece::SentencePieceTrainer::Train (for expert use only)}
39 | 
40 | \item{verbose}{logical indicating to show progress of sentencepiece training. Defaults to \code{FALSE}.}
41 | }
42 | \value{
43 | an object of class \code{sentencepiece} which is defined at \code{\link{sentencepiece_load_model}}
44 | }
45 | \description{
46 | Construct a Sentencepiece model on text.
47 | }
48 | \examples{
49 | library(tokenizers.bpe)
50 | data(belgium_parliament, package = "tokenizers.bpe")
51 | path   <- "traindata.txt" 
52 | folder <- getwd() 
53 | \dontshow{
54 | path   <- tempfile("traindata_", fileext = ".txt")
55 | folder <- tempdir()
56 | }
57 | writeLines(belgium_parliament$text, con = path)
58 | \dontshow{
59 | model <- sentencepiece(path, type = "char", vocab_size = 30, model_dir = folder)
60 | model <- sentencepiece(path, type = "unigram", vocab_size = 50, model_dir = folder)
61 | model <- sentencepiece(path, type = "bpe", vocab_size = 200, model_dir = folder)
62 | }
63 | \donttest{
64 | model <- sentencepiece(path, type = "char", 
65 |                        model_dir = folder, verbose = TRUE)
66 | model <- sentencepiece(path, type = "unigram", vocab_size = 20000, 
67 |                        model_dir = folder, verbose = TRUE)
68 | model <- sentencepiece(path, type = "bpe", vocab_size = 4000, 
69 |                        model_dir = folder, verbose = TRUE)
70 | 
71 | txt <- c("De eigendomsoverdracht aan de deelstaten is ingewikkeld.",
72 |          "On est d'accord sur le prix de la biere?")
73 | sentencepiece_encode(model, x = txt, type = "subwords")
74 | sentencepiece_encode(model, x = txt, type = "ids")
75 | 
76 | 
77 | model <- sentencepiece_load_model(file.path(folder, "sentencepiece.model"))
78 | sentencepiece_encode(model, x = txt, type = "subwords")
79 | sentencepiece_encode(model, x = txt, type = "ids")
80 | }
81 | 
82 | \dontshow{
83 | # clean up for CRAN
84 | file.remove(file.path(folder, "sentencepiece.model"))
85 | file.remove(file.path(folder, "sentencepiece.vocab"))
86 | file.remove(path)
87 | }
88 | }
89 | \seealso{
90 | \code{\link{sentencepiece_load_model}}
91 | }
92 | 


--------------------------------------------------------------------------------
/src/third_party/protobuf-lite/generated_enum_util.cc:
--------------------------------------------------------------------------------
 1 | // Protocol Buffers - Google's data interchange format
 2 | // Copyright 2008 Google Inc.  All rights reserved.
 3 | // https://developers.google.com/protocol-buffers/
 4 | //
 5 | // Redistribution and use in source and binary forms, with or without
 6 | // modification, are permitted provided that the following conditions are
 7 | // met:
 8 | //
 9 | //     * Redistributions of source code must retain the above copyright
10 | // notice, this list of conditions and the following disclaimer.
11 | //     * Redistributions in binary form must reproduce the above
12 | // copyright notice, this list of conditions and the following disclaimer
13 | // in the documentation and/or other materials provided with the
14 | // distribution.
15 | //     * Neither the name of Google Inc. nor the names of its
16 | // contributors may be used to endorse or promote products derived from
17 | // this software without specific prior written permission.
18 | //
19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 
31 | #include <google/protobuf/generated_enum_util.h>
32 | 
33 | #include <algorithm>
34 | 
35 | #include <google/protobuf/generated_message_util.h>
36 | 
37 | namespace google {
38 | namespace protobuf {
39 | namespace internal {
40 | namespace {
41 | 
42 | bool EnumCompareByName(const EnumEntry& a, const EnumEntry& b) {
43 |   return StringPiece(a.name) < StringPiece(b.name);
44 | }
45 | 
46 | // Gets the numeric value of the EnumEntry at the given index, but returns a
47 | // special value for the index -1. This gives a way to use std::lower_bound on a
48 | // sorted array of indices while searching for value that we associate with -1.
49 | int GetValue(const EnumEntry* enums, int i, int target) {
50 |   if (i == -1) {
51 |     return target;
52 |   } else {
53 |     return enums[i].value;
54 |   }
55 | }
56 | 
57 | }  // namespace
58 | 
59 | bool LookUpEnumValue(const EnumEntry* enums, size_t size,
60 |                      StringPiece name, int* value) {
61 |   EnumEntry target{name, 0};
62 |   auto it = std::lower_bound(enums, enums + size, target, EnumCompareByName);
63 |   if (it != enums + size && it->name == name) {
64 |     *value = it->value;
65 |     return true;
66 |   }
67 |   return false;
68 | }
69 | 
70 | int LookUpEnumName(const EnumEntry* enums, const int* sorted_indices,
71 |                    size_t size, int value) {
72 |   auto comparator = [enums, value](int a, int b) {
73 |     return GetValue(enums, a, value) < GetValue(enums, b, value);
74 |   };
75 |   auto it =
76 |       std::lower_bound(sorted_indices, sorted_indices + size, -1, comparator);
77 |   if (it != sorted_indices + size && enums[*it].value == value) {
78 |     return it - sorted_indices;
79 |   }
80 |   return -1;
81 | }
82 | 
83 | bool InitializeEnumStrings(
84 |     const EnumEntry* enums, const int* sorted_indices, size_t size,
85 |     internal::ExplicitlyConstructed<std::string>* enum_strings) {
86 |   for (int i = 0; i < size; ++i) {
87 |     enum_strings[i].Construct(enums[sorted_indices[i]].name);
88 |     internal::OnShutdownDestroyString(enum_strings[i].get_mutable());
89 |   }
90 |   return true;
91 | }
92 | 
93 | }  // namespace internal
94 | }  // namespace protobuf
95 | }  // namespace google
96 | 


--------------------------------------------------------------------------------
/src/third_party/protobuf-lite/google/protobuf/stubs/stringprintf.h:
--------------------------------------------------------------------------------
 1 | // Protocol Buffers - Google's data interchange format
 2 | // Copyright 2012 Google Inc.  All rights reserved.
 3 | // https://developers.google.com/protocol-buffers/
 4 | //
 5 | // Redistribution and use in source and binary forms, with or without
 6 | // modification, are permitted provided that the following conditions are
 7 | // met:
 8 | //
 9 | //     * Redistributions of source code must retain the above copyright
10 | // notice, this list of conditions and the following disclaimer.
11 | //     * Redistributions in binary form must reproduce the above
12 | // copyright notice, this list of conditions and the following disclaimer
13 | // in the documentation and/or other materials provided with the
14 | // distribution.
15 | //     * Neither the name of Google Inc. nor the names of its
16 | // contributors may be used to endorse or promote products derived from
17 | // this software without specific prior written permission.
18 | //
19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 
31 | // from google3/base/stringprintf.h
32 | //
33 | // Printf variants that place their output in a C++ string.
34 | //
35 | // Usage:
36 | //      string result = StringPrintf("%d %s\n", 10, "hello");
37 | //      SStringPrintf(&result, "%d %s\n", 10, "hello");
38 | //      StringAppendF(&result, "%d %s\n", 20, "there");
39 | 
40 | #ifndef GOOGLE_PROTOBUF_STUBS_STRINGPRINTF_H
41 | #define GOOGLE_PROTOBUF_STUBS_STRINGPRINTF_H
42 | 
43 | #include <stdarg.h>
44 | #include <string>
45 | #include <vector>
46 | 
47 | #include <google/protobuf/stubs/common.h>
48 | 
49 | #include <google/protobuf/port_def.inc>
50 | 
51 | namespace google {
52 | namespace protobuf {
53 | 
54 | // Return a C++ string
55 | PROTOBUF_EXPORT extern std::string StringPrintf(const char* format, ...);
56 | 
57 | // Store result into a supplied string and return it
58 | PROTOBUF_EXPORT extern const std::string& SStringPrintf(std::string* dst,
59 |                                                         const char* format,
60 |                                                         ...);
61 | 
62 | // Append result to a supplied string
63 | PROTOBUF_EXPORT extern void StringAppendF(std::string* dst, const char* format,
64 |                                           ...);
65 | 
66 | // Lower-level routine that takes a va_list and appends to a specified
67 | // string.  All other routines are just convenience wrappers around it.
68 | PROTOBUF_EXPORT extern void StringAppendV(std::string* dst, const char* format,
69 |                                           va_list ap);
70 | 
71 | // The max arguments supported by StringPrintfVector
72 | PROTOBUF_EXPORT extern const int kStringPrintfVectorMaxArgs;
73 | 
74 | // You can use this version when all your arguments are strings, but
75 | // you don't know how many arguments you'll have at compile time.
76 | // StringPrintfVector will LOG(FATAL) if v.size() > kStringPrintfVectorMaxArgs
77 | PROTOBUF_EXPORT extern std::string StringPrintfVector(
78 |     const char* format, const std::vector<std::string>& v);
79 | 
80 | }  // namespace protobuf
81 | }  // namespace google
82 | 
83 | #include <google/protobuf/port_undef.inc>
84 | 
85 | #endif  // GOOGLE_PROTOBUF_STUBS_STRINGPRINTF_H
86 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/filesystem.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2016 Google Inc.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.!
 14 | 
 15 | #include <iostream>
 16 | 
 17 | #include "filesystem.h"
 18 | #include "third_party/absl/memory/memory.h"
 19 | #include "util.h"
 20 | 
 21 | #if defined(OS_WIN) && defined(UNICODE) && defined(_UNICODE)
 22 | #define WPATH(path) (::sentencepiece::win32::Utf8ToWide(path).c_str())
 23 | #else
 24 | #define WPATH(path) (path)
 25 | #endif
 26 | 
 27 | namespace sentencepiece {
 28 | namespace filesystem {
 29 | 
 30 | class PosixReadableFile : public ReadableFile {
 31 |  public:
 32 |   PosixReadableFile(absl::string_view filename, bool is_binary = false)
 33 |       : is_(filename.empty()
 34 |                 ? &std::cin
 35 |                 : new std::ifstream(WPATH(filename.data()),
 36 |                                     is_binary ? std::ios::binary | std::ios::in
 37 |                                               : std::ios::in)) {
 38 |     if (!*is_)
 39 |       status_ = util::StatusBuilder(util::StatusCode::kNotFound, GTL_LOC)
 40 |                 << "\"" << filename.data() << "\": " << util::StrError(errno);
 41 |   }
 42 | 
 43 |   ~PosixReadableFile() {
 44 |     if (is_ != &std::cin) delete is_;
 45 |   }
 46 | 
 47 |   util::Status status() const { return status_; }
 48 | 
 49 |   bool ReadLine(std::string *line) {
 50 |     return static_cast<bool>(std::getline(*is_, *line));
 51 |   }
 52 | 
 53 |   bool ReadAll(std::string *line) {
 54 |     if (is_ == &std::cin) {
 55 |       LOG(ERROR) << "ReadAll is not supported for stdin.";
 56 |       return false;
 57 |     }
 58 |     line->assign(std::istreambuf_iterator<char>(*is_),
 59 |                  std::istreambuf_iterator<char>());
 60 |     return true;
 61 |   }
 62 | 
 63 |  private:
 64 |   util::Status status_;
 65 |   std::istream *is_;
 66 | };
 67 | 
 68 | class PosixWritableFile : public WritableFile {
 69 |  public:
 70 |   PosixWritableFile(absl::string_view filename, bool is_binary = false)
 71 |       : os_(new std::ofstream(WPATH(filename.data()),
 72 |                                     is_binary ? std::ios::binary | std::ios::out
 73 |                                               : std::ios::out)) {
 74 |     if (!*os_)
 75 |       status_ =
 76 |           util::StatusBuilder(util::StatusCode::kPermissionDenied, GTL_LOC)
 77 |           << "\"" << filename.data() << "\": " << util::StrError(errno);
 78 |   }
 79 | 
 80 |   ~PosixWritableFile() {
 81 |     delete os_;
 82 |   }
 83 | 
 84 |   util::Status status() const { return status_; }
 85 | 
 86 |   bool Write(absl::string_view text) {
 87 |     os_->write(text.data(), text.size());
 88 |     return os_->good();
 89 |   }
 90 | 
 91 |   bool WriteLine(absl::string_view text) { return Write(text) && Write("\n"); }
 92 | 
 93 |  private:
 94 |   util::Status status_;
 95 |   std::ostream *os_;
 96 | };
 97 | 
 98 | using DefaultReadableFile = PosixReadableFile;
 99 | using DefaultWritableFile = PosixWritableFile;
100 | 
101 | std::unique_ptr<ReadableFile> NewReadableFile(absl::string_view filename,
102 |                                               bool is_binary) {
103 |   return absl::make_unique<DefaultReadableFile>(filename, is_binary);
104 | }
105 | 
106 | std::unique_ptr<WritableFile> NewWritableFile(absl::string_view filename,
107 |                                               bool is_binary) {
108 |   return absl::make_unique<DefaultWritableFile>(filename, is_binary);
109 | }
110 | 
111 | }  // namespace filesystem
112 | }  // namespace sentencepiece
113 | 


--------------------------------------------------------------------------------
/man/sentencepiece_download_model.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/bpemb.R
 3 | \name{sentencepiece_download_model}
 4 | \alias{sentencepiece_download_model}
 5 | \title{Download a Sentencepiece model}
 6 | \usage{
 7 | sentencepiece_download_model(
 8 |   language,
 9 |   vocab_size,
10 |   dim,
11 |   model_dir = system.file(package = "sentencepiece", "models")
12 | )
13 | }
14 | \arguments{
15 | \item{language}{a character string with the language name. This can be either a plain language or a wikipedia shorthand. \cr
16 | Possible values can be found by looking at the examples or typing sentencepiece:::.bpemb$languages \cr
17 | If you provide multi it downloads the multilingual model available at \url{https://bpemb.h-its.org/multi/}}
18 | 
19 | \item{vocab_size}{integer indicating the number of tokens in the final vocabulary. Defaults to 5000. Possible values depend on the language. To inspect possible values, type sentencepiece:::.bpemb$vocab_sizes and look to your language of your choice.}
20 | 
21 | \item{dim}{dimension of the embedding. Either 25, 50, 100, 200 or 300.}
22 | 
23 | \item{model_dir}{path to the location where the model will be downloaded to. Defaults to \code{system.file(package = "sentencepiece", "models")}.}
24 | }
25 | \value{
26 | a list with elements 
27 | \itemize{
28 | \item{language: the provided language}
29 | \item{wikicode: the wikipedia code of the provided language}
30 | \item{file_model: the path to the downloaded Sentencepiece model}
31 | \item{url: the url where the Sentencepiece model was fetched from}
32 | \item{download_failed: logical, indicating if the download failed}
33 | \item{download_message: a character string with possible download failure information}
34 | \item{glove: a list with elements file_model, url, download_failed and download_message indicating the path to the Glove embeddings in txt format. Only present if the dim argument is provided in the function. Otherwise the embeddings will not be downloaded}
35 | \item{glove.bin: a list with elements file_model, url, download_failed and download_message indicating the path to the Glove embeddings in bin format. Only present if the dim argument is provided in the function. Otherwise the embeddings will not be downloaded}
36 | }
37 | }
38 | \description{
39 | Download pretrained models built on Wikipedia
40 | made available at \url{https://bpemb.h-its.org} through \url{https://github.com/bheinzerling/bpemb}. 
41 | These models contain Byte Pair Encoded models trained with sentencepiece as well
42 | as Glove embeddings of these Byte Pair subwords. Models for 275 languages are available.
43 | }
44 | \examples{
45 | path <- getwd()
46 | \dontshow{
47 | path <- tempdir()
48 | }
49 | \donttest{
50 | 
51 | ##
52 | ## Download only the tokeniser model
53 | ##
54 | dl <- sentencepiece_download_model("Russian", vocab_size = 50000, model_dir = path)
55 | dl <- sentencepiece_download_model("English", vocab_size = 100000, model_dir = path)
56 | dl <- sentencepiece_download_model("French", vocab_size = 25000, model_dir = path)
57 | dl <- sentencepiece_download_model("multi", vocab_size = 320000, model_dir = path)
58 | dl <- sentencepiece_download_model("Vlaams", vocab_size = 1000, model_dir = path)
59 | dl <- sentencepiece_download_model("Dutch", vocab_size = 25000, model_dir = path)
60 | dl <- sentencepiece_download_model("nl", vocab_size = 25000, model_dir = path)
61 | str(dl)
62 | model     <- sentencepiece_load_model(dl$file_model)
63 | 
64 | ##
65 | ## Download the tokeniser model + Glove embeddings of Byte Pairs
66 | ##
67 | dl <- sentencepiece_download_model("nl", vocab_size = 1000, dim = 50, model_dir = path)
68 | str(dl)
69 | model     <- sentencepiece_load_model(dl$file_model)
70 | embedding <- read_word2vec(dl$glove$file_model)
71 | }
72 | 
73 | 
74 | dl <- sentencepiece_download_model("nl", vocab_size = 1000, dim = 25,
75 |                                    model_dir = tempdir())
76 | str(dl)
77 | 
78 | \dontshow{
79 | # clean up for CRAN
80 | f <- list.files(tempdir(), pattern = ".vocab$|.model$", full.names = TRUE)
81 | invisible(file.remove(f))
82 | }
83 | }
84 | \seealso{
85 | \code{\link{sentencepiece_load_model}}
86 | }
87 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/char_model_test.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2016 Google Inc.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.!
 14 | 
 15 | #include <string>
 16 | 
 17 | #include "char_model.h"
 18 | #include "testharness.h"
 19 | #include "util.h"
 20 | 
 21 | namespace sentencepiece {
 22 | namespace character {
 23 | namespace {
 24 | 
 25 | // Space symbol (U+2581)
 26 | #define WS "\xe2\x96\x81"
 27 | 
 28 | ModelProto MakeBaseModelProto() {
 29 |   ModelProto model_proto;
 30 |   auto *sp1 = model_proto.add_pieces();
 31 |   auto *sp2 = model_proto.add_pieces();
 32 |   auto *sp3 = model_proto.add_pieces();
 33 | 
 34 |   sp1->set_type(ModelProto::SentencePiece::UNKNOWN);
 35 |   sp1->set_piece("<unk>");
 36 |   sp2->set_type(ModelProto::SentencePiece::CONTROL);
 37 |   sp2->set_piece("<s>");
 38 |   sp3->set_type(ModelProto::SentencePiece::CONTROL);
 39 |   sp3->set_piece("</s>");
 40 | 
 41 |   return model_proto;
 42 | }
 43 | 
 44 | void AddPiece(ModelProto *model_proto, const std::string &piece,
 45 |               float score = 0.0) {
 46 |   auto *sp = model_proto->add_pieces();
 47 |   sp->set_piece(piece);
 48 |   sp->set_score(score);
 49 | }
 50 | 
 51 | TEST(ModelTest, EncodeTest) {
 52 |   ModelProto model_proto = MakeBaseModelProto();
 53 | 
 54 |   AddPiece(&model_proto, WS, 0.0);
 55 |   AddPiece(&model_proto, "a", 0.1);
 56 |   AddPiece(&model_proto, "b", 0.2);
 57 |   AddPiece(&model_proto, "c", 0.3);
 58 |   AddPiece(&model_proto, "d", 0.4);
 59 |   AddPiece(&model_proto, "ABC", 0.4);
 60 |   model_proto.mutable_pieces(8)->set_type(
 61 |       ModelProto::SentencePiece::USER_DEFINED);
 62 | 
 63 |   const Model model(model_proto);
 64 | 
 65 |   EncodeResult result;
 66 | 
 67 |   result = model.Encode("");
 68 |   EXPECT_TRUE(result.empty());
 69 | 
 70 |   result = model.Encode(WS "a" WS "b" WS "c");
 71 |   EXPECT_EQ(6, result.size());
 72 |   EXPECT_EQ(WS, result[0].first);
 73 |   EXPECT_EQ("a", result[1].first);
 74 |   EXPECT_EQ(WS, result[2].first);
 75 |   EXPECT_EQ("b", result[3].first);
 76 |   EXPECT_EQ(WS, result[4].first);
 77 |   EXPECT_EQ("c", result[5].first);
 78 | 
 79 |   result = model.Encode(WS "ab" WS "cd" WS "abc");
 80 |   EXPECT_EQ(10, result.size());
 81 |   EXPECT_EQ(WS, result[0].first);
 82 |   EXPECT_EQ("a", result[1].first);
 83 |   EXPECT_EQ("b", result[2].first);
 84 |   EXPECT_EQ(WS, result[3].first);
 85 |   EXPECT_EQ("c", result[4].first);
 86 |   EXPECT_EQ("d", result[5].first);
 87 |   EXPECT_EQ(WS, result[6].first);
 88 |   EXPECT_EQ("a", result[7].first);
 89 |   EXPECT_EQ("b", result[8].first);
 90 |   EXPECT_EQ("c", result[9].first);
 91 | 
 92 |   // makes a broken utf-8
 93 |   const std::string broken_utf8 = std::string("あ").substr(0, 1);
 94 |   result = model.Encode(broken_utf8);
 95 |   EXPECT_EQ(1, result.size());
 96 |   EXPECT_EQ(broken_utf8, result[0].first);
 97 | 
 98 |   // "ABC" is treated as one piece, as it is USER_DEFINED.
 99 |   result = model.Encode(WS "abABCcd");
100 |   EXPECT_EQ(6, result.size());
101 |   EXPECT_EQ(WS, result[0].first);
102 |   EXPECT_EQ("a", result[1].first);
103 |   EXPECT_EQ("b", result[2].first);
104 |   EXPECT_EQ("ABC", result[3].first);
105 |   EXPECT_EQ("c", result[4].first);
106 |   EXPECT_EQ("d", result[5].first);
107 | }
108 | 
109 | TEST(CharModelTest, NotSupportedTest) {
110 |   ModelProto model_proto = MakeBaseModelProto();
111 |   const Model model(model_proto);
112 |   EXPECT_EQ(NBestEncodeResult(), model.NBestEncode("test", 10));
113 |   EXPECT_EQ(EncodeResult(), model.SampleEncode("test", 0.1));
114 | }
115 | 
116 | }  // namespace
117 | }  // namespace character
118 | }  // namespace sentencepiece
119 | 


--------------------------------------------------------------------------------
/src/third_party/protobuf-lite/google/protobuf/has_bits.h:
--------------------------------------------------------------------------------
  1 | // Protocol Buffers - Google's data interchange format
  2 | // Copyright 2008 Google Inc.  All rights reserved.
  3 | // https://developers.google.com/protocol-buffers/
  4 | //
  5 | // Redistribution and use in source and binary forms, with or without
  6 | // modification, are permitted provided that the following conditions are
  7 | // met:
  8 | //
  9 | //     * Redistributions of source code must retain the above copyright
 10 | // notice, this list of conditions and the following disclaimer.
 11 | //     * Redistributions in binary form must reproduce the above
 12 | // copyright notice, this list of conditions and the following disclaimer
 13 | // in the documentation and/or other materials provided with the
 14 | // distribution.
 15 | //     * Neither the name of Google Inc. nor the names of its
 16 | // contributors may be used to endorse or promote products derived from
 17 | // this software without specific prior written permission.
 18 | //
 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 30 | 
 31 | #ifndef GOOGLE_PROTOBUF_HAS_BITS_H__
 32 | #define GOOGLE_PROTOBUF_HAS_BITS_H__
 33 | 
 34 | #include <google/protobuf/stubs/common.h>
 35 | #include <google/protobuf/port.h>
 36 | 
 37 | #include <google/protobuf/port_def.inc>
 38 | 
 39 | #ifdef SWIG
 40 | #error "You cannot SWIG proto headers"
 41 | #endif
 42 | 
 43 | namespace google {
 44 | namespace protobuf {
 45 | namespace internal {
 46 | 
 47 | template <size_t doublewords>
 48 | class HasBits {
 49 |  public:
 50 |   constexpr HasBits() PROTOBUF_ALWAYS_INLINE : has_bits_{} {}
 51 | 
 52 |   void Clear() PROTOBUF_ALWAYS_INLINE {
 53 |     memset(has_bits_, 0, sizeof(has_bits_));
 54 |   }
 55 | 
 56 |   uint32& operator[](int index) PROTOBUF_ALWAYS_INLINE {
 57 |     return has_bits_[index];
 58 |   }
 59 | 
 60 |   const uint32& operator[](int index) const PROTOBUF_ALWAYS_INLINE {
 61 |     return has_bits_[index];
 62 |   }
 63 | 
 64 |   bool operator==(const HasBits<doublewords>& rhs) const {
 65 |     return memcmp(has_bits_, rhs.has_bits_, sizeof(has_bits_)) == 0;
 66 |   }
 67 | 
 68 |   bool operator!=(const HasBits<doublewords>& rhs) const {
 69 |     return !(*this == rhs);
 70 |   }
 71 | 
 72 |   void Or(const HasBits<doublewords>& rhs) {
 73 |     for (size_t i = 0; i < doublewords; i++) has_bits_[i] |= rhs[i];
 74 |   }
 75 | 
 76 |   bool empty() const;
 77 | 
 78 |  private:
 79 |   uint32 has_bits_[doublewords];
 80 | };
 81 | 
 82 | template <>
 83 | inline bool HasBits<1>::empty() const {
 84 |   return !has_bits_[0];
 85 | }
 86 | 
 87 | template <>
 88 | inline bool HasBits<2>::empty() const {
 89 |   return !(has_bits_[0] | has_bits_[1]);
 90 | }
 91 | 
 92 | template <>
 93 | inline bool HasBits<3>::empty() const {
 94 |   return !(has_bits_[0] | has_bits_[1] | has_bits_[2]);
 95 | }
 96 | 
 97 | template <>
 98 | inline bool HasBits<4>::empty() const {
 99 |   return !(has_bits_[0] | has_bits_[1] | has_bits_[2] | has_bits_[3]);
100 | }
101 | 
102 | template <size_t doublewords>
103 | inline bool HasBits<doublewords>::empty() const {
104 |   for (size_t i = 0; i < doublewords; ++i) {
105 |     if (has_bits_[i]) return false;
106 |   }
107 |   return true;
108 | }
109 | 
110 | }  // namespace internal
111 | }  // namespace protobuf
112 | }  // namespace google
113 | 
114 | #include <google/protobuf/port_undef.inc>
115 | 
116 | #endif  // GOOGLE_PROTOBUF_HAS_BITS_H__
117 | 


--------------------------------------------------------------------------------
/man/sentencepiece_encode.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sentencepiece.R
 3 | \name{sentencepiece_encode}
 4 | \alias{sentencepiece_encode}
 5 | \title{Tokenise text alongside a Sentencepiece model}
 6 | \usage{
 7 | sentencepiece_encode(
 8 |   model,
 9 |   x,
10 |   type = c("subwords", "ids"),
11 |   nbest = -1L,
12 |   alpha = 0.1
13 | )
14 | }
15 | \arguments{
16 | \item{model}{an object of class \code{sentencepiece} as returned by \code{\link{sentencepiece_load_model}} or \code{\link{sentencepiece}}}
17 | 
18 | \item{x}{a character vector of text (in UTF-8 Encoding)}
19 | 
20 | \item{type}{a character string, either 'subwords' or 'ids' to get the subwords or the corresponding ids of these subwords as defined in the vocabulary of the model. 
21 | Defaults to 'subwords'.}
22 | 
23 | \item{nbest}{integer indicating the number of segmentations to extract. See the details. The argument is not used if you do not provide a value for it.}
24 | 
25 | \item{alpha}{smoothing parameter to perform subword regularisation. Typical values are 0.1, 0.2 or 0.5. See the details. The argument is not used if you do not provide a value for it or do not provide a value for \code{nbest}.}
26 | }
27 | \value{
28 | a list with tokenised text, one for each element of \code{x} 
29 | unless you provide \code{nbest} without providing \code{alpha} in which case the result is a list of list of \code{nbest} tokenised texts
30 | }
31 | \description{
32 | Tokenise text alongside a Sentencepiece model
33 | }
34 | \details{
35 | If you specify \code{alpha} to perform subword regularisation, keep in mind the following. \cr
36 | When alpha is 0.0, one segmentation is uniformly sampled from the \code{nbest} or lattice. 
37 | The best Viterbi segmentation is more likely sampled when setting larger \code{alpha} values like 0.1. \cr
38 | \itemize{
39 | \item If you provide a positive value for \code{nbest}, approximately samples one segmentation from \code{nbest} candidates.
40 | \item If you provide a negative value for \code{nbest}, samples one segmentation from the hypotheses (Lattice) according to the generation probabilities using forward-filtering and backward-sampling algorithm.
41 | }
42 | \code{nbest} and \code{alpha} correspond respectively to the parameter \code{l} and in \code{alpha} 
43 | in the paper \url{https://arxiv.org/abs/1804.10959} where (\code{nbest} < 0 means l = infinity).\cr
44 | 
45 | If the model is a BPE model, \code{alpha} is the merge probability \code{p} explained in \url{https://arxiv.org/abs/1910.13267}. 
46 | In a BPE model, nbest-based sampling is not supported so the nbest parameter is ignored although 
47 | it still needs to be provided if you want to make use of \code{alpha}.
48 | }
49 | \examples{
50 | model <- system.file(package = "sentencepiece", "models", "nl-fr-dekamer.model")
51 | model <- sentencepiece_load_model(file = model)
52 | 
53 | txt <- c("De eigendomsoverdracht aan de deelstaten is ingewikkeld.",
54 |          "On est d'accord sur le prix de la biere?")
55 | sentencepiece_encode(model, x = txt, type = "subwords")
56 | sentencepiece_encode(model, x = txt, type = "ids")
57 | 
58 | ## Examples using subword regularisation
59 | model <- system.file(package = "sentencepiece", "models", "nl-fr-dekamer-unigram.model")
60 | model <- sentencepiece_load_model(file = model)
61 | 
62 | txt <- c("Goed zo",
63 |          "On est d'accord")
64 | sentencepiece_encode(model, x = txt, type = "subwords", nbest = 4)
65 | sentencepiece_encode(model, x = txt, type = "ids", nbest = 4)
66 | sentencepiece_encode(model, x = txt, type = "subwords", nbest = 2)
67 | sentencepiece_encode(model, x = txt, type = "ids", nbest = 2)
68 | sentencepiece_encode(model, x = txt, type = "subwords", nbest = 1)
69 | sentencepiece_encode(model, x = txt, type = "ids", nbest = 1)
70 | sentencepiece_encode(model, x = txt, type = "subwords", nbest = 4, alpha = 0.1)
71 | sentencepiece_encode(model, x = txt, type = "ids", nbest = 4, alpha = 0.1)
72 | sentencepiece_encode(model, x = txt, type = "subwords", nbest = -1, alpha = 0.1)
73 | sentencepiece_encode(model, x = txt, type = "ids", nbest = -1, alpha = 0.1)
74 | sentencepiece_encode(model, x = txt, type = "subwords", nbest = -1, alpha = 0)
75 | sentencepiece_encode(model, x = txt, type = "ids", nbest = -1, alpha = 0)
76 | }
77 | 


--------------------------------------------------------------------------------
/src/third_party/protobuf-lite/google/protobuf/generated_enum_reflection.h:
--------------------------------------------------------------------------------
 1 | // Protocol Buffers - Google's data interchange format
 2 | // Copyright 2008 Google Inc.  All rights reserved.
 3 | // https://developers.google.com/protocol-buffers/
 4 | //
 5 | // Redistribution and use in source and binary forms, with or without
 6 | // modification, are permitted provided that the following conditions are
 7 | // met:
 8 | //
 9 | //     * Redistributions of source code must retain the above copyright
10 | // notice, this list of conditions and the following disclaimer.
11 | //     * Redistributions in binary form must reproduce the above
12 | // copyright notice, this list of conditions and the following disclaimer
13 | // in the documentation and/or other materials provided with the
14 | // distribution.
15 | //     * Neither the name of Google Inc. nor the names of its
16 | // contributors may be used to endorse or promote products derived from
17 | // this software without specific prior written permission.
18 | //
19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 
31 | // Author: jasonh@google.com (Jason Hsueh)
32 | //
33 | // This header is logically internal, but is made public because it is used
34 | // from protocol-compiler-generated code, which may reside in other components.
35 | // It provides reflection support for generated enums, and is included in
36 | // generated .pb.h files and should have minimal dependencies. The methods are
37 | // implemented in generated_message_reflection.cc.
38 | 
39 | #ifndef GOOGLE_PROTOBUF_GENERATED_ENUM_REFLECTION_H__
40 | #define GOOGLE_PROTOBUF_GENERATED_ENUM_REFLECTION_H__
41 | 
42 | #include <string>
43 | 
44 | #include <google/protobuf/generated_enum_util.h>
45 | #include <google/protobuf/port.h>
46 | #include <google/protobuf/stubs/strutil.h>
47 | 
48 | #ifdef SWIG
49 | #error "You cannot SWIG proto headers"
50 | #endif
51 | 
52 | #include <google/protobuf/port_def.inc>
53 | 
54 | namespace google {
55 | namespace protobuf {
56 | class EnumDescriptor;
57 | }  // namespace protobuf
58 | }  // namespace google
59 | 
60 | namespace google {
61 | namespace protobuf {
62 | 
63 | // Returns the EnumDescriptor for enum type E, which must be a
64 | // proto-declared enum type.  Code generated by the protocol compiler
65 | // will include specializations of this template for each enum type declared.
66 | template <typename E>
67 | const EnumDescriptor* GetEnumDescriptor();
68 | 
69 | namespace internal {
70 | 
71 | // Helper for EnumType_Parse functions: try to parse the string 'name' as
72 | // an enum name of the given type, returning true and filling in value on
73 | // success, or returning false and leaving value unchanged on failure.
74 | PROTOBUF_EXPORT bool ParseNamedEnum(const EnumDescriptor* descriptor,
75 |                                     ConstStringParam name, int* value);
76 | 
77 | template <typename EnumType>
78 | bool ParseNamedEnum(const EnumDescriptor* descriptor, ConstStringParam name,
79 |                     EnumType* value) {
80 |   int tmp;
81 |   if (!ParseNamedEnum(descriptor, name, &tmp)) return false;
82 |   *value = static_cast<EnumType>(tmp);
83 |   return true;
84 | }
85 | 
86 | // Just a wrapper around printing the name of a value. The main point of this
87 | // function is not to be inlined, so that you can do this without including
88 | // descriptor.h.
89 | PROTOBUF_EXPORT const std::string& NameOfEnum(const EnumDescriptor* descriptor,
90 |                                               int value);
91 | 
92 | }  // namespace internal
93 | }  // namespace protobuf
94 | }  // namespace google
95 | 
96 | #include <google/protobuf/port_undef.inc>
97 | 
98 | #endif  // GOOGLE_PROTOBUF_GENERATED_ENUM_REFLECTION_H__
99 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/unigram_model_trainer.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2016 Google Inc.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.!
 14 | 
 15 | #ifndef UNIGRAM_MODEL_TRAINER_H_
 16 | #define UNIGRAM_MODEL_TRAINER_H_
 17 | 
 18 | #include <memory>
 19 | #include <string>
 20 | #include <utility>
 21 | #include <vector>
 22 | 
 23 | #include "sentencepiece_model.pb.h"
 24 | #include "third_party/absl/strings/string_view.h"
 25 | #include "trainer_interface.h"
 26 | #include "unigram_model.h"
 27 | #include "util.h"
 28 | 
 29 | namespace sentencepiece {
 30 | namespace unigram {
 31 | 
 32 | using string_util::UnicodeText;
 33 | 
 34 | class TrainerModel : public Model {
 35 |  public:
 36 |   using SentencePieces = std::vector<std::pair<std::string, float>>;
 37 | 
 38 |   TrainerModel() {}
 39 |   TrainerModel(const ModelProto &model_proto) = delete;
 40 |   TrainerModel(const TrainerSpec &trainer_spec,
 41 |                const NormalizerSpec &normalizaiton_spec);
 42 |   ~TrainerModel() override;
 43 | 
 44 |   // Returns the sentencepieces.
 45 |   // The meta symbols, e.g., </s> are NOT included.
 46 |   const SentencePieces &GetSentencePieces() const;
 47 | 
 48 |   // Sets sentencepieces. The sentencepieces are moved.
 49 |   // The meta symbols, e.g., </s> are NOT included.
 50 |   void SetSentencePieces(SentencePieces &&sentencepieces);
 51 | 
 52 |   EncodeResult Encode(absl::string_view normalized) const override {
 53 |     return {};
 54 |   }
 55 | 
 56 |  private:
 57 |   SentencePieces sentencepieces_;
 58 |   TrainerSpec trainer_spec_;
 59 |   NormalizerSpec normalizer_spec_;
 60 |   ModelProto model_proto_data_;
 61 | };
 62 | 
 63 | class Trainer : public TrainerInterface {
 64 |  public:
 65 |   Trainer(const TrainerSpec &trainer_spec,
 66 |           const NormalizerSpec &normalizer_spec,
 67 |           const NormalizerSpec &denormalizer_spec)
 68 |       : TrainerInterface::TrainerInterface(trainer_spec, normalizer_spec,
 69 |                                            denormalizer_spec) {}
 70 | 
 71 |   util::Status Train() override;
 72 | 
 73 |  private:
 74 |   // FRIEND_TEST(TrainerTest, IsValidSentencePieceTest);
 75 | 
 76 |   // Makes seed pieces from the training corpus.
 77 |   // The size of seed pieces is determined by seed_sentencepiece_size.
 78 |   // node_int_type should be of integer type (int32 or int64),
 79 |   // determined by train_extremely_large_corpus.
 80 |   template <typename node_int_type>
 81 |   TrainerModel::SentencePieces MakeSeedSentencePieces() const;
 82 | 
 83 |   // Executes the E step of EM and returns expected count.
 84 |   // The index of return array is the vocab id.
 85 |   // |objective| is a negative likelihood of the current model.
 86 |   // |num_token| is the number of total tokens to tokenize
 87 |   // training corpus.
 88 |   std::vector<float> RunEStep(const TrainerModel &model, float *objective,
 89 |                               int64 *num_tokens) const;
 90 | 
 91 |   // Executes the M step of EM with the expected frequency and
 92 |   // returns new pieces.
 93 |   TrainerModel::SentencePieces RunMStep(
 94 |       const TrainerModel &model, const std::vector<float> &expected) const;
 95 | 
 96 |   // Heuristically prunes the current pieces.
 97 |   // This is called after each EM sub-iteration.
 98 |   TrainerModel::SentencePieces PruneSentencePieces(
 99 |       const TrainerModel &model) const;
100 | 
101 |   // Makes the final sentence pieces by incorporating the required characters
102 |   // and control/user defined symbols.
103 |   TrainerModel::SentencePieces FinalizeSentencePieces(
104 |       const TrainerModel &model) const;
105 | 
106 |   // When the size of SentencePieces becomes less than desired_vocab_size_,
107 |   // break the main training loop. desired_vocab_size_ = 1.1 * vocab_size_
108 |   // for now.
109 |   int desired_vocab_size_;
110 | };
111 | }  // namespace unigram
112 | }  // namespace sentencepiece
113 | #endif  // UNIGRAM_MODEL_TRAINER_H_
114 | 


--------------------------------------------------------------------------------
/src/third_party/protobuf-lite/google/protobuf/stubs/hash.h:
--------------------------------------------------------------------------------
  1 | // Protocol Buffers - Google's data interchange format
  2 | // Copyright 2008 Google Inc.  All rights reserved.
  3 | // https://developers.google.com/protocol-buffers/
  4 | //
  5 | // Redistribution and use in source and binary forms, with or without
  6 | // modification, are permitted provided that the following conditions are
  7 | // met:
  8 | //
  9 | //     * Redistributions of source code must retain the above copyright
 10 | // notice, this list of conditions and the following disclaimer.
 11 | //     * Redistributions in binary form must reproduce the above
 12 | // copyright notice, this list of conditions and the following disclaimer
 13 | // in the documentation and/or other materials provided with the
 14 | // distribution.
 15 | //     * Neither the name of Google Inc. nor the names of its
 16 | // contributors may be used to endorse or promote products derived from
 17 | // this software without specific prior written permission.
 18 | //
 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 30 | 
 31 | // Author: kenton@google.com (Kenton Varda)
 32 | 
 33 | #ifndef GOOGLE_PROTOBUF_STUBS_HASH_H__
 34 | #define GOOGLE_PROTOBUF_STUBS_HASH_H__
 35 | 
 36 | #include <cstring>
 37 | #include <string>
 38 | #include <unordered_map>
 39 | #include <unordered_set>
 40 | 
 41 | # define GOOGLE_PROTOBUF_HASH_NAMESPACE_DECLARATION_START \
 42 |   namespace google {                                      \
 43 |   namespace protobuf {
 44 | # define GOOGLE_PROTOBUF_HASH_NAMESPACE_DECLARATION_END }}
 45 | 
 46 | namespace google {
 47 | namespace protobuf {
 48 | 
 49 | template <typename Key>
 50 | struct hash : public std::hash<Key> {};
 51 | 
 52 | template <typename Key>
 53 | struct hash<const Key*> {
 54 |   inline size_t operator()(const Key* key) const {
 55 |     return reinterpret_cast<size_t>(key);
 56 |   }
 57 | };
 58 | 
 59 | // Unlike the old SGI version, the TR1 "hash" does not special-case char*.  So,
 60 | // we go ahead and provide our own implementation.
 61 | template <>
 62 | struct hash<const char*> {
 63 |   inline size_t operator()(const char* str) const {
 64 |     size_t result = 0;
 65 |     for (; *str != '\0'; str++) {
 66 |       result = 5 * result + static_cast<size_t>(*str);
 67 |     }
 68 |     return result;
 69 |   }
 70 | };
 71 | 
 72 | template<>
 73 | struct hash<bool> {
 74 |   size_t operator()(bool x) const {
 75 |     return static_cast<size_t>(x);
 76 |   }
 77 | };
 78 | 
 79 | template <>
 80 | struct hash<std::string> {
 81 |   inline size_t operator()(const std::string& key) const {
 82 |     return hash<const char*>()(key.c_str());
 83 |   }
 84 | 
 85 |   static const size_t bucket_size = 4;
 86 |   static const size_t min_buckets = 8;
 87 |   inline bool operator()(const std::string& a, const std::string& b) const {
 88 |     return a < b;
 89 |   }
 90 | };
 91 | 
 92 | template <typename First, typename Second>
 93 | struct hash<std::pair<First, Second> > {
 94 |   inline size_t operator()(const std::pair<First, Second>& key) const {
 95 |     size_t first_hash = hash<First>()(key.first);
 96 |     size_t second_hash = hash<Second>()(key.second);
 97 | 
 98 |     // FIXME(kenton):  What is the best way to compute this hash?  I have
 99 |     // no idea!  This seems a bit better than an XOR.
100 |     return first_hash * ((1 << 16) - 1) + second_hash;
101 |   }
102 | 
103 |   static const size_t bucket_size = 4;
104 |   static const size_t min_buckets = 8;
105 |   inline bool operator()(const std::pair<First, Second>& a,
106 |                            const std::pair<First, Second>& b) const {
107 |     return a < b;
108 |   }
109 | };
110 | 
111 | }  // namespace protobuf
112 | }  // namespace google
113 | 
114 | #endif  // GOOGLE_PROTOBUF_STUBS_HASH_H__
115 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/spm_decode_main.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2016 Google Inc.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.!
 14 | 
 15 | #include <functional>
 16 | #include <string>
 17 | #include <vector>
 18 | 
 19 | #include "common.h"
 20 | #include "filesystem.h"
 21 | #include "init.h"
 22 | #include "sentencepiece.pb.h"
 23 | #include "sentencepiece_processor.h"
 24 | #include "third_party/absl/flags/flag.h"
 25 | #include "third_party/absl/strings/str_split.h"
 26 | #include "util.h"
 27 | 
 28 | ABSL_FLAG(std::string, model, "", "model file name");
 29 | ABSL_FLAG(std::string, input, "", "input filename");
 30 | ABSL_FLAG(std::string, output, "", "output filename");
 31 | ABSL_FLAG(std::string, input_format, "piece", "choose from piece or id");
 32 | ABSL_FLAG(std::string, output_format, "string", "choose from string or proto");
 33 | ABSL_FLAG(std::string, extra_options, "",
 34 |           "':' separated encoder extra options, e.g., \"reverse:bos:eos\"");
 35 | 
 36 | int main(int argc, char *argv[]) {
 37 |   sentencepiece::ParseCommandLineFlags(argv[0], &argc, &argv, true);
 38 |   std::vector<std::string> rest_args;
 39 | 
 40 |   if (absl::GetFlag(FLAGS_input).empty()) {
 41 |     for (int i = 1; i < argc; ++i) {
 42 |       rest_args.push_back(std::string(argv[i]));
 43 |     }
 44 |   } else {
 45 |     rest_args.push_back(absl::GetFlag(FLAGS_input));
 46 |   }
 47 | 
 48 |   if (rest_args.empty())
 49 |     rest_args.push_back("");  // empty means that reading from stdin.
 50 | 
 51 |   CHECK(!absl::GetFlag(FLAGS_model).empty());
 52 | 
 53 |   sentencepiece::SentencePieceProcessor sp;
 54 |   CHECK_OK(sp.Load(absl::GetFlag(FLAGS_model)));
 55 |   CHECK_OK(sp.SetDecodeExtraOptions(absl::GetFlag(FLAGS_extra_options)));
 56 | 
 57 |   auto output =
 58 |       sentencepiece::filesystem::NewWritableFile(absl::GetFlag(FLAGS_output));
 59 |   CHECK_OK(output->status());
 60 | 
 61 |   std::string detok, line;
 62 |   sentencepiece::SentencePieceText spt;
 63 |   std::function<void(const std::vector<std::string> &pieces)> process;
 64 | 
 65 |   auto ToIds = [&](const std::vector<std::string> &pieces) {
 66 |     std::vector<int> ids;
 67 |     ids.reserve(pieces.size());
 68 |     for (const auto &s : pieces) {
 69 |       ids.push_back(atoi(s.c_str()));
 70 |     }
 71 |     return ids;
 72 |   };
 73 | 
 74 |   if (absl::GetFlag(FLAGS_input_format) == "piece") {
 75 |     if (absl::GetFlag(FLAGS_output_format) == "string") {
 76 |       process = [&](const std::vector<std::string> &pieces) {
 77 |         CHECK_OK(sp.Decode(pieces, &detok));
 78 |         output->WriteLine(detok);
 79 |       };
 80 |     } else if (absl::GetFlag(FLAGS_output_format) == "proto") {
 81 |       process = [&](const std::vector<std::string> &pieces) {
 82 |         CHECK_OK(sp.Decode(pieces, &spt));
 83 |       };
 84 |     } else {
 85 |       LOG(FATAL) << "Unknown output format: "
 86 |                  << absl::GetFlag(FLAGS_output_format);
 87 |     }
 88 |   } else if (absl::GetFlag(FLAGS_input_format) == "id") {
 89 |     if (absl::GetFlag(FLAGS_output_format) == "string") {
 90 |       process = [&](const std::vector<std::string> &pieces) {
 91 |         CHECK_OK(sp.Decode(ToIds(pieces), &detok));
 92 |         output->WriteLine(detok);
 93 |       };
 94 |     } else if (absl::GetFlag(FLAGS_output_format) == "proto") {
 95 |       process = [&](const std::vector<std::string> &pieces) {
 96 |         CHECK_OK(sp.Decode(ToIds(pieces), &spt));
 97 |       };
 98 |     } else {
 99 |       LOG(FATAL) << "Unknown output format: "
100 |                  << absl::GetFlag(FLAGS_output_format);
101 |     }
102 |   } else {
103 |     LOG(FATAL) << "Unknown input format: " << absl::GetFlag(FLAGS_input_format);
104 |   }
105 | 
106 |   for (const auto &filename : rest_args) {
107 |     auto input = sentencepiece::filesystem::NewReadableFile(filename);
108 |     CHECK_OK(input->status());
109 |     while (input->ReadLine(&line)) {
110 |       const auto pieces = absl::StrSplit(line, " ");
111 |       process(pieces);
112 |     }
113 |   }
114 | 
115 |   return 0;
116 | }
117 | 


--------------------------------------------------------------------------------
/src/third_party/protobuf-lite/google/protobuf/stubs/status.h:
--------------------------------------------------------------------------------
  1 | // Protocol Buffers - Google's data interchange format
  2 | // Copyright 2008 Google Inc.  All rights reserved.
  3 | // https://developers.google.com/protocol-buffers/
  4 | //
  5 | // Redistribution and use in source and binary forms, with or without
  6 | // modification, are permitted provided that the following conditions are
  7 | // met:
  8 | //
  9 | //     * Redistributions of source code must retain the above copyright
 10 | // notice, this list of conditions and the following disclaimer.
 11 | //     * Redistributions in binary form must reproduce the above
 12 | // copyright notice, this list of conditions and the following disclaimer
 13 | // in the documentation and/or other materials provided with the
 14 | // distribution.
 15 | //     * Neither the name of Google Inc. nor the names of its
 16 | // contributors may be used to endorse or promote products derived from
 17 | // this software without specific prior written permission.
 18 | //
 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 30 | #ifndef GOOGLE_PROTOBUF_STUBS_STATUS_H_
 31 | #define GOOGLE_PROTOBUF_STUBS_STATUS_H_
 32 | 
 33 | #include <iosfwd>
 34 | #include <string>
 35 | 
 36 | #include <google/protobuf/stubs/common.h>
 37 | #include <google/protobuf/stubs/stringpiece.h>
 38 | 
 39 | #include <google/protobuf/port_def.inc>
 40 | 
 41 | namespace google {
 42 | namespace protobuf {
 43 | namespace util {
 44 | namespace error {
 45 | // These values must match error codes defined in google/rpc/code.proto.
 46 | enum Code {
 47 |   OK = 0,
 48 |   CANCELLED = 1,
 49 |   UNKNOWN = 2,
 50 |   INVALID_ARGUMENT = 3,
 51 |   DEADLINE_EXCEEDED = 4,
 52 |   NOT_FOUND = 5,
 53 |   ALREADY_EXISTS = 6,
 54 |   PERMISSION_DENIED = 7,
 55 |   UNAUTHENTICATED = 16,
 56 |   RESOURCE_EXHAUSTED = 8,
 57 |   FAILED_PRECONDITION = 9,
 58 |   ABORTED = 10,
 59 |   OUT_OF_RANGE = 11,
 60 |   UNIMPLEMENTED = 12,
 61 |   INTERNAL = 13,
 62 |   UNAVAILABLE = 14,
 63 |   DATA_LOSS = 15,
 64 | };
 65 | }  // namespace error
 66 | 
 67 | class PROTOBUF_EXPORT Status {
 68 |  public:
 69 |   // Creates a "successful" status.
 70 |   Status();
 71 | 
 72 |   // Create a status in the canonical error space with the specified
 73 |   // code, and error message.  If "code == 0", error_message is
 74 |   // ignored and a Status object identical to Status::OK is
 75 |   // constructed.
 76 |   Status(error::Code error_code, StringPiece error_message);
 77 |   Status(const Status&);
 78 |   Status& operator=(const Status& x);
 79 |   ~Status() {}
 80 | 
 81 |   // Some pre-defined Status objects
 82 |   static const Status OK;             // Identical to 0-arg constructor
 83 |   static const Status CANCELLED;
 84 |   static const Status UNKNOWN;
 85 | 
 86 |   // Accessor
 87 |   bool ok() const {
 88 |     return error_code_ == error::OK;
 89 |   }
 90 |   int error_code() const {
 91 |     return error_code_;
 92 |   }
 93 |   error::Code code() const {
 94 |     return error_code_;
 95 |   }
 96 |   StringPiece error_message() const {
 97 |     return error_message_;
 98 |   }
 99 |   StringPiece message() const {
100 |     return error_message_;
101 |   }
102 | 
103 |   bool operator==(const Status& x) const;
104 |   bool operator!=(const Status& x) const {
105 |     return !operator==(x);
106 |   }
107 | 
108 |   // Return a combination of the error code name and message.
109 |   std::string ToString() const;
110 | 
111 |  private:
112 |   error::Code error_code_;
113 |   std::string error_message_;
114 | };
115 | 
116 | // Prints a human-readable representation of 'x' to 'os'.
117 | PROTOBUF_EXPORT std::ostream& operator<<(std::ostream& os, const Status& x);
118 | 
119 | }  // namespace util
120 | }  // namespace protobuf
121 | }  // namespace google
122 | 
123 | #include <google/protobuf/port_undef.inc>
124 | 
125 | #endif  // GOOGLE_PROTOBUF_STUBS_STATUS_H_
126 | 


--------------------------------------------------------------------------------
/src/sentencepiece/src/spm_normalize_main.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2016 Google Inc.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.!
 14 | 
 15 | #include "builder.h"
 16 | #include "common.h"
 17 | #include "filesystem.h"
 18 | #include "init.h"
 19 | #include "normalizer.h"
 20 | #include "sentencepiece.pb.h"
 21 | #include "sentencepiece_model.pb.h"
 22 | #include "sentencepiece_processor.h"
 23 | #include "sentencepiece_trainer.h"
 24 | #include "third_party/absl/flags/flag.h"
 25 | 
 26 | ABSL_FLAG(std::string, model, "", "Model file name");
 27 | ABSL_FLAG(bool, use_internal_normalization, false,
 28 |           "Use NormalizerSpec \"as-is\" to run the normalizer "
 29 |           "for SentencePiece segmentation");
 30 | ABSL_FLAG(std::string, normalization_rule_name, "",
 31 |           "Normalization rule name. "
 32 |           "Choose from nfkc or identity");
 33 | ABSL_FLAG(std::string, normalization_rule_tsv, "",
 34 |           "Normalization rule TSV file. ");
 35 | ABSL_FLAG(bool, remove_extra_whitespaces, true, "Remove extra whitespaces");
 36 | ABSL_FLAG(bool, decompile, false,
 37 |           "Decompile compiled charamap and output it as TSV.");
 38 | ABSL_FLAG(std::string, input, "", "Input filename");
 39 | ABSL_FLAG(std::string, output, "", "Output filename");
 40 | 
 41 | using sentencepiece::ModelProto;
 42 | using sentencepiece::NormalizerSpec;
 43 | using sentencepiece::SentencePieceProcessor;
 44 | using sentencepiece::SentencePieceTrainer;
 45 | using sentencepiece::normalizer::Builder;
 46 | using sentencepiece::normalizer::Normalizer;
 47 | 
 48 | int main(int argc, char *argv[]) {
 49 |   sentencepiece::ParseCommandLineFlags(argv[0], &argc, &argv, true);
 50 |   std::vector<std::string> rest_args;
 51 | 
 52 |   if (absl::GetFlag(FLAGS_input).empty()) {
 53 |     for (int i = 1; i < argc; ++i) {
 54 |       rest_args.push_back(std::string(argv[i]));
 55 |     }
 56 |   } else {
 57 |     rest_args.push_back(absl::GetFlag(FLAGS_input));
 58 |   }
 59 | 
 60 |   NormalizerSpec spec;
 61 | 
 62 |   if (!absl::GetFlag(FLAGS_model).empty()) {
 63 |     ModelProto model_proto;
 64 |     SentencePieceProcessor sp;
 65 |     CHECK_OK(sp.Load(absl::GetFlag(FLAGS_model)));
 66 |     spec = sp.model_proto().normalizer_spec();
 67 |   } else if (!absl::GetFlag(FLAGS_normalization_rule_tsv).empty()) {
 68 |     spec.set_normalization_rule_tsv(
 69 |         absl::GetFlag(FLAGS_normalization_rule_tsv));
 70 |     CHECK_OK(SentencePieceTrainer::PopulateNormalizerSpec(&spec));
 71 |   } else if (!absl::GetFlag(FLAGS_normalization_rule_name).empty()) {
 72 |     spec.set_name(absl::GetFlag(FLAGS_normalization_rule_name));
 73 |     CHECK_OK(SentencePieceTrainer::PopulateNormalizerSpec(&spec));
 74 |   } else {
 75 |     LOG(FATAL) << "Sets --model, normalization_rule_tsv, or "
 76 |                   "normalization_rule_name flag.";
 77 |   }
 78 | 
 79 |   // Uses the normalizer spec encoded in the model_pb.
 80 |   if (!absl::GetFlag(FLAGS_use_internal_normalization)) {
 81 |     spec.set_add_dummy_prefix(false);    // do not add dummy prefix.
 82 |     spec.set_escape_whitespaces(false);  // do not output meta symbol.
 83 |     spec.set_remove_extra_whitespaces(
 84 |         absl::GetFlag(FLAGS_remove_extra_whitespaces));
 85 |   }
 86 | 
 87 |   if (absl::GetFlag(FLAGS_decompile)) {
 88 |     Builder::CharsMap chars_map;
 89 |     CHECK_OK(
 90 |         Builder::DecompileCharsMap(spec.precompiled_charsmap(), &chars_map));
 91 |     CHECK_OK(Builder::SaveCharsMap(absl::GetFlag(FLAGS_output), chars_map));
 92 |   } else {
 93 |     const Normalizer normalizer(spec);
 94 |     auto output =
 95 |         sentencepiece::filesystem::NewWritableFile(absl::GetFlag(FLAGS_output));
 96 |     CHECK_OK(output->status());
 97 | 
 98 |     if (rest_args.empty()) {
 99 |       rest_args.push_back("");  // empty means that read from stdin.
100 |     }
101 | 
102 |     std::string line;
103 |     for (const auto &filename : rest_args) {
104 |       auto input = sentencepiece::filesystem::NewReadableFile(filename);
105 |       CHECK_OK(input->status());
106 |       while (input->ReadLine(&line)) {
107 |         output->WriteLine(normalizer.Normalize(line));
108 |       }
109 |     }
110 |   }
111 | 
112 |   return 0;
113 | }
114 | 


--------------------------------------------------------------------------------