├── .Rbuildignore ├── .github └── workflows │ └── R-CMD-check.yaml ├── .gitignore ├── .vscode └── settings.json ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── LICENSE.note ├── NAMESPACE ├── NEWS.md ├── R ├── decoders.R ├── encoding.R ├── extendr-wrappers.R ├── models.R ├── normalizers.R ├── post_processors.R ├── pre_tokenizers.R ├── tokenizer.R └── trainers.R ├── README.Rmd ├── README.md ├── configure ├── configure.win ├── cran-comments.md ├── inst ├── AUTHORS └── po │ └── fr │ └── LC_MESSAGES │ └── R-tok.mo ├── man ├── decoder_byte_level.Rd ├── encoding.Rd ├── model_bpe.Rd ├── model_unigram.Rd ├── model_wordpiece.Rd ├── normalizer_nfc.Rd ├── normalizer_nfkc.Rd ├── pre_tokenizer.Rd ├── pre_tokenizer_byte_level.Rd ├── pre_tokenizer_whitespace.Rd ├── processor_byte_level.Rd ├── tok_decoder.Rd ├── tok_model.Rd ├── tok_normalizer.Rd ├── tok_processor.Rd ├── tok_trainer.Rd ├── tokenizer.Rd ├── trainer_bpe.Rd ├── trainer_unigram.Rd └── trainer_wordpiece.Rd ├── po ├── R-fr.po └── R-tok.pot ├── src ├── .gitignore ├── Makevars.in ├── Makevars.ucrt ├── Makevars.win.in ├── entrypoint.c ├── rust │ ├── Cargo.lock │ ├── Cargo.toml │ ├── src │ │ ├── decoders.rs │ │ ├── lib.rs │ │ ├── models.rs │ │ ├── normalizers.rs │ │ ├── post_processors.rs │ │ ├── pre_tokenizers.rs │ │ ├── tokenizer.rs │ │ └── trainers.rs │ ├── vendor-config.toml │ └── vendor.tar.xz └── tok-win.def ├── tests ├── testthat.R └── testthat │ ├── _snaps │ └── encoding.md │ ├── assets │ └── tokenizer.json │ ├── test-decoders.R │ ├── test-encoding.R │ ├── test-message-translations.R │ ├── test-models.R │ ├── test-normalizers.R │ ├── test-post_processors.R │ ├── test-pre_tokenizers.R │ ├── test-tokenizer.R │ └── test-trainers.R ├── tok.Rproj └── tools ├── config.R ├── msrv.R ├── patch.R ├── update_authors.R └── vendor.sh /.Rbuildignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/.Rbuildignore -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/.github/workflows/R-CMD-check.yaml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/.gitignore -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/.vscode/settings.json -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/DESCRIPTION -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2020 2 | COPYRIGHT HOLDER: Andy Thomason, Claus O. Wilke 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/LICENSE.md -------------------------------------------------------------------------------- /LICENSE.note: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/LICENSE.note -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/NAMESPACE -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/NEWS.md -------------------------------------------------------------------------------- /R/decoders.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/R/decoders.R -------------------------------------------------------------------------------- /R/encoding.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/R/encoding.R -------------------------------------------------------------------------------- /R/extendr-wrappers.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/R/extendr-wrappers.R -------------------------------------------------------------------------------- /R/models.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/R/models.R -------------------------------------------------------------------------------- /R/normalizers.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/R/normalizers.R -------------------------------------------------------------------------------- /R/post_processors.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/R/post_processors.R -------------------------------------------------------------------------------- /R/pre_tokenizers.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/R/pre_tokenizers.R -------------------------------------------------------------------------------- /R/tokenizer.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/R/tokenizer.R -------------------------------------------------------------------------------- /R/trainers.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/R/trainers.R -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/README.Rmd -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/README.md -------------------------------------------------------------------------------- /configure: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/configure -------------------------------------------------------------------------------- /configure.win: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/configure.win -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/cran-comments.md -------------------------------------------------------------------------------- /inst/AUTHORS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/inst/AUTHORS -------------------------------------------------------------------------------- /inst/po/fr/LC_MESSAGES/R-tok.mo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/inst/po/fr/LC_MESSAGES/R-tok.mo -------------------------------------------------------------------------------- /man/decoder_byte_level.Rd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/man/decoder_byte_level.Rd -------------------------------------------------------------------------------- /man/encoding.Rd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/man/encoding.Rd -------------------------------------------------------------------------------- /man/model_bpe.Rd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/man/model_bpe.Rd -------------------------------------------------------------------------------- /man/model_unigram.Rd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/man/model_unigram.Rd -------------------------------------------------------------------------------- /man/model_wordpiece.Rd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/man/model_wordpiece.Rd -------------------------------------------------------------------------------- /man/normalizer_nfc.Rd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/man/normalizer_nfc.Rd -------------------------------------------------------------------------------- /man/normalizer_nfkc.Rd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/man/normalizer_nfkc.Rd -------------------------------------------------------------------------------- /man/pre_tokenizer.Rd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/man/pre_tokenizer.Rd -------------------------------------------------------------------------------- /man/pre_tokenizer_byte_level.Rd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/man/pre_tokenizer_byte_level.Rd -------------------------------------------------------------------------------- /man/pre_tokenizer_whitespace.Rd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/man/pre_tokenizer_whitespace.Rd -------------------------------------------------------------------------------- /man/processor_byte_level.Rd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/man/processor_byte_level.Rd -------------------------------------------------------------------------------- /man/tok_decoder.Rd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/man/tok_decoder.Rd -------------------------------------------------------------------------------- /man/tok_model.Rd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/man/tok_model.Rd -------------------------------------------------------------------------------- /man/tok_normalizer.Rd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/man/tok_normalizer.Rd -------------------------------------------------------------------------------- /man/tok_processor.Rd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/man/tok_processor.Rd -------------------------------------------------------------------------------- /man/tok_trainer.Rd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/man/tok_trainer.Rd -------------------------------------------------------------------------------- /man/tokenizer.Rd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/man/tokenizer.Rd -------------------------------------------------------------------------------- /man/trainer_bpe.Rd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/man/trainer_bpe.Rd -------------------------------------------------------------------------------- /man/trainer_unigram.Rd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/man/trainer_unigram.Rd -------------------------------------------------------------------------------- /man/trainer_wordpiece.Rd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/man/trainer_wordpiece.Rd -------------------------------------------------------------------------------- /po/R-fr.po: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/po/R-fr.po -------------------------------------------------------------------------------- /po/R-tok.pot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/po/R-tok.pot -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/src/.gitignore -------------------------------------------------------------------------------- /src/Makevars.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/src/Makevars.in -------------------------------------------------------------------------------- /src/Makevars.ucrt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/src/Makevars.ucrt -------------------------------------------------------------------------------- /src/Makevars.win.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/src/Makevars.win.in -------------------------------------------------------------------------------- /src/entrypoint.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/src/entrypoint.c -------------------------------------------------------------------------------- /src/rust/Cargo.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/src/rust/Cargo.lock -------------------------------------------------------------------------------- /src/rust/Cargo.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/src/rust/Cargo.toml -------------------------------------------------------------------------------- /src/rust/src/decoders.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/src/rust/src/decoders.rs -------------------------------------------------------------------------------- /src/rust/src/lib.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/src/rust/src/lib.rs -------------------------------------------------------------------------------- /src/rust/src/models.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/src/rust/src/models.rs -------------------------------------------------------------------------------- /src/rust/src/normalizers.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/src/rust/src/normalizers.rs -------------------------------------------------------------------------------- /src/rust/src/post_processors.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/src/rust/src/post_processors.rs -------------------------------------------------------------------------------- /src/rust/src/pre_tokenizers.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/src/rust/src/pre_tokenizers.rs -------------------------------------------------------------------------------- /src/rust/src/tokenizer.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/src/rust/src/tokenizer.rs -------------------------------------------------------------------------------- /src/rust/src/trainers.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/src/rust/src/trainers.rs -------------------------------------------------------------------------------- /src/rust/vendor-config.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/src/rust/vendor-config.toml -------------------------------------------------------------------------------- /src/rust/vendor.tar.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/src/rust/vendor.tar.xz -------------------------------------------------------------------------------- /src/tok-win.def: -------------------------------------------------------------------------------- 1 | EXPORTS 2 | R_init_tok 3 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/tests/testthat.R -------------------------------------------------------------------------------- /tests/testthat/_snaps/encoding.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/tests/testthat/_snaps/encoding.md -------------------------------------------------------------------------------- /tests/testthat/assets/tokenizer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/tests/testthat/assets/tokenizer.json -------------------------------------------------------------------------------- /tests/testthat/test-decoders.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/tests/testthat/test-decoders.R -------------------------------------------------------------------------------- /tests/testthat/test-encoding.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/tests/testthat/test-encoding.R -------------------------------------------------------------------------------- /tests/testthat/test-message-translations.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/tests/testthat/test-message-translations.R -------------------------------------------------------------------------------- /tests/testthat/test-models.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/tests/testthat/test-models.R -------------------------------------------------------------------------------- /tests/testthat/test-normalizers.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/tests/testthat/test-normalizers.R -------------------------------------------------------------------------------- /tests/testthat/test-post_processors.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/tests/testthat/test-post_processors.R -------------------------------------------------------------------------------- /tests/testthat/test-pre_tokenizers.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/tests/testthat/test-pre_tokenizers.R -------------------------------------------------------------------------------- /tests/testthat/test-tokenizer.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/tests/testthat/test-tokenizer.R -------------------------------------------------------------------------------- /tests/testthat/test-trainers.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/tests/testthat/test-trainers.R -------------------------------------------------------------------------------- /tok.Rproj: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/tok.Rproj -------------------------------------------------------------------------------- /tools/config.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/tools/config.R -------------------------------------------------------------------------------- /tools/msrv.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/tools/msrv.R -------------------------------------------------------------------------------- /tools/patch.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/tools/patch.R -------------------------------------------------------------------------------- /tools/update_authors.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/tools/update_authors.R -------------------------------------------------------------------------------- /tools/vendor.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/tok/HEAD/tools/vendor.sh --------------------------------------------------------------------------------