├── .github └── workflows │ └── go-ci.yml ├── .gitignore ├── LICENSE ├── README.md ├── encodings └── encodings.go ├── go.mod ├── go.sum ├── gotokenizers.go ├── models ├── bpemodel │ ├── bpemodel.go │ ├── bpemodel_test.go │ ├── mergemap.go │ ├── mergemap_test.go │ ├── testdata │ │ └── merges.txt │ ├── word.go │ ├── word_test.go │ ├── wordcache.go │ └── wordmerge.go ├── models.go └── wordpiecemodel │ ├── wordpiecemodel.go │ └── wordpiecemodel_test.go ├── normalizedstring ├── normalizedstring.go ├── normalizedstring_test.go ├── range.go └── range_test.go ├── normalizers ├── bertnormalizer │ ├── bertnormalizer.go │ └── bertnormalizer_test.go ├── lowercasenormalizer │ ├── lowercasenormalizer.go │ └── lowercasenormalizer_test.go ├── normalizers.go ├── sequencenormalizer │ ├── sequencenormalizer.go │ └── sequencenormalizer_test.go └── stripnormalizer │ ├── stripnormalizer.go │ └── stripnormalizer_test.go ├── pretokenizedstring ├── pretokenizedstring.go └── split.go ├── pretokenizers ├── bertpretokenizer │ ├── bertpretokenizer.go │ └── bertpretokenizer_test.go ├── bytelevelpretokenizer │ ├── bytelevelpretokenizer.go │ └── bytelevelpretokenizer_test.go ├── metaspacepretokenizer │ ├── metaspacepretokenizer.go │ └── metaspacepretokenizer_test.go ├── pretokenizers.go ├── runedelimiterpretokenizer │ ├── runedelimiterpretokenizer.go │ └── runedelimiterpretokenizer_test.go ├── whitespacepretokenizer │ ├── whitespacepretokenizer.go │ └── whitespacepretokenizer_test.go └── whitespacesplitpretokenizer │ ├── whitespacesplitpretokenizer.go │ └── whitespacesplitpretokenizer_test.go ├── splitpattern ├── func.go ├── func_test.go ├── invertedpattern.go ├── invertedpattern_test.go ├── regexp.go ├── regexp2.go ├── regexp2_test.go ├── regexp_test.go ├── rune.go ├── rune_test.go ├── splitpattern.go ├── splitpattern_test.go ├── string.go └── string_test.go ├── strutils ├── strutils.go └── strutils_test.go └── vocabulary ├── testdata └── vocab.json ├── vocabulary.go └── vocabulary_test.go /.github/workflows/go-ci.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/.github/workflows/go-ci.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/README.md -------------------------------------------------------------------------------- /encodings/encodings.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/encodings/encodings.go -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/go.mod -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/go.sum -------------------------------------------------------------------------------- /gotokenizers.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/gotokenizers.go -------------------------------------------------------------------------------- /models/bpemodel/bpemodel.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/models/bpemodel/bpemodel.go -------------------------------------------------------------------------------- /models/bpemodel/bpemodel_test.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/models/bpemodel/bpemodel_test.go -------------------------------------------------------------------------------- /models/bpemodel/mergemap.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/models/bpemodel/mergemap.go -------------------------------------------------------------------------------- /models/bpemodel/mergemap_test.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/models/bpemodel/mergemap_test.go -------------------------------------------------------------------------------- /models/bpemodel/testdata/merges.txt: -------------------------------------------------------------------------------- 1 | #version: 0.2 2 | ab cd 3 | efg hij 4 | klmn opqr 5 | -------------------------------------------------------------------------------- /models/bpemodel/word.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/models/bpemodel/word.go -------------------------------------------------------------------------------- /models/bpemodel/word_test.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/models/bpemodel/word_test.go -------------------------------------------------------------------------------- /models/bpemodel/wordcache.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/models/bpemodel/wordcache.go -------------------------------------------------------------------------------- /models/bpemodel/wordmerge.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/models/bpemodel/wordmerge.go -------------------------------------------------------------------------------- /models/models.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/models/models.go -------------------------------------------------------------------------------- /models/wordpiecemodel/wordpiecemodel.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/models/wordpiecemodel/wordpiecemodel.go -------------------------------------------------------------------------------- /models/wordpiecemodel/wordpiecemodel_test.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/models/wordpiecemodel/wordpiecemodel_test.go -------------------------------------------------------------------------------- /normalizedstring/normalizedstring.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/normalizedstring/normalizedstring.go -------------------------------------------------------------------------------- /normalizedstring/normalizedstring_test.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/normalizedstring/normalizedstring_test.go -------------------------------------------------------------------------------- /normalizedstring/range.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/normalizedstring/range.go -------------------------------------------------------------------------------- /normalizedstring/range_test.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/normalizedstring/range_test.go -------------------------------------------------------------------------------- /normalizers/bertnormalizer/bertnormalizer.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/normalizers/bertnormalizer/bertnormalizer.go -------------------------------------------------------------------------------- /normalizers/bertnormalizer/bertnormalizer_test.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/normalizers/bertnormalizer/bertnormalizer_test.go -------------------------------------------------------------------------------- /normalizers/lowercasenormalizer/lowercasenormalizer.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/normalizers/lowercasenormalizer/lowercasenormalizer.go -------------------------------------------------------------------------------- /normalizers/lowercasenormalizer/lowercasenormalizer_test.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/normalizers/lowercasenormalizer/lowercasenormalizer_test.go -------------------------------------------------------------------------------- /normalizers/normalizers.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/normalizers/normalizers.go -------------------------------------------------------------------------------- /normalizers/sequencenormalizer/sequencenormalizer.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/normalizers/sequencenormalizer/sequencenormalizer.go -------------------------------------------------------------------------------- /normalizers/sequencenormalizer/sequencenormalizer_test.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/normalizers/sequencenormalizer/sequencenormalizer_test.go -------------------------------------------------------------------------------- /normalizers/stripnormalizer/stripnormalizer.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/normalizers/stripnormalizer/stripnormalizer.go -------------------------------------------------------------------------------- /normalizers/stripnormalizer/stripnormalizer_test.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/normalizers/stripnormalizer/stripnormalizer_test.go -------------------------------------------------------------------------------- /pretokenizedstring/pretokenizedstring.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/pretokenizedstring/pretokenizedstring.go -------------------------------------------------------------------------------- /pretokenizedstring/split.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/pretokenizedstring/split.go -------------------------------------------------------------------------------- /pretokenizers/bertpretokenizer/bertpretokenizer.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/pretokenizers/bertpretokenizer/bertpretokenizer.go -------------------------------------------------------------------------------- /pretokenizers/bertpretokenizer/bertpretokenizer_test.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/pretokenizers/bertpretokenizer/bertpretokenizer_test.go -------------------------------------------------------------------------------- /pretokenizers/bytelevelpretokenizer/bytelevelpretokenizer.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/pretokenizers/bytelevelpretokenizer/bytelevelpretokenizer.go -------------------------------------------------------------------------------- /pretokenizers/bytelevelpretokenizer/bytelevelpretokenizer_test.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/pretokenizers/bytelevelpretokenizer/bytelevelpretokenizer_test.go -------------------------------------------------------------------------------- /pretokenizers/metaspacepretokenizer/metaspacepretokenizer.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/pretokenizers/metaspacepretokenizer/metaspacepretokenizer.go -------------------------------------------------------------------------------- /pretokenizers/metaspacepretokenizer/metaspacepretokenizer_test.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/pretokenizers/metaspacepretokenizer/metaspacepretokenizer_test.go -------------------------------------------------------------------------------- /pretokenizers/pretokenizers.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/pretokenizers/pretokenizers.go -------------------------------------------------------------------------------- /pretokenizers/runedelimiterpretokenizer/runedelimiterpretokenizer.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/pretokenizers/runedelimiterpretokenizer/runedelimiterpretokenizer.go -------------------------------------------------------------------------------- /pretokenizers/runedelimiterpretokenizer/runedelimiterpretokenizer_test.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/pretokenizers/runedelimiterpretokenizer/runedelimiterpretokenizer_test.go -------------------------------------------------------------------------------- /pretokenizers/whitespacepretokenizer/whitespacepretokenizer.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/pretokenizers/whitespacepretokenizer/whitespacepretokenizer.go -------------------------------------------------------------------------------- /pretokenizers/whitespacepretokenizer/whitespacepretokenizer_test.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/pretokenizers/whitespacepretokenizer/whitespacepretokenizer_test.go -------------------------------------------------------------------------------- /pretokenizers/whitespacesplitpretokenizer/whitespacesplitpretokenizer.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/pretokenizers/whitespacesplitpretokenizer/whitespacesplitpretokenizer.go -------------------------------------------------------------------------------- /pretokenizers/whitespacesplitpretokenizer/whitespacesplitpretokenizer_test.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/pretokenizers/whitespacesplitpretokenizer/whitespacesplitpretokenizer_test.go -------------------------------------------------------------------------------- /splitpattern/func.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/splitpattern/func.go -------------------------------------------------------------------------------- /splitpattern/func_test.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/splitpattern/func_test.go -------------------------------------------------------------------------------- /splitpattern/invertedpattern.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/splitpattern/invertedpattern.go -------------------------------------------------------------------------------- /splitpattern/invertedpattern_test.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/splitpattern/invertedpattern_test.go -------------------------------------------------------------------------------- /splitpattern/regexp.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/splitpattern/regexp.go -------------------------------------------------------------------------------- /splitpattern/regexp2.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/splitpattern/regexp2.go -------------------------------------------------------------------------------- /splitpattern/regexp2_test.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/splitpattern/regexp2_test.go -------------------------------------------------------------------------------- /splitpattern/regexp_test.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/splitpattern/regexp_test.go -------------------------------------------------------------------------------- /splitpattern/rune.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/splitpattern/rune.go -------------------------------------------------------------------------------- /splitpattern/rune_test.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/splitpattern/rune_test.go -------------------------------------------------------------------------------- /splitpattern/splitpattern.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/splitpattern/splitpattern.go -------------------------------------------------------------------------------- /splitpattern/splitpattern_test.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/splitpattern/splitpattern_test.go -------------------------------------------------------------------------------- /splitpattern/string.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/splitpattern/string.go -------------------------------------------------------------------------------- /splitpattern/string_test.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/splitpattern/string_test.go -------------------------------------------------------------------------------- /strutils/strutils.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/strutils/strutils.go -------------------------------------------------------------------------------- /strutils/strutils_test.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/strutils/strutils_test.go -------------------------------------------------------------------------------- /vocabulary/testdata/vocab.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/vocabulary/testdata/vocab.json -------------------------------------------------------------------------------- /vocabulary/vocabulary.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/vocabulary/vocabulary.go -------------------------------------------------------------------------------- /vocabulary/vocabulary_test.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpodyssey/gotokenizers/HEAD/vocabulary/vocabulary_test.go --------------------------------------------------------------------------------