├── .gitignore ├── CMakeLists.txt ├── README.ja-JP.md ├── README.ko-KR.md ├── README.md ├── README.zh-CN.md ├── assets ├── Banner_dark.png ├── Banner_light.png ├── BatchTest_dark.png ├── BatchTest_light.png ├── FlashTokenizer_main_dark.png ├── FlashTokenizer_main_light.png ├── TokenizerPerformanceBar_dark.jpg ├── TokenizerPerformanceBar_light.jpg ├── TokenizerPerformanceGraph_dark.png ├── TokenizerPerformanceGraph_light.png ├── WrongAnswer_dark.png ├── WrongAnswer_light.png ├── comp_accuracy_dark.png ├── comp_accuracy_light.png ├── comp_speed_dark.png ├── comp_speed_light.png ├── flashtokenizer-logo.png ├── logos_dark.png └── logos_light.png ├── data_loader.py ├── dataset ├── BPE │ ├── config.json │ ├── gpt2.bin │ ├── gpt2.i2w │ ├── merges.txt │ ├── tokenizer.json │ ├── tokenizer_config.json │ └── vocab.json ├── DATA.md ├── DATA.zh-CN.md ├── GPT2 │ ├── merges.txt │ └── vocab.json ├── README.md ├── config │ ├── KR-BERT │ │ ├── config.json │ │ ├── tokenizer_config.json │ │ └── vocab.txt │ ├── bert-base-cased.bin │ ├── bert-base-cased │ │ ├── config.json │ │ ├── tokenizer.json │ │ ├── tokenizer_config.json │ │ └── vocab.txt │ ├── bert-base-chinese.bin │ ├── bert-base-chinese │ │ ├── config.json │ │ ├── tokenizer.json │ │ ├── tokenizer_config.json │ │ └── vocab.txt │ ├── bert-base-japanese.bin │ ├── bert-base-japanese │ │ ├── config.json │ │ ├── tokenizer_config.json │ │ └── vocab.txt │ ├── bert-base-multilingual-cased.bin │ ├── bert-base-multilingual-cased │ │ ├── special_tokens_map.json │ │ ├── tokenizer.json │ │ ├── tokenizer_config.json │ │ └── vocab.txt │ ├── bert-base-multilingual-uncased │ │ ├── config.json │ │ ├── tokenizer.json │ │ ├── tokenizer_config.json │ │ └── vocab.txt │ ├── bert-base-swedish-cased.bin │ ├── bert-base-swedish-cased │ │ ├── added_tokens.json │ │ ├── config.json │ │ ├── special_tokens_map.json │ │ ├── tokenizer_config.json │ │ └── vocab.txt │ ├── bert-base-uncased.bin │ ├── bert-base-uncased │ │ ├── config.json │ │ ├── tokenizer.json │ │ ├── tokenizer_config.json │ │ └── vocab.txt │ ├── chinese-macbert-large.bin │ ├── chinese-macbert-large │ │ ├── added_tokens.json │ │ ├── config.json │ │ ├── tokenizer.json │ │ ├── tokenizer_config.json │ │ └── vocab.txt │ ├── kcbert-base.bin │ ├── kcbert-base │ │ ├── special_tokens_map.json │ │ ├── tokenizer.json │ │ ├── tokenizer_config.json │ │ └── vocab.txt │ ├── llmlingua-2-bert-base-multilingual-cased-meetingbank.bin │ ├── llmlingua-2-bert-base-multilingual-cased-meetingbank │ │ ├── special_tokens_map.json │ │ ├── tokenizer.json │ │ ├── tokenizer_config.json │ │ └── vocab.txt │ ├── splade.bin │ └── splade │ │ ├── tokenizer.json │ │ ├── tokenizer_config.json │ │ └── vocab.txt └── data │ ├── .gitkeep │ ├── download.bat │ └── download.sh ├── docs ├── BUILD_whl.md ├── README.md ├── SETUP.md ├── SETUP_zh-CN.md ├── UPLOAD.md └── whitepaper │ └── .gitkeep ├── loader.h ├── main.cpp ├── perftest ├── PERFTEST.md ├── PERFTEST.zh-CN.md ├── accuracy_test.py ├── bert_tokenizer │ ├── __init__.py │ ├── bert_tokenizer_blingfire.py │ ├── bert_tokenizer_flash.py │ ├── bert_tokenizer_huggingface.py │ ├── bert_tokenizer_original.py │ ├── bert_tokenizer_paddlenlp.py │ ├── bert_tokenizer_rust.py │ ├── bert_tokenizer_tensorflow.py │ └── config.py ├── metrics.py ├── performance_test.py ├── performance_test_mt.py ├── setup.sh └── show_parquet.py ├── prj ├── CMakeLists.txt ├── MANIFEST.in ├── README.md ├── USAGE.md ├── USAGE.zh-CN.md ├── include │ ├── basic_tokenizer.h │ ├── bert_tokenizer.h │ ├── bpe_tokenizer.h │ ├── charmap.h │ ├── debugging.h │ ├── defines.h │ ├── env.h │ ├── functions.h │ ├── json.hpp │ ├── robin_hood.h │ ├── thread_pool.h │ ├── trie.h │ ├── version.h │ ├── vocab.h │ ├── wordpiece_tokenizer.h │ └── wordpiecebackward_tokenizer.h ├── pyproject.toml ├── python │ ├── flash_tokenizer │ │ ├── __init__.py │ │ ├── batch_encoding.py │ │ ├── bert_tokenizer_flash.py │ │ ├── bert_tokenizer_original.py │ │ └── config.py │ ├── src │ │ └── bindings.cpp │ └── test_tokenizer.py └── src │ └── bert_tokenizer.cpp ├── requirements.txt ├── sample ├── sample01.py ├── sample02.py ├── sample03.py ├── sample04.py ├── sample_bpe.py ├── tokenizer_config.json └── vocab.txt └── update_version.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/.gitignore -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/CMakeLists.txt -------------------------------------------------------------------------------- /README.ja-JP.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/README.ja-JP.md -------------------------------------------------------------------------------- /README.ko-KR.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/README.ko-KR.md -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/README.md -------------------------------------------------------------------------------- /README.zh-CN.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/README.zh-CN.md -------------------------------------------------------------------------------- /assets/Banner_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/assets/Banner_dark.png -------------------------------------------------------------------------------- /assets/Banner_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/assets/Banner_light.png -------------------------------------------------------------------------------- /assets/BatchTest_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/assets/BatchTest_dark.png -------------------------------------------------------------------------------- /assets/BatchTest_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/assets/BatchTest_light.png -------------------------------------------------------------------------------- /assets/FlashTokenizer_main_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/assets/FlashTokenizer_main_dark.png -------------------------------------------------------------------------------- /assets/FlashTokenizer_main_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/assets/FlashTokenizer_main_light.png -------------------------------------------------------------------------------- /assets/TokenizerPerformanceBar_dark.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/assets/TokenizerPerformanceBar_dark.jpg -------------------------------------------------------------------------------- /assets/TokenizerPerformanceBar_light.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/assets/TokenizerPerformanceBar_light.jpg -------------------------------------------------------------------------------- /assets/TokenizerPerformanceGraph_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/assets/TokenizerPerformanceGraph_dark.png -------------------------------------------------------------------------------- /assets/TokenizerPerformanceGraph_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/assets/TokenizerPerformanceGraph_light.png -------------------------------------------------------------------------------- /assets/WrongAnswer_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/assets/WrongAnswer_dark.png -------------------------------------------------------------------------------- /assets/WrongAnswer_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/assets/WrongAnswer_light.png -------------------------------------------------------------------------------- /assets/comp_accuracy_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/assets/comp_accuracy_dark.png -------------------------------------------------------------------------------- /assets/comp_accuracy_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/assets/comp_accuracy_light.png -------------------------------------------------------------------------------- /assets/comp_speed_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/assets/comp_speed_dark.png -------------------------------------------------------------------------------- /assets/comp_speed_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/assets/comp_speed_light.png -------------------------------------------------------------------------------- /assets/flashtokenizer-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/assets/flashtokenizer-logo.png -------------------------------------------------------------------------------- /assets/logos_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/assets/logos_dark.png -------------------------------------------------------------------------------- /assets/logos_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/assets/logos_light.png -------------------------------------------------------------------------------- /data_loader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/data_loader.py -------------------------------------------------------------------------------- /dataset/BPE/config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/BPE/config.json -------------------------------------------------------------------------------- /dataset/BPE/gpt2.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/BPE/gpt2.bin -------------------------------------------------------------------------------- /dataset/BPE/gpt2.i2w: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/BPE/gpt2.i2w -------------------------------------------------------------------------------- /dataset/BPE/merges.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/BPE/merges.txt -------------------------------------------------------------------------------- /dataset/BPE/tokenizer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/BPE/tokenizer.json -------------------------------------------------------------------------------- /dataset/BPE/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | {"model_max_length": 1024} -------------------------------------------------------------------------------- /dataset/BPE/vocab.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/BPE/vocab.json -------------------------------------------------------------------------------- /dataset/DATA.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/DATA.md -------------------------------------------------------------------------------- /dataset/DATA.zh-CN.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/DATA.zh-CN.md -------------------------------------------------------------------------------- /dataset/GPT2/merges.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/GPT2/merges.txt -------------------------------------------------------------------------------- /dataset/GPT2/vocab.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/GPT2/vocab.json -------------------------------------------------------------------------------- /dataset/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/README.md -------------------------------------------------------------------------------- /dataset/config/KR-BERT/config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/KR-BERT/config.json -------------------------------------------------------------------------------- /dataset/config/KR-BERT/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "do_lower_case": false 4 | 5 | } 6 | -------------------------------------------------------------------------------- /dataset/config/KR-BERT/vocab.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/KR-BERT/vocab.txt -------------------------------------------------------------------------------- /dataset/config/bert-base-cased.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-cased.bin -------------------------------------------------------------------------------- /dataset/config/bert-base-cased/config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-cased/config.json -------------------------------------------------------------------------------- /dataset/config/bert-base-cased/tokenizer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-cased/tokenizer.json -------------------------------------------------------------------------------- /dataset/config/bert-base-cased/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | {"do_lower_case": false, "model_max_length": 512} -------------------------------------------------------------------------------- /dataset/config/bert-base-cased/vocab.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-cased/vocab.txt -------------------------------------------------------------------------------- /dataset/config/bert-base-chinese.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-chinese.bin -------------------------------------------------------------------------------- /dataset/config/bert-base-chinese/config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-chinese/config.json -------------------------------------------------------------------------------- /dataset/config/bert-base-chinese/tokenizer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-chinese/tokenizer.json -------------------------------------------------------------------------------- /dataset/config/bert-base-chinese/tokenizer_config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-chinese/tokenizer_config.json -------------------------------------------------------------------------------- /dataset/config/bert-base-chinese/vocab.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-chinese/vocab.txt -------------------------------------------------------------------------------- /dataset/config/bert-base-japanese.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-japanese.bin -------------------------------------------------------------------------------- /dataset/config/bert-base-japanese/config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-japanese/config.json -------------------------------------------------------------------------------- /dataset/config/bert-base-japanese/tokenizer_config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-japanese/tokenizer_config.json -------------------------------------------------------------------------------- /dataset/config/bert-base-japanese/vocab.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-japanese/vocab.txt -------------------------------------------------------------------------------- /dataset/config/bert-base-multilingual-cased.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-multilingual-cased.bin -------------------------------------------------------------------------------- /dataset/config/bert-base-multilingual-cased/special_tokens_map.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-multilingual-cased/special_tokens_map.json -------------------------------------------------------------------------------- /dataset/config/bert-base-multilingual-cased/tokenizer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-multilingual-cased/tokenizer.json -------------------------------------------------------------------------------- /dataset/config/bert-base-multilingual-cased/tokenizer_config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-multilingual-cased/tokenizer_config.json -------------------------------------------------------------------------------- /dataset/config/bert-base-multilingual-cased/vocab.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-multilingual-cased/vocab.txt -------------------------------------------------------------------------------- /dataset/config/bert-base-multilingual-uncased/config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-multilingual-uncased/config.json -------------------------------------------------------------------------------- /dataset/config/bert-base-multilingual-uncased/tokenizer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-multilingual-uncased/tokenizer.json -------------------------------------------------------------------------------- /dataset/config/bert-base-multilingual-uncased/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | {"do_lower_case": true, "model_max_length": 512} -------------------------------------------------------------------------------- /dataset/config/bert-base-multilingual-uncased/vocab.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-multilingual-uncased/vocab.txt -------------------------------------------------------------------------------- /dataset/config/bert-base-swedish-cased.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-swedish-cased.bin -------------------------------------------------------------------------------- /dataset/config/bert-base-swedish-cased/added_tokens.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /dataset/config/bert-base-swedish-cased/config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-swedish-cased/config.json -------------------------------------------------------------------------------- /dataset/config/bert-base-swedish-cased/special_tokens_map.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-swedish-cased/special_tokens_map.json -------------------------------------------------------------------------------- /dataset/config/bert-base-swedish-cased/tokenizer_config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-swedish-cased/tokenizer_config.json -------------------------------------------------------------------------------- /dataset/config/bert-base-swedish-cased/vocab.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-swedish-cased/vocab.txt -------------------------------------------------------------------------------- /dataset/config/bert-base-uncased.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-uncased.bin -------------------------------------------------------------------------------- /dataset/config/bert-base-uncased/config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-uncased/config.json -------------------------------------------------------------------------------- /dataset/config/bert-base-uncased/tokenizer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-uncased/tokenizer.json -------------------------------------------------------------------------------- /dataset/config/bert-base-uncased/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | {"do_lower_case": true, "model_max_length": 512} -------------------------------------------------------------------------------- /dataset/config/bert-base-uncased/vocab.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/bert-base-uncased/vocab.txt -------------------------------------------------------------------------------- /dataset/config/chinese-macbert-large.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/chinese-macbert-large.bin -------------------------------------------------------------------------------- /dataset/config/chinese-macbert-large/added_tokens.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /dataset/config/chinese-macbert-large/config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/chinese-macbert-large/config.json -------------------------------------------------------------------------------- /dataset/config/chinese-macbert-large/tokenizer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/chinese-macbert-large/tokenizer.json -------------------------------------------------------------------------------- /dataset/config/chinese-macbert-large/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | {"init_inputs": []} -------------------------------------------------------------------------------- /dataset/config/chinese-macbert-large/vocab.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/chinese-macbert-large/vocab.txt -------------------------------------------------------------------------------- /dataset/config/kcbert-base.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/kcbert-base.bin -------------------------------------------------------------------------------- /dataset/config/kcbert-base/special_tokens_map.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/kcbert-base/special_tokens_map.json -------------------------------------------------------------------------------- /dataset/config/kcbert-base/tokenizer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/kcbert-base/tokenizer.json -------------------------------------------------------------------------------- /dataset/config/kcbert-base/tokenizer_config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/kcbert-base/tokenizer_config.json -------------------------------------------------------------------------------- /dataset/config/kcbert-base/vocab.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/kcbert-base/vocab.txt -------------------------------------------------------------------------------- /dataset/config/llmlingua-2-bert-base-multilingual-cased-meetingbank.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/llmlingua-2-bert-base-multilingual-cased-meetingbank.bin -------------------------------------------------------------------------------- /dataset/config/llmlingua-2-bert-base-multilingual-cased-meetingbank/special_tokens_map.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/llmlingua-2-bert-base-multilingual-cased-meetingbank/special_tokens_map.json -------------------------------------------------------------------------------- /dataset/config/llmlingua-2-bert-base-multilingual-cased-meetingbank/tokenizer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/llmlingua-2-bert-base-multilingual-cased-meetingbank/tokenizer.json -------------------------------------------------------------------------------- /dataset/config/llmlingua-2-bert-base-multilingual-cased-meetingbank/tokenizer_config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/llmlingua-2-bert-base-multilingual-cased-meetingbank/tokenizer_config.json -------------------------------------------------------------------------------- /dataset/config/llmlingua-2-bert-base-multilingual-cased-meetingbank/vocab.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/llmlingua-2-bert-base-multilingual-cased-meetingbank/vocab.txt -------------------------------------------------------------------------------- /dataset/config/splade.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/splade.bin -------------------------------------------------------------------------------- /dataset/config/splade/tokenizer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/splade/tokenizer.json -------------------------------------------------------------------------------- /dataset/config/splade/tokenizer_config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/splade/tokenizer_config.json -------------------------------------------------------------------------------- /dataset/config/splade/vocab.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/config/splade/vocab.txt -------------------------------------------------------------------------------- /dataset/data/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataset/data/download.bat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/data/download.bat -------------------------------------------------------------------------------- /dataset/data/download.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/dataset/data/download.sh -------------------------------------------------------------------------------- /docs/BUILD_whl.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/docs/BUILD_whl.md -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/docs/README.md -------------------------------------------------------------------------------- /docs/SETUP.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/docs/SETUP.md -------------------------------------------------------------------------------- /docs/SETUP_zh-CN.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/docs/SETUP_zh-CN.md -------------------------------------------------------------------------------- /docs/UPLOAD.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/docs/UPLOAD.md -------------------------------------------------------------------------------- /docs/whitepaper/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /loader.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/loader.h -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/main.cpp -------------------------------------------------------------------------------- /perftest/PERFTEST.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/perftest/PERFTEST.md -------------------------------------------------------------------------------- /perftest/PERFTEST.zh-CN.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/perftest/PERFTEST.zh-CN.md -------------------------------------------------------------------------------- /perftest/accuracy_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/perftest/accuracy_test.py -------------------------------------------------------------------------------- /perftest/bert_tokenizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/perftest/bert_tokenizer/__init__.py -------------------------------------------------------------------------------- /perftest/bert_tokenizer/bert_tokenizer_blingfire.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/perftest/bert_tokenizer/bert_tokenizer_blingfire.py -------------------------------------------------------------------------------- /perftest/bert_tokenizer/bert_tokenizer_flash.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/perftest/bert_tokenizer/bert_tokenizer_flash.py -------------------------------------------------------------------------------- /perftest/bert_tokenizer/bert_tokenizer_huggingface.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/perftest/bert_tokenizer/bert_tokenizer_huggingface.py -------------------------------------------------------------------------------- /perftest/bert_tokenizer/bert_tokenizer_original.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/perftest/bert_tokenizer/bert_tokenizer_original.py -------------------------------------------------------------------------------- /perftest/bert_tokenizer/bert_tokenizer_paddlenlp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/perftest/bert_tokenizer/bert_tokenizer_paddlenlp.py -------------------------------------------------------------------------------- /perftest/bert_tokenizer/bert_tokenizer_rust.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/perftest/bert_tokenizer/bert_tokenizer_rust.py -------------------------------------------------------------------------------- /perftest/bert_tokenizer/bert_tokenizer_tensorflow.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/perftest/bert_tokenizer/bert_tokenizer_tensorflow.py -------------------------------------------------------------------------------- /perftest/bert_tokenizer/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/perftest/bert_tokenizer/config.py -------------------------------------------------------------------------------- /perftest/metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/perftest/metrics.py -------------------------------------------------------------------------------- /perftest/performance_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/perftest/performance_test.py -------------------------------------------------------------------------------- /perftest/performance_test_mt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/perftest/performance_test_mt.py -------------------------------------------------------------------------------- /perftest/setup.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/perftest/setup.sh -------------------------------------------------------------------------------- /perftest/show_parquet.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/perftest/show_parquet.py -------------------------------------------------------------------------------- /prj/CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/CMakeLists.txt -------------------------------------------------------------------------------- /prj/MANIFEST.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/MANIFEST.in -------------------------------------------------------------------------------- /prj/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/README.md -------------------------------------------------------------------------------- /prj/USAGE.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/USAGE.md -------------------------------------------------------------------------------- /prj/USAGE.zh-CN.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/USAGE.zh-CN.md -------------------------------------------------------------------------------- /prj/include/basic_tokenizer.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/include/basic_tokenizer.h -------------------------------------------------------------------------------- /prj/include/bert_tokenizer.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/include/bert_tokenizer.h -------------------------------------------------------------------------------- /prj/include/bpe_tokenizer.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/include/bpe_tokenizer.h -------------------------------------------------------------------------------- /prj/include/charmap.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/include/charmap.h -------------------------------------------------------------------------------- /prj/include/debugging.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/include/debugging.h -------------------------------------------------------------------------------- /prj/include/defines.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/include/defines.h -------------------------------------------------------------------------------- /prj/include/env.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/include/env.h -------------------------------------------------------------------------------- /prj/include/functions.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/include/functions.h -------------------------------------------------------------------------------- /prj/include/json.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/include/json.hpp -------------------------------------------------------------------------------- /prj/include/robin_hood.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/include/robin_hood.h -------------------------------------------------------------------------------- /prj/include/thread_pool.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/include/thread_pool.h -------------------------------------------------------------------------------- /prj/include/trie.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/include/trie.h -------------------------------------------------------------------------------- /prj/include/version.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/include/version.h -------------------------------------------------------------------------------- /prj/include/vocab.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/include/vocab.h -------------------------------------------------------------------------------- /prj/include/wordpiece_tokenizer.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/include/wordpiece_tokenizer.h -------------------------------------------------------------------------------- /prj/include/wordpiecebackward_tokenizer.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/include/wordpiecebackward_tokenizer.h -------------------------------------------------------------------------------- /prj/pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/pyproject.toml -------------------------------------------------------------------------------- /prj/python/flash_tokenizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/python/flash_tokenizer/__init__.py -------------------------------------------------------------------------------- /prj/python/flash_tokenizer/batch_encoding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/python/flash_tokenizer/batch_encoding.py -------------------------------------------------------------------------------- /prj/python/flash_tokenizer/bert_tokenizer_flash.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/python/flash_tokenizer/bert_tokenizer_flash.py -------------------------------------------------------------------------------- /prj/python/flash_tokenizer/bert_tokenizer_original.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/python/flash_tokenizer/bert_tokenizer_original.py -------------------------------------------------------------------------------- /prj/python/flash_tokenizer/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/python/flash_tokenizer/config.py -------------------------------------------------------------------------------- /prj/python/src/bindings.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/python/src/bindings.cpp -------------------------------------------------------------------------------- /prj/python/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | print("FlashBertTokenizer") -------------------------------------------------------------------------------- /prj/src/bert_tokenizer.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/prj/src/bert_tokenizer.cpp -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/requirements.txt -------------------------------------------------------------------------------- /sample/sample01.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/sample/sample01.py -------------------------------------------------------------------------------- /sample/sample02.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/sample/sample02.py -------------------------------------------------------------------------------- /sample/sample03.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/sample/sample03.py -------------------------------------------------------------------------------- /sample/sample04.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/sample/sample04.py -------------------------------------------------------------------------------- /sample/sample_bpe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/sample/sample_bpe.py -------------------------------------------------------------------------------- /sample/tokenizer_config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/sample/tokenizer_config.json -------------------------------------------------------------------------------- /sample/vocab.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/sample/vocab.txt -------------------------------------------------------------------------------- /update_version.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPOptimize/flash-tokenizer/HEAD/update_version.py --------------------------------------------------------------------------------