├── .github ├── conda │ ├── bld.bat │ ├── build.sh │ └── meta.yaml ├── stale.yml └── workflows │ ├── CI.yml │ ├── build_documentation.yml │ ├── build_pr_documentation.yml │ ├── delete_doc_comment.yml │ ├── delete_doc_comment_trigger.yml │ ├── docs-check.yml │ ├── node-release.yml │ ├── node.yml │ ├── python-release.yml │ ├── python.yml │ ├── rust-release.yml │ ├── rust.yml │ ├── stale.yml │ ├── trufflehog.yml │ └── upload_pr_documentation.yml ├── .gitignore ├── CITATION.cff ├── LICENSE ├── README.md ├── RELEASE.md ├── bindings ├── node │ ├── .cargo │ │ └── config.toml │ ├── .editorconfig │ ├── .eslintrc.yml │ ├── .gitattributes │ ├── .gitignore │ ├── .prettierignore │ ├── .taplo.toml │ ├── .yarn │ │ └── releases │ │ │ └── yarn-3.5.1.cjs │ ├── .yarnrc.yml │ ├── Cargo.toml │ ├── LICENSE │ ├── Makefile │ ├── README.md │ ├── build.rs │ ├── examples │ │ └── documentation │ │ │ ├── pipeline.test.ts │ │ │ └── quicktour.test.ts │ ├── index.d.ts │ ├── index.js │ ├── jest.config.js │ ├── lib │ │ └── bindings │ │ │ ├── __mocks__ │ │ │ ├── merges.txt │ │ │ ├── vocab.json │ │ │ └── vocab.txt │ │ │ ├── decoders.test.ts │ │ │ ├── encoding.test.ts │ │ │ ├── models.test.ts │ │ │ ├── normalizers.test.ts │ │ │ ├── post-processors.test.ts │ │ │ ├── pre-tokenizers.test.ts │ │ │ ├── tokenizer.test.ts │ │ │ └── utils.test.ts │ ├── npm │ │ ├── android-arm-eabi │ │ │ ├── README.md │ │ │ └── package.json │ │ ├── android-arm64 │ │ │ ├── README.md │ │ │ └── package.json │ │ ├── darwin-arm64 │ │ │ ├── README.md │ │ │ └── package.json │ │ ├── darwin-x64 │ │ │ ├── README.md │ │ │ └── package.json │ │ ├── freebsd-x64 │ │ │ ├── README.md │ │ │ └── package.json │ │ ├── linux-arm-gnueabihf │ │ │ ├── README.md │ │ │ └── package.json │ │ ├── linux-arm64-gnu │ │ │ ├── README.md │ │ │ └── package.json │ │ ├── linux-arm64-musl │ │ │ ├── README.md │ │ │ └── package.json │ │ ├── linux-x64-gnu │ │ │ ├── README.md │ │ │ └── package.json │ │ ├── linux-x64-musl │ │ │ ├── README.md │ │ │ └── package.json │ │ ├── win32-arm64-msvc │ │ │ ├── README.md │ │ │ └── package.json │ │ ├── win32-ia32-msvc │ │ │ ├── README.md │ │ │ └── package.json │ │ └── win32-x64-msvc │ │ │ ├── README.md │ │ │ └── package.json │ ├── package.json │ ├── rustfmt.toml │ ├── src │ │ ├── arc_rwlock_serde.rs │ │ ├── decoders.rs │ │ ├── encoding.rs │ │ ├── lib.rs │ │ ├── models.rs │ │ ├── normalizers.rs │ │ ├── pre_tokenizers.rs │ │ ├── processors.rs │ │ ├── tasks │ │ │ ├── mod.rs │ │ │ ├── models.rs │ │ │ └── tokenizer.rs │ │ ├── tokenizer.rs │ │ ├── trainers.rs │ │ └── utils.rs │ ├── tsconfig.json │ ├── types.ts │ └── yarn.lock └── python │ ├── .cargo │ └── config.toml │ ├── .gitignore │ ├── CHANGELOG.md │ ├── Cargo.toml │ ├── MANIFEST.in │ ├── Makefile │ ├── README.md │ ├── benches │ └── test_tiktoken.py │ ├── conftest.py │ ├── examples │ ├── custom_components.py │ ├── example.py │ ├── train_bert_wordpiece.py │ ├── train_bytelevel_bpe.py │ ├── train_with_datasets.py │ └── using_the_visualizer.ipynb │ ├── py_src │ └── tokenizers │ │ ├── __init__.py │ │ ├── __init__.pyi │ │ ├── decoders │ │ ├── __init__.py │ │ └── __init__.pyi │ │ ├── implementations │ │ ├── __init__.py │ │ ├── base_tokenizer.py │ │ ├── bert_wordpiece.py │ │ ├── byte_level_bpe.py │ │ ├── char_level_bpe.py │ │ ├── sentencepiece_bpe.py │ │ └── sentencepiece_unigram.py │ │ ├── models │ │ ├── __init__.py │ │ └── __init__.pyi │ │ ├── normalizers │ │ ├── __init__.py │ │ └── __init__.pyi │ │ ├── pre_tokenizers │ │ ├── __init__.py │ │ └── __init__.pyi │ │ ├── processors │ │ ├── __init__.py │ │ └── __init__.pyi │ │ ├── tools │ │ ├── __init__.py │ │ ├── visualizer-styles.css │ │ └── visualizer.py │ │ └── trainers │ │ ├── __init__.py │ │ └── __init__.pyi │ ├── pyproject.toml │ ├── rust-toolchain │ ├── scripts │ ├── convert.py │ ├── sentencepiece_extractor.py │ └── spm_parity_check.py │ ├── setup.cfg │ ├── src │ ├── decoders.rs │ ├── encoding.rs │ ├── error.rs │ ├── lib.rs │ ├── models.rs │ ├── normalizers.rs │ ├── pre_tokenizers.rs │ ├── processors.rs │ ├── token.rs │ ├── tokenizer.rs │ ├── trainers.rs │ └── utils │ │ ├── iterators.rs │ │ ├── mod.rs │ │ ├── normalization.rs │ │ ├── pretokenization.rs │ │ ├── regex.rs │ │ └── serde_pyo3.rs │ ├── stub.py │ ├── test.txt │ └── tests │ ├── __init__.py │ ├── bindings │ ├── __init__.py │ ├── test_decoders.py │ ├── test_encoding.py │ ├── test_models.py │ ├── test_normalizers.py │ ├── test_pre_tokenizers.py │ ├── test_processors.py │ ├── test_tokenizer.py │ └── test_trainers.py │ ├── documentation │ ├── __init__.py │ ├── test_pipeline.py │ ├── test_quicktour.py │ └── test_tutorial_train_from_iterators.py │ ├── implementations │ ├── __init__.py │ ├── test_base_tokenizer.py │ ├── test_bert_wordpiece.py │ ├── test_byte_level_bpe.py │ ├── test_char_bpe.py │ └── test_sentencepiece.py │ ├── test_serialization.py │ └── utils.py ├── docs ├── Makefile ├── README.md ├── source-doc-builder │ ├── _toctree.yml │ ├── api │ │ ├── added-tokens.mdx │ │ ├── decoders.mdx │ │ ├── encode-inputs.mdx │ │ ├── encoding.mdx │ │ ├── input-sequences.mdx │ │ ├── models.mdx │ │ ├── normalizers.mdx │ │ ├── post-processors.mdx │ │ ├── pre-tokenizers.mdx │ │ ├── tokenizer.mdx │ │ ├── trainers.mdx │ │ └── visualizer.mdx │ ├── components.mdx │ ├── index.mdx │ ├── installation.mdx │ ├── pipeline.mdx │ ├── quicktour.mdx │ └── training_from_memory.mdx └── source │ ├── _ext │ ├── entities.py │ ├── rust_doc.py │ └── toctree_tags.py │ ├── _static │ ├── css │ │ ├── Calibre-Light.ttf │ │ ├── Calibre-Medium.otf │ │ ├── Calibre-Regular.otf │ │ ├── Calibre-Thin.otf │ │ ├── code-snippets.css │ │ └── huggingface.css │ └── js │ │ └── custom.js │ ├── api │ ├── node.inc │ ├── python.inc │ ├── reference.rst │ └── rust.inc │ ├── components.rst │ ├── conf.py │ ├── entities.inc │ ├── index.rst │ ├── installation │ ├── main.rst │ ├── node.inc │ ├── python.inc │ └── rust.inc │ ├── pipeline.rst │ ├── quicktour.rst │ └── tutorials │ └── python │ └── training_from_memory.rst └── tokenizers ├── CHANGELOG.md ├── Cargo.toml ├── LICENSE ├── Makefile ├── README.md ├── README.tpl ├── benches ├── bert_benchmark.rs ├── bpe_benchmark.rs ├── common │ └── mod.rs ├── layout_benchmark.rs ├── llama3_benchmark.rs └── unigram_benchmark.rs ├── examples ├── encode_batch.rs ├── serialization.rs └── unstable_wasm │ ├── .gitignore │ ├── Cargo.toml │ ├── README.md │ ├── src │ ├── lib.rs │ └── utils.rs │ ├── tests │ └── web.rs │ └── www │ ├── .bin │ └── create-wasm-app.js │ ├── .gitignore │ ├── .travis.yml │ ├── LICENSE-APACHE │ ├── LICENSE-MIT │ ├── README.md │ ├── bootstrap.js │ ├── index.html │ ├── index.js │ ├── package-lock.json │ ├── package.json │ └── webpack.config.js ├── rust-toolchain ├── src ├── decoders │ ├── bpe.rs │ ├── byte_fallback.rs │ ├── ctc.rs │ ├── fuse.rs │ ├── mod.rs │ ├── sequence.rs │ ├── strip.rs │ └── wordpiece.rs ├── lib.rs ├── models │ ├── bpe │ │ ├── mod.rs │ │ ├── model.rs │ │ ├── serialization.rs │ │ ├── trainer.rs │ │ └── word.rs │ ├── mod.rs │ ├── unigram │ │ ├── lattice.rs │ │ ├── mod.rs │ │ ├── model.rs │ │ ├── serialization.rs │ │ ├── trainer.rs │ │ └── trie.rs │ ├── wordlevel │ │ ├── mod.rs │ │ ├── serialization.rs │ │ └── trainer.rs │ └── wordpiece │ │ ├── mod.rs │ │ ├── serialization.rs │ │ └── trainer.rs ├── normalizers │ ├── bert.rs │ ├── byte_level.rs │ ├── mod.rs │ ├── precompiled.rs │ ├── prepend.rs │ ├── replace.rs │ ├── strip.rs │ ├── unicode.rs │ └── utils.rs ├── pre_tokenizers │ ├── bert.rs │ ├── byte_level.rs │ ├── delimiter.rs │ ├── digits.rs │ ├── fixed_length.rs │ ├── metaspace.rs │ ├── mod.rs │ ├── punctuation.rs │ ├── sequence.rs │ ├── split.rs │ ├── unicode_scripts │ │ ├── mod.rs │ │ ├── pre_tokenizer.rs │ │ └── scripts.rs │ └── whitespace.rs ├── processors │ ├── bert.rs │ ├── mod.rs │ ├── roberta.rs │ ├── sequence.rs │ └── template.rs ├── tokenizer │ ├── added_vocabulary.rs │ ├── encoding.rs │ ├── mod.rs │ ├── normalizer.rs │ ├── pattern.rs │ ├── pre_tokenizer.rs │ └── serialization.rs └── utils │ ├── cache.rs │ ├── fancy.rs │ ├── from_pretrained.rs │ ├── iter.rs │ ├── mod.rs │ ├── onig.rs │ ├── padding.rs │ ├── parallelism.rs │ ├── progress.rs │ └── truncation.rs └── tests ├── added_tokens.rs ├── common └── mod.rs ├── documentation.rs ├── from_pretrained.rs ├── offsets.rs ├── serialization.rs ├── stream.rs ├── training.rs └── unigram.rs /.github/conda/bld.bat: -------------------------------------------------------------------------------- 1 | cd bindings\python 2 | %PYTHON% -m pip install . --prefix=%PREFIX% 3 | -------------------------------------------------------------------------------- /.github/conda/build.sh: -------------------------------------------------------------------------------- 1 | cd bindings/python 2 | $PYTHON -m pip install . --prefix=$PREFIX 3 | -------------------------------------------------------------------------------- /.github/conda/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set name = "tokenizers" %} 2 | 3 | package: 4 | name: "{{ name|lower }}" 5 | version: "{{ TOKENIZERS_VERSION }}" 6 | 7 | source: 8 | path: ../../ 9 | 10 | requirements: 11 | host: 12 | - pip 13 | - python x.x 14 | - setuptools 15 | - setuptools-rust 16 | - pkg-config 17 | - openssl 18 | - maturin 19 | 20 | run: 21 | - python x.x 22 | 23 | test: 24 | imports: 25 | - tokenizers 26 | - tokenizers.models 27 | 28 | about: 29 | home: https://huggingface.co/docs/tokenizers 30 | license: Apache License 2.0 31 | license_file: LICENSE 32 | summary: "💥 Fast State-of-the-Art Tokenizers optimized for Research and Production" 33 | -------------------------------------------------------------------------------- /.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Number of days of inactivity before an issue becomes stale 2 | daysUntilStale: 60 3 | # Number of days of inactivity before a stale issue is closed 4 | daysUntilClose: 7 5 | # Issues with these labels will never be considered stale 6 | exemptLabels: 7 | - pinned 8 | - security 9 | # Label to use when marking an issue as stale 10 | staleLabel: wontfix 11 | # Comment to post when marking an issue as stale. Set to `false` to disable 12 | markComment: > 13 | This issue has been automatically marked as stale because it has not had 14 | recent activity. It will be closed if no further activity occurs. Thank you 15 | for your contributions. 16 | # Comment to post when closing a stale issue. Set to `false` to disable 17 | closeComment: false 18 | -------------------------------------------------------------------------------- /.github/workflows/build_documentation.yml: -------------------------------------------------------------------------------- 1 | name: Build documentation 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - doc-builder* 8 | - v*-release 9 | - use_templates 10 | 11 | jobs: 12 | build: 13 | uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main 14 | with: 15 | commit_sha: ${{ github.sha }} 16 | package: tokenizers 17 | path_to_docs: tokenizers/docs/source-doc-builder/ 18 | package_path: tokenizers/bindings/python/ 19 | install_rust: true 20 | secrets: 21 | hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} 22 | -------------------------------------------------------------------------------- /.github/workflows/build_pr_documentation.yml: -------------------------------------------------------------------------------- 1 | name: Build PR Documentation 2 | 3 | on: 4 | pull_request: 5 | 6 | concurrency: 7 | group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} 8 | cancel-in-progress: true 9 | 10 | jobs: 11 | build: 12 | uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main 13 | with: 14 | commit_sha: ${{ github.event.pull_request.head.sha }} 15 | pr_number: ${{ github.event.number }} 16 | package: tokenizers 17 | path_to_docs: tokenizers/docs/source-doc-builder/ 18 | package_path: tokenizers/bindings/python/ 19 | install_rust: true 20 | -------------------------------------------------------------------------------- /.github/workflows/delete_doc_comment.yml: -------------------------------------------------------------------------------- 1 | name: Delete doc comment 2 | 3 | on: 4 | workflow_run: 5 | workflows: ["Delete doc comment trigger"] 6 | types: 7 | - completed 8 | 9 | jobs: 10 | delete: 11 | uses: huggingface/doc-builder/.github/workflows/delete_doc_comment.yml@main 12 | secrets: 13 | comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }} -------------------------------------------------------------------------------- /.github/workflows/delete_doc_comment_trigger.yml: -------------------------------------------------------------------------------- 1 | name: Delete doc comment trigger 2 | 3 | on: 4 | pull_request: 5 | types: [ closed ] 6 | 7 | 8 | jobs: 9 | delete: 10 | uses: huggingface/doc-builder/.github/workflows/delete_doc_comment_trigger.yml@main 11 | with: 12 | pr_number: ${{ github.event.number }} -------------------------------------------------------------------------------- /.github/workflows/docs-check.yml: -------------------------------------------------------------------------------- 1 | name: Documentation 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout repository 14 | uses: actions/checkout@v4 15 | 16 | - name: Install Python 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: 3.12 20 | 21 | - name: Install dependencies 22 | run: pip install sphinx sphinx_rtd_theme setuptools-rust 23 | 24 | - name: Install Rust 25 | uses: dtolnay/rust-toolchain@stable 26 | 27 | - name: Build tokenizers 28 | working-directory: ./bindings/python 29 | run: pip install -e . 30 | 31 | - name: Build documentation 32 | working-directory: ./docs 33 | run: make clean && make html_all O="-W --keep-going" 34 | 35 | - name: Upload built doc 36 | uses: actions/upload-artifact@v4 37 | with: 38 | name: documentation 39 | path: ./docs/build/* 40 | -------------------------------------------------------------------------------- /.github/workflows/node.yml: -------------------------------------------------------------------------------- 1 | name: Node 2 | on: 3 | push: 4 | branches: 5 | - main 6 | paths-ignore: 7 | - bindings/python/** 8 | pull_request: 9 | paths-ignore: 10 | - bindings/python/** 11 | 12 | jobs: 13 | build_and_test: 14 | name: Check everything builds 15 | runs-on: ubuntu-latest 16 | steps: 17 | - name: Checkout repository 18 | uses: actions/checkout@v4 19 | 20 | - name: Install Rust 21 | uses: dtolnay/rust-toolchain@stable 22 | with: 23 | components: rustfmt, clippy 24 | 25 | # Necessary for now for the cargo cache: https://github.com/actions/cache/issues/133#issuecomment-599102035 26 | - run: sudo chown -R $(whoami):$(id -ng) ~/.cargo/ 27 | 28 | - name: Cache Cargo Registry 29 | uses: actions/cache@v4 30 | with: 31 | path: ~/.cargo/registry 32 | key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }} 33 | 34 | - name: Install Node 35 | uses: actions/setup-node@v4 36 | with: 37 | node-version: latest 38 | - name: Install dependencies 39 | working-directory: ./bindings/node 40 | run: yarn install 41 | 42 | - name: Build all 43 | working-directory: ./bindings/node 44 | run: yarn build 45 | 46 | - name: Lint Rust formatting 47 | uses: actions-rs/cargo@v1 48 | with: 49 | command: fmt 50 | args: --manifest-path ./bindings/node/Cargo.toml -- --check 51 | 52 | - name: Lint Rust with Clippy 53 | uses: actions-rs/cargo@v1 54 | with: 55 | command: clippy 56 | args: --manifest-path ./bindings/node/Cargo.toml --all-targets --all-features -- -D warnings 57 | 58 | - name: Lint TS 59 | working-directory: ./bindings/node 60 | run: yarn lint 61 | 62 | - name: Run JS tests 63 | working-directory: ./bindings/node 64 | run: make test 65 | -------------------------------------------------------------------------------- /.github/workflows/rust-release.yml: -------------------------------------------------------------------------------- 1 | name: Rust Release 2 | 3 | env: 4 | CRATES_TOKEN: ${{ secrets.CRATES_TOKEN }} 5 | 6 | on: 7 | push: 8 | tags: 9 | - v* 10 | 11 | jobs: 12 | rust_publish: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Checkout repository 16 | uses: actions/checkout@v4 17 | 18 | - name: Install Rust 19 | uses: dtolnay/rust-toolchain@stable 20 | 21 | - name: Cache Cargo Registry 22 | uses: actions/cache@v4 23 | with: 24 | path: ~/.cargo/registry 25 | key: ubuntu-latest-cargo-registry-${{ hashFiles('**/Cargo.toml') }} 26 | 27 | - name: Publish package rust 28 | working-directory: ./tokenizers 29 | if: ${{ !contains(github.ref, 'rc') }} 30 | run: cargo publish --token ${CRATES_TOKEN} 31 | 32 | -------------------------------------------------------------------------------- /.github/workflows/stale.yml: -------------------------------------------------------------------------------- 1 | name: 'Close stale issues and PRs' 2 | on: 3 | schedule: 4 | - cron: '30 1 * * *' 5 | 6 | jobs: 7 | stale: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/stale@v9 11 | with: 12 | stale-issue-message: 'This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days.' 13 | days-before-stale: 30 14 | days-before-close: 5 15 | -------------------------------------------------------------------------------- /.github/workflows/trufflehog.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | 4 | name: Secret Leaks 5 | 6 | jobs: 7 | trufflehog: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout code 11 | uses: actions/checkout@v4 12 | with: 13 | fetch-depth: 0 14 | - name: Secret Scanning 15 | uses: trufflesecurity/trufflehog@853e1e8d249fd1e29d0fcc7280d29b03df3d643d 16 | with: 17 | # exclude buggy postgres detector that is causing false positives and not relevant to our codebase 18 | extra_args: --results=verified,unknown --exclude-detectors=postgres 19 | -------------------------------------------------------------------------------- /.github/workflows/upload_pr_documentation.yml: -------------------------------------------------------------------------------- 1 | name: Upload PR Documentation 2 | 3 | on: 4 | workflow_run: 5 | workflows: ["Build PR Documentation"] 6 | types: 7 | - completed 8 | 9 | jobs: 10 | build: 11 | uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main 12 | with: 13 | package_name: tokenizers 14 | secrets: 15 | hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} 16 | comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *~ 3 | 4 | .vim 5 | .env 6 | target 7 | .idea 8 | **/Cargo.lock 9 | 10 | /data 11 | tokenizers/data 12 | bindings/python/tests/data 13 | docs/build/ 14 | docs/make.bat 15 | 16 | __pycache__ 17 | pip-wheel-metadata 18 | *.egg-info 19 | *.so 20 | /bindings/python/examples/.ipynb_checkpoints 21 | /bindings/python/build 22 | /bindings/python/dist 23 | 24 | .vscode 25 | *.code-workspace 26 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | # This CITATION.cff file was generated with cffinit. 2 | # Visit https://bit.ly/cffinit to generate yours today! 3 | 4 | cff-version: 1.2.0 5 | title: HuggingFace's Tokenizers 6 | message: >- 7 | Fast State-of-the-Art Tokenizers optimized for Research 8 | and Production. 9 | type: software 10 | authors: 11 | - given-names: Anthony 12 | family-names: Moi 13 | email: m.anthony.moi@gmail.com 14 | affiliation: HuggingFace 15 | - given-names: Nicolas 16 | family-names: Patry 17 | affiliation: HuggingFace 18 | repository-code: 'https://github.com/huggingface/tokenizers' 19 | url: 'https://github.com/huggingface/tokenizers' 20 | repository: 'https://huggingface.co' 21 | abstract: >- 22 | Fast State-of-the-Art Tokenizers optimized for Research 23 | and Production. 24 | keywords: 25 | - Rust 26 | - Tokenizer 27 | - NLP 28 | license: Apache-2.0 29 | commit: 37372b6 30 | version: 0.13.4 31 | date-released: '2023-04-05' 32 | -------------------------------------------------------------------------------- /bindings/node/.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [target.aarch64-unknown-linux-musl] 2 | linker = "aarch64-linux-musl-gcc" 3 | rustflags = ["-C", "target-feature=-crt-static"] 4 | -------------------------------------------------------------------------------- /bindings/node/.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig helps developers define and maintain consistent 2 | # coding styles between different editors or IDEs 3 | # http://editorconfig.org 4 | root = true 5 | 6 | [*] 7 | indent_style = space 8 | indent_size = 2 9 | end_of_line = lf 10 | charset = utf-8 11 | trim_trailing_whitespace = true 12 | insert_final_newline = true 13 | 14 | [*.md] 15 | trim_trailing_whitespace = false 16 | -------------------------------------------------------------------------------- /bindings/node/.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | 5 | *.ts text eol=lf merge=union 6 | *.tsx text eol=lf merge=union 7 | *.rs text eol=lf merge=union 8 | *.js text eol=lf merge=union 9 | *.json text eol=lf merge=union 10 | *.debug text eol=lf merge=union 11 | 12 | # Generated codes 13 | index.js linguist-detectable=false 14 | index.d.ts linguist-detectable=false -------------------------------------------------------------------------------- /bindings/node/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.toptal.com/developers/gitignore/api/node 3 | # Edit at https://www.toptal.com/developers/gitignore?templates=node 4 | 5 | ### Node ### 6 | # Logs 7 | logs 8 | *.log 9 | npm-debug.log* 10 | yarn-debug.log* 11 | yarn-error.log* 12 | lerna-debug.log* 13 | 14 | # Diagnostic reports (https://nodejs.org/api/report.html) 15 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 16 | 17 | # Runtime data 18 | pids 19 | *.pid 20 | *.seed 21 | *.pid.lock 22 | 23 | # Directory for instrumented libs generated by jscoverage/JSCover 24 | lib-cov 25 | 26 | # Coverage directory used by tools like istanbul 27 | coverage 28 | *.lcov 29 | 30 | # nyc test coverage 31 | .nyc_output 32 | 33 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 34 | .grunt 35 | 36 | # Bower dependency directory (https://bower.io/) 37 | bower_components 38 | 39 | # node-waf configuration 40 | .lock-wscript 41 | 42 | # Compiled binary addons (https://nodejs.org/api/addons.html) 43 | build/Release 44 | 45 | # Dependency directories 46 | node_modules/ 47 | jspm_packages/ 48 | 49 | # TypeScript v1 declaration files 50 | typings/ 51 | 52 | # TypeScript cache 53 | *.tsbuildinfo 54 | 55 | # Optional npm cache directory 56 | .npm 57 | 58 | # Optional eslint cache 59 | .eslintcache 60 | 61 | # Microbundle cache 62 | .rpt2_cache/ 63 | .rts2_cache_cjs/ 64 | .rts2_cache_es/ 65 | .rts2_cache_umd/ 66 | 67 | # Optional REPL history 68 | .node_repl_history 69 | 70 | # Output of 'npm pack' 71 | *.tgz 72 | 73 | # Yarn Integrity file 74 | .yarn-integrity 75 | 76 | # dotenv environment variables file 77 | .env 78 | .env.test 79 | 80 | # parcel-bundler cache (https://parceljs.org/) 81 | .cache 82 | 83 | # Next.js build output 84 | .next 85 | 86 | # Nuxt.js build / generate output 87 | .nuxt 88 | dist 89 | 90 | # Gatsby files 91 | .cache/ 92 | # Comment in the public line in if your project uses Gatsby and not Next.js 93 | # https://nextjs.org/blog/next-9-1#public-directory-support 94 | # public 95 | 96 | # vuepress build output 97 | .vuepress/dist 98 | 99 | # Serverless directories 100 | .serverless/ 101 | 102 | # FuseBox cache 103 | .fusebox/ 104 | 105 | # DynamoDB Local files 106 | .dynamodb/ 107 | 108 | # TernJS port file 109 | .tern-port 110 | 111 | # Stores VSCode versions used for testing VSCode extensions 112 | .vscode-test 113 | 114 | # End of https://www.toptal.com/developers/gitignore/api/node 115 | 116 | 117 | #Added by cargo 118 | 119 | /target 120 | Cargo.lock 121 | 122 | *.node 123 | .pnp.* 124 | .yarn/* 125 | !.yarn/patches 126 | !.yarn/plugins 127 | !.yarn/releases 128 | !.yarn/sdks 129 | !.yarn/versions -------------------------------------------------------------------------------- /bindings/node/.prettierignore: -------------------------------------------------------------------------------- 1 | target 2 | .yarn -------------------------------------------------------------------------------- /bindings/node/.taplo.toml: -------------------------------------------------------------------------------- 1 | exclude = ["node_modules/**/*.toml"] 2 | 3 | # https://taplo.tamasfe.dev/configuration/formatter-options.html 4 | [formatting] 5 | align_entries = true 6 | indent_tables = true 7 | reorder_keys = true 8 | -------------------------------------------------------------------------------- /bindings/node/.yarnrc.yml: -------------------------------------------------------------------------------- 1 | nodeLinker: node-modules 2 | 3 | npmAuditRegistry: 'https://registry.npmjs.org' 4 | 5 | yarnPath: .yarn/releases/yarn-3.5.1.cjs 6 | -------------------------------------------------------------------------------- /bindings/node/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | authors = ["Nicolas Patry <nicolas@huggingface.co>"] 3 | edition = "2021" 4 | name = "node" 5 | version = "0.21.4-dev.0" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [lib] 10 | crate-type = ["cdylib"] 11 | 12 | [dependencies] 13 | napi = "2" 14 | napi-derive = "2" 15 | serde = { version = "1.0.163", features = ["derive"] } 16 | tokenizers = { path = "../../tokenizers/" } 17 | ahash = { version = "0.8.11", features = ["serde"] } 18 | 19 | [build-dependencies] 20 | napi-build = "2" 21 | 22 | [profile.release] 23 | lto = true 24 | -------------------------------------------------------------------------------- /bindings/node/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 N-API for Rust 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /bindings/node/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: style check-style test 2 | 3 | DATA_DIR = data 4 | 5 | dir_guard=@mkdir -p $(@D) 6 | 7 | # Format source code automatically 8 | style: 9 | npm run lint 10 | 11 | # Check the source code is formatted correctly 12 | check-style: 13 | npm run lint-check 14 | 15 | TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json $(DATA_DIR)/bert-wiki.json 16 | 17 | # Launch the test suite 18 | test: $(TESTS_RESOURCES) 19 | npm run test 20 | 21 | $(DATA_DIR)/big.txt : 22 | $(dir_guard) 23 | wget https://norvig.com/big.txt -O $@ 24 | 25 | $(DATA_DIR)/small.txt : $(DATA_DIR)/big.txt 26 | head -100 $(DATA_DIR)/big.txt > $@ 27 | 28 | $(DATA_DIR)/roberta.json : 29 | $(dir_guard) 30 | wget https://huggingface.co/roberta-large/raw/main/tokenizer.json -O $@ 31 | 32 | $(DATA_DIR)/tokenizer-wiki.json : 33 | $(dir_guard) 34 | wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json -O $@ 35 | 36 | $(DATA_DIR)/bert-wiki.json : 37 | $(dir_guard) 38 | wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json -O $@ 39 | -------------------------------------------------------------------------------- /bindings/node/README.md: -------------------------------------------------------------------------------- 1 | <p align="center"> 2 | <br> 3 | <img src="https://huggingface.co/landing/assets/tokenizers/tokenizers-logo.png" width="600"/> 4 | <br> 5 | <p> 6 | <p align="center"> 7 | <a href="https://badge.fury.io/js/tokenizers"> 8 | <img alt="Build" src="https://badge.fury.io/js/tokenizers.svg"> 9 | </a> 10 | <a href="https://github.com/huggingface/tokenizers/blob/master/LICENSE"> 11 | <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/tokenizers.svg?color=blue"> 12 | </a> 13 | </p> 14 | <br> 15 | 16 | NodeJS implementation of today's most used tokenizers, with a focus on performance and 17 | versatility. Bindings over the [Rust](https://github.com/huggingface/tokenizers/tree/master/tokenizers) implementation. 18 | If you are interested in the High-level design, you can go check it there. 19 | 20 | ## Main features 21 | 22 | - Train new vocabularies and tokenize using 4 pre-made tokenizers (Bert WordPiece and the 3 23 | most common BPE versions). 24 | - Extremely fast (both training and tokenization), thanks to the Rust implementation. Takes 25 | less than 20 seconds to tokenize a GB of text on a server's CPU. 26 | - Easy to use, but also extremely versatile. 27 | - Designed for research and production. 28 | - Normalization comes with alignments tracking. It's always possible to get the part of the 29 | original sentence that corresponds to a given token. 30 | - Does all the pre-processing: Truncate, Pad, add the special tokens your model needs. 31 | 32 | ## Installation 33 | 34 | ```bash 35 | npm install tokenizers@latest 36 | ``` 37 | 38 | ## Basic example 39 | 40 | ```ts 41 | import { Tokenizer } from "tokenizers"; 42 | 43 | const tokenizer = await Tokenizer.fromFile("tokenizer.json"); 44 | const wpEncoded = await tokenizer.encode("Who is John?"); 45 | 46 | console.log(wpEncoded.getLength()); 47 | console.log(wpEncoded.getTokens()); 48 | console.log(wpEncoded.getIds()); 49 | console.log(wpEncoded.getAttentionMask()); 50 | console.log(wpEncoded.getOffsets()); 51 | console.log(wpEncoded.getOverflowing()); 52 | console.log(wpEncoded.getSpecialTokensMask()); 53 | console.log(wpEncoded.getTypeIds()); 54 | console.log(wpEncoded.getWordIds()); 55 | ``` 56 | 57 | ## License 58 | 59 | [Apache License 2.0](../../LICENSE) 60 | -------------------------------------------------------------------------------- /bindings/node/build.rs: -------------------------------------------------------------------------------- 1 | extern crate napi_build; 2 | 3 | fn main() { 4 | napi_build::setup(); 5 | } 6 | -------------------------------------------------------------------------------- /bindings/node/lib/bindings/__mocks__/merges.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/tokenizers/dd4fc3df1a8a7cd135eecca2158db018d85f94f1/bindings/node/lib/bindings/__mocks__/merges.txt -------------------------------------------------------------------------------- /bindings/node/lib/bindings/__mocks__/vocab.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /bindings/node/lib/bindings/__mocks__/vocab.txt: -------------------------------------------------------------------------------- 1 | my 2 | name 3 | is 4 | jo 5 | ##hn 6 | what 7 | yours 8 | pair 9 | [UNK] 10 | -------------------------------------------------------------------------------- /bindings/node/lib/bindings/models.test.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable @typescript-eslint/no-empty-function */ 2 | /* eslint-disable @typescript-eslint/no-explicit-any */ 3 | 4 | import { BPE, Unigram, WordPiece } from '../../' 5 | 6 | const MOCKS_DIR = __dirname + '/__mocks__' 7 | 8 | describe('WordPiece', () => { 9 | describe('fromFile', () => { 10 | it('throws if called with only one argument', () => { 11 | expect(() => (WordPiece as any).fromFile()).toThrow( 12 | 'Failed to convert JavaScript value `Undefined` into rust type `String`', 13 | ) 14 | }) 15 | 16 | it('throws if called with 2 arguments without a callback as third argument', () => { 17 | expect(() => (WordPiece as any).fromFile({})).toThrow( 18 | 'Failed to convert JavaScript value `Object {}` into rust type `String`', 19 | ) 20 | }) 21 | 22 | it('has its callback called with the loaded model', async () => { 23 | const model = await WordPiece.fromFile(`${MOCKS_DIR}/vocab.txt`) 24 | expect(model).toBeDefined() 25 | }) 26 | }) 27 | }) 28 | 29 | describe('BPE', () => { 30 | describe('fromFile', () => { 31 | it('has its callback called with the loaded model', async () => { 32 | const model = await BPE.fromFile(`${MOCKS_DIR}/vocab.json`, `${MOCKS_DIR}/merges.txt`) 33 | expect(model).toBeDefined() 34 | }) 35 | 36 | it('has its callback called with the loaded model', async () => { 37 | const model = await BPE.fromFile(`${MOCKS_DIR}/vocab.json`, `${MOCKS_DIR}/merges.txt`, {}) 38 | expect(model).toBeDefined() 39 | }) 40 | }) 41 | describe('When initialized from memory', () => { 42 | it('returns the loaded Model', () => { 43 | const bpe = BPE.init({ a: 0, b: 1, ab: 2 }, [['a', 'b']]) 44 | // expect(bpe.constructor.name).toEqual("Model"); 45 | expect(bpe.constructor.name).toEqual('BPE') 46 | }) 47 | }) 48 | }) 49 | 50 | describe('Unigram', () => { 51 | it('can be initialized from memory', () => { 52 | const unigram = Unigram.init( 53 | [ 54 | ['<unk>', 0], 55 | ['Hello', -1], 56 | ['there', -2], 57 | ], 58 | { 59 | unkId: 0, 60 | }, 61 | ) 62 | expect(unigram.constructor.name).toEqual('Unigram') 63 | }) 64 | }) 65 | -------------------------------------------------------------------------------- /bindings/node/lib/bindings/normalizers.test.ts: -------------------------------------------------------------------------------- 1 | import { prependNormalizer, stripAccentsNormalizer, stripNormalizer } from '../../' 2 | 3 | describe('stripNormalizer', () => { 4 | it('instantiates with no parameters', () => { 5 | const normalizer = stripNormalizer() 6 | expect(normalizer.constructor.name).toEqual('Normalizer') 7 | }) 8 | 9 | it('accepts `undefined` as first parameter', () => { 10 | expect(stripNormalizer(undefined)).toBeDefined() 11 | }) 12 | 13 | it('accepts `undefined` as second parameter', () => { 14 | expect(stripNormalizer(false, undefined)).toBeDefined() 15 | }) 16 | 17 | it('instantiates with one parameter', () => { 18 | const normalizer = stripNormalizer(false) 19 | expect(normalizer.constructor.name).toEqual('Normalizer') 20 | }) 21 | 22 | it('instantiates with two parameters', () => { 23 | const normalizer = stripNormalizer(false, true) 24 | expect(normalizer.constructor.name).toEqual('Normalizer') 25 | }) 26 | 27 | it('prepend instantiates with one parameter', () => { 28 | const normalizer = prependNormalizer('_') 29 | expect(normalizer.constructor.name).toEqual('Normalizer') 30 | expect(normalizer.normalizeString('Hello')).toEqual('_Hello') 31 | }) 32 | 33 | it('can normalize strings', () => { 34 | const normalizer = stripNormalizer() 35 | expect(normalizer.normalizeString(' Hello there ')).toEqual('Hello there') 36 | }) 37 | }) 38 | 39 | describe('stripAccentsNormalizer', () => { 40 | it('initialize', () => { 41 | const normalizer = stripAccentsNormalizer() 42 | expect(normalizer.constructor.name).toEqual('Normalizer') 43 | }) 44 | }) 45 | -------------------------------------------------------------------------------- /bindings/node/lib/bindings/post-processors.test.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable @typescript-eslint/no-explicit-any */ 2 | 3 | import { bertProcessing, byteLevelProcessing, robertaProcessing, sequenceProcessing, templateProcessing } from '../../' 4 | 5 | describe('bertProcessing', () => { 6 | it('instantiates correctly with only two parameters', () => { 7 | const processor = bertProcessing(['sep', 1], ['cls', 2]) 8 | expect(processor.constructor.name).toEqual('Processor') 9 | }) 10 | 11 | it('throws if only one argument is provided', () => { 12 | expect(() => (bertProcessing as any)(['sep', 1])).toThrow('Given napi value is not an array') 13 | }) 14 | 15 | it('throws if arguments are malformed', () => { 16 | expect(() => (bertProcessing as any)(['sep', '1'], ['cls', '2'])).toThrow( 17 | 'Failed to convert napi value String into rust type `u32`', 18 | ) 19 | expect(() => (bertProcessing as any)(['sep'], ['cls'])).toThrow('Array length < 2') 20 | }) 21 | }) 22 | 23 | describe('byteLevelProcessing', () => { 24 | it('instantiates correctly without any parameter', () => { 25 | const processor = byteLevelProcessing() 26 | expect(processor.constructor.name).toEqual('Processor') 27 | }) 28 | 29 | it('accepts `undefined` as first parameter', () => { 30 | expect(byteLevelProcessing(undefined)).toBeDefined() 31 | }) 32 | 33 | it('accepts `boolean` as first parameter', () => { 34 | expect(byteLevelProcessing(true)).toBeDefined() 35 | }) 36 | }) 37 | 38 | describe('robertaProcessing', () => { 39 | it('instantiates correctly with only two parameters', () => { 40 | const processor = robertaProcessing(['sep', 1], ['cls', 2]) 41 | expect(processor.constructor.name).toEqual('Processor') 42 | }) 43 | 44 | it('accepts `undefined` as third and fourth parameters', () => { 45 | expect(robertaProcessing(['sep', 1], ['cls', 2], undefined, undefined)).toBeDefined() 46 | }) 47 | 48 | it('accepts `boolean` as third and fourth parameter', () => { 49 | expect(robertaProcessing(['sep', 1], ['cls', 2], true, true)).toBeDefined() 50 | }) 51 | }) 52 | 53 | describe('templateProcessing', () => { 54 | it('instantiates correctly with only a single template', () => { 55 | const processor = templateProcessing('$A $A') 56 | expect(processor.constructor.name).toEqual('Processor') 57 | }) 58 | 59 | it('throws if special tokens are missing', () => { 60 | expect(() => templateProcessing('[CLS] $A [SEP]')).toThrow('Missing SpecialToken(s) with id(s)') 61 | }) 62 | 63 | it('instantiates correctly with both templates', () => { 64 | const processor = templateProcessing('[CLS] $A [SEP]', '[CLS] $A [SEP] $B:1 [SEP]:1', [ 65 | ['[CLS]', 1], 66 | ['[SEP]', 2], 67 | ]) 68 | expect(processor.constructor.name).toEqual('Processor') 69 | }) 70 | }) 71 | 72 | describe('sequenceProcessing', () => { 73 | it('accepts `PostProcessor[]` as first parameter', () => { 74 | const template = templateProcessing('[CLS] $A [SEP]', '[CLS] $A [SEP] $B:1 [SEP]:1', [ 75 | ['[CLS]', 1], 76 | ['[SEP]', 2], 77 | ]) 78 | const bytelevel = byteLevelProcessing(true) 79 | expect(sequenceProcessing([bytelevel, template])).toBeDefined() 80 | }) 81 | }) 82 | -------------------------------------------------------------------------------- /bindings/node/lib/bindings/pre-tokenizers.test.ts: -------------------------------------------------------------------------------- 1 | import { 2 | byteLevelPreTokenizer, 3 | metaspacePreTokenizer, 4 | punctuationPreTokenizer, 5 | sequencePreTokenizer, 6 | splitPreTokenizer, 7 | whitespaceSplitPreTokenizer, 8 | } from '../../' 9 | 10 | describe('byteLevelPreTokenizer', () => { 11 | it('instantiates correctly', () => { 12 | const processor = byteLevelPreTokenizer() 13 | expect(processor.constructor.name).toEqual('PreTokenizer') 14 | }) 15 | }) 16 | 17 | describe('metaspacePreTokenizer', () => { 18 | it('instantiates correctly without any parameter', () => { 19 | const processor = metaspacePreTokenizer() 20 | expect(processor.constructor.name).toEqual('PreTokenizer') 21 | }) 22 | 23 | it('accepts `undefined` as first parameter', () => { 24 | expect(metaspacePreTokenizer(undefined)).toBeDefined() 25 | }) 26 | 27 | it('accepts `undefined` as second parameter', () => { 28 | expect(metaspacePreTokenizer('t', undefined)).toBeDefined() 29 | }) 30 | 31 | it('can pre-tokenize strings', () => { 32 | const pretok = metaspacePreTokenizer() 33 | expect(pretok.preTokenizeString('Hello there friend')).toEqual([ 34 | ['▁Hello', [0, 5]], 35 | ['▁there', [5, 11]], 36 | ['▁friend', [11, 18]], 37 | ]) 38 | }) 39 | }) 40 | 41 | describe('punctuationPreTokenizer', () => { 42 | it('instantiates correctly without any parameter', () => { 43 | const processor = punctuationPreTokenizer() 44 | expect(processor.constructor.name).toEqual('PreTokenizer') 45 | }) 46 | 47 | it('instantiates correctly with non-default split delimeter', () => { 48 | const processor = punctuationPreTokenizer('removed') 49 | expect(processor.constructor.name).toEqual('PreTokenizer') 50 | }) 51 | }) 52 | 53 | describe('splitPreTokenizer', () => { 54 | it('instantiates correctly with invert parameter', () => { 55 | const processor = splitPreTokenizer(' ', 'mergedWithPrevious', false) 56 | expect(processor.constructor.name).toEqual('PreTokenizer') 57 | }) 58 | }) 59 | 60 | describe('sequencePreTokenizer', () => { 61 | it('instantiates correctly', () => { 62 | const punctuation = punctuationPreTokenizer() 63 | const whitespace = whitespaceSplitPreTokenizer() 64 | const sequence2 = sequencePreTokenizer([]) 65 | expect(sequence2.constructor.name).toEqual('PreTokenizer') 66 | const sequence3 = sequencePreTokenizer([punctuation, whitespace]) 67 | expect(sequence3.constructor.name).toEqual('PreTokenizer') 68 | }) 69 | }) 70 | -------------------------------------------------------------------------------- /bindings/node/npm/android-arm-eabi/README.md: -------------------------------------------------------------------------------- 1 | # `tokenizers-android-arm-eabi` 2 | 3 | This is the **armv7-linux-androideabi** binary for `tokenizers` 4 | -------------------------------------------------------------------------------- /bindings/node/npm/android-arm-eabi/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tokenizers-android-arm-eabi", 3 | "version": "0.13.4-rc1", 4 | "os": [ 5 | "android" 6 | ], 7 | "cpu": [ 8 | "arm" 9 | ], 10 | "main": "tokenizers.android-arm-eabi.node", 11 | "files": [ 12 | "tokenizers.android-arm-eabi.node" 13 | ], 14 | "description": "Tokenizers platform specific bindings", 15 | "keywords": [ 16 | "napi-rs", 17 | "NAPI", 18 | "N-API", 19 | "Rust", 20 | "node-addon", 21 | "node-addon-api" 22 | ], 23 | "license": "MIT", 24 | "engines": { 25 | "node": ">= 10" 26 | }, 27 | "publishConfig": { 28 | "registry": "https://registry.npmjs.org/", 29 | "access": "public" 30 | }, 31 | "repository": "tokenizers" 32 | } -------------------------------------------------------------------------------- /bindings/node/npm/android-arm64/README.md: -------------------------------------------------------------------------------- 1 | # `tokenizers-android-arm64` 2 | 3 | This is the **aarch64-linux-android** binary for `tokenizers` 4 | -------------------------------------------------------------------------------- /bindings/node/npm/android-arm64/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tokenizers-android-arm64", 3 | "version": "0.13.4-rc1", 4 | "os": [ 5 | "android" 6 | ], 7 | "cpu": [ 8 | "arm64" 9 | ], 10 | "main": "tokenizers.android-arm64.node", 11 | "files": [ 12 | "tokenizers.android-arm64.node" 13 | ], 14 | "description": "Tokenizers platform specific bindings", 15 | "keywords": [ 16 | "napi-rs", 17 | "NAPI", 18 | "N-API", 19 | "Rust", 20 | "node-addon", 21 | "node-addon-api" 22 | ], 23 | "license": "MIT", 24 | "engines": { 25 | "node": ">= 10" 26 | }, 27 | "publishConfig": { 28 | "registry": "https://registry.npmjs.org/", 29 | "access": "public" 30 | }, 31 | "repository": "tokenizers" 32 | } -------------------------------------------------------------------------------- /bindings/node/npm/darwin-arm64/README.md: -------------------------------------------------------------------------------- 1 | # `tokenizers-darwin-arm64` 2 | 3 | This is the **aarch64-apple-darwin** binary for `tokenizers` 4 | -------------------------------------------------------------------------------- /bindings/node/npm/darwin-arm64/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tokenizers-darwin-arm64", 3 | "version": "0.13.4-rc1", 4 | "os": [ 5 | "darwin" 6 | ], 7 | "cpu": [ 8 | "arm64" 9 | ], 10 | "main": "tokenizers.darwin-arm64.node", 11 | "files": [ 12 | "tokenizers.darwin-arm64.node" 13 | ], 14 | "description": "Tokenizers platform specific bindings", 15 | "keywords": [ 16 | "napi-rs", 17 | "NAPI", 18 | "N-API", 19 | "Rust", 20 | "node-addon", 21 | "node-addon-api" 22 | ], 23 | "license": "MIT", 24 | "engines": { 25 | "node": ">= 10" 26 | }, 27 | "publishConfig": { 28 | "registry": "https://registry.npmjs.org/", 29 | "access": "public" 30 | }, 31 | "repository": "tokenizers" 32 | } -------------------------------------------------------------------------------- /bindings/node/npm/darwin-x64/README.md: -------------------------------------------------------------------------------- 1 | # `tokenizers-darwin-x64` 2 | 3 | This is the **x86_64-apple-darwin** binary for `tokenizers` 4 | -------------------------------------------------------------------------------- /bindings/node/npm/darwin-x64/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tokenizers-darwin-x64", 3 | "version": "0.13.4-rc1", 4 | "os": [ 5 | "darwin" 6 | ], 7 | "cpu": [ 8 | "x64" 9 | ], 10 | "main": "tokenizers.darwin-x64.node", 11 | "files": [ 12 | "tokenizers.darwin-x64.node" 13 | ], 14 | "description": "Tokenizers platform specific bindings", 15 | "keywords": [ 16 | "napi-rs", 17 | "NAPI", 18 | "N-API", 19 | "Rust", 20 | "node-addon", 21 | "node-addon-api" 22 | ], 23 | "license": "MIT", 24 | "engines": { 25 | "node": ">= 10" 26 | }, 27 | "publishConfig": { 28 | "registry": "https://registry.npmjs.org/", 29 | "access": "public" 30 | }, 31 | "repository": "tokenizers" 32 | } -------------------------------------------------------------------------------- /bindings/node/npm/freebsd-x64/README.md: -------------------------------------------------------------------------------- 1 | # `tokenizers-freebsd-x64` 2 | 3 | This is the **x86_64-unknown-freebsd** binary for `tokenizers` 4 | -------------------------------------------------------------------------------- /bindings/node/npm/freebsd-x64/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tokenizers-freebsd-x64", 3 | "version": "0.13.4-rc1", 4 | "os": [ 5 | "freebsd" 6 | ], 7 | "cpu": [ 8 | "x64" 9 | ], 10 | "main": "tokenizers.freebsd-x64.node", 11 | "files": [ 12 | "tokenizers.freebsd-x64.node" 13 | ], 14 | "description": "Tokenizers platform specific bindings", 15 | "keywords": [ 16 | "napi-rs", 17 | "NAPI", 18 | "N-API", 19 | "Rust", 20 | "node-addon", 21 | "node-addon-api" 22 | ], 23 | "license": "MIT", 24 | "engines": { 25 | "node": ">= 10" 26 | }, 27 | "publishConfig": { 28 | "registry": "https://registry.npmjs.org/", 29 | "access": "public" 30 | }, 31 | "repository": "tokenizers" 32 | } -------------------------------------------------------------------------------- /bindings/node/npm/linux-arm-gnueabihf/README.md: -------------------------------------------------------------------------------- 1 | # `tokenizers-linux-arm-gnueabihf` 2 | 3 | This is the **armv7-unknown-linux-gnueabihf** binary for `tokenizers` 4 | -------------------------------------------------------------------------------- /bindings/node/npm/linux-arm-gnueabihf/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tokenizers-linux-arm-gnueabihf", 3 | "version": "0.13.4-rc1", 4 | "os": [ 5 | "linux" 6 | ], 7 | "cpu": [ 8 | "arm" 9 | ], 10 | "main": "tokenizers.linux-arm-gnueabihf.node", 11 | "files": [ 12 | "tokenizers.linux-arm-gnueabihf.node" 13 | ], 14 | "description": "Tokenizers platform specific bindings", 15 | "keywords": [ 16 | "napi-rs", 17 | "NAPI", 18 | "N-API", 19 | "Rust", 20 | "node-addon", 21 | "node-addon-api" 22 | ], 23 | "license": "MIT", 24 | "engines": { 25 | "node": ">= 10" 26 | }, 27 | "publishConfig": { 28 | "registry": "https://registry.npmjs.org/", 29 | "access": "public" 30 | }, 31 | "repository": "tokenizers" 32 | } -------------------------------------------------------------------------------- /bindings/node/npm/linux-arm64-gnu/README.md: -------------------------------------------------------------------------------- 1 | # `tokenizers-linux-arm64-gnu` 2 | 3 | This is the **aarch64-unknown-linux-gnu** binary for `tokenizers` 4 | -------------------------------------------------------------------------------- /bindings/node/npm/linux-arm64-gnu/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tokenizers-linux-arm64-gnu", 3 | "version": "0.13.4-rc1", 4 | "os": [ 5 | "linux" 6 | ], 7 | "cpu": [ 8 | "arm64" 9 | ], 10 | "main": "tokenizers.linux-arm64-gnu.node", 11 | "files": [ 12 | "tokenizers.linux-arm64-gnu.node" 13 | ], 14 | "description": "Tokenizers platform specific bindings", 15 | "keywords": [ 16 | "napi-rs", 17 | "NAPI", 18 | "N-API", 19 | "Rust", 20 | "node-addon", 21 | "node-addon-api" 22 | ], 23 | "license": "MIT", 24 | "engines": { 25 | "node": ">= 10" 26 | }, 27 | "publishConfig": { 28 | "registry": "https://registry.npmjs.org/", 29 | "access": "public" 30 | }, 31 | "repository": "tokenizers", 32 | "libc": [ 33 | "glibc" 34 | ] 35 | } -------------------------------------------------------------------------------- /bindings/node/npm/linux-arm64-musl/README.md: -------------------------------------------------------------------------------- 1 | # `tokenizers-linux-arm64-musl` 2 | 3 | This is the **aarch64-unknown-linux-musl** binary for `tokenizers` 4 | -------------------------------------------------------------------------------- /bindings/node/npm/linux-arm64-musl/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tokenizers-linux-arm64-musl", 3 | "version": "0.13.4-rc1", 4 | "os": [ 5 | "linux" 6 | ], 7 | "cpu": [ 8 | "arm64" 9 | ], 10 | "main": "tokenizers.linux-arm64-musl.node", 11 | "files": [ 12 | "tokenizers.linux-arm64-musl.node" 13 | ], 14 | "description": "Tokenizers platform specific bindings", 15 | "keywords": [ 16 | "napi-rs", 17 | "NAPI", 18 | "N-API", 19 | "Rust", 20 | "node-addon", 21 | "node-addon-api" 22 | ], 23 | "license": "MIT", 24 | "engines": { 25 | "node": ">= 10" 26 | }, 27 | "publishConfig": { 28 | "registry": "https://registry.npmjs.org/", 29 | "access": "public" 30 | }, 31 | "repository": "tokenizers", 32 | "libc": [ 33 | "musl" 34 | ] 35 | } -------------------------------------------------------------------------------- /bindings/node/npm/linux-x64-gnu/README.md: -------------------------------------------------------------------------------- 1 | # `tokenizers-linux-x64-gnu` 2 | 3 | This is the **x86_64-unknown-linux-gnu** binary for `tokenizers` 4 | -------------------------------------------------------------------------------- /bindings/node/npm/linux-x64-gnu/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tokenizers-linux-x64-gnu", 3 | "version": "0.13.4-rc1", 4 | "os": [ 5 | "linux" 6 | ], 7 | "cpu": [ 8 | "x64" 9 | ], 10 | "main": "tokenizers.linux-x64-gnu.node", 11 | "files": [ 12 | "tokenizers.linux-x64-gnu.node" 13 | ], 14 | "description": "Tokenizers platform specific bindings", 15 | "keywords": [ 16 | "napi-rs", 17 | "NAPI", 18 | "N-API", 19 | "Rust", 20 | "node-addon", 21 | "node-addon-api" 22 | ], 23 | "license": "MIT", 24 | "engines": { 25 | "node": ">= 10" 26 | }, 27 | "publishConfig": { 28 | "registry": "https://registry.npmjs.org/", 29 | "access": "public" 30 | }, 31 | "repository": "tokenizers", 32 | "libc": [ 33 | "glibc" 34 | ] 35 | } -------------------------------------------------------------------------------- /bindings/node/npm/linux-x64-musl/README.md: -------------------------------------------------------------------------------- 1 | # `tokenizers-linux-x64-musl` 2 | 3 | This is the **x86_64-unknown-linux-musl** binary for `tokenizers` 4 | -------------------------------------------------------------------------------- /bindings/node/npm/linux-x64-musl/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tokenizers-linux-x64-musl", 3 | "version": "0.13.4-rc1", 4 | "os": [ 5 | "linux" 6 | ], 7 | "cpu": [ 8 | "x64" 9 | ], 10 | "main": "tokenizers.linux-x64-musl.node", 11 | "files": [ 12 | "tokenizers.linux-x64-musl.node" 13 | ], 14 | "description": "Tokenizers platform specific bindings", 15 | "keywords": [ 16 | "napi-rs", 17 | "NAPI", 18 | "N-API", 19 | "Rust", 20 | "node-addon", 21 | "node-addon-api" 22 | ], 23 | "license": "MIT", 24 | "engines": { 25 | "node": ">= 10" 26 | }, 27 | "publishConfig": { 28 | "registry": "https://registry.npmjs.org/", 29 | "access": "public" 30 | }, 31 | "repository": "tokenizers", 32 | "libc": [ 33 | "musl" 34 | ] 35 | } -------------------------------------------------------------------------------- /bindings/node/npm/win32-arm64-msvc/README.md: -------------------------------------------------------------------------------- 1 | # `tokenizers-win32-arm64-msvc` 2 | 3 | This is the **aarch64-pc-windows-msvc** binary for `tokenizers` 4 | -------------------------------------------------------------------------------- /bindings/node/npm/win32-arm64-msvc/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tokenizers-win32-arm64-msvc", 3 | "version": "0.13.4-rc1", 4 | "os": [ 5 | "win32" 6 | ], 7 | "cpu": [ 8 | "arm64" 9 | ], 10 | "main": "tokenizers.win32-arm64-msvc.node", 11 | "files": [ 12 | "tokenizers.win32-arm64-msvc.node" 13 | ], 14 | "description": "Tokenizers platform specific bindings", 15 | "keywords": [ 16 | "napi-rs", 17 | "NAPI", 18 | "N-API", 19 | "Rust", 20 | "node-addon", 21 | "node-addon-api" 22 | ], 23 | "license": "MIT", 24 | "engines": { 25 | "node": ">= 10" 26 | }, 27 | "publishConfig": { 28 | "registry": "https://registry.npmjs.org/", 29 | "access": "public" 30 | }, 31 | "repository": "tokenizers" 32 | } -------------------------------------------------------------------------------- /bindings/node/npm/win32-ia32-msvc/README.md: -------------------------------------------------------------------------------- 1 | # `tokenizers-win32-ia32-msvc` 2 | 3 | This is the **i686-pc-windows-msvc** binary for `tokenizers` 4 | -------------------------------------------------------------------------------- /bindings/node/npm/win32-ia32-msvc/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tokenizers-win32-ia32-msvc", 3 | "version": "0.13.4-rc1", 4 | "os": [ 5 | "win32" 6 | ], 7 | "cpu": [ 8 | "ia32" 9 | ], 10 | "main": "tokenizers.win32-ia32-msvc.node", 11 | "files": [ 12 | "tokenizers.win32-ia32-msvc.node" 13 | ], 14 | "description": "Tokenizers platform specific bindings", 15 | "keywords": [ 16 | "napi-rs", 17 | "NAPI", 18 | "N-API", 19 | "Rust", 20 | "node-addon", 21 | "node-addon-api" 22 | ], 23 | "license": "MIT", 24 | "engines": { 25 | "node": ">= 10" 26 | }, 27 | "publishConfig": { 28 | "registry": "https://registry.npmjs.org/", 29 | "access": "public" 30 | }, 31 | "repository": "tokenizers" 32 | } -------------------------------------------------------------------------------- /bindings/node/npm/win32-x64-msvc/README.md: -------------------------------------------------------------------------------- 1 | # `tokenizers-win32-x64-msvc` 2 | 3 | This is the **x86_64-pc-windows-msvc** binary for `tokenizers` 4 | -------------------------------------------------------------------------------- /bindings/node/npm/win32-x64-msvc/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tokenizers-win32-x64-msvc", 3 | "version": "0.13.4-rc1", 4 | "os": [ 5 | "win32" 6 | ], 7 | "cpu": [ 8 | "x64" 9 | ], 10 | "main": "tokenizers.win32-x64-msvc.node", 11 | "files": [ 12 | "tokenizers.win32-x64-msvc.node" 13 | ], 14 | "description": "Tokenizers platform specific bindings", 15 | "keywords": [ 16 | "napi-rs", 17 | "NAPI", 18 | "N-API", 19 | "Rust", 20 | "node-addon", 21 | "node-addon-api" 22 | ], 23 | "license": "MIT", 24 | "engines": { 25 | "node": ">= 10" 26 | }, 27 | "publishConfig": { 28 | "registry": "https://registry.npmjs.org/", 29 | "access": "public" 30 | }, 31 | "repository": "tokenizers" 32 | } -------------------------------------------------------------------------------- /bindings/node/rustfmt.toml: -------------------------------------------------------------------------------- 1 | tab_spaces = 2 2 | -------------------------------------------------------------------------------- /bindings/node/src/arc_rwlock_serde.rs: -------------------------------------------------------------------------------- 1 | use serde::de::Deserializer; 2 | use serde::ser::Serializer; 3 | use serde::{Deserialize, Serialize}; 4 | use std::sync::{Arc, RwLock}; 5 | 6 | pub fn serialize<S, T>(val: &Option<Arc<RwLock<T>>>, s: S) -> Result<S::Ok, S::Error> 7 | where 8 | S: Serializer, 9 | T: Serialize, 10 | { 11 | T::serialize(&*(val.clone().unwrap()).read().unwrap(), s) 12 | } 13 | 14 | pub fn deserialize<'de, D, T>(d: D) -> Result<Option<Arc<RwLock<T>>>, D::Error> 15 | where 16 | D: Deserializer<'de>, 17 | T: Deserialize<'de>, 18 | { 19 | Ok(Some(Arc::new(RwLock::new(T::deserialize(d)?)))) 20 | } 21 | -------------------------------------------------------------------------------- /bindings/node/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![deny(clippy::all)] 2 | 3 | pub const VERSION: &str = env!("CARGO_PKG_VERSION"); 4 | 5 | mod arc_rwlock_serde; 6 | pub mod decoders; 7 | pub mod encoding; 8 | pub mod models; 9 | pub mod normalizers; 10 | pub mod pre_tokenizers; 11 | pub mod processors; 12 | pub mod tasks; 13 | pub mod tokenizer; 14 | pub mod trainers; 15 | pub mod utils; 16 | -------------------------------------------------------------------------------- /bindings/node/src/tasks/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod models; 2 | pub mod tokenizer; 3 | -------------------------------------------------------------------------------- /bindings/node/src/tasks/models.rs: -------------------------------------------------------------------------------- 1 | extern crate tokenizers as tk; 2 | 3 | use crate::models::Model; 4 | use napi::bindgen_prelude::*; 5 | use std::sync::{Arc, RwLock}; 6 | use tokenizers::models::bpe::{BpeBuilder, BPE}; 7 | use tokenizers::models::wordlevel::{WordLevel, WordLevelBuilder}; 8 | use tokenizers::models::wordpiece::{WordPiece, WordPieceBuilder}; 9 | 10 | pub struct BPEFromFilesTask { 11 | pub(crate) builder: Option<BpeBuilder>, 12 | } 13 | 14 | impl Task for BPEFromFilesTask { 15 | type Output = BPE; 16 | type JsValue = Model; 17 | 18 | fn compute(&mut self) -> Result<Self::Output> { 19 | self 20 | .builder 21 | .take() 22 | .ok_or(Error::from_reason("Empty builder".to_string()))? 23 | .build() 24 | .map_err(|e| Error::from_reason(format!("{e}"))) 25 | } 26 | 27 | fn resolve(&mut self, _env: Env, output: Self::Output) -> Result<Self::JsValue> { 28 | Ok(Model { 29 | model: Some(Arc::new(RwLock::new(output.into()))), 30 | }) 31 | } 32 | } 33 | 34 | pub struct WordPieceFromFilesTask { 35 | pub(crate) builder: Option<WordPieceBuilder>, 36 | } 37 | 38 | impl Task for WordPieceFromFilesTask { 39 | type Output = WordPiece; 40 | type JsValue = Model; 41 | 42 | fn compute(&mut self) -> Result<Self::Output> { 43 | self 44 | .builder 45 | .take() 46 | .ok_or(Error::from_reason("Empty builder".to_string()))? 47 | .build() 48 | .map_err(|e| Error::from_reason(format!("{e}"))) 49 | } 50 | 51 | fn resolve(&mut self, _env: Env, output: Self::Output) -> Result<Self::JsValue> { 52 | Ok(Model { 53 | model: Some(Arc::new(RwLock::new(output.into()))), 54 | }) 55 | } 56 | } 57 | pub struct WordLevelFromFilesTask { 58 | pub(crate) builder: Option<WordLevelBuilder>, 59 | } 60 | 61 | impl Task for WordLevelFromFilesTask { 62 | type Output = WordLevel; 63 | type JsValue = Model; 64 | 65 | fn compute(&mut self) -> Result<Self::Output> { 66 | self 67 | .builder 68 | .take() 69 | .ok_or(Error::from_reason("Empty builder".to_string()))? 70 | .build() 71 | .map_err(|e| Error::from_reason(format!("{e}"))) 72 | } 73 | 74 | fn resolve(&mut self, _env: Env, output: Self::Output) -> Result<Self::JsValue> { 75 | Ok(Model { 76 | model: Some(Arc::new(RwLock::new(output.into()))), 77 | }) 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /bindings/node/src/trainers.rs: -------------------------------------------------------------------------------- 1 | use crate::models::Model; 2 | use napi_derive::napi; 3 | use std::sync::{Arc, RwLock}; 4 | use tokenizers as tk; 5 | use tokenizers::models::TrainerWrapper; 6 | 7 | #[napi] 8 | pub struct Trainer { 9 | trainer: Option<Arc<RwLock<TrainerWrapper>>>, 10 | } 11 | 12 | impl From<TrainerWrapper> for Trainer { 13 | fn from(trainer: TrainerWrapper) -> Self { 14 | Self { 15 | trainer: Some(Arc::new(RwLock::new(trainer))), 16 | } 17 | } 18 | } 19 | 20 | impl tk::Trainer for Trainer { 21 | type Model = Model; 22 | 23 | fn should_show_progress(&self) -> bool { 24 | self 25 | .trainer 26 | .as_ref() 27 | .expect("Uninitialized Trainer") 28 | .read() 29 | .unwrap() 30 | .should_show_progress() 31 | } 32 | 33 | fn train(&self, model: &mut Self::Model) -> tk::Result<Vec<tk::AddedToken>> { 34 | let special_tokens = self 35 | .trainer 36 | .as_ref() 37 | .ok_or("Uninitialized Trainer")? 38 | .read() 39 | .unwrap() 40 | .train( 41 | &mut model 42 | .model 43 | .as_ref() 44 | .ok_or("Uninitialized Model")? 45 | .write() 46 | .unwrap(), 47 | )?; 48 | 49 | Ok(special_tokens) 50 | } 51 | 52 | fn feed<I, S, F>(&mut self, iterator: I, process: F) -> tk::Result<()> 53 | where 54 | I: Iterator<Item = S> + Send, 55 | S: AsRef<str> + Send, 56 | F: Fn(&str) -> tk::Result<Vec<String>> + Sync, 57 | { 58 | self 59 | .trainer 60 | .as_ref() 61 | .ok_or("Uninitialized Trainer")? 62 | .write() 63 | .unwrap() 64 | .feed(iterator, process) 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /bindings/node/src/utils.rs: -------------------------------------------------------------------------------- 1 | use napi::bindgen_prelude::*; 2 | use napi_derive::napi; 3 | use tokenizers as tk; 4 | use tokenizers::Encoding; 5 | 6 | use crate::encoding::JsEncoding; 7 | 8 | #[napi] 9 | pub fn slice(s: String, begin_index: Option<i32>, end_index: Option<i32>) -> Result<String> { 10 | let len = s.chars().count(); 11 | 12 | let get_index = |x: i32| -> usize { 13 | if x >= 0 { 14 | x as usize 15 | } else { 16 | (len as i32 + x) as usize 17 | } 18 | }; 19 | 20 | let begin_index = get_index(begin_index.unwrap_or(0)); 21 | let end_index = get_index(end_index.unwrap_or(len as i32)); 22 | 23 | if let Some(slice) = tk::tokenizer::normalizer::get_range_of(&s, begin_index..end_index) { 24 | Ok(slice.to_string()) 25 | } else { 26 | Err(Error::new( 27 | Status::GenericFailure, 28 | "Error in offsets".to_string(), 29 | )) 30 | } 31 | } 32 | 33 | #[napi] 34 | pub fn merge_encodings( 35 | encodings: Vec<&JsEncoding>, 36 | growing_offsets: Option<bool>, 37 | ) -> Result<JsEncoding> { 38 | let growing_offsets = growing_offsets.unwrap_or(false); 39 | 40 | let encodings: Vec<_> = encodings 41 | .into_iter() 42 | .map(|enc| enc.encoding.to_owned().unwrap()) 43 | .collect(); 44 | 45 | let new_encoding = Encoding::merge(encodings, growing_offsets); 46 | let js_encoding = JsEncoding { 47 | encoding: Some(new_encoding), 48 | }; 49 | 50 | Ok(js_encoding) 51 | } 52 | -------------------------------------------------------------------------------- /bindings/node/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2018", 4 | "strict": true, 5 | "moduleResolution": "node", 6 | "module": "CommonJS", 7 | "noUnusedLocals": true, 8 | "noUnusedParameters": true, 9 | "esModuleInterop": true, 10 | "allowSyntheticDefaultImports": true 11 | }, 12 | "include": ["."], 13 | "exclude": ["node_modules"] 14 | } 15 | -------------------------------------------------------------------------------- /bindings/node/types.ts: -------------------------------------------------------------------------------- 1 | export type TextInputSequence = string 2 | export type PreTokenizedInputSequence = string[] 3 | export type InputSequence = TextInputSequence | PreTokenizedInputSequence 4 | 5 | export type TextEncodeInput = TextInputSequence | [TextInputSequence, TextInputSequence] 6 | export type PreTokenizedEncodeInput = PreTokenizedInputSequence | [PreTokenizedInputSequence, PreTokenizedInputSequence] 7 | export type EncodeInput = TextEncodeInput | PreTokenizedEncodeInput 8 | -------------------------------------------------------------------------------- /bindings/python/.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [target.x86_64-apple-darwin] 2 | rustflags = [ 3 | "-C", "link-arg=-undefined", 4 | "-C", "link-arg=dynamic_lookup", 5 | "-C", "link-arg=-mmacosx-version-min=10.11", 6 | ] 7 | 8 | [target.aarch64-apple-darwin] 9 | rustflags = [ 10 | "-C", "link-arg=-undefined", 11 | "-C", "link-arg=dynamic_lookup", 12 | "-C", "link-arg=-mmacosx-version-min=10.11", 13 | ] 14 | -------------------------------------------------------------------------------- /bindings/python/.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | -------------------------------------------------------------------------------- /bindings/python/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "tokenizers-python" 3 | version = "0.21.4-dev.0" 4 | authors = ["Anthony MOI <m.anthony.moi@gmail.com>"] 5 | edition = "2021" 6 | 7 | [lib] 8 | name = "tokenizers" 9 | crate-type = ["cdylib"] 10 | 11 | [dependencies] 12 | rayon = "1.10" 13 | serde = { version = "1.0", features = ["rc", "derive"] } 14 | serde_json = "1.0" 15 | libc = "0.2" 16 | env_logger = "0.11" 17 | pyo3 = { version = "0.25", features = ["abi3", "abi3-py39", "py-clone"] } 18 | numpy = "0.25" 19 | ndarray = "0.16" 20 | itertools = "0.14" 21 | ahash = { version = "0.8.11", features = ["serde"] } 22 | 23 | [dependencies.tokenizers] 24 | path = "../../tokenizers" 25 | 26 | [dev-dependencies] 27 | tempfile = "3.10" 28 | pyo3 = { version = "0.25", features = ["auto-initialize"] } 29 | 30 | [features] 31 | default = ["pyo3/extension-module"] 32 | -------------------------------------------------------------------------------- /bindings/python/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include Cargo.toml 2 | include pyproject.toml 3 | include rust-toolchain 4 | include ../../LICENSE 5 | recursive-include src * 6 | recursive-include tokenizers-lib * 7 | recursive-exclude tokenizers-lib/target * 8 | -------------------------------------------------------------------------------- /bindings/python/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: style check-style test 2 | 3 | DATA_DIR = data 4 | 5 | dir_guard=@mkdir -p $(@D) 6 | check_dirs := examples py_src/tokenizers tests 7 | 8 | # Format source code automatically 9 | style: 10 | python stub.py 11 | ruff check $(check_dirs) --fix 12 | ruff format $(check_dirs) 13 | 14 | # Check the source code is formatted correctly 15 | check-style: 16 | python stub.py --check 17 | ruff check $(check_dirs) 18 | ruff format --check $(check_dirs) 19 | 20 | TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json 21 | 22 | # Launch the test suite 23 | test: $(TESTS_RESOURCES) 24 | pip install pytest requests setuptools_rust numpy pyarrow datasets 25 | python -m pytest -s -v tests 26 | cargo test --no-default-features 27 | 28 | $(DATA_DIR)/big.txt : 29 | $(dir_guard) 30 | wget https://norvig.com/big.txt -O $@ 31 | 32 | $(DATA_DIR)/small.txt : $(DATA_DIR)/big.txt 33 | head -100 $(DATA_DIR)/big.txt > $@ 34 | 35 | $(DATA_DIR)/roberta.json : 36 | $(dir_guard) 37 | wget https://huggingface.co/roberta-large/raw/main/tokenizer.json -O $@ 38 | -------------------------------------------------------------------------------- /bindings/python/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def pytest_addoption(parser): 5 | parser.addoption("--runslow", action="store_true", default=False, help="run slow tests") 6 | 7 | 8 | def pytest_configure(config): 9 | config.addinivalue_line("markers", "slow: mark test as slow to run") 10 | 11 | 12 | def pytest_collection_modifyitems(config, items): 13 | if config.getoption("--runslow"): 14 | # --runslow given in cli: do not skip slow tests 15 | return 16 | skip_slow = pytest.mark.skip(reason="need --runslow option to run") 17 | for item in items: 18 | if "slow" in item.keywords: 19 | item.add_marker(skip_slow) 20 | -------------------------------------------------------------------------------- /bindings/python/examples/train_bert_wordpiece.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | 4 | from tokenizers import BertWordPieceTokenizer 5 | 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument( 9 | "--files", 10 | default=None, 11 | metavar="path", 12 | type=str, 13 | required=True, 14 | help="The files to use as training; accept '**/*.txt' type of patterns \ 15 | if enclosed in quotes", 16 | ) 17 | parser.add_argument( 18 | "--out", 19 | default="./", 20 | type=str, 21 | help="Path to the output directory, where the files will be saved", 22 | ) 23 | parser.add_argument("--name", default="bert-wordpiece", type=str, help="The name of the output vocab files") 24 | args = parser.parse_args() 25 | 26 | files = glob.glob(args.files) 27 | if not files: 28 | print(f"File does not exist: {args.files}") 29 | exit(1) 30 | 31 | 32 | # Initialize an empty tokenizer 33 | tokenizer = BertWordPieceTokenizer( 34 | clean_text=True, 35 | handle_chinese_chars=True, 36 | strip_accents=True, 37 | lowercase=True, 38 | ) 39 | 40 | # And then train 41 | tokenizer.train( 42 | files, 43 | vocab_size=10000, 44 | min_frequency=2, 45 | show_progress=True, 46 | special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], 47 | limit_alphabet=1000, 48 | wordpieces_prefix="##", 49 | ) 50 | 51 | # Save the files 52 | tokenizer.save_model(args.out, args.name) 53 | -------------------------------------------------------------------------------- /bindings/python/examples/train_bytelevel_bpe.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | from os.path import join 4 | 5 | from tokenizers import ByteLevelBPETokenizer 6 | 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument( 10 | "--files", 11 | default=None, 12 | metavar="path", 13 | type=str, 14 | required=True, 15 | help="The files to use as training; accept '**/*.txt' type of patterns \ 16 | if enclosed in quotes", 17 | ) 18 | parser.add_argument( 19 | "--out", 20 | default="./", 21 | type=str, 22 | help="Path to the output directory, where the files will be saved", 23 | ) 24 | parser.add_argument("--name", default="bpe-bytelevel", type=str, help="The name of the output vocab files") 25 | args = parser.parse_args() 26 | 27 | files = glob.glob(args.files) 28 | if not files: 29 | print(f"File does not exist: {args.files}") 30 | exit(1) 31 | 32 | 33 | # Initialize an empty tokenizer 34 | tokenizer = ByteLevelBPETokenizer(add_prefix_space=True) 35 | 36 | # And then train 37 | tokenizer.train( 38 | files, 39 | vocab_size=10000, 40 | min_frequency=2, 41 | show_progress=True, 42 | special_tokens=["<s>", "<pad>", "</s>"], 43 | ) 44 | 45 | # Save the files 46 | tokenizer.save_model(args.out, args.name) 47 | 48 | # Restoring model from learned vocab/merges 49 | tokenizer = ByteLevelBPETokenizer( 50 | join(args.out, "{}-vocab.json".format(args.name)), 51 | join(args.out, "{}-merges.txt".format(args.name)), 52 | add_prefix_space=True, 53 | ) 54 | 55 | # Test encoding 56 | print(tokenizer.encode("Training ByteLevel BPE is very easy").tokens) 57 | -------------------------------------------------------------------------------- /bindings/python/examples/train_with_datasets.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | 3 | from tokenizers import Tokenizer, models, normalizers, pre_tokenizers 4 | 5 | 6 | # Build a tokenizer 7 | bpe_tokenizer = Tokenizer(models.BPE()) 8 | bpe_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() 9 | bpe_tokenizer.normalizer = normalizers.Lowercase() 10 | 11 | # Initialize a dataset 12 | dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="train") 13 | 14 | 15 | # Build an iterator over this dataset 16 | def batch_iterator(): 17 | batch_size = 1000 18 | for batch in dataset.iter(batch_size=batch_size): 19 | yield batch["text"] 20 | 21 | 22 | # And finally train 23 | bpe_tokenizer.train_from_iterator(batch_iterator(), length=len(dataset)) 24 | -------------------------------------------------------------------------------- /bindings/python/py_src/tokenizers/__init__.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import List, Tuple, Union 3 | 4 | 5 | Offsets = Tuple[int, int] 6 | 7 | TextInputSequence = str 8 | """A :obj:`str` that represents an input sequence """ 9 | 10 | PreTokenizedInputSequence = Union[List[str], Tuple[str]] 11 | """A pre-tokenized input sequence. Can be one of: 12 | 13 | - A :obj:`List` of :obj:`str` 14 | - A :obj:`Tuple` of :obj:`str` 15 | """ 16 | 17 | TextEncodeInput = Union[ 18 | TextInputSequence, 19 | Tuple[TextInputSequence, TextInputSequence], 20 | List[TextInputSequence], 21 | ] 22 | """Represents a textual input for encoding. Can be either: 23 | 24 | - A single sequence: :data:`~tokenizers.TextInputSequence` 25 | - A pair of sequences: 26 | 27 | - A :obj:`Tuple` of :data:`~tokenizers.TextInputSequence` 28 | - Or a :obj:`List` of :data:`~tokenizers.TextInputSequence` of size 2 29 | """ 30 | 31 | PreTokenizedEncodeInput = Union[ 32 | PreTokenizedInputSequence, 33 | Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence], 34 | List[PreTokenizedInputSequence], 35 | ] 36 | """Represents a pre-tokenized input for encoding. Can be either: 37 | 38 | - A single sequence: :data:`~tokenizers.PreTokenizedInputSequence` 39 | - A pair of sequences: 40 | 41 | - A :obj:`Tuple` of :data:`~tokenizers.PreTokenizedInputSequence` 42 | - Or a :obj:`List` of :data:`~tokenizers.PreTokenizedInputSequence` of size 2 43 | """ 44 | 45 | InputSequence = Union[TextInputSequence, PreTokenizedInputSequence] 46 | """Represents all the possible types of input sequences for encoding. Can be: 47 | 48 | - When ``is_pretokenized=False``: :data:`~TextInputSequence` 49 | - When ``is_pretokenized=True``: :data:`~PreTokenizedInputSequence` 50 | """ 51 | 52 | EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput] 53 | """Represents all the possible types of input for encoding. Can be: 54 | 55 | - When ``is_pretokenized=False``: :data:`~TextEncodeInput` 56 | - When ``is_pretokenized=True``: :data:`~PreTokenizedEncodeInput` 57 | """ 58 | 59 | 60 | class OffsetReferential(Enum): 61 | ORIGINAL = "original" 62 | NORMALIZED = "normalized" 63 | 64 | 65 | class OffsetType(Enum): 66 | BYTE = "byte" 67 | CHAR = "char" 68 | 69 | 70 | class SplitDelimiterBehavior(Enum): 71 | REMOVED = "removed" 72 | ISOLATED = "isolated" 73 | MERGED_WITH_PREVIOUS = "merged_with_previous" 74 | MERGED_WITH_NEXT = "merged_with_next" 75 | CONTIGUOUS = "contiguous" 76 | 77 | 78 | from .tokenizers import ( 79 | AddedToken, 80 | Encoding, 81 | NormalizedString, 82 | PreTokenizedString, 83 | Regex, 84 | Token, 85 | Tokenizer, 86 | decoders, 87 | models, 88 | normalizers, 89 | pre_tokenizers, 90 | processors, 91 | trainers, 92 | __version__, 93 | ) 94 | from .implementations import ( 95 | BertWordPieceTokenizer, 96 | ByteLevelBPETokenizer, 97 | CharBPETokenizer, 98 | SentencePieceBPETokenizer, 99 | SentencePieceUnigramTokenizer, 100 | ) 101 | -------------------------------------------------------------------------------- /bindings/python/py_src/tokenizers/decoders/__init__.py: -------------------------------------------------------------------------------- 1 | from .. import decoders 2 | 3 | 4 | Decoder = decoders.Decoder 5 | ByteLevel = decoders.ByteLevel 6 | Replace = decoders.Replace 7 | WordPiece = decoders.WordPiece 8 | ByteFallback = decoders.ByteFallback 9 | Fuse = decoders.Fuse 10 | Strip = decoders.Strip 11 | Metaspace = decoders.Metaspace 12 | BPEDecoder = decoders.BPEDecoder 13 | CTC = decoders.CTC 14 | Sequence = decoders.Sequence 15 | DecodeStream = decoders.DecodeStream 16 | -------------------------------------------------------------------------------- /bindings/python/py_src/tokenizers/implementations/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_tokenizer import BaseTokenizer 2 | from .bert_wordpiece import BertWordPieceTokenizer 3 | from .byte_level_bpe import ByteLevelBPETokenizer 4 | from .char_level_bpe import CharBPETokenizer 5 | from .sentencepiece_bpe import SentencePieceBPETokenizer 6 | from .sentencepiece_unigram import SentencePieceUnigramTokenizer 7 | -------------------------------------------------------------------------------- /bindings/python/py_src/tokenizers/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Generated content DO NOT EDIT 2 | from .. import models 3 | 4 | Model = models.Model 5 | BPE = models.BPE 6 | Unigram = models.Unigram 7 | WordLevel = models.WordLevel 8 | WordPiece = models.WordPiece 9 | -------------------------------------------------------------------------------- /bindings/python/py_src/tokenizers/normalizers/__init__.py: -------------------------------------------------------------------------------- 1 | from .. import normalizers 2 | 3 | 4 | Normalizer = normalizers.Normalizer 5 | BertNormalizer = normalizers.BertNormalizer 6 | NFD = normalizers.NFD 7 | NFKD = normalizers.NFKD 8 | NFC = normalizers.NFC 9 | NFKC = normalizers.NFKC 10 | Sequence = normalizers.Sequence 11 | Lowercase = normalizers.Lowercase 12 | Prepend = normalizers.Prepend 13 | Strip = normalizers.Strip 14 | StripAccents = normalizers.StripAccents 15 | Nmt = normalizers.Nmt 16 | Precompiled = normalizers.Precompiled 17 | Replace = normalizers.Replace 18 | ByteLevel = normalizers.ByteLevel 19 | 20 | NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD} 21 | 22 | 23 | def unicode_normalizer_from_str(normalizer: str) -> Normalizer: 24 | if normalizer not in NORMALIZERS: 25 | raise ValueError( 26 | "{} is not a known unicode normalizer. Available are {}".format(normalizer, NORMALIZERS.keys()) 27 | ) 28 | 29 | return NORMALIZERS[normalizer]() 30 | -------------------------------------------------------------------------------- /bindings/python/py_src/tokenizers/pre_tokenizers/__init__.py: -------------------------------------------------------------------------------- 1 | # Generated content DO NOT EDIT 2 | from .. import pre_tokenizers 3 | 4 | PreTokenizer = pre_tokenizers.PreTokenizer 5 | BertPreTokenizer = pre_tokenizers.BertPreTokenizer 6 | ByteLevel = pre_tokenizers.ByteLevel 7 | CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit 8 | Digits = pre_tokenizers.Digits 9 | FixedLength = pre_tokenizers.FixedLength 10 | Metaspace = pre_tokenizers.Metaspace 11 | Punctuation = pre_tokenizers.Punctuation 12 | Sequence = pre_tokenizers.Sequence 13 | Split = pre_tokenizers.Split 14 | UnicodeScripts = pre_tokenizers.UnicodeScripts 15 | Whitespace = pre_tokenizers.Whitespace 16 | WhitespaceSplit = pre_tokenizers.WhitespaceSplit 17 | -------------------------------------------------------------------------------- /bindings/python/py_src/tokenizers/processors/__init__.py: -------------------------------------------------------------------------------- 1 | # Generated content DO NOT EDIT 2 | from .. import processors 3 | 4 | PostProcessor = processors.PostProcessor 5 | BertProcessing = processors.BertProcessing 6 | ByteLevel = processors.ByteLevel 7 | RobertaProcessing = processors.RobertaProcessing 8 | Sequence = processors.Sequence 9 | TemplateProcessing = processors.TemplateProcessing 10 | -------------------------------------------------------------------------------- /bindings/python/py_src/tokenizers/tools/__init__.py: -------------------------------------------------------------------------------- 1 | from .visualizer import Annotation, EncodingVisualizer 2 | -------------------------------------------------------------------------------- /bindings/python/py_src/tokenizers/trainers/__init__.py: -------------------------------------------------------------------------------- 1 | # Generated content DO NOT EDIT 2 | from .. import trainers 3 | 4 | Trainer = trainers.Trainer 5 | BpeTrainer = trainers.BpeTrainer 6 | UnigramTrainer = trainers.UnigramTrainer 7 | WordLevelTrainer = trainers.WordLevelTrainer 8 | WordPieceTrainer = trainers.WordPieceTrainer 9 | -------------------------------------------------------------------------------- /bindings/python/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "tokenizers" 3 | requires-python = ">=3.9" 4 | authors = [ 5 | { name = "Nicolas Patry", email = "patry.nicolas@protonmail.com" }, 6 | { name = "Anthony Moi", email = "anthony@huggingface.co" }, 7 | ] 8 | classifiers = [ 9 | "Development Status :: 5 - Production/Stable", 10 | "Intended Audience :: Developers", 11 | "Intended Audience :: Education", 12 | "Intended Audience :: Science/Research", 13 | "License :: OSI Approved :: Apache Software License", 14 | "Operating System :: OS Independent", 15 | "Programming Language :: Python :: 3", 16 | "Programming Language :: Python :: 3.9", 17 | "Programming Language :: Python :: 3.10", 18 | "Programming Language :: Python :: 3.11", 19 | "Programming Language :: Python :: 3.12", 20 | "Programming Language :: Python :: 3.13", 21 | "Programming Language :: Python :: 3 :: Only", 22 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 23 | ] 24 | keywords = ["NLP", "tokenizer", "BPE", "transformer", "deep learning"] 25 | dynamic = ["description", "license", "readme", "version"] 26 | dependencies = ["huggingface_hub>=0.16.4,<1.0"] 27 | 28 | [project.urls] 29 | Homepage = "https://github.com/huggingface/tokenizers" 30 | Source = "https://github.com/huggingface/tokenizers" 31 | 32 | 33 | [project.optional-dependencies] 34 | testing = ["pytest", "requests", "numpy", "datasets", "black==22.3", "ruff"] 35 | docs = ["sphinx", "sphinx_rtd_theme", "setuptools_rust"] 36 | dev = ["tokenizers[testing]"] 37 | 38 | 39 | [build-system] 40 | requires = ["maturin>=1.0,<2.0"] 41 | build-backend = "maturin" 42 | 43 | [tool.maturin] 44 | python-source = "py_src" 45 | module-name = "tokenizers.tokenizers" 46 | bindings = "pyo3" 47 | features = ["pyo3/extension-module"] 48 | 49 | [tool.black] 50 | line-length = 119 51 | target-version = ["py35"] 52 | 53 | [tool.ruff] 54 | line-length = 119 55 | target-version = "py311" 56 | lint.ignore = [ 57 | # a == None in tests vs is None. 58 | "E711", 59 | # a == False in tests vs is False. 60 | "E712", 61 | # try.. import except.. pattern without using the lib. 62 | "F401", 63 | # Raw type equality is required in asserts 64 | "E721", 65 | # Import order 66 | "E402", 67 | # Fixtures unused import 68 | "F811", 69 | ] 70 | -------------------------------------------------------------------------------- /bindings/python/rust-toolchain: -------------------------------------------------------------------------------- 1 | stable 2 | -------------------------------------------------------------------------------- /bindings/python/setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | default_section = FIRSTPARTY 3 | ensure_newline_before_comments = True 4 | force_grid_wrap = 0 5 | include_trailing_comma = True 6 | known_first_party = transformers 7 | known_third_party = 8 | absl 9 | conllu 10 | datasets 11 | elasticsearch 12 | fairseq 13 | faiss-cpu 14 | fastprogress 15 | fire 16 | fugashi 17 | git 18 | h5py 19 | matplotlib 20 | nltk 21 | numpy 22 | packaging 23 | pandas 24 | PIL 25 | psutil 26 | pytest 27 | pytorch_lightning 28 | rouge_score 29 | sacrebleu 30 | seqeval 31 | sklearn 32 | streamlit 33 | tensorboardX 34 | tensorflow 35 | tensorflow_datasets 36 | timeout_decorator 37 | torch 38 | torchaudio 39 | torchtext 40 | torchvision 41 | torch_xla 42 | tqdm 43 | 44 | line_length = 119 45 | lines_after_imports = 2 46 | multi_line_output = 3 47 | use_parentheses = True 48 | 49 | [flake8] 50 | ignore = E203, E501, E741, W503, W605 51 | max-line-length = 119 52 | 53 | [tool:pytest] 54 | doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS 55 | -------------------------------------------------------------------------------- /bindings/python/src/error.rs: -------------------------------------------------------------------------------- 1 | use pyo3::exceptions; 2 | use pyo3::prelude::*; 3 | use pyo3::type_object::PyTypeInfo; 4 | use std::ffi::CString; 5 | use std::fmt::{Display, Formatter, Result as FmtResult}; 6 | use tokenizers::tokenizer::Result; 7 | 8 | #[derive(Debug)] 9 | pub struct PyError(pub String); 10 | impl PyError { 11 | #[allow(dead_code)] 12 | pub fn from(s: &str) -> Self { 13 | PyError(String::from(s)) 14 | } 15 | pub fn into_pyerr<T: PyTypeInfo>(self) -> PyErr { 16 | PyErr::new::<T, _>(format!("{self}")) 17 | } 18 | } 19 | impl Display for PyError { 20 | fn fmt(&self, fmt: &mut Formatter) -> FmtResult { 21 | write!(fmt, "{}", self.0) 22 | } 23 | } 24 | impl std::error::Error for PyError {} 25 | 26 | pub struct ToPyResult<T>(pub Result<T>); 27 | impl<T> From<ToPyResult<T>> for PyResult<T> { 28 | fn from(v: ToPyResult<T>) -> Self { 29 | v.0.map_err(|e| exceptions::PyException::new_err(format!("{e}"))) 30 | } 31 | } 32 | impl<T> ToPyResult<T> { 33 | pub fn into_py(self) -> PyResult<T> { 34 | self.into() 35 | } 36 | } 37 | 38 | pub(crate) fn deprecation_warning(py: Python<'_>, version: &str, message: &str) -> PyResult<()> { 39 | let deprecation_warning = py.import("builtins")?.getattr("DeprecationWarning")?; 40 | let full_message = format!("Deprecated in {version}: {message}"); 41 | pyo3::PyErr::warn(py, &deprecation_warning, &CString::new(full_message)?, 0) 42 | } 43 | -------------------------------------------------------------------------------- /bindings/python/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![warn(clippy::all)] 2 | #![allow(clippy::upper_case_acronyms)] 3 | // Many false positives with pyo3 it seems &str, and &PyAny get flagged 4 | #![allow(clippy::borrow_deref_ref)] 5 | 6 | extern crate tokenizers as tk; 7 | 8 | mod decoders; 9 | mod encoding; 10 | mod error; 11 | mod models; 12 | mod normalizers; 13 | mod pre_tokenizers; 14 | mod processors; 15 | mod token; 16 | mod tokenizer; 17 | mod trainers; 18 | mod utils; 19 | 20 | use pyo3::prelude::*; 21 | use pyo3::wrap_pymodule; 22 | 23 | pub const VERSION: &str = env!("CARGO_PKG_VERSION"); 24 | 25 | // For users using multiprocessing in python, it is quite easy to fork the process running 26 | // tokenizers, ending up with a deadlock because we internally make use of multithreading. So 27 | // we register a callback to be called in the event of a fork so that we can warn the user. 28 | #[cfg(target_family = "unix")] 29 | static mut REGISTERED_FORK_CALLBACK: bool = false; 30 | #[cfg(target_family = "unix")] 31 | extern "C" fn child_after_fork() { 32 | use tk::parallelism::*; 33 | if has_parallelism_been_used() && !is_parallelism_configured() { 34 | eprintln!( 35 | "huggingface/tokenizers: The current process just got forked, after parallelism has \ 36 | already been used. Disabling parallelism to avoid deadlocks..." 37 | ); 38 | eprintln!("To disable this warning, you can either:"); 39 | eprintln!( 40 | "\t- Avoid using `tokenizers` before the fork if possible\n\ 41 | \t- Explicitly set the environment variable {ENV_VARIABLE}=(true | false)" 42 | ); 43 | set_parallelism(false); 44 | } 45 | } 46 | 47 | /// Tokenizers Module 48 | #[pymodule] 49 | pub fn tokenizers(m: &Bound<'_, PyModule>) -> PyResult<()> { 50 | let _ = env_logger::try_init_from_env("TOKENIZERS_LOG"); 51 | 52 | // Register the fork callback 53 | #[cfg(target_family = "unix")] 54 | unsafe { 55 | if !REGISTERED_FORK_CALLBACK { 56 | libc::pthread_atfork(None, None, Some(child_after_fork)); 57 | REGISTERED_FORK_CALLBACK = true; 58 | } 59 | } 60 | 61 | m.add_class::<tokenizer::PyTokenizer>()?; 62 | m.add_class::<tokenizer::PyAddedToken>()?; 63 | m.add_class::<token::PyToken>()?; 64 | m.add_class::<encoding::PyEncoding>()?; 65 | m.add_class::<utils::PyRegex>()?; 66 | m.add_class::<utils::PyNormalizedString>()?; 67 | m.add_class::<utils::PyPreTokenizedString>()?; 68 | m.add_wrapped(wrap_pymodule!(models::models))?; 69 | m.add_wrapped(wrap_pymodule!(pre_tokenizers::pre_tokenizers))?; 70 | m.add_wrapped(wrap_pymodule!(decoders::decoders))?; 71 | m.add_wrapped(wrap_pymodule!(processors::processors))?; 72 | m.add_wrapped(wrap_pymodule!(normalizers::normalizers))?; 73 | m.add_wrapped(wrap_pymodule!(trainers::trainers))?; 74 | m.add("__version__", env!("CARGO_PKG_VERSION"))?; 75 | Ok(()) 76 | } 77 | -------------------------------------------------------------------------------- /bindings/python/src/token.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | use tk::Token; 3 | 4 | #[pyclass(module = "tokenizers", name = "Token")] 5 | #[derive(Clone)] 6 | pub struct PyToken { 7 | token: Token, 8 | } 9 | impl From<Token> for PyToken { 10 | fn from(token: Token) -> Self { 11 | Self { token } 12 | } 13 | } 14 | impl From<PyToken> for Token { 15 | fn from(token: PyToken) -> Self { 16 | token.token 17 | } 18 | } 19 | 20 | #[pymethods] 21 | impl PyToken { 22 | #[new] 23 | #[pyo3(text_signature = None)] 24 | fn new(id: u32, value: String, offsets: (usize, usize)) -> PyToken { 25 | Token::new(id, value, offsets).into() 26 | } 27 | 28 | #[getter] 29 | fn get_id(&self) -> u32 { 30 | self.token.id 31 | } 32 | 33 | #[getter] 34 | fn get_value(&self) -> &str { 35 | &self.token.value 36 | } 37 | 38 | #[getter] 39 | fn get_offsets(&self) -> (usize, usize) { 40 | self.token.offsets 41 | } 42 | 43 | fn as_tuple(&self) -> (u32, &str, (usize, usize)) { 44 | (self.token.id, &self.token.value, self.token.offsets) 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /bindings/python/src/utils/mod.rs: -------------------------------------------------------------------------------- 1 | use std::marker::PhantomData; 2 | use std::sync::{Arc, Mutex}; 3 | 4 | mod iterators; 5 | mod normalization; 6 | mod pretokenization; 7 | mod regex; 8 | pub mod serde_pyo3; 9 | 10 | pub use iterators::*; 11 | pub use normalization::*; 12 | pub use pretokenization::*; 13 | pub use regex::*; 14 | 15 | // RefMut utils 16 | 17 | pub trait DestroyPtr { 18 | fn destroy(&mut self); 19 | } 20 | 21 | pub struct RefMutGuard<'r, T: DestroyPtr> { 22 | content: T, 23 | r: PhantomData<&'r mut T>, 24 | } 25 | impl<T: DestroyPtr> RefMutGuard<'_, T> { 26 | pub fn new(content: T) -> Self { 27 | Self { 28 | content, 29 | r: PhantomData, 30 | } 31 | } 32 | 33 | pub fn get(&self) -> &T { 34 | &self.content 35 | } 36 | } 37 | 38 | impl<T: DestroyPtr> Drop for RefMutGuard<'_, T> { 39 | fn drop(&mut self) { 40 | self.content.destroy() 41 | } 42 | } 43 | 44 | #[derive(Clone)] 45 | pub struct RefMutContainer<T> { 46 | inner: Arc<Mutex<Option<*mut T>>>, 47 | } 48 | impl<T> RefMutContainer<T> { 49 | pub fn new(content: &mut T) -> Self { 50 | Self { 51 | inner: Arc::new(Mutex::new(Some(content))), 52 | } 53 | } 54 | 55 | pub fn map<F: FnOnce(&T) -> U, U>(&self, f: F) -> Option<U> { 56 | let lock = self.inner.lock().unwrap(); 57 | let ptr = lock.as_ref()?; 58 | Some(f(unsafe { ptr.as_ref().unwrap() })) 59 | } 60 | 61 | pub fn map_mut<F: FnOnce(&mut T) -> U, U>(&mut self, f: F) -> Option<U> { 62 | let lock = self.inner.lock().unwrap(); 63 | let ptr = lock.as_ref()?; 64 | Some(f(unsafe { ptr.as_mut().unwrap() })) 65 | } 66 | } 67 | 68 | impl<T> DestroyPtr for RefMutContainer<T> { 69 | fn destroy(&mut self) { 70 | self.inner.lock().unwrap().take(); 71 | } 72 | } 73 | 74 | unsafe impl<T: Send> Send for RefMutContainer<T> {} 75 | unsafe impl<T: Sync> Sync for RefMutContainer<T> {} 76 | -------------------------------------------------------------------------------- /bindings/python/src/utils/regex.rs: -------------------------------------------------------------------------------- 1 | use pyo3::exceptions; 2 | use pyo3::prelude::*; 3 | use tk::utils::SysRegex; 4 | 5 | /// Instantiate a new Regex with the given pattern 6 | #[pyclass(module = "tokenizers", name = "Regex")] 7 | pub struct PyRegex { 8 | pub inner: SysRegex, 9 | pub pattern: String, 10 | } 11 | 12 | #[pymethods] 13 | impl PyRegex { 14 | #[new] 15 | #[pyo3(text_signature = "(self, pattern)")] 16 | fn new(s: &str) -> PyResult<Self> { 17 | Ok(Self { 18 | inner: SysRegex::new(s) 19 | .map_err(|e| exceptions::PyException::new_err(e.to_string().to_owned()))?, 20 | pattern: s.to_owned(), 21 | }) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /bindings/python/test.txt: -------------------------------------------------------------------------------- 1 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 2 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 3 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 4 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 5 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 6 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 7 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 8 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 9 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 10 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 11 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 12 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 13 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 14 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 15 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 16 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 17 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 18 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 19 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 20 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 21 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 22 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 23 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 24 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 25 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 26 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 27 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 28 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 29 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 30 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 31 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 32 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 33 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 34 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 35 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 36 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT> 37 | -------------------------------------------------------------------------------- /bindings/python/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/tokenizers/dd4fc3df1a8a7cd135eecca2158db018d85f94f1/bindings/python/tests/__init__.py -------------------------------------------------------------------------------- /bindings/python/tests/bindings/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/tokenizers/dd4fc3df1a8a7cd135eecca2158db018d85f94f1/bindings/python/tests/bindings/__init__.py -------------------------------------------------------------------------------- /bindings/python/tests/documentation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/tokenizers/dd4fc3df1a8a7cd135eecca2158db018d85f94f1/bindings/python/tests/documentation/__init__.py -------------------------------------------------------------------------------- /bindings/python/tests/implementations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/tokenizers/dd4fc3df1a8a7cd135eecca2158db018d85f94f1/bindings/python/tests/implementations/__init__.py -------------------------------------------------------------------------------- /bindings/python/tests/implementations/test_base_tokenizer.py: -------------------------------------------------------------------------------- 1 | from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, processors 2 | from tokenizers.implementations import BaseTokenizer 3 | 4 | 5 | class TestBaseTokenizer: 6 | def test_get_set_components(self): 7 | toki = Tokenizer(models.BPE()) 8 | toki.normalizer = normalizers.NFC() 9 | toki.pre_tokenizer = pre_tokenizers.ByteLevel() 10 | toki.post_processor = processors.BertProcessing(("A", 0), ("B", 1)) 11 | toki.decoder = decoders.ByteLevel() 12 | 13 | tokenizer = BaseTokenizer(toki) 14 | 15 | assert isinstance(tokenizer.model, models.BPE) 16 | assert isinstance(tokenizer.normalizer, normalizers.NFC) 17 | assert isinstance(tokenizer.pre_tokenizer, pre_tokenizers.ByteLevel) 18 | assert isinstance(tokenizer.post_processor, processors.BertProcessing) 19 | assert isinstance(tokenizer.decoder, decoders.ByteLevel) 20 | 21 | tokenizer.model = models.Unigram() 22 | assert isinstance(tokenizer.model, models.Unigram) 23 | tokenizer.normalizer = normalizers.NFD() 24 | assert isinstance(tokenizer.normalizer, normalizers.NFD) 25 | tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() 26 | assert isinstance(tokenizer.pre_tokenizer, pre_tokenizers.Whitespace) 27 | tokenizer.post_processor = processors.ByteLevel() 28 | assert isinstance(tokenizer.post_processor, processors.ByteLevel) 29 | tokenizer.decoder = decoders.WordPiece() 30 | assert isinstance(tokenizer.decoder, decoders.WordPiece) 31 | -------------------------------------------------------------------------------- /bindings/python/tests/implementations/test_bert_wordpiece.py: -------------------------------------------------------------------------------- 1 | from tokenizers import BertWordPieceTokenizer 2 | 3 | from ..utils import bert_files, data_dir, multiprocessing_with_parallelism 4 | 5 | 6 | class TestBertWordPieceTokenizer: 7 | def test_basic_encode(self, bert_files): 8 | tokenizer = BertWordPieceTokenizer.from_file(bert_files["vocab"]) 9 | 10 | # Encode with special tokens by default 11 | output = tokenizer.encode("My name is John", "pair") 12 | assert output.ids == [101, 2026, 2171, 2003, 2198, 102, 3940, 102] 13 | assert output.tokens == [ 14 | "[CLS]", 15 | "my", 16 | "name", 17 | "is", 18 | "john", 19 | "[SEP]", 20 | "pair", 21 | "[SEP]", 22 | ] 23 | assert output.offsets == [ 24 | (0, 0), 25 | (0, 2), 26 | (3, 7), 27 | (8, 10), 28 | (11, 15), 29 | (0, 0), 30 | (0, 4), 31 | (0, 0), 32 | ] 33 | assert output.type_ids == [0, 0, 0, 0, 0, 0, 1, 1] 34 | 35 | # Can encode without the special tokens 36 | output = tokenizer.encode("My name is John", "pair", add_special_tokens=False) 37 | assert output.ids == [2026, 2171, 2003, 2198, 3940] 38 | assert output.tokens == ["my", "name", "is", "john", "pair"] 39 | assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)] 40 | assert output.type_ids == [0, 0, 0, 0, 1] 41 | 42 | def test_multiprocessing_with_parallelism(self, bert_files): 43 | tokenizer = BertWordPieceTokenizer.from_file(bert_files["vocab"]) 44 | multiprocessing_with_parallelism(tokenizer, False) 45 | multiprocessing_with_parallelism(tokenizer, True) 46 | 47 | def test_train_from_iterator(self): 48 | text = ["A first sentence", "Another sentence", "And a last one"] 49 | tokenizer = BertWordPieceTokenizer() 50 | tokenizer.train_from_iterator(text, show_progress=False) 51 | 52 | output = tokenizer.encode("A sentence") 53 | assert output.tokens == ["a", "sentence"] 54 | -------------------------------------------------------------------------------- /bindings/python/tests/implementations/test_byte_level_bpe.py: -------------------------------------------------------------------------------- 1 | from tokenizers import ByteLevelBPETokenizer 2 | 3 | from ..utils import data_dir, multiprocessing_with_parallelism, roberta_files 4 | 5 | 6 | class TestByteLevelBPE: 7 | def test_basic_encode(self, roberta_files): 8 | tokenizer = ByteLevelBPETokenizer.from_file(roberta_files["vocab"], roberta_files["merges"]) 9 | output = tokenizer.encode("The quick brown fox jumps over the lazy dog") 10 | 11 | assert output.ids == [133, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335] 12 | assert output.tokens == [ 13 | "The", 14 | "Ġquick", 15 | "Ġbrown", 16 | "Ġfox", 17 | "Ġjumps", 18 | "Ġover", 19 | "Ġthe", 20 | "Ġlazy", 21 | "Ġdog", 22 | ] 23 | assert output.offsets == [ 24 | (0, 3), 25 | (3, 9), 26 | (9, 15), 27 | (15, 19), 28 | (19, 25), 29 | (25, 30), 30 | (30, 34), 31 | (34, 39), 32 | (39, 43), 33 | ] 34 | 35 | def test_add_prefix_space(self, roberta_files): 36 | tokenizer = ByteLevelBPETokenizer.from_file( 37 | roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True 38 | ) 39 | output = tokenizer.encode("The quick brown fox jumps over the lazy dog") 40 | 41 | assert output.ids == [20, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335] 42 | assert output.tokens == [ 43 | "ĠThe", 44 | "Ġquick", 45 | "Ġbrown", 46 | "Ġfox", 47 | "Ġjumps", 48 | "Ġover", 49 | "Ġthe", 50 | "Ġlazy", 51 | "Ġdog", 52 | ] 53 | assert output.offsets == [ 54 | (0, 3), 55 | (3, 9), 56 | (9, 15), 57 | (15, 19), 58 | (19, 25), 59 | (25, 30), 60 | (30, 34), 61 | (34, 39), 62 | (39, 43), 63 | ] 64 | 65 | def test_lowerspace(self, roberta_files): 66 | tokenizer = ByteLevelBPETokenizer.from_file( 67 | roberta_files["vocab"], 68 | roberta_files["merges"], 69 | add_prefix_space=True, 70 | lowercase=True, 71 | ) 72 | output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog") 73 | 74 | assert output.ids == [5, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335] 75 | assert output.tokens == [ 76 | "Ġthe", 77 | "Ġquick", 78 | "Ġbrown", 79 | "Ġfox", 80 | "Ġjumps", 81 | "Ġover", 82 | "Ġthe", 83 | "Ġlazy", 84 | "Ġdog", 85 | ] 86 | 87 | def test_multiprocessing_with_parallelism(self, roberta_files): 88 | tokenizer = ByteLevelBPETokenizer.from_file(roberta_files["vocab"], roberta_files["merges"]) 89 | multiprocessing_with_parallelism(tokenizer, False) 90 | multiprocessing_with_parallelism(tokenizer, True) 91 | 92 | def test_train_from_iterator(self): 93 | text = ["A first sentence", "Another sentence", "And a last one"] 94 | tokenizer = ByteLevelBPETokenizer() 95 | tokenizer.train_from_iterator(text, show_progress=False) 96 | 97 | output = tokenizer.encode("A sentence") 98 | assert output.tokens == ["A", "Ġsentence"] 99 | -------------------------------------------------------------------------------- /bindings/python/tests/implementations/test_char_bpe.py: -------------------------------------------------------------------------------- 1 | from tokenizers import CharBPETokenizer 2 | 3 | from ..utils import data_dir, multiprocessing_with_parallelism, openai_files 4 | 5 | 6 | class TestCharBPETokenizer: 7 | def test_basic_encode(self, openai_files): 8 | tokenizer = CharBPETokenizer.from_file(openai_files["vocab"], openai_files["merges"]) 9 | 10 | output = tokenizer.encode("My name is John", "pair") 11 | assert output.ids == [0, 253, 1362, 544, 0, 7, 12662, 2688] 12 | assert output.tokens == [ 13 | "<unk>", 14 | "y</w>", 15 | "name</w>", 16 | "is</w>", 17 | "<unk>", 18 | "o", 19 | "hn</w>", 20 | "pair</w>", 21 | ] 22 | assert output.offsets == [ 23 | (0, 1), 24 | (1, 2), 25 | (3, 7), 26 | (8, 10), 27 | (11, 12), 28 | (12, 13), 29 | (13, 15), 30 | (0, 4), 31 | ] 32 | assert output.type_ids == [0, 0, 0, 0, 0, 0, 0, 1] 33 | 34 | def test_lowercase(self, openai_files): 35 | tokenizer = CharBPETokenizer.from_file(openai_files["vocab"], openai_files["merges"], lowercase=True) 36 | output = tokenizer.encode("My name is John", "pair", add_special_tokens=False) 37 | assert output.ids == [547, 1362, 544, 2476, 2688] 38 | assert output.tokens == ["my</w>", "name</w>", "is</w>", "john</w>", "pair</w>"] 39 | assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)] 40 | assert output.type_ids == [0, 0, 0, 0, 1] 41 | 42 | def test_decoding(self, openai_files): 43 | tokenizer = CharBPETokenizer.from_file(openai_files["vocab"], openai_files["merges"], lowercase=True) 44 | decoded = tokenizer.decode(tokenizer.encode("my name is john").ids) 45 | assert decoded == "my name is john" 46 | 47 | def test_multiprocessing_with_parallelism(self, openai_files): 48 | tokenizer = CharBPETokenizer.from_file(openai_files["vocab"], openai_files["merges"]) 49 | multiprocessing_with_parallelism(tokenizer, False) 50 | multiprocessing_with_parallelism(tokenizer, True) 51 | 52 | def test_train_from_iterator(self): 53 | text = ["A first sentence", "Another sentence", "And a last one"] 54 | tokenizer = CharBPETokenizer() 55 | tokenizer.train_from_iterator(text, show_progress=False) 56 | 57 | output = tokenizer.encode("A sentence") 58 | assert output.tokens == ["A</w>", "sentence</w>"] 59 | -------------------------------------------------------------------------------- /bindings/python/tests/implementations/test_sentencepiece.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from tokenizers import SentencePieceBPETokenizer, SentencePieceUnigramTokenizer 4 | 5 | 6 | class TestSentencePieceBPE: 7 | def test_train_from_iterator(self): 8 | text = ["A first sentence", "Another sentence", "And a last one"] 9 | tokenizer = SentencePieceBPETokenizer() 10 | tokenizer.train_from_iterator(text, show_progress=False) 11 | 12 | output = tokenizer.encode("A sentence") 13 | assert output.tokens == ["▁A", "▁sentence"] 14 | 15 | 16 | class TestSentencePieceUnigram: 17 | def test_train(self, tmpdir): 18 | p = tmpdir.mkdir("tmpdir").join("file.txt") 19 | p.write("A first sentence\nAnother sentence\nAnd a last one") 20 | 21 | tokenizer = SentencePieceUnigramTokenizer() 22 | tokenizer.train(files=str(p), show_progress=False) 23 | 24 | output = tokenizer.encode("A sentence") 25 | assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e"] 26 | 27 | with pytest.raises(Exception) as excinfo: 28 | _ = tokenizer.encode("A sentence 🤗") 29 | assert str(excinfo.value) == "Encountered an unknown token but `unk_id` is missing" 30 | 31 | def test_train_with_unk_token(self, tmpdir): 32 | p = tmpdir.mkdir("tmpdir").join("file.txt") 33 | p.write("A first sentence\nAnother sentence\nAnd a last one") 34 | 35 | tokenizer = SentencePieceUnigramTokenizer() 36 | tokenizer.train(files=str(p), show_progress=False, special_tokens=["<unk>"], unk_token="<unk>") 37 | output = tokenizer.encode("A sentence 🤗") 38 | assert output.ids[-1] == 0 39 | assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e", "▁", "🤗"] 40 | 41 | def test_train_from_iterator(self): 42 | text = ["A first sentence", "Another sentence", "And a last one"] 43 | tokenizer = SentencePieceUnigramTokenizer() 44 | tokenizer.train_from_iterator(text, show_progress=False) 45 | 46 | output = tokenizer.encode("A sentence") 47 | assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e"] 48 | 49 | with pytest.raises(Exception) as excinfo: 50 | _ = tokenizer.encode("A sentence 🤗") 51 | assert str(excinfo.value) == "Encountered an unknown token but `unk_id` is missing" 52 | 53 | def test_train_from_iterator_with_unk_token(self): 54 | text = ["A first sentence", "Another sentence", "And a last one"] 55 | tokenizer = SentencePieceUnigramTokenizer() 56 | tokenizer.train_from_iterator( 57 | text, vocab_size=100, show_progress=False, special_tokens=["<unk>"], unk_token="<unk>" 58 | ) 59 | output = tokenizer.encode("A sentence 🤗") 60 | assert output.ids[-1] == 0 61 | assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e", "▁", "🤗"] 62 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for those with `?=` 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | BUILDDIR ?= build 9 | SOURCEDIR = source 10 | 11 | # Put it first so that "make" without argument is like "make html_all". 12 | html_all: 13 | @echo "Generating doc for Rust" 14 | @$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)/rust" $(SPHINXOPTS) $(O) -t rust 15 | @echo "Generating doc for Python" 16 | @$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)/python" $(SPHINXOPTS) $(O) -t python 17 | @echo "Generating doc for Node.js" 18 | @$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)/node" $(SPHINXOPTS) $(O) -t node 19 | 20 | .PHONY: html_all Makefile 21 | 22 | # Catch-all target: route all unknown targets to Sphinx using the new 23 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 24 | %: Makefile 25 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 26 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | ## Requirements 2 | 3 | In order to generate the documentation, it is necessary to have a Python environment with the 4 | following: 5 | ```python 6 | pip install sphinx sphinx_rtd_theme setuptools_rust 7 | ``` 8 | 9 | It is also necessary to have the `tokenizers` library in this same environment, for Sphinx to 10 | generate all the API Reference and links properly. If you want to visualize the documentation with 11 | some modifications made to the Python bindings, make sure you build it from source. 12 | 13 | ## Building the documentation 14 | 15 | Once everything is setup, you can build the documentation automatically for all the languages 16 | using the following command in the `/docs` folder: 17 | 18 | ```bash 19 | make html_all 20 | ``` 21 | 22 | If you want to build only for a specific language, you can use: 23 | 24 | ```bash 25 | make html O="-t python" 26 | ``` 27 | 28 | (Replacing `python` by the target language among `rust`, `node`, and `python`) 29 | 30 | 31 | **NOTE** 32 | 33 | If you are making any structural change to the documentation, it is recommended to clean the build 34 | directory before rebuilding: 35 | 36 | ```bash 37 | make clean && make html_all 38 | ``` 39 | -------------------------------------------------------------------------------- /docs/source-doc-builder/_toctree.yml: -------------------------------------------------------------------------------- 1 | - sections: 2 | - local: index 3 | title: 🤗 Tokenizers 4 | - local: quicktour 5 | title: Quicktour 6 | - local: installation 7 | title: Installation 8 | - local: pipeline 9 | title: The tokenization pipeline 10 | - local: components 11 | title: Components 12 | - local: training_from_memory 13 | title: Training from memory 14 | title: Getting started 15 | - sections: 16 | - local: api/input-sequences 17 | title: Input Sequences 18 | - local: api/encode-inputs 19 | title: Encode Inputs 20 | - local: api/tokenizer 21 | title: Tokenizer 22 | - local: api/encoding 23 | title: Encoding 24 | - local: api/added-tokens 25 | title: Added Tokens 26 | - local: api/models 27 | title: Models 28 | - local: api/normalizers 29 | title: Normalizers 30 | - local: api/pre-tokenizers 31 | title: Pre-tokenizers 32 | - local: api/post-processors 33 | title: Post-processors 34 | - local: api/trainers 35 | title: Trainers 36 | - local: api/decoders 37 | title: Decoders 38 | - local: api/visualizer 39 | title: Visualizer 40 | title: API 41 | -------------------------------------------------------------------------------- /docs/source-doc-builder/api/added-tokens.mdx: -------------------------------------------------------------------------------- 1 | # Added Tokens 2 | 3 | <tokenizerslangcontent> 4 | <python> 5 | ## AddedToken 6 | 7 | [[autodoc]] tokenizers.AddedToken 8 | - content 9 | - lstrip 10 | - normalized 11 | - rstrip 12 | - single_word 13 | </python> 14 | <rust> 15 | The Rust API Reference is available directly on the [Docs.rs](https://docs.rs/tokenizers/latest/tokenizers/) website. 16 | </rust> 17 | <node> 18 | The node API has not been documented yet. 19 | </node> 20 | </tokenizerslangcontent> -------------------------------------------------------------------------------- /docs/source-doc-builder/api/decoders.mdx: -------------------------------------------------------------------------------- 1 | # Decoders 2 | 3 | <tokenizerslangcontent> 4 | <python> 5 | ## BPEDecoder 6 | 7 | [[autodoc]] tokenizers.decoders.BPEDecoder 8 | 9 | ## ByteLevel 10 | 11 | [[autodoc]] tokenizers.decoders.ByteLevel 12 | 13 | ## CTC 14 | 15 | [[autodoc]] tokenizers.decoders.CTC 16 | 17 | ## Metaspace 18 | 19 | [[autodoc]] tokenizers.decoders.Metaspace 20 | 21 | ## WordPiece 22 | 23 | [[autodoc]] tokenizers.decoders.WordPiece 24 | </python> 25 | <rust> 26 | The Rust API Reference is available directly on the [Docs.rs](https://docs.rs/tokenizers/latest/tokenizers/) website. 27 | </rust> 28 | <node> 29 | The node API has not been documented yet. 30 | </node> 31 | </tokenizerslangcontent> -------------------------------------------------------------------------------- /docs/source-doc-builder/api/encode-inputs.mdx: -------------------------------------------------------------------------------- 1 | # Encode Inputs 2 | 3 | <tokenizerslangcontent> 4 | <python> 5 | These types represent all the different kinds of input that a [`~tokenizers.Tokenizer`] accepts 6 | when using [`~tokenizers.Tokenizer.encode_batch`]. 7 | 8 | ## TextEncodeInput[[[[tokenizers.TextEncodeInput]]]] 9 | 10 | <code>tokenizers.TextEncodeInput</code> 11 | 12 | Represents a textual input for encoding. Can be either: 13 | - A single sequence: [TextInputSequence](/docs/tokenizers/api/input-sequences#tokenizers.TextInputSequence) 14 | - A pair of sequences: 15 | - A Tuple of [TextInputSequence](/docs/tokenizers/api/input-sequences#tokenizers.TextInputSequence) 16 | - Or a List of [TextInputSequence](/docs/tokenizers/api/input-sequences#tokenizers.TextInputSequence) of size 2 17 | 18 | alias of `Union[str, Tuple[str, str], List[str]]`. 19 | 20 | ## PreTokenizedEncodeInput[[[[tokenizers.PreTokenizedEncodeInput]]]] 21 | 22 | <code>tokenizers.PreTokenizedEncodeInput</code> 23 | 24 | Represents a pre-tokenized input for encoding. Can be either: 25 | - A single sequence: [PreTokenizedInputSequence](/docs/tokenizers/api/input-sequences#tokenizers.PreTokenizedInputSequence) 26 | - A pair of sequences: 27 | - A Tuple of [PreTokenizedInputSequence](/docs/tokenizers/api/input-sequences#tokenizers.PreTokenizedInputSequence) 28 | - Or a List of [PreTokenizedInputSequence](/docs/tokenizers/api/input-sequences#tokenizers.PreTokenizedInputSequence) of size 2 29 | 30 | alias of `Union[List[str], Tuple[str], Tuple[Union[List[str], Tuple[str]], Union[List[str], Tuple[str]]], List[Union[List[str], Tuple[str]]]]`. 31 | 32 | ## EncodeInput[[[[tokenizers.EncodeInput]]]] 33 | 34 | <code>tokenizers.EncodeInput</code> 35 | 36 | Represents all the possible types of input for encoding. Can be: 37 | - When `is_pretokenized=False`: [TextEncodeInput](#tokenizers.TextEncodeInput) 38 | - When `is_pretokenized=True`: [PreTokenizedEncodeInput](#tokenizers.PreTokenizedEncodeInput) 39 | 40 | alias of `Union[str, Tuple[str, str], List[str], Tuple[str], Tuple[Union[List[str], Tuple[str]], Union[List[str], Tuple[str]]], List[Union[List[str], Tuple[str]]]]`. 41 | </python> 42 | <rust> 43 | The Rust API Reference is available directly on the [Docs.rs](https://docs.rs/tokenizers/latest/tokenizers/) website. 44 | </rust> 45 | <node> 46 | The node API has not been documented yet. 47 | </node> 48 | </tokenizerslangcontent> -------------------------------------------------------------------------------- /docs/source-doc-builder/api/encoding.mdx: -------------------------------------------------------------------------------- 1 | # Encoding 2 | 3 | <tokenizerslangcontent> 4 | <python> 5 | ## Encoding 6 | 7 | [[autodoc]] tokenizers.Encoding 8 | - all 9 | - attention_mask 10 | - ids 11 | - n_sequences 12 | - offsets 13 | - overflowing 14 | - sequence_ids 15 | - special_tokens_mask 16 | - tokens 17 | - type_ids 18 | - word_ids 19 | - words 20 | </python> 21 | <rust> 22 | The Rust API Reference is available directly on the [Docs.rs](https://docs.rs/tokenizers/latest/tokenizers/) website. 23 | </rust> 24 | <node> 25 | The node API has not been documented yet. 26 | </node> 27 | </tokenizerslangcontent> -------------------------------------------------------------------------------- /docs/source-doc-builder/api/input-sequences.mdx: -------------------------------------------------------------------------------- 1 | # Input Sequences 2 | 3 | <tokenizerslangcontent> 4 | <python> 5 | These types represent all the different kinds of sequence that can be used as input of a Tokenizer. 6 | Globally, any sequence can be either a string or a list of strings, according to the operating 7 | mode of the tokenizer: `raw text` vs `pre-tokenized`. 8 | 9 | ## TextInputSequence[[tokenizers.TextInputSequence]] 10 | 11 | <code>tokenizers.TextInputSequence</code> 12 | 13 | A `str` that represents an input sequence 14 | 15 | ## PreTokenizedInputSequence[[tokenizers.PreTokenizedInputSequence]] 16 | 17 | <code>tokenizers.PreTokenizedInputSequence</code> 18 | 19 | A pre-tokenized input sequence. Can be one of: 20 | - A `List` of `str` 21 | - A `Tuple` of `str` 22 | 23 | alias of `Union[List[str], Tuple[str]]`. 24 | 25 | ## InputSequence[[tokenizers.InputSequence]] 26 | 27 | <code>tokenizers.InputSequence</code> 28 | 29 | Represents all the possible types of input sequences for encoding. Can be: 30 | - When `is_pretokenized=False`: [TextInputSequence](#tokenizers.TextInputSequence) 31 | - When `is_pretokenized=True`: [PreTokenizedInputSequence](#tokenizers.PreTokenizedInputSequence) 32 | 33 | alias of `Union[str, List[str], Tuple[str]]`. 34 | </python> 35 | <rust> 36 | The Rust API Reference is available directly on the [Docs.rs](https://docs.rs/tokenizers/latest/tokenizers/) website. 37 | </rust> 38 | <node> 39 | The node API has not been documented yet. 40 | </node> 41 | </tokenizerslangcontent> -------------------------------------------------------------------------------- /docs/source-doc-builder/api/models.mdx: -------------------------------------------------------------------------------- 1 | # Models 2 | 3 | <tokenizerslangcontent> 4 | <python> 5 | ## BPE 6 | 7 | [[autodoc]] tokenizers.models.BPE 8 | 9 | ## Model 10 | 11 | [[autodoc]] tokenizers.models.Model 12 | 13 | ## Unigram 14 | 15 | [[autodoc]] tokenizers.models.Unigram 16 | 17 | ## WordLevel 18 | 19 | [[autodoc]] tokenizers.models.WordLevel 20 | 21 | ## WordPiece 22 | 23 | [[autodoc]] tokenizers.models.WordPiece 24 | </python> 25 | <rust> 26 | The Rust API Reference is available directly on the [Docs.rs](https://docs.rs/tokenizers/latest/tokenizers/) website. 27 | </rust> 28 | <node> 29 | The node API has not been documented yet. 30 | </node> 31 | </tokenizerslangcontent> -------------------------------------------------------------------------------- /docs/source-doc-builder/api/normalizers.mdx: -------------------------------------------------------------------------------- 1 | # Normalizers 2 | 3 | <tokenizerslangcontent> 4 | <python> 5 | ## BertNormalizer 6 | 7 | [[autodoc]] tokenizers.normalizers.BertNormalizer 8 | 9 | ## Lowercase 10 | 11 | [[autodoc]] tokenizers.normalizers.Lowercase 12 | 13 | ## NFC 14 | 15 | [[autodoc]] tokenizers.normalizers.NFC 16 | 17 | ## NFD 18 | 19 | [[autodoc]] tokenizers.normalizers.NFD 20 | 21 | ## NFKC 22 | 23 | [[autodoc]] tokenizers.normalizers.NFKC 24 | 25 | ## NFKD 26 | 27 | [[autodoc]] tokenizers.normalizers.NFKD 28 | 29 | ## Nmt 30 | 31 | [[autodoc]] tokenizers.normalizers.Nmt 32 | 33 | ## Normalizer 34 | 35 | [[autodoc]] tokenizers.normalizers.Normalizer 36 | 37 | ## Precompiled 38 | 39 | [[autodoc]] tokenizers.normalizers.Precompiled 40 | 41 | ## Replace 42 | 43 | [[autodoc]] tokenizers.normalizers.Replace 44 | 45 | ## Sequence 46 | 47 | [[autodoc]] tokenizers.normalizers.Sequence 48 | 49 | ## Strip 50 | 51 | [[autodoc]] tokenizers.normalizers.Strip 52 | 53 | ## StripAccents 54 | 55 | [[autodoc]] tokenizers.normalizers.StripAccents 56 | </python> 57 | <rust> 58 | The Rust API Reference is available directly on the [Docs.rs](https://docs.rs/tokenizers/latest/tokenizers/) website. 59 | </rust> 60 | <node> 61 | The node API has not been documented yet. 62 | </node> 63 | </tokenizerslangcontent> -------------------------------------------------------------------------------- /docs/source-doc-builder/api/post-processors.mdx: -------------------------------------------------------------------------------- 1 | # Post-processors 2 | 3 | <tokenizerslangcontent> 4 | <python> 5 | ## BertProcessing 6 | 7 | [[autodoc]] tokenizers.processors.BertProcessing 8 | 9 | ## ByteLevel 10 | 11 | [[autodoc]] tokenizers.processors.ByteLevel 12 | 13 | ## RobertaProcessing 14 | 15 | [[autodoc]] tokenizers.processors.RobertaProcessing 16 | 17 | ## TemplateProcessing 18 | 19 | [[autodoc]] tokenizers.processors.TemplateProcessing 20 | </python> 21 | <rust> 22 | The Rust API Reference is available directly on the [Docs.rs](https://docs.rs/tokenizers/latest/tokenizers/) website. 23 | </rust> 24 | <node> 25 | The node API has not been documented yet. 26 | </node> 27 | </tokenizerslangcontent> -------------------------------------------------------------------------------- /docs/source-doc-builder/api/pre-tokenizers.mdx: -------------------------------------------------------------------------------- 1 | # Pre-tokenizers 2 | 3 | <tokenizerslangcontent> 4 | <python> 5 | ## BertPreTokenizer 6 | 7 | [[autodoc]] tokenizers.pre_tokenizers.BertPreTokenizer 8 | 9 | ## ByteLevel 10 | 11 | [[autodoc]] tokenizers.pre_tokenizers.ByteLevel 12 | 13 | ## CharDelimiterSplit 14 | 15 | [[autodoc]] tokenizers.pre_tokenizers.CharDelimiterSplit 16 | 17 | ## Digits 18 | 19 | [[autodoc]] tokenizers.pre_tokenizers.Digits 20 | 21 | ## Metaspace 22 | 23 | [[autodoc]] tokenizers.pre_tokenizers.Metaspace 24 | 25 | ## PreTokenizer 26 | 27 | [[autodoc]] tokenizers.pre_tokenizers.PreTokenizer 28 | 29 | ## Punctuation 30 | 31 | [[autodoc]] tokenizers.pre_tokenizers.Punctuation 32 | 33 | ## Sequence 34 | 35 | [[autodoc]] tokenizers.pre_tokenizers.Sequence 36 | 37 | ## Split 38 | 39 | [[autodoc]] tokenizers.pre_tokenizers.Split 40 | 41 | ## UnicodeScripts 42 | 43 | [[autodoc]] tokenizers.pre_tokenizers.UnicodeScripts 44 | 45 | ## Whitespace 46 | 47 | [[autodoc]] tokenizers.pre_tokenizers.Whitespace 48 | 49 | ## WhitespaceSplit 50 | 51 | [[autodoc]] tokenizers.pre_tokenizers.WhitespaceSplit 52 | </python> 53 | <rust> 54 | The Rust API Reference is available directly on the [Docs.rs](https://docs.rs/tokenizers/latest/tokenizers/) website. 55 | </rust> 56 | <node> 57 | The node API has not been documented yet. 58 | </node> 59 | </tokenizerslangcontent> -------------------------------------------------------------------------------- /docs/source-doc-builder/api/tokenizer.mdx: -------------------------------------------------------------------------------- 1 | # Tokenizer 2 | 3 | <tokenizerslangcontent> 4 | <python> 5 | ## Tokenizer 6 | 7 | [[autodoc]] tokenizers.Tokenizer 8 | - all 9 | - decoder 10 | - model 11 | - normalizer 12 | - padding 13 | - post_processor 14 | - pre_tokenizer 15 | - truncation 16 | </python> 17 | <rust> 18 | The Rust API Reference is available directly on the [Docs.rs](https://docs.rs/tokenizers/latest/tokenizers/) website. 19 | </rust> 20 | <node> 21 | The node API has not been documented yet. 22 | </node> 23 | </tokenizerslangcontent> -------------------------------------------------------------------------------- /docs/source-doc-builder/api/trainers.mdx: -------------------------------------------------------------------------------- 1 | # Trainers 2 | 3 | <tokenizerslangcontent> 4 | <python> 5 | ## BpeTrainer 6 | 7 | [[autodoc]] tokenizers.trainers.BpeTrainer 8 | 9 | ## UnigramTrainer 10 | 11 | [[autodoc]] tokenizers.trainers.UnigramTrainer 12 | 13 | ## WordLevelTrainer 14 | 15 | [[autodoc]] tokenizers.trainers.WordLevelTrainer 16 | 17 | ## WordPieceTrainer 18 | 19 | [[autodoc]] tokenizers.trainers.WordPieceTrainer 20 | </python> 21 | <rust> 22 | The Rust API Reference is available directly on the [Docs.rs](https://docs.rs/tokenizers/latest/tokenizers/) website. 23 | </rust> 24 | <node> 25 | The node API has not been documented yet. 26 | </node> 27 | </tokenizerslangcontent> -------------------------------------------------------------------------------- /docs/source-doc-builder/api/visualizer.mdx: -------------------------------------------------------------------------------- 1 | # Visualizer 2 | 3 | <tokenizerslangcontent> 4 | <python> 5 | ## Annotation 6 | 7 | [[autodoc]] tokenizers.tools.Annotation 8 | 9 | ## EncodingVisualizer 10 | 11 | [[autodoc]] tokenizers.tools.EncodingVisualizer 12 | - __call__ 13 | </python> 14 | <rust> 15 | The Rust API Reference is available directly on the [Docs.rs](https://docs.rs/tokenizers/latest/tokenizers/) website. 16 | </rust> 17 | <node> 18 | The node API has not been documented yet. 19 | </node> 20 | </tokenizerslangcontent> -------------------------------------------------------------------------------- /docs/source-doc-builder/index.mdx: -------------------------------------------------------------------------------- 1 | <!-- DISABLE-FRONTMATTER-SECTIONS --> 2 | 3 | # Tokenizers 4 | 5 | Fast State-of-the-art tokenizers, optimized for both research and 6 | production 7 | 8 | [🤗 Tokenizers](https://github.com/huggingface/tokenizers) provides an 9 | implementation of today's most used tokenizers, with a focus on 10 | performance and versatility. These tokenizers are also used in [🤗 Transformers](https://github.com/huggingface/transformers). 11 | 12 | # Main features: 13 | 14 | - Train new vocabularies and tokenize, using today's most used tokenizers. 15 | - Extremely fast (both training and tokenization), thanks to the Rust implementation. Takes less than 20 seconds to tokenize a GB of text on a server's CPU. 16 | - Easy to use, but also extremely versatile. 17 | - Designed for both research and production. 18 | - Full alignment tracking. Even with destructive normalization, it's always possible to get the part of the original sentence that corresponds to any token. 19 | - Does all the pre-processing: Truncation, Padding, add the special tokens your model needs. 20 | -------------------------------------------------------------------------------- /docs/source-doc-builder/installation.mdx: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | <tokenizerslangcontent> 4 | <python> 5 | 🤗 Tokenizers is tested on Python 3.5+. 6 | 7 | You should install 🤗 Tokenizers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're 8 | unfamiliar with Python virtual environments, check out the [user 9 | guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). 10 | Create a virtual environment with the version of Python you're going to 11 | use and activate it. 12 | 13 | ## Installation with pip 14 | 15 | 🤗 Tokenizers can be installed using pip as follows: 16 | 17 | ```bash 18 | pip install tokenizers 19 | ``` 20 | 21 | ## Installation from sources 22 | 23 | To use this method, you need to have the Rust language installed. You 24 | can follow [the official 25 | guide](https://www.rust-lang.org/learn/get-started) for more 26 | information. 27 | 28 | If you are using a unix based OS, the installation should be as simple 29 | as running: 30 | 31 | ```bash 32 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh 33 | ``` 34 | 35 | Or you can easily update it with the following command: 36 | 37 | ```bash 38 | rustup update 39 | ``` 40 | 41 | Once rust is installed, we can start retrieving the sources for 🤗 42 | Tokenizers: 43 | 44 | ```bash 45 | git clone https://github.com/huggingface/tokenizers 46 | ``` 47 | 48 | Then we go into the python bindings folder: 49 | 50 | ```bash 51 | cd tokenizers/bindings/python 52 | ``` 53 | 54 | At this point you should have your [virtual environment]() already 55 | activated. In order to compile 🤗 Tokenizers, you need to: 56 | 57 | ```bash 58 | pip install -e . 59 | ``` 60 | 61 | </python> 62 | <rust> 63 | ## Crates.io 64 | 65 | 🤗 Tokenizers is available on [crates.io](https://crates.io/crates/tokenizers). 66 | 67 | You just need to add it to your `Cargo.toml`: 68 | 69 | ```bash 70 | cargo add tokenizers 71 | ``` 72 | </rust> 73 | <node> 74 | ## Installation with npm 75 | 76 | You can simply install 🤗 Tokenizers with npm using: 77 | 78 | ```bash 79 | npm install tokenizers 80 | ``` 81 | </node> 82 | </tokenizerslangcontent> 83 | -------------------------------------------------------------------------------- /docs/source/_ext/rust_doc.py: -------------------------------------------------------------------------------- 1 | from docutils import nodes 2 | 3 | import sphinx 4 | from sphinx.locale import _ 5 | 6 | from conf import rust_version 7 | 8 | logger = sphinx.util.logging.getLogger(__name__) 9 | 10 | 11 | class RustRef: 12 | def __call__(self, name, rawtext, text, lineno, inliner, options={}, content=[]): 13 | doctype = name.split("_")[1] 14 | parts = text.split("::") 15 | 16 | if text.startswith("~"): 17 | title = parts[-1] 18 | parts[0] = parts[0][1:] 19 | else: 20 | content = text 21 | link = self.base_link() 22 | 23 | if doctype == "struct": 24 | l, title = self.make_struct_link(parts, title) 25 | if doctype == "func": 26 | l, title = self.make_func_link(parts, title) 27 | if doctype == "meth": 28 | l, title = self.make_meth_link(parts, title) 29 | if doctype == "trait": 30 | l, title = self.make_trait_link(parts, title) 31 | link += l 32 | 33 | node = nodes.reference(internal=False, refuri=link, text=title) 34 | wrapper = nodes.literal(classes=["xref"]) 35 | wrapper += node 36 | 37 | return [wrapper], [] 38 | 39 | def base_link(self): 40 | return f"https://docs.rs/tokenizers/{rust_version}" 41 | 42 | def make_struct_link(self, parts, title): 43 | link = "" 44 | struct_name = parts[-1] 45 | path = parts[:-1] 46 | 47 | for p in path: 48 | link += f"/{p}" 49 | link += f"/struct.{struct_name}.html" 50 | 51 | return link, title 52 | 53 | def make_func_link(self, parts, title): 54 | link = "" 55 | fn_name = parts[-1] 56 | 57 | path = parts[:-1] 58 | for p in path: 59 | link += f"/{p}" 60 | link += f"/fn.{fn_name}.html" 61 | 62 | return link, title 63 | 64 | def make_meth_link(self, parts, title): 65 | meth_name = parts[-1] 66 | if meth_name.endswith("()"): 67 | meth_name = meth_name[:-2] 68 | 69 | link, title = self.make_struct_link(parts[:-1], title) 70 | link += f"#method.{meth_name}" 71 | 72 | if not title.endswith(")"): 73 | title += "()" 74 | 75 | return link, title 76 | 77 | def make_trait_link(self, parts, title): 78 | link = "" 79 | trait_name = parts[-1] 80 | 81 | path = parts[:-1] 82 | for p in path: 83 | link += f"/{p}" 84 | link += f"/trait.{trait_name}.html" 85 | 86 | return link, title 87 | 88 | 89 | def setup(app): 90 | app.add_role("rust_struct", RustRef()) 91 | app.add_role("rust_func", RustRef()) 92 | app.add_role("rust_meth", RustRef()) 93 | app.add_role("rust_trait", RustRef()) 94 | 95 | return { 96 | "version": "0.1", 97 | "parallel_read_safe": True, 98 | "parallel_write_safe": True, 99 | } 100 | -------------------------------------------------------------------------------- /docs/source/_ext/toctree_tags.py: -------------------------------------------------------------------------------- 1 | import re 2 | from sphinx.directives.other import TocTree 3 | 4 | 5 | class TocTreeTags(TocTree): 6 | hasPat = re.compile("^\s*:(.+):(.+)quot;) 7 | 8 | def filter_entries(self, entries): 9 | filtered = [] 10 | for e in entries: 11 | m = self.hasPat.match(e) 12 | if m != None: 13 | if self.env.app.tags.has(m.groups()[0]): 14 | filtered.append(m.groups()[1]) 15 | else: 16 | filtered.append(e) 17 | return filtered 18 | 19 | def run(self): 20 | self.content = self.filter_entries(self.content) 21 | return super().run() 22 | 23 | 24 | def setup(app): 25 | app.add_directive("toctree-tags", TocTreeTags) 26 | 27 | return { 28 | "version": "0.1", 29 | } 30 | -------------------------------------------------------------------------------- /docs/source/_static/css/Calibre-Light.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/tokenizers/dd4fc3df1a8a7cd135eecca2158db018d85f94f1/docs/source/_static/css/Calibre-Light.ttf -------------------------------------------------------------------------------- /docs/source/_static/css/Calibre-Medium.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/tokenizers/dd4fc3df1a8a7cd135eecca2158db018d85f94f1/docs/source/_static/css/Calibre-Medium.otf -------------------------------------------------------------------------------- /docs/source/_static/css/Calibre-Regular.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/tokenizers/dd4fc3df1a8a7cd135eecca2158db018d85f94f1/docs/source/_static/css/Calibre-Regular.otf -------------------------------------------------------------------------------- /docs/source/_static/css/Calibre-Thin.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/tokenizers/dd4fc3df1a8a7cd135eecca2158db018d85f94f1/docs/source/_static/css/Calibre-Thin.otf -------------------------------------------------------------------------------- /docs/source/_static/css/code-snippets.css: -------------------------------------------------------------------------------- 1 | 2 | .highlight .c1, .highlight .sd{ 3 | color: #999 4 | } 5 | 6 | .highlight .nn, .highlight .k, .highlight .s1, .highlight .nb, .highlight .bp, .highlight .kc, .highlight .kt { 7 | color: #FB8D68; 8 | } 9 | 10 | .highlight .kn, .highlight .nv, .highlight .s2, .highlight .ow, .highlight .kd, .highlight .kr, .highlight .s { 11 | color: #6670FF; 12 | } 13 | 14 | .highlight .gp { 15 | color: #FB8D68; 16 | } 17 | -------------------------------------------------------------------------------- /docs/source/api/node.inc: -------------------------------------------------------------------------------- 1 | Documentation 2 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 3 | 4 | The node API has not been documented yet. 5 | -------------------------------------------------------------------------------- /docs/source/api/python.inc: -------------------------------------------------------------------------------- 1 | Input sequences 2 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 3 | 4 | These types represent all the different kinds of sequence that can be used as input of a Tokenizer. 5 | Globally, any sequence can be either a string or a list of strings, according to the operating 6 | mode of the tokenizer: ``raw text`` vs ``pre-tokenized``. 7 | 8 | .. autodata:: tokenizers.TextInputSequence 9 | 10 | .. autodata:: tokenizers.PreTokenizedInputSequence 11 | 12 | .. autodata:: tokenizers.InputSequence 13 | 14 | 15 | Encode inputs 16 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 17 | 18 | These types represent all the different kinds of input that a :class:`~tokenizers.Tokenizer` accepts 19 | when using :meth:`~tokenizers.Tokenizer.encode_batch`. 20 | 21 | .. autodata:: tokenizers.TextEncodeInput 22 | 23 | .. autodata:: tokenizers.PreTokenizedEncodeInput 24 | 25 | .. autodata:: tokenizers.EncodeInput 26 | 27 | 28 | Tokenizer 29 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 30 | 31 | .. autoclass:: tokenizers.Tokenizer 32 | :members: 33 | 34 | 35 | Encoding 36 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 37 | 38 | .. autoclass:: tokenizers.Encoding 39 | :members: 40 | 41 | 42 | Added Tokens 43 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 44 | 45 | .. autoclass:: tokenizers.AddedToken 46 | :members: 47 | 48 | 49 | Models 50 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 51 | 52 | .. automodule:: tokenizers.models 53 | :members: 54 | 55 | Normalizers 56 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 57 | 58 | .. automodule:: tokenizers.normalizers 59 | :members: 60 | 61 | 62 | Pre-tokenizers 63 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 64 | 65 | .. automodule:: tokenizers.pre_tokenizers 66 | :members: 67 | 68 | 69 | Post-processor 70 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 71 | 72 | .. automodule:: tokenizers.processors 73 | :members: 74 | 75 | 76 | Trainers 77 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 78 | 79 | .. automodule:: tokenizers.trainers 80 | :members: 81 | 82 | Decoders 83 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 84 | 85 | .. automodule:: tokenizers.decoders 86 | :members: 87 | 88 | Visualizer 89 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 90 | 91 | .. autoclass:: tokenizers.tools.Annotation 92 | :members: 93 | 94 | .. autoclass:: tokenizers.tools.EncodingVisualizer 95 | :members: __call__ 96 | -------------------------------------------------------------------------------- /docs/source/api/reference.rst: -------------------------------------------------------------------------------- 1 | .. only:: python 2 | 3 | .. include:: python.inc 4 | 5 | .. only:: rust 6 | 7 | .. include:: rust.inc 8 | 9 | .. only:: node 10 | 11 | .. include:: node.inc 12 | -------------------------------------------------------------------------------- /docs/source/api/rust.inc: -------------------------------------------------------------------------------- 1 | Documentation 2 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 3 | 4 | The Rust API Reference is available directly on the `Docs.rs <https://docs.rs/tokenizers>`__ 5 | website. 6 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | 16 | sys.path.insert(0, os.path.abspath("./_ext")) 17 | sys.path.insert(0, os.path.abspath(".")) 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = "tokenizers" 23 | copyright = "2020, huggingface" 24 | author = "huggingface" 25 | 26 | # The full version, including alpha/beta/rc tags 27 | release = "" 28 | 29 | # -- Custom information ------------------------------------------------------ 30 | 31 | # The possible values for languages (used by `_ext/entities`) 32 | languages = ["node", "rust", "python"] 33 | 34 | # This defines the version used to generate links to docs.rs 35 | rust_version = "latest" 36 | 37 | # -- General configuration --------------------------------------------------- 38 | 39 | # Add any Sphinx extension module names here, as strings. They can be 40 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 41 | # ones. 42 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.napoleon", "entities", "rust_doc", "toctree_tags"] 43 | 44 | # Add any paths that contain templates here, relative to this directory. 45 | templates_path = ["_templates"] 46 | 47 | # List of patterns, relative to source directory, that match files and 48 | # directories to ignore when looking for source files. 49 | # This pattern also affects html_static_path and html_extra_path. 50 | exclude_patterns = [] 51 | 52 | # -- Options for HTML output ------------------------------------------------- 53 | 54 | # The theme to use for HTML and HTML Help pages. See the documentation for 55 | # a list of builtin themes. 56 | # 57 | html_theme = "sphinx_rtd_theme" 58 | 59 | # Theme options are theme-specific and customize the look and feel of a theme 60 | # further. For a list of options available for each theme, see the 61 | # documentation. 62 | # 63 | html_theme_options = {"analytics_id": "UA-83738774-2"} 64 | 65 | # Add any paths that contain custom static files (such as style sheets) here, 66 | # relative to this directory. They are copied after the builtin static files, 67 | # so a file named "default.css" will overwrite the builtin "default.css". 68 | html_static_path = ["_static"] 69 | 70 | 71 | def setup(app): 72 | for language in languages: 73 | if not tags.has(language): 74 | exclude_patterns.append(f"tutorials/{language}/*") 75 | 76 | app.add_css_file("css/huggingface.css") 77 | app.add_css_file("css/code-snippets.css") 78 | app.add_js_file("js/custom.js") 79 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | Tokenizers 2 | ==================================================================================================== 3 | 4 | Fast State-of-the-art tokenizers, optimized for both research and production 5 | 6 | `🤗 Tokenizers`_ provides an implementation of today's most used tokenizers, with 7 | a focus on performance and versatility. These tokenizers are also used in 8 | `🤗 Transformers`_. 9 | 10 | .. _🤗 Tokenizers: https://github.com/huggingface/tokenizers 11 | .. _🤗 Transformers: https://github.com/huggingface/transformers 12 | 13 | Main features: 14 | ---------------------------------------------------------------------------------------------------- 15 | 16 | - Train new vocabularies and tokenize, using today's most used tokenizers. 17 | - Extremely fast (both training and tokenization), thanks to the Rust implementation. Takes 18 | less than 20 seconds to tokenize a GB of text on a server's CPU. 19 | - Easy to use, but also extremely versatile. 20 | - Designed for both research and production. 21 | - Full alignment tracking. Even with destructive normalization, it's always possible to get 22 | the part of the original sentence that corresponds to any token. 23 | - Does all the pre-processing: Truncation, Padding, add the special tokens your model needs. 24 | 25 | 26 | .. toctree:: 27 | :maxdepth: 2 28 | :caption: Getting Started 29 | 30 | quicktour 31 | installation/main 32 | pipeline 33 | components 34 | 35 | .. toctree-tags:: 36 | :maxdepth: 3 37 | :caption: Using 🤗 Tokenizers 38 | :glob: 39 | 40 | :python:tutorials/python/* 41 | 42 | .. toctree:: 43 | :maxdepth: 3 44 | :caption: API Reference 45 | 46 | api/reference 47 | 48 | .. include:: entities.inc 49 | -------------------------------------------------------------------------------- /docs/source/installation/main.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ==================================================================================================== 3 | 4 | .. only:: python 5 | 6 | .. include:: python.inc 7 | 8 | .. only:: rust 9 | 10 | .. include:: rust.inc 11 | 12 | .. only:: node 13 | 14 | .. include:: node.inc 15 | 16 | -------------------------------------------------------------------------------- /docs/source/installation/node.inc: -------------------------------------------------------------------------------- 1 | Installation with npm 2 | ---------------------------------------------------------------------------------------------------- 3 | 4 | You can simply install 🤗 Tokenizers with npm using:: 5 | 6 | npm install tokenizers 7 | -------------------------------------------------------------------------------- /docs/source/installation/python.inc: -------------------------------------------------------------------------------- 1 | 🤗 Tokenizers is tested on Python 3.5+. 2 | 3 | You should install 🤗 Tokenizers in a 4 | `virtual environment <https://docs.python.org/3/library/venv.html>`_. If you're unfamiliar with 5 | Python virtual environments, check out the 6 | `user guide <https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/>`__. 7 | Create a virtual environment with the version of Python you're going to use and activate it. 8 | 9 | Installation with pip 10 | ---------------------------------------------------------------------------------------------------- 11 | 12 | 🤗 Tokenizers can be installed using pip as follows:: 13 | 14 | pip install tokenizers 15 | 16 | 17 | Installation from sources 18 | ---------------------------------------------------------------------------------------------------- 19 | 20 | To use this method, you need to have the Rust language installed. You can follow 21 | `the official guide <https://www.rust-lang.org/learn/get-started>`__ for more information. 22 | 23 | If you are using a unix based OS, the installation should be as simple as running:: 24 | 25 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh 26 | 27 | Or you can easily update it with the following command:: 28 | 29 | rustup update 30 | 31 | Once rust is installed, we can start retrieving the sources for 🤗 Tokenizers:: 32 | 33 | git clone https://github.com/huggingface/tokenizers 34 | 35 | Then we go into the python bindings folder:: 36 | 37 | cd tokenizers/bindings/python 38 | 39 | At this point you should have your `virtual environment`_ already activated. In order to 40 | compile 🤗 Tokenizers, you need to:: 41 | 42 | pip install -e . 43 | -------------------------------------------------------------------------------- /docs/source/installation/rust.inc: -------------------------------------------------------------------------------- 1 | Crates.io 2 | ---------------------------------------------------------------------------------------------------- 3 | 4 | 🤗 Tokenizers is available on `crates.io <https://crates.io/crates/tokenizers>`__. 5 | 6 | You just need to add it to your :obj:`Cargo.toml`:: 7 | 8 | tokenizers = "0.10" 9 | -------------------------------------------------------------------------------- /tokenizers/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | authors = ["Anthony MOI <m.anthony.moi@gmail.com>", "Nicolas Patry <patry.nicolas@protonmail.com>"] 3 | edition = "2018" 4 | name = "tokenizers" 5 | version = "0.21.4-dev.0" 6 | homepage = "https://github.com/huggingface/tokenizers" 7 | repository = "https://github.com/huggingface/tokenizers" 8 | documentation = "https://docs.rs/tokenizers/" 9 | license = "Apache-2.0" 10 | keywords = ["tokenizer", "NLP", "huggingface", "BPE", "WordPiece"] 11 | readme = "./README.md" 12 | description = """ 13 | Provides an implementation of today's most used tokenizers, 14 | with a focus on performances and versatility. 15 | """ 16 | exclude = [ "rust-toolchain", "target/*", "Cargo.lock", "benches/*.txt", "benches/*.json", "data/*" ] 17 | 18 | [lib] 19 | name = "tokenizers" 20 | path = "src/lib.rs" 21 | bench = false 22 | 23 | [[bench]] 24 | name = "bpe_benchmark" 25 | harness = false 26 | 27 | [[bench]] 28 | name = "bert_benchmark" 29 | harness = false 30 | 31 | [[bench]] 32 | name = "layout_benchmark" 33 | harness = false 34 | 35 | [[bench]] 36 | name = "unigram_benchmark" 37 | harness = false 38 | 39 | [[bench]] 40 | name = "llama3_benchmark" 41 | harness = false 42 | 43 | [dependencies] 44 | rand = "0.9" 45 | onig = { version = "6.5.1", default-features = false, optional = true } 46 | regex = "1.10" 47 | regex-syntax = "0.8" 48 | rayon = "1.10" 49 | rayon-cond = "0.4" 50 | serde = { version = "1.0", features = [ "derive" ] } 51 | serde_json = "1.0" 52 | unicode-normalization-alignments = "0.1" 53 | unicode_categories = "0.1" 54 | unicode-segmentation = "1.11" 55 | indicatif = {version = "0.17", optional = true} 56 | itertools = "0.14" 57 | log = "0.4" 58 | derive_builder = "0.20" 59 | spm_precompiled = "0.1.3" 60 | hf-hub = { version = "0.4.1", features = ["ureq"], default-features = false, optional = true } 61 | aho-corasick = "1.1" 62 | paste = "1.0.14" 63 | macro_rules_attribute = "0.2.0" 64 | thiserror = "2" 65 | fancy-regex = { version = "0.14", optional = true} 66 | getrandom = { version = "0.3" } 67 | esaxx-rs = { version = "0.1.10", default-features = false, features=[]} 68 | monostate = "0.1.12" 69 | ahash = { version = "0.8.11", features = ["serde"] } 70 | dary_heap = { version = "0.3.6", features = ["serde"] } 71 | compact_str = { version = "0.9", features = ["serde"] } 72 | 73 | [features] 74 | default = ["progressbar", "onig", "esaxx_fast"] 75 | esaxx_fast = ["esaxx-rs/cpp"] 76 | progressbar = ["indicatif"] 77 | http = ["hf-hub"] 78 | unstable_wasm = ["fancy-regex", "getrandom/wasm_js"] 79 | rustls-tls = ["hf-hub?/rustls-tls"] 80 | 81 | [dev-dependencies] 82 | criterion = "0.6" 83 | tempfile = "3.10" 84 | assert_approx_eq = "1.1" 85 | tracing = "0.1" 86 | tracing-subscriber = "0.3.18" 87 | 88 | [profile.release] 89 | lto = "fat" 90 | 91 | [[example]] 92 | name = "encode_batch" 93 | required-features = ["http"] 94 | 95 | -------------------------------------------------------------------------------- /tokenizers/LICENSE: -------------------------------------------------------------------------------- 1 | ../LICENSE -------------------------------------------------------------------------------- /tokenizers/Makefile: -------------------------------------------------------------------------------- 1 | DATA_DIR = data 2 | BENCHMARK_DIR = benches 3 | TESTS_DIR = tests 4 | 5 | dir_guard=@mkdir -p $(@D) 6 | 7 | SHARED_RESOURCES = $(DATA_DIR)/gpt2-vocab.json $(DATA_DIR)/gpt2-merges.txt $(DATA_DIR)/bert-base-uncased-vocab.txt $(DATA_DIR)/big.txt $(DATA_DIR)/small.txt $(DATA_DIR)/albert-base-v1-tokenizer.json $(DATA_DIR)/llama-3-tokenizer.json 8 | BENCHMARK_RESOURCES = $(SHARED_RESOURCES) 9 | TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json $(DATA_DIR)/bert-wiki.json 10 | 11 | .PHONY : build 12 | build : 13 | cargo build --all-targets 14 | 15 | .PHONY : release 16 | release : 17 | cargo build --release 18 | 19 | .PHONY : format 20 | format : 21 | cargo fmt -- 22 | 23 | .PHONY : lint 24 | lint : 25 | cargo fmt -- --check 26 | cargo fmt -- $(BENCHMARK_DIR)/*.rs --check 27 | cargo clippy --all-targets --all-features -- -D warnings 28 | 29 | .PHONY : test 30 | test : $(TESTS_RESOURCES) 31 | cargo test 32 | 33 | .PHONY : doc 34 | doc : 35 | cargo doc 36 | 37 | .PHONY : publish 38 | publish : 39 | cargo publish 40 | 41 | .PHONY : all-checks 42 | all-checks : lint test doc 43 | 44 | .PHONY : bench 45 | bench : $(BENCHMARK_RESOURCES) 46 | cargo bench -- --verbose 47 | 48 | $(DATA_DIR)/gpt2-% : 49 | $(dir_guard) 50 | wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-$* -O $@ 51 | 52 | $(DATA_DIR)/bert-% : 53 | $(dir_guard) 54 | wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-$* -O $@ 55 | 56 | $(DATA_DIR)/unigram% : 57 | $(dir_guard) 58 | wget https://huggingface.co/Narsil/small/raw/main/unigram$* -O $@ 59 | 60 | $(DATA_DIR)/albert-base-v1-tokenizer.json : 61 | $(dir_guard) 62 | wget https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json -O $@ 63 | 64 | $(DATA_DIR)/tokenizer-llama3.json : 65 | $(dir_guard) 66 | wget https://huggingface.co/Narsil/llama-tokenizer/resolve/main/tokenizer.json -O $@ 67 | 68 | $(DATA_DIR)/big.txt : 69 | $(dir_guard) 70 | wget https://norvig.com/big.txt -O $@ 71 | 72 | $(DATA_DIR)/small.txt : $(DATA_DIR)/big.txt 73 | head -100 $(DATA_DIR)/big.txt > $@ 74 | 75 | $(DATA_DIR)/roberta.json : 76 | $(dir_guard) 77 | wget https://huggingface.co/Narsil/small/raw/main/roberta.json -O $@ 78 | 79 | $(DATA_DIR)/tokenizer-wiki.json : 80 | $(dir_guard) 81 | wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json -O $@ 82 | 83 | $(DATA_DIR)/bert-wiki.json : 84 | $(dir_guard) 85 | wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json -O $@ 86 | 87 | $(DATA_DIR)/llama-3-tokenizer.json : 88 | $(dir_guard) 89 | wget https://huggingface.co/hf-internal-testing/llama3-tokenizer/resolve/main/tokenizer.json -O $@ 90 | -------------------------------------------------------------------------------- /tokenizers/README.tpl: -------------------------------------------------------------------------------- 1 | <p align="center"> 2 | <br> 3 | <img src="https://huggingface.co/landing/assets/tokenizers/tokenizers-logo.png" width="600"/> 4 | <br> 5 | <p> 6 | <p align="center"> 7 | <img alt="Build" src="https://github.com/huggingface/tokenizers/workflows/Rust/badge.svg"> 8 | <a href="https://github.com/huggingface/tokenizers/blob/master/LICENSE"> 9 | <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/tokenizers.svg?color=blue"> 10 | </a> 11 | <a href="https://docs.rs/tokenizers/"> 12 | <img alt="Doc" src="https://docs.rs/tokenizers/badge.svg"> 13 | </a> 14 | </p> 15 | <br> 16 | 17 | 18 | {{readme}} -------------------------------------------------------------------------------- /tokenizers/benches/common/mod.rs: -------------------------------------------------------------------------------- 1 | use std::time::{Duration, Instant}; 2 | 3 | use std::hint::black_box; 4 | 5 | use tokenizers::{ 6 | Decoder, EncodeInput, Model, Normalizer, PostProcessor, PreTokenizer, TokenizerImpl, Trainer, 7 | }; 8 | 9 | #[allow(dead_code)] 10 | pub fn iter_bench_encode<M, N, PT, PP, D>( 11 | iters: u64, 12 | tokenizer: &TokenizerImpl<M, N, PT, PP, D>, 13 | lines: &[EncodeInput], 14 | ) -> Duration 15 | where 16 | M: Model, 17 | N: Normalizer, 18 | PT: PreTokenizer, 19 | PP: PostProcessor, 20 | D: Decoder, 21 | { 22 | let mut duration = Duration::new(0, 0); 23 | for _i in 0..iters { 24 | for line in lines { 25 | let input = line.clone(); 26 | let start = Instant::now(); 27 | let _ = black_box(tokenizer.encode(input, false)); 28 | duration = duration.checked_add(start.elapsed()).unwrap(); 29 | } 30 | } 31 | duration 32 | } 33 | 34 | #[allow(dead_code)] 35 | pub fn iter_bench_encode_batch<M, N, PT, PP, D>( 36 | iters: u64, 37 | tokenizer: &TokenizerImpl<M, N, PT, PP, D>, 38 | batches: &[Vec<EncodeInput>], 39 | ) -> Duration 40 | where 41 | M: Model + Send + Sync, 42 | N: Normalizer + Send + Sync, 43 | PT: PreTokenizer + Send + Sync, 44 | PP: PostProcessor + Send + Sync, 45 | D: Decoder + Send + Sync, 46 | { 47 | let mut duration = Duration::new(0, 0); 48 | for _i in 0..iters { 49 | for batch in batches { 50 | let batch = batch.clone(); 51 | let start = Instant::now(); 52 | let _ = black_box(tokenizer.encode_batch(batch, false)); 53 | duration = duration.checked_add(start.elapsed()).unwrap(); 54 | } 55 | } 56 | duration 57 | } 58 | 59 | #[allow(dead_code)] 60 | pub fn iter_bench_train<T, M, N, PT, PP, D>( 61 | iters: u64, 62 | tokenizer: &mut TokenizerImpl<M, N, PT, PP, D>, 63 | trainer: &mut T, 64 | files: Vec<String>, 65 | ) -> Duration 66 | where 67 | T: Trainer<Model = M> + Sync, 68 | M: Model + Send + Sync, 69 | N: Normalizer + Send + Sync, 70 | PT: PreTokenizer + Send + Sync, 71 | PP: PostProcessor + Send + Sync, 72 | D: Decoder + Send + Sync, 73 | { 74 | let mut duration = Duration::new(0, 0); 75 | for _i in 0..iters { 76 | let start = Instant::now(); 77 | tokenizer.train_from_files(trainer, files.clone()).unwrap(); 78 | duration = duration.checked_add(start.elapsed()).unwrap(); 79 | } 80 | duration 81 | } 82 | -------------------------------------------------------------------------------- /tokenizers/benches/layout_benchmark.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | extern crate criterion; 3 | 4 | use std::fs::File; 5 | use std::io::{BufRead, BufReader}; 6 | use std::path::Path; 7 | use std::time::{Duration, Instant}; 8 | 9 | use criterion::Criterion; 10 | use std::hint::black_box; 11 | use tokenizers::processors::template::TemplateProcessing; 12 | use tokenizers::{EncodeInput, Encoding, PostProcessor, Tokenizer}; 13 | 14 | /// Simple TemplateProcessing 15 | fn create_processor() -> TemplateProcessing { 16 | TemplateProcessing::builder() 17 | .try_single("[CLS]:0 $A:0 [SEP]:0") 18 | .unwrap() 19 | .try_pair("[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1") 20 | .unwrap() 21 | .special_tokens(vec![("[CLS]", 0), ("[SEP]", 1)]) 22 | .build() 23 | .unwrap() 24 | } 25 | 26 | pub fn bench_layout(c: &mut Criterion) { 27 | let processor = create_processor(); 28 | let tokenizer = Tokenizer::from_file("data/albert-base-v1-tokenizer.json").unwrap(); 29 | let mut encodeds: Vec<Encoding> = vec![]; 30 | for line in BufReader::new(File::open(Path::new("data/big.txt")).unwrap()).lines() { 31 | let line: EncodeInput = line.unwrap().into(); 32 | 33 | let encoded: Encoding = tokenizer.encode(line, false).unwrap(); 34 | encodeds.push(encoded); 35 | } 36 | 37 | c.bench_function("TemplateProcessing single encode", |b| { 38 | b.iter_custom(|iters| { 39 | let mut duration = Duration::new(0, 0); 40 | for i in 0..iters as usize { 41 | let encoded_index = i % encodeds.len(); 42 | let encoded: Encoding = encodeds[encoded_index].clone(); 43 | 44 | let start = Instant::now(); 45 | let _ = black_box(processor.process(encoded, None, false)); 46 | duration = duration.checked_add(start.elapsed()).unwrap(); 47 | } 48 | duration 49 | }) 50 | }); 51 | c.bench_function("TemplateProcessing pair encode", |b| { 52 | b.iter_custom(|iters| { 53 | let mut duration = Duration::new(0, 0); 54 | for i in 0..iters as usize { 55 | let encoded_index = i % encodeds.len(); 56 | let encoded: Encoding = encodeds[encoded_index].clone(); 57 | 58 | let encoded_index2 = (i + 1) % encodeds.len(); 59 | let pair: Encoding = encodeds[encoded_index2].clone(); 60 | 61 | let start = Instant::now(); 62 | let _ = black_box(processor.process(encoded, Some(pair), false)); 63 | duration = duration.checked_add(start.elapsed()).unwrap(); 64 | } 65 | duration 66 | }) 67 | }); 68 | } 69 | 70 | criterion_group! { 71 | name = layout_benches; 72 | config = Criterion::default().sample_size(20); 73 | targets = bench_layout 74 | } 75 | 76 | criterion_main!(layout_benches); 77 | -------------------------------------------------------------------------------- /tokenizers/benches/llama3_benchmark.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | extern crate criterion; 3 | 4 | mod common; 5 | 6 | use common::{iter_bench_encode, iter_bench_encode_batch, iter_bench_train}; 7 | use criterion::{Criterion, Throughput}; 8 | use std::hint::black_box; 9 | use tokenizers::{ 10 | models::{bpe::BpeTrainerBuilder, TrainerWrapper}, 11 | EncodeInput, Tokenizer, 12 | }; 13 | 14 | static BATCH_SIZE: usize = 1_000; 15 | 16 | pub fn llama3(c: &mut Criterion) { 17 | let data = std::fs::read_to_string("data/big.txt").unwrap(); 18 | let mut group = c.benchmark_group("llama3-encode"); 19 | group.throughput(Throughput::Bytes(data.len() as u64)); 20 | let mut lines: Vec<EncodeInput> = vec![]; 21 | let mut batches: Vec<Vec<EncodeInput>> = vec![vec![]]; 22 | for line in data.lines() { 23 | let line: EncodeInput = line.into(); 24 | lines.push(line.clone()); 25 | if batches.last().unwrap().len() >= BATCH_SIZE { 26 | batches.push(vec![]); 27 | } 28 | batches.last_mut().unwrap().push(line); 29 | } 30 | let tokenizer = Tokenizer::from_file("data/llama-3-tokenizer.json").unwrap(); 31 | group.bench_function("llama3-offsets", |b| { 32 | let data: Vec<_> = data.lines().collect(); 33 | let add_special_tokens = false; 34 | b.iter(|| { 35 | tokenizer 36 | .encode_batch_char_offsets(black_box(data.clone()), add_special_tokens) 37 | .unwrap() 38 | }) 39 | }); 40 | group.bench_function("llama3-encode", |b| { 41 | b.iter_custom(|iters| iter_bench_encode(iters, &tokenizer, &lines)) 42 | }); 43 | group.bench_function("llama3-batch", |b| { 44 | b.iter_custom(|iters| iter_bench_encode_batch(iters, &tokenizer, &batches)) 45 | }); 46 | let mut trainer: TrainerWrapper = BpeTrainerBuilder::default() 47 | .show_progress(false) 48 | .build() 49 | .into(); 50 | let mut tokenizer = Tokenizer::from_file("data/llama-3-tokenizer.json").unwrap(); 51 | group.bench_function("BPE Train vocabulary (big)", |b| { 52 | b.iter_custom(|iters| { 53 | iter_bench_train( 54 | iters, 55 | &mut tokenizer, 56 | &mut trainer, 57 | vec!["data/big.txt".to_string()], 58 | ) 59 | }) 60 | }); 61 | group.finish(); 62 | } 63 | 64 | criterion_group! { 65 | name = llama_3; 66 | config = Criterion::default().sample_size(10); 67 | targets = llama3 68 | } 69 | 70 | criterion_main!(llama_3); 71 | -------------------------------------------------------------------------------- /tokenizers/benches/unigram_benchmark.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | extern crate criterion; 3 | 4 | mod common; 5 | 6 | use common::iter_bench_train; 7 | 8 | use criterion::{Criterion, Throughput}; 9 | use tokenizers::models::unigram::{Unigram, UnigramTrainerBuilder}; 10 | use tokenizers::models::TrainerWrapper; 11 | use tokenizers::pre_tokenizers::whitespace::Whitespace; 12 | use tokenizers::Tokenizer; 13 | 14 | // pub fn bench_train(c: &mut Criterion) { 15 | // let trainer = UnigramTrainer::builder() 16 | // .show_progress(false) 17 | // .unk_token(Some("<UNK>".into())) 18 | // .build() 19 | // .unwrap(); 20 | // 21 | // let mut model = Unigram::default(); 22 | // 23 | // let content = read_to_string("data/big.txt").unwrap(); 24 | // c.bench_function("Unigram Train vocabulary (medium)", |b| { 25 | // b.iter_custom(|iters| { 26 | // let mut duration = Duration::new(0, 0); 27 | // for _i in 0..iters { 28 | // let sentences = sentences.clone(); 29 | // let start = Instant::now(); 30 | // trainer.do_train(sentences, &mut model).unwrap(); 31 | // duration = duration.checked_add(start.elapsed()).unwrap(); 32 | // } 33 | // duration 34 | // }) 35 | // }); 36 | // } 37 | fn bench_train(c: &mut Criterion) { 38 | let mut trainer: TrainerWrapper = UnigramTrainerBuilder::default() 39 | .show_progress(false) 40 | .build() 41 | .unwrap() 42 | .into(); 43 | let mut tokenizer = Tokenizer::new(Unigram::default()).into_inner(); 44 | tokenizer.with_pre_tokenizer(Some(Whitespace {})); 45 | let mut group = c.benchmark_group("unigram-train-large"); 46 | let data = std::fs::read_to_string("data/big.txt").unwrap(); 47 | group.throughput(Throughput::Bytes(data.len() as u64)); 48 | group.bench_function("BPE Train vocabulary (big)", |b| { 49 | b.iter_custom(|iters| { 50 | iter_bench_train( 51 | iters, 52 | &mut tokenizer, 53 | &mut trainer, 54 | vec!["data/big.txt".to_string()], 55 | ) 56 | }) 57 | }); 58 | } 59 | 60 | criterion_group! { 61 | name = benches_train; 62 | config = Criterion::default().sample_size(10); 63 | targets = bench_train 64 | } 65 | 66 | criterion_main!(benches_train); 67 | -------------------------------------------------------------------------------- /tokenizers/examples/encode_batch.rs: -------------------------------------------------------------------------------- 1 | use tokenizers::Tokenizer; 2 | 3 | fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> { 4 | let tokenizer = Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None)?; 5 | 6 | let data = std::fs::read_to_string("data/big.txt")?; 7 | let data: Vec<_> = data.lines().collect(); 8 | let add_special_tokens = false; 9 | tokenizer.encode_batch_char_offsets(data, add_special_tokens)?; 10 | Ok(()) 11 | } 12 | -------------------------------------------------------------------------------- /tokenizers/examples/serialization.rs: -------------------------------------------------------------------------------- 1 | use tokenizers::models::wordpiece::WordPiece; 2 | use tokenizers::{AddedToken, Tokenizer}; 3 | 4 | fn main() { 5 | let start = std::time::Instant::now(); 6 | let mut tokenizer = Tokenizer::new(WordPiece::default()); 7 | 8 | // Mix special and not special 9 | // You can make sure ids are in order, and special status is correct. 10 | let tokens: Vec<_> = (0..120_000) 11 | .map(|i| AddedToken::from(format!("[SPECIAL_{i}]"), i % 2 == 0)) 12 | .collect(); 13 | tokenizer.add_tokens(&tokens); 14 | tokenizer.save("_tok.json", true).unwrap(); 15 | println!("Save took {:?}", start.elapsed()); 16 | let start = std::time::Instant::now(); 17 | let _tok = Tokenizer::from_file("_tok.json").unwrap(); 18 | println!("Took {:?}", start.elapsed()); 19 | std::fs::remove_file("_tok.json").unwrap(); 20 | } 21 | -------------------------------------------------------------------------------- /tokenizers/examples/unstable_wasm/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | **/*.rs.bk 3 | Cargo.lock 4 | bin/ 5 | pkg/ 6 | wasm-pack.log 7 | -------------------------------------------------------------------------------- /tokenizers/examples/unstable_wasm/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "unstable_wasm" 3 | version = "0.1.0" 4 | authors = ["Nicolas Patry"] 5 | edition = "2018" 6 | 7 | [lib] 8 | crate-type = ["cdylib", "rlib"] 9 | 10 | [features] 11 | default = ["console_error_panic_hook"] 12 | 13 | [dependencies] 14 | wasm-bindgen = "0.2.63" 15 | 16 | # The `console_error_panic_hook` crate provides better debugging of panics by 17 | # logging them with `console.error`. This is great for development, but requires 18 | # all the `std::fmt` and `std::panicking` infrastructure, so isn't great for 19 | # code size when deploying. 20 | console_error_panic_hook = { version = "0.1.6", optional = true } 21 | 22 | # `wee_alloc` is a tiny allocator for wasm that is only ~1K in code size 23 | # compared to the default allocator's ~10K. It is slower than the default 24 | # allocator, however. 25 | # 26 | # Unfortunately, `wee_alloc` requires nightly Rust when targeting wasm for now. 27 | wee_alloc = { version = "0.4.5", optional = true } 28 | 29 | tokenizers = { path = "../../", default-features=false, features = ["unstable_wasm"]} 30 | 31 | [dev-dependencies] 32 | wasm-bindgen-test = "0.3.13" 33 | 34 | [profile.release] 35 | # Tell `rustc` to optimize for small code size. 36 | opt-level = "s" 37 | -------------------------------------------------------------------------------- /tokenizers/examples/unstable_wasm/README.md: -------------------------------------------------------------------------------- 1 | <div align="center"> 2 | 3 | <h1><code>wasm-pack-template</code></h1> 4 | 5 | <strong>A template for kick starting a Rust and WebAssembly project using <a href="https://github.com/rustwasm/wasm-pack">wasm-pack</a>.</strong> 6 | 7 | <p> 8 | <a href="https://travis-ci.org/rustwasm/wasm-pack-template"><img src="https://img.shields.io/travis/rustwasm/wasm-pack-template.svg?style=flat-square" alt="Build Status" /></a> 9 | </p> 10 | 11 | <h3> 12 | <a href="https://rustwasm.github.io/docs/wasm-pack/tutorials/npm-browser-packages/index.html">Tutorial</a> 13 | <span> | </span> 14 | <a href="https://discordapp.com/channels/442252698964721669/443151097398296587">Chat</a> 15 | </h3> 16 | 17 | <sub>Built with 🦀🕸 by <a href="https://rustwasm.github.io/">The Rust and WebAssembly Working Group</a></sub> 18 | </div> 19 | 20 | ## About 21 | 22 | 23 | This is an example project showing off a very basic use case for `wasm` tokenizers 24 | usage. 25 | 26 | [**📚 Read this template tutorial! 📚**][template-docs] 27 | 28 | This template is designed for compiling Rust libraries into WebAssembly and 29 | publishing the resulting package to NPM. 30 | 31 | Be sure to check out [other `wasm-pack` tutorials online][tutorials] for other 32 | templates and usages of `wasm-pack`. 33 | 34 | [tutorials]: https://rustwasm.github.io/docs/wasm-pack/tutorials/index.html 35 | [template-docs]: https://rustwasm.github.io/docs/wasm-pack/tutorials/npm-browser-packages/index.html 36 | 37 | ## 🚴 Usage 38 | 39 | ### 🐑 Use `cargo generate` to Clone this Template 40 | 41 | [Learn more about `cargo generate` here.](https://github.com/ashleygwilliams/cargo-generate) 42 | 43 | ``` 44 | cargo generate --git https://github.com/rustwasm/wasm-pack-template.git --name my-project 45 | cd my-project 46 | ``` 47 | 48 | ### 🛠️ Build with `wasm-pack build` 49 | 50 | ``` 51 | wasm-pack build 52 | ``` 53 | 54 | ### 🔬 Test in Headless Browsers with `wasm-pack test` 55 | 56 | ``` 57 | wasm-pack test --headless --firefox 58 | ``` 59 | 60 | ### 🎁 Publish to NPM with `wasm-pack publish` 61 | 62 | ``` 63 | wasm-pack publish 64 | ``` 65 | 66 | ## 🔋 Batteries Included 67 | 68 | * [`wasm-bindgen`](https://github.com/rustwasm/wasm-bindgen) for communicating 69 | between WebAssembly and JavaScript. 70 | * [`console_error_panic_hook`](https://github.com/rustwasm/console_error_panic_hook) 71 | for logging panic messages to the developer console. 72 | * [`wee_alloc`](https://github.com/rustwasm/wee_alloc), an allocator optimized 73 | for small code size. 74 | -------------------------------------------------------------------------------- /tokenizers/examples/unstable_wasm/src/lib.rs: -------------------------------------------------------------------------------- 1 | mod utils; 2 | use tokenizers::models::bpe::{Vocab, BPE}; 3 | use tokenizers::Tokenizer; 4 | 5 | use wasm_bindgen::prelude::*; 6 | 7 | // When the `wee_alloc` feature is enabled, use `wee_alloc` as the global 8 | // allocator. 9 | #[cfg(feature = "wee_alloc")] 10 | #[global_allocator] 11 | static ALLOC: wee_alloc::WeeAlloc = wee_alloc::WeeAlloc::INIT; 12 | 13 | #[wasm_bindgen] 14 | pub fn tokenize(string: &str) -> Vec<u32> { 15 | let vocab: Vocab = vec![ 16 | ("a".to_string(), 0), 17 | ("##b".to_string(), 1), 18 | ("##c".to_string(), 2), 19 | ("ab".to_string(), 3), 20 | ("abc".to_string(), 4), 21 | ] 22 | .into_iter() 23 | .collect(); 24 | 25 | let merges = vec![ 26 | ("a".to_string(), "##b".to_string()), 27 | ("ab".to_string(), "##c".to_string()), 28 | ]; 29 | 30 | let bpe = BPE::builder() 31 | .vocab_and_merges(vocab, merges) 32 | .unk_token("[UNK]".to_string()) 33 | .continuing_subword_prefix("##".to_string()) 34 | .build() 35 | .unwrap(); 36 | let tokenizer = Tokenizer::new(bpe); 37 | tokenizer 38 | .encode(string, false) 39 | .unwrap() 40 | .get_ids() 41 | .into_iter() 42 | .cloned() 43 | .collect() 44 | } 45 | -------------------------------------------------------------------------------- /tokenizers/examples/unstable_wasm/src/utils.rs: -------------------------------------------------------------------------------- 1 | pub fn set_panic_hook() { 2 | // When the `console_error_panic_hook` feature is enabled, we can call the 3 | // `set_panic_hook` function at least once during initialization, and then 4 | // we will get better error messages if our code ever panics. 5 | // 6 | // For more details see 7 | // https://github.com/rustwasm/console_error_panic_hook#readme 8 | #[cfg(feature = "console_error_panic_hook")] 9 | console_error_panic_hook::set_once(); 10 | } 11 | -------------------------------------------------------------------------------- /tokenizers/examples/unstable_wasm/tests/web.rs: -------------------------------------------------------------------------------- 1 | //! Test suite for the Web and headless browsers. 2 | 3 | #![cfg(target_arch = "wasm32")] 4 | 5 | extern crate wasm_bindgen_test; 6 | use wasm_bindgen_test::*; 7 | 8 | wasm_bindgen_test_configure!(run_in_browser); 9 | 10 | #[wasm_bindgen_test] 11 | fn pass() { 12 | assert_eq!(1 + 1, 2); 13 | } 14 | -------------------------------------------------------------------------------- /tokenizers/examples/unstable_wasm/www/.bin/create-wasm-app.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const { spawn } = require("child_process"); 4 | const fs = require("fs"); 5 | 6 | let folderName = '.'; 7 | 8 | if (process.argv.length >= 3) { 9 | folderName = process.argv[2]; 10 | if (!fs.existsSync(folderName)) { 11 | fs.mkdirSync(folderName); 12 | } 13 | } 14 | 15 | const clone = spawn("git", ["clone", "https://github.com/rustwasm/create-wasm-app.git", folderName]); 16 | 17 | clone.on("close", code => { 18 | if (code !== 0) { 19 | console.error("cloning the template failed!") 20 | process.exit(code); 21 | } else { 22 | console.log("🦀 Rust + 🕸 Wasm = ❤"); 23 | } 24 | }); 25 | -------------------------------------------------------------------------------- /tokenizers/examples/unstable_wasm/www/.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | dist 3 | -------------------------------------------------------------------------------- /tokenizers/examples/unstable_wasm/www/.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: "10" 3 | 4 | script: 5 | - ./node_modules/.bin/webpack 6 | -------------------------------------------------------------------------------- /tokenizers/examples/unstable_wasm/www/LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) [year] [name] 2 | 3 | Permission is hereby granted, free of charge, to any 4 | person obtaining a copy of this software and associated 5 | documentation files (the "Software"), to deal in the 6 | Software without restriction, including without 7 | limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software 10 | is furnished to do so, subject to the following 11 | conditions: 12 | 13 | The above copyright notice and this permission notice 14 | shall be included in all copies or substantial portions 15 | of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | DEALINGS IN THE SOFTWARE. 26 | -------------------------------------------------------------------------------- /tokenizers/examples/unstable_wasm/www/README.md: -------------------------------------------------------------------------------- 1 | <div align="center"> 2 | 3 | <h1><code>create-wasm-app</code></h1> 4 | 5 | <strong>An <code>npm init</code> template for kick starting a project that uses NPM packages containing Rust-generated WebAssembly and bundles them with Webpack.</strong> 6 | 7 | <p> 8 | <a href="https://travis-ci.org/rustwasm/create-wasm-app"><img src="https://img.shields.io/travis/rustwasm/create-wasm-app.svg?style=flat-square" alt="Build Status" /></a> 9 | </p> 10 | 11 | <h3> 12 | <a href="#usage">Usage</a> 13 | <span> | </span> 14 | <a href="https://discordapp.com/channels/442252698964721669/443151097398296587">Chat</a> 15 | </h3> 16 | 17 | <sub>Built with 🦀🕸 by <a href="https://rustwasm.github.io/">The Rust and WebAssembly Working Group</a></sub> 18 | </div> 19 | 20 | ## About 21 | 22 | This template is designed for depending on NPM packages that contain 23 | Rust-generated WebAssembly and using them to create a Website. 24 | 25 | * Want to create an NPM package with Rust and WebAssembly? [Check out 26 | `wasm-pack-template`.](https://github.com/rustwasm/wasm-pack-template) 27 | * Want to make a monorepo-style Website without publishing to NPM? Check out 28 | [`rust-webpack-template`](https://github.com/rustwasm/rust-webpack-template) 29 | and/or 30 | [`rust-parcel-template`](https://github.com/rustwasm/rust-parcel-template). 31 | 32 | ## 🚴 Usage 33 | 34 | ``` 35 | npm init wasm-app 36 | ``` 37 | 38 | ## 🔋 Batteries Included 39 | 40 | - `.gitignore`: ignores `node_modules` 41 | - `LICENSE-APACHE` and `LICENSE-MIT`: most Rust projects are licensed this way, so these are included for you 42 | - `README.md`: the file you are reading now! 43 | - `index.html`: a bare bones html document that includes the webpack bundle 44 | - `index.js`: example js file with a comment showing how to import and use a wasm pkg 45 | - `package.json` and `package-lock.json`: 46 | - pulls in devDependencies for using webpack: 47 | - [`webpack`](https://www.npmjs.com/package/webpack) 48 | - [`webpack-cli`](https://www.npmjs.com/package/webpack-cli) 49 | - [`webpack-dev-server`](https://www.npmjs.com/package/webpack-dev-server) 50 | - defines a `start` script to run `webpack-dev-server` 51 | - `webpack.config.js`: configuration file for bundling your js with webpack 52 | 53 | ## License 54 | 55 | Licensed under either of 56 | 57 | * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) 58 | * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) 59 | 60 | at your option. 61 | 62 | ### Contribution 63 | 64 | Unless you explicitly state otherwise, any contribution intentionally 65 | submitted for inclusion in the work by you, as defined in the Apache-2.0 66 | license, shall be dual licensed as above, without any additional terms or 67 | conditions. 68 | -------------------------------------------------------------------------------- /tokenizers/examples/unstable_wasm/www/bootstrap.js: -------------------------------------------------------------------------------- 1 | // A dependency graph that contains any wasm must all be imported 2 | // asynchronously. This `bootstrap.js` file does the single async import, so 3 | // that no one else needs to worry about it again. 4 | import("./index.js") 5 | .catch(e => console.error("Error importing `index.js`:", e)); 6 | -------------------------------------------------------------------------------- /tokenizers/examples/unstable_wasm/www/index.html: -------------------------------------------------------------------------------- 1 | <!DOCTYPE html> 2 | <html> 3 | <head> 4 | <meta charset="utf-8"> 5 | <title>Hello wasm-pack!</title> 6 | </head> 7 | <body> 8 | <noscript>This page contains webassembly and javascript content, please enable javascript in your browser.</noscript> 9 | <script src="./bootstrap.js"></script> 10 | </body> 11 | </html> 12 | -------------------------------------------------------------------------------- /tokenizers/examples/unstable_wasm/www/index.js: -------------------------------------------------------------------------------- 1 | import * as wasm from "unstable_wasm"; 2 | 3 | console.log(wasm.tokenize("ab")); 4 | console.log(wasm.tokenize("abc")); 5 | -------------------------------------------------------------------------------- /tokenizers/examples/unstable_wasm/www/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "create-wasm-app", 3 | "version": "0.1.0", 4 | "description": "create an app to consume rust-generated wasm packages", 5 | "main": "index.js", 6 | "bin": { 7 | "create-wasm-app": ".bin/create-wasm-app.js" 8 | }, 9 | "scripts": { 10 | "build": "webpack --config webpack.config.js", 11 | "start": "NODE_OPTIONS=--openssl-legacy-provider webpack-dev-server" 12 | }, 13 | "repository": { 14 | "type": "git", 15 | "url": "git+https://github.com/rustwasm/create-wasm-app.git" 16 | }, 17 | "keywords": ["webassembly", "wasm", "rust", "webpack"], 18 | "author": "Ashley Williams <ashley666ashley@gmail.com>", 19 | "license": "(MIT OR Apache-2.0)", 20 | "bugs": { 21 | "url": "https://github.com/rustwasm/create-wasm-app/issues" 22 | }, 23 | "homepage": "https://github.com/rustwasm/create-wasm-app#readme", 24 | "devDependencies": { 25 | "copy-webpack-plugin": "^11.0.0", 26 | "webpack": "^5.75.0", 27 | "webpack-cli": "^5.0.1", 28 | "webpack-dev-server": "^5.2.1" 29 | }, 30 | "dependencies": { 31 | "unstable_wasm": "file:../pkg" 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /tokenizers/examples/unstable_wasm/www/webpack.config.js: -------------------------------------------------------------------------------- 1 | const CopyWebpackPlugin = require("copy-webpack-plugin"); 2 | const path = require('path'); 3 | 4 | module.exports = { 5 | entry: "./bootstrap.js", 6 | output: { 7 | path: path.resolve(__dirname, "dist"), 8 | filename: "bootstrap.js", 9 | }, 10 | mode: "development", 11 | plugins: [ 12 | new CopyWebpackPlugin(['index.html']) 13 | ], 14 | }; 15 | -------------------------------------------------------------------------------- /tokenizers/rust-toolchain: -------------------------------------------------------------------------------- 1 | stable 2 | -------------------------------------------------------------------------------- /tokenizers/src/decoders/bpe.rs: -------------------------------------------------------------------------------- 1 | use crate::tokenizer::{Decoder, Result}; 2 | 3 | use serde::{Deserialize, Serialize}; 4 | 5 | #[derive(Deserialize, Clone, Debug, Serialize)] 6 | /// Allows decoding Original BPE by joining all the tokens and then replacing 7 | /// the suffix used to identify end-of-words by whitespaces 8 | #[serde(tag = "type")] 9 | #[non_exhaustive] 10 | pub struct BPEDecoder { 11 | pub suffix: String, 12 | } 13 | 14 | impl BPEDecoder { 15 | pub fn new(suffix: String) -> Self { 16 | Self { suffix } 17 | } 18 | } 19 | 20 | impl Default for BPEDecoder { 21 | fn default() -> Self { 22 | Self::new("</w>".into()) 23 | } 24 | } 25 | 26 | impl Decoder for BPEDecoder { 27 | fn decode_chain(&self, tokens: Vec<String>) -> Result<Vec<String>> { 28 | let n = tokens.len() - 1; 29 | Ok(tokens 30 | .into_iter() 31 | .enumerate() 32 | .map(|(i, token)| { 33 | let replacement = if i == n { "" } else { " " }; 34 | token.replace(&self.suffix, replacement) 35 | }) 36 | .collect()) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /tokenizers/src/decoders/fuse.rs: -------------------------------------------------------------------------------- 1 | use crate::tokenizer::{Decoder, Result}; 2 | use monostate::MustBe; 3 | use serde::{Deserialize, Serialize}; 4 | 5 | #[derive(Clone, Debug, Serialize, Deserialize, Default)] 6 | /// Fuse simply fuses all tokens into one big string. 7 | /// It's usually the last decoding step anyway, but this 8 | /// decoder exists incase some decoders need to happen after that 9 | /// step 10 | #[non_exhaustive] 11 | pub struct Fuse { 12 | #[serde(rename = "type")] 13 | type_: MustBe!("Fuse"), 14 | } 15 | 16 | impl Fuse { 17 | pub fn new() -> Self { 18 | Self { 19 | type_: MustBe!("Fuse"), 20 | } 21 | } 22 | } 23 | 24 | impl Decoder for Fuse { 25 | fn decode_chain(&self, tokens: Vec<String>) -> Result<Vec<String>> { 26 | let new_string = tokens.join(""); 27 | Ok(vec![new_string]) 28 | } 29 | } 30 | 31 | #[cfg(test)] 32 | mod tests { 33 | use super::*; 34 | 35 | #[test] 36 | fn decode() { 37 | let decoder = Fuse::new(); 38 | let res = decoder 39 | .decode_chain(vec!["Hey".into(), " friend!".into()]) 40 | .unwrap(); 41 | assert_eq!(res, vec!["Hey friend!"]); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /tokenizers/src/decoders/sequence.rs: -------------------------------------------------------------------------------- 1 | use crate::decoders::DecoderWrapper; 2 | use crate::tokenizer::{Decoder, Result}; 3 | use crate::utils::macro_rules_attribute; 4 | use serde::{Deserialize, Serialize}; 5 | 6 | #[derive(Clone, Debug)] 7 | #[macro_rules_attribute(impl_serde_type!)] 8 | pub struct Sequence { 9 | decoders: Vec<DecoderWrapper>, 10 | } 11 | 12 | impl Sequence { 13 | pub fn new(decoders: Vec<DecoderWrapper>) -> Self { 14 | Self { decoders } 15 | } 16 | 17 | pub fn get_decoders(&self) -> &[DecoderWrapper] { 18 | &self.decoders 19 | } 20 | 21 | pub fn get_decoders_mut(&mut self) -> &mut [DecoderWrapper] { 22 | &mut self.decoders 23 | } 24 | } 25 | 26 | impl Decoder for Sequence { 27 | fn decode_chain(&self, mut tokens: Vec<String>) -> Result<Vec<String>> { 28 | for decoder in &self.decoders { 29 | tokens = decoder.decode_chain(tokens)?; 30 | } 31 | Ok(tokens) 32 | } 33 | } 34 | 35 | #[cfg(test)] 36 | mod tests { 37 | use super::*; 38 | use crate::decoders::ctc::CTC; 39 | use crate::pre_tokenizers::metaspace::Metaspace; 40 | 41 | #[test] 42 | fn sequence_basic() { 43 | let decoders = vec![ 44 | DecoderWrapper::CTC(CTC::default()), 45 | DecoderWrapper::Metaspace(Metaspace::default()), 46 | ]; 47 | let decoder = Sequence::new(decoders); 48 | let tokens: Vec<String> = vec!["▁", "▁", "H", "H", "i", "i", "▁", "y", "o", "u"] 49 | .into_iter() 50 | .map(|s| s.to_string()) 51 | .collect(); 52 | let out_tokens = decoder.decode(tokens).unwrap(); 53 | assert_eq!(out_tokens, "Hi you"); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /tokenizers/src/decoders/strip.rs: -------------------------------------------------------------------------------- 1 | use crate::tokenizer::{Decoder, Result}; 2 | 3 | use serde::{Deserialize, Serialize}; 4 | 5 | #[derive(Deserialize, Clone, Debug, Serialize, Default)] 6 | /// Strip is a simple trick which converts tokens looking like `<0x61>` 7 | /// to pure bytes, and attempts to make them into a string. If the tokens 8 | /// cannot be decoded you will get � instead for each inconvertible byte token 9 | #[serde(tag = "type")] 10 | #[non_exhaustive] 11 | pub struct Strip { 12 | pub content: char, 13 | pub start: usize, 14 | pub stop: usize, 15 | } 16 | 17 | impl Strip { 18 | pub fn new(content: char, start: usize, stop: usize) -> Self { 19 | Self { 20 | content, 21 | start, 22 | stop, 23 | } 24 | } 25 | } 26 | 27 | impl Decoder for Strip { 28 | fn decode_chain(&self, tokens: Vec<String>) -> Result<Vec<String>> { 29 | Ok(tokens 30 | .into_iter() 31 | .map(|token| { 32 | let chars: Vec<char> = token.chars().collect(); 33 | 34 | let mut start_cut = 0; 35 | for (i, &c) in chars.iter().enumerate().take(self.start) { 36 | if c == self.content { 37 | start_cut = i + 1; 38 | continue; 39 | } else { 40 | break; 41 | } 42 | } 43 | 44 | let mut stop_cut = chars.len(); 45 | for i in 0..self.stop { 46 | let index = chars.len() - i - 1; 47 | if chars[index] == self.content { 48 | stop_cut = index; 49 | continue; 50 | } else { 51 | break; 52 | } 53 | } 54 | 55 | let new_token: String = chars[start_cut..stop_cut].iter().collect(); 56 | new_token 57 | }) 58 | .collect()) 59 | } 60 | } 61 | 62 | #[cfg(test)] 63 | mod tests { 64 | use super::*; 65 | 66 | #[test] 67 | fn decode() { 68 | let decoder = Strip::new('H', 1, 0); 69 | let res = decoder 70 | .decode_chain(vec!["Hey".into(), " friend!".into(), "HHH".into()]) 71 | .unwrap(); 72 | assert_eq!(res, vec!["ey", " friend!", "HH"]); 73 | 74 | let decoder = Strip::new('y', 0, 1); 75 | let res = decoder 76 | .decode_chain(vec!["Hey".into(), " friend!".into()]) 77 | .unwrap(); 78 | assert_eq!(res, vec!["He", " friend!"]); 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /tokenizers/src/decoders/wordpiece.rs: -------------------------------------------------------------------------------- 1 | use crate::tokenizer::{Decoder, Result}; 2 | 3 | use serde::{Deserialize, Serialize}; 4 | 5 | #[derive(Deserialize, Clone, Debug, Serialize)] 6 | /// The WordPiece decoder takes care of decoding a list of wordpiece tokens 7 | /// back into a readable string. 8 | #[serde(tag = "type")] 9 | #[non_exhaustive] 10 | pub struct WordPiece { 11 | /// The prefix to be used for continuing subwords 12 | pub prefix: String, 13 | /// Whether to cleanup some tokenization artifacts (spaces before punctuation, ...) 14 | pub cleanup: bool, 15 | } 16 | 17 | impl WordPiece { 18 | pub fn new(prefix: String, cleanup: bool) -> Self { 19 | Self { prefix, cleanup } 20 | } 21 | } 22 | 23 | impl Default for WordPiece { 24 | fn default() -> Self { 25 | Self { 26 | prefix: "##".to_owned(), 27 | cleanup: true, 28 | } 29 | } 30 | } 31 | pub fn cleanup(dirty_input: &str) -> String { 32 | dirty_input 33 | .replace(" .", ".") 34 | .replace(" ?", "?") 35 | .replace(" !", "!") 36 | .replace(" ,", ",") 37 | .replace(" ' ", "'") 38 | .replace(" n't", "n't") 39 | .replace(" 'm", "'m") 40 | .replace(" do not", " don't") 41 | .replace(" 's", "'s") 42 | .replace(" 've", "'ve") 43 | .replace(" 're", "'re") 44 | } 45 | 46 | impl Decoder for WordPiece { 47 | fn decode_chain(&self, mut tokens: Vec<String>) -> Result<Vec<String>> { 48 | for (i, token) in tokens.iter_mut().enumerate() { 49 | if i != 0 { 50 | if let Some(tk) = token.strip_prefix(&self.prefix) { 51 | *token = tk.to_string(); 52 | } else { 53 | *token = format!(" {token}"); 54 | } 55 | } 56 | if self.cleanup { 57 | *token = cleanup(token); 58 | } 59 | } 60 | Ok(tokens) 61 | } 62 | } 63 | 64 | #[cfg(test)] 65 | mod tests { 66 | use super::*; 67 | 68 | #[test] 69 | fn wordpiece_decoder() { 70 | let decoder = WordPiece::new("##".to_string(), false); 71 | 72 | assert_eq!( 73 | decoder 74 | .decode(vec![ 75 | "##uelo".to_string(), 76 | "Ara".to_string(), 77 | "##új".to_string(), 78 | "##o".to_string(), 79 | "No".to_string(), 80 | "##guera".to_string() 81 | ]) 82 | .unwrap(), 83 | "##uelo Araújo Noguera" 84 | ); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /tokenizers/src/models/bpe/mod.rs: -------------------------------------------------------------------------------- 1 | //! [Byte Pair Encoding](https://www.aclweb.org/anthology/P16-1162/) model. 2 | use std::{iter, mem}; 3 | 4 | mod model; 5 | mod serialization; 6 | pub mod trainer; 7 | mod word; 8 | 9 | type Pair = (u32, u32); 10 | 11 | /// Errors that can be encountered while using or constructing a `BPE` model. 12 | #[derive(thiserror::Error, Debug)] 13 | pub enum Error { 14 | /// An error encountered while reading files mainly. 15 | #[error("IoError: {0}")] 16 | Io(#[from] std::io::Error), 17 | /// An error forwarded from Serde, while parsing JSON 18 | #[error("JsonError: {0}")] 19 | JsonError(#[from] serde_json::Error), 20 | /// When the vocab.json file is in the wrong format 21 | #[error("Bad vocabulary json file")] 22 | BadVocabulary, 23 | /// When the merges.txt file is in the wrong format. This error holds the line 24 | /// number of the line that caused the error. 25 | #[error("Merges text file invalid at line {0}")] 26 | BadMerges(usize), 27 | /// If a token found in merges, is not in the vocab 28 | #[error("Token `{0}` out of vocabulary")] 29 | MergeTokenOutOfVocabulary(String), 30 | /// If the provided unk token is out of vocabulary 31 | #[error("Unk token `{0}` not found in the vocabulary")] 32 | UnkTokenOutOfVocabulary(String), 33 | /// Dropout not between 0 and 1. 34 | #[error("Dropout should be between 0 and 1, inclusive")] 35 | InvalidDropout, 36 | } 37 | 38 | /// Provides access to the `FirstLastIterator` to any Iterator 39 | pub(crate) trait WithFirstLastIterator: Iterator + Sized { 40 | fn with_first_and_last(self) -> FirstLastIterator<Self>; 41 | } 42 | 43 | impl<I> WithFirstLastIterator for I 44 | where 45 | I: Iterator, 46 | { 47 | fn with_first_and_last(self) -> FirstLastIterator<Self> { 48 | FirstLastIterator { 49 | first: true, 50 | iter: self.peekable(), 51 | } 52 | } 53 | } 54 | 55 | /// Provides information about whether an item is the first and/or the last of the iterator 56 | pub(crate) struct FirstLastIterator<I> 57 | where 58 | I: Iterator, 59 | { 60 | first: bool, 61 | iter: iter::Peekable<I>, 62 | } 63 | 64 | impl<I> Iterator for FirstLastIterator<I> 65 | where 66 | I: Iterator, 67 | { 68 | /// (is_first, is_last, item) 69 | type Item = (bool, bool, I::Item); 70 | 71 | fn next(&mut self) -> Option<Self::Item> { 72 | let first = mem::replace(&mut self.first, false); 73 | self.iter 74 | .next() 75 | .map(|e| (first, self.iter.peek().is_none(), e)) 76 | } 77 | } 78 | 79 | // Re-export 80 | pub use model::*; 81 | pub use trainer::*; 82 | use word::*; 83 | -------------------------------------------------------------------------------- /tokenizers/src/models/unigram/mod.rs: -------------------------------------------------------------------------------- 1 | //! [Unigram](https://arxiv.org/abs/1804.10959) model. 2 | mod lattice; 3 | mod model; 4 | mod serialization; 5 | mod trainer; 6 | mod trie; 7 | 8 | pub use lattice::*; 9 | pub use model::*; 10 | pub use trainer::*; 11 | -------------------------------------------------------------------------------- /tokenizers/src/models/unigram/trie.rs: -------------------------------------------------------------------------------- 1 | use ahash::AHashMap; 2 | use std::hash::Hash; 3 | 4 | #[derive(Default)] 5 | pub struct TrieBuilder<Label> { 6 | trie: Trie<Label>, 7 | } 8 | 9 | impl<Label: Eq + Hash + Copy> TrieBuilder<Label> { 10 | pub fn push(&mut self, element: &[Label]) { 11 | self.trie.push(element); 12 | } 13 | 14 | pub fn build(self) -> Trie<Label> { 15 | self.trie 16 | } 17 | } 18 | 19 | #[derive(Clone)] 20 | pub struct Trie<Label> { 21 | root: Node<Label>, 22 | } 23 | 24 | impl<Label: Eq + Hash + Copy> Trie<Label> { 25 | pub fn push(&mut self, element: &[Label]) { 26 | let mut node = &mut self.root; 27 | for label in element.iter() { 28 | node = node.children.entry(*label).or_default(); 29 | } 30 | node.is_leaf = true; 31 | } 32 | 33 | pub fn common_prefix_search<T>(&self, iterator: T) -> TrieIterator<Label, T> 34 | where 35 | T: Iterator<Item = Label>, 36 | { 37 | TrieIterator { 38 | node: &self.root, 39 | prefix: vec![], 40 | iterator, 41 | } 42 | } 43 | } 44 | 45 | pub struct TrieIterator<'a, Label, T> { 46 | node: &'a Node<Label>, 47 | prefix: Vec<Label>, 48 | iterator: T, 49 | } 50 | 51 | impl<Label, T> Iterator for TrieIterator<'_, Label, T> 52 | where 53 | Label: Eq + Hash + Copy, 54 | T: Iterator<Item = Label>, 55 | { 56 | type Item = Vec<Label>; 57 | fn next(&mut self) -> Option<Self::Item> { 58 | loop { 59 | let label = self.iterator.next()?; 60 | self.prefix.push(label); 61 | let child = self.node.children.get(&label)?; 62 | self.node = child; 63 | if self.node.is_leaf { 64 | return Some(self.prefix.clone()); 65 | } 66 | } 67 | } 68 | } 69 | 70 | impl<Label> Default for Trie<Label> { 71 | fn default() -> Self { 72 | Self { 73 | root: Node::default(), 74 | } 75 | } 76 | } 77 | 78 | #[derive(Clone)] 79 | pub struct Node<Label> { 80 | is_leaf: bool, 81 | children: AHashMap<Label, Node<Label>>, 82 | } 83 | 84 | impl<Label> Default for Node<Label> { 85 | fn default() -> Self { 86 | Self { 87 | is_leaf: false, 88 | children: AHashMap::new(), 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /tokenizers/src/normalizers/precompiled.rs: -------------------------------------------------------------------------------- 1 | use crate::tokenizer::{NormalizedString, Normalizer, Result}; 2 | pub use spm_precompiled::Precompiled; 3 | use std::cmp::Ordering; 4 | use unicode_segmentation::UnicodeSegmentation; 5 | 6 | fn replace(transformations: &mut Vec<(char, isize)>, old_part: &str, new_part: &str) { 7 | let old_count = old_part.chars().count() as isize; 8 | let new_count = new_part.chars().count() as isize; 9 | let diff = new_count - old_count; 10 | 11 | // If we are just replacing characters, all changes should be == 0 12 | transformations.extend(new_part.chars().map(|c| (c, 0))); 13 | 14 | match diff.cmp(&0) { 15 | // If we are adding some characters, the last DIFF characters should be == 1 16 | Ordering::Greater => { 17 | transformations 18 | .iter_mut() 19 | .rev() 20 | .take(diff as usize) 21 | .for_each(|(_, cs)| *cs = 1); 22 | } 23 | // If we are removing some characters, the last one should include the diff 24 | Ordering::Less => { 25 | if let Some((_, cs)) = transformations.last_mut() { 26 | *cs += diff; 27 | } 28 | } 29 | _ => {} 30 | } 31 | } 32 | 33 | impl Normalizer for Precompiled { 34 | fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> { 35 | let mut transformations = Vec::with_capacity(normalized.get().len()); 36 | // Future reader. From @Narsil. 37 | // Yes, this is weird, 38 | // Yes, this seems broken 39 | // No, I don't know why Google did this. 40 | // If you question this code, check this normalizer against 41 | // XNLI database (all languages) with Unigram model against 42 | // Mbart, XLMRoberta *AND* Marian. If you don't get 100% or 43 | // break a single test. 44 | // You don't pass. 45 | let mut modified = false; 46 | normalized.get().graphemes(true).for_each(|grapheme| { 47 | if grapheme.len() < 6 { 48 | if let Some(norm) = self.transform(grapheme) { 49 | modified = true; 50 | replace(&mut transformations, grapheme, norm); 51 | return; 52 | } 53 | } 54 | for (char_index, c) in grapheme.char_indices() { 55 | let part = &grapheme[char_index..char_index + c.len_utf8()]; 56 | if let Some(norm) = self.transform(part) { 57 | modified = true; 58 | replace(&mut transformations, part, norm); 59 | } else { 60 | transformations.push((c, 0)); 61 | } 62 | } 63 | }); 64 | if modified { 65 | normalized.transform(transformations, 0); 66 | } 67 | Ok(()) 68 | } 69 | } 70 | 71 | #[cfg(test)] 72 | mod tests { 73 | use super::*; 74 | 75 | #[test] 76 | fn expansion_followed_by_removal() { 77 | // Simulate transformations from "™\x1eg" to "TMg" 78 | let mut transformations = vec![]; 79 | 80 | let mut n = NormalizedString::from("™\x1eg"); 81 | replace(&mut transformations, "™", "TM"); 82 | replace(&mut transformations, "\x1e", ""); 83 | transformations.push(('g', 0)); 84 | 85 | n.transform(transformations, 0); 86 | 87 | assert_eq!(n.get(), "TMg"); 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /tokenizers/src/normalizers/prepend.rs: -------------------------------------------------------------------------------- 1 | use crate::tokenizer::{NormalizedString, Normalizer, Result}; 2 | use serde::{Deserialize, Serialize}; 3 | 4 | #[derive(Clone, Debug, Deserialize, Serialize)] 5 | #[serde(tag = "type")] 6 | pub struct Prepend { 7 | pub prepend: String, 8 | } 9 | 10 | impl Prepend { 11 | pub fn new(prepend: String) -> Self { 12 | Self { prepend } 13 | } 14 | } 15 | 16 | impl Normalizer for Prepend { 17 | /// Strip the normalized string inplace 18 | fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> { 19 | if !normalized.is_empty() { 20 | normalized.prepend(&self.prepend); 21 | } 22 | Ok(()) 23 | } 24 | } 25 | 26 | #[cfg(test)] 27 | mod tests { 28 | use super::*; 29 | 30 | #[test] 31 | fn test_prepend() { 32 | let original = "Hello"; 33 | let normalized = "▁Hello"; 34 | assert_ne!(original, normalized); 35 | let mut n = NormalizedString::from(original); 36 | let prepend = Prepend::new("▁".to_string()); 37 | prepend.normalize(&mut n).unwrap(); 38 | assert_eq!(&n.get(), &normalized); 39 | assert_eq!( 40 | n, 41 | NormalizedString::new( 42 | original.to_string(), 43 | normalized.to_string(), 44 | vec![ 45 | (0, 1), 46 | (0, 1), 47 | (0, 1), 48 | (0, 1), 49 | (1, 2), 50 | (2, 3), 51 | (3, 4), 52 | (4, 5) 53 | ], 54 | 0 55 | ) 56 | ); 57 | assert_eq!( 58 | n.alignments_original(), 59 | vec![(0, 4), (4, 5), (5, 6), (6, 7), (7, 8)] 60 | ); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /tokenizers/src/normalizers/unicode.rs: -------------------------------------------------------------------------------- 1 | use crate::tokenizer::{NormalizedString, Normalizer, Result}; 2 | use crate::utils::macro_rules_attribute; 3 | 4 | #[derive(Default, Copy, Clone, Debug)] 5 | #[macro_rules_attribute(impl_serde_type!)] 6 | pub struct NFD; 7 | impl Normalizer for NFD { 8 | fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> { 9 | normalized.nfd(); 10 | Ok(()) 11 | } 12 | } 13 | 14 | #[derive(Default, Copy, Clone, Debug)] 15 | #[macro_rules_attribute(impl_serde_type!)] 16 | pub struct NFKD; 17 | impl Normalizer for NFKD { 18 | fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> { 19 | normalized.nfkd(); 20 | Ok(()) 21 | } 22 | } 23 | 24 | #[derive(Default, Copy, Clone, Debug)] 25 | #[macro_rules_attribute(impl_serde_type!)] 26 | pub struct NFC; 27 | impl Normalizer for NFC { 28 | fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> { 29 | normalized.nfc(); 30 | Ok(()) 31 | } 32 | } 33 | 34 | #[derive(Default, Copy, Clone, Debug)] 35 | #[macro_rules_attribute(impl_serde_type!)] 36 | pub struct NFKC; 37 | impl Normalizer for NFKC { 38 | fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> { 39 | normalized.nfkc(); 40 | Ok(()) 41 | } 42 | } 43 | 44 | fn do_nmt(normalized: &mut NormalizedString) { 45 | // Ascii Control characters 46 | normalized 47 | .filter(|c| { 48 | !matches!( 49 | c as u32, 50 | 0x0001..=0x0008 | 51 | 0x000B | 52 | 0x000E..=0x001F | 53 | 0x007F | 54 | 0x008F | 55 | 0x009F 56 | ) 57 | }) 58 | // Other code points considered as whitespace. 59 | .map(|c| match c as u32 { 60 | 0x0009 => ' ', 61 | 0x000A => ' ', 62 | 0x000C => ' ', 63 | 0x000D => ' ', 64 | 0x1680 => ' ', 65 | 0x200B..=0x200F => ' ', 66 | 0x2028 => ' ', 67 | 0x2029 => ' ', 68 | 0x2581 => ' ', 69 | 0xFEFF => ' ', 70 | 0xFFFD => ' ', 71 | _ => c, 72 | }); 73 | } 74 | 75 | #[derive(Default, Copy, Clone, Debug)] 76 | #[macro_rules_attribute(impl_serde_type!)] 77 | pub struct Nmt; 78 | impl Normalizer for Nmt { 79 | fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> { 80 | do_nmt(normalized); 81 | Ok(()) 82 | } 83 | } 84 | 85 | #[cfg(test)] 86 | mod tests { 87 | use super::*; 88 | 89 | #[test] 90 | fn test_nfkc() { 91 | let original = "\u{fb01}".to_string(); 92 | let normalized = "fi".to_string(); 93 | let mut n = NormalizedString::from(original.clone()); 94 | NFKC.normalize(&mut n).unwrap(); 95 | 96 | assert_eq!( 97 | n, 98 | NormalizedString::new(original, normalized, vec![(0, 3), (0, 3)], 0) 99 | ); 100 | 101 | assert_eq!(n.alignments_original(), vec![(0, 2), (0, 2), (0, 2)]); 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /tokenizers/src/normalizers/utils.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | 3 | use crate::normalizers::NormalizerWrapper; 4 | use crate::tokenizer::{NormalizedString, Normalizer, Result}; 5 | use crate::utils::macro_rules_attribute; 6 | 7 | #[derive(Clone, Deserialize, Debug, Serialize)] 8 | #[serde(tag = "type")] 9 | /// Allows concatenating multiple other Normalizer as a Sequence. 10 | /// All the normalizers run in sequence in the given order against the same NormalizedString. 11 | pub struct Sequence { 12 | normalizers: Vec<NormalizerWrapper>, 13 | } 14 | 15 | impl Sequence { 16 | pub fn new(normalizers: Vec<NormalizerWrapper>) -> Self { 17 | Self { normalizers } 18 | } 19 | } 20 | 21 | impl AsRef<[NormalizerWrapper]> for Sequence { 22 | fn as_ref(&self) -> &[NormalizerWrapper] { 23 | &self.normalizers 24 | } 25 | } 26 | 27 | impl AsMut<[NormalizerWrapper]> for Sequence { 28 | fn as_mut(&mut self) -> &mut [NormalizerWrapper] { 29 | &mut self.normalizers 30 | } 31 | } 32 | 33 | impl IntoIterator for Sequence { 34 | type Item = NormalizerWrapper; 35 | type IntoIter = std::vec::IntoIter<Self::Item>; 36 | 37 | fn into_iter(self) -> Self::IntoIter { 38 | self.normalizers.into_iter() 39 | } 40 | } 41 | 42 | impl Normalizer for Sequence { 43 | fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> { 44 | for normalizer in &self.normalizers { 45 | normalizer.normalize(normalized)?; 46 | } 47 | Ok(()) 48 | } 49 | } 50 | 51 | /// Lowercases the input 52 | #[derive(Copy, Clone, Debug)] 53 | #[macro_rules_attribute(impl_serde_type!)] 54 | pub struct Lowercase; 55 | impl Normalizer for Lowercase { 56 | fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> { 57 | normalized.lowercase(); 58 | Ok(()) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /tokenizers/src/pre_tokenizers/bert.rs: -------------------------------------------------------------------------------- 1 | use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; 2 | use crate::utils::macro_rules_attribute; 3 | use unicode_categories::UnicodeCategories; 4 | 5 | fn is_bert_punc(x: char) -> bool { 6 | char::is_ascii_punctuation(&x) || x.is_punctuation() 7 | } 8 | 9 | #[derive(Copy, Clone, Debug, PartialEq, Eq)] 10 | #[macro_rules_attribute(impl_serde_type!)] 11 | pub struct BertPreTokenizer; 12 | 13 | impl PreTokenizer for BertPreTokenizer { 14 | fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> { 15 | pretokenized.split(|_, s| s.split(char::is_whitespace, SplitDelimiterBehavior::Removed))?; 16 | pretokenized.split(|_, s| s.split(is_bert_punc, SplitDelimiterBehavior::Isolated)) 17 | } 18 | } 19 | 20 | #[cfg(test)] 21 | mod tests { 22 | use super::*; 23 | use crate::{NormalizedString, OffsetReferential, OffsetType}; 24 | 25 | #[test] 26 | fn basic() { 27 | let pretok = BertPreTokenizer; 28 | let mut pretokenized: PreTokenizedString = "Hey friend! How are you?!?".into(); 29 | pretok.pre_tokenize(&mut pretokenized).unwrap(); 30 | assert_eq!( 31 | pretokenized 32 | .get_splits(OffsetReferential::Original, OffsetType::Byte) 33 | .into_iter() 34 | .map(|(s, o, _)| (s, o)) 35 | .collect::<Vec<_>>(), 36 | vec![ 37 | ("Hey", (0, 3)), 38 | ("friend", (4, 10)), 39 | ("!", (10, 11)), 40 | ("How", (16, 19)), 41 | ("are", (20, 23)), 42 | ("you", (24, 27)), 43 | ("?", (27, 28)), 44 | ("!", (28, 29)), 45 | ("?", (29, 30)), 46 | ] 47 | ); 48 | } 49 | 50 | #[test] 51 | fn chinese_chars() { 52 | let mut n = NormalizedString::from("野口里佳 Noguchi Rika"); 53 | n.transform( 54 | n.get().to_owned().chars().flat_map(|c| { 55 | if (c as usize) > 0x4E00 { 56 | vec![(' ', 0), (c, 1), (' ', 1)] 57 | } else { 58 | vec![(c, 0)] 59 | } 60 | }), 61 | 0, 62 | ); 63 | let mut pretokenized = n.into(); 64 | let pretok = BertPreTokenizer; 65 | pretok.pre_tokenize(&mut pretokenized).unwrap(); 66 | assert_eq!( 67 | pretokenized 68 | .get_splits(OffsetReferential::Original, OffsetType::Byte) 69 | .into_iter() 70 | .map(|(s, o, _)| (s, o)) 71 | .collect::<Vec<_>>(), 72 | vec![ 73 | ("野", (0, 3)), 74 | ("口", (3, 6)), 75 | ("里", (6, 9)), 76 | ("佳", (9, 12)), 77 | ("Noguchi", (13, 20)), 78 | ("Rika", (21, 25)) 79 | ] 80 | ); 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /tokenizers/src/pre_tokenizers/delimiter.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | 3 | use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; 4 | use crate::utils::macro_rules_attribute; 5 | 6 | #[derive(Copy, Clone, Debug, PartialEq, Eq)] 7 | #[non_exhaustive] 8 | #[macro_rules_attribute(impl_serde_type!)] 9 | pub struct CharDelimiterSplit { 10 | pub delimiter: char, 11 | } 12 | 13 | impl CharDelimiterSplit { 14 | pub fn new(delimiter: char) -> Self { 15 | Self { delimiter } 16 | } 17 | } 18 | 19 | impl PreTokenizer for CharDelimiterSplit { 20 | fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> { 21 | // TODO: Maybe add the option to specify the behavior 22 | pretokenized.split(|_, normalized| { 23 | normalized.split(self.delimiter, SplitDelimiterBehavior::Removed) 24 | }) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /tokenizers/src/pre_tokenizers/punctuation.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | 3 | use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; 4 | use crate::utils::macro_rules_attribute; 5 | use unicode_categories::UnicodeCategories; 6 | 7 | fn is_punc(x: char) -> bool { 8 | char::is_ascii_punctuation(&x) || x.is_punctuation() 9 | } 10 | 11 | #[derive(Copy, Clone, Debug, PartialEq, Eq)] 12 | #[macro_rules_attribute(impl_serde_type!)] 13 | pub struct Punctuation { 14 | #[serde(default = "default_split")] 15 | pub behavior: SplitDelimiterBehavior, 16 | } 17 | 18 | fn default_split() -> SplitDelimiterBehavior { 19 | SplitDelimiterBehavior::Isolated 20 | } 21 | 22 | impl Punctuation { 23 | pub fn new(behavior: SplitDelimiterBehavior) -> Self { 24 | Self { behavior } 25 | } 26 | } 27 | 28 | impl Default for Punctuation { 29 | fn default() -> Self { 30 | Self::new(SplitDelimiterBehavior::Isolated) 31 | } 32 | } 33 | 34 | impl PreTokenizer for Punctuation { 35 | fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> { 36 | pretokenized.split(|_, s| s.split(is_punc, self.behavior)) 37 | } 38 | } 39 | 40 | #[cfg(test)] 41 | mod tests { 42 | use super::*; 43 | use crate::{OffsetReferential, OffsetType}; 44 | 45 | #[test] 46 | fn punctuation_basic() { 47 | let pretok = Punctuation::default(); 48 | let mut pretokenized: PreTokenizedString = "Hey friend! How are you?!?".into(); 49 | pretok.pre_tokenize(&mut pretokenized).unwrap(); 50 | assert_eq!( 51 | pretokenized 52 | .get_splits(OffsetReferential::Original, OffsetType::Byte) 53 | .into_iter() 54 | .map(|(s, o, _)| (s, o)) 55 | .collect::<Vec<_>>(), 56 | vec![ 57 | ("Hey friend", (0, 10)), 58 | ("!", (10, 11)), 59 | (" How are you", (11, 27)), 60 | ("?", (27, 28)), 61 | ("!", (28, 29)), 62 | ("?", (29, 30)), 63 | ] 64 | ); 65 | } 66 | 67 | #[test] 68 | fn deserialization() { 69 | let punctuation: Punctuation = serde_json::from_str(r#"{"type": "Punctuation"}"#).unwrap(); 70 | assert_eq!(punctuation, Punctuation::default()); 71 | assert_eq!( 72 | punctuation, 73 | Punctuation::new(SplitDelimiterBehavior::Isolated) 74 | ); 75 | } 76 | 77 | #[test] 78 | #[should_panic] 79 | fn deserialization_erroneous() { 80 | let _punctuation: Punctuation = 81 | serde_json::from_str(r#"{"type": "WhitespaceSplit"}"#).unwrap(); 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /tokenizers/src/pre_tokenizers/sequence.rs: -------------------------------------------------------------------------------- 1 | use crate::pre_tokenizers::PreTokenizerWrapper; 2 | use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result}; 3 | use crate::utils::macro_rules_attribute; 4 | use serde::{Deserialize, Serialize}; 5 | 6 | #[derive(Clone, Debug, PartialEq)] 7 | #[macro_rules_attribute(impl_serde_type!)] 8 | pub struct Sequence { 9 | pretokenizers: Vec<PreTokenizerWrapper>, 10 | } 11 | 12 | impl Sequence { 13 | pub fn new(pretokenizers: Vec<PreTokenizerWrapper>) -> Self { 14 | Self { pretokenizers } 15 | } 16 | } 17 | 18 | impl AsRef<[PreTokenizerWrapper]> for Sequence { 19 | fn as_ref(&self) -> &[PreTokenizerWrapper] { 20 | &self.pretokenizers 21 | } 22 | } 23 | 24 | impl AsMut<[PreTokenizerWrapper]> for Sequence { 25 | fn as_mut(&mut self) -> &mut [PreTokenizerWrapper] { 26 | &mut self.pretokenizers 27 | } 28 | } 29 | 30 | impl IntoIterator for Sequence { 31 | type Item = PreTokenizerWrapper; 32 | type IntoIter = std::vec::IntoIter<Self::Item>; 33 | 34 | fn into_iter(self) -> Self::IntoIter { 35 | self.pretokenizers.into_iter() 36 | } 37 | } 38 | 39 | impl PreTokenizer for Sequence { 40 | fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> { 41 | for pretokenizer in &self.pretokenizers { 42 | pretokenizer.pre_tokenize(pretokenized)?; 43 | } 44 | Ok(()) 45 | } 46 | } 47 | 48 | #[cfg(test)] 49 | mod tests { 50 | use super::*; 51 | use crate::pre_tokenizers::{punctuation::Punctuation, whitespace::WhitespaceSplit}; 52 | use crate::{OffsetReferential, OffsetType}; 53 | 54 | #[test] 55 | fn sequence_basic() { 56 | let pretokenizers = vec![ 57 | PreTokenizerWrapper::WhitespaceSplit(WhitespaceSplit), 58 | PreTokenizerWrapper::Punctuation(Punctuation::default()), 59 | ]; 60 | let pretok = Sequence::new(pretokenizers); 61 | let mut pretokenized: PreTokenizedString = "Hey friend! How are you?!?".into(); 62 | pretok.pre_tokenize(&mut pretokenized).unwrap(); 63 | assert_eq!( 64 | pretokenized 65 | .get_splits(OffsetReferential::Original, OffsetType::Byte) 66 | .into_iter() 67 | .map(|(s, o, _)| (s, o)) 68 | .collect::<Vec<_>>(), 69 | vec![ 70 | ("Hey", (0, 3)), 71 | ("friend", (4, 10)), 72 | ("!", (10, 11)), 73 | ("How", (16, 19)), 74 | ("are", (20, 23)), 75 | ("you", (24, 27)), 76 | ("?", (27, 28)), 77 | ("!", (28, 29)), 78 | ("?", (29, 30)), 79 | ] 80 | ); 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /tokenizers/src/pre_tokenizers/unicode_scripts/mod.rs: -------------------------------------------------------------------------------- 1 | mod pre_tokenizer; 2 | mod scripts; 3 | 4 | // Re-export the PreTokenizer 5 | pub use pre_tokenizer::UnicodeScripts; 6 | -------------------------------------------------------------------------------- /tokenizers/src/utils/fancy.rs: -------------------------------------------------------------------------------- 1 | use crate::tokenizer::pattern::Pattern; 2 | use crate::Offsets; 3 | use fancy_regex::Regex; 4 | use std::error::Error; 5 | 6 | #[derive(Debug)] 7 | pub struct SysRegex { 8 | regex: Regex, 9 | } 10 | 11 | impl SysRegex { 12 | pub fn find_iter<'r, 't>(&'r self, inside: &'t str) -> Matches<'r, 't> { 13 | Matches(self.regex.find_iter(inside)) 14 | } 15 | 16 | pub fn new(regex_str: &str) -> Result<Self, Box<dyn Error + Send + Sync + 'static>> { 17 | Ok(Self { 18 | regex: Regex::new(regex_str)?, 19 | }) 20 | } 21 | } 22 | 23 | pub struct Matches<'r, 't>(fancy_regex::Matches<'r, 't>); 24 | 25 | impl Iterator for Matches<'_, '_> { 26 | type Item = (usize, usize); 27 | 28 | fn next(&mut self) -> Option<Self::Item> { 29 | match self.0.next() { 30 | Some(Ok(mat)) => Some((mat.start(), mat.end())), 31 | // stop if an error is encountered 32 | None | Some(Err(_)) => None, 33 | } 34 | } 35 | } 36 | 37 | impl Pattern for &Regex { 38 | fn find_matches( 39 | &self, 40 | inside: &str, 41 | ) -> Result<Vec<(Offsets, bool)>, Box<dyn Error + Send + Sync + 'static>> { 42 | if inside.is_empty() { 43 | return Ok(vec![((0, 0), false)]); 44 | } 45 | 46 | let mut prev = 0; 47 | let mut splits = Vec::with_capacity(inside.len()); 48 | for match_ in self.find_iter(inside) { 49 | let match_ = match_?; 50 | let start = match_.start(); 51 | let end = match_.end(); 52 | if prev != start { 53 | splits.push(((prev, start), false)); 54 | } 55 | splits.push(((start, end), true)); 56 | prev = end; 57 | } 58 | if prev != inside.len() { 59 | splits.push(((prev, inside.len()), false)) 60 | } 61 | Ok(splits) 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /tokenizers/src/utils/from_pretrained.rs: -------------------------------------------------------------------------------- 1 | use crate::Result; 2 | use hf_hub::{api::sync::ApiBuilder, Repo, RepoType}; 3 | use std::collections::HashMap; 4 | use std::path::PathBuf; 5 | 6 | /// Defines the additional parameters available for the `from_pretrained` function 7 | #[derive(Debug, Clone)] 8 | pub struct FromPretrainedParameters { 9 | pub revision: String, 10 | pub user_agent: HashMap<String, String>, 11 | pub token: Option<String>, 12 | } 13 | 14 | impl Default for FromPretrainedParameters { 15 | fn default() -> Self { 16 | Self { 17 | revision: "main".into(), 18 | user_agent: HashMap::new(), 19 | token: None, 20 | } 21 | } 22 | } 23 | 24 | /// Downloads and cache the identified tokenizer if it exists on 25 | /// the Hugging Face Hub, and returns a local path to the file 26 | pub fn from_pretrained<S: AsRef<str>>( 27 | identifier: S, 28 | params: Option<FromPretrainedParameters>, 29 | ) -> Result<PathBuf> { 30 | let identifier: String = identifier.as_ref().to_string(); 31 | 32 | let valid_chars = ['-', '_', '.', '/']; 33 | let is_valid_char = |x: char| x.is_alphanumeric() || valid_chars.contains(&x); 34 | 35 | let valid = identifier.chars().all(is_valid_char); 36 | let valid_chars_stringified = valid_chars 37 | .iter() 38 | .fold(vec![], |mut buf, x| { 39 | buf.push(format!("'{x}'")); 40 | buf 41 | }) 42 | .join(", "); // "'/', '-', '_', '.'" 43 | if !valid { 44 | return Err(format!( 45 | "Model \"{identifier}\" contains invalid characters, expected only alphanumeric or {valid_chars_stringified}" 46 | ) 47 | .into()); 48 | } 49 | let params = params.unwrap_or_default(); 50 | 51 | let revision = ¶ms.revision; 52 | let valid_revision = revision.chars().all(is_valid_char); 53 | if !valid_revision { 54 | return Err(format!( 55 | "Revision \"{revision}\" contains invalid characters, expected only alphanumeric or {valid_chars_stringified}" 56 | ) 57 | .into()); 58 | } 59 | 60 | let mut builder = ApiBuilder::from_env(); 61 | if let Some(token) = params.token { 62 | builder = builder.with_token(Some(token)); 63 | } 64 | let api = builder.build()?; 65 | let repo = Repo::with_revision(identifier, RepoType::Model, params.revision); 66 | let api = api.repo(repo); 67 | Ok(api.get("tokenizer.json")?) 68 | } 69 | -------------------------------------------------------------------------------- /tokenizers/src/utils/iter.rs: -------------------------------------------------------------------------------- 1 | //! This comes from the Rust libcore and is duplicated here because it is not exported 2 | //! (cf <https://github.com/rust-lang/rust/blob/25091ed9b7739e12466fb2490baa1e8a2815121c/src/libcore/iter/adapters/mod.rs#L2664>) 3 | //! We are now using the version from <https://stackoverflow.com/questions/44544323/how-to-unzip-a-sequence-of-resulta-b-e-to-a-veca-vecb-and-stop-on-f> 4 | //! because the one from the libcore seems to cause overflowing stacks in some cases 5 | //! It also contains a lines_with_ending that copies std::io::BufRead but keeps line endings. 6 | use std::io::BufRead; 7 | 8 | pub struct ResultShunt<I, E> { 9 | iter: I, 10 | error: Option<E>, 11 | } 12 | 13 | impl<I, T, E> ResultShunt<I, E> 14 | where 15 | I: Iterator<Item = Result<T, E>>, 16 | { 17 | /// Process the given iterator as if it yielded a `T` instead of a 18 | /// `Result<T, _>`. Any errors will stop the inner iterator and 19 | /// the overall result will be an error. 20 | pub fn process<F, U>(iter: I, mut f: F) -> Result<U, E> 21 | where 22 | F: FnMut(&mut Self) -> U, 23 | { 24 | let mut shunt = ResultShunt::new(iter); 25 | let value = f(shunt.by_ref()); 26 | shunt.reconstruct(value) 27 | } 28 | 29 | fn new(iter: I) -> Self { 30 | ResultShunt { iter, error: None } 31 | } 32 | 33 | /// Consume the adapter and rebuild a `Result` value. This should 34 | /// *always* be called, otherwise any potential error would be 35 | /// lost. 36 | fn reconstruct<U>(self, val: U) -> Result<U, E> { 37 | match self.error { 38 | None => Ok(val), 39 | Some(e) => Err(e), 40 | } 41 | } 42 | } 43 | 44 | impl<I, T, E> Iterator for ResultShunt<I, E> 45 | where 46 | I: Iterator<Item = Result<T, E>>, 47 | { 48 | type Item = T; 49 | 50 | fn next(&mut self) -> Option<Self::Item> { 51 | match self.iter.next() { 52 | Some(Ok(v)) => Some(v), 53 | Some(Err(e)) => { 54 | self.error = Some(e); 55 | None 56 | } 57 | None => None, 58 | } 59 | } 60 | } 61 | 62 | /// Copied from std::io::BufRead but keep newline characters. 63 | #[derive(Debug)] 64 | pub struct Lines<B> { 65 | buf: B, 66 | } 67 | 68 | pub trait LinesWithEnding<B> { 69 | fn lines_with_ending(self) -> Lines<B>; 70 | } 71 | 72 | impl<B> LinesWithEnding<B> for B 73 | where 74 | B: BufRead, 75 | { 76 | fn lines_with_ending(self) -> Lines<B> { 77 | Lines::<B> { buf: self } 78 | } 79 | } 80 | impl<B: BufRead> Iterator for Lines<B> { 81 | type Item = std::io::Result<String>; 82 | 83 | fn next(&mut self) -> Option<Self::Item> { 84 | let mut buf = String::new(); 85 | match self.buf.read_line(&mut buf) { 86 | Ok(0) => None, 87 | Ok(_n) => { 88 | // if buf.ends_with('\n') { 89 | // buf.pop(); 90 | // if buf.ends_with('\r') { 91 | // buf.pop(); 92 | // } 93 | // } 94 | Some(Ok(buf)) 95 | } 96 | Err(e) => Some(Err(e)), 97 | } 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /tokenizers/src/utils/onig.rs: -------------------------------------------------------------------------------- 1 | use crate::tokenizer::pattern::Pattern; 2 | use crate::{Offsets, Result}; 3 | use onig::Regex; 4 | use std::error::Error; 5 | 6 | #[derive(Debug)] 7 | pub struct SysRegex { 8 | regex: Regex, 9 | } 10 | 11 | impl SysRegex { 12 | pub fn find_iter<'r, 't>(&'r self, inside: &'t str) -> onig::FindMatches<'r, 't> { 13 | self.regex.find_iter(inside) 14 | } 15 | 16 | pub fn new( 17 | regex_str: &str, 18 | ) -> std::result::Result<Self, Box<dyn Error + Send + Sync + 'static>> { 19 | Ok(Self { 20 | regex: Regex::new(regex_str)?, 21 | }) 22 | } 23 | } 24 | 25 | impl Pattern for &Regex { 26 | fn find_matches(&self, inside: &str) -> Result<Vec<(Offsets, bool)>> { 27 | if inside.is_empty() { 28 | return Ok(vec![((0, 0), false)]); 29 | } 30 | 31 | let mut prev = 0; 32 | let mut splits = Vec::with_capacity(inside.len()); 33 | for (start, end) in self.find_iter(inside) { 34 | if prev != start { 35 | splits.push(((prev, start), false)); 36 | } 37 | splits.push(((start, end), true)); 38 | prev = end; 39 | } 40 | if prev != inside.len() { 41 | splits.push(((prev, inside.len()), false)) 42 | } 43 | Ok(splits) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /tokenizers/src/utils/progress.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "progressbar")] 2 | pub(crate) use indicatif::{ProgressBar, ProgressStyle}; 3 | 4 | #[cfg(not(feature = "progressbar"))] 5 | mod progressbar { 6 | use std::borrow::Cow; 7 | pub struct ProgressBar; 8 | impl ProgressBar { 9 | pub fn new(_length: u64) -> Self { 10 | Self {} 11 | } 12 | 13 | pub fn set_length(&self, _length: u64) {} 14 | pub fn set_message(&self, _message: impl Into<Cow<'static, str>>) {} 15 | pub fn finish(&self) {} 16 | pub fn reset(&self) {} 17 | pub fn inc(&self, _inc: u64) {} 18 | pub fn set_style(&self, _style: ProgressStyle) {} 19 | } 20 | 21 | pub struct ProgressStyle {} 22 | impl ProgressStyle { 23 | pub fn default_bar() -> Self { 24 | Self {} 25 | } 26 | pub fn template(self, _template: &str) -> Result<Self, String> { 27 | Ok(self) 28 | } 29 | } 30 | } 31 | #[cfg(not(feature = "progressbar"))] 32 | pub(crate) use progressbar::{ProgressBar, ProgressStyle}; 33 | -------------------------------------------------------------------------------- /tokenizers/tests/common/mod.rs: -------------------------------------------------------------------------------- 1 | use tokenizers::decoders::wordpiece::WordPiece as WordPieceDecoder; 2 | use tokenizers::models::bpe::BPE; 3 | use tokenizers::models::wordpiece::WordPiece; 4 | use tokenizers::normalizers::bert::BertNormalizer; 5 | use tokenizers::pre_tokenizers::bert::BertPreTokenizer; 6 | use tokenizers::pre_tokenizers::byte_level::ByteLevel; 7 | use tokenizers::processors::bert::BertProcessing; 8 | use tokenizers::tokenizer::{Model, Tokenizer}; 9 | 10 | #[allow(dead_code)] 11 | pub fn get_empty() -> Tokenizer { 12 | Tokenizer::new(BPE::default()) 13 | } 14 | 15 | #[allow(dead_code)] 16 | pub fn get_byte_level_bpe() -> BPE { 17 | BPE::from_file("data/gpt2-vocab.json", "data/gpt2-merges.txt") 18 | .build() 19 | .expect("Files not found, run `make test` to download these files") 20 | } 21 | 22 | #[allow(dead_code)] 23 | pub fn get_byte_level(add_prefix_space: bool, trim_offsets: bool) -> Tokenizer { 24 | let mut tokenizer = Tokenizer::new(get_byte_level_bpe()); 25 | tokenizer 26 | .with_pre_tokenizer(Some( 27 | ByteLevel::default().add_prefix_space(add_prefix_space), 28 | )) 29 | .with_decoder(Some(ByteLevel::default())) 30 | .with_post_processor(Some(ByteLevel::default().trim_offsets(trim_offsets))); 31 | 32 | tokenizer 33 | } 34 | 35 | #[allow(dead_code)] 36 | pub fn get_bert_wordpiece() -> WordPiece { 37 | WordPiece::from_file("data/bert-base-uncased-vocab.txt") 38 | .build() 39 | .expect("Files not found, run `make test` to download these files") 40 | } 41 | 42 | #[allow(dead_code)] 43 | pub fn get_bert() -> Tokenizer { 44 | let mut tokenizer = Tokenizer::new(get_bert_wordpiece()); 45 | let sep = tokenizer.get_model().token_to_id("[SEP]").unwrap(); 46 | let cls = tokenizer.get_model().token_to_id("[CLS]").unwrap(); 47 | tokenizer 48 | .with_normalizer(Some(BertNormalizer::default())) 49 | .with_pre_tokenizer(Some(BertPreTokenizer)) 50 | .with_decoder(Some(WordPieceDecoder::default())) 51 | .with_post_processor(Some(BertProcessing::new( 52 | (String::from("[SEP]"), sep), 53 | (String::from("[CLS]"), cls), 54 | ))); 55 | 56 | tokenizer 57 | } 58 | -------------------------------------------------------------------------------- /tokenizers/tests/from_pretrained.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "http")] 2 | use tokenizers::{FromPretrainedParameters, Result, Tokenizer}; 3 | 4 | #[test] 5 | fn test_from_pretrained() -> Result<()> { 6 | let tokenizer = Tokenizer::from_pretrained("bert-base-cased", None)?; 7 | let encoding = tokenizer.encode("Hey there dear friend!", false)?; 8 | assert_eq!( 9 | encoding.get_tokens(), 10 | &["Hey", "there", "dear", "friend", "!"] 11 | ); 12 | Ok(()) 13 | } 14 | 15 | #[test] 16 | fn test_from_pretrained_revision() -> Result<()> { 17 | let tokenizer = Tokenizer::from_pretrained("anthony/tokenizers-test", None)?; 18 | let encoding = tokenizer.encode("Hey there dear friend!", false)?; 19 | assert_eq!( 20 | encoding.get_tokens(), 21 | &["hey", "there", "dear", "friend", "!"] 22 | ); 23 | 24 | let tokenizer = Tokenizer::from_pretrained( 25 | "anthony/tokenizers-test", 26 | Some(FromPretrainedParameters { 27 | revision: "gpt-2".to_string(), 28 | ..Default::default() 29 | }), 30 | )?; 31 | let encoding = tokenizer.encode("Hey there dear friend!", false)?; 32 | assert_eq!( 33 | encoding.get_tokens(), 34 | &["Hey", "Ġthere", "Ġdear", "Ġfriend", "!"] 35 | ); 36 | 37 | Ok(()) 38 | } 39 | 40 | #[test] 41 | fn test_from_pretrained_invalid_model() { 42 | let tokenizer = Tokenizer::from_pretrained("docs?", None); 43 | assert!(tokenizer.is_err()); 44 | } 45 | 46 | #[test] 47 | fn test_from_pretrained_invalid_revision() { 48 | let tokenizer = Tokenizer::from_pretrained( 49 | "bert-base-cased", 50 | Some(FromPretrainedParameters { 51 | revision: "gpt?".to_string(), 52 | ..Default::default() 53 | }), 54 | ); 55 | assert!(tokenizer.is_err()); 56 | } 57 | -------------------------------------------------------------------------------- /tokenizers/tests/training.rs: -------------------------------------------------------------------------------- 1 | use tokenizers::models::bpe::BPE; 2 | use tokenizers::pre_tokenizers::whitespace::Whitespace; 3 | use tokenizers::{DecoderWrapper, NormalizerWrapper, PostProcessorWrapper, PreTokenizerWrapper}; 4 | use tokenizers::{Model, Tokenizer, TokenizerBuilder}; 5 | 6 | #[test] 7 | fn bpe_values_after_training() { 8 | let mut tokenizer = TokenizerBuilder::< 9 | BPE, 10 | NormalizerWrapper, 11 | PreTokenizerWrapper, 12 | PostProcessorWrapper, 13 | DecoderWrapper, 14 | >::default() 15 | .with_model( 16 | BPE::builder() 17 | .unk_token("[UNK]".to_string()) 18 | .dropout(0.1) 19 | .build() 20 | .unwrap(), 21 | ) 22 | .build() 23 | .unwrap(); 24 | let mut trainer = tokenizer.get_model().get_trainer(); 25 | tokenizer 26 | .train_from_files(&mut trainer, vec!["./data/small.txt".to_string()]) 27 | .unwrap(); 28 | assert_eq!(tokenizer.get_model().dropout, Some(0.1)); 29 | assert_eq!(tokenizer.get_model().unk_token, Some("[UNK]".to_string())); 30 | } 31 | 32 | #[test] 33 | fn bpe_continuing_subword_prefix_error() { 34 | let mut tokenizer = TokenizerBuilder::< 35 | BPE, 36 | NormalizerWrapper, 37 | PreTokenizerWrapper, 38 | PostProcessorWrapper, 39 | DecoderWrapper, 40 | >::default() 41 | .with_model( 42 | BPE::builder() 43 | .unk_token("[UNK]".to_string()) 44 | .continuing_subword_prefix("##".to_string()) 45 | .build() 46 | .unwrap(), 47 | ) 48 | .with_pre_tokenizer(Some(PreTokenizerWrapper::Whitespace(Whitespace {}))) 49 | .build() 50 | .unwrap(); 51 | let mut trainer = tokenizer.get_model().get_trainer(); 52 | tokenizer 53 | .train_from_files(&mut trainer, vec!["./data/small.txt".to_string()]) 54 | .unwrap(); 55 | tokenizer.save("tokenizer.json", true).unwrap(); 56 | let tokenizer = Tokenizer::from_file("tokenizer.json").unwrap(); 57 | assert_eq!(tokenizer.get_vocab_size(false), 1526); 58 | 59 | std::fs::remove_file("tokenizer.json").unwrap(); 60 | } 61 | --------------------------------------------------------------------------------