The response has been limited to 50k tokens of the smallest files in the repo. You can remove this limitation by removing the max tokens filter.
├── .github
    ├── conda
    │   ├── bld.bat
    │   ├── build.sh
    │   └── meta.yaml
    ├── stale.yml
    └── workflows
    │   ├── CI.yml
    │   ├── build_documentation.yml
    │   ├── build_pr_documentation.yml
    │   ├── delete_doc_comment.yml
    │   ├── delete_doc_comment_trigger.yml
    │   ├── docs-check.yml
    │   ├── node-release.yml
    │   ├── node.yml
    │   ├── python-release.yml
    │   ├── python.yml
    │   ├── rust-release.yml
    │   ├── rust.yml
    │   ├── stale.yml
    │   ├── trufflehog.yml
    │   └── upload_pr_documentation.yml
├── .gitignore
├── CITATION.cff
├── LICENSE
├── README.md
├── RELEASE.md
├── bindings
    ├── node
    │   ├── .cargo
    │   │   └── config.toml
    │   ├── .editorconfig
    │   ├── .eslintrc.yml
    │   ├── .gitattributes
    │   ├── .gitignore
    │   ├── .prettierignore
    │   ├── .taplo.toml
    │   ├── .yarn
    │   │   └── releases
    │   │   │   └── yarn-3.5.1.cjs
    │   ├── .yarnrc.yml
    │   ├── Cargo.toml
    │   ├── LICENSE
    │   ├── Makefile
    │   ├── README.md
    │   ├── build.rs
    │   ├── examples
    │   │   └── documentation
    │   │   │   ├── pipeline.test.ts
    │   │   │   └── quicktour.test.ts
    │   ├── index.d.ts
    │   ├── index.js
    │   ├── jest.config.js
    │   ├── lib
    │   │   └── bindings
    │   │   │   ├── __mocks__
    │   │   │       ├── merges.txt
    │   │   │       ├── vocab.json
    │   │   │       └── vocab.txt
    │   │   │   ├── decoders.test.ts
    │   │   │   ├── encoding.test.ts
    │   │   │   ├── models.test.ts
    │   │   │   ├── normalizers.test.ts
    │   │   │   ├── post-processors.test.ts
    │   │   │   ├── pre-tokenizers.test.ts
    │   │   │   ├── tokenizer.test.ts
    │   │   │   └── utils.test.ts
    │   ├── npm
    │   │   ├── android-arm-eabi
    │   │   │   ├── README.md
    │   │   │   └── package.json
    │   │   ├── android-arm64
    │   │   │   ├── README.md
    │   │   │   └── package.json
    │   │   ├── darwin-arm64
    │   │   │   ├── README.md
    │   │   │   └── package.json
    │   │   ├── darwin-x64
    │   │   │   ├── README.md
    │   │   │   └── package.json
    │   │   ├── freebsd-x64
    │   │   │   ├── README.md
    │   │   │   └── package.json
    │   │   ├── linux-arm-gnueabihf
    │   │   │   ├── README.md
    │   │   │   └── package.json
    │   │   ├── linux-arm64-gnu
    │   │   │   ├── README.md
    │   │   │   └── package.json
    │   │   ├── linux-arm64-musl
    │   │   │   ├── README.md
    │   │   │   └── package.json
    │   │   ├── linux-x64-gnu
    │   │   │   ├── README.md
    │   │   │   └── package.json
    │   │   ├── linux-x64-musl
    │   │   │   ├── README.md
    │   │   │   └── package.json
    │   │   ├── win32-arm64-msvc
    │   │   │   ├── README.md
    │   │   │   └── package.json
    │   │   ├── win32-ia32-msvc
    │   │   │   ├── README.md
    │   │   │   └── package.json
    │   │   └── win32-x64-msvc
    │   │   │   ├── README.md
    │   │   │   └── package.json
    │   ├── package.json
    │   ├── rustfmt.toml
    │   ├── src
    │   │   ├── arc_rwlock_serde.rs
    │   │   ├── decoders.rs
    │   │   ├── encoding.rs
    │   │   ├── lib.rs
    │   │   ├── models.rs
    │   │   ├── normalizers.rs
    │   │   ├── pre_tokenizers.rs
    │   │   ├── processors.rs
    │   │   ├── tasks
    │   │   │   ├── mod.rs
    │   │   │   ├── models.rs
    │   │   │   └── tokenizer.rs
    │   │   ├── tokenizer.rs
    │   │   ├── trainers.rs
    │   │   └── utils.rs
    │   ├── tsconfig.json
    │   ├── types.ts
    │   └── yarn.lock
    └── python
    │   ├── .cargo
    │       └── config.toml
    │   ├── .gitignore
    │   ├── CHANGELOG.md
    │   ├── Cargo.toml
    │   ├── MANIFEST.in
    │   ├── Makefile
    │   ├── README.md
    │   ├── benches
    │       └── test_tiktoken.py
    │   ├── conftest.py
    │   ├── examples
    │       ├── custom_components.py
    │       ├── example.py
    │       ├── train_bert_wordpiece.py
    │       ├── train_bytelevel_bpe.py
    │       ├── train_with_datasets.py
    │       └── using_the_visualizer.ipynb
    │   ├── py_src
    │       └── tokenizers
    │       │   ├── __init__.py
    │       │   ├── __init__.pyi
    │       │   ├── decoders
    │       │       ├── __init__.py
    │       │       └── __init__.pyi
    │       │   ├── implementations
    │       │       ├── __init__.py
    │       │       ├── base_tokenizer.py
    │       │       ├── bert_wordpiece.py
    │       │       ├── byte_level_bpe.py
    │       │       ├── char_level_bpe.py
    │       │       ├── sentencepiece_bpe.py
    │       │       └── sentencepiece_unigram.py
    │       │   ├── models
    │       │       ├── __init__.py
    │       │       └── __init__.pyi
    │       │   ├── normalizers
    │       │       ├── __init__.py
    │       │       └── __init__.pyi
    │       │   ├── pre_tokenizers
    │       │       ├── __init__.py
    │       │       └── __init__.pyi
    │       │   ├── processors
    │       │       ├── __init__.py
    │       │       └── __init__.pyi
    │       │   ├── tools
    │       │       ├── __init__.py
    │       │       ├── visualizer-styles.css
    │       │       └── visualizer.py
    │       │   └── trainers
    │       │       ├── __init__.py
    │       │       └── __init__.pyi
    │   ├── pyproject.toml
    │   ├── rust-toolchain
    │   ├── scripts
    │       ├── convert.py
    │       ├── sentencepiece_extractor.py
    │       └── spm_parity_check.py
    │   ├── setup.cfg
    │   ├── src
    │       ├── decoders.rs
    │       ├── encoding.rs
    │       ├── error.rs
    │       ├── lib.rs
    │       ├── models.rs
    │       ├── normalizers.rs
    │       ├── pre_tokenizers.rs
    │       ├── processors.rs
    │       ├── token.rs
    │       ├── tokenizer.rs
    │       ├── trainers.rs
    │       └── utils
    │       │   ├── iterators.rs
    │       │   ├── mod.rs
    │       │   ├── normalization.rs
    │       │   ├── pretokenization.rs
    │       │   ├── regex.rs
    │       │   └── serde_pyo3.rs
    │   ├── stub.py
    │   ├── test.txt
    │   └── tests
    │       ├── __init__.py
    │       ├── bindings
    │           ├── __init__.py
    │           ├── test_decoders.py
    │           ├── test_encoding.py
    │           ├── test_models.py
    │           ├── test_normalizers.py
    │           ├── test_pre_tokenizers.py
    │           ├── test_processors.py
    │           ├── test_tokenizer.py
    │           └── test_trainers.py
    │       ├── documentation
    │           ├── __init__.py
    │           ├── test_pipeline.py
    │           ├── test_quicktour.py
    │           └── test_tutorial_train_from_iterators.py
    │       ├── implementations
    │           ├── __init__.py
    │           ├── test_base_tokenizer.py
    │           ├── test_bert_wordpiece.py
    │           ├── test_byte_level_bpe.py
    │           ├── test_char_bpe.py
    │           └── test_sentencepiece.py
    │       ├── test_serialization.py
    │       └── utils.py
├── docs
    ├── Makefile
    ├── README.md
    ├── source-doc-builder
    │   ├── _toctree.yml
    │   ├── api
    │   │   ├── added-tokens.mdx
    │   │   ├── decoders.mdx
    │   │   ├── encode-inputs.mdx
    │   │   ├── encoding.mdx
    │   │   ├── input-sequences.mdx
    │   │   ├── models.mdx
    │   │   ├── normalizers.mdx
    │   │   ├── post-processors.mdx
    │   │   ├── pre-tokenizers.mdx
    │   │   ├── tokenizer.mdx
    │   │   ├── trainers.mdx
    │   │   └── visualizer.mdx
    │   ├── components.mdx
    │   ├── index.mdx
    │   ├── installation.mdx
    │   ├── pipeline.mdx
    │   ├── quicktour.mdx
    │   └── training_from_memory.mdx
    └── source
    │   ├── _ext
    │       ├── entities.py
    │       ├── rust_doc.py
    │       └── toctree_tags.py
    │   ├── _static
    │       ├── css
    │       │   ├── Calibre-Light.ttf
    │       │   ├── Calibre-Medium.otf
    │       │   ├── Calibre-Regular.otf
    │       │   ├── Calibre-Thin.otf
    │       │   ├── code-snippets.css
    │       │   └── huggingface.css
    │       └── js
    │       │   └── custom.js
    │   ├── api
    │       ├── node.inc
    │       ├── python.inc
    │       ├── reference.rst
    │       └── rust.inc
    │   ├── components.rst
    │   ├── conf.py
    │   ├── entities.inc
    │   ├── index.rst
    │   ├── installation
    │       ├── main.rst
    │       ├── node.inc
    │       ├── python.inc
    │       └── rust.inc
    │   ├── pipeline.rst
    │   ├── quicktour.rst
    │   └── tutorials
    │       └── python
    │           └── training_from_memory.rst
└── tokenizers
    ├── CHANGELOG.md
    ├── Cargo.toml
    ├── LICENSE
    ├── Makefile
    ├── README.md
    ├── README.tpl
    ├── benches
        ├── bert_benchmark.rs
        ├── bpe_benchmark.rs
        ├── common
        │   └── mod.rs
        ├── layout_benchmark.rs
        ├── llama3_benchmark.rs
        └── unigram_benchmark.rs
    ├── examples
        ├── encode_batch.rs
        ├── serialization.rs
        └── unstable_wasm
        │   ├── .gitignore
        │   ├── Cargo.toml
        │   ├── README.md
        │   ├── src
        │       ├── lib.rs
        │       └── utils.rs
        │   ├── tests
        │       └── web.rs
        │   └── www
        │       ├── .bin
        │           └── create-wasm-app.js
        │       ├── .gitignore
        │       ├── .travis.yml
        │       ├── LICENSE-APACHE
        │       ├── LICENSE-MIT
        │       ├── README.md
        │       ├── bootstrap.js
        │       ├── index.html
        │       ├── index.js
        │       ├── package-lock.json
        │       ├── package.json
        │       └── webpack.config.js
    ├── rust-toolchain
    ├── src
        ├── decoders
        │   ├── bpe.rs
        │   ├── byte_fallback.rs
        │   ├── ctc.rs
        │   ├── fuse.rs
        │   ├── mod.rs
        │   ├── sequence.rs
        │   ├── strip.rs
        │   └── wordpiece.rs
        ├── lib.rs
        ├── models
        │   ├── bpe
        │   │   ├── mod.rs
        │   │   ├── model.rs
        │   │   ├── serialization.rs
        │   │   ├── trainer.rs
        │   │   └── word.rs
        │   ├── mod.rs
        │   ├── unigram
        │   │   ├── lattice.rs
        │   │   ├── mod.rs
        │   │   ├── model.rs
        │   │   ├── serialization.rs
        │   │   ├── trainer.rs
        │   │   └── trie.rs
        │   ├── wordlevel
        │   │   ├── mod.rs
        │   │   ├── serialization.rs
        │   │   └── trainer.rs
        │   └── wordpiece
        │   │   ├── mod.rs
        │   │   ├── serialization.rs
        │   │   └── trainer.rs
        ├── normalizers
        │   ├── bert.rs
        │   ├── byte_level.rs
        │   ├── mod.rs
        │   ├── precompiled.rs
        │   ├── prepend.rs
        │   ├── replace.rs
        │   ├── strip.rs
        │   ├── unicode.rs
        │   └── utils.rs
        ├── pre_tokenizers
        │   ├── bert.rs
        │   ├── byte_level.rs
        │   ├── delimiter.rs
        │   ├── digits.rs
        │   ├── fixed_length.rs
        │   ├── metaspace.rs
        │   ├── mod.rs
        │   ├── punctuation.rs
        │   ├── sequence.rs
        │   ├── split.rs
        │   ├── unicode_scripts
        │   │   ├── mod.rs
        │   │   ├── pre_tokenizer.rs
        │   │   └── scripts.rs
        │   └── whitespace.rs
        ├── processors
        │   ├── bert.rs
        │   ├── mod.rs
        │   ├── roberta.rs
        │   ├── sequence.rs
        │   └── template.rs
        ├── tokenizer
        │   ├── added_vocabulary.rs
        │   ├── encoding.rs
        │   ├── mod.rs
        │   ├── normalizer.rs
        │   ├── pattern.rs
        │   ├── pre_tokenizer.rs
        │   └── serialization.rs
        └── utils
        │   ├── cache.rs
        │   ├── fancy.rs
        │   ├── from_pretrained.rs
        │   ├── iter.rs
        │   ├── mod.rs
        │   ├── onig.rs
        │   ├── padding.rs
        │   ├── parallelism.rs
        │   ├── progress.rs
        │   └── truncation.rs
    └── tests
        ├── added_tokens.rs
        ├── common
            └── mod.rs
        ├── documentation.rs
        ├── from_pretrained.rs
        ├── offsets.rs
        ├── serialization.rs
        ├── stream.rs
        ├── training.rs
        └── unigram.rs


/.github/conda/bld.bat:
--------------------------------------------------------------------------------
1 | cd bindings\python
2 | %PYTHON% -m pip install . --prefix=%PREFIX%
3 | 


--------------------------------------------------------------------------------
/.github/conda/build.sh:
--------------------------------------------------------------------------------
1 | cd bindings/python
2 | $PYTHON -m pip install . --prefix=$PREFIX
3 | 


--------------------------------------------------------------------------------
/.github/conda/meta.yaml:
--------------------------------------------------------------------------------
 1 | {% set name = "tokenizers" %}
 2 | 
 3 | package:
 4 |   name: "{{ name|lower }}"
 5 |   version: "{{ TOKENIZERS_VERSION }}"
 6 | 
 7 | source:
 8 |   path: ../../
 9 | 
10 | requirements:
11 |   host:
12 |     - pip
13 |     - python x.x
14 |     - setuptools
15 |     - setuptools-rust
16 |     - pkg-config
17 |     - openssl
18 |     - maturin
19 | 
20 |   run:
21 |     - python x.x
22 | 
23 | test:
24 |   imports:
25 |     - tokenizers
26 |     - tokenizers.models
27 | 
28 | about:
29 |   home: https://huggingface.co/docs/tokenizers
30 |   license: Apache License 2.0
31 |   license_file: LICENSE
32 |   summary: "💥 Fast State-of-the-Art Tokenizers optimized for Research and Production"
33 | 


--------------------------------------------------------------------------------
/.github/stale.yml:
--------------------------------------------------------------------------------
 1 | # Number of days of inactivity before an issue becomes stale
 2 | daysUntilStale: 60
 3 | # Number of days of inactivity before a stale issue is closed
 4 | daysUntilClose: 7
 5 | # Issues with these labels will never be considered stale
 6 | exemptLabels:
 7 |   - pinned
 8 |   - security
 9 | # Label to use when marking an issue as stale
10 | staleLabel: wontfix
11 | # Comment to post when marking an issue as stale. Set to `false` to disable
12 | markComment: >
13 |   This issue has been automatically marked as stale because it has not had
14 |   recent activity. It will be closed if no further activity occurs. Thank you
15 |   for your contributions.
16 | # Comment to post when closing a stale issue. Set to `false` to disable
17 | closeComment: false
18 | 


--------------------------------------------------------------------------------
/.github/workflows/build_documentation.yml:
--------------------------------------------------------------------------------
 1 | name: Build documentation
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - doc-builder*
 8 |       - v*-release
 9 |       - use_templates
10 | 
11 | jobs:
12 |   build:
13 |     uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
14 |     with:
15 |       commit_sha: ${{ github.sha }}
16 |       package: tokenizers
17 |       path_to_docs: tokenizers/docs/source-doc-builder/
18 |       package_path: tokenizers/bindings/python/
19 |       install_rust: true
20 |     secrets:
21 |       hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
22 | 


--------------------------------------------------------------------------------
/.github/workflows/build_pr_documentation.yml:
--------------------------------------------------------------------------------
 1 | name: Build PR Documentation
 2 | 
 3 | on:
 4 |   pull_request:
 5 | 
 6 | concurrency:
 7 |   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
 8 |   cancel-in-progress: true
 9 | 
10 | jobs:
11 |   build:
12 |     uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
13 |     with:
14 |       commit_sha: ${{ github.event.pull_request.head.sha }}
15 |       pr_number: ${{ github.event.number }}
16 |       package: tokenizers
17 |       path_to_docs: tokenizers/docs/source-doc-builder/
18 |       package_path: tokenizers/bindings/python/
19 |       install_rust: true
20 | 


--------------------------------------------------------------------------------
/.github/workflows/delete_doc_comment.yml:
--------------------------------------------------------------------------------
 1 | name: Delete doc comment
 2 | 
 3 | on:
 4 |   workflow_run:
 5 |     workflows: ["Delete doc comment trigger"]
 6 |     types:
 7 |       - completed
 8 | 
 9 | jobs:
10 |   delete:
11 |     uses: huggingface/doc-builder/.github/workflows/delete_doc_comment.yml@main
12 |     secrets:
13 |       comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}


--------------------------------------------------------------------------------
/.github/workflows/delete_doc_comment_trigger.yml:
--------------------------------------------------------------------------------
 1 | name: Delete doc comment trigger
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     types: [ closed ]
 6 | 
 7 | 
 8 | jobs:
 9 |   delete:
10 |     uses: huggingface/doc-builder/.github/workflows/delete_doc_comment_trigger.yml@main
11 |     with:
12 |       pr_number: ${{ github.event.number }}


--------------------------------------------------------------------------------
/.github/workflows/docs-check.yml:
--------------------------------------------------------------------------------
 1 | name: Documentation
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 | 
 9 | jobs:
10 |   build:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - name: Checkout repository
14 |         uses: actions/checkout@v4
15 | 
16 |       - name: Install Python
17 |         uses: actions/setup-python@v5
18 |         with:
19 |           python-version: 3.12
20 | 
21 |       - name: Install dependencies
22 |         run: pip install sphinx sphinx_rtd_theme setuptools-rust
23 | 
24 |       - name: Install Rust
25 |         uses: dtolnay/rust-toolchain@stable
26 | 
27 |       - name: Build tokenizers
28 |         working-directory: ./bindings/python
29 |         run: pip install -e .
30 | 
31 |       - name: Build documentation
32 |         working-directory: ./docs
33 |         run: make clean && make html_all O="-W --keep-going"
34 | 
35 |       - name: Upload built doc
36 |         uses: actions/upload-artifact@v4
37 |         with:
38 |           name: documentation
39 |           path: ./docs/build/*
40 | 


--------------------------------------------------------------------------------
/.github/workflows/node.yml:
--------------------------------------------------------------------------------
 1 | name: Node
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 |     paths-ignore:
 7 |       - bindings/python/**
 8 |   pull_request:
 9 |     paths-ignore:
10 |       - bindings/python/**
11 | 
12 | jobs:
13 |   build_and_test:
14 |     name: Check everything builds
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - name: Checkout repository
18 |         uses: actions/checkout@v4
19 | 
20 |       - name: Install Rust
21 |         uses: dtolnay/rust-toolchain@stable
22 |         with:
23 |           components: rustfmt, clippy
24 | 
25 |       # Necessary for now for the cargo cache: https://github.com/actions/cache/issues/133#issuecomment-599102035
26 |       - run: sudo chown -R $(whoami):$(id -ng) ~/.cargo/
27 | 
28 |       - name: Cache Cargo Registry
29 |         uses: actions/cache@v4
30 |         with:
31 |           path: ~/.cargo/registry
32 |           key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }}
33 | 
34 |       - name: Install Node
35 |         uses: actions/setup-node@v4
36 |         with:
37 |           node-version: latest
38 |       - name: Install dependencies
39 |         working-directory: ./bindings/node
40 |         run: yarn install
41 | 
42 |       - name: Build all
43 |         working-directory: ./bindings/node
44 |         run: yarn build
45 | 
46 |       - name: Lint Rust formatting
47 |         uses: actions-rs/cargo@v1
48 |         with:
49 |           command: fmt
50 |           args: --manifest-path ./bindings/node/Cargo.toml -- --check
51 | 
52 |       - name: Lint Rust with Clippy
53 |         uses: actions-rs/cargo@v1
54 |         with:
55 |           command: clippy
56 |           args: --manifest-path ./bindings/node/Cargo.toml --all-targets --all-features -- -D warnings
57 | 
58 |       - name: Lint TS
59 |         working-directory: ./bindings/node
60 |         run: yarn lint
61 | 
62 |       - name: Run JS tests
63 |         working-directory: ./bindings/node
64 |         run: make test
65 | 


--------------------------------------------------------------------------------
/.github/workflows/rust-release.yml:
--------------------------------------------------------------------------------
 1 | name: Rust Release
 2 | 
 3 | env:
 4 |   CRATES_TOKEN: ${{ secrets.CRATES_TOKEN }}
 5 | 
 6 | on:
 7 |   push:
 8 |     tags:
 9 |       - v*
10 | 
11 | jobs:
12 |   rust_publish:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - name: Checkout repository
16 |         uses: actions/checkout@v4
17 | 
18 |       - name: Install Rust
19 |         uses: dtolnay/rust-toolchain@stable
20 | 
21 |       - name: Cache Cargo Registry
22 |         uses: actions/cache@v4
23 |         with:
24 |           path: ~/.cargo/registry
25 |           key: ubuntu-latest-cargo-registry-${{ hashFiles('**/Cargo.toml') }}
26 | 
27 |       - name: Publish package rust
28 |         working-directory: ./tokenizers
29 |         if: ${{ !contains(github.ref, 'rc') }}
30 |         run: cargo publish --token ${CRATES_TOKEN}
31 | 
32 | 


--------------------------------------------------------------------------------
/.github/workflows/stale.yml:
--------------------------------------------------------------------------------
 1 | name: 'Close stale issues and PRs'
 2 | on:
 3 |   schedule:
 4 |     - cron: '30 1 * * *'
 5 | 
 6 | jobs:
 7 |   stale:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - uses: actions/stale@v9
11 |         with:
12 |           stale-issue-message: 'This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days.'
13 |           days-before-stale: 30
14 |           days-before-close: 5
15 | 


--------------------------------------------------------------------------------
/.github/workflows/trufflehog.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 | 
 4 | name: Secret Leaks
 5 | 
 6 | jobs:
 7 |   trufflehog:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |     - name: Checkout code
11 |       uses: actions/checkout@v4
12 |       with:
13 |         fetch-depth: 0
14 |     - name: Secret Scanning
15 |       uses: trufflesecurity/trufflehog@853e1e8d249fd1e29d0fcc7280d29b03df3d643d
16 |       with:
17 |         # exclude buggy postgres detector that is causing false positives and not relevant to our codebase
18 |         extra_args: --results=verified,unknown --exclude-detectors=postgres
19 | 


--------------------------------------------------------------------------------
/.github/workflows/upload_pr_documentation.yml:
--------------------------------------------------------------------------------
 1 | name: Upload PR Documentation
 2 | 
 3 | on:
 4 |   workflow_run:
 5 |     workflows: ["Build PR Documentation"]
 6 |     types:
 7 |       - completed
 8 | 
 9 | jobs:
10 |   build:
11 |     uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
12 |     with:
13 |       package_name: tokenizers
14 |     secrets:
15 |       hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
16 |       comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | *~
 3 | 
 4 | .vim
 5 | .env
 6 | target
 7 | .idea
 8 | **/Cargo.lock
 9 | 
10 | /data
11 | tokenizers/data
12 | bindings/python/tests/data
13 | docs/build/
14 | docs/make.bat
15 | 
16 | __pycache__
17 | pip-wheel-metadata
18 | *.egg-info
19 | *.so
20 | /bindings/python/examples/.ipynb_checkpoints
21 | /bindings/python/build
22 | /bindings/python/dist
23 | 
24 | .vscode
25 | *.code-workspace
26 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | # This CITATION.cff file was generated with cffinit.
 2 | # Visit https://bit.ly/cffinit to generate yours today!
 3 | 
 4 | cff-version: 1.2.0
 5 | title: HuggingFace's Tokenizers
 6 | message: >-
 7 |   Fast State-of-the-Art Tokenizers optimized for Research
 8 |   and Production.
 9 | type: software
10 | authors:
11 |   - given-names: Anthony
12 |     family-names: Moi
13 |     email: m.anthony.moi@gmail.com
14 |     affiliation: HuggingFace
15 |   - given-names: Nicolas
16 |     family-names: Patry
17 |     affiliation: HuggingFace
18 | repository-code: 'https://github.com/huggingface/tokenizers'
19 | url: 'https://github.com/huggingface/tokenizers'
20 | repository: 'https://huggingface.co'
21 | abstract: >-
22 |   Fast State-of-the-Art Tokenizers optimized for Research
23 |   and Production.
24 | keywords:
25 |   - Rust
26 |   - Tokenizer
27 |   - NLP
28 | license: Apache-2.0
29 | commit: 37372b6
30 | version: 0.13.4
31 | date-released: '2023-04-05'
32 | 


--------------------------------------------------------------------------------
/bindings/node/.cargo/config.toml:
--------------------------------------------------------------------------------
1 | [target.aarch64-unknown-linux-musl]
2 | linker = "aarch64-linux-musl-gcc"
3 | rustflags = ["-C", "target-feature=-crt-static"]
4 | 


--------------------------------------------------------------------------------
/bindings/node/.editorconfig:
--------------------------------------------------------------------------------
 1 | # EditorConfig helps developers define and maintain consistent
 2 | # coding styles between different editors or IDEs
 3 | # http://editorconfig.org
 4 | root = true
 5 | 
 6 | [*]
 7 | indent_style = space
 8 | indent_size = 2
 9 | end_of_line = lf
10 | charset = utf-8
11 | trim_trailing_whitespace = true
12 | insert_final_newline = true
13 | 
14 | [*.md]
15 | trim_trailing_whitespace = false
16 | 


--------------------------------------------------------------------------------
/bindings/node/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | *        text=auto
 3 | 
 4 | 
 5 | *.ts    text eol=lf merge=union 
 6 | *.tsx   text eol=lf merge=union 
 7 | *.rs    text eol=lf merge=union 
 8 | *.js    text eol=lf merge=union 
 9 | *.json  text eol=lf merge=union 
10 | *.debug text eol=lf merge=union 
11 | 
12 | # Generated codes
13 | index.js linguist-detectable=false
14 | index.d.ts linguist-detectable=false 


--------------------------------------------------------------------------------
/bindings/node/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.toptal.com/developers/gitignore/api/node
  3 | # Edit at https://www.toptal.com/developers/gitignore?templates=node
  4 | 
  5 | ### Node ###
  6 | # Logs
  7 | logs
  8 | *.log
  9 | npm-debug.log*
 10 | yarn-debug.log*
 11 | yarn-error.log*
 12 | lerna-debug.log*
 13 | 
 14 | # Diagnostic reports (https://nodejs.org/api/report.html)
 15 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
 16 | 
 17 | # Runtime data
 18 | pids
 19 | *.pid
 20 | *.seed
 21 | *.pid.lock
 22 | 
 23 | # Directory for instrumented libs generated by jscoverage/JSCover
 24 | lib-cov
 25 | 
 26 | # Coverage directory used by tools like istanbul
 27 | coverage
 28 | *.lcov
 29 | 
 30 | # nyc test coverage
 31 | .nyc_output
 32 | 
 33 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
 34 | .grunt
 35 | 
 36 | # Bower dependency directory (https://bower.io/)
 37 | bower_components
 38 | 
 39 | # node-waf configuration
 40 | .lock-wscript
 41 | 
 42 | # Compiled binary addons (https://nodejs.org/api/addons.html)
 43 | build/Release
 44 | 
 45 | # Dependency directories
 46 | node_modules/
 47 | jspm_packages/
 48 | 
 49 | # TypeScript v1 declaration files
 50 | typings/
 51 | 
 52 | # TypeScript cache
 53 | *.tsbuildinfo
 54 | 
 55 | # Optional npm cache directory
 56 | .npm
 57 | 
 58 | # Optional eslint cache
 59 | .eslintcache
 60 | 
 61 | # Microbundle cache
 62 | .rpt2_cache/
 63 | .rts2_cache_cjs/
 64 | .rts2_cache_es/
 65 | .rts2_cache_umd/
 66 | 
 67 | # Optional REPL history
 68 | .node_repl_history
 69 | 
 70 | # Output of 'npm pack'
 71 | *.tgz
 72 | 
 73 | # Yarn Integrity file
 74 | .yarn-integrity
 75 | 
 76 | # dotenv environment variables file
 77 | .env
 78 | .env.test
 79 | 
 80 | # parcel-bundler cache (https://parceljs.org/)
 81 | .cache
 82 | 
 83 | # Next.js build output
 84 | .next
 85 | 
 86 | # Nuxt.js build / generate output
 87 | .nuxt
 88 | dist
 89 | 
 90 | # Gatsby files
 91 | .cache/
 92 | # Comment in the public line in if your project uses Gatsby and not Next.js
 93 | # https://nextjs.org/blog/next-9-1#public-directory-support
 94 | # public
 95 | 
 96 | # vuepress build output
 97 | .vuepress/dist
 98 | 
 99 | # Serverless directories
100 | .serverless/
101 | 
102 | # FuseBox cache
103 | .fusebox/
104 | 
105 | # DynamoDB Local files
106 | .dynamodb/
107 | 
108 | # TernJS port file
109 | .tern-port
110 | 
111 | # Stores VSCode versions used for testing VSCode extensions
112 | .vscode-test
113 | 
114 | # End of https://www.toptal.com/developers/gitignore/api/node
115 | 
116 | 
117 | #Added by cargo
118 | 
119 | /target
120 | Cargo.lock
121 | 
122 | *.node
123 | .pnp.*
124 | .yarn/*
125 | !.yarn/patches
126 | !.yarn/plugins
127 | !.yarn/releases
128 | !.yarn/sdks
129 | !.yarn/versions


--------------------------------------------------------------------------------
/bindings/node/.prettierignore:
--------------------------------------------------------------------------------
1 | target
2 | .yarn


--------------------------------------------------------------------------------
/bindings/node/.taplo.toml:
--------------------------------------------------------------------------------
1 | exclude = ["node_modules/**/*.toml"]
2 | 
3 | # https://taplo.tamasfe.dev/configuration/formatter-options.html
4 | [formatting]
5 | align_entries = true
6 | indent_tables = true
7 | reorder_keys  = true
8 | 


--------------------------------------------------------------------------------
/bindings/node/.yarnrc.yml:
--------------------------------------------------------------------------------
1 | nodeLinker: node-modules
2 | 
3 | npmAuditRegistry: 'https://registry.npmjs.org'
4 | 
5 | yarnPath: .yarn/releases/yarn-3.5.1.cjs
6 | 


--------------------------------------------------------------------------------
/bindings/node/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | authors = ["Nicolas Patry <nicolas@huggingface.co>"]
 3 | edition = "2021"
 4 | name    = "node"
 5 | version = "0.21.4-dev.0"
 6 | 
 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 8 | 
 9 | [lib]
10 | crate-type = ["cdylib"]
11 | 
12 | [dependencies]
13 | napi        = "2"
14 | napi-derive = "2"
15 | serde       = { version = "1.0.163", features = ["derive"] }
16 | tokenizers  = { path = "../../tokenizers/" }
17 | ahash = { version = "0.8.11", features = ["serde"] }
18 | 
19 | [build-dependencies]
20 | napi-build = "2"
21 | 
22 | [profile.release]
23 | lto = true
24 | 


--------------------------------------------------------------------------------
/bindings/node/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 N-API for Rust
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/bindings/node/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: style check-style test
 2 | 
 3 | DATA_DIR = data
 4 | 
 5 | dir_guard=@mkdir -p $(@D)
 6 | 
 7 | # Format source code automatically
 8 | style:
 9 | 	npm run lint
10 | 
11 | # Check the source code is formatted correctly
12 | check-style:
13 | 	npm run lint-check
14 | 
15 | TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json $(DATA_DIR)/bert-wiki.json
16 | 
17 | # Launch the test suite
18 | test: $(TESTS_RESOURCES)
19 | 	npm run test
20 | 
21 | $(DATA_DIR)/big.txt :
22 | 	$(dir_guard)
23 | 	wget https://norvig.com/big.txt -O $@
24 | 
25 | $(DATA_DIR)/small.txt : $(DATA_DIR)/big.txt
26 | 	head -100 $(DATA_DIR)/big.txt > $@
27 | 
28 | $(DATA_DIR)/roberta.json :
29 | 	$(dir_guard)
30 | 	wget https://huggingface.co/roberta-large/raw/main/tokenizer.json -O $@
31 | 
32 | $(DATA_DIR)/tokenizer-wiki.json :
33 | 	$(dir_guard)
34 | 	wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json -O $@
35 | 
36 | $(DATA_DIR)/bert-wiki.json :
37 | 	$(dir_guard)
38 | 	wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json -O $@
39 | 


--------------------------------------------------------------------------------
/bindings/node/README.md:
--------------------------------------------------------------------------------
 1 | <p align="center">
 2 |   <br>
 3 |   <img src="https://huggingface.co/landing/assets/tokenizers/tokenizers-logo.png" width="600"/>
 4 |   <br>
 5 | <p>
 6 | <p align="center">
 7 |   <a href="https://badge.fury.io/js/tokenizers">
 8 |     <img alt="Build" src="https://badge.fury.io/js/tokenizers.svg">
 9 |   </a>
10 |   <a href="https://github.com/huggingface/tokenizers/blob/master/LICENSE">
11 |     <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/tokenizers.svg?color=blue">
12 |   </a>
13 | </p>
14 | <br>
15 | 
16 | NodeJS implementation of today's most used tokenizers, with a focus on performance and
17 | versatility. Bindings over the [Rust](https://github.com/huggingface/tokenizers/tree/master/tokenizers) implementation.
18 | If you are interested in the High-level design, you can go check it there.
19 | 
20 | ## Main features
21 | 
22 |  - Train new vocabularies and tokenize using 4 pre-made tokenizers (Bert WordPiece and the 3
23 |    most common BPE versions).
24 |  - Extremely fast (both training and tokenization), thanks to the Rust implementation. Takes
25 |    less than 20 seconds to tokenize a GB of text on a server's CPU.
26 |  - Easy to use, but also extremely versatile.
27 |  - Designed for research and production.
28 |  - Normalization comes with alignments tracking. It's always possible to get the part of the
29 |    original sentence that corresponds to a given token.
30 |  - Does all the pre-processing: Truncate, Pad, add the special tokens your model needs.
31 | 
32 | ## Installation
33 | 
34 | ```bash
35 | npm install tokenizers@latest
36 | ```
37 | 
38 | ## Basic example
39 | 
40 | ```ts
41 | import { Tokenizer } from "tokenizers";
42 | 
43 | const tokenizer = await Tokenizer.fromFile("tokenizer.json");
44 | const wpEncoded = await tokenizer.encode("Who is John?");
45 | 
46 | console.log(wpEncoded.getLength());
47 | console.log(wpEncoded.getTokens());
48 | console.log(wpEncoded.getIds());
49 | console.log(wpEncoded.getAttentionMask());
50 | console.log(wpEncoded.getOffsets());
51 | console.log(wpEncoded.getOverflowing());
52 | console.log(wpEncoded.getSpecialTokensMask());
53 | console.log(wpEncoded.getTypeIds());
54 | console.log(wpEncoded.getWordIds());
55 | ```
56 | 
57 | ## License
58 | 
59 | [Apache License 2.0](../../LICENSE)
60 | 


--------------------------------------------------------------------------------
/bindings/node/build.rs:
--------------------------------------------------------------------------------
1 | extern crate napi_build;
2 | 
3 | fn main() {
4 |   napi_build::setup();
5 | }
6 | 


--------------------------------------------------------------------------------
/bindings/node/lib/bindings/__mocks__/merges.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/tokenizers/dd4fc3df1a8a7cd135eecca2158db018d85f94f1/bindings/node/lib/bindings/__mocks__/merges.txt


--------------------------------------------------------------------------------
/bindings/node/lib/bindings/__mocks__/vocab.json:
--------------------------------------------------------------------------------
1 | {}
2 | 


--------------------------------------------------------------------------------
/bindings/node/lib/bindings/__mocks__/vocab.txt:
--------------------------------------------------------------------------------
 1 | my
 2 | name
 3 | is
 4 | jo
 5 | ##hn
 6 | what
 7 | yours
 8 | pair
 9 | [UNK]
10 | 


--------------------------------------------------------------------------------
/bindings/node/lib/bindings/models.test.ts:
--------------------------------------------------------------------------------
 1 | /* eslint-disable @typescript-eslint/no-empty-function */
 2 | /* eslint-disable @typescript-eslint/no-explicit-any */
 3 | 
 4 | import { BPE, Unigram, WordPiece } from '../../'
 5 | 
 6 | const MOCKS_DIR = __dirname + '/__mocks__'
 7 | 
 8 | describe('WordPiece', () => {
 9 |   describe('fromFile', () => {
10 |     it('throws if called with only one argument', () => {
11 |       expect(() => (WordPiece as any).fromFile()).toThrow(
12 |         'Failed to convert JavaScript value `Undefined` into rust type `String`',
13 |       )
14 |     })
15 | 
16 |     it('throws if called with 2 arguments without a callback as third argument', () => {
17 |       expect(() => (WordPiece as any).fromFile({})).toThrow(
18 |         'Failed to convert JavaScript value `Object {}` into rust type `String`',
19 |       )
20 |     })
21 | 
22 |     it('has its callback called with the loaded model', async () => {
23 |       const model = await WordPiece.fromFile(`${MOCKS_DIR}/vocab.txt`)
24 |       expect(model).toBeDefined()
25 |     })
26 |   })
27 | })
28 | 
29 | describe('BPE', () => {
30 |   describe('fromFile', () => {
31 |     it('has its callback called with the loaded model', async () => {
32 |       const model = await BPE.fromFile(`${MOCKS_DIR}/vocab.json`, `${MOCKS_DIR}/merges.txt`)
33 |       expect(model).toBeDefined()
34 |     })
35 | 
36 |     it('has its callback called with the loaded model', async () => {
37 |       const model = await BPE.fromFile(`${MOCKS_DIR}/vocab.json`, `${MOCKS_DIR}/merges.txt`, {})
38 |       expect(model).toBeDefined()
39 |     })
40 |   })
41 |   describe('When initialized from memory', () => {
42 |     it('returns the loaded Model', () => {
43 |       const bpe = BPE.init({ a: 0, b: 1, ab: 2 }, [['a', 'b']])
44 |       // expect(bpe.constructor.name).toEqual("Model");
45 |       expect(bpe.constructor.name).toEqual('BPE')
46 |     })
47 |   })
48 | })
49 | 
50 | describe('Unigram', () => {
51 |   it('can be initialized from memory', () => {
52 |     const unigram = Unigram.init(
53 |       [
54 |         ['<unk>', 0],
55 |         ['Hello', -1],
56 |         ['there', -2],
57 |       ],
58 |       {
59 |         unkId: 0,
60 |       },
61 |     )
62 |     expect(unigram.constructor.name).toEqual('Unigram')
63 |   })
64 | })
65 | 


--------------------------------------------------------------------------------
/bindings/node/lib/bindings/normalizers.test.ts:
--------------------------------------------------------------------------------
 1 | import { prependNormalizer, stripAccentsNormalizer, stripNormalizer } from '../../'
 2 | 
 3 | describe('stripNormalizer', () => {
 4 |   it('instantiates with no parameters', () => {
 5 |     const normalizer = stripNormalizer()
 6 |     expect(normalizer.constructor.name).toEqual('Normalizer')
 7 |   })
 8 | 
 9 |   it('accepts `undefined` as first parameter', () => {
10 |     expect(stripNormalizer(undefined)).toBeDefined()
11 |   })
12 | 
13 |   it('accepts `undefined` as second parameter', () => {
14 |     expect(stripNormalizer(false, undefined)).toBeDefined()
15 |   })
16 | 
17 |   it('instantiates with one parameter', () => {
18 |     const normalizer = stripNormalizer(false)
19 |     expect(normalizer.constructor.name).toEqual('Normalizer')
20 |   })
21 | 
22 |   it('instantiates with two parameters', () => {
23 |     const normalizer = stripNormalizer(false, true)
24 |     expect(normalizer.constructor.name).toEqual('Normalizer')
25 |   })
26 | 
27 |   it('prepend instantiates with one parameter', () => {
28 |     const normalizer = prependNormalizer('_')
29 |     expect(normalizer.constructor.name).toEqual('Normalizer')
30 |     expect(normalizer.normalizeString('Hello')).toEqual('_Hello')
31 |   })
32 | 
33 |   it('can normalize strings', () => {
34 |     const normalizer = stripNormalizer()
35 |     expect(normalizer.normalizeString('     Hello there   ')).toEqual('Hello there')
36 |   })
37 | })
38 | 
39 | describe('stripAccentsNormalizer', () => {
40 |   it('initialize', () => {
41 |     const normalizer = stripAccentsNormalizer()
42 |     expect(normalizer.constructor.name).toEqual('Normalizer')
43 |   })
44 | })
45 | 


--------------------------------------------------------------------------------
/bindings/node/lib/bindings/post-processors.test.ts:
--------------------------------------------------------------------------------
 1 | /* eslint-disable @typescript-eslint/no-explicit-any */
 2 | 
 3 | import { bertProcessing, byteLevelProcessing, robertaProcessing, sequenceProcessing, templateProcessing } from '../../'
 4 | 
 5 | describe('bertProcessing', () => {
 6 |   it('instantiates correctly with only two parameters', () => {
 7 |     const processor = bertProcessing(['sep', 1], ['cls', 2])
 8 |     expect(processor.constructor.name).toEqual('Processor')
 9 |   })
10 | 
11 |   it('throws if only one argument is provided', () => {
12 |     expect(() => (bertProcessing as any)(['sep', 1])).toThrow('Given napi value is not an array')
13 |   })
14 | 
15 |   it('throws if arguments are malformed', () => {
16 |     expect(() => (bertProcessing as any)(['sep', '1'], ['cls', '2'])).toThrow(
17 |       'Failed to convert napi value String into rust type `u32`',
18 |     )
19 |     expect(() => (bertProcessing as any)(['sep'], ['cls'])).toThrow('Array length < 2')
20 |   })
21 | })
22 | 
23 | describe('byteLevelProcessing', () => {
24 |   it('instantiates correctly without any parameter', () => {
25 |     const processor = byteLevelProcessing()
26 |     expect(processor.constructor.name).toEqual('Processor')
27 |   })
28 | 
29 |   it('accepts `undefined` as first parameter', () => {
30 |     expect(byteLevelProcessing(undefined)).toBeDefined()
31 |   })
32 | 
33 |   it('accepts `boolean` as first parameter', () => {
34 |     expect(byteLevelProcessing(true)).toBeDefined()
35 |   })
36 | })
37 | 
38 | describe('robertaProcessing', () => {
39 |   it('instantiates correctly with only two parameters', () => {
40 |     const processor = robertaProcessing(['sep', 1], ['cls', 2])
41 |     expect(processor.constructor.name).toEqual('Processor')
42 |   })
43 | 
44 |   it('accepts `undefined` as third and fourth parameters', () => {
45 |     expect(robertaProcessing(['sep', 1], ['cls', 2], undefined, undefined)).toBeDefined()
46 |   })
47 | 
48 |   it('accepts `boolean` as third and fourth parameter', () => {
49 |     expect(robertaProcessing(['sep', 1], ['cls', 2], true, true)).toBeDefined()
50 |   })
51 | })
52 | 
53 | describe('templateProcessing', () => {
54 |   it('instantiates correctly with only a single template', () => {
55 |     const processor = templateProcessing('$A $A')
56 |     expect(processor.constructor.name).toEqual('Processor')
57 |   })
58 | 
59 |   it('throws if special tokens are missing', () => {
60 |     expect(() => templateProcessing('[CLS] $A [SEP]')).toThrow('Missing SpecialToken(s) with id(s)')
61 |   })
62 | 
63 |   it('instantiates correctly with both templates', () => {
64 |     const processor = templateProcessing('[CLS] $A [SEP]', '[CLS] $A [SEP] $B:1 [SEP]:1', [
65 |       ['[CLS]', 1],
66 |       ['[SEP]', 2],
67 |     ])
68 |     expect(processor.constructor.name).toEqual('Processor')
69 |   })
70 | })
71 | 
72 | describe('sequenceProcessing', () => {
73 |   it('accepts `PostProcessor[]` as first parameter', () => {
74 |     const template = templateProcessing('[CLS] $A [SEP]', '[CLS] $A [SEP] $B:1 [SEP]:1', [
75 |       ['[CLS]', 1],
76 |       ['[SEP]', 2],
77 |     ])
78 |     const bytelevel = byteLevelProcessing(true)
79 |     expect(sequenceProcessing([bytelevel, template])).toBeDefined()
80 |   })
81 | })
82 | 


--------------------------------------------------------------------------------
/bindings/node/lib/bindings/pre-tokenizers.test.ts:
--------------------------------------------------------------------------------
 1 | import {
 2 |   byteLevelPreTokenizer,
 3 |   metaspacePreTokenizer,
 4 |   punctuationPreTokenizer,
 5 |   sequencePreTokenizer,
 6 |   splitPreTokenizer,
 7 |   whitespaceSplitPreTokenizer,
 8 | } from '../../'
 9 | 
10 | describe('byteLevelPreTokenizer', () => {
11 |   it('instantiates correctly', () => {
12 |     const processor = byteLevelPreTokenizer()
13 |     expect(processor.constructor.name).toEqual('PreTokenizer')
14 |   })
15 | })
16 | 
17 | describe('metaspacePreTokenizer', () => {
18 |   it('instantiates correctly without any parameter', () => {
19 |     const processor = metaspacePreTokenizer()
20 |     expect(processor.constructor.name).toEqual('PreTokenizer')
21 |   })
22 | 
23 |   it('accepts `undefined` as first parameter', () => {
24 |     expect(metaspacePreTokenizer(undefined)).toBeDefined()
25 |   })
26 | 
27 |   it('accepts `undefined` as second parameter', () => {
28 |     expect(metaspacePreTokenizer('t', undefined)).toBeDefined()
29 |   })
30 | 
31 |   it('can pre-tokenize strings', () => {
32 |     const pretok = metaspacePreTokenizer()
33 |     expect(pretok.preTokenizeString('Hello there friend')).toEqual([
34 |       ['▁Hello', [0, 5]],
35 |       ['▁there', [5, 11]],
36 |       ['▁friend', [11, 18]],
37 |     ])
38 |   })
39 | })
40 | 
41 | describe('punctuationPreTokenizer', () => {
42 |   it('instantiates correctly without any parameter', () => {
43 |     const processor = punctuationPreTokenizer()
44 |     expect(processor.constructor.name).toEqual('PreTokenizer')
45 |   })
46 | 
47 |   it('instantiates correctly with non-default split delimeter', () => {
48 |     const processor = punctuationPreTokenizer('removed')
49 |     expect(processor.constructor.name).toEqual('PreTokenizer')
50 |   })
51 | })
52 | 
53 | describe('splitPreTokenizer', () => {
54 |   it('instantiates correctly with invert parameter', () => {
55 |     const processor = splitPreTokenizer(' ', 'mergedWithPrevious', false)
56 |     expect(processor.constructor.name).toEqual('PreTokenizer')
57 |   })
58 | })
59 | 
60 | describe('sequencePreTokenizer', () => {
61 |   it('instantiates correctly', () => {
62 |     const punctuation = punctuationPreTokenizer()
63 |     const whitespace = whitespaceSplitPreTokenizer()
64 |     const sequence2 = sequencePreTokenizer([])
65 |     expect(sequence2.constructor.name).toEqual('PreTokenizer')
66 |     const sequence3 = sequencePreTokenizer([punctuation, whitespace])
67 |     expect(sequence3.constructor.name).toEqual('PreTokenizer')
68 |   })
69 | })
70 | 


--------------------------------------------------------------------------------
/bindings/node/npm/android-arm-eabi/README.md:
--------------------------------------------------------------------------------
1 | # `tokenizers-android-arm-eabi`
2 | 
3 | This is the **armv7-linux-androideabi** binary for `tokenizers`
4 | 


--------------------------------------------------------------------------------
/bindings/node/npm/android-arm-eabi/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "tokenizers-android-arm-eabi",
 3 |   "version": "0.13.4-rc1",
 4 |   "os": [
 5 |     "android"
 6 |   ],
 7 |   "cpu": [
 8 |     "arm"
 9 |   ],
10 |   "main": "tokenizers.android-arm-eabi.node",
11 |   "files": [
12 |     "tokenizers.android-arm-eabi.node"
13 |   ],
14 |   "description": "Tokenizers platform specific bindings",
15 |   "keywords": [
16 |     "napi-rs",
17 |     "NAPI",
18 |     "N-API",
19 |     "Rust",
20 |     "node-addon",
21 |     "node-addon-api"
22 |   ],
23 |   "license": "MIT",
24 |   "engines": {
25 |     "node": ">= 10"
26 |   },
27 |   "publishConfig": {
28 |     "registry": "https://registry.npmjs.org/",
29 |     "access": "public"
30 |   },
31 |   "repository": "tokenizers"
32 | }


--------------------------------------------------------------------------------
/bindings/node/npm/android-arm64/README.md:
--------------------------------------------------------------------------------
1 | # `tokenizers-android-arm64`
2 | 
3 | This is the **aarch64-linux-android** binary for `tokenizers`
4 | 


--------------------------------------------------------------------------------
/bindings/node/npm/android-arm64/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "tokenizers-android-arm64",
 3 |   "version": "0.13.4-rc1",
 4 |   "os": [
 5 |     "android"
 6 |   ],
 7 |   "cpu": [
 8 |     "arm64"
 9 |   ],
10 |   "main": "tokenizers.android-arm64.node",
11 |   "files": [
12 |     "tokenizers.android-arm64.node"
13 |   ],
14 |   "description": "Tokenizers platform specific bindings",
15 |   "keywords": [
16 |     "napi-rs",
17 |     "NAPI",
18 |     "N-API",
19 |     "Rust",
20 |     "node-addon",
21 |     "node-addon-api"
22 |   ],
23 |   "license": "MIT",
24 |   "engines": {
25 |     "node": ">= 10"
26 |   },
27 |   "publishConfig": {
28 |     "registry": "https://registry.npmjs.org/",
29 |     "access": "public"
30 |   },
31 |   "repository": "tokenizers"
32 | }


--------------------------------------------------------------------------------
/bindings/node/npm/darwin-arm64/README.md:
--------------------------------------------------------------------------------
1 | # `tokenizers-darwin-arm64`
2 | 
3 | This is the **aarch64-apple-darwin** binary for `tokenizers`
4 | 


--------------------------------------------------------------------------------
/bindings/node/npm/darwin-arm64/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "tokenizers-darwin-arm64",
 3 |   "version": "0.13.4-rc1",
 4 |   "os": [
 5 |     "darwin"
 6 |   ],
 7 |   "cpu": [
 8 |     "arm64"
 9 |   ],
10 |   "main": "tokenizers.darwin-arm64.node",
11 |   "files": [
12 |     "tokenizers.darwin-arm64.node"
13 |   ],
14 |   "description": "Tokenizers platform specific bindings",
15 |   "keywords": [
16 |     "napi-rs",
17 |     "NAPI",
18 |     "N-API",
19 |     "Rust",
20 |     "node-addon",
21 |     "node-addon-api"
22 |   ],
23 |   "license": "MIT",
24 |   "engines": {
25 |     "node": ">= 10"
26 |   },
27 |   "publishConfig": {
28 |     "registry": "https://registry.npmjs.org/",
29 |     "access": "public"
30 |   },
31 |   "repository": "tokenizers"
32 | }


--------------------------------------------------------------------------------
/bindings/node/npm/darwin-x64/README.md:
--------------------------------------------------------------------------------
1 | # `tokenizers-darwin-x64`
2 | 
3 | This is the **x86_64-apple-darwin** binary for `tokenizers`
4 | 


--------------------------------------------------------------------------------
/bindings/node/npm/darwin-x64/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "tokenizers-darwin-x64",
 3 |   "version": "0.13.4-rc1",
 4 |   "os": [
 5 |     "darwin"
 6 |   ],
 7 |   "cpu": [
 8 |     "x64"
 9 |   ],
10 |   "main": "tokenizers.darwin-x64.node",
11 |   "files": [
12 |     "tokenizers.darwin-x64.node"
13 |   ],
14 |   "description": "Tokenizers platform specific bindings",
15 |   "keywords": [
16 |     "napi-rs",
17 |     "NAPI",
18 |     "N-API",
19 |     "Rust",
20 |     "node-addon",
21 |     "node-addon-api"
22 |   ],
23 |   "license": "MIT",
24 |   "engines": {
25 |     "node": ">= 10"
26 |   },
27 |   "publishConfig": {
28 |     "registry": "https://registry.npmjs.org/",
29 |     "access": "public"
30 |   },
31 |   "repository": "tokenizers"
32 | }


--------------------------------------------------------------------------------
/bindings/node/npm/freebsd-x64/README.md:
--------------------------------------------------------------------------------
1 | # `tokenizers-freebsd-x64`
2 | 
3 | This is the **x86_64-unknown-freebsd** binary for `tokenizers`
4 | 


--------------------------------------------------------------------------------
/bindings/node/npm/freebsd-x64/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "tokenizers-freebsd-x64",
 3 |   "version": "0.13.4-rc1",
 4 |   "os": [
 5 |     "freebsd"
 6 |   ],
 7 |   "cpu": [
 8 |     "x64"
 9 |   ],
10 |   "main": "tokenizers.freebsd-x64.node",
11 |   "files": [
12 |     "tokenizers.freebsd-x64.node"
13 |   ],
14 |   "description": "Tokenizers platform specific bindings",
15 |   "keywords": [
16 |     "napi-rs",
17 |     "NAPI",
18 |     "N-API",
19 |     "Rust",
20 |     "node-addon",
21 |     "node-addon-api"
22 |   ],
23 |   "license": "MIT",
24 |   "engines": {
25 |     "node": ">= 10"
26 |   },
27 |   "publishConfig": {
28 |     "registry": "https://registry.npmjs.org/",
29 |     "access": "public"
30 |   },
31 |   "repository": "tokenizers"
32 | }


--------------------------------------------------------------------------------
/bindings/node/npm/linux-arm-gnueabihf/README.md:
--------------------------------------------------------------------------------
1 | # `tokenizers-linux-arm-gnueabihf`
2 | 
3 | This is the **armv7-unknown-linux-gnueabihf** binary for `tokenizers`
4 | 


--------------------------------------------------------------------------------
/bindings/node/npm/linux-arm-gnueabihf/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "tokenizers-linux-arm-gnueabihf",
 3 |   "version": "0.13.4-rc1",
 4 |   "os": [
 5 |     "linux"
 6 |   ],
 7 |   "cpu": [
 8 |     "arm"
 9 |   ],
10 |   "main": "tokenizers.linux-arm-gnueabihf.node",
11 |   "files": [
12 |     "tokenizers.linux-arm-gnueabihf.node"
13 |   ],
14 |   "description": "Tokenizers platform specific bindings",
15 |   "keywords": [
16 |     "napi-rs",
17 |     "NAPI",
18 |     "N-API",
19 |     "Rust",
20 |     "node-addon",
21 |     "node-addon-api"
22 |   ],
23 |   "license": "MIT",
24 |   "engines": {
25 |     "node": ">= 10"
26 |   },
27 |   "publishConfig": {
28 |     "registry": "https://registry.npmjs.org/",
29 |     "access": "public"
30 |   },
31 |   "repository": "tokenizers"
32 | }


--------------------------------------------------------------------------------
/bindings/node/npm/linux-arm64-gnu/README.md:
--------------------------------------------------------------------------------
1 | # `tokenizers-linux-arm64-gnu`
2 | 
3 | This is the **aarch64-unknown-linux-gnu** binary for `tokenizers`
4 | 


--------------------------------------------------------------------------------
/bindings/node/npm/linux-arm64-gnu/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "tokenizers-linux-arm64-gnu",
 3 |   "version": "0.13.4-rc1",
 4 |   "os": [
 5 |     "linux"
 6 |   ],
 7 |   "cpu": [
 8 |     "arm64"
 9 |   ],
10 |   "main": "tokenizers.linux-arm64-gnu.node",
11 |   "files": [
12 |     "tokenizers.linux-arm64-gnu.node"
13 |   ],
14 |   "description": "Tokenizers platform specific bindings",
15 |   "keywords": [
16 |     "napi-rs",
17 |     "NAPI",
18 |     "N-API",
19 |     "Rust",
20 |     "node-addon",
21 |     "node-addon-api"
22 |   ],
23 |   "license": "MIT",
24 |   "engines": {
25 |     "node": ">= 10"
26 |   },
27 |   "publishConfig": {
28 |     "registry": "https://registry.npmjs.org/",
29 |     "access": "public"
30 |   },
31 |   "repository": "tokenizers",
32 |   "libc": [
33 |     "glibc"
34 |   ]
35 | }


--------------------------------------------------------------------------------
/bindings/node/npm/linux-arm64-musl/README.md:
--------------------------------------------------------------------------------
1 | # `tokenizers-linux-arm64-musl`
2 | 
3 | This is the **aarch64-unknown-linux-musl** binary for `tokenizers`
4 | 


--------------------------------------------------------------------------------
/bindings/node/npm/linux-arm64-musl/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "tokenizers-linux-arm64-musl",
 3 |   "version": "0.13.4-rc1",
 4 |   "os": [
 5 |     "linux"
 6 |   ],
 7 |   "cpu": [
 8 |     "arm64"
 9 |   ],
10 |   "main": "tokenizers.linux-arm64-musl.node",
11 |   "files": [
12 |     "tokenizers.linux-arm64-musl.node"
13 |   ],
14 |   "description": "Tokenizers platform specific bindings",
15 |   "keywords": [
16 |     "napi-rs",
17 |     "NAPI",
18 |     "N-API",
19 |     "Rust",
20 |     "node-addon",
21 |     "node-addon-api"
22 |   ],
23 |   "license": "MIT",
24 |   "engines": {
25 |     "node": ">= 10"
26 |   },
27 |   "publishConfig": {
28 |     "registry": "https://registry.npmjs.org/",
29 |     "access": "public"
30 |   },
31 |   "repository": "tokenizers",
32 |   "libc": [
33 |     "musl"
34 |   ]
35 | }


--------------------------------------------------------------------------------
/bindings/node/npm/linux-x64-gnu/README.md:
--------------------------------------------------------------------------------
1 | # `tokenizers-linux-x64-gnu`
2 | 
3 | This is the **x86_64-unknown-linux-gnu** binary for `tokenizers`
4 | 


--------------------------------------------------------------------------------
/bindings/node/npm/linux-x64-gnu/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "tokenizers-linux-x64-gnu",
 3 |   "version": "0.13.4-rc1",
 4 |   "os": [
 5 |     "linux"
 6 |   ],
 7 |   "cpu": [
 8 |     "x64"
 9 |   ],
10 |   "main": "tokenizers.linux-x64-gnu.node",
11 |   "files": [
12 |     "tokenizers.linux-x64-gnu.node"
13 |   ],
14 |   "description": "Tokenizers platform specific bindings",
15 |   "keywords": [
16 |     "napi-rs",
17 |     "NAPI",
18 |     "N-API",
19 |     "Rust",
20 |     "node-addon",
21 |     "node-addon-api"
22 |   ],
23 |   "license": "MIT",
24 |   "engines": {
25 |     "node": ">= 10"
26 |   },
27 |   "publishConfig": {
28 |     "registry": "https://registry.npmjs.org/",
29 |     "access": "public"
30 |   },
31 |   "repository": "tokenizers",
32 |   "libc": [
33 |     "glibc"
34 |   ]
35 | }


--------------------------------------------------------------------------------
/bindings/node/npm/linux-x64-musl/README.md:
--------------------------------------------------------------------------------
1 | # `tokenizers-linux-x64-musl`
2 | 
3 | This is the **x86_64-unknown-linux-musl** binary for `tokenizers`
4 | 


--------------------------------------------------------------------------------
/bindings/node/npm/linux-x64-musl/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "tokenizers-linux-x64-musl",
 3 |   "version": "0.13.4-rc1",
 4 |   "os": [
 5 |     "linux"
 6 |   ],
 7 |   "cpu": [
 8 |     "x64"
 9 |   ],
10 |   "main": "tokenizers.linux-x64-musl.node",
11 |   "files": [
12 |     "tokenizers.linux-x64-musl.node"
13 |   ],
14 |   "description": "Tokenizers platform specific bindings",
15 |   "keywords": [
16 |     "napi-rs",
17 |     "NAPI",
18 |     "N-API",
19 |     "Rust",
20 |     "node-addon",
21 |     "node-addon-api"
22 |   ],
23 |   "license": "MIT",
24 |   "engines": {
25 |     "node": ">= 10"
26 |   },
27 |   "publishConfig": {
28 |     "registry": "https://registry.npmjs.org/",
29 |     "access": "public"
30 |   },
31 |   "repository": "tokenizers",
32 |   "libc": [
33 |     "musl"
34 |   ]
35 | }


--------------------------------------------------------------------------------
/bindings/node/npm/win32-arm64-msvc/README.md:
--------------------------------------------------------------------------------
1 | # `tokenizers-win32-arm64-msvc`
2 | 
3 | This is the **aarch64-pc-windows-msvc** binary for `tokenizers`
4 | 


--------------------------------------------------------------------------------
/bindings/node/npm/win32-arm64-msvc/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "tokenizers-win32-arm64-msvc",
 3 |   "version": "0.13.4-rc1",
 4 |   "os": [
 5 |     "win32"
 6 |   ],
 7 |   "cpu": [
 8 |     "arm64"
 9 |   ],
10 |   "main": "tokenizers.win32-arm64-msvc.node",
11 |   "files": [
12 |     "tokenizers.win32-arm64-msvc.node"
13 |   ],
14 |   "description": "Tokenizers platform specific bindings",
15 |   "keywords": [
16 |     "napi-rs",
17 |     "NAPI",
18 |     "N-API",
19 |     "Rust",
20 |     "node-addon",
21 |     "node-addon-api"
22 |   ],
23 |   "license": "MIT",
24 |   "engines": {
25 |     "node": ">= 10"
26 |   },
27 |   "publishConfig": {
28 |     "registry": "https://registry.npmjs.org/",
29 |     "access": "public"
30 |   },
31 |   "repository": "tokenizers"
32 | }


--------------------------------------------------------------------------------
/bindings/node/npm/win32-ia32-msvc/README.md:
--------------------------------------------------------------------------------
1 | # `tokenizers-win32-ia32-msvc`
2 | 
3 | This is the **i686-pc-windows-msvc** binary for `tokenizers`
4 | 


--------------------------------------------------------------------------------
/bindings/node/npm/win32-ia32-msvc/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "tokenizers-win32-ia32-msvc",
 3 |   "version": "0.13.4-rc1",
 4 |   "os": [
 5 |     "win32"
 6 |   ],
 7 |   "cpu": [
 8 |     "ia32"
 9 |   ],
10 |   "main": "tokenizers.win32-ia32-msvc.node",
11 |   "files": [
12 |     "tokenizers.win32-ia32-msvc.node"
13 |   ],
14 |   "description": "Tokenizers platform specific bindings",
15 |   "keywords": [
16 |     "napi-rs",
17 |     "NAPI",
18 |     "N-API",
19 |     "Rust",
20 |     "node-addon",
21 |     "node-addon-api"
22 |   ],
23 |   "license": "MIT",
24 |   "engines": {
25 |     "node": ">= 10"
26 |   },
27 |   "publishConfig": {
28 |     "registry": "https://registry.npmjs.org/",
29 |     "access": "public"
30 |   },
31 |   "repository": "tokenizers"
32 | }


--------------------------------------------------------------------------------
/bindings/node/npm/win32-x64-msvc/README.md:
--------------------------------------------------------------------------------
1 | # `tokenizers-win32-x64-msvc`
2 | 
3 | This is the **x86_64-pc-windows-msvc** binary for `tokenizers`
4 | 


--------------------------------------------------------------------------------
/bindings/node/npm/win32-x64-msvc/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "tokenizers-win32-x64-msvc",
 3 |   "version": "0.13.4-rc1",
 4 |   "os": [
 5 |     "win32"
 6 |   ],
 7 |   "cpu": [
 8 |     "x64"
 9 |   ],
10 |   "main": "tokenizers.win32-x64-msvc.node",
11 |   "files": [
12 |     "tokenizers.win32-x64-msvc.node"
13 |   ],
14 |   "description": "Tokenizers platform specific bindings",
15 |   "keywords": [
16 |     "napi-rs",
17 |     "NAPI",
18 |     "N-API",
19 |     "Rust",
20 |     "node-addon",
21 |     "node-addon-api"
22 |   ],
23 |   "license": "MIT",
24 |   "engines": {
25 |     "node": ">= 10"
26 |   },
27 |   "publishConfig": {
28 |     "registry": "https://registry.npmjs.org/",
29 |     "access": "public"
30 |   },
31 |   "repository": "tokenizers"
32 | }


--------------------------------------------------------------------------------
/bindings/node/rustfmt.toml:
--------------------------------------------------------------------------------
1 | tab_spaces = 2
2 | 


--------------------------------------------------------------------------------
/bindings/node/src/arc_rwlock_serde.rs:
--------------------------------------------------------------------------------
 1 | use serde::de::Deserializer;
 2 | use serde::ser::Serializer;
 3 | use serde::{Deserialize, Serialize};
 4 | use std::sync::{Arc, RwLock};
 5 | 
 6 | pub fn serialize<S, T>(val: &Option<Arc<RwLock<T>>>, s: S) -> Result<S::Ok, S::Error>
 7 | where
 8 |   S: Serializer,
 9 |   T: Serialize,
10 | {
11 |   T::serialize(&*(val.clone().unwrap()).read().unwrap(), s)
12 | }
13 | 
14 | pub fn deserialize<'de, D, T>(d: D) -> Result<Option<Arc<RwLock<T>>>, D::Error>
15 | where
16 |   D: Deserializer<'de>,
17 |   T: Deserialize<'de>,
18 | {
19 |   Ok(Some(Arc::new(RwLock::new(T::deserialize(d)?))))
20 | }
21 | 


--------------------------------------------------------------------------------
/bindings/node/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![deny(clippy::all)]
 2 | 
 3 | pub const VERSION: &str = env!("CARGO_PKG_VERSION");
 4 | 
 5 | mod arc_rwlock_serde;
 6 | pub mod decoders;
 7 | pub mod encoding;
 8 | pub mod models;
 9 | pub mod normalizers;
10 | pub mod pre_tokenizers;
11 | pub mod processors;
12 | pub mod tasks;
13 | pub mod tokenizer;
14 | pub mod trainers;
15 | pub mod utils;
16 | 


--------------------------------------------------------------------------------
/bindings/node/src/tasks/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod models;
2 | pub mod tokenizer;
3 | 


--------------------------------------------------------------------------------
/bindings/node/src/tasks/models.rs:
--------------------------------------------------------------------------------
 1 | extern crate tokenizers as tk;
 2 | 
 3 | use crate::models::Model;
 4 | use napi::bindgen_prelude::*;
 5 | use std::sync::{Arc, RwLock};
 6 | use tokenizers::models::bpe::{BpeBuilder, BPE};
 7 | use tokenizers::models::wordlevel::{WordLevel, WordLevelBuilder};
 8 | use tokenizers::models::wordpiece::{WordPiece, WordPieceBuilder};
 9 | 
10 | pub struct BPEFromFilesTask {
11 |   pub(crate) builder: Option<BpeBuilder>,
12 | }
13 | 
14 | impl Task for BPEFromFilesTask {
15 |   type Output = BPE;
16 |   type JsValue = Model;
17 | 
18 |   fn compute(&mut self) -> Result<Self::Output> {
19 |     self
20 |       .builder
21 |       .take()
22 |       .ok_or(Error::from_reason("Empty builder".to_string()))?
23 |       .build()
24 |       .map_err(|e| Error::from_reason(format!("{e}")))
25 |   }
26 | 
27 |   fn resolve(&mut self, _env: Env, output: Self::Output) -> Result<Self::JsValue> {
28 |     Ok(Model {
29 |       model: Some(Arc::new(RwLock::new(output.into()))),
30 |     })
31 |   }
32 | }
33 | 
34 | pub struct WordPieceFromFilesTask {
35 |   pub(crate) builder: Option<WordPieceBuilder>,
36 | }
37 | 
38 | impl Task for WordPieceFromFilesTask {
39 |   type Output = WordPiece;
40 |   type JsValue = Model;
41 | 
42 |   fn compute(&mut self) -> Result<Self::Output> {
43 |     self
44 |       .builder
45 |       .take()
46 |       .ok_or(Error::from_reason("Empty builder".to_string()))?
47 |       .build()
48 |       .map_err(|e| Error::from_reason(format!("{e}")))
49 |   }
50 | 
51 |   fn resolve(&mut self, _env: Env, output: Self::Output) -> Result<Self::JsValue> {
52 |     Ok(Model {
53 |       model: Some(Arc::new(RwLock::new(output.into()))),
54 |     })
55 |   }
56 | }
57 | pub struct WordLevelFromFilesTask {
58 |   pub(crate) builder: Option<WordLevelBuilder>,
59 | }
60 | 
61 | impl Task for WordLevelFromFilesTask {
62 |   type Output = WordLevel;
63 |   type JsValue = Model;
64 | 
65 |   fn compute(&mut self) -> Result<Self::Output> {
66 |     self
67 |       .builder
68 |       .take()
69 |       .ok_or(Error::from_reason("Empty builder".to_string()))?
70 |       .build()
71 |       .map_err(|e| Error::from_reason(format!("{e}")))
72 |   }
73 | 
74 |   fn resolve(&mut self, _env: Env, output: Self::Output) -> Result<Self::JsValue> {
75 |     Ok(Model {
76 |       model: Some(Arc::new(RwLock::new(output.into()))),
77 |     })
78 |   }
79 | }
80 | 


--------------------------------------------------------------------------------
/bindings/node/src/trainers.rs:
--------------------------------------------------------------------------------
 1 | use crate::models::Model;
 2 | use napi_derive::napi;
 3 | use std::sync::{Arc, RwLock};
 4 | use tokenizers as tk;
 5 | use tokenizers::models::TrainerWrapper;
 6 | 
 7 | #[napi]
 8 | pub struct Trainer {
 9 |   trainer: Option<Arc<RwLock<TrainerWrapper>>>,
10 | }
11 | 
12 | impl From<TrainerWrapper> for Trainer {
13 |   fn from(trainer: TrainerWrapper) -> Self {
14 |     Self {
15 |       trainer: Some(Arc::new(RwLock::new(trainer))),
16 |     }
17 |   }
18 | }
19 | 
20 | impl tk::Trainer for Trainer {
21 |   type Model = Model;
22 | 
23 |   fn should_show_progress(&self) -> bool {
24 |     self
25 |       .trainer
26 |       .as_ref()
27 |       .expect("Uninitialized Trainer")
28 |       .read()
29 |       .unwrap()
30 |       .should_show_progress()
31 |   }
32 | 
33 |   fn train(&self, model: &mut Self::Model) -> tk::Result<Vec<tk::AddedToken>> {
34 |     let special_tokens = self
35 |       .trainer
36 |       .as_ref()
37 |       .ok_or("Uninitialized Trainer")?
38 |       .read()
39 |       .unwrap()
40 |       .train(
41 |         &mut model
42 |           .model
43 |           .as_ref()
44 |           .ok_or("Uninitialized Model")?
45 |           .write()
46 |           .unwrap(),
47 |       )?;
48 | 
49 |     Ok(special_tokens)
50 |   }
51 | 
52 |   fn feed<I, S, F>(&mut self, iterator: I, process: F) -> tk::Result<()>
53 |   where
54 |     I: Iterator<Item = S> + Send,
55 |     S: AsRef<str> + Send,
56 |     F: Fn(&str) -> tk::Result<Vec<String>> + Sync,
57 |   {
58 |     self
59 |       .trainer
60 |       .as_ref()
61 |       .ok_or("Uninitialized Trainer")?
62 |       .write()
63 |       .unwrap()
64 |       .feed(iterator, process)
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/bindings/node/src/utils.rs:
--------------------------------------------------------------------------------
 1 | use napi::bindgen_prelude::*;
 2 | use napi_derive::napi;
 3 | use tokenizers as tk;
 4 | use tokenizers::Encoding;
 5 | 
 6 | use crate::encoding::JsEncoding;
 7 | 
 8 | #[napi]
 9 | pub fn slice(s: String, begin_index: Option<i32>, end_index: Option<i32>) -> Result<String> {
10 |   let len = s.chars().count();
11 | 
12 |   let get_index = |x: i32| -> usize {
13 |     if x >= 0 {
14 |       x as usize
15 |     } else {
16 |       (len as i32 + x) as usize
17 |     }
18 |   };
19 | 
20 |   let begin_index = get_index(begin_index.unwrap_or(0));
21 |   let end_index = get_index(end_index.unwrap_or(len as i32));
22 | 
23 |   if let Some(slice) = tk::tokenizer::normalizer::get_range_of(&s, begin_index..end_index) {
24 |     Ok(slice.to_string())
25 |   } else {
26 |     Err(Error::new(
27 |       Status::GenericFailure,
28 |       "Error in offsets".to_string(),
29 |     ))
30 |   }
31 | }
32 | 
33 | #[napi]
34 | pub fn merge_encodings(
35 |   encodings: Vec<&JsEncoding>,
36 |   growing_offsets: Option<bool>,
37 | ) -> Result<JsEncoding> {
38 |   let growing_offsets = growing_offsets.unwrap_or(false);
39 | 
40 |   let encodings: Vec<_> = encodings
41 |     .into_iter()
42 |     .map(|enc| enc.encoding.to_owned().unwrap())
43 |     .collect();
44 | 
45 |   let new_encoding = Encoding::merge(encodings, growing_offsets);
46 |   let js_encoding = JsEncoding {
47 |     encoding: Some(new_encoding),
48 |   };
49 | 
50 |   Ok(js_encoding)
51 | }
52 | 


--------------------------------------------------------------------------------
/bindings/node/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "ES2018",
 4 |     "strict": true,
 5 |     "moduleResolution": "node",
 6 |     "module": "CommonJS",
 7 |     "noUnusedLocals": true,
 8 |     "noUnusedParameters": true,
 9 |     "esModuleInterop": true,
10 |     "allowSyntheticDefaultImports": true
11 |   },
12 |   "include": ["."],
13 |   "exclude": ["node_modules"]
14 | }
15 | 


--------------------------------------------------------------------------------
/bindings/node/types.ts:
--------------------------------------------------------------------------------
1 | export type TextInputSequence = string
2 | export type PreTokenizedInputSequence = string[]
3 | export type InputSequence = TextInputSequence | PreTokenizedInputSequence
4 | 
5 | export type TextEncodeInput = TextInputSequence | [TextInputSequence, TextInputSequence]
6 | export type PreTokenizedEncodeInput = PreTokenizedInputSequence | [PreTokenizedInputSequence, PreTokenizedInputSequence]
7 | export type EncodeInput = TextEncodeInput | PreTokenizedEncodeInput
8 | 


--------------------------------------------------------------------------------
/bindings/python/.cargo/config.toml:
--------------------------------------------------------------------------------
 1 | [target.x86_64-apple-darwin]
 2 | rustflags = [
 3 |   "-C", "link-arg=-undefined",
 4 |   "-C", "link-arg=dynamic_lookup",
 5 |   "-C", "link-arg=-mmacosx-version-min=10.11",
 6 | ]
 7 | 
 8 | [target.aarch64-apple-darwin]
 9 | rustflags = [
10 |   "-C", "link-arg=-undefined",
11 |   "-C", "link-arg=dynamic_lookup",
12 |   "-C", "link-arg=-mmacosx-version-min=10.11",
13 | ]
14 | 


--------------------------------------------------------------------------------
/bindings/python/.gitignore:
--------------------------------------------------------------------------------
1 | data
2 | 


--------------------------------------------------------------------------------
/bindings/python/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "tokenizers-python"
 3 | version = "0.21.4-dev.0"
 4 | authors = ["Anthony MOI <m.anthony.moi@gmail.com>"]
 5 | edition = "2021"
 6 | 
 7 | [lib]
 8 | name = "tokenizers"
 9 | crate-type = ["cdylib"]
10 | 
11 | [dependencies]
12 | rayon = "1.10"
13 | serde = { version = "1.0", features = ["rc", "derive"] }
14 | serde_json = "1.0"
15 | libc = "0.2"
16 | env_logger = "0.11"
17 | pyo3 = { version = "0.25", features = ["abi3", "abi3-py39", "py-clone"] }
18 | numpy = "0.25"
19 | ndarray = "0.16"
20 | itertools = "0.14"
21 | ahash = { version = "0.8.11", features = ["serde"] }
22 | 
23 | [dependencies.tokenizers]
24 | path = "../../tokenizers"
25 | 
26 | [dev-dependencies]
27 | tempfile = "3.10"
28 | pyo3 = { version = "0.25", features = ["auto-initialize"] }
29 | 
30 | [features]
31 | default = ["pyo3/extension-module"]
32 | 


--------------------------------------------------------------------------------
/bindings/python/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include Cargo.toml
2 | include pyproject.toml
3 | include rust-toolchain
4 | include ../../LICENSE
5 | recursive-include src *
6 | recursive-include tokenizers-lib *
7 | recursive-exclude tokenizers-lib/target *
8 | 


--------------------------------------------------------------------------------
/bindings/python/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: style check-style test
 2 | 
 3 | DATA_DIR = data
 4 | 
 5 | dir_guard=@mkdir -p $(@D)
 6 | check_dirs := examples py_src/tokenizers tests
 7 | 
 8 | # Format source code automatically
 9 | style:
10 | 	python stub.py
11 | 	ruff check  $(check_dirs) --fix 
12 | 	ruff format $(check_dirs)
13 | 
14 | # Check the source code is formatted correctly
15 | check-style:
16 | 	python stub.py --check
17 | 	ruff check $(check_dirs)
18 | 	ruff format --check $(check_dirs)
19 | 
20 | TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json
21 | 
22 | # Launch the test suite
23 | test: $(TESTS_RESOURCES)
24 | 	pip install pytest requests setuptools_rust numpy pyarrow datasets
25 | 	python -m pytest -s -v tests
26 | 	cargo test --no-default-features
27 | 
28 | $(DATA_DIR)/big.txt :
29 | 	$(dir_guard)
30 | 	wget https://norvig.com/big.txt -O $@
31 | 
32 | $(DATA_DIR)/small.txt : $(DATA_DIR)/big.txt
33 | 	head -100 $(DATA_DIR)/big.txt > $@
34 | 
35 | $(DATA_DIR)/roberta.json :
36 | 	$(dir_guard)
37 | 	wget https://huggingface.co/roberta-large/raw/main/tokenizer.json -O $@
38 | 


--------------------------------------------------------------------------------
/bindings/python/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | def pytest_addoption(parser):
 5 |     parser.addoption("--runslow", action="store_true", default=False, help="run slow tests")
 6 | 
 7 | 
 8 | def pytest_configure(config):
 9 |     config.addinivalue_line("markers", "slow: mark test as slow to run")
10 | 
11 | 
12 | def pytest_collection_modifyitems(config, items):
13 |     if config.getoption("--runslow"):
14 |         # --runslow given in cli: do not skip slow tests
15 |         return
16 |     skip_slow = pytest.mark.skip(reason="need --runslow option to run")
17 |     for item in items:
18 |         if "slow" in item.keywords:
19 |             item.add_marker(skip_slow)
20 | 


--------------------------------------------------------------------------------
/bindings/python/examples/train_bert_wordpiece.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import glob
 3 | 
 4 | from tokenizers import BertWordPieceTokenizer
 5 | 
 6 | 
 7 | parser = argparse.ArgumentParser()
 8 | parser.add_argument(
 9 |     "--files",
10 |     default=None,
11 |     metavar="path",
12 |     type=str,
13 |     required=True,
14 |     help="The files to use as training; accept '**/*.txt' type of patterns \
15 |                           if enclosed in quotes",
16 | )
17 | parser.add_argument(
18 |     "--out",
19 |     default="./",
20 |     type=str,
21 |     help="Path to the output directory, where the files will be saved",
22 | )
23 | parser.add_argument("--name", default="bert-wordpiece", type=str, help="The name of the output vocab files")
24 | args = parser.parse_args()
25 | 
26 | files = glob.glob(args.files)
27 | if not files:
28 |     print(f"File does not exist: {args.files}")
29 |     exit(1)
30 | 
31 | 
32 | # Initialize an empty tokenizer
33 | tokenizer = BertWordPieceTokenizer(
34 |     clean_text=True,
35 |     handle_chinese_chars=True,
36 |     strip_accents=True,
37 |     lowercase=True,
38 | )
39 | 
40 | # And then train
41 | tokenizer.train(
42 |     files,
43 |     vocab_size=10000,
44 |     min_frequency=2,
45 |     show_progress=True,
46 |     special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
47 |     limit_alphabet=1000,
48 |     wordpieces_prefix="##",
49 | )
50 | 
51 | # Save the files
52 | tokenizer.save_model(args.out, args.name)
53 | 


--------------------------------------------------------------------------------
/bindings/python/examples/train_bytelevel_bpe.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import glob
 3 | from os.path import join
 4 | 
 5 | from tokenizers import ByteLevelBPETokenizer
 6 | 
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument(
10 |     "--files",
11 |     default=None,
12 |     metavar="path",
13 |     type=str,
14 |     required=True,
15 |     help="The files to use as training; accept '**/*.txt' type of patterns \
16 |                           if enclosed in quotes",
17 | )
18 | parser.add_argument(
19 |     "--out",
20 |     default="./",
21 |     type=str,
22 |     help="Path to the output directory, where the files will be saved",
23 | )
24 | parser.add_argument("--name", default="bpe-bytelevel", type=str, help="The name of the output vocab files")
25 | args = parser.parse_args()
26 | 
27 | files = glob.glob(args.files)
28 | if not files:
29 |     print(f"File does not exist: {args.files}")
30 |     exit(1)
31 | 
32 | 
33 | # Initialize an empty tokenizer
34 | tokenizer = ByteLevelBPETokenizer(add_prefix_space=True)
35 | 
36 | # And then train
37 | tokenizer.train(
38 |     files,
39 |     vocab_size=10000,
40 |     min_frequency=2,
41 |     show_progress=True,
42 |     special_tokens=["<s>", "<pad>", "</s>"],
43 | )
44 | 
45 | # Save the files
46 | tokenizer.save_model(args.out, args.name)
47 | 
48 | # Restoring model from learned vocab/merges
49 | tokenizer = ByteLevelBPETokenizer(
50 |     join(args.out, "{}-vocab.json".format(args.name)),
51 |     join(args.out, "{}-merges.txt".format(args.name)),
52 |     add_prefix_space=True,
53 | )
54 | 
55 | # Test encoding
56 | print(tokenizer.encode("Training ByteLevel BPE is very easy").tokens)
57 | 


--------------------------------------------------------------------------------
/bindings/python/examples/train_with_datasets.py:
--------------------------------------------------------------------------------
 1 | import datasets
 2 | 
 3 | from tokenizers import Tokenizer, models, normalizers, pre_tokenizers
 4 | 
 5 | 
 6 | # Build a tokenizer
 7 | bpe_tokenizer = Tokenizer(models.BPE())
 8 | bpe_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
 9 | bpe_tokenizer.normalizer = normalizers.Lowercase()
10 | 
11 | # Initialize a dataset
12 | dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="train")
13 | 
14 | 
15 | # Build an iterator over this dataset
16 | def batch_iterator():
17 |     batch_size = 1000
18 |     for batch in dataset.iter(batch_size=batch_size):
19 |         yield batch["text"]
20 | 
21 | 
22 | # And finally train
23 | bpe_tokenizer.train_from_iterator(batch_iterator(), length=len(dataset))
24 | 


--------------------------------------------------------------------------------
/bindings/python/py_src/tokenizers/__init__.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | from typing import List, Tuple, Union
  3 | 
  4 | 
  5 | Offsets = Tuple[int, int]
  6 | 
  7 | TextInputSequence = str
  8 | """A :obj:`str` that represents an input sequence """
  9 | 
 10 | PreTokenizedInputSequence = Union[List[str], Tuple[str]]
 11 | """A pre-tokenized input sequence. Can be one of:
 12 | 
 13 |     - A :obj:`List` of :obj:`str`
 14 |     - A :obj:`Tuple` of :obj:`str`
 15 | """
 16 | 
 17 | TextEncodeInput = Union[
 18 |     TextInputSequence,
 19 |     Tuple[TextInputSequence, TextInputSequence],
 20 |     List[TextInputSequence],
 21 | ]
 22 | """Represents a textual input for encoding. Can be either:
 23 | 
 24 |     - A single sequence: :data:`~tokenizers.TextInputSequence`
 25 |     - A pair of sequences:
 26 | 
 27 |       - A :obj:`Tuple` of :data:`~tokenizers.TextInputSequence`
 28 |       - Or a :obj:`List` of :data:`~tokenizers.TextInputSequence` of size 2
 29 | """
 30 | 
 31 | PreTokenizedEncodeInput = Union[
 32 |     PreTokenizedInputSequence,
 33 |     Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
 34 |     List[PreTokenizedInputSequence],
 35 | ]
 36 | """Represents a pre-tokenized input for encoding. Can be either:
 37 | 
 38 |     - A single sequence: :data:`~tokenizers.PreTokenizedInputSequence`
 39 |     - A pair of sequences:
 40 | 
 41 |       - A :obj:`Tuple` of :data:`~tokenizers.PreTokenizedInputSequence`
 42 |       - Or a :obj:`List` of :data:`~tokenizers.PreTokenizedInputSequence` of size 2
 43 | """
 44 | 
 45 | InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
 46 | """Represents all the possible types of input sequences for encoding. Can be:
 47 | 
 48 |     - When ``is_pretokenized=False``: :data:`~TextInputSequence`
 49 |     - When ``is_pretokenized=True``: :data:`~PreTokenizedInputSequence`
 50 | """
 51 | 
 52 | EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
 53 | """Represents all the possible types of input for encoding. Can be:
 54 | 
 55 |     - When ``is_pretokenized=False``: :data:`~TextEncodeInput`
 56 |     - When ``is_pretokenized=True``: :data:`~PreTokenizedEncodeInput`
 57 | """
 58 | 
 59 | 
 60 | class OffsetReferential(Enum):
 61 |     ORIGINAL = "original"
 62 |     NORMALIZED = "normalized"
 63 | 
 64 | 
 65 | class OffsetType(Enum):
 66 |     BYTE = "byte"
 67 |     CHAR = "char"
 68 | 
 69 | 
 70 | class SplitDelimiterBehavior(Enum):
 71 |     REMOVED = "removed"
 72 |     ISOLATED = "isolated"
 73 |     MERGED_WITH_PREVIOUS = "merged_with_previous"
 74 |     MERGED_WITH_NEXT = "merged_with_next"
 75 |     CONTIGUOUS = "contiguous"
 76 | 
 77 | 
 78 | from .tokenizers import (
 79 |     AddedToken,
 80 |     Encoding,
 81 |     NormalizedString,
 82 |     PreTokenizedString,
 83 |     Regex,
 84 |     Token,
 85 |     Tokenizer,
 86 |     decoders,
 87 |     models,
 88 |     normalizers,
 89 |     pre_tokenizers,
 90 |     processors,
 91 |     trainers,
 92 |     __version__,
 93 | )
 94 | from .implementations import (
 95 |     BertWordPieceTokenizer,
 96 |     ByteLevelBPETokenizer,
 97 |     CharBPETokenizer,
 98 |     SentencePieceBPETokenizer,
 99 |     SentencePieceUnigramTokenizer,
100 | )
101 | 


--------------------------------------------------------------------------------
/bindings/python/py_src/tokenizers/decoders/__init__.py:
--------------------------------------------------------------------------------
 1 | from .. import decoders
 2 | 
 3 | 
 4 | Decoder = decoders.Decoder
 5 | ByteLevel = decoders.ByteLevel
 6 | Replace = decoders.Replace
 7 | WordPiece = decoders.WordPiece
 8 | ByteFallback = decoders.ByteFallback
 9 | Fuse = decoders.Fuse
10 | Strip = decoders.Strip
11 | Metaspace = decoders.Metaspace
12 | BPEDecoder = decoders.BPEDecoder
13 | CTC = decoders.CTC
14 | Sequence = decoders.Sequence
15 | DecodeStream = decoders.DecodeStream
16 | 


--------------------------------------------------------------------------------
/bindings/python/py_src/tokenizers/implementations/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_tokenizer import BaseTokenizer
2 | from .bert_wordpiece import BertWordPieceTokenizer
3 | from .byte_level_bpe import ByteLevelBPETokenizer
4 | from .char_level_bpe import CharBPETokenizer
5 | from .sentencepiece_bpe import SentencePieceBPETokenizer
6 | from .sentencepiece_unigram import SentencePieceUnigramTokenizer
7 | 


--------------------------------------------------------------------------------
/bindings/python/py_src/tokenizers/models/__init__.py:
--------------------------------------------------------------------------------
1 | # Generated content DO NOT EDIT
2 | from .. import models
3 | 
4 | Model = models.Model
5 | BPE = models.BPE
6 | Unigram = models.Unigram
7 | WordLevel = models.WordLevel
8 | WordPiece = models.WordPiece
9 | 


--------------------------------------------------------------------------------
/bindings/python/py_src/tokenizers/normalizers/__init__.py:
--------------------------------------------------------------------------------
 1 | from .. import normalizers
 2 | 
 3 | 
 4 | Normalizer = normalizers.Normalizer
 5 | BertNormalizer = normalizers.BertNormalizer
 6 | NFD = normalizers.NFD
 7 | NFKD = normalizers.NFKD
 8 | NFC = normalizers.NFC
 9 | NFKC = normalizers.NFKC
10 | Sequence = normalizers.Sequence
11 | Lowercase = normalizers.Lowercase
12 | Prepend = normalizers.Prepend
13 | Strip = normalizers.Strip
14 | StripAccents = normalizers.StripAccents
15 | Nmt = normalizers.Nmt
16 | Precompiled = normalizers.Precompiled
17 | Replace = normalizers.Replace
18 | ByteLevel = normalizers.ByteLevel
19 | 
20 | NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
21 | 
22 | 
23 | def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
24 |     if normalizer not in NORMALIZERS:
25 |         raise ValueError(
26 |             "{} is not a known unicode normalizer. Available are {}".format(normalizer, NORMALIZERS.keys())
27 |         )
28 | 
29 |     return NORMALIZERS[normalizer]()
30 | 


--------------------------------------------------------------------------------
/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Generated content DO NOT EDIT
 2 | from .. import pre_tokenizers
 3 | 
 4 | PreTokenizer = pre_tokenizers.PreTokenizer
 5 | BertPreTokenizer = pre_tokenizers.BertPreTokenizer
 6 | ByteLevel = pre_tokenizers.ByteLevel
 7 | CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit
 8 | Digits = pre_tokenizers.Digits
 9 | FixedLength = pre_tokenizers.FixedLength
10 | Metaspace = pre_tokenizers.Metaspace
11 | Punctuation = pre_tokenizers.Punctuation
12 | Sequence = pre_tokenizers.Sequence
13 | Split = pre_tokenizers.Split
14 | UnicodeScripts = pre_tokenizers.UnicodeScripts
15 | Whitespace = pre_tokenizers.Whitespace
16 | WhitespaceSplit = pre_tokenizers.WhitespaceSplit
17 | 


--------------------------------------------------------------------------------
/bindings/python/py_src/tokenizers/processors/__init__.py:
--------------------------------------------------------------------------------
 1 | # Generated content DO NOT EDIT
 2 | from .. import processors
 3 | 
 4 | PostProcessor = processors.PostProcessor
 5 | BertProcessing = processors.BertProcessing
 6 | ByteLevel = processors.ByteLevel
 7 | RobertaProcessing = processors.RobertaProcessing
 8 | Sequence = processors.Sequence
 9 | TemplateProcessing = processors.TemplateProcessing
10 | 


--------------------------------------------------------------------------------
/bindings/python/py_src/tokenizers/tools/__init__.py:
--------------------------------------------------------------------------------
1 | from .visualizer import Annotation, EncodingVisualizer
2 | 


--------------------------------------------------------------------------------
/bindings/python/py_src/tokenizers/trainers/__init__.py:
--------------------------------------------------------------------------------
1 | # Generated content DO NOT EDIT
2 | from .. import trainers
3 | 
4 | Trainer = trainers.Trainer
5 | BpeTrainer = trainers.BpeTrainer
6 | UnigramTrainer = trainers.UnigramTrainer
7 | WordLevelTrainer = trainers.WordLevelTrainer
8 | WordPieceTrainer = trainers.WordPieceTrainer
9 | 


--------------------------------------------------------------------------------
/bindings/python/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "tokenizers"
 3 | requires-python = ">=3.9"
 4 | authors = [
 5 |   { name = "Nicolas Patry", email = "patry.nicolas@protonmail.com" },
 6 |   { name = "Anthony Moi", email = "anthony@huggingface.co" },
 7 | ]
 8 | classifiers = [
 9 |   "Development Status :: 5 - Production/Stable",
10 |   "Intended Audience :: Developers",
11 |   "Intended Audience :: Education",
12 |   "Intended Audience :: Science/Research",
13 |   "License :: OSI Approved :: Apache Software License",
14 |   "Operating System :: OS Independent",
15 |   "Programming Language :: Python :: 3",
16 |   "Programming Language :: Python :: 3.9",
17 |   "Programming Language :: Python :: 3.10",
18 |   "Programming Language :: Python :: 3.11",
19 |   "Programming Language :: Python :: 3.12",
20 |   "Programming Language :: Python :: 3.13",
21 |   "Programming Language :: Python :: 3 :: Only",
22 |   "Topic :: Scientific/Engineering :: Artificial Intelligence",
23 | ]
24 | keywords = ["NLP", "tokenizer", "BPE", "transformer", "deep learning"]
25 | dynamic = ["description", "license", "readme", "version"]
26 | dependencies = ["huggingface_hub>=0.16.4,<1.0"]
27 | 
28 | [project.urls]
29 | Homepage = "https://github.com/huggingface/tokenizers"
30 | Source = "https://github.com/huggingface/tokenizers"
31 | 
32 | 
33 | [project.optional-dependencies]
34 | testing = ["pytest", "requests", "numpy", "datasets", "black==22.3", "ruff"]
35 | docs = ["sphinx", "sphinx_rtd_theme", "setuptools_rust"]
36 | dev = ["tokenizers[testing]"]
37 | 
38 | 
39 | [build-system]
40 | requires = ["maturin>=1.0,<2.0"]
41 | build-backend = "maturin"
42 | 
43 | [tool.maturin]
44 | python-source = "py_src"
45 | module-name = "tokenizers.tokenizers"
46 | bindings = "pyo3"
47 | features = ["pyo3/extension-module"]
48 | 
49 | [tool.black]
50 | line-length = 119
51 | target-version = ["py35"]
52 | 
53 | [tool.ruff]
54 | line-length = 119
55 | target-version = "py311"
56 | lint.ignore = [
57 |   # a == None in tests vs is None.
58 |   "E711",
59 |   # a == False in tests vs is False.
60 |   "E712",
61 |   # try.. import except.. pattern without using the lib.
62 |   "F401",
63 |   # Raw type equality is required in asserts
64 |   "E721",
65 |   # Import order
66 |   "E402",
67 |   # Fixtures unused import
68 |   "F811",
69 | ]
70 | 


--------------------------------------------------------------------------------
/bindings/python/rust-toolchain:
--------------------------------------------------------------------------------
1 | stable
2 | 


--------------------------------------------------------------------------------
/bindings/python/setup.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | default_section = FIRSTPARTY
 3 | ensure_newline_before_comments = True
 4 | force_grid_wrap = 0
 5 | include_trailing_comma = True
 6 | known_first_party = transformers
 7 | known_third_party =
 8 |     absl
 9 |     conllu
10 |     datasets
11 |     elasticsearch
12 |     fairseq
13 |     faiss-cpu
14 |     fastprogress
15 |     fire
16 |     fugashi
17 |     git
18 |     h5py
19 |     matplotlib
20 |     nltk
21 |     numpy
22 |     packaging
23 |     pandas
24 |     PIL
25 |     psutil
26 |     pytest
27 |     pytorch_lightning
28 |     rouge_score
29 |     sacrebleu
30 |     seqeval
31 |     sklearn
32 |     streamlit
33 |     tensorboardX
34 |     tensorflow
35 |     tensorflow_datasets
36 |     timeout_decorator
37 |     torch
38 |     torchaudio
39 |     torchtext
40 |     torchvision
41 |     torch_xla
42 |     tqdm
43 | 
44 | line_length = 119
45 | lines_after_imports = 2
46 | multi_line_output = 3
47 | use_parentheses = True
48 | 
49 | [flake8]
50 | ignore = E203, E501, E741, W503, W605
51 | max-line-length = 119
52 | 
53 | [tool:pytest]
54 | doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS
55 | 


--------------------------------------------------------------------------------
/bindings/python/src/error.rs:
--------------------------------------------------------------------------------
 1 | use pyo3::exceptions;
 2 | use pyo3::prelude::*;
 3 | use pyo3::type_object::PyTypeInfo;
 4 | use std::ffi::CString;
 5 | use std::fmt::{Display, Formatter, Result as FmtResult};
 6 | use tokenizers::tokenizer::Result;
 7 | 
 8 | #[derive(Debug)]
 9 | pub struct PyError(pub String);
10 | impl PyError {
11 |     #[allow(dead_code)]
12 |     pub fn from(s: &str) -> Self {
13 |         PyError(String::from(s))
14 |     }
15 |     pub fn into_pyerr<T: PyTypeInfo>(self) -> PyErr {
16 |         PyErr::new::<T, _>(format!("{self}"))
17 |     }
18 | }
19 | impl Display for PyError {
20 |     fn fmt(&self, fmt: &mut Formatter) -> FmtResult {
21 |         write!(fmt, "{}", self.0)
22 |     }
23 | }
24 | impl std::error::Error for PyError {}
25 | 
26 | pub struct ToPyResult<T>(pub Result<T>);
27 | impl<T> From<ToPyResult<T>> for PyResult<T> {
28 |     fn from(v: ToPyResult<T>) -> Self {
29 |         v.0.map_err(|e| exceptions::PyException::new_err(format!("{e}")))
30 |     }
31 | }
32 | impl<T> ToPyResult<T> {
33 |     pub fn into_py(self) -> PyResult<T> {
34 |         self.into()
35 |     }
36 | }
37 | 
38 | pub(crate) fn deprecation_warning(py: Python<'_>, version: &str, message: &str) -> PyResult<()> {
39 |     let deprecation_warning = py.import("builtins")?.getattr("DeprecationWarning")?;
40 |     let full_message = format!("Deprecated in {version}: {message}");
41 |     pyo3::PyErr::warn(py, &deprecation_warning, &CString::new(full_message)?, 0)
42 | }
43 | 


--------------------------------------------------------------------------------
/bindings/python/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![warn(clippy::all)]
 2 | #![allow(clippy::upper_case_acronyms)]
 3 | // Many false positives with pyo3 it seems &str, and &PyAny get flagged
 4 | #![allow(clippy::borrow_deref_ref)]
 5 | 
 6 | extern crate tokenizers as tk;
 7 | 
 8 | mod decoders;
 9 | mod encoding;
10 | mod error;
11 | mod models;
12 | mod normalizers;
13 | mod pre_tokenizers;
14 | mod processors;
15 | mod token;
16 | mod tokenizer;
17 | mod trainers;
18 | mod utils;
19 | 
20 | use pyo3::prelude::*;
21 | use pyo3::wrap_pymodule;
22 | 
23 | pub const VERSION: &str = env!("CARGO_PKG_VERSION");
24 | 
25 | // For users using multiprocessing in python, it is quite easy to fork the process running
26 | // tokenizers, ending up with a deadlock because we internally make use of multithreading. So
27 | // we register a callback to be called in the event of a fork so that we can warn the user.
28 | #[cfg(target_family = "unix")]
29 | static mut REGISTERED_FORK_CALLBACK: bool = false;
30 | #[cfg(target_family = "unix")]
31 | extern "C" fn child_after_fork() {
32 |     use tk::parallelism::*;
33 |     if has_parallelism_been_used() && !is_parallelism_configured() {
34 |         eprintln!(
35 |             "huggingface/tokenizers: The current process just got forked, after parallelism has \
36 |             already been used. Disabling parallelism to avoid deadlocks..."
37 |         );
38 |         eprintln!("To disable this warning, you can either:");
39 |         eprintln!(
40 |             "\t- Avoid using `tokenizers` before the fork if possible\n\
41 |             \t- Explicitly set the environment variable {ENV_VARIABLE}=(true | false)"
42 |         );
43 |         set_parallelism(false);
44 |     }
45 | }
46 | 
47 | /// Tokenizers Module
48 | #[pymodule]
49 | pub fn tokenizers(m: &Bound<'_, PyModule>) -> PyResult<()> {
50 |     let _ = env_logger::try_init_from_env("TOKENIZERS_LOG");
51 | 
52 |     // Register the fork callback
53 |     #[cfg(target_family = "unix")]
54 |     unsafe {
55 |         if !REGISTERED_FORK_CALLBACK {
56 |             libc::pthread_atfork(None, None, Some(child_after_fork));
57 |             REGISTERED_FORK_CALLBACK = true;
58 |         }
59 |     }
60 | 
61 |     m.add_class::<tokenizer::PyTokenizer>()?;
62 |     m.add_class::<tokenizer::PyAddedToken>()?;
63 |     m.add_class::<token::PyToken>()?;
64 |     m.add_class::<encoding::PyEncoding>()?;
65 |     m.add_class::<utils::PyRegex>()?;
66 |     m.add_class::<utils::PyNormalizedString>()?;
67 |     m.add_class::<utils::PyPreTokenizedString>()?;
68 |     m.add_wrapped(wrap_pymodule!(models::models))?;
69 |     m.add_wrapped(wrap_pymodule!(pre_tokenizers::pre_tokenizers))?;
70 |     m.add_wrapped(wrap_pymodule!(decoders::decoders))?;
71 |     m.add_wrapped(wrap_pymodule!(processors::processors))?;
72 |     m.add_wrapped(wrap_pymodule!(normalizers::normalizers))?;
73 |     m.add_wrapped(wrap_pymodule!(trainers::trainers))?;
74 |     m.add("__version__", env!("CARGO_PKG_VERSION"))?;
75 |     Ok(())
76 | }
77 | 


--------------------------------------------------------------------------------
/bindings/python/src/token.rs:
--------------------------------------------------------------------------------
 1 | use pyo3::prelude::*;
 2 | use tk::Token;
 3 | 
 4 | #[pyclass(module = "tokenizers", name = "Token")]
 5 | #[derive(Clone)]
 6 | pub struct PyToken {
 7 |     token: Token,
 8 | }
 9 | impl From<Token> for PyToken {
10 |     fn from(token: Token) -> Self {
11 |         Self { token }
12 |     }
13 | }
14 | impl From<PyToken> for Token {
15 |     fn from(token: PyToken) -> Self {
16 |         token.token
17 |     }
18 | }
19 | 
20 | #[pymethods]
21 | impl PyToken {
22 |     #[new]
23 |     #[pyo3(text_signature = None)]
24 |     fn new(id: u32, value: String, offsets: (usize, usize)) -> PyToken {
25 |         Token::new(id, value, offsets).into()
26 |     }
27 | 
28 |     #[getter]
29 |     fn get_id(&self) -> u32 {
30 |         self.token.id
31 |     }
32 | 
33 |     #[getter]
34 |     fn get_value(&self) -> &str {
35 |         &self.token.value
36 |     }
37 | 
38 |     #[getter]
39 |     fn get_offsets(&self) -> (usize, usize) {
40 |         self.token.offsets
41 |     }
42 | 
43 |     fn as_tuple(&self) -> (u32, &str, (usize, usize)) {
44 |         (self.token.id, &self.token.value, self.token.offsets)
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/bindings/python/src/utils/mod.rs:
--------------------------------------------------------------------------------
 1 | use std::marker::PhantomData;
 2 | use std::sync::{Arc, Mutex};
 3 | 
 4 | mod iterators;
 5 | mod normalization;
 6 | mod pretokenization;
 7 | mod regex;
 8 | pub mod serde_pyo3;
 9 | 
10 | pub use iterators::*;
11 | pub use normalization::*;
12 | pub use pretokenization::*;
13 | pub use regex::*;
14 | 
15 | // RefMut utils
16 | 
17 | pub trait DestroyPtr {
18 |     fn destroy(&mut self);
19 | }
20 | 
21 | pub struct RefMutGuard<'r, T: DestroyPtr> {
22 |     content: T,
23 |     r: PhantomData<&'r mut T>,
24 | }
25 | impl<T: DestroyPtr> RefMutGuard<'_, T> {
26 |     pub fn new(content: T) -> Self {
27 |         Self {
28 |             content,
29 |             r: PhantomData,
30 |         }
31 |     }
32 | 
33 |     pub fn get(&self) -> &T {
34 |         &self.content
35 |     }
36 | }
37 | 
38 | impl<T: DestroyPtr> Drop for RefMutGuard<'_, T> {
39 |     fn drop(&mut self) {
40 |         self.content.destroy()
41 |     }
42 | }
43 | 
44 | #[derive(Clone)]
45 | pub struct RefMutContainer<T> {
46 |     inner: Arc<Mutex<Option<*mut T>>>,
47 | }
48 | impl<T> RefMutContainer<T> {
49 |     pub fn new(content: &mut T) -> Self {
50 |         Self {
51 |             inner: Arc::new(Mutex::new(Some(content))),
52 |         }
53 |     }
54 | 
55 |     pub fn map<F: FnOnce(&T) -> U, U>(&self, f: F) -> Option<U> {
56 |         let lock = self.inner.lock().unwrap();
57 |         let ptr = lock.as_ref()?;
58 |         Some(f(unsafe { ptr.as_ref().unwrap() }))
59 |     }
60 | 
61 |     pub fn map_mut<F: FnOnce(&mut T) -> U, U>(&mut self, f: F) -> Option<U> {
62 |         let lock = self.inner.lock().unwrap();
63 |         let ptr = lock.as_ref()?;
64 |         Some(f(unsafe { ptr.as_mut().unwrap() }))
65 |     }
66 | }
67 | 
68 | impl<T> DestroyPtr for RefMutContainer<T> {
69 |     fn destroy(&mut self) {
70 |         self.inner.lock().unwrap().take();
71 |     }
72 | }
73 | 
74 | unsafe impl<T: Send> Send for RefMutContainer<T> {}
75 | unsafe impl<T: Sync> Sync for RefMutContainer<T> {}
76 | 


--------------------------------------------------------------------------------
/bindings/python/src/utils/regex.rs:
--------------------------------------------------------------------------------
 1 | use pyo3::exceptions;
 2 | use pyo3::prelude::*;
 3 | use tk::utils::SysRegex;
 4 | 
 5 | /// Instantiate a new Regex with the given pattern
 6 | #[pyclass(module = "tokenizers", name = "Regex")]
 7 | pub struct PyRegex {
 8 |     pub inner: SysRegex,
 9 |     pub pattern: String,
10 | }
11 | 
12 | #[pymethods]
13 | impl PyRegex {
14 |     #[new]
15 |     #[pyo3(text_signature = "(self, pattern)")]
16 |     fn new(s: &str) -> PyResult<Self> {
17 |         Ok(Self {
18 |             inner: SysRegex::new(s)
19 |                 .map_err(|e| exceptions::PyException::new_err(e.to_string().to_owned()))?,
20 |             pattern: s.to_owned(),
21 |         })
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/bindings/python/test.txt:
--------------------------------------------------------------------------------
 1 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
 2 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
 3 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
 4 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
 5 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
 6 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
 7 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
 8 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
 9 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
10 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
11 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
12 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
13 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
14 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
15 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
16 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
17 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
18 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
19 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
20 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
21 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
22 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
23 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
24 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
25 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
26 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
27 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
28 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
29 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
30 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
31 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
32 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
33 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
34 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
35 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
36 | <DOCUMENT> \test{bla} thisisatest </DOCUMENT>
37 | 


--------------------------------------------------------------------------------
/bindings/python/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/tokenizers/dd4fc3df1a8a7cd135eecca2158db018d85f94f1/bindings/python/tests/__init__.py


--------------------------------------------------------------------------------
/bindings/python/tests/bindings/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/tokenizers/dd4fc3df1a8a7cd135eecca2158db018d85f94f1/bindings/python/tests/bindings/__init__.py


--------------------------------------------------------------------------------
/bindings/python/tests/documentation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/tokenizers/dd4fc3df1a8a7cd135eecca2158db018d85f94f1/bindings/python/tests/documentation/__init__.py


--------------------------------------------------------------------------------
/bindings/python/tests/implementations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/tokenizers/dd4fc3df1a8a7cd135eecca2158db018d85f94f1/bindings/python/tests/implementations/__init__.py


--------------------------------------------------------------------------------
/bindings/python/tests/implementations/test_base_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, processors
 2 | from tokenizers.implementations import BaseTokenizer
 3 | 
 4 | 
 5 | class TestBaseTokenizer:
 6 |     def test_get_set_components(self):
 7 |         toki = Tokenizer(models.BPE())
 8 |         toki.normalizer = normalizers.NFC()
 9 |         toki.pre_tokenizer = pre_tokenizers.ByteLevel()
10 |         toki.post_processor = processors.BertProcessing(("A", 0), ("B", 1))
11 |         toki.decoder = decoders.ByteLevel()
12 | 
13 |         tokenizer = BaseTokenizer(toki)
14 | 
15 |         assert isinstance(tokenizer.model, models.BPE)
16 |         assert isinstance(tokenizer.normalizer, normalizers.NFC)
17 |         assert isinstance(tokenizer.pre_tokenizer, pre_tokenizers.ByteLevel)
18 |         assert isinstance(tokenizer.post_processor, processors.BertProcessing)
19 |         assert isinstance(tokenizer.decoder, decoders.ByteLevel)
20 | 
21 |         tokenizer.model = models.Unigram()
22 |         assert isinstance(tokenizer.model, models.Unigram)
23 |         tokenizer.normalizer = normalizers.NFD()
24 |         assert isinstance(tokenizer.normalizer, normalizers.NFD)
25 |         tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
26 |         assert isinstance(tokenizer.pre_tokenizer, pre_tokenizers.Whitespace)
27 |         tokenizer.post_processor = processors.ByteLevel()
28 |         assert isinstance(tokenizer.post_processor, processors.ByteLevel)
29 |         tokenizer.decoder = decoders.WordPiece()
30 |         assert isinstance(tokenizer.decoder, decoders.WordPiece)
31 | 


--------------------------------------------------------------------------------
/bindings/python/tests/implementations/test_bert_wordpiece.py:
--------------------------------------------------------------------------------
 1 | from tokenizers import BertWordPieceTokenizer
 2 | 
 3 | from ..utils import bert_files, data_dir, multiprocessing_with_parallelism
 4 | 
 5 | 
 6 | class TestBertWordPieceTokenizer:
 7 |     def test_basic_encode(self, bert_files):
 8 |         tokenizer = BertWordPieceTokenizer.from_file(bert_files["vocab"])
 9 | 
10 |         # Encode with special tokens by default
11 |         output = tokenizer.encode("My name is John", "pair")
12 |         assert output.ids == [101, 2026, 2171, 2003, 2198, 102, 3940, 102]
13 |         assert output.tokens == [
14 |             "[CLS]",
15 |             "my",
16 |             "name",
17 |             "is",
18 |             "john",
19 |             "[SEP]",
20 |             "pair",
21 |             "[SEP]",
22 |         ]
23 |         assert output.offsets == [
24 |             (0, 0),
25 |             (0, 2),
26 |             (3, 7),
27 |             (8, 10),
28 |             (11, 15),
29 |             (0, 0),
30 |             (0, 4),
31 |             (0, 0),
32 |         ]
33 |         assert output.type_ids == [0, 0, 0, 0, 0, 0, 1, 1]
34 | 
35 |         # Can encode without the special tokens
36 |         output = tokenizer.encode("My name is John", "pair", add_special_tokens=False)
37 |         assert output.ids == [2026, 2171, 2003, 2198, 3940]
38 |         assert output.tokens == ["my", "name", "is", "john", "pair"]
39 |         assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)]
40 |         assert output.type_ids == [0, 0, 0, 0, 1]
41 | 
42 |     def test_multiprocessing_with_parallelism(self, bert_files):
43 |         tokenizer = BertWordPieceTokenizer.from_file(bert_files["vocab"])
44 |         multiprocessing_with_parallelism(tokenizer, False)
45 |         multiprocessing_with_parallelism(tokenizer, True)
46 | 
47 |     def test_train_from_iterator(self):
48 |         text = ["A first sentence", "Another sentence", "And a last one"]
49 |         tokenizer = BertWordPieceTokenizer()
50 |         tokenizer.train_from_iterator(text, show_progress=False)
51 | 
52 |         output = tokenizer.encode("A sentence")
53 |         assert output.tokens == ["a", "sentence"]
54 | 


--------------------------------------------------------------------------------
/bindings/python/tests/implementations/test_byte_level_bpe.py:
--------------------------------------------------------------------------------
 1 | from tokenizers import ByteLevelBPETokenizer
 2 | 
 3 | from ..utils import data_dir, multiprocessing_with_parallelism, roberta_files
 4 | 
 5 | 
 6 | class TestByteLevelBPE:
 7 |     def test_basic_encode(self, roberta_files):
 8 |         tokenizer = ByteLevelBPETokenizer.from_file(roberta_files["vocab"], roberta_files["merges"])
 9 |         output = tokenizer.encode("The quick brown fox jumps over the lazy dog")
10 | 
11 |         assert output.ids == [133, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
12 |         assert output.tokens == [
13 |             "The",
14 |             "Ġquick",
15 |             "Ġbrown",
16 |             "Ġfox",
17 |             "Ġjumps",
18 |             "Ġover",
19 |             "Ġthe",
20 |             "Ġlazy",
21 |             "Ġdog",
22 |         ]
23 |         assert output.offsets == [
24 |             (0, 3),
25 |             (3, 9),
26 |             (9, 15),
27 |             (15, 19),
28 |             (19, 25),
29 |             (25, 30),
30 |             (30, 34),
31 |             (34, 39),
32 |             (39, 43),
33 |         ]
34 | 
35 |     def test_add_prefix_space(self, roberta_files):
36 |         tokenizer = ByteLevelBPETokenizer.from_file(
37 |             roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True
38 |         )
39 |         output = tokenizer.encode("The quick brown fox jumps over the lazy dog")
40 | 
41 |         assert output.ids == [20, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
42 |         assert output.tokens == [
43 |             "ĠThe",
44 |             "Ġquick",
45 |             "Ġbrown",
46 |             "Ġfox",
47 |             "Ġjumps",
48 |             "Ġover",
49 |             "Ġthe",
50 |             "Ġlazy",
51 |             "Ġdog",
52 |         ]
53 |         assert output.offsets == [
54 |             (0, 3),
55 |             (3, 9),
56 |             (9, 15),
57 |             (15, 19),
58 |             (19, 25),
59 |             (25, 30),
60 |             (30, 34),
61 |             (34, 39),
62 |             (39, 43),
63 |         ]
64 | 
65 |     def test_lowerspace(self, roberta_files):
66 |         tokenizer = ByteLevelBPETokenizer.from_file(
67 |             roberta_files["vocab"],
68 |             roberta_files["merges"],
69 |             add_prefix_space=True,
70 |             lowercase=True,
71 |         )
72 |         output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog")
73 | 
74 |         assert output.ids == [5, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
75 |         assert output.tokens == [
76 |             "Ġthe",
77 |             "Ġquick",
78 |             "Ġbrown",
79 |             "Ġfox",
80 |             "Ġjumps",
81 |             "Ġover",
82 |             "Ġthe",
83 |             "Ġlazy",
84 |             "Ġdog",
85 |         ]
86 | 
87 |     def test_multiprocessing_with_parallelism(self, roberta_files):
88 |         tokenizer = ByteLevelBPETokenizer.from_file(roberta_files["vocab"], roberta_files["merges"])
89 |         multiprocessing_with_parallelism(tokenizer, False)
90 |         multiprocessing_with_parallelism(tokenizer, True)
91 | 
92 |     def test_train_from_iterator(self):
93 |         text = ["A first sentence", "Another sentence", "And a last one"]
94 |         tokenizer = ByteLevelBPETokenizer()
95 |         tokenizer.train_from_iterator(text, show_progress=False)
96 | 
97 |         output = tokenizer.encode("A sentence")
98 |         assert output.tokens == ["A", "Ġsentence"]
99 | 


--------------------------------------------------------------------------------
/bindings/python/tests/implementations/test_char_bpe.py:
--------------------------------------------------------------------------------
 1 | from tokenizers import CharBPETokenizer
 2 | 
 3 | from ..utils import data_dir, multiprocessing_with_parallelism, openai_files
 4 | 
 5 | 
 6 | class TestCharBPETokenizer:
 7 |     def test_basic_encode(self, openai_files):
 8 |         tokenizer = CharBPETokenizer.from_file(openai_files["vocab"], openai_files["merges"])
 9 | 
10 |         output = tokenizer.encode("My name is John", "pair")
11 |         assert output.ids == [0, 253, 1362, 544, 0, 7, 12662, 2688]
12 |         assert output.tokens == [
13 |             "<unk>",
14 |             "y</w>",
15 |             "name</w>",
16 |             "is</w>",
17 |             "<unk>",
18 |             "o",
19 |             "hn</w>",
20 |             "pair</w>",
21 |         ]
22 |         assert output.offsets == [
23 |             (0, 1),
24 |             (1, 2),
25 |             (3, 7),
26 |             (8, 10),
27 |             (11, 12),
28 |             (12, 13),
29 |             (13, 15),
30 |             (0, 4),
31 |         ]
32 |         assert output.type_ids == [0, 0, 0, 0, 0, 0, 0, 1]
33 | 
34 |     def test_lowercase(self, openai_files):
35 |         tokenizer = CharBPETokenizer.from_file(openai_files["vocab"], openai_files["merges"], lowercase=True)
36 |         output = tokenizer.encode("My name is John", "pair", add_special_tokens=False)
37 |         assert output.ids == [547, 1362, 544, 2476, 2688]
38 |         assert output.tokens == ["my</w>", "name</w>", "is</w>", "john</w>", "pair</w>"]
39 |         assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)]
40 |         assert output.type_ids == [0, 0, 0, 0, 1]
41 | 
42 |     def test_decoding(self, openai_files):
43 |         tokenizer = CharBPETokenizer.from_file(openai_files["vocab"], openai_files["merges"], lowercase=True)
44 |         decoded = tokenizer.decode(tokenizer.encode("my name is john").ids)
45 |         assert decoded == "my name is john"
46 | 
47 |     def test_multiprocessing_with_parallelism(self, openai_files):
48 |         tokenizer = CharBPETokenizer.from_file(openai_files["vocab"], openai_files["merges"])
49 |         multiprocessing_with_parallelism(tokenizer, False)
50 |         multiprocessing_with_parallelism(tokenizer, True)
51 | 
52 |     def test_train_from_iterator(self):
53 |         text = ["A first sentence", "Another sentence", "And a last one"]
54 |         tokenizer = CharBPETokenizer()
55 |         tokenizer.train_from_iterator(text, show_progress=False)
56 | 
57 |         output = tokenizer.encode("A sentence")
58 |         assert output.tokens == ["A</w>", "sentence</w>"]
59 | 


--------------------------------------------------------------------------------
/bindings/python/tests/implementations/test_sentencepiece.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from tokenizers import SentencePieceBPETokenizer, SentencePieceUnigramTokenizer
 4 | 
 5 | 
 6 | class TestSentencePieceBPE:
 7 |     def test_train_from_iterator(self):
 8 |         text = ["A first sentence", "Another sentence", "And a last one"]
 9 |         tokenizer = SentencePieceBPETokenizer()
10 |         tokenizer.train_from_iterator(text, show_progress=False)
11 | 
12 |         output = tokenizer.encode("A sentence")
13 |         assert output.tokens == ["▁A", "▁sentence"]
14 | 
15 | 
16 | class TestSentencePieceUnigram:
17 |     def test_train(self, tmpdir):
18 |         p = tmpdir.mkdir("tmpdir").join("file.txt")
19 |         p.write("A first sentence\nAnother sentence\nAnd a last one")
20 | 
21 |         tokenizer = SentencePieceUnigramTokenizer()
22 |         tokenizer.train(files=str(p), show_progress=False)
23 | 
24 |         output = tokenizer.encode("A sentence")
25 |         assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e"]
26 | 
27 |         with pytest.raises(Exception) as excinfo:
28 |             _ = tokenizer.encode("A sentence 🤗")
29 |         assert str(excinfo.value) == "Encountered an unknown token but `unk_id` is missing"
30 | 
31 |     def test_train_with_unk_token(self, tmpdir):
32 |         p = tmpdir.mkdir("tmpdir").join("file.txt")
33 |         p.write("A first sentence\nAnother sentence\nAnd a last one")
34 | 
35 |         tokenizer = SentencePieceUnigramTokenizer()
36 |         tokenizer.train(files=str(p), show_progress=False, special_tokens=["<unk>"], unk_token="<unk>")
37 |         output = tokenizer.encode("A sentence 🤗")
38 |         assert output.ids[-1] == 0
39 |         assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e", "▁", "🤗"]
40 | 
41 |     def test_train_from_iterator(self):
42 |         text = ["A first sentence", "Another sentence", "And a last one"]
43 |         tokenizer = SentencePieceUnigramTokenizer()
44 |         tokenizer.train_from_iterator(text, show_progress=False)
45 | 
46 |         output = tokenizer.encode("A sentence")
47 |         assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e"]
48 | 
49 |         with pytest.raises(Exception) as excinfo:
50 |             _ = tokenizer.encode("A sentence 🤗")
51 |         assert str(excinfo.value) == "Encountered an unknown token but `unk_id` is missing"
52 | 
53 |     def test_train_from_iterator_with_unk_token(self):
54 |         text = ["A first sentence", "Another sentence", "And a last one"]
55 |         tokenizer = SentencePieceUnigramTokenizer()
56 |         tokenizer.train_from_iterator(
57 |             text, vocab_size=100, show_progress=False, special_tokens=["<unk>"], unk_token="<unk>"
58 |         )
59 |         output = tokenizer.encode("A sentence 🤗")
60 |         assert output.ids[-1] == 0
61 |         assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e", "▁", "🤗"]
62 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for those with `?=`
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | BUILDDIR      ?= build
 9 | SOURCEDIR      = source
10 | 
11 | # Put it first so that "make" without argument is like "make html_all".
12 | html_all:
13 | 	@echo "Generating doc for Rust"
14 | 	@$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)/rust" $(SPHINXOPTS) $(O) -t rust
15 | 	@echo "Generating doc for Python"
16 | 	@$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)/python" $(SPHINXOPTS) $(O) -t python
17 | 	@echo "Generating doc for Node.js"
18 | 	@$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)/node" $(SPHINXOPTS) $(O) -t node
19 | 
20 | .PHONY: html_all Makefile
21 | 
22 | # Catch-all target: route all unknown targets to Sphinx using the new
23 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
24 | %: Makefile
25 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
26 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | ## Requirements
 2 | 
 3 | In order to generate the documentation, it is necessary to have a Python environment with the
 4 | following:
 5 | ```python
 6 | pip install sphinx sphinx_rtd_theme setuptools_rust
 7 | ```
 8 | 
 9 | It is also necessary to have the `tokenizers` library in this same environment, for Sphinx to
10 | generate all the API Reference and links properly.  If you want to visualize the documentation with
11 | some modifications made to the Python bindings, make sure you build it from source.
12 | 
13 | ## Building the documentation
14 | 
15 | Once everything is setup, you can build the documentation automatically for all the languages
16 | using the following command in the `/docs` folder:
17 | 
18 | ```bash
19 | make html_all
20 | ```
21 | 
22 | If you want to build only for a specific language, you can use:
23 | 
24 | ```bash
25 | make html O="-t python"
26 | ```
27 | 
28 | (Replacing `python` by the target language among `rust`, `node`, and `python`)
29 | 
30 | 
31 | **NOTE**
32 | 
33 | If you are making any structural change to the documentation, it is recommended to clean the build
34 | directory before rebuilding:
35 | 
36 | ```bash
37 | make clean && make html_all
38 | ```
39 | 


--------------------------------------------------------------------------------
/docs/source-doc-builder/_toctree.yml:
--------------------------------------------------------------------------------
 1 | - sections: 
 2 |   - local: index
 3 |     title: 🤗 Tokenizers
 4 |   - local: quicktour
 5 |     title: Quicktour
 6 |   - local: installation
 7 |     title: Installation
 8 |   - local: pipeline
 9 |     title: The tokenization pipeline
10 |   - local: components
11 |     title: Components
12 |   - local: training_from_memory
13 |     title: Training from memory
14 |   title: Getting started
15 | - sections:
16 |   - local: api/input-sequences
17 |     title: Input Sequences
18 |   - local: api/encode-inputs
19 |     title: Encode Inputs
20 |   - local: api/tokenizer
21 |     title: Tokenizer
22 |   - local: api/encoding
23 |     title: Encoding
24 |   - local: api/added-tokens
25 |     title: Added Tokens
26 |   - local: api/models
27 |     title: Models
28 |   - local: api/normalizers
29 |     title: Normalizers
30 |   - local: api/pre-tokenizers
31 |     title: Pre-tokenizers
32 |   - local: api/post-processors
33 |     title: Post-processors
34 |   - local: api/trainers
35 |     title: Trainers
36 |   - local: api/decoders
37 |     title: Decoders
38 |   - local: api/visualizer
39 |     title: Visualizer
40 |   title: API
41 | 


--------------------------------------------------------------------------------
/docs/source-doc-builder/api/added-tokens.mdx:
--------------------------------------------------------------------------------
 1 | # Added Tokens
 2 | 
 3 | <tokenizerslangcontent>
 4 | <python>
 5 | ## AddedToken
 6 | 
 7 | [[autodoc]] tokenizers.AddedToken
 8 |     - content
 9 |     - lstrip
10 |     - normalized
11 |     - rstrip
12 |     - single_word
13 | </python>
14 | <rust>
15 | The Rust API Reference is available directly on the [Docs.rs](https://docs.rs/tokenizers/latest/tokenizers/) website.
16 | </rust>
17 | <node>
18 | The node API has not been documented yet.
19 | </node>
20 | </tokenizerslangcontent>


--------------------------------------------------------------------------------
/docs/source-doc-builder/api/decoders.mdx:
--------------------------------------------------------------------------------
 1 | # Decoders
 2 | 
 3 | <tokenizerslangcontent>
 4 | <python>
 5 | ## BPEDecoder
 6 | 
 7 | [[autodoc]] tokenizers.decoders.BPEDecoder
 8 | 
 9 | ## ByteLevel
10 | 
11 | [[autodoc]] tokenizers.decoders.ByteLevel
12 | 
13 | ## CTC
14 | 
15 | [[autodoc]] tokenizers.decoders.CTC
16 | 
17 | ## Metaspace
18 | 
19 | [[autodoc]] tokenizers.decoders.Metaspace
20 | 
21 | ## WordPiece
22 | 
23 | [[autodoc]] tokenizers.decoders.WordPiece
24 | </python>
25 | <rust>
26 | The Rust API Reference is available directly on the [Docs.rs](https://docs.rs/tokenizers/latest/tokenizers/) website.
27 | </rust>
28 | <node>
29 | The node API has not been documented yet.
30 | </node>
31 | </tokenizerslangcontent>


--------------------------------------------------------------------------------
/docs/source-doc-builder/api/encode-inputs.mdx:
--------------------------------------------------------------------------------
 1 | # Encode Inputs
 2 | 
 3 | <tokenizerslangcontent>
 4 | <python>
 5 | These types represent all the different kinds of input that a [`~tokenizers.Tokenizer`] accepts
 6 | when using [`~tokenizers.Tokenizer.encode_batch`].
 7 | 
 8 | ## TextEncodeInput[[[[tokenizers.TextEncodeInput]]]]
 9 | 
10 | <code>tokenizers.TextEncodeInput</code>
11 | 
12 | Represents a textual input for encoding. Can be either:
13 | - A single sequence: [TextInputSequence](/docs/tokenizers/api/input-sequences#tokenizers.TextInputSequence)
14 | - A pair of sequences:
15 |   - A Tuple of [TextInputSequence](/docs/tokenizers/api/input-sequences#tokenizers.TextInputSequence)
16 |   - Or a List of [TextInputSequence](/docs/tokenizers/api/input-sequences#tokenizers.TextInputSequence) of size 2
17 | 
18 | alias of `Union[str, Tuple[str, str], List[str]]`.
19 | 
20 | ## PreTokenizedEncodeInput[[[[tokenizers.PreTokenizedEncodeInput]]]]
21 | 
22 | <code>tokenizers.PreTokenizedEncodeInput</code>
23 | 
24 | Represents a pre-tokenized input for encoding. Can be either:
25 | - A single sequence: [PreTokenizedInputSequence](/docs/tokenizers/api/input-sequences#tokenizers.PreTokenizedInputSequence)
26 | - A pair of sequences:
27 |   - A Tuple of [PreTokenizedInputSequence](/docs/tokenizers/api/input-sequences#tokenizers.PreTokenizedInputSequence)
28 |   - Or a List of [PreTokenizedInputSequence](/docs/tokenizers/api/input-sequences#tokenizers.PreTokenizedInputSequence) of size 2
29 | 
30 | alias of `Union[List[str], Tuple[str], Tuple[Union[List[str], Tuple[str]], Union[List[str], Tuple[str]]], List[Union[List[str], Tuple[str]]]]`.
31 | 
32 | ## EncodeInput[[[[tokenizers.EncodeInput]]]]
33 | 
34 | <code>tokenizers.EncodeInput</code>
35 | 
36 | Represents all the possible types of input for encoding. Can be:
37 | - When `is_pretokenized=False`: [TextEncodeInput](#tokenizers.TextEncodeInput)
38 | - When `is_pretokenized=True`: [PreTokenizedEncodeInput](#tokenizers.PreTokenizedEncodeInput)
39 | 
40 | alias of `Union[str, Tuple[str, str], List[str], Tuple[str], Tuple[Union[List[str], Tuple[str]], Union[List[str], Tuple[str]]], List[Union[List[str], Tuple[str]]]]`.
41 | </python>
42 | <rust>
43 | The Rust API Reference is available directly on the [Docs.rs](https://docs.rs/tokenizers/latest/tokenizers/) website.
44 | </rust>
45 | <node>
46 | The node API has not been documented yet.
47 | </node>
48 | </tokenizerslangcontent>


--------------------------------------------------------------------------------
/docs/source-doc-builder/api/encoding.mdx:
--------------------------------------------------------------------------------
 1 | # Encoding
 2 | 
 3 | <tokenizerslangcontent>
 4 | <python>
 5 | ## Encoding
 6 | 
 7 | [[autodoc]] tokenizers.Encoding
 8 |     - all
 9 |     - attention_mask
10 |     - ids
11 |     - n_sequences
12 |     - offsets
13 |     - overflowing
14 |     - sequence_ids
15 |     - special_tokens_mask
16 |     - tokens
17 |     - type_ids
18 |     - word_ids
19 |     - words
20 | </python>
21 | <rust>
22 | The Rust API Reference is available directly on the [Docs.rs](https://docs.rs/tokenizers/latest/tokenizers/) website.
23 | </rust>
24 | <node>
25 | The node API has not been documented yet.
26 | </node>
27 | </tokenizerslangcontent>


--------------------------------------------------------------------------------
/docs/source-doc-builder/api/input-sequences.mdx:
--------------------------------------------------------------------------------
 1 | # Input Sequences
 2 | 
 3 | <tokenizerslangcontent>
 4 | <python>
 5 | These types represent all the different kinds of sequence that can be used as input of a Tokenizer.
 6 | Globally, any sequence can be either a string or a list of strings, according to the operating
 7 | mode of the tokenizer: `raw text` vs `pre-tokenized`.
 8 | 
 9 | ## TextInputSequence[[tokenizers.TextInputSequence]]
10 | 
11 | <code>tokenizers.TextInputSequence</code>
12 | 
13 | A `str` that represents an input sequence
14 | 
15 | ## PreTokenizedInputSequence[[tokenizers.PreTokenizedInputSequence]]
16 | 
17 | <code>tokenizers.PreTokenizedInputSequence</code>
18 | 
19 | A pre-tokenized input sequence. Can be one of:
20 | - A `List` of `str`
21 | - A `Tuple` of `str`
22 | 
23 | alias of `Union[List[str], Tuple[str]]`.
24 | 
25 | ## InputSequence[[tokenizers.InputSequence]]
26 | 
27 | <code>tokenizers.InputSequence</code>
28 | 
29 | Represents all the possible types of input sequences for encoding. Can be:
30 | - When `is_pretokenized=False`: [TextInputSequence](#tokenizers.TextInputSequence)
31 | - When `is_pretokenized=True`: [PreTokenizedInputSequence](#tokenizers.PreTokenizedInputSequence)
32 | 
33 | alias of `Union[str, List[str], Tuple[str]]`.
34 | </python>
35 | <rust>
36 | The Rust API Reference is available directly on the [Docs.rs](https://docs.rs/tokenizers/latest/tokenizers/) website.
37 | </rust>
38 | <node>
39 | The node API has not been documented yet.
40 | </node>
41 | </tokenizerslangcontent>


--------------------------------------------------------------------------------
/docs/source-doc-builder/api/models.mdx:
--------------------------------------------------------------------------------
 1 | # Models
 2 | 
 3 | <tokenizerslangcontent>
 4 | <python>
 5 | ## BPE
 6 | 
 7 | [[autodoc]] tokenizers.models.BPE
 8 | 
 9 | ## Model
10 | 
11 | [[autodoc]] tokenizers.models.Model
12 | 
13 | ## Unigram
14 | 
15 | [[autodoc]] tokenizers.models.Unigram
16 | 
17 | ## WordLevel
18 | 
19 | [[autodoc]] tokenizers.models.WordLevel
20 | 
21 | ## WordPiece
22 | 
23 | [[autodoc]] tokenizers.models.WordPiece
24 | </python>
25 | <rust>
26 | The Rust API Reference is available directly on the [Docs.rs](https://docs.rs/tokenizers/latest/tokenizers/) website.
27 | </rust>
28 | <node>
29 | The node API has not been documented yet.
30 | </node>
31 | </tokenizerslangcontent>


--------------------------------------------------------------------------------
/docs/source-doc-builder/api/normalizers.mdx:
--------------------------------------------------------------------------------
 1 | # Normalizers
 2 | 
 3 | <tokenizerslangcontent>
 4 | <python>
 5 | ## BertNormalizer
 6 | 
 7 | [[autodoc]] tokenizers.normalizers.BertNormalizer
 8 | 
 9 | ## Lowercase
10 | 
11 | [[autodoc]] tokenizers.normalizers.Lowercase
12 | 
13 | ## NFC
14 | 
15 | [[autodoc]] tokenizers.normalizers.NFC
16 | 
17 | ## NFD
18 | 
19 | [[autodoc]] tokenizers.normalizers.NFD
20 | 
21 | ## NFKC
22 | 
23 | [[autodoc]] tokenizers.normalizers.NFKC
24 | 
25 | ## NFKD
26 | 
27 | [[autodoc]] tokenizers.normalizers.NFKD
28 | 
29 | ## Nmt
30 | 
31 | [[autodoc]] tokenizers.normalizers.Nmt
32 | 
33 | ## Normalizer
34 | 
35 | [[autodoc]] tokenizers.normalizers.Normalizer
36 | 
37 | ## Precompiled
38 | 
39 | [[autodoc]] tokenizers.normalizers.Precompiled
40 | 
41 | ## Replace
42 | 
43 | [[autodoc]] tokenizers.normalizers.Replace
44 | 
45 | ## Sequence
46 | 
47 | [[autodoc]] tokenizers.normalizers.Sequence
48 | 
49 | ## Strip
50 | 
51 | [[autodoc]] tokenizers.normalizers.Strip
52 | 
53 | ## StripAccents
54 | 
55 | [[autodoc]] tokenizers.normalizers.StripAccents
56 | </python>
57 | <rust>
58 | The Rust API Reference is available directly on the [Docs.rs](https://docs.rs/tokenizers/latest/tokenizers/) website.
59 | </rust>
60 | <node>
61 | The node API has not been documented yet.
62 | </node>
63 | </tokenizerslangcontent>


--------------------------------------------------------------------------------
/docs/source-doc-builder/api/post-processors.mdx:
--------------------------------------------------------------------------------
 1 | # Post-processors
 2 | 
 3 | <tokenizerslangcontent>
 4 | <python>
 5 | ## BertProcessing
 6 | 
 7 | [[autodoc]] tokenizers.processors.BertProcessing
 8 | 
 9 | ## ByteLevel
10 | 
11 | [[autodoc]] tokenizers.processors.ByteLevel
12 | 
13 | ## RobertaProcessing
14 | 
15 | [[autodoc]] tokenizers.processors.RobertaProcessing
16 | 
17 | ## TemplateProcessing
18 | 
19 | [[autodoc]] tokenizers.processors.TemplateProcessing
20 | </python>
21 | <rust>
22 | The Rust API Reference is available directly on the [Docs.rs](https://docs.rs/tokenizers/latest/tokenizers/) website.
23 | </rust>
24 | <node>
25 | The node API has not been documented yet.
26 | </node>
27 | </tokenizerslangcontent>


--------------------------------------------------------------------------------
/docs/source-doc-builder/api/pre-tokenizers.mdx:
--------------------------------------------------------------------------------
 1 | # Pre-tokenizers
 2 | 
 3 | <tokenizerslangcontent>
 4 | <python>
 5 | ## BertPreTokenizer
 6 | 
 7 | [[autodoc]] tokenizers.pre_tokenizers.BertPreTokenizer
 8 | 
 9 | ## ByteLevel
10 | 
11 | [[autodoc]] tokenizers.pre_tokenizers.ByteLevel
12 | 
13 | ## CharDelimiterSplit
14 | 
15 | [[autodoc]] tokenizers.pre_tokenizers.CharDelimiterSplit
16 | 
17 | ## Digits
18 | 
19 | [[autodoc]] tokenizers.pre_tokenizers.Digits
20 | 
21 | ## Metaspace
22 | 
23 | [[autodoc]] tokenizers.pre_tokenizers.Metaspace
24 | 
25 | ## PreTokenizer
26 | 
27 | [[autodoc]] tokenizers.pre_tokenizers.PreTokenizer
28 | 
29 | ## Punctuation
30 | 
31 | [[autodoc]] tokenizers.pre_tokenizers.Punctuation
32 | 
33 | ## Sequence
34 | 
35 | [[autodoc]] tokenizers.pre_tokenizers.Sequence
36 | 
37 | ## Split
38 | 
39 | [[autodoc]] tokenizers.pre_tokenizers.Split
40 | 
41 | ## UnicodeScripts
42 | 
43 | [[autodoc]] tokenizers.pre_tokenizers.UnicodeScripts
44 | 
45 | ## Whitespace
46 | 
47 | [[autodoc]] tokenizers.pre_tokenizers.Whitespace
48 | 
49 | ## WhitespaceSplit
50 | 
51 | [[autodoc]] tokenizers.pre_tokenizers.WhitespaceSplit
52 | </python>
53 | <rust>
54 | The Rust API Reference is available directly on the [Docs.rs](https://docs.rs/tokenizers/latest/tokenizers/) website.
55 | </rust>
56 | <node>
57 | The node API has not been documented yet.
58 | </node>
59 | </tokenizerslangcontent>


--------------------------------------------------------------------------------
/docs/source-doc-builder/api/tokenizer.mdx:
--------------------------------------------------------------------------------
 1 | # Tokenizer
 2 | 
 3 | <tokenizerslangcontent>
 4 | <python>
 5 | ## Tokenizer
 6 | 
 7 | [[autodoc]] tokenizers.Tokenizer
 8 |     - all
 9 |     - decoder
10 |     - model
11 |     - normalizer
12 |     - padding
13 |     - post_processor
14 |     - pre_tokenizer
15 |     - truncation
16 | </python>
17 | <rust>
18 | The Rust API Reference is available directly on the [Docs.rs](https://docs.rs/tokenizers/latest/tokenizers/) website.
19 | </rust>
20 | <node>
21 | The node API has not been documented yet.
22 | </node>
23 | </tokenizerslangcontent>


--------------------------------------------------------------------------------
/docs/source-doc-builder/api/trainers.mdx:
--------------------------------------------------------------------------------
 1 | # Trainers
 2 | 
 3 | <tokenizerslangcontent>
 4 | <python>
 5 | ## BpeTrainer
 6 | 
 7 | [[autodoc]] tokenizers.trainers.BpeTrainer
 8 | 
 9 | ## UnigramTrainer
10 | 
11 | [[autodoc]] tokenizers.trainers.UnigramTrainer
12 | 
13 | ## WordLevelTrainer
14 | 
15 | [[autodoc]] tokenizers.trainers.WordLevelTrainer
16 | 
17 | ## WordPieceTrainer
18 | 
19 | [[autodoc]] tokenizers.trainers.WordPieceTrainer
20 | </python>
21 | <rust>
22 | The Rust API Reference is available directly on the [Docs.rs](https://docs.rs/tokenizers/latest/tokenizers/) website.
23 | </rust>
24 | <node>
25 | The node API has not been documented yet.
26 | </node>
27 | </tokenizerslangcontent>


--------------------------------------------------------------------------------
/docs/source-doc-builder/api/visualizer.mdx:
--------------------------------------------------------------------------------
 1 | # Visualizer
 2 | 
 3 | <tokenizerslangcontent>
 4 | <python>
 5 | ## Annotation
 6 | 
 7 | [[autodoc]] tokenizers.tools.Annotation
 8 | 
 9 | ## EncodingVisualizer
10 | 
11 | [[autodoc]] tokenizers.tools.EncodingVisualizer
12 |     -  __call__
13 | </python>
14 | <rust>
15 | The Rust API Reference is available directly on the [Docs.rs](https://docs.rs/tokenizers/latest/tokenizers/) website.
16 | </rust>
17 | <node>
18 | The node API has not been documented yet.
19 | </node>
20 | </tokenizerslangcontent>


--------------------------------------------------------------------------------
/docs/source-doc-builder/index.mdx:
--------------------------------------------------------------------------------
 1 | <!-- DISABLE-FRONTMATTER-SECTIONS -->
 2 | 
 3 | # Tokenizers
 4 | 
 5 | Fast State-of-the-art tokenizers, optimized for both research and
 6 | production
 7 | 
 8 | [🤗 Tokenizers](https://github.com/huggingface/tokenizers) provides an
 9 | implementation of today's most used tokenizers, with a focus on
10 | performance and versatility. These tokenizers are also used in [🤗 Transformers](https://github.com/huggingface/transformers).
11 | 
12 | # Main features:
13 | 
14 | - Train new vocabularies and tokenize, using today's most used tokenizers.
15 | - Extremely fast (both training and tokenization), thanks to the Rust implementation. Takes less than 20 seconds to tokenize a GB of text on a server's CPU.
16 | - Easy to use, but also extremely versatile.
17 | - Designed for both research and production.
18 | - Full alignment tracking. Even with destructive normalization, it's always possible to get the part of the original sentence that corresponds to any token.
19 | - Does all the pre-processing: Truncation, Padding, add the special tokens your model needs.
20 | 


--------------------------------------------------------------------------------
/docs/source-doc-builder/installation.mdx:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | <tokenizerslangcontent>
 4 | <python>
 5 | 🤗 Tokenizers is tested on Python 3.5+.
 6 | 
 7 | You should install 🤗 Tokenizers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're
 8 | unfamiliar with Python virtual environments, check out the [user
 9 | guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
10 | Create a virtual environment with the version of Python you're going to
11 | use and activate it.
12 | 
13 | ## Installation with pip
14 | 
15 | 🤗 Tokenizers can be installed using pip as follows:
16 | 
17 | ```bash
18 | pip install tokenizers
19 | ```
20 | 
21 | ## Installation from sources
22 | 
23 | To use this method, you need to have the Rust language installed. You
24 | can follow [the official
25 | guide](https://www.rust-lang.org/learn/get-started) for more
26 | information.
27 | 
28 | If you are using a unix based OS, the installation should be as simple
29 | as running:
30 | 
31 | ```bash
32 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
33 | ```
34 | 
35 | Or you can easily update it with the following command:
36 | 
37 | ```bash
38 | rustup update
39 | ```
40 | 
41 | Once rust is installed, we can start retrieving the sources for 🤗
42 | Tokenizers:
43 | 
44 | ```bash
45 | git clone https://github.com/huggingface/tokenizers
46 | ```
47 | 
48 | Then we go into the python bindings folder:
49 | 
50 | ```bash
51 | cd tokenizers/bindings/python
52 | ```
53 | 
54 | At this point you should have your [virtual environment]() already
55 | activated. In order to compile 🤗 Tokenizers, you need to:
56 | 
57 | ```bash
58 | pip install -e .
59 | ```
60 | 
61 | </python>
62 | <rust>
63 | ## Crates.io
64 | 
65 | 🤗 Tokenizers is available on [crates.io](https://crates.io/crates/tokenizers).
66 | 
67 | You just need to add it to your `Cargo.toml`:
68 | 
69 | ```bash
70 | cargo add tokenizers
71 | ```
72 | </rust>
73 | <node>
74 | ## Installation with npm
75 | 
76 | You can simply install 🤗 Tokenizers with npm using:
77 | 
78 | ```bash
79 | npm install tokenizers
80 | ```
81 | </node>
82 | </tokenizerslangcontent>
83 | 


--------------------------------------------------------------------------------
/docs/source/_ext/rust_doc.py:
--------------------------------------------------------------------------------
  1 | from docutils import nodes
  2 | 
  3 | import sphinx
  4 | from sphinx.locale import _
  5 | 
  6 | from conf import rust_version
  7 | 
  8 | logger = sphinx.util.logging.getLogger(__name__)
  9 | 
 10 | 
 11 | class RustRef:
 12 |     def __call__(self, name, rawtext, text, lineno, inliner, options={}, content=[]):
 13 |         doctype = name.split("_")[1]
 14 |         parts = text.split("::")
 15 | 
 16 |         if text.startswith("~"):
 17 |             title = parts[-1]
 18 |             parts[0] = parts[0][1:]
 19 |         else:
 20 |             content = text
 21 |         link = self.base_link()
 22 | 
 23 |         if doctype == "struct":
 24 |             l, title = self.make_struct_link(parts, title)
 25 |         if doctype == "func":
 26 |             l, title = self.make_func_link(parts, title)
 27 |         if doctype == "meth":
 28 |             l, title = self.make_meth_link(parts, title)
 29 |         if doctype == "trait":
 30 |             l, title = self.make_trait_link(parts, title)
 31 |         link += l
 32 | 
 33 |         node = nodes.reference(internal=False, refuri=link, text=title)
 34 |         wrapper = nodes.literal(classes=["xref"])
 35 |         wrapper += node
 36 | 
 37 |         return [wrapper], []
 38 | 
 39 |     def base_link(self):
 40 |         return f"https://docs.rs/tokenizers/{rust_version}"
 41 | 
 42 |     def make_struct_link(self, parts, title):
 43 |         link = ""
 44 |         struct_name = parts[-1]
 45 |         path = parts[:-1]
 46 | 
 47 |         for p in path:
 48 |             link += f"/{p}"
 49 |         link += f"/struct.{struct_name}.html"
 50 | 
 51 |         return link, title
 52 | 
 53 |     def make_func_link(self, parts, title):
 54 |         link = ""
 55 |         fn_name = parts[-1]
 56 | 
 57 |         path = parts[:-1]
 58 |         for p in path:
 59 |             link += f"/{p}"
 60 |         link += f"/fn.{fn_name}.html"
 61 | 
 62 |         return link, title
 63 | 
 64 |     def make_meth_link(self, parts, title):
 65 |         meth_name = parts[-1]
 66 |         if meth_name.endswith("()"):
 67 |             meth_name = meth_name[:-2]
 68 | 
 69 |         link, title = self.make_struct_link(parts[:-1], title)
 70 |         link += f"#method.{meth_name}"
 71 | 
 72 |         if not title.endswith(")"):
 73 |             title += "()"
 74 | 
 75 |         return link, title
 76 | 
 77 |     def make_trait_link(self, parts, title):
 78 |         link = ""
 79 |         trait_name = parts[-1]
 80 | 
 81 |         path = parts[:-1]
 82 |         for p in path:
 83 |             link += f"/{p}"
 84 |         link += f"/trait.{trait_name}.html"
 85 | 
 86 |         return link, title
 87 | 
 88 | 
 89 | def setup(app):
 90 |     app.add_role("rust_struct", RustRef())
 91 |     app.add_role("rust_func", RustRef())
 92 |     app.add_role("rust_meth", RustRef())
 93 |     app.add_role("rust_trait", RustRef())
 94 | 
 95 |     return {
 96 |         "version": "0.1",
 97 |         "parallel_read_safe": True,
 98 |         "parallel_write_safe": True,
 99 |     }
100 | 


--------------------------------------------------------------------------------
/docs/source/_ext/toctree_tags.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from sphinx.directives.other import TocTree
 3 | 
 4 | 
 5 | class TocTreeTags(TocTree):
 6 |     hasPat = re.compile("^\s*:(.+):(.+)
quot;)
 7 | 
 8 |     def filter_entries(self, entries):
 9 |         filtered = []
10 |         for e in entries:
11 |             m = self.hasPat.match(e)
12 |             if m != None:
13 |                 if self.env.app.tags.has(m.groups()[0]):
14 |                     filtered.append(m.groups()[1])
15 |             else:
16 |                 filtered.append(e)
17 |         return filtered
18 | 
19 |     def run(self):
20 |         self.content = self.filter_entries(self.content)
21 |         return super().run()
22 | 
23 | 
24 | def setup(app):
25 |     app.add_directive("toctree-tags", TocTreeTags)
26 | 
27 |     return {
28 |         "version": "0.1",
29 |     }
30 | 


--------------------------------------------------------------------------------
/docs/source/_static/css/Calibre-Light.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/tokenizers/dd4fc3df1a8a7cd135eecca2158db018d85f94f1/docs/source/_static/css/Calibre-Light.ttf


--------------------------------------------------------------------------------
/docs/source/_static/css/Calibre-Medium.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/tokenizers/dd4fc3df1a8a7cd135eecca2158db018d85f94f1/docs/source/_static/css/Calibre-Medium.otf


--------------------------------------------------------------------------------
/docs/source/_static/css/Calibre-Regular.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/tokenizers/dd4fc3df1a8a7cd135eecca2158db018d85f94f1/docs/source/_static/css/Calibre-Regular.otf


--------------------------------------------------------------------------------
/docs/source/_static/css/Calibre-Thin.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/tokenizers/dd4fc3df1a8a7cd135eecca2158db018d85f94f1/docs/source/_static/css/Calibre-Thin.otf


--------------------------------------------------------------------------------
/docs/source/_static/css/code-snippets.css:
--------------------------------------------------------------------------------
 1 | 
 2 | .highlight .c1, .highlight .sd{
 3 |     color: #999
 4 | }
 5 | 
 6 | .highlight .nn, .highlight .k, .highlight .s1, .highlight .nb, .highlight .bp, .highlight .kc, .highlight .kt {
 7 |     color: #FB8D68;
 8 | }
 9 | 
10 | .highlight .kn, .highlight .nv, .highlight .s2, .highlight .ow, .highlight .kd, .highlight .kr, .highlight .s {
11 |     color: #6670FF;
12 | }
13 | 
14 | .highlight .gp {
15 |     color: #FB8D68;
16 | }
17 | 


--------------------------------------------------------------------------------
/docs/source/api/node.inc:
--------------------------------------------------------------------------------
1 | Documentation
2 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3 | 
4 | The node API has not been documented yet.
5 | 


--------------------------------------------------------------------------------
/docs/source/api/python.inc:
--------------------------------------------------------------------------------
 1 | Input sequences
 2 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 3 | 
 4 | These types represent all the different kinds of sequence that can be used as input of a Tokenizer.
 5 | Globally, any sequence can be either a string or a list of strings, according to the operating
 6 | mode of the tokenizer: ``raw text`` vs ``pre-tokenized``.
 7 | 
 8 | .. autodata:: tokenizers.TextInputSequence
 9 | 
10 | .. autodata:: tokenizers.PreTokenizedInputSequence
11 | 
12 | .. autodata:: tokenizers.InputSequence
13 | 
14 | 
15 | Encode inputs
16 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
17 | 
18 | These types represent all the different kinds of input that a :class:`~tokenizers.Tokenizer` accepts
19 | when using :meth:`~tokenizers.Tokenizer.encode_batch`.
20 | 
21 | .. autodata:: tokenizers.TextEncodeInput
22 | 
23 | .. autodata:: tokenizers.PreTokenizedEncodeInput
24 | 
25 | .. autodata:: tokenizers.EncodeInput
26 | 
27 | 
28 | Tokenizer
29 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
30 | 
31 | .. autoclass:: tokenizers.Tokenizer
32 |     :members:
33 | 
34 | 
35 | Encoding
36 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
37 | 
38 | .. autoclass:: tokenizers.Encoding
39 |     :members:
40 | 
41 | 
42 | Added Tokens
43 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
44 | 
45 | .. autoclass:: tokenizers.AddedToken
46 |     :members:
47 | 
48 | 
49 | Models
50 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
51 | 
52 | .. automodule:: tokenizers.models
53 |     :members:
54 | 
55 | Normalizers
56 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
57 | 
58 | .. automodule:: tokenizers.normalizers
59 |     :members:
60 | 
61 | 
62 | Pre-tokenizers
63 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
64 | 
65 | .. automodule:: tokenizers.pre_tokenizers
66 |     :members:
67 | 
68 | 
69 | Post-processor
70 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
71 | 
72 | .. automodule:: tokenizers.processors
73 |     :members:
74 | 
75 | 
76 | Trainers
77 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
78 | 
79 | .. automodule:: tokenizers.trainers
80 |     :members:
81 | 
82 | Decoders
83 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
84 | 
85 | .. automodule:: tokenizers.decoders
86 |     :members:
87 | 
88 | Visualizer
89 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
90 | 
91 | .. autoclass:: tokenizers.tools.Annotation
92 |     :members:
93 | 
94 | .. autoclass:: tokenizers.tools.EncodingVisualizer
95 |     :members: __call__
96 | 


--------------------------------------------------------------------------------
/docs/source/api/reference.rst:
--------------------------------------------------------------------------------
 1 | .. only:: python
 2 | 
 3 |     .. include:: python.inc
 4 | 
 5 | .. only:: rust
 6 | 
 7 |     .. include:: rust.inc
 8 | 
 9 | .. only:: node
10 | 
11 |     .. include:: node.inc
12 | 


--------------------------------------------------------------------------------
/docs/source/api/rust.inc:
--------------------------------------------------------------------------------
1 | Documentation
2 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3 | 
4 | The Rust API Reference is available directly on the `Docs.rs <https://docs.rs/tokenizers>`__
5 | website.
6 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | 
16 | sys.path.insert(0, os.path.abspath("./_ext"))
17 | sys.path.insert(0, os.path.abspath("."))
18 | 
19 | 
20 | # -- Project information -----------------------------------------------------
21 | 
22 | project = "tokenizers"
23 | copyright = "2020, huggingface"
24 | author = "huggingface"
25 | 
26 | # The full version, including alpha/beta/rc tags
27 | release = ""
28 | 
29 | # -- Custom information ------------------------------------------------------
30 | 
31 | # The possible values for languages (used by `_ext/entities`)
32 | languages = ["node", "rust", "python"]
33 | 
34 | # This defines the version used to generate links to docs.rs
35 | rust_version = "latest"
36 | 
37 | # -- General configuration ---------------------------------------------------
38 | 
39 | # Add any Sphinx extension module names here, as strings. They can be
40 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
41 | # ones.
42 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.napoleon", "entities", "rust_doc", "toctree_tags"]
43 | 
44 | # Add any paths that contain templates here, relative to this directory.
45 | templates_path = ["_templates"]
46 | 
47 | # List of patterns, relative to source directory, that match files and
48 | # directories to ignore when looking for source files.
49 | # This pattern also affects html_static_path and html_extra_path.
50 | exclude_patterns = []
51 | 
52 | # -- Options for HTML output -------------------------------------------------
53 | 
54 | # The theme to use for HTML and HTML Help pages.  See the documentation for
55 | # a list of builtin themes.
56 | #
57 | html_theme = "sphinx_rtd_theme"
58 | 
59 | # Theme options are theme-specific and customize the look and feel of a theme
60 | # further.  For a list of options available for each theme, see the
61 | # documentation.
62 | #
63 | html_theme_options = {"analytics_id": "UA-83738774-2"}
64 | 
65 | # Add any paths that contain custom static files (such as style sheets) here,
66 | # relative to this directory. They are copied after the builtin static files,
67 | # so a file named "default.css" will overwrite the builtin "default.css".
68 | html_static_path = ["_static"]
69 | 
70 | 
71 | def setup(app):
72 |     for language in languages:
73 |         if not tags.has(language):
74 |             exclude_patterns.append(f"tutorials/{language}/*")
75 | 
76 |     app.add_css_file("css/huggingface.css")
77 |     app.add_css_file("css/code-snippets.css")
78 |     app.add_js_file("js/custom.js")
79 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | Tokenizers
 2 | ====================================================================================================
 3 | 
 4 | Fast State-of-the-art tokenizers, optimized for both research and production
 5 | 
 6 | `🤗 Tokenizers`_ provides an implementation of today's most used tokenizers, with
 7 | a focus on performance and versatility. These tokenizers are also used in
 8 | `🤗 Transformers`_.
 9 | 
10 | .. _🤗 Tokenizers: https://github.com/huggingface/tokenizers
11 | .. _🤗 Transformers: https://github.com/huggingface/transformers
12 | 
13 | Main features:
14 | ----------------------------------------------------------------------------------------------------
15 | 
16 |  - Train new vocabularies and tokenize, using today's most used tokenizers.
17 |  - Extremely fast (both training and tokenization), thanks to the Rust implementation. Takes
18 |    less than 20 seconds to tokenize a GB of text on a server's CPU.
19 |  - Easy to use, but also extremely versatile.
20 |  - Designed for both research and production.
21 |  - Full alignment tracking. Even with destructive normalization, it's always possible to get
22 |    the part of the original sentence that corresponds to any token.
23 |  - Does all the pre-processing: Truncation, Padding, add the special tokens your model needs.
24 | 
25 | 
26 | .. toctree::
27 |     :maxdepth: 2
28 |     :caption: Getting Started
29 | 
30 |     quicktour
31 |     installation/main
32 |     pipeline
33 |     components
34 | 
35 | .. toctree-tags::
36 |     :maxdepth: 3
37 |     :caption: Using 🤗 Tokenizers
38 |     :glob:
39 | 
40 |     :python:tutorials/python/*
41 | 
42 | .. toctree::
43 |     :maxdepth: 3
44 |     :caption: API Reference
45 | 
46 |     api/reference
47 | 
48 | .. include:: entities.inc
49 | 


--------------------------------------------------------------------------------
/docs/source/installation/main.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ====================================================================================================
 3 | 
 4 | .. only:: python
 5 | 
 6 |     .. include:: python.inc
 7 | 
 8 | .. only:: rust
 9 | 
10 |     .. include:: rust.inc
11 | 
12 | .. only:: node
13 | 
14 |     .. include:: node.inc
15 | 
16 | 


--------------------------------------------------------------------------------
/docs/source/installation/node.inc:
--------------------------------------------------------------------------------
1 | Installation with npm
2 | ----------------------------------------------------------------------------------------------------
3 | 
4 | You can simply install 🤗 Tokenizers with npm using::
5 | 
6 |     npm install tokenizers
7 | 


--------------------------------------------------------------------------------
/docs/source/installation/python.inc:
--------------------------------------------------------------------------------
 1 | 🤗 Tokenizers is tested on Python 3.5+.
 2 | 
 3 | You should install 🤗 Tokenizers in a
 4 | `virtual environment <https://docs.python.org/3/library/venv.html>`_. If you're unfamiliar with
 5 | Python virtual environments, check out the
 6 | `user guide <https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/>`__.
 7 | Create a virtual environment with the version of Python you're going to use and activate it.
 8 | 
 9 | Installation with pip
10 | ----------------------------------------------------------------------------------------------------
11 | 
12 | 🤗 Tokenizers can be installed using pip as follows::
13 | 
14 |     pip install tokenizers
15 | 
16 | 
17 | Installation from sources
18 | ----------------------------------------------------------------------------------------------------
19 | 
20 | To use this method, you need to have the Rust language installed. You can follow
21 | `the official guide <https://www.rust-lang.org/learn/get-started>`__ for more information.
22 | 
23 | If you are using a unix based OS, the installation should be as simple as running::
24 | 
25 |     curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
26 | 
27 | Or you can easily update it with the following command::
28 | 
29 |     rustup update
30 | 
31 | Once rust is installed, we can start retrieving the sources for 🤗 Tokenizers::
32 | 
33 |     git clone https://github.com/huggingface/tokenizers
34 | 
35 | Then we go into the python bindings folder::
36 | 
37 |     cd tokenizers/bindings/python
38 | 
39 | At this point you should have your `virtual environment`_ already activated. In order to
40 | compile 🤗 Tokenizers, you need to::
41 | 
42 |     pip install -e .
43 | 


--------------------------------------------------------------------------------
/docs/source/installation/rust.inc:
--------------------------------------------------------------------------------
1 | Crates.io
2 | ----------------------------------------------------------------------------------------------------
3 | 
4 | 🤗 Tokenizers is available on `crates.io <https://crates.io/crates/tokenizers>`__.
5 | 
6 | You just need to add it to your :obj:`Cargo.toml`::
7 | 
8 |     tokenizers = "0.10"
9 | 


--------------------------------------------------------------------------------
/tokenizers/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | authors = ["Anthony MOI <m.anthony.moi@gmail.com>", "Nicolas Patry <patry.nicolas@protonmail.com>"]
 3 | edition = "2018"
 4 | name = "tokenizers"
 5 | version = "0.21.4-dev.0"
 6 | homepage = "https://github.com/huggingface/tokenizers"
 7 | repository = "https://github.com/huggingface/tokenizers"
 8 | documentation = "https://docs.rs/tokenizers/"
 9 | license = "Apache-2.0"
10 | keywords = ["tokenizer", "NLP", "huggingface", "BPE", "WordPiece"]
11 | readme = "./README.md"
12 | description = """
13 | Provides an implementation of today's most used tokenizers,
14 | with a focus on performances and versatility.
15 | """
16 | exclude = [ "rust-toolchain", "target/*", "Cargo.lock", "benches/*.txt", "benches/*.json", "data/*" ]
17 | 
18 | [lib]
19 | name = "tokenizers"
20 | path = "src/lib.rs"
21 | bench = false
22 | 
23 | [[bench]]
24 | name = "bpe_benchmark"
25 | harness = false
26 | 
27 | [[bench]]
28 | name = "bert_benchmark"
29 | harness = false
30 | 
31 | [[bench]]
32 | name = "layout_benchmark"
33 | harness = false
34 | 
35 | [[bench]]
36 | name = "unigram_benchmark"
37 | harness = false
38 | 
39 | [[bench]]
40 | name = "llama3_benchmark"
41 | harness = false
42 | 
43 | [dependencies]
44 | rand = "0.9"
45 | onig = { version = "6.5.1", default-features = false, optional = true }
46 | regex = "1.10"
47 | regex-syntax = "0.8"
48 | rayon = "1.10"
49 | rayon-cond = "0.4"
50 | serde = { version = "1.0", features = [ "derive" ] }
51 | serde_json = "1.0"
52 | unicode-normalization-alignments = "0.1"
53 | unicode_categories = "0.1"
54 | unicode-segmentation = "1.11"
55 | indicatif = {version = "0.17", optional = true}
56 | itertools = "0.14"
57 | log = "0.4"
58 | derive_builder = "0.20"
59 | spm_precompiled = "0.1.3"
60 | hf-hub = { version = "0.4.1", features = ["ureq"], default-features = false, optional = true }
61 | aho-corasick = "1.1"
62 | paste = "1.0.14"
63 | macro_rules_attribute = "0.2.0"
64 | thiserror = "2"
65 | fancy-regex = { version = "0.14", optional = true}
66 | getrandom = { version = "0.3" }
67 | esaxx-rs = { version = "0.1.10", default-features = false, features=[]}
68 | monostate = "0.1.12"
69 | ahash = { version = "0.8.11", features = ["serde"] }
70 | dary_heap = { version = "0.3.6", features = ["serde"] }
71 | compact_str = { version = "0.9", features = ["serde"] }
72 | 
73 | [features]
74 | default = ["progressbar", "onig", "esaxx_fast"]
75 | esaxx_fast = ["esaxx-rs/cpp"]
76 | progressbar = ["indicatif"]
77 | http = ["hf-hub"]
78 | unstable_wasm = ["fancy-regex", "getrandom/wasm_js"]
79 | rustls-tls = ["hf-hub?/rustls-tls"]
80 | 
81 | [dev-dependencies]
82 | criterion = "0.6"
83 | tempfile = "3.10"
84 | assert_approx_eq = "1.1"
85 | tracing = "0.1"
86 | tracing-subscriber = "0.3.18"
87 | 
88 | [profile.release]
89 | lto = "fat"
90 | 
91 | [[example]]
92 | name = "encode_batch"
93 | required-features = ["http"]
94 | 
95 | 


--------------------------------------------------------------------------------
/tokenizers/LICENSE:
--------------------------------------------------------------------------------
1 | ../LICENSE


--------------------------------------------------------------------------------
/tokenizers/Makefile:
--------------------------------------------------------------------------------
 1 | DATA_DIR = data
 2 | BENCHMARK_DIR = benches
 3 | TESTS_DIR = tests
 4 | 
 5 | dir_guard=@mkdir -p $(@D)
 6 | 
 7 | SHARED_RESOURCES = $(DATA_DIR)/gpt2-vocab.json $(DATA_DIR)/gpt2-merges.txt $(DATA_DIR)/bert-base-uncased-vocab.txt $(DATA_DIR)/big.txt $(DATA_DIR)/small.txt $(DATA_DIR)/albert-base-v1-tokenizer.json  $(DATA_DIR)/llama-3-tokenizer.json
 8 | BENCHMARK_RESOURCES = $(SHARED_RESOURCES)
 9 | TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json $(DATA_DIR)/bert-wiki.json
10 | 
11 | .PHONY : build
12 | build :
13 | 	cargo build --all-targets
14 | 
15 | .PHONY : release
16 | release :
17 | 	cargo build --release
18 | 
19 | .PHONY : format
20 | format :
21 | 	cargo fmt --
22 | 
23 | .PHONY : lint
24 | lint :
25 | 	cargo fmt -- --check
26 | 	cargo fmt -- $(BENCHMARK_DIR)/*.rs --check
27 | 	cargo clippy --all-targets --all-features -- -D warnings
28 | 
29 | .PHONY : test
30 | test : $(TESTS_RESOURCES)
31 | 	cargo test
32 | 
33 | .PHONY : doc
34 | doc :
35 | 	cargo doc
36 | 
37 | .PHONY : publish
38 | publish :
39 | 	cargo publish
40 | 
41 | .PHONY : all-checks
42 | all-checks : lint test doc
43 | 
44 | .PHONY : bench
45 | bench : $(BENCHMARK_RESOURCES)
46 | 	cargo bench -- --verbose
47 | 
48 | $(DATA_DIR)/gpt2-% :
49 | 	$(dir_guard)
50 | 	wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-$* -O $@
51 | 
52 | $(DATA_DIR)/bert-% :
53 | 	$(dir_guard)
54 | 	wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-$* -O $@
55 | 
56 | $(DATA_DIR)/unigram% :
57 | 	$(dir_guard)
58 | 	wget https://huggingface.co/Narsil/small/raw/main/unigram$* -O $@
59 | 
60 | $(DATA_DIR)/albert-base-v1-tokenizer.json :
61 | 	$(dir_guard)
62 | 	wget https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json -O $@
63 | 
64 | $(DATA_DIR)/tokenizer-llama3.json :
65 | 	$(dir_guard)
66 | 	wget https://huggingface.co/Narsil/llama-tokenizer/resolve/main/tokenizer.json -O $@
67 | 
68 | $(DATA_DIR)/big.txt :
69 | 	$(dir_guard)
70 | 	wget https://norvig.com/big.txt -O $@
71 | 
72 | $(DATA_DIR)/small.txt : $(DATA_DIR)/big.txt
73 | 	head -100 $(DATA_DIR)/big.txt > $@
74 | 
75 | $(DATA_DIR)/roberta.json :
76 | 	$(dir_guard)
77 | 	wget https://huggingface.co/Narsil/small/raw/main/roberta.json -O $@
78 | 
79 | $(DATA_DIR)/tokenizer-wiki.json :
80 | 	$(dir_guard)
81 | 	wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json -O $@
82 | 
83 | $(DATA_DIR)/bert-wiki.json :
84 | 	$(dir_guard)
85 | 	wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json -O $@
86 | 
87 | $(DATA_DIR)/llama-3-tokenizer.json :
88 | 	$(dir_guard)
89 | 	wget https://huggingface.co/hf-internal-testing/llama3-tokenizer/resolve/main/tokenizer.json -O $@
90 | 


--------------------------------------------------------------------------------
/tokenizers/README.tpl:
--------------------------------------------------------------------------------
 1 | <p align="center">
 2 |     <br>
 3 |     <img src="https://huggingface.co/landing/assets/tokenizers/tokenizers-logo.png" width="600"/>
 4 |     <br>
 5 | <p>
 6 | <p align="center">
 7 |     <img alt="Build" src="https://github.com/huggingface/tokenizers/workflows/Rust/badge.svg">
 8 |     <a href="https://github.com/huggingface/tokenizers/blob/master/LICENSE">
 9 |         <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/tokenizers.svg?color=blue">
10 |     </a>
11 |     <a href="https://docs.rs/tokenizers/">
12 |         <img alt="Doc" src="https://docs.rs/tokenizers/badge.svg">
13 |     </a>
14 | </p>
15 | <br>
16 | 
17 | 
18 | {{readme}}


--------------------------------------------------------------------------------
/tokenizers/benches/common/mod.rs:
--------------------------------------------------------------------------------
 1 | use std::time::{Duration, Instant};
 2 | 
 3 | use std::hint::black_box;
 4 | 
 5 | use tokenizers::{
 6 |     Decoder, EncodeInput, Model, Normalizer, PostProcessor, PreTokenizer, TokenizerImpl, Trainer,
 7 | };
 8 | 
 9 | #[allow(dead_code)]
10 | pub fn iter_bench_encode<M, N, PT, PP, D>(
11 |     iters: u64,
12 |     tokenizer: &TokenizerImpl<M, N, PT, PP, D>,
13 |     lines: &[EncodeInput],
14 | ) -> Duration
15 | where
16 |     M: Model,
17 |     N: Normalizer,
18 |     PT: PreTokenizer,
19 |     PP: PostProcessor,
20 |     D: Decoder,
21 | {
22 |     let mut duration = Duration::new(0, 0);
23 |     for _i in 0..iters {
24 |         for line in lines {
25 |             let input = line.clone();
26 |             let start = Instant::now();
27 |             let _ = black_box(tokenizer.encode(input, false));
28 |             duration = duration.checked_add(start.elapsed()).unwrap();
29 |         }
30 |     }
31 |     duration
32 | }
33 | 
34 | #[allow(dead_code)]
35 | pub fn iter_bench_encode_batch<M, N, PT, PP, D>(
36 |     iters: u64,
37 |     tokenizer: &TokenizerImpl<M, N, PT, PP, D>,
38 |     batches: &[Vec<EncodeInput>],
39 | ) -> Duration
40 | where
41 |     M: Model + Send + Sync,
42 |     N: Normalizer + Send + Sync,
43 |     PT: PreTokenizer + Send + Sync,
44 |     PP: PostProcessor + Send + Sync,
45 |     D: Decoder + Send + Sync,
46 | {
47 |     let mut duration = Duration::new(0, 0);
48 |     for _i in 0..iters {
49 |         for batch in batches {
50 |             let batch = batch.clone();
51 |             let start = Instant::now();
52 |             let _ = black_box(tokenizer.encode_batch(batch, false));
53 |             duration = duration.checked_add(start.elapsed()).unwrap();
54 |         }
55 |     }
56 |     duration
57 | }
58 | 
59 | #[allow(dead_code)]
60 | pub fn iter_bench_train<T, M, N, PT, PP, D>(
61 |     iters: u64,
62 |     tokenizer: &mut TokenizerImpl<M, N, PT, PP, D>,
63 |     trainer: &mut T,
64 |     files: Vec<String>,
65 | ) -> Duration
66 | where
67 |     T: Trainer<Model = M> + Sync,
68 |     M: Model + Send + Sync,
69 |     N: Normalizer + Send + Sync,
70 |     PT: PreTokenizer + Send + Sync,
71 |     PP: PostProcessor + Send + Sync,
72 |     D: Decoder + Send + Sync,
73 | {
74 |     let mut duration = Duration::new(0, 0);
75 |     for _i in 0..iters {
76 |         let start = Instant::now();
77 |         tokenizer.train_from_files(trainer, files.clone()).unwrap();
78 |         duration = duration.checked_add(start.elapsed()).unwrap();
79 |     }
80 |     duration
81 | }
82 | 


--------------------------------------------------------------------------------
/tokenizers/benches/layout_benchmark.rs:
--------------------------------------------------------------------------------
 1 | #[macro_use]
 2 | extern crate criterion;
 3 | 
 4 | use std::fs::File;
 5 | use std::io::{BufRead, BufReader};
 6 | use std::path::Path;
 7 | use std::time::{Duration, Instant};
 8 | 
 9 | use criterion::Criterion;
10 | use std::hint::black_box;
11 | use tokenizers::processors::template::TemplateProcessing;
12 | use tokenizers::{EncodeInput, Encoding, PostProcessor, Tokenizer};
13 | 
14 | /// Simple TemplateProcessing
15 | fn create_processor() -> TemplateProcessing {
16 |     TemplateProcessing::builder()
17 |         .try_single("[CLS]:0 $A:0 [SEP]:0")
18 |         .unwrap()
19 |         .try_pair("[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1")
20 |         .unwrap()
21 |         .special_tokens(vec![("[CLS]", 0), ("[SEP]", 1)])
22 |         .build()
23 |         .unwrap()
24 | }
25 | 
26 | pub fn bench_layout(c: &mut Criterion) {
27 |     let processor = create_processor();
28 |     let tokenizer = Tokenizer::from_file("data/albert-base-v1-tokenizer.json").unwrap();
29 |     let mut encodeds: Vec<Encoding> = vec![];
30 |     for line in BufReader::new(File::open(Path::new("data/big.txt")).unwrap()).lines() {
31 |         let line: EncodeInput = line.unwrap().into();
32 | 
33 |         let encoded: Encoding = tokenizer.encode(line, false).unwrap();
34 |         encodeds.push(encoded);
35 |     }
36 | 
37 |     c.bench_function("TemplateProcessing single encode", |b| {
38 |         b.iter_custom(|iters| {
39 |             let mut duration = Duration::new(0, 0);
40 |             for i in 0..iters as usize {
41 |                 let encoded_index = i % encodeds.len();
42 |                 let encoded: Encoding = encodeds[encoded_index].clone();
43 | 
44 |                 let start = Instant::now();
45 |                 let _ = black_box(processor.process(encoded, None, false));
46 |                 duration = duration.checked_add(start.elapsed()).unwrap();
47 |             }
48 |             duration
49 |         })
50 |     });
51 |     c.bench_function("TemplateProcessing pair encode", |b| {
52 |         b.iter_custom(|iters| {
53 |             let mut duration = Duration::new(0, 0);
54 |             for i in 0..iters as usize {
55 |                 let encoded_index = i % encodeds.len();
56 |                 let encoded: Encoding = encodeds[encoded_index].clone();
57 | 
58 |                 let encoded_index2 = (i + 1) % encodeds.len();
59 |                 let pair: Encoding = encodeds[encoded_index2].clone();
60 | 
61 |                 let start = Instant::now();
62 |                 let _ = black_box(processor.process(encoded, Some(pair), false));
63 |                 duration = duration.checked_add(start.elapsed()).unwrap();
64 |             }
65 |             duration
66 |         })
67 |     });
68 | }
69 | 
70 | criterion_group! {
71 |     name = layout_benches;
72 |     config = Criterion::default().sample_size(20);
73 |     targets = bench_layout
74 | }
75 | 
76 | criterion_main!(layout_benches);
77 | 


--------------------------------------------------------------------------------
/tokenizers/benches/llama3_benchmark.rs:
--------------------------------------------------------------------------------
 1 | #[macro_use]
 2 | extern crate criterion;
 3 | 
 4 | mod common;
 5 | 
 6 | use common::{iter_bench_encode, iter_bench_encode_batch, iter_bench_train};
 7 | use criterion::{Criterion, Throughput};
 8 | use std::hint::black_box;
 9 | use tokenizers::{
10 |     models::{bpe::BpeTrainerBuilder, TrainerWrapper},
11 |     EncodeInput, Tokenizer,
12 | };
13 | 
14 | static BATCH_SIZE: usize = 1_000;
15 | 
16 | pub fn llama3(c: &mut Criterion) {
17 |     let data = std::fs::read_to_string("data/big.txt").unwrap();
18 |     let mut group = c.benchmark_group("llama3-encode");
19 |     group.throughput(Throughput::Bytes(data.len() as u64));
20 |     let mut lines: Vec<EncodeInput> = vec![];
21 |     let mut batches: Vec<Vec<EncodeInput>> = vec![vec![]];
22 |     for line in data.lines() {
23 |         let line: EncodeInput = line.into();
24 |         lines.push(line.clone());
25 |         if batches.last().unwrap().len() >= BATCH_SIZE {
26 |             batches.push(vec![]);
27 |         }
28 |         batches.last_mut().unwrap().push(line);
29 |     }
30 |     let tokenizer = Tokenizer::from_file("data/llama-3-tokenizer.json").unwrap();
31 |     group.bench_function("llama3-offsets", |b| {
32 |         let data: Vec<_> = data.lines().collect();
33 |         let add_special_tokens = false;
34 |         b.iter(|| {
35 |             tokenizer
36 |                 .encode_batch_char_offsets(black_box(data.clone()), add_special_tokens)
37 |                 .unwrap()
38 |         })
39 |     });
40 |     group.bench_function("llama3-encode", |b| {
41 |         b.iter_custom(|iters| iter_bench_encode(iters, &tokenizer, &lines))
42 |     });
43 |     group.bench_function("llama3-batch", |b| {
44 |         b.iter_custom(|iters| iter_bench_encode_batch(iters, &tokenizer, &batches))
45 |     });
46 |     let mut trainer: TrainerWrapper = BpeTrainerBuilder::default()
47 |         .show_progress(false)
48 |         .build()
49 |         .into();
50 |     let mut tokenizer = Tokenizer::from_file("data/llama-3-tokenizer.json").unwrap();
51 |     group.bench_function("BPE Train vocabulary (big)", |b| {
52 |         b.iter_custom(|iters| {
53 |             iter_bench_train(
54 |                 iters,
55 |                 &mut tokenizer,
56 |                 &mut trainer,
57 |                 vec!["data/big.txt".to_string()],
58 |             )
59 |         })
60 |     });
61 |     group.finish();
62 | }
63 | 
64 | criterion_group! {
65 |     name = llama_3;
66 |     config = Criterion::default().sample_size(10);
67 |     targets = llama3
68 | }
69 | 
70 | criterion_main!(llama_3);
71 | 


--------------------------------------------------------------------------------
/tokenizers/benches/unigram_benchmark.rs:
--------------------------------------------------------------------------------
 1 | #[macro_use]
 2 | extern crate criterion;
 3 | 
 4 | mod common;
 5 | 
 6 | use common::iter_bench_train;
 7 | 
 8 | use criterion::{Criterion, Throughput};
 9 | use tokenizers::models::unigram::{Unigram, UnigramTrainerBuilder};
10 | use tokenizers::models::TrainerWrapper;
11 | use tokenizers::pre_tokenizers::whitespace::Whitespace;
12 | use tokenizers::Tokenizer;
13 | 
14 | // pub fn bench_train(c: &mut Criterion) {
15 | //     let trainer = UnigramTrainer::builder()
16 | //         .show_progress(false)
17 | //         .unk_token(Some("<UNK>".into()))
18 | //         .build()
19 | //         .unwrap();
20 | //
21 | //     let mut model = Unigram::default();
22 | //
23 | //     let content = read_to_string("data/big.txt").unwrap();
24 | //     c.bench_function("Unigram Train vocabulary (medium)", |b| {
25 | //         b.iter_custom(|iters| {
26 | //             let mut duration = Duration::new(0, 0);
27 | //             for _i in 0..iters {
28 | //                 let sentences = sentences.clone();
29 | //                 let start = Instant::now();
30 | //                 trainer.do_train(sentences, &mut model).unwrap();
31 | //                 duration = duration.checked_add(start.elapsed()).unwrap();
32 | //             }
33 | //             duration
34 | //         })
35 | //     });
36 | // }
37 | fn bench_train(c: &mut Criterion) {
38 |     let mut trainer: TrainerWrapper = UnigramTrainerBuilder::default()
39 |         .show_progress(false)
40 |         .build()
41 |         .unwrap()
42 |         .into();
43 |     let mut tokenizer = Tokenizer::new(Unigram::default()).into_inner();
44 |     tokenizer.with_pre_tokenizer(Some(Whitespace {}));
45 |     let mut group = c.benchmark_group("unigram-train-large");
46 |     let data = std::fs::read_to_string("data/big.txt").unwrap();
47 |     group.throughput(Throughput::Bytes(data.len() as u64));
48 |     group.bench_function("BPE Train vocabulary (big)", |b| {
49 |         b.iter_custom(|iters| {
50 |             iter_bench_train(
51 |                 iters,
52 |                 &mut tokenizer,
53 |                 &mut trainer,
54 |                 vec!["data/big.txt".to_string()],
55 |             )
56 |         })
57 |     });
58 | }
59 | 
60 | criterion_group! {
61 |     name = benches_train;
62 |     config = Criterion::default().sample_size(10);
63 |     targets = bench_train
64 | }
65 | 
66 | criterion_main!(benches_train);
67 | 


--------------------------------------------------------------------------------
/tokenizers/examples/encode_batch.rs:
--------------------------------------------------------------------------------
 1 | use tokenizers::Tokenizer;
 2 | 
 3 | fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
 4 |     let tokenizer = Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None)?;
 5 | 
 6 |     let data = std::fs::read_to_string("data/big.txt")?;
 7 |     let data: Vec<_> = data.lines().collect();
 8 |     let add_special_tokens = false;
 9 |     tokenizer.encode_batch_char_offsets(data, add_special_tokens)?;
10 |     Ok(())
11 | }
12 | 


--------------------------------------------------------------------------------
/tokenizers/examples/serialization.rs:
--------------------------------------------------------------------------------
 1 | use tokenizers::models::wordpiece::WordPiece;
 2 | use tokenizers::{AddedToken, Tokenizer};
 3 | 
 4 | fn main() {
 5 |     let start = std::time::Instant::now();
 6 |     let mut tokenizer = Tokenizer::new(WordPiece::default());
 7 | 
 8 |     // Mix special and not special
 9 |     // You can make sure ids are in order, and special status is correct.
10 |     let tokens: Vec<_> = (0..120_000)
11 |         .map(|i| AddedToken::from(format!("[SPECIAL_{i}]"), i % 2 == 0))
12 |         .collect();
13 |     tokenizer.add_tokens(&tokens);
14 |     tokenizer.save("_tok.json", true).unwrap();
15 |     println!("Save took {:?}", start.elapsed());
16 |     let start = std::time::Instant::now();
17 |     let _tok = Tokenizer::from_file("_tok.json").unwrap();
18 |     println!("Took {:?}", start.elapsed());
19 |     std::fs::remove_file("_tok.json").unwrap();
20 | }
21 | 


--------------------------------------------------------------------------------
/tokenizers/examples/unstable_wasm/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | **/*.rs.bk
3 | Cargo.lock
4 | bin/
5 | pkg/
6 | wasm-pack.log
7 | 


--------------------------------------------------------------------------------
/tokenizers/examples/unstable_wasm/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "unstable_wasm"
 3 | version = "0.1.0"
 4 | authors = ["Nicolas Patry"]
 5 | edition = "2018"
 6 | 
 7 | [lib]
 8 | crate-type = ["cdylib", "rlib"]
 9 | 
10 | [features]
11 | default = ["console_error_panic_hook"]
12 | 
13 | [dependencies]
14 | wasm-bindgen = "0.2.63"
15 | 
16 | # The `console_error_panic_hook` crate provides better debugging of panics by
17 | # logging them with `console.error`. This is great for development, but requires
18 | # all the `std::fmt` and `std::panicking` infrastructure, so isn't great for
19 | # code size when deploying.
20 | console_error_panic_hook = { version = "0.1.6", optional = true }
21 | 
22 | # `wee_alloc` is a tiny allocator for wasm that is only ~1K in code size
23 | # compared to the default allocator's ~10K. It is slower than the default
24 | # allocator, however.
25 | #
26 | # Unfortunately, `wee_alloc` requires nightly Rust when targeting wasm for now.
27 | wee_alloc = { version = "0.4.5", optional = true }
28 | 
29 | tokenizers = { path = "../../", default-features=false, features = ["unstable_wasm"]}
30 | 
31 | [dev-dependencies]
32 | wasm-bindgen-test = "0.3.13"
33 | 
34 | [profile.release]
35 | # Tell `rustc` to optimize for small code size.
36 | opt-level = "s"
37 | 


--------------------------------------------------------------------------------
/tokenizers/examples/unstable_wasm/README.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 | 
 3 |   <h1><code>wasm-pack-template</code></h1>
 4 | 
 5 |   <strong>A template for kick starting a Rust and WebAssembly project using <a href="https://github.com/rustwasm/wasm-pack">wasm-pack</a>.</strong>
 6 | 
 7 |   <p>
 8 |     <a href="https://travis-ci.org/rustwasm/wasm-pack-template"><img src="https://img.shields.io/travis/rustwasm/wasm-pack-template.svg?style=flat-square" alt="Build Status" /></a>
 9 |   </p>
10 | 
11 |   <h3>
12 |     <a href="https://rustwasm.github.io/docs/wasm-pack/tutorials/npm-browser-packages/index.html">Tutorial</a>
13 |     <span> | </span>
14 |     <a href="https://discordapp.com/channels/442252698964721669/443151097398296587">Chat</a>
15 |   </h3>
16 | 
17 |   <sub>Built with 🦀🕸 by <a href="https://rustwasm.github.io/">The Rust and WebAssembly Working Group</a></sub>
18 | </div>
19 | 
20 | ## About
21 | 
22 | 
23 | This is an example project showing off a very basic use case for `wasm` tokenizers
24 | usage.
25 | 
26 | [**📚 Read this template tutorial! 📚**][template-docs]
27 | 
28 | This template is designed for compiling Rust libraries into WebAssembly and
29 | publishing the resulting package to NPM.
30 | 
31 | Be sure to check out [other `wasm-pack` tutorials online][tutorials] for other
32 | templates and usages of `wasm-pack`.
33 | 
34 | [tutorials]: https://rustwasm.github.io/docs/wasm-pack/tutorials/index.html
35 | [template-docs]: https://rustwasm.github.io/docs/wasm-pack/tutorials/npm-browser-packages/index.html
36 | 
37 | ## 🚴 Usage
38 | 
39 | ### 🐑 Use `cargo generate` to Clone this Template
40 | 
41 | [Learn more about `cargo generate` here.](https://github.com/ashleygwilliams/cargo-generate)
42 | 
43 | ```
44 | cargo generate --git https://github.com/rustwasm/wasm-pack-template.git --name my-project
45 | cd my-project
46 | ```
47 | 
48 | ### 🛠️ Build with `wasm-pack build`
49 | 
50 | ```
51 | wasm-pack build
52 | ```
53 | 
54 | ### 🔬 Test in Headless Browsers with `wasm-pack test`
55 | 
56 | ```
57 | wasm-pack test --headless --firefox
58 | ```
59 | 
60 | ### 🎁 Publish to NPM with `wasm-pack publish`
61 | 
62 | ```
63 | wasm-pack publish
64 | ```
65 | 
66 | ## 🔋 Batteries Included
67 | 
68 | * [`wasm-bindgen`](https://github.com/rustwasm/wasm-bindgen) for communicating
69 |   between WebAssembly and JavaScript.
70 | * [`console_error_panic_hook`](https://github.com/rustwasm/console_error_panic_hook)
71 |   for logging panic messages to the developer console.
72 | * [`wee_alloc`](https://github.com/rustwasm/wee_alloc), an allocator optimized
73 |   for small code size.
74 | 


--------------------------------------------------------------------------------
/tokenizers/examples/unstable_wasm/src/lib.rs:
--------------------------------------------------------------------------------
 1 | mod utils;
 2 | use tokenizers::models::bpe::{Vocab, BPE};
 3 | use tokenizers::Tokenizer;
 4 | 
 5 | use wasm_bindgen::prelude::*;
 6 | 
 7 | // When the `wee_alloc` feature is enabled, use `wee_alloc` as the global
 8 | // allocator.
 9 | #[cfg(feature = "wee_alloc")]
10 | #[global_allocator]
11 | static ALLOC: wee_alloc::WeeAlloc = wee_alloc::WeeAlloc::INIT;
12 | 
13 | #[wasm_bindgen]
14 | pub fn tokenize(string: &str) -> Vec<u32> {
15 |     let vocab: Vocab = vec![
16 |         ("a".to_string(), 0),
17 |         ("##b".to_string(), 1),
18 |         ("##c".to_string(), 2),
19 |         ("ab".to_string(), 3),
20 |         ("abc".to_string(), 4),
21 |     ]
22 |     .into_iter()
23 |     .collect();
24 | 
25 |     let merges = vec![
26 |         ("a".to_string(), "##b".to_string()),
27 |         ("ab".to_string(), "##c".to_string()),
28 |     ];
29 | 
30 |     let bpe = BPE::builder()
31 |         .vocab_and_merges(vocab, merges)
32 |         .unk_token("[UNK]".to_string())
33 |         .continuing_subword_prefix("##".to_string())
34 |         .build()
35 |         .unwrap();
36 |     let tokenizer = Tokenizer::new(bpe);
37 |     tokenizer
38 |         .encode(string, false)
39 |         .unwrap()
40 |         .get_ids()
41 |         .into_iter()
42 |         .cloned()
43 |         .collect()
44 | }
45 | 


--------------------------------------------------------------------------------
/tokenizers/examples/unstable_wasm/src/utils.rs:
--------------------------------------------------------------------------------
 1 | pub fn set_panic_hook() {
 2 |     // When the `console_error_panic_hook` feature is enabled, we can call the
 3 |     // `set_panic_hook` function at least once during initialization, and then
 4 |     // we will get better error messages if our code ever panics.
 5 |     //
 6 |     // For more details see
 7 |     // https://github.com/rustwasm/console_error_panic_hook#readme
 8 |     #[cfg(feature = "console_error_panic_hook")]
 9 |     console_error_panic_hook::set_once();
10 | }
11 | 


--------------------------------------------------------------------------------
/tokenizers/examples/unstable_wasm/tests/web.rs:
--------------------------------------------------------------------------------
 1 | //! Test suite for the Web and headless browsers.
 2 | 
 3 | #![cfg(target_arch = "wasm32")]
 4 | 
 5 | extern crate wasm_bindgen_test;
 6 | use wasm_bindgen_test::*;
 7 | 
 8 | wasm_bindgen_test_configure!(run_in_browser);
 9 | 
10 | #[wasm_bindgen_test]
11 | fn pass() {
12 |     assert_eq!(1 + 1, 2);
13 | }
14 | 


--------------------------------------------------------------------------------
/tokenizers/examples/unstable_wasm/www/.bin/create-wasm-app.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const { spawn } = require("child_process");
 4 | const fs = require("fs");
 5 | 
 6 | let folderName = '.';
 7 | 
 8 | if (process.argv.length >= 3) {
 9 |   folderName = process.argv[2];
10 |   if (!fs.existsSync(folderName)) {
11 |     fs.mkdirSync(folderName);
12 |   }
13 | }
14 | 
15 | const clone = spawn("git", ["clone", "https://github.com/rustwasm/create-wasm-app.git", folderName]);
16 | 
17 | clone.on("close", code => {
18 |   if (code !== 0) {
19 |     console.error("cloning the template failed!")
20 |     process.exit(code);
21 |   } else {
22 |     console.log("🦀 Rust + 🕸 Wasm = ❤");
23 |   }
24 | });
25 | 


--------------------------------------------------------------------------------
/tokenizers/examples/unstable_wasm/www/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | dist
3 | 


--------------------------------------------------------------------------------
/tokenizers/examples/unstable_wasm/www/.travis.yml:
--------------------------------------------------------------------------------
1 | language: node_js
2 | node_js: "10"
3 | 
4 | script:
5 |   - ./node_modules/.bin/webpack
6 | 


--------------------------------------------------------------------------------
/tokenizers/examples/unstable_wasm/www/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | Copyright (c) [year] [name]
 2 | 
 3 | Permission is hereby granted, free of charge, to any
 4 | person obtaining a copy of this software and associated
 5 | documentation files (the "Software"), to deal in the
 6 | Software without restriction, including without
 7 | limitation the rights to use, copy, modify, merge,
 8 | publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software
10 | is furnished to do so, subject to the following
11 | conditions:
12 | 
13 | The above copyright notice and this permission notice
14 | shall be included in all copies or substantial portions
15 | of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | DEALINGS IN THE SOFTWARE.
26 | 


--------------------------------------------------------------------------------
/tokenizers/examples/unstable_wasm/www/README.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 | 
 3 |   <h1><code>create-wasm-app</code></h1>
 4 | 
 5 |   <strong>An <code>npm init</code> template for kick starting a project that uses NPM packages containing Rust-generated WebAssembly and bundles them with Webpack.</strong>
 6 | 
 7 |   <p>
 8 |     <a href="https://travis-ci.org/rustwasm/create-wasm-app"><img src="https://img.shields.io/travis/rustwasm/create-wasm-app.svg?style=flat-square" alt="Build Status" /></a>
 9 |   </p>
10 | 
11 |   <h3>
12 |     <a href="#usage">Usage</a>
13 |     <span> | </span>
14 |     <a href="https://discordapp.com/channels/442252698964721669/443151097398296587">Chat</a>
15 |   </h3>
16 | 
17 |   <sub>Built with 🦀🕸 by <a href="https://rustwasm.github.io/">The Rust and WebAssembly Working Group</a></sub>
18 | </div>
19 | 
20 | ## About
21 | 
22 | This template is designed for depending on NPM packages that contain
23 | Rust-generated WebAssembly and using them to create a Website.
24 | 
25 | * Want to create an NPM package with Rust and WebAssembly? [Check out
26 |   `wasm-pack-template`.](https://github.com/rustwasm/wasm-pack-template)
27 | * Want to make a monorepo-style Website without publishing to NPM? Check out
28 |   [`rust-webpack-template`](https://github.com/rustwasm/rust-webpack-template)
29 |   and/or
30 |   [`rust-parcel-template`](https://github.com/rustwasm/rust-parcel-template).
31 | 
32 | ## 🚴 Usage
33 | 
34 | ```
35 | npm init wasm-app
36 | ```
37 | 
38 | ## 🔋 Batteries Included
39 | 
40 | - `.gitignore`: ignores `node_modules`
41 | - `LICENSE-APACHE` and `LICENSE-MIT`: most Rust projects are licensed this way, so these are included for you
42 | - `README.md`: the file you are reading now!
43 | - `index.html`: a bare bones html document that includes the webpack bundle
44 | - `index.js`: example js file with a comment showing how to import and use a wasm pkg
45 | - `package.json` and `package-lock.json`:
46 |   - pulls in devDependencies for using webpack:
47 |       - [`webpack`](https://www.npmjs.com/package/webpack)
48 |       - [`webpack-cli`](https://www.npmjs.com/package/webpack-cli)
49 |       - [`webpack-dev-server`](https://www.npmjs.com/package/webpack-dev-server)
50 |   - defines a `start` script to run `webpack-dev-server`
51 | - `webpack.config.js`: configuration file for bundling your js with webpack
52 | 
53 | ## License
54 | 
55 | Licensed under either of
56 | 
57 | * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
58 | * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
59 | 
60 | at your option.
61 | 
62 | ### Contribution
63 | 
64 | Unless you explicitly state otherwise, any contribution intentionally
65 | submitted for inclusion in the work by you, as defined in the Apache-2.0
66 | license, shall be dual licensed as above, without any additional terms or
67 | conditions.
68 | 


--------------------------------------------------------------------------------
/tokenizers/examples/unstable_wasm/www/bootstrap.js:
--------------------------------------------------------------------------------
1 | // A dependency graph that contains any wasm must all be imported
2 | // asynchronously. This `bootstrap.js` file does the single async import, so
3 | // that no one else needs to worry about it again.
4 | import("./index.js")
5 |   .catch(e => console.error("Error importing `index.js`:", e));
6 | 


--------------------------------------------------------------------------------
/tokenizers/examples/unstable_wasm/www/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta charset="utf-8">
 5 |     <title>Hello wasm-pack!</title>
 6 |   </head>
 7 |   <body>
 8 |     <noscript>This page contains webassembly and javascript content, please enable javascript in your browser.</noscript>
 9 |     <script src="./bootstrap.js"></script>
10 |   </body>
11 | </html>
12 | 


--------------------------------------------------------------------------------
/tokenizers/examples/unstable_wasm/www/index.js:
--------------------------------------------------------------------------------
1 | import * as wasm from "unstable_wasm";
2 | 
3 | console.log(wasm.tokenize("ab"));
4 | console.log(wasm.tokenize("abc"));
5 | 


--------------------------------------------------------------------------------
/tokenizers/examples/unstable_wasm/www/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "create-wasm-app",
 3 |     "version": "0.1.0",
 4 |     "description": "create an app to consume rust-generated wasm packages",
 5 |     "main": "index.js",
 6 |     "bin": {
 7 |         "create-wasm-app": ".bin/create-wasm-app.js"
 8 |     },
 9 |     "scripts": {
10 |         "build": "webpack --config webpack.config.js",
11 |         "start": "NODE_OPTIONS=--openssl-legacy-provider webpack-dev-server"
12 |     },
13 |     "repository": {
14 |         "type": "git",
15 |         "url": "git+https://github.com/rustwasm/create-wasm-app.git"
16 |     },
17 |     "keywords": ["webassembly", "wasm", "rust", "webpack"],
18 |     "author": "Ashley Williams <ashley666ashley@gmail.com>",
19 |     "license": "(MIT OR Apache-2.0)",
20 |     "bugs": {
21 |         "url": "https://github.com/rustwasm/create-wasm-app/issues"
22 |     },
23 |     "homepage": "https://github.com/rustwasm/create-wasm-app#readme",
24 |     "devDependencies": {
25 |         "copy-webpack-plugin": "^11.0.0",
26 |         "webpack": "^5.75.0",
27 |         "webpack-cli": "^5.0.1",
28 |         "webpack-dev-server": "^5.2.1"
29 |     },
30 |     "dependencies": {
31 |         "unstable_wasm": "file:../pkg"
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/tokenizers/examples/unstable_wasm/www/webpack.config.js:
--------------------------------------------------------------------------------
 1 | const CopyWebpackPlugin = require("copy-webpack-plugin");
 2 | const path = require('path');
 3 | 
 4 | module.exports = {
 5 |   entry: "./bootstrap.js",
 6 |   output: {
 7 |     path: path.resolve(__dirname, "dist"),
 8 |     filename: "bootstrap.js",
 9 |   },
10 |   mode: "development",
11 |   plugins: [
12 |     new CopyWebpackPlugin(['index.html'])
13 |   ],
14 | };
15 | 


--------------------------------------------------------------------------------
/tokenizers/rust-toolchain:
--------------------------------------------------------------------------------
1 | stable
2 | 


--------------------------------------------------------------------------------
/tokenizers/src/decoders/bpe.rs:
--------------------------------------------------------------------------------
 1 | use crate::tokenizer::{Decoder, Result};
 2 | 
 3 | use serde::{Deserialize, Serialize};
 4 | 
 5 | #[derive(Deserialize, Clone, Debug, Serialize)]
 6 | /// Allows decoding Original BPE by joining all the tokens and then replacing
 7 | /// the suffix used to identify end-of-words by whitespaces
 8 | #[serde(tag = "type")]
 9 | #[non_exhaustive]
10 | pub struct BPEDecoder {
11 |     pub suffix: String,
12 | }
13 | 
14 | impl BPEDecoder {
15 |     pub fn new(suffix: String) -> Self {
16 |         Self { suffix }
17 |     }
18 | }
19 | 
20 | impl Default for BPEDecoder {
21 |     fn default() -> Self {
22 |         Self::new("</w>".into())
23 |     }
24 | }
25 | 
26 | impl Decoder for BPEDecoder {
27 |     fn decode_chain(&self, tokens: Vec<String>) -> Result<Vec<String>> {
28 |         let n = tokens.len() - 1;
29 |         Ok(tokens
30 |             .into_iter()
31 |             .enumerate()
32 |             .map(|(i, token)| {
33 |                 let replacement = if i == n { "" } else { " " };
34 |                 token.replace(&self.suffix, replacement)
35 |             })
36 |             .collect())
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/tokenizers/src/decoders/fuse.rs:
--------------------------------------------------------------------------------
 1 | use crate::tokenizer::{Decoder, Result};
 2 | use monostate::MustBe;
 3 | use serde::{Deserialize, Serialize};
 4 | 
 5 | #[derive(Clone, Debug, Serialize, Deserialize, Default)]
 6 | /// Fuse simply fuses all tokens into one big string.
 7 | /// It's usually the last decoding step anyway, but this
 8 | /// decoder exists incase some decoders need to happen after that
 9 | /// step
10 | #[non_exhaustive]
11 | pub struct Fuse {
12 |     #[serde(rename = "type")]
13 |     type_: MustBe!("Fuse"),
14 | }
15 | 
16 | impl Fuse {
17 |     pub fn new() -> Self {
18 |         Self {
19 |             type_: MustBe!("Fuse"),
20 |         }
21 |     }
22 | }
23 | 
24 | impl Decoder for Fuse {
25 |     fn decode_chain(&self, tokens: Vec<String>) -> Result<Vec<String>> {
26 |         let new_string = tokens.join("");
27 |         Ok(vec![new_string])
28 |     }
29 | }
30 | 
31 | #[cfg(test)]
32 | mod tests {
33 |     use super::*;
34 | 
35 |     #[test]
36 |     fn decode() {
37 |         let decoder = Fuse::new();
38 |         let res = decoder
39 |             .decode_chain(vec!["Hey".into(), " friend!".into()])
40 |             .unwrap();
41 |         assert_eq!(res, vec!["Hey friend!"]);
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/tokenizers/src/decoders/sequence.rs:
--------------------------------------------------------------------------------
 1 | use crate::decoders::DecoderWrapper;
 2 | use crate::tokenizer::{Decoder, Result};
 3 | use crate::utils::macro_rules_attribute;
 4 | use serde::{Deserialize, Serialize};
 5 | 
 6 | #[derive(Clone, Debug)]
 7 | #[macro_rules_attribute(impl_serde_type!)]
 8 | pub struct Sequence {
 9 |     decoders: Vec<DecoderWrapper>,
10 | }
11 | 
12 | impl Sequence {
13 |     pub fn new(decoders: Vec<DecoderWrapper>) -> Self {
14 |         Self { decoders }
15 |     }
16 | 
17 |     pub fn get_decoders(&self) -> &[DecoderWrapper] {
18 |         &self.decoders
19 |     }
20 | 
21 |     pub fn get_decoders_mut(&mut self) -> &mut [DecoderWrapper] {
22 |         &mut self.decoders
23 |     }
24 | }
25 | 
26 | impl Decoder for Sequence {
27 |     fn decode_chain(&self, mut tokens: Vec<String>) -> Result<Vec<String>> {
28 |         for decoder in &self.decoders {
29 |             tokens = decoder.decode_chain(tokens)?;
30 |         }
31 |         Ok(tokens)
32 |     }
33 | }
34 | 
35 | #[cfg(test)]
36 | mod tests {
37 |     use super::*;
38 |     use crate::decoders::ctc::CTC;
39 |     use crate::pre_tokenizers::metaspace::Metaspace;
40 | 
41 |     #[test]
42 |     fn sequence_basic() {
43 |         let decoders = vec![
44 |             DecoderWrapper::CTC(CTC::default()),
45 |             DecoderWrapper::Metaspace(Metaspace::default()),
46 |         ];
47 |         let decoder = Sequence::new(decoders);
48 |         let tokens: Vec<String> = vec!["▁", "▁", "H", "H", "i", "i", "▁", "y", "o", "u"]
49 |             .into_iter()
50 |             .map(|s| s.to_string())
51 |             .collect();
52 |         let out_tokens = decoder.decode(tokens).unwrap();
53 |         assert_eq!(out_tokens, "Hi you");
54 |     }
55 | }
56 | 


--------------------------------------------------------------------------------
/tokenizers/src/decoders/strip.rs:
--------------------------------------------------------------------------------
 1 | use crate::tokenizer::{Decoder, Result};
 2 | 
 3 | use serde::{Deserialize, Serialize};
 4 | 
 5 | #[derive(Deserialize, Clone, Debug, Serialize, Default)]
 6 | /// Strip is a simple trick which converts tokens looking like `<0x61>`
 7 | /// to pure bytes, and attempts to make them into a string. If the tokens
 8 | /// cannot be decoded you will get � instead for each inconvertible byte token
 9 | #[serde(tag = "type")]
10 | #[non_exhaustive]
11 | pub struct Strip {
12 |     pub content: char,
13 |     pub start: usize,
14 |     pub stop: usize,
15 | }
16 | 
17 | impl Strip {
18 |     pub fn new(content: char, start: usize, stop: usize) -> Self {
19 |         Self {
20 |             content,
21 |             start,
22 |             stop,
23 |         }
24 |     }
25 | }
26 | 
27 | impl Decoder for Strip {
28 |     fn decode_chain(&self, tokens: Vec<String>) -> Result<Vec<String>> {
29 |         Ok(tokens
30 |             .into_iter()
31 |             .map(|token| {
32 |                 let chars: Vec<char> = token.chars().collect();
33 | 
34 |                 let mut start_cut = 0;
35 |                 for (i, &c) in chars.iter().enumerate().take(self.start) {
36 |                     if c == self.content {
37 |                         start_cut = i + 1;
38 |                         continue;
39 |                     } else {
40 |                         break;
41 |                     }
42 |                 }
43 | 
44 |                 let mut stop_cut = chars.len();
45 |                 for i in 0..self.stop {
46 |                     let index = chars.len() - i - 1;
47 |                     if chars[index] == self.content {
48 |                         stop_cut = index;
49 |                         continue;
50 |                     } else {
51 |                         break;
52 |                     }
53 |                 }
54 | 
55 |                 let new_token: String = chars[start_cut..stop_cut].iter().collect();
56 |                 new_token
57 |             })
58 |             .collect())
59 |     }
60 | }
61 | 
62 | #[cfg(test)]
63 | mod tests {
64 |     use super::*;
65 | 
66 |     #[test]
67 |     fn decode() {
68 |         let decoder = Strip::new('H', 1, 0);
69 |         let res = decoder
70 |             .decode_chain(vec!["Hey".into(), " friend!".into(), "HHH".into()])
71 |             .unwrap();
72 |         assert_eq!(res, vec!["ey", " friend!", "HH"]);
73 | 
74 |         let decoder = Strip::new('y', 0, 1);
75 |         let res = decoder
76 |             .decode_chain(vec!["Hey".into(), " friend!".into()])
77 |             .unwrap();
78 |         assert_eq!(res, vec!["He", " friend!"]);
79 |     }
80 | }
81 | 


--------------------------------------------------------------------------------
/tokenizers/src/decoders/wordpiece.rs:
--------------------------------------------------------------------------------
 1 | use crate::tokenizer::{Decoder, Result};
 2 | 
 3 | use serde::{Deserialize, Serialize};
 4 | 
 5 | #[derive(Deserialize, Clone, Debug, Serialize)]
 6 | /// The WordPiece decoder takes care of decoding a list of wordpiece tokens
 7 | /// back into a readable string.
 8 | #[serde(tag = "type")]
 9 | #[non_exhaustive]
10 | pub struct WordPiece {
11 |     /// The prefix to be used for continuing subwords
12 |     pub prefix: String,
13 |     /// Whether to cleanup some tokenization artifacts (spaces before punctuation, ...)
14 |     pub cleanup: bool,
15 | }
16 | 
17 | impl WordPiece {
18 |     pub fn new(prefix: String, cleanup: bool) -> Self {
19 |         Self { prefix, cleanup }
20 |     }
21 | }
22 | 
23 | impl Default for WordPiece {
24 |     fn default() -> Self {
25 |         Self {
26 |             prefix: "##".to_owned(),
27 |             cleanup: true,
28 |         }
29 |     }
30 | }
31 | pub fn cleanup(dirty_input: &str) -> String {
32 |     dirty_input
33 |         .replace(" .", ".")
34 |         .replace(" ?", "?")
35 |         .replace(" !", "!")
36 |         .replace(" ,", ",")
37 |         .replace(" ' ", "'")
38 |         .replace(" n't", "n't")
39 |         .replace(" 'm", "'m")
40 |         .replace(" do not", " don't")
41 |         .replace(" 's", "'s")
42 |         .replace(" 've", "'ve")
43 |         .replace(" 're", "'re")
44 | }
45 | 
46 | impl Decoder for WordPiece {
47 |     fn decode_chain(&self, mut tokens: Vec<String>) -> Result<Vec<String>> {
48 |         for (i, token) in tokens.iter_mut().enumerate() {
49 |             if i != 0 {
50 |                 if let Some(tk) = token.strip_prefix(&self.prefix) {
51 |                     *token = tk.to_string();
52 |                 } else {
53 |                     *token = format!(" {token}");
54 |                 }
55 |             }
56 |             if self.cleanup {
57 |                 *token = cleanup(token);
58 |             }
59 |         }
60 |         Ok(tokens)
61 |     }
62 | }
63 | 
64 | #[cfg(test)]
65 | mod tests {
66 |     use super::*;
67 | 
68 |     #[test]
69 |     fn wordpiece_decoder() {
70 |         let decoder = WordPiece::new("##".to_string(), false);
71 | 
72 |         assert_eq!(
73 |             decoder
74 |                 .decode(vec![
75 |                     "##uelo".to_string(),
76 |                     "Ara".to_string(),
77 |                     "##új".to_string(),
78 |                     "##o".to_string(),
79 |                     "No".to_string(),
80 |                     "##guera".to_string()
81 |                 ])
82 |                 .unwrap(),
83 |             "##uelo Araújo Noguera"
84 |         );
85 |     }
86 | }
87 | 


--------------------------------------------------------------------------------
/tokenizers/src/models/bpe/mod.rs:
--------------------------------------------------------------------------------
 1 | //! [Byte Pair Encoding](https://www.aclweb.org/anthology/P16-1162/) model.
 2 | use std::{iter, mem};
 3 | 
 4 | mod model;
 5 | mod serialization;
 6 | pub mod trainer;
 7 | mod word;
 8 | 
 9 | type Pair = (u32, u32);
10 | 
11 | /// Errors that can be encountered while using or constructing a `BPE` model.
12 | #[derive(thiserror::Error, Debug)]
13 | pub enum Error {
14 |     /// An error encountered while reading files mainly.
15 |     #[error("IoError: {0}")]
16 |     Io(#[from] std::io::Error),
17 |     /// An error forwarded from Serde, while parsing JSON
18 |     #[error("JsonError: {0}")]
19 |     JsonError(#[from] serde_json::Error),
20 |     /// When the vocab.json file is in the wrong format
21 |     #[error("Bad vocabulary json file")]
22 |     BadVocabulary,
23 |     /// When the merges.txt file is in the wrong format. This error holds the line
24 |     /// number of the line that caused the error.
25 |     #[error("Merges text file invalid at line {0}")]
26 |     BadMerges(usize),
27 |     /// If a token found in merges, is not in the vocab
28 |     #[error("Token `{0}` out of vocabulary")]
29 |     MergeTokenOutOfVocabulary(String),
30 |     /// If the provided unk token is out of vocabulary
31 |     #[error("Unk token `{0}` not found in the vocabulary")]
32 |     UnkTokenOutOfVocabulary(String),
33 |     /// Dropout not between 0 and 1.
34 |     #[error("Dropout should be between 0 and 1, inclusive")]
35 |     InvalidDropout,
36 | }
37 | 
38 | /// Provides access to the `FirstLastIterator` to any Iterator
39 | pub(crate) trait WithFirstLastIterator: Iterator + Sized {
40 |     fn with_first_and_last(self) -> FirstLastIterator<Self>;
41 | }
42 | 
43 | impl<I> WithFirstLastIterator for I
44 | where
45 |     I: Iterator,
46 | {
47 |     fn with_first_and_last(self) -> FirstLastIterator<Self> {
48 |         FirstLastIterator {
49 |             first: true,
50 |             iter: self.peekable(),
51 |         }
52 |     }
53 | }
54 | 
55 | /// Provides information about whether an item is the first and/or the last of the iterator
56 | pub(crate) struct FirstLastIterator<I>
57 | where
58 |     I: Iterator,
59 | {
60 |     first: bool,
61 |     iter: iter::Peekable<I>,
62 | }
63 | 
64 | impl<I> Iterator for FirstLastIterator<I>
65 | where
66 |     I: Iterator,
67 | {
68 |     /// (is_first, is_last, item)
69 |     type Item = (bool, bool, I::Item);
70 | 
71 |     fn next(&mut self) -> Option<Self::Item> {
72 |         let first = mem::replace(&mut self.first, false);
73 |         self.iter
74 |             .next()
75 |             .map(|e| (first, self.iter.peek().is_none(), e))
76 |     }
77 | }
78 | 
79 | // Re-export
80 | pub use model::*;
81 | pub use trainer::*;
82 | use word::*;
83 | 


--------------------------------------------------------------------------------
/tokenizers/src/models/unigram/mod.rs:
--------------------------------------------------------------------------------
 1 | //! [Unigram](https://arxiv.org/abs/1804.10959) model.
 2 | mod lattice;
 3 | mod model;
 4 | mod serialization;
 5 | mod trainer;
 6 | mod trie;
 7 | 
 8 | pub use lattice::*;
 9 | pub use model::*;
10 | pub use trainer::*;
11 | 


--------------------------------------------------------------------------------
/tokenizers/src/models/unigram/trie.rs:
--------------------------------------------------------------------------------
 1 | use ahash::AHashMap;
 2 | use std::hash::Hash;
 3 | 
 4 | #[derive(Default)]
 5 | pub struct TrieBuilder<Label> {
 6 |     trie: Trie<Label>,
 7 | }
 8 | 
 9 | impl<Label: Eq + Hash + Copy> TrieBuilder<Label> {
10 |     pub fn push(&mut self, element: &[Label]) {
11 |         self.trie.push(element);
12 |     }
13 | 
14 |     pub fn build(self) -> Trie<Label> {
15 |         self.trie
16 |     }
17 | }
18 | 
19 | #[derive(Clone)]
20 | pub struct Trie<Label> {
21 |     root: Node<Label>,
22 | }
23 | 
24 | impl<Label: Eq + Hash + Copy> Trie<Label> {
25 |     pub fn push(&mut self, element: &[Label]) {
26 |         let mut node = &mut self.root;
27 |         for label in element.iter() {
28 |             node = node.children.entry(*label).or_default();
29 |         }
30 |         node.is_leaf = true;
31 |     }
32 | 
33 |     pub fn common_prefix_search<T>(&self, iterator: T) -> TrieIterator<Label, T>
34 |     where
35 |         T: Iterator<Item = Label>,
36 |     {
37 |         TrieIterator {
38 |             node: &self.root,
39 |             prefix: vec![],
40 |             iterator,
41 |         }
42 |     }
43 | }
44 | 
45 | pub struct TrieIterator<'a, Label, T> {
46 |     node: &'a Node<Label>,
47 |     prefix: Vec<Label>,
48 |     iterator: T,
49 | }
50 | 
51 | impl<Label, T> Iterator for TrieIterator<'_, Label, T>
52 | where
53 |     Label: Eq + Hash + Copy,
54 |     T: Iterator<Item = Label>,
55 | {
56 |     type Item = Vec<Label>;
57 |     fn next(&mut self) -> Option<Self::Item> {
58 |         loop {
59 |             let label = self.iterator.next()?;
60 |             self.prefix.push(label);
61 |             let child = self.node.children.get(&label)?;
62 |             self.node = child;
63 |             if self.node.is_leaf {
64 |                 return Some(self.prefix.clone());
65 |             }
66 |         }
67 |     }
68 | }
69 | 
70 | impl<Label> Default for Trie<Label> {
71 |     fn default() -> Self {
72 |         Self {
73 |             root: Node::default(),
74 |         }
75 |     }
76 | }
77 | 
78 | #[derive(Clone)]
79 | pub struct Node<Label> {
80 |     is_leaf: bool,
81 |     children: AHashMap<Label, Node<Label>>,
82 | }
83 | 
84 | impl<Label> Default for Node<Label> {
85 |     fn default() -> Self {
86 |         Self {
87 |             is_leaf: false,
88 |             children: AHashMap::new(),
89 |         }
90 |     }
91 | }
92 | 


--------------------------------------------------------------------------------
/tokenizers/src/normalizers/precompiled.rs:
--------------------------------------------------------------------------------
 1 | use crate::tokenizer::{NormalizedString, Normalizer, Result};
 2 | pub use spm_precompiled::Precompiled;
 3 | use std::cmp::Ordering;
 4 | use unicode_segmentation::UnicodeSegmentation;
 5 | 
 6 | fn replace(transformations: &mut Vec<(char, isize)>, old_part: &str, new_part: &str) {
 7 |     let old_count = old_part.chars().count() as isize;
 8 |     let new_count = new_part.chars().count() as isize;
 9 |     let diff = new_count - old_count;
10 | 
11 |     // If we are just replacing characters, all changes should be == 0
12 |     transformations.extend(new_part.chars().map(|c| (c, 0)));
13 | 
14 |     match diff.cmp(&0) {
15 |         // If we are adding some characters, the last DIFF characters should be == 1
16 |         Ordering::Greater => {
17 |             transformations
18 |                 .iter_mut()
19 |                 .rev()
20 |                 .take(diff as usize)
21 |                 .for_each(|(_, cs)| *cs = 1);
22 |         }
23 |         // If we are removing some characters, the last one should include the diff
24 |         Ordering::Less => {
25 |             if let Some((_, cs)) = transformations.last_mut() {
26 |                 *cs += diff;
27 |             }
28 |         }
29 |         _ => {}
30 |     }
31 | }
32 | 
33 | impl Normalizer for Precompiled {
34 |     fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
35 |         let mut transformations = Vec::with_capacity(normalized.get().len());
36 |         // Future reader. From @Narsil.
37 |         // Yes, this is weird,
38 |         // Yes, this seems broken
39 |         // No, I don't know why Google did this.
40 |         // If you question this code, check this normalizer against
41 |         // XNLI database (all languages) with Unigram model against
42 |         // Mbart, XLMRoberta *AND* Marian. If you don't get 100% or
43 |         // break a single test.
44 |         // You don't pass.
45 |         let mut modified = false;
46 |         normalized.get().graphemes(true).for_each(|grapheme| {
47 |             if grapheme.len() < 6 {
48 |                 if let Some(norm) = self.transform(grapheme) {
49 |                     modified = true;
50 |                     replace(&mut transformations, grapheme, norm);
51 |                     return;
52 |                 }
53 |             }
54 |             for (char_index, c) in grapheme.char_indices() {
55 |                 let part = &grapheme[char_index..char_index + c.len_utf8()];
56 |                 if let Some(norm) = self.transform(part) {
57 |                     modified = true;
58 |                     replace(&mut transformations, part, norm);
59 |                 } else {
60 |                     transformations.push((c, 0));
61 |                 }
62 |             }
63 |         });
64 |         if modified {
65 |             normalized.transform(transformations, 0);
66 |         }
67 |         Ok(())
68 |     }
69 | }
70 | 
71 | #[cfg(test)]
72 | mod tests {
73 |     use super::*;
74 | 
75 |     #[test]
76 |     fn expansion_followed_by_removal() {
77 |         // Simulate transformations from "™\x1eg" to "TMg"
78 |         let mut transformations = vec![];
79 | 
80 |         let mut n = NormalizedString::from("™\x1eg");
81 |         replace(&mut transformations, "™", "TM");
82 |         replace(&mut transformations, "\x1e", "");
83 |         transformations.push(('g', 0));
84 | 
85 |         n.transform(transformations, 0);
86 | 
87 |         assert_eq!(n.get(), "TMg");
88 |     }
89 | }
90 | 


--------------------------------------------------------------------------------
/tokenizers/src/normalizers/prepend.rs:
--------------------------------------------------------------------------------
 1 | use crate::tokenizer::{NormalizedString, Normalizer, Result};
 2 | use serde::{Deserialize, Serialize};
 3 | 
 4 | #[derive(Clone, Debug, Deserialize, Serialize)]
 5 | #[serde(tag = "type")]
 6 | pub struct Prepend {
 7 |     pub prepend: String,
 8 | }
 9 | 
10 | impl Prepend {
11 |     pub fn new(prepend: String) -> Self {
12 |         Self { prepend }
13 |     }
14 | }
15 | 
16 | impl Normalizer for Prepend {
17 |     /// Strip the normalized string inplace
18 |     fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
19 |         if !normalized.is_empty() {
20 |             normalized.prepend(&self.prepend);
21 |         }
22 |         Ok(())
23 |     }
24 | }
25 | 
26 | #[cfg(test)]
27 | mod tests {
28 |     use super::*;
29 | 
30 |     #[test]
31 |     fn test_prepend() {
32 |         let original = "Hello";
33 |         let normalized = "▁Hello";
34 |         assert_ne!(original, normalized);
35 |         let mut n = NormalizedString::from(original);
36 |         let prepend = Prepend::new("▁".to_string());
37 |         prepend.normalize(&mut n).unwrap();
38 |         assert_eq!(&n.get(), &normalized);
39 |         assert_eq!(
40 |             n,
41 |             NormalizedString::new(
42 |                 original.to_string(),
43 |                 normalized.to_string(),
44 |                 vec![
45 |                     (0, 1),
46 |                     (0, 1),
47 |                     (0, 1),
48 |                     (0, 1),
49 |                     (1, 2),
50 |                     (2, 3),
51 |                     (3, 4),
52 |                     (4, 5)
53 |                 ],
54 |                 0
55 |             )
56 |         );
57 |         assert_eq!(
58 |             n.alignments_original(),
59 |             vec![(0, 4), (4, 5), (5, 6), (6, 7), (7, 8)]
60 |         );
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/tokenizers/src/normalizers/unicode.rs:
--------------------------------------------------------------------------------
  1 | use crate::tokenizer::{NormalizedString, Normalizer, Result};
  2 | use crate::utils::macro_rules_attribute;
  3 | 
  4 | #[derive(Default, Copy, Clone, Debug)]
  5 | #[macro_rules_attribute(impl_serde_type!)]
  6 | pub struct NFD;
  7 | impl Normalizer for NFD {
  8 |     fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
  9 |         normalized.nfd();
 10 |         Ok(())
 11 |     }
 12 | }
 13 | 
 14 | #[derive(Default, Copy, Clone, Debug)]
 15 | #[macro_rules_attribute(impl_serde_type!)]
 16 | pub struct NFKD;
 17 | impl Normalizer for NFKD {
 18 |     fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
 19 |         normalized.nfkd();
 20 |         Ok(())
 21 |     }
 22 | }
 23 | 
 24 | #[derive(Default, Copy, Clone, Debug)]
 25 | #[macro_rules_attribute(impl_serde_type!)]
 26 | pub struct NFC;
 27 | impl Normalizer for NFC {
 28 |     fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
 29 |         normalized.nfc();
 30 |         Ok(())
 31 |     }
 32 | }
 33 | 
 34 | #[derive(Default, Copy, Clone, Debug)]
 35 | #[macro_rules_attribute(impl_serde_type!)]
 36 | pub struct NFKC;
 37 | impl Normalizer for NFKC {
 38 |     fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
 39 |         normalized.nfkc();
 40 |         Ok(())
 41 |     }
 42 | }
 43 | 
 44 | fn do_nmt(normalized: &mut NormalizedString) {
 45 |     // Ascii Control characters
 46 |     normalized
 47 |         .filter(|c| {
 48 |             !matches!(
 49 |                 c as u32,
 50 |                 0x0001..=0x0008 |
 51 |                 0x000B |
 52 |                 0x000E..=0x001F |
 53 |                 0x007F |
 54 |                 0x008F |
 55 |                 0x009F
 56 |             )
 57 |         })
 58 |         // Other code points considered as whitespace.
 59 |         .map(|c| match c as u32 {
 60 |             0x0009 => ' ',
 61 |             0x000A => ' ',
 62 |             0x000C => ' ',
 63 |             0x000D => ' ',
 64 |             0x1680 => ' ',
 65 |             0x200B..=0x200F => ' ',
 66 |             0x2028 => ' ',
 67 |             0x2029 => ' ',
 68 |             0x2581 => ' ',
 69 |             0xFEFF => ' ',
 70 |             0xFFFD => ' ',
 71 |             _ => c,
 72 |         });
 73 | }
 74 | 
 75 | #[derive(Default, Copy, Clone, Debug)]
 76 | #[macro_rules_attribute(impl_serde_type!)]
 77 | pub struct Nmt;
 78 | impl Normalizer for Nmt {
 79 |     fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
 80 |         do_nmt(normalized);
 81 |         Ok(())
 82 |     }
 83 | }
 84 | 
 85 | #[cfg(test)]
 86 | mod tests {
 87 |     use super::*;
 88 | 
 89 |     #[test]
 90 |     fn test_nfkc() {
 91 |         let original = "\u{fb01}".to_string();
 92 |         let normalized = "fi".to_string();
 93 |         let mut n = NormalizedString::from(original.clone());
 94 |         NFKC.normalize(&mut n).unwrap();
 95 | 
 96 |         assert_eq!(
 97 |             n,
 98 |             NormalizedString::new(original, normalized, vec![(0, 3), (0, 3)], 0)
 99 |         );
100 | 
101 |         assert_eq!(n.alignments_original(), vec![(0, 2), (0, 2), (0, 2)]);
102 |     }
103 | }
104 | 


--------------------------------------------------------------------------------
/tokenizers/src/normalizers/utils.rs:
--------------------------------------------------------------------------------
 1 | use serde::{Deserialize, Serialize};
 2 | 
 3 | use crate::normalizers::NormalizerWrapper;
 4 | use crate::tokenizer::{NormalizedString, Normalizer, Result};
 5 | use crate::utils::macro_rules_attribute;
 6 | 
 7 | #[derive(Clone, Deserialize, Debug, Serialize)]
 8 | #[serde(tag = "type")]
 9 | /// Allows concatenating multiple other Normalizer as a Sequence.
10 | /// All the normalizers run in sequence in the given order against the same NormalizedString.
11 | pub struct Sequence {
12 |     normalizers: Vec<NormalizerWrapper>,
13 | }
14 | 
15 | impl Sequence {
16 |     pub fn new(normalizers: Vec<NormalizerWrapper>) -> Self {
17 |         Self { normalizers }
18 |     }
19 | }
20 | 
21 | impl AsRef<[NormalizerWrapper]> for Sequence {
22 |     fn as_ref(&self) -> &[NormalizerWrapper] {
23 |         &self.normalizers
24 |     }
25 | }
26 | 
27 | impl AsMut<[NormalizerWrapper]> for Sequence {
28 |     fn as_mut(&mut self) -> &mut [NormalizerWrapper] {
29 |         &mut self.normalizers
30 |     }
31 | }
32 | 
33 | impl IntoIterator for Sequence {
34 |     type Item = NormalizerWrapper;
35 |     type IntoIter = std::vec::IntoIter<Self::Item>;
36 | 
37 |     fn into_iter(self) -> Self::IntoIter {
38 |         self.normalizers.into_iter()
39 |     }
40 | }
41 | 
42 | impl Normalizer for Sequence {
43 |     fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
44 |         for normalizer in &self.normalizers {
45 |             normalizer.normalize(normalized)?;
46 |         }
47 |         Ok(())
48 |     }
49 | }
50 | 
51 | /// Lowercases the input
52 | #[derive(Copy, Clone, Debug)]
53 | #[macro_rules_attribute(impl_serde_type!)]
54 | pub struct Lowercase;
55 | impl Normalizer for Lowercase {
56 |     fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
57 |         normalized.lowercase();
58 |         Ok(())
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/tokenizers/src/pre_tokenizers/bert.rs:
--------------------------------------------------------------------------------
 1 | use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior};
 2 | use crate::utils::macro_rules_attribute;
 3 | use unicode_categories::UnicodeCategories;
 4 | 
 5 | fn is_bert_punc(x: char) -> bool {
 6 |     char::is_ascii_punctuation(&x) || x.is_punctuation()
 7 | }
 8 | 
 9 | #[derive(Copy, Clone, Debug, PartialEq, Eq)]
10 | #[macro_rules_attribute(impl_serde_type!)]
11 | pub struct BertPreTokenizer;
12 | 
13 | impl PreTokenizer for BertPreTokenizer {
14 |     fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
15 |         pretokenized.split(|_, s| s.split(char::is_whitespace, SplitDelimiterBehavior::Removed))?;
16 |         pretokenized.split(|_, s| s.split(is_bert_punc, SplitDelimiterBehavior::Isolated))
17 |     }
18 | }
19 | 
20 | #[cfg(test)]
21 | mod tests {
22 |     use super::*;
23 |     use crate::{NormalizedString, OffsetReferential, OffsetType};
24 | 
25 |     #[test]
26 |     fn basic() {
27 |         let pretok = BertPreTokenizer;
28 |         let mut pretokenized: PreTokenizedString = "Hey friend!     How are you?!?".into();
29 |         pretok.pre_tokenize(&mut pretokenized).unwrap();
30 |         assert_eq!(
31 |             pretokenized
32 |                 .get_splits(OffsetReferential::Original, OffsetType::Byte)
33 |                 .into_iter()
34 |                 .map(|(s, o, _)| (s, o))
35 |                 .collect::<Vec<_>>(),
36 |             vec![
37 |                 ("Hey", (0, 3)),
38 |                 ("friend", (4, 10)),
39 |                 ("!", (10, 11)),
40 |                 ("How", (16, 19)),
41 |                 ("are", (20, 23)),
42 |                 ("you", (24, 27)),
43 |                 ("?", (27, 28)),
44 |                 ("!", (28, 29)),
45 |                 ("?", (29, 30)),
46 |             ]
47 |         );
48 |     }
49 | 
50 |     #[test]
51 |     fn chinese_chars() {
52 |         let mut n = NormalizedString::from("野口里佳 Noguchi Rika");
53 |         n.transform(
54 |             n.get().to_owned().chars().flat_map(|c| {
55 |                 if (c as usize) > 0x4E00 {
56 |                     vec![(' ', 0), (c, 1), (' ', 1)]
57 |                 } else {
58 |                     vec![(c, 0)]
59 |                 }
60 |             }),
61 |             0,
62 |         );
63 |         let mut pretokenized = n.into();
64 |         let pretok = BertPreTokenizer;
65 |         pretok.pre_tokenize(&mut pretokenized).unwrap();
66 |         assert_eq!(
67 |             pretokenized
68 |                 .get_splits(OffsetReferential::Original, OffsetType::Byte)
69 |                 .into_iter()
70 |                 .map(|(s, o, _)| (s, o))
71 |                 .collect::<Vec<_>>(),
72 |             vec![
73 |                 ("野", (0, 3)),
74 |                 ("口", (3, 6)),
75 |                 ("里", (6, 9)),
76 |                 ("佳", (9, 12)),
77 |                 ("Noguchi", (13, 20)),
78 |                 ("Rika", (21, 25))
79 |             ]
80 |         );
81 |     }
82 | }
83 | 


--------------------------------------------------------------------------------
/tokenizers/src/pre_tokenizers/delimiter.rs:
--------------------------------------------------------------------------------
 1 | use serde::{Deserialize, Serialize};
 2 | 
 3 | use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior};
 4 | use crate::utils::macro_rules_attribute;
 5 | 
 6 | #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 7 | #[non_exhaustive]
 8 | #[macro_rules_attribute(impl_serde_type!)]
 9 | pub struct CharDelimiterSplit {
10 |     pub delimiter: char,
11 | }
12 | 
13 | impl CharDelimiterSplit {
14 |     pub fn new(delimiter: char) -> Self {
15 |         Self { delimiter }
16 |     }
17 | }
18 | 
19 | impl PreTokenizer for CharDelimiterSplit {
20 |     fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
21 |         // TODO: Maybe add the option to specify the behavior
22 |         pretokenized.split(|_, normalized| {
23 |             normalized.split(self.delimiter, SplitDelimiterBehavior::Removed)
24 |         })
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/tokenizers/src/pre_tokenizers/punctuation.rs:
--------------------------------------------------------------------------------
 1 | use serde::{Deserialize, Serialize};
 2 | 
 3 | use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior};
 4 | use crate::utils::macro_rules_attribute;
 5 | use unicode_categories::UnicodeCategories;
 6 | 
 7 | fn is_punc(x: char) -> bool {
 8 |     char::is_ascii_punctuation(&x) || x.is_punctuation()
 9 | }
10 | 
11 | #[derive(Copy, Clone, Debug, PartialEq, Eq)]
12 | #[macro_rules_attribute(impl_serde_type!)]
13 | pub struct Punctuation {
14 |     #[serde(default = "default_split")]
15 |     pub behavior: SplitDelimiterBehavior,
16 | }
17 | 
18 | fn default_split() -> SplitDelimiterBehavior {
19 |     SplitDelimiterBehavior::Isolated
20 | }
21 | 
22 | impl Punctuation {
23 |     pub fn new(behavior: SplitDelimiterBehavior) -> Self {
24 |         Self { behavior }
25 |     }
26 | }
27 | 
28 | impl Default for Punctuation {
29 |     fn default() -> Self {
30 |         Self::new(SplitDelimiterBehavior::Isolated)
31 |     }
32 | }
33 | 
34 | impl PreTokenizer for Punctuation {
35 |     fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
36 |         pretokenized.split(|_, s| s.split(is_punc, self.behavior))
37 |     }
38 | }
39 | 
40 | #[cfg(test)]
41 | mod tests {
42 |     use super::*;
43 |     use crate::{OffsetReferential, OffsetType};
44 | 
45 |     #[test]
46 |     fn punctuation_basic() {
47 |         let pretok = Punctuation::default();
48 |         let mut pretokenized: PreTokenizedString = "Hey friend!     How are you?!?".into();
49 |         pretok.pre_tokenize(&mut pretokenized).unwrap();
50 |         assert_eq!(
51 |             pretokenized
52 |                 .get_splits(OffsetReferential::Original, OffsetType::Byte)
53 |                 .into_iter()
54 |                 .map(|(s, o, _)| (s, o))
55 |                 .collect::<Vec<_>>(),
56 |             vec![
57 |                 ("Hey friend", (0, 10)),
58 |                 ("!", (10, 11)),
59 |                 ("     How are you", (11, 27)),
60 |                 ("?", (27, 28)),
61 |                 ("!", (28, 29)),
62 |                 ("?", (29, 30)),
63 |             ]
64 |         );
65 |     }
66 | 
67 |     #[test]
68 |     fn deserialization() {
69 |         let punctuation: Punctuation = serde_json::from_str(r#"{"type": "Punctuation"}"#).unwrap();
70 |         assert_eq!(punctuation, Punctuation::default());
71 |         assert_eq!(
72 |             punctuation,
73 |             Punctuation::new(SplitDelimiterBehavior::Isolated)
74 |         );
75 |     }
76 | 
77 |     #[test]
78 |     #[should_panic]
79 |     fn deserialization_erroneous() {
80 |         let _punctuation: Punctuation =
81 |             serde_json::from_str(r#"{"type": "WhitespaceSplit"}"#).unwrap();
82 |     }
83 | }
84 | 


--------------------------------------------------------------------------------
/tokenizers/src/pre_tokenizers/sequence.rs:
--------------------------------------------------------------------------------
 1 | use crate::pre_tokenizers::PreTokenizerWrapper;
 2 | use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result};
 3 | use crate::utils::macro_rules_attribute;
 4 | use serde::{Deserialize, Serialize};
 5 | 
 6 | #[derive(Clone, Debug, PartialEq)]
 7 | #[macro_rules_attribute(impl_serde_type!)]
 8 | pub struct Sequence {
 9 |     pretokenizers: Vec<PreTokenizerWrapper>,
10 | }
11 | 
12 | impl Sequence {
13 |     pub fn new(pretokenizers: Vec<PreTokenizerWrapper>) -> Self {
14 |         Self { pretokenizers }
15 |     }
16 | }
17 | 
18 | impl AsRef<[PreTokenizerWrapper]> for Sequence {
19 |     fn as_ref(&self) -> &[PreTokenizerWrapper] {
20 |         &self.pretokenizers
21 |     }
22 | }
23 | 
24 | impl AsMut<[PreTokenizerWrapper]> for Sequence {
25 |     fn as_mut(&mut self) -> &mut [PreTokenizerWrapper] {
26 |         &mut self.pretokenizers
27 |     }
28 | }
29 | 
30 | impl IntoIterator for Sequence {
31 |     type Item = PreTokenizerWrapper;
32 |     type IntoIter = std::vec::IntoIter<Self::Item>;
33 | 
34 |     fn into_iter(self) -> Self::IntoIter {
35 |         self.pretokenizers.into_iter()
36 |     }
37 | }
38 | 
39 | impl PreTokenizer for Sequence {
40 |     fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
41 |         for pretokenizer in &self.pretokenizers {
42 |             pretokenizer.pre_tokenize(pretokenized)?;
43 |         }
44 |         Ok(())
45 |     }
46 | }
47 | 
48 | #[cfg(test)]
49 | mod tests {
50 |     use super::*;
51 |     use crate::pre_tokenizers::{punctuation::Punctuation, whitespace::WhitespaceSplit};
52 |     use crate::{OffsetReferential, OffsetType};
53 | 
54 |     #[test]
55 |     fn sequence_basic() {
56 |         let pretokenizers = vec![
57 |             PreTokenizerWrapper::WhitespaceSplit(WhitespaceSplit),
58 |             PreTokenizerWrapper::Punctuation(Punctuation::default()),
59 |         ];
60 |         let pretok = Sequence::new(pretokenizers);
61 |         let mut pretokenized: PreTokenizedString = "Hey friend!     How are you?!?".into();
62 |         pretok.pre_tokenize(&mut pretokenized).unwrap();
63 |         assert_eq!(
64 |             pretokenized
65 |                 .get_splits(OffsetReferential::Original, OffsetType::Byte)
66 |                 .into_iter()
67 |                 .map(|(s, o, _)| (s, o))
68 |                 .collect::<Vec<_>>(),
69 |             vec![
70 |                 ("Hey", (0, 3)),
71 |                 ("friend", (4, 10)),
72 |                 ("!", (10, 11)),
73 |                 ("How", (16, 19)),
74 |                 ("are", (20, 23)),
75 |                 ("you", (24, 27)),
76 |                 ("?", (27, 28)),
77 |                 ("!", (28, 29)),
78 |                 ("?", (29, 30)),
79 |             ]
80 |         );
81 |     }
82 | }
83 | 


--------------------------------------------------------------------------------
/tokenizers/src/pre_tokenizers/unicode_scripts/mod.rs:
--------------------------------------------------------------------------------
1 | mod pre_tokenizer;
2 | mod scripts;
3 | 
4 | // Re-export the PreTokenizer
5 | pub use pre_tokenizer::UnicodeScripts;
6 | 


--------------------------------------------------------------------------------
/tokenizers/src/utils/fancy.rs:
--------------------------------------------------------------------------------
 1 | use crate::tokenizer::pattern::Pattern;
 2 | use crate::Offsets;
 3 | use fancy_regex::Regex;
 4 | use std::error::Error;
 5 | 
 6 | #[derive(Debug)]
 7 | pub struct SysRegex {
 8 |     regex: Regex,
 9 | }
10 | 
11 | impl SysRegex {
12 |     pub fn find_iter<'r, 't>(&'r self, inside: &'t str) -> Matches<'r, 't> {
13 |         Matches(self.regex.find_iter(inside))
14 |     }
15 | 
16 |     pub fn new(regex_str: &str) -> Result<Self, Box<dyn Error + Send + Sync + 'static>> {
17 |         Ok(Self {
18 |             regex: Regex::new(regex_str)?,
19 |         })
20 |     }
21 | }
22 | 
23 | pub struct Matches<'r, 't>(fancy_regex::Matches<'r, 't>);
24 | 
25 | impl Iterator for Matches<'_, '_> {
26 |     type Item = (usize, usize);
27 | 
28 |     fn next(&mut self) -> Option<Self::Item> {
29 |         match self.0.next() {
30 |             Some(Ok(mat)) => Some((mat.start(), mat.end())),
31 |             // stop if an error is encountered
32 |             None | Some(Err(_)) => None,
33 |         }
34 |     }
35 | }
36 | 
37 | impl Pattern for &Regex {
38 |     fn find_matches(
39 |         &self,
40 |         inside: &str,
41 |     ) -> Result<Vec<(Offsets, bool)>, Box<dyn Error + Send + Sync + 'static>> {
42 |         if inside.is_empty() {
43 |             return Ok(vec![((0, 0), false)]);
44 |         }
45 | 
46 |         let mut prev = 0;
47 |         let mut splits = Vec::with_capacity(inside.len());
48 |         for match_ in self.find_iter(inside) {
49 |             let match_ = match_?;
50 |             let start = match_.start();
51 |             let end = match_.end();
52 |             if prev != start {
53 |                 splits.push(((prev, start), false));
54 |             }
55 |             splits.push(((start, end), true));
56 |             prev = end;
57 |         }
58 |         if prev != inside.len() {
59 |             splits.push(((prev, inside.len()), false))
60 |         }
61 |         Ok(splits)
62 |     }
63 | }
64 | 


--------------------------------------------------------------------------------
/tokenizers/src/utils/from_pretrained.rs:
--------------------------------------------------------------------------------
 1 | use crate::Result;
 2 | use hf_hub::{api::sync::ApiBuilder, Repo, RepoType};
 3 | use std::collections::HashMap;
 4 | use std::path::PathBuf;
 5 | 
 6 | /// Defines the additional parameters available for the `from_pretrained` function
 7 | #[derive(Debug, Clone)]
 8 | pub struct FromPretrainedParameters {
 9 |     pub revision: String,
10 |     pub user_agent: HashMap<String, String>,
11 |     pub token: Option<String>,
12 | }
13 | 
14 | impl Default for FromPretrainedParameters {
15 |     fn default() -> Self {
16 |         Self {
17 |             revision: "main".into(),
18 |             user_agent: HashMap::new(),
19 |             token: None,
20 |         }
21 |     }
22 | }
23 | 
24 | /// Downloads and cache the identified tokenizer if it exists on
25 | /// the Hugging Face Hub, and returns a local path to the file
26 | pub fn from_pretrained<S: AsRef<str>>(
27 |     identifier: S,
28 |     params: Option<FromPretrainedParameters>,
29 | ) -> Result<PathBuf> {
30 |     let identifier: String = identifier.as_ref().to_string();
31 | 
32 |     let valid_chars = ['-', '_', '.', '/'];
33 |     let is_valid_char = |x: char| x.is_alphanumeric() || valid_chars.contains(&x);
34 | 
35 |     let valid = identifier.chars().all(is_valid_char);
36 |     let valid_chars_stringified = valid_chars
37 |         .iter()
38 |         .fold(vec![], |mut buf, x| {
39 |             buf.push(format!("'{x}'"));
40 |             buf
41 |         })
42 |         .join(", "); // "'/', '-', '_', '.'"
43 |     if !valid {
44 |         return Err(format!(
45 |             "Model \"{identifier}\" contains invalid characters, expected only alphanumeric or {valid_chars_stringified}"
46 |         )
47 |         .into());
48 |     }
49 |     let params = params.unwrap_or_default();
50 | 
51 |     let revision = &params.revision;
52 |     let valid_revision = revision.chars().all(is_valid_char);
53 |     if !valid_revision {
54 |         return Err(format!(
55 |             "Revision \"{revision}\" contains invalid characters, expected only alphanumeric or {valid_chars_stringified}"
56 |         )
57 |         .into());
58 |     }
59 | 
60 |     let mut builder = ApiBuilder::from_env();
61 |     if let Some(token) = params.token {
62 |         builder = builder.with_token(Some(token));
63 |     }
64 |     let api = builder.build()?;
65 |     let repo = Repo::with_revision(identifier, RepoType::Model, params.revision);
66 |     let api = api.repo(repo);
67 |     Ok(api.get("tokenizer.json")?)
68 | }
69 | 


--------------------------------------------------------------------------------
/tokenizers/src/utils/iter.rs:
--------------------------------------------------------------------------------
  1 | //! This comes from the Rust libcore and is duplicated here because it is not exported
  2 | //! (cf <https://github.com/rust-lang/rust/blob/25091ed9b7739e12466fb2490baa1e8a2815121c/src/libcore/iter/adapters/mod.rs#L2664>)
  3 | //! We are now using the version from <https://stackoverflow.com/questions/44544323/how-to-unzip-a-sequence-of-resulta-b-e-to-a-veca-vecb-and-stop-on-f>
  4 | //! because the one from the libcore seems to cause overflowing stacks in some cases
  5 | //! It also contains a lines_with_ending that copies std::io::BufRead but keeps line endings.
  6 | use std::io::BufRead;
  7 | 
  8 | pub struct ResultShunt<I, E> {
  9 |     iter: I,
 10 |     error: Option<E>,
 11 | }
 12 | 
 13 | impl<I, T, E> ResultShunt<I, E>
 14 | where
 15 |     I: Iterator<Item = Result<T, E>>,
 16 | {
 17 |     /// Process the given iterator as if it yielded a `T` instead of a
 18 |     /// `Result<T, _>`. Any errors will stop the inner iterator and
 19 |     /// the overall result will be an error.
 20 |     pub fn process<F, U>(iter: I, mut f: F) -> Result<U, E>
 21 |     where
 22 |         F: FnMut(&mut Self) -> U,
 23 |     {
 24 |         let mut shunt = ResultShunt::new(iter);
 25 |         let value = f(shunt.by_ref());
 26 |         shunt.reconstruct(value)
 27 |     }
 28 | 
 29 |     fn new(iter: I) -> Self {
 30 |         ResultShunt { iter, error: None }
 31 |     }
 32 | 
 33 |     /// Consume the adapter and rebuild a `Result` value. This should
 34 |     /// *always* be called, otherwise any potential error would be
 35 |     /// lost.
 36 |     fn reconstruct<U>(self, val: U) -> Result<U, E> {
 37 |         match self.error {
 38 |             None => Ok(val),
 39 |             Some(e) => Err(e),
 40 |         }
 41 |     }
 42 | }
 43 | 
 44 | impl<I, T, E> Iterator for ResultShunt<I, E>
 45 | where
 46 |     I: Iterator<Item = Result<T, E>>,
 47 | {
 48 |     type Item = T;
 49 | 
 50 |     fn next(&mut self) -> Option<Self::Item> {
 51 |         match self.iter.next() {
 52 |             Some(Ok(v)) => Some(v),
 53 |             Some(Err(e)) => {
 54 |                 self.error = Some(e);
 55 |                 None
 56 |             }
 57 |             None => None,
 58 |         }
 59 |     }
 60 | }
 61 | 
 62 | /// Copied from std::io::BufRead but keep newline characters.
 63 | #[derive(Debug)]
 64 | pub struct Lines<B> {
 65 |     buf: B,
 66 | }
 67 | 
 68 | pub trait LinesWithEnding<B> {
 69 |     fn lines_with_ending(self) -> Lines<B>;
 70 | }
 71 | 
 72 | impl<B> LinesWithEnding<B> for B
 73 | where
 74 |     B: BufRead,
 75 | {
 76 |     fn lines_with_ending(self) -> Lines<B> {
 77 |         Lines::<B> { buf: self }
 78 |     }
 79 | }
 80 | impl<B: BufRead> Iterator for Lines<B> {
 81 |     type Item = std::io::Result<String>;
 82 | 
 83 |     fn next(&mut self) -> Option<Self::Item> {
 84 |         let mut buf = String::new();
 85 |         match self.buf.read_line(&mut buf) {
 86 |             Ok(0) => None,
 87 |             Ok(_n) => {
 88 |                 // if buf.ends_with('\n') {
 89 |                 //     buf.pop();
 90 |                 //     if buf.ends_with('\r') {
 91 |                 //         buf.pop();
 92 |                 //     }
 93 |                 // }
 94 |                 Some(Ok(buf))
 95 |             }
 96 |             Err(e) => Some(Err(e)),
 97 |         }
 98 |     }
 99 | }
100 | 


--------------------------------------------------------------------------------
/tokenizers/src/utils/onig.rs:
--------------------------------------------------------------------------------
 1 | use crate::tokenizer::pattern::Pattern;
 2 | use crate::{Offsets, Result};
 3 | use onig::Regex;
 4 | use std::error::Error;
 5 | 
 6 | #[derive(Debug)]
 7 | pub struct SysRegex {
 8 |     regex: Regex,
 9 | }
10 | 
11 | impl SysRegex {
12 |     pub fn find_iter<'r, 't>(&'r self, inside: &'t str) -> onig::FindMatches<'r, 't> {
13 |         self.regex.find_iter(inside)
14 |     }
15 | 
16 |     pub fn new(
17 |         regex_str: &str,
18 |     ) -> std::result::Result<Self, Box<dyn Error + Send + Sync + 'static>> {
19 |         Ok(Self {
20 |             regex: Regex::new(regex_str)?,
21 |         })
22 |     }
23 | }
24 | 
25 | impl Pattern for &Regex {
26 |     fn find_matches(&self, inside: &str) -> Result<Vec<(Offsets, bool)>> {
27 |         if inside.is_empty() {
28 |             return Ok(vec![((0, 0), false)]);
29 |         }
30 | 
31 |         let mut prev = 0;
32 |         let mut splits = Vec::with_capacity(inside.len());
33 |         for (start, end) in self.find_iter(inside) {
34 |             if prev != start {
35 |                 splits.push(((prev, start), false));
36 |             }
37 |             splits.push(((start, end), true));
38 |             prev = end;
39 |         }
40 |         if prev != inside.len() {
41 |             splits.push(((prev, inside.len()), false))
42 |         }
43 |         Ok(splits)
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/tokenizers/src/utils/progress.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(feature = "progressbar")]
 2 | pub(crate) use indicatif::{ProgressBar, ProgressStyle};
 3 | 
 4 | #[cfg(not(feature = "progressbar"))]
 5 | mod progressbar {
 6 |     use std::borrow::Cow;
 7 |     pub struct ProgressBar;
 8 |     impl ProgressBar {
 9 |         pub fn new(_length: u64) -> Self {
10 |             Self {}
11 |         }
12 | 
13 |         pub fn set_length(&self, _length: u64) {}
14 |         pub fn set_message(&self, _message: impl Into<Cow<'static, str>>) {}
15 |         pub fn finish(&self) {}
16 |         pub fn reset(&self) {}
17 |         pub fn inc(&self, _inc: u64) {}
18 |         pub fn set_style(&self, _style: ProgressStyle) {}
19 |     }
20 | 
21 |     pub struct ProgressStyle {}
22 |     impl ProgressStyle {
23 |         pub fn default_bar() -> Self {
24 |             Self {}
25 |         }
26 |         pub fn template(self, _template: &str) -> Result<Self, String> {
27 |             Ok(self)
28 |         }
29 |     }
30 | }
31 | #[cfg(not(feature = "progressbar"))]
32 | pub(crate) use progressbar::{ProgressBar, ProgressStyle};
33 | 


--------------------------------------------------------------------------------
/tokenizers/tests/common/mod.rs:
--------------------------------------------------------------------------------
 1 | use tokenizers::decoders::wordpiece::WordPiece as WordPieceDecoder;
 2 | use tokenizers::models::bpe::BPE;
 3 | use tokenizers::models::wordpiece::WordPiece;
 4 | use tokenizers::normalizers::bert::BertNormalizer;
 5 | use tokenizers::pre_tokenizers::bert::BertPreTokenizer;
 6 | use tokenizers::pre_tokenizers::byte_level::ByteLevel;
 7 | use tokenizers::processors::bert::BertProcessing;
 8 | use tokenizers::tokenizer::{Model, Tokenizer};
 9 | 
10 | #[allow(dead_code)]
11 | pub fn get_empty() -> Tokenizer {
12 |     Tokenizer::new(BPE::default())
13 | }
14 | 
15 | #[allow(dead_code)]
16 | pub fn get_byte_level_bpe() -> BPE {
17 |     BPE::from_file("data/gpt2-vocab.json", "data/gpt2-merges.txt")
18 |         .build()
19 |         .expect("Files not found, run `make test` to download these files")
20 | }
21 | 
22 | #[allow(dead_code)]
23 | pub fn get_byte_level(add_prefix_space: bool, trim_offsets: bool) -> Tokenizer {
24 |     let mut tokenizer = Tokenizer::new(get_byte_level_bpe());
25 |     tokenizer
26 |         .with_pre_tokenizer(Some(
27 |             ByteLevel::default().add_prefix_space(add_prefix_space),
28 |         ))
29 |         .with_decoder(Some(ByteLevel::default()))
30 |         .with_post_processor(Some(ByteLevel::default().trim_offsets(trim_offsets)));
31 | 
32 |     tokenizer
33 | }
34 | 
35 | #[allow(dead_code)]
36 | pub fn get_bert_wordpiece() -> WordPiece {
37 |     WordPiece::from_file("data/bert-base-uncased-vocab.txt")
38 |         .build()
39 |         .expect("Files not found, run `make test` to download these files")
40 | }
41 | 
42 | #[allow(dead_code)]
43 | pub fn get_bert() -> Tokenizer {
44 |     let mut tokenizer = Tokenizer::new(get_bert_wordpiece());
45 |     let sep = tokenizer.get_model().token_to_id("[SEP]").unwrap();
46 |     let cls = tokenizer.get_model().token_to_id("[CLS]").unwrap();
47 |     tokenizer
48 |         .with_normalizer(Some(BertNormalizer::default()))
49 |         .with_pre_tokenizer(Some(BertPreTokenizer))
50 |         .with_decoder(Some(WordPieceDecoder::default()))
51 |         .with_post_processor(Some(BertProcessing::new(
52 |             (String::from("[SEP]"), sep),
53 |             (String::from("[CLS]"), cls),
54 |         )));
55 | 
56 |     tokenizer
57 | }
58 | 


--------------------------------------------------------------------------------
/tokenizers/tests/from_pretrained.rs:
--------------------------------------------------------------------------------
 1 | #![cfg(feature = "http")]
 2 | use tokenizers::{FromPretrainedParameters, Result, Tokenizer};
 3 | 
 4 | #[test]
 5 | fn test_from_pretrained() -> Result<()> {
 6 |     let tokenizer = Tokenizer::from_pretrained("bert-base-cased", None)?;
 7 |     let encoding = tokenizer.encode("Hey there dear friend!", false)?;
 8 |     assert_eq!(
 9 |         encoding.get_tokens(),
10 |         &["Hey", "there", "dear", "friend", "!"]
11 |     );
12 |     Ok(())
13 | }
14 | 
15 | #[test]
16 | fn test_from_pretrained_revision() -> Result<()> {
17 |     let tokenizer = Tokenizer::from_pretrained("anthony/tokenizers-test", None)?;
18 |     let encoding = tokenizer.encode("Hey there dear friend!", false)?;
19 |     assert_eq!(
20 |         encoding.get_tokens(),
21 |         &["hey", "there", "dear", "friend", "!"]
22 |     );
23 | 
24 |     let tokenizer = Tokenizer::from_pretrained(
25 |         "anthony/tokenizers-test",
26 |         Some(FromPretrainedParameters {
27 |             revision: "gpt-2".to_string(),
28 |             ..Default::default()
29 |         }),
30 |     )?;
31 |     let encoding = tokenizer.encode("Hey there dear friend!", false)?;
32 |     assert_eq!(
33 |         encoding.get_tokens(),
34 |         &["Hey", "Ġthere", "Ġdear", "Ġfriend", "!"]
35 |     );
36 | 
37 |     Ok(())
38 | }
39 | 
40 | #[test]
41 | fn test_from_pretrained_invalid_model() {
42 |     let tokenizer = Tokenizer::from_pretrained("docs?", None);
43 |     assert!(tokenizer.is_err());
44 | }
45 | 
46 | #[test]
47 | fn test_from_pretrained_invalid_revision() {
48 |     let tokenizer = Tokenizer::from_pretrained(
49 |         "bert-base-cased",
50 |         Some(FromPretrainedParameters {
51 |             revision: "gpt?".to_string(),
52 |             ..Default::default()
53 |         }),
54 |     );
55 |     assert!(tokenizer.is_err());
56 | }
57 | 


--------------------------------------------------------------------------------
/tokenizers/tests/training.rs:
--------------------------------------------------------------------------------
 1 | use tokenizers::models::bpe::BPE;
 2 | use tokenizers::pre_tokenizers::whitespace::Whitespace;
 3 | use tokenizers::{DecoderWrapper, NormalizerWrapper, PostProcessorWrapper, PreTokenizerWrapper};
 4 | use tokenizers::{Model, Tokenizer, TokenizerBuilder};
 5 | 
 6 | #[test]
 7 | fn bpe_values_after_training() {
 8 |     let mut tokenizer = TokenizerBuilder::<
 9 |         BPE,
10 |         NormalizerWrapper,
11 |         PreTokenizerWrapper,
12 |         PostProcessorWrapper,
13 |         DecoderWrapper,
14 |     >::default()
15 |     .with_model(
16 |         BPE::builder()
17 |             .unk_token("[UNK]".to_string())
18 |             .dropout(0.1)
19 |             .build()
20 |             .unwrap(),
21 |     )
22 |     .build()
23 |     .unwrap();
24 |     let mut trainer = tokenizer.get_model().get_trainer();
25 |     tokenizer
26 |         .train_from_files(&mut trainer, vec!["./data/small.txt".to_string()])
27 |         .unwrap();
28 |     assert_eq!(tokenizer.get_model().dropout, Some(0.1));
29 |     assert_eq!(tokenizer.get_model().unk_token, Some("[UNK]".to_string()));
30 | }
31 | 
32 | #[test]
33 | fn bpe_continuing_subword_prefix_error() {
34 |     let mut tokenizer = TokenizerBuilder::<
35 |         BPE,
36 |         NormalizerWrapper,
37 |         PreTokenizerWrapper,
38 |         PostProcessorWrapper,
39 |         DecoderWrapper,
40 |     >::default()
41 |     .with_model(
42 |         BPE::builder()
43 |             .unk_token("[UNK]".to_string())
44 |             .continuing_subword_prefix("##".to_string())
45 |             .build()
46 |             .unwrap(),
47 |     )
48 |     .with_pre_tokenizer(Some(PreTokenizerWrapper::Whitespace(Whitespace {})))
49 |     .build()
50 |     .unwrap();
51 |     let mut trainer = tokenizer.get_model().get_trainer();
52 |     tokenizer
53 |         .train_from_files(&mut trainer, vec!["./data/small.txt".to_string()])
54 |         .unwrap();
55 |     tokenizer.save("tokenizer.json", true).unwrap();
56 |     let tokenizer = Tokenizer::from_file("tokenizer.json").unwrap();
57 |     assert_eq!(tokenizer.get_vocab_size(false), 1526);
58 | 
59 |     std::fs::remove_file("tokenizer.json").unwrap();
60 | }
61 | 


--------------------------------------------------------------------------------