├── .cargo └── config ├── .dockerignore ├── .github ├── FUNDING.yml └── workflows │ ├── main.yml │ └── manylinux_build.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── Cargo.toml ├── LICENSE ├── README.md ├── demo ├── .cargo-ok ├── .gitignore ├── Cargo.toml ├── LICENSE_APACHE ├── LICENSE_MIT ├── README.md ├── src │ ├── lib.rs │ └── utils.rs ├── tests │ └── web.rs └── www │ ├── .babelrc │ ├── .bin │ └── create-wasm-app.js │ ├── .eslintrc.js │ ├── .gitignore │ ├── LICENSE-APACHE │ ├── LICENSE-MIT │ ├── README.md │ ├── package-lock.json │ ├── package.json │ ├── src │ ├── index.html │ └── index.tsx │ ├── tsconfig.json │ ├── types │ └── react-linto │ │ └── index.d.ts │ └── webpack.config.js ├── dockerfiles ├── centos │ └── Dockerfile └── ci │ ├── centos │ └── Dockerfile │ └── manylinux │ └── Dockerfile ├── img └── demo.png ├── note ├── algorithm.md └── blog_post.md ├── python ├── .gitignore ├── Cargo.toml ├── Makefile ├── README.md ├── poetry.lock ├── pyproject.toml ├── setup.cfg ├── src │ └── lib.rs ├── tests │ ├── __init__.py │ └── test_main.py ├── tokenizations │ ├── __init__.py │ ├── __init__.pyi │ └── py.typed └── tox.ini └── src ├── lib.rs └── tests.rs /.cargo/config: -------------------------------------------------------------------------------- 1 | [target.x86_64-apple-darwin] 2 | rustflags = [ 3 | "-C", "link-arg=-undefined", 4 | "-C", "link-arg=dynamic_lookup", 5 | ] -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | /target/ 2 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: tamuhey 4 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: Test and Deploy 2 | 3 | on: 4 | push: 5 | paths-ignore: 6 | - 'README.md' 7 | 8 | jobs: 9 | test: 10 | if: contains(github.event.head_commit.message, '[skip ci]') == false 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | matrix: 14 | python-version: [3.7, 3.8, 3.9] 15 | os: [macos-latest, windows-latest, ubuntu-latest] 16 | steps: 17 | - uses: actions/checkout@v1 18 | - name: Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v1 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | 23 | - name: Install latest stable 24 | uses: actions-rs/toolchain@v1 25 | with: 26 | toolchain: stable 27 | override: true 28 | components: rustfmt, clippy 29 | 30 | - name: Lint with RustFmt 31 | uses: actions-rs/cargo@v1 32 | with: 33 | command: fmt 34 | 35 | - name: Lint with Clippy 36 | uses: actions-rs/cargo@v1 37 | with: 38 | command: clippy 39 | args: --all-targets --all-features 40 | 41 | - name: Test with cargo 42 | uses: actions-rs/cargo@v1.0.1 43 | with: 44 | command: test 45 | toolchain: stable 46 | 47 | - name: Install dependencies with pip 48 | working-directory: python 49 | run: | 50 | python -m pip install --upgrade pip 51 | pip install poetry maturin 52 | poetry install 53 | 54 | - name: Build python package 55 | working-directory: python 56 | run: poetry run maturin develop 57 | 58 | - name: Test with pytest 59 | working-directory: python 60 | run: poetry run pytest tests 61 | 62 | publish-rust: 63 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/rust/') 64 | needs: test 65 | runs-on: ubuntu-latest 66 | steps: 67 | - uses: actions/checkout@v1 68 | - uses: actions-rs/toolchain@v1 69 | with: 70 | toolchain: stable 71 | override: true 72 | - name: Publish to creates.io 73 | run: | 74 | cargo login ${{ secrets.CRATES_PASS }} 75 | cargo publish 76 | 77 | publish-python-wheels: 78 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/python/') 79 | needs: test 80 | runs-on: ${{ matrix.os }} 81 | strategy: 82 | matrix: 83 | python-version: [3.7, 3.8, 3.9] 84 | # ubuntu wheel is built in `manylinux_build.yml` 85 | os: [macos-latest, windows-latest] 86 | 87 | steps: 88 | - uses: actions/checkout@v1 89 | - uses: actions/setup-python@v1 90 | with: 91 | python-version: ${{ matrix.python-version }} 92 | - uses: actions-rs/toolchain@v1 93 | with: 94 | toolchain: stable 95 | override: true 96 | 97 | - name: Install publishment tool 98 | working-directory: python 99 | run: | 100 | python -m pip install --upgrade pip 101 | pip install maturin twine 102 | 103 | - name: Build 104 | working-directory: python 105 | run: maturin build --no-sdist --release --strip -i python 106 | 107 | - name: Publish test pypi 108 | working-directory: python 109 | run: twine upload target/wheels/*whl --repository-url https://test.pypi.org/legacy/ -u ${{ secrets.TEST_PYPI_USER }} -p ${{ secrets.TEST_PYPI_PASS }} 110 | 111 | - name: Publish pypi 112 | working-directory: python 113 | run: twine upload target/wheels/*whl -u ${{ secrets.PYPI_USER }} -p ${{ secrets.PYPI_PASS }} 114 | 115 | publish-python-sdist: 116 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/python/') 117 | needs: test 118 | runs-on: ubuntu-latest 119 | steps: 120 | - uses: actions/checkout@v1 121 | - uses: actions/setup-python@v1 122 | with: 123 | python-version: 3.7 124 | - uses: actions-rs/toolchain@v1 125 | with: 126 | toolchain: stable 127 | override: true 128 | 129 | - name: Install publishment tool 130 | working-directory: python 131 | run: | 132 | python -m pip install --upgrade pip 133 | pip install maturin twine 134 | 135 | - name: Build sdist 136 | working-directory: python 137 | run: maturin sdist 138 | 139 | - name: Publish test pypi 140 | working-directory: python 141 | run: | 142 | twine upload target/wheels/*.tar.gz --repository-url https://test.pypi.org/legacy/ -u ${{ secrets.TEST_PYPI_USER }} -p ${{ secrets.TEST_PYPI_PASS }} 143 | 144 | - name: Publish pypi 145 | working-directory: python 146 | run: | 147 | twine upload target/wheels/*.tar.gz -u ${{ secrets.PYPI_USER }} -p ${{ secrets.PYPI_PASS }} 148 | 149 | -------------------------------------------------------------------------------- /.github/workflows/manylinux_build.yml: -------------------------------------------------------------------------------- 1 | name: build manylinux 2 | 3 | on: 4 | push: 5 | 6 | jobs: 7 | build: 8 | if: contains(github.event.head_commit.message, '[skip ci]') == false 9 | runs-on: ubuntu-latest 10 | strategy: 11 | matrix: 12 | python-version: 13 | - 3.7 14 | - 3.8 15 | - 3.9 16 | container: 17 | image: quay.io/pypa/manylinux2010_x86_64 18 | env: 19 | PATH: /root/.cargo/bin:/root/.local/bin:/opt/python/cp36-cp36m/bin:/opt/python/cp37-cp37m/bin:/opt/python/cp38-cp38/bin:/opt/rh/devtoolset-2/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/python/cp35-cp35m/bin:/opt/python/cp36-cp36m/bin:/opt/python/cp37-cp37m/bin:/opt/python/cp38-cp38/bin:/opt/python/cp39-cp39/bin:/opt/rh/devtoolset-8/root/usr/bin 20 | options: --user root 21 | env: 22 | HOME: /root 23 | PYTHON: python${{ matrix.python-version }} 24 | steps: 25 | - uses: actions/checkout@v1 26 | - name: Install rust 27 | run: | 28 | curl --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal 29 | - name: Test rust lib 30 | run: cargo test 31 | 32 | - name: Install dependencies with pip 33 | working-directory: python 34 | run: | 35 | $PYTHON -m pip install --upgrade pip 36 | $PYTHON -m venv .venv 37 | $PYTHON -m pip install poetry maturin 38 | poetry install 39 | poetry run which python 40 | 41 | - name: Build python package 42 | working-directory: python 43 | run: poetry run maturin develop 44 | 45 | - name: Test with pytest 46 | working-directory: python 47 | run: poetry run pytest tests 48 | 49 | - name: Install publishment tool 50 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/python/') 51 | working-directory: python 52 | run: $PYTHON -m pip install twine auditwheel 53 | 54 | - name: Build 55 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/python/') 56 | working-directory: python 57 | run: | 58 | maturin build --no-sdist --release --strip -i $PYTHON 59 | find target/ -type f -name "*whl" -exec $PYTHON -m auditwheel repair {} \; 60 | 61 | - name: Publish test pypi 62 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/python/') 63 | working-directory: python 64 | run: | 65 | twine upload target/wheels/*whl --repository-url https://test.pypi.org/legacy/ -u ${{ secrets.TEST_PYPI_USER }} -p ${{ secrets.TEST_PYPI_PASS }} 66 | 67 | - name: Publish pypi 68 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/python/') 69 | working-directory: python 70 | run: | 71 | twine upload target/wheels/*whl -u ${{ secrets.PYPI_USER }} -p ${{ secrets.PYPI_PASS }} 72 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | **/*.rs.bk 3 | Cargo.lock 4 | .venv 5 | .vscode/ 6 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: local 3 | hooks: 4 | - id: rustfmt 5 | name: rustfmt 6 | language: system 7 | entry: cargo fmt --all -- --check 8 | types: [file, rust] 9 | - id: check 10 | name: check 11 | language: system 12 | files: '[.]rs$' 13 | entry: cargo clippy --all-targets 14 | pass_filenames: false 15 | 16 | - repo: https://github.com/pre-commit/pre-commit-hooks 17 | rev: v2.4.0 18 | hooks: 19 | - id: check-added-large-files 20 | args: ['--maxkb=1000'] 21 | - id: check-merge-conflict 22 | - id: check-symlinks 23 | - id: flake8 24 | exclude: scripts/ 25 | args: ["--config", "python/setup.cfg"] 26 | 27 | - repo: git@github.com:humitos/mirrors-autoflake.git 28 | rev: v1.1 29 | hooks: 30 | - id: autoflake 31 | args: ['--in-place', '--remove-all-unused-imports', '--remove-unused-variable'] 32 | 33 | - repo: https://github.com/pre-commit/mirrors-isort 34 | rev: v4.3.21 35 | hooks: 36 | - id: isort 37 | 38 | - repo: https://github.com/ambv/black 39 | rev: stable 40 | hooks: 41 | - id: black 42 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Publishment flow 2 | 3 | ## Rust 4 | 5 | 1. Fix version in `Cargo.toml` (e.g. 0.1.0) 6 | 1. Git tag version with prefix `rust/` (e.g. `git tag rust/0.1.0) 7 | 1. Push tag to master 8 | 1. CI automatically publish crates to crates.io after testing 9 | 10 | ## Python 11 | 12 | 1. Fix version in `python/pyproject.toml`, `python/Cargo.toml`, `python/src/lib.rs` 13 | - Easily done with [pyversionup](https://github.com/tamuhey/pyversionup): `versionup 0.1.0` 14 | 1. Git tag version with prefix `python/` 15 | 1. Push tag to master 16 | 1. CI automatically publish package to pypi after testing 17 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "tokenizations" 3 | version = "0.4.0" 4 | license = "MIT" 5 | description = "Tokenizations alignments library" 6 | homepage = "https://github.com/tamuhey/tokenizations" 7 | repository = "https://github.com/tamuhey/tokenizations" 8 | keywords = ["nlp", "text", "algorithm"] 9 | authors = ["Yohei Tamura "] 10 | readme = "README.md" 11 | documentation = "https://docs.rs/tokenizations" 12 | 13 | [dependencies] 14 | unicode-normalization = "0.1.8" 15 | seqdiff = "0.2" 16 | 17 | [dev-dependencies] 18 | quickcheck = "0.9" 19 | quickcheck_macros = "0.9" 20 | 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 tamuhey 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ⚠ MOVED 2 | 3 | This repository was moved to [https://github.com/explosion/tokenizations](https://github.com/explosion/tokenizations), and now maintained by Explosion, spaCy's authors. 4 | 5 | # Robust and Fast tokenizations alignment library for Rust and Python 6 | [![creates.io](https://img.shields.io/crates/v/tokenizations.svg)](https://crates.io/crates/tokenizations) 7 | [![pypi](https://img.shields.io/pypi/v/pytokenizations.svg)](https://pypi.org/project/pytokenizations/) 8 | [![Actions Status](https://github.com/tamuhey/tokenizations/workflows/Test/badge.svg)](https://github.com/tamuhey/tokenizations/actions) 9 | 10 | ![sample](./img/demo.png) 11 | 12 | Demo: [demo](https://tamuhey.github.io/tokenizations/) 13 | Rust document: [docs.rs](https://docs.rs/tokenizations) 14 | Blog post: [How to calculate the alignment between BERT and spaCy tokens effectively and robustly](https://gist.github.com/tamuhey/af6cbb44a703423556c32798e1e1b704) 15 | -------------------------------------------------------------------------------- /demo/.cargo-ok: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tamuhey/tokenizations/bcb27e901c2a9413b45dfd030c6ff8764d753188/demo/.cargo-ok -------------------------------------------------------------------------------- /demo/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | **/*.rs.bk 3 | Cargo.lock 4 | bin/ 5 | pkg/ 6 | wasm-pack.log 7 | -------------------------------------------------------------------------------- /demo/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "demo" 3 | version = "0.1.0" 4 | authors = ["Yohei Tamura "] 5 | edition = "2018" 6 | 7 | [lib] 8 | crate-type = ["cdylib", "rlib"] 9 | 10 | [features] 11 | default = ["console_error_panic_hook"] 12 | 13 | [dependencies] 14 | wasm-bindgen = {version = "0.2", features= ["serde-serialize"]} 15 | tokenizations = "0.2.2" 16 | js-sys = "0.3.37" 17 | 18 | # The `console_error_panic_hook` crate provides better debugging of panics by 19 | # logging them with `console.error`. This is great for development, but requires 20 | # all the `std::fmt` and `std::panicking` infrastructure, so isn't great for 21 | # code size when deploying. 22 | console_error_panic_hook = { version = "0.1.1", optional = true } 23 | 24 | # `wee_alloc` is a tiny allocator for wasm that is only ~1K in code size 25 | # compared to the default allocator's ~10K. It is slower than the default 26 | # allocator, however. 27 | # 28 | # Unfortunately, `wee_alloc` requires nightly Rust when targeting wasm for now. 29 | wee_alloc = { version = "0.4.2", optional = true } 30 | 31 | [dev-dependencies] 32 | wasm-bindgen-test = "0.2" 33 | 34 | [profile.release] 35 | # Tell `rustc` to optimize for small code size. 36 | opt-level = "s" 37 | -------------------------------------------------------------------------------- /demo/LICENSE_APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | -------------------------------------------------------------------------------- /demo/LICENSE_MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018 Yohei Tamura 2 | 3 | Permission is hereby granted, free of charge, to any 4 | person obtaining a copy of this software and associated 5 | documentation files (the "Software"), to deal in the 6 | Software without restriction, including without 7 | limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software 10 | is furnished to do so, subject to the following 11 | conditions: 12 | 13 | The above copyright notice and this permission notice 14 | shall be included in all copies or substantial portions 15 | of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | DEALINGS IN THE SOFTWARE. 26 | -------------------------------------------------------------------------------- /demo/README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 |

wasm-pack-template

4 | 5 | A template for kick starting a Rust and WebAssembly project using wasm-pack. 6 | 7 |

8 | Build Status 9 |

10 | 11 |

12 | Tutorial 13 | | 14 | Chat 15 |

16 | 17 | Built with 🦀🕸 by The Rust and WebAssembly Working Group 18 |
19 | 20 | ## About 21 | 22 | [**📚 Read this template tutorial! 📚**][template-docs] 23 | 24 | This template is designed for compiling Rust libraries into WebAssembly and 25 | publishing the resulting package to NPM. 26 | 27 | Be sure to check out [other `wasm-pack` tutorials online][tutorials] for other 28 | templates and usages of `wasm-pack`. 29 | 30 | [tutorials]: https://rustwasm.github.io/docs/wasm-pack/tutorials/index.html 31 | [template-docs]: https://rustwasm.github.io/docs/wasm-pack/tutorials/npm-browser-packages/index.html 32 | 33 | ## 🚴 Usage 34 | 35 | ### 🐑 Use `cargo generate` to Clone this Template 36 | 37 | [Learn more about `cargo generate` here.](https://github.com/ashleygwilliams/cargo-generate) 38 | 39 | ``` 40 | cargo generate --git https://github.com/rustwasm/wasm-pack-template.git --name my-project 41 | cd my-project 42 | ``` 43 | 44 | ### 🛠️ Build with `wasm-pack build` 45 | 46 | ``` 47 | wasm-pack build 48 | ``` 49 | 50 | ### 🔬 Test in Headless Browsers with `wasm-pack test` 51 | 52 | ``` 53 | wasm-pack test --headless --firefox 54 | ``` 55 | 56 | ### 🎁 Publish to NPM with `wasm-pack publish` 57 | 58 | ``` 59 | wasm-pack publish 60 | ``` 61 | 62 | ## 🔋 Batteries Included 63 | 64 | * [`wasm-bindgen`](https://github.com/rustwasm/wasm-bindgen) for communicating 65 | between WebAssembly and JavaScript. 66 | * [`console_error_panic_hook`](https://github.com/rustwasm/console_error_panic_hook) 67 | for logging panic messages to the developer console. 68 | * [`wee_alloc`](https://github.com/rustwasm/wee_alloc), an allocator optimized 69 | for small code size. 70 | -------------------------------------------------------------------------------- /demo/src/lib.rs: -------------------------------------------------------------------------------- 1 | mod utils; 2 | 3 | use js_sys; 4 | use tokenizations; 5 | use wasm_bindgen::prelude::*; 6 | use wasm_bindgen::JsCast; 7 | 8 | // When the `wee_alloc` feature is enabled, use `wee_alloc` as the global 9 | // allocator. 10 | #[cfg(feature = "wee_alloc")] 11 | #[global_allocator] 12 | static ALLOC: wee_alloc::WeeAlloc = wee_alloc::WeeAlloc::INIT; 13 | 14 | #[wasm_bindgen] 15 | extern "C" { 16 | fn alert(s: &str); 17 | } 18 | 19 | fn as_vecstring(s: js_sys::Array) -> Vec { 20 | s.iter().map(|v| v.as_string().unwrap()).collect::>() 21 | } 22 | 23 | #[wasm_bindgen] 24 | pub fn get_alignment(s: js_sys::Array, t: js_sys::Array) -> JsValue { 25 | let s = as_vecstring(s); 26 | let t = as_vecstring(t); 27 | let ret = tokenizations::get_alignments(&s, &t); 28 | JsValue::from_serde(&ret).unwrap() 29 | } 30 | -------------------------------------------------------------------------------- /demo/src/utils.rs: -------------------------------------------------------------------------------- 1 | pub fn set_panic_hook() { 2 | // When the `console_error_panic_hook` feature is enabled, we can call the 3 | // `set_panic_hook` function at least once during initialization, and then 4 | // we will get better error messages if our code ever panics. 5 | // 6 | // For more details see 7 | // https://github.com/rustwasm/console_error_panic_hook#readme 8 | #[cfg(feature = "console_error_panic_hook")] 9 | console_error_panic_hook::set_once(); 10 | } 11 | -------------------------------------------------------------------------------- /demo/tests/web.rs: -------------------------------------------------------------------------------- 1 | //! Test suite for the Web and headless browsers. 2 | 3 | #![cfg(target_arch = "wasm32")] 4 | 5 | extern crate wasm_bindgen_test; 6 | use wasm_bindgen_test::*; 7 | 8 | wasm_bindgen_test_configure!(run_in_browser); 9 | 10 | #[wasm_bindgen_test] 11 | fn pass() { 12 | assert_eq!(1 + 1, 2); 13 | } 14 | -------------------------------------------------------------------------------- /demo/www/.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "presets": [ 3 | "@babel/preset-env", 4 | "@babel/preset-react" 5 | ] 6 | } -------------------------------------------------------------------------------- /demo/www/.bin/create-wasm-app.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const { spawn } = require("child_process"); 4 | const fs = require("fs"); 5 | 6 | let folderName = '.'; 7 | 8 | if (process.argv.length >= 3) { 9 | folderName = process.argv[2]; 10 | if (!fs.existsSync(folderName)) { 11 | fs.mkdirSync(folderName); 12 | } 13 | } 14 | 15 | const clone = spawn("git", ["clone", "https://github.com/rustwasm/create-wasm-app.git", folderName]); 16 | 17 | clone.on("close", code => { 18 | if (code !== 0) { 19 | console.error("cloning the template failed!") 20 | process.exit(code); 21 | } else { 22 | console.log("🦀 Rust + 🕸 Wasm = ❤"); 23 | } 24 | }); 25 | -------------------------------------------------------------------------------- /demo/www/.eslintrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | root: true, 3 | parser: '@typescript-eslint/parser', 4 | plugins: [ 5 | '@typescript-eslint', 6 | "react-hooks", 7 | "prettier", 8 | ], 9 | parserOptions: { 10 | tsconfigRootDir: __dirname, 11 | project: ['./tsconfig.json'], 12 | }, 13 | extends: [ 14 | 'plugin:@typescript-eslint/recommended-requiring-type-checking', 15 | "plugin:react/recommended", 16 | "prettier", 17 | "prettier/@typescript-eslint", 18 | "prettier/react", 19 | ], 20 | rules: { 21 | "react/jsx-props-no-spreading": "off", 22 | "no-underscore-dangle": "off", 23 | "prettier/prettier": "error", 24 | "react-hooks/rules-of-hooks": "error", // Checks rules of Hooks 25 | "react-hooks/exhaustive-deps": "warn", // Checks effect dependencies 26 | "react/prop-types": "off", 27 | } 28 | }; -------------------------------------------------------------------------------- /demo/www/.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | dist 3 | -------------------------------------------------------------------------------- /demo/www/LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /demo/www/LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) [year] [name] 2 | 3 | Permission is hereby granted, free of charge, to any 4 | person obtaining a copy of this software and associated 5 | documentation files (the "Software"), to deal in the 6 | Software without restriction, including without 7 | limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software 10 | is furnished to do so, subject to the following 11 | conditions: 12 | 13 | The above copyright notice and this permission notice 14 | shall be included in all copies or substantial portions 15 | of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | DEALINGS IN THE SOFTWARE. 26 | -------------------------------------------------------------------------------- /demo/www/README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 |

create-wasm-app

4 | 5 | An npm init template for kick starting a project that uses NPM packages containing Rust-generated WebAssembly and bundles them with Webpack. 6 | 7 |

8 | Build Status 9 |

10 | 11 |

12 | Usage 13 | | 14 | Chat 15 |

16 | 17 | Built with 🦀🕸 by The Rust and WebAssembly Working Group 18 |
19 | 20 | ## About 21 | 22 | This template is designed for depending on NPM packages that contain 23 | Rust-generated WebAssembly and using them to create a Website. 24 | 25 | * Want to create an NPM package with Rust and WebAssembly? [Check out 26 | `wasm-pack-template`.](https://github.com/rustwasm/wasm-pack-template) 27 | * Want to make a monorepo-style Website without publishing to NPM? Check out 28 | [`rust-webpack-template`](https://github.com/rustwasm/rust-webpack-template) 29 | and/or 30 | [`rust-parcel-template`](https://github.com/rustwasm/rust-parcel-template). 31 | 32 | ## 🚴 Usage 33 | 34 | ``` 35 | npm init wasm-app 36 | ``` 37 | 38 | ## 🔋 Batteries Included 39 | 40 | - `.gitignore`: ignores `node_modules` 41 | - `LICENSE-APACHE` and `LICENSE-MIT`: most Rust projects are licensed this way, so these are included for you 42 | - `README.md`: the file you are reading now! 43 | - `index.html`: a bare bones html document that includes the webpack bundle 44 | - `index.js`: example js file with a comment showing how to import and use a wasm pkg 45 | - `package.json` and `package-lock.json`: 46 | - pulls in devDependencies for using webpack: 47 | - [`webpack`](https://www.npmjs.com/package/webpack) 48 | - [`webpack-cli`](https://www.npmjs.com/package/webpack-cli) 49 | - [`webpack-dev-server`](https://www.npmjs.com/package/webpack-dev-server) 50 | - defines a `start` script to run `webpack-dev-server` 51 | - `webpack.config.js`: configuration file for bundling your js with webpack 52 | 53 | ## License 54 | 55 | Licensed under either of 56 | 57 | * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) 58 | * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) 59 | 60 | at your option. 61 | 62 | ### Contribution 63 | 64 | Unless you explicitly state otherwise, any contribution intentionally 65 | submitted for inclusion in the work by you, as defined in the Apache-2.0 66 | license, shall be dual licensed as above, without any additional terms or 67 | conditions. 68 | -------------------------------------------------------------------------------- /demo/www/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "create-wasm-app", 3 | "version": "0.1.0", 4 | "description": "create an app to consume rust-generated wasm packages", 5 | "main": "index.js", 6 | "bin": { 7 | "create-wasm-app": ".bin/create-wasm-app.js" 8 | }, 9 | "scripts": { 10 | "start": "webpack-dev-server", 11 | "build": "webpack --mode production", 12 | "predeploy": "npm run build", 13 | "deploy": "gh-pages -d dist" 14 | }, 15 | "repository": { 16 | "type": "git", 17 | "url": "git+https://github.com/rustwasm/create-wasm-app.git" 18 | }, 19 | "keywords": [ 20 | "webassembly", 21 | "wasm", 22 | "rust", 23 | "webpack" 24 | ], 25 | "author": "Ashley Williams ", 26 | "license": "(MIT OR Apache-2.0)", 27 | "bugs": { 28 | "url": "https://github.com/rustwasm/create-wasm-app/issues" 29 | }, 30 | "homepage": "https://github.com/rustwasm/create-wasm-app#readme", 31 | "devDependencies": { 32 | "@babel/core": "^7.9.0", 33 | "@babel/preset-env": "^7.9.5", 34 | "@babel/preset-react": "^7.9.4", 35 | "@types/react": "^16.9.34", 36 | "@types/react-dom": "^16.9.6", 37 | "babel-loader": "^8.1.0", 38 | "copy-webpack-plugin": "^5.0.0", 39 | "eslint": "^6.8.0", 40 | "eslint-config-prettier": "^6.10.1", 41 | "eslint-plugin-import": "^2.20.2", 42 | "eslint-plugin-jsx-a11y": "^6.2.3", 43 | "eslint-plugin-prettier": "^3.1.2", 44 | "eslint-plugin-react": "^7.19.0", 45 | "eslint-plugin-react-hooks": "^3.0.0", 46 | "gh-pages": "^2.2.0", 47 | "hello-wasm-pack": "^0.1.0", 48 | "html-loader": "^1.1.0", 49 | "html-webpack-plugin": "^4.2.0", 50 | "prettier": "^2.0.2", 51 | "source-map-loader": "^0.2.4", 52 | "ts-loader": "^7.0.1", 53 | "webpack": "^4.29.3", 54 | "webpack-cli": "^3.1.0", 55 | "webpack-dev-server": "^3.1.5" 56 | }, 57 | "dependencies": { 58 | "@material-ui/core": "^4.9.11", 59 | "@material-ui/icons": "^4.9.1", 60 | "@material-ui/styles": "^4.10.0", 61 | "react": "^16.13.1", 62 | "react-dom": "^16.13.1", 63 | "react-lineto": "^3.1.4", 64 | "react-scripts": "3.4.1", 65 | "tokenization": "file:../pkg", 66 | "typescript": "~3.7.2" 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /demo/www/src/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | tokenization 7 | 8 | 9 | 10 |
11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /demo/www/src/index.tsx: -------------------------------------------------------------------------------- 1 | import React, { useState, useEffect } from "react"; 2 | import ReactDOM from "react-dom"; 3 | import GitHub from "@material-ui/icons/GitHub"; 4 | import { makeStyles, createStyles, ThemeProvider } from "@material-ui/styles"; 5 | import { createMuiTheme, Theme } from "@material-ui/core/styles"; 6 | import Container from "@material-ui/core/Container"; 7 | import Paper from "@material-ui/core/Paper"; 8 | import Grid from "@material-ui/core/Grid"; 9 | import TextField from "@material-ui/core/TextField"; 10 | import Typography from "@material-ui/core/Typography"; 11 | import Box from "@material-ui/core/Box"; 12 | import Link from "@material-ui/core/Link"; 13 | import LineTo from "react-lineto"; 14 | 15 | const repoURL = "https://github.com/tamuhey/tokenizations"; 16 | const repoWWWURL = "https://github.com/tamuhey/tokenizations/tree/master/demo"; 17 | const tryParse = (input: string): [string[], boolean] => { 18 | try { 19 | const tokens = JSON.parse(input); 20 | return [tokens, false]; 21 | } catch { 22 | return [[], true]; 23 | } 24 | }; 25 | 26 | const useStyles = makeStyles((theme: Theme) => 27 | createStyles({ 28 | textField: { 29 | fontSize: "1.3rem", 30 | }, 31 | tokenBox: { 32 | padding: 10, 33 | border: "1px solid black", 34 | borderRadius: 10, 35 | }, 36 | tokensContainer: { 37 | display: "flex", 38 | padding: theme.spacing(3), 39 | margin: theme.spacing(3), 40 | backgroundColor: theme.palette.background.paper, 41 | }, 42 | titleBox: { 43 | display: "flex", 44 | justifyContent: "center", 45 | margin: 3, 46 | alignItems: "baseline", 47 | }, 48 | githubIcon: { 49 | color: "black", 50 | marginLeft: 20, 51 | }, 52 | gridContainer: { 53 | padding: 30, 54 | }, 55 | container: { 56 | marginTop: 20, 57 | }, 58 | }) 59 | ); 60 | 61 | interface InputProps { 62 | text: string; 63 | setText: (text: string) => void; 64 | error: boolean; 65 | } 66 | 67 | const theme = createMuiTheme(); 68 | const Index = () => ( 69 | 70 | 71 | 72 | ); 73 | 74 | const App = () => { 75 | const [inputA, setInputA] = useState(`["John", "Johånson", "'s", "house"]`); 76 | const [inputB, setInputB] = useState( 77 | `["john", "johan", "##son", "s", "house"]` 78 | ); 79 | const [tokensA, errorA] = tryParse(inputA); 80 | const [tokensB, errorB] = tryParse(inputB); 81 | const [tokenization, setTokenization] = useState(null); 82 | const loadWasm = async () => setTokenization(await import("tokenization")); 83 | const classes = useStyles(); 84 | 85 | useEffect(() => { 86 | loadWasm(); 87 | }); 88 | const [a2b]: number[][][] = tokenization 89 | ? tokenization.get_alignment(tokensA, tokensB) 90 | : [[], []]; 91 | console.log(a2b); 92 | return ( 93 | 94 | 95 | 96 | Tokenizations Demo 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | Tokenizations is a token alignment 105 | library for Rust and Python. Feel free to change the below texts. 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 |
115 | 116 | 117 | {tokensA.map((token, i) => ( 118 | 119 | {token} 120 | 121 | ))} 122 | 123 | 124 | 125 | 126 | {tokensB.map((token, i) => { 127 | return ( 128 | 129 | {token} 130 | 131 | ); 132 | })} 133 | 134 | 135 |
136 | {a2b.map((l, i) => { 137 | return l.map((j) => ( 138 | 148 | )); 149 | })} 150 | 151 | 152 | This page is built with React and Wasm. The source is{" "} 153 | here. 154 | 155 | 156 |
157 |
158 |
159 | ); 160 | }; 161 | 162 | const Input = ({ text, setText, error }: InputProps) => { 163 | const classes = useStyles(); 164 | return ( 165 | setText(e.target.value)} 168 | error={error} 169 | fullWidth 170 | InputProps={{ 171 | classes: { 172 | input: classes.textField, 173 | }, 174 | }} 175 | helperText={error ? "Invalid JSON array" : ""} 176 | /> 177 | ); 178 | }; 179 | 180 | ReactDOM.render(, document.getElementById("container")); 181 | -------------------------------------------------------------------------------- /demo/www/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "outDir": "./dist/", 4 | "sourceMap": true, 5 | "noImplicitAny": true, 6 | "module": "ESNext", 7 | "target": "es6", 8 | "jsx": "react", 9 | "esModuleInterop": true, 10 | "moduleResolution": "node", 11 | "typeRoots": [ 12 | "types" 13 | ] 14 | } 15 | } -------------------------------------------------------------------------------- /demo/www/types/react-linto/index.d.ts: -------------------------------------------------------------------------------- 1 | declare module "react-lineto" { 2 | import { Component, PureComponent } from "react"; 3 | 4 | /** 5 | * Orientation type for 'Stepped' lines 6 | */ 7 | type Orientation = "h" | "v"; 8 | 9 | /** 10 | * Delay 11 | */ 12 | type Delay = number | boolean; 13 | 14 | /** 15 | * Anchor type 16 | */ 17 | type Anchor = string; 18 | 19 | /** 20 | * Coordinate type 21 | */ 22 | type Coordinate = { x: number } | { y: number }; 23 | 24 | /** 25 | * Coordinates type 26 | */ 27 | type Coordinates = { 28 | x: number; 29 | y: number; 30 | }; 31 | 32 | /** 33 | * Line coordinates 34 | */ 35 | interface LineCoordinates { 36 | /** 37 | * First X coordinate 38 | */ 39 | x0: number; 40 | /** 41 | * Second X coordinate 42 | */ 43 | x1: number; 44 | /** 45 | * First Y coordinate 46 | */ 47 | y0: number; 48 | /** 49 | * Second Y coordinate 50 | */ 51 | y1: number; 52 | } 53 | 54 | /** 55 | * Base props for all components 56 | */ 57 | interface BaseProps { 58 | /** 59 | * Border color, Example: #f00, red, etc. 60 | */ 61 | borderColor?: string; 62 | /** 63 | * Border style, Example: solid, dashed, etc. 64 | */ 65 | borderStyle?: string; 66 | /** 67 | * Border width (px) 68 | */ 69 | borderWidth?: number; 70 | /** 71 | * Desired CSS className for the rendered element 72 | */ 73 | className?: string; 74 | /** 75 | * Z-index offset 76 | */ 77 | zIndex?: number; 78 | /** 79 | * CSS class name of the desired container 80 | */ 81 | within?: string; 82 | } 83 | 84 | /** 85 | * Common props for 'LineTo' and 'SteppedLineTo' components 86 | */ 87 | interface LineToCommonProps extends BaseProps { 88 | /** 89 | * Force render after delay (ms) 90 | */ 91 | delay?: Delay; 92 | /** 93 | * Anchor for starting point (Format: "x y") 94 | */ 95 | fromAnchor?: Anchor; 96 | /** 97 | * CSS class name of the first element 98 | */ 99 | from: string; 100 | /** 101 | * Anchor for ending point (Format: 'x y") 102 | */ 103 | toAnchor?: Anchor; 104 | /** 105 | * CSS class name of the second element 106 | */ 107 | to: string; 108 | } 109 | 110 | /** 111 | * Common props for 'Line' and 'SteppedLine' components 112 | */ 113 | interface LineCommonProps extends BaseProps, LineCoordinates {} 114 | 115 | /** 116 | * Props for 'Stepped' components 117 | */ 118 | interface SteppedProps { 119 | /** 120 | * "h" for horizontal, "v" for vertical 121 | */ 122 | orientation?: Orientation; 123 | } 124 | 125 | /** 126 | * Props of 'LineTo' component 127 | */ 128 | export interface LineToProps extends LineToCommonProps {} 129 | 130 | /** 131 | * Props of 'SteppedLineTo' component 132 | */ 133 | export interface SteppedLineToProps extends LineToProps, SteppedProps {} 134 | 135 | /** 136 | * Props of 'Line' component 137 | */ 138 | export interface LineProps extends LineCommonProps {} 139 | 140 | /** 141 | * Props of 'SteppedLine' component 142 | */ 143 | export interface SteppedLineProps extends LineProps, SteppedProps {} 144 | 145 | /** 146 | * Draw line between two DOM elements. 147 | */ 148 | export default class LineTo< 149 | P extends LineToProps = LineToProps 150 | > extends Component> { 151 | /** 152 | * Forced update after delay (MS) 153 | */ 154 | deferUpdate: (delay: number) => void; 155 | 156 | /** 157 | * Parse delay prop 158 | */ 159 | parseDelay: (delay?: Delay) => number; 160 | 161 | /** 162 | * Parse anchor given as percentage 163 | */ 164 | parseAnchorPercent: (value: string) => number; 165 | 166 | /** 167 | * Parse anchor given as text 168 | */ 169 | parseAnchorText: (value: string) => Coordinate; 170 | 171 | /** 172 | * Parse anchor prop 173 | */ 174 | parseAnchor: (value?: Anchor) => Coordinates; 175 | 176 | /** 177 | * Detect coordinates 178 | */ 179 | detect: () => LineCoordinates; 180 | 181 | /** 182 | * Find element by class 183 | */ 184 | findElement: (className: string) => Element; 185 | } 186 | 187 | /** 188 | * Draw stepped line between two DOM elements. 189 | */ 190 | export class SteppedLineTo extends LineTo {} 191 | 192 | /** 193 | * Draw line using pixel coordinates (relative to viewport). 194 | */ 195 | export class Line extends PureComponent { 196 | /** 197 | * Find element by class 198 | */ 199 | findElement: (className: string) => Element; 200 | } 201 | 202 | /** 203 | * Draw stepped line using pixel coordinates (relative to viewport). 204 | */ 205 | export class SteppedLine extends PureComponent { 206 | /** 207 | * Render vertically 208 | */ 209 | renderVertical: () => React.ReactNode; 210 | 211 | /** 212 | * Render horizontally 213 | */ 214 | renderHorizontal: () => React.ReactNode; 215 | } 216 | } 217 | -------------------------------------------------------------------------------- /demo/www/webpack.config.js: -------------------------------------------------------------------------------- 1 | const path = require("path"); 2 | const HtmlWebPackPlugin = require("html-webpack-plugin"); 3 | 4 | const src = path.resolve(__dirname, "src"); 5 | const dist = path.resolve(__dirname, "dist"); 6 | 7 | module.exports = { 8 | mode: "development", 9 | entry: src + "/index.tsx", 10 | output: { 11 | path: dist, 12 | filename: "bundle.js", 13 | }, 14 | module: { 15 | rules: [ 16 | { 17 | test: /\.ts(x?)$/, 18 | exclude: /node_modules/, 19 | use: { 20 | loader: "ts-loader", 21 | }, 22 | }, 23 | { 24 | enforce: "pre", 25 | test: /\.js$/, 26 | loader: "source-map-loader", 27 | }, 28 | ], 29 | }, 30 | resolve: { 31 | extensions: [".js", ".jsx", ".ts", "tsx"], 32 | }, 33 | plugins: [ 34 | new HtmlWebPackPlugin({ 35 | template: src + "/index.html", 36 | filename: "index.html", 37 | }), 38 | ], 39 | }; 40 | -------------------------------------------------------------------------------- /dockerfiles/centos/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM centos:7 2 | ENV HOME /root 3 | ENV PATH $HOME/.pyenv/bin:$HOME/.pyenv/shims:$HOME/.cargo/bin:$HOME/.local/bin:$PATH 4 | RUN yum update -y && yum install -y git gcc make zlib-devel && \ 5 | curl https://pyenv.run | bash 6 | # pyenv prequisits. see https://github.com/pyenv/pyenv/wiki/common-build-problems 7 | RUN yum install @development zlib-devel bzip2 bzip2-devel readline-devel sqlite sqlite-devel openssl-devel xz xz-devel libffi-devel findutils -y 8 | RUN pyenv install 3.8.2 && \ 9 | pyenv install 3.7.6 && \ 10 | pyenv install 3.6.10 && \ 11 | pyenv install 3.5.4 && \ 12 | pyenv global 3.8.2 3.7.6 3.6.10 3.5.4 13 | -------------------------------------------------------------------------------- /dockerfiles/ci/centos/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.pkg.github.com/tamuhey/tokenizations/centos7-python:0 2 | -------------------------------------------------------------------------------- /dockerfiles/ci/manylinux/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM quay.io/pypa/manylinux1_x86_64:2020-03-07-9c5ba95 2 | ENV PATH /root/.cargo/bin:/root/.local/bin:/opt/python/cp36-cp36m/bin:/opt/python/cp37-cp37m/bin:/opt/python/cp38-cp38/bin:/opt/rh/devtoolset-2/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/python/cp35-cp35m/bin:/opt/python/cp36-cp36m/bin:/opt/python/cp37-cp37m/bin:/opt/python/cp38-cp38/bin -------------------------------------------------------------------------------- /img/demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tamuhey/tokenizations/bcb27e901c2a9413b45dfd030c6ff8764d753188/img/demo.png -------------------------------------------------------------------------------- /note/algorithm.md: -------------------------------------------------------------------------------- 1 | # Algorithm 2 | 3 | Let $A = a_{11}a_{12}..a_{1k_1},a_{21}..a_{Nk_N}$ and $B = b_{11}b_{12}..b_{1l_1},b_{21}..b_{Ml_M}$ be tokens of length N and M respectively. Each token $A_i$ in $A$ and $B_j$ in $B$ have length $k_i$ and $l_j$ respectively. 4 | The *alignment* $AL_{AB}$ of $A$ to $B$ is such that $ \forall j \in AL_{AB,i} => B_j \cap A_i $. ($t \cap s$ means t partially matches s.) 5 | For example, $a=["f","o","o"], b=["fo","o"] => AL_{AB} = [[1],[1],[2]], AL_{BA} = [[1, 2], [3]]$. 6 | The goal of this algorithm is to find such $AL_{AB}$ and $AL_{BA}$ 7 | 8 | 1. Normalize tokens in the unicode normalization form "NFKD", then lowercase all characters. 9 | 2. Concatenate all tokens $A$ and $B$ to generate $TA$ and $TB$ respectively 10 | 3. Calculate shortest path on edit graph of $TA$ and $TB$ 11 | 4. Get character mapping $C_{AB}$ and $C_{BA}$ from the edit graph 12 | 5. Get $AL_{AB}$ and $AL_{BA}$ from the character alignments $C_{AB}$ and $C_{BA}$ 13 | 14 | Details: 15 | 16 | 1. Normalize tokens in the unicode normalization form "NFKD" 17 | 18 | To compare the token positions, we must compare each characters in tokens. Because the two tokenizations may be partially different, we normalize them in "NFKD" and lowercase them first. 19 | 20 | 2. Concatenate all tokens $A$ and $B$ to generate $TA$ and $TB$ respectively 21 | 22 | Before calculating the edit graph, we combine tokens into text. For example, if we have tokens `["Foo", "bar"]`, we concatenate them into one text `Foobar`. 23 | 24 | 3. Calculate shortest path on edit graph from $TA$ and $TB$ 25 | 26 | We calculate the shortest path on edit graph from texts $TA$ and $TB$ to get character map between them. The path can be calculated, for example, by [Myers' algorighm](http://www.xmailserver.org/diff2.pdf) 27 | 28 | 4. Get character alignments $C_{AB}$ and $C_{BA}$ from the edit graph 29 | 30 | Let $TA_i$ and $TB_j$ be the i-th and j-th character in the text $TA$ and $TB$, respectively. $C_{AB}$ is a mapping from $TA$ to $TB$ such that $C_{AB},i \neq -1 \land C_{AB,i} = j \Rightarrow TA_i = TA_j$. For example, $TA = f0o, TB = fboo$ then $C_{AB} = [1,-1,3], C_{BA} = [1,-1,3,-1]$. 31 | We can calculate $C_{AB}$ and $C_{BA}$ from the shortest path on the edit graph. If there exists diagonal edge $(i-1,j-1) -> (i, j)$ in the path, $C_{AB,i} = j$ and $C_{BA,j} = i$. If there doesn't exist any diagonal edge to $\forall j (i, j)$ then $C_{AB,i} = -1$. 32 | 33 | 5. Get $AL_{AB}$ and $AL_{BA}$ from the character alignments $C_{AB}$ and $C_{BA}$ 34 | 35 | Now we can calculate the desired $AL_{AB}$ and $AL_{BA}$ from the previous calculated $C_{AB}$ and $C_{BA}$. -------------------------------------------------------------------------------- /note/blog_post.md: -------------------------------------------------------------------------------- 1 | # How to calculate the alignment between BERT and spaCy tokens effectively and robustly 2 | 3 | [![image](https://user-images.githubusercontent.com/24998666/82346698-c22c9c80-9a31-11ea-8ac2-709af9227060.png)](https://tamuhey.github.io/tokenizations/) 4 | 5 | site: https://tamuhey.github.io/tokenizations/ 6 | 7 | Natural Language Processing (NLP) has made great progress in recent years because of neural networks, which allows us to solve various tasks with end-to-end architecture. 8 | However, many NLP systems still requires language-specific pre- and post-processing, especially in tokenizations. 9 | In this article, I describe an algorithm which simplifies calculating of correspondence between tokens (e.g. BERT vs. spaCy), one such process. 10 | And I introduce Python and Rust libraries that implement this algorithm. 11 | 12 | Here is the library and the demo site links: 13 | 14 | - repo: https://github.com/tamuhey/tokenizations 15 | - demo: https://tamuhey.github.io/tokenizations/ 16 | 17 | # What is "alignment" of tokens and Why is it necessary? 18 | 19 | Suppose we want to combine BERT-based named entity recognition (NER) model with rule-based NER model buit on top of spaCy. 20 | Although BERT's NER exhibits [extremely high performance](http://nlpprogress.com/english/named_entity_recognition.html), 21 | it is usually combined with rule-based approaches for practical purposes. 22 | In such cases, what often bothers us is that tokens of spaCy and BERT are different, even if the input sentences are the same. 23 | For example, let's say the input sentence is "John Johanson 's house"; BERT tokenizes this sentence like `["john", "johan", "##son", "'", "s", "house"]` and spaCy tokenizes it like `["John", "Johanson", "'s", "house"]`. 24 | In order to combine the outputs, we need to calculate the correspondence between the two different token sequences. 25 | This correspondence is the "alignment". 26 | 27 | # How to calculate the alignment? 28 | 29 | First, let's sort out the problem. 30 | Looking at the previous example, it can be said that two different token sequences have the following characteristics: 31 | 32 | 1. Splitted in different offsets 33 | 2. Normalized (e.g. lowercase, unicode normalization, dropping accents...) 34 | 3. Added noise (meta symbol '#' in the previous case) 35 | 36 | If the token sequences differ only in *1.*, it can be easily solved, because we just need to compare the letters in order from the beginning. 37 | In fact, `spacy.gold.align`, which [I implemented previously](https://github.com/explosion/spaCy/pull/4526), is based on this algorithm. 38 | 39 | However, when the features *2.* and *3.* are taken into account, the problem suddenly becomes more difficult. 40 | If you want to deal with the previous example, it is relatively easily solved by lowercasing (e.g. A -> a) and removing meta symbols (e.g. "#" -> ""), but this depends on each tokenizers and isn't general-purpose method. 41 | Of course, we want a generic implementation that **works for any tokenizers**. 42 | 43 | Let's think about how to deal with *2.* and *3.*. 44 | 45 | ## Normalization 46 | 47 | In order to compare letters, we need to normalize the input tokens at first. 48 | This is because even though two letters may look the same, the underlying data may be different. 49 | There are variety of normalization methods which is used in NLP. For example: 50 | 51 | - [Unicode normalizations](https://unicode.org/faq/normalization.html) 52 | - Dropping accents ("å" -> "a") 53 | - Lowercasing ("A" -> "a") 54 | 55 | Unicode normalizations are defined in Unicode Standard. 56 | There are 4 types of Unicode normalizations: NFC, NFD, NFKC, NFKD. 57 | Of these, in NFKD, letters are decomposed based on compatibility, 58 | and the number of letter types are the least and the probability 59 | of matching is highest among the four methods. (see [Unicode document](https://unicode.org/faq/normalization.html) for detail). 60 | For example, you can detect the letter "a" is a part of "å" with NFKD, but not with NFKC. 61 | 62 | ![](https://user-images.githubusercontent.com/24998666/81841036-c87bce00-9584-11ea-9d8a-e53689f0de7b.png) 63 | 64 | Thus, we first normalize the intput tokens in NFKD form. 65 | Then, we lowercase all letters because lowercasing is also often used in NLP. 66 | 67 | ## Compare noisy texts 68 | 69 | Now we can compare almost all tokens thanks to NFKD and lowercasing, but they still contain some noise (e.g. "#"), 70 | so we cannot completely compare all letters in tokens. 71 | How to properly ignore the noises and compare all letters? 72 | I racked my brain for few days trying to solve this problem. 73 | 74 | Then, I came up with a solution based on a tool that I use every day. 75 | It is **diff**. 76 | diff is a tool that compares two texts and outputs the mismatches. 77 | It is built in `git` as `git diff`, and you can display the charcter-level correspondence as follows: 78 | 79 | ![image](https://user-images.githubusercontent.com/24998666/81947250-4ac6c980-963b-11ea-86ad-589bc3dad891.png) 80 | 81 | In our case, what we want to know is the agreement part, not the difference, but these are pretty much the same thing. 82 | So, what kind of algorithms is `diff` based on? 83 | 84 | According to the [git diff documentation](https://git-scm.com/docs/git-diff), it is based on [Myers' algorithm](http://www.xmailserver.org/diff2.pdf). 85 | Myers' algorithm is one of the dynamic programming methods that computes the shortest path of what is called edit graph. 86 | It works very fast especially if the difference of the two inputs is small. 87 | For now, what we want to compare are almost identical, so we can get the correspondence of the letters very quickly. 88 | 89 | In short, it turns out that Myers' algorithm helps us to get the correspondens of the letters in two sequence of tokens, while properly ignoring some noises. 90 | 91 | ## Overview of the algorithm 92 | 93 | The considerations so far have shown that suitale normalizations and character-based diff gives us a generic method for computing 94 | the alignment of two token sequences. 95 | Let's summarize the specific steps briefly. 96 | 97 | Let `tokens_a` and `tokens_b` be token sequences of type `List[str]` to be compared. For example, `tokens_a = ["foo", "bar", "baz"]`. 98 | 99 | 1. Normalize all tokens with `NFKD` and lowercasing. 100 | 101 | For example, `"Foo" -> "foo"` 102 | 103 | 2. Concatenate the tokens into one string and let the results be `cat_a` and `cat_b` respectively. 104 | 105 | For example, `cat_a = "".join(tokens_a)` in Python. 106 | 107 | 3. Get the character based diff between the strings `cat_a` and `cat_b`. 108 | 109 | The character based diff can be calculated with [Myers' algorithm](http://www.xmailserver.org/diff2.pdf). 110 | 111 | 4. Converts the caracter-based diff to a token-based diff. 112 | 113 | This is relatively easy to calculate because we know the mapping between the characters and tokens in step 2. 114 | 115 | # Implementation 116 | 117 | [Here is the repository](https://github.com/tamuhey/tokenizations) that implements this algorithm. 118 | This library, `tokenizations`, is implemented with **Rust** and provides a **Python** binding. 119 | 120 | For example, you can use the Python library as follows: 121 | 122 | ```Python 123 | # `$ pip install pytokenizations` to install the package 124 | import tokenizations 125 | 126 | tokens_a = ["John", "Johanson", "'s", "house"] 127 | tokens_b = ["john", "johan", "##son", "'", "s", "house"] 128 | a2b, b2a = tokenizations.get_alignments(tokens_a, tokens_b) 129 | 130 | for i in range(len(tokens_a)): 131 | print(tokens_a[i]) 132 | for j in a2b[i]: 133 | print(" ", tokens_b[j]) 134 | ``` 135 | 136 | ``` 137 | John 138 | john 139 | Johanson 140 | johan 141 | ##son 142 | 's 143 | ' 144 | s 145 | house 146 | house 147 | ``` 148 | 149 | # Conclusion 150 | 151 | In this article, I introduced an algorithm to align two token sequences that are produced by two different tokenizers. 152 | The title mentions spaCy and BERT, but this algorithm can be applied to any tokenizers. 153 | Also, it can be useful to apply NLP methods to noisy texts which contains HTML tags for example: 154 | remove the tags, apply the methods, then calculate the alignment for the output and original text. 155 | Here are the links to the library and demo. 156 | 157 | - repo: https://github.com/tamuhey/tokenizations 158 | - demo: https://tamuhey.github.io/tokenizations/ -------------------------------------------------------------------------------- /python/.gitignore: -------------------------------------------------------------------------------- 1 | ### https://raw.github.com/github/gitignore/499ae899e7b54e701e878759f73d9092302fd07a/Python.gitignore 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # celery beat schedule file 96 | celerybeat-schedule 97 | 98 | # SageMath parsed files 99 | *.sage.py 100 | 101 | # Environments 102 | .env 103 | .venv 104 | env/ 105 | venv/ 106 | ENV/ 107 | env.bak/ 108 | venv.bak/ 109 | 110 | # Spyder project settings 111 | .spyderproject 112 | .spyproject 113 | 114 | # Rope project settings 115 | .ropeproject 116 | 117 | # mkdocs documentation 118 | /site 119 | 120 | # mypy 121 | .mypy_cache/ 122 | .dmypy.json 123 | dmypy.json 124 | 125 | # Pyre type checker 126 | .pyre/ 127 | 128 | .vscode/ 129 | -------------------------------------------------------------------------------- /python/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "pytokenizations" 3 | version = "0.8.2" 4 | authors = ["Yohei Tamura "] 5 | edition = "2018" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | tokenizations = "^0.4.0" 11 | 12 | 13 | [lib] 14 | name = "tokenizations" 15 | crate-type = ["cdylib"] 16 | 17 | [dependencies.pyo3] 18 | version = "^0.13.0" 19 | features = ["extension-module"] 20 | -------------------------------------------------------------------------------- /python/Makefile: -------------------------------------------------------------------------------- 1 | build: 2 | poetry run maturin build 3 | develop: 4 | poetry run maturin develop 5 | test: develop 6 | poetry run pytest tests 7 | -------------------------------------------------------------------------------- /python/README.md: -------------------------------------------------------------------------------- 1 | # pytokenizations 2 | 3 | ## Installation 4 | 5 | ```bash 6 | $ pip install pytokenizations 7 | ``` 8 | 9 | ### Install from source 10 | 11 | This library uses [maturin](https://github.com/PyO3/maturin) to build. 12 | 13 | ```console 14 | $ git clone https://github.com/tamuhey/tokenizations 15 | $ cd python 16 | $ pip install maturin 17 | $ maturin build 18 | ``` 19 | 20 | Now wheel is built in `python/target/wheels` directory. You can install it with `pip install *whl`. 21 | 22 | # Usage 23 | 24 | See the [README.md](https://github.com/tamuhey/tokenizations#usage-python) 25 | -------------------------------------------------------------------------------- /python/poetry.lock: -------------------------------------------------------------------------------- 1 | [[package]] 2 | name = "atomicwrites" 3 | version = "1.3.0" 4 | description = "Atomic file writes." 5 | category = "dev" 6 | optional = false 7 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 8 | 9 | [[package]] 10 | name = "attrs" 11 | version = "19.3.0" 12 | description = "Classes Without Boilerplate" 13 | category = "dev" 14 | optional = false 15 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 16 | 17 | [package.extras] 18 | azure-pipelines = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "pytest-azurepipelines"] 19 | dev = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "sphinx", "pre-commit"] 20 | docs = ["sphinx", "zope.interface"] 21 | tests = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface"] 22 | 23 | [[package]] 24 | name = "bleach" 25 | version = "3.3.0" 26 | description = "An easy safelist-based HTML-sanitizing tool." 27 | category = "dev" 28 | optional = false 29 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" 30 | 31 | [package.dependencies] 32 | packaging = "*" 33 | six = ">=1.9.0" 34 | webencodings = "*" 35 | 36 | [[package]] 37 | name = "certifi" 38 | version = "2019.11.28" 39 | description = "Python package for providing Mozilla's CA Bundle." 40 | category = "dev" 41 | optional = false 42 | python-versions = "*" 43 | 44 | [[package]] 45 | name = "cffi" 46 | version = "1.14.4" 47 | description = "Foreign Function Interface for Python calling C code." 48 | category = "dev" 49 | optional = false 50 | python-versions = "*" 51 | 52 | [package.dependencies] 53 | pycparser = "*" 54 | 55 | [[package]] 56 | name = "chardet" 57 | version = "3.0.4" 58 | description = "Universal encoding detector for Python 2 and 3" 59 | category = "dev" 60 | optional = false 61 | python-versions = "*" 62 | 63 | [[package]] 64 | name = "colorama" 65 | version = "0.4.3" 66 | description = "Cross-platform colored terminal text." 67 | category = "dev" 68 | optional = false 69 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" 70 | 71 | [[package]] 72 | name = "cryptography" 73 | version = "3.3.2" 74 | description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." 75 | category = "dev" 76 | optional = false 77 | python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*" 78 | 79 | [package.dependencies] 80 | cffi = ">=1.12" 81 | six = ">=1.4.1" 82 | 83 | [package.extras] 84 | docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx-rtd-theme"] 85 | docstest = ["doc8", "pyenchant (>=1.6.11)", "twine (>=1.12.0)", "sphinxcontrib-spelling (>=4.0.1)"] 86 | pep8test = ["black", "flake8", "flake8-import-order", "pep8-naming"] 87 | ssh = ["bcrypt (>=3.1.5)"] 88 | test = ["pytest (>=3.6.0,!=3.9.0,!=3.9.1,!=3.9.2)", "pretend", "iso8601", "pytz", "hypothesis (>=1.11.4,!=3.79.2)"] 89 | 90 | [[package]] 91 | name = "docutils" 92 | version = "0.16" 93 | description = "Docutils -- Python Documentation Utilities" 94 | category = "dev" 95 | optional = false 96 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" 97 | 98 | [[package]] 99 | name = "hypothesis" 100 | version = "6.3.0" 101 | description = "A library for property-based testing" 102 | category = "dev" 103 | optional = false 104 | python-versions = ">=3.6" 105 | 106 | [package.dependencies] 107 | attrs = ">=19.2.0" 108 | sortedcontainers = ">=2.1.0,<3.0.0" 109 | 110 | [package.extras] 111 | all = ["black (>=19.10b0)", "click (>=7.0)", "django (>=2.2)", "dpcontracts (>=0.4)", "lark-parser (>=0.6.5)", "libcst (>=0.3.16)", "numpy (>=1.9.0)", "pandas (>=0.25)", "pytest (>=4.6)", "python-dateutil (>=1.4)", "pytz (>=2014.1)", "redis (>=3.0.0)", "importlib-resources (>=3.3.0)", "importlib-metadata", "backports.zoneinfo (>=0.2.1)", "tzdata (>=2020.4)"] 112 | cli = ["click (>=7.0)", "black (>=19.10b0)"] 113 | codemods = ["libcst (>=0.3.16)"] 114 | dateutil = ["python-dateutil (>=1.4)"] 115 | django = ["pytz (>=2014.1)", "django (>=2.2)"] 116 | dpcontracts = ["dpcontracts (>=0.4)"] 117 | ghostwriter = ["black (>=19.10b0)"] 118 | lark = ["lark-parser (>=0.6.5)"] 119 | numpy = ["numpy (>=1.9.0)"] 120 | pandas = ["pandas (>=0.25)"] 121 | pytest = ["pytest (>=4.6)"] 122 | pytz = ["pytz (>=2014.1)"] 123 | redis = ["redis (>=3.0.0)"] 124 | zoneinfo = ["importlib-resources (>=3.3.0)", "backports.zoneinfo (>=0.2.1)", "tzdata (>=2020.4)"] 125 | 126 | [[package]] 127 | name = "idna" 128 | version = "2.9" 129 | description = "Internationalized Domain Names in Applications (IDNA)" 130 | category = "dev" 131 | optional = false 132 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 133 | 134 | [[package]] 135 | name = "importlib-metadata" 136 | version = "1.5.0" 137 | description = "Read metadata from Python packages" 138 | category = "dev" 139 | optional = false 140 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" 141 | 142 | [package.dependencies] 143 | zipp = ">=0.5" 144 | 145 | [package.extras] 146 | docs = ["sphinx", "rst.linker"] 147 | testing = ["packaging", "importlib-resources"] 148 | 149 | [[package]] 150 | name = "iniconfig" 151 | version = "1.0.0" 152 | description = "iniconfig: brain-dead simple config-ini parsing" 153 | category = "dev" 154 | optional = false 155 | python-versions = "*" 156 | 157 | [[package]] 158 | name = "jeepney" 159 | version = "0.6.0" 160 | description = "Low-level, pure Python DBus protocol wrapper." 161 | category = "dev" 162 | optional = false 163 | python-versions = ">=3.6" 164 | 165 | [package.extras] 166 | test = ["pytest", "pytest-trio", "pytest-asyncio", "testpath", "trio"] 167 | 168 | [[package]] 169 | name = "keyring" 170 | version = "22.0.1" 171 | description = "Store and access your passwords safely." 172 | category = "dev" 173 | optional = false 174 | python-versions = ">=3.6" 175 | 176 | [package.dependencies] 177 | importlib-metadata = {version = ">=1", markers = "python_version < \"3.8\""} 178 | jeepney = {version = ">=0.4.2", markers = "sys_platform == \"linux\""} 179 | pywin32-ctypes = {version = "<0.1.0 || >0.1.0,<0.1.1 || >0.1.1", markers = "sys_platform == \"win32\""} 180 | SecretStorage = {version = ">=3.2", markers = "sys_platform == \"linux\""} 181 | 182 | [package.extras] 183 | docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"] 184 | testing = ["pytest (>=3.5,!=3.7.3)", "pytest-checkdocs (>=1.2.3)", "pytest-flake8", "pytest-cov", "pytest-enabler", "pytest-black (>=0.3.7)", "pytest-mypy"] 185 | 186 | [[package]] 187 | name = "packaging" 188 | version = "20.3" 189 | description = "Core utilities for Python packages" 190 | category = "dev" 191 | optional = false 192 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 193 | 194 | [package.dependencies] 195 | pyparsing = ">=2.0.2" 196 | six = "*" 197 | 198 | [[package]] 199 | name = "pkginfo" 200 | version = "1.5.0.1" 201 | description = "Query metadatdata from sdists / bdists / installed packages." 202 | category = "dev" 203 | optional = false 204 | python-versions = "*" 205 | 206 | [package.extras] 207 | testing = ["nose", "coverage"] 208 | 209 | [[package]] 210 | name = "pluggy" 211 | version = "0.13.1" 212 | description = "plugin and hook calling mechanisms for python" 213 | category = "dev" 214 | optional = false 215 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 216 | 217 | [package.dependencies] 218 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} 219 | 220 | [package.extras] 221 | dev = ["pre-commit", "tox"] 222 | 223 | [[package]] 224 | name = "py" 225 | version = "1.9.0" 226 | description = "library with cross-python path, ini-parsing, io, code, log facilities" 227 | category = "dev" 228 | optional = false 229 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 230 | 231 | [[package]] 232 | name = "pycparser" 233 | version = "2.20" 234 | description = "C parser in Python" 235 | category = "dev" 236 | optional = false 237 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 238 | 239 | [[package]] 240 | name = "pygments" 241 | version = "2.6.1" 242 | description = "Pygments is a syntax highlighting package written in Python." 243 | category = "dev" 244 | optional = false 245 | python-versions = ">=3.5" 246 | 247 | [[package]] 248 | name = "pyparsing" 249 | version = "2.4.6" 250 | description = "Python parsing module" 251 | category = "dev" 252 | optional = false 253 | python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" 254 | 255 | [[package]] 256 | name = "pytest" 257 | version = "6.2.2" 258 | description = "pytest: simple powerful testing with Python" 259 | category = "dev" 260 | optional = false 261 | python-versions = ">=3.6" 262 | 263 | [package.dependencies] 264 | atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} 265 | attrs = ">=19.2.0" 266 | colorama = {version = "*", markers = "sys_platform == \"win32\""} 267 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} 268 | iniconfig = "*" 269 | packaging = "*" 270 | pluggy = ">=0.12,<1.0.0a1" 271 | py = ">=1.8.2" 272 | toml = "*" 273 | 274 | [package.extras] 275 | testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] 276 | 277 | [[package]] 278 | name = "pywin32-ctypes" 279 | version = "0.2.0" 280 | description = "" 281 | category = "dev" 282 | optional = false 283 | python-versions = "*" 284 | 285 | [[package]] 286 | name = "readme-renderer" 287 | version = "24.0" 288 | description = "readme_renderer is a library for rendering \"readme\" descriptions for Warehouse" 289 | category = "dev" 290 | optional = false 291 | python-versions = "*" 292 | 293 | [package.dependencies] 294 | bleach = ">=2.1.0" 295 | docutils = ">=0.13.1" 296 | Pygments = "*" 297 | six = "*" 298 | 299 | [package.extras] 300 | md = ["cmarkgfm (>=0.2.0)"] 301 | 302 | [[package]] 303 | name = "requests" 304 | version = "2.23.0" 305 | description = "Python HTTP for Humans." 306 | category = "dev" 307 | optional = false 308 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" 309 | 310 | [package.dependencies] 311 | certifi = ">=2017.4.17" 312 | chardet = ">=3.0.2,<4" 313 | idna = ">=2.5,<3" 314 | urllib3 = ">=1.21.1,<1.25.0 || >1.25.0,<1.25.1 || >1.25.1,<1.26" 315 | 316 | [package.extras] 317 | security = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)"] 318 | socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"] 319 | 320 | [[package]] 321 | name = "requests-toolbelt" 322 | version = "0.9.1" 323 | description = "A utility belt for advanced users of python-requests" 324 | category = "dev" 325 | optional = false 326 | python-versions = "*" 327 | 328 | [package.dependencies] 329 | requests = ">=2.0.1,<3.0.0" 330 | 331 | [[package]] 332 | name = "rfc3986" 333 | version = "1.4.0" 334 | description = "Validating URI References per RFC 3986" 335 | category = "dev" 336 | optional = false 337 | python-versions = "*" 338 | 339 | [package.extras] 340 | idna2008 = ["idna"] 341 | 342 | [[package]] 343 | name = "secretstorage" 344 | version = "3.3.0" 345 | description = "Python bindings to FreeDesktop.org Secret Service API" 346 | category = "dev" 347 | optional = false 348 | python-versions = ">=3.6" 349 | 350 | [package.dependencies] 351 | cryptography = ">=2.0" 352 | jeepney = ">=0.6" 353 | 354 | [[package]] 355 | name = "six" 356 | version = "1.14.0" 357 | description = "Python 2 and 3 compatibility utilities" 358 | category = "dev" 359 | optional = false 360 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" 361 | 362 | [[package]] 363 | name = "sortedcontainers" 364 | version = "2.1.0" 365 | description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set" 366 | category = "dev" 367 | optional = false 368 | python-versions = "*" 369 | 370 | [[package]] 371 | name = "toml" 372 | version = "0.10.1" 373 | description = "Python Library for Tom's Obvious, Minimal Language" 374 | category = "dev" 375 | optional = false 376 | python-versions = "*" 377 | 378 | [[package]] 379 | name = "tqdm" 380 | version = "4.43.0" 381 | description = "Fast, Extensible Progress Meter" 382 | category = "dev" 383 | optional = false 384 | python-versions = ">=2.6, !=3.0.*, !=3.1.*" 385 | 386 | [package.extras] 387 | dev = ["py-make (>=0.1.0)", "twine", "argopt", "pydoc-markdown"] 388 | 389 | [[package]] 390 | name = "twine" 391 | version = "3.3.0" 392 | description = "Collection of utilities for publishing packages on PyPI" 393 | category = "dev" 394 | optional = false 395 | python-versions = ">=3.6" 396 | 397 | [package.dependencies] 398 | colorama = ">=0.4.3" 399 | importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} 400 | keyring = ">=15.1" 401 | pkginfo = ">=1.4.2" 402 | readme-renderer = ">=21.0" 403 | requests = ">=2.20" 404 | requests-toolbelt = ">=0.8.0,<0.9.0 || >0.9.0" 405 | rfc3986 = ">=1.4.0" 406 | tqdm = ">=4.14" 407 | 408 | [[package]] 409 | name = "urllib3" 410 | version = "1.22" 411 | description = "HTTP library with thread-safe connection pooling, file post, and more." 412 | category = "dev" 413 | optional = false 414 | python-versions = "*" 415 | 416 | [package.extras] 417 | secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"] 418 | socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] 419 | 420 | [[package]] 421 | name = "webencodings" 422 | version = "0.5.1" 423 | description = "Character encoding aliases for legacy web content" 424 | category = "dev" 425 | optional = false 426 | python-versions = "*" 427 | 428 | [[package]] 429 | name = "zipp" 430 | version = "1.2.0" 431 | description = "Backport of pathlib-compatible object wrapper for zip files" 432 | category = "dev" 433 | optional = false 434 | python-versions = ">=2.7" 435 | 436 | [package.extras] 437 | docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"] 438 | testing = ["pathlib2", "unittest2", "jaraco.itertools", "func-timeout"] 439 | 440 | [metadata] 441 | lock-version = "1.1" 442 | python-versions = ">=3.7" 443 | content-hash = "e50473e3834448d00d11c65e82d12dd0f2165eaf0a40c97ecb0607a8fbc8b114" 444 | 445 | [metadata.files] 446 | atomicwrites = [ 447 | {file = "atomicwrites-1.3.0-py2.py3-none-any.whl", hash = "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4"}, 448 | {file = "atomicwrites-1.3.0.tar.gz", hash = "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"}, 449 | ] 450 | attrs = [ 451 | {file = "attrs-19.3.0-py2.py3-none-any.whl", hash = "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c"}, 452 | {file = "attrs-19.3.0.tar.gz", hash = "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72"}, 453 | ] 454 | bleach = [ 455 | {file = "bleach-3.3.0-py2.py3-none-any.whl", hash = "sha256:6123ddc1052673e52bab52cdc955bcb57a015264a1c57d37bea2f6b817af0125"}, 456 | {file = "bleach-3.3.0.tar.gz", hash = "sha256:98b3170739e5e83dd9dc19633f074727ad848cbedb6026708c8ac2d3b697a433"}, 457 | ] 458 | certifi = [ 459 | {file = "certifi-2019.11.28-py2.py3-none-any.whl", hash = "sha256:017c25db2a153ce562900032d5bc68e9f191e44e9a0f762f373977de9df1fbb3"}, 460 | {file = "certifi-2019.11.28.tar.gz", hash = "sha256:25b64c7da4cd7479594d035c08c2d809eb4aab3a26e5a990ea98cc450c320f1f"}, 461 | ] 462 | cffi = [ 463 | {file = "cffi-1.14.4-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ebb253464a5d0482b191274f1c8bf00e33f7e0b9c66405fbffc61ed2c839c775"}, 464 | {file = "cffi-1.14.4-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:2c24d61263f511551f740d1a065eb0212db1dbbbbd241db758f5244281590c06"}, 465 | {file = "cffi-1.14.4-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:9f7a31251289b2ab6d4012f6e83e58bc3b96bd151f5b5262467f4bb6b34a7c26"}, 466 | {file = "cffi-1.14.4-cp27-cp27m-win32.whl", hash = "sha256:5cf4be6c304ad0b6602f5c4e90e2f59b47653ac1ed9c662ed379fe48a8f26b0c"}, 467 | {file = "cffi-1.14.4-cp27-cp27m-win_amd64.whl", hash = "sha256:f60567825f791c6f8a592f3c6e3bd93dd2934e3f9dac189308426bd76b00ef3b"}, 468 | {file = "cffi-1.14.4-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:c6332685306b6417a91b1ff9fae889b3ba65c2292d64bd9245c093b1b284809d"}, 469 | {file = "cffi-1.14.4-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:d9efd8b7a3ef378dd61a1e77367f1924375befc2eba06168b6ebfa903a5e59ca"}, 470 | {file = "cffi-1.14.4-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:51a8b381b16ddd370178a65360ebe15fbc1c71cf6f584613a7ea08bfad946698"}, 471 | {file = "cffi-1.14.4-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:1d2c4994f515e5b485fd6d3a73d05526aa0fcf248eb135996b088d25dfa1865b"}, 472 | {file = "cffi-1.14.4-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:af5c59122a011049aad5dd87424b8e65a80e4a6477419c0c1015f73fb5ea0293"}, 473 | {file = "cffi-1.14.4-cp35-cp35m-win32.whl", hash = "sha256:594234691ac0e9b770aee9fcdb8fa02c22e43e5c619456efd0d6c2bf276f3eb2"}, 474 | {file = "cffi-1.14.4-cp35-cp35m-win_amd64.whl", hash = "sha256:64081b3f8f6f3c3de6191ec89d7dc6c86a8a43911f7ecb422c60e90c70be41c7"}, 475 | {file = "cffi-1.14.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:f803eaa94c2fcda012c047e62bc7a51b0bdabda1cad7a92a522694ea2d76e49f"}, 476 | {file = "cffi-1.14.4-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:105abaf8a6075dc96c1fe5ae7aae073f4696f2905fde6aeada4c9d2926752362"}, 477 | {file = "cffi-1.14.4-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:0638c3ae1a0edfb77c6765d487fee624d2b1ee1bdfeffc1f0b58c64d149e7eec"}, 478 | {file = "cffi-1.14.4-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:7c6b1dece89874d9541fc974917b631406233ea0440d0bdfbb8e03bf39a49b3b"}, 479 | {file = "cffi-1.14.4-cp36-cp36m-win32.whl", hash = "sha256:155136b51fd733fa94e1c2ea5211dcd4c8879869008fc811648f16541bf99668"}, 480 | {file = "cffi-1.14.4-cp36-cp36m-win_amd64.whl", hash = "sha256:6bc25fc545a6b3d57b5f8618e59fc13d3a3a68431e8ca5fd4c13241cd70d0009"}, 481 | {file = "cffi-1.14.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a7711edca4dcef1a75257b50a2fbfe92a65187c47dab5a0f1b9b332c5919a3fb"}, 482 | {file = "cffi-1.14.4-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:00e28066507bfc3fe865a31f325c8391a1ac2916219340f87dfad602c3e48e5d"}, 483 | {file = "cffi-1.14.4-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:798caa2a2384b1cbe8a2a139d80734c9db54f9cc155c99d7cc92441a23871c03"}, 484 | {file = "cffi-1.14.4-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:a5ed8c05548b54b998b9498753fb9cadbfd92ee88e884641377d8a8b291bcc01"}, 485 | {file = "cffi-1.14.4-cp37-cp37m-win32.whl", hash = "sha256:00a1ba5e2e95684448de9b89888ccd02c98d512064b4cb987d48f4b40aa0421e"}, 486 | {file = "cffi-1.14.4-cp37-cp37m-win_amd64.whl", hash = "sha256:9cc46bc107224ff5b6d04369e7c595acb700c3613ad7bcf2e2012f62ece80c35"}, 487 | {file = "cffi-1.14.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:df5169c4396adc04f9b0a05f13c074df878b6052430e03f50e68adf3a57aa28d"}, 488 | {file = "cffi-1.14.4-cp38-cp38-manylinux1_i686.whl", hash = "sha256:9ffb888f19d54a4d4dfd4b3f29bc2c16aa4972f1c2ab9c4ab09b8ab8685b9c2b"}, 489 | {file = "cffi-1.14.4-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:8d6603078baf4e11edc4168a514c5ce5b3ba6e3e9c374298cb88437957960a53"}, 490 | {file = "cffi-1.14.4-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:d5ff0621c88ce83a28a10d2ce719b2ee85635e85c515f12bac99a95306da4b2e"}, 491 | {file = "cffi-1.14.4-cp38-cp38-win32.whl", hash = "sha256:b4e248d1087abf9f4c10f3c398896c87ce82a9856494a7155823eb45a892395d"}, 492 | {file = "cffi-1.14.4-cp38-cp38-win_amd64.whl", hash = "sha256:ec80dc47f54e6e9a78181ce05feb71a0353854cc26999db963695f950b5fb375"}, 493 | {file = "cffi-1.14.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:840793c68105fe031f34d6a086eaea153a0cd5c491cde82a74b420edd0a2b909"}, 494 | {file = "cffi-1.14.4-cp39-cp39-manylinux1_i686.whl", hash = "sha256:b18e0a9ef57d2b41f5c68beefa32317d286c3d6ac0484efd10d6e07491bb95dd"}, 495 | {file = "cffi-1.14.4-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:045d792900a75e8b1e1b0ab6787dd733a8190ffcf80e8c8ceb2fb10a29ff238a"}, 496 | {file = "cffi-1.14.4-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:7ef7d4ced6b325e92eb4d3502946c78c5367bc416398d387b39591532536734e"}, 497 | {file = "cffi-1.14.4-cp39-cp39-win32.whl", hash = "sha256:ba4e9e0ae13fc41c6b23299545e5ef73055213e466bd107953e4a013a5ddd7e3"}, 498 | {file = "cffi-1.14.4-cp39-cp39-win_amd64.whl", hash = "sha256:f032b34669220030f905152045dfa27741ce1a6db3324a5bc0b96b6c7420c87b"}, 499 | {file = "cffi-1.14.4.tar.gz", hash = "sha256:1a465cbe98a7fd391d47dce4b8f7e5b921e6cd805ef421d04f5f66ba8f06086c"}, 500 | ] 501 | chardet = [ 502 | {file = "chardet-3.0.4-py2.py3-none-any.whl", hash = "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"}, 503 | {file = "chardet-3.0.4.tar.gz", hash = "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae"}, 504 | ] 505 | colorama = [ 506 | {file = "colorama-0.4.3-py2.py3-none-any.whl", hash = "sha256:7d73d2a99753107a36ac6b455ee49046802e59d9d076ef8e47b61499fa29afff"}, 507 | {file = "colorama-0.4.3.tar.gz", hash = "sha256:e96da0d330793e2cb9485e9ddfd918d456036c7149416295932478192f4436a1"}, 508 | ] 509 | cryptography = [ 510 | {file = "cryptography-3.3.2-cp27-cp27m-macosx_10_10_x86_64.whl", hash = "sha256:541dd758ad49b45920dda3b5b48c968f8b2533d8981bcdb43002798d8f7a89ed"}, 511 | {file = "cryptography-3.3.2-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:49570438e60f19243e7e0d504527dd5fe9b4b967b5a1ff21cc12b57602dd85d3"}, 512 | {file = "cryptography-3.3.2-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:a9a4ac9648d39ce71c2f63fe7dc6db144b9fa567ddfc48b9fde1b54483d26042"}, 513 | {file = "cryptography-3.3.2-cp27-cp27m-win32.whl", hash = "sha256:aa4969f24d536ae2268c902b2c3d62ab464b5a66bcb247630d208a79a8098e9b"}, 514 | {file = "cryptography-3.3.2-cp27-cp27m-win_amd64.whl", hash = "sha256:1bd0ccb0a1ed775cd7e2144fe46df9dc03eefd722bbcf587b3e0616ea4a81eff"}, 515 | {file = "cryptography-3.3.2-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:e18e6ab84dfb0ab997faf8cca25a86ff15dfea4027b986322026cc99e0a892da"}, 516 | {file = "cryptography-3.3.2-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:c7390f9b2119b2b43160abb34f63277a638504ef8df99f11cb52c1fda66a2e6f"}, 517 | {file = "cryptography-3.3.2-cp36-abi3-macosx_10_10_x86_64.whl", hash = "sha256:0d7b69674b738068fa6ffade5c962ecd14969690585aaca0a1b1fc9058938a72"}, 518 | {file = "cryptography-3.3.2-cp36-abi3-manylinux1_x86_64.whl", hash = "sha256:922f9602d67c15ade470c11d616f2b2364950602e370c76f0c94c94ae672742e"}, 519 | {file = "cryptography-3.3.2-cp36-abi3-manylinux2010_x86_64.whl", hash = "sha256:a0f0b96c572fc9f25c3f4ddbf4688b9b38c69836713fb255f4a2715d93cbaf44"}, 520 | {file = "cryptography-3.3.2-cp36-abi3-manylinux2014_aarch64.whl", hash = "sha256:a777c096a49d80f9d2979695b835b0f9c9edab73b59e4ceb51f19724dda887ed"}, 521 | {file = "cryptography-3.3.2-cp36-abi3-win32.whl", hash = "sha256:3c284fc1e504e88e51c428db9c9274f2da9f73fdf5d7e13a36b8ecb039af6e6c"}, 522 | {file = "cryptography-3.3.2-cp36-abi3-win_amd64.whl", hash = "sha256:7951a966613c4211b6612b0352f5bf29989955ee592c4a885d8c7d0f830d0433"}, 523 | {file = "cryptography-3.3.2.tar.gz", hash = "sha256:5a60d3780149e13b7a6ff7ad6526b38846354d11a15e21068e57073e29e19bed"}, 524 | ] 525 | docutils = [ 526 | {file = "docutils-0.16-py2.py3-none-any.whl", hash = "sha256:0c5b78adfbf7762415433f5515cd5c9e762339e23369dbe8000d84a4bf4ab3af"}, 527 | {file = "docutils-0.16.tar.gz", hash = "sha256:c2de3a60e9e7d07be26b7f2b00ca0309c207e06c100f9cc2a94931fc75a478fc"}, 528 | ] 529 | hypothesis = [ 530 | {file = "hypothesis-6.3.0-py3-none-any.whl", hash = "sha256:9340fd2d183b3f092b47f5a593d88ca6ed7bf3b635ff08c0dd404645d52ff5a8"}, 531 | {file = "hypothesis-6.3.0.tar.gz", hash = "sha256:3458e095724535179a1f8362f3498fd59e16bd1d22ea6d1ab190ca3103567f70"}, 532 | ] 533 | idna = [ 534 | {file = "idna-2.9-py2.py3-none-any.whl", hash = "sha256:a068a21ceac8a4d63dbfd964670474107f541babbd2250d61922f029858365fa"}, 535 | {file = "idna-2.9.tar.gz", hash = "sha256:7588d1c14ae4c77d74036e8c22ff447b26d0fde8f007354fd48a7814db15b7cb"}, 536 | ] 537 | importlib-metadata = [ 538 | {file = "importlib_metadata-1.5.0-py2.py3-none-any.whl", hash = "sha256:b97607a1a18a5100839aec1dc26a1ea17ee0d93b20b0f008d80a5a050afb200b"}, 539 | {file = "importlib_metadata-1.5.0.tar.gz", hash = "sha256:06f5b3a99029c7134207dd882428a66992a9de2bef7c2b699b5641f9886c3302"}, 540 | ] 541 | iniconfig = [ 542 | {file = "iniconfig-1.0.0.tar.gz", hash = "sha256:aa0b40f50a00e72323cb5d41302f9c6165728fd764ac8822aa3fff00a40d56b4"}, 543 | ] 544 | jeepney = [ 545 | {file = "jeepney-0.6.0-py3-none-any.whl", hash = "sha256:aec56c0eb1691a841795111e184e13cad504f7703b9a64f63020816afa79a8ae"}, 546 | {file = "jeepney-0.6.0.tar.gz", hash = "sha256:7d59b6622675ca9e993a6bd38de845051d315f8b0c72cca3aef733a20b648657"}, 547 | ] 548 | keyring = [ 549 | {file = "keyring-22.0.1-py3-none-any.whl", hash = "sha256:9f44660a5d4931bdc14c08a1d01ef30b18a7a8147380710d8c9f9531e1f6c3c0"}, 550 | {file = "keyring-22.0.1.tar.gz", hash = "sha256:9acb3e1452edbb7544822b12fd25459078769e560fa51f418b6d00afaa6178df"}, 551 | ] 552 | packaging = [ 553 | {file = "packaging-20.3-py2.py3-none-any.whl", hash = "sha256:82f77b9bee21c1bafbf35a84905d604d5d1223801d639cf3ed140bd651c08752"}, 554 | {file = "packaging-20.3.tar.gz", hash = "sha256:3c292b474fda1671ec57d46d739d072bfd495a4f51ad01a055121d81e952b7a3"}, 555 | ] 556 | pkginfo = [ 557 | {file = "pkginfo-1.5.0.1-py2.py3-none-any.whl", hash = "sha256:a6d9e40ca61ad3ebd0b72fbadd4fba16e4c0e4df0428c041e01e06eb6ee71f32"}, 558 | {file = "pkginfo-1.5.0.1.tar.gz", hash = "sha256:7424f2c8511c186cd5424bbf31045b77435b37a8d604990b79d4e70d741148bb"}, 559 | ] 560 | pluggy = [ 561 | {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"}, 562 | {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"}, 563 | ] 564 | py = [ 565 | {file = "py-1.9.0-py2.py3-none-any.whl", hash = "sha256:366389d1db726cd2fcfc79732e75410e5fe4d31db13692115529d34069a043c2"}, 566 | {file = "py-1.9.0.tar.gz", hash = "sha256:9ca6883ce56b4e8da7e79ac18787889fa5206c79dcc67fb065376cd2fe03f342"}, 567 | ] 568 | pycparser = [ 569 | {file = "pycparser-2.20-py2.py3-none-any.whl", hash = "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"}, 570 | {file = "pycparser-2.20.tar.gz", hash = "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0"}, 571 | ] 572 | pygments = [ 573 | {file = "Pygments-2.6.1-py3-none-any.whl", hash = "sha256:ff7a40b4860b727ab48fad6360eb351cc1b33cbf9b15a0f689ca5353e9463324"}, 574 | {file = "Pygments-2.6.1.tar.gz", hash = "sha256:647344a061c249a3b74e230c739f434d7ea4d8b1d5f3721bc0f3558049b38f44"}, 575 | ] 576 | pyparsing = [ 577 | {file = "pyparsing-2.4.6-py2.py3-none-any.whl", hash = "sha256:c342dccb5250c08d45fd6f8b4a559613ca603b57498511740e65cd11a2e7dcec"}, 578 | {file = "pyparsing-2.4.6.tar.gz", hash = "sha256:4c830582a84fb022400b85429791bc551f1f4871c33f23e44f353119e92f969f"}, 579 | ] 580 | pytest = [ 581 | {file = "pytest-6.2.2-py3-none-any.whl", hash = "sha256:b574b57423e818210672e07ca1fa90aaf194a4f63f3ab909a2c67ebb22913839"}, 582 | {file = "pytest-6.2.2.tar.gz", hash = "sha256:9d1edf9e7d0b84d72ea3dbcdfd22b35fb543a5e8f2a60092dd578936bf63d7f9"}, 583 | ] 584 | pywin32-ctypes = [ 585 | {file = "pywin32-ctypes-0.2.0.tar.gz", hash = "sha256:24ffc3b341d457d48e8922352130cf2644024a4ff09762a2261fd34c36ee5942"}, 586 | {file = "pywin32_ctypes-0.2.0-py2.py3-none-any.whl", hash = "sha256:9dc2d991b3479cc2df15930958b674a48a227d5361d413827a4cfd0b5876fc98"}, 587 | ] 588 | readme-renderer = [ 589 | {file = "readme_renderer-24.0-py2.py3-none-any.whl", hash = "sha256:c8532b79afc0375a85f10433eca157d6b50f7d6990f337fa498c96cd4bfc203d"}, 590 | {file = "readme_renderer-24.0.tar.gz", hash = "sha256:bb16f55b259f27f75f640acf5e00cf897845a8b3e4731b5c1a436e4b8529202f"}, 591 | ] 592 | requests = [ 593 | {file = "requests-2.23.0-py2.7.egg", hash = "sha256:5d2d0ffbb515f39417009a46c14256291061ac01ba8f875b90cad137de83beb4"}, 594 | {file = "requests-2.23.0-py2.py3-none-any.whl", hash = "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee"}, 595 | {file = "requests-2.23.0.tar.gz", hash = "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6"}, 596 | ] 597 | requests-toolbelt = [ 598 | {file = "requests-toolbelt-0.9.1.tar.gz", hash = "sha256:968089d4584ad4ad7c171454f0a5c6dac23971e9472521ea3b6d49d610aa6fc0"}, 599 | {file = "requests_toolbelt-0.9.1-py2.py3-none-any.whl", hash = "sha256:380606e1d10dc85c3bd47bf5a6095f815ec007be7a8b69c878507068df059e6f"}, 600 | ] 601 | rfc3986 = [ 602 | {file = "rfc3986-1.4.0-py2.py3-none-any.whl", hash = "sha256:af9147e9aceda37c91a05f4deb128d4b4b49d6b199775fd2d2927768abdc8f50"}, 603 | {file = "rfc3986-1.4.0.tar.gz", hash = "sha256:112398da31a3344dc25dbf477d8df6cb34f9278a94fee2625d89e4514be8bb9d"}, 604 | ] 605 | secretstorage = [ 606 | {file = "SecretStorage-3.3.0-py3-none-any.whl", hash = "sha256:5c36f6537a523ec5f969ef9fad61c98eb9e017bc601d811e53aa25bece64892f"}, 607 | {file = "SecretStorage-3.3.0.tar.gz", hash = "sha256:30cfdef28829dad64d6ea1ed08f8eff6aa115a77068926bcc9f5225d5a3246aa"}, 608 | ] 609 | six = [ 610 | {file = "six-1.14.0-py2.py3-none-any.whl", hash = "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c"}, 611 | {file = "six-1.14.0.tar.gz", hash = "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a"}, 612 | ] 613 | sortedcontainers = [ 614 | {file = "sortedcontainers-2.1.0-py2.py3-none-any.whl", hash = "sha256:d9e96492dd51fae31e60837736b38fe42a187b5404c16606ff7ee7cd582d4c60"}, 615 | {file = "sortedcontainers-2.1.0.tar.gz", hash = "sha256:974e9a32f56b17c1bac2aebd9dcf197f3eb9cd30553c5852a3187ad162e1a03a"}, 616 | ] 617 | toml = [ 618 | {file = "toml-0.10.1-py2.py3-none-any.whl", hash = "sha256:bda89d5935c2eac546d648028b9901107a595863cb36bae0c73ac804a9b4ce88"}, 619 | {file = "toml-0.10.1.tar.gz", hash = "sha256:926b612be1e5ce0634a2ca03470f95169cf16f939018233a670519cb4ac58b0f"}, 620 | ] 621 | tqdm = [ 622 | {file = "tqdm-4.43.0-py2.py3-none-any.whl", hash = "sha256:0d8b5afb66e23d80433102e9bd8b5c8b65d34c2a2255b2de58d97bd2ea8170fd"}, 623 | {file = "tqdm-4.43.0.tar.gz", hash = "sha256:f35fb121bafa030bd94e74fcfd44f3c2830039a2ddef7fc87ef1c2d205237b24"}, 624 | ] 625 | twine = [ 626 | {file = "twine-3.3.0-py3-none-any.whl", hash = "sha256:2f6942ec2a17417e19d2dd372fc4faa424c87ee9ce49b4e20c427eb00a0f3f41"}, 627 | {file = "twine-3.3.0.tar.gz", hash = "sha256:fcffa8fc37e8083a5be0728371f299598870ee1eccc94e9a25cef7b1dcfa8297"}, 628 | ] 629 | urllib3 = [ 630 | {file = "urllib3-1.22-py2.py3-none-any.whl", hash = "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b"}, 631 | {file = "urllib3-1.22.tar.gz", hash = "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f"}, 632 | ] 633 | webencodings = [ 634 | {file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"}, 635 | {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"}, 636 | ] 637 | zipp = [ 638 | {file = "zipp-1.2.0-py2.py3-none-any.whl", hash = "sha256:e0d9e63797e483a30d27e09fffd308c59a700d365ec34e93cc100844168bf921"}, 639 | {file = "zipp-1.2.0.tar.gz", hash = "sha256:c70410551488251b0fee67b460fb9a536af8d6f9f008ad10ac51f615b6a521b1"}, 640 | ] 641 | -------------------------------------------------------------------------------- /python/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ "maturin",] 3 | build-backend = "maturin" 4 | 5 | [tool.versionup] 6 | tag = true 7 | commit = true 8 | files = [ "src/lib.rs", "Cargo.toml",] 9 | tag_prefix = "python/" 10 | 11 | [tool.poetry] 12 | name = "pytokenizations" 13 | version = "0.8.2" 14 | description = "" 15 | authors = [ "Yohei Tamura ",] 16 | [[tool.poetry.packages]] 17 | include = "tokenizations" 18 | 19 | [tool.poetry.dependencies] 20 | python = ">=3.7" 21 | 22 | [tool.poetry.dev-dependencies] 23 | pytest = "^6.2.2" 24 | hypothesis = "^6.3.0" 25 | twine = "^3.3.0" 26 | -------------------------------------------------------------------------------- /python/setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | exclude = .git, __pycache__, build, scripts, .venv, .tox, .hypothesis, .nox, outputs 4 | doctests = False 5 | ignore = E203,W503,E501 6 | -------------------------------------------------------------------------------- /python/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::deprecated)] 2 | use pyo3::prelude::*; 3 | use tokenizations::{get_alignments, get_charmap, Alignment, CharMap}; 4 | 5 | #[pymodule] 6 | fn tokenizations(_py: Python, m: &PyModule) -> PyResult<()> { 7 | m.add("__version__", "0.8.2")?; 8 | 9 | #[pyfn(m, "get_alignments")] 10 | pub fn get_alignments_py( 11 | _py: Python, 12 | a: Vec<&str>, 13 | b: Vec<&str>, 14 | ) -> PyResult<(Alignment, Alignment)> { 15 | Ok(get_alignments(&a, &b)) 16 | } 17 | 18 | #[pyfn(m, "get_charmap")] 19 | pub fn get_charmap_py(_py: Python, a: &str, b: &str) -> PyResult<(CharMap, CharMap)> { 20 | Ok(get_charmap(a, b)) 21 | } 22 | 23 | Ok(()) 24 | } 25 | -------------------------------------------------------------------------------- /python/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tamuhey/tokenizations/bcb27e901c2a9413b45dfd030c6ff8764d753188/python/tests/__init__.py -------------------------------------------------------------------------------- /python/tests/test_main.py: -------------------------------------------------------------------------------- 1 | from tokenizations import get_original_spans 2 | import pytest 3 | import tokenizations 4 | from hypothesis import given 5 | from hypothesis import strategies as st 6 | 7 | 8 | @given(st.lists(st.text()), st.lists(st.text())) 9 | def test_random(a, b): 10 | tokenizations.get_alignments(a, b) 11 | 12 | 13 | @given(st.lists(st.text())) 14 | def test_equality(a): 15 | a2b, b2a = tokenizations.get_alignments(a, a) 16 | assert a2b == b2a 17 | assert a2b == [[i] if len(aa) else [] for i, aa in enumerate(a)] 18 | 19 | 20 | @pytest.mark.parametrize( 21 | "input_,expected", 22 | [ 23 | ((["fo", "o"], ["foo"]), ([[0], [0]], [[0, 1]])), 24 | ((["fø", "o"], ["foo"]), ([[0], [0]], [[0, 1]])), 25 | ((["New", "York"], ["New York"]), ([[0], [0]], [[0, 1]])), 26 | ( 27 | (["今日は", "\t", "いい", "天気だ", "。"], ["今日", "は", "いい", "天気", "た", "。"]), 28 | ([[0, 1], [], [2], [3, 4], [5]], [[0], [0], [2], [3], [3], [4]]), 29 | ), 30 | ], 31 | ) 32 | def test_get_alignments(input_, expected): 33 | output = tokenizations.get_alignments(*input_) 34 | assert output == expected 35 | 36 | 37 | @pytest.mark.parametrize( 38 | "input_,expected", [(("foo", "fo0"), ([[0], [1], []], [[0], [1], []]))] 39 | ) 40 | def test_get_charmap(input_, expected): 41 | assert tokenizations.get_charmap(*input_) == expected 42 | 43 | 44 | @given(st.text(), st.text()) 45 | def test_random_charmap(a, b): 46 | tokenizations.get_charmap(a, b) 47 | 48 | 49 | @given(st.text()) 50 | def test_equality_charmap(a): 51 | a2b, b2a = tokenizations.get_charmap(a, a) 52 | assert a2b == b2a 53 | assert a2b == [[x] for x in range(len(a))] 54 | 55 | 56 | VERSION_DEPRECATE_WARN_GET_ORIGINAL_SPANS = "0.7" 57 | VERSION_DEPRECATE_ERR_GET_ORIGINAL_SPANS = "0.8" 58 | 59 | 60 | @pytest.mark.skipif( 61 | not ( 62 | VERSION_DEPRECATE_WARN_GET_ORIGINAL_SPANS 63 | <= tokenizations.__version__ 64 | < VERSION_DEPRECATE_ERR_GET_ORIGINAL_SPANS 65 | ), 66 | reason="deprecation check", 67 | ) 68 | def test_warn_get_original_spans(): 69 | with pytest.warns(DeprecationWarning): 70 | get_original_spans([], "") 71 | 72 | 73 | @pytest.mark.skipif( 74 | tokenizations.__version__ < VERSION_DEPRECATE_ERR_GET_ORIGINAL_SPANS, 75 | reason="deprecation error check", 76 | ) 77 | def test_error_get_original_spans(): 78 | with pytest.raises(ValueError): 79 | get_original_spans([], "") 80 | -------------------------------------------------------------------------------- /python/tokenizations/__init__.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from .tokenizations import ( 3 | get_alignments, 4 | get_charmap, 5 | __version__, 6 | ) 7 | 8 | 9 | def get_original_spans(tokens, original_text): 10 | raise ValueError( 11 | f"{get_original_spans.__name__} was deprecated. Please use `textspan.get_original_spans` instead." 12 | ) 13 | 14 | 15 | __all__ = ["get_charmap", "get_alignments", "get_original_spans", "__version__"] 16 | -------------------------------------------------------------------------------- /python/tokenizations/__init__.pyi: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Optional, Sequence, Tuple 3 | 4 | def get_alignments( 5 | a: Sequence[str], b: Sequence[str] 6 | ) -> Tuple[list[list[int]], list[list[int]]]: ... 7 | def get_charmap(a: str, b: str) -> Tuple[list[list[int]], list[list[int]]]: ... 8 | def get_original_spans( 9 | tokens: Sequence[str], original_text: str 10 | ) -> list[Optional[Tuple[int, int]]]: ... 11 | 12 | -------------------------------------------------------------------------------- /python/tokenizations/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tamuhey/tokenizations/bcb27e901c2a9413b45dfd030c6ff8764d753188/python/tokenizations/py.typed -------------------------------------------------------------------------------- /python/tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | isolated_build = true 3 | envlist = py37, py38, py39 4 | 5 | [testenv] 6 | whitelist_externals = 7 | poetry 8 | maturin 9 | commands = 10 | poetry install -v 11 | maturin develop 12 | poetry run pytest tests 13 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![deny(warnings)] 2 | //! Tokenizations alignment functions. 3 | #[cfg(test)] 4 | mod tests; 5 | #[cfg(test)] 6 | extern crate quickcheck; 7 | #[cfg(test)] 8 | extern crate quickcheck_macros; 9 | extern crate seqdiff; 10 | extern crate unicode_normalization; 11 | use seqdiff::Diff; 12 | use unicode_normalization::UnicodeNormalization; 13 | 14 | pub type Alignment = Vec>; 15 | pub type CharMap = Vec>; 16 | 17 | fn normalize(text: &str) -> String { 18 | text.to_lowercase().nfkd().collect() 19 | } 20 | 21 | fn get_char2token>(tokens: &[T]) -> Vec { 22 | let token_lengths = tokens 23 | .iter() 24 | .map(|s| s.as_ref().chars().count()) 25 | .collect::>(); 26 | let mut ret = vec![0; token_lengths.iter().sum()]; 27 | let mut cur = 0; 28 | for (i, &l) in token_lengths.iter().enumerate() { 29 | for _ in 0..l { 30 | ret[cur] = i; 31 | cur += 1; 32 | } 33 | } 34 | ret 35 | } 36 | 37 | // Returns tokenization alignment from ta to tb. 38 | fn get_alignment( 39 | num_tokens: usize, 40 | a2b: &[Option], 41 | ac2t: &[usize], 42 | bc2t: &[usize], 43 | ) -> Vec> { 44 | let mut at2bt = vec![vec![]; num_tokens]; 45 | for (ti, a2bi) in ac2t.iter().zip(a2b) { 46 | if let Some(i) = a2bi { 47 | if let Some(j) = at2bt[*ti].last() { 48 | if *j == bc2t[*i] { 49 | continue; 50 | } 51 | } 52 | at2bt[*ti].push(bc2t[*i]) 53 | } 54 | } 55 | at2bt 56 | } 57 | 58 | /// Returns the tokenizations alignments `a2b` (from `a` to `b`) and `b2a` (from `b` to `a`) based on the shortest edit script (SES). 59 | /// 60 | /// # Examples 61 | /// 62 | /// ``` 63 | /// use tokenizations::get_alignments; 64 | /// 65 | /// let a = vec!["New York"]; 66 | /// let b = vec!["New", "York"]; 67 | /// // calculate the two alignments `a2b` and `b2a` at the same time 68 | /// let (a2b, b2a) = get_alignments(&a, &b); 69 | /// 70 | /// // `a2b[i]` is a set that holds indices `j`s of `b` such that `a[i]` corresponds to `b[j]` 71 | /// assert_eq!(a2b, vec![[0, 1]]); 72 | /// // `b2a` is the inverse of `a2b` 73 | /// assert_eq!(b2a, vec![[0], [0]]); 74 | /// 75 | /// // `get_alignments` can be applied to noisy tokens. 76 | /// let a = vec!["à", "la", "gorge"]; 77 | /// let b = vec!["a", "la", "gorge"]; // dropped accent 78 | /// let (a2b, b2a) = get_alignments(&a, &b); 79 | /// assert_eq!(a2b, vec![[0], [1], [2]]); 80 | /// assert_eq!(a2b, vec![[0], [1], [2]]); 81 | /// ``` 82 | pub fn get_alignments>(a: &[S], b: &[S]) -> (Alignment, Alignment) { 83 | let a: Vec = a.iter().map(|x| normalize(x.as_ref())).collect(); 84 | let b: Vec = b.iter().map(|x| normalize(x.as_ref())).collect(); 85 | let ac2t = get_char2token(&a); 86 | let bc2t = get_char2token(&b); 87 | let (a2b, b2a) = seqdiff::diff( 88 | &a.join("").chars().collect::>(), 89 | &b.join("").chars().collect::>(), 90 | ); 91 | let at2bt = get_alignment(a.len(), &a2b, &ac2t, &bc2t); 92 | let bt2at = get_alignment(b.len(), &b2a, &bc2t, &ac2t); 93 | (at2bt, bt2at) 94 | } 95 | 96 | /// Returns the character mappings `c_a2b` (from `a` to `b`) and `c_b2a` (from `b` to `a`) based on the shortest edit script (SES). 97 | /// 98 | /// `a` and `b` can be noisy. For example, `bar` and `bår` can be properly compared. 99 | /// 100 | /// # Examples 101 | /// 102 | /// Basic usage: 103 | /// 104 | /// ``` 105 | /// use tokenizations::get_charmap; 106 | /// let a = "bar"; 107 | /// let b = "bår"; 108 | /// let (c_a2b, c_b2a) = get_charmap(a, b); 109 | /// assert_eq!(c_a2b, vec![vec![0], vec![1], vec![2]]); 110 | /// assert_eq!(c_b2a, vec![vec![0], vec![1], vec![2]]); 111 | /// ``` 112 | pub fn get_charmap(a: &str, b: &str) -> (CharMap, CharMap) { 113 | let at: Vec = a.chars().map(|x| x.to_string()).collect(); 114 | let bt: Vec = b.chars().map(|x| x.to_string()).collect(); 115 | get_alignments(&at, &bt) 116 | } 117 | 118 | // Deprecated functions: 119 | 120 | fn _get_charmap(a: &str, b: &str) -> (Diff, Diff) { 121 | let at: Vec = a.chars().map(|x| x.to_string()).collect(); 122 | let bt: Vec = b.chars().map(|x| x.to_string()).collect(); 123 | let (a2b, b2a) = get_alignments(&at, &bt); 124 | let c_a2b: Diff = a2b.into_iter().map(|x| x.into_iter().next()).collect(); 125 | let c_b2a: Diff = b2a.into_iter().map(|x| x.into_iter().next()).collect(); 126 | (c_a2b, c_b2a) 127 | } 128 | 129 | fn get_span_indices>(tokens: &[S]) -> Vec<(usize, usize)> { 130 | tokens 131 | .iter() 132 | .scan(0, |state, token| { 133 | let l = *state; 134 | let r = l + token.as_ref().chars().count(); 135 | *state = r; 136 | Some((l, r)) 137 | }) 138 | .collect() 139 | } 140 | 141 | fn join>(tokens: &[S]) -> String { 142 | let mut text = "".to_owned(); 143 | for token in tokens.iter() { 144 | text.push_str(token.as_ref()); 145 | } 146 | text 147 | } 148 | 149 | #[deprecated(since = "0.5.0", note = "please use `textspan::align_spans` instead")] 150 | pub fn get_original_spans>( 151 | tokens: &[S], 152 | original_text: &str, 153 | ) -> Vec> { 154 | let spans = get_span_indices(tokens); 155 | let text = join(tokens); 156 | let (a2b, b2a) = _get_charmap(&text, original_text); 157 | 158 | let mut ret = vec![]; 159 | for (l, r) in spans { 160 | // get the leftmost corresponding char 161 | let mut origl = None; 162 | for &x in a2b[l..r].iter() { 163 | if x != None { 164 | origl = x; 165 | break; 166 | } 167 | } 168 | // get the rightmost corresponding char 169 | let mut origr = None; 170 | for x in a2b[l..r].iter().rev() { 171 | if let Some(j) = x { 172 | origr = Some(j + 1); 173 | break; 174 | } 175 | } 176 | // edge case: a token with empty string 177 | if l == r { 178 | if l >= a2b.len() { 179 | origl = Some(b2a.len()); 180 | } else { 181 | origl = a2b[l]; 182 | } 183 | origr = origl; 184 | } 185 | ret.push(match (origl, origr) { 186 | (Some(l), Some(r)) => Some((l, r)), 187 | (None, None) => None, 188 | _ => unreachable!( 189 | "Internal error occured in get_original_span\ntokens: {:?}\noriginal_text: {:?}", 190 | tokens.iter().map(|x| x.as_ref()).collect::>(), 191 | original_text 192 | ), 193 | }) 194 | } 195 | ret 196 | } 197 | -------------------------------------------------------------------------------- /src/tests.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | #[test] 4 | fn test_get_alignment() { 5 | let testcases = vec![ 6 | ( 7 | (vec!["fあo①が", "bar"], vec!["fあo1かb", "ar"]), 8 | (vec![vec![0], vec![0, 1]], vec![vec![0, 1], vec![1]]), 9 | ), 10 | ( 11 | (vec!["New York"], vec!["New", "York"]), 12 | (vec![vec![0, 1]], vec![vec![0], vec![0]]), 13 | ), 14 | ( 15 | (vec!["A'B"], vec!["A", "B"]), 16 | (vec![vec![0, 1]], vec![vec![0], vec![0]]), 17 | ), 18 | ( 19 | (vec!["A'b"], vec!["a", "b"]), 20 | (vec![vec![0, 1]], vec![vec![0], vec![0]]), 21 | ), 22 | ( 23 | (vec![""], vec!["", ""]), 24 | (vec![vec![]], vec![vec![], vec![]]), 25 | ), 26 | ( 27 | (vec!["à", "la", "gorge"], vec!["a", "la", "gorge"]), 28 | ( 29 | vec![vec![0], vec![1], vec![2]], 30 | vec![vec![0], vec![1], vec![2]], 31 | ), 32 | ), 33 | ]; 34 | for (input, expected) in testcases { 35 | assert_eq!(get_alignments(&input.0, &input.1), expected); 36 | } 37 | } 38 | 39 | #[test] 40 | fn test_get_char2token() { 41 | let testcases = vec![(vec!["a", "bc"], vec![0, 1, 1])]; 42 | for (input, expected) in testcases.into_iter() { 43 | assert_eq!(get_char2token(&input), expected); 44 | } 45 | } 46 | #[test] 47 | fn test_get_charmap() { 48 | let testcases = vec![ 49 | ("å", "å", vec![vec![0, 1]], vec![vec![0], vec![0]]), 50 | ( 51 | "あがさ", 52 | "あかさ", 53 | vec![vec![0], vec![1], vec![2]], 54 | vec![vec![0], vec![1], vec![2]], 55 | ), 56 | ("", "a", vec![], vec![vec![]]), 57 | ("", "", vec![], vec![]), 58 | ( 59 | "å\tb", 60 | "a b", 61 | vec![vec![0], vec![], vec![2]], 62 | vec![vec![0], vec![], vec![2]], 63 | ), 64 | ( 65 | "a\tb", 66 | "a b", 67 | vec![vec![0], vec![], vec![2]], 68 | vec![vec![0], vec![], vec![2]], 69 | ), 70 | ( 71 | "2000", 72 | "2000", 73 | vec![vec![0], vec![1], vec![2], vec![3]], 74 | vec![vec![0], vec![1], vec![2], vec![3]], 75 | ), 76 | ("¨", "", vec![vec![]], vec![]), 77 | ( 78 | "hello``world``", 79 | "Hello \"world\"", 80 | vec![ 81 | vec![0], 82 | vec![1], 83 | vec![2], 84 | vec![3], 85 | vec![4], 86 | vec![], 87 | vec![], 88 | vec![7], 89 | vec![8], 90 | vec![9], 91 | vec![10], 92 | vec![11], 93 | vec![], 94 | vec![], 95 | ], 96 | vec![ 97 | vec![0], 98 | vec![1], 99 | vec![2], 100 | vec![3], 101 | vec![4], 102 | vec![], 103 | vec![], 104 | vec![7], 105 | vec![8], 106 | vec![9], 107 | vec![10], 108 | vec![11], 109 | vec![], 110 | ], 111 | ), 112 | ]; 113 | for (a, b, e_a2b, e_b2a) in testcases { 114 | let (a2b, b2a) = get_charmap(a, b); 115 | assert_eq!(a2b.len(), a.chars().count(), "a2b {:?}", a2b); 116 | assert_eq!(b2a.len(), b.chars().count(), "b2a {:?}", b2a); 117 | assert_eq!( 118 | a2b, e_a2b, 119 | "check a2b 120 | a: {:?} 121 | b: {:?} 122 | ", 123 | a, b 124 | ); 125 | assert_eq!( 126 | b2a, e_b2a, 127 | "check b2a 128 | a: {:?} 129 | b: {:?} 130 | ", 131 | a, b 132 | ); 133 | } 134 | } 135 | --------------------------------------------------------------------------------