├── demo
├── .cargo-ok
├── www
│ ├── .gitignore
│ ├── .babelrc
│ ├── src
│ │ ├── index.html
│ │ └── index.tsx
│ ├── tsconfig.json
│ ├── .bin
│ │ └── create-wasm-app.js
│ ├── .eslintrc.js
│ ├── webpack.config.js
│ ├── LICENSE-MIT
│ ├── package.json
│ ├── README.md
│ ├── types
│ │ └── react-linto
│ │ │ └── index.d.ts
│ └── LICENSE-APACHE
├── .gitignore
├── tests
│ └── web.rs
├── src
│ ├── utils.rs
│ └── lib.rs
├── LICENSE_MIT
├── Cargo.toml
├── README.md
└── LICENSE_APACHE
├── .dockerignore
├── python
├── tests
│ ├── __init__.py
│ ├── test_bench.py
│ └── test_main.py
├── tokenizations
│ ├── py.typed
│ ├── __init__.pyi
│ └── __init__.py
├── pytest.ini
├── Makefile
├── setup.cfg
├── tox.ini
├── Cargo.toml
├── README.md
├── pyproject.toml
├── src
│ └── lib.rs
└── .gitignore
├── .gitignore
├── img
└── demo.png
├── .github
├── FUNDING.yml
└── workflows
│ ├── manylinux_build.yml
│ └── main.yml
├── dockerfiles
├── ci
│ ├── centos
│ │ └── Dockerfile
│ └── manylinux
│ │ └── Dockerfile
└── centos
│ └── Dockerfile
├── .cargo
└── config
├── Cargo.toml
├── CONTRIBUTING.md
├── LICENSE
├── benches
└── main.rs
├── README.md
├── note
├── algorithm.md
└── blog_post.md
└── src
├── tests.rs
└── lib.rs
/demo/.cargo-ok:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | /target/
2 |
--------------------------------------------------------------------------------
/python/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/python/tokenizations/py.typed:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/demo/www/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | dist
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | **/*.rs.bk
3 | Cargo.lock
4 | .venv
5 | .vscode/
6 |
--------------------------------------------------------------------------------
/img/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/tokenizations/HEAD/img/demo.png
--------------------------------------------------------------------------------
/python/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = --benchmark-skip
3 | testpaths = tests
4 |
--------------------------------------------------------------------------------
/demo/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | **/*.rs.bk
3 | Cargo.lock
4 | bin/
5 | pkg/
6 | wasm-pack.log
7 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: tamuhey
4 |
--------------------------------------------------------------------------------
/dockerfiles/ci/centos/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM docker.pkg.github.com/tamuhey/tokenizations/centos7-python:0
2 |
--------------------------------------------------------------------------------
/demo/www/.babelrc:
--------------------------------------------------------------------------------
1 | {
2 | "presets": [
3 | "@babel/preset-env",
4 | "@babel/preset-react"
5 | ]
6 | }
--------------------------------------------------------------------------------
/.cargo/config:
--------------------------------------------------------------------------------
1 | [target.x86_64-apple-darwin]
2 | rustflags = [
3 | "-C", "link-arg=-undefined",
4 | "-C", "link-arg=dynamic_lookup",
5 | ]
--------------------------------------------------------------------------------
/python/Makefile:
--------------------------------------------------------------------------------
1 | build:
2 | poetry run maturin build
3 | develop:
4 | poetry run maturin develop
5 | test: develop
6 | poetry run pytest tests
7 |
--------------------------------------------------------------------------------
/python/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | exclude = .git, __pycache__, build, scripts, .venv, .tox, .hypothesis, .nox, outputs
4 | doctests = False
5 | ignore = E203,W503,E501
6 |
--------------------------------------------------------------------------------
/python/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | isolated_build = true
3 | envlist = py37, py38, py39
4 |
5 | [testenv]
6 | whitelist_externals =
7 | poetry
8 | maturin
9 | commands =
10 | poetry install -v
11 | maturin develop
12 | poetry run pytest tests
13 |
--------------------------------------------------------------------------------
/demo/tests/web.rs:
--------------------------------------------------------------------------------
1 | //! Test suite for the Web and headless browsers.
2 |
3 | #![cfg(target_arch = "wasm32")]
4 |
5 | extern crate wasm_bindgen_test;
6 | use wasm_bindgen_test::*;
7 |
8 | wasm_bindgen_test_configure!(run_in_browser);
9 |
10 | #[wasm_bindgen_test]
11 | fn pass() {
12 | assert_eq!(1 + 1, 2);
13 | }
14 |
--------------------------------------------------------------------------------
/demo/www/src/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | tokenization
7 |
8 |
9 |
10 |
11 | This page contains webassembly and javascript content, please enable javascript in your browser.
12 |
13 |
14 |
--------------------------------------------------------------------------------
/demo/www/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "outDir": "./dist/",
4 | "sourceMap": true,
5 | "noImplicitAny": true,
6 | "module": "ESNext",
7 | "target": "es6",
8 | "jsx": "react",
9 | "esModuleInterop": true,
10 | "moduleResolution": "node",
11 | "typeRoots": [
12 | "types"
13 | ]
14 | }
15 | }
--------------------------------------------------------------------------------
/dockerfiles/ci/manylinux/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM quay.io/pypa/manylinux1_x86_64:2020-03-07-9c5ba95
2 | ENV PATH /root/.cargo/bin:/root/.local/bin:/opt/python/cp36-cp36m/bin:/opt/python/cp37-cp37m/bin:/opt/python/cp38-cp38/bin:/opt/rh/devtoolset-2/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/python/cp35-cp35m/bin:/opt/python/cp36-cp36m/bin:/opt/python/cp37-cp37m/bin:/opt/python/cp38-cp38/bin
--------------------------------------------------------------------------------
/python/tokenizations/__init__.pyi:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | from typing import Optional, Sequence, Tuple
3 |
4 | def get_alignments(
5 | a: Sequence[str], b: Sequence[str]
6 | ) -> Tuple[list[list[int]], list[list[int]]]: ...
7 | def get_charmap(a: str, b: str) -> Tuple[list[list[int]], list[list[int]]]: ...
8 | def get_original_spans(
9 | tokens: Sequence[str], original_text: str
10 | ) -> list[Optional[Tuple[int, int]]]: ...
11 |
12 |
--------------------------------------------------------------------------------
/python/tokenizations/__init__.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | from .tokenizations import (
3 | get_alignments,
4 | get_charmap,
5 | __version__,
6 | )
7 |
8 |
9 | def get_original_spans(tokens, original_text):
10 | raise ValueError(
11 | f"{get_original_spans.__name__} was deprecated. Please use `textspan.get_original_spans` instead."
12 | )
13 |
14 |
15 | __all__ = ["get_charmap", "get_alignments", "get_original_spans", "__version__"]
16 |
--------------------------------------------------------------------------------
/python/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "pytokenizations"
3 | version = "0.8.4"
4 | authors = ["Yohei Tamura "]
5 | edition = "2018"
6 |
7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
8 |
9 | [dependencies]
10 | tokenizations = "0.4.2"
11 |
12 | [lib]
13 | name = "tokenizations"
14 | crate-type = ["cdylib"]
15 |
16 | [dependencies.pyo3]
17 | version = "^0.16.5"
18 | features = ["extension-module"]
19 |
--------------------------------------------------------------------------------
/demo/src/utils.rs:
--------------------------------------------------------------------------------
1 | pub fn set_panic_hook() {
2 | // When the `console_error_panic_hook` feature is enabled, we can call the
3 | // `set_panic_hook` function at least once during initialization, and then
4 | // we will get better error messages if our code ever panics.
5 | //
6 | // For more details see
7 | // https://github.com/rustwasm/console_error_panic_hook#readme
8 | #[cfg(feature = "console_error_panic_hook")]
9 | console_error_panic_hook::set_once();
10 | }
11 |
--------------------------------------------------------------------------------
/python/tests/test_bench.py:
--------------------------------------------------------------------------------
1 | """Benchmark"""
2 | import tokenizations
3 | import pytest
4 |
5 |
6 | @pytest.mark.benchmark(warmup=True, group="short", disable_gc=True)
7 | def test_short(benchmark):
8 | args = ["今日は", "\t", "いい", "天気だ", "。"], ["今日", "は", "いい", "天気", "た", "。"]
9 | benchmark(tokenizations.get_alignments, *args)
10 |
11 |
12 | @pytest.mark.benchmark(warmup=True, group="long", disable_gc=True)
13 | def test_long(benchmark):
14 | a = list("abcde") * 1000
15 | b = list("abbde") * 1000
16 | benchmark(tokenizations.get_alignments, a, b)
17 |
--------------------------------------------------------------------------------
/python/README.md:
--------------------------------------------------------------------------------
1 | # pytokenizations
2 |
3 | ## Installation
4 |
5 | ```bash
6 | $ pip install pytokenizations
7 | ```
8 |
9 | ### Install from source
10 |
11 | This library uses [maturin](https://github.com/PyO3/maturin) to build.
12 |
13 | ```console
14 | $ git clone https://github.com/tamuhey/tokenizations
15 | $ cd python
16 | $ pip install maturin
17 | $ maturin build
18 | ```
19 |
20 | Now wheel is built in `python/target/wheels` directory. You can install it with `pip install *whl`.
21 |
22 | # Usage
23 |
24 | See the [README.md](https://github.com/tamuhey/tokenizations#usage-python)
25 |
--------------------------------------------------------------------------------
/python/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [ "maturin",]
3 | build-backend = "maturin"
4 |
5 | [tool.versionup]
6 | tag = true
7 | commit = true
8 | files = [ "src/lib.rs", "Cargo.toml",]
9 | tag_prefix = "python/"
10 |
11 | [tool.poetry]
12 | name = "pytokenizations"
13 | version = "0.8.4"
14 | description = ""
15 | authors = [ "Yohei Tamura ",]
16 | [[tool.poetry.packages]]
17 | include = "tokenizations"
18 |
19 | [tool.poetry.dependencies]
20 | python = ">=3.7"
21 |
22 | [tool.poetry.dev-dependencies]
23 | pytest = "^6.2.2"
24 | hypothesis = "^6.3.0"
25 | twine = "^3.3.0"
26 | pytest-benchmark = "^3.4.1"
27 |
--------------------------------------------------------------------------------
/demo/www/.bin/create-wasm-app.js:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 |
3 | const { spawn } = require("child_process");
4 | const fs = require("fs");
5 |
6 | let folderName = '.';
7 |
8 | if (process.argv.length >= 3) {
9 | folderName = process.argv[2];
10 | if (!fs.existsSync(folderName)) {
11 | fs.mkdirSync(folderName);
12 | }
13 | }
14 |
15 | const clone = spawn("git", ["clone", "https://github.com/rustwasm/create-wasm-app.git", folderName]);
16 |
17 | clone.on("close", code => {
18 | if (code !== 0) {
19 | console.error("cloning the template failed!")
20 | process.exit(code);
21 | } else {
22 | console.log("🦀 Rust + 🕸 Wasm = ❤");
23 | }
24 | });
25 |
--------------------------------------------------------------------------------
/dockerfiles/centos/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM centos:7
2 | ENV HOME /root
3 | ENV PATH $HOME/.pyenv/bin:$HOME/.pyenv/shims:$HOME/.cargo/bin:$HOME/.local/bin:$PATH
4 | RUN yum update -y && yum install -y git gcc make zlib-devel && \
5 | curl https://pyenv.run | bash
6 | # pyenv prequisits. see https://github.com/pyenv/pyenv/wiki/common-build-problems
7 | RUN yum install @development zlib-devel bzip2 bzip2-devel readline-devel sqlite sqlite-devel openssl-devel xz xz-devel libffi-devel findutils -y
8 | RUN pyenv install 3.8.2 && \
9 | pyenv install 3.7.6 && \
10 | pyenv install 3.6.10 && \
11 | pyenv install 3.5.4 && \
12 | pyenv global 3.8.2 3.7.6 3.6.10 3.5.4
13 |
--------------------------------------------------------------------------------
/python/src/lib.rs:
--------------------------------------------------------------------------------
1 | #![allow(clippy::deprecated)]
2 | use pyo3::prelude::*;
3 | use tokenizations::{get_alignments, get_charmap, Alignment, CharMap};
4 |
5 | #[pymodule]
6 | fn tokenizations(_py: Python, m: &PyModule) -> PyResult<()> {
7 | m.add("__version__", "0.8.4")?;
8 |
9 | #[pyfn(m, "get_alignments")]
10 | pub fn get_alignments_py(
11 | _py: Python,
12 | a: Vec<&str>,
13 | b: Vec<&str>,
14 | ) -> PyResult<(Alignment, Alignment)> {
15 | Ok(get_alignments(&a, &b))
16 | }
17 |
18 | #[pyfn(m, "get_charmap")]
19 | pub fn get_charmap_py(_py: Python, a: &str, b: &str) -> PyResult<(CharMap, CharMap)> {
20 | Ok(get_charmap(a, b))
21 | }
22 |
23 | Ok(())
24 | }
25 |
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "tokenizations"
3 | version = "0.4.2"
4 | license = "MIT"
5 | edition = "2018"
6 | description = "Tokenizations alignments library"
7 | homepage = "https://github.com/tamuhey/tokenizations"
8 | repository = "https://github.com/tamuhey/tokenizations"
9 | keywords = ["nlp", "text", "algorithm"]
10 | authors = ["Yohei Tamura "]
11 | readme = "README.md"
12 | documentation = "https://docs.rs/tokenizations"
13 |
14 | [dependencies]
15 | unicode-normalization = "0.1.17"
16 | seqdiff = "0.3"
17 |
18 | [dev-dependencies]
19 | quickcheck = "1"
20 | quickcheck_macros = "1"
21 | criterion = "0.3"
22 |
23 | [[bench]]
24 | name = "main"
25 | harness = false
26 |
27 | [lib]
28 | bench = false
29 |
30 | [profile.bench]
31 | debug = 2
32 |
--------------------------------------------------------------------------------
/demo/src/lib.rs:
--------------------------------------------------------------------------------
1 | mod utils;
2 |
3 | use js_sys;
4 | use tokenizations;
5 | use wasm_bindgen::prelude::*;
6 | use wasm_bindgen::JsCast;
7 |
8 | // When the `wee_alloc` feature is enabled, use `wee_alloc` as the global
9 | // allocator.
10 | #[cfg(feature = "wee_alloc")]
11 | #[global_allocator]
12 | static ALLOC: wee_alloc::WeeAlloc = wee_alloc::WeeAlloc::INIT;
13 |
14 | #[wasm_bindgen]
15 | extern "C" {
16 | fn alert(s: &str);
17 | }
18 |
19 | fn as_vecstring(s: js_sys::Array) -> Vec {
20 | s.iter().map(|v| v.as_string().unwrap()).collect::>()
21 | }
22 |
23 | #[wasm_bindgen]
24 | pub fn get_alignment(s: js_sys::Array, t: js_sys::Array) -> JsValue {
25 | let s = as_vecstring(s);
26 | let t = as_vecstring(t);
27 | let ret = tokenizations::get_alignments(&s, &t);
28 | JsValue::from_serde(&ret).unwrap()
29 | }
30 |
--------------------------------------------------------------------------------
/demo/www/.eslintrc.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | root: true,
3 | parser: '@typescript-eslint/parser',
4 | plugins: [
5 | '@typescript-eslint',
6 | "react-hooks",
7 | "prettier",
8 | ],
9 | parserOptions: {
10 | tsconfigRootDir: __dirname,
11 | project: ['./tsconfig.json'],
12 | },
13 | extends: [
14 | 'plugin:@typescript-eslint/recommended-requiring-type-checking',
15 | "plugin:react/recommended",
16 | "prettier",
17 | "prettier/@typescript-eslint",
18 | "prettier/react",
19 | ],
20 | rules: {
21 | "react/jsx-props-no-spreading": "off",
22 | "no-underscore-dangle": "off",
23 | "prettier/prettier": "error",
24 | "react-hooks/rules-of-hooks": "error", // Checks rules of Hooks
25 | "react-hooks/exhaustive-deps": "warn", // Checks effect dependencies
26 | "react/prop-types": "off",
27 | }
28 | };
--------------------------------------------------------------------------------
/demo/www/webpack.config.js:
--------------------------------------------------------------------------------
1 | const path = require("path");
2 | const HtmlWebPackPlugin = require("html-webpack-plugin");
3 |
4 | const src = path.resolve(__dirname, "src");
5 | const dist = path.resolve(__dirname, "dist");
6 |
7 | module.exports = {
8 | mode: "development",
9 | entry: src + "/index.tsx",
10 | output: {
11 | path: dist,
12 | filename: "bundle.js",
13 | },
14 | module: {
15 | rules: [
16 | {
17 | test: /\.ts(x?)$/,
18 | exclude: /node_modules/,
19 | use: {
20 | loader: "ts-loader",
21 | },
22 | },
23 | {
24 | enforce: "pre",
25 | test: /\.js$/,
26 | loader: "source-map-loader",
27 | },
28 | ],
29 | },
30 | resolve: {
31 | extensions: [".js", ".jsx", ".ts", "tsx"],
32 | },
33 | plugins: [
34 | new HtmlWebPackPlugin({
35 | template: src + "/index.html",
36 | filename: "index.html",
37 | }),
38 | ],
39 | };
40 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # For Maintainer
2 |
3 | ## Publishing flow
4 |
5 | Testing and publishing are automatically done in GitHub Actions.
6 | The definitions are located under `.github` directory.
7 |
8 | ### Rust
9 |
10 | 1. Get a token from crates.io and set it into `CRATES_PASS` secrets via the settings page of this repository
11 | - Token can be issued in "API Access" section in [crates.io account settings page](https://crates.io/me)
12 | 1. Fix version in `Cargo.toml` (e.g. 0.1.0)
13 | 1. Git tag version with prefix `rust/` (e.g. `git tag rust/0.1.0`)
14 | 1. Push tag to master
15 | 1. CI automatically publish a new crate to crates.io after testing
16 |
17 | ### Python
18 |
19 | 1. Set the PyPi user name and password into `PYPI_USER` and `PYPI_PASS` respectively, via the settings page of this repository.
20 | 1. Fix version in `python/pyproject.toml`, `python/Cargo.toml`, `python/src/lib.rs`
21 | - Easily done with [pyversionup](https://github.com/tamuhey/pyversionup): e.g. `versionup 0.1.0`
22 | 1. Git tag version with prefix `python/`
23 | 1. Push tag to master
24 | 1. CI automatically publish package to PyPi after testing
25 |
26 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 tamuhey
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/demo/www/LICENSE-MIT:
--------------------------------------------------------------------------------
1 | Copyright (c) [year] [name]
2 |
3 | Permission is hereby granted, free of charge, to any
4 | person obtaining a copy of this software and associated
5 | documentation files (the "Software"), to deal in the
6 | Software without restriction, including without
7 | limitation the rights to use, copy, modify, merge,
8 | publish, distribute, sublicense, and/or sell copies of
9 | the Software, and to permit persons to whom the Software
10 | is furnished to do so, subject to the following
11 | conditions:
12 |
13 | The above copyright notice and this permission notice
14 | shall be included in all copies or substantial portions
15 | of the Software.
16 |
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | DEALINGS IN THE SOFTWARE.
26 |
--------------------------------------------------------------------------------
/demo/LICENSE_MIT:
--------------------------------------------------------------------------------
1 | Copyright (c) 2018 Yohei Tamura
2 |
3 | Permission is hereby granted, free of charge, to any
4 | person obtaining a copy of this software and associated
5 | documentation files (the "Software"), to deal in the
6 | Software without restriction, including without
7 | limitation the rights to use, copy, modify, merge,
8 | publish, distribute, sublicense, and/or sell copies of
9 | the Software, and to permit persons to whom the Software
10 | is furnished to do so, subject to the following
11 | conditions:
12 |
13 | The above copyright notice and this permission notice
14 | shall be included in all copies or substantial portions
15 | of the Software.
16 |
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | DEALINGS IN THE SOFTWARE.
26 |
--------------------------------------------------------------------------------
/demo/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "demo"
3 | version = "0.1.0"
4 | authors = ["Yohei Tamura "]
5 | edition = "2018"
6 |
7 | [lib]
8 | crate-type = ["cdylib", "rlib"]
9 |
10 | [features]
11 | default = ["console_error_panic_hook"]
12 |
13 | [dependencies]
14 | wasm-bindgen = {version = "0.2", features= ["serde-serialize"]}
15 | tokenizations = "0.2.2"
16 | js-sys = "0.3.37"
17 |
18 | # The `console_error_panic_hook` crate provides better debugging of panics by
19 | # logging them with `console.error`. This is great for development, but requires
20 | # all the `std::fmt` and `std::panicking` infrastructure, so isn't great for
21 | # code size when deploying.
22 | console_error_panic_hook = { version = "0.1.1", optional = true }
23 |
24 | # `wee_alloc` is a tiny allocator for wasm that is only ~1K in code size
25 | # compared to the default allocator's ~10K. It is slower than the default
26 | # allocator, however.
27 | #
28 | # Unfortunately, `wee_alloc` requires nightly Rust when targeting wasm for now.
29 | wee_alloc = { version = "0.4.2", optional = true }
30 |
31 | [dev-dependencies]
32 | wasm-bindgen-test = "0.2"
33 |
34 | [profile.release]
35 | # Tell `rustc` to optimize for small code size.
36 | opt-level = "s"
37 |
--------------------------------------------------------------------------------
/benches/main.rs:
--------------------------------------------------------------------------------
1 | use criterion::{black_box, criterion_group, criterion_main, Criterion};
2 | use tokenizations;
3 |
4 | fn get_alignments(c: &mut Criterion) {
5 | let mut group = c.benchmark_group("get_alignments");
6 | let s = black_box(vec![
7 | "asd",
8 | "asdfasdf",
9 | "asdfa",
10 | "-02 t",
11 | "q2-0t",
12 | "q -q24t0-q4t2",
13 | ]);
14 | let t = black_box(vec![
15 | "asd",
16 | "afasdf",
17 | "0sdfa",
18 | "-02t",
19 | "q2---0t",
20 | "q --:あh4t0-q4t2",
21 | ]);
22 | let u = black_box(vec![
23 | "zzz",
24 | "zzzzzz",
25 | "ppppp",
26 | "pppp",
27 | "ppppppp",
28 | "ppppppppppppppp",
29 | ]);
30 |
31 | group.bench_function("handmade short", |b| {
32 | b.iter(|| tokenizations::get_alignments(&s, &t))
33 | });
34 |
35 | let n = black_box(100);
36 | let s_long = s.repeat(n);
37 | let t_long = t.repeat(n);
38 | let u_long = u.repeat(n);
39 | group.bench_function("handmade long", |b| {
40 | b.iter(|| tokenizations::get_alignments(&s_long, &t_long))
41 | });
42 |
43 | group.bench_function("identical short", |b| {
44 | b.iter(|| tokenizations::get_alignments(&s, &s))
45 | });
46 |
47 | group.bench_function("identical long", |b| {
48 | b.iter(|| tokenizations::get_alignments(&s_long, &s_long))
49 | });
50 |
51 | group.bench_function("completely different short", |b| {
52 | b.iter(|| tokenizations::get_alignments(&s, &u))
53 | });
54 |
55 | group.bench_function("completely different long", |b| {
56 | b.iter(|| tokenizations::get_alignments(&s_long, &u_long))
57 | });
58 | group.finish()
59 | }
60 |
61 | criterion_group!(benches, get_alignments);
62 | criterion_main!(benches);
63 |
--------------------------------------------------------------------------------
/demo/www/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "create-wasm-app",
3 | "version": "0.1.0",
4 | "description": "create an app to consume rust-generated wasm packages",
5 | "main": "index.js",
6 | "bin": {
7 | "create-wasm-app": ".bin/create-wasm-app.js"
8 | },
9 | "scripts": {
10 | "start": "webpack-dev-server",
11 | "build": "webpack --mode production",
12 | "predeploy": "npm run build",
13 | "deploy": "gh-pages -d dist"
14 | },
15 | "repository": {
16 | "type": "git",
17 | "url": "git+https://github.com/rustwasm/create-wasm-app.git"
18 | },
19 | "keywords": [
20 | "webassembly",
21 | "wasm",
22 | "rust",
23 | "webpack"
24 | ],
25 | "author": "Ashley Williams ",
26 | "license": "(MIT OR Apache-2.0)",
27 | "bugs": {
28 | "url": "https://github.com/rustwasm/create-wasm-app/issues"
29 | },
30 | "homepage": "https://github.com/rustwasm/create-wasm-app#readme",
31 | "devDependencies": {
32 | "@babel/core": "^7.9.0",
33 | "@babel/preset-env": "^7.9.5",
34 | "@babel/preset-react": "^7.9.4",
35 | "@types/react": "^16.9.34",
36 | "@types/react-dom": "^16.9.6",
37 | "babel-loader": "^8.1.0",
38 | "copy-webpack-plugin": "^5.0.0",
39 | "eslint": "^6.8.0",
40 | "eslint-config-prettier": "^6.10.1",
41 | "eslint-plugin-import": "^2.20.2",
42 | "eslint-plugin-jsx-a11y": "^6.2.3",
43 | "eslint-plugin-prettier": "^3.1.2",
44 | "eslint-plugin-react": "^7.19.0",
45 | "eslint-plugin-react-hooks": "^3.0.0",
46 | "gh-pages": "^2.2.0",
47 | "hello-wasm-pack": "^0.1.0",
48 | "html-loader": "^1.1.0",
49 | "html-webpack-plugin": "^4.2.0",
50 | "prettier": "^2.0.2",
51 | "source-map-loader": "^0.2.4",
52 | "ts-loader": "^7.0.1",
53 | "webpack": "^4.29.3",
54 | "webpack-cli": "^3.1.0",
55 | "webpack-dev-server": "^3.1.5"
56 | },
57 | "dependencies": {
58 | "@material-ui/core": "^4.9.11",
59 | "@material-ui/icons": "^4.9.1",
60 | "@material-ui/styles": "^4.10.0",
61 | "react": "^16.13.1",
62 | "react-dom": "^16.13.1",
63 | "react-lineto": "^3.1.4",
64 | "react-scripts": "3.4.1",
65 | "tokenization": "file:../pkg",
66 | "typescript": "~3.7.2"
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/demo/README.md:
--------------------------------------------------------------------------------
1 |
19 |
20 | ## About
21 |
22 | [**📚 Read this template tutorial! 📚**][template-docs]
23 |
24 | This template is designed for compiling Rust libraries into WebAssembly and
25 | publishing the resulting package to NPM.
26 |
27 | Be sure to check out [other `wasm-pack` tutorials online][tutorials] for other
28 | templates and usages of `wasm-pack`.
29 |
30 | [tutorials]: https://rustwasm.github.io/docs/wasm-pack/tutorials/index.html
31 | [template-docs]: https://rustwasm.github.io/docs/wasm-pack/tutorials/npm-browser-packages/index.html
32 |
33 | ## 🚴 Usage
34 |
35 | ### 🐑 Use `cargo generate` to Clone this Template
36 |
37 | [Learn more about `cargo generate` here.](https://github.com/ashleygwilliams/cargo-generate)
38 |
39 | ```
40 | cargo generate --git https://github.com/rustwasm/wasm-pack-template.git --name my-project
41 | cd my-project
42 | ```
43 |
44 | ### 🛠️ Build with `wasm-pack build`
45 |
46 | ```
47 | wasm-pack build
48 | ```
49 |
50 | ### 🔬 Test in Headless Browsers with `wasm-pack test`
51 |
52 | ```
53 | wasm-pack test --headless --firefox
54 | ```
55 |
56 | ### 🎁 Publish to NPM with `wasm-pack publish`
57 |
58 | ```
59 | wasm-pack publish
60 | ```
61 |
62 | ## 🔋 Batteries Included
63 |
64 | * [`wasm-bindgen`](https://github.com/rustwasm/wasm-bindgen) for communicating
65 | between WebAssembly and JavaScript.
66 | * [`console_error_panic_hook`](https://github.com/rustwasm/console_error_panic_hook)
67 | for logging panic messages to the developer console.
68 | * [`wee_alloc`](https://github.com/rustwasm/wee_alloc), an allocator optimized
69 | for small code size.
70 |
--------------------------------------------------------------------------------
/python/tests/test_main.py:
--------------------------------------------------------------------------------
1 | from tokenizations import get_original_spans
2 | import pytest
3 | import tokenizations
4 | from hypothesis import given
5 | from hypothesis import strategies as st
6 |
7 |
8 | @given(st.lists(st.text()), st.lists(st.text()))
9 | def test_random(a, b):
10 | tokenizations.get_alignments(a, b)
11 |
12 |
13 | @given(st.lists(st.text()))
14 | def test_equality(a):
15 | a2b, b2a = tokenizations.get_alignments(a, a)
16 | assert a2b == b2a
17 | assert a2b == [[i] if len(aa) else [] for i, aa in enumerate(a)]
18 |
19 |
20 | @pytest.mark.parametrize(
21 | "input_,expected",
22 | [
23 | ((["fo", "o"], ["foo"]), ([[0], [0]], [[0, 1]])),
24 | ((["fø", "o"], ["foo"]), ([[0], [0]], [[0, 1]])),
25 | ((["New", "York"], ["New York"]), ([[0], [0]], [[0, 1]])),
26 | (
27 | (["今日は", "\t", "いい", "天気だ", "。"], ["今日", "は", "いい", "天気", "た", "。"]),
28 | ([[0, 1], [], [2], [3, 4], [5]], [[0], [0], [2], [3], [3], [4]]),
29 | ),
30 | ],
31 | )
32 | def test_get_alignments(input_, expected):
33 | output = tokenizations.get_alignments(*input_)
34 | assert output == expected
35 |
36 |
37 | @pytest.mark.parametrize(
38 | "input_,expected", [(("foo", "fo0"), ([[0], [1], []], [[0], [1], []]))]
39 | )
40 | def test_get_charmap(input_, expected):
41 | assert tokenizations.get_charmap(*input_) == expected
42 |
43 |
44 | @given(st.text(), st.text())
45 | def test_random_charmap(a, b):
46 | tokenizations.get_charmap(a, b)
47 |
48 |
49 | @given(st.text())
50 | def test_equality_charmap(a):
51 | a2b, b2a = tokenizations.get_charmap(a, a)
52 | assert a2b == b2a
53 | assert a2b == [[x] for x in range(len(a))]
54 |
55 |
56 | VERSION_DEPRECATE_WARN_GET_ORIGINAL_SPANS = "0.7"
57 | VERSION_DEPRECATE_ERR_GET_ORIGINAL_SPANS = "0.8"
58 |
59 |
60 | @pytest.mark.skipif(
61 | not (
62 | VERSION_DEPRECATE_WARN_GET_ORIGINAL_SPANS
63 | <= tokenizations.__version__
64 | < VERSION_DEPRECATE_ERR_GET_ORIGINAL_SPANS
65 | ),
66 | reason="deprecation check",
67 | )
68 | def test_warn_get_original_spans():
69 | with pytest.warns(DeprecationWarning):
70 | get_original_spans([], "")
71 |
72 |
73 | @pytest.mark.skipif(
74 | tokenizations.__version__ < VERSION_DEPRECATE_ERR_GET_ORIGINAL_SPANS,
75 | reason="deprecation error check",
76 | )
77 | def test_error_get_original_spans():
78 | with pytest.raises(ValueError):
79 | get_original_spans([], "")
80 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Robust and Fast tokenizations alignment library for Rust and Python
2 | [](https://crates.io/crates/tokenizations)
3 | [](https://pypi.org/project/pytokenizations/)
4 | [](https://github.com/explosion/tokenizations/actions)
5 |
6 | 
7 |
8 | Demo: [demo](https://tamuhey.github.io/tokenizations/)
9 | Rust document: [docs.rs](https://docs.rs/tokenizations)
10 | Blog post: [How to calculate the alignment between BERT and spaCy tokens effectively and robustly](https://gist.github.com/tamuhey/af6cbb44a703423556c32798e1e1b704)
11 |
12 | ## Usage (Python)
13 |
14 | - Installation
15 |
16 | ```bash
17 | $ pip install -U pip # update pip
18 | $ pip install pytokenizations
19 | ```
20 |
21 | - Or, install from source
22 |
23 | This library uses [maturin](https://github.com/PyO3/maturin) to build the wheel.
24 |
25 | ```console
26 | $ git clone https://github.com/tamuhey/tokenizations
27 | $ cd tokenizations/python
28 | $ pip install maturin
29 | $ maturin build
30 | ```
31 |
32 | Now the wheel is created in `python/target/wheels` directory, and you can install it with `pip install *whl`.
33 |
34 | ### `get_alignments`
35 |
36 | ```python
37 | def get_alignments(a: Sequence[str], b: Sequence[str]) -> Tuple[List[List[int]], List[List[int]]]: ...
38 | ```
39 |
40 | Returns alignment mappings for two different tokenizations:
41 |
42 | ```python
43 | >>> tokens_a = ["å", "BC"]
44 | >>> tokens_b = ["abc"] # the accent is dropped (å -> a) and the letters are lowercased(BC -> bc)
45 | >>> a2b, b2a = tokenizations.get_alignments(tokens_a, tokens_b)
46 | >>> print(a2b)
47 | [[0], [0]]
48 | >>> print(b2a)
49 | [[0, 1]]
50 | ```
51 |
52 | `a2b[i]` is a list representing the alignment from `tokens_a` to `tokens_b`.
53 |
54 | ## Usage (Rust)
55 |
56 | See here: [docs.rs](https://docs.rs/tokenizations)
57 |
58 | ## Related
59 |
60 | - [Algorithm overview](./note/algorithm.md)
61 | - [Blog post](./note/blog_post.md)
62 | - [seqdiff](https://github.com/tamuhey/seqdiff) is used for the diff process.
63 | - [textspan](https://github.com/tamuhey/textspan)
64 | - [explosion/spacy-alignments: 💫 A spaCy package for Yohei Tamura's Rust tokenizations library](https://github.com/explosion/spacy-alignments)
65 | - Python bindings for this library, maintained by Explosion, author of spaCy. If you feel difficult to install pytokenizations, please try this.
66 |
--------------------------------------------------------------------------------
/python/.gitignore:
--------------------------------------------------------------------------------
1 | ### https://raw.github.com/github/gitignore/499ae899e7b54e701e878759f73d9092302fd07a/Python.gitignore
2 |
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | pip-wheel-metadata/
26 | share/python-wheels/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | MANIFEST
31 |
32 | # PyInstaller
33 | # Usually these files are written by a python script from a template
34 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 |
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 |
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .nox/
46 | .coverage
47 | .coverage.*
48 | .cache
49 | nosetests.xml
50 | coverage.xml
51 | *.cover
52 | .hypothesis/
53 | .pytest_cache/
54 |
55 | # Translations
56 | *.mo
57 | *.pot
58 |
59 | # Django stuff:
60 | *.log
61 | local_settings.py
62 | db.sqlite3
63 | db.sqlite3-journal
64 |
65 | # Flask stuff:
66 | instance/
67 | .webassets-cache
68 |
69 | # Scrapy stuff:
70 | .scrapy
71 |
72 | # Sphinx documentation
73 | docs/_build/
74 |
75 | # PyBuilder
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | .python-version
87 |
88 | # pipenv
89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
92 | # install all needed dependencies.
93 | #Pipfile.lock
94 |
95 | # celery beat schedule file
96 | celerybeat-schedule
97 |
98 | # SageMath parsed files
99 | *.sage.py
100 |
101 | # Environments
102 | .env
103 | .venv
104 | env/
105 | venv/
106 | ENV/
107 | env.bak/
108 | venv.bak/
109 |
110 | # Spyder project settings
111 | .spyderproject
112 | .spyproject
113 |
114 | # Rope project settings
115 | .ropeproject
116 |
117 | # mkdocs documentation
118 | /site
119 |
120 | # mypy
121 | .mypy_cache/
122 | .dmypy.json
123 | dmypy.json
124 |
125 | # Pyre type checker
126 | .pyre/
127 |
128 | .vscode/
129 | .benchmarks
130 | poetry.lock
131 |
--------------------------------------------------------------------------------
/note/algorithm.md:
--------------------------------------------------------------------------------
1 | # Algorithm
2 |
3 | Let $A = a_{11}a_{12}..a_{1k_1},a_{21}..a_{Nk_N}$ and $B = b_{11}b_{12}..b_{1l_1},b_{21}..b_{Ml_M}$ be tokens of length N and M respectively. Each token $A_i$ in $A$ and $B_j$ in $B$ have length $k_i$ and $l_j$ respectively.
4 | The *alignment* $AL_{AB}$ of $A$ to $B$ is such that $ \forall j \in AL_{AB,i} => B_j \cap A_i $. ($t \cap s$ means t partially matches s.)
5 | For example, $a=["f","o","o"], b=["fo","o"] => AL_{AB} = [[1],[1],[2]], AL_{BA} = [[1, 2], [3]]$.
6 | The goal of this algorithm is to find such $AL_{AB}$ and $AL_{BA}$
7 |
8 | 1. Normalize tokens in the unicode normalization form "NFKD", then lowercase all characters.
9 | 2. Concatenate all tokens $A$ and $B$ to generate $TA$ and $TB$ respectively
10 | 3. Calculate shortest path on edit graph of $TA$ and $TB$
11 | 4. Get character mapping $C_{AB}$ and $C_{BA}$ from the edit graph
12 | 5. Get $AL_{AB}$ and $AL_{BA}$ from the character alignments $C_{AB}$ and $C_{BA}$
13 |
14 | Details:
15 |
16 | 1. Normalize tokens in the unicode normalization form "NFKD"
17 |
18 | To compare the token positions, we must compare each characters in tokens. Because the two tokenizations may be partially different, we normalize them in "NFKD" and lowercase them first.
19 |
20 | 2. Concatenate all tokens $A$ and $B$ to generate $TA$ and $TB$ respectively
21 |
22 | Before calculating the edit graph, we combine tokens into text. For example, if we have tokens `["Foo", "bar"]`, we concatenate them into one text `Foobar`.
23 |
24 | 3. Calculate shortest path on edit graph from $TA$ and $TB$
25 |
26 | We calculate the shortest path on edit graph from texts $TA$ and $TB$ to get character map between them. The path can be calculated, for example, by [Myers' algorighm](http://www.xmailserver.org/diff2.pdf)
27 |
28 | 4. Get character alignments $C_{AB}$ and $C_{BA}$ from the edit graph
29 |
30 | Let $TA_i$ and $TB_j$ be the i-th and j-th character in the text $TA$ and $TB$, respectively. $C_{AB}$ is a mapping from $TA$ to $TB$ such that $C_{AB},i \neq -1 \land C_{AB,i} = j \Rightarrow TA_i = TA_j$. For example, $TA = f0o, TB = fboo$ then $C_{AB} = [1,-1,3], C_{BA} = [1,-1,3,-1]$.
31 | We can calculate $C_{AB}$ and $C_{BA}$ from the shortest path on the edit graph. If there exists diagonal edge $(i-1,j-1) -> (i, j)$ in the path, $C_{AB,i} = j$ and $C_{BA,j} = i$. If there doesn't exist any diagonal edge to $\forall j (i, j)$ then $C_{AB,i} = -1$.
32 |
33 | 5. Get $AL_{AB}$ and $AL_{BA}$ from the character alignments $C_{AB}$ and $C_{BA}$
34 |
35 | Now we can calculate the desired $AL_{AB}$ and $AL_{BA}$ from the previous calculated $C_{AB}$ and $C_{BA}$.
--------------------------------------------------------------------------------
/demo/www/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
create-wasm-app
4 |
5 |
An npm init template for kick starting a project that uses NPM packages containing Rust-generated WebAssembly and bundles them with Webpack.
6 |
7 |
8 |
9 |
10 |
11 |
12 | Usage
13 | |
14 | Chat
15 |
16 |
17 |
Built with 🦀🕸 by The Rust and WebAssembly Working Group
18 |
19 |
20 | ## About
21 |
22 | This template is designed for depending on NPM packages that contain
23 | Rust-generated WebAssembly and using them to create a Website.
24 |
25 | * Want to create an NPM package with Rust and WebAssembly? [Check out
26 | `wasm-pack-template`.](https://github.com/rustwasm/wasm-pack-template)
27 | * Want to make a monorepo-style Website without publishing to NPM? Check out
28 | [`rust-webpack-template`](https://github.com/rustwasm/rust-webpack-template)
29 | and/or
30 | [`rust-parcel-template`](https://github.com/rustwasm/rust-parcel-template).
31 |
32 | ## 🚴 Usage
33 |
34 | ```
35 | npm init wasm-app
36 | ```
37 |
38 | ## 🔋 Batteries Included
39 |
40 | - `.gitignore`: ignores `node_modules`
41 | - `LICENSE-APACHE` and `LICENSE-MIT`: most Rust projects are licensed this way, so these are included for you
42 | - `README.md`: the file you are reading now!
43 | - `index.html`: a bare bones html document that includes the webpack bundle
44 | - `index.js`: example js file with a comment showing how to import and use a wasm pkg
45 | - `package.json` and `package-lock.json`:
46 | - pulls in devDependencies for using webpack:
47 | - [`webpack`](https://www.npmjs.com/package/webpack)
48 | - [`webpack-cli`](https://www.npmjs.com/package/webpack-cli)
49 | - [`webpack-dev-server`](https://www.npmjs.com/package/webpack-dev-server)
50 | - defines a `start` script to run `webpack-dev-server`
51 | - `webpack.config.js`: configuration file for bundling your js with webpack
52 |
53 | ## License
54 |
55 | Licensed under either of
56 |
57 | * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
58 | * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
59 |
60 | at your option.
61 |
62 | ### Contribution
63 |
64 | Unless you explicitly state otherwise, any contribution intentionally
65 | submitted for inclusion in the work by you, as defined in the Apache-2.0
66 | license, shall be dual licensed as above, without any additional terms or
67 | conditions.
68 |
--------------------------------------------------------------------------------
/src/tests.rs:
--------------------------------------------------------------------------------
1 | use crate::*;
2 |
3 | #[test]
4 | fn test_get_alignment() {
5 | let testcases = vec![
6 | (
7 | (vec!["fあo①が", "bar"], vec!["fあo1かb", "ar"]),
8 | (vec![vec![0], vec![0, 1]], vec![vec![0, 1], vec![1]]),
9 | ),
10 | (
11 | (vec!["New York"], vec!["New", "York"]),
12 | (vec![vec![0, 1]], vec![vec![0], vec![0]]),
13 | ),
14 | (
15 | (vec!["A'B"], vec!["A", "B"]),
16 | (vec![vec![0, 1]], vec![vec![0], vec![0]]),
17 | ),
18 | (
19 | (vec!["A'b"], vec!["a", "b"]),
20 | (vec![vec![0, 1]], vec![vec![0], vec![0]]),
21 | ),
22 | (
23 | (vec![""], vec!["", ""]),
24 | (vec![vec![]], vec![vec![], vec![]]),
25 | ),
26 | (
27 | (vec!["à", "la", "gorge"], vec!["a", "la", "gorge"]),
28 | (
29 | vec![vec![0], vec![1], vec![2]],
30 | vec![vec![0], vec![1], vec![2]],
31 | ),
32 | ),
33 | ];
34 | for (input, expected) in testcases {
35 | assert_eq!(get_alignments(&input.0, &input.1), expected);
36 | }
37 | }
38 |
39 | #[test]
40 | fn test_get_char2token() {
41 | let testcases = vec![(vec!["a", "bc"], vec![0, 1, 1])];
42 | for (input, expected) in testcases.into_iter() {
43 | assert_eq!(get_char2token(&input), expected);
44 | }
45 | }
46 | #[test]
47 | fn test_get_charmap() {
48 | let testcases = vec![
49 | ("å", "å", vec![vec![0, 1]], vec![vec![0], vec![0]]),
50 | (
51 | "あがさ",
52 | "あかさ",
53 | vec![vec![0], vec![1], vec![2]],
54 | vec![vec![0], vec![1], vec![2]],
55 | ),
56 | ("", "a", vec![], vec![vec![]]),
57 | ("", "", vec![], vec![]),
58 | (
59 | "å\tb",
60 | "a b",
61 | vec![vec![0], vec![], vec![2]],
62 | vec![vec![0], vec![], vec![2]],
63 | ),
64 | (
65 | "a\tb",
66 | "a b",
67 | vec![vec![0], vec![], vec![2]],
68 | vec![vec![0], vec![], vec![2]],
69 | ),
70 | (
71 | "2000",
72 | "2000",
73 | vec![vec![0], vec![1], vec![2], vec![3]],
74 | vec![vec![0], vec![1], vec![2], vec![3]],
75 | ),
76 | ("¨", "", vec![vec![]], vec![]),
77 | (
78 | "hello``world``",
79 | "Hello \"world\"",
80 | vec![
81 | vec![0],
82 | vec![1],
83 | vec![2],
84 | vec![3],
85 | vec![4],
86 | vec![],
87 | vec![],
88 | vec![7],
89 | vec![8],
90 | vec![9],
91 | vec![10],
92 | vec![11],
93 | vec![],
94 | vec![],
95 | ],
96 | vec![
97 | vec![0],
98 | vec![1],
99 | vec![2],
100 | vec![3],
101 | vec![4],
102 | vec![],
103 | vec![],
104 | vec![7],
105 | vec![8],
106 | vec![9],
107 | vec![10],
108 | vec![11],
109 | vec![],
110 | ],
111 | ),
112 | ];
113 | for (a, b, e_a2b, e_b2a) in testcases {
114 | let (a2b, b2a) = get_charmap(a, b);
115 | assert_eq!(a2b.len(), a.chars().count(), "a2b {:?}", a2b);
116 | assert_eq!(b2a.len(), b.chars().count(), "b2a {:?}", b2a);
117 | assert_eq!(
118 | a2b, e_a2b,
119 | "check a2b
120 | a: {:?}
121 | b: {:?}
122 | ",
123 | a, b
124 | );
125 | assert_eq!(
126 | b2a, e_b2a,
127 | "check b2a
128 | a: {:?}
129 | b: {:?}
130 | ",
131 | a, b
132 | );
133 | }
134 | }
135 |
--------------------------------------------------------------------------------
/.github/workflows/manylinux_build.yml:
--------------------------------------------------------------------------------
1 | name: build manylinux
2 |
3 | on:
4 | push:
5 |
6 | jobs:
7 | build:
8 | if: contains(github.event.head_commit.message, '[skip ci]') == false
9 | runs-on: ubuntu-latest
10 | strategy:
11 | matrix:
12 | python-version:
13 | - 3.7
14 | - 3.8
15 | - 3.9
16 | container:
17 | image: quay.io/pypa/manylinux2010_x86_64
18 | env:
19 | PATH: /root/.cargo/bin:/root/.local/bin:/opt/python/cp36-cp36m/bin:/opt/python/cp37-cp37m/bin:/opt/python/cp38-cp38/bin:/opt/rh/devtoolset-2/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/python/cp35-cp35m/bin:/opt/python/cp36-cp36m/bin:/opt/python/cp37-cp37m/bin:/opt/python/cp38-cp38/bin:/opt/python/cp39-cp39/bin:/opt/rh/devtoolset-8/root/usr/bin
20 | options: --user root
21 | env:
22 | HOME: /root
23 | PYTHON: python${{ matrix.python-version }}
24 | steps:
25 | - uses: actions/checkout@v1
26 | - name: Install rust
27 | run: |
28 | curl --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal
29 | - name: Test rust lib
30 | run: cargo test
31 |
32 | - name: Install dependencies with pip
33 | working-directory: python
34 | run: |
35 | $PYTHON -m pip install --upgrade pip
36 | $PYTHON -m venv .venv
37 | $PYTHON -m pip install poetry maturin
38 | poetry install
39 | poetry run which python
40 |
41 | - name: Build python package
42 | working-directory: python
43 | run: poetry run maturin develop
44 |
45 | - name: Test with pytest
46 | working-directory: python
47 | run: poetry run pytest
48 |
49 | - name: Install publishment tool
50 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/python/')
51 | working-directory: python
52 | run: $PYTHON -m pip install twine auditwheel
53 |
54 | - name: Build
55 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/python/')
56 | working-directory: python
57 | run: |
58 | maturin build --no-sdist --release --strip -i $PYTHON
59 | find target/ -type f -name "*whl" -exec $PYTHON -m auditwheel repair {} \;
60 |
61 | - name: Publish test pypi
62 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/python/')
63 | working-directory: python
64 | run: |
65 | twine upload target/wheels/*whl --repository-url https://test.pypi.org/legacy/ -u ${{ secrets.TEST_PYPI_USER }} -p ${{ secrets.TEST_PYPI_PASS }}
66 |
67 | - name: Publish pypi
68 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/python/')
69 | working-directory: python
70 | run: |
71 | twine upload target/wheels/*whl -u ${{ secrets.PYPI_USER }} -p ${{ secrets.PYPI_PASS }}
72 |
73 | build-aarch64:
74 | strategy:
75 | matrix:
76 | pyver: [cp37-cp37m, cp38-cp38, cp39-cp39]
77 | runs-on: ubuntu-latest
78 | env:
79 | py: /opt/python/${{ matrix.pyver }}/bin/python
80 | img: quay.io/pypa/manylinux2014_aarch64
81 | steps:
82 | - name: Checkout
83 | uses: actions/checkout@v2
84 | - name: Set up QEMU
85 | id: qemu
86 | uses: docker/setup-qemu-action@v1
87 | - name: Build Wheel
88 | run: |
89 | docker run --rm -v ${{ github.workspace }}:/ws:rw --workdir=/ws \
90 | ${{ env.img }} \
91 | bash -exc '${{ env.py }} -m venv .env && \
92 | source .env/bin/activate && \
93 | curl --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal && \
94 | source $HOME/.cargo/env && \
95 | cargo test && \
96 | pip install --upgrade pip && \
97 | pip install poetry maturin && \
98 | cd python && \
99 | poetry install && \
100 | poetry run which python && \
101 | poetry run maturin develop && \
102 | poetry run pytest && \
103 | maturin build --no-sdist --release --strip -i ${{ env.py }} && \
104 | for WHL in target/wheels/*whl; do auditwheel repair ${WHL}; done && \
105 | twine upload target/wheels/*manylinux2014_aarch64.whl --repository-url https://test.pypi.org/legacy/ -u ${{ secrets.TEST_PYPI_USER }} -p ${{ secrets.TEST_PYPI_PASS }} && \
106 | twine upload target/wheels/*manylinux2014_aarch64.whl -u ${{ secrets.PYPI_USER }} -p ${{ secrets.PYPI_PASS }} && \
107 | deactivate'
108 |
--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
1 | name: Test and Deploy
2 |
3 | on:
4 | push:
5 | paths-ignore:
6 | - 'README.md'
7 |
8 | jobs:
9 | test:
10 | if: contains(github.event.head_commit.message, '[skip ci]') == false
11 | runs-on: ${{ matrix.os }}
12 | strategy:
13 | matrix:
14 | python-version: [3.7, 3.8, 3.9]
15 | os: [macos-latest, windows-latest, ubuntu-latest]
16 | steps:
17 | - uses: actions/checkout@v1
18 | - name: Set up Python ${{ matrix.python-version }}
19 | uses: actions/setup-python@v1
20 | with:
21 | python-version: ${{ matrix.python-version }}
22 |
23 | - name: Install latest stable
24 | uses: actions-rs/toolchain@v1
25 | with:
26 | toolchain: stable
27 | override: true
28 | components: rustfmt, clippy
29 |
30 | - name: Lint with RustFmt
31 | uses: actions-rs/cargo@v1
32 | with:
33 | command: fmt
34 |
35 | - name: Lint with Clippy
36 | uses: actions-rs/cargo@v1
37 | with:
38 | command: clippy
39 | args: --all-targets --all-features
40 |
41 | - name: Test with cargo
42 | uses: actions-rs/cargo@v1.0.1
43 | with:
44 | command: test
45 | toolchain: stable
46 |
47 | - name: Install dependencies with pip
48 | working-directory: python
49 | run: |
50 | python -m pip install --upgrade pip
51 | pip install poetry maturin
52 | poetry install
53 |
54 | - name: Build python package
55 | working-directory: python
56 | run: poetry run maturin develop
57 |
58 | - name: Test with pytest
59 | working-directory: python
60 | run: poetry run pytest
61 |
62 | publish-rust:
63 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/rust/')
64 | needs: test
65 | runs-on: ubuntu-latest
66 | steps:
67 | - uses: actions/checkout@v1
68 | - uses: actions-rs/toolchain@v1
69 | with:
70 | toolchain: stable
71 | override: true
72 | - name: Publish to creates.io
73 | run: |
74 | cargo login ${{ secrets.CRATES_PASS }}
75 | cargo publish
76 |
77 | publish-python-wheels:
78 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/python/')
79 | needs: test
80 | runs-on: ${{ matrix.os }}
81 | strategy:
82 | matrix:
83 | python-version: [3.7, 3.8, 3.9]
84 | # ubuntu wheel is built in `manylinux_build.yml`
85 | os: [macos-latest, windows-latest]
86 |
87 | steps:
88 | - uses: actions/checkout@v1
89 | - uses: actions/setup-python@v1
90 | with:
91 | python-version: ${{ matrix.python-version }}
92 | - uses: actions-rs/toolchain@v1
93 | with:
94 | toolchain: stable
95 | override: true
96 |
97 | - name: Install publishment tool
98 | working-directory: python
99 | run: |
100 | python -m pip install --upgrade pip
101 | pip install maturin twine
102 |
103 | - name: Build
104 | working-directory: python
105 | run: maturin build --no-sdist --release --strip -i python
106 |
107 | - name: Publish test pypi
108 | working-directory: python
109 | run: twine upload target/wheels/*whl --repository-url https://test.pypi.org/legacy/ -u ${{ secrets.TEST_PYPI_USER }} -p ${{ secrets.TEST_PYPI_PASS }}
110 |
111 | - name: Publish pypi
112 | working-directory: python
113 | run: twine upload target/wheels/*whl -u ${{ secrets.PYPI_USER }} -p ${{ secrets.PYPI_PASS }}
114 |
115 | publish-python-sdist:
116 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/python/')
117 | needs: test
118 | runs-on: ubuntu-latest
119 | steps:
120 | - uses: actions/checkout@v1
121 | - uses: actions/setup-python@v1
122 | with:
123 | python-version: 3.7
124 | - uses: actions-rs/toolchain@v1
125 | with:
126 | toolchain: stable
127 | override: true
128 |
129 | - name: Install publishment tool
130 | working-directory: python
131 | run: |
132 | python -m pip install --upgrade pip
133 | pip install maturin twine
134 |
135 | - name: Build sdist
136 | working-directory: python
137 | run: maturin sdist
138 |
139 | - name: Publish test pypi
140 | working-directory: python
141 | run: |
142 | twine upload target/wheels/*.tar.gz --repository-url https://test.pypi.org/legacy/ -u ${{ secrets.TEST_PYPI_USER }} -p ${{ secrets.TEST_PYPI_PASS }}
143 |
144 | - name: Publish pypi
145 | working-directory: python
146 | run: |
147 | twine upload target/wheels/*.tar.gz -u ${{ secrets.PYPI_USER }} -p ${{ secrets.PYPI_PASS }}
148 |
149 |
--------------------------------------------------------------------------------
/demo/www/types/react-linto/index.d.ts:
--------------------------------------------------------------------------------
1 | declare module "react-lineto" {
2 | import { Component, PureComponent } from "react";
3 |
4 | /**
5 | * Orientation type for 'Stepped' lines
6 | */
7 | type Orientation = "h" | "v";
8 |
9 | /**
10 | * Delay
11 | */
12 | type Delay = number | boolean;
13 |
14 | /**
15 | * Anchor type
16 | */
17 | type Anchor = string;
18 |
19 | /**
20 | * Coordinate type
21 | */
22 | type Coordinate = { x: number } | { y: number };
23 |
24 | /**
25 | * Coordinates type
26 | */
27 | type Coordinates = {
28 | x: number;
29 | y: number;
30 | };
31 |
32 | /**
33 | * Line coordinates
34 | */
35 | interface LineCoordinates {
36 | /**
37 | * First X coordinate
38 | */
39 | x0: number;
40 | /**
41 | * Second X coordinate
42 | */
43 | x1: number;
44 | /**
45 | * First Y coordinate
46 | */
47 | y0: number;
48 | /**
49 | * Second Y coordinate
50 | */
51 | y1: number;
52 | }
53 |
54 | /**
55 | * Base props for all components
56 | */
57 | interface BaseProps {
58 | /**
59 | * Border color, Example: #f00, red, etc.
60 | */
61 | borderColor?: string;
62 | /**
63 | * Border style, Example: solid, dashed, etc.
64 | */
65 | borderStyle?: string;
66 | /**
67 | * Border width (px)
68 | */
69 | borderWidth?: number;
70 | /**
71 | * Desired CSS className for the rendered element
72 | */
73 | className?: string;
74 | /**
75 | * Z-index offset
76 | */
77 | zIndex?: number;
78 | /**
79 | * CSS class name of the desired container
80 | */
81 | within?: string;
82 | }
83 |
84 | /**
85 | * Common props for 'LineTo' and 'SteppedLineTo' components
86 | */
87 | interface LineToCommonProps extends BaseProps {
88 | /**
89 | * Force render after delay (ms)
90 | */
91 | delay?: Delay;
92 | /**
93 | * Anchor for starting point (Format: "x y")
94 | */
95 | fromAnchor?: Anchor;
96 | /**
97 | * CSS class name of the first element
98 | */
99 | from: string;
100 | /**
101 | * Anchor for ending point (Format: 'x y")
102 | */
103 | toAnchor?: Anchor;
104 | /**
105 | * CSS class name of the second element
106 | */
107 | to: string;
108 | }
109 |
110 | /**
111 | * Common props for 'Line' and 'SteppedLine' components
112 | */
113 | interface LineCommonProps extends BaseProps, LineCoordinates {}
114 |
115 | /**
116 | * Props for 'Stepped' components
117 | */
118 | interface SteppedProps {
119 | /**
120 | * "h" for horizontal, "v" for vertical
121 | */
122 | orientation?: Orientation;
123 | }
124 |
125 | /**
126 | * Props of 'LineTo' component
127 | */
128 | export interface LineToProps extends LineToCommonProps {}
129 |
130 | /**
131 | * Props of 'SteppedLineTo' component
132 | */
133 | export interface SteppedLineToProps extends LineToProps, SteppedProps {}
134 |
135 | /**
136 | * Props of 'Line' component
137 | */
138 | export interface LineProps extends LineCommonProps {}
139 |
140 | /**
141 | * Props of 'SteppedLine' component
142 | */
143 | export interface SteppedLineProps extends LineProps, SteppedProps {}
144 |
145 | /**
146 | * Draw line between two DOM elements.
147 | */
148 | export default class LineTo<
149 | P extends LineToProps = LineToProps
150 | > extends Component> {
151 | /**
152 | * Forced update after delay (MS)
153 | */
154 | deferUpdate: (delay: number) => void;
155 |
156 | /**
157 | * Parse delay prop
158 | */
159 | parseDelay: (delay?: Delay) => number;
160 |
161 | /**
162 | * Parse anchor given as percentage
163 | */
164 | parseAnchorPercent: (value: string) => number;
165 |
166 | /**
167 | * Parse anchor given as text
168 | */
169 | parseAnchorText: (value: string) => Coordinate;
170 |
171 | /**
172 | * Parse anchor prop
173 | */
174 | parseAnchor: (value?: Anchor) => Coordinates;
175 |
176 | /**
177 | * Detect coordinates
178 | */
179 | detect: () => LineCoordinates;
180 |
181 | /**
182 | * Find element by class
183 | */
184 | findElement: (className: string) => Element;
185 | }
186 |
187 | /**
188 | * Draw stepped line between two DOM elements.
189 | */
190 | export class SteppedLineTo extends LineTo {}
191 |
192 | /**
193 | * Draw line using pixel coordinates (relative to viewport).
194 | */
195 | export class Line extends PureComponent {
196 | /**
197 | * Find element by class
198 | */
199 | findElement: (className: string) => Element;
200 | }
201 |
202 | /**
203 | * Draw stepped line using pixel coordinates (relative to viewport).
204 | */
205 | export class SteppedLine extends PureComponent {
206 | /**
207 | * Render vertically
208 | */
209 | renderVertical: () => React.ReactNode;
210 |
211 | /**
212 | * Render horizontally
213 | */
214 | renderHorizontal: () => React.ReactNode;
215 | }
216 | }
217 |
--------------------------------------------------------------------------------
/demo/www/src/index.tsx:
--------------------------------------------------------------------------------
1 | import React, { useState, useEffect } from "react";
2 | import ReactDOM from "react-dom";
3 | import GitHub from "@material-ui/icons/GitHub";
4 | import { makeStyles, createStyles, ThemeProvider } from "@material-ui/styles";
5 | import { createMuiTheme, Theme } from "@material-ui/core/styles";
6 | import Container from "@material-ui/core/Container";
7 | import Paper from "@material-ui/core/Paper";
8 | import Grid from "@material-ui/core/Grid";
9 | import TextField from "@material-ui/core/TextField";
10 | import Typography from "@material-ui/core/Typography";
11 | import Box from "@material-ui/core/Box";
12 | import Link from "@material-ui/core/Link";
13 | import LineTo from "react-lineto";
14 |
15 | const repoURL = "https://github.com/tamuhey/tokenizations";
16 | const repoWWWURL = "https://github.com/tamuhey/tokenizations/tree/master/demo";
17 | const tryParse = (input: string): [string[], boolean] => {
18 | try {
19 | const tokens = JSON.parse(input);
20 | return [tokens, false];
21 | } catch {
22 | return [[], true];
23 | }
24 | };
25 |
26 | const useStyles = makeStyles((theme: Theme) =>
27 | createStyles({
28 | textField: {
29 | fontSize: "1.3rem",
30 | },
31 | tokenBox: {
32 | padding: 10,
33 | border: "1px solid black",
34 | borderRadius: 10,
35 | },
36 | tokensContainer: {
37 | display: "flex",
38 | padding: theme.spacing(3),
39 | margin: theme.spacing(3),
40 | backgroundColor: theme.palette.background.paper,
41 | },
42 | titleBox: {
43 | display: "flex",
44 | justifyContent: "center",
45 | margin: 3,
46 | alignItems: "baseline",
47 | },
48 | githubIcon: {
49 | color: "black",
50 | marginLeft: 20,
51 | },
52 | gridContainer: {
53 | padding: 30,
54 | },
55 | container: {
56 | marginTop: 20,
57 | },
58 | })
59 | );
60 |
61 | interface InputProps {
62 | text: string;
63 | setText: (text: string) => void;
64 | error: boolean;
65 | }
66 |
67 | const theme = createMuiTheme();
68 | const Index = () => (
69 |
70 |
71 |
72 | );
73 |
74 | const App = () => {
75 | const [inputA, setInputA] = useState(`["John", "Johånson", "'s", "house"]`);
76 | const [inputB, setInputB] = useState(
77 | `["john", "johan", "##son", "s", "house"]`
78 | );
79 | const [tokensA, errorA] = tryParse(inputA);
80 | const [tokensB, errorB] = tryParse(inputB);
81 | const [tokenization, setTokenization] = useState(null);
82 | const loadWasm = async () => setTokenization(await import("tokenization"));
83 | const classes = useStyles();
84 |
85 | useEffect(() => {
86 | loadWasm();
87 | });
88 | const [a2b]: number[][][] = tokenization
89 | ? tokenization.get_alignment(tokensA, tokensB)
90 | : [[], []];
91 | console.log(a2b);
92 | return (
93 |
94 |
95 |
96 | Tokenizations Demo
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 | Tokenizations is a token alignment
105 | library for Rust and Python. Feel free to change the below texts.
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 | {tokensA.map((token, i) => (
118 |
119 | {token}
120 |
121 | ))}
122 |
123 |
124 |
125 |
126 | {tokensB.map((token, i) => {
127 | return (
128 |
129 | {token}
130 |
131 | );
132 | })}
133 |
134 |
135 |
136 | {a2b.map((l, i) => {
137 | return l.map((j) => (
138 |
148 | ));
149 | })}
150 |
151 |
152 | This page is built with React and Wasm. The source is{" "}
153 | here.
154 |
155 |
156 |
157 |
158 |
159 | );
160 | };
161 |
162 | const Input = ({ text, setText, error }: InputProps) => {
163 | const classes = useStyles();
164 | return (
165 | setText(e.target.value)}
168 | error={error}
169 | fullWidth
170 | InputProps={{
171 | classes: {
172 | input: classes.textField,
173 | },
174 | }}
175 | helperText={error ? "Invalid JSON array" : ""}
176 | />
177 | );
178 | };
179 |
180 | ReactDOM.render( , document.getElementById("container"));
181 |
--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
1 | #![deny(warnings)]
2 | //! Tokenizations alignment functions.
3 | #[cfg(test)]
4 | mod tests;
5 | #[cfg(test)]
6 | extern crate quickcheck;
7 | #[cfg(test)]
8 | extern crate quickcheck_macros;
9 | extern crate seqdiff;
10 | extern crate unicode_normalization;
11 | use seqdiff::Diff;
12 | use unicode_normalization::UnicodeNormalization;
13 |
14 | pub type Alignment = Vec>;
15 | pub type CharMap = Vec>;
16 |
17 | fn normalize(text: &str) -> String {
18 | text.to_lowercase().nfkd().collect()
19 | }
20 |
21 | fn get_char2token>(tokens: &[T]) -> Vec {
22 | let token_lengths = tokens
23 | .iter()
24 | .map(|s| s.as_ref().chars().count())
25 | .collect::>();
26 | let mut ret = vec![0; token_lengths.iter().sum()];
27 | let mut cur = 0;
28 | for (i, &l) in token_lengths.iter().enumerate() {
29 | for _ in 0..l {
30 | ret[cur] = i;
31 | cur += 1;
32 | }
33 | }
34 | ret
35 | }
36 |
37 | // Returns tokenization alignment from ta to tb.
38 | fn get_alignment(
39 | num_tokens: usize,
40 | a2b: &[Option],
41 | ac2t: &[usize],
42 | bc2t: &[usize],
43 | ) -> Vec> {
44 | let mut at2bt = vec![vec![]; num_tokens];
45 | for (ti, a2bi) in ac2t.iter().zip(a2b) {
46 | if let Some(i) = a2bi {
47 | if let Some(j) = at2bt[*ti].last() {
48 | if *j == bc2t[*i] {
49 | continue;
50 | }
51 | }
52 | at2bt[*ti].push(bc2t[*i])
53 | }
54 | }
55 | at2bt
56 | }
57 |
58 | /// Returns the tokenizations alignments `a2b` (from `a` to `b`) and `b2a` (from `b` to `a`) based on the shortest edit script (SES).
59 | ///
60 | /// # Examples
61 | ///
62 | /// ```
63 | /// use tokenizations::get_alignments;
64 | ///
65 | /// let a = vec!["New York"];
66 | /// let b = vec!["New", "York"];
67 | /// // calculate the two alignments `a2b` and `b2a` at the same time
68 | /// let (a2b, b2a) = get_alignments(&a, &b);
69 | ///
70 | /// // `a2b[i]` is a set that holds indices `j`s of `b` such that `a[i]` corresponds to `b[j]`
71 | /// assert_eq!(a2b, vec![[0, 1]]);
72 | /// // `b2a` is the inverse of `a2b`
73 | /// assert_eq!(b2a, vec![[0], [0]]);
74 | ///
75 | /// // `get_alignments` can be applied to noisy tokens.
76 | /// let a = vec!["à", "la", "gorge"];
77 | /// let b = vec!["a", "la", "gorge"]; // dropped accent
78 | /// let (a2b, b2a) = get_alignments(&a, &b);
79 | /// assert_eq!(a2b, vec![[0], [1], [2]]);
80 | /// assert_eq!(a2b, vec![[0], [1], [2]]);
81 | /// ```
82 | pub fn get_alignments>(a: &[S], b: &[S]) -> (Alignment, Alignment) {
83 | let a: Vec = a.iter().map(|x| normalize(x.as_ref())).collect();
84 | let b: Vec = b.iter().map(|x| normalize(x.as_ref())).collect();
85 | let ac2t = get_char2token(&a);
86 | let bc2t = get_char2token(&b);
87 | let (a2b, b2a) = seqdiff::diff(
88 | &a.join("").chars().collect::>(),
89 | &b.join("").chars().collect::>(),
90 | );
91 | let at2bt = get_alignment(a.len(), &a2b, &ac2t, &bc2t);
92 | let bt2at = get_alignment(b.len(), &b2a, &bc2t, &ac2t);
93 | (at2bt, bt2at)
94 | }
95 |
96 | /// Returns the character mappings `c_a2b` (from `a` to `b`) and `c_b2a` (from `b` to `a`) based on the shortest edit script (SES).
97 | ///
98 | /// `a` and `b` can be noisy. For example, `bar` and `bår` can be properly compared.
99 | ///
100 | /// # Examples
101 | ///
102 | /// Basic usage:
103 | ///
104 | /// ```
105 | /// use tokenizations::get_charmap;
106 | /// let a = "bar";
107 | /// let b = "bår";
108 | /// let (c_a2b, c_b2a) = get_charmap(a, b);
109 | /// assert_eq!(c_a2b, vec![vec![0], vec![1], vec![2]]);
110 | /// assert_eq!(c_b2a, vec![vec![0], vec![1], vec![2]]);
111 | /// ```
112 | pub fn get_charmap(a: &str, b: &str) -> (CharMap, CharMap) {
113 | let at: Vec = a.chars().map(|x| x.to_string()).collect();
114 | let bt: Vec = b.chars().map(|x| x.to_string()).collect();
115 | get_alignments(&at, &bt)
116 | }
117 |
118 | // Deprecated functions:
119 |
120 | fn _get_charmap(a: &str, b: &str) -> (Diff, Diff) {
121 | let at: Vec = a.chars().map(|x| x.to_string()).collect();
122 | let bt: Vec = b.chars().map(|x| x.to_string()).collect();
123 | let (a2b, b2a) = get_alignments(&at, &bt);
124 | let c_a2b: Diff = a2b.into_iter().map(|x| x.into_iter().next()).collect();
125 | let c_b2a: Diff = b2a.into_iter().map(|x| x.into_iter().next()).collect();
126 | (c_a2b, c_b2a)
127 | }
128 |
129 | fn get_span_indices>(tokens: &[S]) -> Vec<(usize, usize)> {
130 | tokens
131 | .iter()
132 | .scan(0, |state, token| {
133 | let l = *state;
134 | let r = l + token.as_ref().chars().count();
135 | *state = r;
136 | Some((l, r))
137 | })
138 | .collect()
139 | }
140 |
141 | fn join>(tokens: &[S]) -> String {
142 | let mut text = "".to_owned();
143 | for token in tokens.iter() {
144 | text.push_str(token.as_ref());
145 | }
146 | text
147 | }
148 |
149 | #[deprecated(since = "0.5.0", note = "please use `textspan::align_spans` instead")]
150 | pub fn get_original_spans>(
151 | tokens: &[S],
152 | original_text: &str,
153 | ) -> Vec> {
154 | let spans = get_span_indices(tokens);
155 | let text = join(tokens);
156 | let (a2b, b2a) = _get_charmap(&text, original_text);
157 |
158 | let mut ret = vec![];
159 | for (l, r) in spans {
160 | // get the leftmost corresponding char
161 | let mut origl = None;
162 | for &x in a2b[l..r].iter() {
163 | if x != None {
164 | origl = x;
165 | break;
166 | }
167 | }
168 | // get the rightmost corresponding char
169 | let mut origr = a2b[l..r].iter().rev().flatten().next().map(|j| j + 1);
170 | // edge case: a token with empty string
171 | if l == r {
172 | if l >= a2b.len() {
173 | origl = Some(b2a.len());
174 | } else {
175 | origl = a2b[l];
176 | }
177 | origr = origl;
178 | }
179 | ret.push(match (origl, origr) {
180 | (Some(l), Some(r)) => Some((l, r)),
181 | (None, None) => None,
182 | _ => unreachable!(
183 | "Internal error occured in get_original_span\ntokens: {:?}\noriginal_text: {:?}",
184 | tokens.iter().map(|x| x.as_ref()).collect::>(),
185 | original_text
186 | ),
187 | })
188 | }
189 | ret
190 | }
191 |
--------------------------------------------------------------------------------
/note/blog_post.md:
--------------------------------------------------------------------------------
1 | # How to calculate the alignment between BERT and spaCy tokens effectively and robustly
2 |
3 | [](https://tamuhey.github.io/tokenizations/)
4 |
5 | site: https://tamuhey.github.io/tokenizations/
6 |
7 | Natural Language Processing (NLP) has made great progress in recent years because of neural networks, which allows us to solve various tasks with end-to-end architecture.
8 | However, many NLP systems still requires language-specific pre- and post-processing, especially in tokenizations.
9 | In this article, I describe an algorithm which simplifies calculating of correspondence between tokens (e.g. BERT vs. spaCy), one such process.
10 | And I introduce Python and Rust libraries that implement this algorithm.
11 |
12 | Here is the library and the demo site links:
13 |
14 | - repo: https://github.com/tamuhey/tokenizations
15 | - demo: https://tamuhey.github.io/tokenizations/
16 |
17 | # What is "alignment" of tokens and Why is it necessary?
18 |
19 | Suppose we want to combine BERT-based named entity recognition (NER) model with rule-based NER model buit on top of spaCy.
20 | Although BERT's NER exhibits [extremely high performance](http://nlpprogress.com/english/named_entity_recognition.html),
21 | it is usually combined with rule-based approaches for practical purposes.
22 | In such cases, what often bothers us is that tokens of spaCy and BERT are different, even if the input sentences are the same.
23 | For example, let's say the input sentence is "John Johanson 's house"; BERT tokenizes this sentence like `["john", "johan", "##son", "'", "s", "house"]` and spaCy tokenizes it like `["John", "Johanson", "'s", "house"]`.
24 | In order to combine the outputs, we need to calculate the correspondence between the two different token sequences.
25 | This correspondence is the "alignment".
26 |
27 | # How to calculate the alignment?
28 |
29 | First, let's sort out the problem.
30 | Looking at the previous example, it can be said that two different token sequences have the following characteristics:
31 |
32 | 1. Splitted in different offsets
33 | 2. Normalized (e.g. lowercase, unicode normalization, dropping accents...)
34 | 3. Added noise (meta symbol '#' in the previous case)
35 |
36 | If the token sequences differ only in *1.*, it can be easily solved, because we just need to compare the letters in order from the beginning.
37 | In fact, `spacy.gold.align`, which [I implemented previously](https://github.com/explosion/spaCy/pull/4526), is based on this algorithm.
38 |
39 | However, when the features *2.* and *3.* are taken into account, the problem suddenly becomes more difficult.
40 | If you want to deal with the previous example, it is relatively easily solved by lowercasing (e.g. A -> a) and removing meta symbols (e.g. "#" -> ""), but this depends on each tokenizers and isn't general-purpose method.
41 | Of course, we want a generic implementation that **works for any tokenizers**.
42 |
43 | Let's think about how to deal with *2.* and *3.*.
44 |
45 | ## Normalization
46 |
47 | In order to compare letters, we need to normalize the input tokens at first.
48 | This is because even though two letters may look the same, the underlying data may be different.
49 | There are variety of normalization methods which is used in NLP. For example:
50 |
51 | - [Unicode normalizations](https://unicode.org/faq/normalization.html)
52 | - Dropping accents ("å" -> "a")
53 | - Lowercasing ("A" -> "a")
54 |
55 | Unicode normalizations are defined in Unicode Standard.
56 | There are 4 types of Unicode normalizations: NFC, NFD, NFKC, NFKD.
57 | Of these, in NFKD, letters are decomposed based on compatibility,
58 | and the number of letter types are the least and the probability
59 | of matching is highest among the four methods. (see [Unicode document](https://unicode.org/faq/normalization.html) for detail).
60 | For example, you can detect the letter "a" is a part of "å" with NFKD, but not with NFKC.
61 |
62 | 
63 |
64 | Thus, we first normalize the intput tokens in NFKD form.
65 | Then, we lowercase all letters because lowercasing is also often used in NLP.
66 |
67 | ## Compare noisy texts
68 |
69 | Now we can compare almost all tokens thanks to NFKD and lowercasing, but they still contain some noise (e.g. "#"),
70 | so we cannot completely compare all letters in tokens.
71 | How to properly ignore the noises and compare all letters?
72 | I racked my brain for few days trying to solve this problem.
73 |
74 | Then, I came up with a solution based on a tool that I use every day.
75 | It is **diff**.
76 | diff is a tool that compares two texts and outputs the mismatches.
77 | It is built in `git` as `git diff`, and you can display the charcter-level correspondence as follows:
78 |
79 | 
80 |
81 | In our case, what we want to know is the agreement part, not the difference, but these are pretty much the same thing.
82 | So, what kind of algorithms is `diff` based on?
83 |
84 | According to the [git diff documentation](https://git-scm.com/docs/git-diff), it is based on [Myers' algorithm](http://www.xmailserver.org/diff2.pdf).
85 | Myers' algorithm is one of the dynamic programming methods that computes the shortest path of what is called edit graph.
86 | It works very fast especially if the difference of the two inputs is small.
87 | For now, what we want to compare are almost identical, so we can get the correspondence of the letters very quickly.
88 |
89 | In short, it turns out that Myers' algorithm helps us to get the correspondens of the letters in two sequence of tokens, while properly ignoring some noises.
90 |
91 | ## Overview of the algorithm
92 |
93 | The considerations so far have shown that suitable normalizations and character-based diff gives us a generic method for computing
94 | the alignment of two token sequences.
95 | Let's summarize the specific steps briefly.
96 |
97 | Let `tokens_a` and `tokens_b` be token sequences of type `List[str]` to be compared. For example, `tokens_a = ["foo", "bar", "baz"]`.
98 |
99 | 1. Normalize all tokens with `NFKD` and lowercasing.
100 |
101 | For example, `"Foo" -> "foo"`
102 |
103 | 2. Concatenate the tokens into one string and let the results be `cat_a` and `cat_b` respectively.
104 |
105 | For example, `cat_a = "".join(tokens_a)` in Python.
106 |
107 | 3. Get the character based diff between the strings `cat_a` and `cat_b`.
108 |
109 | The character based diff can be calculated with [Myers' algorithm](http://www.xmailserver.org/diff2.pdf).
110 |
111 | 4. Converts the caracter-based diff to a token-based diff.
112 |
113 | This is relatively easy to calculate because we know the mapping between the characters and tokens in step 2.
114 |
115 | # Implementation
116 |
117 | [Here is the repository](https://github.com/tamuhey/tokenizations) that implements this algorithm.
118 | This library, `tokenizations`, is implemented with **Rust** and provides a **Python** binding.
119 |
120 | For example, you can use the Python library as follows:
121 |
122 | ```Python
123 | # `$ pip install pytokenizations` to install the package
124 | import tokenizations
125 |
126 | tokens_a = ["John", "Johanson", "'s", "house"]
127 | tokens_b = ["john", "johan", "##son", "'", "s", "house"]
128 | a2b, b2a = tokenizations.get_alignments(tokens_a, tokens_b)
129 |
130 | for i in range(len(tokens_a)):
131 | print(tokens_a[i])
132 | for j in a2b[i]:
133 | print(" ", tokens_b[j])
134 | ```
135 |
136 | ```
137 | John
138 | john
139 | Johanson
140 | johan
141 | ##son
142 | 's
143 | '
144 | s
145 | house
146 | house
147 | ```
148 |
149 | # Conclusion
150 |
151 | In this article, I introduced an algorithm to align two token sequences that are produced by two different tokenizers.
152 | The title mentions spaCy and BERT, but this algorithm can be applied to any tokenizers.
153 | Also, it can be useful to apply NLP methods to noisy texts which contains HTML tags for example:
154 | remove the tags, apply the methods, then calculate the alignment for the output and original text.
155 | Here are the links to the library and demo.
156 |
157 | - repo: https://github.com/tamuhey/tokenizations
158 | - demo: https://tamuhey.github.io/tokenizations/
159 |
--------------------------------------------------------------------------------
/demo/LICENSE_APACHE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
--------------------------------------------------------------------------------
/demo/www/LICENSE-APACHE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------