├── python ├── tests │ ├── __init__.py │ └── test_main.py ├── textspan │ ├── py.typed │ ├── __init__.py │ └── __init__.pyi ├── Makefile ├── Cargo.toml ├── pyproject.toml ├── .gitignore ├── src │ └── lib.rs └── poetry.lock ├── .gitignore ├── rustfmt.toml ├── .github ├── FUNDING.yml └── workflows │ ├── manylinux_build.yml │ └── main.yml ├── .cargo └── config ├── Cargo.toml ├── LICENSE ├── README.md └── src └── lib.rs /python/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python/textspan/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | Cargo.lock 3 | .python-version 4 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | format_code_in_doc_comments = true 2 | wrap_comments = true -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: tamuhey 4 | -------------------------------------------------------------------------------- /.cargo/config: -------------------------------------------------------------------------------- 1 | [target.x86_64-apple-darwin] 2 | rustflags = [ 3 | "-C", "link-arg=-undefined", 4 | "-C", "link-arg=dynamic_lookup", 5 | ] -------------------------------------------------------------------------------- /python/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: build test 2 | SO = textspan/textspan.cpython-38-darwin.so 3 | ${SO}: src/lib.rs 4 | poetry run maturin develop 5 | test: ${SO} 6 | poetry run pytest tests 7 | build: src/lib.rs textspan/* 8 | poetry run maturin build 9 | -------------------------------------------------------------------------------- /python/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "pytextspan" 3 | version = "0.5.7" 4 | authors = ["Yohei Tamura "] 5 | edition = "2018" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | textspanrs = { package = "textspan", version = "0.5.2"} 11 | 12 | [lib] 13 | name = "textspan" 14 | crate-type = ["cdylib"] 15 | 16 | [dependencies.pyo3] 17 | version = "0.17.3" 18 | features = ["extension-module"] 19 | -------------------------------------------------------------------------------- /python/textspan/__init__.py: -------------------------------------------------------------------------------- 1 | from textspan.textspan import ( 2 | get_original_spans, 3 | align_spans_by_mapping, 4 | align_spans, 5 | remove_span_overlaps, 6 | remove_span_overlaps_idx, 7 | lift_spans_index, 8 | lift_span_index, 9 | ) 10 | 11 | __all__ = [ 12 | "get_original_spans", 13 | "align_spans_by_mapping", 14 | "align_spans", 15 | "remove_span_overlaps", 16 | "remove_span_overlaps_idx", 17 | "lift_span_index", 18 | "lift_spans_index", 19 | ] 20 | -------------------------------------------------------------------------------- /python/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ "maturin",] 3 | build-backend = "maturin" 4 | 5 | [tool.poetry] 6 | name = "pytextspan" 7 | version = "0.5.7" 8 | description = "" 9 | authors = [ "Yohei Tamura ",] 10 | license = "MIT" 11 | [[tool.poetry.packages]] 12 | include = "textspan" 13 | 14 | [tool.versionup] 15 | tag = true 16 | commit = true 17 | files = [ "src/lib.rs", "Cargo.toml",] 18 | tag_prefix = "python/" 19 | 20 | [tool.poetry.dependencies] 21 | python = "^3.7" 22 | 23 | [tool.poetry.dev-dependencies] 24 | pytest = "^7.2" 25 | hypothesis = "^6.61.0" 26 | pydoc-md = "^0.1.0" 27 | 28 | [tool.pytest.ini_options] 29 | addopts = "--doctest-glob='*pyi'" 30 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "textspan" 3 | version = "0.5.2" 4 | license = "MIT" 5 | authors = ["Yohei Tamura "] 6 | edition = "2018" 7 | description = "Text span utility" 8 | homepage = "https://github.com/tamuhey/textspan" 9 | repository = "https://github.com/tamuhey/textspan" 10 | keywords = ["nlp", "text", "algorithm"] 11 | readme = "README.md" 12 | documentation = "https://docs.rs/textspan" 13 | 14 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 15 | 16 | [dependencies] 17 | 18 | tokenizations = "0.4.2" 19 | 20 | [dev-dependencies] 21 | quickcheck = "1.0" 22 | quickcheck_macros = "1.0" 23 | proptest = "1.0" 24 | rstest = "0.7.0" 25 | -------------------------------------------------------------------------------- /python/textspan/__init__.pyi: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | def align_spans( 4 | spans: List[Tuple[int, int]], text: str, original_text: str, 5 | ) -> List[List[Tuple[int, int]]]: ... 6 | def align_spans_by_mapping( 7 | spans: List[Tuple[int, int]], mapping: List[List[int]], 8 | ) -> List[List[Tuple[int, int]]]: ... 9 | def get_original_spans( 10 | tokens: List[str], original_text: str, 11 | ) -> List[List[Tuple[int, int]]]: ... 12 | def remove_span_overlaps(tokens: List[Tuple[int, int]]) -> List[Tuple[int, int]]: ... 13 | def remove_span_overlaps_idx(tokens: List[Tuple[int, int]]) -> List[int]: ... 14 | def lift_span_index(span: Tuple[int, int], target_spans: List[Tuple[int, int]]) -> Tuple[Tuple[int, bool], Tuple[int, bool]]: ... 15 | def lift_spans_index(spans: List[Tuple[int, int]], target_spans: List[Tuple[int, int]]) -> List[Tuple[Tuple[int, bool], Tuple[int, bool]]]: ... 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Yohei Tamura 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /python/tests/test_main.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from hypothesis import strategies as st, given 3 | import textspan 4 | 5 | 6 | @pytest.mark.parametrize( 7 | "spans,text,original_text,expected", 8 | [([(0, 1), (3, 7)], "foobarbaz", "foo bar baz", [[(0, 1)], [(4, 7), (8, 9)]])], 9 | ) 10 | def test_align_spans(spans, text, original_text, expected): 11 | assert textspan.align_spans(spans, text, original_text) == expected 12 | 13 | 14 | @pytest.mark.parametrize( 15 | "spans,mapping,expected", 16 | [ 17 | ( 18 | [(0, 2), (2, 5)], 19 | [[0], [1], [2, 3], [], [5, 7]], 20 | [[(0, 2)], [(2, 4), (5, 6), (7, 8)]], 21 | ) 22 | ], 23 | ) 24 | def test_align_spans_by_mapping(spans, mapping, expected): 25 | assert textspan.align_spans_by_mapping(spans, mapping) == expected 26 | 27 | 28 | @given(st.lists(st.text()), st.text()) 29 | def test_random_get_original_spans(tokens, text): 30 | textspan.get_original_spans(tokens, text) 31 | ret = textspan.get_original_spans(tokens, "".join(tokens)) 32 | assert all(x is not None for x in ret) 33 | 34 | 35 | @pytest.mark.parametrize( 36 | "tokens,text,expected", 37 | [ 38 | (["Hello", "world"], "Hello world", [[(0, 5)], [(6, 11)]]), 39 | (["hello", "``world``"], 'Hello "world"', [[(0, 5)], [(7, 12)]]), 40 | ], 41 | ) 42 | def test_random_get_original_spans2(tokens, text, expected): 43 | ret = textspan.get_original_spans(tokens, text) 44 | assert ret == expected, (tokens, text) 45 | 46 | 47 | @pytest.mark.parametrize( 48 | "span, spans, expected", 49 | [ 50 | ((3, 10), [(0, 3), (3, 4), (4, 9), (9, 12)], ((1, True), (4, False))), 51 | ((0, 18), [(0, 13), (13, 18)], ((0, True), (2, True))), 52 | ], 53 | ) 54 | def test_lift_span_index(span, spans, expected): 55 | assert textspan.lift_span_index(span, spans) == expected 56 | assert textspan.lift_spans_index([span], spans) == [expected] 57 | 58 | 59 | def test_remove_span_overlaps(): 60 | spans = [(0, 2), (0, 3), (2, 4), (5, 7)] 61 | assert textspan.remove_span_overlaps(spans) == [(0, 3), (5, 7)] 62 | assert textspan.remove_span_overlaps_idx(spans) == [1, 3] 63 | -------------------------------------------------------------------------------- /.github/workflows/manylinux_build.yml: -------------------------------------------------------------------------------- 1 | name: build manylinux 2 | 3 | on: 4 | push: 5 | 6 | jobs: 7 | build: 8 | if: contains(github.event.head_commit.message, '[skip ci]') == false 9 | runs-on: ubuntu-latest 10 | strategy: 11 | matrix: 12 | python-version: ['3.7', '3.8', '3.9', '3.10', '3.11'] 13 | container: 14 | image: quay.io/pypa/manylinux2014_x86_64 15 | env: 16 | PATH: /root/.cargo/bin:/root/.local/bin:/opt/python/cp36-cp36m/bin:/opt/python/cp37-cp37m/bin:/opt/python/cp38-cp38/bin:/opt/rh/devtoolset-2/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/python/cp35-cp35m/bin:/opt/python/cp36-cp36m/bin:/opt/python/cp37-cp37m/bin:/opt/python/cp38-cp38/bin:/opt/python/cp39-cp39/bin:/opt/python/cp310-cp310/bin:/opt/python/cp310-cp310/bin:/opt/rh/devtoolset-8/root/usr/bin:/opt/_internal/cpython-3.11.1/bin:/opt/_internal/cpython-3.10.1/bin 17 | options: --user root 18 | env: 19 | HOME: /root 20 | PYTHON: python${{ matrix.python-version }} 21 | steps: 22 | - uses: actions/checkout@v1 23 | - name: Install rust 24 | run: | 25 | curl --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal 26 | - name: install gcc 27 | run: yum -y install gcc 28 | - name: Test rust lib 29 | run: cargo test 30 | 31 | - name: Install dependencies with pip 32 | working-directory: python 33 | run: | 34 | $PYTHON -m pip install --upgrade pip 35 | $PYTHON -m venv .venv 36 | $PYTHON -m pip install poetry maturin 37 | poetry install 38 | poetry run which python 39 | 40 | - name: Build python package 41 | working-directory: python 42 | run: poetry run maturin develop 43 | 44 | - name: Test with pytest 45 | working-directory: python 46 | run: poetry run pytest tests 47 | 48 | - name: Install publishment tool 49 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/python/') 50 | working-directory: python 51 | run: $PYTHON -m pip install twine auditwheel 52 | 53 | - name: Build 54 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/python/') 55 | working-directory: python 56 | run: | 57 | maturin build --release --strip -i $PYTHON 58 | find target/ -type f -name "*whl" -exec $PYTHON -m auditwheel repair {} \; 59 | 60 | - name: Publish test pypi 61 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/python/') 62 | working-directory: python 63 | run: | 64 | twine upload target/wheels/*whl --repository-url https://test.pypi.org/legacy/ -u ${{ secrets.TEST_PYPI_USER }} -p ${{ secrets.TEST_PYPI_PASS }} 65 | 66 | - name: Publish pypi 67 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/python/') 68 | working-directory: python 69 | run: | 70 | twine upload target/wheels/*whl -u ${{ secrets.PYPI_USER }} -p ${{ secrets.PYPI_PASS }} 71 | -------------------------------------------------------------------------------- /python/.gitignore: -------------------------------------------------------------------------------- 1 | ### https://raw.github.com/github/gitignore/499ae899e7b54e701e878759f73d9092302fd07a/Python.gitignore 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # celery beat schedule file 96 | celerybeat-schedule 97 | 98 | # SageMath parsed files 99 | *.sage.py 100 | 101 | # Environments 102 | .env 103 | .venv 104 | env/ 105 | venv/ 106 | ENV/ 107 | env.bak/ 108 | venv.bak/ 109 | 110 | # Spyder project settings 111 | .spyderproject 112 | .spyproject 113 | 114 | # Rope project settings 115 | .ropeproject 116 | 117 | # mkdocs documentation 118 | /site 119 | 120 | # mypy 121 | .mypy_cache/ 122 | .dmypy.json 123 | dmypy.json 124 | 125 | # Pyre type checker 126 | .pyre/ 127 | 128 | 129 | ### https://raw.github.com/github/gitignore/499ae899e7b54e701e878759f73d9092302fd07a/Rust.gitignore 130 | 131 | # Generated by Cargo 132 | # will have compiled files and executables 133 | /target/ 134 | 135 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 136 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 137 | Cargo.lock 138 | 139 | # These are backup files generated by rustfmt 140 | **/*.rs.bk 141 | 142 | 143 | ### https://raw.github.com/github/gitignore/499ae899e7b54e701e878759f73d9092302fd07a/Rust.gitignore 144 | 145 | # Generated by Cargo 146 | # will have compiled files and executables 147 | /target/ 148 | 149 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 150 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 151 | Cargo.lock 152 | 153 | # These are backup files generated by rustfmt 154 | **/*.rs.bk 155 | 156 | /.venv37 157 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Text span utilities for Rust and Python 2 | 3 | [![creates.io](https://img.shields.io/crates/v/textspan.svg)](https://crates.io/crates/textspan) 4 | [![pypi](https://img.shields.io/pypi/v/pytextspan.svg)](https://pypi.org/project/pytextspan/) 5 | [![Action Status](https://github.com/tamuhey/textspan/workflows/Test%20and%20Deploy/badge.svg)](https://github.com/tamuhey/textspan/actions) 6 | 7 | - Rust doc: https://docs.rs/textspan 8 | 9 | 10 | ## Usage (Python) 11 | 12 | Install: `pip install pytextspan` 13 | 14 | ### `align_spans` 15 | 16 | ```python 17 | def align_spans(spans: List[Tuple[int, int]], text: str, original_text: str) -> List[List[Tuple[int, int]]]: ... 18 | ``` 19 | 20 | Converts the spans defined in `text` to those defined in `original_text`. 21 | 22 | This is useful, for example, when you want to get the spans in the 23 | original text of spans obtained in the normalized text. 24 | 25 | ```python 26 | >>> import textspan 27 | >>> spans = [(0, 3), (3, 6)]; 28 | >>> text = "foobarbaz"; 29 | >>> original_text = "FOo.BåR baZ"; 30 | >>> textspan.align_spans(spans, text, original_text) 31 | [[(0, 3)], [(4, 7)]] 32 | ``` 33 | 34 | ### `align_spans_by_mapping` 35 | 36 | ```python 37 | def align_spans_by_mapping(spans: List[Tuple[int, int]], mapping: List[List[int]]) -> List[List[Tuple[int, int]]]: ... 38 | ``` 39 | 40 | Converts the spans by the given `mapping`. 41 | 42 | Generally speaking, the character correspondence between two texts is not 43 | necessarily surjective, not injective, not even a methematical map - 44 | some character in `textA` may not have a correspondence in `textB`, 45 | or may have multiple correspondences in `textB`. Thus, you should 46 | provide `mapping` as `List[List[Tuple[int,int]]]`. 47 | 48 | ```python 49 | >>> import textspan 50 | >>> spans = [(0, 2), (3, 4)] 51 | >>> mapping = [[0, 1], [], [2], [4, 5, 6]] 52 | >>> textspan.align_spans_by_mapping(spans, mapping) 53 | [[(0, 2)], [(4, 7)]] 54 | ``` 55 | 56 | ### `get_original_spans` 57 | 58 | ```python 59 | def get_original_spans(tokens: List[str], original_text: str) -> List[List[Tuple[int, int]]]: ... 60 | ``` 61 | 62 | Returns the span indices of `original_text` from the tokens based on the shortest edit script (SES). 63 | 64 | This is useful, for example, when you want to get the spans in the 65 | original text of tokens obtained in the normalized text. 66 | 67 | ```python 68 | >>> import textspan 69 | >>> tokens = ["foo", "bar"] 70 | >>> textspan.get_original_spans(tokens, "FO.o BåR") 71 | [[(0, 2), (3, 4)], [(6, 9)]] 72 | ``` 73 | 74 | ### `lift_span_index` 75 | 76 | ```python 77 | def lift_span_index(span: Tuple[int, int], target_spans: List[Tuple[int, int]]) -> Tuple[Tuple[int, bool], Tuple[int, bool]]: ... 78 | ``` 79 | 80 | Examples: 81 | 82 | ```python 83 | >>> import textspan 84 | >>> spans = [(0, 3), (3, 4), (4, 9), (9, 12)] 85 | >>> assert textspan.lift_spans_index((2, 10), spans) == (0, 4) 86 | ``` 87 | 88 | ### `lift_spans_index` 89 | 90 | ```python 91 | def lift_spans_index(spans: List[Tuple[int, int]], target_spans: List[Tuple[int, int]]) -> List[Tuple[Tuple[int, bool], Tuple[int, bool]]]: ... 92 | ``` 93 | 94 | ### `remove_span_overlaps` 95 | 96 | ```python 97 | def remove_span_overlaps(tokens: List[Tuple[int, int]]) -> List[Tuple[int, int]]: ... 98 | ``` 99 | 100 | Remove overlapping spans from given `spans`. 101 | 102 | First, longest spans are remained - if the two spans are overlapped, the 103 | first span will be remained. If the two spans are overlapped and their start 104 | positions are same, the longer span will be remained. 105 | 106 | ```python 107 | >>> import textspan 108 | >>> spans = [(0, 2), (0, 3), (2, 4), (5, 7)] 109 | >>> assert textspan.remove_span_overlaps(spans) == [(0, 3), (5, 7)] 110 | ``` 111 | 112 | ### `remove_span_overlaps_idx` 113 | 114 | ```python 115 | def remove_span_overlaps_idx(tokens: List[Tuple[int, int]]) -> List[int]: ... 116 | ``` 117 | 118 | Remove overlapping spans from given `spans`, and returns remained span indices. 119 | 120 | First, longest spans are remained - if the two spans are overlapped, the 121 | first span will be remained. If the two spans are overlapped and their start 122 | positions are same, the longer span will be remained. 123 | 124 | ```python 125 | >>> import textspan 126 | >>> spans = [(0, 2), (0, 3), (2, 4), (5, 7)] 127 | >>> assert textspan.remove_span_overlaps_idx(spans) == [1, 3] 128 | ``` 129 | 130 | 131 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: Test and Deploy 2 | 3 | on: 4 | push: 5 | paths-ignore: 6 | - 'README.md' 7 | 8 | jobs: 9 | test: 10 | if: contains(github.event.head_commit.message, '[skip ci]') == false 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | matrix: 14 | python-version: ['3.7', '3.8', '3.9', '3.10', '3.11'] 15 | os: [macos-latest, windows-latest, ubuntu-latest] 16 | steps: 17 | - uses: actions/checkout@v1 18 | - name: Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v1 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | 23 | - name: Install latest stable 24 | uses: actions-rs/toolchain@v1 25 | with: 26 | toolchain: stable 27 | override: true 28 | components: rustfmt, clippy 29 | 30 | - name: Lint with RustFmt 31 | uses: actions-rs/cargo@v1 32 | with: 33 | command: fmt 34 | 35 | - name: Lint with Clippy 36 | uses: actions-rs/cargo@v1 37 | with: 38 | command: clippy 39 | args: --all-targets --all-features 40 | 41 | - name: Test with cargo 42 | uses: actions-rs/cargo@v1.0.1 43 | with: 44 | command: test 45 | toolchain: stable 46 | 47 | - name: Install dependencies with pip 48 | working-directory: python 49 | run: | 50 | python -m pip install --upgrade pip 51 | pip install poetry maturin 52 | poetry install 53 | 54 | - name: Build python package 55 | working-directory: python 56 | run: poetry run maturin develop 57 | 58 | - name: Test with pytest 59 | working-directory: python 60 | run: poetry run pytest tests 61 | 62 | publish-rust: 63 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/rust/') 64 | needs: test 65 | runs-on: ubuntu-latest 66 | steps: 67 | - uses: actions/checkout@v1 68 | - uses: actions-rs/toolchain@v1 69 | with: 70 | toolchain: stable 71 | override: true 72 | - name: Publish to creates.io 73 | run: | 74 | cargo login ${{ secrets.CRATES_PASS }} 75 | cargo publish 76 | 77 | publish-python-wheels: 78 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/python/') 79 | needs: test 80 | runs-on: ${{ matrix.os }} 81 | strategy: 82 | matrix: 83 | python-version: ['3.7', '3.8', '3.9', '3.10', '3.11'] 84 | # ubuntu wheel is built in `manylinux_build.yml` 85 | os: [macos-latest, windows-latest] 86 | 87 | steps: 88 | - uses: actions/checkout@v1 89 | - uses: actions/setup-python@v1 90 | with: 91 | python-version: ${{ matrix.python-version }} 92 | - uses: actions-rs/toolchain@v1 93 | with: 94 | toolchain: stable 95 | override: true 96 | 97 | - name: Install publishment tool 98 | working-directory: python 99 | run: | 100 | python -m pip install --upgrade pip 101 | pip install maturin twine 102 | 103 | - name: Build 104 | working-directory: python 105 | run: maturin build --release --strip -i python 106 | 107 | - name: Publish test pypi 108 | working-directory: python 109 | run: twine upload target/wheels/*whl --repository-url https://test.pypi.org/legacy/ -u ${{ secrets.TEST_PYPI_USER }} -p ${{ secrets.TEST_PYPI_PASS }} 110 | 111 | - name: Publish pypi 112 | working-directory: python 113 | run: twine upload target/wheels/*whl -u ${{ secrets.PYPI_USER }} -p ${{ secrets.PYPI_PASS }} 114 | 115 | publish-python-sdist: 116 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/python/') 117 | needs: test 118 | runs-on: ubuntu-latest 119 | steps: 120 | - uses: actions/checkout@v1 121 | - uses: actions/setup-python@v1 122 | with: 123 | python-version: 3.7 124 | - uses: actions-rs/toolchain@v1 125 | with: 126 | toolchain: stable 127 | override: true 128 | 129 | - name: Install publishment tool 130 | working-directory: python 131 | run: | 132 | python -m pip install --upgrade pip 133 | pip install maturin twine 134 | 135 | - name: Build sdist 136 | working-directory: python 137 | run: maturin sdist 138 | 139 | - name: Publish test pypi 140 | working-directory: python 141 | run: | 142 | twine upload target/wheels/*.tar.gz --repository-url https://test.pypi.org/legacy/ -u ${{ secrets.TEST_PYPI_USER }} -p ${{ secrets.TEST_PYPI_PASS }} 143 | 144 | - name: Publish pypi 145 | working-directory: python 146 | run: | 147 | twine upload target/wheels/*.tar.gz -u ${{ secrets.PYPI_USER }} -p ${{ secrets.PYPI_PASS }} 148 | 149 | -------------------------------------------------------------------------------- /python/src/lib.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | use textspanrs::Span; 3 | 4 | #[pymodule] 5 | fn textspan(_py: Python, m: &PyModule) -> PyResult<()> { 6 | m.add("__version__", "0.5.7")?; 7 | 8 | /// Converts the spans defined in `text` to those defined in `original_text`. 9 | /// 10 | /// This is useful, for example, when you want to get the spans in the 11 | /// original text of spans obtained in the normalized text. 12 | /// 13 | /// Examples: 14 | /// 15 | /// >>> import textspan 16 | /// >>> spans = [(0, 3), (3, 6)]; 17 | /// >>> text = "foobarbaz"; 18 | /// >>> original_text = "FOo.BåR baZ"; 19 | /// >>> textspan.align_spans(spans, text, original_text) 20 | /// [[(0, 3)], [(4, 7)]] 21 | #[pyfn(m)] 22 | #[pyo3(text_signature = "(spans, text, original_text)")] 23 | pub fn align_spans( 24 | _py: Python, 25 | spans: Vec, 26 | text: &str, 27 | original_text: &str, 28 | ) -> PyResult>> { 29 | Ok(textspanrs::align_spans(&spans, text, original_text)) 30 | } 31 | 32 | /// Converts the spans by the given `mapping`. 33 | /// 34 | /// Generally speaking, the character correspondence between two texts is not 35 | /// necessarily surjective, not injective, not even a methematical map - 36 | /// some character in `textA` may not have a correspondence in `textB`, 37 | /// or may have multiple correspondences in `textB`. Thus, you should 38 | /// provide `mapping` as `List[List[Tuple[int,int]]]`. 39 | /// 40 | /// Examples: 41 | /// >>> import textspan 42 | /// >>> spans = [(0, 2), (3, 4)] 43 | /// >>> mapping = [[0, 1], [], [2], [4, 5, 6]] 44 | /// >>> textspan.align_spans_by_mapping(spans, mapping) 45 | /// [[(0, 2)], [(4, 7)]] 46 | #[pyfn(m)] 47 | #[pyo3(text_signature = "(spans, mapping)")] 48 | pub fn align_spans_by_mapping( 49 | _py: Python, 50 | spans: Vec, 51 | mapping: Vec>, 52 | ) -> PyResult>> { 53 | Ok(textspanrs::align_spans_by_mapping(&spans, &mapping)) 54 | } 55 | 56 | /// Returns the span indices of `original_text` from the tokens based on the shortest edit script (SES). 57 | /// 58 | /// This is useful, for example, when you want to get the spans in the 59 | /// original text of tokens obtained in the normalized text. 60 | /// 61 | /// Examples: 62 | /// >>> import textspan 63 | /// >>> tokens = ["foo", "bar"] 64 | /// >>> textspan.get_original_spans(tokens, "FO.o BåR") 65 | /// [[(0, 2), (3, 4)], [(6, 9)]] 66 | /// 67 | #[pyfn(m)] 68 | #[pyo3(text_signature = "(tokens, original_text)")] 69 | pub fn get_original_spans( 70 | _py: Python, 71 | tokens: Vec<&str>, 72 | original_text: &str, 73 | ) -> PyResult>> { 74 | Ok(textspanrs::get_original_spans(&tokens, original_text)) 75 | } 76 | 77 | /// Remove overlapping spans from given `spans`. 78 | /// 79 | /// First, longest spans are remained - if the two spans are overlapped, the 80 | /// first span will be remained. If the two spans are overlapped and their start 81 | /// positions are same, the longer span will be remained. 82 | /// 83 | /// Examples: 84 | /// >>> import textspan 85 | /// >>> spans = [(0, 2), (0, 3), (2, 4), (5, 7)] 86 | /// >>> assert textspan.remove_span_overlaps(spans) == [(0, 3), (5, 7)] 87 | /// 88 | /// 89 | #[pyfn(m)] 90 | #[pyo3(text_signature = "(spans)")] 91 | pub fn remove_span_overlaps(_py: Python, spans: Vec) -> PyResult> { 92 | Ok(textspanrs::remove_span_overlaps(&spans)) 93 | } 94 | 95 | /// Remove overlapping spans from given `spans`, and returns remained span indices. 96 | /// 97 | /// First, longest spans are remained - if the two spans are overlapped, the 98 | /// first span will be remained. If the two spans are overlapped and their start 99 | /// positions are same, the longer span will be remained. 100 | /// 101 | /// Examples: 102 | /// >>> import textspan 103 | /// >>> spans = [(0, 2), (0, 3), (2, 4), (5, 7)] 104 | /// >>> assert textspan.remove_span_overlaps_idx(spans) == [1, 3] 105 | /// 106 | /// 107 | #[pyfn(m)] 108 | #[pyo3(text_signature = "(spans)")] 109 | pub fn remove_span_overlaps_idx(_py: Python, spans: Vec) -> PyResult> { 110 | Ok(textspanrs::remove_span_overlaps_idx(&spans)) 111 | } 112 | 113 | fn to_tuple(x: Result) -> (T, bool) { 114 | match x { 115 | Ok(x) => (x, true), 116 | Err(x) => (x, false), 117 | } 118 | } 119 | 120 | /// Examples: 121 | /// >>> import textspan 122 | /// >>> spans = [(0, 3), (3, 4), (4, 9), (9, 12)] 123 | /// >>> assert textspan.lift_spans_index((2, 10), spans) == (0, 4) 124 | #[pyfn(m)] 125 | #[pyo3(text_signature = "(span, target_spans)")] 126 | pub fn lift_span_index( 127 | _py: Python, 128 | span: Span, 129 | target_spans: Vec, 130 | ) -> PyResult<((usize, bool), (usize, bool))> { 131 | let (l, r) = textspanrs::lift_span_index(span, &target_spans); 132 | Ok((to_tuple(l), to_tuple(r))) 133 | } 134 | 135 | #[pyfn(m)] 136 | #[pyo3(text_signature = "(spans, target_spans)")] 137 | pub fn lift_spans_index( 138 | _py: Python, 139 | spans: Vec, 140 | target_spans: Vec, 141 | ) -> PyResult> { 142 | Ok(textspanrs::lift_spans_index(&spans, &target_spans) 143 | .into_iter() 144 | .map(|(l, r)| (to_tuple(l), to_tuple(r))) 145 | .collect()) 146 | } 147 | 148 | Ok(()) 149 | } 150 | -------------------------------------------------------------------------------- /python/poetry.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Poetry and should not be changed by hand. 2 | 3 | [[package]] 4 | name = "attrs" 5 | version = "22.2.0" 6 | description = "Classes Without Boilerplate" 7 | category = "dev" 8 | optional = false 9 | python-versions = ">=3.6" 10 | files = [ 11 | {file = "attrs-22.2.0-py3-none-any.whl", hash = "sha256:29e95c7f6778868dbd49170f98f8818f78f3dc5e0e37c0b1f474e3561b240836"}, 12 | {file = "attrs-22.2.0.tar.gz", hash = "sha256:c9227bfc2f01993c03f68db37d1d15c9690188323c067c641f1a35ca58185f99"}, 13 | ] 14 | 15 | [package.extras] 16 | cov = ["attrs[tests]", "coverage-enable-subprocess", "coverage[toml] (>=5.3)"] 17 | dev = ["attrs[docs,tests]"] 18 | docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope.interface"] 19 | tests = ["attrs[tests-no-zope]", "zope.interface"] 20 | tests-no-zope = ["cloudpickle", "cloudpickle", "hypothesis", "hypothesis", "mypy (>=0.971,<0.990)", "mypy (>=0.971,<0.990)", "pympler", "pympler", "pytest (>=4.3.0)", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-mypy-plugins", "pytest-xdist[psutil]", "pytest-xdist[psutil]"] 21 | 22 | [[package]] 23 | name = "colorama" 24 | version = "0.4.6" 25 | description = "Cross-platform colored terminal text." 26 | category = "dev" 27 | optional = false 28 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" 29 | files = [ 30 | {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, 31 | {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, 32 | ] 33 | 34 | [[package]] 35 | name = "docstring-parser" 36 | version = "0.7.3" 37 | description = "" 38 | category = "dev" 39 | optional = false 40 | python-versions = "~=3.5" 41 | files = [ 42 | {file = "docstring_parser-0.7.3.tar.gz", hash = "sha256:cde5fbf8b846433dfbde1e0f96b7f909336a634d5df34a38cb75050c7346734a"}, 43 | ] 44 | 45 | [[package]] 46 | name = "exceptiongroup" 47 | version = "1.1.0" 48 | description = "Backport of PEP 654 (exception groups)" 49 | category = "dev" 50 | optional = false 51 | python-versions = ">=3.7" 52 | files = [ 53 | {file = "exceptiongroup-1.1.0-py3-none-any.whl", hash = "sha256:327cbda3da756e2de031a3107b81ab7b3770a602c4d16ca618298c526f4bec1e"}, 54 | {file = "exceptiongroup-1.1.0.tar.gz", hash = "sha256:bcb67d800a4497e1b404c2dd44fca47d3b7a5e5433dbab67f96c1a685cdfdf23"}, 55 | ] 56 | 57 | [package.extras] 58 | test = ["pytest (>=6)"] 59 | 60 | [[package]] 61 | name = "fire" 62 | version = "0.3.1" 63 | description = "A library for automatically generating command line interfaces." 64 | category = "dev" 65 | optional = false 66 | python-versions = "*" 67 | files = [ 68 | {file = "fire-0.3.1.tar.gz", hash = "sha256:9736a16227c3d469e5d2d296bce5b4d8fa8d7851e953bda327a455fc2994307f"}, 69 | ] 70 | 71 | [package.dependencies] 72 | six = "*" 73 | termcolor = "*" 74 | 75 | [[package]] 76 | name = "hypothesis" 77 | version = "6.61.0" 78 | description = "A library for property-based testing" 79 | category = "dev" 80 | optional = false 81 | python-versions = ">=3.7" 82 | files = [ 83 | {file = "hypothesis-6.61.0-py3-none-any.whl", hash = "sha256:7bb22d22e35db99d5724bbf5bdc686b46add94a0f228bf1be249c47ec46b9c7f"}, 84 | {file = "hypothesis-6.61.0.tar.gz", hash = "sha256:fbf7da30aea839d88898f74bcc027f0f997060498a8a7605880688c8a2166215"}, 85 | ] 86 | 87 | [package.dependencies] 88 | attrs = ">=19.2.0" 89 | exceptiongroup = {version = ">=1.0.0", markers = "python_version < \"3.11\""} 90 | sortedcontainers = ">=2.1.0,<3.0.0" 91 | 92 | [package.extras] 93 | all = ["backports.zoneinfo (>=0.2.1)", "black (>=19.10b0)", "click (>=7.0)", "django (>=3.2)", "dpcontracts (>=0.4)", "importlib-metadata (>=3.6)", "lark (>=0.10.1)", "libcst (>=0.3.16)", "numpy (>=1.9.0)", "pandas (>=1.0)", "pytest (>=4.6)", "python-dateutil (>=1.4)", "pytz (>=2014.1)", "redis (>=3.0.0)", "rich (>=9.0.0)", "tzdata (>=2022.7)"] 94 | cli = ["black (>=19.10b0)", "click (>=7.0)", "rich (>=9.0.0)"] 95 | codemods = ["libcst (>=0.3.16)"] 96 | dateutil = ["python-dateutil (>=1.4)"] 97 | django = ["django (>=3.2)"] 98 | dpcontracts = ["dpcontracts (>=0.4)"] 99 | ghostwriter = ["black (>=19.10b0)"] 100 | lark = ["lark (>=0.10.1)"] 101 | numpy = ["numpy (>=1.9.0)"] 102 | pandas = ["pandas (>=1.0)"] 103 | pytest = ["pytest (>=4.6)"] 104 | pytz = ["pytz (>=2014.1)"] 105 | redis = ["redis (>=3.0.0)"] 106 | zoneinfo = ["backports.zoneinfo (>=0.2.1)", "tzdata (>=2022.7)"] 107 | 108 | [[package]] 109 | name = "importlib-metadata" 110 | version = "6.0.0" 111 | description = "Read metadata from Python packages" 112 | category = "dev" 113 | optional = false 114 | python-versions = ">=3.7" 115 | files = [ 116 | {file = "importlib_metadata-6.0.0-py3-none-any.whl", hash = "sha256:7efb448ec9a5e313a57655d35aa54cd3e01b7e1fbcf72dce1bf06119420f5bad"}, 117 | {file = "importlib_metadata-6.0.0.tar.gz", hash = "sha256:e354bedeb60efa6affdcc8ae121b73544a7aa74156d047311948f6d711cd378d"}, 118 | ] 119 | 120 | [package.dependencies] 121 | typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""} 122 | zipp = ">=0.5" 123 | 124 | [package.extras] 125 | docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] 126 | perf = ["ipython"] 127 | testing = ["flake8 (<5)", "flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)"] 128 | 129 | [[package]] 130 | name = "iniconfig" 131 | version = "1.1.1" 132 | description = "iniconfig: brain-dead simple config-ini parsing" 133 | category = "dev" 134 | optional = false 135 | python-versions = "*" 136 | files = [ 137 | {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, 138 | {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, 139 | ] 140 | 141 | [[package]] 142 | name = "loguru" 143 | version = "0.5.3" 144 | description = "Python logging made (stupidly) simple" 145 | category = "dev" 146 | optional = false 147 | python-versions = ">=3.5" 148 | files = [ 149 | {file = "loguru-0.5.3-py3-none-any.whl", hash = "sha256:f8087ac396b5ee5f67c963b495d615ebbceac2796379599820e324419d53667c"}, 150 | {file = "loguru-0.5.3.tar.gz", hash = "sha256:b28e72ac7a98be3d28ad28570299a393dfcd32e5e3f6a353dec94675767b6319"}, 151 | ] 152 | 153 | [package.dependencies] 154 | colorama = {version = ">=0.3.4", markers = "sys_platform == \"win32\""} 155 | win32-setctime = {version = ">=1.0.0", markers = "sys_platform == \"win32\""} 156 | 157 | [package.extras] 158 | dev = ["Sphinx (>=2.2.1)", "black (>=19.10b0)", "codecov (>=2.0.15)", "colorama (>=0.3.4)", "flake8 (>=3.7.7)", "isort (>=5.1.1)", "pytest (>=4.6.2)", "pytest-cov (>=2.7.1)", "sphinx-autobuild (>=0.7.1)", "sphinx-rtd-theme (>=0.4.3)", "tox (>=3.9.0)", "tox-travis (>=0.12)"] 159 | 160 | [[package]] 161 | name = "packaging" 162 | version = "22.0" 163 | description = "Core utilities for Python packages" 164 | category = "dev" 165 | optional = false 166 | python-versions = ">=3.7" 167 | files = [ 168 | {file = "packaging-22.0-py3-none-any.whl", hash = "sha256:957e2148ba0e1a3b282772e791ef1d8083648bc131c8ab0c1feba110ce1146c3"}, 169 | {file = "packaging-22.0.tar.gz", hash = "sha256:2198ec20bd4c017b8f9717e00f0c8714076fc2fd93816750ab48e2c41de2cfd3"}, 170 | ] 171 | 172 | [[package]] 173 | name = "pluggy" 174 | version = "1.0.0" 175 | description = "plugin and hook calling mechanisms for python" 176 | category = "dev" 177 | optional = false 178 | python-versions = ">=3.6" 179 | files = [ 180 | {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, 181 | {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, 182 | ] 183 | 184 | [package.dependencies] 185 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} 186 | 187 | [package.extras] 188 | dev = ["pre-commit", "tox"] 189 | testing = ["pytest", "pytest-benchmark"] 190 | 191 | [[package]] 192 | name = "pydoc-md" 193 | version = "0.1.0" 194 | description = "" 195 | category = "dev" 196 | optional = false 197 | python-versions = ">=3.6,<4.0" 198 | files = [ 199 | {file = "pydoc-md-0.1.0.tar.gz", hash = "sha256:5b64025c83d9230438bab6019ce570876a15a77e7f6bb83e0d1320ed34492700"}, 200 | {file = "pydoc_md-0.1.0-py3-none-any.whl", hash = "sha256:b2bcf3df7aa51d0bb40db9a67e5607caba64b7cdb1d791f40d2d7ef372449f85"}, 201 | ] 202 | 203 | [package.dependencies] 204 | docstring_parser = ">=0.7.2,<0.8.0" 205 | fire = ">=0.3.1,<0.4.0" 206 | loguru = ">=0.5.1,<0.6.0" 207 | 208 | [[package]] 209 | name = "pytest" 210 | version = "7.2.0" 211 | description = "pytest: simple powerful testing with Python" 212 | category = "dev" 213 | optional = false 214 | python-versions = ">=3.7" 215 | files = [ 216 | {file = "pytest-7.2.0-py3-none-any.whl", hash = "sha256:892f933d339f068883b6fd5a459f03d85bfcb355e4981e146d2c7616c21fef71"}, 217 | {file = "pytest-7.2.0.tar.gz", hash = "sha256:c4014eb40e10f11f355ad4e3c2fb2c6c6d1919c73f3b5a433de4708202cade59"}, 218 | ] 219 | 220 | [package.dependencies] 221 | attrs = ">=19.2.0" 222 | colorama = {version = "*", markers = "sys_platform == \"win32\""} 223 | exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} 224 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} 225 | iniconfig = "*" 226 | packaging = "*" 227 | pluggy = ">=0.12,<2.0" 228 | tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} 229 | 230 | [package.extras] 231 | testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] 232 | 233 | [[package]] 234 | name = "six" 235 | version = "1.16.0" 236 | description = "Python 2 and 3 compatibility utilities" 237 | category = "dev" 238 | optional = false 239 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" 240 | files = [ 241 | {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, 242 | {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, 243 | ] 244 | 245 | [[package]] 246 | name = "sortedcontainers" 247 | version = "2.4.0" 248 | description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set" 249 | category = "dev" 250 | optional = false 251 | python-versions = "*" 252 | files = [ 253 | {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"}, 254 | {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, 255 | ] 256 | 257 | [[package]] 258 | name = "termcolor" 259 | version = "2.1.1" 260 | description = "ANSI color formatting for output in terminal" 261 | category = "dev" 262 | optional = false 263 | python-versions = ">=3.7" 264 | files = [ 265 | {file = "termcolor-2.1.1-py3-none-any.whl", hash = "sha256:fa852e957f97252205e105dd55bbc23b419a70fec0085708fc0515e399f304fd"}, 266 | {file = "termcolor-2.1.1.tar.gz", hash = "sha256:67cee2009adc6449c650f6bcf3bdeed00c8ba53a8cda5362733c53e0a39fb70b"}, 267 | ] 268 | 269 | [package.extras] 270 | tests = ["pytest", "pytest-cov"] 271 | 272 | [[package]] 273 | name = "tomli" 274 | version = "2.0.1" 275 | description = "A lil' TOML parser" 276 | category = "dev" 277 | optional = false 278 | python-versions = ">=3.7" 279 | files = [ 280 | {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, 281 | {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, 282 | ] 283 | 284 | [[package]] 285 | name = "typing-extensions" 286 | version = "4.4.0" 287 | description = "Backported and Experimental Type Hints for Python 3.7+" 288 | category = "dev" 289 | optional = false 290 | python-versions = ">=3.7" 291 | files = [ 292 | {file = "typing_extensions-4.4.0-py3-none-any.whl", hash = "sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e"}, 293 | {file = "typing_extensions-4.4.0.tar.gz", hash = "sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa"}, 294 | ] 295 | 296 | [[package]] 297 | name = "win32-setctime" 298 | version = "1.1.0" 299 | description = "A small Python utility to set file creation time on Windows" 300 | category = "dev" 301 | optional = false 302 | python-versions = ">=3.5" 303 | files = [ 304 | {file = "win32_setctime-1.1.0-py3-none-any.whl", hash = "sha256:231db239e959c2fe7eb1d7dc129f11172354f98361c4fa2d6d2d7e278baa8aad"}, 305 | {file = "win32_setctime-1.1.0.tar.gz", hash = "sha256:15cf5750465118d6929ae4de4eb46e8edae9a5634350c01ba582df868e932cb2"}, 306 | ] 307 | 308 | [package.extras] 309 | dev = ["black (>=19.3b0)", "pytest (>=4.6.2)"] 310 | 311 | [[package]] 312 | name = "zipp" 313 | version = "3.11.0" 314 | description = "Backport of pathlib-compatible object wrapper for zip files" 315 | category = "dev" 316 | optional = false 317 | python-versions = ">=3.7" 318 | files = [ 319 | {file = "zipp-3.11.0-py3-none-any.whl", hash = "sha256:83a28fcb75844b5c0cdaf5aa4003c2d728c77e05f5aeabe8e95e56727005fbaa"}, 320 | {file = "zipp-3.11.0.tar.gz", hash = "sha256:a7a22e05929290a67401440b39690ae6563279bced5f314609d9d03798f56766"}, 321 | ] 322 | 323 | [package.extras] 324 | docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)"] 325 | testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] 326 | 327 | [metadata] 328 | lock-version = "2.0" 329 | python-versions = "^3.7" 330 | content-hash = "89365fe557cd87888c4381f25176c19c8bae28c217c89aaac2db05b33cc85f79" 331 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![deny(clippy::all)] 2 | #[cfg(test)] 3 | extern crate quickcheck; 4 | #[cfg(test)] 5 | #[macro_use(quickcheck)] 6 | extern crate quickcheck_macros; 7 | use std::borrow::Borrow; 8 | use std::convert::AsRef; 9 | 10 | pub type Span = (usize, usize); 11 | 12 | fn get_span_indices>(tokens: &[S]) -> Vec { 13 | tokens 14 | .iter() 15 | .scan(0, |state, token| { 16 | let l = *state; 17 | let r = l + token.borrow().chars().count(); 18 | *state = r; 19 | Some((l, r)) 20 | }) 21 | .collect() 22 | } 23 | 24 | /// Returns the span indices of `original_text` from the tokens based on the 25 | /// shortest edit script (SES). 26 | /// 27 | /// This is useful, for example, when you want to get the spans in the original 28 | /// text of tokens obtained in the normalized text. 29 | /// 30 | /// # Examples 31 | /// 32 | /// ``` 33 | /// let tokens = vec!["a", "la", "gorge"]; 34 | /// let original_text = "à LA gorge"; 35 | /// let spans = textspan::get_original_spans(&tokens, original_text); 36 | /// assert_eq!(spans, vec![vec![(0, 1)], vec![(3, 5)], vec![(9, 14)]]); 37 | /// ``` 38 | pub fn get_original_spans>(tokens: &[S], original_text: &str) -> Vec> { 39 | let spans = get_span_indices(tokens); 40 | let text = tokens.join(""); 41 | align_spans(&spans, &text, original_text) 42 | } 43 | 44 | /// Converts the spans defined in `text` to those defined in `original_text`. 45 | /// 46 | /// This is useful, for example, when you want to get the spans in the original 47 | /// text of spans obtained in the normalized text. 48 | /// 49 | /// # Examples 50 | /// 51 | /// ``` 52 | /// let spans = [(0, 3), (3, 6)]; 53 | /// let text = "foobarbaz"; 54 | /// let original_text = "FOo.BåR baZ"; 55 | /// assert_eq!( 56 | /// textspan::align_spans(&spans, text, original_text), 57 | /// [[(0, 3)], [(4, 7)]] 58 | /// ) 59 | /// ``` 60 | pub fn align_spans(spans: &[Span], text: &str, original_text: &str) -> Vec> { 61 | let (mapping, _) = tokenizations::get_charmap(text, original_text); 62 | align_spans_by_mapping(spans, &mapping) 63 | } 64 | 65 | /// Converts the spans by the given `mapping`. 66 | /// Generally speaking, the character correspondence between two texts is not 67 | /// necessarily surjective, not injective, not even a methematical map - some 68 | /// character in `textA` may not have a correspondence in `textB`, or may have 69 | /// multiple correspondences in `textB`. Thus, `mapping` should be provided as 70 | /// `Vec>`. 71 | /// 72 | /// # Examples 73 | /// 74 | /// ``` 75 | /// let spans = [(0, 2), (3, 4)]; 76 | /// let mapping = [vec![0, 1], vec![], vec![2], vec![4, 5, 6]]; 77 | /// assert_eq!( 78 | /// textspan::align_spans_by_mapping(&spans, &mapping), 79 | /// [[(0, 2)], [(4, 7)]] 80 | /// ) 81 | /// ``` 82 | pub fn align_spans_by_mapping>(spans: &[Span], mapping: &[T]) -> Vec> { 83 | let mut ret = vec![]; 84 | for &(start, end) in spans { 85 | let mut l = None; 86 | let mut r = None; 87 | let mut prevy: Option = None; 88 | let mut pret = vec![]; 89 | for item in mapping.iter().take(end).skip(start) { 90 | for &y in item.as_ref() { 91 | if prevy.is_some() && prevy.unwrap() + 1 < y { 92 | pret.push((l.unwrap(), r.unwrap())); 93 | l = None; 94 | } else { 95 | r = Some(y + 1); 96 | } 97 | if l.is_none() { 98 | l = Some(y); 99 | r = Some(y + 1); 100 | } 101 | prevy = Some(y); 102 | } 103 | } 104 | if let Some(l) = l { 105 | pret.push((l, r.unwrap())); 106 | } 107 | ret.push(pret) 108 | } 109 | ret 110 | } 111 | 112 | /// Remove overlapping spans from given `spans`. 113 | /// First, longest spans are remained - if the two spans are overlapped, the 114 | /// first span will be remained. If the two spans are overlapped and their start 115 | /// positions are same, the longer span will be remained. 116 | /// 117 | /// # Example 118 | /// 119 | /// ``` 120 | /// use textspan::remove_span_overlaps; 121 | /// let spans = [(0, 2), (0, 3), (2, 4), (5, 7)]; 122 | /// let ret = remove_span_overlaps(&spans); 123 | /// assert_eq!(ret, [(0, 3), (5, 7)]); 124 | /// ``` 125 | pub fn remove_span_overlaps(spans: &[Span]) -> Vec { 126 | let mut spans = spans.to_vec(); 127 | spans.sort_by_key(|x| (x.0, !0 - x.1)); // to take first longest spans 128 | let mut ret = vec![]; 129 | let mut cur = 0; 130 | for &(l, r) in &spans { 131 | if l < cur { 132 | continue; 133 | } 134 | ret.push((l, r)); 135 | cur = r; 136 | } 137 | ret 138 | } 139 | 140 | /// Remove overlapping spans from given `spans`, and returns remained span indices. 141 | /// First, longest spans are remained - if the two spans are overlapped, the 142 | /// first span will be remained. If the two spans are overlapped and their start 143 | /// positions are same, the longer span will be remained. 144 | /// 145 | /// # Example 146 | /// 147 | /// ``` 148 | /// use textspan::remove_span_overlaps_idx; 149 | /// let spans = [(0, 2), (0, 3), (2, 4), (5, 7)]; 150 | /// let ret = remove_span_overlaps_idx(&spans); 151 | /// assert_eq!(ret, [1, 3]); 152 | /// ``` 153 | pub fn remove_span_overlaps_idx(spans: &[Span]) -> Vec { 154 | let mut indices: Vec<_> = (0..spans.len()).collect(); 155 | indices.sort_by_key(|&i| { 156 | let (l, r) = spans[i]; 157 | (l, !0 - r) 158 | }); 159 | let mut ret = vec![]; 160 | let mut cur = 0; 161 | for i in indices { 162 | let (l, r) = spans[i]; 163 | if l < cur { 164 | continue; 165 | } 166 | ret.push(i); 167 | cur = r; 168 | } 169 | ret 170 | } 171 | 172 | /// Convert `span` indices to `target_spans` based indices. 173 | /// Expects `target_spans` is sorted and not overlapping. 174 | /// 175 | /// # Example 176 | /// 177 | /// ``` 178 | /// use textspan::lift_span_index; 179 | /// let target_spans = [(0, 3), (3, 4), (4, 9), (9, 12)]; 180 | /// assert_eq!(lift_span_index((0, 3), &target_spans), (Ok(0), Ok(1))); 181 | /// assert_eq!(lift_span_index((0, 4), &target_spans), (Ok(0), Ok(2))); 182 | /// assert_eq!(lift_span_index((1, 4), &target_spans), (Err(0), Ok(2))); 183 | /// assert_eq!(lift_span_index((1, 5), &target_spans), (Err(0), Err(3))); 184 | /// assert_eq!(lift_span_index((1, 9), &target_spans), (Err(0), Ok(3))); 185 | /// assert_eq!(lift_span_index((0, 9), &target_spans), (Ok(0), Ok(3))); 186 | /// assert_eq!(lift_span_index((1, 13), &target_spans), (Err(0), Err(4))); 187 | /// 188 | /// let target_spans = [(3, 4), (4, 9), (9, 12)]; 189 | /// assert_eq!(lift_span_index((0, 9), &target_spans), (Err(0), Ok(2))); 190 | /// 191 | /// assert_eq!(lift_span_index((0, 0), &[(0, 0)]), (Ok(0), Ok(1))); 192 | /// assert_eq!(lift_span_index((0, 0), &[]), (Err(0), Err(0))); 193 | pub fn lift_span_index( 194 | span: Span, 195 | target_spans: &[Span], 196 | ) -> (Result, Result) { 197 | if target_spans.is_empty() { 198 | return (Err(0), Err(0)); 199 | } 200 | let (l, r) = span; 201 | // i = max i where l >= ri-1 202 | // if li == l Ok, else Err 203 | let li = { 204 | if target_spans[0].0 == l { 205 | Ok(0) 206 | } else if target_spans[0].1 > l { 207 | Err(0) 208 | } else { 209 | let mut ok = target_spans.len(); 210 | let mut ng = 0; 211 | while ok - ng > 1 { 212 | let m = (ok + ng) / 2; 213 | if target_spans[m].1 > l { 214 | ok = m; 215 | } else { 216 | ng = m; 217 | } 218 | } 219 | if ok < target_spans.len() && target_spans[ok].0 == l { 220 | Ok(ok) 221 | } else { 222 | Err(ok) 223 | } 224 | } 225 | }; 226 | // i = min(r <= l_i) 227 | // if ri-1 == r Ok, else Err 228 | let ri = { 229 | if target_spans[0].1 == r { 230 | Ok(1) 231 | } else if target_spans[0].0 >= r { 232 | Err(0) 233 | } else { 234 | let mut ok = target_spans.len(); 235 | let mut ng = 0; 236 | while ok - ng > 1 { 237 | let m = (ok + ng) / 2; 238 | if target_spans[m].0 >= r { 239 | ok = m; 240 | } else { 241 | ng = m; 242 | } 243 | } 244 | if ok > 0 && target_spans[ok - 1].1 == r { 245 | Ok(ok) 246 | } else { 247 | Err(ok) 248 | } 249 | } 250 | }; 251 | (li, ri) 252 | } 253 | 254 | /// Convert `spans` indices on `target_spans` 255 | /// 256 | /// # Example 257 | /// 258 | /// ``` 259 | /// use textspan::lift_spans_index; 260 | /// let target_spans = [(3, 5), (5, 9), (11, 15)]; 261 | /// 262 | /// assert_eq!(lift_spans_index(&[(3, 9)], &target_spans), &[(Ok(0), Ok(2))]); 263 | /// ``` 264 | pub fn lift_spans_index( 265 | spans: &[Span], 266 | target_spans: &[Span], 267 | ) -> Vec<(Result, Result)> { 268 | let mut ret = vec![]; 269 | let mut cur = 0usize; 270 | for &(l, r) in spans { 271 | // i = argmin(l < ri) 272 | while cur < target_spans.len() && target_spans[cur].1 <= l { 273 | cur += 1; 274 | } 275 | let li = if cur < target_spans.len() && target_spans[cur].0 == l { 276 | Ok(cur) 277 | } else { 278 | Err(cur) 279 | }; 280 | // i = argmin(r <= l_i) 281 | let mut cur = cur; 282 | while cur < target_spans.len() && target_spans[cur].0 < r { 283 | cur += 1; 284 | } 285 | let ri = if cur > 0 && target_spans[cur - 1].1 == r { 286 | Ok(cur) 287 | } else { 288 | Err(cur) 289 | }; 290 | ret.push((li, ri)); 291 | } 292 | ret 293 | } 294 | 295 | #[cfg(test)] 296 | mod tests { 297 | use super::*; 298 | use proptest::collection as pc; 299 | use proptest::prelude::*; 300 | use proptest::strategy::Strategy; 301 | use rstest::*; 302 | #[quickcheck] 303 | fn test_lift_spans_index(spans: Vec, target_spans: Vec) { 304 | let sanitize = |spans: Vec| { 305 | let mut v = vec![]; 306 | for (l, r) in spans { 307 | if l == r { 308 | continue; 309 | } 310 | if l > r { 311 | v.push((r, l)); 312 | } else { 313 | v.push((l, r)); 314 | } 315 | } 316 | let mut spans = remove_span_overlaps(&v); 317 | spans.sort_unstable(); 318 | spans 319 | }; 320 | let spans = sanitize(spans); 321 | let target_spans = sanitize(target_spans); 322 | assert_eq!( 323 | lift_spans_index(&spans, &target_spans), 324 | spans 325 | .iter() 326 | .cloned() 327 | .map(|span| lift_span_index(span, &target_spans)) 328 | .collect::>(), 329 | "\nspans: {:?}\ntarget_spans: {:?}\n", 330 | spans, 331 | target_spans 332 | ); 333 | } 334 | #[quickcheck] 335 | fn remove_span_overlaps_quick(spans: Vec) { 336 | let new_spans = remove_span_overlaps(&spans); 337 | let mut cur = 0; 338 | for &(l, r) in &new_spans { 339 | assert!(l >= cur); 340 | cur = r; 341 | } 342 | let indices = remove_span_overlaps_idx(&spans); 343 | let new_spans2: Vec<_> = indices.into_iter().map(|i| spans[i]).collect(); 344 | assert_eq!(new_spans, new_spans2); 345 | } 346 | #[test] 347 | fn align_spans_handmade() { 348 | for (case, expected) in vec![ 349 | ((vec![], "", ""), vec![]), 350 | ( 351 | (vec![(1, 4)], "foobar", "foo.bar"), 352 | vec![vec![(1, 3), (4, 5)]], 353 | ), 354 | ((vec![(0, 1)], "foo", "oo"), vec![vec![]]), 355 | ((vec![(0, 3)], "foo", "fo0o"), vec![vec![(0, 2), (3, 4)]]), 356 | ] 357 | .iter() 358 | { 359 | let (spans, text, original_text) = case; 360 | assert_eq!(align_spans(spans, text, original_text), *expected); 361 | } 362 | } 363 | 364 | #[test] 365 | fn align_spans_by_mapping_handmade() { 366 | for (case, expected) in vec![ 367 | ((vec![], vec![]), vec![]), 368 | ( 369 | ( 370 | vec![(1, 3), (4, 6)], 371 | vec![ 372 | vec![0], 373 | vec![1], 374 | vec![2], 375 | vec![5], 376 | vec![6], 377 | vec![9], 378 | vec![10], 379 | ], 380 | ), 381 | vec![vec![(1, 3)], vec![(6, 7), (9, 10)]], 382 | ), 383 | ] 384 | .iter() 385 | { 386 | let (spans, mapping) = case; 387 | assert_eq!(align_spans_by_mapping(spans, mapping), *expected); 388 | } 389 | } 390 | 391 | fn cases_align_spans_by_mapping( 392 | max_length: usize, 393 | ) -> impl Strategy>)> { 394 | pc::vec((0..4usize, 0..5usize), 0..max_length) 395 | .prop_map(|v| { 396 | v.iter() 397 | .scan(0, |s, (d, n)| { 398 | *s += d; 399 | let v: Vec<_> = (*s..(*s + n)).collect(); 400 | if *n > 0 { 401 | *s += n - 1; 402 | } 403 | Some(v) 404 | }) 405 | .collect() 406 | }) 407 | .prop_flat_map(|v: Vec>| { 408 | let l = v.len(); 409 | ((0..=l, 0..=l), Just(v)) 410 | }) 411 | } 412 | 413 | fn check_align(span: Span, mapping: &[Vec], ret: &[Vec]) { 414 | let (start, end) = span; 415 | if start >= end { 416 | assert_eq!(ret[0], vec![]) 417 | } else { 418 | if ret[0].is_empty() { 419 | assert_eq!(mapping[start], vec![]); 420 | assert_eq!(mapping[end - 1], vec![]); 421 | return; 422 | } 423 | let mut cur = None; 424 | for spans in ret { 425 | for (start, end) in spans { 426 | if let Some(_cur) = cur { 427 | assert!(start - _cur > 0); 428 | } 429 | cur = Some(end); 430 | } 431 | } 432 | let rev = |x: usize| mapping.iter().position(|y| y.contains(&x)).unwrap(); 433 | let l = rev(ret[0][0].0); 434 | assert!( 435 | mapping[start].is_empty() || mapping[l].iter().any(|x| mapping[start].contains(x)), 436 | "compare start. 437 | ret: {:?} 438 | l : {} 439 | ", 440 | ret, 441 | l 442 | ); 443 | 444 | let r = rev(ret[ret.len() - 1][ret[0].len() - 1].1 - 1); 445 | assert!( 446 | mapping[end - 1].is_empty() 447 | || mapping[end - 1].iter().any(|x| mapping[r].contains(x)), 448 | "compare end 449 | ret: {:?} 450 | r : {} 451 | ", 452 | ret, 453 | r 454 | ); 455 | } 456 | } 457 | proptest! { 458 | #[test] 459 | fn align_spans_by_mapping_proptest((span, mapping) in cases_align_spans_by_mapping(1000)) { 460 | let ret = align_spans_by_mapping(&[span], &mapping); 461 | check_align(span, &mapping, &ret); 462 | } 463 | } 464 | 465 | #[quickcheck] 466 | fn get_original_spans_for_clean_text_quickcheck(tokens: Vec) -> bool { 467 | let spans = get_span_indices(&tokens); 468 | let output = get_original_spans(&tokens, &tokens.join("")) 469 | .iter() 470 | .scan(0, |s, x| { 471 | if let Some(&p) = x.first() { 472 | *s = p.1; 473 | Some(p) 474 | } else { 475 | Some((*s, *s)) 476 | } 477 | }) 478 | .collect::>(); 479 | spans == output 480 | } 481 | 482 | #[rstest(input, expected, 483 | case( 484 | (vec!["fあo①が", "bar"], "fあo1かbar"), 485 | vec![vec![(0, 5)], vec![(5, 8)]], 486 | ), 487 | case((vec!["New York"], "NewYork"), vec![vec![(0, 7)]]), 488 | case( 489 | (vec!["A'B", "", ""], "A B"), 490 | vec![vec![(0, 1), (2, 3)], vec![], vec![]], 491 | ), 492 | case( 493 | (vec!["A'b", ""], "a b"), 494 | vec![vec![(0, 1), (2, 3)], vec![]], 495 | ), 496 | case((vec!["", "", ""], ""), vec![vec![], vec![], vec![]]), 497 | case( 498 | (vec!["hello", "``world``"], "Hello \"world\""), 499 | vec![vec![(0, 5)], vec![(7, 12)]], 500 | ), 501 | case( 502 | (vec!["à", " ", "", "la", "gorge", ""], "a lagorge"), 503 | vec![ 504 | vec![(0, 1)], 505 | vec![(5, 6)], 506 | vec![], 507 | vec![(6, 8)], 508 | vec![(8, 13)], 509 | vec![], 510 | ], 511 | ), 512 | )] 513 | fn hm_get_original_spans(input: (Vec<&str>, &str), expected: Vec>) { 514 | assert_eq!( 515 | get_original_spans(&input.0, input.1), 516 | expected, 517 | "{:?}", 518 | input 519 | ); 520 | } 521 | } 522 | --------------------------------------------------------------------------------