├── .github └── workflows │ ├── build.yml │ └── deploy.yml ├── .gitignore ├── Cargo.toml ├── LICENSE ├── README.md ├── build.rs ├── cortex.yaml ├── images └── logo.png ├── poetry.lock ├── pyproject.toml ├── pysubstringsearch ├── __init__.py ├── py.typed └── pysubstringsearch.pyi ├── src ├── lib.rs └── libsais │ ├── libsais.c │ └── libsais.h └── tests ├── __init__.py └── test_pysubstringsearch.py /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | on: 3 | - push 4 | - pull_request 5 | jobs: 6 | lint: 7 | if: github.event_name == 'push' && !startsWith(github.event.ref, 'refs/tags') 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout 11 | uses: actions/checkout@v3 12 | - name: Install latest rust 13 | uses: actions-rs/toolchain@v1 14 | with: 15 | toolchain: stable 16 | profile: minimal 17 | override: true 18 | components: clippy 19 | - name: Lint with clippy 20 | uses: actions-rs/cargo@v1 21 | with: 22 | command: clippy 23 | args: --all-targets --all-features 24 | test: 25 | runs-on: ${{ matrix.os }} 26 | needs: lint 27 | strategy: 28 | fail-fast: false 29 | matrix: 30 | python-version: 31 | - '3.7' 32 | - '3.8' 33 | - '3.9' 34 | - '3.10' 35 | - '3.11' 36 | os: 37 | - ubuntu-latest 38 | - macos-latest 39 | - windows-latest 40 | steps: 41 | - name: Checkout 42 | uses: actions/checkout@v3 43 | - name: Set up Python ${{ matrix.python-version }} 44 | uses: actions/setup-python@v3 45 | with: 46 | python-version: ${{ matrix.python-version }} 47 | - name: Install Poetry 48 | uses: abatilo/actions-poetry@v2.1.3 49 | - name: Install Rust 50 | uses: actions-rs/toolchain@v1 51 | with: 52 | profile: minimal 53 | toolchain: stable 54 | override: true 55 | - name: Install dependencies 56 | run: poetry install 57 | - name: Build Python package 58 | run: poetry run maturin develop 59 | - name: Test 60 | run: poetry run pytest -Werror tests 61 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: Deploy 2 | on: 3 | release: 4 | types: 5 | - released 6 | jobs: 7 | deploy: 8 | runs-on: ${{ matrix.os }} 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | python-version: 13 | - '3.7' 14 | - '3.8' 15 | - '3.9' 16 | - '3.10' 17 | - '3.11' 18 | os: 19 | - ubuntu-latest 20 | - macos-latest 21 | - windows-latest 22 | steps: 23 | - name: Checkout 24 | uses: actions/checkout@v3 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: actions/setup-python@v4 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | - name: Install Rust 30 | uses: actions-rs/toolchain@v1 31 | with: 32 | profile: minimal 33 | toolchain: stable 34 | override: true 35 | - name: Install Cross-compilers (macOS) 36 | if: matrix.os == 'macos-latest' 37 | run: | 38 | rustup target add x86_64-apple-darwin 39 | rustup target add aarch64-apple-darwin 40 | - name: Publish Package 41 | uses: PyO3/maturin-action@v1 42 | with: 43 | command: publish 44 | args: --username=__token__ ${{ matrix.os == 'ubuntu-latest' && matrix.python-version == '3.7' && '' || '--no-sdist' }} --interpreter=python${{ !startsWith(matrix.os, 'windows') && matrix.python-version || '' }} 45 | env: 46 | MATURIN_PASSWORD: ${{ secrets.pypi_password }} 47 | if: matrix.os != 'macos-latest' 48 | - name: Publish macOS (x86_64) Package 49 | if: matrix.os == 'macos-latest' 50 | uses: PyO3/maturin-action@v1 51 | with: 52 | command: publish 53 | args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=x86_64-apple-darwin --no-sdist 54 | env: 55 | MATURIN_PASSWORD: ${{ secrets.pypi_password }} 56 | - name: Publish macOS (arm64) Package 57 | if: matrix.os == 'macos-latest' 58 | uses: PyO3/maturin-action@v1 59 | with: 60 | command: publish 61 | args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=aarch64-apple-darwin --no-sdist 62 | env: 63 | MATURIN_PASSWORD: ${{ secrets.pypi_password }} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ 139 | .gitignore 140 | .gitignore 141 | 142 | # Generated by Cargo 143 | # will have compiled files and executables 144 | debug/ 145 | target/ 146 | 147 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 148 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 149 | Cargo.lock 150 | 151 | # These are backup files generated by rustfmt 152 | **/*.rs.bk 153 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "pysubstringsearch" 3 | version = "0.7.1" 4 | authors = ["Gal Ben David "] 5 | edition = "2021" 6 | description = "A Python library written in Rust that searches for substrings quickly using a Suffix Array" 7 | readme = "README.md" 8 | repository = "https://github.com/intsights/pysubstringsearch" 9 | homepage = "https://github.com/intsights/pysubstringsearch" 10 | license = "MIT" 11 | keywords = [ 12 | "substring", 13 | "pattern", 14 | "search", 15 | "suffix", 16 | "array", 17 | "rust", 18 | "pyo3" 19 | ] 20 | 21 | [package.metadata.maturin] 22 | 23 | [lib] 24 | name = "pysubstringsearch" 25 | crate-type = ["cdylib"] 26 | 27 | [dependencies] 28 | ahash = "0.7" 29 | bstr = "0.2" 30 | byteorder = "1" 31 | memchr = "2" 32 | parking_lot = "0.12" 33 | rayon = "1" 34 | 35 | [dependencies.pyo3] 36 | version = "0.16.4" 37 | features = ["extension-module"] 38 | 39 | [build-dependencies] 40 | cc = { version = "1.0", features = ["parallel"] } 41 | 42 | [profile.release] 43 | lto = true 44 | panic = "abort" 45 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Gal Ben David 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | Logo 4 | 5 |

6 | A Python library written in Rust that searches for substrings quickly using a Suffix Array 7 |

8 |

9 | 10 | ![license](https://img.shields.io/badge/MIT-License-blue) 11 | ![Python](https://img.shields.io/badge/Python-3.7%20%7C%203.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-blue) 12 | ![Build](https://github.com/Intsights/PySubstringSearch/workflows/Build/badge.svg) 13 | [![PyPi](https://img.shields.io/pypi/v/PySubstringSearch.svg)](https://pypi.org/project/PySubstringSearch/) 14 | 15 | ## Table of Contents 16 | 17 | - [Table of Contents](#table-of-contents) 18 | - [About The Project](#about-the-project) 19 | - [Built With](#built-with) 20 | - [Performance](#performance) 21 | - [500MB File](#500mb-file) 22 | - [7500MB File](#7500mb-file) 23 | - [Installation](#installation) 24 | - [Usage](#usage) 25 | - [License](#license) 26 | - [Contact](#contact) 27 | 28 | 29 | ## About The Project 30 | 31 | PySubstringSearch is a library designed to search over an index file for substring patterns. In order to achieve speed and efficiency, the library is written in Rust. For string indexing, the library uses [libsais](https://github.com/IlyaGrebnov/libsais) suffix array construction library. The index created consists of the original text and a 32bit suffix array struct. To get around the limitations of the Suffix Array Construction implementation, the library uses a proprietary container protocol to hold the original text and index in chunks of 512MB. 32 | 33 | The module implements a method for searching. 34 | - `search` - Find different entries with the same substring concurrently. Concurrency increases as the index file grows in size with multiple inner chunks. 35 | - `search_multiple` - same as `search` but accepts multiple substrings in a single call 36 | 37 | 38 | ### Built With 39 | 40 | * [libsais](https://github.com/IlyaGrebnov/libsais) 41 | 42 | 43 | ### Performance 44 | 45 | #### 500MB File 46 | | Library | Function | Time | #Results | Improvement Factor | 47 | | ------------- | ------------- | ------------- | ------------- | ------------- | 48 | | [ripgrepy](https://pypi.org/project/ripgrepy/) | Ripgrepy('google', '500mb').run().as_string.split('\n') | 47.2ms | 5943 | 1.0x | 49 | | [PySubstringSearch](https://github.com/Intsights/PySubstringSearch) | reader.search('google') | 497µs | 5943 | 95x | 50 | | [ripgrepy](https://pypi.org/project/ripgrepy/) | Ripgrepy('text_two', '500mb').run().as_string.split('\n') | 44.7ms | 159 | 1.0x | 51 | | [PySubstringSearch](https://github.com/Intsights/PySubstringSearch) | reader.search('text_two') | 14.9µs | 159 | 3000x | 52 | 53 | #### 7500MB File 54 | | Library | Function | Time | #Results | Improvement Factor | 55 | | ------------- | ------------- | ------------- | ------------- | ------------- | 56 | | [ripgrepy](https://pypi.org/project/ripgrepy/) | Ripgrepy('google', '6000mb').run().as_string.split('\n') | 900ms | 62834 | 1.0x | 57 | | [PySubstringSearch](https://github.com/Intsights/PySubstringSearch) | reader.search('google') | 10.1ms | 62834 | 89.1x | 58 | | [ripgrepy](https://pypi.org/project/ripgrepy/) | Ripgrepy('text_two', '6000mb').run().as_string.split('\n') | 820ms | 0 | 1.0x | 59 | | [PySubstringSearch](https://github.com/Intsights/PySubstringSearch) | reader.search('text_two') | 200µs | 0 | 4100x | 60 | 61 | 62 | ### Installation 63 | 64 | ```sh 65 | pip3 install PySubstringSearch 66 | ``` 67 | 68 | 69 | ## Usage 70 | 71 | Create an index 72 | ```python 73 | import pysubstringsearch 74 | 75 | # creating a new index file 76 | # if a file with this name is already exists, it will be overwritten 77 | writer = pysubstringsearch.Writer( 78 | index_file_path='output.idx', 79 | ) 80 | 81 | # adding entries to the new index 82 | writer.add_entry('some short string') 83 | writer.add_entry('another but now a longer string') 84 | writer.add_entry('more text to add') 85 | 86 | # adding entries from file lines 87 | writer.add_entries_from_file_lines('input_file.txt') 88 | 89 | # making sure the data is dumped to the file 90 | writer.finalize() 91 | ``` 92 | 93 | Search a substring within an index 94 | ```python 95 | import pysubstringsearch 96 | 97 | # opening an index file for searching 98 | reader = pysubstringsearch.Reader( 99 | index_file_path='output.idx', 100 | ) 101 | 102 | # lookup for a substring 103 | reader.search('short') 104 | >>> ['some short string'] 105 | 106 | # lookup for a substring 107 | reader.search('string') 108 | >>> ['some short string', 'another but now a longer string'] 109 | 110 | # lookup for multiple substrings 111 | reader.search_multiple( 112 | [ 113 | 'short', 114 | 'longer', 115 | ], 116 | ) 117 | >>> ['some short string', 'another but now a longer string'] 118 | ``` 119 | 120 | 121 | 122 | ## License 123 | 124 | Distributed under the MIT License. See `LICENSE` for more information. 125 | 126 | 127 | ## Contact 128 | 129 | Gal Ben David - gal@intsights.com 130 | 131 | Project Link: [https://github.com/Intsights/PySubstringSearch](https://github.com/Intsights/PySubstringSearch) 132 | -------------------------------------------------------------------------------- /build.rs: -------------------------------------------------------------------------------- 1 | fn main() { 2 | println!("cargo:rerun-if-changed=libsais.c"); 3 | 4 | let src = [ 5 | "src/libsais/libsais.c", 6 | ]; 7 | let mut builder = cc::Build::new(); 8 | let build = builder 9 | .files(src.iter()); 10 | build.compile("libsais"); 11 | } 12 | -------------------------------------------------------------------------------- /cortex.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | info: 3 | title: Pysubstringsearch 4 | description: Python library for fast substring/pattern search written in C++ leveraging 5 | Suffix Array Algorithm 6 | x-cortex-git: 7 | github: 8 | alias: intsightsorg 9 | repository: Intsights/PySubstringSearch 10 | x-cortex-tag: pysubstringsearch 11 | x-cortex-type: service 12 | x-cortex-domain-parents: 13 | - tag: threatintel-platform-delivery 14 | x-cortex-groups: 15 | - exposure:external-ship 16 | - target:library 17 | openapi: 3.0.1 18 | servers: 19 | - url: "/" 20 | -------------------------------------------------------------------------------- /images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Intsights/PySubstringSearch/7456989e6172b7f0ad563a33ad54b6d9d44f79de/images/logo.png -------------------------------------------------------------------------------- /poetry.lock: -------------------------------------------------------------------------------- 1 | [[package]] 2 | name = "colorama" 3 | version = "0.4.6" 4 | description = "Cross-platform colored terminal text." 5 | category = "dev" 6 | optional = false 7 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" 8 | 9 | [[package]] 10 | name = "exceptiongroup" 11 | version = "1.2.0" 12 | description = "Backport of PEP 654 (exception groups)" 13 | category = "dev" 14 | optional = false 15 | python-versions = ">=3.7" 16 | 17 | [package.extras] 18 | test = ["pytest (>=6)"] 19 | 20 | [[package]] 21 | name = "gitdb" 22 | version = "4.0.11" 23 | description = "Git Object Database" 24 | category = "dev" 25 | optional = false 26 | python-versions = ">=3.7" 27 | 28 | [package.dependencies] 29 | smmap = ">=3.0.1,<6" 30 | 31 | [[package]] 32 | name = "gitpython" 33 | version = "3.1.41" 34 | description = "GitPython is a Python library used to interact with Git repositories" 35 | category = "dev" 36 | optional = false 37 | python-versions = ">=3.7" 38 | 39 | [package.dependencies] 40 | gitdb = ">=4.0.1,<5" 41 | typing-extensions = {version = ">=3.7.4.3", markers = "python_version < \"3.8\""} 42 | 43 | [package.extras] 44 | test = ["black", "coverage", "ddt (>=1.1.1,!=1.4.3)", "mypy", "pre-commit", "pytest (>=7.3.1)", "pytest-cov", "pytest-instafail", "pytest-mock", "pytest-sugar", "sumtypes", "mock"] 45 | 46 | [[package]] 47 | name = "importlib-metadata" 48 | version = "6.7.0" 49 | description = "Read metadata from Python packages" 50 | category = "dev" 51 | optional = false 52 | python-versions = ">=3.7" 53 | 54 | [package.dependencies] 55 | typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""} 56 | zipp = ">=0.5" 57 | 58 | [package.extras] 59 | docs = ["sphinx (>=3.5)", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "furo", "sphinx-lint", "jaraco.tidelift (>=1.4)"] 60 | perf = ["ipython"] 61 | testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-ruff", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "importlib-resources (>=1.3)"] 62 | 63 | [[package]] 64 | name = "iniconfig" 65 | version = "2.0.0" 66 | description = "brain-dead simple config-ini parsing" 67 | category = "dev" 68 | optional = false 69 | python-versions = ">=3.7" 70 | 71 | [[package]] 72 | name = "maturin" 73 | version = "1.4.0" 74 | description = "Build and publish crates with pyo3, rust-cpython and cffi bindings as well as rust binaries as python packages" 75 | category = "dev" 76 | optional = false 77 | python-versions = ">=3.7" 78 | 79 | [package.dependencies] 80 | tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} 81 | 82 | [package.extras] 83 | zig = ["ziglang (>=0.10.0,<0.11.0)"] 84 | patchelf = ["patchelf"] 85 | 86 | [[package]] 87 | name = "packaging" 88 | version = "23.2" 89 | description = "Core utilities for Python packages" 90 | category = "dev" 91 | optional = false 92 | python-versions = ">=3.7" 93 | 94 | [[package]] 95 | name = "pluggy" 96 | version = "1.2.0" 97 | description = "plugin and hook calling mechanisms for python" 98 | category = "dev" 99 | optional = false 100 | python-versions = ">=3.7" 101 | 102 | [package.dependencies] 103 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} 104 | 105 | [package.extras] 106 | dev = ["pre-commit", "tox"] 107 | testing = ["pytest", "pytest-benchmark"] 108 | 109 | [[package]] 110 | name = "pytest" 111 | version = "7.4.4" 112 | description = "pytest: simple powerful testing with Python" 113 | category = "dev" 114 | optional = false 115 | python-versions = ">=3.7" 116 | 117 | [package.dependencies] 118 | colorama = {version = "*", markers = "sys_platform == \"win32\""} 119 | exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} 120 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} 121 | iniconfig = "*" 122 | packaging = "*" 123 | pluggy = ">=0.12,<2.0" 124 | tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} 125 | 126 | [package.extras] 127 | testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] 128 | 129 | [[package]] 130 | name = "pytest-runner" 131 | version = "6.0.1" 132 | description = "Invoke py.test as distutils command with dependency resolution" 133 | category = "dev" 134 | optional = false 135 | python-versions = ">=3.7" 136 | 137 | [package.extras] 138 | docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "jaraco.tidelift (>=1.4)"] 139 | testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "pytest-virtualenv", "types-setuptools", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)"] 140 | 141 | [[package]] 142 | name = "smmap" 143 | version = "5.0.1" 144 | description = "A pure Python implementation of a sliding window memory map manager" 145 | category = "dev" 146 | optional = false 147 | python-versions = ">=3.7" 148 | 149 | [[package]] 150 | name = "tomli" 151 | version = "2.0.1" 152 | description = "A lil' TOML parser" 153 | category = "dev" 154 | optional = false 155 | python-versions = ">=3.7" 156 | 157 | [[package]] 158 | name = "typing-extensions" 159 | version = "4.7.1" 160 | description = "Backported and Experimental Type Hints for Python 3.7+" 161 | category = "dev" 162 | optional = false 163 | python-versions = ">=3.7" 164 | 165 | [[package]] 166 | name = "zipp" 167 | version = "3.15.0" 168 | description = "Backport of pathlib-compatible object wrapper for zip files" 169 | category = "dev" 170 | optional = false 171 | python-versions = ">=3.7" 172 | 173 | [package.extras] 174 | docs = ["sphinx (>=3.5)", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "furo", "sphinx-lint", "jaraco.tidelift (>=1.4)"] 175 | testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "flake8 (<5)", "pytest-cov", "pytest-enabler (>=1.3)", "jaraco.itertools", "jaraco.functools", "more-itertools", "big-o", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "pytest-flake8"] 176 | 177 | [metadata] 178 | lock-version = "1.1" 179 | python-versions = "^3.7" 180 | content-hash = "73c215113a1a0f3275a327a924fa4b01076299e805acbc7e197a5820a0908f1d" 181 | 182 | [metadata.files] 183 | colorama = [] 184 | exceptiongroup = [] 185 | gitdb = [] 186 | gitpython = [] 187 | importlib-metadata = [] 188 | iniconfig = [] 189 | maturin = [] 190 | packaging = [] 191 | pluggy = [] 192 | pytest = [] 193 | pytest-runner = [] 194 | smmap = [] 195 | tomli = [] 196 | typing-extensions = [] 197 | zipp = [] 198 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["maturin>=0.12,<0.13"] 3 | build-backend = "maturin" 4 | 5 | [tool.maturin] 6 | sdist-include = [ 7 | "src/*", 8 | "Cargo.toml", 9 | "pysubstringsearch/*.py", 10 | "pysubstringsearch/*.pyi" 11 | ] 12 | 13 | [tool.poetry] 14 | name = "pysubstringsearch" 15 | version = "0.7.1" 16 | authors = ["Gal Ben David "] 17 | description = "A Python library written in Rust that searches for substrings quickly using a Suffix Array" 18 | readme = "README.md" 19 | repository = "https://github.com/intsights/pysubstringsearch" 20 | homepage = "https://github.com/intsights/pysubstringsearch" 21 | license = "MIT" 22 | keywords = [ 23 | "substring", 24 | "pattern", 25 | "search", 26 | "suffix", 27 | "array", 28 | "rust", 29 | "pyo3" 30 | ] 31 | classifiers = [ 32 | "License :: OSI Approved :: MIT License", 33 | "Operating System :: MacOS", 34 | "Operating System :: Microsoft", 35 | "Operating System :: POSIX :: Linux", 36 | "Programming Language :: Python :: 3.7", 37 | "Programming Language :: Python :: 3.8", 38 | "Programming Language :: Python :: 3.9", 39 | "Programming Language :: Python :: 3.10", 40 | "Programming Language :: Python :: 3.11", 41 | "Programming Language :: Rust", 42 | ] 43 | 44 | [tool.poetry.dependencies] 45 | python = "^3.7" 46 | 47 | [tool.poetry.dev-dependencies] 48 | pytest = "*" 49 | gitpython = "*" 50 | wheel = "*" 51 | pytest-runner = "*" 52 | maturin = "*" 53 | 54 | [tool.pytest.ini_options] 55 | minversion = "6.0" 56 | addopts = [ 57 | "--tb=native", 58 | "--pythonwarnings=all", 59 | ] 60 | testpaths = [ 61 | "tests", 62 | ] 63 | -------------------------------------------------------------------------------- /pysubstringsearch/__init__.py: -------------------------------------------------------------------------------- 1 | import typing 2 | 3 | from . import pysubstringsearch 4 | 5 | 6 | class Writer: 7 | def __init__( 8 | self, 9 | index_file_path: str, 10 | max_chunk_len: typing.Optional[int] = None, 11 | ) -> None: 12 | self.writer = pysubstringsearch.Writer( 13 | index_file_path=index_file_path, 14 | max_chunk_len=max_chunk_len, 15 | ) 16 | 17 | def add_entries_from_file_lines( 18 | self, 19 | input_file_path: str, 20 | ) -> None: 21 | self.writer.add_entries_from_file_lines( 22 | input_file_path=input_file_path, 23 | ) 24 | 25 | def add_entry( 26 | self, 27 | text: str, 28 | ) -> None: 29 | self.writer.add_entry( 30 | text=text, 31 | ) 32 | 33 | def dump_data( 34 | self, 35 | ) -> None: 36 | self.writer.dump_data() 37 | 38 | def finalize( 39 | self, 40 | ) -> None: 41 | self.writer.finalize() 42 | 43 | 44 | class Reader: 45 | def __init__( 46 | self, 47 | index_file_path: str, 48 | ) -> None: 49 | self.reader = pysubstringsearch.Reader( 50 | index_file_path=index_file_path, 51 | ) 52 | 53 | def search( 54 | self, 55 | substring: str, 56 | ) -> typing.List[str]: 57 | return self.reader.search( 58 | substring=substring, 59 | ) 60 | 61 | def search_multiple( 62 | self, 63 | substrings: typing.List[str], 64 | ) -> typing.List[str]: 65 | results = [] 66 | for substring in substrings: 67 | results.extend( 68 | self.search( 69 | substring=substring, 70 | ), 71 | ) 72 | 73 | return results 74 | -------------------------------------------------------------------------------- /pysubstringsearch/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Intsights/PySubstringSearch/7456989e6172b7f0ad563a33ad54b6d9d44f79de/pysubstringsearch/py.typed -------------------------------------------------------------------------------- /pysubstringsearch/pysubstringsearch.pyi: -------------------------------------------------------------------------------- 1 | import typing 2 | 3 | 4 | class Writer: 5 | def __init__( 6 | self, 7 | index_file_path: str, 8 | max_chunk_len: typing.Optional[int] = None, 9 | ) -> None: ... 10 | 11 | def add_entries_from_file_lines( 12 | self, 13 | input_file_path: str, 14 | ) -> None: ... 15 | 16 | def add_entry( 17 | self, 18 | text: str, 19 | ) -> None: ... 20 | 21 | def dump_data( 22 | self, 23 | ) -> None: ... 24 | 25 | def finalize( 26 | self, 27 | ) -> None: ... 28 | 29 | 30 | class Reader: 31 | def __init__( 32 | self, 33 | index_file_path: str, 34 | ) -> None: ... 35 | 36 | def search( 37 | self, 38 | substring: str, 39 | ) -> typing.List[str]: ... 40 | 41 | def search_multiple( 42 | self, 43 | substrings: typing.List[str], 44 | ) -> typing.List[str]: ... 45 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | use ahash::AHashSet; 2 | use bstr::io::BufReadExt; 3 | use byteorder::{ReadBytesExt, WriteBytesExt, ByteOrder, LittleEndian}; 4 | use memchr::memmem; 5 | use parking_lot::Mutex; 6 | use pyo3::exceptions; 7 | use pyo3::prelude::*; 8 | use rayon::prelude::*; 9 | use std::fs::File; 10 | use std::io::{BufReader, BufWriter, Read, Seek, SeekFrom, Write}; 11 | use std::str; 12 | use std::sync::Arc; 13 | 14 | extern "C" { 15 | pub fn libsais( 16 | data: *const u8, 17 | suffix_array: *mut i32, 18 | data_len: i32, 19 | suffix_array_extra_space: i32, 20 | symbol_frequency_table: *mut i32, 21 | ) -> i32; 22 | } 23 | 24 | fn construct_suffix_array( 25 | buffer: &[u8], 26 | ) -> Vec { 27 | let mut suffix_array = vec![0; buffer.len()]; 28 | 29 | unsafe { 30 | libsais( 31 | buffer.as_ptr(), 32 | suffix_array.as_mut_ptr(), 33 | buffer.len() as i32, 34 | 0, 35 | std::ptr::null_mut::(), 36 | ); 37 | } 38 | 39 | suffix_array 40 | } 41 | 42 | #[pyclass] 43 | struct Writer { 44 | index_file: BufWriter, 45 | buffer: Vec, 46 | } 47 | 48 | #[pymethods] 49 | impl Writer { 50 | #[new] 51 | fn new( 52 | index_file_path: &str, 53 | max_chunk_len: Option, 54 | ) -> PyResult { 55 | let index_file = File::create(index_file_path)?; 56 | let index_file = BufWriter::new(index_file); 57 | let max_chunk_len = max_chunk_len.unwrap_or(512 * 1024 * 1024); 58 | 59 | Ok( 60 | Writer { 61 | index_file, 62 | buffer: Vec::with_capacity(max_chunk_len), 63 | } 64 | ) 65 | } 66 | 67 | fn add_entries_from_file_lines( 68 | &mut self, 69 | input_file_path: &str, 70 | ) -> PyResult<()> { 71 | let input_file = File::open(input_file_path)?; 72 | let input_file_reader = BufReader::new(input_file); 73 | input_file_reader.for_byte_line( 74 | |line| { 75 | if self.buffer.len() + line.len() + 1 > self.buffer.capacity() { 76 | self.dump_data()?; 77 | } 78 | self.buffer.extend_from_slice(line); 79 | self.buffer.push(b'\n'); 80 | 81 | Ok(true) 82 | } 83 | )?; 84 | 85 | Ok(()) 86 | } 87 | 88 | fn add_entry( 89 | &mut self, 90 | text: &str, 91 | ) -> PyResult<()> { 92 | if text.len() > self.buffer.capacity() { 93 | return Err(exceptions::PyValueError::new_err("entry is too big")); 94 | } 95 | 96 | if self.buffer.len() + text.len() + 1 > self.buffer.capacity() { 97 | self.dump_data()?; 98 | } 99 | self.buffer.extend_from_slice(text.as_bytes()); 100 | self.buffer.push(b'\n'); 101 | 102 | Ok(()) 103 | } 104 | 105 | fn dump_data( 106 | &mut self, 107 | ) -> PyResult<()> { 108 | if self.buffer.is_empty() { 109 | return Ok(()); 110 | } 111 | 112 | self.index_file.write_u32::(self.buffer.len() as u32)?; 113 | self.index_file.write_all(&self.buffer)?; 114 | 115 | let suffix_array = construct_suffix_array(&self.buffer); 116 | self.index_file.write_u32::((suffix_array.len() * 4) as u32)?; 117 | for suffix in suffix_array { 118 | self.index_file.write_i32::(suffix)?; 119 | } 120 | 121 | self.buffer.clear(); 122 | 123 | Ok(()) 124 | } 125 | 126 | fn finalize( 127 | &mut self, 128 | ) -> PyResult<()> { 129 | if !self.buffer.is_empty() { 130 | self.dump_data()?; 131 | } 132 | self.index_file.flush()?; 133 | 134 | Ok(()) 135 | } 136 | } 137 | 138 | impl Drop for Writer { 139 | fn drop( 140 | &mut self, 141 | ) { 142 | self.finalize().unwrap(); 143 | } 144 | } 145 | 146 | struct SubIndex { 147 | data: Vec, 148 | index_file: BufReader, 149 | suffixes_file_start: usize, 150 | suffixes_file_end: usize, 151 | finder: memmem::Finder<'static>, 152 | finder_rev: memmem::FinderRev<'static>, 153 | } 154 | 155 | #[pyclass] 156 | struct Reader { 157 | sub_indexes: Vec, 158 | } 159 | 160 | #[pymethods] 161 | impl Reader { 162 | #[new] 163 | fn new( 164 | index_file_path: &str, 165 | ) -> PyResult { 166 | let index_file = File::open(index_file_path)?; 167 | let mut index_file = BufReader::new(index_file); 168 | let index_file_metadata = std::fs::metadata(index_file_path)?; 169 | let index_file_len = index_file_metadata.len(); 170 | let mut bytes_read = 0; 171 | 172 | let mut sub_indexes = Vec::new(); 173 | 174 | while bytes_read < index_file_len { 175 | let data_file_len = index_file.read_u32::()?; 176 | let mut data = vec![0; data_file_len as usize]; 177 | index_file.read_exact(&mut data)?; 178 | 179 | let suffixes_file_len = index_file.read_u32::()? as usize; 180 | let suffixes_file_start = index_file.seek(SeekFrom::Current(0))? as usize; 181 | let suffixes_file_end = suffixes_file_start + suffixes_file_len; 182 | index_file.seek(SeekFrom::Current(suffixes_file_len as i64))?; 183 | 184 | bytes_read += 4 + 4 + data_file_len as u64 + suffixes_file_len as u64; 185 | 186 | sub_indexes.push( 187 | SubIndex { 188 | data, 189 | index_file: BufReader::new(File::open(index_file_path)?), 190 | suffixes_file_start, 191 | suffixes_file_end, 192 | finder: memmem::Finder::new(b"\n"), 193 | finder_rev: memmem::FinderRev::new(b"\n"), 194 | } 195 | ); 196 | } 197 | 198 | Ok(Reader { sub_indexes }) 199 | } 200 | 201 | fn search( 202 | &mut self, 203 | substring: &str, 204 | ) -> PyResult> { 205 | let results = Arc::new(Mutex::new(Vec::new())); 206 | 207 | self.sub_indexes.par_iter_mut().for_each( 208 | |sub_index| { 209 | let mut start_of_indices = None; 210 | let mut end_of_indices = None; 211 | 212 | let mut left_anchor = sub_index.suffixes_file_start; 213 | let mut right_anchor = sub_index.suffixes_file_end - 4; 214 | while left_anchor <= right_anchor { 215 | let middle_anchor = left_anchor + ((right_anchor - left_anchor) / 4 / 2 * 4); 216 | sub_index.index_file.seek(SeekFrom::Start(middle_anchor as u64)).unwrap(); 217 | let data_index = sub_index.index_file.read_i32::().unwrap(); 218 | 219 | let line = &sub_index.data[data_index as usize..]; 220 | if line.starts_with(substring.as_bytes()) { 221 | start_of_indices = Some(middle_anchor); 222 | right_anchor = middle_anchor - 4; 223 | } else { 224 | match substring.as_bytes().cmp(line) { 225 | std::cmp::Ordering::Less => right_anchor = middle_anchor - 4, 226 | std::cmp::Ordering::Greater => left_anchor = middle_anchor + 4, 227 | std::cmp::Ordering::Equal => {}, 228 | }; 229 | } 230 | } 231 | if start_of_indices.is_none() { 232 | return; 233 | } 234 | 235 | let mut right_anchor = sub_index.suffixes_file_end - 4; 236 | while left_anchor <= right_anchor { 237 | let middle_anchor = left_anchor + ((right_anchor - left_anchor) / 4 / 2 * 4); 238 | sub_index.index_file.seek(SeekFrom::Start(middle_anchor as u64)).unwrap(); 239 | let data_index = sub_index.index_file.read_i32::().unwrap(); 240 | 241 | let line = &sub_index.data[data_index as usize..]; 242 | if line.starts_with(substring.as_bytes()) { 243 | end_of_indices = Some(middle_anchor); 244 | left_anchor = middle_anchor + 4; 245 | } else { 246 | match substring.as_bytes().cmp(line) { 247 | std::cmp::Ordering::Less => right_anchor = middle_anchor - 4, 248 | std::cmp::Ordering::Greater => left_anchor = middle_anchor + 4, 249 | std::cmp::Ordering::Equal => {}, 250 | }; 251 | } 252 | } 253 | 254 | let start_of_indices = start_of_indices.unwrap(); 255 | let end_of_indices = end_of_indices.unwrap(); 256 | 257 | let mut suffixes = vec![0; end_of_indices - start_of_indices + 4]; 258 | 259 | sub_index.index_file.seek(SeekFrom::Start(start_of_indices as u64)).unwrap(); 260 | sub_index.index_file.read_exact(&mut suffixes).unwrap(); 261 | 262 | let mut matches_ranges = AHashSet::new(); 263 | let mut local_results = Vec::with_capacity((end_of_indices - start_of_indices + 4) / 4); 264 | for suffix in suffixes.chunks_mut(4) { 265 | let data_index = LittleEndian::read_i32(suffix); 266 | let line_head = match sub_index.finder.find(&sub_index.data[data_index as usize..]) { 267 | Some(next_nl_pos) => data_index as usize + next_nl_pos, 268 | None => sub_index.data.len() - 1, 269 | }; 270 | let line_tail = match sub_index.finder_rev.rfind(&sub_index.data[..data_index as usize]) { 271 | Some(previous_nl_pos) => previous_nl_pos + 1, 272 | None => 0, 273 | }; 274 | if matches_ranges.insert(line_tail) { 275 | let line = unsafe { str::from_utf8_unchecked(&sub_index.data[line_tail..line_head]) }; 276 | local_results.push(line); 277 | } 278 | } 279 | 280 | results.lock().extend(local_results); 281 | } 282 | ); 283 | 284 | let results = results.lock().to_vec(); 285 | 286 | Ok(results) 287 | } 288 | } 289 | 290 | #[pymodule] 291 | fn pysubstringsearch( 292 | _py: Python, 293 | m: &PyModule, 294 | ) -> PyResult<()> { 295 | m.add_class::()?; 296 | m.add_class::()?; 297 | 298 | Ok(()) 299 | } 300 | -------------------------------------------------------------------------------- /src/libsais/libsais.h: -------------------------------------------------------------------------------- 1 | /*-- 2 | 3 | This file is a part of libsais, a library for linear time 4 | suffix array and burrows wheeler transform construction. 5 | 6 | Copyright (c) 2021-2022 Ilya Grebnov 7 | 8 | Licensed under the Apache License, Version 2.0 (the "License"); 9 | you may not use this file except in compliance with the License. 10 | You may obtain a copy of the License at 11 | 12 | http://www.apache.org/licenses/LICENSE-2.0 13 | 14 | Unless required by applicable law or agreed to in writing, software 15 | distributed under the License is distributed on an "AS IS" BASIS, 16 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | See the License for the specific language governing permissions and 18 | limitations under the License. 19 | 20 | Please see the file LICENSE for full copyright information. 21 | 22 | --*/ 23 | 24 | #ifndef LIBSAIS_H 25 | #define LIBSAIS_H 1 26 | 27 | #ifdef __cplusplus 28 | extern "C" { 29 | #endif 30 | 31 | #include 32 | 33 | /** 34 | * Creates the libsais context that allows reusing allocated memory with each libsais operation. 35 | * In multi-threaded environments, use one context per thread for parallel executions. 36 | * @return the libsais context, NULL otherwise. 37 | */ 38 | void * libsais_create_ctx(void); 39 | 40 | #if defined(_OPENMP) 41 | /** 42 | * Creates the libsais context that allows reusing allocated memory with each parallel libsais operation using OpenMP. 43 | * In multi-threaded environments, use one context per thread for parallel executions. 44 | * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). 45 | * @return the libsais context, NULL otherwise. 46 | */ 47 | void * libsais_create_ctx_omp(int32_t threads); 48 | #endif 49 | 50 | /** 51 | * Destroys the libsass context and free previusly allocated memory. 52 | * @param ctx The libsais context (can be NULL). 53 | */ 54 | void libsais_free_ctx(void * ctx); 55 | 56 | /** 57 | * Constructs the suffix array of a given string. 58 | * @param T [0..n-1] The input string. 59 | * @param SA [0..n-1+fs] The output array of suffixes. 60 | * @param n The length of the given string. 61 | * @param fs The extra space available at the end of SA array (0 should be enough for most cases). 62 | * @param freq [0..255] The output symbol frequency table (can be NULL). 63 | * @return 0 if no error occurred, -1 or -2 otherwise. 64 | */ 65 | int32_t libsais(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq); 66 | 67 | /** 68 | * Constructs the suffix array of a given integer array. 69 | * Note, during construction input array will be modified, but restored at the end if no errors occurred. 70 | * @param T [0..n-1] The input integer array. 71 | * @param SA [0..n-1+fs] The output array of suffixes. 72 | * @param n The length of the integer array. 73 | * @param k The alphabet size of the input integer array. 74 | * @param fs Extra space available at the end of SA array (can be 0, but 4k or better 6k is recommended for optimal performance). 75 | * @return 0 if no error occurred, -1 or -2 otherwise. 76 | */ 77 | int32_t libsais_int(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs); 78 | 79 | /** 80 | * Constructs the suffix array of a given string using libsais context. 81 | * @param ctx The libsais context. 82 | * @param T [0..n-1] The input string. 83 | * @param SA [0..n-1+fs] The output array of suffixes. 84 | * @param n The length of the given string. 85 | * @param fs The extra space available at the end of SA array (0 should be enough for most cases). 86 | * @param freq [0..255] The output symbol frequency table (can be NULL). 87 | * @return 0 if no error occurred, -1 or -2 otherwise. 88 | */ 89 | int32_t libsais_ctx(const void * ctx, const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq); 90 | 91 | #if defined(_OPENMP) 92 | /** 93 | * Constructs the suffix array of a given string in parallel using OpenMP. 94 | * @param T [0..n-1] The input string. 95 | * @param SA [0..n-1+fs] The output array of suffixes. 96 | * @param n The length of the given string. 97 | * @param fs The extra space available at the end of SA array (0 should be enough for most cases). 98 | * @param freq [0..255] The output symbol frequency table (can be NULL). 99 | * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). 100 | * @return 0 if no error occurred, -1 or -2 otherwise. 101 | */ 102 | int32_t libsais_omp(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads); 103 | 104 | /** 105 | * Constructs the suffix array of a given integer array in parallel using OpenMP. 106 | * Note, during construction input array will be modified, but restored at the end if no errors occurred. 107 | * @param T [0..n-1] The input integer array. 108 | * @param SA [0..n-1+fs] The output array of suffixes. 109 | * @param n The length of the integer array. 110 | * @param k The alphabet size of the input integer array. 111 | * @param fs Extra space available at the end of SA array (can be 0, but 4k or better 6k is recommended for optimal performance). 112 | * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). 113 | * @return 0 if no error occurred, -1 or -2 otherwise. 114 | */ 115 | int32_t libsais_int_omp(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs, int32_t threads); 116 | #endif 117 | 118 | /** 119 | * Constructs the burrows-wheeler transformed string of a given string. 120 | * @param T [0..n-1] The input string. 121 | * @param U [0..n-1] The output string (can be T). 122 | * @param A [0..n-1+fs] The temporary array. 123 | * @param n The length of the given string. 124 | * @param fs The extra space available at the end of A array (0 should be enough for most cases). 125 | * @param freq [0..255] The output symbol frequency table (can be NULL). 126 | * @return The primary index if no error occurred, -1 or -2 otherwise. 127 | */ 128 | int32_t libsais_bwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq); 129 | 130 | /** 131 | * Constructs the burrows-wheeler transformed string of a given string with auxiliary indexes. 132 | * @param T [0..n-1] The input string. 133 | * @param U [0..n-1] The output string (can be T). 134 | * @param A [0..n-1+fs] The temporary array. 135 | * @param n The length of the given string. 136 | * @param fs The extra space available at the end of A array (0 should be enough for most cases). 137 | * @param freq [0..255] The output symbol frequency table (can be NULL). 138 | * @param r The sampling rate for auxiliary indexes (must be power of 2). 139 | * @param I [0..(n-1)/r] The output auxiliary indexes. 140 | * @return 0 if no error occurred, -1 or -2 otherwise. 141 | */ 142 | int32_t libsais_bwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I); 143 | 144 | /** 145 | * Constructs the burrows-wheeler transformed string of a given string using libsais context. 146 | * @param ctx The libsais context. 147 | * @param T [0..n-1] The input string. 148 | * @param U [0..n-1] The output string (can be T). 149 | * @param A [0..n-1+fs] The temporary array. 150 | * @param n The length of the given string. 151 | * @param fs The extra space available at the end of A array (0 should be enough for most cases). 152 | * @param freq [0..255] The output symbol frequency table (can be NULL). 153 | * @return The primary index if no error occurred, -1 or -2 otherwise. 154 | */ 155 | int32_t libsais_bwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq); 156 | 157 | /** 158 | * Constructs the burrows-wheeler transformed string of a given string with auxiliary indexes using libsais context. 159 | * @param ctx The libsais context. 160 | * @param T [0..n-1] The input string. 161 | * @param U [0..n-1] The output string (can be T). 162 | * @param A [0..n-1+fs] The temporary array. 163 | * @param n The length of the given string. 164 | * @param fs The extra space available at the end of A array (0 should be enough for most cases). 165 | * @param freq [0..255] The output symbol frequency table (can be NULL). 166 | * @param r The sampling rate for auxiliary indexes (must be power of 2). 167 | * @param I [0..(n-1)/r] The output auxiliary indexes. 168 | * @return 0 if no error occurred, -1 or -2 otherwise. 169 | */ 170 | int32_t libsais_bwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I); 171 | 172 | #if defined(_OPENMP) 173 | /** 174 | * Constructs the burrows-wheeler transformed string of a given string in parallel using OpenMP. 175 | * @param T [0..n-1] The input string. 176 | * @param U [0..n-1] The output string (can be T). 177 | * @param A [0..n-1+fs] The temporary array. 178 | * @param n The length of the given string. 179 | * @param fs The extra space available at the end of A array (0 should be enough for most cases). 180 | * @param freq [0..255] The output symbol frequency table (can be NULL). 181 | * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). 182 | * @return The primary index if no error occurred, -1 or -2 otherwise. 183 | */ 184 | int32_t libsais_bwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t threads); 185 | 186 | /** 187 | * Constructs the burrows-wheeler transformed string of a given string with auxiliary indexes in parallel using OpenMP. 188 | * @param T [0..n-1] The input string. 189 | * @param U [0..n-1] The output string (can be T). 190 | * @param A [0..n-1+fs] The temporary array. 191 | * @param n The length of the given string. 192 | * @param fs The extra space available at the end of A array (0 should be enough for most cases). 193 | * @param freq [0..255] The output symbol frequency table (can be NULL). 194 | * @param r The sampling rate for auxiliary indexes (must be power of 2). 195 | * @param I [0..(n-1)/r] The output auxiliary indexes. 196 | * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). 197 | * @return 0 if no error occurred, -1 or -2 otherwise. 198 | */ 199 | int32_t libsais_bwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I, int32_t threads); 200 | #endif 201 | 202 | /** 203 | * Creates the libsais reverse BWT context that allows reusing allocated memory with each libsais_unbwt_* operation. 204 | * In multi-threaded environments, use one context per thread for parallel executions. 205 | * @return the libsais context, NULL otherwise. 206 | */ 207 | void * libsais_unbwt_create_ctx(void); 208 | 209 | #if defined(_OPENMP) 210 | /** 211 | * Creates the libsais reverse BWT context that allows reusing allocated memory with each parallel libsais_unbwt_* operation using OpenMP. 212 | * In multi-threaded environments, use one context per thread for parallel executions. 213 | * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). 214 | * @return the libsais context, NULL otherwise. 215 | */ 216 | void * libsais_unbwt_create_ctx_omp(int32_t threads); 217 | #endif 218 | 219 | /** 220 | * Destroys the libsass reverse BWT context and free previusly allocated memory. 221 | * @param ctx The libsais context (can be NULL). 222 | */ 223 | void libsais_unbwt_free_ctx(void * ctx); 224 | 225 | /** 226 | * Constructs the original string from a given burrows-wheeler transformed string with primary index. 227 | * @param T [0..n-1] The input string. 228 | * @param U [0..n-1] The output string (can be T). 229 | * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). 230 | * @param n The length of the given string. 231 | * @param freq [0..255] The input symbol frequency table (can be NULL). 232 | * @param i The primary index. 233 | * @return 0 if no error occurred, -1 or -2 otherwise. 234 | */ 235 | int32_t libsais_unbwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i); 236 | 237 | /** 238 | * Constructs the original string from a given burrows-wheeler transformed string with primary index using libsais reverse BWT context. 239 | * @param ctx The libsais reverse BWT context. 240 | * @param T [0..n-1] The input string. 241 | * @param U [0..n-1] The output string (can be T). 242 | * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). 243 | * @param n The length of the given string. 244 | * @param freq [0..255] The input symbol frequency table (can be NULL). 245 | * @param i The primary index. 246 | * @return 0 if no error occurred, -1 or -2 otherwise. 247 | */ 248 | int32_t libsais_unbwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i); 249 | 250 | /** 251 | * Constructs the original string from a given burrows-wheeler transformed string with auxiliary indexes. 252 | * @param T [0..n-1] The input string. 253 | * @param U [0..n-1] The output string (can be T). 254 | * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). 255 | * @param n The length of the given string. 256 | * @param freq [0..255] The input symbol frequency table (can be NULL). 257 | * @param r The sampling rate for auxiliary indexes (must be power of 2). 258 | * @param I [0..(n-1)/r] The input auxiliary indexes. 259 | * @return 0 if no error occurred, -1 or -2 otherwise. 260 | */ 261 | int32_t libsais_unbwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I); 262 | 263 | /** 264 | * Constructs the original string from a given burrows-wheeler transformed string with auxiliary indexes using libsais reverse BWT context. 265 | * @param ctx The libsais reverse BWT context. 266 | * @param T [0..n-1] The input string. 267 | * @param U [0..n-1] The output string (can be T). 268 | * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). 269 | * @param n The length of the given string. 270 | * @param freq [0..255] The input symbol frequency table (can be NULL). 271 | * @param r The sampling rate for auxiliary indexes (must be power of 2). 272 | * @param I [0..(n-1)/r] The input auxiliary indexes. 273 | * @return 0 if no error occurred, -1 or -2 otherwise. 274 | */ 275 | int32_t libsais_unbwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I); 276 | 277 | #if defined(_OPENMP) 278 | /** 279 | * Constructs the original string from a given burrows-wheeler transformed string with primary index in parallel using OpenMP. 280 | * @param T [0..n-1] The input string. 281 | * @param U [0..n-1] The output string (can be T). 282 | * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). 283 | * @param n The length of the given string. 284 | * @param freq [0..255] The input symbol frequency table (can be NULL). 285 | * @param i The primary index. 286 | * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). 287 | * @return 0 if no error occurred, -1 or -2 otherwise. 288 | */ 289 | int32_t libsais_unbwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i, int32_t threads); 290 | 291 | /** 292 | * Constructs the original string from a given burrows-wheeler transformed string with auxiliary indexes in parallel using OpenMP. 293 | * @param T [0..n-1] The input string. 294 | * @param U [0..n-1] The output string (can be T). 295 | * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). 296 | * @param n The length of the given string. 297 | * @param freq [0..255] The input symbol frequency table (can be NULL). 298 | * @param r The sampling rate for auxiliary indexes (must be power of 2). 299 | * @param I [0..(n-1)/r] The input auxiliary indexes. 300 | * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). 301 | * @return 0 if no error occurred, -1 or -2 otherwise. 302 | */ 303 | int32_t libsais_unbwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I, int32_t threads); 304 | #endif 305 | 306 | #ifdef __cplusplus 307 | } 308 | #endif 309 | 310 | #endif 311 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Intsights/PySubstringSearch/7456989e6172b7f0ad563a33ad54b6d9d44f79de/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_pysubstringsearch.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import unittest 4 | 5 | import pysubstringsearch 6 | 7 | 8 | class PySubstringSearchTestCase( 9 | unittest.TestCase, 10 | ): 11 | def assert_substring_search( 12 | self, 13 | strings, 14 | substring, 15 | expected_results, 16 | ): 17 | try: 18 | with tempfile.TemporaryDirectory() as tmp_directory: 19 | index_file_path = f'{tmp_directory}/output.idx' 20 | writer = pysubstringsearch.Writer( 21 | index_file_path=index_file_path, 22 | ) 23 | for string in strings: 24 | writer.add_entry( 25 | text=string, 26 | ) 27 | writer.finalize() 28 | 29 | reader = pysubstringsearch.Reader( 30 | index_file_path=index_file_path, 31 | ) 32 | self.assertCountEqual( 33 | first=reader.search( 34 | substring=substring, 35 | ), 36 | second=expected_results, 37 | ) 38 | 39 | try: 40 | os.unlink( 41 | path=index_file_path, 42 | ) 43 | except Exception: 44 | pass 45 | except PermissionError: 46 | pass 47 | 48 | def test_file_not_found( 49 | self, 50 | ): 51 | with self.assertRaises( 52 | expected_exception=FileNotFoundError, 53 | ): 54 | pysubstringsearch.Reader( 55 | index_file_path='missing_index_file_path', 56 | ) 57 | 58 | def test_sanity( 59 | self, 60 | ): 61 | strings = [ 62 | 'one', 63 | 'two', 64 | 'three', 65 | 'four', 66 | 'five', 67 | 'six', 68 | 'seven', 69 | 'eight', 70 | 'nine', 71 | 'ten', 72 | ] 73 | 74 | self.assert_substring_search( 75 | strings=strings, 76 | substring='four', 77 | expected_results=[ 78 | 'four', 79 | ], 80 | ) 81 | 82 | self.assert_substring_search( 83 | strings=strings, 84 | substring='f', 85 | expected_results=[ 86 | 'four', 87 | 'five', 88 | ], 89 | ) 90 | 91 | self.assert_substring_search( 92 | strings=strings, 93 | substring='our', 94 | expected_results=[ 95 | 'four', 96 | ], 97 | ) 98 | 99 | self.assert_substring_search( 100 | strings=strings, 101 | substring='aaa', 102 | expected_results=[], 103 | ) 104 | 105 | def test_edgecases( 106 | self, 107 | ): 108 | strings = [ 109 | 'one', 110 | 'two', 111 | 'three', 112 | 'four', 113 | 'five', 114 | 'six', 115 | 'seven', 116 | 'eight', 117 | 'nine', 118 | 'ten', 119 | 'tenten', 120 | ] 121 | 122 | self.assert_substring_search( 123 | strings=strings, 124 | substring='none', 125 | expected_results=[], 126 | ) 127 | 128 | self.assert_substring_search( 129 | strings=strings, 130 | substring='one', 131 | expected_results=[ 132 | 'one', 133 | ], 134 | ) 135 | 136 | self.assert_substring_search( 137 | strings=strings, 138 | substring='onet', 139 | expected_results=[], 140 | ) 141 | 142 | self.assert_substring_search( 143 | strings=strings, 144 | substring='ten', 145 | expected_results=[ 146 | 'ten', 147 | 'tenten', 148 | ], 149 | ) 150 | 151 | def test_unicode( 152 | self, 153 | ): 154 | strings = [ 155 | 'رجعوني عنيك لأيامي اللي راحوا', 156 | 'علموني أندم على الماضي وجراحه', 157 | 'اللي شفته قبل ما تشوفك عنيه', 158 | 'عمر ضايع يحسبوه إزاي عليّ', 159 | 'انت عمري اللي ابتدي بنورك صباحه', 160 | 'قد ايه من عمري قبلك راح وعدّى', 161 | 'يا حبيبي قد ايه من عمري راح', 162 | 'ولا شاف القلب قبلك فرحة واحدة', 163 | 'ولا داق في الدنيا غير طعم الجراح', 164 | 'ابتديت دلوقت بس أحب عمري', 165 | 'ابتديت دلوقت اخاف لا العمر يجري', 166 | 'كل فرحه اشتاقها من قبلك خيالي', 167 | 'التقاها في نور عنيك قلبي وفكري', 168 | 'يا حياة قلبي يا أغلى من حياتي', 169 | 'ليه ما قابلتش هواك يا حبيبي بدري', 170 | 'اللي شفته قبل ما تشوفك عنيه', 171 | 'عمر ضايع يحسبوه إزاي عليّ', 172 | 'انت عمري اللي ابتدي بنورك صباحه', 173 | 'الليالي الحلوه والشوق والمحبة', 174 | 'من زمان والقلب شايلهم عشانك', 175 | 'دوق معايا الحب دوق حبه بحبه', 176 | 'من حنان قلبي اللي طال شوقه لحنانك', 177 | 'هات عنيك تسرح في دنيتهم عنيه', 178 | 'هات ايديك ترتاح للمستهم ايديه', 179 | ] 180 | 181 | self.assert_substring_search( 182 | strings=strings, 183 | substring='زمان', 184 | expected_results=[ 185 | 'من زمان والقلب شايلهم عشانك', 186 | ], 187 | ) 188 | 189 | self.assert_substring_search( 190 | strings=strings, 191 | substring='في', 192 | expected_results=[ 193 | 'هات عنيك تسرح في دنيتهم عنيه', 194 | 'التقاها في نور عنيك قلبي وفكري', 195 | 'ولا داق في الدنيا غير طعم الجراح', 196 | ], 197 | ) 198 | 199 | self.assert_substring_search( 200 | strings=strings, 201 | substring='حنان', 202 | expected_results=[ 203 | 'من حنان قلبي اللي طال شوقه لحنانك', 204 | ], 205 | ) 206 | 207 | self.assert_substring_search( 208 | strings=strings, 209 | substring='none', 210 | expected_results=[], 211 | ) 212 | 213 | def test_multiple_words_string( 214 | self, 215 | ): 216 | strings = [ 217 | 'some short string', 218 | 'another but now a longer string', 219 | 'more text to add', 220 | ] 221 | 222 | self.assert_substring_search( 223 | strings=strings, 224 | substring='short', 225 | expected_results=[ 226 | 'some short string', 227 | ], 228 | ) 229 | 230 | def test_short_string( 231 | self, 232 | ): 233 | strings = [ 234 | 'ab', 235 | ] 236 | self.assert_substring_search( 237 | strings=strings, 238 | substring='a', 239 | expected_results=[ 240 | 'ab', 241 | ], 242 | ) 243 | 244 | def test_multiple_strings( 245 | self, 246 | ): 247 | try: 248 | with tempfile.TemporaryDirectory() as tmp_directory: 249 | index_file_path = f'{tmp_directory}/output.idx' 250 | writer = pysubstringsearch.Writer( 251 | index_file_path=index_file_path, 252 | ) 253 | for string in [ 254 | 'one', 255 | 'two', 256 | 'three', 257 | 'four', 258 | 'five', 259 | 'six', 260 | 'seven', 261 | 'eight', 262 | 'nine', 263 | 'ten', 264 | 'tenten', 265 | ]: 266 | writer.add_entry( 267 | text=string, 268 | ) 269 | writer.finalize() 270 | 271 | reader = pysubstringsearch.Reader( 272 | index_file_path=index_file_path, 273 | ) 274 | self.assertCountEqual( 275 | first=reader.search_multiple( 276 | substrings=[ 277 | 'ee', 278 | 'ven', 279 | ], 280 | ), 281 | second=[ 282 | 'three', 283 | 'seven', 284 | ], 285 | ) 286 | 287 | try: 288 | os.unlink( 289 | path=index_file_path, 290 | ) 291 | except Exception: 292 | pass 293 | except PermissionError: 294 | pass 295 | --------------------------------------------------------------------------------