├── .github
└── workflows
│ ├── build.yml
│ └── deploy.yml
├── .gitignore
├── Cargo.toml
├── LICENSE
├── README.md
├── build.rs
├── cortex.yaml
├── images
└── logo.png
├── poetry.lock
├── pyproject.toml
├── pysubstringsearch
├── __init__.py
├── py.typed
└── pysubstringsearch.pyi
├── src
├── lib.rs
└── libsais
│ ├── libsais.c
│ └── libsais.h
└── tests
├── __init__.py
└── test_pysubstringsearch.py
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
1 | name: Build
2 | on:
3 | - push
4 | - pull_request
5 | jobs:
6 | lint:
7 | if: github.event_name == 'push' && !startsWith(github.event.ref, 'refs/tags')
8 | runs-on: ubuntu-latest
9 | steps:
10 | - name: Checkout
11 | uses: actions/checkout@v3
12 | - name: Install latest rust
13 | uses: actions-rs/toolchain@v1
14 | with:
15 | toolchain: stable
16 | profile: minimal
17 | override: true
18 | components: clippy
19 | - name: Lint with clippy
20 | uses: actions-rs/cargo@v1
21 | with:
22 | command: clippy
23 | args: --all-targets --all-features
24 | test:
25 | runs-on: ${{ matrix.os }}
26 | needs: lint
27 | strategy:
28 | fail-fast: false
29 | matrix:
30 | python-version:
31 | - '3.7'
32 | - '3.8'
33 | - '3.9'
34 | - '3.10'
35 | - '3.11'
36 | os:
37 | - ubuntu-latest
38 | - macos-latest
39 | - windows-latest
40 | steps:
41 | - name: Checkout
42 | uses: actions/checkout@v3
43 | - name: Set up Python ${{ matrix.python-version }}
44 | uses: actions/setup-python@v3
45 | with:
46 | python-version: ${{ matrix.python-version }}
47 | - name: Install Poetry
48 | uses: abatilo/actions-poetry@v2.1.3
49 | - name: Install Rust
50 | uses: actions-rs/toolchain@v1
51 | with:
52 | profile: minimal
53 | toolchain: stable
54 | override: true
55 | - name: Install dependencies
56 | run: poetry install
57 | - name: Build Python package
58 | run: poetry run maturin develop
59 | - name: Test
60 | run: poetry run pytest -Werror tests
61 |
--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
1 | name: Deploy
2 | on:
3 | release:
4 | types:
5 | - released
6 | jobs:
7 | deploy:
8 | runs-on: ${{ matrix.os }}
9 | strategy:
10 | fail-fast: false
11 | matrix:
12 | python-version:
13 | - '3.7'
14 | - '3.8'
15 | - '3.9'
16 | - '3.10'
17 | - '3.11'
18 | os:
19 | - ubuntu-latest
20 | - macos-latest
21 | - windows-latest
22 | steps:
23 | - name: Checkout
24 | uses: actions/checkout@v3
25 | - name: Set up Python ${{ matrix.python-version }}
26 | uses: actions/setup-python@v4
27 | with:
28 | python-version: ${{ matrix.python-version }}
29 | - name: Install Rust
30 | uses: actions-rs/toolchain@v1
31 | with:
32 | profile: minimal
33 | toolchain: stable
34 | override: true
35 | - name: Install Cross-compilers (macOS)
36 | if: matrix.os == 'macos-latest'
37 | run: |
38 | rustup target add x86_64-apple-darwin
39 | rustup target add aarch64-apple-darwin
40 | - name: Publish Package
41 | uses: PyO3/maturin-action@v1
42 | with:
43 | command: publish
44 | args: --username=__token__ ${{ matrix.os == 'ubuntu-latest' && matrix.python-version == '3.7' && '' || '--no-sdist' }} --interpreter=python${{ !startsWith(matrix.os, 'windows') && matrix.python-version || '' }}
45 | env:
46 | MATURIN_PASSWORD: ${{ secrets.pypi_password }}
47 | if: matrix.os != 'macos-latest'
48 | - name: Publish macOS (x86_64) Package
49 | if: matrix.os == 'macos-latest'
50 | uses: PyO3/maturin-action@v1
51 | with:
52 | command: publish
53 | args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=x86_64-apple-darwin --no-sdist
54 | env:
55 | MATURIN_PASSWORD: ${{ secrets.pypi_password }}
56 | - name: Publish macOS (arm64) Package
57 | if: matrix.os == 'macos-latest'
58 | uses: PyO3/maturin-action@v1
59 | with:
60 | command: publish
61 | args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=aarch64-apple-darwin --no-sdist
62 | env:
63 | MATURIN_PASSWORD: ${{ secrets.pypi_password }}
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
98 | __pypackages__/
99 |
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 |
104 | # SageMath parsed files
105 | *.sage.py
106 |
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 |
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 |
120 | # Rope project settings
121 | .ropeproject
122 |
123 | # mkdocs documentation
124 | /site
125 |
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 |
131 | # Pyre type checker
132 | .pyre/
133 |
134 | # pytype static type analyzer
135 | .pytype/
136 |
137 | # Cython debug symbols
138 | cython_debug/
139 | .gitignore
140 | .gitignore
141 |
142 | # Generated by Cargo
143 | # will have compiled files and executables
144 | debug/
145 | target/
146 |
147 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
148 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
149 | Cargo.lock
150 |
151 | # These are backup files generated by rustfmt
152 | **/*.rs.bk
153 |
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "pysubstringsearch"
3 | version = "0.7.1"
4 | authors = ["Gal Ben David "]
5 | edition = "2021"
6 | description = "A Python library written in Rust that searches for substrings quickly using a Suffix Array"
7 | readme = "README.md"
8 | repository = "https://github.com/intsights/pysubstringsearch"
9 | homepage = "https://github.com/intsights/pysubstringsearch"
10 | license = "MIT"
11 | keywords = [
12 | "substring",
13 | "pattern",
14 | "search",
15 | "suffix",
16 | "array",
17 | "rust",
18 | "pyo3"
19 | ]
20 |
21 | [package.metadata.maturin]
22 |
23 | [lib]
24 | name = "pysubstringsearch"
25 | crate-type = ["cdylib"]
26 |
27 | [dependencies]
28 | ahash = "0.7"
29 | bstr = "0.2"
30 | byteorder = "1"
31 | memchr = "2"
32 | parking_lot = "0.12"
33 | rayon = "1"
34 |
35 | [dependencies.pyo3]
36 | version = "0.16.4"
37 | features = ["extension-module"]
38 |
39 | [build-dependencies]
40 | cc = { version = "1.0", features = ["parallel"] }
41 |
42 | [profile.release]
43 | lto = true
44 | panic = "abort"
45 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Gal Ben David
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | A Python library written in Rust that searches for substrings quickly using a Suffix Array
7 |
8 |
9 |
10 | 
11 | 
12 | 
13 | [](https://pypi.org/project/PySubstringSearch/)
14 |
15 | ## Table of Contents
16 |
17 | - [Table of Contents](#table-of-contents)
18 | - [About The Project](#about-the-project)
19 | - [Built With](#built-with)
20 | - [Performance](#performance)
21 | - [500MB File](#500mb-file)
22 | - [7500MB File](#7500mb-file)
23 | - [Installation](#installation)
24 | - [Usage](#usage)
25 | - [License](#license)
26 | - [Contact](#contact)
27 |
28 |
29 | ## About The Project
30 |
31 | PySubstringSearch is a library designed to search over an index file for substring patterns. In order to achieve speed and efficiency, the library is written in Rust. For string indexing, the library uses [libsais](https://github.com/IlyaGrebnov/libsais) suffix array construction library. The index created consists of the original text and a 32bit suffix array struct. To get around the limitations of the Suffix Array Construction implementation, the library uses a proprietary container protocol to hold the original text and index in chunks of 512MB.
32 |
33 | The module implements a method for searching.
34 | - `search` - Find different entries with the same substring concurrently. Concurrency increases as the index file grows in size with multiple inner chunks.
35 | - `search_multiple` - same as `search` but accepts multiple substrings in a single call
36 |
37 |
38 | ### Built With
39 |
40 | * [libsais](https://github.com/IlyaGrebnov/libsais)
41 |
42 |
43 | ### Performance
44 |
45 | #### 500MB File
46 | | Library | Function | Time | #Results | Improvement Factor |
47 | | ------------- | ------------- | ------------- | ------------- | ------------- |
48 | | [ripgrepy](https://pypi.org/project/ripgrepy/) | Ripgrepy('google', '500mb').run().as_string.split('\n') | 47.2ms | 5943 | 1.0x |
49 | | [PySubstringSearch](https://github.com/Intsights/PySubstringSearch) | reader.search('google') | 497µs | 5943 | 95x |
50 | | [ripgrepy](https://pypi.org/project/ripgrepy/) | Ripgrepy('text_two', '500mb').run().as_string.split('\n') | 44.7ms | 159 | 1.0x |
51 | | [PySubstringSearch](https://github.com/Intsights/PySubstringSearch) | reader.search('text_two') | 14.9µs | 159 | 3000x |
52 |
53 | #### 7500MB File
54 | | Library | Function | Time | #Results | Improvement Factor |
55 | | ------------- | ------------- | ------------- | ------------- | ------------- |
56 | | [ripgrepy](https://pypi.org/project/ripgrepy/) | Ripgrepy('google', '6000mb').run().as_string.split('\n') | 900ms | 62834 | 1.0x |
57 | | [PySubstringSearch](https://github.com/Intsights/PySubstringSearch) | reader.search('google') | 10.1ms | 62834 | 89.1x |
58 | | [ripgrepy](https://pypi.org/project/ripgrepy/) | Ripgrepy('text_two', '6000mb').run().as_string.split('\n') | 820ms | 0 | 1.0x |
59 | | [PySubstringSearch](https://github.com/Intsights/PySubstringSearch) | reader.search('text_two') | 200µs | 0 | 4100x |
60 |
61 |
62 | ### Installation
63 |
64 | ```sh
65 | pip3 install PySubstringSearch
66 | ```
67 |
68 |
69 | ## Usage
70 |
71 | Create an index
72 | ```python
73 | import pysubstringsearch
74 |
75 | # creating a new index file
76 | # if a file with this name is already exists, it will be overwritten
77 | writer = pysubstringsearch.Writer(
78 | index_file_path='output.idx',
79 | )
80 |
81 | # adding entries to the new index
82 | writer.add_entry('some short string')
83 | writer.add_entry('another but now a longer string')
84 | writer.add_entry('more text to add')
85 |
86 | # adding entries from file lines
87 | writer.add_entries_from_file_lines('input_file.txt')
88 |
89 | # making sure the data is dumped to the file
90 | writer.finalize()
91 | ```
92 |
93 | Search a substring within an index
94 | ```python
95 | import pysubstringsearch
96 |
97 | # opening an index file for searching
98 | reader = pysubstringsearch.Reader(
99 | index_file_path='output.idx',
100 | )
101 |
102 | # lookup for a substring
103 | reader.search('short')
104 | >>> ['some short string']
105 |
106 | # lookup for a substring
107 | reader.search('string')
108 | >>> ['some short string', 'another but now a longer string']
109 |
110 | # lookup for multiple substrings
111 | reader.search_multiple(
112 | [
113 | 'short',
114 | 'longer',
115 | ],
116 | )
117 | >>> ['some short string', 'another but now a longer string']
118 | ```
119 |
120 |
121 |
122 | ## License
123 |
124 | Distributed under the MIT License. See `LICENSE` for more information.
125 |
126 |
127 | ## Contact
128 |
129 | Gal Ben David - gal@intsights.com
130 |
131 | Project Link: [https://github.com/Intsights/PySubstringSearch](https://github.com/Intsights/PySubstringSearch)
132 |
--------------------------------------------------------------------------------
/build.rs:
--------------------------------------------------------------------------------
1 | fn main() {
2 | println!("cargo:rerun-if-changed=libsais.c");
3 |
4 | let src = [
5 | "src/libsais/libsais.c",
6 | ];
7 | let mut builder = cc::Build::new();
8 | let build = builder
9 | .files(src.iter());
10 | build.compile("libsais");
11 | }
12 |
--------------------------------------------------------------------------------
/cortex.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | info:
3 | title: Pysubstringsearch
4 | description: Python library for fast substring/pattern search written in C++ leveraging
5 | Suffix Array Algorithm
6 | x-cortex-git:
7 | github:
8 | alias: intsightsorg
9 | repository: Intsights/PySubstringSearch
10 | x-cortex-tag: pysubstringsearch
11 | x-cortex-type: service
12 | x-cortex-domain-parents:
13 | - tag: threatintel-platform-delivery
14 | x-cortex-groups:
15 | - exposure:external-ship
16 | - target:library
17 | openapi: 3.0.1
18 | servers:
19 | - url: "/"
20 |
--------------------------------------------------------------------------------
/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Intsights/PySubstringSearch/7456989e6172b7f0ad563a33ad54b6d9d44f79de/images/logo.png
--------------------------------------------------------------------------------
/poetry.lock:
--------------------------------------------------------------------------------
1 | [[package]]
2 | name = "colorama"
3 | version = "0.4.6"
4 | description = "Cross-platform colored terminal text."
5 | category = "dev"
6 | optional = false
7 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
8 |
9 | [[package]]
10 | name = "exceptiongroup"
11 | version = "1.2.0"
12 | description = "Backport of PEP 654 (exception groups)"
13 | category = "dev"
14 | optional = false
15 | python-versions = ">=3.7"
16 |
17 | [package.extras]
18 | test = ["pytest (>=6)"]
19 |
20 | [[package]]
21 | name = "gitdb"
22 | version = "4.0.11"
23 | description = "Git Object Database"
24 | category = "dev"
25 | optional = false
26 | python-versions = ">=3.7"
27 |
28 | [package.dependencies]
29 | smmap = ">=3.0.1,<6"
30 |
31 | [[package]]
32 | name = "gitpython"
33 | version = "3.1.41"
34 | description = "GitPython is a Python library used to interact with Git repositories"
35 | category = "dev"
36 | optional = false
37 | python-versions = ">=3.7"
38 |
39 | [package.dependencies]
40 | gitdb = ">=4.0.1,<5"
41 | typing-extensions = {version = ">=3.7.4.3", markers = "python_version < \"3.8\""}
42 |
43 | [package.extras]
44 | test = ["black", "coverage", "ddt (>=1.1.1,!=1.4.3)", "mypy", "pre-commit", "pytest (>=7.3.1)", "pytest-cov", "pytest-instafail", "pytest-mock", "pytest-sugar", "sumtypes", "mock"]
45 |
46 | [[package]]
47 | name = "importlib-metadata"
48 | version = "6.7.0"
49 | description = "Read metadata from Python packages"
50 | category = "dev"
51 | optional = false
52 | python-versions = ">=3.7"
53 |
54 | [package.dependencies]
55 | typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""}
56 | zipp = ">=0.5"
57 |
58 | [package.extras]
59 | docs = ["sphinx (>=3.5)", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "furo", "sphinx-lint", "jaraco.tidelift (>=1.4)"]
60 | perf = ["ipython"]
61 | testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-ruff", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "importlib-resources (>=1.3)"]
62 |
63 | [[package]]
64 | name = "iniconfig"
65 | version = "2.0.0"
66 | description = "brain-dead simple config-ini parsing"
67 | category = "dev"
68 | optional = false
69 | python-versions = ">=3.7"
70 |
71 | [[package]]
72 | name = "maturin"
73 | version = "1.4.0"
74 | description = "Build and publish crates with pyo3, rust-cpython and cffi bindings as well as rust binaries as python packages"
75 | category = "dev"
76 | optional = false
77 | python-versions = ">=3.7"
78 |
79 | [package.dependencies]
80 | tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
81 |
82 | [package.extras]
83 | zig = ["ziglang (>=0.10.0,<0.11.0)"]
84 | patchelf = ["patchelf"]
85 |
86 | [[package]]
87 | name = "packaging"
88 | version = "23.2"
89 | description = "Core utilities for Python packages"
90 | category = "dev"
91 | optional = false
92 | python-versions = ">=3.7"
93 |
94 | [[package]]
95 | name = "pluggy"
96 | version = "1.2.0"
97 | description = "plugin and hook calling mechanisms for python"
98 | category = "dev"
99 | optional = false
100 | python-versions = ">=3.7"
101 |
102 | [package.dependencies]
103 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
104 |
105 | [package.extras]
106 | dev = ["pre-commit", "tox"]
107 | testing = ["pytest", "pytest-benchmark"]
108 |
109 | [[package]]
110 | name = "pytest"
111 | version = "7.4.4"
112 | description = "pytest: simple powerful testing with Python"
113 | category = "dev"
114 | optional = false
115 | python-versions = ">=3.7"
116 |
117 | [package.dependencies]
118 | colorama = {version = "*", markers = "sys_platform == \"win32\""}
119 | exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
120 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
121 | iniconfig = "*"
122 | packaging = "*"
123 | pluggy = ">=0.12,<2.0"
124 | tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
125 |
126 | [package.extras]
127 | testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
128 |
129 | [[package]]
130 | name = "pytest-runner"
131 | version = "6.0.1"
132 | description = "Invoke py.test as distutils command with dependency resolution"
133 | category = "dev"
134 | optional = false
135 | python-versions = ">=3.7"
136 |
137 | [package.extras]
138 | docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "jaraco.tidelift (>=1.4)"]
139 | testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "pytest-virtualenv", "types-setuptools", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)"]
140 |
141 | [[package]]
142 | name = "smmap"
143 | version = "5.0.1"
144 | description = "A pure Python implementation of a sliding window memory map manager"
145 | category = "dev"
146 | optional = false
147 | python-versions = ">=3.7"
148 |
149 | [[package]]
150 | name = "tomli"
151 | version = "2.0.1"
152 | description = "A lil' TOML parser"
153 | category = "dev"
154 | optional = false
155 | python-versions = ">=3.7"
156 |
157 | [[package]]
158 | name = "typing-extensions"
159 | version = "4.7.1"
160 | description = "Backported and Experimental Type Hints for Python 3.7+"
161 | category = "dev"
162 | optional = false
163 | python-versions = ">=3.7"
164 |
165 | [[package]]
166 | name = "zipp"
167 | version = "3.15.0"
168 | description = "Backport of pathlib-compatible object wrapper for zip files"
169 | category = "dev"
170 | optional = false
171 | python-versions = ">=3.7"
172 |
173 | [package.extras]
174 | docs = ["sphinx (>=3.5)", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "furo", "sphinx-lint", "jaraco.tidelift (>=1.4)"]
175 | testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "flake8 (<5)", "pytest-cov", "pytest-enabler (>=1.3)", "jaraco.itertools", "jaraco.functools", "more-itertools", "big-o", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "pytest-flake8"]
176 |
177 | [metadata]
178 | lock-version = "1.1"
179 | python-versions = "^3.7"
180 | content-hash = "73c215113a1a0f3275a327a924fa4b01076299e805acbc7e197a5820a0908f1d"
181 |
182 | [metadata.files]
183 | colorama = []
184 | exceptiongroup = []
185 | gitdb = []
186 | gitpython = []
187 | importlib-metadata = []
188 | iniconfig = []
189 | maturin = []
190 | packaging = []
191 | pluggy = []
192 | pytest = []
193 | pytest-runner = []
194 | smmap = []
195 | tomli = []
196 | typing-extensions = []
197 | zipp = []
198 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["maturin>=0.12,<0.13"]
3 | build-backend = "maturin"
4 |
5 | [tool.maturin]
6 | sdist-include = [
7 | "src/*",
8 | "Cargo.toml",
9 | "pysubstringsearch/*.py",
10 | "pysubstringsearch/*.pyi"
11 | ]
12 |
13 | [tool.poetry]
14 | name = "pysubstringsearch"
15 | version = "0.7.1"
16 | authors = ["Gal Ben David "]
17 | description = "A Python library written in Rust that searches for substrings quickly using a Suffix Array"
18 | readme = "README.md"
19 | repository = "https://github.com/intsights/pysubstringsearch"
20 | homepage = "https://github.com/intsights/pysubstringsearch"
21 | license = "MIT"
22 | keywords = [
23 | "substring",
24 | "pattern",
25 | "search",
26 | "suffix",
27 | "array",
28 | "rust",
29 | "pyo3"
30 | ]
31 | classifiers = [
32 | "License :: OSI Approved :: MIT License",
33 | "Operating System :: MacOS",
34 | "Operating System :: Microsoft",
35 | "Operating System :: POSIX :: Linux",
36 | "Programming Language :: Python :: 3.7",
37 | "Programming Language :: Python :: 3.8",
38 | "Programming Language :: Python :: 3.9",
39 | "Programming Language :: Python :: 3.10",
40 | "Programming Language :: Python :: 3.11",
41 | "Programming Language :: Rust",
42 | ]
43 |
44 | [tool.poetry.dependencies]
45 | python = "^3.7"
46 |
47 | [tool.poetry.dev-dependencies]
48 | pytest = "*"
49 | gitpython = "*"
50 | wheel = "*"
51 | pytest-runner = "*"
52 | maturin = "*"
53 |
54 | [tool.pytest.ini_options]
55 | minversion = "6.0"
56 | addopts = [
57 | "--tb=native",
58 | "--pythonwarnings=all",
59 | ]
60 | testpaths = [
61 | "tests",
62 | ]
63 |
--------------------------------------------------------------------------------
/pysubstringsearch/__init__.py:
--------------------------------------------------------------------------------
1 | import typing
2 |
3 | from . import pysubstringsearch
4 |
5 |
6 | class Writer:
7 | def __init__(
8 | self,
9 | index_file_path: str,
10 | max_chunk_len: typing.Optional[int] = None,
11 | ) -> None:
12 | self.writer = pysubstringsearch.Writer(
13 | index_file_path=index_file_path,
14 | max_chunk_len=max_chunk_len,
15 | )
16 |
17 | def add_entries_from_file_lines(
18 | self,
19 | input_file_path: str,
20 | ) -> None:
21 | self.writer.add_entries_from_file_lines(
22 | input_file_path=input_file_path,
23 | )
24 |
25 | def add_entry(
26 | self,
27 | text: str,
28 | ) -> None:
29 | self.writer.add_entry(
30 | text=text,
31 | )
32 |
33 | def dump_data(
34 | self,
35 | ) -> None:
36 | self.writer.dump_data()
37 |
38 | def finalize(
39 | self,
40 | ) -> None:
41 | self.writer.finalize()
42 |
43 |
44 | class Reader:
45 | def __init__(
46 | self,
47 | index_file_path: str,
48 | ) -> None:
49 | self.reader = pysubstringsearch.Reader(
50 | index_file_path=index_file_path,
51 | )
52 |
53 | def search(
54 | self,
55 | substring: str,
56 | ) -> typing.List[str]:
57 | return self.reader.search(
58 | substring=substring,
59 | )
60 |
61 | def search_multiple(
62 | self,
63 | substrings: typing.List[str],
64 | ) -> typing.List[str]:
65 | results = []
66 | for substring in substrings:
67 | results.extend(
68 | self.search(
69 | substring=substring,
70 | ),
71 | )
72 |
73 | return results
74 |
--------------------------------------------------------------------------------
/pysubstringsearch/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Intsights/PySubstringSearch/7456989e6172b7f0ad563a33ad54b6d9d44f79de/pysubstringsearch/py.typed
--------------------------------------------------------------------------------
/pysubstringsearch/pysubstringsearch.pyi:
--------------------------------------------------------------------------------
1 | import typing
2 |
3 |
4 | class Writer:
5 | def __init__(
6 | self,
7 | index_file_path: str,
8 | max_chunk_len: typing.Optional[int] = None,
9 | ) -> None: ...
10 |
11 | def add_entries_from_file_lines(
12 | self,
13 | input_file_path: str,
14 | ) -> None: ...
15 |
16 | def add_entry(
17 | self,
18 | text: str,
19 | ) -> None: ...
20 |
21 | def dump_data(
22 | self,
23 | ) -> None: ...
24 |
25 | def finalize(
26 | self,
27 | ) -> None: ...
28 |
29 |
30 | class Reader:
31 | def __init__(
32 | self,
33 | index_file_path: str,
34 | ) -> None: ...
35 |
36 | def search(
37 | self,
38 | substring: str,
39 | ) -> typing.List[str]: ...
40 |
41 | def search_multiple(
42 | self,
43 | substrings: typing.List[str],
44 | ) -> typing.List[str]: ...
45 |
--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
1 | use ahash::AHashSet;
2 | use bstr::io::BufReadExt;
3 | use byteorder::{ReadBytesExt, WriteBytesExt, ByteOrder, LittleEndian};
4 | use memchr::memmem;
5 | use parking_lot::Mutex;
6 | use pyo3::exceptions;
7 | use pyo3::prelude::*;
8 | use rayon::prelude::*;
9 | use std::fs::File;
10 | use std::io::{BufReader, BufWriter, Read, Seek, SeekFrom, Write};
11 | use std::str;
12 | use std::sync::Arc;
13 |
14 | extern "C" {
15 | pub fn libsais(
16 | data: *const u8,
17 | suffix_array: *mut i32,
18 | data_len: i32,
19 | suffix_array_extra_space: i32,
20 | symbol_frequency_table: *mut i32,
21 | ) -> i32;
22 | }
23 |
24 | fn construct_suffix_array(
25 | buffer: &[u8],
26 | ) -> Vec {
27 | let mut suffix_array = vec![0; buffer.len()];
28 |
29 | unsafe {
30 | libsais(
31 | buffer.as_ptr(),
32 | suffix_array.as_mut_ptr(),
33 | buffer.len() as i32,
34 | 0,
35 | std::ptr::null_mut::(),
36 | );
37 | }
38 |
39 | suffix_array
40 | }
41 |
42 | #[pyclass]
43 | struct Writer {
44 | index_file: BufWriter,
45 | buffer: Vec,
46 | }
47 |
48 | #[pymethods]
49 | impl Writer {
50 | #[new]
51 | fn new(
52 | index_file_path: &str,
53 | max_chunk_len: Option,
54 | ) -> PyResult {
55 | let index_file = File::create(index_file_path)?;
56 | let index_file = BufWriter::new(index_file);
57 | let max_chunk_len = max_chunk_len.unwrap_or(512 * 1024 * 1024);
58 |
59 | Ok(
60 | Writer {
61 | index_file,
62 | buffer: Vec::with_capacity(max_chunk_len),
63 | }
64 | )
65 | }
66 |
67 | fn add_entries_from_file_lines(
68 | &mut self,
69 | input_file_path: &str,
70 | ) -> PyResult<()> {
71 | let input_file = File::open(input_file_path)?;
72 | let input_file_reader = BufReader::new(input_file);
73 | input_file_reader.for_byte_line(
74 | |line| {
75 | if self.buffer.len() + line.len() + 1 > self.buffer.capacity() {
76 | self.dump_data()?;
77 | }
78 | self.buffer.extend_from_slice(line);
79 | self.buffer.push(b'\n');
80 |
81 | Ok(true)
82 | }
83 | )?;
84 |
85 | Ok(())
86 | }
87 |
88 | fn add_entry(
89 | &mut self,
90 | text: &str,
91 | ) -> PyResult<()> {
92 | if text.len() > self.buffer.capacity() {
93 | return Err(exceptions::PyValueError::new_err("entry is too big"));
94 | }
95 |
96 | if self.buffer.len() + text.len() + 1 > self.buffer.capacity() {
97 | self.dump_data()?;
98 | }
99 | self.buffer.extend_from_slice(text.as_bytes());
100 | self.buffer.push(b'\n');
101 |
102 | Ok(())
103 | }
104 |
105 | fn dump_data(
106 | &mut self,
107 | ) -> PyResult<()> {
108 | if self.buffer.is_empty() {
109 | return Ok(());
110 | }
111 |
112 | self.index_file.write_u32::(self.buffer.len() as u32)?;
113 | self.index_file.write_all(&self.buffer)?;
114 |
115 | let suffix_array = construct_suffix_array(&self.buffer);
116 | self.index_file.write_u32::((suffix_array.len() * 4) as u32)?;
117 | for suffix in suffix_array {
118 | self.index_file.write_i32::(suffix)?;
119 | }
120 |
121 | self.buffer.clear();
122 |
123 | Ok(())
124 | }
125 |
126 | fn finalize(
127 | &mut self,
128 | ) -> PyResult<()> {
129 | if !self.buffer.is_empty() {
130 | self.dump_data()?;
131 | }
132 | self.index_file.flush()?;
133 |
134 | Ok(())
135 | }
136 | }
137 |
138 | impl Drop for Writer {
139 | fn drop(
140 | &mut self,
141 | ) {
142 | self.finalize().unwrap();
143 | }
144 | }
145 |
146 | struct SubIndex {
147 | data: Vec,
148 | index_file: BufReader,
149 | suffixes_file_start: usize,
150 | suffixes_file_end: usize,
151 | finder: memmem::Finder<'static>,
152 | finder_rev: memmem::FinderRev<'static>,
153 | }
154 |
155 | #[pyclass]
156 | struct Reader {
157 | sub_indexes: Vec,
158 | }
159 |
160 | #[pymethods]
161 | impl Reader {
162 | #[new]
163 | fn new(
164 | index_file_path: &str,
165 | ) -> PyResult {
166 | let index_file = File::open(index_file_path)?;
167 | let mut index_file = BufReader::new(index_file);
168 | let index_file_metadata = std::fs::metadata(index_file_path)?;
169 | let index_file_len = index_file_metadata.len();
170 | let mut bytes_read = 0;
171 |
172 | let mut sub_indexes = Vec::new();
173 |
174 | while bytes_read < index_file_len {
175 | let data_file_len = index_file.read_u32::()?;
176 | let mut data = vec![0; data_file_len as usize];
177 | index_file.read_exact(&mut data)?;
178 |
179 | let suffixes_file_len = index_file.read_u32::()? as usize;
180 | let suffixes_file_start = index_file.seek(SeekFrom::Current(0))? as usize;
181 | let suffixes_file_end = suffixes_file_start + suffixes_file_len;
182 | index_file.seek(SeekFrom::Current(suffixes_file_len as i64))?;
183 |
184 | bytes_read += 4 + 4 + data_file_len as u64 + suffixes_file_len as u64;
185 |
186 | sub_indexes.push(
187 | SubIndex {
188 | data,
189 | index_file: BufReader::new(File::open(index_file_path)?),
190 | suffixes_file_start,
191 | suffixes_file_end,
192 | finder: memmem::Finder::new(b"\n"),
193 | finder_rev: memmem::FinderRev::new(b"\n"),
194 | }
195 | );
196 | }
197 |
198 | Ok(Reader { sub_indexes })
199 | }
200 |
201 | fn search(
202 | &mut self,
203 | substring: &str,
204 | ) -> PyResult> {
205 | let results = Arc::new(Mutex::new(Vec::new()));
206 |
207 | self.sub_indexes.par_iter_mut().for_each(
208 | |sub_index| {
209 | let mut start_of_indices = None;
210 | let mut end_of_indices = None;
211 |
212 | let mut left_anchor = sub_index.suffixes_file_start;
213 | let mut right_anchor = sub_index.suffixes_file_end - 4;
214 | while left_anchor <= right_anchor {
215 | let middle_anchor = left_anchor + ((right_anchor - left_anchor) / 4 / 2 * 4);
216 | sub_index.index_file.seek(SeekFrom::Start(middle_anchor as u64)).unwrap();
217 | let data_index = sub_index.index_file.read_i32::().unwrap();
218 |
219 | let line = &sub_index.data[data_index as usize..];
220 | if line.starts_with(substring.as_bytes()) {
221 | start_of_indices = Some(middle_anchor);
222 | right_anchor = middle_anchor - 4;
223 | } else {
224 | match substring.as_bytes().cmp(line) {
225 | std::cmp::Ordering::Less => right_anchor = middle_anchor - 4,
226 | std::cmp::Ordering::Greater => left_anchor = middle_anchor + 4,
227 | std::cmp::Ordering::Equal => {},
228 | };
229 | }
230 | }
231 | if start_of_indices.is_none() {
232 | return;
233 | }
234 |
235 | let mut right_anchor = sub_index.suffixes_file_end - 4;
236 | while left_anchor <= right_anchor {
237 | let middle_anchor = left_anchor + ((right_anchor - left_anchor) / 4 / 2 * 4);
238 | sub_index.index_file.seek(SeekFrom::Start(middle_anchor as u64)).unwrap();
239 | let data_index = sub_index.index_file.read_i32::().unwrap();
240 |
241 | let line = &sub_index.data[data_index as usize..];
242 | if line.starts_with(substring.as_bytes()) {
243 | end_of_indices = Some(middle_anchor);
244 | left_anchor = middle_anchor + 4;
245 | } else {
246 | match substring.as_bytes().cmp(line) {
247 | std::cmp::Ordering::Less => right_anchor = middle_anchor - 4,
248 | std::cmp::Ordering::Greater => left_anchor = middle_anchor + 4,
249 | std::cmp::Ordering::Equal => {},
250 | };
251 | }
252 | }
253 |
254 | let start_of_indices = start_of_indices.unwrap();
255 | let end_of_indices = end_of_indices.unwrap();
256 |
257 | let mut suffixes = vec![0; end_of_indices - start_of_indices + 4];
258 |
259 | sub_index.index_file.seek(SeekFrom::Start(start_of_indices as u64)).unwrap();
260 | sub_index.index_file.read_exact(&mut suffixes).unwrap();
261 |
262 | let mut matches_ranges = AHashSet::new();
263 | let mut local_results = Vec::with_capacity((end_of_indices - start_of_indices + 4) / 4);
264 | for suffix in suffixes.chunks_mut(4) {
265 | let data_index = LittleEndian::read_i32(suffix);
266 | let line_head = match sub_index.finder.find(&sub_index.data[data_index as usize..]) {
267 | Some(next_nl_pos) => data_index as usize + next_nl_pos,
268 | None => sub_index.data.len() - 1,
269 | };
270 | let line_tail = match sub_index.finder_rev.rfind(&sub_index.data[..data_index as usize]) {
271 | Some(previous_nl_pos) => previous_nl_pos + 1,
272 | None => 0,
273 | };
274 | if matches_ranges.insert(line_tail) {
275 | let line = unsafe { str::from_utf8_unchecked(&sub_index.data[line_tail..line_head]) };
276 | local_results.push(line);
277 | }
278 | }
279 |
280 | results.lock().extend(local_results);
281 | }
282 | );
283 |
284 | let results = results.lock().to_vec();
285 |
286 | Ok(results)
287 | }
288 | }
289 |
290 | #[pymodule]
291 | fn pysubstringsearch(
292 | _py: Python,
293 | m: &PyModule,
294 | ) -> PyResult<()> {
295 | m.add_class::()?;
296 | m.add_class::()?;
297 |
298 | Ok(())
299 | }
300 |
--------------------------------------------------------------------------------
/src/libsais/libsais.h:
--------------------------------------------------------------------------------
1 | /*--
2 |
3 | This file is a part of libsais, a library for linear time
4 | suffix array and burrows wheeler transform construction.
5 |
6 | Copyright (c) 2021-2022 Ilya Grebnov
7 |
8 | Licensed under the Apache License, Version 2.0 (the "License");
9 | you may not use this file except in compliance with the License.
10 | You may obtain a copy of the License at
11 |
12 | http://www.apache.org/licenses/LICENSE-2.0
13 |
14 | Unless required by applicable law or agreed to in writing, software
15 | distributed under the License is distributed on an "AS IS" BASIS,
16 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | See the License for the specific language governing permissions and
18 | limitations under the License.
19 |
20 | Please see the file LICENSE for full copyright information.
21 |
22 | --*/
23 |
24 | #ifndef LIBSAIS_H
25 | #define LIBSAIS_H 1
26 |
27 | #ifdef __cplusplus
28 | extern "C" {
29 | #endif
30 |
31 | #include
32 |
33 | /**
34 | * Creates the libsais context that allows reusing allocated memory with each libsais operation.
35 | * In multi-threaded environments, use one context per thread for parallel executions.
36 | * @return the libsais context, NULL otherwise.
37 | */
38 | void * libsais_create_ctx(void);
39 |
40 | #if defined(_OPENMP)
41 | /**
42 | * Creates the libsais context that allows reusing allocated memory with each parallel libsais operation using OpenMP.
43 | * In multi-threaded environments, use one context per thread for parallel executions.
44 | * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
45 | * @return the libsais context, NULL otherwise.
46 | */
47 | void * libsais_create_ctx_omp(int32_t threads);
48 | #endif
49 |
50 | /**
51 | * Destroys the libsass context and free previusly allocated memory.
52 | * @param ctx The libsais context (can be NULL).
53 | */
54 | void libsais_free_ctx(void * ctx);
55 |
56 | /**
57 | * Constructs the suffix array of a given string.
58 | * @param T [0..n-1] The input string.
59 | * @param SA [0..n-1+fs] The output array of suffixes.
60 | * @param n The length of the given string.
61 | * @param fs The extra space available at the end of SA array (0 should be enough for most cases).
62 | * @param freq [0..255] The output symbol frequency table (can be NULL).
63 | * @return 0 if no error occurred, -1 or -2 otherwise.
64 | */
65 | int32_t libsais(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq);
66 |
67 | /**
68 | * Constructs the suffix array of a given integer array.
69 | * Note, during construction input array will be modified, but restored at the end if no errors occurred.
70 | * @param T [0..n-1] The input integer array.
71 | * @param SA [0..n-1+fs] The output array of suffixes.
72 | * @param n The length of the integer array.
73 | * @param k The alphabet size of the input integer array.
74 | * @param fs Extra space available at the end of SA array (can be 0, but 4k or better 6k is recommended for optimal performance).
75 | * @return 0 if no error occurred, -1 or -2 otherwise.
76 | */
77 | int32_t libsais_int(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs);
78 |
79 | /**
80 | * Constructs the suffix array of a given string using libsais context.
81 | * @param ctx The libsais context.
82 | * @param T [0..n-1] The input string.
83 | * @param SA [0..n-1+fs] The output array of suffixes.
84 | * @param n The length of the given string.
85 | * @param fs The extra space available at the end of SA array (0 should be enough for most cases).
86 | * @param freq [0..255] The output symbol frequency table (can be NULL).
87 | * @return 0 if no error occurred, -1 or -2 otherwise.
88 | */
89 | int32_t libsais_ctx(const void * ctx, const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq);
90 |
91 | #if defined(_OPENMP)
92 | /**
93 | * Constructs the suffix array of a given string in parallel using OpenMP.
94 | * @param T [0..n-1] The input string.
95 | * @param SA [0..n-1+fs] The output array of suffixes.
96 | * @param n The length of the given string.
97 | * @param fs The extra space available at the end of SA array (0 should be enough for most cases).
98 | * @param freq [0..255] The output symbol frequency table (can be NULL).
99 | * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
100 | * @return 0 if no error occurred, -1 or -2 otherwise.
101 | */
102 | int32_t libsais_omp(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads);
103 |
104 | /**
105 | * Constructs the suffix array of a given integer array in parallel using OpenMP.
106 | * Note, during construction input array will be modified, but restored at the end if no errors occurred.
107 | * @param T [0..n-1] The input integer array.
108 | * @param SA [0..n-1+fs] The output array of suffixes.
109 | * @param n The length of the integer array.
110 | * @param k The alphabet size of the input integer array.
111 | * @param fs Extra space available at the end of SA array (can be 0, but 4k or better 6k is recommended for optimal performance).
112 | * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
113 | * @return 0 if no error occurred, -1 or -2 otherwise.
114 | */
115 | int32_t libsais_int_omp(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs, int32_t threads);
116 | #endif
117 |
118 | /**
119 | * Constructs the burrows-wheeler transformed string of a given string.
120 | * @param T [0..n-1] The input string.
121 | * @param U [0..n-1] The output string (can be T).
122 | * @param A [0..n-1+fs] The temporary array.
123 | * @param n The length of the given string.
124 | * @param fs The extra space available at the end of A array (0 should be enough for most cases).
125 | * @param freq [0..255] The output symbol frequency table (can be NULL).
126 | * @return The primary index if no error occurred, -1 or -2 otherwise.
127 | */
128 | int32_t libsais_bwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq);
129 |
130 | /**
131 | * Constructs the burrows-wheeler transformed string of a given string with auxiliary indexes.
132 | * @param T [0..n-1] The input string.
133 | * @param U [0..n-1] The output string (can be T).
134 | * @param A [0..n-1+fs] The temporary array.
135 | * @param n The length of the given string.
136 | * @param fs The extra space available at the end of A array (0 should be enough for most cases).
137 | * @param freq [0..255] The output symbol frequency table (can be NULL).
138 | * @param r The sampling rate for auxiliary indexes (must be power of 2).
139 | * @param I [0..(n-1)/r] The output auxiliary indexes.
140 | * @return 0 if no error occurred, -1 or -2 otherwise.
141 | */
142 | int32_t libsais_bwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I);
143 |
144 | /**
145 | * Constructs the burrows-wheeler transformed string of a given string using libsais context.
146 | * @param ctx The libsais context.
147 | * @param T [0..n-1] The input string.
148 | * @param U [0..n-1] The output string (can be T).
149 | * @param A [0..n-1+fs] The temporary array.
150 | * @param n The length of the given string.
151 | * @param fs The extra space available at the end of A array (0 should be enough for most cases).
152 | * @param freq [0..255] The output symbol frequency table (can be NULL).
153 | * @return The primary index if no error occurred, -1 or -2 otherwise.
154 | */
155 | int32_t libsais_bwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq);
156 |
157 | /**
158 | * Constructs the burrows-wheeler transformed string of a given string with auxiliary indexes using libsais context.
159 | * @param ctx The libsais context.
160 | * @param T [0..n-1] The input string.
161 | * @param U [0..n-1] The output string (can be T).
162 | * @param A [0..n-1+fs] The temporary array.
163 | * @param n The length of the given string.
164 | * @param fs The extra space available at the end of A array (0 should be enough for most cases).
165 | * @param freq [0..255] The output symbol frequency table (can be NULL).
166 | * @param r The sampling rate for auxiliary indexes (must be power of 2).
167 | * @param I [0..(n-1)/r] The output auxiliary indexes.
168 | * @return 0 if no error occurred, -1 or -2 otherwise.
169 | */
170 | int32_t libsais_bwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I);
171 |
172 | #if defined(_OPENMP)
173 | /**
174 | * Constructs the burrows-wheeler transformed string of a given string in parallel using OpenMP.
175 | * @param T [0..n-1] The input string.
176 | * @param U [0..n-1] The output string (can be T).
177 | * @param A [0..n-1+fs] The temporary array.
178 | * @param n The length of the given string.
179 | * @param fs The extra space available at the end of A array (0 should be enough for most cases).
180 | * @param freq [0..255] The output symbol frequency table (can be NULL).
181 | * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
182 | * @return The primary index if no error occurred, -1 or -2 otherwise.
183 | */
184 | int32_t libsais_bwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t threads);
185 |
186 | /**
187 | * Constructs the burrows-wheeler transformed string of a given string with auxiliary indexes in parallel using OpenMP.
188 | * @param T [0..n-1] The input string.
189 | * @param U [0..n-1] The output string (can be T).
190 | * @param A [0..n-1+fs] The temporary array.
191 | * @param n The length of the given string.
192 | * @param fs The extra space available at the end of A array (0 should be enough for most cases).
193 | * @param freq [0..255] The output symbol frequency table (can be NULL).
194 | * @param r The sampling rate for auxiliary indexes (must be power of 2).
195 | * @param I [0..(n-1)/r] The output auxiliary indexes.
196 | * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
197 | * @return 0 if no error occurred, -1 or -2 otherwise.
198 | */
199 | int32_t libsais_bwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I, int32_t threads);
200 | #endif
201 |
202 | /**
203 | * Creates the libsais reverse BWT context that allows reusing allocated memory with each libsais_unbwt_* operation.
204 | * In multi-threaded environments, use one context per thread for parallel executions.
205 | * @return the libsais context, NULL otherwise.
206 | */
207 | void * libsais_unbwt_create_ctx(void);
208 |
209 | #if defined(_OPENMP)
210 | /**
211 | * Creates the libsais reverse BWT context that allows reusing allocated memory with each parallel libsais_unbwt_* operation using OpenMP.
212 | * In multi-threaded environments, use one context per thread for parallel executions.
213 | * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
214 | * @return the libsais context, NULL otherwise.
215 | */
216 | void * libsais_unbwt_create_ctx_omp(int32_t threads);
217 | #endif
218 |
219 | /**
220 | * Destroys the libsass reverse BWT context and free previusly allocated memory.
221 | * @param ctx The libsais context (can be NULL).
222 | */
223 | void libsais_unbwt_free_ctx(void * ctx);
224 |
225 | /**
226 | * Constructs the original string from a given burrows-wheeler transformed string with primary index.
227 | * @param T [0..n-1] The input string.
228 | * @param U [0..n-1] The output string (can be T).
229 | * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
230 | * @param n The length of the given string.
231 | * @param freq [0..255] The input symbol frequency table (can be NULL).
232 | * @param i The primary index.
233 | * @return 0 if no error occurred, -1 or -2 otherwise.
234 | */
235 | int32_t libsais_unbwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i);
236 |
237 | /**
238 | * Constructs the original string from a given burrows-wheeler transformed string with primary index using libsais reverse BWT context.
239 | * @param ctx The libsais reverse BWT context.
240 | * @param T [0..n-1] The input string.
241 | * @param U [0..n-1] The output string (can be T).
242 | * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
243 | * @param n The length of the given string.
244 | * @param freq [0..255] The input symbol frequency table (can be NULL).
245 | * @param i The primary index.
246 | * @return 0 if no error occurred, -1 or -2 otherwise.
247 | */
248 | int32_t libsais_unbwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i);
249 |
250 | /**
251 | * Constructs the original string from a given burrows-wheeler transformed string with auxiliary indexes.
252 | * @param T [0..n-1] The input string.
253 | * @param U [0..n-1] The output string (can be T).
254 | * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
255 | * @param n The length of the given string.
256 | * @param freq [0..255] The input symbol frequency table (can be NULL).
257 | * @param r The sampling rate for auxiliary indexes (must be power of 2).
258 | * @param I [0..(n-1)/r] The input auxiliary indexes.
259 | * @return 0 if no error occurred, -1 or -2 otherwise.
260 | */
261 | int32_t libsais_unbwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I);
262 |
263 | /**
264 | * Constructs the original string from a given burrows-wheeler transformed string with auxiliary indexes using libsais reverse BWT context.
265 | * @param ctx The libsais reverse BWT context.
266 | * @param T [0..n-1] The input string.
267 | * @param U [0..n-1] The output string (can be T).
268 | * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
269 | * @param n The length of the given string.
270 | * @param freq [0..255] The input symbol frequency table (can be NULL).
271 | * @param r The sampling rate for auxiliary indexes (must be power of 2).
272 | * @param I [0..(n-1)/r] The input auxiliary indexes.
273 | * @return 0 if no error occurred, -1 or -2 otherwise.
274 | */
275 | int32_t libsais_unbwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I);
276 |
277 | #if defined(_OPENMP)
278 | /**
279 | * Constructs the original string from a given burrows-wheeler transformed string with primary index in parallel using OpenMP.
280 | * @param T [0..n-1] The input string.
281 | * @param U [0..n-1] The output string (can be T).
282 | * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
283 | * @param n The length of the given string.
284 | * @param freq [0..255] The input symbol frequency table (can be NULL).
285 | * @param i The primary index.
286 | * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
287 | * @return 0 if no error occurred, -1 or -2 otherwise.
288 | */
289 | int32_t libsais_unbwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i, int32_t threads);
290 |
291 | /**
292 | * Constructs the original string from a given burrows-wheeler transformed string with auxiliary indexes in parallel using OpenMP.
293 | * @param T [0..n-1] The input string.
294 | * @param U [0..n-1] The output string (can be T).
295 | * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
296 | * @param n The length of the given string.
297 | * @param freq [0..255] The input symbol frequency table (can be NULL).
298 | * @param r The sampling rate for auxiliary indexes (must be power of 2).
299 | * @param I [0..(n-1)/r] The input auxiliary indexes.
300 | * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
301 | * @return 0 if no error occurred, -1 or -2 otherwise.
302 | */
303 | int32_t libsais_unbwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I, int32_t threads);
304 | #endif
305 |
306 | #ifdef __cplusplus
307 | }
308 | #endif
309 |
310 | #endif
311 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Intsights/PySubstringSearch/7456989e6172b7f0ad563a33ad54b6d9d44f79de/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_pysubstringsearch.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tempfile
3 | import unittest
4 |
5 | import pysubstringsearch
6 |
7 |
8 | class PySubstringSearchTestCase(
9 | unittest.TestCase,
10 | ):
11 | def assert_substring_search(
12 | self,
13 | strings,
14 | substring,
15 | expected_results,
16 | ):
17 | try:
18 | with tempfile.TemporaryDirectory() as tmp_directory:
19 | index_file_path = f'{tmp_directory}/output.idx'
20 | writer = pysubstringsearch.Writer(
21 | index_file_path=index_file_path,
22 | )
23 | for string in strings:
24 | writer.add_entry(
25 | text=string,
26 | )
27 | writer.finalize()
28 |
29 | reader = pysubstringsearch.Reader(
30 | index_file_path=index_file_path,
31 | )
32 | self.assertCountEqual(
33 | first=reader.search(
34 | substring=substring,
35 | ),
36 | second=expected_results,
37 | )
38 |
39 | try:
40 | os.unlink(
41 | path=index_file_path,
42 | )
43 | except Exception:
44 | pass
45 | except PermissionError:
46 | pass
47 |
48 | def test_file_not_found(
49 | self,
50 | ):
51 | with self.assertRaises(
52 | expected_exception=FileNotFoundError,
53 | ):
54 | pysubstringsearch.Reader(
55 | index_file_path='missing_index_file_path',
56 | )
57 |
58 | def test_sanity(
59 | self,
60 | ):
61 | strings = [
62 | 'one',
63 | 'two',
64 | 'three',
65 | 'four',
66 | 'five',
67 | 'six',
68 | 'seven',
69 | 'eight',
70 | 'nine',
71 | 'ten',
72 | ]
73 |
74 | self.assert_substring_search(
75 | strings=strings,
76 | substring='four',
77 | expected_results=[
78 | 'four',
79 | ],
80 | )
81 |
82 | self.assert_substring_search(
83 | strings=strings,
84 | substring='f',
85 | expected_results=[
86 | 'four',
87 | 'five',
88 | ],
89 | )
90 |
91 | self.assert_substring_search(
92 | strings=strings,
93 | substring='our',
94 | expected_results=[
95 | 'four',
96 | ],
97 | )
98 |
99 | self.assert_substring_search(
100 | strings=strings,
101 | substring='aaa',
102 | expected_results=[],
103 | )
104 |
105 | def test_edgecases(
106 | self,
107 | ):
108 | strings = [
109 | 'one',
110 | 'two',
111 | 'three',
112 | 'four',
113 | 'five',
114 | 'six',
115 | 'seven',
116 | 'eight',
117 | 'nine',
118 | 'ten',
119 | 'tenten',
120 | ]
121 |
122 | self.assert_substring_search(
123 | strings=strings,
124 | substring='none',
125 | expected_results=[],
126 | )
127 |
128 | self.assert_substring_search(
129 | strings=strings,
130 | substring='one',
131 | expected_results=[
132 | 'one',
133 | ],
134 | )
135 |
136 | self.assert_substring_search(
137 | strings=strings,
138 | substring='onet',
139 | expected_results=[],
140 | )
141 |
142 | self.assert_substring_search(
143 | strings=strings,
144 | substring='ten',
145 | expected_results=[
146 | 'ten',
147 | 'tenten',
148 | ],
149 | )
150 |
151 | def test_unicode(
152 | self,
153 | ):
154 | strings = [
155 | 'رجعوني عنيك لأيامي اللي راحوا',
156 | 'علموني أندم على الماضي وجراحه',
157 | 'اللي شفته قبل ما تشوفك عنيه',
158 | 'عمر ضايع يحسبوه إزاي عليّ',
159 | 'انت عمري اللي ابتدي بنورك صباحه',
160 | 'قد ايه من عمري قبلك راح وعدّى',
161 | 'يا حبيبي قد ايه من عمري راح',
162 | 'ولا شاف القلب قبلك فرحة واحدة',
163 | 'ولا داق في الدنيا غير طعم الجراح',
164 | 'ابتديت دلوقت بس أحب عمري',
165 | 'ابتديت دلوقت اخاف لا العمر يجري',
166 | 'كل فرحه اشتاقها من قبلك خيالي',
167 | 'التقاها في نور عنيك قلبي وفكري',
168 | 'يا حياة قلبي يا أغلى من حياتي',
169 | 'ليه ما قابلتش هواك يا حبيبي بدري',
170 | 'اللي شفته قبل ما تشوفك عنيه',
171 | 'عمر ضايع يحسبوه إزاي عليّ',
172 | 'انت عمري اللي ابتدي بنورك صباحه',
173 | 'الليالي الحلوه والشوق والمحبة',
174 | 'من زمان والقلب شايلهم عشانك',
175 | 'دوق معايا الحب دوق حبه بحبه',
176 | 'من حنان قلبي اللي طال شوقه لحنانك',
177 | 'هات عنيك تسرح في دنيتهم عنيه',
178 | 'هات ايديك ترتاح للمستهم ايديه',
179 | ]
180 |
181 | self.assert_substring_search(
182 | strings=strings,
183 | substring='زمان',
184 | expected_results=[
185 | 'من زمان والقلب شايلهم عشانك',
186 | ],
187 | )
188 |
189 | self.assert_substring_search(
190 | strings=strings,
191 | substring='في',
192 | expected_results=[
193 | 'هات عنيك تسرح في دنيتهم عنيه',
194 | 'التقاها في نور عنيك قلبي وفكري',
195 | 'ولا داق في الدنيا غير طعم الجراح',
196 | ],
197 | )
198 |
199 | self.assert_substring_search(
200 | strings=strings,
201 | substring='حنان',
202 | expected_results=[
203 | 'من حنان قلبي اللي طال شوقه لحنانك',
204 | ],
205 | )
206 |
207 | self.assert_substring_search(
208 | strings=strings,
209 | substring='none',
210 | expected_results=[],
211 | )
212 |
213 | def test_multiple_words_string(
214 | self,
215 | ):
216 | strings = [
217 | 'some short string',
218 | 'another but now a longer string',
219 | 'more text to add',
220 | ]
221 |
222 | self.assert_substring_search(
223 | strings=strings,
224 | substring='short',
225 | expected_results=[
226 | 'some short string',
227 | ],
228 | )
229 |
230 | def test_short_string(
231 | self,
232 | ):
233 | strings = [
234 | 'ab',
235 | ]
236 | self.assert_substring_search(
237 | strings=strings,
238 | substring='a',
239 | expected_results=[
240 | 'ab',
241 | ],
242 | )
243 |
244 | def test_multiple_strings(
245 | self,
246 | ):
247 | try:
248 | with tempfile.TemporaryDirectory() as tmp_directory:
249 | index_file_path = f'{tmp_directory}/output.idx'
250 | writer = pysubstringsearch.Writer(
251 | index_file_path=index_file_path,
252 | )
253 | for string in [
254 | 'one',
255 | 'two',
256 | 'three',
257 | 'four',
258 | 'five',
259 | 'six',
260 | 'seven',
261 | 'eight',
262 | 'nine',
263 | 'ten',
264 | 'tenten',
265 | ]:
266 | writer.add_entry(
267 | text=string,
268 | )
269 | writer.finalize()
270 |
271 | reader = pysubstringsearch.Reader(
272 | index_file_path=index_file_path,
273 | )
274 | self.assertCountEqual(
275 | first=reader.search_multiple(
276 | substrings=[
277 | 'ee',
278 | 'ven',
279 | ],
280 | ),
281 | second=[
282 | 'three',
283 | 'seven',
284 | ],
285 | )
286 |
287 | try:
288 | os.unlink(
289 | path=index_file_path,
290 | )
291 | except Exception:
292 | pass
293 | except PermissionError:
294 | pass
295 |
--------------------------------------------------------------------------------