├── .github ├── dependabot.yml └── workflows │ ├── build-and-publish.yml │ └── tests.yml ├── .gitignore ├── .gitmodules ├── .well-known └── funding-manifest-urls ├── AUTHORS.rst ├── CHANGES.rst ├── LICENSE ├── MANIFEST.in ├── README.rst ├── bench ├── __init__.py ├── speed.py └── words100k.txt.gz ├── docs ├── Makefile ├── api.rst ├── benchmarks.rst ├── changelog.rst ├── conf.py ├── contributing.rst ├── index.rst ├── make.bat └── tutorial.rst ├── setup.py ├── src ├── agent.cpp ├── agent.pxd ├── base.cpp ├── base.pxd ├── iostream.cpp ├── iostream.pxd ├── key.cpp ├── key.pxd ├── keyset.cpp ├── keyset.pxd ├── marisa_trie.cpp ├── marisa_trie.pyx ├── query.cpp ├── query.pxd ├── std_iostream.cpp ├── std_iostream.pxd ├── trie.cpp └── trie.pxd ├── tests ├── __init__.py ├── test_binary_trie.py ├── test_bytes_trie.py ├── test_packaging.py ├── test_record_trie.py ├── test_trie.py └── utils.py └── update_cpp.sh /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | # GitHub Actions 4 | - package-ecosystem: github-actions 5 | directory: / 6 | schedule: 7 | interval: daily 8 | labels: 9 | - dependencies 10 | - QA/CI 11 | -------------------------------------------------------------------------------- /.github/workflows/build-and-publish.yml: -------------------------------------------------------------------------------- 1 | name: Build & Publish 2 | 3 | on: 4 | pull_request: 5 | paths: 6 | - ".github/workflows/build-and-publish.yml" 7 | - "setup.*" 8 | 9 | workflow_dispatch: 10 | inputs: 11 | branch: 12 | description: "The branch, tag or SHA to release from" 13 | required: true 14 | default: "master" 15 | 16 | jobs: 17 | os-built-distributions: 18 | name: Build on ${{ matrix.os }} 19 | runs-on: ${{ matrix.os }} 20 | strategy: 21 | matrix: 22 | os: [ubuntu-latest, windows-latest, macos-latest] 23 | steps: 24 | - name: Checkout 25 | uses: actions/checkout@v4 26 | with: 27 | ref: ${{ github.event.inputs.branch }} 28 | submodules: true 29 | 30 | - name: Set up QEMU 31 | if: runner.os == 'Linux' 32 | uses: docker/setup-qemu-action@v3 33 | with: 34 | platforms: all 35 | 36 | - name: Install Python 37 | uses: actions/setup-python@v5 38 | with: 39 | python-version: "3.12" 40 | - name: Install build dependencies 41 | run: python -m pip install --upgrade cibuildwheel 42 | - name: Build wheels 43 | run: python -m cibuildwheel 44 | env: 45 | CIBW_SKIP: "pp*" # skip PyPy releases 46 | CIBW_ARCHS_MACOS: "x86_64 universal2 arm64" 47 | CIBW_ARCHS_LINUX: "auto aarch64" 48 | - uses: actions/upload-artifact@v4 49 | with: 50 | name: python-package-distributions-${{ matrix.os }} 51 | path: ./wheelhouse/*.whl 52 | 53 | source-distribution: 54 | name: Build source distribution 55 | runs-on: ubuntu-latest 56 | steps: 57 | - name: Checkout 58 | uses: actions/checkout@v4 59 | with: 60 | ref: ${{ github.event.inputs.branch }} 61 | submodules: true 62 | - name: Install Python 63 | uses: actions/setup-python@v5 64 | with: 65 | python-version: "3.12" 66 | - name: Build source distribution 67 | run: | 68 | # FIXME: setuptools was removed starting with Python 3.12 69 | pip install --upgrade --force setuptools 70 | python setup.py sdist 71 | - name: Store the source distribution 72 | uses: actions/upload-artifact@v4 73 | with: 74 | name: python-package-distributions-source 75 | path: dist 76 | retention-days: 4 77 | 78 | publish: 79 | needs: 80 | - os-built-distributions 81 | - source-distribution 82 | runs-on: ubuntu-latest 83 | steps: 84 | - name: Download all the dists 85 | uses: actions/download-artifact@v4 86 | with: 87 | pattern: python-package-distributions-* 88 | merge-multiple: true 89 | path: dist/ 90 | - name: What will we publish? 91 | run: ls -l dist 92 | - name: Publish 93 | if: github.event.inputs.branch != '' 94 | uses: pypa/gh-action-pypi-publish@release/v1 95 | with: 96 | user: __token__ 97 | password: ${{ secrets.PYPI_API_TOKEN }} 98 | skip_existing: true 99 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: pull_request 4 | 5 | jobs: 6 | tests: 7 | name: Run tests for ${{ matrix.os }} for ${{ matrix.python }} 8 | runs-on: ${{ matrix.os }} 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | os: [ubuntu-latest, windows-latest, macos-latest] 13 | python: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "3.14-dev"] 14 | steps: 15 | - name: Checkout 16 | uses: actions/checkout@v4 17 | with: 18 | ref: ${{ github.event.inputs.branch }} 19 | submodules: true 20 | - name: Use Python ${{ matrix.python }} 21 | uses: actions/setup-python@v5 22 | with: 23 | python-version: ${{ matrix.python }} 24 | - name: Install test dependencies 25 | run: python -m pip install ".[test]" 26 | - name: Test 27 | run: python -m pytest 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | MANIFEST 2 | src/*.html 3 | docs/_build 4 | venv* 5 | *.py[cod] 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Packages 11 | *.egg 12 | *.egg-info 13 | dist 14 | build 15 | eggs 16 | .eggs 17 | parts 18 | bin 19 | var 20 | sdist 21 | develop-eggs 22 | .installed.cfg 23 | lib64 24 | __pycache__ 25 | 26 | # Installer logs 27 | pip-log.txt 28 | 29 | # Unit test / coverage reports 30 | .coverage 31 | .hypothesis 32 | .cache 33 | .tox 34 | nosetests.xml 35 | 36 | # IDE 37 | .idea 38 | .vscode 39 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "marisa-trie"] 2 | path = marisa-trie 3 | url = https://github.com/s-yata/marisa-trie.git 4 | -------------------------------------------------------------------------------- /.well-known/funding-manifest-urls: -------------------------------------------------------------------------------- 1 | https://www.tiger-222.fr/funding.json 2 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | Authors and contributors, in no particular order: 2 | 3 | * Mikhail Korobov 4 | * `Matt Hickford `_ 5 | * Sergei Lebedev 6 | * Tomasz Melcer 7 | * `Mickaël Schoentgen ` 8 | -------------------------------------------------------------------------------- /CHANGES.rst: -------------------------------------------------------------------------------- 1 | 2 | CHANGES 3 | ======= 4 | 5 | 1.3.0 (2025-xx-xx) 6 | ------------------ 7 | 8 | * Updated ``libmarisa-trie`` to the latest version (0.2.7) (#116). 9 | * Dropped Python 3.7 support (#112). 10 | * Added Python 3.13 support (#112). 11 | * Rebuild Cython wrapper with Cython 3.1.1 (#117). 12 | 13 | 1.2.1 (2024-10-12) 14 | ------------------ 15 | 16 | * Publish Python 3.13 wheels (only CPython ones, PyPy ones are skipped until https://github.com/pypa/distutils/issues/283 is fixed). 17 | * Rebuild Cython wrapper with Cython 3.0.11. 18 | 19 | 1.2.0 (2024-06-05) 20 | ------------------ 21 | 22 | * Added Python 3.13 support (#105). 23 | * Rebuild Cython wrapper with Cython 3.0.10 (#105). 24 | 25 | 1.1.1 (2024-05-06) 26 | ------------------ 27 | 28 | * Publish Linux aarch64 wheels (#101). 29 | 30 | 1.1.0 (2023-10-06) 31 | ------------------ 32 | 33 | * Added Python 3.12 support. 34 | 35 | 1.0.0 (2023-09-03) 36 | ------------------ 37 | 38 | * Dropped Python 2.7, 3.4, 3.5, 3.6 support. 39 | * Added ``Trie.map()`` (#90). 40 | * Rebuilt Cython wrapper with Cython 3.0.2. 41 | * Fixed benchmark documentation typos (#89). 42 | 43 | 0.8.0 (2023-03-25) 44 | ------------------ 45 | 46 | * Add ``Trie.iter_prefixes_with_ids()`` method to return ``(prefix, id)`` pairs (#83). 47 | * Rebuild Cython wrapper with Cython 0.29.33 (#88). 48 | 49 | 0.7.8 (2022-10-25) 50 | ------------------ 51 | 52 | * Added Python 3.11 support. 53 | * Rebuild Cython wrapper with Cython 0.29.32. 54 | 55 | 0.7.7 (2021-08-04) 56 | ------------------ 57 | 58 | * Restored Python 2.7 support. 59 | * Fixed README image references not working on Windows. 60 | 61 | 0.7.6 (2021-07-28) 62 | ------------------ 63 | 64 | * Wheels are now published for all platforms. 65 | * Fixed ``ResourceWarning: unclosed file`` in ``setup.py``. 66 | * Run ``black`` on the entire source code. 67 | * Moved the QA/CI to GitHub. 68 | * Rebuild Cython wrapper with Cython 0.29.24. 69 | * Updated ``libmarisa-trie`` to the latest version (0.2.6). 70 | * Fixed failing tests and usage of deprecated methods. 71 | * Expanded supported Python version (2.7, 3.4 - 3.10). 72 | 73 | 0.7.5 (2018-04-10) 74 | ------------------ 75 | 76 | * Removed redundant ``DeprecationWarning`` messages in ``Trie.save`` and 77 | ``Trie.load``. 78 | * Dropped support for Python 2.6. 79 | * Rebuild Cython wrapper with Cython 0.28.1. 80 | 81 | 0.7.4 (2017-03-27) 82 | ------------------ 83 | 84 | * Fixed packaging issue, ``MANIFEST.in`` was not updated after ``libmarisa-trie`` 85 | became a submodule. 86 | 87 | 0.7.3 (2017-02-14) 88 | ------------------ 89 | 90 | * Added ``BinaryTrie`` for storing arbitrary sequences of bytes, e.g. IP 91 | addresses (thanks Tomasz Melcer); 92 | * Deprecated ``Trie.has_keys_with_prefix`` which can be trivially implemented in 93 | terms of ``Trie.iterkeys``; 94 | * Deprecated ``Trie.read`` and ``Trie.write`` which onlywork for "real" files 95 | and duplicate the functionality of ``load`` and ``save``. See issue #31 on 96 | GitHub; 97 | * Updated ``libmarisa-trie`` to the latest version. Yay, 64-bit Windows support. 98 | * Rebuilt Cython wrapper with Cython 0.25.2. 99 | 100 | 0.7.2 (2015-04-21) 101 | ------------------ 102 | 103 | * packaging issue is fixed. 104 | 105 | 0.7.1 (2015-04-21) 106 | ------------------ 107 | 108 | * setup.py is switched to setuptools; 109 | * a tiny speedup; 110 | * wrapper is rebuilt with Cython 0.22. 111 | 112 | 0.7 (2014-12-15) 113 | ---------------- 114 | 115 | * ``trie1 == trie2`` and ``trie1 != trie2`` now work (thanks Sergei Lebedev); 116 | * ``for key in trie:`` is fixed (thanks Sergei Lebedev); 117 | * wrapper is rebuilt with Cython 0.21.1 (thanks Sergei Lebedev); 118 | * https://bitbucket.org/kmike/marisa-trie repo is no longer supported. 119 | 120 | 0.6 (2014-02-22) 121 | ---------------- 122 | 123 | * New ``Trie`` methods: ``__getitem__``, ``get``, ``items``, ``iteritems``. 124 | ``trie[u'key']`` is now the same as ``trie.key_id(u'key')``. 125 | * small optimization for ``BytesTrie.get``. 126 | * wrapper is rebuilt with Cython 0.20.1. 127 | 128 | 0.5.3 (2014-02-08) 129 | ------------------ 130 | 131 | * small ``Trie.restore_key`` optimization (it should work 5-15% faster) 132 | 133 | 0.5.2 (2014-02-08) 134 | ------------------ 135 | 136 | * fix ``Trie.restore_key`` method - it was reading past declared string length; 137 | * rebuild wrapper with Cython 0.20. 138 | 139 | 0.5.1 (2013-10-03) 140 | ------------------ 141 | 142 | * ``has_keys_with_prefix(prefix)`` method (thanks 143 | `Matt Hickford `_) 144 | 145 | 0.5 (2013-05-07) 146 | ---------------- 147 | 148 | * ``BytesTrie.iterkeys``, ``BytesTrie.iteritems``, 149 | ``RecordTrie.iterkeys`` and ``RecordTrie.iteritems`` methods; 150 | * wrapper is rebuilt with Cython 0.19; 151 | * ``value_separator`` parameter for ``BytesTrie`` and ``RecordTrie``. 152 | 153 | 0.4 (2013-02-28) 154 | ---------------- 155 | 156 | * improved trie building: ``weights`` optional parameter; 157 | * improved trie building: unnecessary input sorting is removed; 158 | * wrapper is rebuilt with Cython 0.18; 159 | * bundled marisa-trie C++ library is updated to svn r133. 160 | 161 | 0.3.8 (2013-01-03) 162 | ------------------ 163 | 164 | * Rebuild wrapper with Cython pre-0.18; 165 | * update benchmarks. 166 | 167 | 0.3.7 (2012-09-21) 168 | ------------------ 169 | 170 | * Update bundled marisa-trie C++ library (this may fix more mingw issues); 171 | * Python 3.3 support is back. 172 | 173 | 0.3.6 (2012-09-05) 174 | ------------------ 175 | 176 | * much faster (3x-7x) ``.items()`` and ``.keys()`` methods for all tries; 177 | faster (up to 3x) ``.prefixes()`` method for ``Trie``. 178 | 179 | 0.3.5 (2012-08-30) 180 | ------------------ 181 | 182 | * Pickling of RecordTrie is fixed (thanks lazarou for the report); 183 | * error messages should become more useful. 184 | 185 | 0.3.4 (2012-08-29) 186 | ------------------ 187 | 188 | * Issues with mingw32 should be resolved (thanks Susumu Yata). 189 | 190 | 0.3.3 (2012-08-27) 191 | ------------------ 192 | 193 | * ``.get(key, default=None)`` method for ``BytesTrie`` and ``RecordTrie``; 194 | * small README improvements. 195 | 196 | 0.3.2 (2012-08-26) 197 | ------------------ 198 | 199 | * Small code cleanup; 200 | * ``load``, ``read`` and ``mmap`` methods returns 'self'; 201 | * I can't run tests (via tox) under Python 3.3 so it is 202 | removed from supported versions for now. 203 | 204 | 0.3.1 (2012-08-23) 205 | ------------------ 206 | 207 | * ``.prefixes()`` support for RecordTrie and BytesTrie. 208 | 209 | 0.3 (2012-08-23) 210 | ---------------- 211 | 212 | * RecordTrie and BytesTrie are introduced; 213 | * IntTrie class is removed (probably temporary?); 214 | * dumps/loads methods are renamed to tobytes/frombytes; 215 | * benchmark & tests improvements; 216 | * support for MARISA-trie config options is added. 217 | 218 | 0.2 (2012-08-19) 219 | ------------------ 220 | 221 | * Pickling/unpickling support; 222 | * dumps/loads methods; 223 | * python 3.3 workaround; 224 | * improved tests; 225 | * benchmarks. 226 | 227 | 0.1 (2012-08-17) 228 | ---------------- 229 | 230 | Initial release. 231 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) marisa-trie authors and contributors, 2012-2025 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is furnished 8 | to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 14 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR 15 | A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 16 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 17 | CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 18 | OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | include CHANGES.rst 3 | include LICENSE 4 | include update_cpp.sh 5 | 6 | recursive-include src *.cpp *.pxd *.pyx 7 | recursive-include marisa-trie/lib/marisa *.h *.cc 8 | recursive-include marisa-trie/include/marisa *.h 9 | recursive-include tests *.py 10 | recursive-include bench *.py 11 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | MARISA Trie 2 | =========== 3 | 4 | |PyPI Version| 5 | |PyPI Status| 6 | |PyPI Python Versions| 7 | |Github Build Status| 8 | 9 | .. tip:: 10 | 11 | Become **my boss** to help me work on this awesome software, and make the world better: 12 | 13 | |Patreon| 14 | 15 | Static memory-efficient Trie-like structures for Python (3.8+) 16 | based on `marisa-trie`_ C++ library. 17 | 18 | String data in a MARISA-trie may take up to 50x-100x less memory than 19 | in a standard Python dict; the raw lookup speed is comparable; trie also 20 | provides fast advanced methods like prefix search. 21 | 22 | .. note:: 23 | 24 | There are official SWIG-based Python bindings included 25 | in C++ library distribution; this package provides alternative 26 | Cython-based pip-installable Python bindings. 27 | 28 | .. _marisa-trie: https://github.com/s-yata/marisa-trie 29 | 30 | Installation 31 | ============ 32 | 33 | :: 34 | 35 | python -m pip install -U marisa-trie 36 | 37 | Usage 38 | ===== 39 | 40 | See `tutorial`_ and `API`_ for details. 41 | 42 | .. _tutorial: https://marisa-trie.readthedocs.io/en/latest/tutorial.html 43 | .. _API: https://marisa-trie.readthedocs.io/en/latest/api.html 44 | 45 | Current limitations 46 | =================== 47 | 48 | * The library is not tested with mingw32 compiler; 49 | * ``.prefixes()`` method of ``BytesTrie`` and ``RecordTrie`` is quite slow 50 | and doesn't have iterator counterpart; 51 | * ``read()`` and ``write()`` methods don't work with file-like objects 52 | (they work only with real files; pickling works fine for file-like objects); 53 | * there are ``keys()`` and ``items()`` methods but no ``values()`` method. 54 | 55 | License 56 | ======= 57 | 58 | Wrapper code is licensed under MIT License. 59 | 60 | Bundled `marisa-trie`_ C++ library is dual-licensed under 61 | LGPL and BSD 2-clause license. 62 | 63 | .. |PyPI Version| image:: https://img.shields.io/pypi/v/marisa-trie.svg 64 | :target: https://pypi.python.org/pypi/marisa-trie/ 65 | .. |PyPI Status| image:: https://img.shields.io/pypi/status/marisa-trie.svg 66 | :target: https://pypi.python.org/pypi/marisa-trie/ 67 | .. |PyPI Python Versions| image:: https://img.shields.io/pypi/pyversions/marisa-trie.svg 68 | :target: https://pypi.python.org/pypi/marisa-trie/ 69 | .. |Github Build Status| image:: https://github.com/pytries/marisa-trie/actions/workflows/tests.yml/badge.svg 70 | :target: https://github.com/pytries/marisa-trie/actions/workflows/tests.yml 71 | .. |Patreon| image:: https://img.shields.io/badge/Patreon-F96854?style=for-the-badge&logo=patreon&logoColor=white 72 | :target: https://www.patreon.com/mschoentgen 73 | -------------------------------------------------------------------------------- /bench/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytries/marisa-trie/97cfda688aee37565f6d4a414cc66dd5384cb4ad/bench/__init__.py -------------------------------------------------------------------------------- /bench/speed.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import os 3 | import random 4 | import string 5 | import struct 6 | import timeit 7 | 8 | import marisa_trie 9 | 10 | 11 | def words100k(): 12 | zip_name = os.path.join( 13 | os.path.abspath(os.path.dirname(__file__)), "words100k.txt.gz" 14 | ) 15 | 16 | return list(map(str.rstrip, gzip.open(zip_name, "rt"))) 17 | 18 | 19 | def random_words(num): 20 | russian = "абвгдеёжзиклмнопрстуфхцчъыьэюя" 21 | alphabet = f"{russian}{string.ascii_letters}" 22 | return [ 23 | "".join(random.choice(alphabet) for _ in range(random.randint(1, 15))) 24 | for _ in range(num) 25 | ] 26 | 27 | 28 | def truncated_words(words): 29 | return [word[:3] for word in words] 30 | 31 | 32 | def prefixes1k(words, prefix_len): 33 | words = [w for w in words if len(w) >= prefix_len] 34 | every_nth = len(words) // 1000 35 | _words = [w[:prefix_len] for w in words[::every_nth]] 36 | return _words[:1000] 37 | 38 | 39 | WORDS100k = words100k() 40 | MIXED_WORDS100k = truncated_words(WORDS100k) 41 | NON_WORDS100k = random_words(100000) 42 | PREFIXES_3_1k = prefixes1k(WORDS100k, 3) 43 | PREFIXES_5_1k = prefixes1k(WORDS100k, 5) 44 | PREFIXES_8_1k = prefixes1k(WORDS100k, 8) 45 | PREFIXES_15_1k = prefixes1k(WORDS100k, 15) 46 | 47 | 48 | def format_result(key, value, text_width): 49 | key = key.ljust(text_width) 50 | print(f" {key} {value}") 51 | 52 | 53 | def bench( 54 | name, timer, descr="M ops/sec", op_count=0.1, repeats=3, runs=5, text_width=33 55 | ): 56 | try: 57 | times = [] 58 | for x in range(runs): 59 | times.append(timer.timeit(repeats)) 60 | 61 | def op_time(time): 62 | return op_count * repeats / time 63 | 64 | val = f"{op_time(min(times)):0.3f}{descr}" 65 | format_result(name, val, text_width) 66 | except (AttributeError, TypeError): 67 | format_result(name, "not supported", text_width) 68 | 69 | 70 | def create_trie(): 71 | words = WORDS100k 72 | return marisa_trie.Trie(words) 73 | 74 | 75 | def create_bytes_trie(): 76 | words = WORDS100k 77 | values = (struct.pack("/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " applehelp to make an Apple Help Book" 34 | @echo " devhelp to make HTML files and a Devhelp project" 35 | @echo " epub to make an epub" 36 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 37 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 38 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 39 | @echo " text to make text files" 40 | @echo " man to make manual pages" 41 | @echo " texinfo to make Texinfo files" 42 | @echo " info to make Texinfo files and run them through makeinfo" 43 | @echo " gettext to make PO message catalogs" 44 | @echo " changes to make an overview of all changed/added/deprecated items" 45 | @echo " xml to make Docutils-native XML files" 46 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 47 | @echo " linkcheck to check all external links for integrity" 48 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 49 | @echo " coverage to run coverage check of the documentation (if enabled)" 50 | 51 | clean: 52 | rm -rf $(BUILDDIR)/* 53 | 54 | html: 55 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 56 | @echo 57 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 58 | 59 | dirhtml: 60 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 61 | @echo 62 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 63 | 64 | singlehtml: 65 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 66 | @echo 67 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 68 | 69 | pickle: 70 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 71 | @echo 72 | @echo "Build finished; now you can process the pickle files." 73 | 74 | json: 75 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 76 | @echo 77 | @echo "Build finished; now you can process the JSON files." 78 | 79 | htmlhelp: 80 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 81 | @echo 82 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 83 | ".hhp project file in $(BUILDDIR)/htmlhelp." 84 | 85 | qthelp: 86 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 87 | @echo 88 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 89 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 90 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/marisa-trie.qhcp" 91 | @echo "To view the help file:" 92 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/marisa-trie.qhc" 93 | 94 | applehelp: 95 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 96 | @echo 97 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 98 | @echo "N.B. You won't be able to view it unless you put it in" \ 99 | "~/Library/Documentation/Help or install it in your application" \ 100 | "bundle." 101 | 102 | devhelp: 103 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 104 | @echo 105 | @echo "Build finished." 106 | @echo "To view the help file:" 107 | @echo "# mkdir -p $$HOME/.local/share/devhelp/marisa-trie" 108 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/marisa-trie" 109 | @echo "# devhelp" 110 | 111 | epub: 112 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 113 | @echo 114 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 115 | 116 | latex: 117 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 118 | @echo 119 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 120 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 121 | "(use \`make latexpdf' here to do that automatically)." 122 | 123 | latexpdf: 124 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 125 | @echo "Running LaTeX files through pdflatex..." 126 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 127 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 128 | 129 | latexpdfja: 130 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 131 | @echo "Running LaTeX files through platex and dvipdfmx..." 132 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 133 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 134 | 135 | text: 136 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 137 | @echo 138 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 139 | 140 | man: 141 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 142 | @echo 143 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 144 | 145 | texinfo: 146 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 147 | @echo 148 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 149 | @echo "Run \`make' in that directory to run these through makeinfo" \ 150 | "(use \`make info' here to do that automatically)." 151 | 152 | info: 153 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 154 | @echo "Running Texinfo files through makeinfo..." 155 | make -C $(BUILDDIR)/texinfo info 156 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 157 | 158 | gettext: 159 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 160 | @echo 161 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 162 | 163 | changes: 164 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 165 | @echo 166 | @echo "The overview file is in $(BUILDDIR)/changes." 167 | 168 | linkcheck: 169 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 170 | @echo 171 | @echo "Link check complete; look for any errors in the above output " \ 172 | "or in $(BUILDDIR)/linkcheck/output.txt." 173 | 174 | doctest: 175 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 176 | @echo "Testing of doctests in the sources finished, look at the " \ 177 | "results in $(BUILDDIR)/doctest/output.txt." 178 | 179 | coverage: 180 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 181 | @echo "Testing of coverage in the sources finished, look at the " \ 182 | "results in $(BUILDDIR)/coverage/python.txt." 183 | 184 | xml: 185 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 186 | @echo 187 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 188 | 189 | pseudoxml: 190 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 191 | @echo 192 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 193 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | .. _api: 2 | 3 | 4 | API reference 5 | ============= 6 | 7 | BinaryTrie 8 | ---------- 9 | 10 | .. autoclass:: marisa_trie.BinaryTrie 11 | :members: 12 | :inherited-members: 13 | 14 | Trie 15 | ---- 16 | 17 | .. autoclass:: marisa_trie.Trie 18 | :members: 19 | :inherited-members: 20 | 21 | BytesTrie 22 | --------- 23 | 24 | .. autoclass:: marisa_trie.BytesTrie 25 | :members: 26 | 27 | 28 | RecordTrie 29 | ---------- 30 | 31 | .. autoclass:: marisa_trie.RecordTrie 32 | :members: 33 | -------------------------------------------------------------------------------- /docs/benchmarks.rst: -------------------------------------------------------------------------------- 1 | Benchmarks 2 | ========== 3 | 4 | My quick tests show that memory usage is quite decent. 5 | For a list of 3000000 (3 million) Russian words memory consumption 6 | with different data structures (under Python 2.7): 7 | 8 | * dict(unicode words -> word lenghts): about 600M 9 | * list(unicode words) : about 300M 10 | * BaseTrie from datrie_ library: about 70M 11 | * ``marisa_trie.RecordTrie`` : 11M 12 | * ``marisa_trie.Trie``: 7M 13 | 14 | 15 | .. note:: 16 | 17 | Lengths of words were stored as values in ``datrie.BaseTrie`` 18 | and ``marisa_trie.RecordTrie``. ``RecordTrie`` compresses 19 | similar values and the key compression is better so it uses 20 | much less memory than ``datrie.BaseTrie``. 21 | 22 | ``marisa_trie.Trie`` provides auto-assigned IDs. It is not possible 23 | to store arbitrary values in ``marisa_trie.Trie`` so it uses less 24 | memory than ``RecordTrie``. 25 | 26 | Benchmark results (100k unicode words, integer values (lengths of the words), 27 | Python 3.2, Macbook Air i5 1.8 Ghz):: 28 | 29 | dict building 2.919M words/sec 30 | Trie building 0.394M words/sec 31 | BytesTrie building 0.355M words/sec 32 | RecordTrie building 0.354M words/sec 33 | 34 | dict __getitem__ (hits) 8.239M ops/sec 35 | Trie __getitem__ (hits) not supported 36 | BytesTrie __getitem__ (hits) 0.498M ops/sec 37 | RecordTrie __getitem__ (hits) 0.404M ops/sec 38 | 39 | dict get() (hits) 4.410M ops/sec 40 | Trie get() (hits) not supported 41 | BytesTrie get() (hits) 0.458M ops/sec 42 | RecordTrie get() (hits) 0.364M ops/sec 43 | dict get() (misses) 4.869M ops/sec 44 | Trie get() (misses) not supported 45 | BytesTrie get() (misses) 0.849M ops/sec 46 | RecordTrie get() (misses) 0.816M ops/sec 47 | 48 | dict __contains__ (hits) 8.053M ops/sec 49 | Trie __contains__ (hits) 1.018M ops/sec 50 | BytesTrie __contains__ (hits) 0.605M ops/sec 51 | RecordTrie __contains__ (hits) 0.618M ops/sec 52 | dict __contains__ (misses) 6.489M ops/sec 53 | Trie __contains__ (misses) 2.047M ops/sec 54 | BytesTrie __contains__ (misses) 1.079M ops/sec 55 | RecordTrie __contains__ (misses) 1.123M ops/sec 56 | 57 | dict items() 57.248 ops/sec 58 | Trie items() not supported 59 | BytesTrie items() 11.691 ops/sec 60 | RecordTrie items() 8.369 ops/sec 61 | 62 | dict keys() 217.920 ops/sec 63 | Trie keys() 19.589 ops/sec 64 | BytesTrie keys() 14.849 ops/sec 65 | RecordTrie keys() 15.369 ops/sec 66 | 67 | Trie.prefixes (hits) 0.594M ops/sec 68 | Trie.prefixes (mixed) 1.874M ops/sec 69 | Trie.prefixes (misses) 1.447M ops/sec 70 | RecordTrie.prefixes (hits) 0.103M ops/sec 71 | RecordTrie.prefixes (mixed) 0.458M ops/sec 72 | RecordTrie.prefixes (misses) 0.164M ops/sec 73 | Trie.iter_prefixes (hits) 0.588M ops/sec 74 | Trie.iter_prefixes (mixed) 1.470M ops/sec 75 | Trie.iter_prefixes (misses) 1.170M ops/sec 76 | 77 | Trie.keys(prefix="xxx"), avg_len(res)==415 5.044K ops/sec 78 | Trie.keys(prefix="xxxxx"), avg_len(res)==17 89.363K ops/sec 79 | Trie.keys(prefix="xxxxxxxx"), avg_len(res)==3 258.732K ops/sec 80 | Trie.keys(prefix="xxxxx..xx"), avg_len(res)==1.4 293.199K ops/sec 81 | Trie.keys(prefix="xxx"), NON_EXISTING 1169.524K ops/sec 82 | 83 | RecordTrie.keys(prefix="xxx"), avg_len(res)==415 3.836K ops/sec 84 | RecordTrie.keys(prefix="xxxxx"), avg_len(res)==17 73.591K ops/sec 85 | RecordTrie.keys(prefix="xxxxxxxx"), avg_len(res)==3 229.515K ops/sec 86 | RecordTrie.keys(prefix="xxxxx..xx"), avg_len(res)==1.4 269.228K ops/sec 87 | RecordTrie.keys(prefix="xxx"), NON_EXISTING 1071.433K ops/sec 88 | 89 | 90 | Tries from ``marisa_trie`` are static and uses less memory, tries from 91 | `datrie`_ are faster and can be updated. 92 | 93 | You may also give DAWG_ a try - it is usually faster than 94 | ``marisa-trie`` and sometimes can use less memory (depending on data). 95 | 96 | Please take this benchmark results with a grain of salt; this 97 | is a very simple benchmark on a single data set. 98 | 99 | .. _datrie: https://github.com/kmike/datrie 100 | .. _DAWG: https://github.com/kmike/DAWG 101 | -------------------------------------------------------------------------------- /docs/changelog.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../CHANGES.rst 2 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # marisa-trie documentation build configuration file, created by 5 | # sphinx-quickstart on Thu Mar 24 00:18:01 2016. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | import sys 17 | import os 18 | 19 | # If extensions (or modules to document with autodoc) are in another directory, 20 | # add these directories to sys.path here. If the directory is relative to the 21 | # documentation root, use os.path.abspath to make it absolute, like shown here. 22 | sys.path.insert(0, os.path.abspath("..")) 23 | 24 | # -- General configuration ------------------------------------------------ 25 | 26 | # If your documentation needs a minimal Sphinx version, state it here. 27 | # needs_sphinx = '1.0' 28 | 29 | # Add any Sphinx extension module names here, as strings. They can be 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 31 | # ones. 32 | extensions = [ 33 | "sphinx.ext.autodoc", 34 | "sphinx.ext.autosummary", 35 | "sphinx.ext.intersphinx", 36 | ] 37 | 38 | # Add any paths that contain templates here, relative to this directory. 39 | templates_path = ["_templates"] 40 | 41 | # The suffix(es) of source filenames. 42 | # You can specify multiple suffix as a list of string: 43 | # source_suffix = ['.rst', '.md'] 44 | source_suffix = ".rst" 45 | 46 | # The encoding of source files. 47 | # source_encoding = 'utf-8-sig' 48 | 49 | # The master toctree document. 50 | master_doc = "index" 51 | 52 | # General information about the project. 53 | project = "marisa-trie" 54 | copyright = "2016-2025, Mikhail Korobov & contributors" 55 | author = "Mikhail Korobov & contributors" 56 | 57 | # The version info for the project you're documenting, acts as replacement for 58 | # |version| and |release|, also used in various other places throughout the 59 | # built documents. 60 | # 61 | # The short X.Y version. 62 | version = "1.2.1" 63 | # The full version, including alpha/beta/rc tags. 64 | release = version 65 | 66 | # The language for content autogenerated by Sphinx. Refer to documentation 67 | # for a list of supported languages. 68 | # 69 | # This is also used if you do content translation via gettext catalogs. 70 | # Usually you set "language" from the command line for these cases. 71 | language = None 72 | 73 | # List of patterns, relative to source directory, that match files and 74 | # directories to ignore when looking for source files. 75 | exclude_patterns = ["_build"] 76 | 77 | # The name of the Pygments (syntax highlighting) style to use. 78 | pygments_style = "sphinx" 79 | 80 | # If true, `todo` and `todoList` produce output, else they produce nothing. 81 | todo_include_todos = False 82 | 83 | # Example configuration for intersphinx: refer to the Python standard library. 84 | intersphinx_mapping = {"https://docs.python.org/3/": None} 85 | 86 | # -- Options for HTML output ---------------------------------------------- 87 | 88 | ## Read the docs style: 89 | try: 90 | import sphinx_rtd_theme 91 | except ImportError: 92 | html_theme = "classic" 93 | else: 94 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 95 | html_theme = "sphinx_rtd_theme" 96 | 97 | # Add any paths that contain custom static files (such as style sheets) here, 98 | # relative to this directory. They are copied after the builtin static files, 99 | # so a file named "default.css" will overwrite the builtin "default.css". 100 | html_static_path = [] 101 | 102 | # Output file base name for HTML help builder. 103 | htmlhelp_basename = "marisa-triedoc" 104 | 105 | # -- Options for LaTeX output --------------------------------------------- 106 | 107 | latex_elements = { 108 | # The paper size ('letterpaper' or 'a4paper'). 109 | #'papersize': 'letterpaper', 110 | # The font size ('10pt', '11pt' or '12pt'). 111 | #'pointsize': '10pt', 112 | # Additional stuff for the LaTeX preamble. 113 | #'preamble': '', 114 | # Latex figure (float) alignment 115 | #'figure_align': 'htbp', 116 | } 117 | 118 | # Grouping the document tree into LaTeX files. List of tuples 119 | # (source start file, target name, title, 120 | # author, documentclass [howto, manual, or own class]). 121 | latex_documents = [ 122 | ( 123 | master_doc, 124 | "marisa-trie.tex", 125 | "marisa-trie Documentation", 126 | author, 127 | "manual", 128 | ), 129 | ] 130 | 131 | 132 | # -- Options for manual page output --------------------------------------- 133 | 134 | # One entry per manual page. List of tuples 135 | # (source start file, name, description, authors, manual section). 136 | man_pages = [(master_doc, "marisa-trie", "marisa-trie Documentation", [author], 1)] 137 | 138 | 139 | # -- Options for Texinfo output ------------------------------------------- 140 | 141 | # Grouping the document tree into Texinfo files. List of tuples 142 | # (source start file, target name, title, author, 143 | # dir menu entry, description, category) 144 | texinfo_documents = [ 145 | ( 146 | master_doc, 147 | "marisa-trie", 148 | "marisa-trie Documentation", 149 | author, 150 | "marisa-trie", 151 | "One line description of project.", 152 | "Miscellaneous", 153 | ), 154 | ] 155 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | Contributing 2 | ============ 3 | 4 | Contributions are welcome! Development happens at 5 | `GitHub `_. Feel free to submit 6 | ideas, bug reports and pull requests. 7 | 8 | If you found a bug in a C++ part please report it to the original 9 | `bug tracker `_. 10 | 11 | Navigating the source code 12 | -------------------------- 13 | 14 | There are 4 folders in repository: 15 | 16 | * ``bench`` -- benchmarks & benchmark data; 17 | * ``lib`` -- original unmodified `marisa-trie`_ C++ library which is a git 18 | submodule; if something is have to be fixed in this library 19 | consider fixing it in the original repo; 20 | * ``src`` -- wrapper code; ``src/marisa_trie.pyx`` is a wrapper implementation; 21 | ``src/*.pxd`` files are Cython headers for corresponding C++ headers; 22 | ``src/*.cpp`` files are the pre-built extension code and shouldn't be 23 | modified directly (they should be updated via ``update_cpp.sh`` script). 24 | * ``tests`` -- the test suite. 25 | 26 | .. _marisa-trie: https://github.com/s-yata/marisa-trie 27 | 28 | Running tests and benchmarks 29 | ---------------------------- 30 | 31 | Make sure ``pytest`` is installed and run 32 | 33 | :: 34 | 35 | $ python -m pytest 36 | 37 | from the source checkout. Tests should pass under Python 3.8 and newer. 38 | 39 | In order to run benchmarks, type 40 | 41 | :: 42 | 43 | $ python bench/speed.py 44 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../README.rst 2 | 3 | .. toctree:: 4 | :hidden: 5 | 6 | tutorial 7 | benchmarks 8 | api 9 | contributing 10 | changelog 11 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | echo. coverage to run coverage check of the documentation if enabled 41 | goto end 42 | ) 43 | 44 | if "%1" == "clean" ( 45 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 46 | del /q /s %BUILDDIR%\* 47 | goto end 48 | ) 49 | 50 | 51 | REM Check if sphinx-build is available and fallback to Python version if any 52 | %SPHINXBUILD% 2> nul 53 | if errorlevel 9009 goto sphinx_python 54 | goto sphinx_ok 55 | 56 | :sphinx_python 57 | 58 | set SPHINXBUILD=python -m sphinx.__init__ 59 | %SPHINXBUILD% 2> nul 60 | if errorlevel 9009 ( 61 | echo. 62 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 63 | echo.installed, then set the SPHINXBUILD environment variable to point 64 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 65 | echo.may add the Sphinx directory to PATH. 66 | echo. 67 | echo.If you don't have Sphinx installed, grab it from 68 | echo.http://sphinx-doc.org/ 69 | exit /b 1 70 | ) 71 | 72 | :sphinx_ok 73 | 74 | 75 | if "%1" == "html" ( 76 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 77 | if errorlevel 1 exit /b 1 78 | echo. 79 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 80 | goto end 81 | ) 82 | 83 | if "%1" == "dirhtml" ( 84 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 85 | if errorlevel 1 exit /b 1 86 | echo. 87 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 88 | goto end 89 | ) 90 | 91 | if "%1" == "singlehtml" ( 92 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 93 | if errorlevel 1 exit /b 1 94 | echo. 95 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 96 | goto end 97 | ) 98 | 99 | if "%1" == "pickle" ( 100 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 101 | if errorlevel 1 exit /b 1 102 | echo. 103 | echo.Build finished; now you can process the pickle files. 104 | goto end 105 | ) 106 | 107 | if "%1" == "json" ( 108 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 109 | if errorlevel 1 exit /b 1 110 | echo. 111 | echo.Build finished; now you can process the JSON files. 112 | goto end 113 | ) 114 | 115 | if "%1" == "htmlhelp" ( 116 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 117 | if errorlevel 1 exit /b 1 118 | echo. 119 | echo.Build finished; now you can run HTML Help Workshop with the ^ 120 | .hhp project file in %BUILDDIR%/htmlhelp. 121 | goto end 122 | ) 123 | 124 | if "%1" == "qthelp" ( 125 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 129 | .qhcp project file in %BUILDDIR%/qthelp, like this: 130 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\marisa-trie.qhcp 131 | echo.To view the help file: 132 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\marisa-trie.ghc 133 | goto end 134 | ) 135 | 136 | if "%1" == "devhelp" ( 137 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 138 | if errorlevel 1 exit /b 1 139 | echo. 140 | echo.Build finished. 141 | goto end 142 | ) 143 | 144 | if "%1" == "epub" ( 145 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 146 | if errorlevel 1 exit /b 1 147 | echo. 148 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 149 | goto end 150 | ) 151 | 152 | if "%1" == "latex" ( 153 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 154 | if errorlevel 1 exit /b 1 155 | echo. 156 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 157 | goto end 158 | ) 159 | 160 | if "%1" == "latexpdf" ( 161 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 162 | cd %BUILDDIR%/latex 163 | make all-pdf 164 | cd %~dp0 165 | echo. 166 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 167 | goto end 168 | ) 169 | 170 | if "%1" == "latexpdfja" ( 171 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 172 | cd %BUILDDIR%/latex 173 | make all-pdf-ja 174 | cd %~dp0 175 | echo. 176 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 177 | goto end 178 | ) 179 | 180 | if "%1" == "text" ( 181 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 182 | if errorlevel 1 exit /b 1 183 | echo. 184 | echo.Build finished. The text files are in %BUILDDIR%/text. 185 | goto end 186 | ) 187 | 188 | if "%1" == "man" ( 189 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 190 | if errorlevel 1 exit /b 1 191 | echo. 192 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 193 | goto end 194 | ) 195 | 196 | if "%1" == "texinfo" ( 197 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 198 | if errorlevel 1 exit /b 1 199 | echo. 200 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 201 | goto end 202 | ) 203 | 204 | if "%1" == "gettext" ( 205 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 206 | if errorlevel 1 exit /b 1 207 | echo. 208 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 209 | goto end 210 | ) 211 | 212 | if "%1" == "changes" ( 213 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 214 | if errorlevel 1 exit /b 1 215 | echo. 216 | echo.The overview file is in %BUILDDIR%/changes. 217 | goto end 218 | ) 219 | 220 | if "%1" == "linkcheck" ( 221 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 222 | if errorlevel 1 exit /b 1 223 | echo. 224 | echo.Link check complete; look for any errors in the above output ^ 225 | or in %BUILDDIR%/linkcheck/output.txt. 226 | goto end 227 | ) 228 | 229 | if "%1" == "doctest" ( 230 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 231 | if errorlevel 1 exit /b 1 232 | echo. 233 | echo.Testing of doctests in the sources finished, look at the ^ 234 | results in %BUILDDIR%/doctest/output.txt. 235 | goto end 236 | ) 237 | 238 | if "%1" == "coverage" ( 239 | %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage 240 | if errorlevel 1 exit /b 1 241 | echo. 242 | echo.Testing of coverage in the sources finished, look at the ^ 243 | results in %BUILDDIR%/coverage/python.txt. 244 | goto end 245 | ) 246 | 247 | if "%1" == "xml" ( 248 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 249 | if errorlevel 1 exit /b 1 250 | echo. 251 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 252 | goto end 253 | ) 254 | 255 | if "%1" == "pseudoxml" ( 256 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 257 | if errorlevel 1 exit /b 1 258 | echo. 259 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 260 | goto end 261 | ) 262 | 263 | :end 264 | -------------------------------------------------------------------------------- /docs/tutorial.rst: -------------------------------------------------------------------------------- 1 | .. _tutorial: 2 | 3 | Tutorial 4 | ======== 5 | 6 | Tries 7 | ----- 8 | 9 | There are several trie classes in this package: 10 | 11 | .. autosummary:: 12 | :nosignatures: 13 | 14 | marisa_trie.BinaryTrie 15 | marisa_trie.Trie 16 | marisa_trie.RecordTrie 17 | marisa_trie.BytesTrie 18 | 19 | marisa_trie.Trie 20 | ~~~~~~~~~~~~~~~~ 21 | 22 | Create a new trie from a list of keys:: 23 | 24 | >>> import marisa_trie 25 | >>> trie = marisa_trie.Trie(["key1", "key2", "key12"]) 26 | 27 | Check if a key is present:: 28 | 29 | >>> "key1" in trie 30 | True 31 | >>> "key20" in trie 32 | False 33 | 34 | Each key is assigned an unique ID from 0 to (n - 1), where n is the 35 | number of keys in a trie:: 36 | 37 | >>> trie["key2"] 38 | 1 39 | 40 | Note that you can't assign a value to a ``marisa_trie.Trie`` key, 41 | but can use the returned ID to store values in a separate data structure 42 | (e.g. in a Python list or NumPy array). 43 | 44 | An ID can be mapped back to the corresponding key: 45 | 46 | >>> trie.restore_key(1) 47 | "key2" 48 | 49 | Query a trie 50 | 51 | * Find all trie keys which are prefixes of a given key:: 52 | 53 | >>> trie.prefixes("key12") 54 | ["key1", "key12"] 55 | 56 | * Find all trie keys which start with a given prefix:: 57 | 58 | >> trie.keys("key1") 59 | ["key1", "key12"] 60 | 61 | * The latter is complemented by :meth:`~marisa_trie.Trie.items` which 62 | returns all matching ``(key, ID)`` pairs. 63 | 64 | All query methods have generator-based versions prefixed with ``iter``. 65 | 66 | .. note:: 67 | 68 | If you're looking for a trie with bytes keys, check out 69 | :class:`~marisa_trie.BinaryTrie`. 70 | 71 | 72 | marisa_trie.RecordTrie 73 | ~~~~~~~~~~~~~~~~~~~~~~ 74 | 75 | Create a new trie from a list of ``(key, data)`` pairs:: 76 | 77 | >>> keys = ["foo", "bar", "foobar", "foo"] 78 | >>> values = [(1, 2), (2, 1), (3, 3), (2, 1)] 79 | >>> fmt = ">> trie = marisa_trie.RecordTrie(fmt, zip(keys, values)) 81 | 82 | Each data tuple would be converted to bytes using :func:`struct.pack`. Take a 83 | look at available `format strings `_. 84 | 85 | Check if a key is present:: 86 | 87 | >>> "foo" in trie 88 | True 89 | >>> "spam" in trie 90 | False 91 | 92 | ``marisa_trie.RecordTrie`` allows duplicate keys. Therefore ``__getitem__`` and 93 | ``get`` return a list of values. 94 | 95 | >>> trie["bar"] 96 | [(2, 1)] 97 | >>> trie["foo"] 98 | [(1, 2), (2, 1)] 99 | >>> trie.get("bar", 123) 100 | [(2, 1)] 101 | >>> trie.get("BAAR", 123) # default value. 102 | 123 103 | 104 | Similarly, :meth:`~marisa_trie.RecordTrie.keys` and 105 | :meth:`~marisa_trie.RecordTrie.items` take into account key multiplicities:: 106 | 107 | >> trie.keys("fo") 108 | ["foo", "foo", "foobar"] 109 | >> trie.items("fo") 110 | [("foo", (1, 2)), ("foo", (2, 1)), ("foobar", (3, 3))] 111 | 112 | 113 | marisa_trie.BytesTrie 114 | ~~~~~~~~~~~~~~~~~~~~~ 115 | 116 | ``BytesTrie`` is similar to ``RecordTrie``, but the values are raw bytes, 117 | not tuples:: 118 | 119 | >>> keys = ["foo", "bar", "foobar", "foo"] 120 | >>> values = [b'foo-value', b'bar-value', b'foobar-value', b'foo-value2'] 121 | >>> trie = marisa_trie.BytesTrie(zip(keys, values)) 122 | >>> trie["bar"] 123 | [b'bar-value'] 124 | 125 | 126 | Persistence 127 | ----------- 128 | 129 | Trie objects supports saving/loading, pickling/unpickling and memory mapped I/O. 130 | 131 | Save trie to a file:: 132 | 133 | >>> trie.save('my_trie.marisa') 134 | 135 | Load trie from a file:: 136 | 137 | >>> trie2 = marisa_trie.Trie() 138 | >>> trie2.load('my_trie.marisa') 139 | 140 | .. note:: You may also build a trie using ``marisa-build`` command-line 141 | utility (provided by underlying C++ library; it should be 142 | downloaded and compiled separately) and then load the trie 143 | from the resulting file using ``load``. 144 | 145 | Trie objects are picklable:: 146 | 147 | >>> import pickle 148 | >>> data = pickle.dumps(trie) 149 | >>> trie3 = pickle.loads(data) 150 | 151 | 152 | Memory mapped I/O 153 | ----------------- 154 | 155 | It is possible to use memory mapped file as data source:: 156 | 157 | >>> trie = marisa_trie.RecordTrie(fmt).mmap('my_record_trie.marisa') 158 | 159 | This way the whole dictionary won't be loaded fully to memory; memory 160 | mapped I/O is an easy way to share dictionary data among processes. 161 | 162 | .. warning:: 163 | 164 | Memory mapped trie might cause lots of random disk accesses which 165 | considerably increases the search time. 166 | 167 | 168 | Storage options 169 | --------------- 170 | 171 | `marisa-trie `_ C++ library provides 172 | some configuration options for trie storage; See "Enumeration Constants" 173 | section in the library 174 | `docs `_. 175 | 176 | These options are exposed as ``order``, ``num_tries``, ``cache_size`` 177 | and ``binary`` keyword arguments for trie constructors. 178 | 179 | For example, set ``order`` to ``marisa_trie.LABEL_ORDER`` in order to 180 | make trie functions return results in alphabetical oder:: 181 | 182 | >>> trie = marisa_trie.RecordTrie(fmt, data, order=marisa_trie.LABEL_ORDER) 183 | 184 | Note that two tries constructed from identical data but with different ``order`` 185 | arguments will compare unequal:: 186 | 187 | >>> t1 = marisa_trie.Trie(order=marisa_trie.LABEL_ORDER) 188 | >>> t2 = marisa_trie.Trie(order=marisa_trie.WEIGHT_ORDER) 189 | >>> t1 == t2 190 | False 191 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """Static memory-efficient and fast Trie-like structures for Python.""" 2 | 3 | import glob 4 | import itertools 5 | import os.path 6 | 7 | from setuptools import setup, Extension 8 | 9 | 10 | # Note: keep requirements here to ease distributions packaging 11 | tests_require = [ 12 | "hypothesis", 13 | "pytest", 14 | "readme_renderer", 15 | ] 16 | install_requires = [ 17 | "setuptools", 18 | ] 19 | 20 | MARISA_ROOT_DIR = "marisa-trie" 21 | MARISA_SOURCE_DIR = os.path.join(MARISA_ROOT_DIR, "lib") 22 | MARISA_INCLUDE_DIR = os.path.join(MARISA_ROOT_DIR, "include") 23 | MARISA_FILES = [ 24 | "marisa/*.cc", 25 | "marisa/grimoire.cc", 26 | "marisa/grimoire/io/*.cc", 27 | "marisa/grimoire/trie/*.cc", 28 | "marisa/grimoire/vector/*.cc", 29 | ] 30 | 31 | MARISA_FILES[:] = itertools.chain( 32 | *(glob.glob(os.path.join(MARISA_SOURCE_DIR, path)) for path in MARISA_FILES) 33 | ) 34 | 35 | DESCRIPTION = __doc__ 36 | with open("README.rst", encoding="utf-8") as f1, open( 37 | "CHANGES.rst", encoding="utf-8" 38 | ) as f2: 39 | LONG_DESCRIPTION = f1.read() + f2.read() 40 | LICENSE = "MIT" 41 | 42 | CLASSIFIERS = [ 43 | "Development Status :: 4 - Beta", 44 | "Intended Audience :: Developers", 45 | "Intended Audience :: Science/Research", 46 | "License :: OSI Approved :: MIT License", 47 | "Programming Language :: Cython", 48 | "Programming Language :: Python", 49 | "Programming Language :: Python :: 3", 50 | "Programming Language :: Python :: 3.8", 51 | "Programming Language :: Python :: 3.9", 52 | "Programming Language :: Python :: 3.10", 53 | "Programming Language :: Python :: 3.11", 54 | "Programming Language :: Python :: 3.12", 55 | "Programming Language :: Python :: 3.13", 56 | "Programming Language :: Python :: 3.14", 57 | "Programming Language :: Python :: Implementation :: CPython", 58 | "Topic :: Software Development :: Libraries :: Python Modules", 59 | "Topic :: Scientific/Engineering :: Information Analysis", 60 | "Topic :: Text Processing :: Linguistic", 61 | ] 62 | 63 | setup( 64 | name="marisa-trie", 65 | version="1.2.1", 66 | description=DESCRIPTION, 67 | long_description=LONG_DESCRIPTION, 68 | long_description_content_type="text/x-rst", 69 | author="Mikhail Korobov", 70 | author_email="kmike84@gmail.com", 71 | license=LICENSE, 72 | url="https://github.com/pytries/marisa-trie", 73 | classifiers=CLASSIFIERS, 74 | libraries=[ 75 | ( 76 | "libmarisa-trie", 77 | { 78 | "sources": MARISA_FILES, 79 | "include_dirs": [MARISA_SOURCE_DIR, MARISA_INCLUDE_DIR], 80 | }, 81 | ) 82 | ], 83 | ext_modules=[ 84 | Extension( 85 | "marisa_trie", 86 | [ 87 | "src/agent.cpp", 88 | "src/base.cpp", 89 | "src/iostream.cpp", 90 | "src/key.cpp", 91 | "src/keyset.cpp", 92 | "src/marisa_trie.cpp", 93 | "src/query.cpp", 94 | "src/std_iostream.cpp", 95 | "src/trie.cpp", 96 | ], 97 | include_dirs=[MARISA_INCLUDE_DIR], 98 | ) 99 | ], 100 | python_requires=">=3.8", 101 | install_requires=install_requires, 102 | extras_require={ 103 | "test": tests_require, 104 | }, 105 | ) 106 | -------------------------------------------------------------------------------- /src/agent.pxd: -------------------------------------------------------------------------------- 1 | cimport query, key 2 | 3 | cdef extern from "" namespace "marisa" nogil: 4 | cdef cppclass Agent: 5 | Agent() except + 6 | 7 | query.Query &query() 8 | key.Key &key() 9 | 10 | void set_query(char *str) 11 | void set_query(char *ptr, int length) 12 | void set_query(int key_id) 13 | 14 | void set_key(char *str) 15 | void set_key(char *ptr, int length) 16 | void set_key(int id) 17 | 18 | void clear() 19 | 20 | void init_state() 21 | 22 | void swap(Agent &rhs) 23 | -------------------------------------------------------------------------------- /src/base.pxd: -------------------------------------------------------------------------------- 1 | cdef extern from "": 2 | 3 | # A dictionary consists of 3 tries in default. Usually more tries make a 4 | # dictionary space-efficient but time-inefficient. 5 | ctypedef enum marisa_num_tries: 6 | MARISA_MIN_NUM_TRIES 7 | MARISA_MAX_NUM_TRIES 8 | MARISA_DEFAULT_NUM_TRIES 9 | 10 | 11 | # This library uses a cache technique to accelerate search functions. The 12 | # following enumerated type marisa_cache_level gives a list of available cache 13 | # size options. A larger cache enables faster search but takes a more space. 14 | ctypedef enum marisa_cache_level: 15 | MARISA_HUGE_CACHE 16 | MARISA_LARGE_CACHE 17 | MARISA_NORMAL_CACHE 18 | MARISA_SMALL_CACHE 19 | MARISA_TINY_CACHE 20 | MARISA_DEFAULT_CACHE 21 | 22 | # This library provides 2 kinds of TAIL implementations. 23 | ctypedef enum marisa_tail_mode: 24 | # MARISA_TEXT_TAIL merges last labels as zero-terminated strings. So, it is 25 | # available if and only if the last labels do not contain a NULL character. 26 | # If MARISA_TEXT_TAIL is specified and a NULL character exists in the last 27 | # labels, the setting is automatically switched to MARISA_BINARY_TAIL. 28 | MARISA_TEXT_TAIL 29 | 30 | # MARISA_BINARY_TAIL also merges last labels but as byte sequences. It uses 31 | # a bit vector to detect the end of a sequence, instead of NULL characters. 32 | # So, MARISA_BINARY_TAIL requires a larger space if the average length of 33 | # labels is greater than 8. 34 | MARISA_BINARY_TAIL 35 | 36 | MARISA_DEFAULT_TAIL 37 | 38 | # The arrangement of nodes affects the time cost of matching and the order of 39 | # predictive search. 40 | ctypedef enum marisa_node_order: 41 | # MARISA_LABEL_ORDER arranges nodes in ascending label order. 42 | # MARISA_LABEL_ORDER is useful if an application needs to predict keys in 43 | # label order. 44 | MARISA_LABEL_ORDER 45 | 46 | # MARISA_WEIGHT_ORDER arranges nodes in descending weight order. 47 | # MARISA_WEIGHT_ORDER is generally a better choice because it enables faster 48 | # matching. 49 | MARISA_WEIGHT_ORDER 50 | MARISA_DEFAULT_ORDER 51 | 52 | ctypedef enum marisa_config_mask: 53 | MARISA_NUM_TRIES_MASK 54 | MARISA_CACHE_LEVEL_MASK 55 | MARISA_TAIL_MODE_MASK 56 | MARISA_NODE_ORDER_MASK 57 | MARISA_CONFIG_MASK 58 | 59 | 60 | cdef extern from "" namespace "marisa": 61 | ctypedef marisa_cache_level CacheLevel 62 | ctypedef marisa_tail_mode TailMode 63 | ctypedef marisa_node_order NodeOrder 64 | -------------------------------------------------------------------------------- /src/iostream.pxd: -------------------------------------------------------------------------------- 1 | from std_iostream cimport istream, ostream 2 | from trie cimport Trie 3 | 4 | cdef extern from "" namespace "marisa" nogil: 5 | 6 | istream &read(istream &stream, Trie *trie) 7 | ostream &write(ostream &stream, Trie &trie) 8 | -------------------------------------------------------------------------------- /src/key.pxd: -------------------------------------------------------------------------------- 1 | cdef extern from "" namespace "marisa" nogil: 2 | 3 | cdef cppclass Key: 4 | Key() 5 | Key(Key &query) 6 | 7 | #Key &operator=(Key &query) 8 | 9 | char operator[](int i) 10 | 11 | void set_str(char *str) 12 | void set_str(char *ptr, int length) 13 | void set_id(int id) 14 | void set_weight(float weight) 15 | 16 | char *ptr() 17 | int length() 18 | int id() 19 | float weight() 20 | 21 | void clear() 22 | void swap(Key &rhs) 23 | -------------------------------------------------------------------------------- /src/keyset.pxd: -------------------------------------------------------------------------------- 1 | cimport key 2 | 3 | cdef extern from "" namespace "marisa" nogil: 4 | cdef cppclass Keyset: 5 | 6 | # cdef enum constants: 7 | # BASE_BLOCK_SIZE = 4096 8 | # EXTRA_BLOCK_SIZE = 1024 9 | # KEY_BLOCK_SIZE = 256 10 | 11 | Keyset() 12 | 13 | void push_back(key.Key &key) 14 | void push_back(key.Key &key, char end_marker) 15 | 16 | void push_back(char *str) 17 | void push_back(char *ptr, int length) 18 | void push_back(char *ptr, int length, float weight) 19 | 20 | key.Key &operator[](int i) 21 | 22 | int num_keys() 23 | bint empty() 24 | 25 | int size() 26 | int total_length() 27 | 28 | void reset() 29 | void clear() 30 | void swap(Keyset &rhs) 31 | -------------------------------------------------------------------------------- /src/marisa_trie.pyx: -------------------------------------------------------------------------------- 1 | # cython: profile=False, embedsignature=True 2 | 3 | from std_iostream cimport stringstream, istream, ostream 4 | from libc.string cimport strncmp 5 | cimport keyset 6 | cimport key 7 | cimport agent 8 | cimport trie 9 | cimport iostream 10 | cimport base 11 | 12 | from cpython.buffer cimport PyBUF_SIMPLE, Py_buffer, PyObject_CheckBuffer, PyObject_GetBuffer, PyBuffer_Release 13 | 14 | import itertools 15 | import struct 16 | import warnings 17 | 18 | DEFAULT_CACHE = base.MARISA_DEFAULT_CACHE 19 | HUGE_CACHE = base.MARISA_HUGE_CACHE 20 | LARGE_CACHE = base.MARISA_LARGE_CACHE 21 | NORMAL_CACHE = base.MARISA_NORMAL_CACHE 22 | SMALL_CACHE = base.MARISA_SMALL_CACHE 23 | TINY_CACHE = base.MARISA_TINY_CACHE 24 | 25 | MIN_NUM_TRIES = base.MARISA_MIN_NUM_TRIES 26 | MAX_NUM_TRIES = base.MARISA_MAX_NUM_TRIES 27 | DEFAULT_NUM_TRIES = base.MARISA_DEFAULT_NUM_TRIES 28 | 29 | # MARISA_TEXT_TAIL merges last labels as zero-terminated strings. So, it is 30 | # available if and only if the last labels do not contain a NULL character. 31 | # If MARISA_TEXT_TAIL is specified and a NULL character exists in the last 32 | # labels, the setting is automatically switched to MARISA_BINARY_TAIL. 33 | TEXT_TAIL = base.MARISA_TEXT_TAIL 34 | 35 | # MARISA_BINARY_TAIL also merges last labels but as byte sequences. It uses 36 | # a bit vector to detect the end of a sequence, instead of NULL characters. 37 | # So, MARISA_BINARY_TAIL requires a larger space if the average length of 38 | # labels is greater than 8. 39 | BINARY_TAIL = base.MARISA_BINARY_TAIL 40 | DEFAULT_TAIL = base.MARISA_DEFAULT_TAIL 41 | 42 | 43 | # MARISA_LABEL_ORDER arranges nodes in ascending label order. 44 | # MARISA_LABEL_ORDER is useful if an application needs to predict keys in 45 | # label order. 46 | LABEL_ORDER = base.MARISA_LABEL_ORDER 47 | 48 | # MARISA_WEIGHT_ORDER arranges nodes in descending weight order. 49 | # MARISA_WEIGHT_ORDER is generally a better choice because it enables faster 50 | # matching. 51 | WEIGHT_ORDER = base.MARISA_WEIGHT_ORDER 52 | DEFAULT_ORDER = base.MARISA_DEFAULT_ORDER 53 | 54 | 55 | cdef inline int getbufptr(object obj, char ** ptr, Py_ssize_t * size, Py_buffer * buf): 56 | """Get a pointer from bytes/buffer object ``obj``. 57 | 58 | On success, return 0, and set ``ptr``, ``size`` and ``buf``.""" 59 | cdef int result = -1 60 | ptr[0] = NULL 61 | size[0] = 0 62 | if PyObject_CheckBuffer(obj) == 1: # new-style Buffer interface 63 | result = PyObject_GetBuffer(obj, buf, PyBUF_SIMPLE) 64 | if result == 0: 65 | ptr[0] = buf.buf 66 | size[0] = buf.len 67 | return result 68 | 69 | 70 | cdef inline void releasebuf(Py_buffer *buf): 71 | """Release buffer if necessary.""" 72 | PyBuffer_Release(buf) 73 | 74 | 75 | cdef class _Trie: 76 | cdef trie.Trie* _trie 77 | 78 | cdef bytes _encode_key(self, key): 79 | return key 80 | 81 | cdef _get_key(self, agent.Agent& ag): 82 | return ag.key().ptr()[:ag.key().length()] 83 | 84 | def __init__(self, arg=None, num_tries=DEFAULT_NUM_TRIES, binary=False, 85 | cache_size=DEFAULT_CACHE, order=DEFAULT_ORDER, weights=None): 86 | """ 87 | ``arg`` can be one of the following: 88 | 89 | * an iterable with bytes keys; 90 | * None (if you're going to load a trie later). 91 | 92 | Pass a ``weights`` iterable with expected lookup frequencies 93 | to optimize lookup and prefix search speed. 94 | """ 95 | 96 | if self._trie: 97 | return 98 | self._trie = new trie.Trie() 99 | 100 | byte_keys = (self._encode_key(key) for key in (arg or [])) 101 | 102 | self._build( 103 | byte_keys, 104 | weights, 105 | num_tries=num_tries, 106 | binary=binary, 107 | cache_size=cache_size, 108 | order=order 109 | ) 110 | 111 | def __dealloc__(self): 112 | if self._trie: 113 | del self._trie 114 | 115 | def _config_flags(self, num_tries=DEFAULT_NUM_TRIES, binary=False, 116 | cache_size=DEFAULT_CACHE, order=DEFAULT_ORDER): 117 | if not MIN_NUM_TRIES <= num_tries <= MAX_NUM_TRIES: 118 | raise ValueError( 119 | "num_tries (which is %d) must be between between %d and %d" % 120 | (num_tries, MIN_NUM_TRIES, MAX_NUM_TRIES)) 121 | 122 | binary_flag = BINARY_TAIL if binary else TEXT_TAIL 123 | return num_tries | binary_flag | cache_size | order 124 | 125 | def _build(self, byte_keys, weights=None, **options): 126 | if weights is None: 127 | weights = itertools.repeat(1.0) 128 | 129 | cdef char* data 130 | cdef float weight 131 | cdef keyset.Keyset *ks = new keyset.Keyset() 132 | 133 | try: 134 | for key, weight in zip(byte_keys, weights): 135 | ks.push_back(key, len(key), weight) 136 | self._trie.build(ks[0], self._config_flags(**options)) 137 | finally: 138 | del ks 139 | 140 | def __richcmp__(self, other, int op): 141 | if op == 2: # == 142 | if other is self: 143 | return True 144 | elif not isinstance(other, _Trie): 145 | return False 146 | 147 | return (<_Trie>self)._equals(other) 148 | elif op == 3: # != 149 | return not (self == other) 150 | 151 | raise TypeError("unorderable types: {0} and {1}".format( 152 | self.__class__, other.__class__)) 153 | 154 | cdef bint _equals(self, _Trie other) nogil: 155 | cdef int num_keys = self._trie.num_keys() 156 | cdef base.NodeOrder node_order = self._trie.node_order() 157 | if (other._trie.num_keys() != num_keys or 158 | other._trie.node_order() != node_order): 159 | return False 160 | 161 | cdef agent.Agent ag1, ag2 162 | ag1.set_query(b"") 163 | ag2.set_query(b"") 164 | cdef int i 165 | cdef key.Key key1, key2 166 | for i in range(num_keys): 167 | self._trie.predictive_search(ag1) 168 | other._trie.predictive_search(ag2) 169 | key1 = ag1.key() 170 | key2 = ag2.key() 171 | if (key1.length() != key2.length() or 172 | strncmp(key1.ptr(), key2.ptr(), key1.length()) != 0): 173 | return False 174 | return True 175 | 176 | def __iter__(self): 177 | return self.iterkeys() 178 | 179 | def __len__(self): 180 | return self._trie.num_keys() 181 | 182 | def __contains__(self, key): 183 | cdef bytes _key = self._encode_key(key) 184 | return self._contains(_key) 185 | 186 | cdef bint _contains(self, bytes key): 187 | cdef agent.Agent ag 188 | ag.set_query(key, len(key)) 189 | return self._trie.lookup(ag) 190 | 191 | def read(self, f): 192 | """Read a trie from an open file. 193 | 194 | :param file f: a "real" on-disk file object. Passing a *file-like* 195 | object would result in an error. 196 | 197 | .. deprecated:: 0.7.3 198 | 199 | The method will be removed in version 0.8.0. Please use 200 | :meth:`load` instead. 201 | """ 202 | warnings.warn("Trie.read is deprecated and will " 203 | "be removed in marisa_trie 0.8.0. Please use " 204 | "Trie.load instead.", DeprecationWarning) 205 | self._trie.read(f.fileno()) 206 | return self 207 | 208 | def write(self, f): 209 | """Write a trie to an open file. 210 | 211 | :param file f: a "real" on-disk file object. Passing a *file-like* 212 | object would result in an error. 213 | 214 | .. deprecated:: 0.7.3 215 | 216 | The method will be removed in version 0.8.0. Please use 217 | :meth:`save` instead. 218 | """ 219 | warnings.warn("Trie.write is deprecated and will " 220 | "be removed in marisa_trie 0.8.0. Please use " 221 | "Trie.save instead.", DeprecationWarning) 222 | self._trie.write(f.fileno()) 223 | 224 | def save(self, path): 225 | """Save a trie to a specified path.""" 226 | with open(path, 'w') as f: 227 | self._trie.write(f.fileno()) 228 | 229 | def load(self, path): 230 | """Load a trie from a specified path.""" 231 | with open(path, 'r') as f: 232 | self._trie.read(f.fileno()) 233 | return self 234 | 235 | cpdef bytes tobytes(self) except +: 236 | """Return raw trie content as bytes.""" 237 | cdef stringstream stream 238 | iostream.write(( &stream)[0], self._trie[0]) 239 | cdef bytes res = stream.str() 240 | return res 241 | 242 | cpdef frombytes(self, bytes data) except +: 243 | """Load a trie from raw bytes generated by :meth:`tobytes`.""" 244 | cdef stringstream* stream = new stringstream(data) 245 | try: 246 | iostream.read(( stream)[0], self._trie) 247 | finally: 248 | del stream 249 | return self 250 | 251 | def __reduce__(self): 252 | return self.__class__, (), self.tobytes() 253 | 254 | __setstate__ = frombytes 255 | 256 | def mmap(self, path): 257 | """Memory map the content of a trie stored in a file. 258 | 259 | This allows to query trie without loading it fully in memory. 260 | """ 261 | import sys 262 | str_path = path.encode(sys.getfilesystemencoding()) 263 | cdef char* c_path = str_path 264 | self._trie.mmap(c_path) 265 | return self 266 | 267 | def map(self, buffer): 268 | """Load the trie from an object exposing the buffer protocol.""" 269 | 270 | cdef char *ptr = NULL 271 | cdef Py_ssize_t size = 0 272 | cdef Py_buffer buf 273 | result = getbufptr(buffer, &ptr, &size, &buf) 274 | if result != 0: 275 | raise ValueError("Invalid buffer.") 276 | self._trie.map(ptr, size) 277 | releasebuf(&buf) 278 | return self 279 | 280 | def iterkeys(self, prefix=None): 281 | """ 282 | Return an iterator over trie keys starting with a given ``prefix``. 283 | """ 284 | cdef agent.Agent ag 285 | cdef bytes b_prefix = b'' 286 | if prefix is not None: 287 | b_prefix = self._encode_key(prefix) 288 | ag.set_query(b_prefix, len(b_prefix)) 289 | 290 | while self._trie.predictive_search(ag): 291 | yield self._get_key(ag) 292 | 293 | cpdef list keys(self, prefix=None): 294 | """Return a list of trie keys starting with a given ``prefix``.""" 295 | # non-generator inlined version of iterkeys() 296 | cdef list res = [] 297 | cdef bytes b_prefix = b'' 298 | if prefix is not None: 299 | b_prefix = self._encode_key(prefix) 300 | cdef agent.Agent ag 301 | ag.set_query(b_prefix, len(b_prefix)) 302 | 303 | while self._trie.predictive_search(ag): 304 | res.append(self._get_key(ag)) 305 | 306 | return res 307 | 308 | def has_keys_with_prefix(self, prefix=""): 309 | """ 310 | Return ``True`` if any key in the trie begins with ``prefix``. 311 | 312 | .. deprecated:: 0.7.3 313 | 314 | The method will be removed in version 0.8.0. Please use 315 | :meth:`iterkeys` instead. 316 | """ 317 | warnings.warn("Trie.has_keys_with_prefix is deprecated and will " 318 | "be removed in marisa_trie 0.8.0. Please use " 319 | "Trie.iterkeys instead.", DeprecationWarning) 320 | 321 | cdef agent.Agent ag 322 | cdef bytes b_prefix = self._encode_key(prefix) 323 | ag.set_query(b_prefix, len(b_prefix)) 324 | return self._trie.predictive_search(ag) 325 | 326 | 327 | cdef class BinaryTrie(_Trie): 328 | """A trie mapping bytes keys to auto-generated unique IDs.""" 329 | 330 | # key_id method is not in _Trie because it won't work for BytesTrie 331 | cpdef int key_id(self, bytes key) except -1: 332 | """Return an ID generated for a given ``key``. 333 | 334 | :raises KeyError: if key is not present in this trie. 335 | """ 336 | cdef int res = self._key_id(key, len(key)) 337 | if res == -1: 338 | raise KeyError(key) 339 | return res 340 | 341 | cdef int _key_id(self, char* key, int len): 342 | cdef bint res 343 | cdef agent.Agent ag 344 | ag.set_query(key, len) 345 | res = self._trie.lookup(ag) 346 | if not res: 347 | return -1 348 | return ag.key().id() 349 | 350 | cpdef restore_key(self, int index): 351 | """Return a key corresponding to a given ID.""" 352 | cdef agent.Agent ag 353 | ag.set_query(index) 354 | try: 355 | self._trie.reverse_lookup(ag) 356 | except KeyError: 357 | raise KeyError(index) 358 | return self._get_key(ag) 359 | 360 | def __getitem__(self, bytes key): 361 | return self.key_id(key) 362 | 363 | def get(self, bytes key, default=None): 364 | """ 365 | Return an ID for a given ``key`` or ``default`` if ``key`` is 366 | not present in this trie. 367 | """ 368 | cdef int res 369 | 370 | res = self._key_id(key, len(key)) 371 | if res == -1: 372 | return default 373 | return res 374 | 375 | def iter_prefixes(self, bytes key): 376 | """ 377 | Return an iterator of all prefixes of a given key. 378 | """ 379 | cdef agent.Agent ag 380 | ag.set_query(key, len(key)) 381 | 382 | while self._trie.common_prefix_search(ag): 383 | yield self._get_key(ag) 384 | 385 | def prefixes(self, bytes key): 386 | """ 387 | Return a list with all prefixes of a given key. 388 | """ 389 | # this an inlined version of ``list(self.iter_prefixes(key))`` 390 | 391 | cdef list res = [] 392 | cdef agent.Agent ag 393 | ag.set_query(key, len(key)) 394 | 395 | while self._trie.common_prefix_search(ag): 396 | res.append(self._get_key(ag)) 397 | return res 398 | 399 | def items(self, bytes prefix=b""): 400 | # inlined for speed 401 | cdef list res = [] 402 | cdef agent.Agent ag 403 | ag.set_query(prefix, len(prefix)) 404 | 405 | while self._trie.predictive_search(ag): 406 | res.append((self._get_key(ag), ag.key().id())) 407 | 408 | return res 409 | 410 | def iteritems(self, bytes prefix=b""): 411 | """ 412 | Return an iterator over items that have a prefix ``prefix``. 413 | """ 414 | cdef agent.Agent ag 415 | ag.set_query(prefix, len(prefix)) 416 | 417 | while self._trie.predictive_search(ag): 418 | yield self._get_key(ag), ag.key().id() 419 | 420 | 421 | cdef class _UnicodeKeyedTrie(_Trie): 422 | """ 423 | MARISA-trie wrapper for unicode keys. 424 | """ 425 | cdef bytes _encode_key(self, key): 426 | return key.encode('utf8') 427 | 428 | cdef _get_key(self, agent.Agent& ag): 429 | return _Trie._get_key(self, ag).decode('utf8') 430 | 431 | 432 | cdef class Trie(_UnicodeKeyedTrie): 433 | """A trie mapping unicode keys to auto-generated unique IDs.""" 434 | 435 | # key_id method is not in _Trie because it won't work for BytesTrie 436 | cpdef int key_id(self, unicode key) except -1: 437 | """Return an ID generated for a given ``key``. 438 | 439 | :raises KeyError: if key is not present in this trie. 440 | """ 441 | cdef bytes _key = key.encode('utf8') 442 | cdef int res = self._key_id(_key) 443 | if res == -1: 444 | raise KeyError(key) 445 | return res 446 | 447 | def __getitem__(self, unicode key): 448 | return self.key_id(key) 449 | 450 | def get(self, key, default=None): 451 | """ 452 | Return an ID for a given ``key`` or ``default`` if ``key`` is 453 | not present in this trie. 454 | """ 455 | cdef bytes b_key 456 | cdef int res 457 | 458 | if isinstance(key, unicode): 459 | b_key = (key).encode('utf8') 460 | else: 461 | b_key = key 462 | 463 | res = self._key_id(b_key) 464 | if res == -1: 465 | return default 466 | return res 467 | 468 | cpdef restore_key(self, int index): 469 | """Return a key corresponding to a given ID.""" 470 | cdef agent.Agent ag 471 | ag.set_query(index) 472 | try: 473 | self._trie.reverse_lookup(ag) 474 | except KeyError: 475 | raise KeyError(index) 476 | return self._get_key(ag) 477 | 478 | cdef int _key_id(self, char* key): 479 | cdef bint res 480 | cdef agent.Agent ag 481 | ag.set_query(key) 482 | res = self._trie.lookup(ag) 483 | if not res: 484 | return -1 485 | return ag.key().id() 486 | 487 | def iter_prefixes(self, unicode key): 488 | """ 489 | Return an iterator of all prefixes of a given key. 490 | """ 491 | cdef bytes b_key = key.encode('utf8') 492 | cdef agent.Agent ag 493 | ag.set_query(b_key) 494 | 495 | while self._trie.common_prefix_search(ag): 496 | yield self._get_key(ag) 497 | 498 | def prefixes(self, unicode key): 499 | """ 500 | Return a list with all prefixes of a given key. 501 | """ 502 | # this an inlined version of ``list(self.iter_prefixes(key))`` 503 | 504 | cdef list res = [] 505 | cdef bytes b_key = key.encode('utf8') 506 | cdef agent.Agent ag 507 | ag.set_query(b_key) 508 | 509 | while self._trie.common_prefix_search(ag): 510 | res.append(self._get_key(ag)) 511 | return res 512 | 513 | def iter_prefixes_with_ids(self, unicode key): 514 | """ 515 | Return an iterator of (prefix, id) pairs of all prefixes of a given key. 516 | """ 517 | cdef bytes b_key = key.encode('utf8') 518 | cdef agent.Agent ag 519 | ag.set_query(b_key, len(b_key)) 520 | 521 | while self._trie.common_prefix_search(ag): 522 | yield (self._get_key(ag), ag.key().id()) 523 | 524 | def iteritems(self, unicode prefix=""): 525 | """ 526 | Return an iterator over items that have a prefix ``prefix``. 527 | """ 528 | cdef bytes b_prefix = prefix.encode('utf8') 529 | cdef agent.Agent ag 530 | ag.set_query(b_prefix) 531 | 532 | while self._trie.predictive_search(ag): 533 | yield self._get_key(ag), ag.key().id() 534 | 535 | def items(self, unicode prefix=""): 536 | # inlined for speed 537 | cdef list res = [] 538 | cdef bytes b_prefix = prefix.encode('utf8') 539 | cdef agent.Agent ag 540 | ag.set_query(b_prefix) 541 | 542 | while self._trie.predictive_search(ag): 543 | res.append((self._get_key(ag), ag.key().id())) 544 | 545 | return res 546 | 547 | 548 | # This symbol is not allowed in utf8 so it is safe to use 549 | # as a separator between utf8-encoded string and binary payload. 550 | # XXX: b'\xff' value changes sort order for BytesTrie and RecordTrie. 551 | # See https://github.com/kmike/DAWG docs for a description of a similar issue. 552 | cdef bytes _VALUE_SEPARATOR = b'\xff' 553 | 554 | 555 | cdef class BytesTrie(_UnicodeKeyedTrie): 556 | """A trie mapping unicode keys to lists of bytes objects. 557 | 558 | The mapping is implemented by appending binary values to UTF8-encoded 559 | and storing the result in MARISA-trie. 560 | """ 561 | cdef bytes _b_value_separator 562 | cdef unsigned char _c_value_separator 563 | 564 | def __init__(self, arg=None, bytes value_separator=_VALUE_SEPARATOR, 565 | **options): 566 | """ 567 | ``arg`` must be an iterable of tuples (unicode_key, bytes_payload). 568 | """ 569 | super(BytesTrie, self).__init__() 570 | 571 | self._b_value_separator = value_separator 572 | self._c_value_separator = ord(value_separator) 573 | 574 | byte_keys = (self._raw_key(d[0], d[1]) for d in (arg or [])) 575 | self._build(byte_keys, **options) 576 | 577 | cpdef bytes _raw_key(self, unicode key, bytes payload): 578 | return key.encode('utf8') + self._b_value_separator + payload 579 | 580 | cdef bint _contains(self, bytes key): 581 | cdef agent.Agent ag 582 | cdef bytes _key = key + self._b_value_separator 583 | ag.set_query(_key) 584 | return self._trie.predictive_search(ag) 585 | 586 | cpdef list prefixes(self, unicode key): 587 | """ 588 | Return a list with all prefixes of a given key. 589 | """ 590 | 591 | # XXX: is there a char-walking API in libmarisa? 592 | # This implementation is suboptimal. 593 | 594 | cdef agent.Agent ag 595 | cdef list res = [] 596 | cdef int key_len = len(key) 597 | cdef unicode prefix 598 | cdef bytes b_prefix 599 | cdef int ind = 1 600 | 601 | while ind <= key_len: 602 | prefix = key[:ind] 603 | b_prefix = (prefix.encode('utf8') + self._b_value_separator) 604 | ag.set_query(b_prefix) 605 | if self._trie.predictive_search(ag): 606 | res.append(prefix) 607 | 608 | ind += 1 609 | 610 | return res 611 | 612 | def __getitem__(self, key): 613 | cdef list res = self.get(key) 614 | if res is None: 615 | raise KeyError(key) 616 | return res 617 | 618 | cpdef get(self, key, default=None): 619 | """ 620 | Return a list of payloads (as byte objects) for a given key 621 | or ``default`` if the key is not found. 622 | """ 623 | cdef list res 624 | 625 | if isinstance(key, unicode): 626 | res = self.get_value(key) 627 | else: 628 | res = self.b_get_value(key) 629 | 630 | if not res: 631 | return default 632 | return res 633 | 634 | cpdef list get_value(self, unicode key): 635 | """ 636 | Return a list of payloads (as byte objects) for a given unicode key. 637 | """ 638 | cdef bytes b_key = key.encode('utf8') 639 | return self.b_get_value(b_key) 640 | 641 | cpdef list b_get_value(self, bytes key): 642 | """ 643 | Return a list of payloads (as byte objects) for a given utf8-encoded key. 644 | """ 645 | cdef list res = [] 646 | cdef bytes value 647 | cdef bytes b_prefix = key + self._b_value_separator 648 | cdef int prefix_len = len(b_prefix) 649 | 650 | cdef agent.Agent ag 651 | ag.set_query(b_prefix) 652 | 653 | while self._trie.predictive_search(ag): 654 | value = ag.key().ptr()[prefix_len:ag.key().length()] 655 | res.append(value) 656 | 657 | return res 658 | 659 | cpdef list items(self, unicode prefix=""): 660 | # copied from iteritems for speed 661 | cdef bytes b_prefix = prefix.encode('utf8') 662 | cdef bytes value 663 | cdef unicode key 664 | cdef unsigned char* raw_key 665 | cdef list res = [] 666 | cdef int i, value_len 667 | 668 | cdef agent.Agent ag 669 | ag.set_query(b_prefix) 670 | 671 | while self._trie.predictive_search(ag): 672 | raw_key = ag.key().ptr() 673 | 674 | for i in range(0, ag.key().length()): 675 | if raw_key[i] == self._c_value_separator: 676 | break 677 | 678 | key = raw_key[:i].decode('utf8') 679 | value = raw_key[i+1:ag.key().length()] 680 | 681 | res.append( 682 | (key, value) 683 | ) 684 | return res 685 | 686 | def iteritems(self, unicode prefix=""): 687 | cdef bytes b_prefix = prefix.encode('utf8') 688 | cdef bytes value 689 | cdef unicode key 690 | cdef unsigned char* raw_key 691 | cdef int i, value_len 692 | 693 | cdef agent.Agent ag 694 | ag.set_query(b_prefix) 695 | 696 | while self._trie.predictive_search(ag): 697 | raw_key = ag.key().ptr() 698 | 699 | for i in range(0, ag.key().length()): 700 | if raw_key[i] == self._c_value_separator: 701 | break 702 | 703 | key = raw_key[:i].decode('utf8') 704 | value = raw_key[i+1:ag.key().length()] 705 | 706 | yield key, value 707 | 708 | cpdef list keys(self, prefix=""): 709 | # copied from iterkeys for speed 710 | cdef bytes b_prefix = prefix.encode('utf8') 711 | cdef unicode key 712 | cdef unsigned char* raw_key 713 | cdef list res = [] 714 | cdef int i 715 | 716 | cdef agent.Agent ag 717 | ag.set_query(b_prefix) 718 | 719 | while self._trie.predictive_search(ag): 720 | raw_key = ag.key().ptr() 721 | 722 | for i in range(0, ag.key().length()): 723 | if raw_key[i] == self._c_value_separator: 724 | key = raw_key[:i].decode('utf8') 725 | res.append(key) 726 | break 727 | return res 728 | 729 | def iterkeys(self, unicode prefix=""): 730 | cdef bytes b_prefix = prefix.encode('utf8') 731 | cdef unicode key 732 | cdef unsigned char* raw_key 733 | cdef int i 734 | 735 | cdef agent.Agent ag 736 | ag.set_query(b_prefix) 737 | 738 | while self._trie.predictive_search(ag): 739 | raw_key = ag.key().ptr() 740 | 741 | for i in range(0, ag.key().length()): 742 | if raw_key[i] == self._c_value_separator: 743 | yield raw_key[:i].decode('utf8') 744 | break 745 | 746 | 747 | cdef class _UnpackTrie(BytesTrie): 748 | 749 | def __init__(self, arg=None, **options): 750 | keys = ((d[0], self._pack(d[1])) for d in (arg or [])) 751 | super(_UnpackTrie, self).__init__(keys, **options) 752 | 753 | cdef _unpack(self, bytes value): 754 | return value 755 | 756 | cdef bytes _pack(self, value): 757 | return value 758 | 759 | cpdef list b_get_value(self, bytes key): 760 | cdef list values = BytesTrie.b_get_value(self, key) 761 | return [self._unpack(val) for val in values] 762 | 763 | cpdef list items(self, unicode prefix=""): 764 | cdef list items = BytesTrie.items(self, prefix) 765 | return [(key, self._unpack(val)) for (key, val) in items] 766 | 767 | def iteritems(self, unicode prefix=""): 768 | return ((key, self._unpack(val)) for key, val in BytesTrie.iteritems(self, prefix)) 769 | 770 | 771 | cdef class RecordTrie(_UnpackTrie): 772 | """A trie mapping unicode keys to lists of data tuples. 773 | 774 | The data is packed using :mod:`struct` module, therefore all 775 | tuples must be of the same format. See :mod:`struct` documentation 776 | for available format strings. 777 | 778 | The mapping is implemented by appending binary values to UTF8-encoded 779 | and storing the result in MARISA-trie. 780 | """ 781 | cdef _struct 782 | cdef _fmt 783 | 784 | def __init__(self, fmt, arg=None, **options): 785 | """ 786 | ``arg`` must be an iterable of tuples (unicode_key, data_tuple). 787 | Data tuples will be converted to bytes with 788 | ``struct.pack(fmt, *data_tuple)``. 789 | """ 790 | self._fmt = fmt 791 | self._struct = struct.Struct(str(fmt)) 792 | super(RecordTrie, self).__init__(arg, **options) 793 | 794 | cdef _unpack(self, bytes value): 795 | return self._struct.unpack(value) 796 | 797 | cdef bytes _pack(self, value): 798 | return self._struct.pack(*value) 799 | 800 | def __reduce__(self): 801 | return self.__class__, (self._fmt, ), self.tobytes() 802 | -------------------------------------------------------------------------------- /src/query.pxd: -------------------------------------------------------------------------------- 1 | cdef extern from "" namespace "marisa" nogil: 2 | 3 | cdef cppclass Query: 4 | Query() 5 | Query(Query &query) 6 | 7 | #Query &operator=(Query &query) 8 | 9 | char operator[](int i) 10 | 11 | void set_str(char *str) 12 | void set_str(char *ptr, int length) 13 | void set_id(int id) 14 | 15 | char *ptr() 16 | int length() 17 | int id() 18 | 19 | void clear() 20 | void swap(Query &rhs) 21 | -------------------------------------------------------------------------------- /src/std_iostream.pxd: -------------------------------------------------------------------------------- 1 | from libcpp.string cimport string 2 | 3 | cdef extern from "" namespace "std" nogil: 4 | cdef cppclass istream: 5 | istream() except + 6 | istream& read (char* s, int n) except + 7 | 8 | cdef cppclass ostream: 9 | ostream() except + 10 | ostream& write (char* s, int n) except + 11 | 12 | cdef extern from "" namespace "std" nogil: 13 | 14 | cdef cppclass stringstream: 15 | stringstream() 16 | stringstream(string s) 17 | string str () 18 | 19 | -------------------------------------------------------------------------------- /src/trie.pxd: -------------------------------------------------------------------------------- 1 | cimport agent 2 | cimport base 3 | cimport keyset 4 | 5 | 6 | cdef extern from "" namespace "marisa" nogil: 7 | 8 | cdef cppclass Trie: 9 | Trie() 10 | 11 | void build(keyset.Keyset &keyset, int config_flags) except + 12 | void build(keyset.Keyset &keyset) except + 13 | 14 | void mmap(char *filename) except + 15 | void map(void *ptr, int size) except + 16 | 17 | void load(char *filename) except + 18 | void read(int fd) except + 19 | 20 | void save(char *filename) except + 21 | void write(int fd) except + 22 | 23 | bint lookup(agent.Agent &agent) except + 24 | void reverse_lookup(agent.Agent &agent) except +KeyError 25 | bint common_prefix_search(agent.Agent &agent) except + 26 | bint predictive_search(agent.Agent &agent) except + 27 | 28 | int num_tries() except + 29 | int num_keys() except + 30 | int num_nodes() except + 31 | 32 | base.TailMode tail_mode() 33 | base.NodeOrder node_order() 34 | 35 | bint empty() except + 36 | int size() except + 37 | int total_size() except + 38 | int io_size() except + 39 | 40 | void clear() except + 41 | void swap(Trie &rhs) except + 42 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytries/marisa-trie/97cfda688aee37565f6d4a414cc66dd5384cb4ad/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_binary_trie.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from uuid import uuid4 3 | 4 | import pytest 5 | import hypothesis.strategies as st 6 | from hypothesis import given, assume 7 | 8 | import marisa_trie 9 | 10 | from .utils import Mapping 11 | 12 | text = st.binary() 13 | 14 | 15 | @given(st.sets(text), text) 16 | def test_init(keys, missing_key): 17 | assume(missing_key not in keys) 18 | 19 | trie = marisa_trie.BinaryTrie(keys) 20 | for key in keys: 21 | assert key in trie 22 | 23 | assert missing_key not in trie 24 | 25 | 26 | @given(st.sets(text, min_size=1), text) 27 | def test_key_id(keys, missing_key): 28 | assume(missing_key not in keys) 29 | 30 | trie = marisa_trie.BinaryTrie(keys) 31 | for key in keys: 32 | key_id = trie.key_id(key) 33 | assert trie.restore_key(key_id) == key 34 | 35 | key_ids = [trie.key_id(key) for key in keys] 36 | non_existing_id = max(key_ids) + 1 37 | 38 | with pytest.raises(KeyError): 39 | trie.restore_key(non_existing_id) 40 | 41 | with pytest.raises(KeyError): 42 | trie.key_id(missing_key) 43 | 44 | 45 | @given(st.sets(text, min_size=1), text) 46 | def test_getitem(keys, missing_key): 47 | assume(missing_key not in keys) 48 | 49 | trie = marisa_trie.BinaryTrie(keys) 50 | for key in keys: 51 | key_id = trie[key] 52 | assert trie.restore_key(key_id) == key 53 | 54 | key_ids = [trie[key] for key in keys] 55 | non_existing_id = max(key_ids) + 1 56 | 57 | with pytest.raises(KeyError): 58 | trie.restore_key(non_existing_id) 59 | 60 | with pytest.raises(KeyError): 61 | trie[missing_key] 62 | 63 | 64 | @given(st.sets(text)) 65 | def test_get(keys): 66 | trie = marisa_trie.BinaryTrie(keys) 67 | for key in keys: 68 | key_id = trie.get(key) 69 | assert trie.restore_key(key_id) == key 70 | 71 | key_id = trie.get(key, "default value") 72 | assert trie.restore_key(key_id) == key 73 | 74 | assert trie.get(b"non_existing_bytes_key") is None 75 | assert trie.get(b"non_existing_bytes_key", "default value") == "default value" 76 | 77 | 78 | @given(st.sets(text)) 79 | def test_saveload(tmpdir_factory, keys): 80 | trie = marisa_trie.BinaryTrie(keys) 81 | 82 | dirname = f"{uuid4()}_" 83 | path = str(tmpdir_factory.mktemp(dirname).join("trie.bin")) 84 | trie.save(path) 85 | 86 | trie2 = marisa_trie.BinaryTrie() 87 | trie2.load(path) 88 | 89 | for key in keys: 90 | assert key in trie2 91 | 92 | 93 | @given(st.sets(text)) 94 | def test_mmap(tmpdir_factory, keys): 95 | trie = marisa_trie.BinaryTrie(keys) 96 | 97 | dirname = f"{uuid4()}_" 98 | path = str(tmpdir_factory.mktemp(dirname).join("trie.bin")) 99 | trie.save(path) 100 | 101 | trie2 = marisa_trie.BinaryTrie() 102 | trie2.mmap(path) 103 | 104 | for key in keys: 105 | assert key in trie2 106 | 107 | 108 | @given(st.sets(text)) 109 | def test_tobytes_frombytes(keys): 110 | trie = marisa_trie.BinaryTrie(keys) 111 | data = trie.tobytes() 112 | 113 | trie2 = marisa_trie.BinaryTrie().frombytes(data) 114 | 115 | for key in keys: 116 | assert key in trie2 117 | assert trie2.key_id(key) == trie.key_id(key) 118 | 119 | 120 | @given(st.sets(text)) 121 | def test_dumps_loads(keys): 122 | trie = marisa_trie.BinaryTrie(keys) 123 | data = pickle.dumps(trie) 124 | 125 | trie2 = pickle.loads(data) 126 | 127 | for key in keys: 128 | assert key in trie2 129 | assert trie2.key_id(key) == trie.key_id(key) 130 | 131 | 132 | def test_contains_empty(): 133 | assert b"foo" not in marisa_trie.BinaryTrie() 134 | 135 | 136 | def test_contains_singleton(): 137 | trie = marisa_trie.BinaryTrie([b"foo"]) 138 | assert b"foo" in trie 139 | assert b"f" not in trie 140 | 141 | 142 | def test_eq_self(): 143 | trie = marisa_trie.BinaryTrie() 144 | assert trie == trie 145 | assert trie == marisa_trie.BinaryTrie() 146 | 147 | 148 | def test_eq_neq(): 149 | trie = marisa_trie.BinaryTrie([b"foo", b"bar"]) 150 | assert trie == marisa_trie.BinaryTrie([b"foo", b"bar"]) 151 | assert trie != marisa_trie.BinaryTrie([b"foo", b"boo"]) 152 | 153 | 154 | def test_neq_different_type(): 155 | assert marisa_trie.BinaryTrie([b"foo", b"bar"]) != {} 156 | 157 | 158 | def test_eq_neq_different_order(): 159 | lo_trie = marisa_trie.BinaryTrie(order=marisa_trie.LABEL_ORDER) 160 | wo_trie = marisa_trie.BinaryTrie(order=marisa_trie.WEIGHT_ORDER) 161 | assert lo_trie == lo_trie and wo_trie == wo_trie 162 | assert lo_trie != wo_trie 163 | 164 | 165 | def test_gt_lt_exceptions(): 166 | with pytest.raises(TypeError): 167 | marisa_trie.BinaryTrie() < marisa_trie.BinaryTrie() 168 | 169 | with pytest.raises(TypeError): 170 | marisa_trie.BinaryTrie() > marisa_trie.BinaryTrie() 171 | 172 | 173 | def test_iter(): 174 | trie = marisa_trie.BinaryTrie([b"foo", b"bar"]) 175 | assert list(trie) == list(trie.iterkeys()) 176 | 177 | 178 | def test_len(): 179 | trie = marisa_trie.BinaryTrie() 180 | assert len(trie) == 0 181 | 182 | trie = marisa_trie.BinaryTrie([b"foo", b"f", b"bar"]) 183 | assert len(trie) == 3 184 | 185 | 186 | def test_prefixes(): 187 | trie = marisa_trie.BinaryTrie([b"foo", b"f", b"foobar", b"bar"]) 188 | assert trie.prefixes(b"foobar") == [b"f", b"foo", b"foobar"] 189 | assert trie.prefixes(b"foo") == [b"f", b"foo"] 190 | assert trie.prefixes(b"bar") == [b"bar"] 191 | assert trie.prefixes(b"b") == [] 192 | 193 | assert list(trie.iter_prefixes(b"foobar")) == [b"f", b"foo", b"foobar"] 194 | 195 | 196 | def test_keys(): 197 | keys = [b"foo", b"f", b"foobar", b"bar"] 198 | trie = marisa_trie.BinaryTrie(keys) 199 | assert set(trie.keys()) == set(keys) 200 | 201 | 202 | def test_keys_prefix(): 203 | keys = [b"foo", b"f", b"foobar", b"bar"] 204 | trie = marisa_trie.BinaryTrie(keys) 205 | assert set(trie.keys(b"fo")) == {b"foo", b"foobar"} 206 | assert trie.keys(b"foobarz") == [] 207 | 208 | 209 | @given(st.sets(text)) 210 | def test_iterkeys(keys): 211 | trie = marisa_trie.BinaryTrie(keys) 212 | assert trie.keys() == list(trie.iterkeys()) 213 | 214 | for key in keys: 215 | prefix = key[:5] 216 | assert trie.keys(prefix) == list(trie.iterkeys(prefix)) 217 | 218 | 219 | def test_items(): 220 | keys = [b"foo", b"f", b"foobar", b"bar"] 221 | trie = marisa_trie.BinaryTrie(keys) 222 | items = trie.items() 223 | assert set(items) == set(zip(keys, (trie[k] for k in keys))) 224 | 225 | 226 | def test_items_prefix(): 227 | keys = [b"foo", b"f", b"foobar", b"bar"] 228 | trie = marisa_trie.BinaryTrie(keys) 229 | assert set(trie.items(b"fo")) == { 230 | (b"foo", trie[b"foo"]), 231 | (b"foobar", trie[b"foobar"]), 232 | } 233 | 234 | 235 | @given(st.sets(text)) 236 | def test_iteritems(keys): 237 | trie = marisa_trie.BinaryTrie(keys) 238 | assert trie.items() == list(trie.iteritems()) 239 | 240 | for key in keys: 241 | prefix = key[:5] 242 | assert trie.items(prefix) == list(trie.iteritems(prefix)) 243 | 244 | 245 | @pytest.mark.filterwarnings("ignore:Trie.has_keys_with_prefix is deprecated") 246 | def test_has_keys_with_prefix_empty(): 247 | empty_trie = marisa_trie.BinaryTrie() 248 | assert not empty_trie.has_keys_with_prefix(b"") 249 | assert not empty_trie.has_keys_with_prefix(b"ab") 250 | 251 | 252 | def test_invalid_file(): 253 | try: 254 | marisa_trie.BinaryTrie().load(__file__) 255 | except RuntimeError as e: 256 | assert "MARISA_FORMAT_ERROR" in e.args[0] 257 | else: 258 | pytest.fail("Exception is not raised") 259 | 260 | 261 | def test_mutable_mapping(): 262 | for method in Mapping.__abstractmethods__: 263 | assert hasattr(marisa_trie.BinaryTrie, method) 264 | -------------------------------------------------------------------------------- /tests/test_bytes_trie.py: -------------------------------------------------------------------------------- 1 | import io 2 | import pickle 3 | 4 | import pytest 5 | import hypothesis.strategies as st 6 | from hypothesis import assume, given 7 | 8 | import marisa_trie 9 | 10 | from .utils import text 11 | 12 | 13 | @given(st.sets(text), st.lists(st.binary()), text) 14 | def test_contains(keys, values, missing_key): 15 | assume(missing_key not in keys) 16 | 17 | data = zip(keys, values) 18 | trie = marisa_trie.BytesTrie(data) 19 | 20 | for word, value in data: 21 | assert word in trie 22 | 23 | assert missing_key not in trie 24 | 25 | 26 | @given(st.sets(text), st.lists(st.binary()), text) 27 | def test_getitem(keys, values, missing_key): 28 | assume(missing_key not in keys) 29 | 30 | data = zip(keys, values) 31 | trie = marisa_trie.BytesTrie(data) 32 | 33 | for key, value in data: 34 | assert trie[key] == [value] 35 | 36 | with pytest.raises(KeyError): 37 | trie[missing_key] 38 | 39 | 40 | @pytest.mark.parametrize("data", [[], [("foo", b"bar")]]) 41 | def test_getitem_missing(data): 42 | trie = marisa_trie.BytesTrie(data) 43 | with pytest.raises(KeyError): 44 | trie["missing"] 45 | 46 | 47 | def test_getitem_multiple(): 48 | data = [ 49 | ("foo", b"x"), 50 | ("fo", b"y"), 51 | ("foo", b"a"), 52 | ] 53 | trie = marisa_trie.BytesTrie(data) 54 | assert trie["fo"] == [b"y"] 55 | assert trie["foo"] == [b"a", b"x"] 56 | 57 | 58 | def test_null_bytes_in_values(): 59 | data = [("foo", b"\x00\x00bar\x00")] 60 | trie = marisa_trie.BytesTrie(data) 61 | 62 | for key, value in data: 63 | assert trie[key] == [value] 64 | 65 | 66 | def test_items(): 67 | data = [ 68 | ("fo", b"y"), 69 | ("foo", b"x"), 70 | ("foo", b"a"), 71 | ] 72 | trie = marisa_trie.BytesTrie(data) 73 | assert set(trie.items()) == set(data) 74 | assert set(trie.items("f")) == set(data) 75 | assert set(trie.items("fo")) == set(data) 76 | assert set(trie.items("foo")) == set(data[1:]) 77 | assert trie.items("food") == [] 78 | assert trie.items("bar") == [] 79 | 80 | 81 | @given(st.sets(text), st.lists(st.binary())) 82 | def test_iteritems(keys, values): 83 | trie = marisa_trie.BytesTrie(zip(keys, values)) 84 | assert trie.items() == list(trie.iteritems()) 85 | 86 | for key in keys: 87 | prefix = key[:5] 88 | assert trie.items(prefix) == list(trie.iteritems(prefix)) 89 | 90 | 91 | def test_keys(): 92 | trie = marisa_trie.BytesTrie( 93 | [ 94 | ("foo", b"x"), 95 | ("fo", b"y"), 96 | ("foo", b"a"), 97 | ] 98 | ) 99 | 100 | # FIXME: ordering? 101 | assert trie.keys() == ["foo", "foo", "fo"] 102 | assert trie.keys("f") == ["foo", "foo", "fo"] 103 | assert trie.keys("fo") == ["foo", "foo", "fo"] 104 | assert trie.keys("foo") == ["foo", "foo"] 105 | assert trie.keys("food") == [] 106 | assert trie.keys("bar") == [] 107 | 108 | 109 | @given(st.sets(text), st.lists(st.binary())) 110 | def test_iterkeys(keys, values): 111 | trie = marisa_trie.BytesTrie(zip(keys, values)) 112 | assert trie.keys() == list(trie.iterkeys()) 113 | 114 | for key in keys: 115 | prefix = key[:5] 116 | assert trie.keys(prefix) == list(trie.iterkeys(prefix)) 117 | 118 | 119 | @given(st.sets(st.tuples(text, st.binary()))) 120 | def test_dumps_loads(data): 121 | trie = marisa_trie.BytesTrie(data) 122 | 123 | buf = io.BytesIO() 124 | pickle.dump(trie, buf) 125 | buf.seek(0) 126 | 127 | assert trie == pickle.load(buf) 128 | -------------------------------------------------------------------------------- /tests/test_packaging.py: -------------------------------------------------------------------------------- 1 | """ 2 | Shamelessly inspired from https://github.com/pypa/twine/blob/main/twine/commands/check.py 3 | """ 4 | import io 5 | import re 6 | import subprocess 7 | from email import message_from_string 8 | from pkg_resources import get_distribution 9 | 10 | from readme_renderer.rst import render 11 | 12 | # Regular expression used to capture and reformat docutils warnings into 13 | # something that a human can understand. This is loosely borrowed from 14 | # Sphinx: https://github.com/sphinx-doc/sphinx/blob 15 | # /c35eb6fade7a3b4a6de4183d1dd4196f04a5edaf/sphinx/util/docutils.py#L199 16 | _REPORT_RE = re.compile( 17 | r"^:(?P(?:\d+)?): " 18 | r"\((?PDEBUG|INFO|WARNING|ERROR|SEVERE)/(\d+)?\) " 19 | r"(?P.*)", 20 | re.DOTALL | re.MULTILINE, 21 | ) 22 | 23 | 24 | class _WarningStream: 25 | def __init__(self) -> None: 26 | self.output = io.StringIO() 27 | 28 | def write(self, text: str) -> None: 29 | matched = _REPORT_RE.search(text) 30 | 31 | if not matched: 32 | self.output.write(text) 33 | return 34 | 35 | self.output.write( 36 | "line {line}: {level_text}: {message}\n".format( 37 | level_text=matched.group("level").capitalize(), 38 | line=matched.group("line"), 39 | message=matched.group("message").rstrip("\r\n"), 40 | ) 41 | ) 42 | 43 | def __str__(self) -> str: 44 | return self.output.getvalue() 45 | 46 | 47 | def test_check_pypi_rendering(): 48 | subprocess.check_call(["python3", "setup.py", "sdist"]) 49 | 50 | package = get_distribution("marisa-trie") 51 | pkg_info = message_from_string(package.get_metadata("PKG-INFO")) 52 | metadata = dict(pkg_info.items()) 53 | lines = metadata["Summary"].splitlines() 54 | description = lines.pop(0) + "\n" 55 | description += "\n".join(l[8:] for l in lines) 56 | 57 | warnings = _WarningStream() 58 | rendering = render(description, stream=warnings) 59 | print(description) 60 | print(warnings) 61 | assert not str(warnings) 62 | assert rendering is not None 63 | -------------------------------------------------------------------------------- /tests/test_record_trie.py: -------------------------------------------------------------------------------- 1 | import io 2 | import pickle 3 | 4 | import hypothesis.strategies as st 5 | from hypothesis import given 6 | 7 | import marisa_trie 8 | 9 | from .utils import text 10 | 11 | records = st.tuples(st.integers(min_value=0, max_value=2 ** 16 - 1), st.booleans()) 12 | 13 | 14 | @given(st.sets(st.tuples(text, records))) 15 | def test_dumps_loads(data): 16 | trie = marisa_trie.RecordTrie(" marisa_trie.Trie() 204 | 205 | 206 | def test_iter(): 207 | trie = marisa_trie.Trie(["foo", "bar"]) 208 | assert list(trie) == list(trie.iterkeys()) 209 | 210 | 211 | def test_len(): 212 | trie = marisa_trie.Trie() 213 | assert len(trie) == 0 214 | 215 | trie = marisa_trie.Trie(["foo", "f", "bar"]) 216 | assert len(trie) == 3 217 | 218 | 219 | def test_prefixes(): 220 | trie = marisa_trie.Trie(["foo", "f", "foobar", "bar"]) 221 | assert trie.prefixes("foobar") == ["f", "foo", "foobar"] 222 | assert trie.prefixes("foo") == ["f", "foo"] 223 | assert trie.prefixes("bar") == ["bar"] 224 | assert trie.prefixes("b") == [] 225 | 226 | assert list(trie.iter_prefixes("foobar")) == ["f", "foo", "foobar"] 227 | 228 | def test_iter_prefixes_with_keys(): 229 | trie = marisa_trie.Trie(["foo", "f", "foobar", "bar"]) 230 | 231 | assert set(trie.iter_prefixes_with_ids("foobar")) == { 232 | ("f", trie["f"]), 233 | ("foo", trie["foo"]), 234 | ("foobar", trie["foobar"]), 235 | } 236 | assert set(trie.iter_prefixes_with_ids("foo")) == { 237 | ("f", trie["f"]), 238 | ("foo", trie["foo"]), 239 | } 240 | assert set(trie.iter_prefixes_with_ids("bar")) == {("bar", trie["bar"])} 241 | assert not set(trie.iter_prefixes_with_ids("b")) 242 | 243 | for test_key in ["foobar", "foo", "bar", "b"]: 244 | assert list(trie.iter_prefixes_with_ids(test_key)) == [ 245 | (prefix, trie[prefix]) for prefix in trie.prefixes(test_key) 246 | ] 247 | 248 | def test_keys(): 249 | keys = ["foo", "f", "foobar", "bar"] 250 | trie = marisa_trie.Trie(keys) 251 | assert set(trie.keys()) == set(keys) 252 | 253 | 254 | def test_keys_prefix(): 255 | keys = ["foo", "f", "foobar", "bar"] 256 | trie = marisa_trie.Trie(keys) 257 | assert set(trie.keys("fo")) == {"foo", "foobar"} 258 | assert trie.keys("foobarz") == [] 259 | 260 | 261 | @given(st.sets(text)) 262 | def test_iterkeys(keys): 263 | trie = marisa_trie.Trie(keys) 264 | assert trie.keys() == list(trie.iterkeys()) 265 | 266 | for key in keys: 267 | prefix = key[:5] 268 | assert trie.keys(prefix) == list(trie.iterkeys(prefix)) 269 | 270 | 271 | def test_items(): 272 | keys = ["foo", "f", "foobar", "bar"] 273 | trie = marisa_trie.Trie(keys) 274 | items = trie.items() 275 | assert set(items) == set(zip(keys, (trie[k] for k in keys))) 276 | 277 | 278 | def test_items_prefix(): 279 | keys = ["foo", "f", "foobar", "bar"] 280 | trie = marisa_trie.Trie(keys) 281 | assert set(trie.items("fo")) == { 282 | ("foo", trie["foo"]), 283 | ("foobar", trie["foobar"]), 284 | } 285 | 286 | 287 | @given(st.sets(text)) 288 | def test_iteritems(keys): 289 | trie = marisa_trie.Trie(keys) 290 | assert trie.items() == list(trie.iteritems()) 291 | 292 | for key in keys: 293 | prefix = key[:5] 294 | assert trie.items(prefix) == list(trie.iteritems(prefix)) 295 | 296 | 297 | @pytest.mark.filterwarnings("ignore:Trie.has_keys_with_prefix is deprecated") 298 | def test_has_keys_with_prefix_empty(): 299 | empty_trie = marisa_trie.Trie() 300 | assert not empty_trie.has_keys_with_prefix("") 301 | assert not empty_trie.has_keys_with_prefix("ab") 302 | 303 | 304 | @pytest.mark.filterwarnings("ignore:Trie.has_keys_with_prefix is deprecated") 305 | def test_has_keys_with_prefix(): 306 | fruit_trie = marisa_trie.BytesTrie( 307 | [ 308 | ("apple", b"foo"), 309 | ("pear", b"bar"), 310 | ("peach", b"baz"), 311 | ] 312 | ) 313 | assert fruit_trie.has_keys_with_prefix("") 314 | assert fruit_trie.has_keys_with_prefix("a") 315 | assert fruit_trie.has_keys_with_prefix("pe") 316 | assert fruit_trie.has_keys_with_prefix("pear") 317 | assert not fruit_trie.has_keys_with_prefix("x") 318 | 319 | 320 | def test_invalid_file(): 321 | try: 322 | marisa_trie.Trie().load(__file__) 323 | except RuntimeError as e: 324 | assert "MARISA_FORMAT_ERROR" in e.args[0] 325 | else: 326 | pytest.fail("Exception is not raised") 327 | 328 | 329 | def test_mutable_mapping(): 330 | for method in Mapping.__abstractmethods__: 331 | assert hasattr(marisa_trie.Trie, method) 332 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | import string 2 | from collections.abc import Mapping 3 | 4 | import hypothesis.strategies as st 5 | 6 | text = st.text(f"абвгдеёжзиклмнопрстуфхцчъыьэюя{string.ascii_lowercase}") 7 | 8 | __all__ = ("Mapping", text) 9 | -------------------------------------------------------------------------------- /update_cpp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | cython src/*.pyx src/*.pxd --cplus -a -3 3 | --------------------------------------------------------------------------------