├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── build-and-publish.yml
    │   └── tests.yml
├── .gitignore
├── .gitmodules
├── .well-known
    └── funding-manifest-urls
├── AUTHORS.rst
├── CHANGES.rst
├── LICENSE
├── MANIFEST.in
├── README.rst
├── bench
    ├── __init__.py
    ├── speed.py
    └── words100k.txt.gz
├── docs
    ├── Makefile
    ├── api.rst
    ├── benchmarks.rst
    ├── changelog.rst
    ├── conf.py
    ├── contributing.rst
    ├── index.rst
    ├── make.bat
    └── tutorial.rst
├── setup.py
├── src
    ├── agent.cpp
    ├── agent.pxd
    ├── base.cpp
    ├── base.pxd
    ├── iostream.cpp
    ├── iostream.pxd
    ├── key.cpp
    ├── key.pxd
    ├── keyset.cpp
    ├── keyset.pxd
    ├── marisa_trie.cpp
    ├── marisa_trie.pyx
    ├── query.cpp
    ├── query.pxd
    ├── std_iostream.cpp
    ├── std_iostream.pxd
    ├── trie.cpp
    └── trie.pxd
├── tests
    ├── __init__.py
    ├── test_binary_trie.py
    ├── test_bytes_trie.py
    ├── test_packaging.py
    ├── test_record_trie.py
    ├── test_trie.py
    └── utils.py
└── update_cpp.sh


/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   # GitHub Actions
 4 |   - package-ecosystem: github-actions
 5 |     directory: /
 6 |     schedule:
 7 |       interval: daily
 8 |     labels:
 9 |       - dependencies
10 |       - QA/CI
11 | 


--------------------------------------------------------------------------------
/.github/workflows/build-and-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Build & Publish
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     paths:
 6 |       - ".github/workflows/build-and-publish.yml"
 7 |       - "setup.*"
 8 | 
 9 |   workflow_dispatch:
10 |     inputs:
11 |       branch:
12 |         description: "The branch, tag or SHA to release from"
13 |         required: true
14 |         default: "master"
15 | 
16 | jobs:
17 |   os-built-distributions:
18 |     name: Build on ${{ matrix.os }}
19 |     runs-on: ${{ matrix.os }}
20 |     strategy:
21 |       matrix:
22 |         os: [ubuntu-latest, windows-latest, macos-latest]
23 |     steps:
24 |       - name: Checkout
25 |         uses: actions/checkout@v4
26 |         with:
27 |           ref: ${{ github.event.inputs.branch }}
28 |           submodules: true
29 | 
30 |       - name: Set up QEMU
31 |         if: runner.os == 'Linux'
32 |         uses: docker/setup-qemu-action@v3
33 |         with:
34 |           platforms: all
35 | 
36 |       - name: Install Python
37 |         uses: actions/setup-python@v5
38 |         with:
39 |           python-version: "3.12"
40 |       - name: Install build dependencies
41 |         run: python -m pip install --upgrade cibuildwheel
42 |       - name: Build wheels
43 |         run: python -m cibuildwheel
44 |         env:
45 |           CIBW_SKIP: "pp*"  # skip PyPy releases
46 |           CIBW_ARCHS_MACOS: "x86_64 universal2 arm64"
47 |           CIBW_ARCHS_LINUX: "auto aarch64"
48 |       - uses: actions/upload-artifact@v4
49 |         with:
50 |           name: python-package-distributions-${{ matrix.os }}
51 |           path: ./wheelhouse/*.whl
52 | 
53 |   source-distribution:
54 |     name: Build source distribution
55 |     runs-on: ubuntu-latest
56 |     steps:
57 |       - name: Checkout
58 |         uses: actions/checkout@v4
59 |         with:
60 |           ref: ${{ github.event.inputs.branch }}
61 |           submodules: true
62 |       - name: Install Python
63 |         uses: actions/setup-python@v5
64 |         with:
65 |           python-version: "3.12"
66 |       - name: Build source distribution
67 |         run: |
68 |           # FIXME: setuptools was removed starting with Python 3.12
69 |           pip install --upgrade --force setuptools
70 |           python setup.py sdist
71 |       - name: Store the source distribution
72 |         uses: actions/upload-artifact@v4
73 |         with:
74 |           name: python-package-distributions-source
75 |           path: dist
76 |           retention-days: 4
77 | 
78 |   publish:
79 |     needs:
80 |       - os-built-distributions
81 |       - source-distribution
82 |     runs-on: ubuntu-latest
83 |     steps:
84 |       - name: Download all the dists
85 |         uses: actions/download-artifact@v4
86 |         with:
87 |           pattern: python-package-distributions-*
88 |           merge-multiple: true
89 |           path: dist/
90 |       - name: What will we publish?
91 |         run: ls -l dist
92 |       - name: Publish
93 |         if: github.event.inputs.branch != ''
94 |         uses: pypa/gh-action-pypi-publish@release/v1
95 |         with:
96 |           user: __token__
97 |           password: ${{ secrets.PYPI_API_TOKEN }}
98 |           skip_existing: true
99 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on: pull_request
 4 | 
 5 | jobs:
 6 |   tests:
 7 |     name: Run tests for ${{ matrix.os }} for ${{ matrix.python }}
 8 |     runs-on: ${{ matrix.os }}
 9 |     strategy:
10 |       fail-fast: false
11 |       matrix:
12 |         os: [ubuntu-latest, windows-latest, macos-latest]
13 |         python: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "3.14-dev"]
14 |     steps:
15 |       - name: Checkout
16 |         uses: actions/checkout@v4
17 |         with:
18 |           ref: ${{ github.event.inputs.branch }}
19 |           submodules: true
20 |       - name: Use Python ${{ matrix.python }}
21 |         uses: actions/setup-python@v5
22 |         with:
23 |           python-version: ${{ matrix.python }}
24 |       - name: Install test dependencies
25 |         run: python -m pip install ".[test]"
26 |       - name: Test
27 |         run: python -m pytest
28 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | MANIFEST
 2 | src/*.html
 3 | docs/_build
 4 | venv*
 5 | *.py[cod]
 6 | 
 7 | # C extensions
 8 | *.so
 9 | 
10 | # Packages
11 | *.egg
12 | *.egg-info
13 | dist
14 | build
15 | eggs
16 | .eggs
17 | parts
18 | bin
19 | var
20 | sdist
21 | develop-eggs
22 | .installed.cfg
23 | lib64
24 | __pycache__
25 | 
26 | # Installer logs
27 | pip-log.txt
28 | 
29 | # Unit test / coverage reports
30 | .coverage
31 | .hypothesis
32 | .cache
33 | .tox
34 | nosetests.xml
35 | 
36 | # IDE
37 | .idea
38 | .vscode
39 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "marisa-trie"]
2 | 	path = marisa-trie
3 | 	url = https://github.com/s-yata/marisa-trie.git
4 | 


--------------------------------------------------------------------------------
/.well-known/funding-manifest-urls:
--------------------------------------------------------------------------------
1 | https://www.tiger-222.fr/funding.json
2 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
1 | Authors and contributors, in no particular order:
2 | 
3 | * Mikhail Korobov <kmike84@gmail.com>
4 | * `Matt Hickford <https://github.com/matt-hickford>`_
5 | * Sergei Lebedev <superbobry@gmail.com>
6 | * Tomasz Melcer <https://github.com/liori>
7 | * `Mickaël Schoentgen <https://github.com/BoboTiG>`
8 | 


--------------------------------------------------------------------------------
/CHANGES.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | CHANGES
  3 | =======
  4 | 
  5 | 1.3.0 (2025-xx-xx)
  6 | ------------------
  7 | 
  8 | * Updated ``libmarisa-trie`` to the latest version (0.2.7) (#116).
  9 | * Dropped Python 3.7 support (#112).
 10 | * Added Python 3.13 support (#112).
 11 | * Rebuild Cython wrapper with Cython 3.1.1 (#117).
 12 | 
 13 | 1.2.1 (2024-10-12)
 14 | ------------------
 15 | 
 16 | * Publish Python 3.13 wheels (only CPython ones, PyPy ones are skipped until https://github.com/pypa/distutils/issues/283 is fixed).
 17 | * Rebuild Cython wrapper with Cython 3.0.11.
 18 | 
 19 | 1.2.0 (2024-06-05)
 20 | ------------------
 21 | 
 22 | * Added Python 3.13 support (#105).
 23 | * Rebuild Cython wrapper with Cython 3.0.10 (#105).
 24 | 
 25 | 1.1.1 (2024-05-06)
 26 | ------------------
 27 | 
 28 | * Publish Linux aarch64 wheels (#101).
 29 | 
 30 | 1.1.0 (2023-10-06)
 31 | ------------------
 32 | 
 33 | * Added Python 3.12 support.
 34 | 
 35 | 1.0.0 (2023-09-03)
 36 | ------------------
 37 | 
 38 | * Dropped Python 2.7, 3.4, 3.5, 3.6 support.
 39 | * Added ``Trie.map()`` (#90).
 40 | * Rebuilt Cython wrapper with Cython 3.0.2.
 41 | * Fixed benchmark documentation typos (#89).
 42 | 
 43 | 0.8.0 (2023-03-25)
 44 | ------------------
 45 | 
 46 | * Add ``Trie.iter_prefixes_with_ids()`` method to return ``(prefix, id)`` pairs (#83).
 47 | * Rebuild Cython wrapper with Cython 0.29.33 (#88).
 48 | 
 49 | 0.7.8 (2022-10-25)
 50 | ------------------
 51 | 
 52 | * Added Python 3.11 support.
 53 | * Rebuild Cython wrapper with Cython 0.29.32.
 54 | 
 55 | 0.7.7 (2021-08-04)
 56 | ------------------
 57 | 
 58 | * Restored Python 2.7 support.
 59 | * Fixed README image references not working on Windows.
 60 | 
 61 | 0.7.6 (2021-07-28)
 62 | ------------------
 63 | 
 64 | * Wheels are now published for all platforms.
 65 | * Fixed ``ResourceWarning: unclosed file`` in ``setup.py``.
 66 | * Run ``black`` on the entire source code.
 67 | * Moved the QA/CI to GitHub.
 68 | * Rebuild Cython wrapper with Cython 0.29.24.
 69 | * Updated ``libmarisa-trie`` to the latest version (0.2.6).
 70 | * Fixed failing tests and usage of deprecated methods.
 71 | * Expanded supported Python version (2.7, 3.4 - 3.10).
 72 | 
 73 | 0.7.5 (2018-04-10)
 74 | ------------------
 75 | 
 76 | * Removed redundant ``DeprecationWarning`` messages in ``Trie.save`` and
 77 |   ``Trie.load``.
 78 | * Dropped support for Python 2.6.
 79 | * Rebuild Cython wrapper with Cython 0.28.1.
 80 | 
 81 | 0.7.4 (2017-03-27)
 82 | ------------------
 83 | 
 84 | * Fixed packaging issue, ``MANIFEST.in`` was not updated after ``libmarisa-trie``
 85 |   became a submodule.
 86 | 
 87 | 0.7.3 (2017-02-14)
 88 | ------------------
 89 | 
 90 | * Added ``BinaryTrie`` for storing arbitrary sequences of bytes, e.g. IP
 91 |   addresses (thanks Tomasz Melcer);
 92 | * Deprecated ``Trie.has_keys_with_prefix`` which can be trivially implemented in
 93 |   terms of ``Trie.iterkeys``;
 94 | * Deprecated ``Trie.read`` and ``Trie.write`` which onlywork for "real" files
 95 |   and duplicate the functionality of ``load`` and ``save``. See issue #31 on
 96 |   GitHub;
 97 | * Updated ``libmarisa-trie`` to the latest version. Yay, 64-bit Windows support.
 98 | * Rebuilt Cython wrapper with Cython 0.25.2.
 99 | 
100 | 0.7.2 (2015-04-21)
101 | ------------------
102 | 
103 | * packaging issue is fixed.
104 | 
105 | 0.7.1 (2015-04-21)
106 | ------------------
107 | 
108 | * setup.py is switched to setuptools;
109 | * a tiny speedup;
110 | * wrapper is rebuilt with Cython 0.22.
111 | 
112 | 0.7 (2014-12-15)
113 | ----------------
114 | 
115 | * ``trie1 == trie2`` and ``trie1 != trie2`` now work (thanks Sergei Lebedev);
116 | * ``for key in trie:`` is fixed (thanks Sergei Lebedev);
117 | * wrapper is rebuilt with Cython 0.21.1 (thanks Sergei Lebedev);
118 | * https://bitbucket.org/kmike/marisa-trie repo is no longer supported.
119 | 
120 | 0.6 (2014-02-22)
121 | ----------------
122 | 
123 | * New ``Trie`` methods: ``__getitem__``, ``get``, ``items``, ``iteritems``.
124 |   ``trie[u'key']`` is now the same as ``trie.key_id(u'key')``.
125 | * small optimization for ``BytesTrie.get``.
126 | * wrapper is rebuilt with Cython 0.20.1.
127 | 
128 | 0.5.3 (2014-02-08)
129 | ------------------
130 | 
131 | * small ``Trie.restore_key`` optimization (it should work 5-15% faster)
132 | 
133 | 0.5.2 (2014-02-08)
134 | ------------------
135 | 
136 | * fix ``Trie.restore_key`` method - it was reading past declared string length;
137 | * rebuild wrapper with Cython 0.20.
138 | 
139 | 0.5.1 (2013-10-03)
140 | ------------------
141 | 
142 | * ``has_keys_with_prefix(prefix)`` method (thanks
143 |   `Matt Hickford <https://github.com/matt-hickford>`_)
144 | 
145 | 0.5 (2013-05-07)
146 | ----------------
147 | 
148 | * ``BytesTrie.iterkeys``, ``BytesTrie.iteritems``,
149 |   ``RecordTrie.iterkeys`` and ``RecordTrie.iteritems`` methods;
150 | * wrapper is rebuilt with Cython 0.19;
151 | * ``value_separator`` parameter for ``BytesTrie`` and ``RecordTrie``.
152 | 
153 | 0.4 (2013-02-28)
154 | ----------------
155 | 
156 | * improved trie building: ``weights`` optional parameter;
157 | * improved trie building: unnecessary input sorting is removed;
158 | * wrapper is rebuilt with Cython 0.18;
159 | * bundled marisa-trie C++ library is updated to svn r133.
160 | 
161 | 0.3.8 (2013-01-03)
162 | ------------------
163 | 
164 | * Rebuild wrapper with Cython pre-0.18;
165 | * update benchmarks.
166 | 
167 | 0.3.7 (2012-09-21)
168 | ------------------
169 | 
170 | * Update bundled marisa-trie C++ library (this may fix more mingw issues);
171 | * Python 3.3 support is back.
172 | 
173 | 0.3.6 (2012-09-05)
174 | ------------------
175 | 
176 | * much faster (3x-7x) ``.items()`` and ``.keys()`` methods for all tries;
177 |   faster (up to 3x) ``.prefixes()`` method for ``Trie``.
178 | 
179 | 0.3.5 (2012-08-30)
180 | ------------------
181 | 
182 | * Pickling of RecordTrie is fixed (thanks lazarou for the report);
183 | * error messages should become more useful.
184 | 
185 | 0.3.4 (2012-08-29)
186 | ------------------
187 | 
188 | * Issues with mingw32 should be resolved (thanks Susumu Yata).
189 | 
190 | 0.3.3 (2012-08-27)
191 | ------------------
192 | 
193 | * ``.get(key, default=None)`` method for ``BytesTrie`` and ``RecordTrie``;
194 | * small README improvements.
195 | 
196 | 0.3.2 (2012-08-26)
197 | ------------------
198 | 
199 | * Small code cleanup;
200 | * ``load``, ``read`` and ``mmap`` methods returns 'self';
201 | * I can't run tests (via tox) under Python 3.3 so it is
202 |   removed from supported versions for now.
203 | 
204 | 0.3.1 (2012-08-23)
205 | ------------------
206 | 
207 | * ``.prefixes()`` support for RecordTrie and BytesTrie.
208 | 
209 | 0.3 (2012-08-23)
210 | ----------------
211 | 
212 | * RecordTrie and BytesTrie are introduced;
213 | * IntTrie class is removed (probably temporary?);
214 | * dumps/loads methods are renamed to tobytes/frombytes;
215 | * benchmark & tests improvements;
216 | * support for MARISA-trie config options is added.
217 | 
218 | 0.2 (2012-08-19)
219 | ------------------
220 | 
221 | * Pickling/unpickling support;
222 | * dumps/loads methods;
223 | * python 3.3 workaround;
224 | * improved tests;
225 | * benchmarks.
226 | 
227 | 0.1 (2012-08-17)
228 | ----------------
229 | 
230 | Initial release.
231 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) marisa-trie authors and contributors, 2012-2025
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is furnished
 8 | to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
14 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR
15 | A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
16 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
17 | CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
18 | OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include README.rst
 2 | include CHANGES.rst
 3 | include LICENSE
 4 | include update_cpp.sh
 5 | 
 6 | recursive-include src *.cpp *.pxd *.pyx
 7 | recursive-include marisa-trie/lib/marisa *.h *.cc
 8 | recursive-include marisa-trie/include/marisa *.h
 9 | recursive-include tests *.py
10 | recursive-include bench *.py
11 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | MARISA Trie
 2 | ===========
 3 | 
 4 | |PyPI Version|
 5 | |PyPI Status|
 6 | |PyPI Python Versions|
 7 | |Github Build Status|
 8 | 
 9 | .. tip::
10 | 
11 |     Become **my boss** to help me work on this awesome software, and make the world better:
12 | 
13 |    |Patreon|
14 | 
15 | Static memory-efficient Trie-like structures for Python (3.8+)
16 | based on `marisa-trie`_ C++ library.
17 | 
18 | String data in a MARISA-trie may take up to 50x-100x less memory than
19 | in a standard Python dict; the raw lookup speed is comparable; trie also
20 | provides fast advanced methods like prefix search.
21 | 
22 | .. note::
23 | 
24 |     There are official SWIG-based Python bindings included
25 |     in C++ library distribution; this package provides alternative
26 |     Cython-based pip-installable Python bindings.
27 | 
28 | .. _marisa-trie: https://github.com/s-yata/marisa-trie
29 | 
30 | Installation
31 | ============
32 | 
33 | ::
34 | 
35 |     python -m pip install -U marisa-trie
36 | 
37 | Usage
38 | =====
39 | 
40 | See `tutorial`_ and `API`_ for details.
41 | 
42 | .. _tutorial: https://marisa-trie.readthedocs.io/en/latest/tutorial.html
43 | .. _API: https://marisa-trie.readthedocs.io/en/latest/api.html
44 | 
45 | Current limitations
46 | ===================
47 | 
48 | * The library is not tested with mingw32 compiler;
49 | * ``.prefixes()`` method of ``BytesTrie`` and ``RecordTrie`` is quite slow
50 |   and doesn't have iterator counterpart;
51 | * ``read()`` and ``write()`` methods don't work with file-like objects
52 |   (they work only with real files; pickling works fine for file-like objects);
53 | * there are ``keys()`` and ``items()`` methods but no ``values()`` method.
54 | 
55 | License
56 | =======
57 | 
58 | Wrapper code is licensed under MIT License.
59 | 
60 | Bundled `marisa-trie`_ C++ library is dual-licensed under
61 | LGPL and BSD 2-clause license.
62 | 
63 | .. |PyPI Version| image:: https://img.shields.io/pypi/v/marisa-trie.svg
64 |    :target: https://pypi.python.org/pypi/marisa-trie/
65 | .. |PyPI Status| image:: https://img.shields.io/pypi/status/marisa-trie.svg
66 |    :target: https://pypi.python.org/pypi/marisa-trie/
67 | .. |PyPI Python Versions| image:: https://img.shields.io/pypi/pyversions/marisa-trie.svg
68 |    :target: https://pypi.python.org/pypi/marisa-trie/
69 | .. |Github Build Status| image:: https://github.com/pytries/marisa-trie/actions/workflows/tests.yml/badge.svg
70 |    :target: https://github.com/pytries/marisa-trie/actions/workflows/tests.yml
71 | .. |Patreon| image:: https://img.shields.io/badge/Patreon-F96854?style=for-the-badge&logo=patreon&logoColor=white
72 |    :target: https://www.patreon.com/mschoentgen
73 | 


--------------------------------------------------------------------------------
/bench/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytries/marisa-trie/97cfda688aee37565f6d4a414cc66dd5384cb4ad/bench/__init__.py


--------------------------------------------------------------------------------
/bench/speed.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import os
  3 | import random
  4 | import string
  5 | import struct
  6 | import timeit
  7 | 
  8 | import marisa_trie
  9 | 
 10 | 
 11 | def words100k():
 12 |     zip_name = os.path.join(
 13 |         os.path.abspath(os.path.dirname(__file__)), "words100k.txt.gz"
 14 |     )
 15 | 
 16 |     return list(map(str.rstrip, gzip.open(zip_name, "rt")))
 17 | 
 18 | 
 19 | def random_words(num):
 20 |     russian = "абвгдеёжзиклмнопрстуфхцчъыьэюя"
 21 |     alphabet = f"{russian}{string.ascii_letters}"
 22 |     return [
 23 |         "".join(random.choice(alphabet) for _ in range(random.randint(1, 15)))
 24 |         for _ in range(num)
 25 |     ]
 26 | 
 27 | 
 28 | def truncated_words(words):
 29 |     return [word[:3] for word in words]
 30 | 
 31 | 
 32 | def prefixes1k(words, prefix_len):
 33 |     words = [w for w in words if len(w) >= prefix_len]
 34 |     every_nth = len(words) // 1000
 35 |     _words = [w[:prefix_len] for w in words[::every_nth]]
 36 |     return _words[:1000]
 37 | 
 38 | 
 39 | WORDS100k = words100k()
 40 | MIXED_WORDS100k = truncated_words(WORDS100k)
 41 | NON_WORDS100k = random_words(100000)
 42 | PREFIXES_3_1k = prefixes1k(WORDS100k, 3)
 43 | PREFIXES_5_1k = prefixes1k(WORDS100k, 5)
 44 | PREFIXES_8_1k = prefixes1k(WORDS100k, 8)
 45 | PREFIXES_15_1k = prefixes1k(WORDS100k, 15)
 46 | 
 47 | 
 48 | def format_result(key, value, text_width):
 49 |     key = key.ljust(text_width)
 50 |     print(f"    {key} {value}")
 51 | 
 52 | 
 53 | def bench(
 54 |     name, timer, descr="M ops/sec", op_count=0.1, repeats=3, runs=5, text_width=33
 55 | ):
 56 |     try:
 57 |         times = []
 58 |         for x in range(runs):
 59 |             times.append(timer.timeit(repeats))
 60 | 
 61 |         def op_time(time):
 62 |             return op_count * repeats / time
 63 | 
 64 |         val = f"{op_time(min(times)):0.3f}{descr}"
 65 |         format_result(name, val, text_width)
 66 |     except (AttributeError, TypeError):
 67 |         format_result(name, "not supported", text_width)
 68 | 
 69 | 
 70 | def create_trie():
 71 |     words = WORDS100k
 72 |     return marisa_trie.Trie(words)
 73 | 
 74 | 
 75 | def create_bytes_trie():
 76 |     words = WORDS100k
 77 |     values = (struct.pack("<H", len(word)) for word in words)
 78 |     return marisa_trie.BytesTrie(zip(words, values))
 79 | 
 80 | 
 81 | def create_record_trie():
 82 |     words = WORDS100k
 83 |     values = ([len(word)] for word in words)
 84 |     return marisa_trie.RecordTrie("<H", zip(words, values))
 85 | 
 86 | 
 87 | def benchmark():
 88 |     print("\n====== Benchmarks (100k unique unicode words) =======\n")
 89 | 
 90 |     common_setup = """
 91 | from __main__ import create_trie, create_bytes_trie, create_record_trie
 92 | from __main__ import WORDS100k, NON_WORDS100k, MIXED_WORDS100k
 93 | from __main__ import PREFIXES_3_1k, PREFIXES_5_1k, PREFIXES_8_1k, PREFIXES_15_1k
 94 | NON_WORDS_10k = NON_WORDS100k[:10000]
 95 | NON_WORDS_1k = ['ыва', 'xyz', 'соы', 'Axx', 'avы']*200
 96 | """
 97 |     BUILD = {
 98 |         "dict": "data = dict((word, len(word)) for word in WORDS100k);",
 99 |         "Trie": "data = create_trie();",
100 |         "BytesTrie": "data = create_bytes_trie();",
101 |         "RecordTrie": "data = create_record_trie();",
102 |     }
103 | 
104 |     dict_setup = common_setup + BUILD["dict"]
105 |     trie_setup = common_setup + BUILD["Trie"]
106 |     bytes_trie_setup = common_setup + BUILD["BytesTrie"]
107 |     record_trie_setup = common_setup + BUILD["RecordTrie"]
108 | 
109 |     structures = [
110 |         ("dict", dict_setup),
111 |         ("Trie", trie_setup),
112 |         ("BytesTrie", bytes_trie_setup),
113 |         ("RecordTrie", record_trie_setup),
114 |     ]
115 | 
116 |     # build performance
117 |     for name, setup in structures:
118 |         timer = timeit.Timer(BUILD[name], common_setup)
119 |         bench(f"{name} building", timer, "M words/sec", 0.1, 5)
120 | 
121 |     # common operations speed
122 |     tests = [
123 |         (
124 |             "__getitem__ (hits)",
125 |             "for word in WORDS100k: data[word]",
126 |             "M ops/sec",
127 |             0.1,
128 |             3,
129 |         ),
130 |         ("get() (hits)", "for word in WORDS100k: data.get(word)", "M ops/sec", 0.1, 3),
131 |         (
132 |             "get() (misses)",
133 |             "for word in NON_WORDS_10k: data.get(word)",
134 |             "M ops/sec",
135 |             0.01,
136 |             5,
137 |         ),
138 |         (
139 |             "__contains__ (hits)",
140 |             "for word in WORDS100k: word in data",
141 |             "M ops/sec",
142 |             0.1,
143 |             3,
144 |         ),
145 |         (
146 |             "__contains__ (misses)",
147 |             "for word in NON_WORDS100k: word in data",
148 |             "M ops/sec",
149 |             0.1,
150 |             3,
151 |         ),
152 |         ("items()", "list(data.items())", " ops/sec", 1, 1),
153 |         ("keys()", "list(data.keys())", " ops/sec", 1, 1),
154 |         #        ('values()', 'list(data.values())', ' ops/sec', 1, 1),
155 |     ]
156 | 
157 |     for test_name, test, descr, op_count, repeats in tests:
158 |         for name, setup in structures:
159 |             timer = timeit.Timer(test, setup)
160 |             full_test_name = f"{name} {test_name}"
161 |             bench(full_test_name, timer, descr, op_count, repeats, 9)
162 | 
163 |     # trie-specific benchmarks
164 |     for struct_name, setup in structures[1:]:
165 |         _bench_data = [
166 |             ("hits", "WORDS100k"),
167 |             ("mixed", "MIXED_WORDS100k"),
168 |             ("misses", "NON_WORDS100k"),
169 |         ]
170 | 
171 |         for meth in ["prefixes", "iter_prefixes"]:
172 |             for name, data in _bench_data:
173 |                 bench(
174 |                     f"{struct_name}.{meth} ({name})",
175 |                     timeit.Timer(
176 |                         f"for word in {data}:\n"
177 |                         f"   for it in data.{meth}(word): pass",
178 |                         setup,
179 |                     ),
180 |                     runs=3,
181 |                 )
182 | 
183 |         _bench_data = [
184 |             ("xxx", "avg_len(res)==415", "PREFIXES_3_1k"),
185 |             ("xxxxx", "avg_len(res)==17", "PREFIXES_5_1k"),
186 |             ("xxxxxxxx", "avg_len(res)==3", "PREFIXES_8_1k"),
187 |             ("xxxxx..xx", "avg_len(res)==1.4", "PREFIXES_15_1k"),
188 |             ("xxx", "NON_EXISTING", "NON_WORDS_1k"),
189 |         ]
190 |         for xxx, avg, data in _bench_data:
191 |             for meth in ["keys"]:  # ('items', 'keys', 'values'):
192 |                 bench(
193 |                     f'{struct_name}.{meth}(prefix="{xxx}"), {avg}',
194 |                     timeit.Timer(
195 |                         f"for word in {data}: data.{meth}(word)", setup
196 |                     ),
197 |                     "K ops/sec",
198 |                     op_count=1,
199 |                     runs=3,
200 |                     text_width=60,
201 |                 )
202 | 
203 | 
204 | def check_trie(trie, words):
205 |     assert sum(trie[word] for word in words) == len(words)
206 | 
207 | 
208 | def profiling():
209 |     import pstats
210 |     import cProfile
211 | 
212 |     print("\n====== Profiling =======\n")
213 |     trie = create_trie()
214 |     WORDS = WORDS100k
215 | 
216 |     #    def check_prefixes(trie, words):
217 |     #        for word in words:
218 |     #            trie.keys(word)
219 |     #    cProfile.runctx("check_prefixes(trie, NON_WORDS_1k)", globals(), locals(), "Profile.prof")
220 |     #
221 |     cProfile.runctx("check_trie(trie, WORDS)", globals(), locals(), "Profile.prof")
222 | 
223 |     s = pstats.Stats("Profile.prof")
224 |     s.strip_dirs().sort_stats("time").print_stats(20)
225 | 
226 | 
227 | if __name__ == "__main__":
228 |     benchmark()
229 |     # profiling()
230 |     print("\n~~~~~~~~~~~~~~\n")
231 | 


--------------------------------------------------------------------------------
/bench/words100k.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytries/marisa-trie/97cfda688aee37565f6d4a414cc66dd5384cb4ad/bench/words100k.txt.gz


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  applehelp  to make an Apple Help Book"
 34 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 35 | 	@echo "  epub       to make an epub"
 36 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 37 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 38 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 39 | 	@echo "  text       to make text files"
 40 | 	@echo "  man        to make manual pages"
 41 | 	@echo "  texinfo    to make Texinfo files"
 42 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 43 | 	@echo "  gettext    to make PO message catalogs"
 44 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 45 | 	@echo "  xml        to make Docutils-native XML files"
 46 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 47 | 	@echo "  linkcheck  to check all external links for integrity"
 48 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 49 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 50 | 
 51 | clean:
 52 | 	rm -rf $(BUILDDIR)/*
 53 | 
 54 | html:
 55 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 58 | 
 59 | dirhtml:
 60 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 61 | 	@echo
 62 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 63 | 
 64 | singlehtml:
 65 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 66 | 	@echo
 67 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 68 | 
 69 | pickle:
 70 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 71 | 	@echo
 72 | 	@echo "Build finished; now you can process the pickle files."
 73 | 
 74 | json:
 75 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 76 | 	@echo
 77 | 	@echo "Build finished; now you can process the JSON files."
 78 | 
 79 | htmlhelp:
 80 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 81 | 	@echo
 82 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 83 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 84 | 
 85 | qthelp:
 86 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 87 | 	@echo
 88 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 89 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 90 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/marisa-trie.qhcp"
 91 | 	@echo "To view the help file:"
 92 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/marisa-trie.qhc"
 93 | 
 94 | applehelp:
 95 | 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
 96 | 	@echo
 97 | 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
 98 | 	@echo "N.B. You won't be able to view it unless you put it in" \
 99 | 	      "~/Library/Documentation/Help or install it in your application" \
100 | 	      "bundle."
101 | 
102 | devhelp:
103 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
104 | 	@echo
105 | 	@echo "Build finished."
106 | 	@echo "To view the help file:"
107 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/marisa-trie"
108 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/marisa-trie"
109 | 	@echo "# devhelp"
110 | 
111 | epub:
112 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
113 | 	@echo
114 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
115 | 
116 | latex:
117 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
118 | 	@echo
119 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
120 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
121 | 	      "(use \`make latexpdf' here to do that automatically)."
122 | 
123 | latexpdf:
124 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
125 | 	@echo "Running LaTeX files through pdflatex..."
126 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
127 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
128 | 
129 | latexpdfja:
130 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
131 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
132 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
133 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
134 | 
135 | text:
136 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
137 | 	@echo
138 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
139 | 
140 | man:
141 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
142 | 	@echo
143 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
144 | 
145 | texinfo:
146 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
147 | 	@echo
148 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
149 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
150 | 	      "(use \`make info' here to do that automatically)."
151 | 
152 | info:
153 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
154 | 	@echo "Running Texinfo files through makeinfo..."
155 | 	make -C $(BUILDDIR)/texinfo info
156 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
157 | 
158 | gettext:
159 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
160 | 	@echo
161 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
162 | 
163 | changes:
164 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
165 | 	@echo
166 | 	@echo "The overview file is in $(BUILDDIR)/changes."
167 | 
168 | linkcheck:
169 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
170 | 	@echo
171 | 	@echo "Link check complete; look for any errors in the above output " \
172 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
173 | 
174 | doctest:
175 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
176 | 	@echo "Testing of doctests in the sources finished, look at the " \
177 | 	      "results in $(BUILDDIR)/doctest/output.txt."
178 | 
179 | coverage:
180 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
181 | 	@echo "Testing of coverage in the sources finished, look at the " \
182 | 	      "results in $(BUILDDIR)/coverage/python.txt."
183 | 
184 | xml:
185 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
186 | 	@echo
187 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
188 | 
189 | pseudoxml:
190 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
191 | 	@echo
192 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
193 | 


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
 1 | .. _api:
 2 | 
 3 | 
 4 | API reference
 5 | =============
 6 | 
 7 | BinaryTrie
 8 | ----------
 9 | 
10 | .. autoclass:: marisa_trie.BinaryTrie
11 |    :members:
12 |    :inherited-members:
13 | 
14 | Trie
15 | ----
16 | 
17 | .. autoclass:: marisa_trie.Trie
18 |    :members:
19 |    :inherited-members:
20 | 
21 | BytesTrie
22 | ---------
23 | 
24 | .. autoclass:: marisa_trie.BytesTrie
25 |    :members:
26 | 
27 | 
28 | RecordTrie
29 | ----------
30 | 
31 | .. autoclass:: marisa_trie.RecordTrie
32 |    :members:
33 | 


--------------------------------------------------------------------------------
/docs/benchmarks.rst:
--------------------------------------------------------------------------------
  1 | Benchmarks
  2 | ==========
  3 | 
  4 | My quick tests show that memory usage is quite decent.
  5 | For a list of 3000000 (3 million) Russian words memory consumption
  6 | with different data structures (under Python 2.7):
  7 | 
  8 | * dict(unicode words -> word lenghts): about 600M
  9 | * list(unicode words) : about 300M
 10 | * BaseTrie from datrie_ library: about 70M
 11 | * ``marisa_trie.RecordTrie`` : 11M
 12 | * ``marisa_trie.Trie``: 7M
 13 | 
 14 | 
 15 | .. note::
 16 | 
 17 |     Lengths of words were stored as values in ``datrie.BaseTrie``
 18 |     and ``marisa_trie.RecordTrie``. ``RecordTrie`` compresses
 19 |     similar values and the key compression is better so it uses
 20 |     much less memory than ``datrie.BaseTrie``.
 21 | 
 22 |     ``marisa_trie.Trie`` provides auto-assigned IDs. It is not possible
 23 |     to store arbitrary values in ``marisa_trie.Trie`` so it uses less
 24 |     memory than ``RecordTrie``.
 25 | 
 26 | Benchmark results (100k unicode words, integer values (lengths of the words),
 27 | Python 3.2, Macbook Air i5 1.8 Ghz)::
 28 | 
 29 |     dict building                     2.919M words/sec
 30 |     Trie building                     0.394M words/sec
 31 |     BytesTrie building                0.355M words/sec
 32 |     RecordTrie building               0.354M words/sec
 33 | 
 34 |     dict __getitem__ (hits)           8.239M ops/sec
 35 |     Trie __getitem__ (hits)           not supported
 36 |     BytesTrie __getitem__ (hits)      0.498M ops/sec
 37 |     RecordTrie __getitem__ (hits)     0.404M ops/sec
 38 | 
 39 |     dict get() (hits)                 4.410M ops/sec
 40 |     Trie get() (hits)                 not supported
 41 |     BytesTrie get() (hits)            0.458M ops/sec
 42 |     RecordTrie get() (hits)           0.364M ops/sec
 43 |     dict get() (misses)               4.869M ops/sec
 44 |     Trie get() (misses)               not supported
 45 |     BytesTrie get() (misses)          0.849M ops/sec
 46 |     RecordTrie get() (misses)         0.816M ops/sec
 47 | 
 48 |     dict __contains__ (hits)          8.053M ops/sec
 49 |     Trie __contains__ (hits)          1.018M ops/sec
 50 |     BytesTrie __contains__ (hits)     0.605M ops/sec
 51 |     RecordTrie __contains__ (hits)    0.618M ops/sec
 52 |     dict __contains__ (misses)        6.489M ops/sec
 53 |     Trie __contains__ (misses)        2.047M ops/sec
 54 |     BytesTrie __contains__ (misses)   1.079M ops/sec
 55 |     RecordTrie __contains__ (misses)  1.123M ops/sec
 56 | 
 57 |     dict items()                      57.248 ops/sec
 58 |     Trie items()                      not supported
 59 |     BytesTrie items()                 11.691 ops/sec
 60 |     RecordTrie items()                8.369 ops/sec
 61 | 
 62 |     dict keys()                       217.920 ops/sec
 63 |     Trie keys()                       19.589 ops/sec
 64 |     BytesTrie keys()                  14.849 ops/sec
 65 |     RecordTrie keys()                 15.369 ops/sec
 66 | 
 67 |     Trie.prefixes (hits)              0.594M ops/sec
 68 |     Trie.prefixes (mixed)             1.874M ops/sec
 69 |     Trie.prefixes (misses)            1.447M ops/sec
 70 |     RecordTrie.prefixes (hits)        0.103M ops/sec
 71 |     RecordTrie.prefixes (mixed)       0.458M ops/sec
 72 |     RecordTrie.prefixes (misses)      0.164M ops/sec
 73 |     Trie.iter_prefixes (hits)         0.588M ops/sec
 74 |     Trie.iter_prefixes (mixed)        1.470M ops/sec
 75 |     Trie.iter_prefixes (misses)       1.170M ops/sec
 76 | 
 77 |     Trie.keys(prefix="xxx"), avg_len(res)==415                   5.044K ops/sec
 78 |     Trie.keys(prefix="xxxxx"), avg_len(res)==17                  89.363K ops/sec
 79 |     Trie.keys(prefix="xxxxxxxx"), avg_len(res)==3                258.732K ops/sec
 80 |     Trie.keys(prefix="xxxxx..xx"), avg_len(res)==1.4             293.199K ops/sec
 81 |     Trie.keys(prefix="xxx"), NON_EXISTING                        1169.524K ops/sec
 82 | 
 83 |     RecordTrie.keys(prefix="xxx"), avg_len(res)==415             3.836K ops/sec
 84 |     RecordTrie.keys(prefix="xxxxx"), avg_len(res)==17            73.591K ops/sec
 85 |     RecordTrie.keys(prefix="xxxxxxxx"), avg_len(res)==3          229.515K ops/sec
 86 |     RecordTrie.keys(prefix="xxxxx..xx"), avg_len(res)==1.4       269.228K ops/sec
 87 |     RecordTrie.keys(prefix="xxx"), NON_EXISTING                  1071.433K ops/sec
 88 | 
 89 | 
 90 | Tries from ``marisa_trie`` are static and uses less memory, tries from
 91 | `datrie`_ are faster and can be updated.
 92 | 
 93 | You may also give DAWG_ a try - it is usually faster than
 94 | ``marisa-trie`` and sometimes can use less memory (depending on data).
 95 | 
 96 | Please take this benchmark results with a grain of salt; this
 97 | is a very simple benchmark on a single data set.
 98 | 
 99 | .. _datrie: https://github.com/kmike/datrie
100 | .. _DAWG: https://github.com/kmike/DAWG
101 | 


--------------------------------------------------------------------------------
/docs/changelog.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../CHANGES.rst
2 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # marisa-trie documentation build configuration file, created by
  5 | # sphinx-quickstart on Thu Mar 24 00:18:01 2016.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | import sys
 17 | import os
 18 | 
 19 | # If extensions (or modules to document with autodoc) are in another directory,
 20 | # add these directories to sys.path here. If the directory is relative to the
 21 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 22 | sys.path.insert(0, os.path.abspath(".."))
 23 | 
 24 | # -- General configuration ------------------------------------------------
 25 | 
 26 | # If your documentation needs a minimal Sphinx version, state it here.
 27 | # needs_sphinx = '1.0'
 28 | 
 29 | # Add any Sphinx extension module names here, as strings. They can be
 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 31 | # ones.
 32 | extensions = [
 33 |     "sphinx.ext.autodoc",
 34 |     "sphinx.ext.autosummary",
 35 |     "sphinx.ext.intersphinx",
 36 | ]
 37 | 
 38 | # Add any paths that contain templates here, relative to this directory.
 39 | templates_path = ["_templates"]
 40 | 
 41 | # The suffix(es) of source filenames.
 42 | # You can specify multiple suffix as a list of string:
 43 | # source_suffix = ['.rst', '.md']
 44 | source_suffix = ".rst"
 45 | 
 46 | # The encoding of source files.
 47 | # source_encoding = 'utf-8-sig'
 48 | 
 49 | # The master toctree document.
 50 | master_doc = "index"
 51 | 
 52 | # General information about the project.
 53 | project = "marisa-trie"
 54 | copyright = "2016-2025, Mikhail Korobov & contributors"
 55 | author = "Mikhail Korobov & contributors"
 56 | 
 57 | # The version info for the project you're documenting, acts as replacement for
 58 | # |version| and |release|, also used in various other places throughout the
 59 | # built documents.
 60 | #
 61 | # The short X.Y version.
 62 | version = "1.2.1"
 63 | # The full version, including alpha/beta/rc tags.
 64 | release = version
 65 | 
 66 | # The language for content autogenerated by Sphinx. Refer to documentation
 67 | # for a list of supported languages.
 68 | #
 69 | # This is also used if you do content translation via gettext catalogs.
 70 | # Usually you set "language" from the command line for these cases.
 71 | language = None
 72 | 
 73 | # List of patterns, relative to source directory, that match files and
 74 | # directories to ignore when looking for source files.
 75 | exclude_patterns = ["_build"]
 76 | 
 77 | # The name of the Pygments (syntax highlighting) style to use.
 78 | pygments_style = "sphinx"
 79 | 
 80 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 81 | todo_include_todos = False
 82 | 
 83 | # Example configuration for intersphinx: refer to the Python standard library.
 84 | intersphinx_mapping = {"https://docs.python.org/3/": None}
 85 | 
 86 | # -- Options for HTML output ----------------------------------------------
 87 | 
 88 | ## Read the docs style:
 89 | try:
 90 |     import sphinx_rtd_theme
 91 | except ImportError:
 92 |     html_theme = "classic"
 93 | else:
 94 |     html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
 95 |     html_theme = "sphinx_rtd_theme"
 96 | 
 97 | # Add any paths that contain custom static files (such as style sheets) here,
 98 | # relative to this directory. They are copied after the builtin static files,
 99 | # so a file named "default.css" will overwrite the builtin "default.css".
100 | html_static_path = []
101 | 
102 | # Output file base name for HTML help builder.
103 | htmlhelp_basename = "marisa-triedoc"
104 | 
105 | # -- Options for LaTeX output ---------------------------------------------
106 | 
107 | latex_elements = {
108 |     # The paper size ('letterpaper' or 'a4paper').
109 |     #'papersize': 'letterpaper',
110 |     # The font size ('10pt', '11pt' or '12pt').
111 |     #'pointsize': '10pt',
112 |     # Additional stuff for the LaTeX preamble.
113 |     #'preamble': '',
114 |     # Latex figure (float) alignment
115 |     #'figure_align': 'htbp',
116 | }
117 | 
118 | # Grouping the document tree into LaTeX files. List of tuples
119 | # (source start file, target name, title,
120 | #  author, documentclass [howto, manual, or own class]).
121 | latex_documents = [
122 |     (
123 |         master_doc,
124 |         "marisa-trie.tex",
125 |         "marisa-trie Documentation",
126 |         author,
127 |         "manual",
128 |     ),
129 | ]
130 | 
131 | 
132 | # -- Options for manual page output ---------------------------------------
133 | 
134 | # One entry per manual page. List of tuples
135 | # (source start file, name, description, authors, manual section).
136 | man_pages = [(master_doc, "marisa-trie", "marisa-trie Documentation", [author], 1)]
137 | 
138 | 
139 | # -- Options for Texinfo output -------------------------------------------
140 | 
141 | # Grouping the document tree into Texinfo files. List of tuples
142 | # (source start file, target name, title, author,
143 | #  dir menu entry, description, category)
144 | texinfo_documents = [
145 |     (
146 |         master_doc,
147 |         "marisa-trie",
148 |         "marisa-trie Documentation",
149 |         author,
150 |         "marisa-trie",
151 |         "One line description of project.",
152 |         "Miscellaneous",
153 |     ),
154 | ]
155 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
 1 | Contributing
 2 | ============
 3 | 
 4 | Contributions are welcome! Development happens at
 5 | `GitHub <https://github.com/pytries/marisa-trie>`_. Feel free to submit
 6 | ideas, bug reports and pull requests.
 7 | 
 8 | If you found a bug in a C++ part please report it to the original
 9 | `bug tracker <https://github.com/s-yata/marisa-trie/issues>`_.
10 | 
11 | Navigating the source code
12 | --------------------------
13 | 
14 | There are 4 folders in repository:
15 | 
16 | * ``bench`` -- benchmarks & benchmark data;
17 | * ``lib`` -- original unmodified `marisa-trie`_ C++ library which is a git
18 |   submodule; if something is have to be fixed in this library
19 |   consider fixing it in the original repo;
20 | * ``src`` -- wrapper code; ``src/marisa_trie.pyx`` is a wrapper implementation;
21 |   ``src/*.pxd`` files are Cython headers for corresponding C++ headers;
22 |   ``src/*.cpp`` files are the pre-built extension code and shouldn't be
23 |   modified directly (they should be updated via ``update_cpp.sh`` script).
24 | * ``tests`` -- the test suite.
25 | 
26 | .. _marisa-trie: https://github.com/s-yata/marisa-trie
27 | 
28 | Running tests and benchmarks
29 | ----------------------------
30 | 
31 | Make sure ``pytest`` is installed and run
32 | 
33 | ::
34 | 
35 |     $ python -m pytest
36 | 
37 | from the source checkout. Tests should pass under Python 3.8 and newer.
38 | 
39 | In order to run benchmarks, type
40 | 
41 | ::
42 | 
43 |     $ python bench/speed.py
44 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. include:: ../README.rst
 2 | 
 3 | .. toctree::
 4 |    :hidden:
 5 | 
 6 |    tutorial
 7 |    benchmarks
 8 |    api
 9 |    contributing
10 |    changelog
11 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	echo.  coverage   to run coverage check of the documentation if enabled
 41 | 	goto end
 42 | )
 43 | 
 44 | if "%1" == "clean" (
 45 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 46 | 	del /q /s %BUILDDIR%\*
 47 | 	goto end
 48 | )
 49 | 
 50 | 
 51 | REM Check if sphinx-build is available and fallback to Python version if any
 52 | %SPHINXBUILD% 2> nul
 53 | if errorlevel 9009 goto sphinx_python
 54 | goto sphinx_ok
 55 | 
 56 | :sphinx_python
 57 | 
 58 | set SPHINXBUILD=python -m sphinx.__init__
 59 | %SPHINXBUILD% 2> nul
 60 | if errorlevel 9009 (
 61 | 	echo.
 62 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 63 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 64 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 65 | 	echo.may add the Sphinx directory to PATH.
 66 | 	echo.
 67 | 	echo.If you don't have Sphinx installed, grab it from
 68 | 	echo.http://sphinx-doc.org/
 69 | 	exit /b 1
 70 | )
 71 | 
 72 | :sphinx_ok
 73 | 
 74 | 
 75 | if "%1" == "html" (
 76 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 77 | 	if errorlevel 1 exit /b 1
 78 | 	echo.
 79 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 80 | 	goto end
 81 | )
 82 | 
 83 | if "%1" == "dirhtml" (
 84 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 85 | 	if errorlevel 1 exit /b 1
 86 | 	echo.
 87 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 88 | 	goto end
 89 | )
 90 | 
 91 | if "%1" == "singlehtml" (
 92 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 93 | 	if errorlevel 1 exit /b 1
 94 | 	echo.
 95 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 96 | 	goto end
 97 | )
 98 | 
 99 | if "%1" == "pickle" (
100 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
101 | 	if errorlevel 1 exit /b 1
102 | 	echo.
103 | 	echo.Build finished; now you can process the pickle files.
104 | 	goto end
105 | )
106 | 
107 | if "%1" == "json" (
108 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
109 | 	if errorlevel 1 exit /b 1
110 | 	echo.
111 | 	echo.Build finished; now you can process the JSON files.
112 | 	goto end
113 | )
114 | 
115 | if "%1" == "htmlhelp" (
116 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
117 | 	if errorlevel 1 exit /b 1
118 | 	echo.
119 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
120 | .hhp project file in %BUILDDIR%/htmlhelp.
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "qthelp" (
125 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
129 | .qhcp project file in %BUILDDIR%/qthelp, like this:
130 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\marisa-trie.qhcp
131 | 	echo.To view the help file:
132 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\marisa-trie.ghc
133 | 	goto end
134 | )
135 | 
136 | if "%1" == "devhelp" (
137 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
138 | 	if errorlevel 1 exit /b 1
139 | 	echo.
140 | 	echo.Build finished.
141 | 	goto end
142 | )
143 | 
144 | if "%1" == "epub" (
145 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
146 | 	if errorlevel 1 exit /b 1
147 | 	echo.
148 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
149 | 	goto end
150 | )
151 | 
152 | if "%1" == "latex" (
153 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
154 | 	if errorlevel 1 exit /b 1
155 | 	echo.
156 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
157 | 	goto end
158 | )
159 | 
160 | if "%1" == "latexpdf" (
161 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
162 | 	cd %BUILDDIR%/latex
163 | 	make all-pdf
164 | 	cd %~dp0
165 | 	echo.
166 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
167 | 	goto end
168 | )
169 | 
170 | if "%1" == "latexpdfja" (
171 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
172 | 	cd %BUILDDIR%/latex
173 | 	make all-pdf-ja
174 | 	cd %~dp0
175 | 	echo.
176 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
177 | 	goto end
178 | )
179 | 
180 | if "%1" == "text" (
181 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
182 | 	if errorlevel 1 exit /b 1
183 | 	echo.
184 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
185 | 	goto end
186 | )
187 | 
188 | if "%1" == "man" (
189 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
190 | 	if errorlevel 1 exit /b 1
191 | 	echo.
192 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
193 | 	goto end
194 | )
195 | 
196 | if "%1" == "texinfo" (
197 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
198 | 	if errorlevel 1 exit /b 1
199 | 	echo.
200 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
201 | 	goto end
202 | )
203 | 
204 | if "%1" == "gettext" (
205 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
206 | 	if errorlevel 1 exit /b 1
207 | 	echo.
208 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
209 | 	goto end
210 | )
211 | 
212 | if "%1" == "changes" (
213 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
214 | 	if errorlevel 1 exit /b 1
215 | 	echo.
216 | 	echo.The overview file is in %BUILDDIR%/changes.
217 | 	goto end
218 | )
219 | 
220 | if "%1" == "linkcheck" (
221 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
222 | 	if errorlevel 1 exit /b 1
223 | 	echo.
224 | 	echo.Link check complete; look for any errors in the above output ^
225 | or in %BUILDDIR%/linkcheck/output.txt.
226 | 	goto end
227 | )
228 | 
229 | if "%1" == "doctest" (
230 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
231 | 	if errorlevel 1 exit /b 1
232 | 	echo.
233 | 	echo.Testing of doctests in the sources finished, look at the ^
234 | results in %BUILDDIR%/doctest/output.txt.
235 | 	goto end
236 | )
237 | 
238 | if "%1" == "coverage" (
239 | 	%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
240 | 	if errorlevel 1 exit /b 1
241 | 	echo.
242 | 	echo.Testing of coverage in the sources finished, look at the ^
243 | results in %BUILDDIR%/coverage/python.txt.
244 | 	goto end
245 | )
246 | 
247 | if "%1" == "xml" (
248 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
249 | 	if errorlevel 1 exit /b 1
250 | 	echo.
251 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
252 | 	goto end
253 | )
254 | 
255 | if "%1" == "pseudoxml" (
256 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
257 | 	if errorlevel 1 exit /b 1
258 | 	echo.
259 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
260 | 	goto end
261 | )
262 | 
263 | :end
264 | 


--------------------------------------------------------------------------------
/docs/tutorial.rst:
--------------------------------------------------------------------------------
  1 | .. _tutorial:
  2 | 
  3 | Tutorial
  4 | ========
  5 | 
  6 | Tries
  7 | -----
  8 | 
  9 | There are several trie classes in this package:
 10 | 
 11 | .. autosummary::
 12 |    :nosignatures:
 13 | 
 14 |    marisa_trie.BinaryTrie
 15 |    marisa_trie.Trie
 16 |    marisa_trie.RecordTrie
 17 |    marisa_trie.BytesTrie
 18 | 
 19 | marisa_trie.Trie
 20 | ~~~~~~~~~~~~~~~~
 21 | 
 22 | Create a new trie from a list of keys::
 23 | 
 24 |     >>> import marisa_trie
 25 |     >>> trie = marisa_trie.Trie(["key1", "key2", "key12"])
 26 | 
 27 | Check if a key is present::
 28 | 
 29 |     >>> "key1" in trie
 30 |     True
 31 |     >>> "key20" in trie
 32 |     False
 33 | 
 34 | Each key is assigned an unique ID from 0 to (n - 1), where n is the
 35 | number of keys in a trie::
 36 | 
 37 |     >>> trie["key2"]
 38 |     1
 39 | 
 40 | Note that you can't assign a value to a ``marisa_trie.Trie`` key,
 41 | but can use the returned ID to store values in a separate data structure
 42 | (e.g. in a Python list or NumPy array).
 43 | 
 44 | An ID can be mapped back to the corresponding key:
 45 | 
 46 |     >>> trie.restore_key(1)
 47 |     "key2"
 48 | 
 49 | Query a trie
 50 | 
 51 | * Find all trie keys which are prefixes of a given key::
 52 | 
 53 |       >>> trie.prefixes("key12")
 54 |       ["key1", "key12"]
 55 | 
 56 | * Find all trie keys which start with a given prefix::
 57 | 
 58 |       >> trie.keys("key1")
 59 |       ["key1", "key12"]
 60 | 
 61 | * The latter is complemented by :meth:`~marisa_trie.Trie.items` which
 62 |   returns all matching ``(key, ID)`` pairs.
 63 | 
 64 | All query methods have generator-based versions prefixed with ``iter``.
 65 | 
 66 | .. note::
 67 | 
 68 |    If you're looking for a trie with bytes keys, check out
 69 |    :class:`~marisa_trie.BinaryTrie`.
 70 | 
 71 | 
 72 | marisa_trie.RecordTrie
 73 | ~~~~~~~~~~~~~~~~~~~~~~
 74 | 
 75 | Create a new trie from a list of ``(key, data)`` pairs::
 76 | 
 77 |     >>> keys = ["foo", "bar", "foobar", "foo"]
 78 |     >>> values = [(1, 2), (2, 1), (3, 3), (2, 1)]
 79 |     >>> fmt = "<HH"   # two short integers.
 80 |     >>> trie = marisa_trie.RecordTrie(fmt, zip(keys, values))
 81 | 
 82 | Each data tuple would be converted to bytes using :func:`struct.pack`. Take a
 83 | look at available `format strings <https://docs.python.org/3/library/struct.html#format-strings>`_.
 84 | 
 85 | Check if a key is present::
 86 | 
 87 |     >>> "foo" in trie
 88 |     True
 89 |     >>> "spam" in trie
 90 |     False
 91 | 
 92 | ``marisa_trie.RecordTrie`` allows duplicate keys. Therefore ``__getitem__`` and
 93 | ``get`` return a list of values.
 94 | 
 95 |     >>> trie["bar"]
 96 |     [(2, 1)]
 97 |     >>> trie["foo"]
 98 |     [(1, 2), (2, 1)]
 99 |     >>> trie.get("bar", 123)
100 |     [(2, 1)]
101 |     >>> trie.get("BAAR", 123)  # default value.
102 |     123
103 | 
104 | Similarly, :meth:`~marisa_trie.RecordTrie.keys` and
105 | :meth:`~marisa_trie.RecordTrie.items` take into account key multiplicities::
106 | 
107 |     >> trie.keys("fo")
108 |     ["foo", "foo", "foobar"]
109 |     >> trie.items("fo")
110 |     [("foo", (1, 2)), ("foo", (2, 1)), ("foobar", (3, 3))]
111 | 
112 | 
113 | marisa_trie.BytesTrie
114 | ~~~~~~~~~~~~~~~~~~~~~
115 | 
116 | ``BytesTrie`` is similar to ``RecordTrie``, but the values are raw bytes,
117 | not tuples::
118 | 
119 |     >>> keys = ["foo", "bar", "foobar", "foo"]
120 |     >>> values = [b'foo-value', b'bar-value', b'foobar-value', b'foo-value2']
121 |     >>> trie = marisa_trie.BytesTrie(zip(keys, values))
122 |     >>> trie["bar"]
123 |     [b'bar-value']
124 | 
125 | 
126 | Persistence
127 | -----------
128 | 
129 | Trie objects supports saving/loading, pickling/unpickling and memory mapped I/O.
130 | 
131 | Save trie to a file::
132 | 
133 |     >>> trie.save('my_trie.marisa')
134 | 
135 | Load trie from a file::
136 | 
137 |     >>> trie2 = marisa_trie.Trie()
138 |     >>> trie2.load('my_trie.marisa')
139 | 
140 | .. note:: You may also build a trie using ``marisa-build`` command-line
141 |           utility (provided by underlying C++ library; it should be
142 |           downloaded and compiled separately) and then load the trie
143 |           from the resulting file using ``load``.
144 | 
145 | Trie objects are picklable::
146 | 
147 |     >>> import pickle
148 |     >>> data = pickle.dumps(trie)
149 |     >>> trie3 = pickle.loads(data)
150 | 
151 | 
152 | Memory mapped I/O
153 | -----------------
154 | 
155 | It is possible to use memory mapped file as data source::
156 | 
157 |     >>> trie = marisa_trie.RecordTrie(fmt).mmap('my_record_trie.marisa')
158 | 
159 | This way the whole dictionary won't be loaded fully to memory; memory
160 | mapped I/O is an easy way to share dictionary data among processes.
161 | 
162 | .. warning::
163 | 
164 |     Memory mapped trie might cause lots of random disk accesses which
165 |     considerably increases the search time.
166 | 
167 | 
168 | Storage options
169 | ---------------
170 | 
171 | `marisa-trie <https://github.com/s-yata/marisa-trie>`_ C++ library provides
172 | some configuration options for trie storage; See "Enumeration Constants"
173 | section in the library
174 | `docs <http://s-yata.github.io/marisa-trie/docs/readme.en.html>`_.
175 | 
176 | These options are exposed as ``order``, ``num_tries``, ``cache_size``
177 | and ``binary`` keyword arguments for trie constructors.
178 | 
179 | For example, set ``order`` to ``marisa_trie.LABEL_ORDER`` in order to
180 | make trie functions return results in alphabetical oder::
181 | 
182 |     >>> trie = marisa_trie.RecordTrie(fmt, data, order=marisa_trie.LABEL_ORDER)
183 | 
184 | Note that two tries constructed from identical data but with different ``order``
185 | arguments will compare unequal::
186 | 
187 |     >>> t1 = marisa_trie.Trie(order=marisa_trie.LABEL_ORDER)
188 |     >>> t2 = marisa_trie.Trie(order=marisa_trie.WEIGHT_ORDER)
189 |     >>> t1 == t2
190 |     False
191 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | """Static memory-efficient and fast Trie-like structures for Python."""
  2 | 
  3 | import glob
  4 | import itertools
  5 | import os.path
  6 | 
  7 | from setuptools import setup, Extension
  8 | 
  9 | 
 10 | # Note: keep requirements here to ease distributions packaging
 11 | tests_require = [
 12 |     "hypothesis",
 13 |     "pytest",
 14 |     "readme_renderer",
 15 | ]
 16 | install_requires = [
 17 |     "setuptools",
 18 | ]
 19 | 
 20 | MARISA_ROOT_DIR = "marisa-trie"
 21 | MARISA_SOURCE_DIR = os.path.join(MARISA_ROOT_DIR, "lib")
 22 | MARISA_INCLUDE_DIR = os.path.join(MARISA_ROOT_DIR, "include")
 23 | MARISA_FILES = [
 24 |     "marisa/*.cc",
 25 |     "marisa/grimoire.cc",
 26 |     "marisa/grimoire/io/*.cc",
 27 |     "marisa/grimoire/trie/*.cc",
 28 |     "marisa/grimoire/vector/*.cc",
 29 | ]
 30 | 
 31 | MARISA_FILES[:] = itertools.chain(
 32 |     *(glob.glob(os.path.join(MARISA_SOURCE_DIR, path)) for path in MARISA_FILES)
 33 | )
 34 | 
 35 | DESCRIPTION = __doc__
 36 | with open("README.rst", encoding="utf-8") as f1, open(
 37 |     "CHANGES.rst", encoding="utf-8"
 38 | ) as f2:
 39 |     LONG_DESCRIPTION = f1.read() + f2.read()
 40 | LICENSE = "MIT"
 41 | 
 42 | CLASSIFIERS = [
 43 |     "Development Status :: 4 - Beta",
 44 |     "Intended Audience :: Developers",
 45 |     "Intended Audience :: Science/Research",
 46 |     "License :: OSI Approved :: MIT License",
 47 |     "Programming Language :: Cython",
 48 |     "Programming Language :: Python",
 49 |     "Programming Language :: Python :: 3",
 50 |     "Programming Language :: Python :: 3.8",
 51 |     "Programming Language :: Python :: 3.9",
 52 |     "Programming Language :: Python :: 3.10",
 53 |     "Programming Language :: Python :: 3.11",
 54 |     "Programming Language :: Python :: 3.12",
 55 |     "Programming Language :: Python :: 3.13",
 56 |     "Programming Language :: Python :: 3.14",
 57 |     "Programming Language :: Python :: Implementation :: CPython",
 58 |     "Topic :: Software Development :: Libraries :: Python Modules",
 59 |     "Topic :: Scientific/Engineering :: Information Analysis",
 60 |     "Topic :: Text Processing :: Linguistic",
 61 | ]
 62 | 
 63 | setup(
 64 |     name="marisa-trie",
 65 |     version="1.2.1",
 66 |     description=DESCRIPTION,
 67 |     long_description=LONG_DESCRIPTION,
 68 |     long_description_content_type="text/x-rst",
 69 |     author="Mikhail Korobov",
 70 |     author_email="kmike84@gmail.com",
 71 |     license=LICENSE,
 72 |     url="https://github.com/pytries/marisa-trie",
 73 |     classifiers=CLASSIFIERS,
 74 |     libraries=[
 75 |         (
 76 |             "libmarisa-trie",
 77 |             {
 78 |                 "sources": MARISA_FILES,
 79 |                 "include_dirs": [MARISA_SOURCE_DIR, MARISA_INCLUDE_DIR],
 80 |             },
 81 |         )
 82 |     ],
 83 |     ext_modules=[
 84 |         Extension(
 85 |             "marisa_trie",
 86 |             [
 87 |                 "src/agent.cpp",
 88 |                 "src/base.cpp",
 89 |                 "src/iostream.cpp",
 90 |                 "src/key.cpp",
 91 |                 "src/keyset.cpp",
 92 |                 "src/marisa_trie.cpp",
 93 |                 "src/query.cpp",
 94 |                 "src/std_iostream.cpp",
 95 |                 "src/trie.cpp",
 96 |             ],
 97 |             include_dirs=[MARISA_INCLUDE_DIR],
 98 |         )
 99 |     ],
100 |     python_requires=">=3.8",
101 |     install_requires=install_requires,
102 |     extras_require={
103 |         "test": tests_require,
104 |     },
105 | )
106 | 


--------------------------------------------------------------------------------
/src/agent.pxd:
--------------------------------------------------------------------------------
 1 | cimport query, key
 2 | 
 3 | cdef extern from "<marisa/agent.h>" namespace "marisa" nogil:
 4 |     cdef cppclass Agent:
 5 |         Agent() except +
 6 | 
 7 |         query.Query &query()
 8 |         key.Key &key()
 9 | 
10 |         void set_query(char *str)
11 |         void set_query(char *ptr, int length)
12 |         void set_query(int key_id)
13 | 
14 |         void set_key(char *str)
15 |         void set_key(char *ptr, int length)
16 |         void set_key(int id)
17 | 
18 |         void clear()
19 | 
20 |         void init_state()
21 | 
22 |         void swap(Agent &rhs)
23 | 


--------------------------------------------------------------------------------
/src/base.pxd:
--------------------------------------------------------------------------------
 1 | cdef extern from "<marisa/base.h>":
 2 | 
 3 |     # A dictionary consists of 3 tries in default. Usually more tries make a
 4 |     # dictionary space-efficient but time-inefficient.
 5 |     ctypedef enum marisa_num_tries:
 6 |         MARISA_MIN_NUM_TRIES
 7 |         MARISA_MAX_NUM_TRIES
 8 |         MARISA_DEFAULT_NUM_TRIES
 9 | 
10 | 
11 |     # This library uses a cache technique to accelerate search functions. The
12 |     # following enumerated type marisa_cache_level gives a list of available cache
13 |     # size options. A larger cache enables faster search but takes a more space.
14 |     ctypedef enum marisa_cache_level:
15 |         MARISA_HUGE_CACHE
16 |         MARISA_LARGE_CACHE
17 |         MARISA_NORMAL_CACHE
18 |         MARISA_SMALL_CACHE
19 |         MARISA_TINY_CACHE
20 |         MARISA_DEFAULT_CACHE
21 | 
22 |     # This library provides 2 kinds of TAIL implementations.
23 |     ctypedef enum marisa_tail_mode:
24 |         # MARISA_TEXT_TAIL merges last labels as zero-terminated strings. So, it is
25 |         # available if and only if the last labels do not contain a NULL character.
26 |         # If MARISA_TEXT_TAIL is specified and a NULL character exists in the last
27 |         # labels, the setting is automatically switched to MARISA_BINARY_TAIL.
28 |         MARISA_TEXT_TAIL
29 | 
30 |         # MARISA_BINARY_TAIL also merges last labels but as byte sequences. It uses
31 |         # a bit vector to detect the end of a sequence, instead of NULL characters.
32 |         # So, MARISA_BINARY_TAIL requires a larger space if the average length of
33 |         # labels is greater than 8.
34 |         MARISA_BINARY_TAIL
35 | 
36 |         MARISA_DEFAULT_TAIL
37 | 
38 |     # The arrangement of nodes affects the time cost of matching and the order of
39 |     # predictive search.
40 |     ctypedef enum marisa_node_order:
41 |         # MARISA_LABEL_ORDER arranges nodes in ascending label order.
42 |         # MARISA_LABEL_ORDER is useful if an application needs to predict keys in
43 |         # label order.
44 |         MARISA_LABEL_ORDER
45 | 
46 |         # MARISA_WEIGHT_ORDER arranges nodes in descending weight order.
47 |         # MARISA_WEIGHT_ORDER is generally a better choice because it enables faster
48 |         # matching.
49 |         MARISA_WEIGHT_ORDER
50 |         MARISA_DEFAULT_ORDER
51 | 
52 |     ctypedef enum marisa_config_mask:
53 |         MARISA_NUM_TRIES_MASK
54 |         MARISA_CACHE_LEVEL_MASK
55 |         MARISA_TAIL_MODE_MASK
56 |         MARISA_NODE_ORDER_MASK
57 |         MARISA_CONFIG_MASK
58 | 
59 | 
60 | cdef extern from "<marisa/base.h>" namespace "marisa":
61 |     ctypedef marisa_cache_level CacheLevel
62 |     ctypedef marisa_tail_mode TailMode
63 |     ctypedef marisa_node_order NodeOrder
64 | 


--------------------------------------------------------------------------------
/src/iostream.pxd:
--------------------------------------------------------------------------------
1 | from std_iostream cimport istream, ostream
2 | from trie cimport Trie
3 | 
4 | cdef extern from "<marisa/iostream.h>" namespace "marisa" nogil:
5 | 
6 |     istream &read(istream &stream, Trie *trie)
7 |     ostream &write(ostream &stream, Trie &trie)
8 | 


--------------------------------------------------------------------------------
/src/key.pxd:
--------------------------------------------------------------------------------
 1 | cdef extern from "<marisa/key.h>" namespace "marisa" nogil:
 2 | 
 3 |     cdef cppclass Key:
 4 |         Key()
 5 |         Key(Key &query)
 6 | 
 7 |         #Key &operator=(Key &query)
 8 | 
 9 |         char operator[](int i)
10 | 
11 |         void set_str(char *str)
12 |         void set_str(char *ptr, int length)
13 |         void set_id(int id)
14 |         void set_weight(float weight)
15 | 
16 |         char *ptr()
17 |         int length()
18 |         int id()
19 |         float weight()
20 | 
21 |         void clear()
22 |         void swap(Key &rhs)
23 | 


--------------------------------------------------------------------------------
/src/keyset.pxd:
--------------------------------------------------------------------------------
 1 | cimport key
 2 | 
 3 | cdef extern from "<marisa/keyset.h>" namespace "marisa" nogil:
 4 |     cdef cppclass Keyset:
 5 | 
 6 | #        cdef enum constants:
 7 | #            BASE_BLOCK_SIZE  = 4096
 8 | #            EXTRA_BLOCK_SIZE = 1024
 9 | #            KEY_BLOCK_SIZE   = 256
10 | 
11 |         Keyset()
12 | 
13 |         void push_back(key.Key &key)
14 |         void push_back(key.Key &key, char end_marker)
15 | 
16 |         void push_back(char *str)
17 |         void push_back(char *ptr, int length)
18 |         void push_back(char *ptr, int length, float weight)
19 | 
20 |         key.Key &operator[](int i)
21 | 
22 |         int num_keys()
23 |         bint empty()
24 | 
25 |         int size()
26 |         int total_length()
27 | 
28 |         void reset()
29 |         void clear()
30 |         void swap(Keyset &rhs)
31 | 


--------------------------------------------------------------------------------
/src/marisa_trie.pyx:
--------------------------------------------------------------------------------
  1 | # cython: profile=False, embedsignature=True
  2 | 
  3 | from std_iostream cimport stringstream, istream, ostream
  4 | from libc.string cimport strncmp
  5 | cimport keyset
  6 | cimport key
  7 | cimport agent
  8 | cimport trie
  9 | cimport iostream
 10 | cimport base
 11 | 
 12 | from cpython.buffer cimport PyBUF_SIMPLE, Py_buffer, PyObject_CheckBuffer, PyObject_GetBuffer, PyBuffer_Release
 13 | 
 14 | import itertools
 15 | import struct
 16 | import warnings
 17 | 
 18 | DEFAULT_CACHE = base.MARISA_DEFAULT_CACHE
 19 | HUGE_CACHE = base.MARISA_HUGE_CACHE
 20 | LARGE_CACHE = base.MARISA_LARGE_CACHE
 21 | NORMAL_CACHE = base.MARISA_NORMAL_CACHE
 22 | SMALL_CACHE = base.MARISA_SMALL_CACHE
 23 | TINY_CACHE = base.MARISA_TINY_CACHE
 24 | 
 25 | MIN_NUM_TRIES = base.MARISA_MIN_NUM_TRIES
 26 | MAX_NUM_TRIES = base.MARISA_MAX_NUM_TRIES
 27 | DEFAULT_NUM_TRIES = base.MARISA_DEFAULT_NUM_TRIES
 28 | 
 29 | # MARISA_TEXT_TAIL merges last labels as zero-terminated strings. So, it is
 30 | # available if and only if the last labels do not contain a NULL character.
 31 | # If MARISA_TEXT_TAIL is specified and a NULL character exists in the last
 32 | # labels, the setting is automatically switched to MARISA_BINARY_TAIL.
 33 | TEXT_TAIL = base.MARISA_TEXT_TAIL
 34 | 
 35 | # MARISA_BINARY_TAIL also merges last labels but as byte sequences. It uses
 36 | # a bit vector to detect the end of a sequence, instead of NULL characters.
 37 | # So, MARISA_BINARY_TAIL requires a larger space if the average length of
 38 | # labels is greater than 8.
 39 | BINARY_TAIL = base.MARISA_BINARY_TAIL
 40 | DEFAULT_TAIL = base.MARISA_DEFAULT_TAIL
 41 | 
 42 | 
 43 | # MARISA_LABEL_ORDER arranges nodes in ascending label order.
 44 | # MARISA_LABEL_ORDER is useful if an application needs to predict keys in
 45 | # label order.
 46 | LABEL_ORDER = base.MARISA_LABEL_ORDER
 47 | 
 48 | # MARISA_WEIGHT_ORDER arranges nodes in descending weight order.
 49 | # MARISA_WEIGHT_ORDER is generally a better choice because it enables faster
 50 | # matching.
 51 | WEIGHT_ORDER = base.MARISA_WEIGHT_ORDER
 52 | DEFAULT_ORDER = base.MARISA_DEFAULT_ORDER
 53 | 
 54 | 
 55 | cdef inline int getbufptr(object obj, char ** ptr, Py_ssize_t * size, Py_buffer * buf):
 56 |     """Get a pointer from bytes/buffer object ``obj``.
 57 | 
 58 |     On success, return 0, and set ``ptr``, ``size`` and ``buf``."""
 59 |     cdef int result = -1
 60 |     ptr[0] = NULL
 61 |     size[0] = 0
 62 |     if PyObject_CheckBuffer(obj) == 1:  # new-style Buffer interface
 63 |         result = PyObject_GetBuffer(obj, buf, PyBUF_SIMPLE)
 64 |         if result == 0:
 65 |             ptr[0] = <char *>buf.buf
 66 |             size[0] = buf.len
 67 |     return result
 68 | 
 69 | 
 70 | cdef inline void releasebuf(Py_buffer *buf):
 71 |     """Release buffer if necessary."""
 72 |     PyBuffer_Release(buf)
 73 | 
 74 | 
 75 | cdef class _Trie:
 76 |     cdef trie.Trie* _trie
 77 | 
 78 |     cdef bytes _encode_key(self, key):
 79 |         return key
 80 | 
 81 |     cdef _get_key(self, agent.Agent& ag):
 82 |         return ag.key().ptr()[:ag.key().length()]
 83 | 
 84 |     def __init__(self, arg=None, num_tries=DEFAULT_NUM_TRIES, binary=False,
 85 |                  cache_size=DEFAULT_CACHE, order=DEFAULT_ORDER, weights=None):
 86 |         """
 87 |         ``arg`` can be one of the following:
 88 | 
 89 |         * an iterable with bytes keys;
 90 |         * None (if you're going to load a trie later).
 91 | 
 92 |         Pass a ``weights`` iterable with expected lookup frequencies
 93 |         to optimize lookup and prefix search speed.
 94 |         """
 95 | 
 96 |         if self._trie:
 97 |             return
 98 |         self._trie = new trie.Trie()
 99 | 
100 |         byte_keys = (self._encode_key(key) for key in (arg or []))
101 | 
102 |         self._build(
103 |             byte_keys,
104 |             weights,
105 |             num_tries=num_tries,
106 |             binary=binary,
107 |             cache_size=cache_size,
108 |             order=order
109 |         )
110 | 
111 |     def __dealloc__(self):
112 |         if self._trie:
113 |             del self._trie
114 | 
115 |     def _config_flags(self, num_tries=DEFAULT_NUM_TRIES, binary=False,
116 |                       cache_size=DEFAULT_CACHE, order=DEFAULT_ORDER):
117 |         if not MIN_NUM_TRIES <= num_tries <= MAX_NUM_TRIES:
118 |             raise ValueError(
119 |                 "num_tries (which is %d) must be between between %d and %d" %
120 |                 (num_tries, MIN_NUM_TRIES, MAX_NUM_TRIES))
121 | 
122 |         binary_flag = BINARY_TAIL if binary else TEXT_TAIL
123 |         return num_tries | binary_flag | cache_size | order
124 | 
125 |     def _build(self, byte_keys, weights=None, **options):
126 |         if weights is None:
127 |             weights = itertools.repeat(1.0)
128 | 
129 |         cdef char* data
130 |         cdef float weight
131 |         cdef keyset.Keyset *ks = new keyset.Keyset()
132 | 
133 |         try:
134 |             for key, weight in zip(byte_keys, weights):
135 |                 ks.push_back(<char *>key, len(key), weight)
136 |             self._trie.build(ks[0], self._config_flags(**options))
137 |         finally:
138 |             del ks
139 | 
140 |     def __richcmp__(self, other, int op):
141 |         if op == 2:    # ==
142 |             if other is self:
143 |                 return True
144 |             elif not isinstance(other, _Trie):
145 |                 return False
146 | 
147 |             return (<_Trie>self)._equals(other)
148 |         elif op == 3:  # !=
149 |             return not (self == other)
150 | 
151 |         raise TypeError("unorderable types: {0} and {1}".format(
152 |             self.__class__, other.__class__))
153 | 
154 |     cdef bint _equals(self, _Trie other) nogil:
155 |         cdef int num_keys = self._trie.num_keys()
156 |         cdef base.NodeOrder node_order = self._trie.node_order()
157 |         if (other._trie.num_keys() != num_keys or
158 |             other._trie.node_order() != node_order):
159 |             return False
160 | 
161 |         cdef agent.Agent ag1, ag2
162 |         ag1.set_query(b"")
163 |         ag2.set_query(b"")
164 |         cdef int i
165 |         cdef key.Key key1, key2
166 |         for i in range(num_keys):
167 |             self._trie.predictive_search(ag1)
168 |             other._trie.predictive_search(ag2)
169 |             key1 = ag1.key()
170 |             key2 = ag2.key()
171 |             if (key1.length() != key2.length() or
172 |                 strncmp(key1.ptr(), key2.ptr(), key1.length()) != 0):
173 |                 return False
174 |         return True
175 | 
176 |     def __iter__(self):
177 |         return self.iterkeys()
178 | 
179 |     def __len__(self):
180 |         return self._trie.num_keys()
181 | 
182 |     def __contains__(self, key):
183 |         cdef bytes _key = self._encode_key(key)
184 |         return self._contains(_key)
185 | 
186 |     cdef bint _contains(self, bytes key):
187 |         cdef agent.Agent ag
188 |         ag.set_query(key, len(key))
189 |         return self._trie.lookup(ag)
190 | 
191 |     def read(self, f):
192 |         """Read a trie from an open file.
193 | 
194 |         :param file f: a "real" on-disk file object. Passing a *file-like*
195 |                        object would result in an error.
196 | 
197 |         .. deprecated:: 0.7.3
198 | 
199 |            The method will be removed in version 0.8.0. Please use
200 |            :meth:`load` instead.
201 |         """
202 |         warnings.warn("Trie.read is deprecated and will "
203 |                       "be removed in marisa_trie 0.8.0. Please use "
204 |                       "Trie.load instead.", DeprecationWarning)
205 |         self._trie.read(f.fileno())
206 |         return self
207 | 
208 |     def write(self, f):
209 |         """Write a trie to an open file.
210 | 
211 |         :param file f: a "real" on-disk file object. Passing a *file-like*
212 |                        object would result in an error.
213 | 
214 |         .. deprecated:: 0.7.3
215 | 
216 |            The method will be removed in version 0.8.0. Please use
217 |            :meth:`save` instead.
218 |         """
219 |         warnings.warn("Trie.write is deprecated and will "
220 |                       "be removed in marisa_trie 0.8.0. Please use "
221 |                       "Trie.save instead.", DeprecationWarning)
222 |         self._trie.write(f.fileno())
223 | 
224 |     def save(self, path):
225 |         """Save a trie to a specified path."""
226 |         with open(path, 'w') as f:
227 |             self._trie.write(f.fileno())
228 | 
229 |     def load(self, path):
230 |         """Load a trie from a specified path."""
231 |         with open(path, 'r') as f:
232 |             self._trie.read(f.fileno())
233 |         return self
234 | 
235 |     cpdef bytes tobytes(self) except +:
236 |         """Return raw trie content as bytes."""
237 |         cdef stringstream stream
238 |         iostream.write((<ostream *> &stream)[0], self._trie[0])
239 |         cdef bytes res = stream.str()
240 |         return res
241 | 
242 |     cpdef frombytes(self, bytes data) except +:
243 |         """Load a trie from raw bytes generated by :meth:`tobytes`."""
244 |         cdef stringstream* stream = new stringstream(data)
245 |         try:
246 |             iostream.read((<istream *> stream)[0], self._trie)
247 |         finally:
248 |             del stream
249 |         return self
250 | 
251 |     def __reduce__(self):
252 |         return self.__class__, (), self.tobytes()
253 | 
254 |     __setstate__ = frombytes
255 | 
256 |     def mmap(self, path):
257 |         """Memory map the content of a trie stored in a file.
258 | 
259 |         This allows to query trie without loading it fully in memory.
260 |         """
261 |         import sys
262 |         str_path = path.encode(sys.getfilesystemencoding())
263 |         cdef char* c_path = str_path
264 |         self._trie.mmap(c_path)
265 |         return self
266 | 
267 |     def map(self, buffer):
268 |         """Load the trie from an object exposing the buffer protocol."""
269 | 
270 |         cdef char *ptr = NULL
271 |         cdef Py_ssize_t size = 0
272 |         cdef Py_buffer buf
273 |         result = getbufptr(buffer, &ptr, &size, &buf)
274 |         if result != 0:
275 |             raise ValueError("Invalid buffer.")
276 |         self._trie.map(ptr, size)
277 |         releasebuf(&buf)
278 |         return self
279 | 
280 |     def iterkeys(self, prefix=None):
281 |         """
282 |         Return an iterator over trie keys starting with a given ``prefix``.
283 |         """
284 |         cdef agent.Agent ag
285 |         cdef bytes b_prefix = b''
286 |         if prefix is not None:
287 |             b_prefix = self._encode_key(prefix)
288 |         ag.set_query(b_prefix, len(b_prefix))
289 | 
290 |         while self._trie.predictive_search(ag):
291 |             yield self._get_key(ag)
292 | 
293 |     cpdef list keys(self, prefix=None):
294 |         """Return a list of trie keys starting with a given ``prefix``."""
295 |         # non-generator inlined version of iterkeys()
296 |         cdef list res = []
297 |         cdef bytes b_prefix = b''
298 |         if prefix is not None:
299 |             b_prefix = self._encode_key(prefix)
300 |         cdef agent.Agent ag
301 |         ag.set_query(b_prefix, len(b_prefix))
302 | 
303 |         while self._trie.predictive_search(ag):
304 |             res.append(self._get_key(ag))
305 | 
306 |         return res
307 | 
308 |     def has_keys_with_prefix(self, prefix=""):
309 |         """
310 |         Return ``True`` if any key in the trie begins with ``prefix``.
311 | 
312 |         .. deprecated:: 0.7.3
313 | 
314 |            The method will be removed in version 0.8.0. Please use
315 |            :meth:`iterkeys` instead.
316 |         """
317 |         warnings.warn("Trie.has_keys_with_prefix is deprecated and will "
318 |                       "be removed in marisa_trie 0.8.0. Please use "
319 |                       "Trie.iterkeys instead.", DeprecationWarning)
320 | 
321 |         cdef agent.Agent ag
322 |         cdef bytes b_prefix = self._encode_key(prefix)
323 |         ag.set_query(b_prefix, len(b_prefix))
324 |         return self._trie.predictive_search(ag)
325 | 
326 | 
327 | cdef class BinaryTrie(_Trie):
328 |     """A trie mapping bytes keys to auto-generated unique IDs."""
329 | 
330 |     # key_id method is not in _Trie because it won't work for BytesTrie
331 |     cpdef int key_id(self, bytes key) except -1:
332 |         """Return an ID generated for a given ``key``.
333 | 
334 |         :raises KeyError: if key is not present in this trie.
335 |         """
336 |         cdef int res = self._key_id(key, len(key))
337 |         if res == -1:
338 |             raise KeyError(key)
339 |         return res
340 | 
341 |     cdef int _key_id(self, char* key, int len):
342 |         cdef bint res
343 |         cdef agent.Agent ag
344 |         ag.set_query(key, len)
345 |         res = self._trie.lookup(ag)
346 |         if not res:
347 |             return -1
348 |         return ag.key().id()
349 | 
350 |     cpdef restore_key(self, int index):
351 |         """Return a key corresponding to a given ID."""
352 |         cdef agent.Agent ag
353 |         ag.set_query(index)
354 |         try:
355 |             self._trie.reverse_lookup(ag)
356 |         except KeyError:
357 |             raise KeyError(index)
358 |         return self._get_key(ag)
359 | 
360 |     def __getitem__(self, bytes key):
361 |         return self.key_id(key)
362 | 
363 |     def get(self, bytes key, default=None):
364 |         """
365 |         Return an ID for a given ``key`` or ``default`` if ``key`` is
366 |         not present in this trie.
367 |         """
368 |         cdef int res
369 | 
370 |         res = self._key_id(key, len(key))
371 |         if res == -1:
372 |             return default
373 |         return res
374 | 
375 |     def iter_prefixes(self, bytes key):
376 |         """
377 |         Return an iterator of all prefixes of a given key.
378 |         """
379 |         cdef agent.Agent ag
380 |         ag.set_query(key, len(key))
381 | 
382 |         while self._trie.common_prefix_search(ag):
383 |             yield self._get_key(ag)
384 | 
385 |     def prefixes(self, bytes key):
386 |         """
387 |         Return a list with all prefixes of a given key.
388 |         """
389 |         # this an inlined version of ``list(self.iter_prefixes(key))``
390 | 
391 |         cdef list res = []
392 |         cdef agent.Agent ag
393 |         ag.set_query(key, len(key))
394 | 
395 |         while self._trie.common_prefix_search(ag):
396 |             res.append(self._get_key(ag))
397 |         return res
398 | 
399 |     def items(self, bytes prefix=b""):
400 |         # inlined for speed
401 |         cdef list res = []
402 |         cdef agent.Agent ag
403 |         ag.set_query(prefix, len(prefix))
404 | 
405 |         while self._trie.predictive_search(ag):
406 |             res.append((self._get_key(ag), ag.key().id()))
407 | 
408 |         return res
409 | 
410 |     def iteritems(self, bytes prefix=b""):
411 |         """
412 |         Return an iterator over items that have a prefix ``prefix``.
413 |         """
414 |         cdef agent.Agent ag
415 |         ag.set_query(prefix, len(prefix))
416 | 
417 |         while self._trie.predictive_search(ag):
418 |             yield self._get_key(ag), ag.key().id()
419 | 
420 | 
421 | cdef class _UnicodeKeyedTrie(_Trie):
422 |     """
423 |     MARISA-trie wrapper for unicode keys.
424 |     """
425 |     cdef bytes _encode_key(self, key):
426 |         return key.encode('utf8')
427 | 
428 |     cdef _get_key(self, agent.Agent& ag):
429 |         return <unicode>_Trie._get_key(self, ag).decode('utf8')
430 | 
431 | 
432 | cdef class Trie(_UnicodeKeyedTrie):
433 |     """A trie mapping unicode keys to auto-generated unique IDs."""
434 | 
435 |     # key_id method is not in _Trie because it won't work for BytesTrie
436 |     cpdef int key_id(self, unicode key) except -1:
437 |         """Return an ID generated for a given ``key``.
438 | 
439 |         :raises KeyError: if key is not present in this trie.
440 |         """
441 |         cdef bytes _key = <bytes>key.encode('utf8')
442 |         cdef int res = self._key_id(_key)
443 |         if res == -1:
444 |             raise KeyError(key)
445 |         return res
446 | 
447 |     def __getitem__(self, unicode key):
448 |         return self.key_id(key)
449 | 
450 |     def get(self, key, default=None):
451 |         """
452 |         Return an ID for a given ``key`` or ``default`` if ``key`` is
453 |         not present in this trie.
454 |         """
455 |         cdef bytes b_key
456 |         cdef int res
457 | 
458 |         if isinstance(key, unicode):
459 |             b_key = <bytes>(<unicode>key).encode('utf8')
460 |         else:
461 |             b_key = key
462 | 
463 |         res = self._key_id(b_key)
464 |         if res == -1:
465 |             return default
466 |         return res
467 | 
468 |     cpdef restore_key(self, int index):
469 |         """Return a key corresponding to a given ID."""
470 |         cdef agent.Agent ag
471 |         ag.set_query(index)
472 |         try:
473 |             self._trie.reverse_lookup(ag)
474 |         except KeyError:
475 |             raise KeyError(index)
476 |         return self._get_key(ag)
477 | 
478 |     cdef int _key_id(self, char* key):
479 |         cdef bint res
480 |         cdef agent.Agent ag
481 |         ag.set_query(key)
482 |         res = self._trie.lookup(ag)
483 |         if not res:
484 |             return -1
485 |         return ag.key().id()
486 | 
487 |     def iter_prefixes(self, unicode key):
488 |         """
489 |         Return an iterator of all prefixes of a given key.
490 |         """
491 |         cdef bytes b_key = <bytes>key.encode('utf8')
492 |         cdef agent.Agent ag
493 |         ag.set_query(b_key)
494 | 
495 |         while self._trie.common_prefix_search(ag):
496 |             yield self._get_key(ag)
497 | 
498 |     def prefixes(self, unicode key):
499 |         """
500 |         Return a list with all prefixes of a given key.
501 |         """
502 |         # this an inlined version of ``list(self.iter_prefixes(key))``
503 | 
504 |         cdef list res = []
505 |         cdef bytes b_key = <bytes>key.encode('utf8')
506 |         cdef agent.Agent ag
507 |         ag.set_query(b_key)
508 | 
509 |         while self._trie.common_prefix_search(ag):
510 |             res.append(self._get_key(ag))
511 |         return res
512 | 
513 |     def iter_prefixes_with_ids(self, unicode key):
514 |         """
515 |         Return an iterator of (prefix, id) pairs of all prefixes of a given key.
516 |         """
517 |         cdef bytes b_key = <bytes>key.encode('utf8')
518 |         cdef agent.Agent ag
519 |         ag.set_query(b_key, len(b_key))
520 | 
521 |         while self._trie.common_prefix_search(ag):
522 |             yield (self._get_key(ag), ag.key().id())
523 | 
524 |     def iteritems(self, unicode prefix=""):
525 |         """
526 |         Return an iterator over items that have a prefix ``prefix``.
527 |         """
528 |         cdef bytes b_prefix = <bytes>prefix.encode('utf8')
529 |         cdef agent.Agent ag
530 |         ag.set_query(b_prefix)
531 | 
532 |         while self._trie.predictive_search(ag):
533 |             yield self._get_key(ag), ag.key().id()
534 | 
535 |     def items(self, unicode prefix=""):
536 |         # inlined for speed
537 |         cdef list res = []
538 |         cdef bytes b_prefix = <bytes>prefix.encode('utf8')
539 |         cdef agent.Agent ag
540 |         ag.set_query(b_prefix)
541 | 
542 |         while self._trie.predictive_search(ag):
543 |             res.append((self._get_key(ag), ag.key().id()))
544 | 
545 |         return res
546 | 
547 | 
548 | # This symbol is not allowed in utf8 so it is safe to use
549 | # as a separator between utf8-encoded string and binary payload.
550 | # XXX: b'\xff' value changes sort order for BytesTrie and RecordTrie.
551 | # See https://github.com/kmike/DAWG docs for a description of a similar issue.
552 | cdef bytes _VALUE_SEPARATOR = b'\xff'
553 | 
554 | 
555 | cdef class BytesTrie(_UnicodeKeyedTrie):
556 |     """A trie mapping unicode keys to lists of bytes objects.
557 | 
558 |     The mapping is implemented by appending binary values to UTF8-encoded
559 |     and storing the result in MARISA-trie.
560 |     """
561 |     cdef bytes _b_value_separator
562 |     cdef unsigned char _c_value_separator
563 | 
564 |     def __init__(self, arg=None, bytes value_separator=_VALUE_SEPARATOR,
565 |                  **options):
566 |         """
567 |         ``arg`` must be an iterable of tuples (unicode_key, bytes_payload).
568 |         """
569 |         super(BytesTrie, self).__init__()
570 | 
571 |         self._b_value_separator = value_separator
572 |         self._c_value_separator = <unsigned char>ord(value_separator)
573 | 
574 |         byte_keys = (self._raw_key(d[0], d[1]) for d in (arg or []))
575 |         self._build(byte_keys, **options)
576 | 
577 |     cpdef bytes _raw_key(self, unicode key, bytes payload):
578 |         return key.encode('utf8') + self._b_value_separator + payload
579 | 
580 |     cdef bint _contains(self, bytes key):
581 |         cdef agent.Agent ag
582 |         cdef bytes _key = key + self._b_value_separator
583 |         ag.set_query(_key)
584 |         return self._trie.predictive_search(ag)
585 | 
586 |     cpdef list prefixes(self, unicode key):
587 |         """
588 |         Return a list with all prefixes of a given key.
589 |         """
590 | 
591 |         # XXX: is there a char-walking API in libmarisa?
592 |         # This implementation is suboptimal.
593 | 
594 |         cdef agent.Agent ag
595 |         cdef list res = []
596 |         cdef int key_len = len(key)
597 |         cdef unicode prefix
598 |         cdef bytes b_prefix
599 |         cdef int ind = 1
600 | 
601 |         while ind <= key_len:
602 |             prefix = key[:ind]
603 |             b_prefix = <bytes>(prefix.encode('utf8') + self._b_value_separator)
604 |             ag.set_query(b_prefix)
605 |             if self._trie.predictive_search(ag):
606 |                 res.append(prefix)
607 | 
608 |             ind += 1
609 | 
610 |         return res
611 | 
612 |     def __getitem__(self, key):
613 |         cdef list res = self.get(key)
614 |         if res is None:
615 |             raise KeyError(key)
616 |         return res
617 | 
618 |     cpdef get(self, key, default=None):
619 |         """
620 |         Return a list of payloads (as byte objects) for a given key
621 |         or ``default`` if the key is not found.
622 |         """
623 |         cdef list res
624 | 
625 |         if isinstance(key, unicode):
626 |             res = self.get_value(<unicode>key)
627 |         else:
628 |             res = self.b_get_value(key)
629 | 
630 |         if not res:
631 |             return default
632 |         return res
633 | 
634 |     cpdef list get_value(self, unicode key):
635 |         """
636 |         Return a list of payloads (as byte objects) for a given unicode key.
637 |         """
638 |         cdef bytes b_key = <bytes>key.encode('utf8')
639 |         return self.b_get_value(b_key)
640 | 
641 |     cpdef list b_get_value(self, bytes key):
642 |         """
643 |         Return a list of payloads (as byte objects) for a given utf8-encoded key.
644 |         """
645 |         cdef list res = []
646 |         cdef bytes value
647 |         cdef bytes b_prefix = key + self._b_value_separator
648 |         cdef int prefix_len = len(b_prefix)
649 | 
650 |         cdef agent.Agent ag
651 |         ag.set_query(b_prefix)
652 | 
653 |         while self._trie.predictive_search(ag):
654 |             value = ag.key().ptr()[prefix_len:ag.key().length()]
655 |             res.append(value)
656 | 
657 |         return res
658 | 
659 |     cpdef list items(self, unicode prefix=""):
660 |         # copied from iteritems for speed
661 |         cdef bytes b_prefix = <bytes>prefix.encode('utf8')
662 |         cdef bytes value
663 |         cdef unicode key
664 |         cdef unsigned char* raw_key
665 |         cdef list res = []
666 |         cdef int i, value_len
667 | 
668 |         cdef agent.Agent ag
669 |         ag.set_query(b_prefix)
670 | 
671 |         while self._trie.predictive_search(ag):
672 |             raw_key = <unsigned char*>ag.key().ptr()
673 | 
674 |             for i in range(0, ag.key().length()):
675 |                 if raw_key[i] == self._c_value_separator:
676 |                     break
677 | 
678 |             key = raw_key[:i].decode('utf8')
679 |             value = raw_key[i+1:ag.key().length()]
680 | 
681 |             res.append(
682 |                 (key, value)
683 |             )
684 |         return res
685 | 
686 |     def iteritems(self, unicode prefix=""):
687 |         cdef bytes b_prefix = <bytes>prefix.encode('utf8')
688 |         cdef bytes value
689 |         cdef unicode key
690 |         cdef unsigned char* raw_key
691 |         cdef int i, value_len
692 | 
693 |         cdef agent.Agent ag
694 |         ag.set_query(b_prefix)
695 | 
696 |         while self._trie.predictive_search(ag):
697 |             raw_key = <unsigned char*>ag.key().ptr()
698 | 
699 |             for i in range(0, ag.key().length()):
700 |                 if raw_key[i] == self._c_value_separator:
701 |                     break
702 | 
703 |             key = raw_key[:i].decode('utf8')
704 |             value = raw_key[i+1:ag.key().length()]
705 | 
706 |             yield key, value
707 | 
708 |     cpdef list keys(self, prefix=""):
709 |         # copied from iterkeys for speed
710 |         cdef bytes b_prefix = <bytes>prefix.encode('utf8')
711 |         cdef unicode key
712 |         cdef unsigned char* raw_key
713 |         cdef list res = []
714 |         cdef int i
715 | 
716 |         cdef agent.Agent ag
717 |         ag.set_query(b_prefix)
718 | 
719 |         while self._trie.predictive_search(ag):
720 |             raw_key = <unsigned char*>ag.key().ptr()
721 | 
722 |             for i in range(0, ag.key().length()):
723 |                 if raw_key[i] == self._c_value_separator:
724 |                     key = raw_key[:i].decode('utf8')
725 |                     res.append(key)
726 |                     break
727 |         return res
728 | 
729 |     def iterkeys(self, unicode prefix=""):
730 |         cdef bytes b_prefix = <bytes>prefix.encode('utf8')
731 |         cdef unicode key
732 |         cdef unsigned char* raw_key
733 |         cdef int i
734 | 
735 |         cdef agent.Agent ag
736 |         ag.set_query(b_prefix)
737 | 
738 |         while self._trie.predictive_search(ag):
739 |             raw_key = <unsigned char*>ag.key().ptr()
740 | 
741 |             for i in range(0, ag.key().length()):
742 |                 if raw_key[i] == self._c_value_separator:
743 |                     yield raw_key[:i].decode('utf8')
744 |                     break
745 | 
746 | 
747 | cdef class _UnpackTrie(BytesTrie):
748 | 
749 |     def __init__(self, arg=None, **options):
750 |         keys = ((d[0], self._pack(d[1])) for d in (arg or []))
751 |         super(_UnpackTrie, self).__init__(keys, **options)
752 | 
753 |     cdef _unpack(self, bytes value):
754 |         return value
755 | 
756 |     cdef bytes _pack(self, value):
757 |         return value
758 | 
759 |     cpdef list b_get_value(self, bytes key):
760 |         cdef list values = BytesTrie.b_get_value(self, key)
761 |         return [self._unpack(val) for val in values]
762 | 
763 |     cpdef list items(self, unicode prefix=""):
764 |         cdef list items = BytesTrie.items(self, prefix)
765 |         return [(key, self._unpack(val)) for (key, val) in items]
766 | 
767 |     def iteritems(self, unicode prefix=""):
768 |         return ((key, self._unpack(val)) for key, val in BytesTrie.iteritems(self, prefix))
769 | 
770 | 
771 | cdef class RecordTrie(_UnpackTrie):
772 |     """A trie mapping unicode keys to lists of data tuples.
773 | 
774 |     The data is packed using :mod:`struct` module, therefore all
775 |     tuples must be of the same format. See :mod:`struct` documentation
776 |     for available format strings.
777 | 
778 |     The mapping is implemented by appending binary values to UTF8-encoded
779 |     and storing the result in MARISA-trie.
780 |     """
781 |     cdef _struct
782 |     cdef _fmt
783 | 
784 |     def __init__(self, fmt, arg=None, **options):
785 |         """
786 |         ``arg`` must be an iterable of tuples (unicode_key, data_tuple).
787 |         Data tuples will be converted to bytes with
788 |         ``struct.pack(fmt, *data_tuple)``.
789 |         """
790 |         self._fmt = fmt
791 |         self._struct = struct.Struct(str(fmt))
792 |         super(RecordTrie, self).__init__(arg, **options)
793 | 
794 |     cdef _unpack(self, bytes value):
795 |         return self._struct.unpack(value)
796 | 
797 |     cdef bytes _pack(self, value):
798 |         return self._struct.pack(*value)
799 | 
800 |     def __reduce__(self):
801 |         return self.__class__, (self._fmt, ), self.tobytes()
802 | 


--------------------------------------------------------------------------------
/src/query.pxd:
--------------------------------------------------------------------------------
 1 | cdef extern from "<marisa/query.h>" namespace "marisa" nogil:
 2 | 
 3 |     cdef cppclass Query:
 4 |         Query()
 5 |         Query(Query &query)
 6 | 
 7 |         #Query &operator=(Query &query)
 8 | 
 9 |         char operator[](int i)
10 | 
11 |         void set_str(char *str)
12 |         void set_str(char *ptr, int length)
13 |         void set_id(int id)
14 | 
15 |         char *ptr()
16 |         int length()
17 |         int id()
18 | 
19 |         void clear()
20 |         void swap(Query &rhs)
21 | 


--------------------------------------------------------------------------------
/src/std_iostream.pxd:
--------------------------------------------------------------------------------
 1 | from libcpp.string cimport string
 2 | 
 3 | cdef extern from "<istream>" namespace "std" nogil:
 4 |     cdef cppclass istream:
 5 |         istream() except +
 6 |         istream& read (char* s, int n) except +
 7 | 
 8 |     cdef cppclass ostream:
 9 |         ostream() except +
10 |         ostream& write (char* s, int n) except +
11 | 
12 | cdef extern from "<sstream>" namespace "std" nogil:
13 | 
14 |     cdef cppclass stringstream:
15 |         stringstream()
16 |         stringstream(string s)
17 |         string str ()
18 | 
19 | 


--------------------------------------------------------------------------------
/src/trie.pxd:
--------------------------------------------------------------------------------
 1 | cimport agent
 2 | cimport base
 3 | cimport keyset
 4 | 
 5 | 
 6 | cdef extern from "<marisa/trie.h>" namespace "marisa" nogil:
 7 | 
 8 |     cdef cppclass Trie:
 9 |         Trie()
10 | 
11 |         void build(keyset.Keyset &keyset, int config_flags) except +
12 |         void build(keyset.Keyset &keyset) except +
13 | 
14 |         void mmap(char *filename) except +
15 |         void map(void *ptr, int size) except +
16 | 
17 |         void load(char *filename) except +
18 |         void read(int fd) except +
19 | 
20 |         void save(char *filename) except +
21 |         void write(int fd) except +
22 | 
23 |         bint lookup(agent.Agent &agent) except +
24 |         void reverse_lookup(agent.Agent &agent) except +KeyError
25 |         bint common_prefix_search(agent.Agent &agent) except +
26 |         bint predictive_search(agent.Agent &agent) except +
27 | 
28 |         int num_tries() except +
29 |         int num_keys() except +
30 |         int num_nodes() except +
31 | 
32 |         base.TailMode tail_mode()
33 |         base.NodeOrder node_order()
34 | 
35 |         bint empty() except +
36 |         int size() except +
37 |         int total_size() except +
38 |         int io_size() except +
39 | 
40 |         void clear() except +
41 |         void swap(Trie &rhs) except +
42 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytries/marisa-trie/97cfda688aee37565f6d4a414cc66dd5384cb4ad/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_binary_trie.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | from uuid import uuid4
  3 | 
  4 | import pytest
  5 | import hypothesis.strategies as st
  6 | from hypothesis import given, assume
  7 | 
  8 | import marisa_trie
  9 | 
 10 | from .utils import Mapping
 11 | 
 12 | text = st.binary()
 13 | 
 14 | 
 15 | @given(st.sets(text), text)
 16 | def test_init(keys, missing_key):
 17 |     assume(missing_key not in keys)
 18 | 
 19 |     trie = marisa_trie.BinaryTrie(keys)
 20 |     for key in keys:
 21 |         assert key in trie
 22 | 
 23 |     assert missing_key not in trie
 24 | 
 25 | 
 26 | @given(st.sets(text, min_size=1), text)
 27 | def test_key_id(keys, missing_key):
 28 |     assume(missing_key not in keys)
 29 | 
 30 |     trie = marisa_trie.BinaryTrie(keys)
 31 |     for key in keys:
 32 |         key_id = trie.key_id(key)
 33 |         assert trie.restore_key(key_id) == key
 34 | 
 35 |     key_ids = [trie.key_id(key) for key in keys]
 36 |     non_existing_id = max(key_ids) + 1
 37 | 
 38 |     with pytest.raises(KeyError):
 39 |         trie.restore_key(non_existing_id)
 40 | 
 41 |     with pytest.raises(KeyError):
 42 |         trie.key_id(missing_key)
 43 | 
 44 | 
 45 | @given(st.sets(text, min_size=1), text)
 46 | def test_getitem(keys, missing_key):
 47 |     assume(missing_key not in keys)
 48 | 
 49 |     trie = marisa_trie.BinaryTrie(keys)
 50 |     for key in keys:
 51 |         key_id = trie[key]
 52 |         assert trie.restore_key(key_id) == key
 53 | 
 54 |     key_ids = [trie[key] for key in keys]
 55 |     non_existing_id = max(key_ids) + 1
 56 | 
 57 |     with pytest.raises(KeyError):
 58 |         trie.restore_key(non_existing_id)
 59 | 
 60 |     with pytest.raises(KeyError):
 61 |         trie[missing_key]
 62 | 
 63 | 
 64 | @given(st.sets(text))
 65 | def test_get(keys):
 66 |     trie = marisa_trie.BinaryTrie(keys)
 67 |     for key in keys:
 68 |         key_id = trie.get(key)
 69 |         assert trie.restore_key(key_id) == key
 70 | 
 71 |         key_id = trie.get(key, "default value")
 72 |         assert trie.restore_key(key_id) == key
 73 | 
 74 |     assert trie.get(b"non_existing_bytes_key") is None
 75 |     assert trie.get(b"non_existing_bytes_key", "default value") == "default value"
 76 | 
 77 | 
 78 | @given(st.sets(text))
 79 | def test_saveload(tmpdir_factory, keys):
 80 |     trie = marisa_trie.BinaryTrie(keys)
 81 | 
 82 |     dirname = f"{uuid4()}_"
 83 |     path = str(tmpdir_factory.mktemp(dirname).join("trie.bin"))
 84 |     trie.save(path)
 85 | 
 86 |     trie2 = marisa_trie.BinaryTrie()
 87 |     trie2.load(path)
 88 | 
 89 |     for key in keys:
 90 |         assert key in trie2
 91 | 
 92 | 
 93 | @given(st.sets(text))
 94 | def test_mmap(tmpdir_factory, keys):
 95 |     trie = marisa_trie.BinaryTrie(keys)
 96 | 
 97 |     dirname = f"{uuid4()}_"
 98 |     path = str(tmpdir_factory.mktemp(dirname).join("trie.bin"))
 99 |     trie.save(path)
100 | 
101 |     trie2 = marisa_trie.BinaryTrie()
102 |     trie2.mmap(path)
103 | 
104 |     for key in keys:
105 |         assert key in trie2
106 | 
107 | 
108 | @given(st.sets(text))
109 | def test_tobytes_frombytes(keys):
110 |     trie = marisa_trie.BinaryTrie(keys)
111 |     data = trie.tobytes()
112 | 
113 |     trie2 = marisa_trie.BinaryTrie().frombytes(data)
114 | 
115 |     for key in keys:
116 |         assert key in trie2
117 |         assert trie2.key_id(key) == trie.key_id(key)
118 | 
119 | 
120 | @given(st.sets(text))
121 | def test_dumps_loads(keys):
122 |     trie = marisa_trie.BinaryTrie(keys)
123 |     data = pickle.dumps(trie)
124 | 
125 |     trie2 = pickle.loads(data)
126 | 
127 |     for key in keys:
128 |         assert key in trie2
129 |         assert trie2.key_id(key) == trie.key_id(key)
130 | 
131 | 
132 | def test_contains_empty():
133 |     assert b"foo" not in marisa_trie.BinaryTrie()
134 | 
135 | 
136 | def test_contains_singleton():
137 |     trie = marisa_trie.BinaryTrie([b"foo"])
138 |     assert b"foo" in trie
139 |     assert b"f" not in trie
140 | 
141 | 
142 | def test_eq_self():
143 |     trie = marisa_trie.BinaryTrie()
144 |     assert trie == trie
145 |     assert trie == marisa_trie.BinaryTrie()
146 | 
147 | 
148 | def test_eq_neq():
149 |     trie = marisa_trie.BinaryTrie([b"foo", b"bar"])
150 |     assert trie == marisa_trie.BinaryTrie([b"foo", b"bar"])
151 |     assert trie != marisa_trie.BinaryTrie([b"foo", b"boo"])
152 | 
153 | 
154 | def test_neq_different_type():
155 |     assert marisa_trie.BinaryTrie([b"foo", b"bar"]) != {}
156 | 
157 | 
158 | def test_eq_neq_different_order():
159 |     lo_trie = marisa_trie.BinaryTrie(order=marisa_trie.LABEL_ORDER)
160 |     wo_trie = marisa_trie.BinaryTrie(order=marisa_trie.WEIGHT_ORDER)
161 |     assert lo_trie == lo_trie and wo_trie == wo_trie
162 |     assert lo_trie != wo_trie
163 | 
164 | 
165 | def test_gt_lt_exceptions():
166 |     with pytest.raises(TypeError):
167 |         marisa_trie.BinaryTrie() < marisa_trie.BinaryTrie()
168 | 
169 |     with pytest.raises(TypeError):
170 |         marisa_trie.BinaryTrie() > marisa_trie.BinaryTrie()
171 | 
172 | 
173 | def test_iter():
174 |     trie = marisa_trie.BinaryTrie([b"foo", b"bar"])
175 |     assert list(trie) == list(trie.iterkeys())
176 | 
177 | 
178 | def test_len():
179 |     trie = marisa_trie.BinaryTrie()
180 |     assert len(trie) == 0
181 | 
182 |     trie = marisa_trie.BinaryTrie([b"foo", b"f", b"bar"])
183 |     assert len(trie) == 3
184 | 
185 | 
186 | def test_prefixes():
187 |     trie = marisa_trie.BinaryTrie([b"foo", b"f", b"foobar", b"bar"])
188 |     assert trie.prefixes(b"foobar") == [b"f", b"foo", b"foobar"]
189 |     assert trie.prefixes(b"foo") == [b"f", b"foo"]
190 |     assert trie.prefixes(b"bar") == [b"bar"]
191 |     assert trie.prefixes(b"b") == []
192 | 
193 |     assert list(trie.iter_prefixes(b"foobar")) == [b"f", b"foo", b"foobar"]
194 | 
195 | 
196 | def test_keys():
197 |     keys = [b"foo", b"f", b"foobar", b"bar"]
198 |     trie = marisa_trie.BinaryTrie(keys)
199 |     assert set(trie.keys()) == set(keys)
200 | 
201 | 
202 | def test_keys_prefix():
203 |     keys = [b"foo", b"f", b"foobar", b"bar"]
204 |     trie = marisa_trie.BinaryTrie(keys)
205 |     assert set(trie.keys(b"fo")) == {b"foo", b"foobar"}
206 |     assert trie.keys(b"foobarz") == []
207 | 
208 | 
209 | @given(st.sets(text))
210 | def test_iterkeys(keys):
211 |     trie = marisa_trie.BinaryTrie(keys)
212 |     assert trie.keys() == list(trie.iterkeys())
213 | 
214 |     for key in keys:
215 |         prefix = key[:5]
216 |         assert trie.keys(prefix) == list(trie.iterkeys(prefix))
217 | 
218 | 
219 | def test_items():
220 |     keys = [b"foo", b"f", b"foobar", b"bar"]
221 |     trie = marisa_trie.BinaryTrie(keys)
222 |     items = trie.items()
223 |     assert set(items) == set(zip(keys, (trie[k] for k in keys)))
224 | 
225 | 
226 | def test_items_prefix():
227 |     keys = [b"foo", b"f", b"foobar", b"bar"]
228 |     trie = marisa_trie.BinaryTrie(keys)
229 |     assert set(trie.items(b"fo")) == {
230 |         (b"foo", trie[b"foo"]),
231 |         (b"foobar", trie[b"foobar"]),
232 |     }
233 | 
234 | 
235 | @given(st.sets(text))
236 | def test_iteritems(keys):
237 |     trie = marisa_trie.BinaryTrie(keys)
238 |     assert trie.items() == list(trie.iteritems())
239 | 
240 |     for key in keys:
241 |         prefix = key[:5]
242 |         assert trie.items(prefix) == list(trie.iteritems(prefix))
243 | 
244 | 
245 | @pytest.mark.filterwarnings("ignore:Trie.has_keys_with_prefix is deprecated")
246 | def test_has_keys_with_prefix_empty():
247 |     empty_trie = marisa_trie.BinaryTrie()
248 |     assert not empty_trie.has_keys_with_prefix(b"")
249 |     assert not empty_trie.has_keys_with_prefix(b"ab")
250 | 
251 | 
252 | def test_invalid_file():
253 |     try:
254 |         marisa_trie.BinaryTrie().load(__file__)
255 |     except RuntimeError as e:
256 |         assert "MARISA_FORMAT_ERROR" in e.args[0]
257 |     else:
258 |         pytest.fail("Exception is not raised")
259 | 
260 | 
261 | def test_mutable_mapping():
262 |     for method in Mapping.__abstractmethods__:
263 |         assert hasattr(marisa_trie.BinaryTrie, method)
264 | 


--------------------------------------------------------------------------------
/tests/test_bytes_trie.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import pickle
  3 | 
  4 | import pytest
  5 | import hypothesis.strategies as st
  6 | from hypothesis import assume, given
  7 | 
  8 | import marisa_trie
  9 | 
 10 | from .utils import text
 11 | 
 12 | 
 13 | @given(st.sets(text), st.lists(st.binary()), text)
 14 | def test_contains(keys, values, missing_key):
 15 |     assume(missing_key not in keys)
 16 | 
 17 |     data = zip(keys, values)
 18 |     trie = marisa_trie.BytesTrie(data)
 19 | 
 20 |     for word, value in data:
 21 |         assert word in trie
 22 | 
 23 |     assert missing_key not in trie
 24 | 
 25 | 
 26 | @given(st.sets(text), st.lists(st.binary()), text)
 27 | def test_getitem(keys, values, missing_key):
 28 |     assume(missing_key not in keys)
 29 | 
 30 |     data = zip(keys, values)
 31 |     trie = marisa_trie.BytesTrie(data)
 32 | 
 33 |     for key, value in data:
 34 |         assert trie[key] == [value]
 35 | 
 36 |     with pytest.raises(KeyError):
 37 |         trie[missing_key]
 38 | 
 39 | 
 40 | @pytest.mark.parametrize("data", [[], [("foo", b"bar")]])
 41 | def test_getitem_missing(data):
 42 |     trie = marisa_trie.BytesTrie(data)
 43 |     with pytest.raises(KeyError):
 44 |         trie["missing"]
 45 | 
 46 | 
 47 | def test_getitem_multiple():
 48 |     data = [
 49 |         ("foo", b"x"),
 50 |         ("fo", b"y"),
 51 |         ("foo", b"a"),
 52 |     ]
 53 |     trie = marisa_trie.BytesTrie(data)
 54 |     assert trie["fo"] == [b"y"]
 55 |     assert trie["foo"] == [b"a", b"x"]
 56 | 
 57 | 
 58 | def test_null_bytes_in_values():
 59 |     data = [("foo", b"\x00\x00bar\x00")]
 60 |     trie = marisa_trie.BytesTrie(data)
 61 | 
 62 |     for key, value in data:
 63 |         assert trie[key] == [value]
 64 | 
 65 | 
 66 | def test_items():
 67 |     data = [
 68 |         ("fo", b"y"),
 69 |         ("foo", b"x"),
 70 |         ("foo", b"a"),
 71 |     ]
 72 |     trie = marisa_trie.BytesTrie(data)
 73 |     assert set(trie.items()) == set(data)
 74 |     assert set(trie.items("f")) == set(data)
 75 |     assert set(trie.items("fo")) == set(data)
 76 |     assert set(trie.items("foo")) == set(data[1:])
 77 |     assert trie.items("food") == []
 78 |     assert trie.items("bar") == []
 79 | 
 80 | 
 81 | @given(st.sets(text), st.lists(st.binary()))
 82 | def test_iteritems(keys, values):
 83 |     trie = marisa_trie.BytesTrie(zip(keys, values))
 84 |     assert trie.items() == list(trie.iteritems())
 85 | 
 86 |     for key in keys:
 87 |         prefix = key[:5]
 88 |         assert trie.items(prefix) == list(trie.iteritems(prefix))
 89 | 
 90 | 
 91 | def test_keys():
 92 |     trie = marisa_trie.BytesTrie(
 93 |         [
 94 |             ("foo", b"x"),
 95 |             ("fo", b"y"),
 96 |             ("foo", b"a"),
 97 |         ]
 98 |     )
 99 | 
100 |     # FIXME: ordering?
101 |     assert trie.keys() == ["foo", "foo", "fo"]
102 |     assert trie.keys("f") == ["foo", "foo", "fo"]
103 |     assert trie.keys("fo") == ["foo", "foo", "fo"]
104 |     assert trie.keys("foo") == ["foo", "foo"]
105 |     assert trie.keys("food") == []
106 |     assert trie.keys("bar") == []
107 | 
108 | 
109 | @given(st.sets(text), st.lists(st.binary()))
110 | def test_iterkeys(keys, values):
111 |     trie = marisa_trie.BytesTrie(zip(keys, values))
112 |     assert trie.keys() == list(trie.iterkeys())
113 | 
114 |     for key in keys:
115 |         prefix = key[:5]
116 |         assert trie.keys(prefix) == list(trie.iterkeys(prefix))
117 | 
118 | 
119 | @given(st.sets(st.tuples(text, st.binary())))
120 | def test_dumps_loads(data):
121 |     trie = marisa_trie.BytesTrie(data)
122 | 
123 |     buf = io.BytesIO()
124 |     pickle.dump(trie, buf)
125 |     buf.seek(0)
126 | 
127 |     assert trie == pickle.load(buf)
128 | 


--------------------------------------------------------------------------------
/tests/test_packaging.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Shamelessly inspired from https://github.com/pypa/twine/blob/main/twine/commands/check.py
 3 | """
 4 | import io
 5 | import re
 6 | import subprocess
 7 | from email import message_from_string
 8 | from pkg_resources import get_distribution
 9 | 
10 | from readme_renderer.rst import render
11 | 
12 | # Regular expression used to capture and reformat docutils warnings into
13 | # something that a human can understand. This is loosely borrowed from
14 | # Sphinx: https://github.com/sphinx-doc/sphinx/blob
15 | # /c35eb6fade7a3b4a6de4183d1dd4196f04a5edaf/sphinx/util/docutils.py#L199
16 | _REPORT_RE = re.compile(
17 |     r"^<string>:(?P<line>(?:\d+)?): "
18 |     r"\((?P<level>DEBUG|INFO|WARNING|ERROR|SEVERE)/(\d+)?\) "
19 |     r"(?P<message>.*)",
20 |     re.DOTALL | re.MULTILINE,
21 | )
22 | 
23 | 
24 | class _WarningStream:
25 |     def __init__(self) -> None:
26 |         self.output = io.StringIO()
27 | 
28 |     def write(self, text: str) -> None:
29 |         matched = _REPORT_RE.search(text)
30 | 
31 |         if not matched:
32 |             self.output.write(text)
33 |             return
34 | 
35 |         self.output.write(
36 |             "line {line}: {level_text}: {message}\n".format(
37 |                 level_text=matched.group("level").capitalize(),
38 |                 line=matched.group("line"),
39 |                 message=matched.group("message").rstrip("\r\n"),
40 |             )
41 |         )
42 | 
43 |     def __str__(self) -> str:
44 |         return self.output.getvalue()
45 | 
46 | 
47 | def test_check_pypi_rendering():
48 |     subprocess.check_call(["python3", "setup.py", "sdist"])
49 | 
50 |     package = get_distribution("marisa-trie")
51 |     pkg_info = message_from_string(package.get_metadata("PKG-INFO"))
52 |     metadata = dict(pkg_info.items())
53 |     lines = metadata["Summary"].splitlines()
54 |     description = lines.pop(0) + "\n"
55 |     description += "\n".join(l[8:] for l in lines)
56 | 
57 |     warnings = _WarningStream()
58 |     rendering = render(description, stream=warnings)
59 |     print(description)
60 |     print(warnings)
61 |     assert not str(warnings)
62 |     assert rendering is not None
63 | 


--------------------------------------------------------------------------------
/tests/test_record_trie.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import pickle
 3 | 
 4 | import hypothesis.strategies as st
 5 | from hypothesis import given
 6 | 
 7 | import marisa_trie
 8 | 
 9 | from .utils import text
10 | 
11 | records = st.tuples(st.integers(min_value=0, max_value=2 ** 16 - 1), st.booleans())
12 | 
13 | 
14 | @given(st.sets(st.tuples(text, records)))
15 | def test_dumps_loads(data):
16 |     trie = marisa_trie.RecordTrie("<H?", data)
17 | 
18 |     buf = io.BytesIO()
19 |     pickle.dump(trie, buf)
20 |     buf.seek(0)
21 | 
22 |     assert trie == pickle.load(buf)
23 | 


--------------------------------------------------------------------------------
/tests/test_trie.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | from uuid import uuid4
  3 | 
  4 | import pytest
  5 | import hypothesis.strategies as st
  6 | from hypothesis import given, assume
  7 | 
  8 | import marisa_trie
  9 | 
 10 | from .utils import text, Mapping
 11 | 
 12 | 
 13 | @given(st.sets(text), text)
 14 | def test_init(keys, missing_key):
 15 |     assume(missing_key not in keys)
 16 | 
 17 |     trie = marisa_trie.Trie(keys)
 18 |     for key in keys:
 19 |         assert key in trie
 20 | 
 21 |     assert missing_key not in trie
 22 | 
 23 | 
 24 | @given(st.sets(text, min_size=1), text)
 25 | def test_key_id(keys, missing_key):
 26 |     assume(missing_key not in keys)
 27 | 
 28 |     trie = marisa_trie.Trie(keys)
 29 |     for key in keys:
 30 |         key_id = trie.key_id(key)
 31 |         assert trie.restore_key(key_id) == key
 32 | 
 33 |     key_ids = [trie.key_id(key) for key in keys]
 34 |     non_existing_id = max(key_ids) + 1
 35 | 
 36 |     with pytest.raises(KeyError):
 37 |         trie.restore_key(non_existing_id)
 38 | 
 39 |     with pytest.raises(KeyError):
 40 |         trie.key_id(missing_key)
 41 | 
 42 | 
 43 | @given(st.sets(text, min_size=1), text)
 44 | def test_getitem(keys, missing_key):
 45 |     assume(missing_key not in keys)
 46 | 
 47 |     trie = marisa_trie.Trie(keys)
 48 |     for key in keys:
 49 |         key_id = trie[key]
 50 |         assert trie.restore_key(key_id) == key
 51 | 
 52 |     key_ids = [trie[key] for key in keys]
 53 |     non_existing_id = max(key_ids) + 1
 54 | 
 55 |     with pytest.raises(KeyError):
 56 |         trie.restore_key(non_existing_id)
 57 | 
 58 |     with pytest.raises(KeyError):
 59 |         trie[missing_key]
 60 | 
 61 | 
 62 | @given(st.sets(text))
 63 | def test_get(keys):
 64 |     trie = marisa_trie.Trie(keys)
 65 |     for key in keys:
 66 |         key_id = trie.get(key)
 67 |         assert trie.restore_key(key_id) == key
 68 | 
 69 |         key_id = trie.get(key.encode("utf8"))
 70 |         assert trie.restore_key(key_id) == key
 71 | 
 72 |         key_id = trie.get(key, "default value")
 73 |         assert trie.restore_key(key_id) == key
 74 | 
 75 |     assert trie.get("non_existing_key") is None
 76 |     assert trie.get(b"non_existing_bytes_key") is None
 77 |     assert trie.get("non_existing_key", "default value") == "default value"
 78 |     assert trie.get(b"non_existing_bytes_key", "default value") == "default value"
 79 | 
 80 | 
 81 | @given(st.sets(text))
 82 | def test_saveload(tmpdir_factory, keys):
 83 |     trie = marisa_trie.Trie(keys)
 84 | 
 85 |     dirname = f"{str(uuid4())}_"
 86 |     path = str(tmpdir_factory.mktemp(dirname).join("trie.bin"))
 87 |     trie.save(path)
 88 | 
 89 |     trie2 = marisa_trie.Trie()
 90 |     trie2.load(path)
 91 | 
 92 |     for key in keys:
 93 |         assert key in trie2
 94 | 
 95 | 
 96 | @given(st.sets(text))
 97 | def test_mmap(tmpdir_factory, keys):
 98 |     trie = marisa_trie.Trie(keys)
 99 | 
100 |     dirname = f"{str(uuid4())}_"
101 |     path = str(tmpdir_factory.mktemp(dirname).join("trie.bin"))
102 |     trie.save(path)
103 | 
104 |     trie2 = marisa_trie.Trie()
105 |     trie2.mmap(path)
106 | 
107 |     for key in keys:
108 |         assert key in trie2
109 | 
110 | @given(st.sets(text))
111 | def test_map(tmpdir_factory, keys):
112 |     trie = marisa_trie.Trie(keys)
113 | 
114 |     dirname = f"{str(uuid4())}_"
115 |     path = str(tmpdir_factory.mktemp(dirname).join("trie.bin"))
116 |     trie.save(path)
117 | 
118 |     data = open(path, "rb").read()
119 |     trie2 = marisa_trie.Trie()
120 |     trie2.map(data)
121 | 
122 |     for key in keys:
123 |         assert key in trie2
124 | 
125 | @given(st.sets(text))
126 | def test_map_with_pad(tmpdir_factory, keys):
127 |     trie = marisa_trie.Trie(keys)
128 | 
129 |     dirname = f"{str(uuid4())}_"
130 |     path = str(tmpdir_factory.mktemp(dirname).join("trie.bin"))
131 |     trie.save(path)
132 | 
133 |     data = b"pad" + open(path, "rb").read() + b"pad"
134 |     trie2 = marisa_trie.Trie()
135 |     trie2.map(memoryview(data)[3:-3])
136 | 
137 |     for key in keys:
138 |         assert key in trie2
139 | 
140 | 
141 | @given(st.sets(text))
142 | def test_tobytes_frombytes(keys):
143 |     trie = marisa_trie.Trie(keys)
144 |     data = trie.tobytes()
145 | 
146 |     trie2 = marisa_trie.Trie().frombytes(data)
147 | 
148 |     for key in keys:
149 |         assert key in trie2
150 |         assert trie2.key_id(key) == trie.key_id(key)
151 | 
152 | 
153 | @given(st.sets(text))
154 | def test_dumps_loads(keys):
155 |     trie = marisa_trie.Trie(keys)
156 |     data = pickle.dumps(trie)
157 | 
158 |     trie2 = pickle.loads(data)
159 | 
160 |     for key in keys:
161 |         assert key in trie2
162 |         assert trie2.key_id(key) == trie.key_id(key)
163 | 
164 | 
165 | def test_contains_empty():
166 |     assert "foo" not in marisa_trie.Trie()
167 | 
168 | 
169 | def test_contains_singleton():
170 |     trie = marisa_trie.Trie(["foo"])
171 |     assert "foo" in trie
172 |     assert "f" not in trie
173 | 
174 | 
175 | def test_eq_self():
176 |     trie = marisa_trie.Trie()
177 |     assert trie == trie
178 |     assert trie == marisa_trie.Trie()
179 | 
180 | 
181 | def test_eq_neq():
182 |     trie = marisa_trie.Trie(["foo", "bar"])
183 |     assert trie == marisa_trie.Trie(["foo", "bar"])
184 |     assert trie != marisa_trie.Trie(["foo", "boo"])
185 | 
186 | 
187 | def test_neq_different_type():
188 |     assert marisa_trie.Trie(["foo", "bar"]) != {}
189 | 
190 | 
191 | def test_eq_neq_different_order():
192 |     lo_trie = marisa_trie.Trie(order=marisa_trie.LABEL_ORDER)
193 |     wo_trie = marisa_trie.Trie(order=marisa_trie.WEIGHT_ORDER)
194 |     assert lo_trie == lo_trie and wo_trie == wo_trie
195 |     assert lo_trie != wo_trie
196 | 
197 | 
198 | def test_gt_lt_exceptions():
199 |     with pytest.raises(TypeError):
200 |         marisa_trie.Trie() < marisa_trie.Trie()
201 | 
202 |     with pytest.raises(TypeError):
203 |         marisa_trie.Trie() > marisa_trie.Trie()
204 | 
205 | 
206 | def test_iter():
207 |     trie = marisa_trie.Trie(["foo", "bar"])
208 |     assert list(trie) == list(trie.iterkeys())
209 | 
210 | 
211 | def test_len():
212 |     trie = marisa_trie.Trie()
213 |     assert len(trie) == 0
214 | 
215 |     trie = marisa_trie.Trie(["foo", "f", "bar"])
216 |     assert len(trie) == 3
217 | 
218 | 
219 | def test_prefixes():
220 |     trie = marisa_trie.Trie(["foo", "f", "foobar", "bar"])
221 |     assert trie.prefixes("foobar") == ["f", "foo", "foobar"]
222 |     assert trie.prefixes("foo") == ["f", "foo"]
223 |     assert trie.prefixes("bar") == ["bar"]
224 |     assert trie.prefixes("b") == []
225 | 
226 |     assert list(trie.iter_prefixes("foobar")) == ["f", "foo", "foobar"]
227 | 
228 | def test_iter_prefixes_with_keys():
229 |     trie = marisa_trie.Trie(["foo", "f", "foobar", "bar"])
230 | 
231 |     assert set(trie.iter_prefixes_with_ids("foobar")) == {
232 |         ("f", trie["f"]),
233 |         ("foo", trie["foo"]),
234 |         ("foobar", trie["foobar"]),
235 |     }
236 |     assert set(trie.iter_prefixes_with_ids("foo")) == {
237 |         ("f", trie["f"]),
238 |         ("foo", trie["foo"]),
239 |     }
240 |     assert set(trie.iter_prefixes_with_ids("bar")) == {("bar", trie["bar"])}
241 |     assert not set(trie.iter_prefixes_with_ids("b"))
242 | 
243 |     for test_key in ["foobar", "foo", "bar", "b"]:
244 |         assert list(trie.iter_prefixes_with_ids(test_key)) == [
245 |             (prefix, trie[prefix]) for prefix in trie.prefixes(test_key)
246 |         ]
247 | 
248 | def test_keys():
249 |     keys = ["foo", "f", "foobar", "bar"]
250 |     trie = marisa_trie.Trie(keys)
251 |     assert set(trie.keys()) == set(keys)
252 | 
253 | 
254 | def test_keys_prefix():
255 |     keys = ["foo", "f", "foobar", "bar"]
256 |     trie = marisa_trie.Trie(keys)
257 |     assert set(trie.keys("fo")) == {"foo", "foobar"}
258 |     assert trie.keys("foobarz") == []
259 | 
260 | 
261 | @given(st.sets(text))
262 | def test_iterkeys(keys):
263 |     trie = marisa_trie.Trie(keys)
264 |     assert trie.keys() == list(trie.iterkeys())
265 | 
266 |     for key in keys:
267 |         prefix = key[:5]
268 |         assert trie.keys(prefix) == list(trie.iterkeys(prefix))
269 | 
270 | 
271 | def test_items():
272 |     keys = ["foo", "f", "foobar", "bar"]
273 |     trie = marisa_trie.Trie(keys)
274 |     items = trie.items()
275 |     assert set(items) == set(zip(keys, (trie[k] for k in keys)))
276 | 
277 | 
278 | def test_items_prefix():
279 |     keys = ["foo", "f", "foobar", "bar"]
280 |     trie = marisa_trie.Trie(keys)
281 |     assert set(trie.items("fo")) == {
282 |         ("foo", trie["foo"]),
283 |         ("foobar", trie["foobar"]),
284 |     }
285 | 
286 | 
287 | @given(st.sets(text))
288 | def test_iteritems(keys):
289 |     trie = marisa_trie.Trie(keys)
290 |     assert trie.items() == list(trie.iteritems())
291 | 
292 |     for key in keys:
293 |         prefix = key[:5]
294 |         assert trie.items(prefix) == list(trie.iteritems(prefix))
295 | 
296 | 
297 | @pytest.mark.filterwarnings("ignore:Trie.has_keys_with_prefix is deprecated")
298 | def test_has_keys_with_prefix_empty():
299 |     empty_trie = marisa_trie.Trie()
300 |     assert not empty_trie.has_keys_with_prefix("")
301 |     assert not empty_trie.has_keys_with_prefix("ab")
302 | 
303 | 
304 | @pytest.mark.filterwarnings("ignore:Trie.has_keys_with_prefix is deprecated")
305 | def test_has_keys_with_prefix():
306 |     fruit_trie = marisa_trie.BytesTrie(
307 |         [
308 |             ("apple", b"foo"),
309 |             ("pear", b"bar"),
310 |             ("peach", b"baz"),
311 |         ]
312 |     )
313 |     assert fruit_trie.has_keys_with_prefix("")
314 |     assert fruit_trie.has_keys_with_prefix("a")
315 |     assert fruit_trie.has_keys_with_prefix("pe")
316 |     assert fruit_trie.has_keys_with_prefix("pear")
317 |     assert not fruit_trie.has_keys_with_prefix("x")
318 | 
319 | 
320 | def test_invalid_file():
321 |     try:
322 |         marisa_trie.Trie().load(__file__)
323 |     except RuntimeError as e:
324 |         assert "MARISA_FORMAT_ERROR" in e.args[0]
325 |     else:
326 |         pytest.fail("Exception is not raised")
327 | 
328 | 
329 | def test_mutable_mapping():
330 |     for method in Mapping.__abstractmethods__:
331 |         assert hasattr(marisa_trie.Trie, method)
332 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
1 | import string
2 | from collections.abc import Mapping
3 | 
4 | import hypothesis.strategies as st
5 | 
6 | text = st.text(f"абвгдеёжзиклмнопрстуфхцчъыьэюя{string.ascii_lowercase}")
7 | 
8 | __all__ = ("Mapping", text)
9 | 


--------------------------------------------------------------------------------
/update_cpp.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | cython src/*.pyx src/*.pxd --cplus -a -3
3 | 


--------------------------------------------------------------------------------