├── .github
└── workflows
│ ├── cibuildwheel.yml
│ ├── publish_pypi.yml
│ └── tests.yml
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── bin
└── push-tag.sh
├── fabfile.py
├── include
└── msvc9
│ └── stdint.h
├── preshed
├── __init__.pxd
├── __init__.py
├── about.py
├── bloom.pxd
├── bloom.pyx
├── counter.pxd
├── counter.pyx
├── maps.pxd
├── maps.pyx
└── tests
│ ├── __init__.py
│ ├── test_bloom.py
│ ├── test_counter.py
│ ├── test_hashing.py
│ └── test_pop.py
├── pyproject.toml
├── requirements.txt
└── setup.py
/.github/workflows/cibuildwheel.yml:
--------------------------------------------------------------------------------
1 | name: Build
2 |
3 | on:
4 | push:
5 | tags:
6 | # ytf did they invent their own syntax that's almost regex?
7 | # ** matches 'zero or more of any character'
8 | - 'release-v[0-9]+.[0-9]+.[0-9]+**'
9 | - 'prerelease-v[0-9]+.[0-9]+.[0-9]+**'
10 | jobs:
11 | build_wheels:
12 | name: Build wheels on ${{ matrix.os }}
13 | runs-on: ${{ matrix.os }}
14 | strategy:
15 | matrix:
16 | # macos-13 is an intel runner, macos-14 is apple silicon
17 | os: [ubuntu-latest, windows-latest, macos-13, macos-14, ubuntu-24.04-arm]
18 |
19 | steps:
20 | - uses: actions/checkout@v4
21 | - name: Build wheels
22 | uses: pypa/cibuildwheel@v2.21.3
23 | env:
24 | CIBW_SOME_OPTION: value
25 | with:
26 | package-dir: .
27 | output-dir: wheelhouse
28 | config-file: "{package}/pyproject.toml"
29 | - uses: actions/upload-artifact@v4
30 | with:
31 | name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
32 | path: ./wheelhouse/*.whl
33 |
34 | build_sdist:
35 | name: Build source distribution
36 | runs-on: ubuntu-latest
37 | steps:
38 | - uses: actions/checkout@v4
39 |
40 | - name: Build sdist
41 | run: pipx run build --sdist
42 | - uses: actions/upload-artifact@v4
43 | with:
44 | name: cibw-sdist
45 | path: dist/*.tar.gz
46 | create_release:
47 | needs: [build_wheels, build_sdist]
48 | runs-on: ubuntu-latest
49 | permissions:
50 | contents: write
51 | checks: write
52 | actions: read
53 | issues: read
54 | packages: write
55 | pull-requests: read
56 | repository-projects: read
57 | statuses: read
58 | steps:
59 | - name: Get the tag name and determine if it's a prerelease
60 | id: get_tag_info
61 | run: |
62 | FULL_TAG=${GITHUB_REF#refs/tags/}
63 | if [[ $FULL_TAG == release-* ]]; then
64 | TAG_NAME=${FULL_TAG#release-}
65 | IS_PRERELEASE=false
66 | elif [[ $FULL_TAG == prerelease-* ]]; then
67 | TAG_NAME=${FULL_TAG#prerelease-}
68 | IS_PRERELEASE=true
69 | else
70 | echo "Tag does not match expected patterns" >&2
71 | exit 1
72 | fi
73 | echo "FULL_TAG=$TAG_NAME" >> $GITHUB_ENV
74 | echo "TAG_NAME=$TAG_NAME" >> $GITHUB_ENV
75 | echo "IS_PRERELEASE=$IS_PRERELEASE" >> $GITHUB_ENV
76 | - uses: actions/download-artifact@v4
77 | with:
78 | # unpacks all CIBW artifacts into dist/
79 | pattern: cibw-*
80 | path: dist
81 | merge-multiple: true
82 | - name: Create Draft Release
83 | id: create_release
84 | uses: softprops/action-gh-release@v2
85 | if: startsWith(github.ref, 'refs/tags/')
86 | env:
87 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
88 | with:
89 | name: ${{ env.TAG_NAME }}
90 | draft: true
91 | prerelease: ${{ env.IS_PRERELEASE }}
92 | files: "./dist/*"
93 |
--------------------------------------------------------------------------------
/.github/workflows/publish_pypi.yml:
--------------------------------------------------------------------------------
1 | # The cibuildwheel action triggers on creation of a release, this
2 | # triggers on publication.
3 | # The expected workflow is to create a draft release and let the wheels
4 | # upload, and then hit 'publish', which uploads to PyPi.
5 |
6 | on:
7 | release:
8 | types:
9 | - published
10 |
11 | jobs:
12 | upload_pypi:
13 | runs-on: ubuntu-latest
14 | environment:
15 | name: pypi
16 | url: https://pypi.org/p/cymem
17 | permissions:
18 | id-token: write
19 | contents: read
20 | if: github.event_name == 'release' && github.event.action == 'published'
21 | # or, alternatively, upload to PyPI on every tag starting with 'v' (remove on: release above to use this)
22 | # if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
23 | steps:
24 | - uses: robinraju/release-downloader@v1
25 | with:
26 | tag: ${{ github.event.release.tag_name }}
27 | fileName: '*'
28 | out-file-path: 'dist'
29 | - uses: pypa/gh-action-pypi-publish@release/v1
30 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | name: tests
2 |
3 | on:
4 | push:
5 | paths-ignore:
6 | - "*.md"
7 | pull_request:
8 | types: [opened, synchronize, reopened, edited]
9 | paths-ignore:
10 | - "*.md"
11 |
12 | env:
13 | MODULE_NAME: 'preshed'
14 | RUN_MYPY: 'false'
15 |
16 | jobs:
17 | tests:
18 | name: Test
19 | if: github.repository_owner == 'explosion'
20 | strategy:
21 | fail-fast: false
22 | matrix:
23 | os: [ubuntu-latest, windows-latest, macos-13]
24 | python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
25 | runs-on: ${{ matrix.os }}
26 |
27 | steps:
28 | - name: Check out repo
29 | uses: actions/checkout@v3
30 |
31 | - name: Configure Python version
32 | uses: actions/setup-python@v4
33 | with:
34 | python-version: ${{ matrix.python_version }}
35 | architecture: x64
36 |
37 | - name: Build sdist
38 | run: |
39 | python -m pip install -U build pip setuptools
40 | python -m pip install -U -r requirements.txt
41 | python -m build --sdist
42 |
43 | - name: Run mypy
44 | shell: bash
45 | if: ${{ env.RUN_MYPY == 'true' }}
46 | run: |
47 | python -m mypy $MODULE_NAME
48 |
49 | - name: Delete source directory
50 | shell: bash
51 | run: |
52 | rm -rf $MODULE_NAME
53 |
54 | - name: Uninstall all packages
55 | run: |
56 | python -m pip freeze > installed.txt
57 | python -m pip uninstall -y -r installed.txt
58 |
59 | - name: Install from sdist
60 | shell: bash
61 | run: |
62 | SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
63 | pip install dist/$SDIST
64 |
65 | - name: Test import
66 | shell: bash
67 | run: |
68 | python -c "import $MODULE_NAME" -Werror
69 |
70 | - name: Install test requirements
71 | run: |
72 | python -m pip install -U -r requirements.txt
73 |
74 | - name: Run tests
75 | shell: bash
76 | run: |
77 | python -m pytest --pyargs $MODULE_NAME -Werror
78 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg
2 | *.egg-info
3 | .eggs
4 | preshed/.maps.pxd.swm
5 | preshed/.maps.pyx.swl
6 | *.sw[a-z]
7 | *.so
8 | *.pyc
9 | *.swp
10 | *.swo
11 | *.html
12 | *.c
13 | *.cpp
14 | .env/
15 | .denv
16 | cythonize.json
17 | MANIFEST
18 | build/
19 | dist/
20 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2016 ExplosionAI GmbH, 2014 Matthew Honnibal
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include include *.h
2 | include LICENSE
3 | include README.md
4 | recursive-exclude preshed *.cpp
5 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # preshed: Cython Hash Table for Pre-Hashed Keys
4 |
5 | Simple but high performance Cython hash table mapping pre-randomized keys to
6 | `void*` values. Inspired by
7 | [Jeff Preshing](http://preshing.com/20130107/this-hash-table-is-faster-than-a-judy-array/).
8 |
9 | [](https://github.com/explosion/preshed/actions/workflows/tests.yml)
10 | [](https://pypi.python.org/pypi/preshed)
11 | [](https://anaconda.org/conda-forge/preshed)
12 | [](https://github.com/explosion/wheelwright/releases)
13 |
--------------------------------------------------------------------------------
/bin/push-tag.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -e
4 |
5 | # Insist repository is clean
6 | git diff-index --quiet HEAD
7 |
8 | git checkout $1
9 | git pull origin $1
10 | git push origin $1
11 |
12 | version=$(grep "__version__ = " preshed/about.py)
13 | version=${version/__version__ = }
14 | version=${version/\'/}
15 | version=${version/\'/}
16 | version=${version/\"/}
17 | version=${version/\"/}
18 | git tag "v$version"
19 | git push origin "v$version"
20 |
--------------------------------------------------------------------------------
/fabfile.py:
--------------------------------------------------------------------------------
1 | from fabric.api import local, run, lcd, cd, env
2 |
3 | import os
4 | from os import path
5 | from os.path import exists as file_exists
6 | from fabtools.python import virtualenv
7 |
8 |
9 | PWD = path.dirname(__file__)
10 | VENV_DIR = path.join(PWD, '.env')
11 | DEV_ENV_DIR = path.join(PWD, '.denv')
12 |
13 |
14 | def dev():
15 | # Allow this to persist, since we aren't as rigorous about keeping state clean
16 | if not file_exists('.denv'):
17 | local('virtualenv .denv')
18 |
19 | with virtualenv(DEV_ENV_DIR):
20 | local('pip install -r requirements.txt')
21 |
22 |
23 | def sdist():
24 | if file_exists('dist/'):
25 | local('rm -rf dist/')
26 | local('mkdir dist')
27 | with virtualenv(VENV_DIR):
28 | local('python setup.py sdist')
29 |
30 |
31 | def publish():
32 | with virtualenv(VENV_DIR):
33 | local('python setup.py register')
34 | local('twine upload dist/*.tar.gz')
35 |
36 |
37 | def setup():
38 | if file_exists('.env'):
39 | local('rm -rf .env')
40 | local('rm -rf *.egg')
41 | local('virtualenv .env')
42 |
43 |
44 | def install():
45 | with virtualenv(VENV_DIR):
46 | local('pip install --upgrade setuptools')
47 | local('pip install dist/*.tar.gz')
48 | local('pip install pytest')
49 |
50 |
51 | def make():
52 | with virtualenv(DEV_ENV_DIR):
53 | with lcd(path.dirname(__file__)):
54 | local('python setup.py build')
55 |
56 |
57 | def clean():
58 | with lcd(os.path.dirname(__file__)):
59 | local('python setup.py clean --all')
60 | with virtualenv(DEV_ENV_DIR):
61 | with lcd(os.path.dirname(__file__)):
62 | local('python setup.py clean --all')
63 |
64 | def test():
65 | with virtualenv(VENV_DIR):
66 | local('python -m pytest -x')
67 |
68 |
69 | def travis():
70 | local('open https://travis-ci.org/spacy-io/preshed')
71 |
--------------------------------------------------------------------------------
/include/msvc9/stdint.h:
--------------------------------------------------------------------------------
1 | // ISO C9x compliant stdint.h for Microsoft Visual Studio
2 | // Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124
3 | //
4 | // Copyright (c) 2006-2013 Alexander Chemeris
5 | //
6 | // Redistribution and use in source and binary forms, with or without
7 | // modification, are permitted provided that the following conditions are met:
8 | //
9 | // 1. Redistributions of source code must retain the above copyright notice,
10 | // this list of conditions and the following disclaimer.
11 | //
12 | // 2. Redistributions in binary form must reproduce the above copyright
13 | // notice, this list of conditions and the following disclaimer in the
14 | // documentation and/or other materials provided with the distribution.
15 | //
16 | // 3. Neither the name of the product nor the names of its contributors may
17 | // be used to endorse or promote products derived from this software
18 | // without specific prior written permission.
19 | //
20 | // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
21 | // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
22 | // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
23 | // EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 | // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 | // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 | // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 | // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | //
31 | ///////////////////////////////////////////////////////////////////////////////
32 |
33 | #ifndef _MSC_VER // [
34 | #error "Use this header only with Microsoft Visual C++ compilers!"
35 | #endif // _MSC_VER ]
36 |
37 | #ifndef _MSC_STDINT_H_ // [
38 | #define _MSC_STDINT_H_
39 |
40 | #if _MSC_VER > 1000
41 | #pragma once
42 | #endif
43 |
44 | #if _MSC_VER >= 1600 // [
45 | #include
46 | #else // ] _MSC_VER >= 1600 [
47 |
48 | #include
49 |
50 | // For Visual Studio 6 in C++ mode and for many Visual Studio versions when
51 | // compiling for ARM we should wrap include with 'extern "C++" {}'
52 | // or compiler give many errors like this:
53 | // error C2733: second C linkage of overloaded function 'wmemchr' not allowed
54 | #ifdef __cplusplus
55 | extern "C" {
56 | #endif
57 | # include
58 | #ifdef __cplusplus
59 | }
60 | #endif
61 |
62 | // Define _W64 macros to mark types changing their size, like intptr_t.
63 | #ifndef _W64
64 | # if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300
65 | # define _W64 __w64
66 | # else
67 | # define _W64
68 | # endif
69 | #endif
70 |
71 |
72 | // 7.18.1 Integer types
73 |
74 | // 7.18.1.1 Exact-width integer types
75 |
76 | // Visual Studio 6 and Embedded Visual C++ 4 doesn't
77 | // realize that, e.g. char has the same size as __int8
78 | // so we give up on __intX for them.
79 | #if (_MSC_VER < 1300)
80 | typedef signed char int8_t;
81 | typedef signed short int16_t;
82 | typedef signed int int32_t;
83 | typedef unsigned char uint8_t;
84 | typedef unsigned short uint16_t;
85 | typedef unsigned int uint32_t;
86 | #else
87 | typedef signed __int8 int8_t;
88 | typedef signed __int16 int16_t;
89 | typedef signed __int32 int32_t;
90 | typedef unsigned __int8 uint8_t;
91 | typedef unsigned __int16 uint16_t;
92 | typedef unsigned __int32 uint32_t;
93 | #endif
94 | typedef signed __int64 int64_t;
95 | typedef unsigned __int64 uint64_t;
96 |
97 |
98 | // 7.18.1.2 Minimum-width integer types
99 | typedef int8_t int_least8_t;
100 | typedef int16_t int_least16_t;
101 | typedef int32_t int_least32_t;
102 | typedef int64_t int_least64_t;
103 | typedef uint8_t uint_least8_t;
104 | typedef uint16_t uint_least16_t;
105 | typedef uint32_t uint_least32_t;
106 | typedef uint64_t uint_least64_t;
107 |
108 | // 7.18.1.3 Fastest minimum-width integer types
109 | typedef int8_t int_fast8_t;
110 | typedef int16_t int_fast16_t;
111 | typedef int32_t int_fast32_t;
112 | typedef int64_t int_fast64_t;
113 | typedef uint8_t uint_fast8_t;
114 | typedef uint16_t uint_fast16_t;
115 | typedef uint32_t uint_fast32_t;
116 | typedef uint64_t uint_fast64_t;
117 |
118 | // 7.18.1.4 Integer types capable of holding object pointers
119 | #ifdef _WIN64 // [
120 | typedef signed __int64 intptr_t;
121 | typedef unsigned __int64 uintptr_t;
122 | #else // _WIN64 ][
123 | typedef _W64 signed int intptr_t;
124 | typedef _W64 unsigned int uintptr_t;
125 | #endif // _WIN64 ]
126 |
127 | // 7.18.1.5 Greatest-width integer types
128 | typedef int64_t intmax_t;
129 | typedef uint64_t uintmax_t;
130 |
131 |
132 | // 7.18.2 Limits of specified-width integer types
133 |
134 | #if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259
135 |
136 | // 7.18.2.1 Limits of exact-width integer types
137 | #define INT8_MIN ((int8_t)_I8_MIN)
138 | #define INT8_MAX _I8_MAX
139 | #define INT16_MIN ((int16_t)_I16_MIN)
140 | #define INT16_MAX _I16_MAX
141 | #define INT32_MIN ((int32_t)_I32_MIN)
142 | #define INT32_MAX _I32_MAX
143 | #define INT64_MIN ((int64_t)_I64_MIN)
144 | #define INT64_MAX _I64_MAX
145 | #define UINT8_MAX _UI8_MAX
146 | #define UINT16_MAX _UI16_MAX
147 | #define UINT32_MAX _UI32_MAX
148 | #define UINT64_MAX _UI64_MAX
149 |
150 | // 7.18.2.2 Limits of minimum-width integer types
151 | #define INT_LEAST8_MIN INT8_MIN
152 | #define INT_LEAST8_MAX INT8_MAX
153 | #define INT_LEAST16_MIN INT16_MIN
154 | #define INT_LEAST16_MAX INT16_MAX
155 | #define INT_LEAST32_MIN INT32_MIN
156 | #define INT_LEAST32_MAX INT32_MAX
157 | #define INT_LEAST64_MIN INT64_MIN
158 | #define INT_LEAST64_MAX INT64_MAX
159 | #define UINT_LEAST8_MAX UINT8_MAX
160 | #define UINT_LEAST16_MAX UINT16_MAX
161 | #define UINT_LEAST32_MAX UINT32_MAX
162 | #define UINT_LEAST64_MAX UINT64_MAX
163 |
164 | // 7.18.2.3 Limits of fastest minimum-width integer types
165 | #define INT_FAST8_MIN INT8_MIN
166 | #define INT_FAST8_MAX INT8_MAX
167 | #define INT_FAST16_MIN INT16_MIN
168 | #define INT_FAST16_MAX INT16_MAX
169 | #define INT_FAST32_MIN INT32_MIN
170 | #define INT_FAST32_MAX INT32_MAX
171 | #define INT_FAST64_MIN INT64_MIN
172 | #define INT_FAST64_MAX INT64_MAX
173 | #define UINT_FAST8_MAX UINT8_MAX
174 | #define UINT_FAST16_MAX UINT16_MAX
175 | #define UINT_FAST32_MAX UINT32_MAX
176 | #define UINT_FAST64_MAX UINT64_MAX
177 |
178 | // 7.18.2.4 Limits of integer types capable of holding object pointers
179 | #ifdef _WIN64 // [
180 | # define INTPTR_MIN INT64_MIN
181 | # define INTPTR_MAX INT64_MAX
182 | # define UINTPTR_MAX UINT64_MAX
183 | #else // _WIN64 ][
184 | # define INTPTR_MIN INT32_MIN
185 | # define INTPTR_MAX INT32_MAX
186 | # define UINTPTR_MAX UINT32_MAX
187 | #endif // _WIN64 ]
188 |
189 | // 7.18.2.5 Limits of greatest-width integer types
190 | #define INTMAX_MIN INT64_MIN
191 | #define INTMAX_MAX INT64_MAX
192 | #define UINTMAX_MAX UINT64_MAX
193 |
194 | // 7.18.3 Limits of other integer types
195 |
196 | #ifdef _WIN64 // [
197 | # define PTRDIFF_MIN _I64_MIN
198 | # define PTRDIFF_MAX _I64_MAX
199 | #else // _WIN64 ][
200 | # define PTRDIFF_MIN _I32_MIN
201 | # define PTRDIFF_MAX _I32_MAX
202 | #endif // _WIN64 ]
203 |
204 | #define SIG_ATOMIC_MIN INT_MIN
205 | #define SIG_ATOMIC_MAX INT_MAX
206 |
207 | #ifndef SIZE_MAX // [
208 | # ifdef _WIN64 // [
209 | # define SIZE_MAX _UI64_MAX
210 | # else // _WIN64 ][
211 | # define SIZE_MAX _UI32_MAX
212 | # endif // _WIN64 ]
213 | #endif // SIZE_MAX ]
214 |
215 | // WCHAR_MIN and WCHAR_MAX are also defined in
216 | #ifndef WCHAR_MIN // [
217 | # define WCHAR_MIN 0
218 | #endif // WCHAR_MIN ]
219 | #ifndef WCHAR_MAX // [
220 | # define WCHAR_MAX _UI16_MAX
221 | #endif // WCHAR_MAX ]
222 |
223 | #define WINT_MIN 0
224 | #define WINT_MAX _UI16_MAX
225 |
226 | #endif // __STDC_LIMIT_MACROS ]
227 |
228 |
229 | // 7.18.4 Limits of other integer types
230 |
231 | #if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260
232 |
233 | // 7.18.4.1 Macros for minimum-width integer constants
234 |
235 | #define INT8_C(val) val##i8
236 | #define INT16_C(val) val##i16
237 | #define INT32_C(val) val##i32
238 | #define INT64_C(val) val##i64
239 |
240 | #define UINT8_C(val) val##ui8
241 | #define UINT16_C(val) val##ui16
242 | #define UINT32_C(val) val##ui32
243 | #define UINT64_C(val) val##ui64
244 |
245 | // 7.18.4.2 Macros for greatest-width integer constants
246 | // These #ifndef's are needed to prevent collisions with .
247 | // Check out Issue 9 for the details.
248 | #ifndef INTMAX_C // [
249 | # define INTMAX_C INT64_C
250 | #endif // INTMAX_C ]
251 | #ifndef UINTMAX_C // [
252 | # define UINTMAX_C UINT64_C
253 | #endif // UINTMAX_C ]
254 |
255 | #endif // __STDC_CONSTANT_MACROS ]
256 |
257 | #endif // _MSC_VER >= 1600 ]
258 |
259 | #endif // _MSC_STDINT_H_ ]
260 |
--------------------------------------------------------------------------------
/preshed/__init__.pxd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/preshed/7bd9d00a9b9460020ad9f9d7f06499efd6a08b58/preshed/__init__.pxd
--------------------------------------------------------------------------------
/preshed/__init__.py:
--------------------------------------------------------------------------------
1 | from .about import *
2 |
--------------------------------------------------------------------------------
/preshed/about.py:
--------------------------------------------------------------------------------
1 | __title__ = "preshed"
2 | __version__ = "3.0.10"
3 | __summary__ = "Cython hash table that trusts the keys are pre-hashed"
4 | __uri__ = "https://github.com/explosion/preshed"
5 | __author__ = "Explosion"
6 | __email__ = "contact@explosion.ai"
7 | __license__ = "MIT"
8 | __release__ = True
9 |
--------------------------------------------------------------------------------
/preshed/bloom.pxd:
--------------------------------------------------------------------------------
1 | from libc.stdint cimport uint64_t, uint32_t
2 | from cymem.cymem cimport Pool
3 |
4 | ctypedef uint64_t key_t
5 |
6 | cdef struct BloomStruct:
7 | key_t* bitfield
8 | key_t hcount # hash count, number of hash functions
9 | key_t length
10 | uint32_t seed
11 |
12 |
13 | cdef class BloomFilter:
14 | cdef Pool mem
15 | cdef BloomStruct* c_bloom
16 | cdef inline bint contains(self, key_t item) nogil
17 |
18 |
19 | cdef void bloom_init(Pool mem, BloomStruct* bloom, key_t hcount, key_t length, uint32_t seed) except *
20 |
21 | cdef void bloom_add(BloomStruct* bloom, key_t item) nogil
22 |
23 | cdef bint bloom_contains(const BloomStruct* bloom, key_t item) nogil
24 |
25 | cdef void bloom_add(BloomStruct* bloom, key_t item) nogil
26 |
--------------------------------------------------------------------------------
/preshed/bloom.pyx:
--------------------------------------------------------------------------------
1 | # cython: infer_types=True
2 | # cython: cdivision=True
3 | #
4 | from murmurhash.mrmr cimport hash128_x86
5 | import math
6 | from array import array
7 |
8 | try:
9 | import copy_reg
10 | except ImportError:
11 | import copyreg as copy_reg
12 |
13 |
14 | def calculate_size_and_hash_count(members, error_rate):
15 | """Calculate the optimal size in bits and number of hash functions for a
16 | given number of members and error rate.
17 | """
18 | base = math.log(1 / (2 ** math.log(2)))
19 | bit_count = math.ceil((members * math.log(error_rate)) / base)
20 | hash_count = math.floor((bit_count / members) * math.log(2))
21 | return (bit_count, hash_count)
22 |
23 |
24 | cdef class BloomFilter:
25 | """Bloom filter that allows for basic membership tests.
26 |
27 | Only integers are supported as keys.
28 | """
29 | def __init__(self, key_t size=(2 ** 10), key_t hash_funcs=23, uint32_t seed=0):
30 | self.mem = Pool()
31 | self.c_bloom = self.mem.alloc(1, sizeof(BloomStruct))
32 | bloom_init(self.mem, self.c_bloom, hash_funcs, size, seed)
33 |
34 | @classmethod
35 | def from_error_rate(cls, members, error_rate=1E-4):
36 | params = calculate_size_and_hash_count(members, error_rate)
37 | return cls(*params)
38 |
39 | def add(self, key_t item):
40 | bloom_add(self.c_bloom, item)
41 |
42 | def __contains__(self, item):
43 | return bloom_contains(self.c_bloom, item)
44 |
45 | cdef inline bint contains(self, key_t item) nogil:
46 | return bloom_contains(self.c_bloom, item)
47 |
48 | def to_bytes(self):
49 | return bloom_to_bytes(self.c_bloom)
50 |
51 | def from_bytes(self, bytes byte_string):
52 | bloom_from_bytes(self.mem, self.c_bloom, byte_string)
53 | return self
54 |
55 |
56 | cdef bytes bloom_to_bytes(const BloomStruct* bloom):
57 | py = array("L")
58 | py.append(bloom.hcount)
59 | py.append(bloom.length)
60 | py.append(bloom.seed)
61 | for i in range(bloom.length // sizeof(key_t)):
62 | py.append(bloom.bitfield[i])
63 | if hasattr(py, "tobytes"):
64 | return py.tobytes()
65 | else:
66 | # Python 2 :(
67 | return py.tostring()
68 |
69 |
70 | cdef void bloom_from_bytes(Pool mem, BloomStruct* bloom, bytes data):
71 | py = array("L")
72 | if hasattr(py, "frombytes"):
73 | py.frombytes(data)
74 | else:
75 | py.fromstring(data)
76 | bloom.hcount = py[0]
77 | bloom.length = py[1]
78 | bloom.seed = py[2]
79 | bloom.bitfield = mem.alloc(bloom.length // sizeof(key_t), sizeof(key_t))
80 | for i in range(bloom.length // sizeof(key_t)):
81 | bloom.bitfield[i] = py[3+i]
82 |
83 |
84 | cdef void bloom_init(Pool mem, BloomStruct* bloom, key_t hcount, key_t length, uint32_t seed) except *:
85 | # size should be a multiple of the container size - round up
86 | if length % sizeof(key_t):
87 | length = math.ceil(length / sizeof(key_t)) * sizeof(key_t)
88 | bloom.length = length
89 | bloom.hcount = hcount
90 | bloom.bitfield = mem.alloc(length // sizeof(key_t), sizeof(key_t))
91 | bloom.seed = seed
92 |
93 |
94 | # Instead of calling MurmurHash with a different seed for each hash function, this
95 | # generates two initial hash values and then combines them to create the correct
96 | # number of hashes. This technique is faster than just doing MurmurhHash
97 | # repeatedly and has been shown to work as well as full hashing.
98 |
99 | # For details see "Less Hashing, Same Performance: Building a Better Bloom
100 | # Filter", Kirsch & Mitzenmacher.
101 |
102 | # https://www.semanticscholar.org/paper/Less-hashing%2C-same-performance%3A-Building-a-better-Kirsch-Mitzenmacher/65c43afbfc064705bdc40d3473f32518e9306429
103 | # The choice of seeds is arbitrary.
104 |
105 |
106 | cdef void bloom_add(BloomStruct* bloom, key_t item) nogil:
107 | cdef key_t hv
108 | cdef key_t[2] keys
109 | cdef key_t one = 1 # We want this explicitly typed, because bits
110 | hash128_x86(&item, sizeof(key_t), 0, &keys)
111 | for hiter in range(bloom.hcount):
112 | hv = (keys[0] + (hiter * keys[1])) % bloom.length
113 | bloom.bitfield[hv // sizeof(key_t)] |= one << (hv % sizeof(key_t))
114 |
115 |
116 | cdef bint bloom_contains(const BloomStruct* bloom, key_t item) nogil:
117 | cdef key_t hv
118 | cdef key_t[2] keys
119 | cdef key_t one = 1 # We want this explicitly typed, because bits
120 | hash128_x86(&item, sizeof(key_t), 0, &keys)
121 | for hiter in range(bloom.hcount):
122 | hv = (keys[0] + (hiter * keys[1])) % bloom.length
123 | if not (bloom.bitfield[hv // sizeof(key_t)] & one << (hv % sizeof(key_t))):
124 | return False
125 | return True
126 |
127 |
128 | def pickle_bloom(BloomFilter bloom):
129 | return unpickle_bloom, (bloom.to_bytes(),)
130 |
131 |
132 | def unpickle_bloom(byte_string):
133 | return BloomFilter().from_bytes(byte_string)
134 |
135 |
136 | copy_reg.pickle(BloomFilter, pickle_bloom, unpickle_bloom)
137 |
--------------------------------------------------------------------------------
/preshed/counter.pxd:
--------------------------------------------------------------------------------
1 | from libc.stdint cimport int64_t
2 |
3 | from cymem.cymem cimport Pool
4 |
5 | from .maps cimport MapStruct
6 | from .maps cimport map_init, map_get, map_set, map_iter
7 | from .maps cimport key_t
8 |
9 |
10 | ctypedef int64_t count_t
11 |
12 |
13 | cdef class PreshCounter:
14 | cdef Pool mem
15 | cdef MapStruct* c_map
16 | cdef public object smoother
17 | cdef readonly count_t total
18 |
19 | cpdef int inc(self, key_t key, count_t inc) except -1
20 |
--------------------------------------------------------------------------------
/preshed/counter.pyx:
--------------------------------------------------------------------------------
1 | """Count occurrences of uint64-valued keys."""
2 | from __future__ import division
3 | cimport cython
4 | from libc.math cimport log, exp, sqrt
5 |
6 |
7 | cdef class PreshCounter:
8 | def __init__(self, initial_size=8):
9 | assert initial_size != 0
10 | assert initial_size & (initial_size - 1) == 0
11 | self.mem = Pool()
12 | self.c_map = self.mem.alloc(1, sizeof(MapStruct))
13 | map_init(self.mem, self.c_map, initial_size)
14 | self.smoother = None
15 | self.total = 0
16 |
17 | property length:
18 | def __get__(self):
19 | return self.c_map.length
20 |
21 | def __len__(self):
22 | return self.c_map.length
23 |
24 | def __iter__(self):
25 | cdef int i = 0
26 | cdef key_t key
27 | cdef void* value
28 | while map_iter(self.c_map, &i, &key, &value):
29 | yield key, value
30 |
31 | def __getitem__(self, key_t key):
32 | return map_get(self.c_map, key)
33 |
34 | cpdef int inc(self, key_t key, count_t inc) except -1:
35 | cdef count_t c = map_get(self.c_map, key)
36 | c += inc
37 | map_set(self.mem, self.c_map, key, c)
38 | self.total += inc
39 | return c
40 |
41 | def prob(self, key_t key):
42 | cdef GaleSmoother smoother
43 | cdef void* value = map_get(self.c_map, key)
44 | if self.smoother is not None:
45 | smoother = self.smoother
46 | r_star = self.smoother(value)
47 | return r_star / self.smoother.total
48 | elif value == NULL:
49 | return 0
50 | else:
51 | return value / self.total
52 |
53 | def smooth(self):
54 | self.smoother = GaleSmoother(self)
55 |
56 |
57 | cdef class GaleSmoother:
58 | cdef Pool mem
59 | cdef count_t* Nr
60 | cdef double gradient
61 | cdef double intercept
62 | cdef readonly count_t cutoff
63 | cdef count_t Nr0
64 | cdef readonly double total
65 |
66 | def __init__(self, PreshCounter counts):
67 | count_counts = PreshCounter()
68 | cdef double total = 0
69 | for _, count in counts:
70 | count_counts.inc(count, 1)
71 | total += count
72 | # If we have no items seen 1 or 2 times, this doesn't work. But, this
73 | # won't be true in real data...
74 | assert count_counts[1] != 0 and count_counts[2] != 0, "Cannot smooth your weird data"
75 | # Extrapolate Nr0 from Nr1 and Nr2.
76 | self.Nr0 = count_counts[1] + (count_counts[1] - count_counts[2])
77 | self.mem = Pool()
78 |
79 | cdef double[2] mb
80 |
81 | cdef int n_counts = 0
82 | for _ in count_counts:
83 | n_counts += 1
84 | sorted_r = count_counts.mem.alloc(n_counts, sizeof(count_t))
85 | self.Nr = self.mem.alloc(n_counts, sizeof(count_t))
86 | for i, (count, count_count) in enumerate(sorted(count_counts)):
87 | sorted_r[i] = count
88 | self.Nr[i] = count_count
89 |
90 | _fit_loglinear_model(mb, sorted_r, self.Nr, n_counts)
91 |
92 | self.cutoff = _find_when_to_switch(sorted_r, self.Nr, mb[0], mb[1],
93 | n_counts)
94 | self.gradient = mb[0]
95 | self.intercept = mb[1]
96 | self.total = self(0) * self.Nr0
97 | for count, count_count in count_counts:
98 | self.total += self(count) * count_count
99 |
100 | def __call__(self, count_t r):
101 | if r == 0:
102 | return self.Nr[1] / self.Nr0
103 | elif r < self.cutoff:
104 | return turing_estimate_of_r(r, self.Nr[r-1], self.Nr[r])
105 | else:
106 | return gale_estimate_of_r(r, self.gradient, self.intercept)
107 |
108 | def count_count(self, count_t r):
109 | if r == 0:
110 | return self.Nr0
111 | else:
112 | return self.Nr[r-1]
113 |
114 |
115 | @cython.cdivision(True)
116 | cdef double turing_estimate_of_r(double r, double Nr, double Nr1) except -1:
117 | return ((r + 1) * Nr1) / Nr
118 |
119 |
120 | @cython.cdivision(True)
121 | cdef double gale_estimate_of_r(double r, double gradient, double intercept) except -1:
122 | cdef double e_nr = exp(gradient * log(r) + intercept)
123 | cdef double e_nr1 = exp(gradient * log(r+1) + intercept)
124 | return (r + 1) * (e_nr1 / e_nr)
125 |
126 |
127 | @cython.cdivision(True)
128 | cdef void _fit_loglinear_model(double* output, count_t* sorted_r, count_t* Nr,
129 | int length) except *:
130 | cdef double x_mean = 0.0
131 | cdef double y_mean = 0.0
132 |
133 | cdef Pool mem = Pool()
134 | x = mem.alloc(length, sizeof(double))
135 | y = mem.alloc(length, sizeof(double))
136 |
137 | cdef int i
138 | for i in range(length):
139 | r = sorted_r[i]
140 | x[i] = log(r)
141 | y[i] = log(_get_zr(i, sorted_r, Nr[i], length))
142 | x_mean += x[i]
143 | y_mean += y[i]
144 |
145 | x_mean /= length
146 | y_mean /= length
147 |
148 | cdef double ss_xy = 0.0
149 | cdef double ss_xx = 0.0
150 |
151 | for i in range(length):
152 | x_dist = x[i] - x_mean
153 | y_dist = y[i] - y_mean
154 | # SS_xy = sum the product of the distances from the mean
155 | ss_xy += x_dist * y_dist
156 | # SS_xx = sum the squares of the x distance
157 | ss_xx += x_dist * x_dist
158 | # Gradient
159 | output[0] = ss_xy / ss_xx
160 | # Intercept
161 | output[1] = y_mean - output[0] * x_mean
162 |
163 |
164 | @cython.cdivision(True)
165 | cdef double _get_zr(int j, count_t* sorted_r, count_t Nr_j, int n_counts) except -1:
166 | cdef double r_i = sorted_r[j-1] if j >= 1 else 0
167 | cdef double r_j = sorted_r[j]
168 | cdef double r_k = sorted_r[j+1] if (j+1) < n_counts else (2 * r_i - 1)
169 | return 2 * Nr_j / (r_k - r_i)
170 |
171 |
172 | @cython.cdivision(True)
173 | cdef double _variance(double r, double Nr, double Nr1) nogil:
174 | return 1.96 * sqrt((r+1)**2 * (Nr1 / Nr**2) * (1.0 + (Nr1 / Nr)))
175 |
176 |
177 | @cython.cdivision(True)
178 | cdef count_t _find_when_to_switch(count_t* sorted_r, count_t* Nr, double m, double b,
179 | int length) except -1:
180 | cdef int i
181 | cdef count_t r
182 | for i in range(length-1):
183 | r = sorted_r[i]
184 | if sorted_r[i+1] != r+1:
185 | return r
186 | g_r = gale_estimate_of_r(r, m, b)
187 | t_r = turing_estimate_of_r(r, Nr[i], Nr[i+1])
188 | if abs(t_r - g_r) <= _variance(r, Nr[i], Nr[i+1]):
189 | return r
190 | else:
191 | return length - 1
192 |
--------------------------------------------------------------------------------
/preshed/maps.pxd:
--------------------------------------------------------------------------------
1 | from libc.stdint cimport uint64_t
2 | from cymem.cymem cimport Pool
3 |
4 |
5 | ctypedef uint64_t key_t
6 |
7 |
8 | cdef struct Cell:
9 | key_t key
10 | void* value
11 |
12 |
13 | cdef struct Result:
14 | int found
15 | void* value
16 |
17 |
18 | cdef struct MapStruct:
19 | Cell* cells
20 | void* value_for_empty_key
21 | void* value_for_del_key
22 | key_t length
23 | key_t filled
24 | bint is_empty_key_set
25 | bint is_del_key_set
26 |
27 |
28 | cdef void* map_bulk_get(const MapStruct* map_, const key_t* keys, void** values,
29 | int n) nogil
30 |
31 |
32 | cdef Result map_get_unless_missing(const MapStruct* map_, const key_t key) nogil
33 |
34 | cdef void* map_get(const MapStruct* map_, const key_t key) nogil
35 |
36 | cdef void map_set(Pool mem, MapStruct* map_, key_t key, void* value) except *
37 |
38 | cdef void map_init(Pool mem, MapStruct* pmap, size_t length) except *
39 |
40 | cdef bint map_iter(const MapStruct* map_, int* i, key_t* key, void** value) nogil
41 |
42 | cdef void* map_clear(MapStruct* map_, const key_t key) nogil
43 |
44 |
45 | cdef class PreshMap:
46 | cdef MapStruct* c_map
47 | cdef Pool mem
48 |
49 | cdef inline void* get(self, key_t key) nogil
50 | cdef void set(self, key_t key, void* value) except *
51 |
52 |
53 | cdef class PreshMapArray:
54 | cdef Pool mem
55 | cdef MapStruct* maps
56 | cdef size_t length
57 |
58 | cdef inline void* get(self, size_t i, key_t key) nogil
59 | cdef void set(self, size_t i, key_t key, void* value) except *
60 |
--------------------------------------------------------------------------------
/preshed/maps.pyx:
--------------------------------------------------------------------------------
1 | # cython: infer_types=True
2 | # cython: cdivision=True
3 | #
4 | cimport cython
5 |
6 |
7 | DEF EMPTY_KEY = 0
8 | DEF DELETED_KEY = 1
9 |
10 |
11 | cdef class PreshMap:
12 | """Hash map that assumes keys come pre-hashed. Maps uint64_t --> uint64_t.
13 | Uses open addressing with linear probing.
14 |
15 | Usage
16 | map = PreshMap() # Create a table
17 | map = PreshMap(initial_size=1024) # Create with initial size (efficiency)
18 | map[key] = value # Set a value to a key
19 | value = map[key] # Get a value given a key
20 | for key, value in map.items(): # Iterate over items
21 | len(map) # Get number of inserted keys
22 | """
23 | def __init__(self, size_t initial_size=8):
24 | # Size must be power of two
25 | if initial_size == 0:
26 | initial_size = 8
27 | if initial_size & (initial_size - 1) != 0:
28 | power = 1
29 | while power < initial_size:
30 | power *= 2
31 | initial_size = power
32 | self.mem = Pool()
33 | self.c_map = self.mem.alloc(1, sizeof(MapStruct))
34 | map_init(self.mem, self.c_map, initial_size)
35 |
36 | property capacity:
37 | def __get__(self):
38 | return self.c_map.length
39 |
40 | def items(self):
41 | cdef key_t key
42 | cdef void* value
43 | cdef int i = 0
44 | while map_iter(self.c_map, &i, &key, &value):
45 | yield key, value
46 |
47 | def keys(self):
48 | for key, _ in self.items():
49 | yield key
50 |
51 | def values(self):
52 | for _, value in self.items():
53 | yield value
54 |
55 | def pop(self, key_t key, default=None):
56 | cdef Result result = map_get_unless_missing(self.c_map, key)
57 | map_clear(self.c_map, key)
58 | if result.found:
59 | return result.value
60 | else:
61 | return default
62 |
63 | def __getitem__(self, key_t key):
64 | cdef Result result = map_get_unless_missing(self.c_map, key)
65 | if result.found:
66 | return result.value
67 | else:
68 | return None
69 |
70 | def __setitem__(self, key_t key, size_t value):
71 | map_set(self.mem, self.c_map, key, value)
72 |
73 | def __delitem__(self, key_t key):
74 | map_clear(self.c_map, key)
75 |
76 | def __len__(self):
77 | return self.c_map.filled
78 |
79 | def __contains__(self, key_t key):
80 | cdef Result result = map_get_unless_missing(self.c_map, key)
81 | return True if result.found else False
82 |
83 | def __iter__(self):
84 | for key in self.keys():
85 | yield key
86 |
87 | cdef inline void* get(self, key_t key) nogil:
88 | return map_get(self.c_map, key)
89 |
90 | cdef void set(self, key_t key, void* value) except *:
91 | map_set(self.mem, self.c_map, key, value)
92 |
93 |
94 | cdef class PreshMapArray:
95 | """An array of hash tables that assume keys come pre-hashed. Each table
96 | uses open addressing with linear probing.
97 | """
98 | def __init__(self, size_t length, size_t initial_size=8):
99 | self.mem = Pool()
100 | self.length = length
101 | self.maps = self.mem.alloc(length, sizeof(MapStruct))
102 | for i in range(length):
103 | map_init(self.mem, &self.maps[i], initial_size)
104 |
105 | cdef inline void* get(self, size_t i, key_t key) nogil:
106 | return map_get(&self.maps[i], key)
107 |
108 | cdef void set(self, size_t i, key_t key, void* value) except *:
109 | map_set(self.mem, &self.maps[i], key, value)
110 |
111 |
112 | cdef void map_init(Pool mem, MapStruct* map_, size_t length) except *:
113 | map_.length = length
114 | map_.filled = 0
115 | map_.cells = mem.alloc(length, sizeof(Cell))
116 |
117 |
118 | cdef void map_set(Pool mem, MapStruct* map_, key_t key, void* value) except *:
119 | cdef Cell* cell
120 | if key == EMPTY_KEY:
121 | map_.value_for_empty_key = value
122 | map_.is_empty_key_set = True
123 | elif key == DELETED_KEY:
124 | map_.value_for_del_key = value
125 | map_.is_del_key_set = True
126 | else:
127 | cell = _find_cell_for_insertion(map_.cells, map_.length, key)
128 | if cell.key == EMPTY_KEY:
129 | map_.filled += 1
130 | cell.key = key
131 | cell.value = value
132 | if (map_.filled + 1) * 5 >= (map_.length * 3):
133 | _resize(mem, map_)
134 |
135 |
136 | cdef void* map_get(const MapStruct* map_, const key_t key) nogil:
137 | if key == EMPTY_KEY:
138 | return map_.value_for_empty_key
139 | elif key == DELETED_KEY:
140 | return map_.value_for_del_key
141 | cdef Cell* cell = _find_cell(map_.cells, map_.length, key)
142 | return cell.value
143 |
144 |
145 | cdef Result map_get_unless_missing(const MapStruct* map_, const key_t key) nogil:
146 | cdef Result result
147 | cdef Cell* cell
148 | result.found = 0
149 | result.value = NULL
150 | if key == EMPTY_KEY:
151 | if map_.is_empty_key_set:
152 | result.found = 1
153 | result.value = map_.value_for_empty_key
154 | elif key == DELETED_KEY:
155 | if map_.is_del_key_set:
156 | result.found = 1
157 | result.value = map_.value_for_del_key
158 | else:
159 | cell = _find_cell(map_.cells, map_.length, key)
160 | if cell.key == key:
161 | result.found = 1
162 | result.value = cell.value
163 | return result
164 |
165 |
166 | cdef void* map_clear(MapStruct* map_, const key_t key) nogil:
167 | if key == EMPTY_KEY:
168 | value = map_.value_for_empty_key if map_.is_empty_key_set else NULL
169 | map_.is_empty_key_set = False
170 | return value
171 | elif key == DELETED_KEY:
172 | value = map_.value_for_del_key if map_.is_del_key_set else NULL
173 | map_.is_del_key_set = False
174 | return value
175 | else:
176 | cell = _find_cell(map_.cells, map_.length, key)
177 | cell.key = DELETED_KEY
178 | # We shouldn't decrement the "filled" value here, as we're not actually
179 | # making "empty" values -- deleted values aren't quite the same.
180 | # Instead if we manage to insert into a deleted slot, we don't increment
181 | # the fill rate.
182 | return cell.value
183 |
184 |
185 | cdef void* map_bulk_get(const MapStruct* map_, const key_t* keys, void** values,
186 | int n) nogil:
187 | cdef int i
188 | for i in range(n):
189 | values[i] = map_get(map_, keys[i])
190 |
191 |
192 | cdef bint map_iter(const MapStruct* map_, int* i, key_t* key, void** value) nogil:
193 | '''Iterate over the filled items, setting the current place in i, and the
194 | key and value. Return False when iteration finishes.
195 | '''
196 | cdef const Cell* cell
197 | while i[0] < map_.length:
198 | cell = &map_.cells[i[0]]
199 | i[0] += 1
200 | if cell[0].key != EMPTY_KEY and cell[0].key != DELETED_KEY:
201 | key[0] = cell[0].key
202 | value[0] = cell[0].value
203 | return True
204 | # Remember to check for cells keyed by the special empty and deleted keys
205 | if i[0] == map_.length:
206 | i[0] += 1
207 | if map_.is_empty_key_set:
208 | key[0] = EMPTY_KEY
209 | value[0] = map_.value_for_empty_key
210 | return True
211 | if i[0] == map_.length + 1:
212 | i[0] += 1
213 | if map_.is_del_key_set:
214 | key[0] = DELETED_KEY
215 | value[0] = map_.value_for_del_key
216 | return True
217 | return False
218 |
219 |
220 | @cython.cdivision
221 | cdef inline Cell* _find_cell(Cell* cells, const key_t size, const key_t key) nogil:
222 | # Modulo for powers-of-two via bitwise &
223 | cdef key_t i = (key & (size - 1))
224 | while cells[i].key != EMPTY_KEY and cells[i].key != key:
225 | i = (i + 1) & (size - 1)
226 | return &cells[i]
227 |
228 |
229 | @cython.cdivision
230 | cdef inline Cell* _find_cell_for_insertion(Cell* cells, const key_t size, const key_t key) nogil:
231 | """Find the correct cell to insert a value, which could be a previously
232 | deleted cell. If we cross a deleted cell and the key is in the table, we
233 | mark the later cell as deleted, and return the earlier one."""
234 | cdef Cell* deleted = NULL
235 | # Modulo for powers-of-two via bitwise &
236 | cdef key_t i = (key & (size - 1))
237 | while cells[i].key != EMPTY_KEY and cells[i].key != key:
238 | if cells[i].key == DELETED_KEY:
239 | deleted = &cells[i]
240 | i = (i + 1) & (size - 1)
241 | if deleted is not NULL:
242 | if cells[i].key == key:
243 | # We need to ensure we don't end up with the key in the table twice.
244 | # If we're using a deleted cell and we also have the key, we mark
245 | # the later cell as deleted.
246 | cells[i].key = DELETED_KEY
247 | return deleted
248 | return &cells[i]
249 |
250 |
251 | cdef void _resize(Pool mem, MapStruct* map_) except *:
252 | cdef size_t new_size = map_.length * 2
253 | cdef Cell* old_cells = map_.cells
254 | cdef size_t old_size = map_.length
255 |
256 | map_.length = new_size
257 | map_.filled = 0
258 | map_.cells = mem.alloc(new_size, sizeof(Cell))
259 |
260 | cdef size_t i
261 | cdef size_t slot
262 | for i in range(old_size):
263 | if old_cells[i].key != EMPTY_KEY and old_cells[i].key != DELETED_KEY:
264 | map_set(mem, map_, old_cells[i].key, old_cells[i].value)
265 | mem.free(old_cells)
266 |
--------------------------------------------------------------------------------
/preshed/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/preshed/7bd9d00a9b9460020ad9f9d7f06499efd6a08b58/preshed/tests/__init__.py
--------------------------------------------------------------------------------
/preshed/tests/test_bloom.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | import pytest
3 | import pickle
4 |
5 | from preshed.bloom import BloomFilter
6 |
7 | def test_contains():
8 | bf = BloomFilter()
9 | assert 23 not in bf
10 | bf.add(23)
11 | assert 23 in bf
12 |
13 | bf.add(5)
14 | bf.add(42)
15 | bf.add(1002)
16 | assert 5 in bf
17 | assert 42 in bf
18 | assert 1002 in bf
19 |
20 | def test_no_false_negatives():
21 | bf = BloomFilter(size=100, hash_funcs=2)
22 | for ii in range(0,1000,20):
23 | bf.add(ii)
24 |
25 | for ii in range(0,1000,20):
26 | assert ii in bf
27 |
28 | def test_from_error():
29 | bf = BloomFilter.from_error_rate(1000)
30 | for ii in range(0,1000,20):
31 | bf.add(ii)
32 |
33 | for ii in range(0,1000,20):
34 | assert ii in bf
35 |
36 | def test_to_from_bytes():
37 | bf = BloomFilter(size=100, hash_funcs=2)
38 | for ii in range(0,1000,20):
39 | bf.add(ii)
40 | data = bf.to_bytes()
41 | bf2 = BloomFilter()
42 | for ii in range(0,1000,20):
43 | assert ii not in bf2
44 | bf2.from_bytes(data)
45 | for ii in range(0,1000,20):
46 | assert ii in bf2
47 | assert bf2.to_bytes() == data
48 |
49 | def test_bloom_pickle():
50 | bf = BloomFilter(size=100, hash_funcs=2)
51 | for ii in range(0,1000,20):
52 | bf.add(ii)
53 | data = pickle.dumps(bf)
54 | bf2 = pickle.loads(data)
55 | for ii in range(0,1000,20):
56 | assert ii in bf2
57 |
--------------------------------------------------------------------------------
/preshed/tests/test_counter.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | import pytest
3 |
4 | from preshed.counter import PreshCounter
5 |
6 |
7 | def test_count():
8 | counter = PreshCounter()
9 | assert counter[12] == 0
10 | counter.inc(12, 1)
11 | assert counter[12] == 1
12 | counter.inc(14, 10)
13 | counter.inc(9, 10)
14 | counter.inc(12, 4)
15 | assert counter[12] == 5
16 | assert counter[14] == 10
17 | assert counter[9] == 10
18 |
19 |
20 | def test_unsmooth_prob():
21 | counter = PreshCounter()
22 | assert counter.prob(12) == 0.0
23 | counter.inc(12, 1)
24 | assert counter.prob(12) == 1.0
25 | counter.inc(14, 10)
26 | assert counter.prob(14) == 10 / 11
27 | assert counter.prob(12) == 1.0 / 11
28 |
29 | def test_smooth_prob():
30 | p = PreshCounter()
31 | # 1 10
32 | # 2 6
33 | # 3 4
34 | # 5 2
35 | # 8 1
36 | for i in range(10):
37 | p.inc(100-i, 1) # 10 items of freq 1
38 | for i in range(6):
39 | p.inc(90 - i, 2) # 6 items of freq 2
40 | for i in range(4):
41 | p.inc(80 - i, 3) # 4 items of freq 3
42 | for i in range(2):
43 | p.inc(70 - i, 5) # 2 items of freq 5
44 | for i in range(1):
45 | p.inc(60 - i, 8) # 1 item of freq 8
46 |
47 | assert p.total == (10 * 1) + (6 * 2) + (4 * 3) + (2 * 5) + (1 * 8)
48 |
49 | assert p.prob(100) == 1.0 / p.total
50 | assert p.prob(200) == 0.0
51 | assert p.prob(60) == 8.0 / p.total
52 |
53 | p.smooth()
54 |
55 | assert p.smoother(1) < 1.0
56 | assert p.smoother(8) < 8.0
57 | assert p.prob(1000) < p.prob(100)
58 |
59 | for event, count in reversed(sorted(p, key=lambda it: it[1])):
60 | assert p.smoother(count) < count
61 |
62 |
63 | import os
64 | def test_large_freqs():
65 | if 'TEST_FILE_LOC' in os.environ:
66 | loc = os.environ['TEST_FILE_LOC']
67 | else:
68 | return None
69 | counts = PreshCounter()
70 | for i, line in enumerate(open(loc)):
71 | line = line.strip()
72 | if not line:
73 | continue
74 | freq = int(line.split()[0])
75 | counts.inc(i+1, freq)
76 | oov = i+2
77 | assert counts.prob(oov) == 0.0
78 | assert counts.prob(1) < 0.1
79 | counts.smooth()
80 | assert counts.prob(oov) > 0
81 | assert counts.prob(oov) < counts.prob(i)
82 |
--------------------------------------------------------------------------------
/preshed/tests/test_hashing.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from preshed.maps import PreshMap
4 | import random
5 |
6 |
7 | def test_insert():
8 | h = PreshMap()
9 | assert h[1] is None
10 | h[1] = 5
11 | assert h[1] == 5
12 | h[2] = 6
13 | assert h[1] == 5
14 | assert h[2] == 6
15 |
16 | def test_resize():
17 | h = PreshMap(4)
18 | h[4] = 12
19 | for i in range(10, 100):
20 | value = int(i * (random.random() + 1))
21 | h[i] = value
22 | assert h[4] == 12
23 |
24 |
25 | def test_zero_key():
26 | h = PreshMap()
27 | h[0] = 6
28 | h[5] = 12
29 | assert h[0] == 6
30 | assert h[5] == 12
31 |
32 | for i in range(500, 1000):
33 | h[i] = i * random.random()
34 | assert h[0] == 6
35 | assert h[5] == 12
36 |
37 |
38 | def test_iter():
39 | key_sum = 0
40 | val_sum = 0
41 | h = PreshMap()
42 | for i in range(56, 24, -3):
43 | h[i] = i * 2
44 | key_sum += i
45 | val_sum += i * 2
46 | for key, value in h.items():
47 | key_sum -= key
48 | val_sum -= value
49 | assert key_sum == 0
50 | assert val_sum == 0
51 |
52 |
53 | def test_one_and_empty():
54 | # See Issue #21
55 | table = PreshMap()
56 | for i in range(100, 110):
57 | table[i] = i
58 | del table[i]
59 | assert table[0] == None
60 |
61 |
62 | def test_many_and_empty():
63 | # See Issue #21
64 | table = PreshMap()
65 | for i in range(100, 110):
66 | table[i] = i
67 | for i in range(100, 110):
68 | del table[i]
69 | assert table[0] == None
70 |
71 |
72 | def test_zero_values():
73 | table = PreshMap()
74 | table[10] = 0
75 | assert table[10] == 0
76 | assert table[11] is None
77 |
--------------------------------------------------------------------------------
/preshed/tests/test_pop.py:
--------------------------------------------------------------------------------
1 | from ..maps import PreshMap
2 |
3 |
4 | def test_pop1():
5 | table = PreshMap()
6 | table[10] = 20
7 | table[30] = 25
8 | assert table[10] == 20
9 | assert table[30] == 25
10 | table.pop(30)
11 | assert table[10] == 20
12 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 | "setuptools",
4 | "cython>=0.28",
5 | "cymem>=2.0.2,<2.1.0",
6 | "murmurhash>=0.28.0,<1.1.0",
7 | ]
8 | build-backend = "setuptools.build_meta"
9 |
10 |
11 | [tool.cibuildwheel]
12 | build = "*"
13 | skip = "pp* cp36* cp37* cp38*"
14 | test-skip = ""
15 | free-threaded-support = false
16 |
17 | archs = ["native"]
18 |
19 | build-frontend = "default"
20 | config-settings = {}
21 | dependency-versions = "pinned"
22 | environment = {}
23 | environment-pass = []
24 | build-verbosity = 0
25 |
26 | before-all = ""
27 | before-build = ""
28 | repair-wheel-command = ""
29 |
30 | test-command = ""
31 | before-test = ""
32 | test-requires = []
33 | test-extras = []
34 |
35 | container-engine = "docker"
36 |
37 | manylinux-x86_64-image = "manylinux2014"
38 | manylinux-i686-image = "manylinux2014"
39 | manylinux-aarch64-image = "manylinux2014"
40 | manylinux-ppc64le-image = "manylinux2014"
41 | manylinux-s390x-image = "manylinux2014"
42 | manylinux-pypy_x86_64-image = "manylinux2014"
43 | manylinux-pypy_i686-image = "manylinux2014"
44 | manylinux-pypy_aarch64-image = "manylinux2014"
45 |
46 | musllinux-x86_64-image = "musllinux_1_2"
47 | musllinux-i686-image = "musllinux_1_2"
48 | musllinux-aarch64-image = "musllinux_1_2"
49 | musllinux-ppc64le-image = "musllinux_1_2"
50 | musllinux-s390x-image = "musllinux_1_2"
51 |
52 |
53 | [tool.cibuildwheel.linux]
54 | repair-wheel-command = "auditwheel repair -w {dest_dir} {wheel}"
55 |
56 | [tool.cibuildwheel.macos]
57 | repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}"
58 |
59 | [tool.cibuildwheel.windows]
60 |
61 | [tool.cibuildwheel.pyodide]
62 |
63 | [tool.isort]
64 | profile = "black"
65 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cymem>=2.0.2,<2.1.0
2 | cython>=0.28
3 | pytest
4 | murmurhash>=0.28.0,<1.1.0
5 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from __future__ import print_function
3 | import os
4 | import sys
5 | import contextlib
6 | from setuptools import Extension, setup
7 | from setuptools.command.build_ext import build_ext
8 | from sysconfig import get_path
9 | from Cython.Build import cythonize
10 |
11 |
12 | PACKAGES = ["preshed", "preshed.tests"]
13 | MOD_NAMES = ["preshed.maps", "preshed.counter", "preshed.bloom"]
14 |
15 |
16 | # By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options
17 | # http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
18 | compile_options = {
19 | "msvc": ["/Ox", "/EHsc"],
20 | "other": ["-O3", "-Wno-strict-prototypes", "-Wno-unused-function"],
21 | }
22 | link_options = {"msvc": [], "other": []}
23 |
24 |
25 | class build_ext_options:
26 | def build_options(self):
27 | for e in self.extensions:
28 | e.extra_compile_args = compile_options.get(
29 | self.compiler.compiler_type, compile_options["other"]
30 | )
31 | for e in self.extensions:
32 | e.extra_link_args = link_options.get(
33 | self.compiler.compiler_type, link_options["other"]
34 | )
35 |
36 |
37 | class build_ext_subclass(build_ext, build_ext_options):
38 | def build_extensions(self):
39 | build_ext_options.build_options(self)
40 | build_ext.build_extensions(self)
41 |
42 |
43 | def clean(path):
44 | for name in MOD_NAMES:
45 | name = name.replace(".", "/")
46 | for ext in [".so", ".html", ".cpp", ".c"]:
47 | file_path = os.path.join(path, name + ext)
48 | if os.path.exists(file_path):
49 | os.unlink(file_path)
50 |
51 |
52 | @contextlib.contextmanager
53 | def chdir(new_dir):
54 | old_dir = os.getcwd()
55 | try:
56 | os.chdir(new_dir)
57 | sys.path.insert(0, new_dir)
58 | yield
59 | finally:
60 | del sys.path[0]
61 | os.chdir(old_dir)
62 |
63 |
64 | def setup_package():
65 | root = os.path.abspath(os.path.dirname(__file__))
66 |
67 | if len(sys.argv) > 1 and sys.argv[1] == "clean":
68 | return clean(root)
69 |
70 | with chdir(root):
71 | with open(os.path.join(root, "preshed", "about.py")) as f:
72 | about = {}
73 | exec(f.read(), about)
74 |
75 | with open(os.path.join(root, "README.md")) as f:
76 | readme = f.read()
77 |
78 | include_dirs = [get_path("include")]
79 |
80 | ext_modules = []
81 | for mod_name in MOD_NAMES:
82 | mod_path = mod_name.replace(".", "/") + ".pyx"
83 | ext_modules.append(
84 | Extension(
85 | mod_name, [mod_path], language="c++", include_dirs=include_dirs
86 | )
87 | )
88 |
89 | setup(
90 | name="preshed",
91 | zip_safe=False,
92 | packages=PACKAGES,
93 | package_data={"": ["*.pyx", "*.pxd"]},
94 | description=about["__summary__"],
95 | long_description=readme,
96 | long_description_content_type="text/markdown",
97 | author=about["__author__"],
98 | author_email=about["__email__"],
99 | version=about["__version__"],
100 | url=about["__uri__"],
101 | license=about["__license__"],
102 | ext_modules=cythonize(ext_modules, language_level=2),
103 | python_requires=">=3.6,<3.14",
104 | install_requires=["cymem>=2.0.2,<2.1.0", "murmurhash>=0.28.0,<1.1.0"],
105 | classifiers=[
106 | "Environment :: Console",
107 | "Intended Audience :: Developers",
108 | "Intended Audience :: Science/Research",
109 | "License :: OSI Approved :: MIT License",
110 | "Operating System :: POSIX :: Linux",
111 | "Operating System :: MacOS :: MacOS X",
112 | "Operating System :: Microsoft :: Windows",
113 | "Programming Language :: Cython",
114 | "Programming Language :: Python :: 3.6",
115 | "Programming Language :: Python :: 3.7",
116 | "Programming Language :: Python :: 3.8",
117 | "Programming Language :: Python :: 3.9",
118 | "Programming Language :: Python :: 3.10",
119 | "Programming Language :: Python :: 3.11",
120 | "Programming Language :: Python :: 3.12",
121 | "Topic :: Scientific/Engineering",
122 | ],
123 | cmdclass={"build_ext": build_ext_subclass},
124 | )
125 |
126 |
127 | if __name__ == "__main__":
128 | setup_package()
129 |
--------------------------------------------------------------------------------
| |