├── .github └── workflows │ ├── cibuildwheel.yml │ ├── publish_pypi.yml │ └── tests.yml ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── bin └── push-tag.sh ├── fabfile.py ├── include └── msvc9 │ └── stdint.h ├── preshed ├── __init__.pxd ├── __init__.py ├── about.py ├── bloom.pxd ├── bloom.pyx ├── counter.pxd ├── counter.pyx ├── maps.pxd ├── maps.pyx └── tests │ ├── __init__.py │ ├── test_bloom.py │ ├── test_counter.py │ ├── test_hashing.py │ └── test_pop.py ├── pyproject.toml ├── requirements.txt └── setup.py /.github/workflows/cibuildwheel.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | 3 | on: 4 | push: 5 | tags: 6 | # ytf did they invent their own syntax that's almost regex? 7 | # ** matches 'zero or more of any character' 8 | - 'release-v[0-9]+.[0-9]+.[0-9]+**' 9 | - 'prerelease-v[0-9]+.[0-9]+.[0-9]+**' 10 | jobs: 11 | build_wheels: 12 | name: Build wheels on ${{ matrix.os }} 13 | runs-on: ${{ matrix.os }} 14 | strategy: 15 | matrix: 16 | # macos-13 is an intel runner, macos-14 is apple silicon 17 | os: [ubuntu-latest, windows-latest, macos-13, macos-14, ubuntu-24.04-arm] 18 | 19 | steps: 20 | - uses: actions/checkout@v4 21 | - name: Build wheels 22 | uses: pypa/cibuildwheel@v2.21.3 23 | env: 24 | CIBW_SOME_OPTION: value 25 | with: 26 | package-dir: . 27 | output-dir: wheelhouse 28 | config-file: "{package}/pyproject.toml" 29 | - uses: actions/upload-artifact@v4 30 | with: 31 | name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }} 32 | path: ./wheelhouse/*.whl 33 | 34 | build_sdist: 35 | name: Build source distribution 36 | runs-on: ubuntu-latest 37 | steps: 38 | - uses: actions/checkout@v4 39 | 40 | - name: Build sdist 41 | run: pipx run build --sdist 42 | - uses: actions/upload-artifact@v4 43 | with: 44 | name: cibw-sdist 45 | path: dist/*.tar.gz 46 | create_release: 47 | needs: [build_wheels, build_sdist] 48 | runs-on: ubuntu-latest 49 | permissions: 50 | contents: write 51 | checks: write 52 | actions: read 53 | issues: read 54 | packages: write 55 | pull-requests: read 56 | repository-projects: read 57 | statuses: read 58 | steps: 59 | - name: Get the tag name and determine if it's a prerelease 60 | id: get_tag_info 61 | run: | 62 | FULL_TAG=${GITHUB_REF#refs/tags/} 63 | if [[ $FULL_TAG == release-* ]]; then 64 | TAG_NAME=${FULL_TAG#release-} 65 | IS_PRERELEASE=false 66 | elif [[ $FULL_TAG == prerelease-* ]]; then 67 | TAG_NAME=${FULL_TAG#prerelease-} 68 | IS_PRERELEASE=true 69 | else 70 | echo "Tag does not match expected patterns" >&2 71 | exit 1 72 | fi 73 | echo "FULL_TAG=$TAG_NAME" >> $GITHUB_ENV 74 | echo "TAG_NAME=$TAG_NAME" >> $GITHUB_ENV 75 | echo "IS_PRERELEASE=$IS_PRERELEASE" >> $GITHUB_ENV 76 | - uses: actions/download-artifact@v4 77 | with: 78 | # unpacks all CIBW artifacts into dist/ 79 | pattern: cibw-* 80 | path: dist 81 | merge-multiple: true 82 | - name: Create Draft Release 83 | id: create_release 84 | uses: softprops/action-gh-release@v2 85 | if: startsWith(github.ref, 'refs/tags/') 86 | env: 87 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 88 | with: 89 | name: ${{ env.TAG_NAME }} 90 | draft: true 91 | prerelease: ${{ env.IS_PRERELEASE }} 92 | files: "./dist/*" 93 | -------------------------------------------------------------------------------- /.github/workflows/publish_pypi.yml: -------------------------------------------------------------------------------- 1 | # The cibuildwheel action triggers on creation of a release, this 2 | # triggers on publication. 3 | # The expected workflow is to create a draft release and let the wheels 4 | # upload, and then hit 'publish', which uploads to PyPi. 5 | 6 | on: 7 | release: 8 | types: 9 | - published 10 | 11 | jobs: 12 | upload_pypi: 13 | runs-on: ubuntu-latest 14 | environment: 15 | name: pypi 16 | url: https://pypi.org/p/cymem 17 | permissions: 18 | id-token: write 19 | contents: read 20 | if: github.event_name == 'release' && github.event.action == 'published' 21 | # or, alternatively, upload to PyPI on every tag starting with 'v' (remove on: release above to use this) 22 | # if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') 23 | steps: 24 | - uses: robinraju/release-downloader@v1 25 | with: 26 | tag: ${{ github.event.release.tag_name }} 27 | fileName: '*' 28 | out-file-path: 'dist' 29 | - uses: pypa/gh-action-pypi-publish@release/v1 30 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: 4 | push: 5 | paths-ignore: 6 | - "*.md" 7 | pull_request: 8 | types: [opened, synchronize, reopened, edited] 9 | paths-ignore: 10 | - "*.md" 11 | 12 | env: 13 | MODULE_NAME: 'preshed' 14 | RUN_MYPY: 'false' 15 | 16 | jobs: 17 | tests: 18 | name: Test 19 | if: github.repository_owner == 'explosion' 20 | strategy: 21 | fail-fast: false 22 | matrix: 23 | os: [ubuntu-latest, windows-latest, macos-13] 24 | python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"] 25 | runs-on: ${{ matrix.os }} 26 | 27 | steps: 28 | - name: Check out repo 29 | uses: actions/checkout@v3 30 | 31 | - name: Configure Python version 32 | uses: actions/setup-python@v4 33 | with: 34 | python-version: ${{ matrix.python_version }} 35 | architecture: x64 36 | 37 | - name: Build sdist 38 | run: | 39 | python -m pip install -U build pip setuptools 40 | python -m pip install -U -r requirements.txt 41 | python -m build --sdist 42 | 43 | - name: Run mypy 44 | shell: bash 45 | if: ${{ env.RUN_MYPY == 'true' }} 46 | run: | 47 | python -m mypy $MODULE_NAME 48 | 49 | - name: Delete source directory 50 | shell: bash 51 | run: | 52 | rm -rf $MODULE_NAME 53 | 54 | - name: Uninstall all packages 55 | run: | 56 | python -m pip freeze > installed.txt 57 | python -m pip uninstall -y -r installed.txt 58 | 59 | - name: Install from sdist 60 | shell: bash 61 | run: | 62 | SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) 63 | pip install dist/$SDIST 64 | 65 | - name: Test import 66 | shell: bash 67 | run: | 68 | python -c "import $MODULE_NAME" -Werror 69 | 70 | - name: Install test requirements 71 | run: | 72 | python -m pip install -U -r requirements.txt 73 | 74 | - name: Run tests 75 | shell: bash 76 | run: | 77 | python -m pytest --pyargs $MODULE_NAME -Werror 78 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg 2 | *.egg-info 3 | .eggs 4 | preshed/.maps.pxd.swm 5 | preshed/.maps.pyx.swl 6 | *.sw[a-z] 7 | *.so 8 | *.pyc 9 | *.swp 10 | *.swo 11 | *.html 12 | *.c 13 | *.cpp 14 | .env/ 15 | .denv 16 | cythonize.json 17 | MANIFEST 18 | build/ 19 | dist/ 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 ExplosionAI GmbH, 2014 Matthew Honnibal 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include include *.h 2 | include LICENSE 3 | include README.md 4 | recursive-exclude preshed *.cpp 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # preshed: Cython Hash Table for Pre-Hashed Keys 4 | 5 | Simple but high performance Cython hash table mapping pre-randomized keys to 6 | `void*` values. Inspired by 7 | [Jeff Preshing](http://preshing.com/20130107/this-hash-table-is-faster-than-a-judy-array/). 8 | 9 | [![tests](https://github.com/explosion/preshed/actions/workflows/tests.yml/badge.svg)](https://github.com/explosion/preshed/actions/workflows/tests.yml) 10 | [![pypi Version](https://img.shields.io/pypi/v/preshed.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.python.org/pypi/preshed) 11 | [![conda Version](https://img.shields.io/conda/vn/conda-forge/preshed.svg?style=flat-square&logo=conda-forge&logoColor=white)](https://anaconda.org/conda-forge/preshed) 12 | [![Python wheels](https://img.shields.io/badge/wheels-%E2%9C%93-4c1.svg?longCache=true&style=flat-square&logo=python&logoColor=white)](https://github.com/explosion/wheelwright/releases) 13 | -------------------------------------------------------------------------------- /bin/push-tag.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | # Insist repository is clean 6 | git diff-index --quiet HEAD 7 | 8 | git checkout $1 9 | git pull origin $1 10 | git push origin $1 11 | 12 | version=$(grep "__version__ = " preshed/about.py) 13 | version=${version/__version__ = } 14 | version=${version/\'/} 15 | version=${version/\'/} 16 | version=${version/\"/} 17 | version=${version/\"/} 18 | git tag "v$version" 19 | git push origin "v$version" 20 | -------------------------------------------------------------------------------- /fabfile.py: -------------------------------------------------------------------------------- 1 | from fabric.api import local, run, lcd, cd, env 2 | 3 | import os 4 | from os import path 5 | from os.path import exists as file_exists 6 | from fabtools.python import virtualenv 7 | 8 | 9 | PWD = path.dirname(__file__) 10 | VENV_DIR = path.join(PWD, '.env') 11 | DEV_ENV_DIR = path.join(PWD, '.denv') 12 | 13 | 14 | def dev(): 15 | # Allow this to persist, since we aren't as rigorous about keeping state clean 16 | if not file_exists('.denv'): 17 | local('virtualenv .denv') 18 | 19 | with virtualenv(DEV_ENV_DIR): 20 | local('pip install -r requirements.txt') 21 | 22 | 23 | def sdist(): 24 | if file_exists('dist/'): 25 | local('rm -rf dist/') 26 | local('mkdir dist') 27 | with virtualenv(VENV_DIR): 28 | local('python setup.py sdist') 29 | 30 | 31 | def publish(): 32 | with virtualenv(VENV_DIR): 33 | local('python setup.py register') 34 | local('twine upload dist/*.tar.gz') 35 | 36 | 37 | def setup(): 38 | if file_exists('.env'): 39 | local('rm -rf .env') 40 | local('rm -rf *.egg') 41 | local('virtualenv .env') 42 | 43 | 44 | def install(): 45 | with virtualenv(VENV_DIR): 46 | local('pip install --upgrade setuptools') 47 | local('pip install dist/*.tar.gz') 48 | local('pip install pytest') 49 | 50 | 51 | def make(): 52 | with virtualenv(DEV_ENV_DIR): 53 | with lcd(path.dirname(__file__)): 54 | local('python setup.py build') 55 | 56 | 57 | def clean(): 58 | with lcd(os.path.dirname(__file__)): 59 | local('python setup.py clean --all') 60 | with virtualenv(DEV_ENV_DIR): 61 | with lcd(os.path.dirname(__file__)): 62 | local('python setup.py clean --all') 63 | 64 | def test(): 65 | with virtualenv(VENV_DIR): 66 | local('python -m pytest -x') 67 | 68 | 69 | def travis(): 70 | local('open https://travis-ci.org/spacy-io/preshed') 71 | -------------------------------------------------------------------------------- /include/msvc9/stdint.h: -------------------------------------------------------------------------------- 1 | // ISO C9x compliant stdint.h for Microsoft Visual Studio 2 | // Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 3 | // 4 | // Copyright (c) 2006-2013 Alexander Chemeris 5 | // 6 | // Redistribution and use in source and binary forms, with or without 7 | // modification, are permitted provided that the following conditions are met: 8 | // 9 | // 1. Redistributions of source code must retain the above copyright notice, 10 | // this list of conditions and the following disclaimer. 11 | // 12 | // 2. Redistributions in binary form must reproduce the above copyright 13 | // notice, this list of conditions and the following disclaimer in the 14 | // documentation and/or other materials provided with the distribution. 15 | // 16 | // 3. Neither the name of the product nor the names of its contributors may 17 | // be used to endorse or promote products derived from this software 18 | // without specific prior written permission. 19 | // 20 | // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED 21 | // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 22 | // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 23 | // EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 26 | // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 27 | // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 28 | // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 29 | // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | // 31 | /////////////////////////////////////////////////////////////////////////////// 32 | 33 | #ifndef _MSC_VER // [ 34 | #error "Use this header only with Microsoft Visual C++ compilers!" 35 | #endif // _MSC_VER ] 36 | 37 | #ifndef _MSC_STDINT_H_ // [ 38 | #define _MSC_STDINT_H_ 39 | 40 | #if _MSC_VER > 1000 41 | #pragma once 42 | #endif 43 | 44 | #if _MSC_VER >= 1600 // [ 45 | #include 46 | #else // ] _MSC_VER >= 1600 [ 47 | 48 | #include 49 | 50 | // For Visual Studio 6 in C++ mode and for many Visual Studio versions when 51 | // compiling for ARM we should wrap include with 'extern "C++" {}' 52 | // or compiler give many errors like this: 53 | // error C2733: second C linkage of overloaded function 'wmemchr' not allowed 54 | #ifdef __cplusplus 55 | extern "C" { 56 | #endif 57 | # include 58 | #ifdef __cplusplus 59 | } 60 | #endif 61 | 62 | // Define _W64 macros to mark types changing their size, like intptr_t. 63 | #ifndef _W64 64 | # if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300 65 | # define _W64 __w64 66 | # else 67 | # define _W64 68 | # endif 69 | #endif 70 | 71 | 72 | // 7.18.1 Integer types 73 | 74 | // 7.18.1.1 Exact-width integer types 75 | 76 | // Visual Studio 6 and Embedded Visual C++ 4 doesn't 77 | // realize that, e.g. char has the same size as __int8 78 | // so we give up on __intX for them. 79 | #if (_MSC_VER < 1300) 80 | typedef signed char int8_t; 81 | typedef signed short int16_t; 82 | typedef signed int int32_t; 83 | typedef unsigned char uint8_t; 84 | typedef unsigned short uint16_t; 85 | typedef unsigned int uint32_t; 86 | #else 87 | typedef signed __int8 int8_t; 88 | typedef signed __int16 int16_t; 89 | typedef signed __int32 int32_t; 90 | typedef unsigned __int8 uint8_t; 91 | typedef unsigned __int16 uint16_t; 92 | typedef unsigned __int32 uint32_t; 93 | #endif 94 | typedef signed __int64 int64_t; 95 | typedef unsigned __int64 uint64_t; 96 | 97 | 98 | // 7.18.1.2 Minimum-width integer types 99 | typedef int8_t int_least8_t; 100 | typedef int16_t int_least16_t; 101 | typedef int32_t int_least32_t; 102 | typedef int64_t int_least64_t; 103 | typedef uint8_t uint_least8_t; 104 | typedef uint16_t uint_least16_t; 105 | typedef uint32_t uint_least32_t; 106 | typedef uint64_t uint_least64_t; 107 | 108 | // 7.18.1.3 Fastest minimum-width integer types 109 | typedef int8_t int_fast8_t; 110 | typedef int16_t int_fast16_t; 111 | typedef int32_t int_fast32_t; 112 | typedef int64_t int_fast64_t; 113 | typedef uint8_t uint_fast8_t; 114 | typedef uint16_t uint_fast16_t; 115 | typedef uint32_t uint_fast32_t; 116 | typedef uint64_t uint_fast64_t; 117 | 118 | // 7.18.1.4 Integer types capable of holding object pointers 119 | #ifdef _WIN64 // [ 120 | typedef signed __int64 intptr_t; 121 | typedef unsigned __int64 uintptr_t; 122 | #else // _WIN64 ][ 123 | typedef _W64 signed int intptr_t; 124 | typedef _W64 unsigned int uintptr_t; 125 | #endif // _WIN64 ] 126 | 127 | // 7.18.1.5 Greatest-width integer types 128 | typedef int64_t intmax_t; 129 | typedef uint64_t uintmax_t; 130 | 131 | 132 | // 7.18.2 Limits of specified-width integer types 133 | 134 | #if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259 135 | 136 | // 7.18.2.1 Limits of exact-width integer types 137 | #define INT8_MIN ((int8_t)_I8_MIN) 138 | #define INT8_MAX _I8_MAX 139 | #define INT16_MIN ((int16_t)_I16_MIN) 140 | #define INT16_MAX _I16_MAX 141 | #define INT32_MIN ((int32_t)_I32_MIN) 142 | #define INT32_MAX _I32_MAX 143 | #define INT64_MIN ((int64_t)_I64_MIN) 144 | #define INT64_MAX _I64_MAX 145 | #define UINT8_MAX _UI8_MAX 146 | #define UINT16_MAX _UI16_MAX 147 | #define UINT32_MAX _UI32_MAX 148 | #define UINT64_MAX _UI64_MAX 149 | 150 | // 7.18.2.2 Limits of minimum-width integer types 151 | #define INT_LEAST8_MIN INT8_MIN 152 | #define INT_LEAST8_MAX INT8_MAX 153 | #define INT_LEAST16_MIN INT16_MIN 154 | #define INT_LEAST16_MAX INT16_MAX 155 | #define INT_LEAST32_MIN INT32_MIN 156 | #define INT_LEAST32_MAX INT32_MAX 157 | #define INT_LEAST64_MIN INT64_MIN 158 | #define INT_LEAST64_MAX INT64_MAX 159 | #define UINT_LEAST8_MAX UINT8_MAX 160 | #define UINT_LEAST16_MAX UINT16_MAX 161 | #define UINT_LEAST32_MAX UINT32_MAX 162 | #define UINT_LEAST64_MAX UINT64_MAX 163 | 164 | // 7.18.2.3 Limits of fastest minimum-width integer types 165 | #define INT_FAST8_MIN INT8_MIN 166 | #define INT_FAST8_MAX INT8_MAX 167 | #define INT_FAST16_MIN INT16_MIN 168 | #define INT_FAST16_MAX INT16_MAX 169 | #define INT_FAST32_MIN INT32_MIN 170 | #define INT_FAST32_MAX INT32_MAX 171 | #define INT_FAST64_MIN INT64_MIN 172 | #define INT_FAST64_MAX INT64_MAX 173 | #define UINT_FAST8_MAX UINT8_MAX 174 | #define UINT_FAST16_MAX UINT16_MAX 175 | #define UINT_FAST32_MAX UINT32_MAX 176 | #define UINT_FAST64_MAX UINT64_MAX 177 | 178 | // 7.18.2.4 Limits of integer types capable of holding object pointers 179 | #ifdef _WIN64 // [ 180 | # define INTPTR_MIN INT64_MIN 181 | # define INTPTR_MAX INT64_MAX 182 | # define UINTPTR_MAX UINT64_MAX 183 | #else // _WIN64 ][ 184 | # define INTPTR_MIN INT32_MIN 185 | # define INTPTR_MAX INT32_MAX 186 | # define UINTPTR_MAX UINT32_MAX 187 | #endif // _WIN64 ] 188 | 189 | // 7.18.2.5 Limits of greatest-width integer types 190 | #define INTMAX_MIN INT64_MIN 191 | #define INTMAX_MAX INT64_MAX 192 | #define UINTMAX_MAX UINT64_MAX 193 | 194 | // 7.18.3 Limits of other integer types 195 | 196 | #ifdef _WIN64 // [ 197 | # define PTRDIFF_MIN _I64_MIN 198 | # define PTRDIFF_MAX _I64_MAX 199 | #else // _WIN64 ][ 200 | # define PTRDIFF_MIN _I32_MIN 201 | # define PTRDIFF_MAX _I32_MAX 202 | #endif // _WIN64 ] 203 | 204 | #define SIG_ATOMIC_MIN INT_MIN 205 | #define SIG_ATOMIC_MAX INT_MAX 206 | 207 | #ifndef SIZE_MAX // [ 208 | # ifdef _WIN64 // [ 209 | # define SIZE_MAX _UI64_MAX 210 | # else // _WIN64 ][ 211 | # define SIZE_MAX _UI32_MAX 212 | # endif // _WIN64 ] 213 | #endif // SIZE_MAX ] 214 | 215 | // WCHAR_MIN and WCHAR_MAX are also defined in 216 | #ifndef WCHAR_MIN // [ 217 | # define WCHAR_MIN 0 218 | #endif // WCHAR_MIN ] 219 | #ifndef WCHAR_MAX // [ 220 | # define WCHAR_MAX _UI16_MAX 221 | #endif // WCHAR_MAX ] 222 | 223 | #define WINT_MIN 0 224 | #define WINT_MAX _UI16_MAX 225 | 226 | #endif // __STDC_LIMIT_MACROS ] 227 | 228 | 229 | // 7.18.4 Limits of other integer types 230 | 231 | #if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260 232 | 233 | // 7.18.4.1 Macros for minimum-width integer constants 234 | 235 | #define INT8_C(val) val##i8 236 | #define INT16_C(val) val##i16 237 | #define INT32_C(val) val##i32 238 | #define INT64_C(val) val##i64 239 | 240 | #define UINT8_C(val) val##ui8 241 | #define UINT16_C(val) val##ui16 242 | #define UINT32_C(val) val##ui32 243 | #define UINT64_C(val) val##ui64 244 | 245 | // 7.18.4.2 Macros for greatest-width integer constants 246 | // These #ifndef's are needed to prevent collisions with . 247 | // Check out Issue 9 for the details. 248 | #ifndef INTMAX_C // [ 249 | # define INTMAX_C INT64_C 250 | #endif // INTMAX_C ] 251 | #ifndef UINTMAX_C // [ 252 | # define UINTMAX_C UINT64_C 253 | #endif // UINTMAX_C ] 254 | 255 | #endif // __STDC_CONSTANT_MACROS ] 256 | 257 | #endif // _MSC_VER >= 1600 ] 258 | 259 | #endif // _MSC_STDINT_H_ ] 260 | -------------------------------------------------------------------------------- /preshed/__init__.pxd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/preshed/7bd9d00a9b9460020ad9f9d7f06499efd6a08b58/preshed/__init__.pxd -------------------------------------------------------------------------------- /preshed/__init__.py: -------------------------------------------------------------------------------- 1 | from .about import * 2 | -------------------------------------------------------------------------------- /preshed/about.py: -------------------------------------------------------------------------------- 1 | __title__ = "preshed" 2 | __version__ = "3.0.10" 3 | __summary__ = "Cython hash table that trusts the keys are pre-hashed" 4 | __uri__ = "https://github.com/explosion/preshed" 5 | __author__ = "Explosion" 6 | __email__ = "contact@explosion.ai" 7 | __license__ = "MIT" 8 | __release__ = True 9 | -------------------------------------------------------------------------------- /preshed/bloom.pxd: -------------------------------------------------------------------------------- 1 | from libc.stdint cimport uint64_t, uint32_t 2 | from cymem.cymem cimport Pool 3 | 4 | ctypedef uint64_t key_t 5 | 6 | cdef struct BloomStruct: 7 | key_t* bitfield 8 | key_t hcount # hash count, number of hash functions 9 | key_t length 10 | uint32_t seed 11 | 12 | 13 | cdef class BloomFilter: 14 | cdef Pool mem 15 | cdef BloomStruct* c_bloom 16 | cdef inline bint contains(self, key_t item) nogil 17 | 18 | 19 | cdef void bloom_init(Pool mem, BloomStruct* bloom, key_t hcount, key_t length, uint32_t seed) except * 20 | 21 | cdef void bloom_add(BloomStruct* bloom, key_t item) nogil 22 | 23 | cdef bint bloom_contains(const BloomStruct* bloom, key_t item) nogil 24 | 25 | cdef void bloom_add(BloomStruct* bloom, key_t item) nogil 26 | -------------------------------------------------------------------------------- /preshed/bloom.pyx: -------------------------------------------------------------------------------- 1 | # cython: infer_types=True 2 | # cython: cdivision=True 3 | # 4 | from murmurhash.mrmr cimport hash128_x86 5 | import math 6 | from array import array 7 | 8 | try: 9 | import copy_reg 10 | except ImportError: 11 | import copyreg as copy_reg 12 | 13 | 14 | def calculate_size_and_hash_count(members, error_rate): 15 | """Calculate the optimal size in bits and number of hash functions for a 16 | given number of members and error rate. 17 | """ 18 | base = math.log(1 / (2 ** math.log(2))) 19 | bit_count = math.ceil((members * math.log(error_rate)) / base) 20 | hash_count = math.floor((bit_count / members) * math.log(2)) 21 | return (bit_count, hash_count) 22 | 23 | 24 | cdef class BloomFilter: 25 | """Bloom filter that allows for basic membership tests. 26 | 27 | Only integers are supported as keys. 28 | """ 29 | def __init__(self, key_t size=(2 ** 10), key_t hash_funcs=23, uint32_t seed=0): 30 | self.mem = Pool() 31 | self.c_bloom = self.mem.alloc(1, sizeof(BloomStruct)) 32 | bloom_init(self.mem, self.c_bloom, hash_funcs, size, seed) 33 | 34 | @classmethod 35 | def from_error_rate(cls, members, error_rate=1E-4): 36 | params = calculate_size_and_hash_count(members, error_rate) 37 | return cls(*params) 38 | 39 | def add(self, key_t item): 40 | bloom_add(self.c_bloom, item) 41 | 42 | def __contains__(self, item): 43 | return bloom_contains(self.c_bloom, item) 44 | 45 | cdef inline bint contains(self, key_t item) nogil: 46 | return bloom_contains(self.c_bloom, item) 47 | 48 | def to_bytes(self): 49 | return bloom_to_bytes(self.c_bloom) 50 | 51 | def from_bytes(self, bytes byte_string): 52 | bloom_from_bytes(self.mem, self.c_bloom, byte_string) 53 | return self 54 | 55 | 56 | cdef bytes bloom_to_bytes(const BloomStruct* bloom): 57 | py = array("L") 58 | py.append(bloom.hcount) 59 | py.append(bloom.length) 60 | py.append(bloom.seed) 61 | for i in range(bloom.length // sizeof(key_t)): 62 | py.append(bloom.bitfield[i]) 63 | if hasattr(py, "tobytes"): 64 | return py.tobytes() 65 | else: 66 | # Python 2 :( 67 | return py.tostring() 68 | 69 | 70 | cdef void bloom_from_bytes(Pool mem, BloomStruct* bloom, bytes data): 71 | py = array("L") 72 | if hasattr(py, "frombytes"): 73 | py.frombytes(data) 74 | else: 75 | py.fromstring(data) 76 | bloom.hcount = py[0] 77 | bloom.length = py[1] 78 | bloom.seed = py[2] 79 | bloom.bitfield = mem.alloc(bloom.length // sizeof(key_t), sizeof(key_t)) 80 | for i in range(bloom.length // sizeof(key_t)): 81 | bloom.bitfield[i] = py[3+i] 82 | 83 | 84 | cdef void bloom_init(Pool mem, BloomStruct* bloom, key_t hcount, key_t length, uint32_t seed) except *: 85 | # size should be a multiple of the container size - round up 86 | if length % sizeof(key_t): 87 | length = math.ceil(length / sizeof(key_t)) * sizeof(key_t) 88 | bloom.length = length 89 | bloom.hcount = hcount 90 | bloom.bitfield = mem.alloc(length // sizeof(key_t), sizeof(key_t)) 91 | bloom.seed = seed 92 | 93 | 94 | # Instead of calling MurmurHash with a different seed for each hash function, this 95 | # generates two initial hash values and then combines them to create the correct 96 | # number of hashes. This technique is faster than just doing MurmurhHash 97 | # repeatedly and has been shown to work as well as full hashing. 98 | 99 | # For details see "Less Hashing, Same Performance: Building a Better Bloom 100 | # Filter", Kirsch & Mitzenmacher. 101 | 102 | # https://www.semanticscholar.org/paper/Less-hashing%2C-same-performance%3A-Building-a-better-Kirsch-Mitzenmacher/65c43afbfc064705bdc40d3473f32518e9306429 103 | # The choice of seeds is arbitrary. 104 | 105 | 106 | cdef void bloom_add(BloomStruct* bloom, key_t item) nogil: 107 | cdef key_t hv 108 | cdef key_t[2] keys 109 | cdef key_t one = 1 # We want this explicitly typed, because bits 110 | hash128_x86(&item, sizeof(key_t), 0, &keys) 111 | for hiter in range(bloom.hcount): 112 | hv = (keys[0] + (hiter * keys[1])) % bloom.length 113 | bloom.bitfield[hv // sizeof(key_t)] |= one << (hv % sizeof(key_t)) 114 | 115 | 116 | cdef bint bloom_contains(const BloomStruct* bloom, key_t item) nogil: 117 | cdef key_t hv 118 | cdef key_t[2] keys 119 | cdef key_t one = 1 # We want this explicitly typed, because bits 120 | hash128_x86(&item, sizeof(key_t), 0, &keys) 121 | for hiter in range(bloom.hcount): 122 | hv = (keys[0] + (hiter * keys[1])) % bloom.length 123 | if not (bloom.bitfield[hv // sizeof(key_t)] & one << (hv % sizeof(key_t))): 124 | return False 125 | return True 126 | 127 | 128 | def pickle_bloom(BloomFilter bloom): 129 | return unpickle_bloom, (bloom.to_bytes(),) 130 | 131 | 132 | def unpickle_bloom(byte_string): 133 | return BloomFilter().from_bytes(byte_string) 134 | 135 | 136 | copy_reg.pickle(BloomFilter, pickle_bloom, unpickle_bloom) 137 | -------------------------------------------------------------------------------- /preshed/counter.pxd: -------------------------------------------------------------------------------- 1 | from libc.stdint cimport int64_t 2 | 3 | from cymem.cymem cimport Pool 4 | 5 | from .maps cimport MapStruct 6 | from .maps cimport map_init, map_get, map_set, map_iter 7 | from .maps cimport key_t 8 | 9 | 10 | ctypedef int64_t count_t 11 | 12 | 13 | cdef class PreshCounter: 14 | cdef Pool mem 15 | cdef MapStruct* c_map 16 | cdef public object smoother 17 | cdef readonly count_t total 18 | 19 | cpdef int inc(self, key_t key, count_t inc) except -1 20 | -------------------------------------------------------------------------------- /preshed/counter.pyx: -------------------------------------------------------------------------------- 1 | """Count occurrences of uint64-valued keys.""" 2 | from __future__ import division 3 | cimport cython 4 | from libc.math cimport log, exp, sqrt 5 | 6 | 7 | cdef class PreshCounter: 8 | def __init__(self, initial_size=8): 9 | assert initial_size != 0 10 | assert initial_size & (initial_size - 1) == 0 11 | self.mem = Pool() 12 | self.c_map = self.mem.alloc(1, sizeof(MapStruct)) 13 | map_init(self.mem, self.c_map, initial_size) 14 | self.smoother = None 15 | self.total = 0 16 | 17 | property length: 18 | def __get__(self): 19 | return self.c_map.length 20 | 21 | def __len__(self): 22 | return self.c_map.length 23 | 24 | def __iter__(self): 25 | cdef int i = 0 26 | cdef key_t key 27 | cdef void* value 28 | while map_iter(self.c_map, &i, &key, &value): 29 | yield key, value 30 | 31 | def __getitem__(self, key_t key): 32 | return map_get(self.c_map, key) 33 | 34 | cpdef int inc(self, key_t key, count_t inc) except -1: 35 | cdef count_t c = map_get(self.c_map, key) 36 | c += inc 37 | map_set(self.mem, self.c_map, key, c) 38 | self.total += inc 39 | return c 40 | 41 | def prob(self, key_t key): 42 | cdef GaleSmoother smoother 43 | cdef void* value = map_get(self.c_map, key) 44 | if self.smoother is not None: 45 | smoother = self.smoother 46 | r_star = self.smoother(value) 47 | return r_star / self.smoother.total 48 | elif value == NULL: 49 | return 0 50 | else: 51 | return value / self.total 52 | 53 | def smooth(self): 54 | self.smoother = GaleSmoother(self) 55 | 56 | 57 | cdef class GaleSmoother: 58 | cdef Pool mem 59 | cdef count_t* Nr 60 | cdef double gradient 61 | cdef double intercept 62 | cdef readonly count_t cutoff 63 | cdef count_t Nr0 64 | cdef readonly double total 65 | 66 | def __init__(self, PreshCounter counts): 67 | count_counts = PreshCounter() 68 | cdef double total = 0 69 | for _, count in counts: 70 | count_counts.inc(count, 1) 71 | total += count 72 | # If we have no items seen 1 or 2 times, this doesn't work. But, this 73 | # won't be true in real data... 74 | assert count_counts[1] != 0 and count_counts[2] != 0, "Cannot smooth your weird data" 75 | # Extrapolate Nr0 from Nr1 and Nr2. 76 | self.Nr0 = count_counts[1] + (count_counts[1] - count_counts[2]) 77 | self.mem = Pool() 78 | 79 | cdef double[2] mb 80 | 81 | cdef int n_counts = 0 82 | for _ in count_counts: 83 | n_counts += 1 84 | sorted_r = count_counts.mem.alloc(n_counts, sizeof(count_t)) 85 | self.Nr = self.mem.alloc(n_counts, sizeof(count_t)) 86 | for i, (count, count_count) in enumerate(sorted(count_counts)): 87 | sorted_r[i] = count 88 | self.Nr[i] = count_count 89 | 90 | _fit_loglinear_model(mb, sorted_r, self.Nr, n_counts) 91 | 92 | self.cutoff = _find_when_to_switch(sorted_r, self.Nr, mb[0], mb[1], 93 | n_counts) 94 | self.gradient = mb[0] 95 | self.intercept = mb[1] 96 | self.total = self(0) * self.Nr0 97 | for count, count_count in count_counts: 98 | self.total += self(count) * count_count 99 | 100 | def __call__(self, count_t r): 101 | if r == 0: 102 | return self.Nr[1] / self.Nr0 103 | elif r < self.cutoff: 104 | return turing_estimate_of_r(r, self.Nr[r-1], self.Nr[r]) 105 | else: 106 | return gale_estimate_of_r(r, self.gradient, self.intercept) 107 | 108 | def count_count(self, count_t r): 109 | if r == 0: 110 | return self.Nr0 111 | else: 112 | return self.Nr[r-1] 113 | 114 | 115 | @cython.cdivision(True) 116 | cdef double turing_estimate_of_r(double r, double Nr, double Nr1) except -1: 117 | return ((r + 1) * Nr1) / Nr 118 | 119 | 120 | @cython.cdivision(True) 121 | cdef double gale_estimate_of_r(double r, double gradient, double intercept) except -1: 122 | cdef double e_nr = exp(gradient * log(r) + intercept) 123 | cdef double e_nr1 = exp(gradient * log(r+1) + intercept) 124 | return (r + 1) * (e_nr1 / e_nr) 125 | 126 | 127 | @cython.cdivision(True) 128 | cdef void _fit_loglinear_model(double* output, count_t* sorted_r, count_t* Nr, 129 | int length) except *: 130 | cdef double x_mean = 0.0 131 | cdef double y_mean = 0.0 132 | 133 | cdef Pool mem = Pool() 134 | x = mem.alloc(length, sizeof(double)) 135 | y = mem.alloc(length, sizeof(double)) 136 | 137 | cdef int i 138 | for i in range(length): 139 | r = sorted_r[i] 140 | x[i] = log(r) 141 | y[i] = log(_get_zr(i, sorted_r, Nr[i], length)) 142 | x_mean += x[i] 143 | y_mean += y[i] 144 | 145 | x_mean /= length 146 | y_mean /= length 147 | 148 | cdef double ss_xy = 0.0 149 | cdef double ss_xx = 0.0 150 | 151 | for i in range(length): 152 | x_dist = x[i] - x_mean 153 | y_dist = y[i] - y_mean 154 | # SS_xy = sum the product of the distances from the mean 155 | ss_xy += x_dist * y_dist 156 | # SS_xx = sum the squares of the x distance 157 | ss_xx += x_dist * x_dist 158 | # Gradient 159 | output[0] = ss_xy / ss_xx 160 | # Intercept 161 | output[1] = y_mean - output[0] * x_mean 162 | 163 | 164 | @cython.cdivision(True) 165 | cdef double _get_zr(int j, count_t* sorted_r, count_t Nr_j, int n_counts) except -1: 166 | cdef double r_i = sorted_r[j-1] if j >= 1 else 0 167 | cdef double r_j = sorted_r[j] 168 | cdef double r_k = sorted_r[j+1] if (j+1) < n_counts else (2 * r_i - 1) 169 | return 2 * Nr_j / (r_k - r_i) 170 | 171 | 172 | @cython.cdivision(True) 173 | cdef double _variance(double r, double Nr, double Nr1) nogil: 174 | return 1.96 * sqrt((r+1)**2 * (Nr1 / Nr**2) * (1.0 + (Nr1 / Nr))) 175 | 176 | 177 | @cython.cdivision(True) 178 | cdef count_t _find_when_to_switch(count_t* sorted_r, count_t* Nr, double m, double b, 179 | int length) except -1: 180 | cdef int i 181 | cdef count_t r 182 | for i in range(length-1): 183 | r = sorted_r[i] 184 | if sorted_r[i+1] != r+1: 185 | return r 186 | g_r = gale_estimate_of_r(r, m, b) 187 | t_r = turing_estimate_of_r(r, Nr[i], Nr[i+1]) 188 | if abs(t_r - g_r) <= _variance(r, Nr[i], Nr[i+1]): 189 | return r 190 | else: 191 | return length - 1 192 | -------------------------------------------------------------------------------- /preshed/maps.pxd: -------------------------------------------------------------------------------- 1 | from libc.stdint cimport uint64_t 2 | from cymem.cymem cimport Pool 3 | 4 | 5 | ctypedef uint64_t key_t 6 | 7 | 8 | cdef struct Cell: 9 | key_t key 10 | void* value 11 | 12 | 13 | cdef struct Result: 14 | int found 15 | void* value 16 | 17 | 18 | cdef struct MapStruct: 19 | Cell* cells 20 | void* value_for_empty_key 21 | void* value_for_del_key 22 | key_t length 23 | key_t filled 24 | bint is_empty_key_set 25 | bint is_del_key_set 26 | 27 | 28 | cdef void* map_bulk_get(const MapStruct* map_, const key_t* keys, void** values, 29 | int n) nogil 30 | 31 | 32 | cdef Result map_get_unless_missing(const MapStruct* map_, const key_t key) nogil 33 | 34 | cdef void* map_get(const MapStruct* map_, const key_t key) nogil 35 | 36 | cdef void map_set(Pool mem, MapStruct* map_, key_t key, void* value) except * 37 | 38 | cdef void map_init(Pool mem, MapStruct* pmap, size_t length) except * 39 | 40 | cdef bint map_iter(const MapStruct* map_, int* i, key_t* key, void** value) nogil 41 | 42 | cdef void* map_clear(MapStruct* map_, const key_t key) nogil 43 | 44 | 45 | cdef class PreshMap: 46 | cdef MapStruct* c_map 47 | cdef Pool mem 48 | 49 | cdef inline void* get(self, key_t key) nogil 50 | cdef void set(self, key_t key, void* value) except * 51 | 52 | 53 | cdef class PreshMapArray: 54 | cdef Pool mem 55 | cdef MapStruct* maps 56 | cdef size_t length 57 | 58 | cdef inline void* get(self, size_t i, key_t key) nogil 59 | cdef void set(self, size_t i, key_t key, void* value) except * 60 | -------------------------------------------------------------------------------- /preshed/maps.pyx: -------------------------------------------------------------------------------- 1 | # cython: infer_types=True 2 | # cython: cdivision=True 3 | # 4 | cimport cython 5 | 6 | 7 | DEF EMPTY_KEY = 0 8 | DEF DELETED_KEY = 1 9 | 10 | 11 | cdef class PreshMap: 12 | """Hash map that assumes keys come pre-hashed. Maps uint64_t --> uint64_t. 13 | Uses open addressing with linear probing. 14 | 15 | Usage 16 | map = PreshMap() # Create a table 17 | map = PreshMap(initial_size=1024) # Create with initial size (efficiency) 18 | map[key] = value # Set a value to a key 19 | value = map[key] # Get a value given a key 20 | for key, value in map.items(): # Iterate over items 21 | len(map) # Get number of inserted keys 22 | """ 23 | def __init__(self, size_t initial_size=8): 24 | # Size must be power of two 25 | if initial_size == 0: 26 | initial_size = 8 27 | if initial_size & (initial_size - 1) != 0: 28 | power = 1 29 | while power < initial_size: 30 | power *= 2 31 | initial_size = power 32 | self.mem = Pool() 33 | self.c_map = self.mem.alloc(1, sizeof(MapStruct)) 34 | map_init(self.mem, self.c_map, initial_size) 35 | 36 | property capacity: 37 | def __get__(self): 38 | return self.c_map.length 39 | 40 | def items(self): 41 | cdef key_t key 42 | cdef void* value 43 | cdef int i = 0 44 | while map_iter(self.c_map, &i, &key, &value): 45 | yield key, value 46 | 47 | def keys(self): 48 | for key, _ in self.items(): 49 | yield key 50 | 51 | def values(self): 52 | for _, value in self.items(): 53 | yield value 54 | 55 | def pop(self, key_t key, default=None): 56 | cdef Result result = map_get_unless_missing(self.c_map, key) 57 | map_clear(self.c_map, key) 58 | if result.found: 59 | return result.value 60 | else: 61 | return default 62 | 63 | def __getitem__(self, key_t key): 64 | cdef Result result = map_get_unless_missing(self.c_map, key) 65 | if result.found: 66 | return result.value 67 | else: 68 | return None 69 | 70 | def __setitem__(self, key_t key, size_t value): 71 | map_set(self.mem, self.c_map, key, value) 72 | 73 | def __delitem__(self, key_t key): 74 | map_clear(self.c_map, key) 75 | 76 | def __len__(self): 77 | return self.c_map.filled 78 | 79 | def __contains__(self, key_t key): 80 | cdef Result result = map_get_unless_missing(self.c_map, key) 81 | return True if result.found else False 82 | 83 | def __iter__(self): 84 | for key in self.keys(): 85 | yield key 86 | 87 | cdef inline void* get(self, key_t key) nogil: 88 | return map_get(self.c_map, key) 89 | 90 | cdef void set(self, key_t key, void* value) except *: 91 | map_set(self.mem, self.c_map, key, value) 92 | 93 | 94 | cdef class PreshMapArray: 95 | """An array of hash tables that assume keys come pre-hashed. Each table 96 | uses open addressing with linear probing. 97 | """ 98 | def __init__(self, size_t length, size_t initial_size=8): 99 | self.mem = Pool() 100 | self.length = length 101 | self.maps = self.mem.alloc(length, sizeof(MapStruct)) 102 | for i in range(length): 103 | map_init(self.mem, &self.maps[i], initial_size) 104 | 105 | cdef inline void* get(self, size_t i, key_t key) nogil: 106 | return map_get(&self.maps[i], key) 107 | 108 | cdef void set(self, size_t i, key_t key, void* value) except *: 109 | map_set(self.mem, &self.maps[i], key, value) 110 | 111 | 112 | cdef void map_init(Pool mem, MapStruct* map_, size_t length) except *: 113 | map_.length = length 114 | map_.filled = 0 115 | map_.cells = mem.alloc(length, sizeof(Cell)) 116 | 117 | 118 | cdef void map_set(Pool mem, MapStruct* map_, key_t key, void* value) except *: 119 | cdef Cell* cell 120 | if key == EMPTY_KEY: 121 | map_.value_for_empty_key = value 122 | map_.is_empty_key_set = True 123 | elif key == DELETED_KEY: 124 | map_.value_for_del_key = value 125 | map_.is_del_key_set = True 126 | else: 127 | cell = _find_cell_for_insertion(map_.cells, map_.length, key) 128 | if cell.key == EMPTY_KEY: 129 | map_.filled += 1 130 | cell.key = key 131 | cell.value = value 132 | if (map_.filled + 1) * 5 >= (map_.length * 3): 133 | _resize(mem, map_) 134 | 135 | 136 | cdef void* map_get(const MapStruct* map_, const key_t key) nogil: 137 | if key == EMPTY_KEY: 138 | return map_.value_for_empty_key 139 | elif key == DELETED_KEY: 140 | return map_.value_for_del_key 141 | cdef Cell* cell = _find_cell(map_.cells, map_.length, key) 142 | return cell.value 143 | 144 | 145 | cdef Result map_get_unless_missing(const MapStruct* map_, const key_t key) nogil: 146 | cdef Result result 147 | cdef Cell* cell 148 | result.found = 0 149 | result.value = NULL 150 | if key == EMPTY_KEY: 151 | if map_.is_empty_key_set: 152 | result.found = 1 153 | result.value = map_.value_for_empty_key 154 | elif key == DELETED_KEY: 155 | if map_.is_del_key_set: 156 | result.found = 1 157 | result.value = map_.value_for_del_key 158 | else: 159 | cell = _find_cell(map_.cells, map_.length, key) 160 | if cell.key == key: 161 | result.found = 1 162 | result.value = cell.value 163 | return result 164 | 165 | 166 | cdef void* map_clear(MapStruct* map_, const key_t key) nogil: 167 | if key == EMPTY_KEY: 168 | value = map_.value_for_empty_key if map_.is_empty_key_set else NULL 169 | map_.is_empty_key_set = False 170 | return value 171 | elif key == DELETED_KEY: 172 | value = map_.value_for_del_key if map_.is_del_key_set else NULL 173 | map_.is_del_key_set = False 174 | return value 175 | else: 176 | cell = _find_cell(map_.cells, map_.length, key) 177 | cell.key = DELETED_KEY 178 | # We shouldn't decrement the "filled" value here, as we're not actually 179 | # making "empty" values -- deleted values aren't quite the same. 180 | # Instead if we manage to insert into a deleted slot, we don't increment 181 | # the fill rate. 182 | return cell.value 183 | 184 | 185 | cdef void* map_bulk_get(const MapStruct* map_, const key_t* keys, void** values, 186 | int n) nogil: 187 | cdef int i 188 | for i in range(n): 189 | values[i] = map_get(map_, keys[i]) 190 | 191 | 192 | cdef bint map_iter(const MapStruct* map_, int* i, key_t* key, void** value) nogil: 193 | '''Iterate over the filled items, setting the current place in i, and the 194 | key and value. Return False when iteration finishes. 195 | ''' 196 | cdef const Cell* cell 197 | while i[0] < map_.length: 198 | cell = &map_.cells[i[0]] 199 | i[0] += 1 200 | if cell[0].key != EMPTY_KEY and cell[0].key != DELETED_KEY: 201 | key[0] = cell[0].key 202 | value[0] = cell[0].value 203 | return True 204 | # Remember to check for cells keyed by the special empty and deleted keys 205 | if i[0] == map_.length: 206 | i[0] += 1 207 | if map_.is_empty_key_set: 208 | key[0] = EMPTY_KEY 209 | value[0] = map_.value_for_empty_key 210 | return True 211 | if i[0] == map_.length + 1: 212 | i[0] += 1 213 | if map_.is_del_key_set: 214 | key[0] = DELETED_KEY 215 | value[0] = map_.value_for_del_key 216 | return True 217 | return False 218 | 219 | 220 | @cython.cdivision 221 | cdef inline Cell* _find_cell(Cell* cells, const key_t size, const key_t key) nogil: 222 | # Modulo for powers-of-two via bitwise & 223 | cdef key_t i = (key & (size - 1)) 224 | while cells[i].key != EMPTY_KEY and cells[i].key != key: 225 | i = (i + 1) & (size - 1) 226 | return &cells[i] 227 | 228 | 229 | @cython.cdivision 230 | cdef inline Cell* _find_cell_for_insertion(Cell* cells, const key_t size, const key_t key) nogil: 231 | """Find the correct cell to insert a value, which could be a previously 232 | deleted cell. If we cross a deleted cell and the key is in the table, we 233 | mark the later cell as deleted, and return the earlier one.""" 234 | cdef Cell* deleted = NULL 235 | # Modulo for powers-of-two via bitwise & 236 | cdef key_t i = (key & (size - 1)) 237 | while cells[i].key != EMPTY_KEY and cells[i].key != key: 238 | if cells[i].key == DELETED_KEY: 239 | deleted = &cells[i] 240 | i = (i + 1) & (size - 1) 241 | if deleted is not NULL: 242 | if cells[i].key == key: 243 | # We need to ensure we don't end up with the key in the table twice. 244 | # If we're using a deleted cell and we also have the key, we mark 245 | # the later cell as deleted. 246 | cells[i].key = DELETED_KEY 247 | return deleted 248 | return &cells[i] 249 | 250 | 251 | cdef void _resize(Pool mem, MapStruct* map_) except *: 252 | cdef size_t new_size = map_.length * 2 253 | cdef Cell* old_cells = map_.cells 254 | cdef size_t old_size = map_.length 255 | 256 | map_.length = new_size 257 | map_.filled = 0 258 | map_.cells = mem.alloc(new_size, sizeof(Cell)) 259 | 260 | cdef size_t i 261 | cdef size_t slot 262 | for i in range(old_size): 263 | if old_cells[i].key != EMPTY_KEY and old_cells[i].key != DELETED_KEY: 264 | map_set(mem, map_, old_cells[i].key, old_cells[i].value) 265 | mem.free(old_cells) 266 | -------------------------------------------------------------------------------- /preshed/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/preshed/7bd9d00a9b9460020ad9f9d7f06499efd6a08b58/preshed/tests/__init__.py -------------------------------------------------------------------------------- /preshed/tests/test_bloom.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import pytest 3 | import pickle 4 | 5 | from preshed.bloom import BloomFilter 6 | 7 | def test_contains(): 8 | bf = BloomFilter() 9 | assert 23 not in bf 10 | bf.add(23) 11 | assert 23 in bf 12 | 13 | bf.add(5) 14 | bf.add(42) 15 | bf.add(1002) 16 | assert 5 in bf 17 | assert 42 in bf 18 | assert 1002 in bf 19 | 20 | def test_no_false_negatives(): 21 | bf = BloomFilter(size=100, hash_funcs=2) 22 | for ii in range(0,1000,20): 23 | bf.add(ii) 24 | 25 | for ii in range(0,1000,20): 26 | assert ii in bf 27 | 28 | def test_from_error(): 29 | bf = BloomFilter.from_error_rate(1000) 30 | for ii in range(0,1000,20): 31 | bf.add(ii) 32 | 33 | for ii in range(0,1000,20): 34 | assert ii in bf 35 | 36 | def test_to_from_bytes(): 37 | bf = BloomFilter(size=100, hash_funcs=2) 38 | for ii in range(0,1000,20): 39 | bf.add(ii) 40 | data = bf.to_bytes() 41 | bf2 = BloomFilter() 42 | for ii in range(0,1000,20): 43 | assert ii not in bf2 44 | bf2.from_bytes(data) 45 | for ii in range(0,1000,20): 46 | assert ii in bf2 47 | assert bf2.to_bytes() == data 48 | 49 | def test_bloom_pickle(): 50 | bf = BloomFilter(size=100, hash_funcs=2) 51 | for ii in range(0,1000,20): 52 | bf.add(ii) 53 | data = pickle.dumps(bf) 54 | bf2 = pickle.loads(data) 55 | for ii in range(0,1000,20): 56 | assert ii in bf2 57 | -------------------------------------------------------------------------------- /preshed/tests/test_counter.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import pytest 3 | 4 | from preshed.counter import PreshCounter 5 | 6 | 7 | def test_count(): 8 | counter = PreshCounter() 9 | assert counter[12] == 0 10 | counter.inc(12, 1) 11 | assert counter[12] == 1 12 | counter.inc(14, 10) 13 | counter.inc(9, 10) 14 | counter.inc(12, 4) 15 | assert counter[12] == 5 16 | assert counter[14] == 10 17 | assert counter[9] == 10 18 | 19 | 20 | def test_unsmooth_prob(): 21 | counter = PreshCounter() 22 | assert counter.prob(12) == 0.0 23 | counter.inc(12, 1) 24 | assert counter.prob(12) == 1.0 25 | counter.inc(14, 10) 26 | assert counter.prob(14) == 10 / 11 27 | assert counter.prob(12) == 1.0 / 11 28 | 29 | def test_smooth_prob(): 30 | p = PreshCounter() 31 | # 1 10 32 | # 2 6 33 | # 3 4 34 | # 5 2 35 | # 8 1 36 | for i in range(10): 37 | p.inc(100-i, 1) # 10 items of freq 1 38 | for i in range(6): 39 | p.inc(90 - i, 2) # 6 items of freq 2 40 | for i in range(4): 41 | p.inc(80 - i, 3) # 4 items of freq 3 42 | for i in range(2): 43 | p.inc(70 - i, 5) # 2 items of freq 5 44 | for i in range(1): 45 | p.inc(60 - i, 8) # 1 item of freq 8 46 | 47 | assert p.total == (10 * 1) + (6 * 2) + (4 * 3) + (2 * 5) + (1 * 8) 48 | 49 | assert p.prob(100) == 1.0 / p.total 50 | assert p.prob(200) == 0.0 51 | assert p.prob(60) == 8.0 / p.total 52 | 53 | p.smooth() 54 | 55 | assert p.smoother(1) < 1.0 56 | assert p.smoother(8) < 8.0 57 | assert p.prob(1000) < p.prob(100) 58 | 59 | for event, count in reversed(sorted(p, key=lambda it: it[1])): 60 | assert p.smoother(count) < count 61 | 62 | 63 | import os 64 | def test_large_freqs(): 65 | if 'TEST_FILE_LOC' in os.environ: 66 | loc = os.environ['TEST_FILE_LOC'] 67 | else: 68 | return None 69 | counts = PreshCounter() 70 | for i, line in enumerate(open(loc)): 71 | line = line.strip() 72 | if not line: 73 | continue 74 | freq = int(line.split()[0]) 75 | counts.inc(i+1, freq) 76 | oov = i+2 77 | assert counts.prob(oov) == 0.0 78 | assert counts.prob(1) < 0.1 79 | counts.smooth() 80 | assert counts.prob(oov) > 0 81 | assert counts.prob(oov) < counts.prob(i) 82 | -------------------------------------------------------------------------------- /preshed/tests/test_hashing.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from preshed.maps import PreshMap 4 | import random 5 | 6 | 7 | def test_insert(): 8 | h = PreshMap() 9 | assert h[1] is None 10 | h[1] = 5 11 | assert h[1] == 5 12 | h[2] = 6 13 | assert h[1] == 5 14 | assert h[2] == 6 15 | 16 | def test_resize(): 17 | h = PreshMap(4) 18 | h[4] = 12 19 | for i in range(10, 100): 20 | value = int(i * (random.random() + 1)) 21 | h[i] = value 22 | assert h[4] == 12 23 | 24 | 25 | def test_zero_key(): 26 | h = PreshMap() 27 | h[0] = 6 28 | h[5] = 12 29 | assert h[0] == 6 30 | assert h[5] == 12 31 | 32 | for i in range(500, 1000): 33 | h[i] = i * random.random() 34 | assert h[0] == 6 35 | assert h[5] == 12 36 | 37 | 38 | def test_iter(): 39 | key_sum = 0 40 | val_sum = 0 41 | h = PreshMap() 42 | for i in range(56, 24, -3): 43 | h[i] = i * 2 44 | key_sum += i 45 | val_sum += i * 2 46 | for key, value in h.items(): 47 | key_sum -= key 48 | val_sum -= value 49 | assert key_sum == 0 50 | assert val_sum == 0 51 | 52 | 53 | def test_one_and_empty(): 54 | # See Issue #21 55 | table = PreshMap() 56 | for i in range(100, 110): 57 | table[i] = i 58 | del table[i] 59 | assert table[0] == None 60 | 61 | 62 | def test_many_and_empty(): 63 | # See Issue #21 64 | table = PreshMap() 65 | for i in range(100, 110): 66 | table[i] = i 67 | for i in range(100, 110): 68 | del table[i] 69 | assert table[0] == None 70 | 71 | 72 | def test_zero_values(): 73 | table = PreshMap() 74 | table[10] = 0 75 | assert table[10] == 0 76 | assert table[11] is None 77 | -------------------------------------------------------------------------------- /preshed/tests/test_pop.py: -------------------------------------------------------------------------------- 1 | from ..maps import PreshMap 2 | 3 | 4 | def test_pop1(): 5 | table = PreshMap() 6 | table[10] = 20 7 | table[30] = 25 8 | assert table[10] == 20 9 | assert table[30] == 25 10 | table.pop(30) 11 | assert table[10] == 20 12 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools", 4 | "cython>=0.28", 5 | "cymem>=2.0.2,<2.1.0", 6 | "murmurhash>=0.28.0,<1.1.0", 7 | ] 8 | build-backend = "setuptools.build_meta" 9 | 10 | 11 | [tool.cibuildwheel] 12 | build = "*" 13 | skip = "pp* cp36* cp37* cp38*" 14 | test-skip = "" 15 | free-threaded-support = false 16 | 17 | archs = ["native"] 18 | 19 | build-frontend = "default" 20 | config-settings = {} 21 | dependency-versions = "pinned" 22 | environment = {} 23 | environment-pass = [] 24 | build-verbosity = 0 25 | 26 | before-all = "" 27 | before-build = "" 28 | repair-wheel-command = "" 29 | 30 | test-command = "" 31 | before-test = "" 32 | test-requires = [] 33 | test-extras = [] 34 | 35 | container-engine = "docker" 36 | 37 | manylinux-x86_64-image = "manylinux2014" 38 | manylinux-i686-image = "manylinux2014" 39 | manylinux-aarch64-image = "manylinux2014" 40 | manylinux-ppc64le-image = "manylinux2014" 41 | manylinux-s390x-image = "manylinux2014" 42 | manylinux-pypy_x86_64-image = "manylinux2014" 43 | manylinux-pypy_i686-image = "manylinux2014" 44 | manylinux-pypy_aarch64-image = "manylinux2014" 45 | 46 | musllinux-x86_64-image = "musllinux_1_2" 47 | musllinux-i686-image = "musllinux_1_2" 48 | musllinux-aarch64-image = "musllinux_1_2" 49 | musllinux-ppc64le-image = "musllinux_1_2" 50 | musllinux-s390x-image = "musllinux_1_2" 51 | 52 | 53 | [tool.cibuildwheel.linux] 54 | repair-wheel-command = "auditwheel repair -w {dest_dir} {wheel}" 55 | 56 | [tool.cibuildwheel.macos] 57 | repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}" 58 | 59 | [tool.cibuildwheel.windows] 60 | 61 | [tool.cibuildwheel.pyodide] 62 | 63 | [tool.isort] 64 | profile = "black" 65 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cymem>=2.0.2,<2.1.0 2 | cython>=0.28 3 | pytest 4 | murmurhash>=0.28.0,<1.1.0 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | import os 4 | import sys 5 | import contextlib 6 | from setuptools import Extension, setup 7 | from setuptools.command.build_ext import build_ext 8 | from sysconfig import get_path 9 | from Cython.Build import cythonize 10 | 11 | 12 | PACKAGES = ["preshed", "preshed.tests"] 13 | MOD_NAMES = ["preshed.maps", "preshed.counter", "preshed.bloom"] 14 | 15 | 16 | # By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options 17 | # http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used 18 | compile_options = { 19 | "msvc": ["/Ox", "/EHsc"], 20 | "other": ["-O3", "-Wno-strict-prototypes", "-Wno-unused-function"], 21 | } 22 | link_options = {"msvc": [], "other": []} 23 | 24 | 25 | class build_ext_options: 26 | def build_options(self): 27 | for e in self.extensions: 28 | e.extra_compile_args = compile_options.get( 29 | self.compiler.compiler_type, compile_options["other"] 30 | ) 31 | for e in self.extensions: 32 | e.extra_link_args = link_options.get( 33 | self.compiler.compiler_type, link_options["other"] 34 | ) 35 | 36 | 37 | class build_ext_subclass(build_ext, build_ext_options): 38 | def build_extensions(self): 39 | build_ext_options.build_options(self) 40 | build_ext.build_extensions(self) 41 | 42 | 43 | def clean(path): 44 | for name in MOD_NAMES: 45 | name = name.replace(".", "/") 46 | for ext in [".so", ".html", ".cpp", ".c"]: 47 | file_path = os.path.join(path, name + ext) 48 | if os.path.exists(file_path): 49 | os.unlink(file_path) 50 | 51 | 52 | @contextlib.contextmanager 53 | def chdir(new_dir): 54 | old_dir = os.getcwd() 55 | try: 56 | os.chdir(new_dir) 57 | sys.path.insert(0, new_dir) 58 | yield 59 | finally: 60 | del sys.path[0] 61 | os.chdir(old_dir) 62 | 63 | 64 | def setup_package(): 65 | root = os.path.abspath(os.path.dirname(__file__)) 66 | 67 | if len(sys.argv) > 1 and sys.argv[1] == "clean": 68 | return clean(root) 69 | 70 | with chdir(root): 71 | with open(os.path.join(root, "preshed", "about.py")) as f: 72 | about = {} 73 | exec(f.read(), about) 74 | 75 | with open(os.path.join(root, "README.md")) as f: 76 | readme = f.read() 77 | 78 | include_dirs = [get_path("include")] 79 | 80 | ext_modules = [] 81 | for mod_name in MOD_NAMES: 82 | mod_path = mod_name.replace(".", "/") + ".pyx" 83 | ext_modules.append( 84 | Extension( 85 | mod_name, [mod_path], language="c++", include_dirs=include_dirs 86 | ) 87 | ) 88 | 89 | setup( 90 | name="preshed", 91 | zip_safe=False, 92 | packages=PACKAGES, 93 | package_data={"": ["*.pyx", "*.pxd"]}, 94 | description=about["__summary__"], 95 | long_description=readme, 96 | long_description_content_type="text/markdown", 97 | author=about["__author__"], 98 | author_email=about["__email__"], 99 | version=about["__version__"], 100 | url=about["__uri__"], 101 | license=about["__license__"], 102 | ext_modules=cythonize(ext_modules, language_level=2), 103 | python_requires=">=3.6,<3.14", 104 | install_requires=["cymem>=2.0.2,<2.1.0", "murmurhash>=0.28.0,<1.1.0"], 105 | classifiers=[ 106 | "Environment :: Console", 107 | "Intended Audience :: Developers", 108 | "Intended Audience :: Science/Research", 109 | "License :: OSI Approved :: MIT License", 110 | "Operating System :: POSIX :: Linux", 111 | "Operating System :: MacOS :: MacOS X", 112 | "Operating System :: Microsoft :: Windows", 113 | "Programming Language :: Cython", 114 | "Programming Language :: Python :: 3.6", 115 | "Programming Language :: Python :: 3.7", 116 | "Programming Language :: Python :: 3.8", 117 | "Programming Language :: Python :: 3.9", 118 | "Programming Language :: Python :: 3.10", 119 | "Programming Language :: Python :: 3.11", 120 | "Programming Language :: Python :: 3.12", 121 | "Topic :: Scientific/Engineering", 122 | ], 123 | cmdclass={"build_ext": build_ext_subclass}, 124 | ) 125 | 126 | 127 | if __name__ == "__main__": 128 | setup_package() 129 | --------------------------------------------------------------------------------