├── .github └── workflows │ ├── build.yml │ └── publish.yml ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── benchmarks ├── bench_city.sh └── bench_farm.sh ├── cpp.mk ├── pip-freeze.txt ├── pyproject.toml ├── python.mk ├── requirements.txt ├── setup.py ├── src ├── city.cc ├── city.h ├── citycrc.h ├── cityhash.cpp ├── cityhash.pyx ├── cityhashcrc.cpp ├── cityhashcrc.pyx ├── farm.cc ├── farm.h ├── farmhash.cpp └── farmhash.pyx └── tests ├── catch.hpp ├── cityhash64_main.cc ├── test_city.cc ├── test_cityhash.cc ├── test_cityhash.py ├── test_cityhashcrc.py └── test_farmhash.py /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | types: 9 | - opened 10 | - synchronize 11 | - reopened 12 | 13 | jobs: 14 | build: 15 | strategy: 16 | matrix: 17 | os: [windows-latest, macos-latest, ubuntu-latest] 18 | python-version: ["3.13"] 19 | 20 | runs-on: ${{ matrix.os }} 21 | steps: 22 | - name: Checkout 23 | uses: actions/checkout@v4 24 | 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: actions/setup-python@v5 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | architecture: x64 30 | 31 | # block below based on: 32 | # https://medium.com/ai2-blog/python-caching-in-github-actions-e9452698e98d 33 | - name: Cache Python environment 34 | uses: actions/cache@v4 35 | with: 36 | path: ${{ env.pythonLocation }} 37 | key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('pip-freeze.txt') }} 38 | 39 | - name: Install dependencies 40 | run: | 41 | pip install --upgrade --upgrade-strategy eager setuptools wheel 42 | pip install --upgrade --upgrade-strategy eager -r requirements.txt 43 | pip freeze > pip-freeze.txt 44 | 45 | - name: Test with pytest 46 | run: | 47 | python setup.py build_ext --inplace 48 | pip install -e . 49 | python -m pytest 50 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: publish 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | repository: 7 | description: 'The repository to upload the package to' 8 | required: true 9 | default: 'testpypi' 10 | 11 | jobs: 12 | build_wheels: 13 | name: Build wheels on ${{ matrix.os }} 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | matrix: 17 | os: [ubuntu-20.04, windows-2019, macOS-13] 18 | steps: 19 | - name: Checkout 20 | uses: actions/checkout@v4 21 | 22 | - name: Set up Python 23 | uses: actions/setup-python@v5 24 | with: 25 | python-version: '3.9' 26 | 27 | - name: Set up QEMU 28 | if: runner.os == 'Linux' 29 | # uses: docker/setup-qemu-action@v1.0.1 30 | uses: docker/setup-qemu-action@v3 31 | with: 32 | platforms: arm64 33 | 34 | - name: Build wheels 35 | # uses: joerick/cibuildwheel@v1.9.0 36 | uses: pypa/cibuildwheel@v2.23.0 37 | with: 38 | output-dir: wheelhouse 39 | env: 40 | CIBW_BUILD: '{cp36,cp37,cp38,cp39,cp310,cp311,cp312,cp313}-{manylinux_x86_64,manylinux_aarch64,win32,win_amd64,macosx_x86_64} {cp39,cp310,cp311,cp312,cp313}-macosx_arm64' 41 | CIBW_MANYLINUX_AARCH64_IMAGE: manylinux2014 42 | CIBW_ARCHS_LINUX: 'auto aarch64' 43 | CIBW_ARCHS_MACOS: 'auto arm64' 44 | CIBW_TEST_REQUIRES: pytest 45 | CIBW_TEST_COMMAND: 'pytest -s {project}/tests' 46 | CIBW_TEST_SKIP: '*-macosx_arm64' # Until the day Apple silicon instances are available on GitHub Actions 47 | 48 | - name: Upload artifact 49 | uses: actions/upload-artifact@v4 50 | with: 51 | name: cibw-wheels-${{ strategy.job-index }} 52 | path: ./wheelhouse/*.whl 53 | 54 | build_sdist: 55 | name: Build a source distribution 56 | runs-on: ubuntu-20.04 57 | steps: 58 | - name: Checkout 59 | uses: actions/checkout@v4 60 | 61 | - name: Set up Python 62 | uses: actions/setup-python@v5 63 | with: 64 | python-version: '3.9' 65 | 66 | - name: Build sdist 67 | run: | 68 | pip install py-cpuinfo 69 | python setup.py build sdist 70 | 71 | - name: Upload artifact 72 | uses: actions/upload-artifact@v4 73 | with: 74 | name: cibw-sdist-${{ strategy.job-index }} 75 | path: dist/*.tar.gz 76 | 77 | publish: 78 | needs: [build_wheels, build_sdist] 79 | environment: ${{ github.event.inputs.repository }} 80 | permissions: 81 | id-token: write 82 | runs-on: ubuntu-latest 83 | steps: 84 | - uses: actions/download-artifact@v4 85 | with: 86 | pattern: cibw-* 87 | path: dist 88 | merge-multiple: true 89 | 90 | - uses: pypa/gh-action-pypi-publish@release/v1 91 | with: 92 | password: ${{ secrets.PYPI_API_TOKEN }} 93 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | *.zip 8 | 9 | # Misc artifacts 10 | .DS_Store 11 | tags 12 | .cache/ 13 | *.sw[op] 14 | 15 | # Exclude these directories 16 | bin/ 17 | data/ 18 | wheelhouse/ 19 | 20 | # Distribution / packaging 21 | .Python 22 | env/ 23 | build/ 24 | develop-eggs/ 25 | dist/ 26 | downloads/ 27 | eggs/ 28 | .eggs/ 29 | lib/ 30 | lib64/ 31 | parts/ 32 | sdist/ 33 | var/ 34 | *.egg-info/ 35 | .installed.cfg 36 | *.egg 37 | 38 | # PyInstaller 39 | # Usually these files are written by a python script from a template 40 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 41 | *.manifest 42 | *.spec 43 | 44 | # Installer logs 45 | pip-log.txt 46 | pip-delete-this-directory.txt 47 | 48 | # Unit test / coverage reports 49 | htmlcov/ 50 | .tox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *,cover 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Sphinx documentation 63 | docs/_build/ 64 | 65 | # PyBuilder 66 | target/ 67 | 68 | # IDE and editor stuff 69 | .idea/ 70 | .vscode/ 71 | .ropeproject/ 72 | .ipynb_checkpoints/ 73 | .pytest_cache/ 74 | .mypy_cache/ 75 | 76 | # Makefile artifacts 77 | .build_stamp 78 | 79 | # Python 3.11 80 | lib64 81 | pyvenv.cfg 82 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | python-cityhash - Python bindings for CityHash and FarmHash 2 | 3 | Copyright (c) 2021, Eugene Scherba , 4 | except where the file explicitly names other copyright holders and licenses. 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of 7 | this software and associated documentation files (the "Software"), to deal in 8 | the Software without restriction, including without limitation the rights to 9 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 10 | the Software, and to permit persons to whom the Software is furnished to do so, 11 | subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 18 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 19 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 20 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | recursive-include src *.h 4 | recursive-include src *.cc 5 | recursive-include src *.cpp 6 | recursive-include src *.pyx 7 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SHELL := /usr/bin/env bash -c 2 | 3 | MAKEFLAGS += --warn-undefined-variables 4 | MAKEFLAGS += --no-builtin-rules 5 | .DEFAULT_GOAL := help 6 | .SUFFIXES: 7 | 8 | include python.mk 9 | include cpp.mk 10 | 11 | .PHONY: help 12 | help: ## show this message and exit 13 | @grep -E '^[a-zA-Z_0-9%-]+:.*?## .*$$' $(MAKEFILE_LIST) \ 14 | | awk -F':' '{print $$(NF-1)":"$$NF}' | sort \ 15 | | awk 'BEGIN {FS = ":.*?## "}; {printf "$(BOLD)%-24s$(END) %s\n", $$1, $$2}' 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CityHash/FarmHash 2 | 3 | Python wrapper for [FarmHash](https://github.com/google/farmhash) and 4 | [CityHash](https://github.com/google/cityhash), a family of fast 5 | non-cryptographic hash functions. 6 | 7 | [![Build Status](https://img.shields.io/github/actions/workflow/status/escherba/python-cityhash/build.yml?branch=master)](https://github.com/escherba/python-cityhash/actions/workflows/build.yml) 8 | [![PyPI Version](https://img.shields.io/pypi/v/cityhash.svg)](https://pypi.python.org/pypi/cityhash) 9 | [![Conda-Forge Version](https://anaconda.org/conda-forge/python-cityhash/badges/version.svg)](https://anaconda.org/conda-forge/python-cityhash) 10 | [![Downloads](https://img.shields.io/pypi/dm/cityhash.svg)](https://pypistats.org/packages/cityhash) 11 | [![License](https://img.shields.io/pypi/l/cityhash.svg)](https://opensource.org/licenses/mit-license) 12 | [![Supported Python Versions](https://img.shields.io/pypi/pyversions/cityhash.svg)](https://pypi.python.org/pypi/cityhash) 13 | 14 | ## Getting Started 15 | 16 | To install from PyPI: 17 | 18 | ``` bash 19 | pip install cityhash 20 | ``` 21 | 22 | To install in a Conda environment: 23 | 24 | ``` bash 25 | conda install -c conda-forge python-cityhash 26 | ``` 27 | 28 | The package exposes Python APIs for CityHash and FarmHash under `cityhash` and 29 | `farmhash` namespaces, respectively. Each provides 32-, 64- and 128-bit 30 | implementations. 31 | 32 | ## Usage Examples 33 | 34 | ### Stateless hashing 35 | 36 | Usage example for FarmHash: 37 | 38 | ``` python 39 | >>> from farmhash import FarmHash32, FarmHash64, FarmHash128 40 | >>> FarmHash32("abc") 41 | 1961358185 42 | >>> FarmHash64("abc") 43 | 2640714258260161385 44 | >>> FarmHash128("abc") 45 | 76434233956484675513733017140465933893 46 | 47 | ``` 48 | 49 | ### Hardware-independent fingerprints 50 | 51 | Fingerprints are seedless hashes that are guaranteed to be hardware- and 52 | platform-independent. This can be useful for networking applications which 53 | require persisting hashed values. 54 | 55 | ``` python 56 | >>> from farmhash import Fingerprint128 57 | >>> Fingerprint128("abc") 58 | 76434233956484675513733017140465933893 59 | 60 | ``` 61 | 62 | ### Incremental hashing 63 | 64 | CityHash and FarmHash do not support incremental hashing and thus are not ideal 65 | for hashing of character streams. If you require incremental hashing, consider 66 | another hashing library, such as 67 | [MetroHash](https://github.com/escherba/python-metrohash) or 68 | [xxHash](https://github.com/ifduyue/python-xxhash). 69 | 70 | ### Fast hashing of NumPy arrays 71 | 72 | The [Buffer Protocol](https://docs.python.org/3/c-api/buffer.html) allows 73 | Python objects to expose their data as raw byte arrays for fast access without 74 | having to copy to a separate location in memory. NumPy is one well-known 75 | library that extensively uses this protocol. 76 | 77 | All hashing functions in this package will read byte arrays from objects that 78 | expose them via the buffer protocol. Here is an example showing hashing of a 79 | four-dimensional NumPy array: 80 | 81 | ``` python 82 | >>> import numpy as np 83 | >>> from farmhash import FarmHash64 84 | >>> arr = np.zeros((256, 256, 4)) 85 | >>> FarmHash64(arr) 86 | 1550282412043536862 87 | 88 | ``` 89 | 90 | The NumPy arrays need to be contiguous for this to work. To convert a 91 | non-contiguous array, use NumPy's `ascontiguousarray()` function. 92 | 93 | ## SSE4.2 support 94 | 95 | For x86-64 platforms, the PyPI repository for this package includes wheels 96 | compiled with SSE4.2 support. The 32- and 64-bit (but not the 128-bit) 97 | variants of FarmHash significantly benefit from SSE4.2 instructions. 98 | 99 | The vanilla CityHash functions (under `cityhash` module) do not take advantage 100 | of SSE4.2. Instead, one can use the `cityhashcrc` module provided with this 101 | package which exposes 128- and 256-bit CRC functions that do harness SSE4.2. 102 | These functions are very fast, and beat `FarmHash128` on speed (FarmHash does 103 | not include a 256-bit function). Since FarmHash is the intended successor of 104 | CityHash, I would be careful before using the CityHash-CRC functions, however, 105 | and would verify whether they provide sufficient randomness for your intended 106 | application. 107 | 108 | ## Development 109 | 110 | ### Local workflow 111 | 112 | For those wanting to contribute, here is a quick start using Make commands: 113 | 114 | ``` bash 115 | git clone https://github.com/escherba/python-cityhash.git 116 | cd python-cityhash 117 | make env # create a virtual environment 118 | make test # run Python tests 119 | make cpp-test # run C++ tests 120 | make shell # enter IPython shell 121 | ``` 122 | 123 | To find out which Make targets are available, enter: 124 | 125 | ``` bash 126 | make help 127 | ``` 128 | 129 | ### Distribution 130 | 131 | The package wheels are built using 132 | [cibuildwheel](https://cibuildwheel.readthedocs.io/) and are distributed to 133 | PyPI using GitHub actions. The wheels contain compiled binaries and are 134 | available for the following platforms: windows-amd64, ubuntu-x86, 135 | linux-x86\_64, linux-aarch64, and macosx-x86\_64. 136 | 137 | ## See Also 138 | 139 | For other fast non-cryptographic hash functions available as Python extensions, 140 | see [MetroHash](https://github.com/escherba/python-metrohash), 141 | [MurmurHash](https://github.com/hajimes/mmh3), and 142 | [xxHash](https://github.com/ifduyue/python-xxhash). 143 | 144 | ## Authors 145 | 146 | The original CityHash Python bindings are due to Alexander \[Amper\] Marshalov. 147 | They were rewritten in Cython by Eugene Scherba, who also added the FarmHash 148 | bindings. The CityHash and FarmHash algorithms and their C++ implementation are 149 | by Google. 150 | 151 | ## License 152 | 153 | This software is licensed under the [MIT 154 | License](http://www.opensource.org/licenses/mit-license). See the included 155 | LICENSE file for details. 156 | -------------------------------------------------------------------------------- /benchmarks/bench_city.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Note: this benchamr script is based on a similar one for xxHash: 4 | # https://github.com/ifduyue/python-xxhash 5 | 6 | PYTHON=${PYTHON-`which python`} 7 | 8 | echo Benchmarking CityHash... 9 | 10 | echo -n " 64WithSeed 1000B: " 11 | $PYTHON -mtimeit -s 'from cityhash import CityHash64WithSeed as hasher' \ 12 | -s 'import os' \ 13 | -s 'import random' \ 14 | -s 'seed = random.randint(0, 0xffffffffffffffff)' \ 15 | -s 'data = os.urandom(1000)' \ 16 | 'hasher(data, seed=seed)' 17 | 18 | echo -n " 64WithSeed 10000B: " 19 | $PYTHON -mtimeit -s 'from cityhash import CityHash64WithSeed as hasher' \ 20 | -s 'import os' \ 21 | -s 'import random' \ 22 | -s 'seed = random.randint(0, 0xffffffffffffffff)' \ 23 | -s 'data = os.urandom(10000)' \ 24 | 'hasher(data, seed=seed)' 25 | 26 | echo -n " 128WithSeed 1000B: " 27 | $PYTHON -mtimeit -s 'from cityhash import CityHash128WithSeed as hasher' \ 28 | -s 'import os' \ 29 | -s 'import random' \ 30 | -s 'seed = random.randint(0, 0xffffffffffffffff)' \ 31 | -s 'data = os.urandom(1000)' \ 32 | 'hasher(data, seed=seed)' 33 | 34 | echo -n " 128WithSeed 10000B: " 35 | $PYTHON -mtimeit -s 'from cityhash import CityHash128WithSeed as hasher' \ 36 | -s 'import os' \ 37 | -s 'import random' \ 38 | -s 'seed = random.randint(0, 0xffffffffffffffff)' \ 39 | -s 'data = os.urandom(10000)' \ 40 | 'hasher(data, seed=seed)' 41 | 42 | echo -n " Crc128WithSeed 1000B: " 43 | $PYTHON -mtimeit -s 'from cityhashcrc import CityHashCrc128WithSeed as hasher' \ 44 | -s 'import os' \ 45 | -s 'import random' \ 46 | -s 'seed = random.randint(0, 0xffffffffffffffff)' \ 47 | -s 'data = os.urandom(1000)' \ 48 | 'hasher(data, seed=seed)' 49 | 50 | echo -n " Crc128WithSeed 10000B: " 51 | $PYTHON -mtimeit -s 'from cityhashcrc import CityHashCrc128WithSeed as hasher' \ 52 | -s 'import os' \ 53 | -s 'import random' \ 54 | -s 'seed = random.randint(0, 0xffffffffffffffff)' \ 55 | -s 'data = os.urandom(10000)' \ 56 | 'hasher(data, seed=seed)' 57 | -------------------------------------------------------------------------------- /benchmarks/bench_farm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Note: this benchamr script is based on a similar one for xxHash: 4 | # https://github.com/ifduyue/python-xxhash 5 | 6 | PYTHON=${PYTHON-`which python`} 7 | 8 | echo Benchmarking FarmHash... 9 | 10 | echo -n " 32WithSeed 1000B: " 11 | $PYTHON -mtimeit -s 'from farmhash import FarmHash32WithSeed as hasher' \ 12 | -s 'import os' \ 13 | -s 'import random' \ 14 | -s 'seed = random.randint(0, 0xffffffff)' \ 15 | -s 'data = os.urandom(1000)' \ 16 | 'hasher(data, seed=seed)' 17 | 18 | echo -n " 32WithSeed 10000B: " 19 | $PYTHON -mtimeit -s 'from farmhash import FarmHash32WithSeed as hasher' \ 20 | -s 'import os' \ 21 | -s 'import random' \ 22 | -s 'seed = random.randint(0, 0xffffffff)' \ 23 | -s 'data = os.urandom(10000)' \ 24 | 'hasher(data, seed=seed)' 25 | 26 | echo -n " 64WithSeed 1000B: " 27 | $PYTHON -mtimeit -s 'from farmhash import FarmHash64WithSeed as hasher' \ 28 | -s 'import os' \ 29 | -s 'import random' \ 30 | -s 'seed = random.randint(0, 0xffffffffffffffff)' \ 31 | -s 'data = os.urandom(1000)' \ 32 | 'hasher(data, seed=seed)' 33 | 34 | echo -n " 64WithSeed 10000B: " 35 | $PYTHON -mtimeit -s 'from farmhash import FarmHash64WithSeed as hasher' \ 36 | -s 'import os' \ 37 | -s 'import random' \ 38 | -s 'seed = random.randint(0, 0xffffffffffffffff)' \ 39 | -s 'data = os.urandom(10000)' \ 40 | 'hasher(data, seed=seed)' 41 | 42 | 43 | echo -n " 128WithSeed 1000B: " 44 | $PYTHON -mtimeit -s 'from farmhash import FarmHash128WithSeed as hasher' \ 45 | -s 'import os' \ 46 | -s 'import random' \ 47 | -s 'seed = random.randint(0, 0xffffffffffffffff)' \ 48 | -s 'data = os.urandom(1000)' \ 49 | 'hasher(data, seed=seed)' 50 | 51 | echo -n " 128WithSeed 10000B: " 52 | $PYTHON -mtimeit -s 'from farmhash import FarmHash128WithSeed as hasher' \ 53 | -s 'import os' \ 54 | -s 'import random' \ 55 | -s 'seed = random.randint(0, 0xffffffffffffffff)' \ 56 | -s 'data = os.urandom(10000)' \ 57 | 'hasher(data, seed=seed)' 58 | -------------------------------------------------------------------------------- /cpp.mk: -------------------------------------------------------------------------------- 1 | CXX := g++ 2 | CXXFLAGS := -std=c++11 -O3 -msse4.2 3 | LDFLAGS := 4 | SRCEXT := cc 5 | INC := -I src 6 | LIB := -L lib 7 | 8 | INPUT := ./data/sample_100k.txt 9 | 10 | BINDIR := bin 11 | SRCDIR := src 12 | TESTDIR := tests 13 | BUILDDIR := build 14 | ALL_SOURCES := $(wildcard $(SRCDIR)/*.$(SRCEXT) $(TESTDIR)/*.$(SRCEXT)) 15 | 16 | RUN_SOURCES := $(wildcard $(SRCDIR)/*_main.$(SRCEXT) $(TESTDIR)/*_main.$(SRCEXT)) 17 | RUN_OBJECTS := $(patsubst %, $(BUILDDIR)/%, $(RUN_SOURCES:.$(SRCEXT)=.o)) 18 | RUN_TARGETS := $(patsubst $(BUILDDIR)/%.o, $(BINDIR)/%, $(RUN_OBJECTS)) 19 | 20 | TEST_SOURCES := $(wildcard $(TESTDIR)/test_*.$(SRCEXT)) 21 | TEST_OBJECTS := $(patsubst %, $(BUILDDIR)/%, $(TEST_SOURCES:.$(SRCEXT)=.o)) 22 | TEST_TARGETS := $(patsubst $(BUILDDIR)/%.o, $(BINDIR)/%, $(TEST_OBJECTS)) 23 | 24 | SOURCES := $(filter-out $(RUN_SOURCES) $(TEST_SOURCES), $(ALL_SOURCES)) 25 | OBJECTS := $(patsubst %, $(BUILDDIR)/%, $(SOURCES:.$(SRCEXT)=.o)) 26 | 27 | .SECONDARY: $(RUN_OBJECTS) $(TEST_OBJECTS) $(OBJECTS) 28 | 29 | $(BUILDDIR)/%.o: %.$(SRCEXT) 30 | @mkdir -p $(dir $@) 31 | $(CC) $(INC) $(CXXFLAGS) -c $< -o $@ 32 | 33 | $(BINDIR)/%: $(BUILDDIR)/%.o $(OBJECTS) 34 | @mkdir -p $(dir $@) 35 | $(CXX) $(LIB) $(LDFLAGS) $^ -o $@ 36 | 37 | .PHONY: cpp-clean 38 | cpp-clean: ## clean up C++ project 39 | rm -rf ./$(BINDIR)/ ./$(BUILDDIR)/ 40 | 41 | .PHONY: cpp-run 42 | cpp-run: $(RUN_TARGETS) ## compile and run C++ program 43 | @for target in $(RUN_TARGETS); do \ 44 | echo $$target >&2; \ 45 | time ./$$target $(INPUT); \ 46 | done 47 | 48 | .PHONY: cpp-test 49 | cpp-test: $(TEST_TARGETS) ## run C++ tests 50 | @for target in $(TEST_TARGETS); do \ 51 | echo $$target >&2; \ 52 | ./$$target; \ 53 | done 54 | -------------------------------------------------------------------------------- /pip-freeze.txt: -------------------------------------------------------------------------------- 1 | asttokens==2.4.1 2 | -e git+ssh://git@github.com/escherba/python-cityhash@1f3ddd1e410697d7ccc9882ad67fe6bf558232b2#egg=cityhash 3 | Cython==3.0.10 4 | decorator==5.1.1 5 | executing==2.0.1 6 | iniconfig==2.0.0 7 | ipdb==0.13.13 8 | ipython==8.23.0 9 | jedi==0.19.1 10 | matplotlib-inline==0.1.6 11 | numpy==1.26.4 12 | packaging==24.0 13 | parso==0.8.4 14 | pexpect==4.9.0 15 | pluggy==1.4.0 16 | prompt-toolkit==3.0.43 17 | ptyprocess==0.7.0 18 | pure-eval==0.2.2 19 | py-cpuinfo==9.0.0 20 | Pygments==2.17.2 21 | pytest==8.1.1 22 | setuptools==69.2.0 23 | six==1.16.0 24 | stack-data==0.6.3 25 | traitlets==5.14.2 26 | wcwidth==0.2.13 27 | wheel==0.45.1 28 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | build-backend = "setuptools.build_meta" 3 | requires = [ 4 | "Cython", 5 | "py-cpuinfo", 6 | "setuptools", 7 | "wheel", 8 | ] 9 | 10 | [tool.pytest.ini_options] 11 | addopts = "-s --doctest-modules" 12 | testpaths = [ 13 | "src", 14 | "tests", 15 | ] 16 | 17 | [tool.cibuildwheel] 18 | test-requires = "pytest" 19 | -------------------------------------------------------------------------------- /python.mk: -------------------------------------------------------------------------------- 1 | PYMODULE := cityhash 2 | EXTENSION := $(PYMODULE).so 3 | SRC_DIR := src 4 | PYPI_URL := https://test.pypi.org/legacy/ 5 | EXTENSION_DEPS := $(shell find $(SRC_DIR) -type f -name "*.pyx") 6 | EXTENSION_INTERMEDIATE := $(patsubst %.pyx,%.cpp,$(EXTENSION_DEPS)) 7 | EXTENSION_OBJS := $(patsubst %.pyx,%.so,$(EXTENSION_DEPS)) 8 | 9 | BUILD_STAMP = .build_stamp 10 | ENV_STAMP = env/bin/activate 11 | 12 | DISTRIBUTE := sdist bdist_wheel 13 | 14 | PYENV := PYTHONPATH=. . env/bin/activate; 15 | INTERPRETER := python3 16 | PACKAGE_MGR := pip3 17 | PYVERSION := $(shell $(INTERPRETER) --version 2>&1) 18 | PYTHON := $(PYENV) $(INTERPRETER) 19 | PIP := $(PYENV) $(PACKAGE_MGR) 20 | 21 | VENV_OPTS := "" 22 | ifeq ($(PIP_SYSTEM_SITE_PACKAGES),1) 23 | VENV_OPTS += --system-site-packages 24 | endif 25 | 26 | BOLD := $(shell tput bold) 27 | END := $(shell tput sgr0) 28 | 29 | .PHONY: package 30 | package: $(DISTRIBUTE) ## package for distribution (deprecated) 31 | $(DISTRIBUTE): $(BUILD_STAMP) | $(ENV_STAMP) 32 | @echo "Packaging using $(PYVERSION)" 33 | $(PYTHON) setup.py $(DISTRIBUTE) 34 | 35 | .PHONY: release 36 | release: $(BUILD_STAMP) | $(ENV_STAMP) ## upload package to PyPI (deprecated) 37 | @echo "Releasing using $(PYVERSION)" 38 | $(PYTHON) setup.py $(DISTRIBUTE) upload -r $(PYPI_URL) 39 | 40 | .PHONY: shell 41 | shell: build ## open Python shell within the virtual environment 42 | @echo "Using $(PYVERSION)" 43 | $(PYENV) python 44 | 45 | .PHONY: build 46 | build: $(EXTENSION_OBJS) ## build C extension(s) 47 | @echo "completed $@ target" 48 | 49 | $(BUILD_STAMP): $(EXTENSION_DEPS) | $(ENV_STAMP) 50 | @echo "Building using $(PYVERSION)" 51 | $(PYTHON) setup.py build_ext --inplace 52 | @echo "$(shell date -u +'%Y-%m-%dT%H:%M:%SZ')" > $@ 53 | 54 | $(EXTENSION_OBJS): $(BUILD_STAMP) 55 | @echo "done building $@" 56 | 57 | .PHONY: test 58 | test: build ## run Python unit tests 59 | $(PYENV) pytest 60 | 61 | .PHONY: nuke 62 | nuke: clean ## clean and remove virtual environment 63 | rm -f $(BUILD_STAMP) $(EXTENSION_INTERMEDIATE) 64 | rm -rf *.egg *.egg-info env 65 | find $(SRC_DIR) -depth -type d -name *.egg-info -exec rm -rf {} \; 66 | 67 | .PHONY: clean 68 | clean: ## remove temporary files 69 | $(PYTHON) setup.py clean 70 | rm -rf dist build __pycache__ 71 | rm -f *.so 72 | find $(SRC_DIR) -type f -name "*.pyc" -exec rm {} \; 73 | find $(SRC_DIR) -type f -name "*.cpp" -exec rm {} \; 74 | find $(SRC_DIR) -type f -name "*.so" -exec rm {} \; 75 | 76 | .PHONY: install 77 | install: $(BUILD_STAMP) ## install package 78 | $(PIP) install -e . 79 | 80 | .PRECIOUS: $(ENV_STAMP) 81 | .PHONY: env 82 | env: $(ENV_STAMP) ## set up a virtual environment 83 | $(ENV_STAMP): setup.py requirements.txt 84 | test -f $@ || $(INTERPRETER) -m venv $(VENV_OPTS) env 85 | $(PIP) install -U pip wheel 86 | export SETUPTOOLS_USE_DISTUTILS=stdlib; $(PIP) install -r requirements.txt 87 | $(PIP) freeze > pip-freeze.txt 88 | $(PIP) install -e . 89 | touch $@ 90 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Cython 2 | ipdb 3 | ipython 4 | numpy 5 | py-cpuinfo 6 | pytest 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import platform 5 | import struct 6 | 7 | from setuptools import setup 8 | from setuptools.dist import Distribution 9 | from setuptools.extension import Extension 10 | 11 | try: 12 | from cpuinfo import get_cpu_info 13 | 14 | CPU_FLAGS = get_cpu_info()["flags"] 15 | except Exception as exc: 16 | print("exception loading cpuinfo", exc) 17 | CPU_FLAGS = {} 18 | 19 | try: 20 | from Cython.Distutils import build_ext 21 | 22 | USE_CYTHON = True 23 | except ImportError: 24 | USE_CYTHON = False 25 | 26 | 27 | class BinaryDistribution(Distribution): 28 | """ 29 | Subclass the setuptools Distribution to flip the purity flag to false. 30 | See https://lucumr.pocoo.org/2014/1/27/python-on-wheels/ 31 | """ 32 | 33 | def is_pure(self): 34 | """Returns purity flag""" 35 | return False 36 | 37 | 38 | def get_system_bits(): 39 | """Return 32 for 32-bit systems and 64 for 64-bit""" 40 | return struct.calcsize("P") * 8 41 | 42 | 43 | SYSTEM = os.name 44 | BITS = get_system_bits() 45 | HAVE_SSE42 = "sse4_2" in CPU_FLAGS 46 | HAVE_AES = "aes" in CPU_FLAGS 47 | 48 | CXXFLAGS = [] 49 | 50 | print("system: %s-%d" % (SYSTEM, BITS)) 51 | print("available CPU flags:", CPU_FLAGS) 52 | print("environment:", ", ".join(["%s=%s" % (k, v) for k, v in os.environ.items()])) 53 | 54 | if SYSTEM == "nt": 55 | CXXFLAGS.extend(["/O2"]) 56 | else: 57 | CXXFLAGS.extend( 58 | [ 59 | "-O3", 60 | "-Wno-unused-value", 61 | "-Wno-unused-function", 62 | ] 63 | ) 64 | 65 | # The "cibuildwheel" tool sets AUDITWHEEL_ARCH variable to architecture strings 66 | # such as 'x86_64', 'aarch64', 'i686', etc. If this variable is not set, we 67 | # assume that the build is not a CI build and target current machine 68 | # architecture. 69 | TARGET_ARCH = os.environ.get("AUDITWHEEL_ARCH", platform.machine()) 70 | print("building for target architecture:", TARGET_ARCH) 71 | 72 | if HAVE_SSE42 and (TARGET_ARCH == "x86_64") and (BITS == 64): 73 | print("enabling SSE4.2 on compile") 74 | if SYSTEM == "nt": 75 | CXXFLAGS.append("/D__SSE4_2__") 76 | else: 77 | CXXFLAGS.append("-msse4.2") 78 | 79 | if HAVE_AES and (TARGET_ARCH == "x86_64") and (BITS == 64): 80 | print("enabling AES on compile") 81 | if SYSTEM == "nt": 82 | CXXFLAGS.append("/D__AES__") 83 | else: 84 | CXXFLAGS.append("-maes") 85 | 86 | if USE_CYTHON: 87 | print("building extension using Cython") 88 | CMDCLASS = {"build_ext": build_ext} 89 | SRC_EXT = ".pyx" 90 | else: 91 | print("building extension w/o Cython") 92 | CMDCLASS = {} 93 | SRC_EXT = ".cpp" 94 | 95 | EXT_MODULES = [ 96 | Extension( 97 | "cityhash", 98 | ["src/city.cc", "src/cityhash" + SRC_EXT], 99 | depends=["src/city.h"], 100 | language="c++", 101 | extra_compile_args=CXXFLAGS, 102 | include_dirs=["src"], 103 | ), 104 | Extension( 105 | "farmhash", 106 | ["src/farm.cc", "src/farmhash" + SRC_EXT], 107 | depends=["src/farm.h"], 108 | language="c++", 109 | extra_compile_args=CXXFLAGS, 110 | include_dirs=["src"], 111 | ), 112 | ] 113 | 114 | if HAVE_SSE42 and (TARGET_ARCH == "x86_64") and (BITS == 64): 115 | EXT_MODULES.append( 116 | Extension( 117 | "cityhashcrc", 118 | ["src/city.cc", "src/cityhashcrc" + SRC_EXT], 119 | depends=[ 120 | "src/city.h", 121 | "src/citycrc.h", 122 | ], 123 | language="c++", 124 | extra_compile_args=CXXFLAGS, 125 | include_dirs=["src"], 126 | ) 127 | ) 128 | 129 | 130 | VERSION = "0.4.8" 131 | URL = "https://github.com/escherba/python-cityhash" 132 | 133 | 134 | def get_long_description(relpath, encoding="utf-8"): 135 | _long_desc = """ 136 | 137 | """ 138 | fname = os.path.join(os.path.dirname(__file__), relpath) 139 | try: 140 | with open(fname, "rb") as fh: 141 | return fh.read().decode(encoding) 142 | except Exception: 143 | return _long_desc 144 | 145 | 146 | setup( 147 | version=VERSION, 148 | description="Python bindings for CityHash and FarmHash", 149 | author="Eugene Scherba", 150 | author_email="escherba+cityhash@gmail.com", 151 | url=URL, 152 | download_url=URL + "/tarball/master/" + VERSION, 153 | name="cityhash", 154 | license="MIT", 155 | python_requires='>=3.6', 156 | zip_safe=False, 157 | cmdclass=CMDCLASS, 158 | ext_modules=EXT_MODULES, 159 | package_dir={"": "src"}, 160 | keywords=[ 161 | "google", 162 | "hash", 163 | "hashing", 164 | "cityhash", 165 | "farmhash", 166 | "murmurhash", 167 | "cython", 168 | ], 169 | classifiers=[ 170 | "Development Status :: 5 - Production/Stable", 171 | "Intended Audience :: Developers", 172 | "Intended Audience :: Science/Research", 173 | "License :: OSI Approved :: MIT License", 174 | "Operating System :: OS Independent", 175 | "Programming Language :: C++", 176 | "Programming Language :: Cython", 177 | "Programming Language :: Python :: 3.6", 178 | "Programming Language :: Python :: 3.7", 179 | "Programming Language :: Python :: 3.8", 180 | "Programming Language :: Python :: 3.9", 181 | "Programming Language :: Python :: 3.10", 182 | "Programming Language :: Python :: 3.11", 183 | "Programming Language :: Python :: 3.12", 184 | "Programming Language :: Python :: 3.13", 185 | "Topic :: Scientific/Engineering :: Information Analysis", 186 | "Topic :: Software Development :: Libraries", 187 | "Topic :: System :: Distributed Computing", 188 | ], 189 | long_description=get_long_description("README.md"), 190 | long_description_content_type="text/markdown", 191 | tests_require=["pytest"], 192 | distclass=BinaryDistribution, 193 | ) 194 | -------------------------------------------------------------------------------- /src/city.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2011 Google, Inc. 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | // 21 | // CityHash, by Geoff Pike and Jyrki Alakuijala 22 | // 23 | // This file provides CityHash64() and related functions. 24 | // 25 | // It's probably possible to create even faster hash functions by 26 | // writing a program that systematically explores some of the space of 27 | // possible hash functions, by using SIMD instructions, or by 28 | // compromising on hash quality. 29 | 30 | #include 31 | 32 | #include 33 | #include // for memcpy and memset 34 | 35 | using namespace std; 36 | 37 | static uint64 UNALIGNED_LOAD64(const char *p) { 38 | uint64 result; 39 | memcpy(&result, p, sizeof(result)); 40 | return result; 41 | } 42 | 43 | static uint32 UNALIGNED_LOAD32(const char *p) { 44 | uint32 result; 45 | memcpy(&result, p, sizeof(result)); 46 | return result; 47 | } 48 | 49 | #ifdef _MSC_VER 50 | 51 | #include 52 | #define bswap_32(x) _byteswap_ulong(x) 53 | #define bswap_64(x) _byteswap_uint64(x) 54 | 55 | #elif defined(__APPLE__) 56 | 57 | // Mac OS X / Darwin features 58 | #include 59 | #define bswap_32(x) OSSwapInt32(x) 60 | #define bswap_64(x) OSSwapInt64(x) 61 | 62 | #elif defined(__sun) || defined(sun) 63 | 64 | #include 65 | #define bswap_32(x) BSWAP_32(x) 66 | #define bswap_64(x) BSWAP_64(x) 67 | 68 | #elif defined(__FreeBSD__) 69 | 70 | #include 71 | #define bswap_32(x) bswap32(x) 72 | #define bswap_64(x) bswap64(x) 73 | 74 | #elif defined(__OpenBSD__) 75 | 76 | #include 77 | #define bswap_32(x) swap32(x) 78 | #define bswap_64(x) swap64(x) 79 | 80 | #elif defined(__NetBSD__) 81 | 82 | #include 83 | #include 84 | #if defined(__BSWAP_RENAME) && !defined(__bswap_32) 85 | #define bswap_32(x) bswap32(x) 86 | #define bswap_64(x) bswap64(x) 87 | #endif 88 | 89 | #else 90 | 91 | #include 92 | 93 | #endif 94 | 95 | #ifdef WORDS_BIGENDIAN 96 | #define uint32_in_expected_order(x) (bswap_32(x)) 97 | #define uint64_in_expected_order(x) (bswap_64(x)) 98 | #else 99 | #define uint32_in_expected_order(x) (x) 100 | #define uint64_in_expected_order(x) (x) 101 | #endif 102 | 103 | #if !defined(LIKELY) 104 | //#if HAVE_BUILTIN_EXPECT 105 | #if defined(__GNUC__) || defined(__ICL) || defined(__clang__) 106 | #define LIKELY(x) (__builtin_expect(!!(x), 1)) 107 | #else 108 | #define LIKELY(x) (x) 109 | #endif 110 | #endif 111 | 112 | static uint64 Fetch64(const char *p) { 113 | return uint64_in_expected_order(UNALIGNED_LOAD64(p)); 114 | } 115 | 116 | static uint32 Fetch32(const char *p) { 117 | return uint32_in_expected_order(UNALIGNED_LOAD32(p)); 118 | } 119 | 120 | // Some primes between 2^63 and 2^64 for various uses. 121 | static const uint64 k0 = 0xc3a5c85c97cb3127ULL; 122 | static const uint64 k1 = 0xb492b66fbe98f273ULL; 123 | static const uint64 k2 = 0x9ae16a3b2f90404fULL; 124 | 125 | // Magic numbers for 32-bit hashing. Copied from Murmur3. 126 | static const uint32 c1 = 0xcc9e2d51; 127 | static const uint32 c2 = 0x1b873593; 128 | 129 | // A 32-bit to 32-bit integer hash copied from Murmur3. 130 | static uint32 fmix(uint32 h) 131 | { 132 | h ^= h >> 16; 133 | h *= 0x85ebca6b; 134 | h ^= h >> 13; 135 | h *= 0xc2b2ae35; 136 | h ^= h >> 16; 137 | return h; 138 | } 139 | 140 | static uint32 Rotate32(uint32 val, int shift) { 141 | // Avoid shifting by 32: doing so yields an undefined result. 142 | return shift == 0 ? val : ((val >> shift) | (val << (32 - shift))); 143 | } 144 | 145 | #undef PERMUTE3 146 | #define PERMUTE3(a, b, c) do { std::swap(a, b); std::swap(a, c); } while (0) 147 | 148 | static uint32 Mur(uint32 a, uint32 h) { 149 | // Helper from Murmur3 for combining two 32-bit values. 150 | a *= c1; 151 | a = Rotate32(a, 17); 152 | a *= c2; 153 | h ^= a; 154 | h = Rotate32(h, 19); 155 | return h * 5 + 0xe6546b64; 156 | } 157 | 158 | static uint32 Hash32Len13to24(const char *s, size_t len) { 159 | uint32 a = Fetch32(s - 4 + (len >> 1)); 160 | uint32 b = Fetch32(s + 4); 161 | uint32 c = Fetch32(s + len - 8); 162 | uint32 d = Fetch32(s + (len >> 1)); 163 | uint32 e = Fetch32(s); 164 | uint32 f = Fetch32(s + len - 4); 165 | uint32 h = len; 166 | 167 | return fmix(Mur(f, Mur(e, Mur(d, Mur(c, Mur(b, Mur(a, h))))))); 168 | } 169 | 170 | static uint32 Hash32Len0to4(const char *s, size_t len) { 171 | uint32 b = 0; 172 | uint32 c = 9; 173 | for (size_t i = 0; i < len; i++) { 174 | signed char v = s[i]; 175 | b = b * c1 + v; 176 | c ^= b; 177 | } 178 | return fmix(Mur(b, Mur(len, c))); 179 | } 180 | 181 | static uint32 Hash32Len5to12(const char *s, size_t len) { 182 | uint32 a = len, b = len * 5, c = 9, d = b; 183 | a += Fetch32(s); 184 | b += Fetch32(s + len - 4); 185 | c += Fetch32(s + ((len >> 1) & 4)); 186 | return fmix(Mur(c, Mur(b, Mur(a, d)))); 187 | } 188 | 189 | uint32 CityHash32(const char *s, size_t len) { 190 | if (len <= 24) { 191 | return len <= 12 ? 192 | (len <= 4 ? Hash32Len0to4(s, len) : Hash32Len5to12(s, len)) : 193 | Hash32Len13to24(s, len); 194 | } 195 | 196 | // len > 24 197 | uint32 h = len, g = c1 * len, f = g; 198 | uint32 a0 = Rotate32(Fetch32(s + len - 4) * c1, 17) * c2; 199 | uint32 a1 = Rotate32(Fetch32(s + len - 8) * c1, 17) * c2; 200 | uint32 a2 = Rotate32(Fetch32(s + len - 16) * c1, 17) * c2; 201 | uint32 a3 = Rotate32(Fetch32(s + len - 12) * c1, 17) * c2; 202 | uint32 a4 = Rotate32(Fetch32(s + len - 20) * c1, 17) * c2; 203 | h ^= a0; 204 | h = Rotate32(h, 19); 205 | h = h * 5 + 0xe6546b64; 206 | h ^= a2; 207 | h = Rotate32(h, 19); 208 | h = h * 5 + 0xe6546b64; 209 | g ^= a1; 210 | g = Rotate32(g, 19); 211 | g = g * 5 + 0xe6546b64; 212 | g ^= a3; 213 | g = Rotate32(g, 19); 214 | g = g * 5 + 0xe6546b64; 215 | f += a4; 216 | f = Rotate32(f, 19); 217 | f = f * 5 + 0xe6546b64; 218 | size_t iters = (len - 1) / 20; 219 | do { 220 | uint32 a0 = Rotate32(Fetch32(s) * c1, 17) * c2; 221 | uint32 a1 = Fetch32(s + 4); 222 | uint32 a2 = Rotate32(Fetch32(s + 8) * c1, 17) * c2; 223 | uint32 a3 = Rotate32(Fetch32(s + 12) * c1, 17) * c2; 224 | uint32 a4 = Fetch32(s + 16); 225 | h ^= a0; 226 | h = Rotate32(h, 18); 227 | h = h * 5 + 0xe6546b64; 228 | f += a1; 229 | f = Rotate32(f, 19); 230 | f = f * c1; 231 | g += a2; 232 | g = Rotate32(g, 18); 233 | g = g * 5 + 0xe6546b64; 234 | h ^= a3 + a1; 235 | h = Rotate32(h, 19); 236 | h = h * 5 + 0xe6546b64; 237 | g ^= a4; 238 | g = bswap_32(g) * 5; 239 | h += a4 * 5; 240 | h = bswap_32(h); 241 | f += a0; 242 | PERMUTE3(f, h, g); 243 | s += 20; 244 | } while (--iters != 0); 245 | g = Rotate32(g, 11) * c1; 246 | g = Rotate32(g, 17) * c1; 247 | f = Rotate32(f, 11) * c1; 248 | f = Rotate32(f, 17) * c1; 249 | h = Rotate32(h + g, 19); 250 | h = h * 5 + 0xe6546b64; 251 | h = Rotate32(h, 17) * c1; 252 | h = Rotate32(h + f, 19); 253 | h = h * 5 + 0xe6546b64; 254 | h = Rotate32(h, 17) * c1; 255 | return h; 256 | } 257 | 258 | // Bitwise right rotate. Normally this will compile to a single 259 | // instruction, especially if the shift is a manifest constant. 260 | static uint64 Rotate(uint64 val, int shift) { 261 | // Avoid shifting by 64: doing so yields an undefined result. 262 | return shift == 0 ? val : ((val >> shift) | (val << (64 - shift))); 263 | } 264 | 265 | static uint64 ShiftMix(uint64 val) { 266 | return val ^ (val >> 47); 267 | } 268 | 269 | static uint64 HashLen16(uint64 u, uint64 v) { 270 | return Hash128to64(uint128(u, v)); 271 | } 272 | 273 | static uint64 HashLen16(uint64 u, uint64 v, uint64 mul) { 274 | // Murmur-inspired hashing. 275 | uint64 a = (u ^ v) * mul; 276 | a ^= (a >> 47); 277 | uint64 b = (v ^ a) * mul; 278 | b ^= (b >> 47); 279 | b *= mul; 280 | return b; 281 | } 282 | 283 | static uint64 HashLen0to16(const char *s, size_t len) { 284 | if (len >= 8) { 285 | uint64 mul = k2 + len * 2; 286 | uint64 a = Fetch64(s) + k2; 287 | uint64 b = Fetch64(s + len - 8); 288 | uint64 c = Rotate(b, 37) * mul + a; 289 | uint64 d = (Rotate(a, 25) + b) * mul; 290 | return HashLen16(c, d, mul); 291 | } 292 | if (len >= 4) { 293 | uint64 mul = k2 + len * 2; 294 | uint64 a = Fetch32(s); 295 | return HashLen16(len + (a << 3), Fetch32(s + len - 4), mul); 296 | } 297 | if (len > 0) { 298 | uint8 a = s[0]; 299 | uint8 b = s[len >> 1]; 300 | uint8 c = s[len - 1]; 301 | uint32 y = static_cast(a) + (static_cast(b) << 8); 302 | uint32 z = len + (static_cast(c) << 2); 303 | return ShiftMix(y * k2 ^ z * k0) * k2; 304 | } 305 | return k2; 306 | } 307 | 308 | // This probably works well for 16-byte strings as well, but it may be overkill 309 | // in that case. 310 | static uint64 HashLen17to32(const char *s, size_t len) { 311 | uint64 mul = k2 + len * 2; 312 | uint64 a = Fetch64(s) * k1; 313 | uint64 b = Fetch64(s + 8); 314 | uint64 c = Fetch64(s + len - 8) * mul; 315 | uint64 d = Fetch64(s + len - 16) * k2; 316 | return HashLen16(Rotate(a + b, 43) + Rotate(c, 30) + d, 317 | a + Rotate(b + k2, 18) + c, mul); 318 | } 319 | 320 | // Return a 16-byte hash for 48 bytes. Quick and dirty. 321 | // Callers do best to use "random-looking" values for a and b. 322 | static pair WeakHashLen32WithSeeds( 323 | uint64 w, uint64 x, uint64 y, uint64 z, uint64 a, uint64 b) { 324 | a += w; 325 | b = Rotate(b + a + z, 21); 326 | uint64 c = a; 327 | a += x; 328 | a += y; 329 | b += Rotate(a, 44); 330 | return make_pair(a + z, b + c); 331 | } 332 | 333 | // Return a 16-byte hash for s[0] ... s[31], a, and b. Quick and dirty. 334 | static pair WeakHashLen32WithSeeds( 335 | const char* s, uint64 a, uint64 b) { 336 | return WeakHashLen32WithSeeds(Fetch64(s), 337 | Fetch64(s + 8), 338 | Fetch64(s + 16), 339 | Fetch64(s + 24), 340 | a, 341 | b); 342 | } 343 | 344 | // Return an 8-byte hash for 33 to 64 bytes. 345 | static uint64 HashLen33to64(const char *s, size_t len) { 346 | uint64 mul = k2 + len * 2; 347 | uint64 a = Fetch64(s) * k2; 348 | uint64 b = Fetch64(s + 8); 349 | uint64 c = Fetch64(s + len - 24); 350 | uint64 d = Fetch64(s + len - 32); 351 | uint64 e = Fetch64(s + 16) * k2; 352 | uint64 f = Fetch64(s + 24) * 9; 353 | uint64 g = Fetch64(s + len - 8); 354 | uint64 h = Fetch64(s + len - 16) * mul; 355 | uint64 u = Rotate(a + g, 43) + (Rotate(b, 30) + c) * 9; 356 | uint64 v = ((a + g) ^ d) + f + 1; 357 | uint64 w = bswap_64((u + v) * mul) + h; 358 | uint64 x = Rotate(e + f, 42) + c; 359 | uint64 y = (bswap_64((v + w) * mul) + g) * mul; 360 | uint64 z = e + f + c; 361 | a = bswap_64((x + z) * mul + y) + b; 362 | b = ShiftMix((z + a) * mul + d + h) * mul; 363 | return b + x; 364 | } 365 | 366 | uint64 CityHash64(const char *s, size_t len) { 367 | if (len <= 32) { 368 | if (len <= 16) { 369 | return HashLen0to16(s, len); 370 | } else { 371 | return HashLen17to32(s, len); 372 | } 373 | } else if (len <= 64) { 374 | return HashLen33to64(s, len); 375 | } 376 | 377 | // For strings over 64 bytes we hash the end first, and then as we 378 | // loop we keep 56 bytes of state: v, w, x, y, and z. 379 | uint64 x = Fetch64(s + len - 40); 380 | uint64 y = Fetch64(s + len - 16) + Fetch64(s + len - 56); 381 | uint64 z = HashLen16(Fetch64(s + len - 48) + len, Fetch64(s + len - 24)); 382 | pair v = WeakHashLen32WithSeeds(s + len - 64, len, z); 383 | pair w = WeakHashLen32WithSeeds(s + len - 32, y + k1, x); 384 | x = x * k1 + Fetch64(s); 385 | 386 | // Decrease len to the nearest multiple of 64, and operate on 64-byte chunks. 387 | len = (len - 1) & ~static_cast(63); 388 | do { 389 | x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1; 390 | y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1; 391 | x ^= w.second; 392 | y += v.first + Fetch64(s + 40); 393 | z = Rotate(z + w.first, 33) * k1; 394 | v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); 395 | w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16)); 396 | std::swap(z, x); 397 | s += 64; 398 | len -= 64; 399 | } while (len != 0); 400 | return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z, 401 | HashLen16(v.second, w.second) + x); 402 | } 403 | 404 | uint64 CityHash64WithSeed(const char *s, size_t len, uint64 seed) { 405 | return CityHash64WithSeeds(s, len, k2, seed); 406 | } 407 | 408 | uint64 CityHash64WithSeeds(const char *s, size_t len, 409 | uint64 seed0, uint64 seed1) { 410 | return HashLen16(CityHash64(s, len) - seed0, seed1); 411 | } 412 | 413 | // A subroutine for CityHash128(). Returns a decent 128-bit hash for strings 414 | // of any length representable in signed long. Based on City and Murmur. 415 | static uint128 CityMurmur(const char *s, size_t len, uint128 seed) { 416 | uint64 a = Uint128Low64(seed); 417 | uint64 b = Uint128High64(seed); 418 | uint64 c = 0; 419 | uint64 d = 0; 420 | signed long l = len - 16; 421 | if (l <= 0) { // len <= 16 422 | a = ShiftMix(a * k1) * k1; 423 | c = b * k1 + HashLen0to16(s, len); 424 | d = ShiftMix(a + (len >= 8 ? Fetch64(s) : c)); 425 | } else { // len > 16 426 | c = HashLen16(Fetch64(s + len - 8) + k1, a); 427 | d = HashLen16(b + len, c + Fetch64(s + len - 16)); 428 | a += d; 429 | do { 430 | a ^= ShiftMix(Fetch64(s) * k1) * k1; 431 | a *= k1; 432 | b ^= a; 433 | c ^= ShiftMix(Fetch64(s + 8) * k1) * k1; 434 | c *= k1; 435 | d ^= c; 436 | s += 16; 437 | l -= 16; 438 | } while (l > 0); 439 | } 440 | a = HashLen16(a, c); 441 | b = HashLen16(d, b); 442 | return uint128(a ^ b, HashLen16(b, a)); 443 | } 444 | 445 | uint128 CityHash128WithSeed(const char *s, size_t len, uint128 seed) { 446 | if (len < 128) { 447 | return CityMurmur(s, len, seed); 448 | } 449 | 450 | // We expect len >= 128 to be the common case. Keep 56 bytes of state: 451 | // v, w, x, y, and z. 452 | pair v, w; 453 | uint64 x = Uint128Low64(seed); 454 | uint64 y = Uint128High64(seed); 455 | uint64 z = len * k1; 456 | v.first = Rotate(y ^ k1, 49) * k1 + Fetch64(s); 457 | v.second = Rotate(v.first, 42) * k1 + Fetch64(s + 8); 458 | w.first = Rotate(y + z, 35) * k1 + x; 459 | w.second = Rotate(x + Fetch64(s + 88), 53) * k1; 460 | 461 | // This is the same inner loop as CityHash64(), manually unrolled. 462 | do { 463 | x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1; 464 | y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1; 465 | x ^= w.second; 466 | y += v.first + Fetch64(s + 40); 467 | z = Rotate(z + w.first, 33) * k1; 468 | v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); 469 | w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16)); 470 | std::swap(z, x); 471 | s += 64; 472 | x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1; 473 | y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1; 474 | x ^= w.second; 475 | y += v.first + Fetch64(s + 40); 476 | z = Rotate(z + w.first, 33) * k1; 477 | v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); 478 | w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16)); 479 | std::swap(z, x); 480 | s += 64; 481 | len -= 128; 482 | } while (LIKELY(len >= 128)); 483 | x += Rotate(v.first + z, 49) * k0; 484 | y = y * k0 + Rotate(w.second, 37); 485 | z = z * k0 + Rotate(w.first, 27); 486 | w.first *= 9; 487 | v.first *= k0; 488 | // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s. 489 | for (size_t tail_done = 0; tail_done < len; ) { 490 | tail_done += 32; 491 | y = Rotate(x + y, 42) * k0 + v.second; 492 | w.first += Fetch64(s + len - tail_done + 16); 493 | x = x * k0 + w.first; 494 | z += w.second + Fetch64(s + len - tail_done); 495 | w.second += v.first; 496 | v = WeakHashLen32WithSeeds(s + len - tail_done, v.first + z, v.second); 497 | v.first *= k0; 498 | } 499 | // At this point our 56 bytes of state should contain more than 500 | // enough information for a strong 128-bit hash. We use two 501 | // different 56-byte-to-8-byte hashes to get a 16-byte final result. 502 | x = HashLen16(x, v.first); 503 | y = HashLen16(y + z, w.first); 504 | return uint128(HashLen16(x + v.second, w.second) + y, 505 | HashLen16(x + w.second, y + v.second)); 506 | } 507 | 508 | uint128 CityHash128(const char *s, size_t len) { 509 | return len >= 16 ? 510 | CityHash128WithSeed(s + 16, len - 16, 511 | uint128(Fetch64(s), Fetch64(s + 8) + k0)) : 512 | CityHash128WithSeed(s, len, uint128(k0, k1)); 513 | } 514 | 515 | #ifdef __SSE4_2__ 516 | #include 517 | #include 518 | 519 | // Requires len >= 240. 520 | static void CityHashCrc256Long(const char *s, size_t len, 521 | uint32 seed, uint64 *result) { 522 | uint64 a = Fetch64(s + 56) + k0; 523 | uint64 b = Fetch64(s + 96) + k0; 524 | uint64 c = result[0] = HashLen16(b, len); 525 | uint64 d = result[1] = Fetch64(s + 120) * k0 + len; 526 | uint64 e = Fetch64(s + 184) + seed; 527 | uint64 f = 0; 528 | uint64 g = 0; 529 | uint64 h = c + d; 530 | uint64 x = seed; 531 | uint64 y = 0; 532 | uint64 z = 0; 533 | 534 | // 240 bytes of input per iter. 535 | size_t iters = len / 240; 536 | len -= iters * 240; 537 | do { 538 | #undef CHUNK 539 | #define CHUNK(r) \ 540 | PERMUTE3(x, z, y); \ 541 | b += Fetch64(s); \ 542 | c += Fetch64(s + 8); \ 543 | d += Fetch64(s + 16); \ 544 | e += Fetch64(s + 24); \ 545 | f += Fetch64(s + 32); \ 546 | a += b; \ 547 | h += f; \ 548 | b += c; \ 549 | f += d; \ 550 | g += e; \ 551 | e += z; \ 552 | g += x; \ 553 | z = _mm_crc32_u64(z, b + g); \ 554 | y = _mm_crc32_u64(y, e + h); \ 555 | x = _mm_crc32_u64(x, f + a); \ 556 | e = Rotate(e, r); \ 557 | c += e; \ 558 | s += 40 559 | 560 | CHUNK(0); PERMUTE3(a, h, c); 561 | CHUNK(33); PERMUTE3(a, h, f); 562 | CHUNK(0); PERMUTE3(b, h, f); 563 | CHUNK(42); PERMUTE3(b, h, d); 564 | CHUNK(0); PERMUTE3(b, h, e); 565 | CHUNK(33); PERMUTE3(a, h, e); 566 | } while (--iters > 0); 567 | 568 | while (len >= 40) { 569 | CHUNK(29); 570 | e ^= Rotate(a, 20); 571 | h += Rotate(b, 30); 572 | g ^= Rotate(c, 40); 573 | f += Rotate(d, 34); 574 | PERMUTE3(c, h, g); 575 | len -= 40; 576 | } 577 | if (len > 0) { 578 | s = s + len - 40; 579 | CHUNK(33); 580 | e ^= Rotate(a, 43); 581 | h += Rotate(b, 42); 582 | g ^= Rotate(c, 41); 583 | f += Rotate(d, 40); 584 | } 585 | result[0] ^= h; 586 | result[1] ^= g; 587 | g += h; 588 | a = HashLen16(a, g + z); 589 | x += y << 32; 590 | b += x; 591 | c = HashLen16(c, z) + h; 592 | d = HashLen16(d, e + result[0]); 593 | g += e; 594 | h += HashLen16(x, f); 595 | e = HashLen16(a, d) + g; 596 | z = HashLen16(b, c) + a; 597 | y = HashLen16(g, h) + c; 598 | result[0] = e + z + y + x; 599 | a = ShiftMix((a + y) * k0) * k0 + b; 600 | result[1] += a + result[0]; 601 | a = ShiftMix(a * k0) * k0 + c; 602 | result[2] = a + result[1]; 603 | a = ShiftMix((a + e) * k0) * k0; 604 | result[3] = a + result[2]; 605 | } 606 | 607 | // Requires len < 240. 608 | static void CityHashCrc256Short(const char *s, size_t len, uint64 *result) { 609 | char buf[240]; 610 | memcpy(buf, s, len); 611 | memset(buf + len, 0, 240 - len); 612 | CityHashCrc256Long(buf, 240, ~static_cast(len), result); 613 | } 614 | 615 | void CityHashCrc256(const char *s, size_t len, uint64 *result) { 616 | if (LIKELY(len >= 240)) { 617 | CityHashCrc256Long(s, len, 0, result); 618 | } else { 619 | CityHashCrc256Short(s, len, result); 620 | } 621 | } 622 | 623 | uint128 CityHashCrc128WithSeed(const char *s, size_t len, uint128 seed) { 624 | if (len <= 900) { 625 | return CityHash128WithSeed(s, len, seed); 626 | } else { 627 | uint64 result[4]; 628 | CityHashCrc256(s, len, result); 629 | uint64 u = Uint128High64(seed) + result[0]; 630 | uint64 v = Uint128Low64(seed) + result[1]; 631 | return uint128(HashLen16(u, v + result[2]), 632 | HashLen16(Rotate(v, 32), u * k0 + result[3])); 633 | } 634 | } 635 | 636 | uint128 CityHashCrc128(const char *s, size_t len) { 637 | if (len <= 900) { 638 | return CityHash128(s, len); 639 | } else { 640 | uint64 result[4]; 641 | CityHashCrc256(s, len, result); 642 | return uint128(result[2], result[3]); 643 | } 644 | } 645 | 646 | #endif 647 | -------------------------------------------------------------------------------- /src/city.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2011 Google, Inc. 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | // 21 | // CityHash, by Geoff Pike and Jyrki Alakuijala 22 | // 23 | // http://code.google.com/p/cityhash/ 24 | // 25 | // This file provides a few functions for hashing strings. All of them are 26 | // high-quality functions in the sense that they pass standard tests such 27 | // as Austin Appleby's SMHasher. They are also fast. 28 | // 29 | // For 64-bit x86 code, on short strings, we don't know of anything faster than 30 | // CityHash64 that is of comparable quality. We believe our nearest competitor 31 | // is Murmur3. For 64-bit x86 code, CityHash64 is an excellent choice for hash 32 | // tables and most other hashing (excluding cryptography). 33 | // 34 | // For 64-bit x86 code, on long strings, the picture is more complicated. 35 | // On many recent Intel CPUs, such as Nehalem, Westmere, Sandy Bridge, etc., 36 | // CityHashCrc128 appears to be faster than all competitors of comparable 37 | // quality. CityHash128 is also good but not quite as fast. We believe our 38 | // nearest competitor is Bob Jenkins' Spooky. We don't have great data for 39 | // other 64-bit CPUs, but for long strings we know that Spooky is slightly 40 | // faster than CityHash on some relatively recent AMD x86-64 CPUs, for example. 41 | // Note that CityHashCrc128 is declared in citycrc.h. 42 | // 43 | // For 32-bit x86 code, we don't know of anything faster than CityHash32 that 44 | // is of comparable quality. We believe our nearest competitor is Murmur3A. 45 | // (On 64-bit CPUs, it is typically faster to use the other CityHash variants.) 46 | // 47 | // Functions in the CityHash family are not suitable for cryptography. 48 | // 49 | // Please see CityHash's README file for more details on our performance 50 | // measurements and so on. 51 | // 52 | // WARNING: This code has been only lightly tested on big-endian platforms! 53 | // It is known to work well on little-endian platforms that have a small penalty 54 | // for unaligned reads, such as current Intel and AMD moderate-to-high-end CPUs. 55 | // It should work on all 32-bit and 64-bit platforms that allow unaligned reads; 56 | // bug reports are welcome. 57 | // 58 | // By the way, for some hash functions, given strings a and b, the hash 59 | // of a+b is easily derived from the hashes of a and b. This property 60 | // doesn't hold for any hash functions in this file. 61 | 62 | #ifndef CITY_HASH_H_ 63 | #define CITY_HASH_H_ 64 | 65 | #include // for size_t. 66 | #include 67 | #include 68 | 69 | typedef uint8_t uint8; 70 | typedef uint32_t uint32; 71 | typedef uint64_t uint64; 72 | typedef std::pair uint128; 73 | 74 | inline uint64 Uint128Low64(const uint128& x) { return x.first; } 75 | inline uint64 Uint128High64(const uint128& x) { return x.second; } 76 | 77 | // Hash function for a byte array. 78 | uint64 CityHash64(const char *buf, size_t len); 79 | 80 | // Hash function for a byte array. For convenience, a 64-bit seed is also 81 | // hashed into the result. 82 | uint64 CityHash64WithSeed(const char *buf, size_t len, uint64 seed); 83 | 84 | // Hash function for a byte array. For convenience, two seeds are also 85 | // hashed into the result. 86 | uint64 CityHash64WithSeeds(const char *buf, size_t len, 87 | uint64 seed0, uint64 seed1); 88 | 89 | // Hash function for a byte array. 90 | uint128 CityHash128(const char *s, size_t len); 91 | 92 | // Hash function for a byte array. For convenience, a 128-bit seed is also 93 | // hashed into the result. 94 | uint128 CityHash128WithSeed(const char *s, size_t len, uint128 seed); 95 | 96 | // Hash function for a byte array. Most useful in 32-bit binaries. 97 | uint32 CityHash32(const char *buf, size_t len); 98 | 99 | // Hash 128 input bits down to 64 bits of output. 100 | // This is intended to be a reasonably good hash function. 101 | inline uint64 Hash128to64(const uint128& x) { 102 | // Murmur-inspired hashing. 103 | const uint64 kMul = 0x9ddfea08eb382d69ULL; 104 | uint64 a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul; 105 | a ^= (a >> 47); 106 | uint64 b = (Uint128High64(x) ^ a) * kMul; 107 | b ^= (b >> 47); 108 | b *= kMul; 109 | return b; 110 | } 111 | 112 | #endif // CITY_HASH_H_ 113 | -------------------------------------------------------------------------------- /src/citycrc.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2011 Google, Inc. 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | // 21 | // CityHash, by Geoff Pike and Jyrki Alakuijala 22 | // 23 | // This file declares the subset of the CityHash functions that require 24 | // _mm_crc32_u64(). See the CityHash README for details. 25 | // 26 | // Functions in the CityHash family are not suitable for cryptography. 27 | 28 | #ifndef CITY_HASH_CRC_H_ 29 | #define CITY_HASH_CRC_H_ 30 | 31 | #include 32 | 33 | // Hash function for a byte array. 34 | uint128 CityHashCrc128(const char *s, size_t len); 35 | 36 | // Hash function for a byte array. For convenience, a 128-bit seed is also 37 | // hashed into the result. 38 | uint128 CityHashCrc128WithSeed(const char *s, size_t len, uint128 seed); 39 | 40 | // Hash function for a byte array. Sets result[0] ... result[3]. 41 | void CityHashCrc256(const char *s, size_t len, uint64 *result); 42 | 43 | #endif // CITY_HASH_CRC_H_ 44 | -------------------------------------------------------------------------------- /src/cityhash.pyx: -------------------------------------------------------------------------------- 1 | #cython: infer_types=True 2 | #cython: embedsignature=True 3 | #cython: binding=False 4 | #cython: language_level=3 5 | #distutils: language=c++ 6 | 7 | """ 8 | Python wrapper for CityHash 9 | """ 10 | 11 | __author__ = "Eugene Scherba" 12 | __email__ = "escherba+cityhash@gmail.com" 13 | __version__ = '0.4.8' 14 | __all__ = [ 15 | "CityHash32", 16 | "CityHash64", 17 | "CityHash64WithSeed", 18 | "CityHash64WithSeeds", 19 | "CityHash128", 20 | "CityHash128WithSeed", 21 | ] 22 | 23 | 24 | cdef extern from * nogil: 25 | ctypedef unsigned long int uint32_t 26 | ctypedef unsigned long long int uint64_t 27 | 28 | 29 | cdef extern from "" namespace "std" nogil: 30 | cdef cppclass pair[T, U]: 31 | T first 32 | U second 33 | pair() 34 | pair(pair&) 35 | pair(T&, U&) 36 | bint operator == (pair&, pair&) 37 | bint operator != (pair&, pair&) 38 | bint operator < (pair&, pair&) 39 | bint operator > (pair&, pair&) 40 | bint operator <= (pair&, pair&) 41 | bint operator >= (pair&, pair&) 42 | 43 | 44 | cdef extern from "Python.h": 45 | # Note that following functions can potentially raise an exception, 46 | # thus they cannot be declared 'nogil'. Also, PyUnicode_AsUTF8AndSize() can 47 | # potentially allocate memory inside in unlikely case of when underlying 48 | # unicode object was stored as non-utf8 and utf8 wasn't requested before. 49 | const char* PyUnicode_AsUTF8AndSize(object obj, Py_ssize_t* length) except NULL 50 | 51 | 52 | cdef extern from "city.h" nogil: 53 | ctypedef uint32_t uint32 54 | ctypedef uint64_t uint64 55 | ctypedef pair[uint64, uint64] uint128 56 | cdef uint32 c_Hash32 "CityHash32" (const char *buff, size_t length) 57 | cdef uint64 c_Hash64 "CityHash64" (const char *buff, size_t length) 58 | cdef uint64 c_Hash64WithSeed "CityHash64WithSeed" (const char *buff, size_t length, uint64 seed) 59 | cdef uint64 c_Hash64WithSeeds "CityHash64WithSeeds" (const char *buff, size_t length, uint64 seed0, uint64 seed1) 60 | cdef uint128 c_Hash128 "CityHash128" (const char *s, size_t length) 61 | cdef uint128 c_Hash128WithSeed "CityHash128WithSeed" (const char *s, size_t length, uint128 seed) 62 | 63 | 64 | from cpython cimport long 65 | 66 | from cpython.buffer cimport PyObject_CheckBuffer 67 | from cpython.buffer cimport PyObject_GetBuffer 68 | from cpython.buffer cimport PyBuffer_Release 69 | from cpython.buffer cimport PyBUF_SIMPLE 70 | 71 | from cpython.unicode cimport PyUnicode_Check 72 | 73 | from cpython.bytes cimport PyBytes_Check 74 | from cpython.bytes cimport PyBytes_GET_SIZE 75 | from cpython.bytes cimport PyBytes_AS_STRING 76 | 77 | 78 | cdef object _type_error(argname: str, expected: object, value: object): 79 | return TypeError( 80 | "Argument '%s' has incorrect type: expected %s, got '%s' instead" % 81 | (argname, expected, type(value).__name__) 82 | ) 83 | 84 | 85 | def CityHash32(data) -> int: 86 | """Obtain a 32-bit hash from input data. 87 | 88 | :param data: input data (string, bytes, or buffer object) 89 | :return: an integer representing a 32-bit hash of the input 90 | :raises TypeError: if data is not of one of input types 91 | :raises ValueError: if input buffer is not C-contiguous 92 | """ 93 | cdef Py_buffer buf 94 | cdef uint32 result 95 | cdef const char* encoding 96 | cdef Py_ssize_t encoding_size = 0 97 | 98 | if PyUnicode_Check(data): 99 | encoding = PyUnicode_AsUTF8AndSize(data, &encoding_size) 100 | result = c_Hash32(encoding, encoding_size) 101 | elif PyBytes_Check(data): 102 | result = c_Hash32( 103 | PyBytes_AS_STRING(data), 104 | PyBytes_GET_SIZE(data)) 105 | elif PyObject_CheckBuffer(data): 106 | PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE) 107 | result = c_Hash32(buf.buf, buf.len) 108 | PyBuffer_Release(&buf) 109 | else: 110 | raise _type_error("data", ["basestring", "buffer"], data) 111 | return result 112 | 113 | 114 | def CityHash64(data) -> int: 115 | """Obtain a 64-bit hash from input data. 116 | 117 | :param data: input data (string, bytes, or buffer object) 118 | :return: an integer representing a 64-bit hash of the input 119 | :raises TypeError: if data is not of one of input types 120 | :raises ValueError: if input buffer is not C-contiguous 121 | """ 122 | cdef Py_buffer buf 123 | cdef uint64 result 124 | cdef const char* encoding 125 | cdef Py_ssize_t encoding_size = 0 126 | 127 | if PyUnicode_Check(data): 128 | encoding = PyUnicode_AsUTF8AndSize(data, &encoding_size) 129 | result = c_Hash64(encoding, encoding_size) 130 | elif PyBytes_Check(data): 131 | result = c_Hash64( 132 | PyBytes_AS_STRING(data), 133 | PyBytes_GET_SIZE(data)) 134 | elif PyObject_CheckBuffer(data): 135 | PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE) 136 | result = c_Hash64(buf.buf, buf.len) 137 | PyBuffer_Release(&buf) 138 | else: 139 | raise _type_error("data", ["basestring", "buffer"], data) 140 | return result 141 | 142 | 143 | def CityHash64WithSeed(data, uint64 seed=0ULL) -> int: 144 | """Obtain a 64-bit hash using a seed. 145 | 146 | :param data: input data (string, bytes, or buffer object) 147 | :param seed: seed value (a 64-bit integer, defaults to 0) 148 | :return: an integer representing a 64-bit hash of the input 149 | :raises TypeError: if data is not of one of input types 150 | :raises ValueError: if input buffer is not C-contiguous 151 | :raises OverflowError: if seed cannot be converted to unsigned int64 152 | """ 153 | cdef Py_buffer buf 154 | cdef uint64 result 155 | cdef const char* encoding 156 | cdef Py_ssize_t encoding_size = 0 157 | 158 | if PyUnicode_Check(data): 159 | encoding = PyUnicode_AsUTF8AndSize(data, &encoding_size) 160 | result = c_Hash64WithSeed(encoding, encoding_size, seed) 161 | elif PyBytes_Check(data): 162 | result = c_Hash64WithSeed( 163 | PyBytes_AS_STRING(data), 164 | PyBytes_GET_SIZE(data), seed) 165 | elif PyObject_CheckBuffer(data): 166 | PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE) 167 | result = c_Hash64WithSeed(buf.buf, buf.len, seed) 168 | PyBuffer_Release(&buf) 169 | else: 170 | raise _type_error("data", ["basestring", "buffer"], data) 171 | return result 172 | 173 | 174 | def CityHash64WithSeeds(data, uint64 seed0=0LL, uint64 seed1=0LL) -> int: 175 | """Obtain a 64-bit hash using two seeds. 176 | 177 | :param data: input data (string, bytes, or buffer object) 178 | :param seed0: first seed (a 64-bit integer, defaults to 0) 179 | :param seed1: second seed (a 64-bit integer, defaults to 0) 180 | :return: an integer representing a 64-bit hash of the input 181 | :raises TypeError: if data is not of one of input types 182 | :raises ValueError: if input buffer is not C-contiguous 183 | """ 184 | cdef Py_buffer buf 185 | cdef uint64 result 186 | cdef const char* encoding 187 | cdef Py_ssize_t encoding_size = 0 188 | 189 | if PyUnicode_Check(data): 190 | encoding = PyUnicode_AsUTF8AndSize(data, &encoding_size) 191 | result = c_Hash64WithSeeds(encoding, encoding_size, seed0, seed1) 192 | elif PyBytes_Check(data): 193 | result = c_Hash64WithSeeds( 194 | PyBytes_AS_STRING(data), 195 | PyBytes_GET_SIZE(data), seed0, seed1) 196 | elif PyObject_CheckBuffer(data): 197 | PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE) 198 | result = c_Hash64WithSeeds(buf.buf, buf.len, seed0, seed1) 199 | PyBuffer_Release(&buf) 200 | else: 201 | raise _type_error("data", ["basestring", "buffer"], data) 202 | return result 203 | 204 | 205 | def CityHash128(data) -> int: 206 | """Obtain a 128-bit hash from input data. 207 | 208 | :param data: input data (string, bytes, or buffer object) 209 | :return: an integer representing a 128-bit hash of the input 210 | :raises TypeError: if data is not of one of input types 211 | :raises ValueError: if input buffer is not C-contiguous 212 | """ 213 | cdef Py_buffer buf 214 | cdef pair[uint64, uint64] result 215 | cdef const char* encoding 216 | cdef Py_ssize_t encoding_size = 0 217 | 218 | if PyUnicode_Check(data): 219 | encoding = PyUnicode_AsUTF8AndSize(data, &encoding_size) 220 | result = c_Hash128(encoding, encoding_size) 221 | elif PyBytes_Check(data): 222 | result = c_Hash128( 223 | PyBytes_AS_STRING(data), 224 | PyBytes_GET_SIZE(data)) 225 | elif PyObject_CheckBuffer(data): 226 | PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE) 227 | result = c_Hash128(buf.buf, buf.len) 228 | PyBuffer_Release(&buf) 229 | else: 230 | raise _type_error("data", ["basestring", "buffer"], data) 231 | return (long(result.first) << 64ULL) + long(result.second) 232 | 233 | 234 | def CityHash128WithSeed(data, seed: int = 0L) -> int: 235 | """Obtain a 128-bit hash using a seed. 236 | 237 | :param data: input data (string, bytes, or buffer object) 238 | :param seed: seed value (defaults to 0) 239 | :return: an integer representing a 128-bit hash of the input 240 | :raises TypeError: if data is not of one of input types 241 | :raises ValueError: if input buffer is not C-contiguous 242 | """ 243 | cdef Py_buffer buf 244 | cdef pair[uint64, uint64] result 245 | cdef pair[uint64, uint64] tseed 246 | cdef const char* encoding 247 | cdef Py_ssize_t encoding_size = 0 248 | 249 | tseed.first = seed >> 64ULL 250 | tseed.second = seed & ((1ULL << 64ULL) - 1ULL) 251 | 252 | if PyUnicode_Check(data): 253 | encoding = PyUnicode_AsUTF8AndSize(data, &encoding_size) 254 | result = c_Hash128WithSeed(encoding, encoding_size, tseed) 255 | elif PyBytes_Check(data): 256 | result = c_Hash128WithSeed( 257 | PyBytes_AS_STRING(data), 258 | PyBytes_GET_SIZE(data), tseed) 259 | elif PyObject_CheckBuffer(data): 260 | PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE) 261 | result = c_Hash128WithSeed(buf.buf, buf.len, tseed) 262 | PyBuffer_Release(&buf) 263 | else: 264 | raise _type_error("data", ["basestring", "buffer"], data) 265 | return (long(result.first) << 64ULL) + long(result.second) 266 | -------------------------------------------------------------------------------- /src/cityhashcrc.pyx: -------------------------------------------------------------------------------- 1 | #cython: infer_types=True 2 | #cython: embedsignature=True 3 | #cython: binding=False 4 | #cython: language_level=3 5 | #distutils: language=c++ 6 | 7 | """ 8 | Python wrapper for CityHash-CRC 9 | """ 10 | 11 | __author__ = "Eugene Scherba" 12 | __email__ = "escherba+cityhash@gmail.com" 13 | __version__ = '0.4.7' 14 | __all__ = [ 15 | "CityHashCrc128", 16 | "CityHashCrc128WithSeed", 17 | "CityHashCrc256Bytes", 18 | ] 19 | 20 | 21 | cdef extern from * nogil: 22 | ctypedef unsigned long int uint32_t 23 | ctypedef unsigned long long int uint64_t 24 | 25 | 26 | cdef extern from "" namespace "std" nogil: 27 | cdef cppclass pair[T, U]: 28 | T first 29 | U second 30 | pair() 31 | pair(pair&) 32 | pair(T&, U&) 33 | bint operator == (pair&, pair&) 34 | bint operator != (pair&, pair&) 35 | bint operator < (pair&, pair&) 36 | bint operator > (pair&, pair&) 37 | bint operator <= (pair&, pair&) 38 | bint operator >= (pair&, pair&) 39 | 40 | 41 | cdef extern from "Python.h": 42 | # Note that following functions can potentially raise an exception, 43 | # thus they cannot be declared 'nogil'. Also, PyUnicode_AsUTF8AndSize() can 44 | # potentially allocate memory inside in unlikely case of when underlying 45 | # unicode object was stored as non-utf8 and utf8 wasn't requested before. 46 | const char* PyUnicode_AsUTF8AndSize(object obj, Py_ssize_t* length) except NULL 47 | 48 | 49 | cdef extern from "city.h" nogil: 50 | ctypedef uint32_t uint32 51 | ctypedef uint64_t uint64 52 | ctypedef pair[uint64, uint64] uint128 53 | 54 | 55 | cdef extern from "citycrc.h" nogil: 56 | cdef uint128 c_HashCrc128 "CityHashCrc128" (const char *s, size_t length) 57 | cdef uint128 c_HashCrc128WithSeed "CityHashCrc128WithSeed" (const char *s, size_t length, uint128 seed) 58 | cdef void c_HashCrc256 "CityHashCrc256" (const char *s, size_t length, uint64 *result) 59 | 60 | 61 | from cpython cimport long 62 | 63 | from cpython.buffer cimport PyObject_CheckBuffer 64 | from cpython.buffer cimport PyObject_GetBuffer 65 | from cpython.buffer cimport PyBuffer_Release 66 | from cpython.buffer cimport PyBUF_SIMPLE 67 | 68 | from cpython.unicode cimport PyUnicode_Check 69 | 70 | from cpython.bytes cimport PyBytes_Check 71 | from cpython.bytes cimport PyBytes_GET_SIZE 72 | from cpython.bytes cimport PyBytes_AS_STRING 73 | from cpython.bytes cimport PyBytes_FromStringAndSize 74 | 75 | 76 | cdef object _type_error(argname: str, expected: object, value: object): 77 | return TypeError( 78 | "Argument '%s' has incorrect type: expected %s, got '%s' instead" % 79 | (argname, expected, type(value).__name__) 80 | ) 81 | 82 | 83 | def CityHashCrc128(data) -> int: 84 | """Obtain a 128-bit hash from input data. 85 | 86 | :param data: input data (string, bytes, or buffer object) 87 | :return: an integer representing a 128-bit hash of the input 88 | :raises TypeError: if data is not of one of input types 89 | :raises ValueError: if input buffer is not C-contiguous 90 | """ 91 | cdef Py_buffer buf 92 | cdef pair[uint64, uint64] result 93 | cdef const char* encoding 94 | cdef Py_ssize_t encoding_size = 0 95 | 96 | if PyUnicode_Check(data): 97 | encoding = PyUnicode_AsUTF8AndSize(data, &encoding_size) 98 | result = c_HashCrc128(encoding, encoding_size) 99 | elif PyBytes_Check(data): 100 | result = c_HashCrc128( 101 | PyBytes_AS_STRING(data), 102 | PyBytes_GET_SIZE(data)) 103 | elif PyObject_CheckBuffer(data): 104 | PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE) 105 | result = c_HashCrc128(buf.buf, buf.len) 106 | PyBuffer_Release(&buf) 107 | else: 108 | raise _type_error("data", ["basestring", "buffer"], data) 109 | return (long(result.first) << 64ULL) + long(result.second) 110 | 111 | 112 | def CityHashCrc256Bytes(data) -> bytes: 113 | """Obtain a 128-bit hash from input data. 114 | 115 | :param data: input data (string, bytes, or buffer object) 116 | :return: a bytes array representing a 128-bit hash of the input 117 | :raises TypeError: if data is not of one of input types 118 | :raises ValueError: if input buffer is not C-contiguous 119 | """ 120 | cdef Py_buffer buf 121 | cdef uint64 out[4] 122 | cdef const char* encoding 123 | cdef Py_ssize_t encoding_size = 0 124 | 125 | if PyUnicode_Check(data): 126 | encoding = PyUnicode_AsUTF8AndSize(data, &encoding_size) 127 | c_HashCrc256(encoding, encoding_size, out) 128 | elif PyBytes_Check(data): 129 | c_HashCrc256( 130 | PyBytes_AS_STRING(data), 131 | PyBytes_GET_SIZE(data), out) 132 | elif PyObject_CheckBuffer(data): 133 | PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE) 134 | c_HashCrc256(buf.buf, buf.len, out) 135 | PyBuffer_Release(&buf) 136 | else: 137 | raise _type_error("data", ["basestring", "buffer"], data) 138 | return PyBytes_FromStringAndSize(out, 32) 139 | 140 | 141 | def CityHashCrc128WithSeed(data, seed: int = 0L) -> int: 142 | """Obtain a 128-bit hash using a seed. 143 | 144 | :param data: input data (string, bytes, or buffer object) 145 | :param seed: seed value (defaults to 0) 146 | :return: an integer representing a 128-bit hash of the input 147 | :raises TypeError: if data is not of one of input types 148 | :raises ValueError: if input buffer is not C-contiguous 149 | """ 150 | cdef Py_buffer buf 151 | cdef pair[uint64, uint64] result 152 | cdef pair[uint64, uint64] tseed 153 | cdef const char* encoding 154 | cdef Py_ssize_t encoding_size = 0 155 | 156 | tseed.first = seed >> 64ULL 157 | tseed.second = seed & ((1ULL << 64ULL) - 1ULL) 158 | 159 | if PyUnicode_Check(data): 160 | encoding = PyUnicode_AsUTF8AndSize(data, &encoding_size) 161 | result = c_HashCrc128WithSeed(encoding, encoding_size, tseed) 162 | elif PyBytes_Check(data): 163 | result = c_HashCrc128WithSeed( 164 | PyBytes_AS_STRING(data), 165 | PyBytes_GET_SIZE(data), tseed) 166 | elif PyObject_CheckBuffer(data): 167 | PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE) 168 | result = c_HashCrc128WithSeed(buf.buf, buf.len, tseed) 169 | PyBuffer_Release(&buf) 170 | else: 171 | raise _type_error("data", ["basestring", "buffer"], data) 172 | return (long(result.first) << 64ULL) + long(result.second) 173 | -------------------------------------------------------------------------------- /src/farm.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014 Google, Inc. 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | // 21 | // FarmHash, by Geoff Pike 22 | 23 | // 24 | // http://code.google.com/p/farmhash/ 25 | // 26 | // This file provides a few functions for hashing strings and other 27 | // data. All of them are high-quality functions in the sense that 28 | // they do well on standard tests such as Austin Appleby's SMHasher. 29 | // They're also fast. FarmHash is the successor to CityHash. 30 | // 31 | // Functions in the FarmHash family are not suitable for cryptography. 32 | // 33 | // WARNING: This code has been only lightly tested on big-endian platforms! 34 | // It is known to work well on little-endian platforms that have a small penalty 35 | // for unaligned reads, such as current Intel and AMD moderate-to-high-end CPUs. 36 | // It should work on all 32-bit and 64-bit platforms that allow unaligned reads; 37 | // bug reports are welcome. 38 | // 39 | // By the way, for some hash functions, given strings a and b, the hash 40 | // of a+b is easily derived from the hashes of a and b. This property 41 | // doesn't hold for any hash functions in this file. 42 | 43 | #ifndef FARM_HASH_H_ 44 | #define FARM_HASH_H_ 45 | 46 | #include 47 | #include 48 | #include 49 | #include // for memcpy and memset 50 | #include 51 | 52 | #ifndef NAMESPACE_FOR_HASH_FUNCTIONS 53 | #define NAMESPACE_FOR_HASH_FUNCTIONS util 54 | #endif 55 | 56 | namespace NAMESPACE_FOR_HASH_FUNCTIONS { 57 | 58 | #if defined(FARMHASH_UINT128_T_DEFINED) 59 | inline uint64_t Uint128Low64(const uint128_t x) { 60 | return static_cast(x); 61 | } 62 | inline uint64_t Uint128High64(const uint128_t x) { 63 | return static_cast(x >> 64); 64 | } 65 | inline uint128_t Uint128(uint64_t lo, uint64_t hi) { 66 | return lo + (((uint128_t)hi) << 64); 67 | } 68 | #else 69 | typedef std::pair uint128_t; 70 | inline uint64_t Uint128Low64(const uint128_t x) { return x.first; } 71 | inline uint64_t Uint128High64(const uint128_t x) { return x.second; } 72 | inline uint128_t Uint128(uint64_t lo, uint64_t hi) { return uint128_t(lo, hi); } 73 | #endif 74 | 75 | 76 | // BASIC STRING HASHING 77 | 78 | // Hash function for a byte array. 79 | // May change from time to time, may differ on different platforms, may differ 80 | // depending on NDEBUG. 81 | size_t Hash(const char* s, size_t len); 82 | 83 | // Hash function for a byte array. Most useful in 32-bit binaries. 84 | // May change from time to time, may differ on different platforms, may differ 85 | // depending on NDEBUG. 86 | uint32_t Hash32(const char* s, size_t len); 87 | 88 | // Hash function for a byte array. For convenience, a 32-bit seed is also 89 | // hashed into the result. 90 | // May change from time to time, may differ on different platforms, may differ 91 | // depending on NDEBUG. 92 | uint32_t Hash32WithSeed(const char* s, size_t len, uint32_t seed); 93 | 94 | // Hash 128 input bits down to 64 bits of output. 95 | // Hash function for a byte array. 96 | // May change from time to time, may differ on different platforms, may differ 97 | // depending on NDEBUG. 98 | uint64_t Hash64(const char* s, size_t len); 99 | 100 | // Hash function for a byte array. For convenience, a 64-bit seed is also 101 | // hashed into the result. 102 | // May change from time to time, may differ on different platforms, may differ 103 | // depending on NDEBUG. 104 | uint64_t Hash64WithSeed(const char* s, size_t len, uint64_t seed); 105 | 106 | // Hash function for a byte array. For convenience, two seeds are also 107 | // hashed into the result. 108 | // May change from time to time, may differ on different platforms, may differ 109 | // depending on NDEBUG. 110 | uint64_t Hash64WithSeeds(const char* s, size_t len, 111 | uint64_t seed0, uint64_t seed1); 112 | 113 | // Hash function for a byte array. 114 | // May change from time to time, may differ on different platforms, may differ 115 | // depending on NDEBUG. 116 | uint128_t Hash128(const char* s, size_t len); 117 | 118 | // Hash function for a byte array. For convenience, a 128-bit seed is also 119 | // hashed into the result. 120 | // May change from time to time, may differ on different platforms, may differ 121 | // depending on NDEBUG. 122 | uint128_t Hash128WithSeed(const char* s, size_t len, uint128_t seed); 123 | 124 | // BASIC NON-STRING HASHING 125 | 126 | // This is intended to be a reasonably good hash function. 127 | // May change from time to time, may differ on different platforms, may differ 128 | // depending on NDEBUG. 129 | inline uint64_t Hash128to64(uint128_t x) { 130 | // Murmur-inspired hashing. 131 | const uint64_t kMul = 0x9ddfea08eb382d69ULL; 132 | uint64_t a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul; 133 | a ^= (a >> 47); 134 | uint64_t b = (Uint128High64(x) ^ a) * kMul; 135 | b ^= (b >> 47); 136 | b *= kMul; 137 | return b; 138 | } 139 | 140 | // FINGERPRINTING (i.e., good, portable, forever-fixed hash functions) 141 | 142 | // Fingerprint function for a byte array. Most useful in 32-bit binaries. 143 | uint32_t Fingerprint32(const char* s, size_t len); 144 | 145 | // Fingerprint function for a byte array. 146 | uint64_t Fingerprint64(const char* s, size_t len); 147 | 148 | // Fingerprint function for a byte array. 149 | uint128_t Fingerprint128(const char* s, size_t len); 150 | 151 | // This is intended to be a good fingerprinting primitive. 152 | // See below for more overloads. 153 | inline uint64_t Fingerprint(uint128_t x) { 154 | // Murmur-inspired hashing. 155 | const uint64_t kMul = 0x9ddfea08eb382d69ULL; 156 | uint64_t a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul; 157 | a ^= (a >> 47); 158 | uint64_t b = (Uint128High64(x) ^ a) * kMul; 159 | b ^= (b >> 44); 160 | b *= kMul; 161 | b ^= (b >> 41); 162 | b *= kMul; 163 | return b; 164 | } 165 | 166 | // This is intended to be a good fingerprinting primitive. 167 | inline uint64_t Fingerprint(uint64_t x) { 168 | // Murmur-inspired hashing. 169 | const uint64_t kMul = 0x9ddfea08eb382d69ULL; 170 | uint64_t b = x * kMul; 171 | b ^= (b >> 44); 172 | b *= kMul; 173 | b ^= (b >> 41); 174 | b *= kMul; 175 | return b; 176 | } 177 | 178 | #ifndef FARMHASH_NO_CXX_STRING 179 | 180 | // Convenience functions to hash or fingerprint C++ strings. 181 | // These require that Str::data() return a pointer to the first char 182 | // (as a const char*) and that Str::length() return the string's length; 183 | // they work with std::string, for example. 184 | 185 | // Hash function for a byte array. 186 | // May change from time to time, may differ on different platforms, may differ 187 | // depending on NDEBUG. 188 | template 189 | inline size_t Hash(const Str& s) { 190 | assert(sizeof(s[0]) == 1); 191 | return Hash(s.data(), s.length()); 192 | } 193 | 194 | // Hash function for a byte array. Most useful in 32-bit binaries. 195 | // May change from time to time, may differ on different platforms, may differ 196 | // depending on NDEBUG. 197 | template 198 | inline uint32_t Hash32(const Str& s) { 199 | assert(sizeof(s[0]) == 1); 200 | return Hash32(s.data(), s.length()); 201 | } 202 | 203 | // Hash function for a byte array. For convenience, a 32-bit seed is also 204 | // hashed into the result. 205 | // May change from time to time, may differ on different platforms, may differ 206 | // depending on NDEBUG. 207 | template 208 | inline uint32_t Hash32WithSeed(const Str& s, uint32_t seed) { 209 | assert(sizeof(s[0]) == 1); 210 | return Hash32WithSeed(s.data(), s.length(), seed); 211 | } 212 | 213 | // Hash 128 input bits down to 64 bits of output. 214 | // Hash function for a byte array. 215 | // May change from time to time, may differ on different platforms, may differ 216 | // depending on NDEBUG. 217 | template 218 | inline uint64_t Hash64(const Str& s) { 219 | assert(sizeof(s[0]) == 1); 220 | return Hash64(s.data(), s.length()); 221 | } 222 | 223 | // Hash function for a byte array. For convenience, a 64-bit seed is also 224 | // hashed into the result. 225 | // May change from time to time, may differ on different platforms, may differ 226 | // depending on NDEBUG. 227 | template 228 | inline uint64_t Hash64WithSeed(const Str& s, uint64_t seed) { 229 | assert(sizeof(s[0]) == 1); 230 | return Hash64WithSeed(s.data(), s.length(), seed); 231 | } 232 | 233 | // Hash function for a byte array. For convenience, two seeds are also 234 | // hashed into the result. 235 | // May change from time to time, may differ on different platforms, may differ 236 | // depending on NDEBUG. 237 | template 238 | inline uint64_t Hash64WithSeeds(const Str& s, uint64_t seed0, uint64_t seed1) { 239 | assert(sizeof(s[0]) == 1); 240 | return Hash64WithSeeds(s.data(), s.length(), seed0, seed1); 241 | } 242 | 243 | // Hash function for a byte array. 244 | // May change from time to time, may differ on different platforms, may differ 245 | // depending on NDEBUG. 246 | template 247 | inline uint128_t Hash128(const Str& s) { 248 | assert(sizeof(s[0]) == 1); 249 | return Hash128(s.data(), s.length()); 250 | } 251 | 252 | // Hash function for a byte array. For convenience, a 128-bit seed is also 253 | // hashed into the result. 254 | // May change from time to time, may differ on different platforms, may differ 255 | // depending on NDEBUG. 256 | template 257 | inline uint128_t Hash128WithSeed(const Str& s, uint128_t seed) { 258 | assert(sizeof(s[0]) == 1); 259 | return Hash128(s.data(), s.length(), seed); 260 | } 261 | 262 | // FINGERPRINTING (i.e., good, portable, forever-fixed hash functions) 263 | 264 | // Fingerprint function for a byte array. Most useful in 32-bit binaries. 265 | template 266 | inline uint32_t Fingerprint32(const Str& s) { 267 | assert(sizeof(s[0]) == 1); 268 | return Fingerprint32(s.data(), s.length()); 269 | } 270 | 271 | // Fingerprint 128 input bits down to 64 bits of output. 272 | // Fingerprint function for a byte array. 273 | template 274 | inline uint64_t Fingerprint64(const Str& s) { 275 | assert(sizeof(s[0]) == 1); 276 | return Fingerprint64(s.data(), s.length()); 277 | } 278 | 279 | // Fingerprint function for a byte array. 280 | template 281 | inline uint128_t Fingerprint128(const Str& s) { 282 | assert(sizeof(s[0]) == 1); 283 | return Fingerprint128(s.data(), s.length()); 284 | } 285 | 286 | #endif 287 | 288 | } // namespace NAMESPACE_FOR_HASH_FUNCTIONS 289 | 290 | #endif // FARM_HASH_H_ 291 | -------------------------------------------------------------------------------- /src/farmhash.pyx: -------------------------------------------------------------------------------- 1 | #cython: infer_types=True 2 | #cython: embedsignature=True 3 | #cython: binding=False 4 | #cython: language_level=3 5 | #distutils: language=c++ 6 | 7 | """ 8 | Python wrapper for FarmHash 9 | """ 10 | 11 | __author__ = "Eugene Scherba" 12 | __email__ = "escherba+cityhash@gmail.com" 13 | __version__ = '0.4.7' 14 | __all__ = [ 15 | "FarmHash32", 16 | "FarmHash32WithSeed", 17 | "Fingerprint32", 18 | "FarmHash64", 19 | "FarmHash64WithSeed", 20 | "FarmHash64WithSeeds", 21 | "Fingerprint64", 22 | "FarmHash128", 23 | "FarmHash128WithSeed", 24 | "Fingerprint128", 25 | ] 26 | 27 | 28 | cdef extern from * nogil: 29 | ctypedef unsigned long int uint32_t 30 | ctypedef unsigned long long int uint64_t 31 | 32 | 33 | cdef extern from "" namespace "std" nogil: 34 | cdef cppclass pair[T, U]: 35 | T first 36 | U second 37 | pair() 38 | pair(pair&) 39 | pair(T&, U&) 40 | bint operator == (pair&, pair&) 41 | bint operator != (pair&, pair&) 42 | bint operator < (pair&, pair&) 43 | bint operator > (pair&, pair&) 44 | bint operator <= (pair&, pair&) 45 | bint operator >= (pair&, pair&) 46 | 47 | 48 | cdef extern from "Python.h": 49 | # Note that following functions can potentially raise an exception, 50 | # thus they cannot be declared 'nogil'. Also, PyUnicode_AsUTF8AndSize() can 51 | # potentially allocate memory inside in unlikely case of when underlying 52 | # unicode object was stored as non-utf8 and utf8 wasn't requested before. 53 | const char* PyUnicode_AsUTF8AndSize(object obj, Py_ssize_t* length) except NULL 54 | 55 | 56 | cdef extern from "farm.h" nogil: 57 | ctypedef pair[uint64_t, uint64_t] uint128_t 58 | cdef uint32_t c_Hash32 "util::Hash32" (const char *buff, size_t length) 59 | cdef uint32_t c_Fingerprint32 "util::Fingerprint32" (const char *buff, size_t length) 60 | cdef uint32_t c_Hash32WithSeed "util::Hash32WithSeed" (const char *buff, size_t length, uint32_t seed) 61 | cdef uint64_t c_Hash64 "util::Hash64" (const char *buff, size_t length) 62 | cdef uint64_t c_Fingerprint64 "util::Fingerprint64" (const char *buff, size_t length) 63 | cdef uint64_t c_Hash64WithSeed "util::Hash64WithSeed" (const char *buff, size_t length, uint64_t seed) 64 | cdef uint64_t c_Hash64WithSeeds "util::Hash64WithSeeds" (const char *buff, size_t length, uint64_t seed0, uint64_t seed1) 65 | cdef uint128_t c_Hash128 "util::Hash128" (const char *s, size_t length) 66 | cdef uint128_t c_Fingerprint128 "util::Fingerprint128" (const char *s, size_t length) 67 | cdef uint128_t c_Hash128WithSeed "util::Hash128WithSeed" (const char *s, size_t length, uint128_t seed) 68 | 69 | 70 | from cpython cimport long 71 | 72 | from cpython.buffer cimport PyObject_CheckBuffer 73 | from cpython.buffer cimport PyObject_GetBuffer 74 | from cpython.buffer cimport PyBuffer_Release 75 | from cpython.buffer cimport PyBUF_SIMPLE 76 | 77 | from cpython.unicode cimport PyUnicode_Check 78 | 79 | from cpython.bytes cimport PyBytes_Check 80 | from cpython.bytes cimport PyBytes_GET_SIZE 81 | from cpython.bytes cimport PyBytes_AS_STRING 82 | 83 | 84 | 85 | cdef object _type_error(argname: str, expected: object, value: object): 86 | return TypeError( 87 | "Argument '%s' has incorrect type: expected %s, got '%s' instead" % 88 | (argname, expected, type(value).__name__) 89 | ) 90 | 91 | 92 | def FarmHash32(data) -> int: 93 | """Obtain a 32-bit hash from input data. 94 | 95 | :param data: input data (string, bytes, or buffer object) 96 | :return: an integer representing a 32-bit hash of the input 97 | :raises TypeError: if data is not of one of input types 98 | :raises ValueError: if input buffer is not C-contiguous 99 | """ 100 | cdef Py_buffer buf 101 | cdef uint32_t result 102 | cdef const char* encoding 103 | cdef Py_ssize_t encoding_size = 0 104 | 105 | if PyUnicode_Check(data): 106 | encoding = PyUnicode_AsUTF8AndSize(data, &encoding_size) 107 | result = c_Hash32(encoding, encoding_size) 108 | elif PyBytes_Check(data): 109 | result = c_Hash32( 110 | PyBytes_AS_STRING(data), 111 | PyBytes_GET_SIZE(data)) 112 | elif PyObject_CheckBuffer(data): 113 | PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE) 114 | result = c_Hash32(buf.buf, buf.len) 115 | PyBuffer_Release(&buf) 116 | else: 117 | raise _type_error("data", ["basestring", "buffer"], data) 118 | return result 119 | 120 | 121 | def Fingerprint32(data) -> int: 122 | """Obtain a 32-bit hardware-independent fingerprint. 123 | 124 | :param data: input data (string, bytes, or buffer object) 125 | :return: an integer representing a 32-bit hash of the input 126 | :raises TypeError: if data is not of one of input types 127 | :raises ValueError: if input buffer is not C-contiguous 128 | """ 129 | cdef Py_buffer buf 130 | cdef uint32_t result 131 | cdef const char* encoding 132 | cdef Py_ssize_t encoding_size = 0 133 | 134 | if PyUnicode_Check(data): 135 | encoding = PyUnicode_AsUTF8AndSize(data, &encoding_size) 136 | result = c_Fingerprint32(encoding, encoding_size) 137 | elif PyBytes_Check(data): 138 | result = c_Fingerprint32( 139 | PyBytes_AS_STRING(data), 140 | PyBytes_GET_SIZE(data)) 141 | elif PyObject_CheckBuffer(data): 142 | PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE) 143 | result = c_Fingerprint32(buf.buf, buf.len) 144 | PyBuffer_Release(&buf) 145 | else: 146 | raise _type_error("data", ["basestring", "buffer"], data) 147 | return result 148 | 149 | 150 | def FarmHash32WithSeed(data, uint32_t seed=0U) -> int: 151 | """Obtain a 32-bit hash using a seed. 152 | 153 | :param data: input data (string, bytes, or buffer object) 154 | :param seed: seed value (a 32-bit integer, defaults to 0) 155 | :return: an integer representing a 32-bit hash of the input 156 | :raises TypeError: if data is not of one of input types 157 | :raises ValueError: if input buffer is not C-contiguous 158 | :raises OverflowError: if seed cannot be converted to unsigned int32 159 | """ 160 | 161 | cdef Py_buffer buf 162 | cdef uint32_t result 163 | cdef const char* encoding 164 | cdef Py_ssize_t encoding_size = 0 165 | 166 | if PyUnicode_Check(data): 167 | encoding = PyUnicode_AsUTF8AndSize(data, &encoding_size) 168 | result = c_Hash32WithSeed(encoding, encoding_size, seed) 169 | elif PyBytes_Check(data): 170 | result = c_Hash32WithSeed( 171 | PyBytes_AS_STRING(data), 172 | PyBytes_GET_SIZE(data), seed) 173 | elif PyObject_CheckBuffer(data): 174 | PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE) 175 | result = c_Hash32WithSeed(buf.buf, buf.len, seed) 176 | PyBuffer_Release(&buf) 177 | else: 178 | raise _type_error("data", ["basestring", "buffer"], data) 179 | return result 180 | 181 | 182 | def FarmHash64(data) -> int: 183 | """Obtain a 64-bit hash from input data. 184 | 185 | :param data: input data (string, bytes, or buffer object) 186 | :return: an integer representing a 64-bit hash of the input 187 | :raises TypeError: if data is not of one of input types 188 | :raises ValueError: if input buffer is not C-contiguous 189 | """ 190 | cdef Py_buffer buf 191 | cdef uint64_t result 192 | cdef const char* encoding 193 | cdef Py_ssize_t encoding_size = 0 194 | 195 | if PyUnicode_Check(data): 196 | encoding = PyUnicode_AsUTF8AndSize(data, &encoding_size) 197 | result = c_Hash64(encoding, encoding_size) 198 | elif PyBytes_Check(data): 199 | result = c_Hash64( 200 | PyBytes_AS_STRING(data), 201 | PyBytes_GET_SIZE(data)) 202 | elif PyObject_CheckBuffer(data): 203 | PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE) 204 | result = c_Hash64(buf.buf, buf.len) 205 | PyBuffer_Release(&buf) 206 | else: 207 | raise _type_error("data", ["basestring", "buffer"], data) 208 | return result 209 | 210 | 211 | def Fingerprint64(data) -> int: 212 | """Obtain a 64-bit hardware-independent fingerprint. 213 | 214 | :param data: input data (string, bytes, or buffer object) 215 | :return: an integer representing a 64-bit hash of the input 216 | :raises TypeError: if data is not of one of input types 217 | :raises ValueError: if input buffer is not C-contiguous 218 | """ 219 | cdef Py_buffer buf 220 | cdef uint64_t result 221 | cdef const char* encoding 222 | cdef Py_ssize_t encoding_size = 0 223 | 224 | if PyUnicode_Check(data): 225 | encoding = PyUnicode_AsUTF8AndSize(data, &encoding_size) 226 | result = c_Fingerprint64(encoding, encoding_size) 227 | elif PyBytes_Check(data): 228 | result = c_Fingerprint64(PyBytes_AS_STRING(data), 229 | PyBytes_GET_SIZE(data)) 230 | elif PyObject_CheckBuffer(data): 231 | PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE) 232 | result = c_Fingerprint64(buf.buf, buf.len) 233 | PyBuffer_Release(&buf) 234 | else: 235 | raise _type_error("data", ["basestring", "buffer"], data) 236 | return result 237 | 238 | 239 | def FarmHash64WithSeed(data, uint64_t seed=0ULL) -> int: 240 | """Obtain a 64-bit hash using a seed. 241 | 242 | :param data: input data (string, bytes, or buffer object) 243 | :param seed: seed value (a 64-bit integer, defaults to 0) 244 | :return: an integer representing a 64-bit hash of the input 245 | :raises TypeError: if data is not of one of input types 246 | :raises ValueError: if input buffer is not C-contiguous 247 | :raises OverflowError: if seed cannot be converted to unsigned int64 248 | """ 249 | cdef Py_buffer buf 250 | cdef uint64_t result 251 | cdef const char* encoding 252 | cdef Py_ssize_t encoding_size = 0 253 | 254 | if PyUnicode_Check(data): 255 | encoding = PyUnicode_AsUTF8AndSize(data, &encoding_size) 256 | result = c_Hash64WithSeed(encoding, encoding_size, seed) 257 | elif PyBytes_Check(data): 258 | result = c_Hash64WithSeed( 259 | PyBytes_AS_STRING(data), 260 | PyBytes_GET_SIZE(data), seed) 261 | elif PyObject_CheckBuffer(data): 262 | PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE) 263 | result = c_Hash64WithSeed(buf.buf, buf.len, seed) 264 | PyBuffer_Release(&buf) 265 | else: 266 | raise _type_error("data", ["basestring", "buffer"], data) 267 | return result 268 | 269 | 270 | def FarmHash64WithSeeds(data, uint64_t seed0=0LL, uint64_t seed1=0LL) -> int: 271 | """Obtain a 64-bit hash using two seeds. 272 | 273 | :param data: input data (string, bytes, or buffer object) 274 | :param seed0: first seed (a 64-bit integer, defaults to 0) 275 | :param seed1: second seed (a 64-bit integer, defaults to 0) 276 | :return: an integer representing a 64-bit hash of the input 277 | :raises TypeError: if data is not of one of input types 278 | :raises ValueError: if input buffer is not C-contiguous 279 | :raises OverflowError: if seed cannot be converted to unsigned int64 280 | """ 281 | cdef Py_buffer buf 282 | cdef uint64_t result 283 | cdef const char* encoding 284 | cdef Py_ssize_t encoding_size = 0 285 | 286 | if PyUnicode_Check(data): 287 | encoding = PyUnicode_AsUTF8AndSize(data, &encoding_size) 288 | result = c_Hash64WithSeeds(encoding, encoding_size, seed0, seed1) 289 | elif PyBytes_Check(data): 290 | result = c_Hash64WithSeeds( 291 | PyBytes_AS_STRING(data), 292 | PyBytes_GET_SIZE(data), seed0, seed1) 293 | elif PyObject_CheckBuffer(data): 294 | PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE) 295 | result = c_Hash64WithSeeds(buf.buf, buf.len, seed0, seed1) 296 | PyBuffer_Release(&buf) 297 | else: 298 | raise _type_error("data", ["basestring", "buffer"], data) 299 | return result 300 | 301 | 302 | def FarmHash128(data) -> int: 303 | """Obtain a 128-bit hash from input data. 304 | 305 | :param data: input data (string, bytes, or buffer object) 306 | :return: an integer representing a 128-bit hash of the input 307 | :raises TypeError: if data is not of one of input types 308 | :raises ValueError: if input buffer is not C-contiguous 309 | """ 310 | cdef Py_buffer buf 311 | cdef pair[uint64_t, uint64_t] result 312 | cdef const char* encoding 313 | cdef Py_ssize_t encoding_size = 0 314 | 315 | if PyUnicode_Check(data): 316 | encoding = PyUnicode_AsUTF8AndSize(data, &encoding_size) 317 | result = c_Hash128(encoding, encoding_size) 318 | elif PyBytes_Check(data): 319 | result = c_Hash128( 320 | PyBytes_AS_STRING(data), 321 | PyBytes_GET_SIZE(data)) 322 | elif PyObject_CheckBuffer(data): 323 | PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE) 324 | result = c_Hash128(buf.buf, buf.len) 325 | PyBuffer_Release(&buf) 326 | else: 327 | raise _type_error("data", ["basestring", "buffer"], data) 328 | return (long(result.first) << 64ULL) + long(result.second) 329 | 330 | 331 | def Fingerprint128(data) -> int: 332 | """Obtain a 128-bit hardware-independent fingerprint. 333 | 334 | :param data: input data (string, bytes, or buffer object) 335 | :return: an integer representing a 128-bit hash of the input 336 | :raises TypeError: if data is not of one of input types 337 | :raises ValueError: if input buffer is not C-contiguous 338 | """ 339 | cdef Py_buffer buf 340 | cdef pair[uint64_t, uint64_t] result 341 | cdef const char* encoding 342 | cdef Py_ssize_t encoding_size = 0 343 | 344 | if PyUnicode_Check(data): 345 | encoding = PyUnicode_AsUTF8AndSize(data, &encoding_size) 346 | result = c_Fingerprint128(encoding, encoding_size) 347 | elif PyBytes_Check(data): 348 | result = c_Fingerprint128( 349 | PyBytes_AS_STRING(data), 350 | PyBytes_GET_SIZE(data)) 351 | elif PyObject_CheckBuffer(data): 352 | PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE) 353 | result = c_Fingerprint128(buf.buf, buf.len) 354 | PyBuffer_Release(&buf) 355 | else: 356 | raise _type_error("data", ["basestring", "buffer"], data) 357 | return (long(result.first) << 64ULL) + long(result.second) 358 | 359 | 360 | def FarmHash128WithSeed(data, seed: int = 0L) -> int: 361 | """Obtain a 128-bit hash using a seed. 362 | 363 | :param data: input data (string, bytes, or buffer object) 364 | :param seed: seed value (defaults to 0) 365 | :return: an integer representing a 128-bit hash of the input 366 | :raises TypeError: if data is not of one of input types 367 | :raises ValueError: if input buffer is not C-contiguous 368 | """ 369 | cdef Py_buffer buf 370 | cdef pair[uint64_t, uint64_t] result 371 | cdef pair[uint64_t, uint64_t] tseed 372 | cdef const char* encoding 373 | cdef Py_ssize_t encoding_size = 0 374 | 375 | tseed.first = seed >> 64ULL 376 | tseed.second = seed & ((1ULL << 64ULL) - 1ULL) 377 | 378 | if PyUnicode_Check(data): 379 | encoding = PyUnicode_AsUTF8AndSize(data, &encoding_size) 380 | result = c_Hash128WithSeed(encoding, encoding_size, tseed) 381 | elif PyBytes_Check(data): 382 | result = c_Hash128WithSeed( 383 | PyBytes_AS_STRING(data), 384 | PyBytes_GET_SIZE(data), tseed) 385 | elif PyObject_CheckBuffer(data): 386 | PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE) 387 | result = c_Hash128WithSeed(buf.buf, buf.len, tseed) 388 | PyBuffer_Release(&buf) 389 | else: 390 | raise _type_error("data", ["basestring", "buffer"], data) 391 | return (long(result.first) << 64ULL) + long(result.second) 392 | -------------------------------------------------------------------------------- /tests/cityhash64_main.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * ===================================================================================== 3 | * 4 | * Filename: cityhash64_main.cc 5 | * 6 | * Description: Run a hashing function on a text file line by line 7 | * 8 | * Version: 1.0 9 | * Created: 09/07/2015 21:21:41 10 | * Revision: none 11 | * Compiler: gcc 12 | * 13 | * Author: Eugene Scherba (es) 14 | * Organization: - 15 | * 16 | * ===================================================================================== 17 | */ 18 | 19 | #include 20 | #include 21 | #include 22 | #include "city.h" 23 | 24 | 25 | int main(int argc, char** argv) { 26 | std::string line; 27 | if (argc <= 1) { 28 | return EXIT_FAILURE; 29 | } 30 | std::ifstream infile(argv[1]); 31 | while (std::getline(infile, line)) 32 | { 33 | uint64 result = CityHash64WithSeed(line.c_str(), line.length(), 0); 34 | std::cout << result << "\t" << line << std::endl; 35 | } 36 | return EXIT_SUCCESS; 37 | } 38 | -------------------------------------------------------------------------------- /tests/test_cityhash.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * ===================================================================================== 3 | * 4 | * Filename: test_cityhash64.cc 5 | * 6 | * Description: C++-based tests for CityHash 7 | * 8 | * Version: 1.0 9 | * Created: 10/12/2015 16:30:58 10 | * Revision: none 11 | * Compiler: gcc 12 | * 13 | * Author: Eugene Scherba (es) 14 | * Organization: - 15 | * 16 | * ===================================================================================== 17 | */ 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #define CATCH_CONFIG_MAIN // This tells Catch to provide a main() - only do this in one cpp file 26 | #include "catch.hpp" 27 | #include "city.h" 28 | 29 | #define STRLEN(s) (sizeof(s)/sizeof(s[0])) 30 | #define HASH64_SZ 8 31 | 32 | TEST_CASE( "CityHash32: basic test", "[basic]" ) 33 | { 34 | const char test_string[] = "abracadabra"; 35 | uint32 hash = CityHash32(test_string, STRLEN(test_string)); 36 | REQUIRE(hash != 0); 37 | } 38 | 39 | TEST_CASE( "CityHash32: test different inputs", "[diff_inputs]" ) 40 | { 41 | const char test_string1[] = "abracadabr"; 42 | const char test_string2[] = "abracaaabra"; 43 | uint32 hash1 = CityHash32(test_string1, STRLEN(test_string1)); 44 | uint32 hash2 = CityHash32(test_string2, STRLEN(test_string2)); 45 | REQUIRE(hash1 != hash2); 46 | } 47 | 48 | TEST_CASE( "CityHash64: basic test", "[basic]" ) 49 | { 50 | const char test_string[] = "abracadabra"; 51 | uint64 hash = CityHash64(test_string, STRLEN(test_string)); 52 | REQUIRE(hash != 0); 53 | } 54 | 55 | TEST_CASE( "CityHash64: test different inputs", "[diff_inputs]" ) 56 | { 57 | const char test_string1[] = "abracadabr"; 58 | const char test_string2[] = "abracaaabra"; 59 | uint64 hash1 = CityHash64(test_string1, STRLEN(test_string1)); 60 | uint64 hash2 = CityHash64(test_string2, STRLEN(test_string2)); 61 | REQUIRE(hash1 != hash2); 62 | } 63 | 64 | TEST_CASE( "CityHash128: basic test", "[basic]" ) 65 | { 66 | const char test_string[] = "abracadabra"; 67 | uint128 hash = CityHash128(test_string, STRLEN(test_string)); 68 | uint128 outcome_shouldnt_be = std::make_pair(0,0); 69 | REQUIRE(hash != outcome_shouldnt_be); 70 | } 71 | 72 | TEST_CASE( "CityHash128: test different inputs", "[diff_inputs]" ) 73 | { 74 | const char test_string1[] = "abracadabr"; 75 | const char test_string2[] = "abracaaabra"; 76 | uint128 hash1 = CityHash128(test_string1, STRLEN(test_string1)); 77 | uint128 hash2 = CityHash128(test_string2, STRLEN(test_string2)); 78 | REQUIRE(hash1 != hash2); 79 | } 80 | 81 | TEST_CASE( "CityHash64WithSeed: basic test", "[basic]" ) 82 | { 83 | const char test_string[] = "abracadabra"; 84 | uint64 hash = CityHash64WithSeed(test_string, STRLEN(test_string), 0); 85 | REQUIRE(hash != 0); 86 | } 87 | 88 | TEST_CASE( "CityHash64WithSeed: test different seeds", "[diff_seeds]" ) 89 | { 90 | const char test_string[] = "abracadabra"; 91 | uint64 hash1 = CityHash64WithSeed(test_string, STRLEN(test_string), 0); 92 | uint64 hash2 = CityHash64WithSeed(test_string, STRLEN(test_string), 1); 93 | REQUIRE(hash1 != hash2); 94 | } 95 | 96 | TEST_CASE( "CityHash64WithSeed: test different inputs", "[diff_inputs]" ) 97 | { 98 | const char test_string1[] = "abracadabr"; 99 | const char test_string2[] = "abracaaabra"; 100 | uint64 hash1 = CityHash64WithSeed(test_string1, STRLEN(test_string1), 0); 101 | uint64 hash2 = CityHash64WithSeed(test_string2, STRLEN(test_string2), 0); 102 | REQUIRE(hash1 != hash2); 103 | } 104 | 105 | TEST_CASE( "CityHash64WithSeed: different outcome than CityHash64", "[compare]" ) 106 | { 107 | const char test_string[] = "abracadabra"; 108 | uint64 hash1 = CityHash64(test_string, STRLEN(test_string)); 109 | uint64 hash2 = CityHash64WithSeed(test_string, STRLEN(test_string), 0); 110 | REQUIRE(hash1 != hash2); 111 | } 112 | 113 | TEST_CASE( "CityHash64WithSeeds: basic test", "[basic]" ) 114 | { 115 | const char test_string[] = "abracadabra"; 116 | uint64 hash = CityHash64WithSeeds(test_string, STRLEN(test_string), 0, 0); 117 | REQUIRE(hash != 0); 118 | } 119 | 120 | TEST_CASE( "CityHash64WithSeeds: test different seeds", "[diff_seeds]" ) 121 | { 122 | const char test_string[] = "abracadabra"; 123 | uint64 hash1 = CityHash64WithSeeds(test_string, STRLEN(test_string), 0, 0); 124 | uint64 hash2 = CityHash64WithSeeds(test_string, STRLEN(test_string), 0, 1); 125 | REQUIRE(hash1 != hash2); 126 | } 127 | 128 | TEST_CASE( "CityHash64WithSeeds: test different inputs", "[diff_inputs]" ) 129 | { 130 | const char test_string1[] = "abracadabr"; 131 | const char test_string2[] = "abracaaabra"; 132 | uint64 hash1 = CityHash64WithSeeds(test_string1, STRLEN(test_string1), 0, 0); 133 | uint64 hash2 = CityHash64WithSeeds(test_string2, STRLEN(test_string2), 0, 0); 134 | REQUIRE(hash1 != hash2); 135 | } 136 | 137 | TEST_CASE( "CityHash64WithSeeds: different outcome than CityHash64WithSeed", "[compare]" ) 138 | { 139 | const char test_string[] = "abracadabra"; 140 | uint64 hash1 = CityHash64WithSeed(test_string, STRLEN(test_string), 0); 141 | uint64 hash2 = CityHash64WithSeeds(test_string, STRLEN(test_string), 0, 0); 142 | REQUIRE(hash1 != hash2); 143 | } 144 | 145 | TEST_CASE( "CityHash128WithSeed: basic test", "[basic]" ) 146 | { 147 | const char test_string[] = "abracadabra"; 148 | uint128 seed = std::make_pair(0,0); 149 | uint128 hash = CityHash128WithSeed(test_string, STRLEN(test_string), seed); 150 | REQUIRE(hash.second != 0); 151 | } 152 | 153 | TEST_CASE( "CityHash128WithSeed: test different inputs", "[diff_inputs]" ) 154 | { 155 | const char test_string1[] = "abracadabr"; 156 | const char test_string2[] = "abracaaabra"; 157 | uint128 seed = std::make_pair(0,0); 158 | uint128 hash1 = CityHash128WithSeed(test_string1, STRLEN(test_string1), seed); 159 | uint128 hash2 = CityHash128WithSeed(test_string2, STRLEN(test_string2), seed); 160 | REQUIRE(hash1 != hash2); 161 | } 162 | 163 | TEST_CASE( "CityHash128WithSeed: test different seeds", "[diff_seeds]" ) 164 | { 165 | const char test_string[] = "abracadabra"; 166 | uint128 seed1 = std::make_pair(0,0); 167 | uint128 seed2 = std::make_pair(0,1); 168 | uint128 hash1 = CityHash128WithSeed(test_string, STRLEN(test_string), seed1); 169 | uint128 hash2 = CityHash128WithSeed(test_string, STRLEN(test_string), seed2); 170 | REQUIRE(hash1 != hash2); 171 | } 172 | 173 | TEST_CASE( "CityHash128WithSeed: different outcome than CityHash128", "[compare]" ) 174 | { 175 | const char test_string[] = "abracadabr"; 176 | uint128 seed = std::make_pair(0,0); 177 | uint128 hash1 = CityHash128WithSeed(test_string, STRLEN(test_string), seed); 178 | uint128 hash2 = CityHash128(test_string, STRLEN(test_string)); 179 | REQUIRE(hash1 != hash2); 180 | } 181 | -------------------------------------------------------------------------------- /tests/test_cityhash.py: -------------------------------------------------------------------------------- 1 | """ 2 | Python-based tests for cityhash extension 3 | """ 4 | import random 5 | import string 6 | import sys 7 | import unittest 8 | 9 | from cityhash import ( 10 | CityHash32, 11 | CityHash64, 12 | CityHash64WithSeed, 13 | CityHash64WithSeeds, 14 | CityHash128, 15 | CityHash128WithSeed, 16 | ) 17 | 18 | 19 | EMPTY_STRING = "" 20 | EMPTY_UNICODE = u"" # pylint: disable=redundant-u-string-prefix 21 | 22 | 23 | if sys.version_info[0] >= 3: 24 | long = int 25 | 26 | 27 | def random_string(n, alphabet=string.ascii_lowercase): 28 | """generate a random string""" 29 | return "".join(random.choice(alphabet) for _ in range(n)) 30 | 31 | 32 | def random_splits(s, n, nsplits=2): 33 | """split string in random places""" 34 | splits = sorted([random.randint(0, n) for _ in range(nsplits - 1)]) 35 | splits = [0] + splits + [n] 36 | for begin, end in zip(splits, splits[1:]): 37 | yield s[begin:end] 38 | 39 | 40 | class TestUnicode(unittest.TestCase): 41 | 42 | """test unicode-related properties (deprecated in Python 3)""" 43 | 44 | def test_string_unicode_32(self): 45 | """Empty Python string has same hash value as empty Unicode string""" 46 | self.assertEqual(CityHash32(EMPTY_STRING), CityHash32(EMPTY_UNICODE)) 47 | 48 | def test_string_unicode_64(self): 49 | """Empty Python string has same hash value as empty Unicode string""" 50 | self.assertEqual( 51 | CityHash64WithSeed(EMPTY_STRING), CityHash64WithSeed(EMPTY_UNICODE) 52 | ) 53 | 54 | def test_string_unicode_128(self): 55 | """Empty Python string has same hash value as empty Unicode string""" 56 | self.assertEqual( 57 | CityHash128WithSeed(EMPTY_STRING), CityHash128WithSeed(EMPTY_UNICODE) 58 | ) 59 | 60 | def test_consistent_encoding_32(self): 61 | """ASCII-range Unicode strings have the same hash values as ASCII strings""" 62 | text = u"abracadabra" # pylint: disable=redundant-u-string-prefix 63 | self.assertEqual(CityHash32(text), CityHash32(text.encode("utf-8"))) 64 | 65 | def test_consistent_encoding_64(self): 66 | """ASCII-range Unicode strings have the same hash values as ASCII strings""" 67 | text = u"abracadabra" # pylint: disable=redundant-u-string-prefix 68 | self.assertEqual( 69 | CityHash64WithSeed(text), CityHash64WithSeed(text.encode("utf-8")) 70 | ) 71 | 72 | def test_consistent_encoding_128(self): 73 | """ASCII-range Unicode strings have the same hash values as ASCII strings""" 74 | text = u"abracadabra" # pylint: disable=redundant-u-string-prefix 75 | self.assertEqual( 76 | CityHash128WithSeed(text), CityHash128WithSeed(text.encode("utf-8")) 77 | ) 78 | 79 | def test_unicode_1_32(self): 80 | """Accepts Unicode input""" 81 | test_case = u"abc" # pylint: disable=redundant-u-string-prefix 82 | self.assertTrue(isinstance(CityHash32(test_case), int)) 83 | 84 | def test_unicode_1_64(self): 85 | """Accepts Unicode input""" 86 | test_case = u"abc" # pylint: disable=redundant-u-string-prefix 87 | self.assertTrue(isinstance(CityHash64WithSeed(test_case), long)) 88 | 89 | def test_unicode_1_128(self): 90 | """Accepts Unicode input""" 91 | test_case = u"abc" # pylint: disable=redundant-u-string-prefix 92 | self.assertTrue(isinstance(CityHash128WithSeed(test_case), long)) 93 | 94 | def test_unicode_2_32(self): 95 | """Accepts Unicode input outside of ASCII range""" 96 | test_case = u"\u2661" # pylint: disable=redundant-u-string-prefix 97 | self.assertTrue(isinstance(CityHash32(test_case), int)) 98 | 99 | def test_unicode_2_64(self): 100 | """Accepts Unicode input outside of ASCII range""" 101 | test_case = u"\u2661" # pylint: disable=redundant-u-string-prefix 102 | self.assertTrue(isinstance(CityHash64WithSeed(test_case), long)) 103 | 104 | def test_unicode_2_128(self): 105 | """Accepts Unicode input outside of ASCII range""" 106 | test_case = u"\u2661" # pylint: disable=redundant-u-string-prefix 107 | self.assertTrue(isinstance(CityHash128WithSeed(test_case), long)) 108 | 109 | def test_unicode_2_128_seed(self): 110 | """Accepts Unicode input outside of ASCII range""" 111 | test_case = u"\u2661" # pylint: disable=redundant-u-string-prefix 112 | result = CityHash128WithSeed(test_case, seed=CityHash128WithSeed(test_case)) 113 | self.assertTrue(isinstance(result, long)) 114 | 115 | 116 | class TestProperties(unittest.TestCase): 117 | 118 | """test various properties""" 119 | 120 | def test_argument_types(self): 121 | """Should accept byte arrays and buffers""" 122 | funcs = [ 123 | CityHash32, 124 | CityHash64, 125 | CityHash128, 126 | CityHash64WithSeed, 127 | CityHash64WithSeeds, 128 | CityHash128WithSeed, 129 | ] 130 | args = [b"ab\x00c", bytearray(b"ab\x00c"), memoryview(b"ab\x00c")] 131 | for func in funcs: 132 | values = set(func(arg) for arg in args) 133 | self.assertEqual(len(values), 1, values) 134 | 135 | def test_refcounts(self): 136 | """Argument reference count should not change""" 137 | funcs = [ 138 | CityHash32, 139 | CityHash64, 140 | CityHash128, 141 | CityHash64WithSeed, 142 | CityHash64WithSeeds, 143 | CityHash128WithSeed, 144 | ] 145 | args = ["abc", b"abc", bytearray(b"def"), memoryview(b"ghi")] 146 | for func in funcs: 147 | for arg in args: 148 | old_refcount = sys.getrefcount(arg) 149 | func(arg) 150 | self.assertEqual(sys.getrefcount(arg), old_refcount) 151 | 152 | def test_different_seeds(self): 153 | """Different seeds should produce different results""" 154 | 155 | test_string = "just a string" 156 | 157 | funcs = [ 158 | CityHash64WithSeed, 159 | CityHash64WithSeeds, 160 | CityHash128WithSeed, 161 | ] 162 | 163 | for func in funcs: 164 | self.assertNotEqual(func(test_string, 0), func(test_string, 1)) 165 | 166 | def test_func_raises_type_error(self): 167 | """Raises type error on bad argument type""" 168 | funcs = [ 169 | CityHash32, 170 | CityHash64, 171 | CityHash128, 172 | CityHash64WithSeed, 173 | CityHash64WithSeeds, 174 | CityHash128WithSeed, 175 | ] 176 | for func in funcs: 177 | with self.assertRaises(TypeError): 178 | func([]) 179 | -------------------------------------------------------------------------------- /tests/test_cityhashcrc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Python-based tests for cityhash extension 3 | """ 4 | import random 5 | import string 6 | import sys 7 | import unittest 8 | 9 | try: 10 | from cityhashcrc import ( 11 | CityHashCrc128, 12 | CityHashCrc128WithSeed, 13 | CityHashCrc256Bytes, 14 | ) 15 | 16 | HAVE_CRC_MODULE = True 17 | except Exception: 18 | HAVE_CRC_MODULE = False 19 | 20 | 21 | def random_string(n, alphabet=string.ascii_lowercase): 22 | """generate a random string""" 23 | return "".join(random.choice(alphabet) for _ in range(n)) 24 | 25 | 26 | def random_splits(s, n, nsplits=2): 27 | """split string in random places""" 28 | splits = sorted([random.randint(0, n) for _ in range(nsplits - 1)]) 29 | splits = [0] + splits + [n] 30 | for begin, end in zip(splits, splits[1:]): 31 | yield s[begin:end] 32 | 33 | 34 | class TestProperties(unittest.TestCase): 35 | 36 | """test various properties""" 37 | 38 | @classmethod 39 | def setUpClass(cls): 40 | if not HAVE_CRC_MODULE: 41 | raise unittest.SkipTest("failed to import optional CRC module") 42 | 43 | def test_argument_types(self): 44 | """Should accept byte arrays and buffers""" 45 | funcs = [CityHashCrc128, CityHashCrc128WithSeed, CityHashCrc256Bytes] 46 | args = [b"ab\x00c", bytearray(b"ab\x00c"), memoryview(b"ab\x00c")] 47 | for func in funcs: 48 | values = set(func(arg) for arg in args) 49 | self.assertEqual(len(values), 1, values) 50 | 51 | def test_refcounts(self): 52 | """Argument reference count should not change""" 53 | funcs = [CityHashCrc128, CityHashCrc128WithSeed, CityHashCrc256Bytes] 54 | args = ["abc", b"abc", bytearray(b"def"), memoryview(b"ghi")] 55 | for func in funcs: 56 | for arg in args: 57 | old_refcount = sys.getrefcount(arg) 58 | func(arg) 59 | self.assertEqual(sys.getrefcount(arg), old_refcount) 60 | 61 | def test_different_seeds(self): 62 | """Different seeds should produce different results""" 63 | 64 | test_string = "just a string" 65 | 66 | funcs = [ 67 | CityHashCrc128WithSeed, 68 | ] 69 | 70 | for func in funcs: 71 | self.assertNotEqual(func(test_string, 0), func(test_string, 1)) 72 | 73 | def test_func_raises_type_error(self): 74 | """Raises type error on bad argument type""" 75 | funcs = [CityHashCrc128, CityHashCrc128WithSeed, CityHashCrc256Bytes] 76 | for func in funcs: 77 | with self.assertRaises(TypeError): 78 | func([]) 79 | -------------------------------------------------------------------------------- /tests/test_farmhash.py: -------------------------------------------------------------------------------- 1 | """ 2 | Python-based tests for farmhash extension 3 | """ 4 | import random 5 | import string 6 | import sys 7 | import unittest 8 | 9 | from farmhash import ( 10 | FarmHash32, 11 | FarmHash32WithSeed, 12 | FarmHash64, 13 | FarmHash64WithSeed, 14 | FarmHash64WithSeeds, 15 | FarmHash128, 16 | FarmHash128WithSeed, 17 | Fingerprint32, 18 | Fingerprint64, 19 | Fingerprint128, 20 | ) 21 | 22 | 23 | EMPTY_STRING = "" 24 | EMPTY_UNICODE = u"" # pylint: disable=redundant-u-string-prefix 25 | 26 | 27 | if sys.version_info[0] >= 3: 28 | long = int 29 | 30 | 31 | def random_string(n, alphabet=string.ascii_lowercase): 32 | """generate a random string""" 33 | return "".join(random.choice(alphabet) for _ in range(n)) 34 | 35 | 36 | def random_splits(s, n, nsplits=2): 37 | """split string in random places""" 38 | splits = sorted([random.randint(0, n) for _ in range(nsplits - 1)]) 39 | splits = [0] + splits + [n] 40 | for begin, end in zip(splits, splits[1:]): 41 | yield s[begin:end] 42 | 43 | 44 | class TestUnicode(unittest.TestCase): 45 | 46 | """test unicode-related properties (deprecated in Python 3)""" 47 | 48 | def test_string_unicode_32(self): 49 | """Empty Python string has same hash value as empty Unicode string""" 50 | self.assertEqual(FarmHash32(EMPTY_STRING), FarmHash32(EMPTY_UNICODE)) 51 | 52 | def test_string_unicode_64(self): 53 | """Empty Python string has same hash value as empty Unicode string""" 54 | self.assertEqual( 55 | FarmHash64WithSeed(EMPTY_STRING), FarmHash64WithSeed(EMPTY_UNICODE) 56 | ) 57 | 58 | def test_string_unicode_128(self): 59 | """Empty Python string has same hash value as empty Unicode string""" 60 | self.assertEqual( 61 | FarmHash128WithSeed(EMPTY_STRING), FarmHash128WithSeed(EMPTY_UNICODE) 62 | ) 63 | 64 | def test_consistent_encoding_32(self): 65 | """ASCII-range Unicode strings have the same hash values as ASCII strings""" 66 | text = u"abracadabra" # pylint: disable=redundant-u-string-prefix 67 | self.assertEqual(FarmHash32(text), FarmHash32(text.encode("utf-8"))) 68 | 69 | def test_consistent_encoding_64(self): 70 | """ASCII-range Unicode strings have the same hash values as ASCII strings""" 71 | text = u"abracadabra" # pylint: disable=redundant-u-string-prefix 72 | self.assertEqual( 73 | FarmHash64WithSeed(text), FarmHash64WithSeed(text.encode("utf-8")) 74 | ) 75 | 76 | def test_consistent_encoding_128(self): 77 | """ASCII-range Unicode strings have the same hash values as ASCII strings""" 78 | text = u"abracadabra" # pylint: disable=redundant-u-string-prefix 79 | self.assertEqual( 80 | FarmHash128WithSeed(text), FarmHash128WithSeed(text.encode("utf-8")) 81 | ) 82 | 83 | def test_unicode_1_32(self): 84 | """Accepts Unicode input""" 85 | test_case = u"abc" # pylint: disable=redundant-u-string-prefix 86 | self.assertTrue(isinstance(FarmHash32(test_case), int)) 87 | 88 | def test_unicode_1_64(self): 89 | """Accepts Unicode input""" 90 | test_case = u"abc" # pylint: disable=redundant-u-string-prefix 91 | self.assertTrue(isinstance(FarmHash64WithSeed(test_case), long)) 92 | 93 | def test_unicode_1_128(self): 94 | """Accepts Unicode input""" 95 | test_case = u"abc" # pylint: disable=redundant-u-string-prefix 96 | self.assertTrue(isinstance(FarmHash128WithSeed(test_case), long)) 97 | 98 | def test_unicode_2_32(self): 99 | """Accepts Unicode input outside of ASCII range""" 100 | test_case = u"\u2661" # pylint: disable=redundant-u-string-prefix 101 | self.assertTrue(isinstance(FarmHash32(test_case), int)) 102 | 103 | def test_unicode_2_64(self): 104 | """Accepts Unicode input outside of ASCII range""" 105 | test_case = u"\u2661" # pylint: disable=redundant-u-string-prefix 106 | self.assertTrue(isinstance(FarmHash64WithSeed(test_case), long)) 107 | 108 | def test_unicode_2_128(self): 109 | """Accepts Unicode input outside of ASCII range""" 110 | test_case = u"\u2661" # pylint: disable=redundant-u-string-prefix 111 | self.assertTrue(isinstance(FarmHash128WithSeed(test_case), long)) 112 | 113 | def test_unicode_2_128_seed(self): 114 | """Accepts Unicode input outside of ASCII range""" 115 | test_case = u"\u2661" # pylint: disable=redundant-u-string-prefix 116 | result = FarmHash128WithSeed(test_case, seed=FarmHash128WithSeed(test_case)) 117 | self.assertTrue(isinstance(result, long)) 118 | 119 | 120 | class TestFingerprints(unittest.TestCase): 121 | 122 | """Fingerprints should be the same across platforms""" 123 | 124 | def test_fingerprint32(self): 125 | """test 32-bit fingerprint""" 126 | test_string = "abc" 127 | self.assertEqual(Fingerprint32(test_string), 795041479) 128 | 129 | def test_fingerprint64(self): 130 | """test 64-bit fingerprint""" 131 | test_string = "abc" 132 | self.assertEqual(Fingerprint64(test_string), 2640714258260161385) 133 | 134 | def test_fingerprint128(self): 135 | """test 128-bit fingerprint""" 136 | test_string = "abc" 137 | self.assertEqual( 138 | Fingerprint128(test_string), 76434233956484675513733017140465933893 139 | ) 140 | 141 | 142 | class TestProperties(unittest.TestCase): 143 | 144 | """test various properties""" 145 | 146 | def test_argument_types(self): 147 | """Should accept byte arrays and buffers""" 148 | funcs = [ 149 | FarmHash32, 150 | FarmHash64, 151 | FarmHash128, 152 | FarmHash32WithSeed, 153 | FarmHash64WithSeed, 154 | FarmHash64WithSeeds, 155 | FarmHash128WithSeed, 156 | Fingerprint32, 157 | Fingerprint64, 158 | Fingerprint128, 159 | ] 160 | args = [b"ab\x00c", bytearray(b"ab\x00c"), memoryview(b"ab\x00c")] 161 | for func in funcs: 162 | values = set(func(arg) for arg in args) 163 | self.assertEqual(len(values), 1, values) 164 | 165 | def test_refcounts(self): 166 | """Argument reference count should not change""" 167 | funcs = [ 168 | FarmHash32, 169 | FarmHash64, 170 | FarmHash128, 171 | FarmHash32WithSeed, 172 | FarmHash64WithSeed, 173 | FarmHash64WithSeeds, 174 | FarmHash128WithSeed, 175 | Fingerprint32, 176 | Fingerprint64, 177 | Fingerprint128, 178 | ] 179 | args = ["abc", b"abc", bytearray(b"def"), memoryview(b"ghi")] 180 | for func in funcs: 181 | for arg in args: 182 | old_refcount = sys.getrefcount(arg) 183 | func(arg) 184 | self.assertEqual(sys.getrefcount(arg), old_refcount) 185 | 186 | def test_different_seeds(self): 187 | """Different seeds should produce different results""" 188 | 189 | test_string = "just a string" 190 | 191 | funcs = [ 192 | FarmHash32WithSeed, 193 | FarmHash64WithSeed, 194 | FarmHash64WithSeeds, 195 | FarmHash128WithSeed, 196 | ] 197 | 198 | for func in funcs: 199 | self.assertNotEqual(func(test_string, 0), func(test_string, 1)) 200 | 201 | def test_func_raises_type_error(self): 202 | """Raises type error on bad argument type""" 203 | funcs = [ 204 | FarmHash32, 205 | FarmHash32WithSeed, 206 | FarmHash64, 207 | FarmHash128, 208 | FarmHash64WithSeed, 209 | FarmHash64WithSeeds, 210 | FarmHash128WithSeed, 211 | Fingerprint32, 212 | Fingerprint64, 213 | Fingerprint128, 214 | ] 215 | for func in funcs: 216 | with self.assertRaises(TypeError): 217 | func([]) 218 | --------------------------------------------------------------------------------