├── .github └── workflows │ ├── build.yml │ └── publish.yml ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── cpp.mk ├── pip-freeze.txt ├── pyproject.toml ├── python.mk ├── requirements.txt ├── setup.py ├── src ├── metro.h ├── metrohash.cpp ├── metrohash.h ├── metrohash.pyx ├── metrohash128.cc ├── metrohash128.h ├── metrohash128crc.cc ├── metrohash128crc.h ├── metrohash64.cc ├── metrohash64.h └── platform.h └── tests ├── catch.hpp ├── metrohash64_main.cc ├── test_metrohash.cc └── test_metrohash.py /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | types: 9 | - opened 10 | - synchronize 11 | - reopened 12 | 13 | jobs: 14 | build: 15 | strategy: 16 | matrix: 17 | os: [windows-latest, macos-latest, ubuntu-latest] 18 | python-version: ["3.11"] 19 | 20 | runs-on: ${{ matrix.os }} 21 | steps: 22 | - uses: actions/checkout@v2 23 | 24 | - name: Set up Python ${{ matrix.python-version }} 25 | uses: actions/setup-python@v4 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | architecture: x64 29 | 30 | # block below based on: 31 | # https://medium.com/ai2-blog/python-caching-in-github-actions-e9452698e98d 32 | - uses: actions/cache@v3 33 | with: 34 | path: ${{ env.pythonLocation }} 35 | key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('pip-freeze.txt') }} 36 | 37 | - name: Install dependencies 38 | run: | 39 | pip install --upgrade --upgrade-strategy eager setuptools wheel 40 | pip install --upgrade --upgrade-strategy eager -r requirements.txt 41 | pip freeze > pip-freeze.txt 42 | 43 | - name: Test with pytest 44 | run: | 45 | python setup.py build_ext --inplace 46 | pip install -e . 47 | python -m pytest 48 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: publish 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | repository: 7 | description: 'The repository to upload the package to' 8 | required: true 9 | default: 'testpypi' 10 | 11 | jobs: 12 | build_wheels: 13 | name: Build wheels on ${{ matrix.os }} 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | matrix: 17 | os: [ubuntu-20.04, windows-2019, macOS-10.15] 18 | steps: 19 | - name: Checkout 20 | uses: actions/checkout@v2 21 | - name: Set up Python 22 | uses: actions/setup-python@v4 23 | with: 24 | python-version: '3.9' 25 | - name: Set up QEMU 26 | if: runner.os == 'Linux' 27 | # uses: docker/setup-qemu-action@v1.0.1 28 | uses: docker/setup-qemu-action@v2 29 | with: 30 | platforms: arm64 31 | - name: Build wheels 32 | # uses: joerick/cibuildwheel@v1.9.0 33 | uses: pypa/cibuildwheel@v2.11.2 34 | with: 35 | output-dir: wheelhouse 36 | env: 37 | CIBW_BUILD: '{cp36,cp37,cp38,cp39,cp310,cp311}-{manylinux_x86_64,manylinux_aarch64,win32,win_amd64,macosx_x86_64} {cp39,cp310,cp311}-macosx_arm64' 38 | CIBW_MANYLINUX_AARCH64_IMAGE: manylinux2014 39 | CIBW_ARCHS_LINUX: 'auto aarch64' 40 | CIBW_ARCHS_MACOS: 'auto arm64' 41 | CIBW_TEST_REQUIRES: pytest 42 | CIBW_TEST_COMMAND: 'pytest -s {project}/tests' 43 | CIBW_TEST_SKIP: '*-macosx_arm64' # Until the day Apple silicon instances are available on GitHub Actions 44 | - uses: actions/upload-artifact@v3 45 | with: 46 | path: ./wheelhouse/*.whl 47 | build_sdist: 48 | name: Build a source distribution 49 | runs-on: ubuntu-20.04 50 | steps: 51 | - name: Checkout 52 | uses: actions/checkout@v2 53 | - name: Set up Python 54 | uses: actions/setup-python@v4 55 | with: 56 | python-version: '3.9' 57 | - name: Build sdist 58 | run: | 59 | pip install py-cpuinfo 60 | python setup.py build sdist 61 | - uses: actions/upload-artifact@v3 62 | with: 63 | path: dist/*.tar.gz 64 | publish: 65 | name: 'Upload to PyPI/TestPyPI' 66 | runs-on: ubuntu-20.04 67 | needs: [build_wheels, build_sdist] 68 | steps: 69 | - name: Set up Python 70 | uses: actions/setup-python@v4 71 | with: 72 | python-version: '3.9' 73 | - name: Set up built items 74 | uses: actions/download-artifact@v3 75 | with: 76 | name: artifact 77 | path: dist 78 | - name: Install dependencies 79 | run: | 80 | python -m pip install --upgrade pip 81 | pip install setuptools wheel twine 82 | - name: Publish 83 | env: 84 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 85 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 86 | run: | 87 | twine upload --verbose --repository ${{ github.event.inputs.repository }} dist/* 88 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | *.zip 8 | 9 | # Misc artifacts 10 | .DS_Store 11 | tags 12 | .cache/ 13 | *.sw[op] 14 | 15 | # Exclude these directories 16 | bin/ 17 | data/ 18 | wheelhouse/ 19 | 20 | # Distribution / packaging 21 | .Python 22 | env/ 23 | build/ 24 | develop-eggs/ 25 | dist/ 26 | downloads/ 27 | eggs/ 28 | .eggs/ 29 | lib/ 30 | lib64/ 31 | parts/ 32 | sdist/ 33 | var/ 34 | *.egg-info/ 35 | .installed.cfg 36 | *.egg 37 | 38 | # PyInstaller 39 | # Usually these files are written by a python script from a template 40 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 41 | *.manifest 42 | *.spec 43 | 44 | # Installer logs 45 | pip-log.txt 46 | pip-delete-this-directory.txt 47 | 48 | # Unit test / coverage reports 49 | htmlcov/ 50 | .tox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *,cover 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Sphinx documentation 63 | docs/_build/ 64 | 65 | # PyBuilder 66 | target/ 67 | 68 | # IDE and editor stuff 69 | .idea/ 70 | .vscode/ 71 | .ropeproject/ 72 | .ipynb_checkpoints/ 73 | .pytest_cache/ 74 | .mypy_cache/ 75 | 76 | # Makefile artifacts 77 | .build_stamp 78 | 79 | # Python 3.11 80 | lib64 81 | pyvenv.cfg 82 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | recursive-include src *.h 4 | recursive-include src *.cc 5 | recursive-include src *.cpp 6 | recursive-include src *.pyx 7 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SHELL := /usr/bin/env bash -c 2 | 3 | MAKEFLAGS += --warn-undefined-variables 4 | MAKEFLAGS += --no-builtin-rules 5 | .DEFAULT_GOAL := help 6 | .SUFFIXES: 7 | 8 | include python.mk 9 | include cpp.mk 10 | 11 | .PHONY: help 12 | help: ## show this message and exit 13 | @grep -E '^[a-zA-Z_0-9%-]+:.*?## .*$$' $(MAKEFILE_LIST) \ 14 | | awk -F':' '{print $$(NF-1)":"$$NF}' | sort \ 15 | | awk 'BEGIN {FS = ":.*?## "}; {printf "$(BOLD)%-24s$(END) %s\n", $$1, $$2}' 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MetroHash 2 | 3 | Python wrapper for [MetroHash](https://github.com/jandrewrogers/MetroHash), a 4 | fast non-cryptographic hash function. 5 | 6 | [![Build Status](https://img.shields.io/github/actions/workflow/status/escherba/python-metrohash/build.yml?branch=master)](https://github.com/escherba/python-metrohash/actions/workflows/build.yml) 7 | [![Latest 8 | Version](https://img.shields.io/pypi/v/metrohash.svg)](https://pypi.python.org/pypi/metrohash) 9 | [![Downloads](https://img.shields.io/pypi/dm/metrohash.svg)](https://pypistats.org/packages/metrohash) 10 | [![License](https://img.shields.io/pypi/l/metrohash.svg)](https://opensource.org/licenses/Apache-2.0) 11 | [![Supported Python 12 | versions](https://img.shields.io/pypi/pyversions/metrohash.svg)](https://pypi.python.org/pypi/metrohash) 13 | 14 | ## Getting Started 15 | 16 | To use this package in your program, simply type 17 | 18 | ``` bash 19 | pip install metrohash 20 | ``` 21 | 22 | After that, you should be able to import the module and do things with 23 | it (see usage example below). 24 | 25 | ## Usage Examples 26 | 27 | ### Stateless hashing 28 | 29 | This package provides Python interfaces to 64- and 128-bit 30 | implementations of MetroHash algorithm. For stateless hashing, it 31 | exports `metrohash64` and `metrohash128` functions. Both take a value to 32 | be hashed and an optional `seed` parameter: 33 | 34 | ``` python 35 | >>> import metrohash 36 | ... 37 | >>> metrohash.hash64_int("abc", seed=0) 38 | 17099979927131455419 39 | >>> metrohash.hash128_int("abc") 40 | 182995299641628952910564950850867298725 41 | 42 | ``` 43 | 44 | ### Incremental hashing 45 | 46 | Unlike its cousins CityHash and FarmHash, MetroHash allows incremental 47 | (stateful) hashing. For incremental hashing, use `MetroHash64` and 48 | `MetroHash128` classes. Incremental hashing is associative and 49 | guarantees that any combination of input slices will result in the same 50 | final hash value. This is useful for processing large inputs and stream 51 | data. Example with two slices: 52 | 53 | ``` python 54 | >>> mh = metrohash.MetroHash64() 55 | >>> mh.update("Nobody inspects") 56 | >>> mh.update(" the spammish repetition") 57 | >>> mh.intdigest() 58 | 7851180100622203313 59 | 60 | ``` 61 | 62 | The resulting hash value above should be the same as in: 63 | 64 | ``` python 65 | >>> mh = metrohash.MetroHash64() 66 | >>> mh.update("Nobody inspects the spammish repetition") 67 | >>> mh.intdigest() 68 | 7851180100622203313 69 | 70 | ``` 71 | 72 | ### Fast hashing of NumPy arrays 73 | 74 | The Python [Buffer 75 | Protocol](https://docs.python.org/3/c-api/buffer.html) allows Python 76 | objects to expose their data as raw byte arrays to other objects, for 77 | fast access without copying to a separate location in memory. Among 78 | others, NumPy is a major framework that supports this protocol. 79 | 80 | All hashing functions in this packege will read byte arrays from objects 81 | that expose them via the buffer protocol. Here is an example showing 82 | hashing of a 4D NumPy array: 83 | 84 | ``` python 85 | >>> import numpy as np 86 | >>> arr = np.zeros((256, 256, 4)) 87 | >>> metrohash.hash64_int(arr) 88 | 12125832280816116063 89 | 90 | ``` 91 | 92 | The arrays need to be contiguous for this to work. To convert a 93 | non-contiguous array, use NumPy's `ascontiguousarray()` function. 94 | 95 | ## Development 96 | 97 | ### Local workflow 98 | 99 | For those who want to contribute, here is a quick start using some 100 | makefile commands: 101 | 102 | ``` bash 103 | git clone https://github.com/escherba/python-metrohash.git 104 | cd python-metrohash 105 | make env # create a virtual environment 106 | make test # run Python tests 107 | make cpp-test # run C++ tests 108 | make shell # enter IPython shell 109 | ``` 110 | 111 | To find out which Make targets are available, type: 112 | 113 | ``` bash 114 | make help 115 | ``` 116 | 117 | ### Distribution 118 | 119 | The wheels are built using [cibuildwheel](https://cibuildwheel.readthedocs.io/) 120 | and are distributed to PyPI using GitHub actions. The wheels contain compiled 121 | binaries and are available for the following platforms: windows-amd64, 122 | ubuntu-x86, linux-x86\_64, linux-aarch64, and macosx-x86\_64. 123 | 124 | ## See Also 125 | 126 | For other fast non-cryptographic hash functions available as Python 127 | extensions, see [FarmHash](https://github.com/escherba/python-cityhash) 128 | and [MurmurHash](https://github.com/hajimes/mmh3). 129 | 130 | ## Authors 131 | 132 | The MetroHash algorithm and C++ implementation is due to J. Andrew 133 | Rogers. The Python bindings for it were written by Eugene Scherba. 134 | 135 | ## License 136 | 137 | This software is licensed under the [Apache License, 138 | Version 2.0](https://opensource.org/licenses/Apache-2.0). See the 139 | included LICENSE file for details. 140 | -------------------------------------------------------------------------------- /cpp.mk: -------------------------------------------------------------------------------- 1 | CXX := g++ 2 | CXXFLAGS := -std=c++11 -O3 -msse4.2 3 | LDFLAGS := 4 | SRCEXT := cc 5 | INC := -I src 6 | LIB := -L lib 7 | 8 | INPUT := ./data/sample_100k.txt 9 | 10 | BINDIR := bin 11 | SRCDIR := src 12 | TESTDIR := tests 13 | BUILDDIR := build 14 | ALL_SOURCES := $(wildcard $(SRCDIR)/*.$(SRCEXT) $(TESTDIR)/*.$(SRCEXT)) 15 | 16 | RUN_SOURCES := $(wildcard $(SRCDIR)/*_main.$(SRCEXT) $(TESTDIR)/*_main.$(SRCEXT)) 17 | RUN_OBJECTS := $(patsubst %, $(BUILDDIR)/%, $(RUN_SOURCES:.$(SRCEXT)=.o)) 18 | RUN_TARGETS := $(patsubst $(BUILDDIR)/%.o, $(BINDIR)/%, $(RUN_OBJECTS)) 19 | 20 | TEST_SOURCES := $(wildcard $(TESTDIR)/test_*.$(SRCEXT)) 21 | TEST_OBJECTS := $(patsubst %, $(BUILDDIR)/%, $(TEST_SOURCES:.$(SRCEXT)=.o)) 22 | TEST_TARGETS := $(patsubst $(BUILDDIR)/%.o, $(BINDIR)/%, $(TEST_OBJECTS)) 23 | 24 | SOURCES := $(filter-out $(RUN_SOURCES) $(TEST_SOURCES), $(ALL_SOURCES)) 25 | OBJECTS := $(patsubst %, $(BUILDDIR)/%, $(SOURCES:.$(SRCEXT)=.o)) 26 | 27 | .SECONDARY: $(RUN_OBJECTS) $(TEST_OBJECTS) $(OBJECTS) 28 | 29 | $(BUILDDIR)/%.o: %.$(SRCEXT) 30 | @mkdir -p $(dir $@) 31 | $(CC) $(INC) $(CXXFLAGS) -c $< -o $@ 32 | 33 | $(BINDIR)/%: $(BUILDDIR)/%.o $(OBJECTS) 34 | @mkdir -p $(dir $@) 35 | $(CXX) $(LIB) $(LDFLAGS) $^ -o $@ 36 | 37 | .PHONY: cpp-clean 38 | cpp-clean: ## clean up C++ project 39 | rm -rf ./$(BINDIR)/ ./$(BUILDDIR)/ 40 | 41 | .PHONY: cpp-run 42 | cpp-run: $(RUN_TARGETS) ## compile and run C++ program 43 | @for target in $(RUN_TARGETS); do \ 44 | echo $$target >&2; \ 45 | time ./$$target $(INPUT); \ 46 | done 47 | 48 | .PHONY: cpp-test 49 | cpp-test: $(TEST_TARGETS) ## run C++ tests 50 | @for target in $(TEST_TARGETS); do \ 51 | echo $$target >&2; \ 52 | ./$$target; \ 53 | done 54 | -------------------------------------------------------------------------------- /pip-freeze.txt: -------------------------------------------------------------------------------- 1 | appnope==0.1.3 2 | asttokens==2.2.1 3 | attrs==22.1.0 4 | backcall==0.2.0 5 | Cython==0.29.32 6 | decorator==5.1.1 7 | exceptiongroup==1.0.4 8 | executing==1.2.0 9 | iniconfig==1.1.1 10 | ipdb==0.13.11 11 | ipython==8.7.0 12 | jedi==0.18.2 13 | matplotlib-inline==0.1.6 14 | numpy==1.23.5 15 | packaging==22.0 16 | parso==0.8.3 17 | pexpect==4.8.0 18 | pickleshare==0.7.5 19 | pluggy==1.0.0 20 | prompt-toolkit==3.0.36 21 | ptyprocess==0.7.0 22 | pure-eval==0.2.2 23 | py-cpuinfo==9.0.0 24 | Pygments==2.13.0 25 | pytest==7.2.0 26 | six==1.16.0 27 | stack-data==0.6.2 28 | tomli==2.0.1 29 | traitlets==5.7.1 30 | wcwidth==0.2.5 31 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | build-backend = "setuptools.build_meta" 3 | requires = [ 4 | "Cython", 5 | "py-cpuinfo", 6 | "setuptools", 7 | "wheel", 8 | ] 9 | 10 | [tool.pytest.ini_options] 11 | addopts = "-s --doctest-modules" 12 | testpaths = [ 13 | "src", 14 | "tests", 15 | ] 16 | 17 | [tool.cibuildwheel] 18 | test-requires = "pytest" 19 | -------------------------------------------------------------------------------- /python.mk: -------------------------------------------------------------------------------- 1 | PYMODULE := metrohash 2 | EXTENSION := $(PYMODULE).so 3 | SRC_DIR := src 4 | PYPI_URL := https://test.pypi.org/legacy/ 5 | EXTENSION_DEPS := $(shell find $(SRC_DIR) -type f -name "*.pyx") 6 | EXTENSION_INTERMEDIATE := $(patsubst %.pyx,%.cpp,$(EXTENSION_DEPS)) 7 | EXTENSION_OBJS := $(patsubst %.pyx,%.so,$(EXTENSION_DEPS)) 8 | 9 | BUILD_STAMP = .build_stamp 10 | ENV_STAMP = env/bin/activate 11 | 12 | DISTRIBUTE := sdist bdist_wheel 13 | 14 | PYENV := PYTHONPATH=. . env/bin/activate; 15 | INTERPRETER := python3 16 | PACKAGE_MGR := pip3 17 | PYVERSION := $(shell $(INTERPRETER) --version 2>&1) 18 | PYTHON := $(PYENV) $(INTERPRETER) 19 | PIP := $(PYENV) $(PACKAGE_MGR) 20 | 21 | VENV_OPTS := "" 22 | ifeq ($(PIP_SYSTEM_SITE_PACKAGES),1) 23 | VENV_OPTS += --system-site-packages 24 | endif 25 | 26 | BOLD := $(shell tput bold) 27 | END := $(shell tput sgr0) 28 | 29 | .PHONY: package 30 | package: $(DISTRIBUTE) ## package for distribution (deprecated) 31 | $(DISTRIBUTE): $(BUILD_STAMP) | $(ENV_STAMP) 32 | @echo "Packaging using $(PYVERSION)" 33 | $(PYTHON) setup.py $(DISTRIBUTE) 34 | 35 | .PHONY: release 36 | release: $(BUILD_STAMP) | $(ENV_STAMP) ## upload package to PyPI (deprecated) 37 | @echo "Releasing using $(PYVERSION)" 38 | $(PYTHON) setup.py $(DISTRIBUTE) upload -r $(PYPI_URL) 39 | 40 | .PHONY: shell 41 | shell: build ## open Python shell within the virtual environment 42 | @echo "Using $(PYVERSION)" 43 | $(PYENV) python 44 | 45 | .PHONY: build 46 | build: $(EXTENSION_OBJS) ## build C extension(s) 47 | @echo "completed $@ target" 48 | 49 | $(BUILD_STAMP): $(EXTENSION_DEPS) | $(ENV_STAMP) 50 | @echo "Building using $(PYVERSION)" 51 | $(PYTHON) setup.py build_ext --inplace 52 | @echo "$(shell date -u +'%Y-%m-%dT%H:%M:%SZ')" > $@ 53 | 54 | $(EXTENSION_OBJS): $(BUILD_STAMP) 55 | @echo "done building $@" 56 | 57 | .PHONY: test 58 | test: build ## run Python unit tests 59 | $(PYENV) pytest 60 | 61 | .PHONY: nuke 62 | nuke: clean ## clean and remove virtual environment 63 | rm -f $(BUILD_STAMP) $(EXTENSION_INTERMEDIATE) 64 | rm -rf *.egg *.egg-info env 65 | find $(SRC_DIR) -depth -type d -name *.egg-info -exec rm -rf {} \; 66 | 67 | .PHONY: clean 68 | clean: ## remove temporary files 69 | $(PYTHON) setup.py clean 70 | rm -rf dist build __pycache__ 71 | rm -f *.so 72 | find $(SRC_DIR) -type f -name "*.pyc" -exec rm {} \; 73 | find $(SRC_DIR) -type f -name "*.cpp" -exec rm {} \; 74 | find $(SRC_DIR) -type f -name "*.so" -exec rm {} \; 75 | 76 | .PHONY: install 77 | install: $(BUILD_STAMP) ## install package 78 | $(PIP) install -e . 79 | 80 | .PRECIOUS: $(ENV_STAMP) 81 | .PHONY: env 82 | env: $(ENV_STAMP) ## set up a virtual environment 83 | $(ENV_STAMP): setup.py requirements.txt 84 | test -f $@ || $(INTERPRETER) -m venv $(VENV_OPTS) env 85 | $(PIP) install -U pip wheel 86 | export SETUPTOOLS_USE_DISTUTILS=stdlib; $(PIP) install -r requirements.txt 87 | $(PIP) freeze > pip-freeze.txt 88 | $(PIP) install -e . 89 | touch $@ 90 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Cython 2 | ipdb 3 | ipython 4 | numpy 5 | py-cpuinfo 6 | pytest 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import platform 5 | import struct 6 | 7 | from setuptools import setup 8 | from setuptools.dist import Distribution 9 | from setuptools.extension import Extension 10 | 11 | try: 12 | from cpuinfo import get_cpu_info 13 | 14 | CPU_FLAGS = get_cpu_info()["flags"] 15 | except Exception as exc: 16 | print("exception loading cpuinfo", exc) 17 | CPU_FLAGS = {} 18 | 19 | try: 20 | from Cython.Distutils import build_ext 21 | 22 | USE_CYTHON = True 23 | except ImportError: 24 | USE_CYTHON = False 25 | 26 | 27 | class BinaryDistribution(Distribution): 28 | """ 29 | Subclass the setuptools Distribution to flip the purity flag to false. 30 | See https://lucumr.pocoo.org/2014/1/27/python-on-wheels/ 31 | """ 32 | 33 | def is_pure(self): 34 | """Returns purity flag""" 35 | return False 36 | 37 | 38 | def get_system_bits(): 39 | """Return 32 for 32-bit systems and 64 for 64-bit""" 40 | return struct.calcsize("P") * 8 41 | 42 | 43 | SYSTEM = os.name 44 | BITS = get_system_bits() 45 | HAVE_SSE42 = "sse4_2" in CPU_FLAGS 46 | HAVE_AES = "aes" in CPU_FLAGS 47 | 48 | CXXFLAGS = [] 49 | 50 | print("system: %s-%d" % (SYSTEM, BITS)) 51 | print("available CPU flags:", CPU_FLAGS) 52 | print("environment:", ", ".join(["%s=%s" % (k, v) for k, v in os.environ.items()])) 53 | 54 | if SYSTEM == "nt": 55 | CXXFLAGS.extend(["/O2"]) 56 | else: 57 | CXXFLAGS.extend( 58 | [ 59 | "-O3", 60 | "-Wno-unused-value", 61 | "-Wno-unused-function", 62 | ] 63 | ) 64 | 65 | # The "cibuildwheel" tool sets AUDITWHEEL_ARCH variable to architecture strings 66 | # such as 'x86_64', 'aarch64', 'i686', etc. If this variable is not set, we 67 | # assume that the build is not a CI build and target current machine 68 | # architecture. 69 | TARGET_ARCH = os.environ.get("AUDITWHEEL_ARCH", platform.machine()) 70 | print("building for target architecture:", TARGET_ARCH) 71 | 72 | if HAVE_SSE42 and (TARGET_ARCH == "x86_64") and (BITS == 64): 73 | print("enabling SSE4.2 on compile") 74 | if SYSTEM == "nt": 75 | CXXFLAGS.append("/D__SSE4_2__") 76 | else: 77 | CXXFLAGS.append("-msse4.2") 78 | 79 | if HAVE_AES and (TARGET_ARCH == "x86_64") and (BITS == 64): 80 | print("enabling AES on compile") 81 | if SYSTEM == "nt": 82 | CXXFLAGS.append("/D__AES__") 83 | else: 84 | CXXFLAGS.append("-maes") 85 | 86 | CXXHEADERS = [ 87 | "src/metro.h", 88 | "src/metrohash.h", 89 | "src/metrohash128.h", 90 | "src/metrohash128crc.h", 91 | "src/metrohash64.h", 92 | "src/platform.h", 93 | ] 94 | CXXSOURCES = [ 95 | "src/metrohash64.cc", 96 | "src/metrohash128.cc", 97 | ] 98 | 99 | 100 | if USE_CYTHON: 101 | print("building extension using Cython") 102 | CMDCLASS = {"build_ext": build_ext} 103 | SRC_EXT = ".pyx" 104 | else: 105 | print("building extension w/o Cython") 106 | CMDCLASS = {} 107 | SRC_EXT = ".cpp" 108 | 109 | EXT_MODULES = [ 110 | Extension( 111 | "metrohash", 112 | CXXSOURCES + ["src/metrohash" + SRC_EXT], 113 | depends=CXXHEADERS, 114 | language="c++", 115 | extra_compile_args=CXXFLAGS, 116 | include_dirs=["src"], 117 | ), 118 | ] 119 | 120 | VERSION = "0.3.3" 121 | URL = "https://github.com/escherba/python-metrohash" 122 | 123 | 124 | def get_long_description(relpath, encoding="utf-8"): 125 | _long_desc = """ 126 | 127 | """ 128 | fname = os.path.join(os.path.dirname(__file__), relpath) 129 | try: 130 | with open(fname, "rb") as fh: 131 | return fh.read().decode(encoding) 132 | except Exception: 133 | return _long_desc 134 | 135 | 136 | setup( 137 | version=VERSION, 138 | description="Python bindings for MetroHash, a fast non-cryptographic hash algorithm", 139 | author="Eugene Scherba", 140 | author_email="escherba+metrohash@gmail.com", 141 | url=URL, 142 | download_url=URL + "/tarball/master/" + VERSION, 143 | name="metrohash", 144 | license="Apache License 2.0", 145 | python_requires='>=3.6', 146 | zip_safe=False, 147 | cmdclass=CMDCLASS, 148 | ext_modules=EXT_MODULES, 149 | package_dir={"": "src"}, 150 | keywords=["hash", "hashing", "metrohash", "cython"], 151 | classifiers=[ 152 | "Development Status :: 5 - Production/Stable", 153 | "Intended Audience :: Developers", 154 | "Intended Audience :: Science/Research", 155 | "License :: OSI Approved :: Apache Software License", 156 | "Operating System :: OS Independent", 157 | "Programming Language :: C++", 158 | "Programming Language :: Cython", 159 | "Programming Language :: Python :: 3.6", 160 | "Programming Language :: Python :: 3.7", 161 | "Programming Language :: Python :: 3.8", 162 | "Programming Language :: Python :: 3.9", 163 | "Programming Language :: Python :: 3.10", 164 | "Programming Language :: Python :: 3.11", 165 | "Topic :: Scientific/Engineering :: Information Analysis", 166 | "Topic :: Software Development :: Libraries", 167 | "Topic :: System :: Distributed Computing", 168 | ], 169 | long_description=get_long_description("README.md"), 170 | long_description_content_type="text/markdown", 171 | tests_require=["pytest"], 172 | distclass=BinaryDistribution, 173 | ) 174 | -------------------------------------------------------------------------------- /src/metro.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ===================================================================================== 3 | * 4 | * Filename: metro.h 5 | * 6 | * Description: Wrapper around metrohash.h 7 | * 8 | * Version: 1.0 9 | * Created: 10/14/2015 18:24:45 10 | * Revision: none 11 | * Compiler: gcc 12 | * 13 | * Author: Eugene Scherba (es), escherba+metrohash@gmail.com 14 | * Organization: - 15 | * 16 | * ===================================================================================== 17 | */ 18 | 19 | #include // for size_t. 20 | #include 21 | #include 22 | 23 | 24 | #include "metrohash.h" 25 | 26 | typedef uint8_t uint8; 27 | typedef uint32_t uint32; 28 | typedef uint64_t uint64; 29 | typedef std::pair uint128; 30 | 31 | 32 | inline uint64 Uint128Low64(const uint128& x) 33 | { 34 | return x.first; 35 | } 36 | 37 | 38 | inline uint64 Uint128High64(const uint128& x) 39 | { 40 | return x.second; 41 | } 42 | 43 | 44 | inline uint64 bytes2int64(uint8_t * const array) 45 | { 46 | // uint64 is guaranteed to be 8 bytes long 47 | return (uint64)( 48 | static_cast(array[0]) 49 | | static_cast(array[1]) << (8 * 1) 50 | | static_cast(array[2]) << (8 * 2) 51 | | static_cast(array[3]) << (8 * 3) 52 | | static_cast(array[4]) << (8 * 4) 53 | | static_cast(array[5]) << (8 * 5) 54 | | static_cast(array[6]) << (8 * 6) 55 | | static_cast(array[7]) << (8 * 7)); 56 | } 57 | 58 | 59 | inline uint128 bytes2int128(uint8_t * const array) 60 | { 61 | // uint64 is guaranteed to be 8 bytes long 62 | uint64 a = (uint64)( 63 | static_cast(array[0]) 64 | | static_cast(array[1]) << (8 * 1) 65 | | static_cast(array[2]) << (8 * 2) 66 | | static_cast(array[3]) << (8 * 3) 67 | | static_cast(array[4]) << (8 * 4) 68 | | static_cast(array[5]) << (8 * 5) 69 | | static_cast(array[6]) << (8 * 6) 70 | | static_cast(array[7]) << (8 * 7)); 71 | 72 | uint64 b = (uint64)( 73 | static_cast(array[8]) 74 | | static_cast(array[9]) << (8 * 1) 75 | | static_cast(array[10]) << (8 * 2) 76 | | static_cast(array[11]) << (8 * 3) 77 | | static_cast(array[12]) << (8 * 4) 78 | | static_cast(array[13]) << (8 * 5) 79 | | static_cast(array[14]) << (8 * 6) 80 | | static_cast(array[15]) << (8 * 7)); 81 | 82 | return uint128(a, b); 83 | } 84 | 85 | 86 | inline uint64 metrohash64(const uint8_t *buffer, const uint64 length, const uint64 seed) 87 | { 88 | uint8_t hash[8]; 89 | MetroHash64::Hash(buffer, length, hash, seed); 90 | return bytes2int64(hash); 91 | } 92 | 93 | 94 | inline uint128 metrohash128(const uint8_t *buffer, const uint64 length, const uint64 seed) 95 | { 96 | uint8_t hash[16]; 97 | MetroHash128::Hash(buffer, length, hash, seed); 98 | return bytes2int128(hash); 99 | } 100 | -------------------------------------------------------------------------------- /src/metrohash.h: -------------------------------------------------------------------------------- 1 | // metrohash.h 2 | // 3 | // Copyright 2015-2018 J. Andrew Rogers 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #ifndef METROHASH_METROHASH_H 18 | #define METROHASH_METROHASH_H 19 | 20 | #include "metrohash64.h" 21 | #include "metrohash128.h" 22 | #include "metrohash128crc.h" 23 | 24 | #endif // #ifndef METROHASH_METROHASH_H 25 | -------------------------------------------------------------------------------- /src/metrohash.pyx: -------------------------------------------------------------------------------- 1 | #cython: infer_types=True 2 | #cython: embedsignature=True 3 | #cython: binding=False 4 | #cython: language_level=3 5 | #distutils: language=c++ 6 | 7 | """ 8 | Python wrapper for MetroHash, a fast non-cryptographic hashing algorithm 9 | """ 10 | 11 | __author__ = "Eugene Scherba" 12 | __email__ = "escherba+metrohash@gmail.com" 13 | __version__ = "0.3.3" 14 | __all__ = [ 15 | "MetroHash64", 16 | "MetroHash128", 17 | "hash64", 18 | "hash128", 19 | "hash64_int", 20 | "hash128_int", 21 | "hash64_hex", 22 | "hash128_hex", 23 | ] 24 | 25 | 26 | cdef extern from * nogil: 27 | ctypedef unsigned char uint8_t 28 | ctypedef unsigned long int uint32_t 29 | ctypedef unsigned long long int uint64_t 30 | 31 | 32 | cdef extern from "" namespace "std" nogil: 33 | cdef cppclass pair[T, U]: 34 | T first 35 | U second 36 | pair() 37 | pair(pair&) 38 | pair(T&, U&) 39 | bint operator == (pair&, pair&) 40 | bint operator != (pair&, pair&) 41 | bint operator < (pair&, pair&) 42 | bint operator > (pair&, pair&) 43 | bint operator <= (pair&, pair&) 44 | bint operator >= (pair&, pair&) 45 | 46 | 47 | cdef extern from "Python.h": 48 | # Note that following functions can potentially raise an exception, 49 | # thus they cannot be declared 'nogil'. Also, PyUnicode_AsUTF8AndSize() can 50 | # potentially allocate memory inside in unlikely case of when underlying 51 | # unicode object was stored as non-utf8 and utf8 wasn't requested before. 52 | const char* PyUnicode_AsUTF8AndSize(object obj, Py_ssize_t* length) except NULL 53 | 54 | 55 | cdef extern from "metro.h" nogil: 56 | ctypedef uint8_t uint8 57 | ctypedef uint32_t uint32 58 | ctypedef uint64_t uint64 59 | ctypedef pair[uint64, uint64] uint128 60 | cdef uint64 c_metrohash64 "metrohash64" (const uint8* key, uint64 length, uint64 seed) 61 | cdef uint64 c_bytes2int64 "bytes2int64" (uint8* const array) 62 | cdef uint128 c_bytes2int128 "bytes2int128" (uint8* const array) 63 | cdef uint128 c_metrohash128 "metrohash128" (const uint8* key, uint64 length, uint64 seed) 64 | cdef cppclass CCMetroHash64 "MetroHash64": 65 | CCMetroHash64(const uint64 seed) 66 | void Initialize(const uint64 seed) 67 | void Update(const uint8* key, const uint64 length) 68 | void Finalize(uint8* const result) 69 | @staticmethod 70 | void Hash(const uint8* key, const uint64 length, uint8* const out, const uint64 seed) 71 | cdef cppclass CCMetroHash128 "MetroHash128": 72 | CCMetroHash128(const uint64 seed) 73 | void Initialize(const uint64 seed) 74 | void Update(const uint8* key, const uint64 length) 75 | void Finalize(uint8* const result) 76 | @staticmethod 77 | void Hash(const uint8* key, const uint64 length, uint8* const out, const uint64 seed) 78 | 79 | 80 | from cpython cimport long 81 | 82 | from cpython.buffer cimport PyObject_CheckBuffer 83 | from cpython.buffer cimport PyObject_GetBuffer 84 | from cpython.buffer cimport PyBuffer_Release 85 | from cpython.buffer cimport PyBUF_SIMPLE 86 | 87 | from cpython.unicode cimport PyUnicode_Check 88 | 89 | from cpython.bytes cimport PyBytes_Check 90 | from cpython.bytes cimport PyBytes_GET_SIZE 91 | from cpython.bytes cimport PyBytes_AS_STRING 92 | 93 | 94 | cdef inline str bytes2hex(bytes bs): 95 | return bs.hex() 96 | 97 | 98 | cdef object _type_error(argname: str, expected: object, value: object): 99 | return TypeError( 100 | "Argument '%s' has incorrect type: expected %s, got '%s' instead" % 101 | (argname, expected, type(value).__name__) 102 | ) 103 | 104 | 105 | cpdef bytes hash64(data, uint64 seed=0ULL): 106 | """Obtain a 64-bit hash from data using MetroHash-64. 107 | 108 | :param data: input data (either string or buffer type) 109 | :param seed: seed to random number generator (integer) 110 | :return: hash value (bytes) 111 | :raises TypeError: if input data is not a string or a buffer 112 | :raises ValueError: if input buffer is not C-contiguous 113 | :raises OverflowError: if seed cannot be converted to unsigned int64 114 | """ 115 | cdef Py_buffer buf 116 | cdef bytearray out = bytearray(8) 117 | cdef const char* encoding 118 | cdef Py_ssize_t encoding_size = 0 119 | 120 | if PyUnicode_Check(data): 121 | encoding = PyUnicode_AsUTF8AndSize(data, &encoding_size) 122 | CCMetroHash64.Hash(encoding, encoding_size, out, seed) 123 | elif PyBytes_Check(data): 124 | CCMetroHash64.Hash( 125 | PyBytes_AS_STRING(data), 126 | PyBytes_GET_SIZE(data), out, seed) 127 | elif PyObject_CheckBuffer(data): 128 | PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE) 129 | CCMetroHash64.Hash(buf.buf, buf.len, out, seed) 130 | PyBuffer_Release(&buf) 131 | else: 132 | raise _type_error("data", ["basestring", "buffer"], data) 133 | return bytes(out) 134 | 135 | 136 | cpdef bytes hash128(data, uint64 seed=0ULL): 137 | """Obtain a 128-bit hash from data using MetroHash-128. 138 | 139 | :param data: input data (either string or buffer type) 140 | :param seed: seed to random number generator (integer) 141 | :return: hash value (bytes) 142 | :raises TypeError: if input data is not a string or a buffer 143 | :raises ValueError: if input buffer is not C-contiguous 144 | :raises OverflowError: if seed cannot be converted to unsigned int64 145 | """ 146 | cdef Py_buffer buf 147 | cdef bytearray out = bytearray(16) 148 | cdef const char* encoding 149 | cdef Py_ssize_t encoding_size = 0 150 | 151 | if PyUnicode_Check(data): 152 | encoding = PyUnicode_AsUTF8AndSize(data, &encoding_size) 153 | CCMetroHash128.Hash(encoding, encoding_size, out, seed) 154 | elif PyBytes_Check(data): 155 | CCMetroHash128.Hash( 156 | PyBytes_AS_STRING(data), 157 | PyBytes_GET_SIZE(data), out, seed) 158 | elif PyObject_CheckBuffer(data): 159 | PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE) 160 | CCMetroHash128.Hash(buf.buf, buf.len, out, seed) 161 | PyBuffer_Release(&buf) 162 | else: 163 | raise _type_error("data", ["basestring", "buffer"], data) 164 | return bytes(out) 165 | 166 | 167 | 168 | def hash64_hex(data, uint64 seed=0ULL) -> str: 169 | """Obtain a 64-bit hash from data using MetroHash-64. 170 | 171 | :param data: input data (either string or buffer type) 172 | :param seed: seed to random number generator (integer) 173 | :return: hash value (string) 174 | :raises TypeError: if input data is not a string or a buffer 175 | :raises ValueError: if input buffer is not C-contiguous 176 | :raises OverflowError: if seed cannot be converted to unsigned int64 177 | """ 178 | return bytes2hex(hash64(data, seed=seed)) 179 | 180 | 181 | def hash128_hex(data, uint64 seed=0ULL) -> str: 182 | """Obtain a 128-bit hash from data using MetroHash-128. 183 | 184 | :param data: data (either string or buffer type) 185 | :param seed: seed to random number generator (integer) 186 | :return: hash value (string) 187 | :raises TypeError: if input data is not a string or a buffer 188 | :raises ValueError: if input buffer is not C-contiguous 189 | :raises OverflowError: if seed cannot be converted to unsigned int64 190 | """ 191 | return bytes2hex(hash128(data, seed=seed)) 192 | 193 | 194 | def hash64_int(data, uint64 seed=0ULL) -> int: 195 | """Obtain a 64-bit hash from data using MetroHash-64. 196 | 197 | :param data: input data (either string or buffer type) 198 | :param seed: seed to random number generator (integer) 199 | :return: hash value (integer) 200 | :raises TypeError: if input data is not a string or a buffer 201 | :raises ValueError: if input buffer is not C-contiguous 202 | :raises OverflowError: if seed cannot be converted to unsigned int64 203 | """ 204 | cdef Py_buffer buf 205 | cdef uint64 result 206 | cdef const char* encoding 207 | cdef Py_ssize_t encoding_size = 0 208 | 209 | if PyUnicode_Check(data): 210 | encoding = PyUnicode_AsUTF8AndSize(data, &encoding_size) 211 | result = c_metrohash64(encoding, encoding_size, seed) 212 | elif PyBytes_Check(data): 213 | result = c_metrohash64( 214 | PyBytes_AS_STRING(data), 215 | PyBytes_GET_SIZE(data), seed) 216 | elif PyObject_CheckBuffer(data): 217 | PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE) 218 | result = c_metrohash64(buf.buf, buf.len, seed) 219 | PyBuffer_Release(&buf) 220 | else: 221 | raise _type_error("data", ["basestring", "buffer"], data) 222 | return result 223 | 224 | 225 | def hash128_int(data, uint64 seed=0ULL) -> int: 226 | """Obtain a 128-bit hash from data using MetroHash-128. 227 | 228 | :param data: input data (either string or buffer type) 229 | :param seed: seed to random number generator (integer) 230 | :return: hash value (integer) 231 | :raises TypeError: if input data is not a string or a buffer 232 | :raises ValueError: if input buffer is not C-contiguous 233 | :raises OverflowError: if seed cannot be converted to unsigned int64 234 | """ 235 | cdef Py_buffer buf 236 | cdef uint128 result 237 | cdef const char* encoding 238 | cdef Py_ssize_t encoding_size = 0 239 | 240 | if PyUnicode_Check(data): 241 | encoding = PyUnicode_AsUTF8AndSize(data, &encoding_size) 242 | result = c_metrohash128(encoding, encoding_size, seed) 243 | elif PyBytes_Check(data): 244 | result = c_metrohash128( 245 | PyBytes_AS_STRING(data), 246 | PyBytes_GET_SIZE(data), seed) 247 | elif PyObject_CheckBuffer(data): 248 | PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE) 249 | result = c_metrohash128(buf.buf, buf.len, seed) 250 | PyBuffer_Release(&buf) 251 | else: 252 | raise _type_error("data", ["basestring", "buffer"], data) 253 | return (long(result.first) << 64ULL) + long(result.second) 254 | 255 | 256 | cdef class MetroHash64(object): 257 | """Incremental hasher interface for MetroHash-64. 258 | 259 | :param seed: seed to random number generator (integer) 260 | :raises TypeError: if seed is not an integer type 261 | :raises MemoryError: if a new method fails 262 | :raises OverflowError: if seed is out of bounds 263 | """ 264 | 265 | cdef CCMetroHash64* _m 266 | 267 | def __cinit__(self, uint64 seed=0ULL) -> None: 268 | self._m = new CCMetroHash64(seed) 269 | if self._m is NULL: 270 | raise MemoryError() 271 | 272 | def __dealloc__(self) -> None: 273 | if not self._m is NULL: 274 | del self._m 275 | self._m = NULL 276 | 277 | def reset(self, uint64 seed=0ULL) -> None: 278 | """Reset state with a new seed. 279 | 280 | :param seed: new seed to reset state to (integer) 281 | :raises TypeError: if seed is not an integer type 282 | :raises OverflowError: if seed is out of bounds 283 | """ 284 | self._m.Initialize(seed) 285 | 286 | def update(self, data) -> None: 287 | """Update digest with new data. 288 | 289 | :param data: input data (either string or buffer type) 290 | :raises TypeError: if input data is not a string or a buffer 291 | :raises ValueError: if input buffer is not C-contiguous 292 | """ 293 | cdef Py_buffer buf 294 | cdef const char* encoding 295 | cdef Py_ssize_t encoding_size = 0 296 | 297 | if PyUnicode_Check(data): 298 | encoding = PyUnicode_AsUTF8AndSize(data, &encoding_size) 299 | self._m.Update(encoding, encoding_size) 300 | elif PyBytes_Check(data): 301 | self._m.Update( 302 | PyBytes_AS_STRING(data), 303 | PyBytes_GET_SIZE(data)) 304 | elif PyObject_CheckBuffer(data): 305 | PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE) 306 | self._m.Update(buf.buf, buf.len) 307 | PyBuffer_Release(&buf) 308 | else: 309 | raise _type_error("data", ["basestring", "buffer"], data) 310 | 311 | cpdef bytes digest(self): 312 | """Obtain bytes digest. 313 | 314 | :return: eight bytes representing the 64-bit hash 315 | """ 316 | cdef bytearray out = bytearray(8) 317 | self._m.Finalize(out) 318 | return bytes(out) 319 | 320 | def hexdigest(self) -> str: 321 | """Obtain a string digest in hexadecimal form. 322 | 323 | :return: hash string 324 | """ 325 | return bytes2hex(self.digest()) 326 | 327 | def intdigest(self) -> int: 328 | """Obtain a long integer representing hash value. 329 | 330 | :return: an integer representing 64-bit hash value 331 | """ 332 | cdef uint8 buf[8] 333 | self._m.Finalize(buf) 334 | return c_bytes2int64(buf) 335 | 336 | 337 | cdef class MetroHash128(object): 338 | """Incremental hasher interface for MetroHash-128. 339 | 340 | :param seed: seed to random number generator (integer) 341 | :raises TypeError: if seed is not an integer type 342 | :raises MemoryError: if a new method fails 343 | :raises OverflowError: if seed is out of bounds 344 | """ 345 | 346 | cdef CCMetroHash128* _m 347 | 348 | def __cinit__(self, uint64 seed=0ULL) -> None: 349 | self._m = new CCMetroHash128(seed) 350 | if self._m is NULL: 351 | raise MemoryError() 352 | 353 | def __dealloc__(self) -> None: 354 | if not self._m is NULL: 355 | del self._m 356 | self._m = NULL 357 | 358 | def reset(self, uint64 seed=0ULL) -> None: 359 | """Reset state with a new seed. 360 | 361 | :param seed: new seed to reset state to (integer) 362 | :param TypeError: if seed is not an integer type 363 | :param OverflowError: if seed is out of bounds 364 | """ 365 | self._m.Initialize(seed) 366 | 367 | def update(self, data) -> None: 368 | """Update digest with new data. 369 | 370 | :param data: input data (either string or buffer type) 371 | :raises TypeError: if input data is not a string or a buffer 372 | :raises ValueError: if input buffer is not C-contiguous 373 | """ 374 | cdef Py_buffer buf 375 | cdef const char* encoding 376 | cdef Py_ssize_t encoding_size = 0 377 | 378 | if PyUnicode_Check(data): 379 | encoding = PyUnicode_AsUTF8AndSize(data, &encoding_size) 380 | self._m.Update(encoding, encoding_size) 381 | elif PyBytes_Check(data): 382 | self._m.Update( 383 | PyBytes_AS_STRING(data), 384 | PyBytes_GET_SIZE(data)) 385 | elif PyObject_CheckBuffer(data): 386 | PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE) 387 | self._m.Update(buf.buf, buf.len) 388 | PyBuffer_Release(&buf) 389 | else: 390 | raise _type_error("data", ["basestring", "buffer"], data) 391 | 392 | cpdef bytes digest(self): 393 | """Obtain bytes digest. 394 | 395 | :return: sixteen bytes representing the 128-bit hash 396 | """ 397 | cdef bytearray out = bytearray(16) 398 | self._m.Finalize(out) 399 | return bytes(out) 400 | 401 | def hexdigest(self) -> str: 402 | """Obtain a string digest in hexadecimal form. 403 | 404 | :return: hash string 405 | """ 406 | return bytes2hex(self.digest()) 407 | 408 | def intdigest(self) -> int: 409 | """Obtain integer digest. 410 | 411 | :return: a long integer representing 128-bit hash value 412 | """ 413 | cdef uint8 buf[16] 414 | self._m.Finalize(buf) 415 | cdef uint128 result = c_bytes2int128(buf) 416 | return (long(result.first) << 64ULL) + long(result.second) 417 | -------------------------------------------------------------------------------- /src/metrohash128.cc: -------------------------------------------------------------------------------- 1 | // metrohash128.cpp 2 | // 3 | // Copyright 2015-2018 J. Andrew Rogers 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #include 18 | #include "platform.h" 19 | #include "metrohash128.h" 20 | 21 | const char * MetroHash128::test_string = "012345678901234567890123456789012345678901234567890123456789012"; 22 | 23 | const uint8_t MetroHash128::test_seed_0[16] = { 24 | 0xC7, 0x7C, 0xE2, 0xBF, 0xA4, 0xED, 0x9F, 0x9B, 25 | 0x05, 0x48, 0xB2, 0xAC, 0x50, 0x74, 0xA2, 0x97 26 | }; 27 | 28 | const uint8_t MetroHash128::test_seed_1[16] = { 29 | 0x45, 0xA3, 0xCD, 0xB8, 0x38, 0x19, 0x9D, 0x7F, 30 | 0xBD, 0xD6, 0x8D, 0x86, 0x7A, 0x14, 0xEC, 0xEF 31 | }; 32 | 33 | 34 | 35 | MetroHash128::MetroHash128(const uint64_t seed) 36 | { 37 | Initialize(seed); 38 | } 39 | 40 | 41 | void MetroHash128::Initialize(const uint64_t seed) 42 | { 43 | // initialize internal hash registers 44 | state.v[0] = (static_cast(seed) - k0) * k3; 45 | state.v[1] = (static_cast(seed) + k1) * k2; 46 | state.v[2] = (static_cast(seed) + k0) * k2; 47 | state.v[3] = (static_cast(seed) - k1) * k3; 48 | 49 | // initialize total length of input 50 | bytes = 0; 51 | } 52 | 53 | 54 | void MetroHash128::Update(const uint8_t * const buffer, const uint64_t length) 55 | { 56 | const uint8_t * ptr = reinterpret_cast(buffer); 57 | const uint8_t * const end = ptr + length; 58 | 59 | // input buffer may be partially filled 60 | if (bytes % 32) 61 | { 62 | uint64_t fill = 32 - (bytes % 32); 63 | if (fill > length) 64 | fill = length; 65 | 66 | memcpy(input.b + (bytes % 32), ptr, static_cast(fill)); 67 | ptr += fill; 68 | bytes += fill; 69 | 70 | // input buffer is still partially filled 71 | if ((bytes % 32) != 0) return; 72 | 73 | // process full input buffer 74 | state.v[0] += read_u64(&input.b[ 0]) * k0; state.v[0] = rotate_right(state.v[0],29) + state.v[2]; 75 | state.v[1] += read_u64(&input.b[ 8]) * k1; state.v[1] = rotate_right(state.v[1],29) + state.v[3]; 76 | state.v[2] += read_u64(&input.b[16]) * k2; state.v[2] = rotate_right(state.v[2],29) + state.v[0]; 77 | state.v[3] += read_u64(&input.b[24]) * k3; state.v[3] = rotate_right(state.v[3],29) + state.v[1]; 78 | } 79 | 80 | // bulk update 81 | bytes += (end - ptr); 82 | while (ptr <= (end - 32)) 83 | { 84 | // process directly from the source, bypassing the input buffer 85 | state.v[0] += read_u64(ptr) * k0; ptr += 8; state.v[0] = rotate_right(state.v[0],29) + state.v[2]; 86 | state.v[1] += read_u64(ptr) * k1; ptr += 8; state.v[1] = rotate_right(state.v[1],29) + state.v[3]; 87 | state.v[2] += read_u64(ptr) * k2; ptr += 8; state.v[2] = rotate_right(state.v[2],29) + state.v[0]; 88 | state.v[3] += read_u64(ptr) * k3; ptr += 8; state.v[3] = rotate_right(state.v[3],29) + state.v[1]; 89 | } 90 | 91 | // store remaining bytes in input buffer 92 | if (ptr < end) 93 | memcpy(input.b, ptr, end - ptr); 94 | } 95 | 96 | 97 | void MetroHash128::Finalize(uint8_t * const hash) 98 | { 99 | // finalize bulk loop, if used 100 | if (bytes >= 32) 101 | { 102 | state.v[2] ^= rotate_right(((state.v[0] + state.v[3]) * k0) + state.v[1], 21) * k1; 103 | state.v[3] ^= rotate_right(((state.v[1] + state.v[2]) * k1) + state.v[0], 21) * k0; 104 | state.v[0] ^= rotate_right(((state.v[0] + state.v[2]) * k0) + state.v[3], 21) * k1; 105 | state.v[1] ^= rotate_right(((state.v[1] + state.v[3]) * k1) + state.v[2], 21) * k0; 106 | } 107 | 108 | // process any bytes remaining in the input buffer 109 | const uint8_t * ptr = reinterpret_cast(input.b); 110 | const uint8_t * const end = ptr + (bytes % 32); 111 | 112 | if ((end - ptr) >= 16) 113 | { 114 | state.v[0] += read_u64(ptr) * k2; ptr += 8; state.v[0] = rotate_right(state.v[0],33) * k3; 115 | state.v[1] += read_u64(ptr) * k2; ptr += 8; state.v[1] = rotate_right(state.v[1],33) * k3; 116 | state.v[0] ^= rotate_right((state.v[0] * k2) + state.v[1], 45) * k1; 117 | state.v[1] ^= rotate_right((state.v[1] * k3) + state.v[0], 45) * k0; 118 | } 119 | 120 | if ((end - ptr) >= 8) 121 | { 122 | state.v[0] += read_u64(ptr) * k2; ptr += 8; state.v[0] = rotate_right(state.v[0],33) * k3; 123 | state.v[0] ^= rotate_right((state.v[0] * k2) + state.v[1], 27) * k1; 124 | } 125 | 126 | if ((end - ptr) >= 4) 127 | { 128 | state.v[1] += read_u32(ptr) * k2; ptr += 4; state.v[1] = rotate_right(state.v[1],33) * k3; 129 | state.v[1] ^= rotate_right((state.v[1] * k3) + state.v[0], 46) * k0; 130 | } 131 | 132 | if ((end - ptr) >= 2) 133 | { 134 | state.v[0] += read_u16(ptr) * k2; ptr += 2; state.v[0] = rotate_right(state.v[0],33) * k3; 135 | state.v[0] ^= rotate_right((state.v[0] * k2) + state.v[1], 22) * k1; 136 | } 137 | 138 | if ((end - ptr) >= 1) 139 | { 140 | state.v[1] += read_u8 (ptr) * k2; state.v[1] = rotate_right(state.v[1],33) * k3; 141 | state.v[1] ^= rotate_right((state.v[1] * k3) + state.v[0], 58) * k0; 142 | } 143 | 144 | state.v[0] += rotate_right((state.v[0] * k0) + state.v[1], 13); 145 | state.v[1] += rotate_right((state.v[1] * k1) + state.v[0], 37); 146 | state.v[0] += rotate_right((state.v[0] * k2) + state.v[1], 13); 147 | state.v[1] += rotate_right((state.v[1] * k3) + state.v[0], 37); 148 | 149 | bytes = 0; 150 | 151 | // do any endian conversion here 152 | 153 | memcpy(hash, state.v, 16); 154 | } 155 | 156 | 157 | void MetroHash128::Hash(const uint8_t * buffer, const uint64_t length, uint8_t * const hash, const uint64_t seed) 158 | { 159 | const uint8_t * ptr = reinterpret_cast(buffer); 160 | const uint8_t * const end = ptr + length; 161 | 162 | uint64_t v[4]; 163 | 164 | v[0] = (static_cast(seed) - k0) * k3; 165 | v[1] = (static_cast(seed) + k1) * k2; 166 | 167 | if (length >= 32) 168 | { 169 | v[2] = (static_cast(seed) + k0) * k2; 170 | v[3] = (static_cast(seed) - k1) * k3; 171 | 172 | do 173 | { 174 | v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2]; 175 | v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3]; 176 | v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0]; 177 | v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1]; 178 | } 179 | while (ptr <= (end - 32)); 180 | 181 | v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 21) * k1; 182 | v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 21) * k0; 183 | v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 21) * k1; 184 | v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 21) * k0; 185 | } 186 | 187 | if ((end - ptr) >= 16) 188 | { 189 | v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],33) * k3; 190 | v[1] += read_u64(ptr) * k2; ptr += 8; v[1] = rotate_right(v[1],33) * k3; 191 | v[0] ^= rotate_right((v[0] * k2) + v[1], 45) * k1; 192 | v[1] ^= rotate_right((v[1] * k3) + v[0], 45) * k0; 193 | } 194 | 195 | if ((end - ptr) >= 8) 196 | { 197 | v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],33) * k3; 198 | v[0] ^= rotate_right((v[0] * k2) + v[1], 27) * k1; 199 | } 200 | 201 | if ((end - ptr) >= 4) 202 | { 203 | v[1] += read_u32(ptr) * k2; ptr += 4; v[1] = rotate_right(v[1],33) * k3; 204 | v[1] ^= rotate_right((v[1] * k3) + v[0], 46) * k0; 205 | } 206 | 207 | if ((end - ptr) >= 2) 208 | { 209 | v[0] += read_u16(ptr) * k2; ptr += 2; v[0] = rotate_right(v[0],33) * k3; 210 | v[0] ^= rotate_right((v[0] * k2) + v[1], 22) * k1; 211 | } 212 | 213 | if ((end - ptr) >= 1) 214 | { 215 | v[1] += read_u8 (ptr) * k2; v[1] = rotate_right(v[1],33) * k3; 216 | v[1] ^= rotate_right((v[1] * k3) + v[0], 58) * k0; 217 | } 218 | 219 | v[0] += rotate_right((v[0] * k0) + v[1], 13); 220 | v[1] += rotate_right((v[1] * k1) + v[0], 37); 221 | v[0] += rotate_right((v[0] * k2) + v[1], 13); 222 | v[1] += rotate_right((v[1] * k3) + v[0], 37); 223 | 224 | // do any endian conversion here 225 | 226 | memcpy(hash, v, 16); 227 | } 228 | 229 | 230 | bool MetroHash128::ImplementationVerified() 231 | { 232 | uint8_t hash[16]; 233 | const uint8_t * key = reinterpret_cast(MetroHash128::test_string); 234 | 235 | // verify one-shot implementation 236 | MetroHash128::Hash(key, strlen(MetroHash128::test_string), hash, 0); 237 | if (memcmp(hash, MetroHash128::test_seed_0, 16) != 0) return false; 238 | 239 | MetroHash128::Hash(key, strlen(MetroHash128::test_string), hash, 1); 240 | if (memcmp(hash, MetroHash128::test_seed_1, 16) != 0) return false; 241 | 242 | // verify incremental implementation 243 | MetroHash128 metro; 244 | 245 | metro.Initialize(0); 246 | metro.Update(reinterpret_cast(MetroHash128::test_string), strlen(MetroHash128::test_string)); 247 | metro.Finalize(hash); 248 | if (memcmp(hash, MetroHash128::test_seed_0, 16) != 0) return false; 249 | 250 | metro.Initialize(1); 251 | metro.Update(reinterpret_cast(MetroHash128::test_string), strlen(MetroHash128::test_string)); 252 | metro.Finalize(hash); 253 | if (memcmp(hash, MetroHash128::test_seed_1, 16) != 0) return false; 254 | 255 | return true; 256 | } 257 | 258 | 259 | void metrohash128_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) 260 | { 261 | static const uint64_t k0 = 0xC83A91E1; 262 | static const uint64_t k1 = 0x8648DBDB; 263 | static const uint64_t k2 = 0x7BDEC03B; 264 | static const uint64_t k3 = 0x2F5870A5; 265 | 266 | const uint8_t * ptr = reinterpret_cast(key); 267 | const uint8_t * const end = ptr + len; 268 | 269 | uint64_t v[4]; 270 | 271 | v[0] = ((static_cast(seed) - k0) * k3) + len; 272 | v[1] = ((static_cast(seed) + k1) * k2) + len; 273 | 274 | if (len >= 32) 275 | { 276 | v[2] = ((static_cast(seed) + k0) * k2) + len; 277 | v[3] = ((static_cast(seed) - k1) * k3) + len; 278 | 279 | do 280 | { 281 | v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2]; 282 | v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3]; 283 | v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0]; 284 | v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1]; 285 | } 286 | while (ptr <= (end - 32)); 287 | 288 | v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 26) * k1; 289 | v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 26) * k0; 290 | v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 26) * k1; 291 | v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 30) * k0; 292 | } 293 | 294 | if ((end - ptr) >= 16) 295 | { 296 | v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],33) * k3; 297 | v[1] += read_u64(ptr) * k2; ptr += 8; v[1] = rotate_right(v[1],33) * k3; 298 | v[0] ^= rotate_right((v[0] * k2) + v[1], 17) * k1; 299 | v[1] ^= rotate_right((v[1] * k3) + v[0], 17) * k0; 300 | } 301 | 302 | if ((end - ptr) >= 8) 303 | { 304 | v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],33) * k3; 305 | v[0] ^= rotate_right((v[0] * k2) + v[1], 20) * k1; 306 | } 307 | 308 | if ((end - ptr) >= 4) 309 | { 310 | v[1] += read_u32(ptr) * k2; ptr += 4; v[1] = rotate_right(v[1],33) * k3; 311 | v[1] ^= rotate_right((v[1] * k3) + v[0], 18) * k0; 312 | } 313 | 314 | if ((end - ptr) >= 2) 315 | { 316 | v[0] += read_u16(ptr) * k2; ptr += 2; v[0] = rotate_right(v[0],33) * k3; 317 | v[0] ^= rotate_right((v[0] * k2) + v[1], 24) * k1; 318 | } 319 | 320 | if ((end - ptr) >= 1) 321 | { 322 | v[1] += read_u8 (ptr) * k2; v[1] = rotate_right(v[1],33) * k3; 323 | v[1] ^= rotate_right((v[1] * k3) + v[0], 24) * k0; 324 | } 325 | 326 | v[0] += rotate_right((v[0] * k0) + v[1], 13); 327 | v[1] += rotate_right((v[1] * k1) + v[0], 37); 328 | v[0] += rotate_right((v[0] * k2) + v[1], 13); 329 | v[1] += rotate_right((v[1] * k3) + v[0], 37); 330 | 331 | // do any endian conversion here 332 | 333 | memcpy(out, v, 16); 334 | } 335 | 336 | 337 | void metrohash128_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) 338 | { 339 | static const uint64_t k0 = 0xD6D018F5; 340 | static const uint64_t k1 = 0xA2AA033B; 341 | static const uint64_t k2 = 0x62992FC1; 342 | static const uint64_t k3 = 0x30BC5B29; 343 | 344 | const uint8_t * ptr = reinterpret_cast(key); 345 | const uint8_t * const end = ptr + len; 346 | 347 | uint64_t v[4]; 348 | 349 | v[0] = ((static_cast(seed) - k0) * k3) + len; 350 | v[1] = ((static_cast(seed) + k1) * k2) + len; 351 | 352 | if (len >= 32) 353 | { 354 | v[2] = ((static_cast(seed) + k0) * k2) + len; 355 | v[3] = ((static_cast(seed) - k1) * k3) + len; 356 | 357 | do 358 | { 359 | v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2]; 360 | v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3]; 361 | v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0]; 362 | v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1]; 363 | } 364 | while (ptr <= (end - 32)); 365 | 366 | v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 33) * k1; 367 | v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 33) * k0; 368 | v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 33) * k1; 369 | v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 33) * k0; 370 | } 371 | 372 | if ((end - ptr) >= 16) 373 | { 374 | v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],29) * k3; 375 | v[1] += read_u64(ptr) * k2; ptr += 8; v[1] = rotate_right(v[1],29) * k3; 376 | v[0] ^= rotate_right((v[0] * k2) + v[1], 29) * k1; 377 | v[1] ^= rotate_right((v[1] * k3) + v[0], 29) * k0; 378 | } 379 | 380 | if ((end - ptr) >= 8) 381 | { 382 | v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],29) * k3; 383 | v[0] ^= rotate_right((v[0] * k2) + v[1], 29) * k1; 384 | } 385 | 386 | if ((end - ptr) >= 4) 387 | { 388 | v[1] += read_u32(ptr) * k2; ptr += 4; v[1] = rotate_right(v[1],29) * k3; 389 | v[1] ^= rotate_right((v[1] * k3) + v[0], 25) * k0; 390 | } 391 | 392 | if ((end - ptr) >= 2) 393 | { 394 | v[0] += read_u16(ptr) * k2; ptr += 2; v[0] = rotate_right(v[0],29) * k3; 395 | v[0] ^= rotate_right((v[0] * k2) + v[1], 30) * k1; 396 | } 397 | 398 | if ((end - ptr) >= 1) 399 | { 400 | v[1] += read_u8 (ptr) * k2; v[1] = rotate_right(v[1],29) * k3; 401 | v[1] ^= rotate_right((v[1] * k3) + v[0], 18) * k0; 402 | } 403 | 404 | v[0] += rotate_right((v[0] * k0) + v[1], 33); 405 | v[1] += rotate_right((v[1] * k1) + v[0], 33); 406 | v[0] += rotate_right((v[0] * k2) + v[1], 33); 407 | v[1] += rotate_right((v[1] * k3) + v[0], 33); 408 | 409 | // do any endian conversion here 410 | 411 | memcpy(out, v, 16); 412 | } 413 | 414 | -------------------------------------------------------------------------------- /src/metrohash128.h: -------------------------------------------------------------------------------- 1 | // metrohash128.h 2 | // 3 | // Copyright 2015-2018 J. Andrew Rogers 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #ifndef METROHASH_METROHASH_128_H 18 | #define METROHASH_METROHASH_128_H 19 | 20 | #include 21 | 22 | class MetroHash128 23 | { 24 | public: 25 | static const uint32_t bits = 128; 26 | 27 | // Constructor initializes the same as Initialize() 28 | MetroHash128(const uint64_t seed=0); 29 | 30 | // Initializes internal state for new hash with optional seed 31 | void Initialize(const uint64_t seed=0); 32 | 33 | // Update the hash state with a string of bytes. If the length 34 | // is sufficiently long, the implementation switches to a bulk 35 | // hashing algorithm directly on the argument buffer for speed. 36 | void Update(const uint8_t * buffer, const uint64_t length); 37 | 38 | // Constructs the final hash and writes it to the argument buffer. 39 | // After a hash is finalized, this instance must be Initialized()-ed 40 | // again or the behavior of Update() and Finalize() is undefined. 41 | void Finalize(uint8_t * const hash); 42 | 43 | // A non-incremental function implementation. This can be significantly 44 | // faster than the incremental implementation for some usage patterns. 45 | static void Hash(const uint8_t * buffer, const uint64_t length, uint8_t * const hash, const uint64_t seed=0); 46 | 47 | // Does implementation correctly execute test vectors? 48 | static bool ImplementationVerified(); 49 | 50 | // test vectors -- Hash(test_string, seed=0) => test_seed_0 51 | static const char * test_string; 52 | static const uint8_t test_seed_0[16]; 53 | static const uint8_t test_seed_1[16]; 54 | 55 | private: 56 | static const uint64_t k0 = 0xC83A91E1; 57 | static const uint64_t k1 = 0x8648DBDB; 58 | static const uint64_t k2 = 0x7BDEC03B; 59 | static const uint64_t k3 = 0x2F5870A5; 60 | 61 | struct { uint64_t v[4]; } state; 62 | struct { uint8_t b[32]; } input; 63 | uint64_t bytes; 64 | }; 65 | 66 | 67 | // Legacy 128-bit hash functions -- do not use 68 | void metrohash128_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); 69 | void metrohash128_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); 70 | 71 | 72 | #endif // #ifndef METROHASH_METROHASH_128_H 73 | -------------------------------------------------------------------------------- /src/metrohash128crc.cc: -------------------------------------------------------------------------------- 1 | // metrohash128crc.cpp 2 | // 3 | // Copyright 2015-2018 J. Andrew Rogers 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | 18 | #include 19 | #include 20 | #include "metrohash.h" 21 | #include "platform.h" 22 | 23 | 24 | void metrohash128crc_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) 25 | { 26 | static const uint64_t k0 = 0xC83A91E1; 27 | static const uint64_t k1 = 0x8648DBDB; 28 | static const uint64_t k2 = 0x7BDEC03B; 29 | static const uint64_t k3 = 0x2F5870A5; 30 | 31 | const uint8_t * ptr = reinterpret_cast(key); 32 | const uint8_t * const end = ptr + len; 33 | 34 | uint64_t v[4]; 35 | 36 | v[0] = ((static_cast(seed) - k0) * k3) + len; 37 | v[1] = ((static_cast(seed) + k1) * k2) + len; 38 | 39 | if (len >= 32) 40 | { 41 | v[2] = ((static_cast(seed) + k0) * k2) + len; 42 | v[3] = ((static_cast(seed) - k1) * k3) + len; 43 | 44 | do 45 | { 46 | v[0] ^= _mm_crc32_u64(v[0], read_u64(ptr)); ptr += 8; 47 | v[1] ^= _mm_crc32_u64(v[1], read_u64(ptr)); ptr += 8; 48 | v[2] ^= _mm_crc32_u64(v[2], read_u64(ptr)); ptr += 8; 49 | v[3] ^= _mm_crc32_u64(v[3], read_u64(ptr)); ptr += 8; 50 | } 51 | while (ptr <= (end - 32)); 52 | 53 | v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 34) * k1; 54 | v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 37) * k0; 55 | v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 34) * k1; 56 | v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 37) * k0; 57 | } 58 | 59 | if ((end - ptr) >= 16) 60 | { 61 | v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],34) * k3; 62 | v[1] += read_u64(ptr) * k2; ptr += 8; v[1] = rotate_right(v[1],34) * k3; 63 | v[0] ^= rotate_right((v[0] * k2) + v[1], 30) * k1; 64 | v[1] ^= rotate_right((v[1] * k3) + v[0], 30) * k0; 65 | } 66 | 67 | if ((end - ptr) >= 8) 68 | { 69 | v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],36) * k3; 70 | v[0] ^= rotate_right((v[0] * k2) + v[1], 23) * k1; 71 | } 72 | 73 | if ((end - ptr) >= 4) 74 | { 75 | v[1] ^= _mm_crc32_u64(v[0], read_u32(ptr)); ptr += 4; 76 | v[1] ^= rotate_right((v[1] * k3) + v[0], 19) * k0; 77 | } 78 | 79 | if ((end - ptr) >= 2) 80 | { 81 | v[0] ^= _mm_crc32_u64(v[1], read_u16(ptr)); ptr += 2; 82 | v[0] ^= rotate_right((v[0] * k2) + v[1], 13) * k1; 83 | } 84 | 85 | if ((end - ptr) >= 1) 86 | { 87 | v[1] ^= _mm_crc32_u64(v[0], read_u8 (ptr)); 88 | v[1] ^= rotate_right((v[1] * k3) + v[0], 17) * k0; 89 | } 90 | 91 | v[0] += rotate_right((v[0] * k0) + v[1], 11); 92 | v[1] += rotate_right((v[1] * k1) + v[0], 26); 93 | v[0] += rotate_right((v[0] * k0) + v[1], 11); 94 | v[1] += rotate_right((v[1] * k1) + v[0], 26); 95 | 96 | memcpy(out, v, 16); 97 | } 98 | 99 | 100 | void metrohash128crc_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) 101 | { 102 | static const uint64_t k0 = 0xEE783E2F; 103 | static const uint64_t k1 = 0xAD07C493; 104 | static const uint64_t k2 = 0x797A90BB; 105 | static const uint64_t k3 = 0x2E4B2E1B; 106 | 107 | const uint8_t * ptr = reinterpret_cast(key); 108 | const uint8_t * const end = ptr + len; 109 | 110 | uint64_t v[4]; 111 | 112 | v[0] = ((static_cast(seed) - k0) * k3) + len; 113 | v[1] = ((static_cast(seed) + k1) * k2) + len; 114 | 115 | if (len >= 32) 116 | { 117 | v[2] = ((static_cast(seed) + k0) * k2) + len; 118 | v[3] = ((static_cast(seed) - k1) * k3) + len; 119 | 120 | do 121 | { 122 | v[0] ^= _mm_crc32_u64(v[0], read_u64(ptr)); ptr += 8; 123 | v[1] ^= _mm_crc32_u64(v[1], read_u64(ptr)); ptr += 8; 124 | v[2] ^= _mm_crc32_u64(v[2], read_u64(ptr)); ptr += 8; 125 | v[3] ^= _mm_crc32_u64(v[3], read_u64(ptr)); ptr += 8; 126 | } 127 | while (ptr <= (end - 32)); 128 | 129 | v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 12) * k1; 130 | v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 19) * k0; 131 | v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 12) * k1; 132 | v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 19) * k0; 133 | } 134 | 135 | if ((end - ptr) >= 16) 136 | { 137 | v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],41) * k3; 138 | v[1] += read_u64(ptr) * k2; ptr += 8; v[1] = rotate_right(v[1],41) * k3; 139 | v[0] ^= rotate_right((v[0] * k2) + v[1], 10) * k1; 140 | v[1] ^= rotate_right((v[1] * k3) + v[0], 10) * k0; 141 | } 142 | 143 | if ((end - ptr) >= 8) 144 | { 145 | v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],34) * k3; 146 | v[0] ^= rotate_right((v[0] * k2) + v[1], 22) * k1; 147 | } 148 | 149 | if ((end - ptr) >= 4) 150 | { 151 | v[1] ^= _mm_crc32_u64(v[0], read_u32(ptr)); ptr += 4; 152 | v[1] ^= rotate_right((v[1] * k3) + v[0], 14) * k0; 153 | } 154 | 155 | if ((end - ptr) >= 2) 156 | { 157 | v[0] ^= _mm_crc32_u64(v[1], read_u16(ptr)); ptr += 2; 158 | v[0] ^= rotate_right((v[0] * k2) + v[1], 15) * k1; 159 | } 160 | 161 | if ((end - ptr) >= 1) 162 | { 163 | v[1] ^= _mm_crc32_u64(v[0], read_u8 (ptr)); 164 | v[1] ^= rotate_right((v[1] * k3) + v[0], 18) * k0; 165 | } 166 | 167 | v[0] += rotate_right((v[0] * k0) + v[1], 15); 168 | v[1] += rotate_right((v[1] * k1) + v[0], 27); 169 | v[0] += rotate_right((v[0] * k0) + v[1], 15); 170 | v[1] += rotate_right((v[1] * k1) + v[0], 27); 171 | 172 | memcpy(out, v, 16); 173 | } 174 | -------------------------------------------------------------------------------- /src/metrohash128crc.h: -------------------------------------------------------------------------------- 1 | // metrohash128crc.h 2 | // 3 | // Copyright 2015-2018 J. Andrew Rogers 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #ifndef METROHASH_METROHASH_128_CRC_H 18 | #define METROHASH_METROHASH_128_CRC_H 19 | 20 | #include 21 | 22 | // Legacy 128-bit hash functions 23 | void metrohash128crc_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); 24 | void metrohash128crc_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); 25 | 26 | 27 | #endif // #ifndef METROHASH_METROHASH_128_CRC_H 28 | -------------------------------------------------------------------------------- /src/metrohash64.cc: -------------------------------------------------------------------------------- 1 | // metrohash64.cpp 2 | // 3 | // Copyright 2015-2018 J. Andrew Rogers 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #include "platform.h" 18 | #include "metrohash64.h" 19 | 20 | #include 21 | 22 | const char * MetroHash64::test_string = "012345678901234567890123456789012345678901234567890123456789012"; 23 | 24 | const uint8_t MetroHash64::test_seed_0[8] = { 0x6B, 0x75, 0x3D, 0xAE, 0x06, 0x70, 0x4B, 0xAD }; 25 | const uint8_t MetroHash64::test_seed_1[8] = { 0x3B, 0x0D, 0x48, 0x1C, 0xF4, 0xB9, 0xB8, 0xDF }; 26 | 27 | 28 | 29 | MetroHash64::MetroHash64(const uint64_t seed) 30 | { 31 | Initialize(seed); 32 | } 33 | 34 | 35 | void MetroHash64::Initialize(const uint64_t seed) 36 | { 37 | vseed = (static_cast(seed) + k2) * k0; 38 | 39 | // initialize internal hash registers 40 | state.v[0] = vseed; 41 | state.v[1] = vseed; 42 | state.v[2] = vseed; 43 | state.v[3] = vseed; 44 | 45 | // initialize total length of input 46 | bytes = 0; 47 | } 48 | 49 | 50 | void MetroHash64::Update(const uint8_t * const buffer, const uint64_t length) 51 | { 52 | const uint8_t * ptr = reinterpret_cast(buffer); 53 | const uint8_t * const end = ptr + length; 54 | 55 | // input buffer may be partially filled 56 | if (bytes % 32) 57 | { 58 | uint64_t fill = 32 - (bytes % 32); 59 | if (fill > length) 60 | fill = length; 61 | 62 | memcpy(input.b + (bytes % 32), ptr, static_cast(fill)); 63 | ptr += fill; 64 | bytes += fill; 65 | 66 | // input buffer is still partially filled 67 | if ((bytes % 32) != 0) return; 68 | 69 | // process full input buffer 70 | state.v[0] += read_u64(&input.b[ 0]) * k0; state.v[0] = rotate_right(state.v[0],29) + state.v[2]; 71 | state.v[1] += read_u64(&input.b[ 8]) * k1; state.v[1] = rotate_right(state.v[1],29) + state.v[3]; 72 | state.v[2] += read_u64(&input.b[16]) * k2; state.v[2] = rotate_right(state.v[2],29) + state.v[0]; 73 | state.v[3] += read_u64(&input.b[24]) * k3; state.v[3] = rotate_right(state.v[3],29) + state.v[1]; 74 | } 75 | 76 | // bulk update 77 | bytes += static_cast(end - ptr); 78 | while (ptr <= (end - 32)) 79 | { 80 | // process directly from the source, bypassing the input buffer 81 | state.v[0] += read_u64(ptr) * k0; ptr += 8; state.v[0] = rotate_right(state.v[0],29) + state.v[2]; 82 | state.v[1] += read_u64(ptr) * k1; ptr += 8; state.v[1] = rotate_right(state.v[1],29) + state.v[3]; 83 | state.v[2] += read_u64(ptr) * k2; ptr += 8; state.v[2] = rotate_right(state.v[2],29) + state.v[0]; 84 | state.v[3] += read_u64(ptr) * k3; ptr += 8; state.v[3] = rotate_right(state.v[3],29) + state.v[1]; 85 | } 86 | 87 | // store remaining bytes in input buffer 88 | if (ptr < end) 89 | memcpy(input.b, ptr, static_cast(end - ptr)); 90 | } 91 | 92 | 93 | void MetroHash64::Finalize(uint8_t * const hash) 94 | { 95 | // finalize bulk loop, if used 96 | if (bytes >= 32) 97 | { 98 | state.v[2] ^= rotate_right(((state.v[0] + state.v[3]) * k0) + state.v[1], 37) * k1; 99 | state.v[3] ^= rotate_right(((state.v[1] + state.v[2]) * k1) + state.v[0], 37) * k0; 100 | state.v[0] ^= rotate_right(((state.v[0] + state.v[2]) * k0) + state.v[3], 37) * k1; 101 | state.v[1] ^= rotate_right(((state.v[1] + state.v[3]) * k1) + state.v[2], 37) * k0; 102 | 103 | state.v[0] = vseed + (state.v[0] ^ state.v[1]); 104 | } 105 | 106 | // process any bytes remaining in the input buffer 107 | const uint8_t * ptr = reinterpret_cast(input.b); 108 | const uint8_t * const end = ptr + (bytes % 32); 109 | 110 | if ((end - ptr) >= 16) 111 | { 112 | state.v[1] = state.v[0] + (read_u64(ptr) * k2); ptr += 8; state.v[1] = rotate_right(state.v[1],29) * k3; 113 | state.v[2] = state.v[0] + (read_u64(ptr) * k2); ptr += 8; state.v[2] = rotate_right(state.v[2],29) * k3; 114 | state.v[1] ^= rotate_right(state.v[1] * k0, 21) + state.v[2]; 115 | state.v[2] ^= rotate_right(state.v[2] * k3, 21) + state.v[1]; 116 | state.v[0] += state.v[2]; 117 | } 118 | 119 | if ((end - ptr) >= 8) 120 | { 121 | state.v[0] += read_u64(ptr) * k3; ptr += 8; 122 | state.v[0] ^= rotate_right(state.v[0], 55) * k1; 123 | } 124 | 125 | if ((end - ptr) >= 4) 126 | { 127 | state.v[0] += read_u32(ptr) * k3; ptr += 4; 128 | state.v[0] ^= rotate_right(state.v[0], 26) * k1; 129 | } 130 | 131 | if ((end - ptr) >= 2) 132 | { 133 | state.v[0] += read_u16(ptr) * k3; ptr += 2; 134 | state.v[0] ^= rotate_right(state.v[0], 48) * k1; 135 | } 136 | 137 | if ((end - ptr) >= 1) 138 | { 139 | state.v[0] += read_u8 (ptr) * k3; 140 | state.v[0] ^= rotate_right(state.v[0], 37) * k1; 141 | } 142 | 143 | state.v[0] ^= rotate_right(state.v[0], 28); 144 | state.v[0] *= k0; 145 | state.v[0] ^= rotate_right(state.v[0], 29); 146 | 147 | bytes = 0; 148 | 149 | // do any endian conversion here 150 | 151 | memcpy(hash, state.v, 8); 152 | } 153 | 154 | 155 | void MetroHash64::Hash(const uint8_t * buffer, const uint64_t length, uint8_t * const hash, const uint64_t seed) 156 | { 157 | const uint8_t * ptr = reinterpret_cast(buffer); 158 | const uint8_t * const end = ptr + length; 159 | 160 | uint64_t h = (static_cast(seed) + k2) * k0; 161 | 162 | if (length >= 32) 163 | { 164 | uint64_t v[4]; 165 | v[0] = h; 166 | v[1] = h; 167 | v[2] = h; 168 | v[3] = h; 169 | 170 | do 171 | { 172 | v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2]; 173 | v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3]; 174 | v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0]; 175 | v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1]; 176 | } 177 | while (ptr <= (end - 32)); 178 | 179 | v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 37) * k1; 180 | v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 37) * k0; 181 | v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 37) * k1; 182 | v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 37) * k0; 183 | h += v[0] ^ v[1]; 184 | } 185 | 186 | if ((end - ptr) >= 16) 187 | { 188 | uint64_t v0 = h + (read_u64(ptr) * k2); ptr += 8; v0 = rotate_right(v0,29) * k3; 189 | uint64_t v1 = h + (read_u64(ptr) * k2); ptr += 8; v1 = rotate_right(v1,29) * k3; 190 | v0 ^= rotate_right(v0 * k0, 21) + v1; 191 | v1 ^= rotate_right(v1 * k3, 21) + v0; 192 | h += v1; 193 | } 194 | 195 | if ((end - ptr) >= 8) 196 | { 197 | h += read_u64(ptr) * k3; ptr += 8; 198 | h ^= rotate_right(h, 55) * k1; 199 | } 200 | 201 | if ((end - ptr) >= 4) 202 | { 203 | h += read_u32(ptr) * k3; ptr += 4; 204 | h ^= rotate_right(h, 26) * k1; 205 | } 206 | 207 | if ((end - ptr) >= 2) 208 | { 209 | h += read_u16(ptr) * k3; ptr += 2; 210 | h ^= rotate_right(h, 48) * k1; 211 | } 212 | 213 | if ((end - ptr) >= 1) 214 | { 215 | h += read_u8 (ptr) * k3; 216 | h ^= rotate_right(h, 37) * k1; 217 | } 218 | 219 | h ^= rotate_right(h, 28); 220 | h *= k0; 221 | h ^= rotate_right(h, 29); 222 | 223 | memcpy(hash, &h, 8); 224 | } 225 | 226 | 227 | bool MetroHash64::ImplementationVerified() 228 | { 229 | uint8_t hash[8]; 230 | const uint8_t * key = reinterpret_cast(MetroHash64::test_string); 231 | 232 | // verify one-shot implementation 233 | MetroHash64::Hash(key, strlen(MetroHash64::test_string), hash, 0); 234 | if (memcmp(hash, MetroHash64::test_seed_0, 8) != 0) return false; 235 | 236 | MetroHash64::Hash(key, strlen(MetroHash64::test_string), hash, 1); 237 | if (memcmp(hash, MetroHash64::test_seed_1, 8) != 0) return false; 238 | 239 | // verify incremental implementation 240 | MetroHash64 metro; 241 | 242 | metro.Initialize(0); 243 | metro.Update(reinterpret_cast(MetroHash64::test_string), strlen(MetroHash64::test_string)); 244 | metro.Finalize(hash); 245 | if (memcmp(hash, MetroHash64::test_seed_0, 8) != 0) return false; 246 | 247 | metro.Initialize(1); 248 | metro.Update(reinterpret_cast(MetroHash64::test_string), strlen(MetroHash64::test_string)); 249 | metro.Finalize(hash); 250 | if (memcmp(hash, MetroHash64::test_seed_1, 8) != 0) return false; 251 | 252 | return true; 253 | } 254 | 255 | 256 | void metrohash64_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) 257 | { 258 | static const uint64_t k0 = 0xC83A91E1; 259 | static const uint64_t k1 = 0x8648DBDB; 260 | static const uint64_t k2 = 0x7BDEC03B; 261 | static const uint64_t k3 = 0x2F5870A5; 262 | 263 | const uint8_t * ptr = reinterpret_cast(key); 264 | const uint8_t * const end = ptr + len; 265 | 266 | uint64_t hash = ((static_cast(seed) + k2) * k0) + len; 267 | 268 | if (len >= 32) 269 | { 270 | uint64_t v[4]; 271 | v[0] = hash; 272 | v[1] = hash; 273 | v[2] = hash; 274 | v[3] = hash; 275 | 276 | do 277 | { 278 | v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2]; 279 | v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3]; 280 | v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0]; 281 | v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1]; 282 | } 283 | while (ptr <= (end - 32)); 284 | 285 | v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 33) * k1; 286 | v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 33) * k0; 287 | v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 33) * k1; 288 | v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 33) * k0; 289 | hash += v[0] ^ v[1]; 290 | } 291 | 292 | if ((end - ptr) >= 16) 293 | { 294 | uint64_t v0 = hash + (read_u64(ptr) * k0); ptr += 8; v0 = rotate_right(v0,33) * k1; 295 | uint64_t v1 = hash + (read_u64(ptr) * k1); ptr += 8; v1 = rotate_right(v1,33) * k2; 296 | v0 ^= rotate_right(v0 * k0, 35) + v1; 297 | v1 ^= rotate_right(v1 * k3, 35) + v0; 298 | hash += v1; 299 | } 300 | 301 | if ((end - ptr) >= 8) 302 | { 303 | hash += read_u64(ptr) * k3; ptr += 8; 304 | hash ^= rotate_right(hash, 33) * k1; 305 | 306 | } 307 | 308 | if ((end - ptr) >= 4) 309 | { 310 | hash += read_u32(ptr) * k3; ptr += 4; 311 | hash ^= rotate_right(hash, 15) * k1; 312 | } 313 | 314 | if ((end - ptr) >= 2) 315 | { 316 | hash += read_u16(ptr) * k3; ptr += 2; 317 | hash ^= rotate_right(hash, 13) * k1; 318 | } 319 | 320 | if ((end - ptr) >= 1) 321 | { 322 | hash += read_u8 (ptr) * k3; 323 | hash ^= rotate_right(hash, 25) * k1; 324 | } 325 | 326 | hash ^= rotate_right(hash, 33); 327 | hash *= k0; 328 | hash ^= rotate_right(hash, 33); 329 | 330 | memcpy(out, &hash, 8); 331 | } 332 | 333 | 334 | void metrohash64_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) 335 | { 336 | static const uint64_t k0 = 0xD6D018F5; 337 | static const uint64_t k1 = 0xA2AA033B; 338 | static const uint64_t k2 = 0x62992FC1; 339 | static const uint64_t k3 = 0x30BC5B29; 340 | 341 | const uint8_t * ptr = reinterpret_cast(key); 342 | const uint8_t * const end = ptr + len; 343 | 344 | uint64_t hash = ((static_cast(seed) + k2) * k0) + len; 345 | 346 | if (len >= 32) 347 | { 348 | uint64_t v[4]; 349 | v[0] = hash; 350 | v[1] = hash; 351 | v[2] = hash; 352 | v[3] = hash; 353 | 354 | do 355 | { 356 | v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2]; 357 | v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3]; 358 | v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0]; 359 | v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1]; 360 | } 361 | while (ptr <= (end - 32)); 362 | 363 | v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 30) * k1; 364 | v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 30) * k0; 365 | v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 30) * k1; 366 | v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 30) * k0; 367 | hash += v[0] ^ v[1]; 368 | } 369 | 370 | if ((end - ptr) >= 16) 371 | { 372 | uint64_t v0 = hash + (read_u64(ptr) * k2); ptr += 8; v0 = rotate_right(v0,29) * k3; 373 | uint64_t v1 = hash + (read_u64(ptr) * k2); ptr += 8; v1 = rotate_right(v1,29) * k3; 374 | v0 ^= rotate_right(v0 * k0, 34) + v1; 375 | v1 ^= rotate_right(v1 * k3, 34) + v0; 376 | hash += v1; 377 | } 378 | 379 | if ((end - ptr) >= 8) 380 | { 381 | hash += read_u64(ptr) * k3; ptr += 8; 382 | hash ^= rotate_right(hash, 36) * k1; 383 | } 384 | 385 | if ((end - ptr) >= 4) 386 | { 387 | hash += read_u32(ptr) * k3; ptr += 4; 388 | hash ^= rotate_right(hash, 15) * k1; 389 | } 390 | 391 | if ((end - ptr) >= 2) 392 | { 393 | hash += read_u16(ptr) * k3; ptr += 2; 394 | hash ^= rotate_right(hash, 15) * k1; 395 | } 396 | 397 | if ((end - ptr) >= 1) 398 | { 399 | hash += read_u8 (ptr) * k3; 400 | hash ^= rotate_right(hash, 23) * k1; 401 | } 402 | 403 | hash ^= rotate_right(hash, 28); 404 | hash *= k0; 405 | hash ^= rotate_right(hash, 29); 406 | 407 | memcpy(out, &hash, 8); 408 | } 409 | 410 | 411 | -------------------------------------------------------------------------------- /src/metrohash64.h: -------------------------------------------------------------------------------- 1 | // metrohash64.h 2 | // 3 | // Copyright 2015-2018 J. Andrew Rogers 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #ifndef METROHASH_METROHASH_64_H 18 | #define METROHASH_METROHASH_64_H 19 | 20 | #include 21 | 22 | class MetroHash64 23 | { 24 | public: 25 | static const uint32_t bits = 64; 26 | 27 | // Constructor initializes the same as Initialize() 28 | MetroHash64(const uint64_t seed=0); 29 | 30 | // Initializes internal state for new hash with optional seed 31 | void Initialize(const uint64_t seed=0); 32 | 33 | // Update the hash state with a string of bytes. If the length 34 | // is sufficiently long, the implementation switches to a bulk 35 | // hashing algorithm directly on the argument buffer for speed. 36 | void Update(const uint8_t * buffer, const uint64_t length); 37 | 38 | // Constructs the final hash and writes it to the argument buffer. 39 | // After a hash is finalized, this instance must be Initialized()-ed 40 | // again or the behavior of Update() and Finalize() is undefined. 41 | void Finalize(uint8_t * const hash); 42 | 43 | // A non-incremental function implementation. This can be significantly 44 | // faster than the incremental implementation for some usage patterns. 45 | static void Hash(const uint8_t * buffer, const uint64_t length, uint8_t * const hash, const uint64_t seed=0); 46 | 47 | // Does implementation correctly execute test vectors? 48 | static bool ImplementationVerified(); 49 | 50 | // test vectors -- Hash(test_string, seed=0) => test_seed_0 51 | static const char * test_string; 52 | static const uint8_t test_seed_0[8]; 53 | static const uint8_t test_seed_1[8]; 54 | 55 | private: 56 | static const uint64_t k0 = 0xD6D018F5; 57 | static const uint64_t k1 = 0xA2AA033B; 58 | static const uint64_t k2 = 0x62992FC1; 59 | static const uint64_t k3 = 0x30BC5B29; 60 | 61 | struct { uint64_t v[4]; } state; 62 | struct { uint8_t b[32]; } input; 63 | uint64_t bytes; 64 | uint64_t vseed; 65 | }; 66 | 67 | 68 | // Legacy 64-bit hash functions -- do not use 69 | void metrohash64_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); 70 | void metrohash64_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); 71 | 72 | 73 | #endif // #ifndef METROHASH_METROHASH_64_H 74 | -------------------------------------------------------------------------------- /src/platform.h: -------------------------------------------------------------------------------- 1 | // platform.h 2 | // 3 | // Copyright 2015-2018 J. Andrew Rogers 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #ifndef METROHASH_PLATFORM_H 18 | #define METROHASH_PLATFORM_H 19 | 20 | #include 21 | 22 | // rotate right idiom recognized by most compilers 23 | inline static uint64_t rotate_right(uint64_t v, unsigned k) 24 | { 25 | return (v >> k) | (v << (64 - k)); 26 | } 27 | 28 | // unaligned reads, fast and safe on Nehalem and later microarchitectures 29 | inline static uint64_t read_u64(const void * const ptr) 30 | { 31 | return static_cast(*reinterpret_cast(ptr)); 32 | } 33 | 34 | inline static uint64_t read_u32(const void * const ptr) 35 | { 36 | return static_cast(*reinterpret_cast(ptr)); 37 | } 38 | 39 | inline static uint64_t read_u16(const void * const ptr) 40 | { 41 | return static_cast(*reinterpret_cast(ptr)); 42 | } 43 | 44 | inline static uint64_t read_u8 (const void * const ptr) 45 | { 46 | return static_cast(*reinterpret_cast(ptr)); 47 | } 48 | 49 | 50 | #endif // #ifndef METROHASH_PLATFORM_H 51 | -------------------------------------------------------------------------------- /tests/metrohash64_main.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * ===================================================================================== 3 | * 4 | * Filename: run_levc.cpp 5 | * 6 | * Description: Compute levenshtein distance (main) 7 | * 8 | * Version: 1.0 9 | * Created: 09/07/2015 21:21:41 10 | * Revision: none 11 | * Compiler: gcc 12 | * 13 | * Author: Eugene Scherba (es), escherba+metrohash@gmail.com 14 | * Organization: - 15 | * 16 | * ===================================================================================== 17 | */ 18 | 19 | #include 20 | #include 21 | #include 22 | #include "metro.h" 23 | 24 | 25 | int main(int argc, char** argv) { 26 | std::string line; 27 | if (argc <= 1) { 28 | return EXIT_FAILURE; 29 | } 30 | std::ifstream infile(argv[1]); 31 | while (std::getline(infile, line)) 32 | { 33 | uint64_t result = metrohash64((uint8_t*)line.c_str(), line.length(), 0); 34 | std::cout << result << "\t" << line << std::endl; 35 | } 36 | return EXIT_SUCCESS; 37 | } 38 | -------------------------------------------------------------------------------- /tests/test_metrohash.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * ===================================================================================== 3 | * 4 | * Filename: test_metro.cc 5 | * 6 | * Description: C++-based tests for MetroHash 7 | * 8 | * Version: 1.0 9 | * Created: 10/12/2015 16:30:58 10 | * Revision: none 11 | * Compiler: gcc 12 | * 13 | * Author: Eugene Scherba (es) 14 | * Organization: - 15 | * 16 | * ===================================================================================== 17 | */ 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #define CATCH_CONFIG_MAIN // This tells Catch to provide a main() - only do this in one cpp file 26 | #include "catch.hpp" 27 | #include "metro.h" 28 | 29 | #define STRLEN(s) (sizeof(s)/sizeof(s[0]) - 1) 30 | #define HASH64_SZ 8 31 | #define HASH128_SZ 16 32 | 33 | // --------------------------------------------- 34 | // 64-bit 35 | // --------------------------------------------- 36 | TEST_CASE( "basic test (64-bit)", "[basic64]" ) { 37 | uint8_t * const hash = (uint8_t * const)calloc(HASH64_SZ + 1, sizeof(uint8_t)); 38 | const uint8_t test_string[] = "abracadabra"; 39 | REQUIRE(hash[0] == (uint8_t)'\0'); 40 | REQUIRE(hash[HASH64_SZ] == (uint8_t)'\0'); 41 | MetroHash64::Hash((uint8_t * const)test_string, STRLEN(test_string), hash, 0); 42 | REQUIRE(hash[0] != (uint8_t)'\0'); 43 | REQUIRE(hash[HASH64_SZ] == (uint8_t)'\0'); 44 | free(hash); 45 | } 46 | 47 | TEST_CASE( "test different seeds (64-bit)", "[seeds64]" ) { 48 | uint8_t * const hash1 = (uint8_t * const)calloc(HASH64_SZ + 1, sizeof(uint8_t)); 49 | uint8_t * const hash2 = (uint8_t * const)calloc(HASH64_SZ + 1, sizeof(uint8_t)); 50 | const uint8_t test_string[] = "abracadabra"; 51 | MetroHash64::Hash(test_string, STRLEN(test_string), hash1, 0); 52 | MetroHash64::Hash(test_string, STRLEN(test_string), hash2, 1); 53 | REQUIRE(memcmp(hash1, hash2, HASH64_SZ) != 0); 54 | free(hash1); 55 | free(hash2); 56 | } 57 | 58 | TEST_CASE( "test different inputs (64-bit)", "[inputs64]" ) { 59 | uint8_t * const hash1 = (uint8_t * const)calloc(HASH64_SZ + 1, sizeof(uint8_t)); 60 | uint8_t * const hash2 = (uint8_t * const)calloc(HASH64_SZ + 1, sizeof(uint8_t)); 61 | const uint8_t test_string1[] = "abracadabr"; 62 | const uint8_t test_string2[] = "abracaaabra"; 63 | MetroHash64::Hash(test_string1, STRLEN(test_string1), hash1, 0); 64 | MetroHash64::Hash(test_string2, STRLEN(test_string2), hash2, 0); 65 | REQUIRE(memcmp(hash1, hash2, HASH64_SZ) != 0); 66 | free(hash1); 67 | free(hash2); 68 | } 69 | 70 | TEST_CASE( "implementation verified (64-bit)", "[verified64]" ) { 71 | REQUIRE(MetroHash64::ImplementationVerified()); 72 | } 73 | 74 | TEST_CASE( "test incremental updating (64-bit)", "[incremental64]" ) { 75 | uint8_t * const hash_incremental = (uint8_t * const)calloc(HASH64_SZ + 1, sizeof(uint8_t)); 76 | uint8_t * const hash_whole = (uint8_t * const)calloc(HASH64_SZ + 1, sizeof(uint8_t)); 77 | const uint8_t test_string[] = "abracadabra"; 78 | const uint8_t test_string1[] = "abra"; 79 | const uint8_t test_string2[] = "cadabra"; 80 | REQUIRE(hash_incremental[0] == (uint8_t)'\0'); 81 | REQUIRE(hash_incremental[HASH64_SZ] == (uint8_t)'\0'); 82 | MetroHash64 m1(0); 83 | m1.Update(test_string1, STRLEN(test_string1)); 84 | m1.Update(test_string2, STRLEN(test_string2)); 85 | m1.Finalize(hash_incremental); 86 | MetroHash64 m2(0); 87 | m2.Update(test_string, STRLEN(test_string)); 88 | m2.Finalize(hash_whole); 89 | REQUIRE(hash_incremental[0] != (uint8_t)'\0'); 90 | REQUIRE(hash_incremental[HASH64_SZ] == (uint8_t)'\0'); 91 | REQUIRE(memcmp(hash_incremental, hash_whole, HASH64_SZ) == 0); 92 | REQUIRE(bytes2int64(hash_incremental) == bytes2int64(hash_whole)); 93 | free(hash_incremental); 94 | free(hash_whole); 95 | } 96 | 97 | // --------------------------------------------- 98 | // 128-bit 99 | // --------------------------------------------- 100 | TEST_CASE( "basic test (128-bit)", "[basic128]" ) { 101 | uint8_t * const hash = (uint8_t * const)calloc(HASH128_SZ + 1, sizeof(uint8_t)); 102 | const uint8_t test_string[] = "abracadabra"; 103 | REQUIRE(hash[0] == (uint8_t)'\0'); 104 | REQUIRE(hash[HASH128_SZ] == (uint8_t)'\0'); 105 | MetroHash128::Hash((uint8_t * const)test_string, STRLEN(test_string), hash, 0); 106 | REQUIRE(hash[0] != (uint8_t)'\0'); 107 | REQUIRE(hash[HASH128_SZ] == (uint8_t)'\0'); 108 | free(hash); 109 | } 110 | 111 | TEST_CASE( "test different seeds (128-bit)", "[seeds128]" ) { 112 | uint8_t * const hash1 = (uint8_t * const)calloc(HASH128_SZ + 1, sizeof(uint8_t)); 113 | uint8_t * const hash2 = (uint8_t * const)calloc(HASH128_SZ + 1, sizeof(uint8_t)); 114 | const uint8_t test_string[] = "abracadabra"; 115 | MetroHash128::Hash(test_string, STRLEN(test_string), hash1, 0); 116 | MetroHash128::Hash(test_string, STRLEN(test_string), hash2, 1); 117 | REQUIRE(memcmp(hash1, hash2, HASH128_SZ) != 0); 118 | free(hash1); 119 | free(hash2); 120 | } 121 | 122 | TEST_CASE( "test different inputs (128-bit)", "[inputs128]" ) { 123 | uint8_t * const hash1 = (uint8_t * const)calloc(HASH128_SZ + 1, sizeof(uint8_t)); 124 | uint8_t * const hash2 = (uint8_t * const)calloc(HASH128_SZ + 1, sizeof(uint8_t)); 125 | const uint8_t test_string1[] = "abracadabr"; 126 | const uint8_t test_string2[] = "abracaaabra"; 127 | MetroHash128::Hash(test_string1, STRLEN(test_string1), hash1, 0); 128 | MetroHash128::Hash(test_string2, STRLEN(test_string2), hash2, 0); 129 | REQUIRE(memcmp(hash1, hash2, HASH128_SZ) != 0); 130 | free(hash1); 131 | free(hash2); 132 | } 133 | 134 | TEST_CASE( "implementation verified (128-bit)", "[verified128]" ) { 135 | REQUIRE(MetroHash128::ImplementationVerified()); 136 | } 137 | 138 | TEST_CASE( "test incremental updating (128-bit)", "[incremental128]" ) { 139 | uint8_t * const hash_incremental = (uint8_t * const)calloc(HASH128_SZ + 1, sizeof(uint8_t)); 140 | uint8_t * const hash_whole = (uint8_t * const)calloc(HASH128_SZ + 1, sizeof(uint8_t)); 141 | const uint8_t test_string[] = "abracadabra"; 142 | const uint8_t test_string1[] = "abra"; 143 | const uint8_t test_string2[] = "cadabra"; 144 | REQUIRE(hash_incremental[0] == (uint8_t)'\0'); 145 | REQUIRE(hash_incremental[HASH128_SZ] == (uint8_t)'\0'); 146 | MetroHash128 m1(0); 147 | m1.Update(test_string1, STRLEN(test_string1)); 148 | m1.Update(test_string2, STRLEN(test_string2)); 149 | m1.Finalize(hash_incremental); 150 | MetroHash128 m2(0); 151 | m2.Update(test_string, STRLEN(test_string)); 152 | m2.Finalize(hash_whole); 153 | REQUIRE(hash_incremental[0] != (uint8_t)'\0'); 154 | REQUIRE(hash_incremental[HASH128_SZ] == (uint8_t)'\0'); 155 | REQUIRE(memcmp(hash_incremental, hash_whole, HASH128_SZ) == 0); 156 | REQUIRE(bytes2int128(hash_incremental) == bytes2int128(hash_whole)); 157 | free(hash_incremental); 158 | free(hash_whole); 159 | } 160 | -------------------------------------------------------------------------------- /tests/test_metrohash.py: -------------------------------------------------------------------------------- 1 | """ 2 | Python-based tests for metrohash extension 3 | """ 4 | import unittest 5 | import random 6 | import string 7 | import sys 8 | 9 | from metrohash import ( 10 | MetroHash64, 11 | MetroHash128, 12 | hash64_int as metrohash64, 13 | hash128_int as metrohash128, 14 | ) 15 | 16 | 17 | EMPTY_STRING = "" 18 | EMPTY_UNICODE = u"" # pylint: disable=redundant-u-string-prefix 19 | 20 | 21 | if sys.version_info[0] >= 3: 22 | long = int 23 | 24 | 25 | def random_string(n, alphabet=string.ascii_lowercase): 26 | """generate a random string""" 27 | return "".join(random.choice(alphabet) for _ in range(n)) 28 | 29 | 30 | def random_splits(s, n, nsplits=2): 31 | """split string in random places""" 32 | splits = sorted([random.randint(0, n) for _ in range(nsplits - 1)]) 33 | splits = [0] + splits + [n] 34 | for begin, end in zip(splits, splits[1:]): 35 | yield s[begin:end] 36 | 37 | 38 | class TestStateless(unittest.TestCase): 39 | 40 | """test stateless methods""" 41 | 42 | def test_string_unicode_64(self): 43 | """Empty Python string has same hash value as empty Unicode string""" 44 | self.assertEqual(metrohash64(EMPTY_STRING), metrohash64(EMPTY_UNICODE)) 45 | 46 | def test_string_unicode_128(self): 47 | """Empty Python string has same hash value as empty Unicode string""" 48 | self.assertEqual(metrohash128(EMPTY_STRING), metrohash128(EMPTY_UNICODE)) 49 | 50 | def test_consistent_encoding_64(self): 51 | """ASCII-range Unicode strings have the same hash values as ASCII strings""" 52 | text = u"abracadabra" # pylint: disable=redundant-u-string-prefix 53 | self.assertEqual(metrohash64(text), metrohash64(text.encode("utf-8"))) 54 | 55 | def test_consistent_encoding_128(self): 56 | """ASCII-range Unicode strings have the same hash values as ASCII strings""" 57 | text = u"abracadabra" # pylint: disable=redundant-u-string-prefix 58 | self.assertEqual(metrohash128(text), metrohash128(text.encode("utf-8"))) 59 | 60 | def test_unicode_1_64(self): 61 | """Accepts Unicode input""" 62 | test_case = u"abc" # pylint: disable=redundant-u-string-prefix 63 | self.assertTrue(isinstance(metrohash64(test_case), long)) 64 | 65 | def test_unicode_1_128(self): 66 | """Accepts Unicode input""" 67 | test_case = u"abc" # pylint: disable=redundant-u-string-prefix 68 | self.assertTrue(isinstance(metrohash128(test_case), long)) 69 | 70 | def test_unicode_2_64(self): 71 | """Accepts Unicode input outside of ASCII range""" 72 | test_case = u"\u2661" # pylint: disable=redundant-u-string-prefix 73 | self.assertTrue(isinstance(metrohash64(test_case), long)) 74 | 75 | def test_unicode_2_128(self): 76 | """Accepts Unicode input outside of ASCII range""" 77 | test_case = u"\u2661" # pylint: disable=redundant-u-string-prefix 78 | self.assertTrue(isinstance(metrohash128(test_case), long)) 79 | 80 | def test_refcounts(self): 81 | """Doesn't leak references to its argument""" 82 | funcs = [metrohash64, metrohash128] 83 | args = ["abc", b"abc", bytearray(b"def"), memoryview(b"ghi")] 84 | for func in funcs: 85 | for arg in args: 86 | old_refcount = sys.getrefcount(arg) 87 | func(arg) 88 | self.assertEqual(sys.getrefcount(arg), old_refcount) 89 | 90 | def test_func_raises_type_error(self): 91 | """Check that functions raise type error""" 92 | funcs = [metrohash64, metrohash128] 93 | for func in funcs: 94 | with self.assertRaises(TypeError): 95 | func([]) 96 | 97 | 98 | class TestIncremental(unittest.TestCase): 99 | 100 | """test incremental hashers""" 101 | 102 | def test_compose(self): 103 | """Test various ways to split a string""" 104 | nchars = 1000 105 | split_range = (2, 10) 106 | num_tests = 100 107 | hashers = [MetroHash64, MetroHash128] 108 | alphabet = string.ascii_uppercase + string.ascii_lowercase + string.digits 109 | 110 | for hasher in hashers: 111 | for _ in range(num_tests): 112 | data = random_string(nchars, alphabet=alphabet) 113 | hasher1 = hasher() 114 | pieces = list(random_splits(data, nchars, random.randint(*split_range))) 115 | for piece in pieces: 116 | hasher1.update(piece) 117 | incremental = hasher1.intdigest() 118 | hasher2 = hasher() 119 | hasher2.update(data) 120 | whole = hasher2.intdigest() 121 | msg = "\ndata: %s\nwhole: %s\nincremental: %s\n" % ( 122 | pieces, 123 | whole, 124 | incremental, 125 | ) 126 | self.assertEqual(whole, incremental, msg) 127 | 128 | def test_obj_raises_type_error(self): 129 | """Check that hasher objects raise type error""" 130 | hasher_classes = [MetroHash64, MetroHash128] 131 | for hasher_class in hasher_classes: 132 | hasher = hasher_class() 133 | with self.assertRaises(TypeError): 134 | hasher.update([]) 135 | 136 | def test_reset_64(self): 137 | """test that 64-bit hasher can be reset""" 138 | 139 | seed1 = 42 140 | expected1 = metrohash64("ab", seed1) 141 | hasher = MetroHash64(seed1) 142 | hasher.update("a") 143 | hasher.update("b") 144 | self.assertEqual(hasher.intdigest(), expected1) 145 | 146 | seed2 = 0 147 | hasher.reset(seed2) 148 | expected2 = metrohash64("c", seed2) 149 | hasher.update("c") 150 | self.assertEqual(hasher.intdigest(), expected2) 151 | 152 | def test_reset_128(self): 153 | """test that 128-bit hasher can be reset""" 154 | 155 | seed1 = 42 156 | expected1 = metrohash128("ab", seed1) 157 | hasher = MetroHash128(seed1) 158 | hasher.update("a") 159 | hasher.update("b") 160 | self.assertEqual(hasher.intdigest(), expected1) 161 | 162 | seed2 = 0 163 | hasher.reset(seed2) 164 | expected2 = metrohash128("c", seed2) 165 | hasher.update("c") 166 | self.assertEqual(hasher.intdigest(), expected2) 167 | --------------------------------------------------------------------------------