├── .gitignore ├── LICENSE.txt ├── README.md ├── bagminhash ├── bitstream_random.hpp ├── exponential_distribution.hpp └── weighted_minwise_hashing.hpp ├── bagminhash_wrappers.hpp ├── catch.hpp ├── darthash.hpp ├── dartminhash.hpp ├── datagenerator.hpp ├── hashing.hpp ├── icws.hpp ├── main.cpp ├── makefile ├── output ├── performance.csv └── similarity.csv ├── similarity.hpp ├── tests-main.cpp ├── tests.cpp └── timer.hpp /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | main 3 | tests 4 | .vscode/* 5 | /bagminhash/xxhash 6 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Tobias Christiani 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DartMinHash: Fast Sketching for Weighted Sets 2 | 3 | This repository contains experiments for comparing the estimation accuracy and running times of the following weighted minwise hashing algorithms: 4 | 5 | * DartMinHash https://arxiv.org/abs/2005.11547 6 | * ICWS and FastICWS https://research.google/pubs/pub36928/ 7 | * BagMinHash https://arxiv.org/abs/1802.03914 8 | 9 | For BagMinHash and ICWS we use the implementation from here https://github.com/oertl/bagminhash with the relevant files included in the /bagminhash folder. 10 | 11 | See the DartMinHash paper https://arxiv.org/abs/2005.11547 for a description of the algorithm and further results of experiments. 12 | 13 | ## Requirements 14 | The BagMinHash algorithm uses XXHash64 which must be installed: 15 | 16 | 1. Get xxhash from https://github.com/Cyan4973/xxHash, e.g. using `git clone https://github.com/Cyan4973/xxHash.git` 17 | 2. Build using `make lib` 18 | 3. Place xxhash.h and libxxhash.a into the directory /bagminhash/xxhash 19 | 20 | The code compiles under GCC version 7.5.0 https://gcc.gnu.org/ with relevant commands in the makefile https://www.gnu.org/software/make/. 21 | 22 | ## Commands 23 | 24 | `make run` compiles and executes the main function in main.cpp. 25 | 26 | `make test` compiles and run unit tests. 27 | 28 | ## Experiments 29 | The different experiments are all placed in the main.cpp and write their output to stdout in CSV format. 30 | 31 | 1. `time_performance`: Times different algorithms on synthetic data for all combinations of sketch lengths, and L0 and L1 norms chosen. 32 | 2. `time_performance_specific`: Same as above, but only runs on specified tuples of parameters. 33 | 3. `measure_similarity`: Returns the estimated Jaccard similarity of different algorithms on synthetic pairs of weighted sets with a specific similarity. 34 | 35 | By default `make run` will run `time_performance_specific` on a subset of the settings used in Table 1 in the paper. 36 | 37 | In order to pipe the output to the file `data.csv` use command `make run > data.csv`. 38 | 39 | ## Example output 40 | 41 | Notation: 42 | 43 | * t denotes the sketch length (usually k in the paper). 44 | * ICWS is a simple and unoptimized version of ICWS using tabulation hashing. 45 | * ICWS_xxhash is the implementation from the BagMinHash repository which uses the ziggurat algorithm for fast sampling: https://en.wikipedia.org/wiki/Ziggurat_algorithm 46 | * FastICWS is our own highly optimized implementation of ICWS that tabulates expensive operations and only computes the logarithms of weights once. 47 | * BagMinHash1 and BagMinHash2: BagMinHash variants described in the BagMinHash paper. BagMinHash2 is essentially always faster and is what we compare against. 48 | * DartMinHash: Optimized implementation following the pseudocode in the paper. 49 | 50 | ### Performance timings 51 | 52 | | id | L0 | log2_L1 | t | ICWS | FastICWS | ICWS_xxhash | BagMinHash1 | BagMinHash2 | DartMinHash | 53 | |----|------|---------|------|---------|----------|-------------|-------------|-------------|-------------| 54 | | 0 | 64 | 0.000 | 64 | 0.899 | 0.060 | 0.538 | 2.439 | 0.628 | 0.042 | 55 | | 1 | 1024 | 0.000 | 64 | 11.565 | 0.515 | 9.604 | 4.374 | 1.706 | 0.145 | 56 | | 2 | 64 | 0.000 | 1024 | 19.296 | 2.885 | 8.083 | 48.248 | 13.279 | 0.592 | 57 | | 3 | 1024 | 0.000 | 1024 | 187.661 | 12.643 | 120.135 | 79.775 | 16.586 | 0.824 | 58 | | 4 | 256 | 0.000 | 1 | 0.040 | 0.008 | 0.040 | 0.112 | 0.103 | 0.021 | 59 | | 5 | 256 | 0.000 | 256 | 14.645 | 0.939 | 7.716 | 13.687 | 3.270 | 0.187 | 60 | | 6 | 1024 | 0.000 | 256 | 45.239 | 2.703 | 30.127 | 18.175 | 4.296 | 0.274 | 61 | | 7 | 1024 | 64.000 | 256 | 46.717 | 2.720 | 30.122 | 18.241 | 4.250 | 2.632 | 62 | | 8 | 1024 | -64.000 | 256 | 47.677 | 2.719 | 30.117 | 18.096 | 4.192 | 2.333 | 63 | 64 | ### Jaccard similarity estimates 65 | 66 | | sim_j | t | ICWS_xxhash | FastICWS | BagMinHash2 | DartMinHash | 67 | |-------|----|-------------|----------|-------------|-------------| 68 | | 0.500 | 1 | 1.000 | 1.000 | 0.000 | 1.000 | 69 | | 0.500 | 2 | 0.500 | 0.500 | 0.000 | 0.500 | 70 | | 0.500 | 3 | 0.333 | 0.333 | 0.000 | 0.333 | 71 | | 0.500 | 4 | 0.500 | 0.250 | 0.750 | 0.750 | 72 | | 0.500 | 5 | 0.000 | 0.400 | 0.600 | 0.200 | 73 | | 0.500 | 6 | 0.667 | 0.500 | 0.500 | 0.000 | 74 | | 0.500 | 7 | 0.571 | 0.714 | 0.429 | 0.429 | 75 | | 0.500 | 8 | 0.250 | 0.375 | 0.625 | 0.500 | 76 | | 0.500 | 9 | 0.889 | 0.222 | 0.556 | 0.444 | 77 | | 0.500 | 10 | 0.600 | 0.400 | 0.700 | 0.400 | 78 | 79 | ## Tests 80 | We use Catch2 https://github.com/catchorg/Catch2 for unit testing. 81 | 82 | To compile and run tests use the command: `make test` -------------------------------------------------------------------------------- /bagminhash/bitstream_random.hpp: -------------------------------------------------------------------------------- 1 | //################################## 2 | //# Copyright (C) 2018 Otmar Ertl. # 3 | //# All rights reserved. # 4 | //################################## 5 | 6 | #ifndef _BIT_STREAM_RANDOM_HPP_ 7 | #define _BIT_STREAM_RANDOM_HPP_ 8 | 9 | #include "exponential_distribution.hpp" 10 | 11 | #include "xxhash/xxhash.h" 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | static constexpr double maxReciprocal = 1. / (UINT64_C(1) << 52); 19 | 20 | // uniform distributed double value from (0, 1] 21 | template double getUniformDouble(T& bitstream) { 22 | return (bitstream(52) + 1) * maxReciprocal; 23 | } 24 | 25 | template double getExponential1(T& bitstream) { 26 | return ziggurat::getExponential(bitstream); 27 | } 28 | 29 | template double getGamma21(T& bitstream) { 30 | return getExponential1(bitstream) + getExponential1(bitstream); 31 | } 32 | 33 | template double getBeta21(T& bitstream) { 34 | return std::sqrt(getUniformDouble(bitstream)); 35 | } 36 | 37 | template bool getBernoulli(double successProbability, T& bitStream) { 38 | while(true) { 39 | if (successProbability == 0) return false; 40 | if (successProbability == 1) return true; 41 | bool b = successProbability > 0.5; 42 | if (bitStream()) return b; 43 | successProbability += successProbability; 44 | if (b) successProbability -= 1; 45 | } 46 | } 47 | 48 | // see Lumbroso, Jeremie. "Optimal discrete uniform generation from coin flips, and applications." arXiv preprint arXiv:1304.1916 (2013). 49 | template uint64_t getUniform(uint64_t n, T& bitstream) { 50 | assert(n > 0); 51 | uint64_t v = 1; 52 | uint64_t c = 0; 53 | while(true) { 54 | v <<= 1; 55 | c <<= 1; 56 | c += bitstream(); 57 | if (v >= n) { 58 | if (c < n) { 59 | return c; 60 | } 61 | else { 62 | v -= n; 63 | c -= n; 64 | } 65 | } 66 | } 67 | } 68 | 69 | template uint64_t getUniformPow2(uint8_t numBits, T& bitstream) { 70 | return bitstream(numBits); 71 | } 72 | 73 | class XXHash64 { 74 | public: 75 | static uint64_t calculateHash(const char* data, size_t length, uint64_t seed) { 76 | return XXH64(data, length, seed); 77 | } 78 | }; 79 | 80 | struct BitMasks { 81 | constexpr BitMasks() : masks() { 82 | masks[0] = 0; 83 | for (uint8_t i = 1; i <= 63; ++i) masks[i] = (UINT64_C(1) << i) - UINT64_C(1); 84 | masks[64] = UINT64_C(0xFFFFFFFFFFFFFFFF); 85 | } 86 | 87 | uint64_t masks[65]; 88 | }; 89 | 90 | static constexpr BitMasks BIT_MASKS; 91 | 92 | template 93 | class BitStream { 94 | 95 | static const uint32_t FNV_OFFSET; 96 | static const uint32_t FNV_PRIME; 97 | 98 | size_t dataSize; 99 | std::unique_ptr data; 100 | uint64_t seed; 101 | uint64_t hashBits; 102 | uint8_t availableBits; 103 | 104 | void nextHash() { 105 | uint32_t tmp; 106 | memcpy(&tmp, data.get(), sizeof(uint32_t)); 107 | tmp *= FNV_PRIME; 108 | memcpy(data.get(), &tmp, sizeof(uint32_t)); 109 | hashBits = R::calculateHash(data.get(), dataSize, seed); 110 | } 111 | public: 112 | 113 | BitStream(const BitStream& p) = delete; 114 | BitStream& operator=(const BitStream&) = delete; 115 | BitStream(BitStream&& p) = default; 116 | BitStream& operator=(BitStream&&) = default; 117 | 118 | template 119 | BitStream(const I& valueProvider, uint64_t _seed) : dataSize(valueProvider.size() + sizeof(uint32_t)), data(new char[dataSize]), seed(_seed), hashBits(0), availableBits(0) { 120 | memcpy(data.get(), &FNV_OFFSET, sizeof(uint32_t)); 121 | valueProvider.init(&data[sizeof(uint32_t)]); 122 | } 123 | 124 | bool operator()() { 125 | if (availableBits == 0) { 126 | nextHash(); 127 | } 128 | bool result = (hashBits & UINT64_C(1)); 129 | hashBits >>= 1; 130 | availableBits -= 1; 131 | availableBits &= UINT8_C(0x3F); 132 | return result; 133 | } 134 | 135 | uint64_t operator()(uint8_t numBits) { 136 | assert(numBits <= 64); 137 | uint64_t result = 0; 138 | uint8_t requiredBits = numBits; 139 | if(numBits > availableBits) { 140 | result = (hashBits & BIT_MASKS.masks[availableBits]); 141 | result <<= (numBits - availableBits); 142 | nextHash(); 143 | requiredBits -= availableBits; 144 | } 145 | result |= (hashBits & BIT_MASKS.masks[requiredBits]); 146 | hashBits >>= requiredBits; 147 | availableBits -= numBits; 148 | availableBits &= UINT8_C(0x3F); 149 | return result; 150 | } 151 | }; 152 | 153 | // see https://en.wikipedia.org/wiki/Fowler-Noll-Vo_hash_function 154 | template const uint32_t BitStream:: FNV_OFFSET = 0x811c9dc5; 155 | template const uint32_t BitStream:: FNV_PRIME = (1 << 24) + (1 << 8) + 0x93; 156 | 157 | #endif // _BIT_STREAM_RANDOM_HPP_ 158 | -------------------------------------------------------------------------------- /bagminhash/exponential_distribution.hpp: -------------------------------------------------------------------------------- 1 | /* Boost Software License - Version 1.0 - August 17th, 2003 2 | * 3 | * Permission is hereby granted, free of charge, to any person or organization 4 | * obtaining a copy of the software and accompanying documentation covered by 5 | * this license (the "Software") to use, reproduce, display, distribute, 6 | * execute, and transmit the Software, and to prepare derivative works of the 7 | * Software, and to permit third-parties to whom the Software is furnished to 8 | * do so, all subject to the following: 9 | * 10 | * The copyright notices in the Software and this entire statement, including 11 | * the above license grant, this restriction and the following disclaimer, 12 | * must be included in all copies of the Software, in whole or in part, and 13 | * all derivative works of the Software, unless such copies or derivative 14 | * works are solely in the form of machine-executable object code generated by 15 | * a source language processor. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 20 | * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 21 | * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 22 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 | * DEALINGS IN THE SOFTWARE. 24 | */ 25 | 26 | /* boost random/exponential_distribution.hpp header file 27 | * 28 | * Copyright Jens Maurer 2000-2001 29 | * Copyright Steven Watanabe 2011 30 | * Copyright Jason Rhinelander 2016 31 | * Distributed under the Boost Software License, Version 1.0. (See 32 | * accompanying file LICENSE_1_0.txt or copy at 33 | * http://www.boost.org/LICENSE_1_0.txt) 34 | * 35 | * See http://www.boost.org for most recent version including documentation. 36 | * 37 | * $Id$ 38 | * 39 | * Revision history 40 | * 2001-02-18 moved to individual header files 41 | */ 42 | 43 | #ifndef _EXPONENTIAL_DISTRIBUTION_HPP_ 44 | #define _EXPONENTIAL_DISTRIBUTION_HPP_ 45 | 46 | #include 47 | 48 | namespace ziggurat { 49 | 50 | // tables for the ziggurat algorithm 51 | struct exponential_table { 52 | static const double table_x[257]; 53 | static const double table_y[257]; 54 | }; 55 | 56 | const double exponential_table::table_x[257] = { 57 | 8.6971174701310497140, 7.6971174701310497140, 6.9410336293772123602, 6.4783784938325698538, 58 | 6.1441646657724730491, 5.8821443157953997963, 5.6664101674540337371, 5.4828906275260628694, 59 | 5.3230905057543986131, 5.1814872813015010392, 5.0542884899813047117, 4.9387770859012514838, 60 | 4.8329397410251125881, 4.7352429966017412526, 4.6444918854200854873, 4.5597370617073515513, 61 | 4.4802117465284221949, 4.4052876934735729805, 4.3344436803172730116, 4.2672424802773661873, 62 | 4.2033137137351843802, 4.1423408656640511251, 4.0840513104082974638, 4.0282085446479365106, 63 | 3.9746060666737884793, 3.9230625001354895926, 3.8734176703995089983, 3.8255294185223367372, 64 | 3.7792709924116678992, 3.7345288940397975350, 3.6912010902374189454, 3.6491955157608538478, 65 | 3.6084288131289096339, 3.5688252656483374051, 3.5303158891293438633, 3.4928376547740601814, 66 | 3.4563328211327607625, 3.4207483572511205323, 3.3860354424603017887, 3.3521490309001100106, 67 | 3.3190474709707487166, 3.2866921715990692095, 3.2550473085704501813, 3.2240795652862645207, 68 | 3.1937579032122407483, 3.1640533580259734580, 3.1349388580844407393, 3.1063890623398246660, 69 | 3.0783802152540905188, 3.0508900166154554479, 3.0238975044556767713, 2.9973829495161306949, 70 | 2.9713277599210896472, 2.9457143948950456386, 2.9205262865127406647, 2.8957477686001416838, 71 | 2.8713640120155362592, 2.8473609656351888266, 2.8237253024500354905, 2.8004443702507381944, 72 | 2.7775061464397572041, 2.7548991965623453650, 2.7326126361947007411, 2.7106360958679293686, 73 | 2.6889596887418041593, 2.6675739807732670816, 2.6464699631518093905, 2.6256390267977886123, 74 | 2.6050729387408355373, 2.5847638202141406911, 2.5647041263169053687, 2.5448866271118700928, 75 | 2.5253043900378279427, 2.5059507635285939648, 2.4868193617402096807, 2.4679040502973649846, 76 | 2.4491989329782498908, 2.4306983392644199088, 2.4123968126888708336, 2.3942890999214583288, 77 | 2.3763701405361408194, 2.3586350574093374601, 2.3410791477030346875, 2.3236978743901964559, 78 | 2.3064868582835798692, 2.2894418705322694265, 2.2725588255531546952, 2.2558337743672190441, 79 | 2.2392628983129087111, 2.2228425031110364013, 2.2065690132576635755, 2.1904389667232199235, 80 | 2.1744490099377744673, 2.1585958930438856781, 2.1428764653998416425, 2.1272876713173679737, 81 | 2.1118265460190418108, 2.0964902118017147637, 2.0812758743932248696, 2.0661808194905755036, 82 | 2.0512024094685848641, 2.0363380802487695916, 2.0215853383189260770, 2.0069417578945183144, 83 | 1.9924049782135764992, 1.9779727009573602295, 1.9636426877895480401, 1.9494127580071845659, 84 | 1.9352807862970511135, 1.9212447005915276767, 1.9073024800183871196, 1.8934521529393077332, 85 | 1.8796917950722108462, 1.8660195276928275962, 1.8524335159111751661, 1.8389319670188793980, 86 | 1.8255131289035192212, 1.8121752885263901413, 1.7989167704602903934, 1.7857359354841254047, 87 | 1.7726311792313049959, 1.7596009308890742369, 1.7466436519460739352, 1.7337578349855711926, 88 | 1.7209420025219350428, 1.7081947058780575683, 1.6955145241015377061, 1.6829000629175537544, 89 | 1.6703499537164519163, 1.6578628525741725325, 1.6454374393037234057, 1.6330724165359912048, 90 | 1.6207665088282577216, 1.6085184617988580769, 1.5963270412864831349, 1.5841910325326886695, 91 | 1.5721092393862294810, 1.5600804835278879161, 1.5481036037145133070, 1.5361774550410318943, 92 | 1.5243009082192260050, 1.5124728488721167573, 1.5006921768428164936, 1.4889578055167456003, 93 | 1.4772686611561334579, 1.4656236822457450411, 1.4540218188487932264, 1.4424620319720121876, 94 | 1.4309432929388794104, 1.4194645827699828254, 1.4080248915695353509, 1.3966232179170417110, 95 | 1.3852585682631217189, 1.3739299563284902176, 1.3626364025050864742, 1.3513769332583349176, 96 | 1.3401505805295045843, 1.3289563811371163220, 1.3177933761763245480, 1.3066606104151739482, 97 | 1.2955571316866007210, 1.2844819902750125450, 1.2734342382962410994, 1.2624129290696153434, 98 | 1.2514171164808525098, 1.2404458543344064544, 1.2294981956938491599, 1.2185731922087903071, 99 | 1.2076698934267612830, 1.1967873460884031665, 1.1859245934042023557, 1.1750806743109117687, 100 | 1.1642546227056790397, 1.1534454666557748056, 1.1426522275816728928, 1.1318739194110786733, 101 | 1.1211095477013306083, 1.1103581087274114281, 1.0996185885325976575, 1.0888899619385472598, 102 | 1.0781711915113727024, 1.0674612264799681530, 1.0567590016025518414, 1.0460634359770445503, 103 | 1.0353734317905289496, 1.0246878730026178052, 1.0140056239570971074, 1.0033255279156973717, 104 | 0.99264640550727647009, 0.98196705308506317914, 0.97128624098390397896, 0.96060271166866709917, 105 | 0.94991517776407659940, 0.93922231995526297952, 0.92852278474721113999, 0.91781518207004493915, 106 | 0.90709808271569100600, 0.89637001558989069006, 0.88562946476175228052, 0.87487486629102585352, 107 | 0.86410460481100519511, 0.85331700984237406386, 0.84251035181036928333, 0.83168283773427388393, 108 | 0.82083260655441252290, 0.80995772405741906620, 0.79905617735548788109, 0.78812586886949324977, 109 | 0.77716460975913043936, 0.76617011273543541328, 0.75513998418198289808, 0.74407171550050873971, 110 | 0.73296267358436604916, 0.72181009030875689912, 0.71061105090965570413, 0.69936248110323266174, 111 | 0.68806113277374858613, 0.67670356802952337911, 0.66528614139267855405, 0.65380497984766565353, 112 | 0.64225596042453703448, 0.63063468493349100113, 0.61893645139487678178, 0.60715622162030085137, 113 | 0.59528858429150359384, 0.58332771274877027785, 0.57126731653258903915, 0.55910058551154127652, 114 | 0.54682012516331112550, 0.53441788123716615385, 0.52188505159213564105, 0.50921198244365495319, 115 | 0.49638804551867159754, 0.48340149165346224782, 0.47023927508216945338, 0.45688684093142071279, 116 | 0.44332786607355296305, 0.42954394022541129589, 0.41551416960035700100, 0.40121467889627836229, 117 | 0.38661797794112021568, 0.37169214532991786118, 0.35639976025839443721, 0.34069648106484979674, 118 | 0.32452911701691008547, 0.30783295467493287307, 0.29052795549123115167, 0.27251318547846547924, 119 | 0.25365836338591284433, 0.23379048305967553619, 0.21267151063096745264, 0.18995868962243277774, 120 | 0.16512762256418831796, 0.13730498094001380420, 0.10483850756582017915, 0.063852163815003480173, 121 | 0 122 | }; 123 | 124 | const double exponential_table::table_y[257] = { 125 | 0, 0.00045413435384149675545, 0.00096726928232717452884, 0.0015362997803015723824, 126 | 0.0021459677437189061793, 0.0027887987935740759640, 0.0034602647778369039855, 0.0041572951208337952532, 127 | 0.0048776559835423925804, 0.0056196422072054831710, 0.0063819059373191794422, 0.0071633531836349841425, 128 | 0.0079630774380170392396, 0.0087803149858089752347, 0.0096144136425022094101, 0.010464810181029979488, 129 | 0.011331013597834597488, 0.012212592426255380661, 0.013109164931254991070, 0.014020391403181937334, 130 | 0.014945968011691148079, 0.015885621839973162490, 0.016839106826039946359, 0.017806200410911360563, 131 | 0.018786700744696029497, 0.019780424338009741737, 0.020787204072578117603, 0.021806887504283582125, 132 | 0.022839335406385238829, 0.023884420511558170348, 0.024942026419731782971, 0.026012046645134218076, 133 | 0.027094383780955798424, 0.028188948763978634421, 0.029295660224637394015, 0.030414443910466605492, 134 | 0.031545232172893605499, 0.032687963508959533317, 0.033842582150874329031, 0.035009037697397411067, 135 | 0.036187284781931419754, 0.037377282772959360128, 0.038578995503074859626, 0.039792391023374122670, 136 | 0.041017441380414820816, 0.042254122413316231413, 0.043502413568888183301, 0.044762297732943280694, 137 | 0.046033761076175166762, 0.047316792913181548703, 0.048611385573379494401, 0.049917534282706374944, 138 | 0.051235237055126279830, 0.052564494593071689595, 0.053905310196046085104, 0.055257689676697038322, 139 | 0.056621641283742874438, 0.057997175631200659098, 0.059384305633420264487, 0.060783046445479636051, 140 | 0.062193415408540996150, 0.063615431999807331076, 0.065049117786753755036, 0.066494496385339779043, 141 | 0.067951593421936607770, 0.069420436498728751675, 0.070901055162371828426, 0.072393480875708743023, 142 | 0.073897746992364746308, 0.075413888734058408453, 0.076941943170480510100, 0.078481949201606426042, 143 | 0.080033947542319910023, 0.081597980709237420930, 0.083174093009632380354, 0.084762330532368125386, 144 | 0.086362741140756912277, 0.087975374467270219300, 0.089600281910032864534, 0.091237516631040162057, 145 | 0.092887133556043546523, 0.094549189376055853718, 0.096223742550432800103, 0.097910853311492199618, 146 | 0.099610583670637128826, 0.10132299742595363588, 0.10304816017125771553, 0.10478613930657016928, 147 | 0.10653700405000166218, 0.10830082545103379867, 0.11007767640518539026, 0.11186763167005629731, 148 | 0.11367076788274431301, 0.11548716357863353664, 0.11731689921155557057, 0.11916005717532768467, 149 | 0.12101672182667483729, 0.12288697950954513498, 0.12477091858083096578, 0.12666862943751066518, 150 | 0.12858020454522817870, 0.13050573846833078225, 0.13244532790138752023, 0.13439907170221363078, 151 | 0.13636707092642885841, 0.13834942886358021406, 0.14034625107486244210, 0.14235764543247220043, 152 | 0.14438372216063476473, 0.14642459387834493787, 0.14848037564386679222, 0.15055118500103990354, 153 | 0.15263714202744286154, 0.15473836938446807312, 0.15685499236936522013, 0.15898713896931420572, 154 | 0.16113493991759203183, 0.16329852875190180795, 0.16547804187493600915, 0.16767361861725019322, 155 | 0.16988540130252766513, 0.17211353531532005700, 0.17435816917135348788, 0.17661945459049489581, 156 | 0.17889754657247831241, 0.18119260347549629488, 0.18350478709776746150, 0.18583426276219711495, 157 | 0.18818119940425430485, 0.19054576966319540013, 0.19292814997677133873, 0.19532852067956322315, 158 | 0.19774706610509886464, 0.20018397469191127727, 0.20263943909370901930, 0.20511365629383770880, 159 | 0.20760682772422204205, 0.21011915938898825914, 0.21265086199297827522, 0.21520215107537867786, 160 | 0.21777324714870053264, 0.22036437584335949720, 0.22297576805812018050, 0.22560766011668406495, 161 | 0.22826029393071670664, 0.23093391716962742173, 0.23362878343743333945, 0.23634515245705964715, 162 | 0.23908329026244917002, 0.24184346939887722761, 0.24462596913189210901, 0.24743107566532763894, 163 | 0.25025908236886230967, 0.25311029001562948171, 0.25598500703041538015, 0.25888354974901621678, 164 | 0.26180624268936295243, 0.26475341883506220209, 0.26772541993204481808, 0.27072259679906003167, 165 | 0.27374530965280298302, 0.27679392844851734458, 0.27986883323697289920, 0.28297041453878076010, 166 | 0.28609907373707684673, 0.28925522348967773308, 0.29243928816189258772, 0.29565170428126120948, 167 | 0.29889292101558177099, 0.30216340067569352897, 0.30546361924459023541, 0.30879406693456016794, 168 | 0.31215524877417956945, 0.31554768522712893632, 0.31897191284495723773, 0.32242848495608914289, 169 | 0.32591797239355619822, 0.32944096426413633091, 0.33299806876180896713, 0.33658991402867758144, 170 | 0.34021714906678004560, 0.34388044470450243010, 0.34758049462163698567, 0.35131801643748334681, 171 | 0.35509375286678745925, 0.35890847294874976196, 0.36276297335481777335, 0.36665807978151414890, 172 | 0.37059464843514599421, 0.37457356761590215193, 0.37859575940958081092, 0.38266218149600982112, 173 | 0.38677382908413768115, 0.39093173698479710717, 0.39513698183329015336, 0.39939068447523107877, 174 | 0.40369401253053026739, 0.40804818315203238238, 0.41245446599716116772, 0.41691418643300289465, 175 | 0.42142872899761659635, 0.42599954114303435739, 0.43062813728845883923, 0.43531610321563659758, 176 | 0.44006510084235387501, 0.44487687341454851593, 0.44975325116275498919, 0.45469615747461548049, 177 | 0.45970761564213768669, 0.46478975625042618067, 0.46994482528395999841, 0.47517519303737738299, 178 | 0.48048336393045423016, 0.48587198734188493564, 0.49134386959403255500, 0.49690198724154955294, 179 | 0.50254950184134769289, 0.50828977641064283495, 0.51412639381474855788, 0.52006317736823356823, 180 | 0.52610421398361972602, 0.53225388026304326945, 0.53851687200286186590, 0.54489823767243963663, 181 | 0.55140341654064131685, 0.55803828226258748140, 0.56480919291240022434, 0.57172304866482579008, 182 | 0.57878735860284503057, 0.58601031847726802755, 0.59340090169173341521, 0.60096896636523224742, 183 | 0.60872538207962206507, 0.61668218091520762326, 0.62485273870366592605, 0.63325199421436607968, 184 | 0.64189671642726607018, 0.65080583341457104881, 0.66000084107899974178, 0.66950631673192477684, 185 | 0.67935057226476538741, 0.68956649611707798890, 0.70019265508278816709, 0.71127476080507597882, 186 | 0.72286765959357200702, 0.73503809243142351530, 0.74786862198519510742, 0.76146338884989624862, 187 | 0.77595685204011559675, 0.79152763697249565519, 0.80842165152300838005, 0.82699329664305033399, 188 | 0.84778550062398962096, 0.87170433238120363669, 0.90046992992574643800, 0.93814368086217467916, 189 | 1 190 | }; 191 | 192 | static double f(double x) { 193 | using std::exp; 194 | return exp(-x); 195 | } 196 | 197 | template double getExponential(T& bitstream) { 198 | const double * const table_x = exponential_table::table_x; 199 | const double * const table_y = exponential_table::table_y; 200 | double shift(0); 201 | for(;;) { 202 | double valsFirst = getUniformDouble(bitstream); 203 | int valsSecond = getUniformPow2(8, bitstream); 204 | int i = valsSecond; 205 | double x = valsFirst * double(table_x[i]); 206 | if(x < double(table_x[i + 1])) return shift + x; 207 | // For i=0 we need to generate from the tail, but because this is an exponential 208 | // distribution, the tail looks exactly like the body, so we can simply repeat with a 209 | // shift: 210 | if (i == 0) shift += double(table_x[1]); 211 | else { 212 | double y01 = getUniformDouble(bitstream); 213 | double y = double(table_y[i]) + y01 * double(table_y[i+1] - table_y[i]); 214 | 215 | // All we care about is whether these are < or > 0; these values are equal to 216 | // (lbound) or proportional to (ubound) `y` minus the lower/upper bound. 217 | double y_above_ubound = double(table_x[i] - table_x[i+1]) * y01 - (double(table_x[i]) - x), 218 | y_above_lbound = y - (double(table_y[i+1]) + (double(table_x[i+1]) - x) * double(table_y[i+1])); 219 | 220 | if (y_above_ubound < 0 // if above the upper bound reject immediately 221 | && 222 | ( 223 | y_above_lbound < 0 // If below the lower bound accept immediately 224 | || 225 | y < f(x) // Otherwise it's between the bounds and we need a full check 226 | ) 227 | ) { 228 | return x + shift; 229 | } 230 | } 231 | } 232 | } 233 | 234 | } // namespace ziggurat 235 | 236 | #endif // _EXPONENTIAL_DISTRIBUTION_HPP_ 237 | -------------------------------------------------------------------------------- /bagminhash/weighted_minwise_hashing.hpp: -------------------------------------------------------------------------------- 1 | //################################## 2 | //# Copyright (C) 2018 Otmar Ertl. # 3 | //# All rights reserved. # 4 | //################################## 5 | 6 | #ifndef _WEIGHTED_MINWISE_HASHING_HPP_ 7 | #define _WEIGHTED_MINWISE_HASHING_HPP_ 8 | 9 | #include "bitstream_random.hpp" 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | template 16 | class MaxValueTracker { 17 | const uint32_t m; 18 | std::vector values; 19 | 20 | public: 21 | MaxValueTracker(uint32_t _m, const T& infinity) : m(_m), values((_m << 1) - 1, infinity) {} 22 | 23 | void update(uint32_t idx, T value) { 24 | assert(idx < m); 25 | while(value < values[idx]) { 26 | values[idx] = value; 27 | idx = m + (idx >> 1); 28 | if (idx >= values.size()) break; 29 | uint32_t leftChildIdx = (idx - m) << 1; 30 | uint32_t rightChildIdx = leftChildIdx + 1; 31 | value = std::max(values[leftChildIdx], values[rightChildIdx]); 32 | } 33 | } 34 | 35 | const T& max() const { 36 | return values.back(); 37 | } 38 | 39 | const T& operator[](uint32_t idx) const { 40 | return values[idx]; 41 | } 42 | }; 43 | 44 | class BinaryWeightDiscretization { 45 | public: 46 | typedef uint8_t index_type; 47 | typedef uint8_t weight_type; 48 | 49 | static const index_type maxBoundIdx = 1; 50 | 51 | static weight_type getBound(index_type boundIdx) { 52 | return boundIdx; 53 | } 54 | }; 55 | 56 | template 57 | I calculateMaxBoundIdx() { 58 | static_assert(sizeof(I) == sizeof(W), "index type and weight type do not have same size"); 59 | I i = 0; 60 | const W w = std::numeric_limits::max(); 61 | memcpy(&i, &w, sizeof(I)); 62 | return i; 63 | } 64 | 65 | template 66 | class WeightDiscretization { 67 | public: 68 | typedef I index_type; 69 | typedef W weight_type; 70 | 71 | static const index_type maxBoundIdx; 72 | 73 | static weight_type getBound(index_type boundIdx) { 74 | W f; 75 | static_assert(std::numeric_limits::is_iec559, "weight_type is not iec559"); 76 | static_assert(sizeof(weight_type) == sizeof(index_type), "weight_type and index_type do not have same size"); 77 | memcpy(&f, &boundIdx, sizeof(index_type)); 78 | return f; 79 | } 80 | }; 81 | 82 | template 83 | const typename WeightDiscretization::index_type WeightDiscretization::maxBoundIdx = calculateMaxBoundIdx::index_type, WeightDiscretization::weight_type>(); 84 | 85 | typedef WeightDiscretization FloatWeightDiscretization; 86 | 87 | typedef WeightDiscretization DoubleWeightDiscretization; 88 | 89 | struct WeightedHashResult { 90 | std::vector hashValues; 91 | uint64_t maxSpace; 92 | 93 | WeightedHashResult(uint32_t m) : hashValues(m), maxSpace(UINT64_C(0)) {} 94 | }; 95 | 96 | template class ValueProvider; 97 | 98 | template<> 99 | class ValueProvider<> { 100 | public: 101 | static size_t size() { 102 | return 0; 103 | } 104 | 105 | void init(char* data) const {} 106 | }; 107 | 108 | template class ValueProvider : ValueProvider { 109 | const W w; 110 | public: 111 | 112 | ValueProvider(const W& _w, const V& ... _v) : ValueProvider(_v...), w(_w) {} 113 | 114 | static size_t size() { 115 | return ValueProvider::size() + sizeof(W); 116 | } 117 | 118 | void init(char* data) const { 119 | ValueProvider::init(data); 120 | memcpy(&data[ValueProvider::size()], &w, sizeof(W)); 121 | } 122 | }; 123 | 124 | template 125 | ValueProvider collectHashData(const V&... v) { 126 | return ValueProvider(v...); 127 | } 128 | 129 | template 130 | class PoissonProcess { 131 | 132 | double point; 133 | double weight; 134 | typename D::index_type weightIdxMin; 135 | typename D::index_type weightIdxMax; 136 | typename D::weight_type boundMin; 137 | typename D::weight_type boundMax; 138 | uint32_t signatureIdx; 139 | BitStream randomBitStream; 140 | 141 | public: 142 | 143 | PoissonProcess( 144 | double _point, 145 | double _weight, 146 | typename D::index_type _weightIdxMin, 147 | typename D::index_type _weightIdxMax, 148 | typename D::weight_type _boundMin, 149 | typename D::weight_type _boundMax, 150 | BitStream&& _randomBitStream 151 | ) 152 | : point(_point), 153 | weight(_weight), 154 | weightIdxMin(_weightIdxMin), 155 | weightIdxMax(_weightIdxMax), 156 | boundMin(_boundMin), 157 | boundMax(_boundMax), 158 | signatureIdx(std::numeric_limits::max()), 159 | randomBitStream(std::move(_randomBitStream)) {} 160 | 161 | 162 | PoissonProcess(BitStream&& _randomBitStream, double _weight) : 163 | PoissonProcess(0., _weight, 0, D::maxBoundIdx, 0, D::getBound(D::maxBoundIdx), std::move(_randomBitStream)) {} 164 | 165 | bool splittable() const { 166 | return weightIdxMax > weightIdxMin + 1; 167 | } 168 | 169 | bool partiallyRelevant() const { 170 | return D::getBound(weightIdxMin + 1) <= weight; 171 | } 172 | 173 | bool fullyRelevant() const { 174 | return boundMax <= weight; 175 | } 176 | 177 | uint32_t getIndex() const { 178 | return signatureIdx; 179 | } 180 | 181 | double getPoint() const { 182 | return point; 183 | } 184 | 185 | void next(uint32_t m) { 186 | point += getExponential1(randomBitStream) / (static_cast(boundMax) - static_cast(boundMin)); 187 | signatureIdx = getUniform(m, randomBitStream); 188 | } 189 | 190 | std::unique_ptr split() { 191 | 192 | typename D::index_type weightIdxMid = (weightIdxMin + weightIdxMax) >> 1; 193 | 194 | double boundMid = D::getBound(weightIdxMid); 195 | 196 | bool inheritToLeft = getBernoulli((boundMid - static_cast(boundMin)) / (static_cast(boundMax) - static_cast(boundMin)), randomBitStream); 197 | 198 | std::unique_ptr pPrime; 199 | 200 | BitStream bitStream(collectHashData(weightIdxMid, point), UINT64_C(0x4b06d55ba29b0826)); // constant from random.org 201 | 202 | if (inheritToLeft) { 203 | pPrime = std::make_unique(point, weight, weightIdxMid, weightIdxMax, boundMid, boundMax, std::move(bitStream)); 204 | weightIdxMax = weightIdxMid; 205 | boundMax = boundMid; 206 | } 207 | else { 208 | pPrime = std::make_unique(point, weight, weightIdxMin, weightIdxMid, boundMin, boundMid, std::move(bitStream)); 209 | weightIdxMin = weightIdxMid; 210 | boundMin = boundMid; 211 | } 212 | return pPrime; 213 | } 214 | }; 215 | 216 | template 217 | struct CmpPoissonProcessPtrs 218 | { 219 | bool operator()(const std::unique_ptr>& lhs, const std::unique_ptr>& rhs) const 220 | { 221 | return rhs->getPoint() < lhs->getPoint(); 222 | } 223 | }; 224 | 225 | template 226 | struct CmpPoissonProcessPtrsInverse 227 | { 228 | bool operator()(const std::unique_ptr>& lhs, const std::unique_ptr>& rhs) const 229 | { 230 | return rhs->getPoint() > lhs->getPoint(); 231 | } 232 | }; 233 | 234 | template 235 | void pushHeap(std::unique_ptr>& p, std::vector>>& heap, uint64_t& maxHeapSize, size_t offset = 0) { 236 | heap.emplace_back(std::move(p)); 237 | std::push_heap(heap.begin() + offset, heap.end(), CmpPoissonProcessPtrs()); 238 | if (heap.size() > maxHeapSize) maxHeapSize = heap.size(); 239 | } 240 | 241 | template 242 | std::unique_ptr> popHeap(std::vector>>& heap, size_t offset = 0) { 243 | std::pop_heap(heap.begin() + offset, heap.end(), CmpPoissonProcessPtrs()); 244 | std::unique_ptr> p = std::move(heap.back()); 245 | heap.pop_back(); 246 | return p; 247 | } 248 | 249 | static const uint64_t bagMinHashSeedA = UINT64_C(0xf331e07615a87fd7); // constant from random.org 250 | static const uint64_t bagMinHashSeedB = UINT64_C(0xe224afad0d89c684); // constant from random.org 251 | 252 | template 253 | WeightedHashResult bag_min_hash_1(const std::vector>& data, const uint32_t m) { 254 | assert(D::getBound(0) == 0); 255 | 256 | const uint8_t b = 64; // constant for b-bit minwise hashing 257 | 258 | std::vector>> heap; 259 | 260 | MaxValueTracker h(m, std::numeric_limits::infinity()); 261 | WeightedHashResult result(m); 262 | 263 | for(const auto& item : data) { 264 | 265 | const double w = std::get<1>(item); 266 | if (w < D::getBound(1)) continue; 267 | 268 | const uint64_t d = std::get<0>(item); 269 | 270 | BitStream bitStream(collectHashData(d), bagMinHashSeedA); 271 | std::unique_ptr> p = std::make_unique>(std::move(bitStream), w); 272 | 273 | p->next(m); 274 | if (p->fullyRelevant()) h.update(p->getIndex(), p->getPoint()); 275 | 276 | while(p->getPoint() <= h.max()) { 277 | while(p->splittable() && p->partiallyRelevant()) { 278 | 279 | std::unique_ptr> pPrime = p->split(); 280 | 281 | if (p->fullyRelevant()) h.update(p->getIndex(), p->getPoint()); 282 | 283 | if (pPrime->partiallyRelevant()) { 284 | pPrime->next(m); 285 | if (pPrime->fullyRelevant()) h.update(pPrime->getIndex(), pPrime->getPoint()); 286 | if (pPrime->getPoint() <= h.max()) pushHeap(pPrime, heap, result.maxSpace); 287 | } 288 | } 289 | 290 | if (p->fullyRelevant()) { 291 | p->next(m); 292 | h.update(p->getIndex(), p->getPoint()); 293 | if (p->getPoint() <= h.max()) pushHeap(p, heap, result.maxSpace); 294 | } 295 | if (heap.empty()) break; 296 | p = popHeap(heap); 297 | } 298 | 299 | heap.clear(); 300 | } 301 | 302 | for (uint32_t k = 0; k < m; ++k) { 303 | BitStream bitstream(collectHashData(h[k]), bagMinHashSeedB); 304 | result.hashValues[k] = getUniformPow2(b, bitstream); 305 | } 306 | 307 | return result; 308 | } 309 | 310 | template 311 | WeightedHashResult bag_min_hash_2(const std::vector>& data, const uint32_t m) { 312 | assert(D::getBound(0) == 0); 313 | 314 | const uint8_t b = 64; // constant for b-bit minwise hashing 315 | 316 | std::vector>> temp; 317 | WeightedHashResult result(m); 318 | 319 | MaxValueTracker h(m, std::numeric_limits::infinity()); 320 | 321 | for(const auto& item : data) { 322 | 323 | const double w = std::get<1>(item); 324 | if (w < D::getBound(1)) continue; 325 | 326 | const uint64_t d = std::get<0>(item); 327 | 328 | const size_t tempHeapOffset = temp.size(); 329 | 330 | BitStream bitStream(collectHashData(d), bagMinHashSeedA); 331 | std::unique_ptr> p = std::make_unique>(std::move(bitStream), w); 332 | 333 | p->next(m); 334 | if (p->fullyRelevant()) h.update(p->getIndex(), p->getPoint()); 335 | 336 | while(p->getPoint() <= h.max()) { 337 | while(p->splittable() && p->partiallyRelevant() && !p->fullyRelevant()) { 338 | 339 | std::unique_ptr> pPrime = p->split(); 340 | 341 | if (p->fullyRelevant()) h.update(p->getIndex(), p->getPoint()); 342 | 343 | if (pPrime->partiallyRelevant()) { 344 | pPrime->next(m); 345 | if (pPrime->fullyRelevant()) h.update(pPrime->getIndex(), pPrime->getPoint()); 346 | if (pPrime->getPoint() <= h.max()) pushHeap(pPrime, temp, result.maxSpace, tempHeapOffset); 347 | } 348 | } 349 | 350 | if (p->fullyRelevant()) { 351 | assert(p->getPoint() <= h.max()); 352 | pushHeap(p, temp, result.maxSpace, tempHeapOffset); 353 | break; 354 | } 355 | if (temp.size() == tempHeapOffset) break; 356 | p = popHeap(temp, tempHeapOffset); 357 | } 358 | 359 | auto bufferEndIt = temp.begin() + tempHeapOffset; 360 | while(bufferEndIt != temp.begin() && temp.front()->getPoint() > h.max()) { 361 | std::pop_heap(temp.begin(), bufferEndIt, CmpPoissonProcessPtrsInverse()); 362 | --bufferEndIt; 363 | } 364 | 365 | for(auto heapIt = temp.begin() + tempHeapOffset; heapIt != temp.end(); ++heapIt) { 366 | if ((*heapIt)->getPoint() <= h.max()) { 367 | *bufferEndIt = std::move(*heapIt); 368 | ++bufferEndIt; 369 | std::push_heap(temp.begin(), bufferEndIt, CmpPoissonProcessPtrsInverse()); 370 | } 371 | } 372 | temp.erase(bufferEndIt, temp.end()); 373 | } 374 | 375 | std::make_heap(temp.begin(), temp.end(), CmpPoissonProcessPtrs()); 376 | 377 | while(!temp.empty()) { 378 | 379 | std::unique_ptr> p = popHeap(temp); 380 | if (p->getPoint() > h.max()) break; 381 | 382 | while(p->splittable() && p->partiallyRelevant()) { 383 | 384 | std::unique_ptr> pPrime = p->split(); 385 | 386 | if (p->fullyRelevant()) h.update(p->getIndex(), p->getPoint()); 387 | 388 | if (pPrime->partiallyRelevant()) { 389 | pPrime->next(m); 390 | if (pPrime->fullyRelevant()) h.update(pPrime->getIndex(), pPrime->getPoint()); 391 | if (pPrime->getPoint() <= h.max()) pushHeap(pPrime, temp, result.maxSpace); 392 | } 393 | } 394 | 395 | if (p->fullyRelevant()) { 396 | p->next(m); 397 | h.update(p->getIndex(), p->getPoint()); 398 | if (p->getPoint() <= h.max()) pushHeap(p, temp, result.maxSpace); 399 | } 400 | } 401 | 402 | for (uint32_t k = 0; k < m; ++k) { 403 | BitStream bitstream(collectHashData(h[k]), bagMinHashSeedB); 404 | result.hashValues[k] = getUniformPow2(b, bitstream); 405 | } 406 | 407 | return result; 408 | } 409 | 410 | // see Ioffe, Sergey. "Improved consistent sampling, weighted minhash and l1 sketching." Data Mining (ICDM), 2010 IEEE 10th International Conference on. IEEE, 2010. 411 | template 412 | WeightedHashResult improved_consistent_weighted_hashing(const std::vector>& data, const uint32_t m) { 413 | const uint8_t b = 64; // constant for b-bit minwise hashing 414 | 415 | std::vector aVec(m, std::numeric_limits::infinity()); 416 | std::vector dVec(m); 417 | std::vector yVec(m); 418 | 419 | WeightedHashResult result(m); 420 | 421 | for(const auto& item : data) { 422 | 423 | const uint64_t d = std::get<0>(item); 424 | const double s = std::get<1>(item); 425 | 426 | if (s == 0) continue; 427 | 428 | const double logS = std::log(s); 429 | 430 | BitStream bitstream(collectHashData(d), UINT64_C(0x87609608d2a48b5d)); // constant from random.org 431 | 432 | for (uint32_t k = 0; k < m; ++k) { 433 | 434 | double r = getGamma21(bitstream); 435 | double c = getGamma21(bitstream); 436 | double beta = getUniformDouble(bitstream); 437 | 438 | double t = std::floor(logS / r + beta); 439 | double y = std::exp(r * (t - beta)); 440 | double a = c / (y * std::exp(r)); 441 | 442 | if (a < aVec[k]) { 443 | aVec[k] = a; 444 | dVec[k] = d; 445 | yVec[k] = y; 446 | } 447 | } 448 | } 449 | 450 | for (uint32_t k = 0; k < m; ++k) { 451 | BitStream bitstream(collectHashData(dVec[k], yVec[k]), UINT64_C(0xbf235dea3db9c393)); // constant from random.org 452 | result.hashValues[k] = getUniformPow2(b, bitstream); 453 | } 454 | 455 | return result; 456 | } 457 | 458 | // see Wu, Wei, et al. "Canonical Consistent Weighted Sampling for Real-Value Weighted Min-Hash." Data Mining (ICDM), 2016 IEEE 16th International Conference on. IEEE, 2016. 459 | template 460 | WeightedHashResult canonical_consistent_weighted_hashing(const std::vector>& data, const uint32_t m) { 461 | const uint8_t b = 64; // constant for b-bit minwise hashing 462 | 463 | std::vector aVec(m, std::numeric_limits::infinity()); 464 | std::vector dVec(m); 465 | std::vector yVec(m); 466 | 467 | WeightedHashResult result(m); 468 | 469 | for(const auto& item : data) { 470 | 471 | const uint64_t d = std::get<0>(item); 472 | const double s = std::get<1>(item); 473 | 474 | if (s == 0) continue; 475 | 476 | BitStream bitstream(collectHashData(d), UINT64_C(0xc9116756125c6267)); // constant from random.org 477 | 478 | for (uint32_t k = 0; k < m; ++k) { 479 | 480 | double beta = getUniformDouble(bitstream); 481 | double r = getBeta21(bitstream); 482 | double c = getGamma21(bitstream); 483 | 484 | double t = std::floor(s / r + beta); 485 | double y = r * (t - beta); 486 | double a = c / y - 2 * r * c; 487 | 488 | if (a < aVec[k]) { 489 | aVec[k] = a; 490 | dVec[k] = d; 491 | yVec[k] = y; 492 | } 493 | } 494 | } 495 | 496 | for (uint32_t k = 0; k < m; ++k) { 497 | BitStream bitstream(collectHashData(dVec[k], yVec[k]), UINT64_C(0xa5c48ff7b4004c41)); // constant from random.org 498 | result.hashValues[k] = getUniformPow2(b, bitstream); 499 | } 500 | 501 | return result; 502 | } 503 | 504 | // see Wu, Wei, et al. "Consistent Weighted Sampling Made More Practical." Proceedings of the 26th International Conference on World Wide Web. International World Wide Web Conferences Steering Committee, 2017. 505 | template 506 | WeightedHashResult practical_consistent_weighted_hashing(const std::vector>& data, const uint32_t m) { 507 | const uint8_t b = 64; // constant for b-bit minwise hashing 508 | 509 | std::vector aVec(m, std::numeric_limits::infinity()); 510 | std::vector dVec(m); 511 | std::vector yVec(m); 512 | 513 | WeightedHashResult result(m); 514 | 515 | for(const auto& item : data) { 516 | 517 | const uint64_t d = std::get<0>(item); 518 | const double s = std::get<1>(item); 519 | 520 | if (s == 0) continue; 521 | 522 | const double logS = std::log(s); 523 | 524 | BitStream bitstream(collectHashData(d), UINT64_C(0xbe46368ee398beee)); // constant from random.org 525 | 526 | for (uint32_t k = 0; k < m; ++k) { 527 | 528 | double u1 = getUniformDouble(bitstream); 529 | double u2 = getUniformDouble(bitstream); 530 | double beta = getUniformDouble(bitstream); 531 | double x = getUniformDouble(bitstream); 532 | 533 | double gamma = -std::log(u1 * u2); 534 | double t = std::floor(logS / gamma + beta); 535 | double y = std::exp(gamma * (t - beta)); 536 | double a = -std::log(x) / (y / u1); 537 | 538 | if (a < aVec[k]) { 539 | aVec[k] = a; 540 | dVec[k] = d; 541 | yVec[k] = y; 542 | } 543 | } 544 | } 545 | 546 | for (uint32_t k = 0; k < m; ++k) { 547 | BitStream bitstream(collectHashData(dVec[k], yVec[k]), UINT64_C(0x50da48973b000da9)); // constant from random.org 548 | result.hashValues[k] = getUniformPow2(b, bitstream); 549 | } 550 | 551 | return result; 552 | } 553 | 554 | // see Wu, Wei, et al. "Improved Consistent Weighted Sampling Revisited." arXiv preprint arXiv:1706.01172 (2017). 555 | template 556 | WeightedHashResult improved_squared_consistent_weighted_hashing(const std::vector>& data, const uint32_t m) { 557 | const uint8_t b = 64; // constant for b-bit minwise hashing 558 | 559 | std::vector aVec(m, std::numeric_limits::infinity()); 560 | std::vector dVec(m); 561 | std::vector yVec(m); 562 | 563 | WeightedHashResult result(m); 564 | 565 | for(const auto& item : data) { 566 | 567 | const uint64_t d = std::get<0>(item); 568 | const double s = std::get<1>(item); 569 | 570 | if (s == 0) continue; 571 | 572 | const double logS = std::log(s); 573 | 574 | BitStream bitstream(collectHashData(d), UINT64_C(0xb30eb19e5e572b46)); // constant from random.org 575 | 576 | for (uint32_t k = 0; k < m; ++k) { 577 | 578 | double r1 = getGamma21(bitstream); 579 | double r2 = getGamma21(bitstream); 580 | double beta1 = getUniformDouble(bitstream); 581 | double beta2 = getUniformDouble(bitstream); 582 | double c = getGamma21(bitstream); 583 | 584 | double t2 = std::floor(logS / r2 + beta2); 585 | double z = std::exp(r2 * (t2 - beta2 + 1)); 586 | double a = c / z; 587 | 588 | if (a < aVec[k]) { 589 | aVec[k] = a; 590 | double t1 = std::floor(logS / r1 + beta1); 591 | double y = std::exp(r1 * (t1 - beta1)); 592 | dVec[k] = d; 593 | yVec[k] = y; 594 | } 595 | } 596 | } 597 | 598 | for (uint32_t k = 0; k < m; ++k) { 599 | BitStream bitstream(collectHashData(dVec[k], yVec[k]), UINT64_C(0xdff981675040e7bc)); // constant from random.org 600 | result.hashValues[k] = getUniformPow2(b, bitstream); 601 | } 602 | 603 | return result; 604 | } 605 | 606 | // see Li, Ping. "0-bit consistent weighted sampling." Proceedings of the 21th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining. ACM, 2015. 607 | template 608 | WeightedHashResult zero_bit_consistent_weighted_sampling(const std::vector>& data, const uint32_t m) { 609 | 610 | std::vector aVec(m, std::numeric_limits::infinity()); 611 | std::vector dVec(m); 612 | 613 | WeightedHashResult result(m); 614 | 615 | for(const auto& item : data) { 616 | 617 | const uint64_t d = std::get<0>(item); 618 | const double s = std::get<1>(item); 619 | 620 | if (s == 0) continue; 621 | 622 | const double logS = std::log(s); 623 | 624 | BitStream bitstream(collectHashData(d), UINT64_C(0x95f3ee483861a892)); // constant from random.org 625 | 626 | for (uint32_t k = 0; k < m; ++k) { 627 | 628 | double r = getGamma21(bitstream); 629 | double c = getGamma21(bitstream); 630 | double beta = getUniformDouble(bitstream); 631 | 632 | double t = std::floor(logS / r + beta); 633 | double y = std::exp(r * (t - beta)); 634 | double a = c / (y * std::exp(r)); 635 | 636 | if (a < aVec[k]) { 637 | aVec[k] = a; 638 | dVec[k] = d; 639 | } 640 | } 641 | } 642 | 643 | result.hashValues = std::move(dVec); 644 | 645 | return result; 646 | } 647 | 648 | #endif // _WEIGHTED_MINWISE_HASHING_HPP_ 649 | -------------------------------------------------------------------------------- /bagminhash_wrappers.hpp: -------------------------------------------------------------------------------- 1 | // Simple wrappers around the bagminhash functions 2 | // Note that we use the floatweightdiscretization as it results in improved performance 3 | #ifndef SKETCH_BAGMINHASH 4 | #define SKETCH_BAGMINHASH 5 | 6 | #include 7 | #include 8 | #include "bagminhash/weighted_minwise_hashing.hpp" 9 | 10 | using namespace std; 11 | 12 | vector> weightedhashresult_to_pairs(WeightedHashResult res) { 13 | vector> output; 14 | for(uint64_t h : res.hashValues) { 15 | output.push_back({h, 0.0}); 16 | } 17 | return output; 18 | } 19 | 20 | vector> pairs_to_tuples(const vector>& x) { 21 | vector> x_tuple; 22 | for(auto& element : x) { 23 | x_tuple.push_back(tuple(element.first, element.second)); 24 | } 25 | return x_tuple; 26 | } 27 | 28 | class BagMinHash1 { 29 | private: 30 | uint64_t t; 31 | public: 32 | BagMinHash1(uint64_t t) : t(t) {}; 33 | vector> operator()(const vector>& x) { 34 | auto x_tuple = pairs_to_tuples(x); 35 | WeightedHashResult res = bag_min_hash_1(x_tuple, t); 36 | return weightedhashresult_to_pairs(res); 37 | 38 | } 39 | }; 40 | 41 | class BagMinHash2 { 42 | private: 43 | uint64_t t; 44 | public: 45 | BagMinHash2(uint64_t t) : t(t) {}; 46 | vector> operator()(const vector>& x) { 47 | auto x_tuple = pairs_to_tuples(x); 48 | WeightedHashResult res = bag_min_hash_2(x_tuple, t); 49 | return weightedhashresult_to_pairs(res); 50 | 51 | } 52 | }; 53 | 54 | class ICWS_xxhash { 55 | private: 56 | uint64_t t; 57 | public: 58 | ICWS_xxhash(uint64_t t) : t(t) {}; 59 | vector> operator()(const vector>& x) { 60 | auto x_tuple = pairs_to_tuples(x); 61 | WeightedHashResult res = improved_consistent_weighted_hashing(x_tuple, t); 62 | return weightedhashresult_to_pairs(res); 63 | } 64 | }; 65 | 66 | #endif -------------------------------------------------------------------------------- /darthash.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SKETCH_DARTHASH 2 | #define SKETCH_DARTHASH 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "similarity.hpp" 11 | #include "hashing.hpp" 12 | 13 | using namespace std; 14 | 15 | class DartHash { 16 | 17 | private: 18 | uint64_t t; 19 | TabulationHashFunction32 T_nu, T_rho, T_w, T_r; 20 | TabulationHashFunction T_i, T_p, T_q, F, M; 21 | vector powers_of_two; 22 | vector negative_powers_of_two; 23 | vector poisson_cdf; 24 | 25 | public: 26 | DartHash(mt19937_64& rng, uint64_t t) : t(t), T_nu(rng), T_rho(rng), T_w(rng), T_r(rng), T_i(rng), T_p(rng), T_q(rng), F(rng), M(rng) { 27 | 28 | // Tabulate positive and negative powers of two 29 | double p = 1.0; 30 | double q = 1.0; 31 | // Standard double precision ranges from around +-2^1024. We will support slightly less. 32 | for(uint64_t i = 0; i < 1000; i++) { 33 | powers_of_two.push_back(p); 34 | p = 2.0*p; 35 | negative_powers_of_two.push_back(q); 36 | q = 0.5*q; 37 | } 38 | 39 | // Tabulate the Poisson CDF 40 | double pdf = exp(-1.0); 41 | double cdf = pdf; 42 | for(uint64_t i = 0; i < 100; i++) { 43 | poisson_cdf.push_back(cdf); 44 | pdf = pdf/(i + 1); 45 | cdf += pdf; 46 | } 47 | }; 48 | 49 | vector> operator()(const vector>& x, double theta = 1.0) { 50 | vector> darts; 51 | darts.reserve(2*t); 52 | double max_rank = theta/weight(x); 53 | double t_inv = 1.0/t; 54 | uint32_t RHO = (uint32_t)floor(log2(1.0 + max_rank)); 55 | 56 | for(const pair& element : x) { 57 | uint64_t i = element.first; 58 | double xi = element.second; 59 | uint64_t i_hash = T_i(i); 60 | uint32_t NU = (uint32_t)floor(log2(1.0 + t*xi)); 61 | for(uint32_t nu = 0; nu <= NU; nu++) { 62 | uint64_t nu_hash = T_nu(nu); 63 | for(uint32_t rho = 0; rho <= RHO; rho++) { 64 | uint64_t region_hash = nu_hash ^ T_rho(rho); 65 | double two_nu = powers_of_two[nu]; 66 | double two_rho = powers_of_two[rho]; 67 | double W = (two_nu - 1)*t_inv; 68 | double R = two_rho - 1; 69 | double delta_nu = two_nu*t_inv*negative_powers_of_two[rho]; 70 | double delta_rho = two_rho*negative_powers_of_two[nu]; 71 | double w0 = W; 72 | uint32_t w_max = rho < 32 ? 1ul << rho : 1ul << 31; 73 | for(uint32_t w = 0; w < w_max; w++) { 74 | if(xi < w0) break; 75 | uint64_t w_hash = T_w(w); 76 | double r0 = R; 77 | uint32_t r_max = nu < 32 ? 1ul << nu : 1ul << 31; 78 | for(uint32_t r = 0; r < r_max; r++) { 79 | if(max_rank < r0) break; 80 | // Get area fingerprint to speed up subsequent hashing 81 | uint64_t area_hash = w_hash ^ T_r(r); 82 | uint64_t z = i_hash ^ region_hash ^ area_hash; 83 | 84 | // Draw from Poisson distribution 85 | double p_z = to_unit(T_p(z)); 86 | uint8_t p = 0; 87 | while(p_z > poisson_cdf[p]) { 88 | p++; 89 | } 90 | 91 | uint64_t q = 0; 92 | while(q < p) { 93 | // Layer the q-values over z to create a unique key with a strong hash value 94 | uint64_t z_q = z ^ (q << 56) ^ (q << 48) ^ (q << 40) ^ (q << 32) ^ (q << 24) ^ (q << 16) ^ (q << 8) ^ q; 95 | auto uniform_weight_rank = to_units(T_q(z_q)); 96 | double weight = w0 + delta_nu*uniform_weight_rank.first; 97 | double rank = r0 + delta_rho*uniform_weight_rank.second; 98 | if(weight < xi && rank < max_rank) { 99 | darts.push_back({F(z_q), rank}); 100 | } 101 | q++; 102 | } 103 | 104 | r0 += delta_rho; 105 | } 106 | w0 += delta_nu; 107 | } 108 | } 109 | } 110 | } 111 | return darts; 112 | } 113 | 114 | 115 | // Convert the t darts to k minhashes by hashing the darts to k buckets and keeping the minimum from each bucket 116 | vector> minhash(const vector>& x, uint64_t k) { 117 | auto darts = (*this)(x); 118 | vector> minhashes(k, {0, numeric_limits::max()}); 119 | for(auto& dart : darts) { 120 | uint64_t j = M(dart.first) % k; 121 | if(dart.second < minhashes[j].second) { 122 | minhashes[j] = dart; 123 | } 124 | } 125 | return minhashes; 126 | } 127 | 128 | vector onebit_minhash(const vector>& x, uint64_t k) { 129 | vector sketch(k, false); 130 | auto minhashes = minhash(x, k); 131 | for(uint64_t i = 0; i < k; i++) { 132 | sketch[i] = ((minhashes[i].first & 1ull) == 1ull); // use first bit of MinHash id 133 | } 134 | return sketch; 135 | } 136 | }; 137 | 138 | #endif -------------------------------------------------------------------------------- /dartminhash.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SKETCH_DARTMINHASH 2 | #define SKETCH_DARTMINHASH 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "similarity.hpp" 11 | #include "hashing.hpp" 12 | #include "darthash.hpp" 13 | 14 | using namespace std; 15 | 16 | class DartMinHash { 17 | 18 | private: 19 | uint64_t k; 20 | TabulationHashFunction T; 21 | DartHash D; 22 | 23 | public: 24 | // Set t = k ln(k) + 2k so the probability of failing on the first run is at most exp(-2) 25 | DartMinHash(mt19937_64& rng, uint64_t k) : k(k), T(rng), D(rng, ceil(k*log(k) + 2*k)) {}; 26 | 27 | vector> operator()(const vector>& x) { 28 | bool all_minhashed = false; 29 | double theta = 1.0; 30 | vector> minhashes(k, {0, numeric_limits::max()}); 31 | while(!all_minhashed) { 32 | vector minhashed(k, false); 33 | auto darts = D(x, theta); 34 | // Place darts into buckets 35 | for(auto& dart : darts) { 36 | uint64_t j = T(dart.first) % k; 37 | minhashed[j] = true; 38 | if(dart.second < minhashes[j].second) { 39 | minhashes[j] = dart; 40 | } 41 | } 42 | // Verify whether all minhashes were computed 43 | all_minhashed = true; 44 | for(bool mh : minhashed) { 45 | if(!mh) { 46 | all_minhashed = false; 47 | } 48 | } 49 | 50 | theta = theta + 0.5; 51 | } 52 | return minhashes; 53 | } 54 | 55 | vector onebit_minhash(const vector>& x) { 56 | vector sketch(k, false); 57 | auto minhashes = (*this)(x); 58 | for(uint64_t i = 0; i < k; i++) { 59 | sketch[i] = ((minhashes[i].first & 1ull) == 1ull); // use first bit of MinHash id 60 | } 61 | return sketch; 62 | } 63 | }; 64 | 65 | #endif -------------------------------------------------------------------------------- /datagenerator.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SKETCH_DATAGENERATOR 2 | #define SKETCH_DATAGENERATOR 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | using namespace std; 11 | 12 | // Generate random histograms by first picking m entries randomly to have nonzero weights, and then sorting m - 1 uniformly distributed variables between zero and one. 13 | // Assign weights as the gaps in sorted order. 14 | 15 | vector> generate_weighted_set(uint64_t L0, double L1, mt19937_64& rng) { 16 | unordered_set elements; 17 | uniform_int_distribution random_index; 18 | while(elements.size() < L0) { 19 | elements.insert(random_index(rng)); 20 | } 21 | 22 | uniform_real_distribution uniform_splitter(0, 1); 23 | vector z; 24 | for(uint64_t i = 0; i < L0 - 1; i++) { 25 | z.push_back(uniform_splitter(rng)); 26 | } 27 | z.push_back(1.0); 28 | sort(z.begin(), z.end()); 29 | 30 | double prev = 0.0; 31 | uint32_t j = 0; 32 | vector> x; 33 | for(uint64_t index : elements) { 34 | double weight = L1*(z[j] - prev); 35 | x.push_back(pair(index, weight)); 36 | prev = z[j]; 37 | j++; 38 | } 39 | 40 | // Sort the vector of pairs by indices 41 | sort(x.begin(), x.end()); 42 | return x; 43 | } 44 | 45 | // Given x we can generate y s.t. the intersection between x and y is equal to some pre-specificed value 46 | // We will do this by setting y to an appropriately scaled down copy of x and adding the remaining mass to an element that does not exist in x. 47 | vector> generate_similar_weighted_set(const vector>& x, double relative_overlap, mt19937_64& rng) { 48 | // Pick a random free element j 49 | uint64_t j; 50 | bool free = false; 51 | while(!free) { 52 | j = rng(); 53 | free = true; 54 | for(auto element : x) { 55 | if(j == element.first) { 56 | free = false; 57 | } 58 | } 59 | } 60 | 61 | double excess_weight = 0.0; 62 | vector> y; 63 | for(auto element : x) { 64 | double w = element.second; 65 | double w_scaled = w*relative_overlap; 66 | double excess = w - w_scaled; 67 | y.push_back({element.first, w_scaled}); 68 | excess_weight += excess; 69 | } 70 | y.push_back({j, excess_weight}); 71 | return y; 72 | } 73 | 74 | #endif -------------------------------------------------------------------------------- /hashing.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SKETCH_HASHING 2 | #define SKETCH_HASHING 3 | 4 | #include 5 | #include 6 | 7 | using namespace std; 8 | 9 | double to_unit(uint64_t x) { 10 | return (double)x/0xFFFFFFFFFFFFFFFFull; 11 | } 12 | 13 | double to_unit32(uint32_t x) { 14 | return (double)x/0xFFFFFFFFul; 15 | } 16 | 17 | // Convert a 64-bit uint to two doubles by splitting it in two and normalizing each 32-bit part 18 | pair to_units(uint64_t x) { 19 | return { 20 | ((double)(x >> 32))/0xFFFFFFFFul, 21 | (double)(x & 0xFFFFFFFFull)/0xFFFFFFFFul 22 | }; 23 | } 24 | 25 | class TabulationHashFunction { 26 | 27 | private: 28 | const uint64_t mask = 0xFF; 29 | uint64_t T1[256]; 30 | uint64_t T2[256]; 31 | uint64_t T3[256]; 32 | uint64_t T4[256]; 33 | uint64_t T5[256]; 34 | uint64_t T6[256]; 35 | uint64_t T7[256]; 36 | uint64_t T8[256]; 37 | 38 | public: 39 | TabulationHashFunction(mt19937_64& rng) { 40 | for(int i = 0; i < 256; i++) { 41 | T1[i] = rng(); 42 | T2[i] = rng(); 43 | T3[i] = rng(); 44 | T4[i] = rng(); 45 | T5[i] = rng(); 46 | T6[i] = rng(); 47 | T7[i] = rng(); 48 | T8[i] = rng(); 49 | } 50 | } 51 | 52 | uint64_t operator()(uint64_t x) { 53 | uint64_t hashvalue = T1[x & mask] ^ T2[(x >> 8) & mask] ^ T3[(x >> 16) & mask] ^ T4[(x >> 24) & mask] ^ 54 | T5[(x >> 32) & mask] ^ T6[(x >> 40) & mask] ^ T7[(x >> 48) & mask] ^ T8[(x >> 56) & mask]; 55 | return hashvalue; 56 | } 57 | }; 58 | 59 | class TabulationHashFunction8 { 60 | 61 | private: 62 | uint64_t T1[256]; 63 | 64 | public: 65 | TabulationHashFunction8(mt19937_64& rng) { 66 | for(int i = 0; i < 256; i++) { 67 | T1[i] = rng(); 68 | } 69 | } 70 | 71 | uint64_t operator()(uint8_t x) { 72 | return T1[x]; 73 | } 74 | }; 75 | 76 | class TabulationHashFunction32 { 77 | 78 | private: 79 | const uint32_t mask = 0xFF; 80 | uint64_t T1[256]; 81 | uint64_t T2[256]; 82 | uint64_t T3[256]; 83 | uint64_t T4[256]; 84 | 85 | public: 86 | TabulationHashFunction32(mt19937_64& rng) { 87 | for(int i = 0; i < 256; i++) { 88 | T1[i] = rng(); 89 | T2[i] = rng(); 90 | T3[i] = rng(); 91 | T4[i] = rng(); 92 | } 93 | } 94 | 95 | uint64_t operator()(uint32_t x) { 96 | uint64_t hashvalue = T1[x & mask] ^ T2[(x >> 8) & mask] ^ T3[(x >> 16) & mask] ^ T4[(x >> 24) & mask]; 97 | return hashvalue; 98 | } 99 | }; 100 | 101 | #endif -------------------------------------------------------------------------------- /icws.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SKETCH_ICWS 2 | #define SKETCH_ICWS 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "hashing.hpp" 11 | 12 | using namespace std; 13 | 14 | // see Ioffe, Sergey. "Improved consistent sampling, weighted minhash and l1 sketching." Data Mining (ICDM), 2010 IEEE 10th International Conference on. IEEE, 2010. 15 | class ICWS { 16 | 17 | private: 18 | TabulationHashFunction T1, T2, T3, T4, T5; 19 | 20 | public: 21 | ICWS(mt19937_64& rng) : T1(rng), T2(rng), T3(rng), T4(rng), T5(rng) {} 22 | 23 | pair operator()(const vector>& x) { 24 | 25 | double a_star = numeric_limits::max(); 26 | uint64_t k_star = 0; 27 | double y_star = 0; 28 | 29 | for(const pair& element : x) { 30 | uint64_t k = element.first; 31 | double S_k = element.second; 32 | double r_u1 = to_unit(T1(k)); 33 | double r_u2 = to_unit(T2(k)); 34 | double c_u1 = to_unit(T3(k)); 35 | double c_u2 = to_unit(T4(k)); 36 | double beta_k = to_unit(T5(k)); 37 | double r_k = -log(r_u1) -log(r_u2); 38 | double c_k = -log(c_u1) -log(c_u2); 39 | 40 | double t_k = floor(log(S_k)/r_k + beta_k); 41 | double y_k = exp(r_k*(t_k - beta_k)); 42 | double a_k = c_k/(y_k*exp(r_k)); 43 | if(a_k < a_star) { 44 | a_star = a_k; 45 | k_star = k; 46 | y_star = y_k; 47 | } 48 | } 49 | 50 | return pair(k_star, y_star); 51 | } 52 | }; 53 | 54 | class ICWS_t { 55 | private: 56 | uint64_t t; 57 | vector M; 58 | 59 | public: 60 | ICWS_t(mt19937_64& rng, uint64_t t) : t(t) { 61 | for(uint64_t i = 0; i < t; i++) { 62 | M.push_back(ICWS(rng)); 63 | } 64 | } 65 | 66 | vector> operator()(const vector>& x) { 67 | vector> minhashes; 68 | for(uint64_t i = 0; i < t; i++) { 69 | minhashes.push_back((M[i])(x)); 70 | } 71 | return minhashes; 72 | } 73 | }; 74 | 75 | // ICWS with tabulated random draws and precomputed logarithms of weights 76 | // Also working with the logarithm og y_k and a_k as suggested in the ICWS paper 77 | // Completely avoids logarithms, exponentials, and divisions. Relies entirely on table lookups and multiplication. 78 | class FastICWS { 79 | 80 | private: 81 | TabulationHashFunction T1, T2, T3; 82 | const vector& gamma; 83 | const vector& gamma_inv; 84 | const vector& log_gamma; 85 | 86 | public: 87 | FastICWS(mt19937_64& rng, const vector& gamma, const vector& gamma_inv, const vector& log_gamma) 88 | : T1(rng), T2(rng), T3(rng), gamma(gamma), gamma_inv(gamma_inv), log_gamma(log_gamma) {} 89 | 90 | pair operator()(const vector>& log_weight_x) { 91 | 92 | const uint64_t MASK16 = 0xFFFFull; 93 | double log_a_star = numeric_limits::max(); 94 | uint64_t minhash = 0; 95 | 96 | for(const pair& element : log_weight_x) { 97 | uint64_t k = element.first; 98 | double log_S_k = element.second; 99 | 100 | uint64_t z = T1(k); 101 | double r_k = gamma[z & MASK16]; 102 | double r_k_inv = gamma_inv[z & MASK16]; 103 | double log_c_k = log_gamma[(z >> 16) & MASK16]; 104 | double beta_k = to_unit32(z >> 32); 105 | 106 | double t_k = floor(log_S_k*r_k_inv + beta_k); 107 | double log_y_k = r_k*(t_k - beta_k); 108 | 109 | double log_a_k = log_c_k - log_y_k - r_k; 110 | if(log_a_k < log_a_star) { 111 | log_a_star = log_a_k; 112 | minhash = T2((uint64_t)t_k) ^ T3(k); // 64-bit minhash 113 | } 114 | } 115 | 116 | return pair(minhash, log_a_star); 117 | } 118 | }; 119 | 120 | class FastICWS_t { 121 | private: 122 | uint64_t t; 123 | vector M; 124 | vector gamma; 125 | vector gamma_inv; 126 | vector log_gamma; 127 | 128 | public: 129 | FastICWS_t(mt19937_64& rng, uint64_t t) : t(t) { 130 | 131 | // Create discretized version of the X ~ Gamma(2,1) distribution 132 | // pdf: z*exp(-z) 133 | // cdf: 1 - exp(-z)*(z + 1) 134 | // We want to create tables with 2^16 entries 135 | // The ith entry (starting from 0) will be an interpolation between a value z_{i} with Pr[X <= z_{i}] <= (i + 1)*epsilon 136 | // and a value z_{i+1} > z_{i} with Pr[X <= z_{i+1}] <= (i+2)*epsilon 137 | 138 | double z = 0.0; 139 | double z_prev = 0.0; 140 | double epsilon = 1.0/((1 << 16) + 1); 141 | double delta = epsilon; // How much to advance z in each step. We can set this to epsilon without skipping steps because pdf <= 1/e. 142 | 143 | for(int i = 0; i < (1 << 16); i++) { 144 | double target_mass = (i+1)*epsilon; 145 | while(1 - exp(-z)*(z+1) < target_mass) { 146 | z += delta; 147 | } 148 | // Fill tables 149 | double v = (z + z_prev)/2; 150 | z_prev = z; 151 | gamma.push_back(v); 152 | gamma_inv.push_back(1/v); 153 | log_gamma.push_back(log(v)); 154 | } 155 | 156 | for(uint64_t i = 0; i < t; i++) { 157 | M.push_back(FastICWS(rng, gamma, gamma_inv, log_gamma)); 158 | } 159 | } 160 | 161 | vector> operator()(const vector>& x) { 162 | vector> log_weight_x; 163 | for(auto element : x) { 164 | log_weight_x.push_back({element.first, log(element.second)}); 165 | } 166 | vector> minhashes; 167 | for(uint64_t i = 0; i < t; i++) { 168 | minhashes.push_back((M[i])(log_weight_x)); 169 | } 170 | return minhashes; 171 | } 172 | }; 173 | 174 | 175 | #endif 176 | -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "timer.hpp" 8 | #include "icws.hpp" 9 | #include "darthash.hpp" 10 | #include "dartminhash.hpp" 11 | #include "datagenerator.hpp" 12 | #include "bagminhash_wrappers.hpp" 13 | 14 | using namespace std; 15 | 16 | template 17 | double time_algorithm(const vector>>& data, T& hasher, uint64_t& id_xor, double& rank_sum) { 18 | Timer timer; 19 | for(auto& x : data) { 20 | timer.start(); 21 | auto minhashes = hasher(x); 22 | timer.stop(); 23 | 24 | // Do something with the minhashes to ensure that the compiler doesn't optimize them away 25 | for(auto mh : minhashes) { 26 | id_xor = id_xor ^ mh.first; 27 | rank_sum += mh.second; 28 | } 29 | } 30 | 31 | return timer.elapsed_ms()/data.size(); 32 | } 33 | 34 | void time_performance() { 35 | uint64_t seed = 1; 36 | mt19937_64 rng(seed); 37 | 38 | // Settings for L0 plot 39 | // vector L0_values; 40 | // for(uint64_t i = 0; i < 17; i++) { 41 | // L0_values.push_back(1ull << i); 42 | // } 43 | // vector L1_values = {1.0}; 44 | // vector t_values = {256}; 45 | // uint64_t m = 100; 46 | 47 | // Settings for L1 plot 48 | // vector L0_values = {256}; 49 | // vector t_values = {256}; 50 | // vector L1_values; 51 | // for(int i = -128; i <= 128; i = i + 16) { 52 | // L1_values.push_back(pow(2.0, (double)i)); 53 | // } 54 | // uint64_t m = 100; 55 | 56 | // Settings for k plot 57 | // vector t_values; 58 | // for(uint64_t i = 0; i < 17; i++) { 59 | // t_values.push_back(1ull << i); 60 | // } 61 | // vector L1_values = {1.0}; 62 | // vector L0_values = {256}; 63 | // uint64_t m = 100; 64 | 65 | // Exploration 66 | // vector L0_values = {1, 4, 16, 64, 256, 1024, 4096, 16384}; 67 | // vector L1_values = {pow(2.0, -32), pow(2.0, -16), pow(2.0, -8), pow(2.0, 0), pow(2.0, 8), pow(2.0, 16), pow(2.0, 32)}; 68 | // vector t_values = {1, 4, 16, 64, 256, 1024, 4096, 16384}; 69 | // uint64_t m = 10; 70 | 71 | // Settings for heatmap 72 | vector L0_values; 73 | vector t_values; 74 | for(uint64_t i = 0; i < 15; i++) { 75 | L0_values.push_back(1ull << i); 76 | t_values.push_back(1ull << i); 77 | } 78 | vector L1_values = {1.0}; 79 | uint64_t m = 100; 80 | 81 | 82 | uint64_t id_xor = 0; 83 | double rank_sum = 0; 84 | uint64_t experiment_counter = 0; 85 | 86 | cout << setprecision(5) << fixed; 87 | cout << "id, L0, log2_L1, t, FastICWS, BagMinHash2, DartMinHash" << endl; 88 | 89 | for(uint64_t L0 : L0_values) { 90 | for(double L1 : L1_values) { 91 | for(uint64_t t : t_values) { 92 | 93 | cout << experiment_counter << ", " << L0 << ", " << log2(L1) << ", " << t << ", "; 94 | 95 | // Generate data 96 | vector>> data; 97 | for(uint64_t i = 0; i < m; i++) { 98 | data.push_back(generate_weighted_set(L0, L1, rng)); 99 | } 100 | 101 | // Algorithms 102 | // ICWS_t I(rng, t); 103 | // cout << time_algorithm(data, I, id_xor, rank_sum) << ", "; 104 | 105 | // ICWS_xxhash I_xx(t); 106 | // cout << time_algorithm(data, I_xx, id_xor, rank_sum) << ", "; 107 | 108 | FastICWS_t F(rng, t); 109 | cout << time_algorithm(data, F, id_xor, rank_sum) << ", "; 110 | 111 | // BagMinHash1 B1(t); 112 | // cout << time_algorithm(dat a, B1, id_xor, rank_sum) << ", "; 113 | 114 | BagMinHash2 B2(t); 115 | cout << time_algorithm(data, B2, id_xor, rank_sum) << ", "; 116 | 117 | DartMinHash M(rng, t); 118 | cout << time_algorithm(data, M, id_xor, rank_sum) << endl; 119 | 120 | experiment_counter++; 121 | } 122 | } 123 | } 124 | 125 | cout << "rank sum: " << rank_sum << ", id XOR: " << id_xor << endl; 126 | } 127 | 128 | struct experiment_settings { 129 | uint64_t L0; 130 | double L1; 131 | uint64_t t; 132 | }; 133 | 134 | void time_performance_specific() { 135 | uint64_t seed = 1; 136 | mt19937_64 rng(seed); 137 | 138 | vector settings { 139 | // L0, L1, t 140 | // Varying L0 141 | {64, pow(2.0, 0.0), 64}, 142 | {1024, pow(2.0, 0.0), 64}, 143 | // {16384, pow(2.0, 0.0), 64}, 144 | 145 | {64, pow(2.0, 0.0), 1024}, 146 | {1024, pow(2.0, 0.0), 1024}, 147 | // {16384, pow(2.0, 0.0), 1024}, 148 | 149 | // // Varying t 150 | {256, pow(2.0, 0.0), 1}, 151 | {256, pow(2.0, 0.0), 256}, 152 | // {256, pow(2.0, 0.0), 4096}, 153 | 154 | // {4096, pow(2.0, 0.0), 1}, 155 | // {4096, pow(2.0, 0.0), 256}, 156 | // {4096, pow(2.0, 0.0), 4096}, 157 | 158 | // // Varying L1 159 | {1024, pow(2.0, 0.0), 256}, 160 | {1024, pow(2.0, 64.0), 256}, 161 | {1024, pow(2.0, -64.0), 256}, 162 | // {1024, pow(2.0, 512.0), 256}, 163 | // {1024, pow(2.0, -512.0), 256}, 164 | }; 165 | uint64_t m = 100; 166 | 167 | uint64_t id_xor = 0; 168 | double rank_sum = 0; 169 | uint64_t experiment_counter = 0; 170 | 171 | cout << setprecision(3) << fixed; 172 | cout << "id, L0, log2_L1, t, ICWS, FastICWS, ICWS_xxhash, BagMinHash1, BagMinHash2, DartMinHash" << endl; 173 | 174 | for(experiment_settings s : settings) { 175 | 176 | cout << experiment_counter << ", " << s.L0 << ", " << log2(s.L1) << ", " << s.t << ", "; 177 | 178 | // Generate data 179 | vector>> data; 180 | for(uint64_t i = 0; i < m; i++) { 181 | data.push_back(generate_weighted_set(s.L0, s.L1, rng)); 182 | } 183 | 184 | // Algorithms 185 | ICWS_t I(rng, s.t); 186 | cout << time_algorithm(data, I, id_xor, rank_sum) << ", "; 187 | 188 | FastICWS_t F(rng, s.t); 189 | cout << time_algorithm(data, F, id_xor, rank_sum) << ", "; 190 | 191 | ICWS_xxhash I_xx(s.t); 192 | cout << time_algorithm(data, I_xx, id_xor, rank_sum) << ", "; 193 | 194 | BagMinHash1 B1(s.t); 195 | cout << time_algorithm(data, B1, id_xor, rank_sum) << ", "; 196 | 197 | BagMinHash2 B2(s.t); 198 | cout << time_algorithm(data, B2, id_xor, rank_sum) << ", "; 199 | 200 | DartMinHash M(rng, s.t); 201 | cout << time_algorithm(data, M, id_xor, rank_sum) << endl; 202 | 203 | experiment_counter++; 204 | } 205 | 206 | cout << "rank sum: " << rank_sum << ", id XOR: " << id_xor << endl; 207 | } 208 | 209 | // Measure how the estimated jaccard similarity changes as the number of minhashes increases 210 | void measure_similarity() { 211 | uint64_t seed = 1; 212 | mt19937_64 rng(seed); 213 | 214 | // Experiment in paper 215 | // uint64_t L0 = 256; 216 | // double L1 = 1.0; 217 | // uint64_t t_min = 1; 218 | // uint64_t t_max = 100; 219 | // vector jaccard_similarity_values = {0.25, 0.5, 0.75}; 220 | 221 | uint64_t L0 = 256; 222 | double L1 = 1.0; 223 | uint64_t t_min = 1; 224 | uint64_t t_max = 10; 225 | vector jaccard_similarity_values = {0.5}; 226 | 227 | 228 | vector t_values; 229 | for(uint64_t i = t_min; i < t_max + 1; i++) { 230 | t_values.push_back(i); 231 | } 232 | 233 | cout << setprecision(3) << fixed; 234 | 235 | cout << "sim_j, t, ICWS_xxhash, FastICWS, BagMinHash2, DartMinHash" << endl; 236 | for(double jaccard_similarity : jaccard_similarity_values) { 237 | for(uint64_t t : t_values) { 238 | cout << jaccard_similarity << ", " << t << ", "; 239 | 240 | // Generate a pair of similar points 241 | double l1_sim = l1_similarity_from_jaccard_similarity(L1, L1, jaccard_similarity); 242 | auto x = generate_weighted_set(L0, L1, rng); 243 | auto y = generate_similar_weighted_set(x, l1_sim, rng); 244 | 245 | // // Algorithms 246 | ICWS_xxhash I_xx(t); 247 | cout << jaccard_estimate_from_minhashes(I_xx(x), I_xx(y)) << ", "; 248 | 249 | FastICWS_t F(rng, t); 250 | cout << jaccard_estimate_from_minhashes(F(x), F(y)) << ", "; 251 | 252 | BagMinHash2 B2(t); 253 | cout << jaccard_estimate_from_minhashes(B2(x), B2(y)) << ", "; 254 | 255 | DartMinHash D(rng, t); 256 | cout << jaccard_estimate_from_minhashes(D(x), D(y)) << endl; 257 | 258 | } 259 | } 260 | } 261 | 262 | int main() { 263 | // time_performance(); 264 | time_performance_specific(); 265 | // measure_similarity(); 266 | } -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | CC = g++ 2 | CFLAGS = -std=c++17 -march=native -Wall -O3 3 | INCLUDES = $(shell find -name '*.hpp') 4 | XXHASHPATH = bagminhash/xxhash/libxxhash.a 5 | 6 | run: main 7 | ./main 8 | 9 | main: main.o 10 | $(CC) -o main main.o $(XXHASHPATH) $(CFLAGS) 11 | 12 | main.o: main.cpp $(INCLUDES) 13 | $(CC) -o main.o -c main.cpp $(CFLAGS) 14 | 15 | test: tests 16 | ./tests 17 | 18 | tests: tests.o tests-main.o 19 | $(CC) -o tests tests.o tests-main.o $(XXHASHPATH) $(CFLAGS) 20 | 21 | tests.o: tests.cpp tests-main.o $(INCLUDES) 22 | $(CC) -c tests.cpp -o tests.o $(CFLAGS) 23 | 24 | tests-main.o: tests-main.cpp catch.hpp 25 | $(CC) -c tests-main.cpp -o tests-main.o $(CFLAGS) 26 | 27 | clean: 28 | rm -rf *.o 29 | -------------------------------------------------------------------------------- /output/performance.csv: -------------------------------------------------------------------------------- 1 | id, L0, log2_L1, t, ICWS, FastICWS, ICWS_xxhash, BagMinHash1, BagMinHash2, DartMinHash 2 | 0, 64, 0.000, 64, 0.899, 0.060, 0.538, 2.439, 0.628, 0.042 3 | 1, 1024, 0.000, 64, 11.565, 0.515, 9.604, 4.374, 1.706, 0.145 4 | 2, 64, 0.000, 1024, 19.296, 2.885, 8.083, 48.248, 13.279, 0.592 5 | 3, 1024, 0.000, 1024, 187.661, 12.643, 120.135, 79.775, 16.586, 0.824 6 | 4, 256, 0.000, 1, 0.040, 0.008, 0.040, 0.112, 0.103, 0.021 7 | 5, 256, 0.000, 256, 14.645, 0.939, 7.716, 13.687, 3.270, 0.187 8 | 6, 1024, 0.000, 256, 45.239, 2.703, 30.127, 18.175, 4.296, 0.274 9 | 7, 1024, 64.000, 256, 46.717, 2.720, 30.122, 18.241, 4.250, 2.632 10 | 8, 1024, -64.000, 256, 47.677, 2.719, 30.117, 18.096, 4.192, 2.333 -------------------------------------------------------------------------------- /output/similarity.csv: -------------------------------------------------------------------------------- 1 | sim_j, t, ICWS_xxhash, FastICWS, BagMinHash2, DartMinHash 2 | 0.500, 1, 1.000, 1.000, 0.000, 1.000 3 | 0.500, 2, 0.500, 0.500, 0.000, 0.500 4 | 0.500, 3, 0.333, 0.333, 0.000, 0.333 5 | 0.500, 4, 0.500, 0.250, 0.750, 0.750 6 | 0.500, 5, 0.000, 0.400, 0.600, 0.200 7 | 0.500, 6, 0.667, 0.500, 0.500, 0.000 8 | 0.500, 7, 0.571, 0.714, 0.429, 0.429 9 | 0.500, 8, 0.250, 0.375, 0.625, 0.500 10 | 0.500, 9, 0.889, 0.222, 0.556, 0.444 11 | 0.500, 10, 0.600, 0.400, 0.700, 0.400 -------------------------------------------------------------------------------- /similarity.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SKETCH_SIMILARITY 2 | #define SKETCH_SIMILARITY 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | 10 | double weight(const vector>& x) { 11 | double w = 0; 12 | for(const pair& v : x) { 13 | w += v.second; 14 | } 15 | return w; 16 | } 17 | 18 | double intersection(const vector>& x, const vector>& y) { 19 | uint64_t i = 0; 20 | uint64_t j = 0; 21 | double s = 0; 22 | while(i < x.size() && j < y.size()) { 23 | if(x[i].first == y[j].first) { 24 | s += min(x[i].second, y[j].second); 25 | i++; 26 | j++; 27 | } else if(x[i].first < y[j].first) { 28 | i++; 29 | } else { 30 | j++; 31 | } 32 | } 33 | return s; 34 | } 35 | 36 | double jaccard_similarity(const vector>& x, const vector>& y) { 37 | double s = intersection(x, y); 38 | double w_x = weight(x); 39 | double w_y = weight(y); 40 | return s/(w_x + w_y - s); 41 | } 42 | 43 | double l1_similarity(const vector>& x, const vector>& y) { 44 | double s = intersection(x, y); 45 | double w_x = weight(x); 46 | double w_y = weight(y); 47 | return s/min(w_x, w_y); 48 | } 49 | 50 | double hamming_distance(const vector& x, const vector& y) { 51 | double h = 0; 52 | for(uint32_t i = 0; i < x.size(); i++) { 53 | if(x[i] != y[i]) { 54 | h = h + 1; 55 | } 56 | } 57 | return h; 58 | } 59 | 60 | double onebit_minhash_jaccard_estimate(const vector& x, const vector& y) { 61 | double h = hamming_distance(x, y); 62 | double t = x.size(); 63 | return max(0.0, 2*(1 - h/t) - 1); 64 | } 65 | 66 | // Similarity conversions 67 | // L1 similarity is the normalized intersection: |x \cap y| / min(|x|, |y|) 68 | // Jaccard similarity is: |x \cap y| / |x \cup y| 69 | double jaccard_similarity_from_l1_similarity(double x_weight, double y_weight, double l1_sim) { 70 | double i = min(x_weight, y_weight)*l1_sim; 71 | double u = x_weight + y_weight - i; 72 | return i/u; 73 | } 74 | 75 | double l1_similarity_from_jaccard_similarity(double x_weight, double y_weight, double jaccard_sim) { 76 | double i = jaccard_sim*(x_weight + y_weight)/(1 + jaccard_sim); 77 | return i/min(x_weight, y_weight); 78 | } 79 | 80 | // Count the number of collisions in two vectors of minhash sketches ((id, rank) pairs) 81 | uint64_t count_collisions(const vector>& x, const vector>& y) { 82 | uint64_t collisions = 0; 83 | for(uint64_t i = 0; i < x.size(); i++) { 84 | if(x[i].first == y[i].first) { 85 | collisions++; 86 | } 87 | } 88 | return collisions; 89 | } 90 | 91 | double jaccard_estimate_from_minhashes(const vector>& x, const vector>& y) { 92 | return (double)count_collisions(x, y)/x.size(); 93 | } 94 | 95 | #endif -------------------------------------------------------------------------------- /tests-main.cpp: -------------------------------------------------------------------------------- 1 | #define CATCH_CONFIG_MAIN // This tells Catch to provide a main() - only do this in one cpp file 2 | #include "catch.hpp" 3 | -------------------------------------------------------------------------------- /tests.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "catch.hpp" 5 | #include "datagenerator.hpp" 6 | #include "similarity.hpp" 7 | #include "darthash.hpp" 8 | #include "icws.hpp" 9 | #include "dartminhash.hpp" 10 | #include "bagminhash_wrappers.hpp" 11 | 12 | using namespace std; 13 | 14 | TEST_CASE("Randomly generated weighted sets behave as expected", "[datagenerator]") { 15 | SECTION("Size and weight") { 16 | uint64_t seed = 1; 17 | mt19937_64 rng(seed); 18 | uint64_t L0 = 128; 19 | double L1 = 1.0; 20 | vector> x = generate_weighted_set(L0, L1, rng); 21 | REQUIRE(x.size() == L0); 22 | 23 | bool sorted = true; 24 | for(uint32_t i = 1; i < x.size(); i++) { 25 | if(x[i-1].first > x[i].first) { 26 | sorted = false; 27 | } 28 | } 29 | REQUIRE(sorted); 30 | REQUIRE(weight(x) == Approx(L1)); 31 | } 32 | SECTION("Similar sets") { 33 | uint64_t seed = 1; 34 | mt19937_64 rng(seed); 35 | uint64_t L0 = 128; 36 | double L1 = 1.0; 37 | vector> x = generate_weighted_set(L0, L1, rng); 38 | auto y = generate_similar_weighted_set(x, 0.5, rng); 39 | 40 | // Ensure that the weight is maintained and that we have added an additional element for the excess weight. 41 | REQUIRE(weight(y) == Approx(L1)); 42 | REQUIRE(y.size() == L0 + 1); 43 | 44 | // Ensure that the generated set matches the desired similarity 45 | int k = 10; 46 | for(int i = 0; i <= k; i++) { 47 | double s = ((double)i/k); 48 | auto y = generate_similar_weighted_set(x, s, rng); 49 | REQUIRE(jaccard_similarity(x, y) == Approx(s/(2-s))); 50 | } 51 | } 52 | } 53 | 54 | TEST_CASE("Similarity measures", "[similarity]") { 55 | SECTION("weight") { 56 | vector> x = { 57 | {1, 1.0}, 58 | {2, 2.0} 59 | }; 60 | REQUIRE(weight(x) == Approx(3.0)); 61 | } 62 | SECTION("jaccard_similarity") { 63 | vector> x = { 64 | {1, 1.0}, 65 | {2, 2.0} 66 | }; 67 | vector> y = { 68 | {1, 1.0}, 69 | {2, 1.0}, 70 | {3, 1.0} 71 | }; 72 | REQUIRE(intersection(x, y) == Approx(2.0)); 73 | REQUIRE(jaccard_similarity(x, y) == Approx(0.5)); 74 | } 75 | SECTION("Hamming distance") { 76 | REQUIRE(hamming_distance({true, true, true}, {false, false, false}) == 3); 77 | REQUIRE(hamming_distance({true, true, true}, {false, true, false}) == 2); 78 | } 79 | 80 | SECTION("similarity conversions") { 81 | vector> x = { 82 | {1, 1.0}, 83 | {2, 3.0} 84 | }; 85 | vector> y = { 86 | {1, 1.0}, 87 | {2, 1.0}, 88 | {3, 1.0} 89 | }; 90 | // jaccard similarity: 2/5 91 | // l1 similarity: 2/3 92 | double jaccard_sim = jaccard_similarity(x, y); 93 | double l1_sim = l1_similarity(x, y); 94 | REQUIRE(jaccard_sim == Approx(2.0/5)); 95 | REQUIRE(l1_sim == Approx(2.0/3)); 96 | double x_weight = weight(x); 97 | double y_weight = weight(y); 98 | REQUIRE(l1_similarity_from_jaccard_similarity(x_weight, y_weight, jaccard_sim) == Approx(l1_sim)); 99 | REQUIRE(jaccard_similarity_from_l1_similarity(x_weight, y_weight, l1_sim) == Approx(jaccard_sim)); 100 | } 101 | } 102 | 103 | TEST_CASE("DartHash", "[darthash]") { 104 | SECTION("Basic dart properties") { 105 | 106 | uint64_t seed = 1; 107 | mt19937_64 rng(seed); 108 | uint64_t t = 256; 109 | DartHash D(rng, t); 110 | uint64_t L0 = 128; 111 | double L1 = 1.0; 112 | auto x = generate_weighted_set(L0, L1, rng); 113 | auto darts = D(x); 114 | 115 | // Number of darts 116 | // According to http://www.cs.columbia.edu/~ccanonne/files/misc/2017-poissonconcentration.pdf 117 | // The probability that the number of darts deviates by more than t/2 is at most 2*exp(-t/10) 118 | REQUIRE(darts.size() > 128); 119 | REQUIRE(darts.size() < 256 + 128); 120 | 121 | // Dart ranks should be smaller than 1/L1 and fingerprints should be unique 122 | unordered_set fingerprints; 123 | bool too_large = false; 124 | bool too_small = true; 125 | for(auto& element : darts) { 126 | fingerprints.insert(element.first); 127 | if(element.second > 1/L1) { 128 | too_large = true; 129 | } 130 | 131 | // Ranks should be uniformly distributed between zero and 1/L1. 132 | if(element.second > 1/(2*L1)) { 133 | too_small = false; 134 | } 135 | } 136 | REQUIRE(!too_large); 137 | REQUIRE(!too_small); 138 | REQUIRE(fingerprints.size() == darts.size()); 139 | } 140 | 141 | SECTION("Darts to MinHash") { 142 | // Converting t darts to k minhashes 143 | // The probability of an "empty" minhash is at most t*exp(-t/k) by a standard union bound over Poisson distributions 144 | // Verify that when t/k is large that we have no empty minhashes 145 | // We set the id of empty minhashes to 0 146 | uint64_t seed = 1; 147 | mt19937_64 rng(seed); 148 | uint64_t t = 4096; 149 | uint64_t k = 128; 150 | DartHash D(rng, t); 151 | uint64_t L0 = 128; 152 | double L1 = 1.0; 153 | auto x = generate_weighted_set(L0, L1, rng); 154 | auto minhashes = D.minhash(x, k); 155 | bool all_nonempty = true; 156 | for(auto mh : minhashes) { 157 | if(mh.first == 0) { 158 | all_nonempty = false; 159 | } 160 | } 161 | REQUIRE(all_nonempty); 162 | 163 | // When k = t then we expect empty minhashes 164 | k = t; 165 | minhashes = D.minhash(x, k); 166 | all_nonempty = true; 167 | for(auto mh : minhashes) { 168 | if(mh.first == 0) { 169 | all_nonempty = false; 170 | } 171 | } 172 | REQUIRE(!all_nonempty); 173 | } 174 | 175 | SECTION("MAE of 1-bit minhash sketch stays within Hoeffding bounds") { 176 | uint64_t seed = 1; 177 | mt19937_64 rng(seed); 178 | uint64_t t = 512; 179 | uint64_t k = 64; 180 | DartHash D(rng, t); 181 | uint64_t L0 = 64; 182 | double L1 = 1.0; 183 | double l1_sim = 0.5; 184 | uint64_t n = 2000; 185 | 186 | // MAE when using minhash to estimate l1 similarity 187 | double target_l1_sim_mae = 0.1079063; 188 | double epsilon = 0.05; 189 | 190 | double total_absolute_error = 0.0; 191 | for(uint64_t i = 0; i < n; i++) { 192 | auto x = generate_weighted_set(L0, L1, rng); 193 | auto y = generate_similar_weighted_set(x, l1_sim, rng); 194 | auto sketch_x = D.onebit_minhash(x, k); 195 | auto sketch_y = D.onebit_minhash(y, k); 196 | double jaccard_estimate = onebit_minhash_jaccard_estimate(sketch_x, sketch_y); 197 | total_absolute_error += abs(l1_similarity_from_jaccard_similarity(weight(x), weight(y), jaccard_estimate) - l1_sim); 198 | } 199 | 200 | double empirical_mae = total_absolute_error/n; 201 | REQUIRE(abs(target_l1_sim_mae - empirical_mae) <= epsilon); 202 | } 203 | } 204 | 205 | TEST_CASE("ICWS", "[icws]") { 206 | SECTION("Weighted samples are valid") { 207 | uint64_t seed = 1; 208 | mt19937_64 rng(seed); 209 | ICWS H(rng); 210 | int m = 100; 211 | uint64_t L0 = 64; 212 | double L1 = 1.0; 213 | for(int i = 0; i < m; i++) { 214 | auto x = generate_weighted_set(L0, L1, rng); 215 | auto z = H(x); 216 | bool valid_cws = false; 217 | for(auto element : x) { 218 | if(element.first == z.first && z.second <= element.second) { 219 | valid_cws = true; 220 | } 221 | } 222 | REQUIRE(valid_cws); 223 | } 224 | } 225 | } 226 | 227 | TEST_CASE("DartMinHash", "[dartminhash]") { 228 | SECTION("1-bit dartminhash MAE stays within Hoeffding bounds") { 229 | 230 | uint64_t seed = 1; 231 | mt19937_64 rng(seed); 232 | uint64_t k = 64; 233 | DartMinHash M(rng, k); 234 | uint64_t L0 = 64; 235 | double L1 = 1.0; 236 | double l1_sim = 0.5; 237 | uint64_t n = 2000; 238 | 239 | // MAE when using minhash to estimate l1 similarity 240 | double target_l1_sim_mae = 0.1079063; 241 | double epsilon = 0.05; 242 | 243 | double total_absolute_error = 0.0; 244 | for(uint64_t i = 0; i < n; i++) { 245 | auto x = generate_weighted_set(L0, L1, rng); 246 | auto y = generate_similar_weighted_set(x, l1_sim, rng); 247 | auto sketch_x = M.onebit_minhash(x); 248 | auto sketch_y = M.onebit_minhash(y); 249 | double jaccard_estimate = onebit_minhash_jaccard_estimate(sketch_x, sketch_y); 250 | total_absolute_error += abs(l1_similarity_from_jaccard_similarity(weight(x), weight(y), jaccard_estimate) - l1_sim); 251 | } 252 | 253 | double empirical_mae = total_absolute_error/n; 254 | REQUIRE(abs(target_l1_sim_mae - empirical_mae) <= epsilon); 255 | 256 | } 257 | } 258 | 259 | // Test correct estimation of jaccard similarity within Hoeffding bounds 260 | TEST_CASE("Jaccard similarity estimation", "[bagminhash]") { 261 | uint64_t seed = 1; 262 | mt19937_64 rng(seed); 263 | uint64_t L0 = 64; 264 | double L1 = 1.0; 265 | double l1_sim = 0.5; 266 | double target_jaccard_similarity = jaccard_similarity_from_l1_similarity(L1, L1, l1_sim); 267 | double epsilon = 0.05; 268 | uint64_t t = 2000; 269 | auto x = generate_weighted_set(L0, L1, rng); 270 | auto y = generate_similar_weighted_set(x, l1_sim, rng); 271 | 272 | SECTION("BagMinHash1") { 273 | BagMinHash1 B1(t); 274 | auto mh_x = B1(x); 275 | auto mh_y = B1(y); 276 | double estimated_jaccard_similarity = (double)count_collisions(mh_x, mh_y)/t; 277 | REQUIRE(abs(target_jaccard_similarity - estimated_jaccard_similarity) <= epsilon); 278 | } 279 | 280 | SECTION("BagMinHash2") { 281 | BagMinHash2 B2(t); 282 | auto mh_x = B2(x); 283 | auto mh_y = B2(y); 284 | double estimated_jaccard_similarity = (double)count_collisions(mh_x, mh_y)/t; 285 | REQUIRE(abs(target_jaccard_similarity - estimated_jaccard_similarity) <= epsilon); 286 | } 287 | 288 | SECTION("ICWS_xxhash") { 289 | ICWS_xxhash I(t); 290 | auto mh_x = I(x); 291 | auto mh_y = I(y); 292 | double estimated_jaccard_similarity = (double)count_collisions(mh_x, mh_y)/t; 293 | REQUIRE(abs(target_jaccard_similarity - estimated_jaccard_similarity) <= epsilon); 294 | } 295 | 296 | SECTION("FastICWS") { 297 | FastICWS_t F(rng, t); 298 | auto mh_x = F(x); 299 | auto mh_y = F(y); 300 | double estimated_jaccard_similarity = (double)count_collisions(mh_x, mh_y)/t; 301 | REQUIRE(abs(target_jaccard_similarity - estimated_jaccard_similarity) <= epsilon); 302 | } 303 | } 304 | 305 | 306 | -------------------------------------------------------------------------------- /timer.hpp: -------------------------------------------------------------------------------- 1 | 2 | #ifndef SKETCH_TIMER 3 | #define SKETCH_TIMER 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | using namespace std; 10 | 11 | struct Timer { 12 | string name; 13 | chrono::nanoseconds elapsed; 14 | chrono::high_resolution_clock::time_point start_time; 15 | 16 | Timer() : Timer("Timer") {} 17 | Timer(string name) : name(name) { 18 | elapsed = chrono::nanoseconds(0); 19 | } 20 | 21 | void start() { 22 | start_time = std::chrono::high_resolution_clock::now(); 23 | } 24 | 25 | void stop() { 26 | elapsed += std::chrono::high_resolution_clock::now() - start_time; 27 | } 28 | 29 | void reset() { 30 | elapsed = chrono::nanoseconds(0); 31 | } 32 | 33 | double elapsed_s() { 34 | return elapsed_ms()/1000; 35 | } 36 | 37 | double elapsed_ms() { 38 | return (double)elapsed.count()/1000000; 39 | } 40 | 41 | double elapsed_ns() { 42 | return elapsed.count(); 43 | } 44 | 45 | void print_ms() { 46 | cout << fixed << setprecision(2); 47 | cout << name << ": " << elapsed_ms() << " (ms)" << endl; 48 | } 49 | 50 | void print_s() { 51 | cout << fixed << setprecision(2); 52 | cout << name << ": " << elapsed_s() << " (s)" << endl; 53 | } 54 | }; 55 | 56 | #endif 57 | --------------------------------------------------------------------------------