├── .gitignore
├── LICENSE.txt
├── README.md
├── bagminhash
    ├── bitstream_random.hpp
    ├── exponential_distribution.hpp
    └── weighted_minwise_hashing.hpp
├── bagminhash_wrappers.hpp
├── catch.hpp
├── darthash.hpp
├── dartminhash.hpp
├── datagenerator.hpp
├── hashing.hpp
├── icws.hpp
├── main.cpp
├── makefile
├── output
    ├── performance.csv
    └── similarity.csv
├── similarity.hpp
├── tests-main.cpp
├── tests.cpp
└── timer.hpp


/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | main
3 | tests
4 | .vscode/*
5 | /bagminhash/xxhash
6 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Tobias Christiani
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DartMinHash: Fast Sketching for Weighted Sets
 2 | 
 3 | This repository contains experiments for comparing the estimation accuracy and running times of the following weighted minwise hashing algorithms:
 4 | 
 5 | * DartMinHash https://arxiv.org/abs/2005.11547 
 6 | * ICWS and FastICWS https://research.google/pubs/pub36928/
 7 | * BagMinHash https://arxiv.org/abs/1802.03914
 8 | 
 9 | For BagMinHash and ICWS we use the implementation from here https://github.com/oertl/bagminhash with the relevant files included in the /bagminhash folder.
10 | 
11 | See the DartMinHash paper https://arxiv.org/abs/2005.11547 for a description of the algorithm and further results of experiments.
12 | 
13 | ## Requirements
14 | The BagMinHash algorithm uses XXHash64 which must be installed:
15 | 
16 | 1. Get xxhash from https://github.com/Cyan4973/xxHash, e.g. using `git clone https://github.com/Cyan4973/xxHash.git`
17 | 2. Build using `make lib`
18 | 3. Place xxhash.h and libxxhash.a into the directory /bagminhash/xxhash
19 | 
20 | The code compiles under GCC version 7.5.0 https://gcc.gnu.org/ with relevant commands in the makefile https://www.gnu.org/software/make/.
21 | 
22 | ## Commands
23 | 
24 | `make run` compiles and executes the main function in main.cpp.
25 | 
26 | `make test` compiles and run unit tests.
27 | 
28 | ## Experiments
29 | The different experiments are all placed in the main.cpp and write their output to stdout in CSV format. 
30 | 
31 | 1. `time_performance`: Times different algorithms on synthetic data for all combinations of sketch lengths, and L0 and L1 norms chosen. 
32 | 2. `time_performance_specific`: Same as above, but only runs on specified tuples of parameters.
33 | 3. `measure_similarity`: Returns the estimated Jaccard similarity of different algorithms on synthetic pairs of weighted sets with a specific similarity.
34 | 
35 | By default `make run` will run `time_performance_specific` on a subset of the settings used in Table 1 in the paper.
36 | 
37 | In order to pipe the output to the file `data.csv` use command `make run > data.csv`.
38 | 
39 | ## Example output
40 | 
41 | Notation:
42 | 
43 | * t denotes the sketch length (usually k in the paper).
44 | * ICWS is a simple and unoptimized version of ICWS using tabulation hashing.
45 | * ICWS_xxhash is the implementation from the BagMinHash repository which uses the ziggurat algorithm for fast sampling: https://en.wikipedia.org/wiki/Ziggurat_algorithm
46 | * FastICWS is our own highly optimized implementation of ICWS that tabulates expensive operations and only computes the logarithms of weights once.
47 | * BagMinHash1 and BagMinHash2: BagMinHash variants described in the BagMinHash paper. BagMinHash2 is essentially always faster and is what we compare against.
48 | * DartMinHash: Optimized implementation following the pseudocode in the paper.
49 | 
50 | ### Performance timings
51 | 
52 | | id | L0   | log2_L1 | t    | ICWS    | FastICWS | ICWS_xxhash | BagMinHash1 | BagMinHash2 | DartMinHash |
53 | |----|------|---------|------|---------|----------|-------------|-------------|-------------|-------------|
54 | | 0  | 64   | 0.000   | 64   | 0.899   | 0.060    | 0.538       | 2.439       | 0.628       | 0.042       |
55 | | 1  | 1024 | 0.000   | 64   | 11.565  | 0.515    | 9.604       | 4.374       | 1.706       | 0.145       |
56 | | 2  | 64   | 0.000   | 1024 | 19.296  | 2.885    | 8.083       | 48.248      | 13.279      | 0.592       |
57 | | 3  | 1024 | 0.000   | 1024 | 187.661 | 12.643   | 120.135     | 79.775      | 16.586      | 0.824       |
58 | | 4  | 256  | 0.000   | 1    | 0.040   | 0.008    | 0.040       | 0.112       | 0.103       | 0.021       |
59 | | 5  | 256  | 0.000   | 256  | 14.645  | 0.939    | 7.716       | 13.687      | 3.270       | 0.187       |
60 | | 6  | 1024 | 0.000   | 256  | 45.239  | 2.703    | 30.127      | 18.175      | 4.296       | 0.274       |
61 | | 7  | 1024 | 64.000  | 256  | 46.717  | 2.720    | 30.122      | 18.241      | 4.250       | 2.632       |
62 | | 8  | 1024 | -64.000 | 256  | 47.677  | 2.719    | 30.117      | 18.096      | 4.192       | 2.333       |
63 | 
64 | ### Jaccard similarity estimates
65 | 
66 | | sim_j | t  | ICWS_xxhash | FastICWS | BagMinHash2 | DartMinHash |
67 | |-------|----|-------------|----------|-------------|-------------|
68 | | 0.500 | 1  | 1.000       | 1.000    | 0.000       | 1.000       |
69 | | 0.500 | 2  | 0.500       | 0.500    | 0.000       | 0.500       |
70 | | 0.500 | 3  | 0.333       | 0.333    | 0.000       | 0.333       |
71 | | 0.500 | 4  | 0.500       | 0.250    | 0.750       | 0.750       |
72 | | 0.500 | 5  | 0.000       | 0.400    | 0.600       | 0.200       |
73 | | 0.500 | 6  | 0.667       | 0.500    | 0.500       | 0.000       |
74 | | 0.500 | 7  | 0.571       | 0.714    | 0.429       | 0.429       |
75 | | 0.500 | 8  | 0.250       | 0.375    | 0.625       | 0.500       |
76 | | 0.500 | 9  | 0.889       | 0.222    | 0.556       | 0.444       |
77 | | 0.500 | 10 | 0.600       | 0.400    | 0.700       | 0.400       |
78 | 
79 | ## Tests
80 | We use Catch2 https://github.com/catchorg/Catch2 for unit testing.
81 | 
82 | To compile and run tests use the command: `make test` 


--------------------------------------------------------------------------------
/bagminhash/bitstream_random.hpp:
--------------------------------------------------------------------------------
  1 | //##################################
  2 | //# Copyright (C) 2018 Otmar Ertl. #
  3 | //# All rights reserved.           #
  4 | //##################################
  5 | 
  6 | #ifndef _BIT_STREAM_RANDOM_HPP_
  7 | #define _BIT_STREAM_RANDOM_HPP_
  8 | 
  9 | #include "exponential_distribution.hpp"
 10 | 
 11 | #include "xxhash/xxhash.h"
 12 | 
 13 | #include <cmath>
 14 | #include <cassert>
 15 | #include <cstring>
 16 | #include <memory>
 17 | 
 18 | static constexpr double maxReciprocal = 1. / (UINT64_C(1) << 52);
 19 | 
 20 | // uniform distributed double value from (0, 1]
 21 | template<typename T> double getUniformDouble(T& bitstream) {
 22 |     return (bitstream(52) + 1) * maxReciprocal;
 23 | }
 24 | 
 25 | template<typename T> double getExponential1(T& bitstream) {
 26 |     return ziggurat::getExponential(bitstream);
 27 | }
 28 | 
 29 | template<typename T> double getGamma21(T& bitstream) {
 30 |     return getExponential1(bitstream) + getExponential1(bitstream);
 31 | }
 32 | 
 33 | template<typename T> double getBeta21(T& bitstream) {
 34 |     return std::sqrt(getUniformDouble(bitstream));
 35 | }
 36 | 
 37 | template<typename T> bool getBernoulli(double successProbability, T& bitStream) {
 38 |     while(true) {
 39 |         if (successProbability == 0) return false;
 40 |         if (successProbability == 1) return true;
 41 |         bool b = successProbability > 0.5;
 42 |         if (bitStream()) return b;
 43 |         successProbability += successProbability;
 44 |         if (b) successProbability -= 1;
 45 |     }
 46 | }
 47 | 
 48 | // see Lumbroso, Jeremie. "Optimal discrete uniform generation from coin flips, and applications." arXiv preprint arXiv:1304.1916 (2013).
 49 | template<typename T> uint64_t getUniform(uint64_t n, T& bitstream) {
 50 |     assert(n > 0);
 51 |     uint64_t v = 1;
 52 |     uint64_t c = 0;
 53 |     while(true) {
 54 |         v <<= 1;
 55 |         c <<= 1;
 56 |         c += bitstream();
 57 |         if (v >= n) {
 58 |             if (c < n) {
 59 |                 return c;
 60 |             }
 61 |             else {
 62 |                 v -= n;
 63 |                 c -= n;
 64 |             }
 65 |         }
 66 |     }
 67 | }
 68 | 
 69 | template<typename T> uint64_t getUniformPow2(uint8_t numBits, T& bitstream) {
 70 |     return bitstream(numBits);
 71 | }
 72 | 
 73 | class XXHash64 {
 74 | public:
 75 |     static uint64_t calculateHash(const char* data, size_t length, uint64_t seed) {
 76 |         return XXH64(data, length, seed);
 77 |     }
 78 | };
 79 | 
 80 | struct BitMasks {
 81 |     constexpr BitMasks() : masks() {
 82 |         masks[0] = 0;
 83 |         for (uint8_t i = 1; i <= 63; ++i) masks[i] = (UINT64_C(1) << i) - UINT64_C(1);
 84 |         masks[64] = UINT64_C(0xFFFFFFFFFFFFFFFF);
 85 |     }
 86 | 
 87 |     uint64_t masks[65];
 88 | };
 89 | 
 90 | static constexpr BitMasks BIT_MASKS;
 91 | 
 92 | template<typename R>
 93 | class BitStream {
 94 | 
 95 |     static const uint32_t FNV_OFFSET;
 96 |     static const uint32_t FNV_PRIME;
 97 | 
 98 |     size_t dataSize;
 99 |     std::unique_ptr<char[]> data;
100 |     uint64_t seed;
101 |     uint64_t hashBits;
102 |     uint8_t availableBits;
103 | 
104 |     void nextHash() {
105 |         uint32_t tmp;
106 |         memcpy(&tmp, data.get(), sizeof(uint32_t));
107 |         tmp *= FNV_PRIME;
108 |         memcpy(data.get(), &tmp, sizeof(uint32_t));
109 |         hashBits = R::calculateHash(data.get(), dataSize, seed);
110 |     }
111 | public:
112 | 
113 |     BitStream(const BitStream& p) = delete;
114 |     BitStream& operator=(const BitStream&) = delete;
115 |     BitStream(BitStream&& p) = default;
116 |     BitStream& operator=(BitStream&&) = default;
117 | 
118 |     template<typename I>
119 |     BitStream(const I& valueProvider, uint64_t _seed) : dataSize(valueProvider.size() +  sizeof(uint32_t)), data(new char[dataSize]), seed(_seed), hashBits(0), availableBits(0) {
120 |         memcpy(data.get(), &FNV_OFFSET, sizeof(uint32_t));
121 |         valueProvider.init(&data[sizeof(uint32_t)]);
122 |     }
123 | 
124 |     bool operator()() {
125 |         if (availableBits == 0) {
126 |             nextHash();
127 |         }
128 |         bool result = (hashBits & UINT64_C(1));
129 |         hashBits >>= 1;
130 |         availableBits -= 1;
131 |         availableBits &= UINT8_C(0x3F);
132 |         return result;
133 |     }
134 | 
135 |     uint64_t operator()(uint8_t numBits) {
136 |         assert(numBits <= 64);
137 |         uint64_t result = 0;
138 |         uint8_t requiredBits = numBits;
139 |         if(numBits > availableBits) {
140 |             result = (hashBits & BIT_MASKS.masks[availableBits]);
141 |             result <<= (numBits - availableBits);
142 |             nextHash();
143 |             requiredBits -= availableBits;
144 |         }
145 |         result |= (hashBits & BIT_MASKS.masks[requiredBits]);
146 |         hashBits >>= requiredBits;
147 |         availableBits -= numBits;
148 |         availableBits &= UINT8_C(0x3F);
149 |         return result;
150 |     }
151 | };
152 | 
153 | // see https://en.wikipedia.org/wiki/Fowler-Noll-Vo_hash_function
154 | template<typename R> const uint32_t BitStream<R>:: FNV_OFFSET = 0x811c9dc5;
155 | template<typename R> const uint32_t BitStream<R>:: FNV_PRIME = (1 << 24) + (1 << 8) + 0x93;
156 | 
157 | #endif // _BIT_STREAM_RANDOM_HPP_
158 | 


--------------------------------------------------------------------------------
/bagminhash/exponential_distribution.hpp:
--------------------------------------------------------------------------------
  1 | /* Boost Software License - Version 1.0 - August 17th, 2003
  2 |  *
  3 |  * Permission is hereby granted, free of charge, to any person or organization
  4 |  * obtaining a copy of the software and accompanying documentation covered by
  5 |  * this license (the "Software") to use, reproduce, display, distribute,
  6 |  * execute, and transmit the Software, and to prepare derivative works of the
  7 |  * Software, and to permit third-parties to whom the Software is furnished to
  8 |  * do so, all subject to the following:
  9 |  *
 10 |  * The copyright notices in the Software and this entire statement, including
 11 |  * the above license grant, this restriction and the following disclaimer,
 12 |  * must be included in all copies of the Software, in whole or in part, and
 13 |  * all derivative works of the Software, unless such copies or derivative
 14 |  * works are solely in the form of machine-executable object code generated by
 15 |  * a source language processor.
 16 |  *
 17 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 |  * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 20 |  * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 21 |  * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 22 |  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 23 |  * DEALINGS IN THE SOFTWARE.
 24 |  */
 25 | 
 26 | /* boost random/exponential_distribution.hpp header file
 27 |  *
 28 |  * Copyright Jens Maurer 2000-2001
 29 |  * Copyright Steven Watanabe 2011
 30 |  * Copyright Jason Rhinelander 2016
 31 |  * Distributed under the Boost Software License, Version 1.0. (See
 32 |  * accompanying file LICENSE_1_0.txt or copy at
 33 |  * http://www.boost.org/LICENSE_1_0.txt)
 34 |  *
 35 |  * See http://www.boost.org for most recent version including documentation.
 36 |  *
 37 |  * $Id$
 38 |  *
 39 |  * Revision history
 40 |  *  2001-02-18  moved to individual header files
 41 |  */
 42 | 
 43 | #ifndef _EXPONENTIAL_DISTRIBUTION_HPP_
 44 | #define _EXPONENTIAL_DISTRIBUTION_HPP_
 45 | 
 46 | #include <cmath>
 47 | 
 48 | namespace ziggurat {
 49 | 
 50 | // tables for the ziggurat algorithm
 51 | struct exponential_table {
 52 |     static const double table_x[257];
 53 |     static const double table_y[257];
 54 | };
 55 | 
 56 | const double exponential_table::table_x[257] = {
 57 |     8.6971174701310497140, 7.6971174701310497140, 6.9410336293772123602, 6.4783784938325698538,
 58 |     6.1441646657724730491, 5.8821443157953997963, 5.6664101674540337371, 5.4828906275260628694,
 59 |     5.3230905057543986131, 5.1814872813015010392, 5.0542884899813047117, 4.9387770859012514838,
 60 |     4.8329397410251125881, 4.7352429966017412526, 4.6444918854200854873, 4.5597370617073515513,
 61 |     4.4802117465284221949, 4.4052876934735729805, 4.3344436803172730116, 4.2672424802773661873,
 62 |     4.2033137137351843802, 4.1423408656640511251, 4.0840513104082974638, 4.0282085446479365106,
 63 |     3.9746060666737884793, 3.9230625001354895926, 3.8734176703995089983, 3.8255294185223367372,
 64 |     3.7792709924116678992, 3.7345288940397975350, 3.6912010902374189454, 3.6491955157608538478,
 65 |     3.6084288131289096339, 3.5688252656483374051, 3.5303158891293438633, 3.4928376547740601814,
 66 |     3.4563328211327607625, 3.4207483572511205323, 3.3860354424603017887, 3.3521490309001100106,
 67 |     3.3190474709707487166, 3.2866921715990692095, 3.2550473085704501813, 3.2240795652862645207,
 68 |     3.1937579032122407483, 3.1640533580259734580, 3.1349388580844407393, 3.1063890623398246660,
 69 |     3.0783802152540905188, 3.0508900166154554479, 3.0238975044556767713, 2.9973829495161306949,
 70 |     2.9713277599210896472, 2.9457143948950456386, 2.9205262865127406647, 2.8957477686001416838,
 71 |     2.8713640120155362592, 2.8473609656351888266, 2.8237253024500354905, 2.8004443702507381944,
 72 |     2.7775061464397572041, 2.7548991965623453650, 2.7326126361947007411, 2.7106360958679293686,
 73 |     2.6889596887418041593, 2.6675739807732670816, 2.6464699631518093905, 2.6256390267977886123,
 74 |     2.6050729387408355373, 2.5847638202141406911, 2.5647041263169053687, 2.5448866271118700928,
 75 |     2.5253043900378279427, 2.5059507635285939648, 2.4868193617402096807, 2.4679040502973649846,
 76 |     2.4491989329782498908, 2.4306983392644199088, 2.4123968126888708336, 2.3942890999214583288,
 77 |     2.3763701405361408194, 2.3586350574093374601, 2.3410791477030346875, 2.3236978743901964559,
 78 |     2.3064868582835798692, 2.2894418705322694265, 2.2725588255531546952, 2.2558337743672190441,
 79 |     2.2392628983129087111, 2.2228425031110364013, 2.2065690132576635755, 2.1904389667232199235,
 80 |     2.1744490099377744673, 2.1585958930438856781, 2.1428764653998416425, 2.1272876713173679737,
 81 |     2.1118265460190418108, 2.0964902118017147637, 2.0812758743932248696, 2.0661808194905755036,
 82 |     2.0512024094685848641, 2.0363380802487695916, 2.0215853383189260770, 2.0069417578945183144,
 83 |     1.9924049782135764992, 1.9779727009573602295, 1.9636426877895480401, 1.9494127580071845659,
 84 |     1.9352807862970511135, 1.9212447005915276767, 1.9073024800183871196, 1.8934521529393077332,
 85 |     1.8796917950722108462, 1.8660195276928275962, 1.8524335159111751661, 1.8389319670188793980,
 86 |     1.8255131289035192212, 1.8121752885263901413, 1.7989167704602903934, 1.7857359354841254047,
 87 |     1.7726311792313049959, 1.7596009308890742369, 1.7466436519460739352, 1.7337578349855711926,
 88 |     1.7209420025219350428, 1.7081947058780575683, 1.6955145241015377061, 1.6829000629175537544,
 89 |     1.6703499537164519163, 1.6578628525741725325, 1.6454374393037234057, 1.6330724165359912048,
 90 |     1.6207665088282577216, 1.6085184617988580769, 1.5963270412864831349, 1.5841910325326886695,
 91 |     1.5721092393862294810, 1.5600804835278879161, 1.5481036037145133070, 1.5361774550410318943,
 92 |     1.5243009082192260050, 1.5124728488721167573, 1.5006921768428164936, 1.4889578055167456003,
 93 |     1.4772686611561334579, 1.4656236822457450411, 1.4540218188487932264, 1.4424620319720121876,
 94 |     1.4309432929388794104, 1.4194645827699828254, 1.4080248915695353509, 1.3966232179170417110,
 95 |     1.3852585682631217189, 1.3739299563284902176, 1.3626364025050864742, 1.3513769332583349176,
 96 |     1.3401505805295045843, 1.3289563811371163220, 1.3177933761763245480, 1.3066606104151739482,
 97 |     1.2955571316866007210, 1.2844819902750125450, 1.2734342382962410994, 1.2624129290696153434,
 98 |     1.2514171164808525098, 1.2404458543344064544, 1.2294981956938491599, 1.2185731922087903071,
 99 |     1.2076698934267612830, 1.1967873460884031665, 1.1859245934042023557, 1.1750806743109117687,
100 |     1.1642546227056790397, 1.1534454666557748056, 1.1426522275816728928, 1.1318739194110786733,
101 |     1.1211095477013306083, 1.1103581087274114281, 1.0996185885325976575, 1.0888899619385472598,
102 |     1.0781711915113727024, 1.0674612264799681530, 1.0567590016025518414, 1.0460634359770445503,
103 |     1.0353734317905289496, 1.0246878730026178052, 1.0140056239570971074, 1.0033255279156973717,
104 |     0.99264640550727647009, 0.98196705308506317914, 0.97128624098390397896, 0.96060271166866709917,
105 |     0.94991517776407659940, 0.93922231995526297952, 0.92852278474721113999, 0.91781518207004493915,
106 |     0.90709808271569100600, 0.89637001558989069006, 0.88562946476175228052, 0.87487486629102585352,
107 |     0.86410460481100519511, 0.85331700984237406386, 0.84251035181036928333, 0.83168283773427388393,
108 |     0.82083260655441252290, 0.80995772405741906620, 0.79905617735548788109, 0.78812586886949324977,
109 |     0.77716460975913043936, 0.76617011273543541328, 0.75513998418198289808, 0.74407171550050873971,
110 |     0.73296267358436604916, 0.72181009030875689912, 0.71061105090965570413, 0.69936248110323266174,
111 |     0.68806113277374858613, 0.67670356802952337911, 0.66528614139267855405, 0.65380497984766565353,
112 |     0.64225596042453703448, 0.63063468493349100113, 0.61893645139487678178, 0.60715622162030085137,
113 |     0.59528858429150359384, 0.58332771274877027785, 0.57126731653258903915, 0.55910058551154127652,
114 |     0.54682012516331112550, 0.53441788123716615385, 0.52188505159213564105, 0.50921198244365495319,
115 |     0.49638804551867159754, 0.48340149165346224782, 0.47023927508216945338, 0.45688684093142071279,
116 |     0.44332786607355296305, 0.42954394022541129589, 0.41551416960035700100, 0.40121467889627836229,
117 |     0.38661797794112021568, 0.37169214532991786118, 0.35639976025839443721, 0.34069648106484979674,
118 |     0.32452911701691008547, 0.30783295467493287307, 0.29052795549123115167, 0.27251318547846547924,
119 |     0.25365836338591284433, 0.23379048305967553619, 0.21267151063096745264, 0.18995868962243277774,
120 |     0.16512762256418831796, 0.13730498094001380420, 0.10483850756582017915, 0.063852163815003480173,
121 |     0
122 | };
123 | 
124 | const double exponential_table::table_y[257] = {
125 |     0, 0.00045413435384149675545, 0.00096726928232717452884, 0.0015362997803015723824,
126 |     0.0021459677437189061793, 0.0027887987935740759640, 0.0034602647778369039855, 0.0041572951208337952532,
127 |     0.0048776559835423925804, 0.0056196422072054831710, 0.0063819059373191794422, 0.0071633531836349841425,
128 |     0.0079630774380170392396, 0.0087803149858089752347, 0.0096144136425022094101, 0.010464810181029979488,
129 |     0.011331013597834597488, 0.012212592426255380661, 0.013109164931254991070, 0.014020391403181937334,
130 |     0.014945968011691148079, 0.015885621839973162490, 0.016839106826039946359, 0.017806200410911360563,
131 |     0.018786700744696029497, 0.019780424338009741737, 0.020787204072578117603, 0.021806887504283582125,
132 |     0.022839335406385238829, 0.023884420511558170348, 0.024942026419731782971, 0.026012046645134218076,
133 |     0.027094383780955798424, 0.028188948763978634421, 0.029295660224637394015, 0.030414443910466605492,
134 |     0.031545232172893605499, 0.032687963508959533317, 0.033842582150874329031, 0.035009037697397411067,
135 |     0.036187284781931419754, 0.037377282772959360128, 0.038578995503074859626, 0.039792391023374122670,
136 |     0.041017441380414820816, 0.042254122413316231413, 0.043502413568888183301, 0.044762297732943280694,
137 |     0.046033761076175166762, 0.047316792913181548703, 0.048611385573379494401, 0.049917534282706374944,
138 |     0.051235237055126279830, 0.052564494593071689595, 0.053905310196046085104, 0.055257689676697038322,
139 |     0.056621641283742874438, 0.057997175631200659098, 0.059384305633420264487, 0.060783046445479636051,
140 |     0.062193415408540996150, 0.063615431999807331076, 0.065049117786753755036, 0.066494496385339779043,
141 |     0.067951593421936607770, 0.069420436498728751675, 0.070901055162371828426, 0.072393480875708743023,
142 |     0.073897746992364746308, 0.075413888734058408453, 0.076941943170480510100, 0.078481949201606426042,
143 |     0.080033947542319910023, 0.081597980709237420930, 0.083174093009632380354, 0.084762330532368125386,
144 |     0.086362741140756912277, 0.087975374467270219300, 0.089600281910032864534, 0.091237516631040162057,
145 |     0.092887133556043546523, 0.094549189376055853718, 0.096223742550432800103, 0.097910853311492199618,
146 |     0.099610583670637128826, 0.10132299742595363588, 0.10304816017125771553, 0.10478613930657016928,
147 |     0.10653700405000166218, 0.10830082545103379867, 0.11007767640518539026, 0.11186763167005629731,
148 |     0.11367076788274431301, 0.11548716357863353664, 0.11731689921155557057, 0.11916005717532768467,
149 |     0.12101672182667483729, 0.12288697950954513498, 0.12477091858083096578, 0.12666862943751066518,
150 |     0.12858020454522817870, 0.13050573846833078225, 0.13244532790138752023, 0.13439907170221363078,
151 |     0.13636707092642885841, 0.13834942886358021406, 0.14034625107486244210, 0.14235764543247220043,
152 |     0.14438372216063476473, 0.14642459387834493787, 0.14848037564386679222, 0.15055118500103990354,
153 |     0.15263714202744286154, 0.15473836938446807312, 0.15685499236936522013, 0.15898713896931420572,
154 |     0.16113493991759203183, 0.16329852875190180795, 0.16547804187493600915, 0.16767361861725019322,
155 |     0.16988540130252766513, 0.17211353531532005700, 0.17435816917135348788, 0.17661945459049489581,
156 |     0.17889754657247831241, 0.18119260347549629488, 0.18350478709776746150, 0.18583426276219711495,
157 |     0.18818119940425430485, 0.19054576966319540013, 0.19292814997677133873, 0.19532852067956322315,
158 |     0.19774706610509886464, 0.20018397469191127727, 0.20263943909370901930, 0.20511365629383770880,
159 |     0.20760682772422204205, 0.21011915938898825914, 0.21265086199297827522, 0.21520215107537867786,
160 |     0.21777324714870053264, 0.22036437584335949720, 0.22297576805812018050, 0.22560766011668406495,
161 |     0.22826029393071670664, 0.23093391716962742173, 0.23362878343743333945, 0.23634515245705964715,
162 |     0.23908329026244917002, 0.24184346939887722761, 0.24462596913189210901, 0.24743107566532763894,
163 |     0.25025908236886230967, 0.25311029001562948171, 0.25598500703041538015, 0.25888354974901621678,
164 |     0.26180624268936295243, 0.26475341883506220209, 0.26772541993204481808, 0.27072259679906003167,
165 |     0.27374530965280298302, 0.27679392844851734458, 0.27986883323697289920, 0.28297041453878076010,
166 |     0.28609907373707684673, 0.28925522348967773308, 0.29243928816189258772, 0.29565170428126120948,
167 |     0.29889292101558177099, 0.30216340067569352897, 0.30546361924459023541, 0.30879406693456016794,
168 |     0.31215524877417956945, 0.31554768522712893632, 0.31897191284495723773, 0.32242848495608914289,
169 |     0.32591797239355619822, 0.32944096426413633091, 0.33299806876180896713, 0.33658991402867758144,
170 |     0.34021714906678004560, 0.34388044470450243010, 0.34758049462163698567, 0.35131801643748334681,
171 |     0.35509375286678745925, 0.35890847294874976196, 0.36276297335481777335, 0.36665807978151414890,
172 |     0.37059464843514599421, 0.37457356761590215193, 0.37859575940958081092, 0.38266218149600982112,
173 |     0.38677382908413768115, 0.39093173698479710717, 0.39513698183329015336, 0.39939068447523107877,
174 |     0.40369401253053026739, 0.40804818315203238238, 0.41245446599716116772, 0.41691418643300289465,
175 |     0.42142872899761659635, 0.42599954114303435739, 0.43062813728845883923, 0.43531610321563659758,
176 |     0.44006510084235387501, 0.44487687341454851593, 0.44975325116275498919, 0.45469615747461548049,
177 |     0.45970761564213768669, 0.46478975625042618067, 0.46994482528395999841, 0.47517519303737738299,
178 |     0.48048336393045423016, 0.48587198734188493564, 0.49134386959403255500, 0.49690198724154955294,
179 |     0.50254950184134769289, 0.50828977641064283495, 0.51412639381474855788, 0.52006317736823356823,
180 |     0.52610421398361972602, 0.53225388026304326945, 0.53851687200286186590, 0.54489823767243963663,
181 |     0.55140341654064131685, 0.55803828226258748140, 0.56480919291240022434, 0.57172304866482579008,
182 |     0.57878735860284503057, 0.58601031847726802755, 0.59340090169173341521, 0.60096896636523224742,
183 |     0.60872538207962206507, 0.61668218091520762326, 0.62485273870366592605, 0.63325199421436607968,
184 |     0.64189671642726607018, 0.65080583341457104881, 0.66000084107899974178, 0.66950631673192477684,
185 |     0.67935057226476538741, 0.68956649611707798890, 0.70019265508278816709, 0.71127476080507597882,
186 |     0.72286765959357200702, 0.73503809243142351530, 0.74786862198519510742, 0.76146338884989624862,
187 |     0.77595685204011559675, 0.79152763697249565519, 0.80842165152300838005, 0.82699329664305033399,
188 |     0.84778550062398962096, 0.87170433238120363669, 0.90046992992574643800, 0.93814368086217467916,
189 |     1
190 | };
191 | 
192 | static double f(double x) {
193 |     using std::exp;
194 |     return exp(-x);
195 | }
196 | 
197 | template<typename T> double getExponential(T& bitstream) {
198 |     const double * const table_x = exponential_table::table_x;
199 |     const double * const table_y = exponential_table::table_y;
200 |     double shift(0);
201 |     for(;;) {
202 |         double valsFirst = getUniformDouble(bitstream);
203 |         int valsSecond = getUniformPow2(8, bitstream);
204 |         int i = valsSecond;
205 |         double x = valsFirst * double(table_x[i]);
206 |         if(x < double(table_x[i + 1])) return shift + x;
207 |         // For i=0 we need to generate from the tail, but because this is an exponential
208 |         // distribution, the tail looks exactly like the body, so we can simply repeat with a
209 |         // shift:
210 |         if (i == 0) shift += double(table_x[1]);
211 |         else {
212 |             double y01 = getUniformDouble(bitstream);
213 |             double y = double(table_y[i]) + y01 * double(table_y[i+1] - table_y[i]);
214 | 
215 |             // All we care about is whether these are < or > 0; these values are equal to
216 |             // (lbound) or proportional to (ubound) `y` minus the lower/upper bound.
217 |             double y_above_ubound = double(table_x[i] - table_x[i+1]) * y01 - (double(table_x[i]) - x),
218 |                      y_above_lbound = y - (double(table_y[i+1]) + (double(table_x[i+1]) - x) * double(table_y[i+1]));
219 | 
220 |             if (y_above_ubound < 0 // if above the upper bound reject immediately
221 |                     &&
222 |                     (
223 |                      y_above_lbound < 0 // If below the lower bound accept immediately
224 |                      ||
225 |                      y < f(x) // Otherwise it's between the bounds and we need a full check
226 |                     )
227 |                ) {
228 |                 return x + shift;
229 |             }
230 |         }
231 |     }
232 | }
233 | 
234 | } // namespace ziggurat
235 | 
236 | #endif // _EXPONENTIAL_DISTRIBUTION_HPP_
237 | 


--------------------------------------------------------------------------------
/bagminhash/weighted_minwise_hashing.hpp:
--------------------------------------------------------------------------------
  1 | //##################################
  2 | //# Copyright (C) 2018 Otmar Ertl. #
  3 | //# All rights reserved.           #
  4 | //##################################
  5 | 
  6 | #ifndef _WEIGHTED_MINWISE_HASHING_HPP_
  7 | #define _WEIGHTED_MINWISE_HASHING_HPP_
  8 | 
  9 | #include "bitstream_random.hpp"
 10 | 
 11 | #include <vector>
 12 | #include <algorithm>
 13 | #include <functional>
 14 | 
 15 | template <typename T>
 16 | class MaxValueTracker {
 17 |     const uint32_t m;
 18 |     std::vector<T> values;
 19 | 
 20 | public:
 21 |     MaxValueTracker(uint32_t _m, const T& infinity) : m(_m), values((_m << 1) - 1, infinity) {}
 22 | 
 23 |     void update(uint32_t idx, T value) {
 24 |         assert(idx < m);
 25 |         while(value < values[idx]) {
 26 |             values[idx] = value;
 27 |             idx = m + (idx >> 1);
 28 |             if (idx >= values.size()) break;
 29 |             uint32_t leftChildIdx = (idx - m) << 1;
 30 |             uint32_t rightChildIdx = leftChildIdx + 1;
 31 |             value = std::max(values[leftChildIdx], values[rightChildIdx]);
 32 |         }
 33 |     }
 34 | 
 35 |     const T& max() const {
 36 |         return values.back();
 37 |     }
 38 | 
 39 |     const T& operator[](uint32_t idx) const {
 40 |         return values[idx];
 41 |     }
 42 | };
 43 | 
 44 | class BinaryWeightDiscretization {
 45 | public:
 46 |     typedef uint8_t index_type;
 47 |     typedef uint8_t weight_type;
 48 | 
 49 |     static const index_type maxBoundIdx = 1;
 50 | 
 51 |     static weight_type getBound(index_type boundIdx) {
 52 |         return boundIdx;
 53 |     }
 54 | };
 55 | 
 56 | template<typename I, typename W>
 57 | I calculateMaxBoundIdx() {
 58 |     static_assert(sizeof(I) == sizeof(W), "index type and weight type do not have same size");
 59 |     I i = 0;
 60 |     const W w = std::numeric_limits<W>::max();
 61 |     memcpy(&i, &w, sizeof(I));
 62 |     return i;
 63 | }
 64 | 
 65 | template<typename W, typename I>
 66 | class WeightDiscretization {
 67 | public:
 68 |     typedef I index_type;
 69 |     typedef W weight_type;
 70 | 
 71 |     static const index_type maxBoundIdx;
 72 | 
 73 |     static weight_type getBound(index_type boundIdx) {
 74 |         W f;
 75 |         static_assert(std::numeric_limits<W>::is_iec559, "weight_type is not iec559");
 76 |         static_assert(sizeof(weight_type) == sizeof(index_type), "weight_type and index_type do not have same size");
 77 |         memcpy(&f, &boundIdx, sizeof(index_type));
 78 |         return f;
 79 |     }
 80 | };
 81 | 
 82 | template<typename W, typename I>
 83 | const typename WeightDiscretization<W, I>::index_type WeightDiscretization<W, I>::maxBoundIdx = calculateMaxBoundIdx<WeightDiscretization<W, I>::index_type, WeightDiscretization<W, I>::weight_type>();
 84 | 
 85 | typedef WeightDiscretization<float, uint32_t> FloatWeightDiscretization;
 86 | 
 87 | typedef WeightDiscretization<double, uint64_t> DoubleWeightDiscretization;
 88 | 
 89 | struct WeightedHashResult {
 90 |     std::vector<uint64_t> hashValues;
 91 |     uint64_t maxSpace;
 92 | 
 93 |     WeightedHashResult(uint32_t m) : hashValues(m), maxSpace(UINT64_C(0)) {}
 94 | };
 95 | 
 96 | template<typename... V> class ValueProvider;
 97 | 
 98 | template<>
 99 | class ValueProvider<> {
100 | public:
101 |     static size_t size() {
102 |         return 0;
103 |     }
104 | 
105 |     void init(char* data) const {}
106 | };
107 | 
108 | template<typename W, typename... V> class ValueProvider<W, V...> : ValueProvider<V...> {
109 |     const W w;
110 | public:
111 | 
112 |     ValueProvider(const W& _w, const V& ... _v) : ValueProvider<V...>(_v...), w(_w) {}
113 | 
114 |     static size_t size() {
115 |         return ValueProvider<V...>::size() + sizeof(W);
116 |     }
117 | 
118 |     void init(char* data) const {
119 |         ValueProvider<V...>::init(data);
120 |         memcpy(&data[ValueProvider<V...>::size()], &w, sizeof(W));
121 |     }
122 | };
123 | 
124 | template<typename... V>
125 | ValueProvider<V...> collectHashData(const V&... v) {
126 |     return ValueProvider<V...>(v...);
127 | }
128 | 
129 | template <typename D, typename H>
130 | class PoissonProcess {
131 | 
132 |     double point;
133 |     double weight;
134 |     typename D::index_type weightIdxMin;
135 |     typename D::index_type weightIdxMax;
136 |     typename D::weight_type boundMin;
137 |     typename D::weight_type boundMax;
138 |     uint32_t signatureIdx;
139 |     BitStream<H> randomBitStream;
140 | 
141 | public:
142 | 
143 |     PoissonProcess(
144 |         double _point,
145 |         double _weight,
146 |         typename D::index_type _weightIdxMin,
147 |         typename D::index_type _weightIdxMax,
148 |         typename D::weight_type _boundMin,
149 |         typename D::weight_type _boundMax,
150 |         BitStream<H>&& _randomBitStream
151 |         )
152 |       : point(_point),
153 |         weight(_weight),
154 |         weightIdxMin(_weightIdxMin),
155 |         weightIdxMax(_weightIdxMax),
156 |         boundMin(_boundMin),
157 |         boundMax(_boundMax),
158 |         signatureIdx(std::numeric_limits<uint32_t>::max()),
159 |         randomBitStream(std::move(_randomBitStream)) {}
160 | 
161 | 
162 |     PoissonProcess(BitStream<H>&& _randomBitStream, double _weight) :
163 |         PoissonProcess(0., _weight, 0, D::maxBoundIdx, 0, D::getBound(D::maxBoundIdx), std::move(_randomBitStream)) {}
164 | 
165 |     bool splittable() const {
166 |         return weightIdxMax > weightIdxMin + 1;
167 |     }
168 | 
169 |     bool partiallyRelevant() const {
170 |         return D::getBound(weightIdxMin + 1) <= weight;
171 |     }
172 | 
173 |     bool fullyRelevant() const {
174 |         return boundMax <= weight;
175 |     }
176 | 
177 |     uint32_t getIndex() const {
178 |         return signatureIdx;
179 |     }
180 | 
181 |     double getPoint() const {
182 |         return point;
183 |     }
184 | 
185 |     void next(uint32_t m) {
186 |         point += getExponential1(randomBitStream) / (static_cast<double>(boundMax) - static_cast<double>(boundMin));
187 |         signatureIdx = getUniform(m, randomBitStream);
188 |     }
189 | 
190 |     std::unique_ptr<PoissonProcess> split() {
191 | 
192 |         typename D::index_type weightIdxMid = (weightIdxMin + weightIdxMax) >> 1;
193 | 
194 |         double boundMid = D::getBound(weightIdxMid);
195 | 
196 |         bool inheritToLeft = getBernoulli((boundMid - static_cast<double>(boundMin)) / (static_cast<double>(boundMax) - static_cast<double>(boundMin)), randomBitStream);
197 | 
198 |         std::unique_ptr<PoissonProcess> pPrime;
199 | 
200 |         BitStream<H> bitStream(collectHashData(weightIdxMid, point), UINT64_C(0x4b06d55ba29b0826)); // constant from random.org
201 | 
202 |         if (inheritToLeft) {
203 |             pPrime = std::make_unique<PoissonProcess>(point, weight, weightIdxMid, weightIdxMax, boundMid, boundMax, std::move(bitStream));
204 |             weightIdxMax = weightIdxMid;
205 |             boundMax = boundMid;
206 |         }
207 |         else {
208 |             pPrime = std::make_unique<PoissonProcess>(point, weight, weightIdxMin, weightIdxMid, boundMin, boundMid, std::move(bitStream));
209 |             weightIdxMin = weightIdxMid;
210 |             boundMin = boundMid;
211 |         }
212 |         return pPrime;
213 |     }
214 | };
215 | 
216 | template<typename D, typename H>
217 | struct CmpPoissonProcessPtrs
218 | {
219 |     bool operator()(const std::unique_ptr<PoissonProcess<D,H>>& lhs, const std::unique_ptr<PoissonProcess<D,H>>& rhs) const
220 |     {
221 |         return rhs->getPoint() < lhs->getPoint();
222 |     }
223 | };
224 | 
225 | template<typename D, typename H>
226 | struct CmpPoissonProcessPtrsInverse
227 | {
228 |     bool operator()(const std::unique_ptr<PoissonProcess<D,H>>& lhs, const std::unique_ptr<PoissonProcess<D,H>>& rhs) const
229 |     {
230 |         return rhs->getPoint() > lhs->getPoint();
231 |     }
232 | };
233 | 
234 | template<typename D, typename H>
235 | void pushHeap(std::unique_ptr<PoissonProcess<D, H>>& p, std::vector<std::unique_ptr<PoissonProcess<D,H>>>& heap, uint64_t& maxHeapSize, size_t offset = 0) {
236 |     heap.emplace_back(std::move(p));
237 |     std::push_heap(heap.begin() + offset, heap.end(), CmpPoissonProcessPtrs<D,H>());
238 |     if (heap.size() > maxHeapSize) maxHeapSize = heap.size();
239 | }
240 | 
241 | template<typename D, typename H>
242 | std::unique_ptr<PoissonProcess<D, H>> popHeap(std::vector<std::unique_ptr<PoissonProcess<D,H>>>& heap, size_t offset = 0) {
243 |     std::pop_heap(heap.begin() + offset, heap.end(), CmpPoissonProcessPtrs<D,H>());
244 |     std::unique_ptr<PoissonProcess<D,H>> p = std::move(heap.back());
245 |     heap.pop_back();
246 |     return p;
247 | }
248 | 
249 | static const uint64_t bagMinHashSeedA = UINT64_C(0xf331e07615a87fd7); // constant from random.org
250 | static const uint64_t bagMinHashSeedB = UINT64_C(0xe224afad0d89c684); // constant from random.org
251 | 
252 | template<typename D, typename H>
253 | WeightedHashResult bag_min_hash_1(const std::vector<std::tuple<uint64_t,double>>& data, const uint32_t m) {
254 |     assert(D::getBound(0) == 0);
255 | 
256 |     const uint8_t b = 64; // constant for b-bit minwise hashing
257 | 
258 |     std::vector<std::unique_ptr<PoissonProcess<D, H>>> heap;
259 | 
260 |     MaxValueTracker<double> h(m, std::numeric_limits<double>::infinity());
261 |     WeightedHashResult result(m);
262 | 
263 |     for(const auto& item : data) {
264 | 
265 |         const double w = std::get<1>(item);
266 |         if (w < D::getBound(1)) continue;
267 | 
268 |         const uint64_t d = std::get<0>(item);
269 | 
270 |         BitStream<H> bitStream(collectHashData(d), bagMinHashSeedA);
271 |         std::unique_ptr<PoissonProcess<D,H>> p = std::make_unique<PoissonProcess<D,H>>(std::move(bitStream), w);
272 | 
273 |         p->next(m);
274 |         if (p->fullyRelevant()) h.update(p->getIndex(), p->getPoint());
275 | 
276 |         while(p->getPoint() <= h.max()) {
277 |             while(p->splittable() && p->partiallyRelevant()) {
278 | 
279 |                 std::unique_ptr<PoissonProcess<D,H>> pPrime = p->split();
280 | 
281 |                 if (p->fullyRelevant()) h.update(p->getIndex(), p->getPoint());
282 | 
283 |                 if (pPrime->partiallyRelevant()) {
284 |                     pPrime->next(m);
285 |                     if (pPrime->fullyRelevant()) h.update(pPrime->getIndex(), pPrime->getPoint());
286 |                     if (pPrime->getPoint() <= h.max()) pushHeap(pPrime, heap, result.maxSpace);
287 |                 }
288 |             }
289 | 
290 |             if (p->fullyRelevant()) {
291 |                 p->next(m);
292 |                 h.update(p->getIndex(), p->getPoint());
293 |                 if (p->getPoint() <= h.max()) pushHeap(p, heap, result.maxSpace);
294 |             }
295 |             if (heap.empty()) break;
296 |             p = popHeap(heap);
297 |         }
298 | 
299 |         heap.clear();
300 |     }
301 | 
302 |     for (uint32_t k = 0; k < m; ++k) {
303 |         BitStream<H> bitstream(collectHashData(h[k]), bagMinHashSeedB);
304 |         result.hashValues[k] = getUniformPow2(b, bitstream);
305 |     }
306 | 
307 |     return result;
308 | }
309 | 
310 | template<typename D, typename H>
311 | WeightedHashResult bag_min_hash_2(const std::vector<std::tuple<uint64_t,double>>& data, const uint32_t m) {
312 |     assert(D::getBound(0) == 0);
313 | 
314 |     const uint8_t b = 64; // constant for b-bit minwise hashing
315 | 
316 |     std::vector<std::unique_ptr<PoissonProcess<D,H>>> temp;
317 |     WeightedHashResult result(m);
318 | 
319 |     MaxValueTracker<double> h(m, std::numeric_limits<double>::infinity());
320 | 
321 |     for(const auto& item : data) {
322 | 
323 |         const double w = std::get<1>(item);
324 |         if (w < D::getBound(1)) continue;
325 | 
326 |         const uint64_t d = std::get<0>(item);
327 | 
328 |         const size_t tempHeapOffset = temp.size();
329 | 
330 |         BitStream<H> bitStream(collectHashData(d), bagMinHashSeedA);
331 |         std::unique_ptr<PoissonProcess<D,H>> p = std::make_unique<PoissonProcess<D,H>>(std::move(bitStream), w);
332 | 
333 |         p->next(m);
334 |         if (p->fullyRelevant()) h.update(p->getIndex(), p->getPoint());
335 | 
336 |         while(p->getPoint() <= h.max()) {
337 |             while(p->splittable() && p->partiallyRelevant() && !p->fullyRelevant()) {
338 | 
339 |                 std::unique_ptr<PoissonProcess<D,H>> pPrime = p->split();
340 | 
341 |                 if (p->fullyRelevant()) h.update(p->getIndex(), p->getPoint());
342 | 
343 |                 if (pPrime->partiallyRelevant()) {
344 |                     pPrime->next(m);
345 |                     if (pPrime->fullyRelevant()) h.update(pPrime->getIndex(), pPrime->getPoint());
346 |                     if (pPrime->getPoint() <= h.max()) pushHeap(pPrime, temp, result.maxSpace, tempHeapOffset);
347 |                 }
348 |             }
349 | 
350 |             if (p->fullyRelevant()) {
351 |                 assert(p->getPoint() <= h.max());
352 |                 pushHeap(p, temp, result.maxSpace, tempHeapOffset);
353 |                 break;
354 |             }
355 |             if (temp.size() == tempHeapOffset) break;
356 |             p = popHeap(temp, tempHeapOffset);
357 |         }
358 | 
359 |         auto bufferEndIt = temp.begin() + tempHeapOffset;
360 |         while(bufferEndIt != temp.begin() && temp.front()->getPoint() > h.max()) {
361 |             std::pop_heap(temp.begin(), bufferEndIt, CmpPoissonProcessPtrsInverse<D,H>());
362 |             --bufferEndIt;
363 |         }
364 | 
365 |         for(auto heapIt = temp.begin() + tempHeapOffset; heapIt != temp.end(); ++heapIt) {
366 |             if ((*heapIt)->getPoint() <= h.max()) {
367 |                 *bufferEndIt = std::move(*heapIt);
368 |                 ++bufferEndIt;
369 |                 std::push_heap(temp.begin(), bufferEndIt, CmpPoissonProcessPtrsInverse<D,H>());
370 |             }
371 |         }
372 |         temp.erase(bufferEndIt, temp.end());
373 |     }
374 | 
375 |     std::make_heap(temp.begin(), temp.end(), CmpPoissonProcessPtrs<D,H>());
376 | 
377 |     while(!temp.empty()) {
378 | 
379 |         std::unique_ptr<PoissonProcess<D,H>> p = popHeap(temp);
380 |         if (p->getPoint() > h.max()) break;
381 | 
382 |         while(p->splittable() && p->partiallyRelevant()) {
383 | 
384 |             std::unique_ptr<PoissonProcess<D,H>> pPrime = p->split();
385 | 
386 |             if (p->fullyRelevant()) h.update(p->getIndex(), p->getPoint());
387 | 
388 |             if (pPrime->partiallyRelevant()) {
389 |                 pPrime->next(m);
390 |                 if (pPrime->fullyRelevant()) h.update(pPrime->getIndex(), pPrime->getPoint());
391 |                 if (pPrime->getPoint() <= h.max()) pushHeap(pPrime, temp, result.maxSpace);
392 |             }
393 |         }
394 | 
395 |         if (p->fullyRelevant()) {
396 |             p->next(m);
397 |             h.update(p->getIndex(), p->getPoint());
398 |             if (p->getPoint() <= h.max()) pushHeap(p, temp, result.maxSpace);
399 |         }
400 |     }
401 | 
402 |     for (uint32_t k = 0; k < m; ++k) {
403 |         BitStream<H> bitstream(collectHashData(h[k]), bagMinHashSeedB);
404 |         result.hashValues[k] = getUniformPow2(b, bitstream);
405 |     }
406 | 
407 |     return result;
408 | }
409 | 
410 | // see Ioffe, Sergey. "Improved consistent sampling, weighted minhash and l1 sketching." Data Mining (ICDM), 2010 IEEE 10th International Conference on. IEEE, 2010.
411 | template <typename H>
412 | WeightedHashResult improved_consistent_weighted_hashing(const std::vector<std::tuple<uint64_t,double>>& data, const uint32_t m) {
413 |     const uint8_t b = 64; // constant for b-bit minwise hashing
414 | 
415 |     std::vector<double> aVec(m, std::numeric_limits<double>::infinity());
416 |     std::vector<uint64_t> dVec(m);
417 |     std::vector<uint64_t> yVec(m);
418 | 
419 |     WeightedHashResult result(m);
420 | 
421 |     for(const auto& item : data) {
422 | 
423 |         const uint64_t d = std::get<0>(item);
424 |         const double s = std::get<1>(item);
425 | 
426 |         if (s == 0) continue;
427 | 
428 |         const double logS = std::log(s);
429 | 
430 |         BitStream<H> bitstream(collectHashData(d), UINT64_C(0x87609608d2a48b5d)); // constant from random.org
431 | 
432 |         for (uint32_t k = 0; k < m; ++k) {
433 | 
434 |             double r = getGamma21(bitstream);
435 |             double c = getGamma21(bitstream);
436 |             double beta = getUniformDouble(bitstream);
437 | 
438 |             double t = std::floor(logS / r + beta);
439 |             double y = std::exp(r * (t - beta));
440 |             double a = c / (y * std::exp(r));
441 | 
442 |             if (a < aVec[k]) {
443 |                 aVec[k] = a;
444 |                 dVec[k] = d;
445 |                 yVec[k] = y;
446 |             }
447 |         }
448 |     }
449 | 
450 |     for (uint32_t k = 0; k < m; ++k) {
451 |         BitStream<H> bitstream(collectHashData(dVec[k], yVec[k]), UINT64_C(0xbf235dea3db9c393)); // constant from random.org
452 |         result.hashValues[k] = getUniformPow2(b, bitstream);
453 |     }
454 | 
455 |     return result;
456 | }
457 | 
458 | // see Wu, Wei, et al. "Canonical Consistent Weighted Sampling for Real-Value Weighted Min-Hash." Data Mining (ICDM), 2016 IEEE 16th International Conference on. IEEE, 2016.
459 | template <typename H>
460 | WeightedHashResult canonical_consistent_weighted_hashing(const std::vector<std::tuple<uint64_t,double>>& data, const uint32_t m) {
461 |     const uint8_t b = 64; // constant for b-bit minwise hashing
462 | 
463 |     std::vector<double> aVec(m, std::numeric_limits<double>::infinity());
464 |     std::vector<uint64_t> dVec(m);
465 |     std::vector<uint64_t> yVec(m);
466 | 
467 |     WeightedHashResult result(m);
468 | 
469 |     for(const auto& item : data) {
470 | 
471 |         const uint64_t d = std::get<0>(item);
472 |         const double s = std::get<1>(item);
473 | 
474 |         if (s == 0) continue;
475 | 
476 |         BitStream<H> bitstream(collectHashData(d), UINT64_C(0xc9116756125c6267)); // constant from random.org
477 | 
478 |         for (uint32_t k = 0; k < m; ++k) {
479 | 
480 |             double beta = getUniformDouble(bitstream);
481 |             double r = getBeta21(bitstream);
482 |             double c = getGamma21(bitstream);
483 | 
484 |             double t = std::floor(s / r + beta);
485 |             double y = r * (t - beta);
486 |             double a = c / y - 2 * r * c;
487 | 
488 |             if (a < aVec[k]) {
489 |                 aVec[k] = a;
490 |                 dVec[k] = d;
491 |                 yVec[k] = y;
492 |             }
493 |         }
494 |     }
495 | 
496 |     for (uint32_t k = 0; k < m; ++k) {
497 |         BitStream<H> bitstream(collectHashData(dVec[k], yVec[k]), UINT64_C(0xa5c48ff7b4004c41)); // constant from random.org
498 |         result.hashValues[k] = getUniformPow2(b, bitstream);
499 |     }
500 | 
501 |     return result;
502 | }
503 | 
504 | // see Wu, Wei, et al. "Consistent Weighted Sampling Made More Practical." Proceedings of the 26th International Conference on World Wide Web. International World Wide Web Conferences Steering Committee, 2017.
505 | template <typename H>
506 | WeightedHashResult practical_consistent_weighted_hashing(const std::vector<std::tuple<uint64_t,double>>& data, const uint32_t m) {
507 |     const uint8_t b = 64; // constant for b-bit minwise hashing
508 | 
509 |     std::vector<double> aVec(m, std::numeric_limits<double>::infinity());
510 |     std::vector<uint64_t> dVec(m);
511 |     std::vector<uint64_t> yVec(m);
512 | 
513 |     WeightedHashResult result(m);
514 | 
515 |     for(const auto& item : data) {
516 | 
517 |         const uint64_t d = std::get<0>(item);
518 |         const double s = std::get<1>(item);
519 | 
520 |         if (s == 0) continue;
521 | 
522 |         const double logS = std::log(s);
523 | 
524 |         BitStream<H> bitstream(collectHashData(d), UINT64_C(0xbe46368ee398beee)); // constant from random.org
525 | 
526 |         for (uint32_t k = 0; k < m; ++k) {
527 | 
528 |             double u1 = getUniformDouble(bitstream);
529 |             double u2 = getUniformDouble(bitstream);
530 |             double beta = getUniformDouble(bitstream);
531 |             double x = getUniformDouble(bitstream);
532 | 
533 |             double gamma = -std::log(u1 * u2);
534 |             double t = std::floor(logS / gamma + beta);
535 |             double y = std::exp(gamma * (t - beta));
536 |             double a = -std::log(x) / (y / u1);
537 | 
538 |             if (a < aVec[k]) {
539 |                 aVec[k] = a;
540 |                 dVec[k] = d;
541 |                 yVec[k] = y;
542 |             }
543 |         }
544 |     }
545 | 
546 |     for (uint32_t k = 0; k < m; ++k) {
547 |         BitStream<H> bitstream(collectHashData(dVec[k], yVec[k]), UINT64_C(0x50da48973b000da9)); // constant from random.org
548 |         result.hashValues[k] = getUniformPow2(b, bitstream);
549 |     }
550 | 
551 |     return result;
552 | }
553 | 
554 | // see Wu, Wei, et al. "Improved Consistent Weighted Sampling Revisited." arXiv preprint arXiv:1706.01172 (2017).
555 | template <typename H>
556 | WeightedHashResult improved_squared_consistent_weighted_hashing(const std::vector<std::tuple<uint64_t,double>>& data, const uint32_t m) {
557 |     const uint8_t b = 64; // constant for b-bit minwise hashing
558 | 
559 |     std::vector<double> aVec(m, std::numeric_limits<double>::infinity());
560 |     std::vector<uint64_t> dVec(m);
561 |     std::vector<uint64_t> yVec(m);
562 | 
563 |     WeightedHashResult result(m);
564 | 
565 |     for(const auto& item : data) {
566 | 
567 |         const uint64_t d = std::get<0>(item);
568 |         const double s = std::get<1>(item);
569 | 
570 |         if (s == 0) continue;
571 | 
572 |         const double logS = std::log(s);
573 | 
574 |         BitStream<H> bitstream(collectHashData(d), UINT64_C(0xb30eb19e5e572b46)); // constant from random.org
575 | 
576 |         for (uint32_t k = 0; k < m; ++k) {
577 | 
578 |             double r1 = getGamma21(bitstream);
579 |             double r2 = getGamma21(bitstream);
580 |             double beta1 = getUniformDouble(bitstream);
581 |             double beta2 = getUniformDouble(bitstream);
582 |             double c = getGamma21(bitstream);
583 | 
584 |             double t2 = std::floor(logS / r2 + beta2);
585 |             double z = std::exp(r2 * (t2 - beta2 + 1));
586 |             double a = c / z;
587 | 
588 |             if (a < aVec[k]) {
589 |                 aVec[k] = a;
590 |                 double t1 = std::floor(logS / r1 + beta1);
591 |                 double y = std::exp(r1 * (t1 - beta1));
592 |                 dVec[k] = d;
593 |                 yVec[k] = y;
594 |             }
595 |         }
596 |     }
597 | 
598 |     for (uint32_t k = 0; k < m; ++k) {
599 |         BitStream<H> bitstream(collectHashData(dVec[k], yVec[k]), UINT64_C(0xdff981675040e7bc)); // constant from random.org
600 |         result.hashValues[k] = getUniformPow2(b, bitstream);
601 |     }
602 | 
603 |     return result;
604 | }
605 | 
606 | // see Li, Ping. "0-bit consistent weighted sampling." Proceedings of the 21th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining. ACM, 2015.
607 | template <typename H>
608 | WeightedHashResult zero_bit_consistent_weighted_sampling(const std::vector<std::tuple<uint64_t,double>>& data, const uint32_t m) {
609 | 
610 |     std::vector<double> aVec(m, std::numeric_limits<double>::infinity());
611 |     std::vector<uint64_t> dVec(m);
612 | 
613 |     WeightedHashResult result(m);
614 | 
615 |     for(const auto& item : data) {
616 | 
617 |         const uint64_t d = std::get<0>(item);
618 |         const double s = std::get<1>(item);
619 | 
620 |         if (s == 0) continue;
621 | 
622 |         const double logS = std::log(s);
623 | 
624 |         BitStream<H> bitstream(collectHashData(d), UINT64_C(0x95f3ee483861a892)); // constant from random.org
625 | 
626 |         for (uint32_t k = 0; k < m; ++k) {
627 | 
628 |             double r = getGamma21(bitstream);
629 |             double c = getGamma21(bitstream);
630 |             double beta = getUniformDouble(bitstream);
631 | 
632 |             double t = std::floor(logS / r + beta);
633 |             double y = std::exp(r * (t - beta));
634 |             double a = c / (y * std::exp(r));
635 | 
636 |             if (a < aVec[k]) {
637 |                 aVec[k] = a;
638 |                 dVec[k] = d;
639 |             }
640 |         }
641 |     }
642 | 
643 |     result.hashValues = std::move(dVec);
644 | 
645 |     return result;
646 | }
647 | 
648 | #endif // _WEIGHTED_MINWISE_HASHING_HPP_
649 | 


--------------------------------------------------------------------------------
/bagminhash_wrappers.hpp:
--------------------------------------------------------------------------------
 1 | // Simple wrappers around the bagminhash functions
 2 | // Note that we use the floatweightdiscretization as it results in improved performance
 3 | #ifndef SKETCH_BAGMINHASH
 4 | #define SKETCH_BAGMINHASH
 5 | 
 6 | #include <cstdint>
 7 | #include <vector>
 8 | #include "bagminhash/weighted_minwise_hashing.hpp"
 9 | 
10 | using namespace std;
11 | 
12 |  vector<pair<uint64_t, double>> weightedhashresult_to_pairs(WeightedHashResult res) {
13 |     vector<pair<uint64_t, double>> output;
14 |     for(uint64_t h : res.hashValues) {
15 |         output.push_back({h, 0.0});
16 |     }
17 |     return output;
18 | }
19 | 
20 |  vector<tuple<uint64_t, double>> pairs_to_tuples(const vector<pair<uint64_t, double>>& x) {
21 |     vector<tuple<uint64_t, double>> x_tuple;
22 |     for(auto& element : x) {
23 |         x_tuple.push_back(tuple<uint64_t, double>(element.first, element.second));
24 |     }
25 |     return x_tuple;
26 | }
27 | 
28 | class BagMinHash1 {
29 |     private:
30 |         uint64_t t;
31 |     public:
32 |         BagMinHash1(uint64_t t) : t(t) {};
33 |         vector<pair<uint64_t, double>> operator()(const vector<pair<uint64_t, double>>& x) {
34 |             auto x_tuple = pairs_to_tuples(x);
35 |             WeightedHashResult res = bag_min_hash_1<FloatWeightDiscretization, XXHash64>(x_tuple, t);
36 |             return weightedhashresult_to_pairs(res);
37 | 
38 |         }
39 | };
40 | 
41 | class BagMinHash2 {
42 |     private:
43 |         uint64_t t;
44 |     public:
45 |         BagMinHash2(uint64_t t) : t(t) {};
46 |         vector<pair<uint64_t, double>> operator()(const vector<pair<uint64_t, double>>& x) {
47 |             auto x_tuple = pairs_to_tuples(x);
48 |             WeightedHashResult res = bag_min_hash_2<FloatWeightDiscretization, XXHash64>(x_tuple, t);
49 |             return weightedhashresult_to_pairs(res);
50 | 
51 |         }
52 | };
53 | 
54 | class ICWS_xxhash {
55 |     private:
56 |         uint64_t t;
57 |     public:
58 |         ICWS_xxhash(uint64_t t) : t(t) {};
59 |         vector<pair<uint64_t, double>> operator()(const vector<pair<uint64_t, double>>& x) {
60 |             auto x_tuple = pairs_to_tuples(x);
61 |             WeightedHashResult res = improved_consistent_weighted_hashing<XXHash64>(x_tuple, t);
62 |             return weightedhashresult_to_pairs(res);
63 |         }
64 | };
65 | 
66 | #endif


--------------------------------------------------------------------------------
/darthash.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef SKETCH_DARTHASH
  2 | #define SKETCH_DARTHASH
  3 | 
  4 | #include <cstdint>
  5 | #include <random>
  6 | #include <vector>
  7 | #include <unordered_set>
  8 | #include <algorithm>
  9 | #include <cmath>
 10 | #include "similarity.hpp"
 11 | #include "hashing.hpp"
 12 | 
 13 | using namespace std;
 14 | 
 15 | class DartHash {
 16 | 
 17 | 	private:
 18 | 		uint64_t t;
 19 |         TabulationHashFunction32 T_nu, T_rho, T_w, T_r;
 20 |         TabulationHashFunction T_i, T_p, T_q, F, M;
 21 |         vector<double> powers_of_two;
 22 |         vector<double> negative_powers_of_two;
 23 |         vector<double> poisson_cdf;
 24 |         
 25 | 	public:
 26 | 		DartHash(mt19937_64& rng, uint64_t t) : t(t), T_nu(rng), T_rho(rng), T_w(rng), T_r(rng), T_i(rng), T_p(rng), T_q(rng), F(rng), M(rng) {
 27 |             
 28 |             // Tabulate positive and negative powers of two
 29 |             double p = 1.0;
 30 |             double q = 1.0;
 31 |             // Standard double precision ranges from around +-2^1024. We will support slightly less. 
 32 |             for(uint64_t i = 0; i < 1000; i++) {
 33 |                 powers_of_two.push_back(p);
 34 |                 p = 2.0*p;
 35 |                 negative_powers_of_two.push_back(q);
 36 |                 q = 0.5*q;
 37 |             }
 38 | 
 39 |             // Tabulate the Poisson CDF
 40 |             double pdf = exp(-1.0);
 41 |             double cdf = pdf;
 42 |             for(uint64_t i = 0; i < 100; i++) {
 43 |                 poisson_cdf.push_back(cdf);
 44 |                 pdf = pdf/(i + 1);
 45 |                 cdf += pdf;
 46 |             }
 47 |         };
 48 | 
 49 |         vector<pair<uint64_t, double>> operator()(const vector<pair<uint64_t, double>>& x, double theta = 1.0) {
 50 |             vector<pair<uint64_t, double>> darts;
 51 |             darts.reserve(2*t);
 52 |             double max_rank = theta/weight(x);
 53 |             double t_inv = 1.0/t;
 54 |             uint32_t RHO = (uint32_t)floor(log2(1.0 + max_rank));
 55 | 
 56 |             for(const pair<uint64_t, double>& element : x) {
 57 |                 uint64_t i = element.first;
 58 |                 double xi = element.second;
 59 |                 uint64_t i_hash = T_i(i);
 60 |                 uint32_t NU = (uint32_t)floor(log2(1.0 + t*xi));
 61 |                 for(uint32_t nu = 0; nu <= NU; nu++) {
 62 |                     uint64_t nu_hash = T_nu(nu);
 63 |                     for(uint32_t rho = 0; rho <= RHO; rho++) {
 64 |                         uint64_t region_hash = nu_hash ^ T_rho(rho);
 65 |                         double two_nu = powers_of_two[nu];
 66 |                         double two_rho = powers_of_two[rho];
 67 |                         double W = (two_nu - 1)*t_inv;
 68 |                         double R = two_rho - 1;
 69 |                         double delta_nu = two_nu*t_inv*negative_powers_of_two[rho];
 70 |                         double delta_rho = two_rho*negative_powers_of_two[nu];
 71 |                         double w0 = W;
 72 |                         uint32_t w_max = rho < 32 ? 1ul << rho : 1ul << 31;
 73 |                         for(uint32_t w = 0; w < w_max; w++) {
 74 |                             if(xi < w0) break;
 75 |                             uint64_t w_hash = T_w(w);
 76 |                             double r0 = R;
 77 |                             uint32_t r_max = nu < 32 ? 1ul << nu : 1ul << 31;
 78 |                             for(uint32_t r = 0; r < r_max; r++) {
 79 |                                 if(max_rank < r0) break;
 80 |                                 // Get area fingerprint to speed up subsequent hashing 
 81 |                                 uint64_t area_hash = w_hash ^ T_r(r);
 82 |                                 uint64_t z = i_hash ^ region_hash ^ area_hash;
 83 |                                 
 84 |                                 // Draw from Poisson distribution
 85 |                                 double p_z = to_unit(T_p(z));
 86 |                                 uint8_t p = 0;
 87 |                                 while(p_z > poisson_cdf[p]) {
 88 |                                     p++;
 89 |                                 }
 90 |                                 
 91 |                                 uint64_t q = 0;
 92 |                                 while(q < p) {
 93 |                                     // Layer the q-values over z to create a unique key with a strong hash value
 94 |                                     uint64_t z_q = z ^ (q << 56) ^ (q << 48) ^ (q << 40) ^ (q << 32) ^ (q << 24) ^ (q << 16) ^ (q << 8) ^ q;
 95 |                                     auto uniform_weight_rank = to_units(T_q(z_q));
 96 |                                     double weight = w0 + delta_nu*uniform_weight_rank.first;
 97 |                                     double rank = r0 + delta_rho*uniform_weight_rank.second;
 98 |                                     if(weight < xi && rank < max_rank) {
 99 |                                         darts.push_back({F(z_q), rank});
100 |                                     }
101 |                                     q++;
102 |                                 }
103 | 
104 |                                 r0 += delta_rho;
105 |                             }
106 |                             w0 += delta_nu;
107 |                         }
108 |                     }
109 |                 }
110 |             }
111 |             return darts;
112 |         }
113 | 
114 | 
115 |         // Convert the t darts to k minhashes by hashing the darts to k buckets and keeping the minimum from each bucket
116 |         vector<pair<uint64_t, double>> minhash(const vector<pair<uint64_t, double>>& x, uint64_t k) { 
117 |             auto darts = (*this)(x);
118 |             vector<pair<uint64_t, double>> minhashes(k, {0, numeric_limits<double>::max()});
119 |             for(auto& dart : darts) {
120 |                 uint64_t j = M(dart.first) % k;
121 |                 if(dart.second < minhashes[j].second) {
122 |                     minhashes[j] = dart;
123 |                 }
124 |             }
125 |             return minhashes;
126 |         }
127 | 
128 |         vector<bool> onebit_minhash(const vector<pair<uint64_t, double>>& x, uint64_t k) { 
129 |             vector<bool> sketch(k, false);
130 |             auto minhashes = minhash(x, k);
131 |             for(uint64_t i = 0; i < k; i++) {
132 |                 sketch[i] = ((minhashes[i].first & 1ull) == 1ull); // use first bit of MinHash id  
133 |             }
134 |             return sketch;
135 |         }
136 | };
137 | 
138 | #endif


--------------------------------------------------------------------------------
/dartminhash.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef SKETCH_DARTMINHASH
 2 | #define SKETCH_DARTMINHASH
 3 | 
 4 | #include <cstdint>
 5 | #include <random>
 6 | #include <vector>
 7 | #include <unordered_set>
 8 | #include <algorithm>
 9 | #include <cmath>
10 | #include "similarity.hpp"
11 | #include "hashing.hpp"
12 | #include "darthash.hpp"
13 | 
14 | using namespace std;
15 | 
16 | class DartMinHash {
17 | 
18 | 	private:
19 | 		uint64_t k;
20 |         TabulationHashFunction T;
21 |         DartHash D;
22 |         
23 | 	public:
24 |         // Set t = k ln(k) + 2k so the probability of failing on the first run is at most exp(-2)
25 | 		DartMinHash(mt19937_64& rng, uint64_t k) : k(k), T(rng), D(rng, ceil(k*log(k) + 2*k)) {};
26 | 
27 |         vector<pair<uint64_t, double>> operator()(const vector<pair<uint64_t, double>>& x) {
28 |             bool all_minhashed = false;
29 |             double theta = 1.0;
30 |             vector<pair<uint64_t, double>> minhashes(k, {0, numeric_limits<double>::max()});
31 |             while(!all_minhashed) {
32 |                 vector<bool> minhashed(k, false);
33 |                 auto darts = D(x, theta);
34 |                 // Place darts into buckets
35 |                 for(auto& dart : darts) {
36 |                     uint64_t j = T(dart.first) % k;
37 |                     minhashed[j] = true;
38 |                     if(dart.second < minhashes[j].second) {
39 |                         minhashes[j] = dart;
40 |                     }
41 |                 }
42 |                 // Verify whether all minhashes were computed
43 |                 all_minhashed = true;
44 |                 for(bool mh : minhashed) {
45 |                     if(!mh) {
46 |                         all_minhashed = false;
47 |                     }
48 |                 }
49 | 
50 |                 theta = theta + 0.5;
51 |             }
52 |             return minhashes;
53 |         }
54 | 
55 |         vector<bool> onebit_minhash(const vector<pair<uint64_t, double>>& x) { 
56 |             vector<bool> sketch(k, false);
57 |             auto minhashes = (*this)(x);
58 |             for(uint64_t i = 0; i < k; i++) {
59 |                 sketch[i] = ((minhashes[i].first & 1ull) == 1ull); // use first bit of MinHash id  
60 |             }
61 |             return sketch;
62 |         }
63 | };
64 | 
65 | #endif


--------------------------------------------------------------------------------
/datagenerator.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef SKETCH_DATAGENERATOR
 2 | #define SKETCH_DATAGENERATOR
 3 | 
 4 | #include <cstdint>
 5 | #include <random>
 6 | #include <vector>
 7 | #include <unordered_set>
 8 | #include <algorithm>
 9 | 
10 | using namespace std;
11 | 
12 | // Generate random histograms by first picking m entries randomly to have nonzero weights, and then sorting m - 1 uniformly distributed variables between zero and one. 
13 | // Assign weights as the gaps in sorted order.
14 | 
15 | vector<pair<uint64_t, double>> generate_weighted_set(uint64_t L0, double L1, mt19937_64& rng) {
16 |     unordered_set<uint64_t> elements;
17 |     uniform_int_distribution<uint64_t> random_index;
18 |     while(elements.size() < L0) {
19 |         elements.insert(random_index(rng));
20 |     }
21 | 
22 |     uniform_real_distribution<double> uniform_splitter(0, 1);
23 |     vector<double> z;
24 |     for(uint64_t i = 0; i < L0 - 1; i++) {
25 |         z.push_back(uniform_splitter(rng));
26 |     }
27 |     z.push_back(1.0);
28 |     sort(z.begin(), z.end());
29 | 
30 |     double prev = 0.0;
31 |     uint32_t j = 0;
32 |     vector<pair<uint64_t, double>> x;
33 |     for(uint64_t index : elements) {
34 |         double weight = L1*(z[j] - prev);
35 |         x.push_back(pair<uint64_t, double>(index, weight));
36 |         prev = z[j];
37 |         j++;
38 |     }
39 | 
40 |     // Sort the vector of pairs by indices
41 |     sort(x.begin(), x.end());
42 |     return x;
43 | }
44 | 
45 | // Given x we can generate y s.t. the intersection between x and y is equal to some pre-specificed value
46 | // We will do this by setting y to an appropriately scaled down copy of x and adding the remaining mass to an element that does not exist in x.
47 |  vector<pair<uint64_t, double>> generate_similar_weighted_set(const vector<pair<uint64_t, double>>& x, double relative_overlap, mt19937_64& rng) {
48 |     // Pick a random free element j
49 |     uint64_t j;
50 |     bool free = false;
51 |     while(!free) {
52 |         j = rng();
53 |         free = true;
54 |         for(auto element : x) {
55 |             if(j == element.first) {
56 |                 free = false;
57 |             }
58 |         }
59 |     }
60 | 
61 |     double excess_weight = 0.0;
62 |     vector<pair<uint64_t, double>> y;
63 |     for(auto element : x) {
64 |         double w = element.second;
65 |         double w_scaled = w*relative_overlap;
66 |         double excess = w - w_scaled;
67 |         y.push_back({element.first, w_scaled});
68 |         excess_weight += excess;
69 |     }
70 |     y.push_back({j, excess_weight});
71 |     return y;
72 | }
73 | 
74 | #endif


--------------------------------------------------------------------------------
/hashing.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef SKETCH_HASHING
  2 | #define SKETCH_HASHING
  3 | 
  4 | #include <cstdint>
  5 | #include <random>
  6 | 
  7 | using namespace std;
  8 | 
  9 |  double to_unit(uint64_t x) {
 10 |     return (double)x/0xFFFFFFFFFFFFFFFFull;
 11 | }
 12 | 
 13 |  double to_unit32(uint32_t x) {
 14 | 	return (double)x/0xFFFFFFFFul;
 15 | }
 16 | 
 17 | // Convert a 64-bit uint to two doubles by splitting it in two and normalizing each 32-bit part
 18 |  pair<double, double> to_units(uint64_t x) {
 19 |     return { 
 20 |         ((double)(x >> 32))/0xFFFFFFFFul,
 21 |         (double)(x & 0xFFFFFFFFull)/0xFFFFFFFFul
 22 |     };
 23 | }
 24 | 
 25 | class TabulationHashFunction {
 26 | 
 27 | 	private:
 28 | 		const uint64_t mask = 0xFF; 
 29 | 		uint64_t T1[256];
 30 | 		uint64_t T2[256];
 31 | 		uint64_t T3[256];
 32 | 		uint64_t T4[256];
 33 |         uint64_t T5[256];
 34 | 		uint64_t T6[256];
 35 | 		uint64_t T7[256];
 36 | 		uint64_t T8[256];
 37 | 
 38 | 	public:
 39 | 		TabulationHashFunction(mt19937_64& rng) {
 40 | 			for(int i = 0; i < 256; i++) {
 41 | 				T1[i] = rng();
 42 | 				T2[i] = rng();
 43 | 				T3[i] = rng();
 44 | 				T4[i] = rng();
 45 |                 T5[i] = rng();
 46 | 				T6[i] = rng();
 47 | 				T7[i] = rng();
 48 | 				T8[i] = rng();
 49 | 			}
 50 | 		}
 51 | 
 52 | 		uint64_t operator()(uint64_t x) {
 53 | 			uint64_t hashvalue = T1[x & mask] ^ T2[(x >> 8) & mask] ^ T3[(x >> 16) & mask] ^ T4[(x >> 24) & mask] ^ 
 54 |                                  T5[(x >> 32) & mask] ^ T6[(x >> 40) & mask] ^ T7[(x >> 48) & mask] ^ T8[(x >> 56) & mask];
 55 | 			return hashvalue;
 56 | 		}
 57 | };
 58 | 
 59 | class TabulationHashFunction8 {
 60 | 
 61 | 	private:
 62 | 		uint64_t T1[256];
 63 | 
 64 | 	public:
 65 | 		TabulationHashFunction8(mt19937_64& rng) {
 66 | 			for(int i = 0; i < 256; i++) {
 67 | 				T1[i] = rng();
 68 | 			}
 69 | 		}
 70 | 
 71 | 		uint64_t operator()(uint8_t x) {
 72 | 			return T1[x];
 73 | 		}
 74 | };
 75 | 
 76 | class TabulationHashFunction32 {
 77 | 
 78 | 	private:
 79 | 		const uint32_t mask = 0xFF; 
 80 | 		uint64_t T1[256];
 81 | 		uint64_t T2[256];
 82 | 		uint64_t T3[256];
 83 | 		uint64_t T4[256];
 84 | 
 85 | 	public:
 86 | 		TabulationHashFunction32(mt19937_64& rng) {
 87 | 			for(int i = 0; i < 256; i++) {
 88 | 				T1[i] = rng();
 89 | 				T2[i] = rng();
 90 | 				T3[i] = rng();
 91 | 				T4[i] = rng();
 92 | 			}
 93 | 		}
 94 | 
 95 | 		uint64_t operator()(uint32_t x) {
 96 | 			uint64_t hashvalue = T1[x & mask] ^ T2[(x >> 8) & mask] ^ T3[(x >> 16) & mask] ^ T4[(x >> 24) & mask];
 97 | 			return hashvalue;
 98 | 		}
 99 | };
100 | 
101 | #endif


--------------------------------------------------------------------------------
/icws.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef SKETCH_ICWS
  2 | #define SKETCH_ICWS
  3 | 
  4 | #include <cmath>
  5 | #include <vector>
  6 | #include <cstdint>
  7 | #include <random>
  8 | #include <limits>
  9 | 
 10 | #include "hashing.hpp"
 11 | 
 12 | using namespace std;
 13 | 
 14 | // see Ioffe, Sergey. "Improved consistent sampling, weighted minhash and l1 sketching." Data Mining (ICDM), 2010 IEEE 10th International Conference on. IEEE, 2010.
 15 | class ICWS {
 16 | 
 17 | 	private:
 18 | 		TabulationHashFunction T1, T2, T3, T4, T5;
 19 | 
 20 | 	public:
 21 | 		ICWS(mt19937_64& rng) : T1(rng), T2(rng), T3(rng), T4(rng), T5(rng) {}
 22 | 
 23 | 		pair<uint64_t, double> operator()(const vector<pair<uint64_t, double>>& x) {
 24 | 		
 25 | 			double a_star = numeric_limits<double>::max();
 26 | 			uint64_t k_star = 0;
 27 | 			double y_star = 0;
 28 | 
 29 | 			for(const pair<uint64_t, double>& element : x) {
 30 | 				uint64_t k = element.first;
 31 | 				double S_k = element.second;
 32 | 				double r_u1 = to_unit(T1(k));
 33 | 				double r_u2 = to_unit(T2(k));
 34 | 				double c_u1 = to_unit(T3(k));
 35 | 				double c_u2 = to_unit(T4(k));
 36 | 				double beta_k = to_unit(T5(k));
 37 | 				double r_k = -log(r_u1) -log(r_u2); 
 38 | 				double c_k = -log(c_u1) -log(c_u2); 
 39 | 
 40 | 				double t_k = floor(log(S_k)/r_k + beta_k);
 41 | 				double y_k = exp(r_k*(t_k - beta_k));
 42 | 				double a_k = c_k/(y_k*exp(r_k));
 43 | 				if(a_k < a_star) {
 44 | 					a_star = a_k;
 45 | 					k_star = k;
 46 | 					y_star = y_k;
 47 | 				}
 48 | 			}
 49 | 			
 50 | 			return pair<uint64_t, double>(k_star, y_star);
 51 | 		}
 52 | };
 53 | 
 54 | class ICWS_t {
 55 | 	private:
 56 | 		uint64_t t;
 57 | 		vector<ICWS> M;
 58 | 		
 59 | 	public:
 60 | 		ICWS_t(mt19937_64& rng, uint64_t t) : t(t) {
 61 | 			for(uint64_t i = 0; i < t; i++) {
 62 | 				M.push_back(ICWS(rng));
 63 | 			}
 64 | 		}
 65 | 		
 66 | 		vector<pair<uint64_t, double>> operator()(const vector<pair<uint64_t, double>>& x) {
 67 | 			vector<pair<uint64_t, double>> minhashes;
 68 | 			for(uint64_t i = 0; i < t; i++) {
 69 | 				minhashes.push_back((M[i])(x));
 70 | 			}
 71 | 			return minhashes;
 72 | 		}
 73 | };
 74 | 
 75 | // ICWS with tabulated random draws and precomputed logarithms of weights
 76 | // Also working with the logarithm og y_k and a_k as suggested in the ICWS paper
 77 | // Completely avoids logarithms, exponentials, and divisions. Relies entirely on table lookups and multiplication.
 78 | class FastICWS {
 79 | 
 80 | 	private:
 81 | 		TabulationHashFunction T1, T2, T3;
 82 | 		const vector<double>& gamma;
 83 | 		const vector<double>& gamma_inv;
 84 | 		const vector<double>& log_gamma;
 85 | 
 86 | 	public:
 87 | 		FastICWS(mt19937_64& rng, const vector<double>& gamma, const vector<double>& gamma_inv, const vector<double>& log_gamma) 
 88 | 			: T1(rng), T2(rng), T3(rng), gamma(gamma), gamma_inv(gamma_inv), log_gamma(log_gamma) {}
 89 | 
 90 | 		pair<uint64_t, double> operator()(const vector<pair<uint64_t, double>>& log_weight_x) {
 91 | 		
 92 | 			const uint64_t MASK16 = 0xFFFFull;
 93 | 			double log_a_star = numeric_limits<double>::max();
 94 | 			uint64_t minhash = 0;
 95 | 
 96 | 			for(const pair<uint64_t, double>& element : log_weight_x) {
 97 | 				uint64_t k = element.first;
 98 | 				double log_S_k = element.second;
 99 | 
100 | 				uint64_t z = T1(k);
101 | 				double r_k = gamma[z & MASK16]; 
102 | 				double r_k_inv = gamma_inv[z & MASK16]; 
103 | 				double log_c_k = log_gamma[(z >> 16) & MASK16];
104 | 				double beta_k = to_unit32(z >> 32);
105 | 
106 | 				double t_k = floor(log_S_k*r_k_inv + beta_k);
107 | 				double log_y_k = r_k*(t_k - beta_k);
108 | 
109 | 				double log_a_k = log_c_k - log_y_k - r_k;
110 | 				if(log_a_k < log_a_star) {
111 | 					log_a_star = log_a_k;
112 | 					minhash = T2((uint64_t)t_k) ^ T3(k); // 64-bit minhash
113 | 				}
114 | 			}
115 | 			
116 | 			return pair<uint64_t, double>(minhash, log_a_star);
117 | 		}
118 | };
119 | 
120 | class FastICWS_t {
121 | 	private:
122 | 		uint64_t t;
123 | 		vector<FastICWS> M;
124 | 		vector<double> gamma;
125 | 		vector<double> gamma_inv;
126 | 		vector<double> log_gamma;
127 | 		
128 | 	public:
129 | 		FastICWS_t(mt19937_64& rng, uint64_t t) : t(t) {
130 | 
131 | 			// Create discretized version of the X ~ Gamma(2,1) distribution
132 | 			// pdf: z*exp(-z)
133 | 			// cdf: 1 - exp(-z)*(z + 1)
134 | 			// We want to create tables with 2^16 entries
135 | 			// The ith entry (starting from 0) will be an interpolation between a value z_{i} with Pr[X <= z_{i}] <= (i + 1)*epsilon
136 | 			// and a value z_{i+1} > z_{i} with Pr[X <= z_{i+1}] <= (i+2)*epsilon
137 | 
138 | 			double z = 0.0;
139 | 			double z_prev = 0.0;
140 | 			double epsilon = 1.0/((1 << 16) + 1);
141 | 			double delta = epsilon; // How much to advance z in each step. We can set this to epsilon without skipping steps because pdf <= 1/e.
142 | 
143 | 			for(int i = 0; i < (1 << 16); i++) {
144 | 				double target_mass = (i+1)*epsilon;
145 | 				while(1 - exp(-z)*(z+1) < target_mass) {
146 | 					z += delta;
147 | 				}
148 | 				// Fill tables
149 | 				double v = (z + z_prev)/2;
150 | 				z_prev = z;
151 | 				gamma.push_back(v);
152 | 				gamma_inv.push_back(1/v);
153 | 				log_gamma.push_back(log(v));
154 | 			}
155 | 
156 | 			for(uint64_t i = 0; i < t; i++) {
157 | 				M.push_back(FastICWS(rng, gamma, gamma_inv, log_gamma));
158 | 			}
159 | 		}
160 | 		
161 | 		vector<pair<uint64_t, double>> operator()(const vector<pair<uint64_t, double>>& x) {
162 | 			vector<pair<uint64_t, double>> log_weight_x;
163 | 			for(auto element : x) {
164 | 				log_weight_x.push_back({element.first, log(element.second)});
165 | 			}
166 | 			vector<pair<uint64_t, double>> minhashes;
167 | 			for(uint64_t i = 0; i < t; i++) {
168 | 				minhashes.push_back((M[i])(log_weight_x));
169 | 			}
170 | 			return minhashes;
171 | 		}
172 | };
173 | 
174 | 
175 | #endif
176 | 


--------------------------------------------------------------------------------
/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <cstdint>
  2 | #include <iostream>
  3 | #include <iomanip>
  4 | #include <random>
  5 | #include <cmath>
  6 | 
  7 | #include "timer.hpp"
  8 | #include "icws.hpp"
  9 | #include "darthash.hpp"
 10 | #include "dartminhash.hpp"
 11 | #include "datagenerator.hpp"
 12 | #include "bagminhash_wrappers.hpp"
 13 | 
 14 | using namespace std;
 15 | 
 16 | template <typename T>
 17 | double time_algorithm(const vector<vector<pair<uint64_t, double>>>& data, T& hasher, uint64_t& id_xor, double& rank_sum) {
 18 | 	Timer timer;
 19 | 	for(auto& x : data) {
 20 | 		timer.start();
 21 | 		auto minhashes = hasher(x);
 22 | 		timer.stop();
 23 | 
 24 | 		// Do something with the minhashes to ensure that the compiler doesn't optimize them away
 25 | 		for(auto mh : minhashes) {
 26 | 			id_xor = id_xor ^ mh.first;
 27 | 			rank_sum += mh.second;
 28 | 		}
 29 | 	}
 30 | 
 31 | 	return timer.elapsed_ms()/data.size();
 32 | }
 33 | 
 34 | void time_performance() {
 35 | 	uint64_t seed = 1;
 36 | 	mt19937_64 rng(seed);
 37 | 
 38 | 	// Settings for L0 plot
 39 | 	// vector<uint64_t> L0_values;
 40 | 	// for(uint64_t i = 0; i < 17; i++) {
 41 | 	// 	L0_values.push_back(1ull << i);
 42 | 	// }
 43 | 	// vector<double> L1_values = {1.0};
 44 | 	// vector<uint64_t> t_values = {256};
 45 | 	// uint64_t m = 100;
 46 | 
 47 | 	// Settings for L1 plot
 48 | 	// vector<uint64_t> L0_values = {256};
 49 | 	// vector<uint64_t> t_values = {256};
 50 | 	// vector<double> L1_values;
 51 | 	// for(int i = -128; i <= 128; i = i + 16) {
 52 | 	// 	L1_values.push_back(pow(2.0, (double)i));
 53 | 	// }
 54 | 	// uint64_t m = 100;
 55 | 
 56 | 	// Settings for k plot
 57 | 	// vector<uint64_t> t_values;
 58 | 	// for(uint64_t i = 0; i < 17; i++) {
 59 | 	// 	t_values.push_back(1ull << i);
 60 | 	// }
 61 | 	// vector<double> L1_values = {1.0};
 62 | 	// vector<uint64_t> L0_values = {256};
 63 | 	// uint64_t m = 100;
 64 | 
 65 | 	// Exploration
 66 | 	// vector<uint64_t> L0_values = {1, 4, 16, 64, 256, 1024, 4096, 16384};
 67 | 	// vector<double> L1_values = {pow(2.0, -32), pow(2.0, -16), pow(2.0, -8), pow(2.0, 0), pow(2.0, 8), pow(2.0, 16), pow(2.0, 32)};
 68 | 	// vector<uint64_t> t_values = {1, 4, 16, 64, 256, 1024, 4096, 16384};
 69 | 	// uint64_t m = 10;
 70 | 
 71 | 	// Settings for heatmap 
 72 | 	vector<uint64_t> L0_values;
 73 | 	vector<uint64_t> t_values;
 74 | 	for(uint64_t i = 0; i < 15; i++) {
 75 | 		L0_values.push_back(1ull << i);
 76 | 		t_values.push_back(1ull << i);
 77 | 	}
 78 | 	vector<double> L1_values = {1.0};
 79 | 	uint64_t m = 100;
 80 | 
 81 | 
 82 | 	uint64_t id_xor = 0;
 83 | 	double rank_sum = 0;
 84 | 	uint64_t experiment_counter = 0;
 85 | 
 86 | 	cout << setprecision(5) << fixed;
 87 | 	cout << "id, L0, log2_L1, t, FastICWS, BagMinHash2, DartMinHash" << endl;
 88 | 
 89 | 	for(uint64_t L0 : L0_values) {
 90 | 		for(double L1 : L1_values) {
 91 | 			for(uint64_t t : t_values) {
 92 | 
 93 | 				cout << experiment_counter << ", " << L0 <<  ", " << log2(L1) <<  ", "  << t << ", ";
 94 | 
 95 | 				// Generate data
 96 | 				vector<vector<pair<uint64_t, double>>> data;
 97 | 				for(uint64_t i = 0; i < m; i++) {
 98 | 					data.push_back(generate_weighted_set(L0, L1, rng));
 99 | 				}
100 | 
101 | 				// Algorithms
102 | 				// ICWS_t I(rng, t);
103 | 				// cout << time_algorithm<ICWS_t>(data, I, id_xor, rank_sum) << ", ";
104 | 
105 | 				// ICWS_xxhash I_xx(t);
106 | 				// cout << time_algorithm<ICWS_xxhash>(data, I_xx, id_xor, rank_sum) << ", ";
107 | 
108 | 				FastICWS_t F(rng, t);
109 | 				cout << time_algorithm<FastICWS_t>(data, F, id_xor, rank_sum) << ", ";
110 | 
111 | 				// BagMinHash1 B1(t);
112 | 				// cout << time_algorithm<BagMinHash1>(dat a, B1, id_xor, rank_sum) << ", ";
113 | 				
114 | 				BagMinHash2 B2(t);
115 | 				cout << time_algorithm<BagMinHash2>(data, B2, id_xor, rank_sum) << ", ";
116 | 
117 | 				DartMinHash M(rng, t);
118 | 				cout << time_algorithm<DartMinHash>(data, M, id_xor, rank_sum) << endl;
119 | 
120 | 				experiment_counter++;
121 | 			}
122 | 		}
123 | 	}
124 | 
125 | 	cout << "rank sum: " << rank_sum << ", id XOR: " << id_xor << endl;
126 | }
127 | 
128 | struct experiment_settings {
129 | 	uint64_t L0;
130 | 	double L1;
131 | 	uint64_t t;
132 | };
133 | 
134 | void time_performance_specific() {
135 | 	uint64_t seed = 1;
136 | 	mt19937_64 rng(seed);
137 | 
138 | 	vector<experiment_settings> settings {
139 | 		// L0, L1, t
140 | 		// Varying L0
141 | 		{64, pow(2.0, 0.0), 64},  
142 | 		{1024, pow(2.0, 0.0), 64}, 
143 | 		// {16384, pow(2.0, 0.0), 64}, 
144 | 
145 | 		{64, pow(2.0, 0.0), 1024},  
146 | 		{1024, pow(2.0, 0.0), 1024}, 
147 | 		// {16384, pow(2.0, 0.0), 1024}, 
148 | 
149 | 		// // Varying t
150 | 		{256, pow(2.0, 0.0), 1}, 
151 | 		{256, pow(2.0, 0.0), 256}, 
152 | 		// {256, pow(2.0, 0.0), 4096},
153 | 
154 | 		// {4096, pow(2.0, 0.0), 1}, 
155 | 		// {4096, pow(2.0, 0.0), 256}, 
156 | 		// {4096, pow(2.0, 0.0), 4096},
157 | 		
158 | 		// // Varying L1
159 | 		{1024, pow(2.0, 0.0), 256}, 
160 | 		{1024, pow(2.0, 64.0), 256}, 
161 | 		{1024, pow(2.0, -64.0), 256}, 
162 | 		// {1024, pow(2.0, 512.0), 256}, 
163 | 		// {1024, pow(2.0, -512.0), 256}, 
164 | 	};
165 | 	uint64_t m = 100;
166 | 
167 | 	uint64_t id_xor = 0;
168 | 	double rank_sum = 0;
169 | 	uint64_t experiment_counter = 0;
170 | 
171 | 	cout << setprecision(3) << fixed;
172 | 	cout << "id, L0, log2_L1, t, ICWS, FastICWS, ICWS_xxhash, BagMinHash1, BagMinHash2, DartMinHash" << endl;
173 | 
174 | 	for(experiment_settings s : settings) {
175 | 
176 | 		cout << experiment_counter << ", " << s.L0 <<  ", " << log2(s.L1) <<  ", "  << s.t << ", ";
177 | 
178 | 		// Generate data
179 | 		vector<vector<pair<uint64_t, double>>> data;
180 | 		for(uint64_t i = 0; i < m; i++) {
181 | 			data.push_back(generate_weighted_set(s.L0, s.L1, rng));
182 | 		}
183 | 		
184 | 		// Algorithms
185 | 		ICWS_t I(rng, s.t);
186 | 		cout << time_algorithm<ICWS_t>(data, I, id_xor, rank_sum) << ", ";
187 | 
188 | 		FastICWS_t F(rng, s.t);
189 | 		cout << time_algorithm<FastICWS_t>(data, F, id_xor, rank_sum) << ", ";
190 | 
191 | 		ICWS_xxhash I_xx(s.t);
192 | 		cout << time_algorithm<ICWS_xxhash>(data, I_xx, id_xor, rank_sum) << ", ";
193 | 
194 | 		BagMinHash1 B1(s.t);
195 | 		cout << time_algorithm<BagMinHash1>(data, B1, id_xor, rank_sum) << ", ";
196 | 		
197 | 		BagMinHash2 B2(s.t);
198 | 		cout << time_algorithm<BagMinHash2>(data, B2, id_xor, rank_sum) << ", ";
199 | 
200 | 		DartMinHash M(rng, s.t);
201 | 		cout << time_algorithm<DartMinHash>(data, M, id_xor, rank_sum) << endl;
202 | 
203 | 		experiment_counter++;
204 | 	}
205 | 
206 | 	cout << "rank sum: " << rank_sum << ", id XOR: " << id_xor << endl;
207 | }
208 | 
209 | // Measure how the estimated jaccard similarity changes as the number of minhashes increases
210 | void measure_similarity() {
211 | 	uint64_t seed = 1;
212 | 	mt19937_64 rng(seed);
213 | 
214 | 	// Experiment in paper
215 | 	// uint64_t L0 = 256;
216 | 	// double L1 = 1.0;
217 | 	// uint64_t t_min = 1;
218 | 	// uint64_t t_max = 100;
219 | 	// vector<double> jaccard_similarity_values = {0.25, 0.5, 0.75};
220 | 
221 | 	uint64_t L0 = 256;
222 | 	double L1 = 1.0;
223 | 	uint64_t t_min = 1;
224 | 	uint64_t t_max = 10;
225 | 	vector<double> jaccard_similarity_values = {0.5};
226 | 
227 | 
228 | 	vector<uint64_t> t_values;
229 | 	for(uint64_t i = t_min; i < t_max + 1; i++) {
230 | 		t_values.push_back(i);
231 | 	}
232 | 
233 | 	cout << setprecision(3) << fixed;
234 | 
235 | 	cout << "sim_j, t, ICWS_xxhash, FastICWS, BagMinHash2, DartMinHash" << endl;
236 | 	for(double jaccard_similarity : jaccard_similarity_values) {
237 | 		for(uint64_t t : t_values) {
238 | 			cout << jaccard_similarity << ", " << t << ", ";
239 | 
240 | 			// Generate a pair of similar points	
241 | 			double l1_sim = l1_similarity_from_jaccard_similarity(L1, L1, jaccard_similarity);
242 | 			auto x = generate_weighted_set(L0, L1, rng);
243 | 			auto y = generate_similar_weighted_set(x, l1_sim, rng);
244 | 
245 | 			// // Algorithms
246 | 			ICWS_xxhash I_xx(t);
247 | 			cout << jaccard_estimate_from_minhashes(I_xx(x), I_xx(y)) << ", ";
248 | 
249 | 			FastICWS_t F(rng, t);
250 | 			cout << jaccard_estimate_from_minhashes(F(x), F(y)) << ", ";
251 | 
252 | 			BagMinHash2 B2(t);
253 | 			cout << jaccard_estimate_from_minhashes(B2(x), B2(y)) << ", ";
254 | 
255 | 			DartMinHash D(rng, t);
256 | 			cout << jaccard_estimate_from_minhashes(D(x), D(y)) << endl;
257 | 
258 | 		}
259 | 	}
260 | }
261 | 
262 | int main() {
263 | 	// time_performance();
264 | 	time_performance_specific();
265 | 	// measure_similarity();
266 | }


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
 1 | CC = g++
 2 | CFLAGS = -std=c++17 -march=native -Wall -O3
 3 | INCLUDES = $(shell find -name '*.hpp')
 4 | XXHASHPATH = bagminhash/xxhash/libxxhash.a
 5 | 
 6 | run: main
 7 | 	./main
 8 | 
 9 | main: main.o    
10 | 	$(CC) -o main main.o $(XXHASHPATH) $(CFLAGS)
11 | 
12 | main.o: main.cpp $(INCLUDES)
13 | 	$(CC) -o main.o -c main.cpp $(CFLAGS)
14 | 
15 | test: tests
16 | 	./tests
17 | 
18 | tests: tests.o tests-main.o
19 | 	$(CC) -o tests tests.o tests-main.o $(XXHASHPATH) $(CFLAGS)
20 | 
21 | tests.o: tests.cpp tests-main.o $(INCLUDES)
22 | 	$(CC) -c tests.cpp -o tests.o $(CFLAGS)
23 | 
24 | tests-main.o: tests-main.cpp catch.hpp
25 | 	$(CC) -c tests-main.cpp -o tests-main.o $(CFLAGS)
26 | 
27 | clean:
28 | 	rm -rf *.o
29 | 


--------------------------------------------------------------------------------
/output/performance.csv:
--------------------------------------------------------------------------------
 1 | id, L0, log2_L1, t, ICWS, FastICWS, ICWS_xxhash, BagMinHash1, BagMinHash2, DartMinHash
 2 | 0, 64, 0.000, 64, 0.899, 0.060, 0.538, 2.439, 0.628, 0.042
 3 | 1, 1024, 0.000, 64, 11.565, 0.515, 9.604, 4.374, 1.706, 0.145
 4 | 2, 64, 0.000, 1024, 19.296, 2.885, 8.083, 48.248, 13.279, 0.592
 5 | 3, 1024, 0.000, 1024, 187.661, 12.643, 120.135, 79.775, 16.586, 0.824
 6 | 4, 256, 0.000, 1, 0.040, 0.008, 0.040, 0.112, 0.103, 0.021
 7 | 5, 256, 0.000, 256, 14.645, 0.939, 7.716, 13.687, 3.270, 0.187
 8 | 6, 1024, 0.000, 256, 45.239, 2.703, 30.127, 18.175, 4.296, 0.274
 9 | 7, 1024, 64.000, 256, 46.717, 2.720, 30.122, 18.241, 4.250, 2.632
10 | 8, 1024, -64.000, 256, 47.677, 2.719, 30.117, 18.096, 4.192, 2.333


--------------------------------------------------------------------------------
/output/similarity.csv:
--------------------------------------------------------------------------------
 1 | sim_j, t, ICWS_xxhash, FastICWS, BagMinHash2, DartMinHash
 2 | 0.500, 1, 1.000, 1.000, 0.000, 1.000
 3 | 0.500, 2, 0.500, 0.500, 0.000, 0.500
 4 | 0.500, 3, 0.333, 0.333, 0.000, 0.333
 5 | 0.500, 4, 0.500, 0.250, 0.750, 0.750
 6 | 0.500, 5, 0.000, 0.400, 0.600, 0.200
 7 | 0.500, 6, 0.667, 0.500, 0.500, 0.000
 8 | 0.500, 7, 0.571, 0.714, 0.429, 0.429
 9 | 0.500, 8, 0.250, 0.375, 0.625, 0.500
10 | 0.500, 9, 0.889, 0.222, 0.556, 0.444
11 | 0.500, 10, 0.600, 0.400, 0.700, 0.400


--------------------------------------------------------------------------------
/similarity.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef SKETCH_SIMILARITY
 2 | #define SKETCH_SIMILARITY
 3 | 
 4 | #include <vector>
 5 | #include <cstdint>
 6 | #include <cmath>
 7 | 
 8 | using namespace std;
 9 | 
10 | double weight(const vector<pair<uint64_t, double>>&  x) {
11 |     double w = 0;
12 |     for(const pair<uint64_t, double>& v : x) {
13 |         w += v.second;
14 |     }
15 |     return w;
16 | }
17 | 
18 | double intersection(const vector<pair<uint64_t, double>>& x, const vector<pair<uint64_t, double>>& y) {
19 | 	uint64_t i = 0;
20 | 	uint64_t j = 0;
21 | 	double s = 0;
22 | 	while(i < x.size() && j < y.size()) {
23 | 		if(x[i].first == y[j].first) {
24 | 			s += min(x[i].second, y[j].second);
25 | 			i++;
26 | 			j++;
27 | 		} else if(x[i].first < y[j].first) {
28 | 			i++;
29 | 		} else {
30 | 			j++;
31 | 		}
32 | 	}
33 | 	return s;
34 | }
35 | 
36 | double jaccard_similarity(const vector<pair<uint64_t, double>>& x, const vector<pair<uint64_t, double>>& y) {
37 | 	double s = intersection(x, y);
38 | 	double w_x = weight(x);
39 | 	double w_y = weight(y);
40 | 	return s/(w_x + w_y - s);
41 | }
42 | 
43 | double l1_similarity(const vector<pair<uint64_t, double>>& x, const vector<pair<uint64_t, double>>& y) {
44 | 	double s = intersection(x, y);
45 | 	double w_x = weight(x);
46 | 	double w_y = weight(y);
47 | 	return s/min(w_x, w_y);
48 | }
49 | 
50 | double hamming_distance(const vector<bool>& x, const vector<bool>& y) {
51 | 	double h = 0;
52 | 	for(uint32_t i = 0; i < x.size(); i++) {
53 | 		if(x[i] != y[i]) {
54 | 			h = h + 1;
55 | 		}
56 | 	}
57 | 	return h;
58 | }
59 | 
60 | double onebit_minhash_jaccard_estimate(const vector<bool>& x, const vector<bool>& y) {
61 | 	double h = hamming_distance(x, y);
62 | 	double t = x.size();
63 | 	return max(0.0, 2*(1 - h/t) - 1);
64 | }
65 | 
66 | // Similarity conversions
67 | // L1 similarity is the normalized intersection: |x \cap y| / min(|x|, |y|)
68 | // Jaccard similarity is: |x \cap y| / |x \cup y|
69 | double jaccard_similarity_from_l1_similarity(double x_weight, double y_weight, double l1_sim) {
70 |   double i = min(x_weight, y_weight)*l1_sim;
71 |   double u = x_weight + y_weight - i;
72 |   return i/u;
73 | }
74 | 
75 | double l1_similarity_from_jaccard_similarity(double x_weight, double y_weight, double jaccard_sim) {
76 |   double i = jaccard_sim*(x_weight + y_weight)/(1 + jaccard_sim);
77 |   return i/min(x_weight, y_weight);
78 | }
79 | 
80 | // Count the number of collisions in two vectors of minhash sketches ((id, rank) pairs)
81 | uint64_t count_collisions(const vector<pair<uint64_t, double>>& x, const vector<pair<uint64_t, double>>& y) {
82 | 	uint64_t collisions = 0;
83 | 	for(uint64_t i = 0; i < x.size(); i++) {
84 | 		if(x[i].first == y[i].first) {
85 | 			collisions++;
86 | 		}
87 | 	}
88 | 	return collisions;
89 | }
90 | 
91 | double jaccard_estimate_from_minhashes(const vector<pair<uint64_t, double>>& x, const vector<pair<uint64_t, double>>& y) {
92 | 	return (double)count_collisions(x, y)/x.size();
93 | }
94 | 
95 | #endif


--------------------------------------------------------------------------------
/tests-main.cpp:
--------------------------------------------------------------------------------
1 | #define CATCH_CONFIG_MAIN  // This tells Catch to provide a main() - only do this in one cpp file
2 | #include "catch.hpp"
3 | 


--------------------------------------------------------------------------------
/tests.cpp:
--------------------------------------------------------------------------------
  1 | #include<vector>
  2 | #include<random>
  3 | #include <iostream>
  4 | #include "catch.hpp"
  5 | #include "datagenerator.hpp"
  6 | #include "similarity.hpp"
  7 | #include "darthash.hpp"
  8 | #include "icws.hpp"
  9 | #include "dartminhash.hpp"
 10 | #include "bagminhash_wrappers.hpp"
 11 | 
 12 | using namespace std;
 13 | 
 14 | TEST_CASE("Randomly generated weighted sets behave as expected", "[datagenerator]") {
 15 |     SECTION("Size and weight") {
 16 |         uint64_t seed = 1;
 17 |         mt19937_64 rng(seed);
 18 |         uint64_t L0 = 128;
 19 |         double L1 = 1.0;
 20 |         vector<pair<uint64_t, double>> x = generate_weighted_set(L0, L1, rng);
 21 |         REQUIRE(x.size() == L0);
 22 | 
 23 |         bool sorted = true;
 24 |         for(uint32_t i = 1; i < x.size(); i++) {
 25 |             if(x[i-1].first > x[i].first) {
 26 |                 sorted = false;
 27 |             }
 28 |         }
 29 |         REQUIRE(sorted);
 30 |         REQUIRE(weight(x) == Approx(L1));
 31 |     }
 32 |     SECTION("Similar sets") {
 33 |         uint64_t seed = 1;
 34 |         mt19937_64 rng(seed);
 35 |         uint64_t L0 = 128;
 36 |         double L1 = 1.0;
 37 |         vector<pair<uint64_t, double>> x = generate_weighted_set(L0, L1, rng);
 38 |         auto y = generate_similar_weighted_set(x, 0.5, rng);
 39 | 
 40 |         // Ensure that the weight is maintained and that we have added an additional element for the excess weight.
 41 |         REQUIRE(weight(y) == Approx(L1));
 42 |         REQUIRE(y.size() == L0 + 1);
 43 | 
 44 |         // Ensure that the generated set matches the desired similarity
 45 |         int k = 10;
 46 |         for(int i = 0; i <= k; i++) {
 47 |             double s = ((double)i/k);
 48 |             auto y = generate_similar_weighted_set(x, s, rng);
 49 |             REQUIRE(jaccard_similarity(x, y) == Approx(s/(2-s)));
 50 |         }
 51 |     }
 52 | }
 53 | 
 54 | TEST_CASE("Similarity measures", "[similarity]") {
 55 |     SECTION("weight") {
 56 |         vector<pair<uint64_t, double>> x = {
 57 |             {1, 1.0},
 58 |             {2, 2.0}
 59 |         };
 60 |         REQUIRE(weight(x) == Approx(3.0));
 61 |     }
 62 |     SECTION("jaccard_similarity") {
 63 |         vector<pair<uint64_t, double>> x = {
 64 |             {1, 1.0},
 65 |             {2, 2.0}
 66 |         };
 67 |         vector<pair<uint64_t, double>> y = {
 68 |             {1, 1.0},
 69 |             {2, 1.0},
 70 |             {3, 1.0}
 71 |         };
 72 |         REQUIRE(intersection(x, y) == Approx(2.0));
 73 |         REQUIRE(jaccard_similarity(x, y) == Approx(0.5));
 74 |     }
 75 |     SECTION("Hamming distance") {
 76 |         REQUIRE(hamming_distance({true, true, true}, {false, false, false}) == 3);
 77 |         REQUIRE(hamming_distance({true, true, true}, {false, true, false}) == 2);
 78 |     }
 79 | 
 80 |     SECTION("similarity conversions") {
 81 |         vector<pair<uint64_t, double>> x = {
 82 |             {1, 1.0},
 83 |             {2, 3.0}
 84 |         };
 85 |         vector<pair<uint64_t, double>> y = {
 86 |             {1, 1.0},
 87 |             {2, 1.0},
 88 |             {3, 1.0}
 89 |         };
 90 |         // jaccard similarity: 2/5
 91 |         // l1 similarity: 2/3
 92 |         double jaccard_sim = jaccard_similarity(x, y);
 93 |         double l1_sim = l1_similarity(x, y);
 94 |         REQUIRE(jaccard_sim == Approx(2.0/5));
 95 |         REQUIRE(l1_sim == Approx(2.0/3));
 96 |         double x_weight = weight(x);
 97 |         double y_weight = weight(y);
 98 |         REQUIRE(l1_similarity_from_jaccard_similarity(x_weight, y_weight, jaccard_sim) == Approx(l1_sim));
 99 |         REQUIRE(jaccard_similarity_from_l1_similarity(x_weight, y_weight, l1_sim) == Approx(jaccard_sim));
100 |     }
101 | }
102 | 
103 | TEST_CASE("DartHash", "[darthash]") {
104 |     SECTION("Basic dart properties") {
105 | 
106 |         uint64_t seed = 1;
107 |         mt19937_64 rng(seed);
108 |         uint64_t t = 256;
109 | 	    DartHash D(rng, t);
110 |         uint64_t L0 = 128;
111 |         double L1 = 1.0;
112 |         auto x = generate_weighted_set(L0, L1, rng);
113 |         auto darts = D(x);
114 | 
115 |         // Number of darts
116 |         // According to http://www.cs.columbia.edu/~ccanonne/files/misc/2017-poissonconcentration.pdf
117 |         // The probability that the number of darts deviates by more than t/2 is at most 2*exp(-t/10)
118 |         REQUIRE(darts.size() > 128);
119 |         REQUIRE(darts.size() < 256 + 128);
120 | 
121 |         // Dart ranks should be smaller than 1/L1 and fingerprints should be unique
122 |         unordered_set<uint64_t> fingerprints;
123 |         bool too_large = false;
124 |         bool too_small = true;
125 |         for(auto& element : darts) {
126 |             fingerprints.insert(element.first);
127 |             if(element.second > 1/L1) {
128 |                 too_large = true;
129 |             }
130 |             
131 |             // Ranks should be uniformly distributed between zero and 1/L1.
132 |             if(element.second > 1/(2*L1)) {
133 |                 too_small = false;
134 |             }
135 |         }
136 |         REQUIRE(!too_large);
137 |         REQUIRE(!too_small);
138 |         REQUIRE(fingerprints.size() == darts.size());
139 |     }
140 | 
141 |     SECTION("Darts to MinHash") {
142 |         // Converting t darts to k minhashes
143 |         // The probability of an "empty" minhash is at most t*exp(-t/k) by a standard union bound over Poisson distributions
144 |         // Verify that when t/k is large that we have no empty minhashes
145 |         // We set the id of empty minhashes to 0
146 |         uint64_t seed = 1;
147 |         mt19937_64 rng(seed);
148 |         uint64_t t = 4096;
149 |         uint64_t k = 128;
150 | 	    DartHash D(rng, t);
151 |         uint64_t L0 = 128;
152 |         double L1 = 1.0;
153 |         auto x = generate_weighted_set(L0, L1, rng);
154 |         auto minhashes = D.minhash(x, k);
155 |         bool all_nonempty = true;
156 |         for(auto mh : minhashes) {
157 |             if(mh.first == 0) {
158 |                 all_nonempty = false;
159 |             }
160 |         }
161 |         REQUIRE(all_nonempty);
162 | 
163 |         // When k = t then we expect empty minhashes
164 |         k = t;
165 |         minhashes = D.minhash(x, k);
166 |         all_nonempty = true;
167 |         for(auto mh : minhashes) {
168 |             if(mh.first == 0) {
169 |                 all_nonempty = false;
170 |             }
171 |         }
172 |         REQUIRE(!all_nonempty);
173 |     }
174 | 
175 |     SECTION("MAE of 1-bit minhash sketch stays within Hoeffding bounds") {
176 |         uint64_t seed = 1;
177 |         mt19937_64 rng(seed);
178 |         uint64_t t = 512;
179 |         uint64_t k = 64;
180 | 	    DartHash D(rng, t);
181 |         uint64_t L0 = 64;
182 |         double L1 = 1.0;
183 |         double l1_sim = 0.5;
184 |         uint64_t n = 2000;
185 | 
186 |         // MAE when using minhash to estimate l1 similarity
187 |         double target_l1_sim_mae = 0.1079063;
188 |         double epsilon = 0.05;
189 | 
190 |         double total_absolute_error = 0.0;
191 |         for(uint64_t i = 0; i < n; i++) {
192 |             auto x = generate_weighted_set(L0, L1, rng);
193 |             auto y = generate_similar_weighted_set(x, l1_sim, rng);
194 |             auto sketch_x = D.onebit_minhash(x, k);
195 |             auto sketch_y = D.onebit_minhash(y, k);
196 |             double jaccard_estimate = onebit_minhash_jaccard_estimate(sketch_x, sketch_y);
197 |             total_absolute_error += abs(l1_similarity_from_jaccard_similarity(weight(x), weight(y), jaccard_estimate) - l1_sim);
198 |         }
199 | 
200 |         double empirical_mae = total_absolute_error/n;
201 |         REQUIRE(abs(target_l1_sim_mae - empirical_mae) <= epsilon);
202 |     }
203 | }
204 | 
205 | TEST_CASE("ICWS", "[icws]") {
206 |     SECTION("Weighted samples are valid") {
207 |         uint64_t seed = 1;
208 |         mt19937_64 rng(seed);
209 |         ICWS H(rng);
210 |         int m = 100;
211 |         uint64_t L0 = 64;
212 |         double L1 = 1.0;
213 |         for(int i = 0; i < m; i++) {
214 |             auto x = generate_weighted_set(L0, L1, rng);
215 |             auto z = H(x);
216 |             bool valid_cws = false;
217 |             for(auto element : x) {
218 |                 if(element.first == z.first && z.second <= element.second) {
219 |                     valid_cws = true;
220 |                 }
221 |             }
222 |             REQUIRE(valid_cws);
223 |         }
224 |     }
225 | }
226 | 
227 | TEST_CASE("DartMinHash", "[dartminhash]") {
228 |     SECTION("1-bit dartminhash MAE stays within Hoeffding bounds") {
229 | 
230 |         uint64_t seed = 1;
231 |         mt19937_64 rng(seed);
232 |         uint64_t k = 64;
233 | 	    DartMinHash M(rng, k);
234 |         uint64_t L0 = 64;
235 |         double L1 = 1.0;
236 |         double l1_sim = 0.5;
237 |         uint64_t n = 2000;
238 | 
239 |         // MAE when using minhash to estimate l1 similarity
240 |         double target_l1_sim_mae = 0.1079063;
241 |         double epsilon = 0.05;
242 | 
243 |         double total_absolute_error = 0.0;
244 |         for(uint64_t i = 0; i < n; i++) {
245 |             auto x = generate_weighted_set(L0, L1, rng);
246 |             auto y = generate_similar_weighted_set(x, l1_sim, rng);
247 |             auto sketch_x = M.onebit_minhash(x);
248 |             auto sketch_y = M.onebit_minhash(y);
249 |             double jaccard_estimate = onebit_minhash_jaccard_estimate(sketch_x, sketch_y);
250 |             total_absolute_error += abs(l1_similarity_from_jaccard_similarity(weight(x), weight(y), jaccard_estimate) - l1_sim);
251 |         }
252 | 
253 |         double empirical_mae = total_absolute_error/n;
254 |         REQUIRE(abs(target_l1_sim_mae - empirical_mae) <= epsilon);
255 | 
256 |     }
257 | }
258 | 
259 | // Test correct estimation of jaccard similarity within Hoeffding bounds
260 | TEST_CASE("Jaccard similarity estimation", "[bagminhash]") {
261 |     uint64_t seed = 1;
262 |     mt19937_64 rng(seed);
263 |     uint64_t L0 = 64;
264 |     double L1 = 1.0;
265 |     double l1_sim = 0.5;
266 |     double target_jaccard_similarity = jaccard_similarity_from_l1_similarity(L1, L1, l1_sim);
267 |     double epsilon = 0.05;
268 |     uint64_t t = 2000;
269 |     auto x = generate_weighted_set(L0, L1, rng);
270 |     auto y = generate_similar_weighted_set(x, l1_sim, rng);
271 |     
272 |     SECTION("BagMinHash1") {
273 |         BagMinHash1 B1(t);
274 |         auto mh_x = B1(x);
275 |         auto mh_y = B1(y);
276 |         double estimated_jaccard_similarity = (double)count_collisions(mh_x, mh_y)/t;
277 |         REQUIRE(abs(target_jaccard_similarity - estimated_jaccard_similarity) <= epsilon);
278 |     }
279 | 
280 |     SECTION("BagMinHash2") {
281 |         BagMinHash2 B2(t);
282 |         auto mh_x = B2(x);
283 |         auto mh_y = B2(y);
284 |         double estimated_jaccard_similarity = (double)count_collisions(mh_x, mh_y)/t;
285 |         REQUIRE(abs(target_jaccard_similarity - estimated_jaccard_similarity) <= epsilon);
286 |     }
287 | 
288 |     SECTION("ICWS_xxhash") {
289 |         ICWS_xxhash I(t);
290 |         auto mh_x = I(x);
291 |         auto mh_y = I(y);
292 |         double estimated_jaccard_similarity = (double)count_collisions(mh_x, mh_y)/t;
293 |         REQUIRE(abs(target_jaccard_similarity - estimated_jaccard_similarity) <= epsilon);
294 |     }
295 | 
296 |     SECTION("FastICWS") {
297 |         FastICWS_t F(rng, t);
298 |         auto mh_x = F(x);
299 |         auto mh_y = F(y);
300 |         double estimated_jaccard_similarity = (double)count_collisions(mh_x, mh_y)/t;
301 |         REQUIRE(abs(target_jaccard_similarity - estimated_jaccard_similarity) <= epsilon);
302 |     }
303 | }
304 | 
305 | 
306 | 


--------------------------------------------------------------------------------
/timer.hpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef SKETCH_TIMER
 3 | #define SKETCH_TIMER
 4 | 
 5 | #include <iostream>
 6 | #include <chrono>
 7 | #include <iomanip>
 8 | 
 9 | using namespace std;
10 | 
11 | struct Timer {
12 | 	string name;
13 | 	chrono::nanoseconds elapsed;
14 | 	chrono::high_resolution_clock::time_point start_time; 
15 | 
16 | 	Timer() : Timer("Timer") {}
17 | 	Timer(string name) : name(name) {
18 | 		elapsed = chrono::nanoseconds(0);
19 | 	}
20 | 
21 | 	void start() {
22 | 		start_time = std::chrono::high_resolution_clock::now();
23 | 	}
24 | 
25 | 	void stop() {
26 | 		elapsed += std::chrono::high_resolution_clock::now() - start_time;
27 | 	}
28 | 
29 | 	void reset() {
30 | 		elapsed = chrono::nanoseconds(0);
31 | 	}
32 | 
33 | 	double elapsed_s() {
34 | 		return elapsed_ms()/1000;
35 | 	}
36 | 
37 | 	double elapsed_ms() {
38 | 		return (double)elapsed.count()/1000000;
39 | 	}
40 | 
41 | 	double elapsed_ns() {
42 | 		return elapsed.count();
43 | 	}
44 | 
45 | 	void print_ms() {
46 | 		cout << fixed << setprecision(2);
47 | 		cout << name << ": " << elapsed_ms() << " (ms)" << endl;
48 | 	}
49 | 
50 | 	void print_s() {
51 | 		cout << fixed << setprecision(2);
52 | 		cout << name << ": " << elapsed_s() << " (s)" << endl;
53 | 	}
54 | };
55 | 
56 | #endif
57 | 


--------------------------------------------------------------------------------