├── BloomFilter └── c++ │ ├── Makefile │ ├── bloom.hpp │ ├── bloom_.hpp │ ├── bloom_array.hpp │ ├── bloom_example.cpp │ ├── counting_bloom.hpp │ └── spectral_bloom.hpp ├── ChangeLog ├── CountMin ├── count_min_sketch.hpp └── count_min_sketch_example.cpp ├── KPS └── kps.hpp ├── LICENSE ├── MisraGries ├── misra_gries.hpp └── test │ ├── makefile │ └── misra_gries_test.cpp ├── README.md ├── StreamSummary ├── c++ │ ├── stream_summary.hpp │ └── test │ │ ├── Makefile │ │ ├── ss_test.cc │ │ ├── test.dat │ │ └── test2.dat ├── go │ ├── example.go │ └── streamsummary │ │ └── streamsummary.go └── python │ ├── __init__.py │ ├── example.py │ ├── stream_summary.py │ ├── test.dat │ └── test │ ├── __init__.py │ └── test_stream_summary.py └── hash ├── MurmurHash3.cpp ├── MurmurHash3.hpp ├── checksum.cpp └── checksum.hpp /BloomFilter/c++/Makefile: -------------------------------------------------------------------------------- 1 | all: demo 2 | 3 | demo: bloom_example.cpp bloom.hpp counting_bloom.hpp spectral_bloom.hpp ../../hash/MurmurHash3.cpp 4 | g++ -o demo ../../hash/MurmurHash3.cpp bloom_example.cpp -I../../hash 5 | 6 | clean: 7 | rm -f demo -------------------------------------------------------------------------------- /BloomFilter/c++/bloom.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * bloom.hpp 3 | * 4 | * 5 | * Bloom Filter Implementation 6 | * 7 | * 8 | * Copyright (C) 2012-2017 Bryant Moscon - bmoscon@gmail.com 9 | * 10 | * Permission is hereby granted, free of charge, to any person obtaining a copy 11 | * of this software and associated documentation files (the "Software"), to 12 | * deal in the Software without restriction, including without limitation the 13 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 14 | * sell copies of the Software, and to permit persons to whom the Software is 15 | * furnished to do so, subject to the following conditions: 16 | * 17 | * 1. Redistributions of source code must retain the above copyright notice, 18 | * this list of conditions, and the following disclaimer. 19 | * 20 | * 2. Redistributions in binary form must reproduce the above copyright notice, 21 | * this list of conditions and the following disclaimer in the documentation 22 | * and/or other materials provided with the distribution, and in the same 23 | * place and form as other copyright, 24 | * license and disclaimer information. 25 | * 26 | * 3. The end-user documentation included with the redistribution, if any, must 27 | * include the following acknowledgment: "This product includes software 28 | * developed by Bryant Moscon (http://www.bryantmoscon.org/)", in the same 29 | * place and form as other third-party acknowledgments. Alternately, this 30 | * acknowledgment may appear in the software itself, in the same form and 31 | * location as other such third-party acknowledgments. 32 | * 33 | * 4. Except as contained in this notice, the name of the author, Bryant Moscon, 34 | * shall not be used in advertising or otherwise to promote the sale, use or 35 | * other dealings in this Software without prior written authorization from 36 | * the author. 37 | * 38 | * 39 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 40 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 41 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 42 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 43 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 44 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 45 | * THE SOFTWARE. 46 | * 47 | */ 48 | 49 | #ifndef __BLOOM_FILTER__ 50 | #define __BLOOM_FILTER__ 51 | 52 | #include 53 | #include 54 | #include 55 | #include 56 | 57 | 58 | #include "bloom_.hpp" 59 | 60 | template 61 | class BloomFilter : Bloom { 62 | public: 63 | typedef S (*hash_function)(const T &s); 64 | 65 | 66 | BloomFilter(const std::vector &hash_list) : 67 | bloom_array_(std::numeric_limits::max(), false), 68 | hash_list_(hash_list) {} 69 | 70 | BloomFilter() : bloom_array_(std::numeric_limits::max(), false), 71 | hash_list_(std::vector(0)) {} 72 | 73 | void setHash(const std::vector &hash_list) { 74 | hash_list_(hash_list); 75 | } 76 | 77 | void addHash(const hash_function &hash) { 78 | hash_list_.push_back(hash); 79 | } 80 | 81 | virtual void add(const T &s) { 82 | assert(hash_list_.size()); 83 | for (uint32_t i = 0; i < hash_list_.size(); ++i) { 84 | bloom_array_[(*hash_list_[i])(s)] = true; 85 | } 86 | } 87 | 88 | virtual bool exists(const T &s) const { 89 | assert(hash_list_.size()); 90 | for (uint32_t i = 0; i < hash_list_.size(); ++i) { 91 | if (!bloom_array_[(*hash_list_[i])(s)]) { 92 | return (false); 93 | } 94 | } 95 | 96 | return (true); 97 | } 98 | 99 | private: 100 | 101 | std::vector bloom_array_; 102 | std::vector hash_list_; 103 | }; 104 | 105 | 106 | #endif 107 | -------------------------------------------------------------------------------- /BloomFilter/c++/bloom_.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * bloom_.hpp 3 | * 4 | * 5 | * Bloom Filter Interface 6 | * 7 | * 8 | * Copyright (C) 2012 Bryant Moscon - bmoscon@gmail.com 9 | * 10 | * Permission is hereby granted, free of charge, to any person obtaining a copy 11 | * of this software and associated documentation files (the "Software"), to 12 | * deal in the Software without restriction, including without limitation the 13 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 14 | * sell copies of the Software, and to permit persons to whom the Software is 15 | * furnished to do so, subject to the following conditions: 16 | * 17 | * 1. Redistributions of source code must retain the above copyright notice, 18 | * this list of conditions, and the following disclaimer. 19 | * 20 | * 2. Redistributions in binary form must reproduce the above copyright notice, 21 | * this list of conditions and the following disclaimer in the documentation 22 | * and/or other materials provided with the distribution, and in the same 23 | * place and form as other copyright, 24 | * license and disclaimer information. 25 | * 26 | * 3. The end-user documentation included with the redistribution, if any, must 27 | * include the following acknowledgment: "This product includes software 28 | * developed by Bryant Moscon (http://www.bryantmoscon.org/)", in the same 29 | * place and form as other third-party acknowledgments. Alternately, this 30 | * acknowledgment may appear in the software itself, in the same form and 31 | * location as other such third-party acknowledgments. 32 | * 33 | * 4. Except as contained in this notice, the name of the author, Bryant Moscon, 34 | * shall not be used in advertising or otherwise to promote the sale, use or 35 | * other dealings in this Software without prior written authorization from 36 | * the author. 37 | * 38 | * 39 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 40 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 41 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 42 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 43 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 44 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 45 | * THE SOFTWARE. 46 | * 47 | */ 48 | 49 | 50 | #ifndef __BLOOM_FILTER_INTERFACE__ 51 | #define __BLOOM_FILTER_INTERFACE__ 52 | 53 | template 54 | class Bloom { 55 | public: 56 | 57 | virtual void add(const T &s) = 0; 58 | virtual bool exists(const T &s) const = 0; 59 | 60 | }; 61 | 62 | #endif 63 | -------------------------------------------------------------------------------- /BloomFilter/c++/bloom_array.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * bloom_array.hpp 3 | * 4 | * 5 | * Bloom Filter Array Implementation 6 | * 7 | * 8 | * Copyright (C) 2012-2013 Bryant Moscon - bmoscon@gmail.com 9 | * 10 | * Permission is hereby granted, free of charge, to any person obtaining a copy 11 | * of this software and associated documentation files (the "Software"), to 12 | * deal in the Software without restriction, including without limitation the 13 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 14 | * sell copies of the Software, and to permit persons to whom the Software is 15 | * furnished to do so, subject to the following conditions: 16 | * 17 | * 1. Redistributions of source code must retain the above copyright notice, 18 | * this list of conditions, and the following disclaimer. 19 | * 20 | * 2. Redistributions in binary form must reproduce the above copyright notice, 21 | * this list of conditions and the following disclaimer in the documentation 22 | * and/or other materials provided with the distribution, and in the same 23 | * place and form as other copyright, license and disclaimer information. 24 | * 25 | * 3. The end-user documentation included with the redistribution, if any, must 26 | * include the following acknowledgment: "This product includes software 27 | * developed by Bryant Moscon (http://www.bryantmoscon.org/)", in the same 28 | * place and form as other third-party acknowledgments. Alternately, this 29 | * acknowledgment may appear in the software itself, in the same form and 30 | * location as other such third-party acknowledgments. 31 | * 32 | * 4. Except as contained in this notice, the name of the author, Bryant Moscon, 33 | * shall not be used in advertising or otherwise to promote the sale, use or 34 | * other dealings in this Software without prior written authorization from 35 | * the author. 36 | * 37 | * 38 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 39 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 40 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 41 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 42 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 43 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 44 | * THE SOFTWARE. 45 | * 46 | */ 47 | 48 | #ifndef __BLOOM_FILTER_ARRAY__ 49 | #define __BLOOM_FILTER_ARRAY__ 50 | 51 | #include 52 | #include 53 | #include 54 | #include 55 | 56 | template 57 | class BloomArray { 58 | 59 | public: 60 | 61 | BloomArray(const T &elements, const T &bits_per_element = 1) : 62 | array_(ceil(elements * bits_per_element / (sizeof(T) * 8.0)), 0), 63 | elem_bits_(bits_per_element), 64 | elem_mask_(pow(2, bits_per_element) - 1), 65 | bitcount_(sizeof(T) << 3), 66 | len_(elements) 67 | { 68 | assert(bits_per_element); 69 | assert(sizeof(T) * 8 % bits_per_element == 0); 70 | index_shift_ = log2(bitcount_); 71 | } 72 | 73 | T at(const T &index) const { 74 | T idx = index_translate(index); 75 | T mask = elem_mask_ << (bitcount_ - idx); 76 | 77 | return (lookup(idx, mask)); 78 | } 79 | 80 | void inc(const T &index) { 81 | T idx = index_translate(index); 82 | T mask = elem_mask_; 83 | 84 | mask <<= (bitcount_ - idx); 85 | 86 | T value = lookup(idx, mask); 87 | 88 | if (value == elem_mask_) { 89 | return; 90 | } else { 91 | array_[idx >> index_shift_] &= ~mask; 92 | 93 | ++value; 94 | value <<= (bitcount_ - idx); 95 | 96 | array_[idx >> index_shift_] |= value; 97 | } 98 | } 99 | 100 | void dec(const T &index) { 101 | T idx = index_translate(index); 102 | T mask = elem_mask_; 103 | 104 | mask <<= (bitcount_ - idx); 105 | 106 | T value = lookup(idx, mask); 107 | 108 | if (value == 0) { 109 | return; 110 | } else { 111 | array_[idx >> index_shift_] &= ~mask; 112 | 113 | --value; 114 | value <<= (bitcount_ - idx); 115 | 116 | array_[idx >> index_shift_] |= value; 117 | } 118 | } 119 | 120 | void set(const T &index, T value) { 121 | if (value > elem_mask_) { 122 | return; 123 | } 124 | 125 | T idx = index_translate(index); 126 | T mask = elem_mask_; 127 | 128 | mask <<= (bitcount_ - idx); 129 | array_[idx >> index_shift_] &= ~mask; 130 | 131 | value <<= (bitcount_ - idx); 132 | array_[idx >> index_shift_] |= value; 133 | } 134 | 135 | T size() const { 136 | return (len_); 137 | } 138 | 139 | void dump() { 140 | for (T i = 0; i < array_.size(); ++i) { 141 | std::cout << array_[i] << std::endl; 142 | } 143 | } 144 | 145 | 146 | protected: 147 | 148 | inline T index_translate(const T &index) const { 149 | return (index << log2(elem_bits_)); 150 | } 151 | 152 | inline T lookup(const T &index, T &mask) const { 153 | return ((array_[index >> index_shift_] & mask) >> (bitcount_ - index)); 154 | } 155 | 156 | inline uint32_t log2(const uint32_t x) const { 157 | uint32_t ret; 158 | 159 | asm ( "\tbsr %1, %0\n" 160 | : "=r" (ret) 161 | : "r" (x) 162 | ); 163 | 164 | return (ret); 165 | } 166 | 167 | // Array that holds the underlying datatype elements 168 | std::vector array_; 169 | // number of bits per element 170 | T elem_bits_; 171 | // mask to select each element 172 | T elem_mask_; 173 | // # of bits in the underlying datatype 174 | T bitcount_; 175 | // log2 of bitcount, used in shifting the index 176 | uint32_t index_shift_; 177 | // user defined len 178 | T len_; 179 | }; 180 | 181 | 182 | #endif 183 | -------------------------------------------------------------------------------- /BloomFilter/c++/bloom_example.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * bloom_example.cpp 3 | * 4 | * 5 | * Bloom Filter - Example Usage 6 | * 7 | * 8 | * Copyright (C) 2012-2017 Bryant Moscon - bmoscon@gmail.com 9 | * 10 | * Permission is hereby granted, free of charge, to any person obtaining a copy 11 | * of this software and associated documentation files (the "Software"), to 12 | * deal in the Software without restriction, including without limitation the 13 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 14 | * sell copies of the Software, and to permit persons to whom the Software is 15 | * furnished to do so, subject to the following conditions: 16 | * 17 | * 1. Redistributions of source code must retain the above copyright notice, 18 | * this list of conditions, and the following disclaimer. 19 | * 20 | * 2. Redistributions in binary form must reproduce the above copyright notice, 21 | * this list of conditions and the following disclaimer in the documentation 22 | * and/or other materials provided with the distribution, and in the same 23 | * place and form as other copyright, 24 | * license and disclaimer information. 25 | * 26 | * 3. The end-user documentation included with the redistribution, if any, must 27 | * include the following acknowledgment: "This product includes software 28 | * developed by Bryant Moscon (http://www.bryantmoscon.com/)", in the same 29 | * place and form as other third-party acknowledgments. Alternately, this 30 | * acknowledgment may appear in the software itself, in the same form and 31 | * location as other such third-party acknowledgments. 32 | * 33 | * 4. Except as contained in this notice, the name of the author, Bryant Moscon, 34 | * shall not be used in advertising or otherwise to promote the sale, use or 35 | * other dealings in this Software without prior written authorization from 36 | * the author. 37 | * 38 | * 39 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 40 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 41 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 42 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 43 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 44 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 45 | * THE SOFTWARE. 46 | * 47 | */ 48 | 49 | 50 | #include 51 | #include 52 | 53 | #include "bloom.hpp" 54 | #include "counting_bloom.hpp" 55 | #include "spectral_bloom.hpp" 56 | #include "MurmurHash3.hpp" 57 | 58 | 59 | uint16_t hash(const std::string &s) { 60 | uint32_t ret; 61 | uint16_t rc; 62 | MurmurHash3_x86_32(s.c_str(), s.length(), 5, &ret); 63 | 64 | memcpy(&rc, &ret, sizeof(uint16_t)); 65 | 66 | return rc; 67 | } 68 | 69 | int main () { 70 | std::vector::hash_function> list(1); 71 | std::vector::hash_function> list2(1); 72 | 73 | list[0] = hash; 74 | list2[0] = hash; 75 | 76 | BloomFilter a(list); 77 | std::string b = "abc"; 78 | 79 | std::cout << "Testing Empty Bloom Filter for string \"" << b << "\"\n"; 80 | std::cout << a.exists(b) << std::endl << std::endl; 81 | 82 | 83 | std::cout << "Adding string \"" << b << "\" to Bloom Filter and testing for existence\n"; 84 | a.add(b); 85 | 86 | std::cout << a.exists(b) << std::endl << std::endl; 87 | 88 | 89 | 90 | CountingBloomFilter count_b(list2, 4); 91 | 92 | count_b.add(b); 93 | count_b.add(b); 94 | count_b.remove(b); 95 | std::cout << "Adding string \"" << b << "\" to Counting Bloom Filter twice, and removing " 96 | << "once, then testing for existence\n"; 97 | std::cout << count_b.exists(b) << std::endl << std::endl; 98 | 99 | 100 | std::cout << "Adding string \"" << b << "\" to Spectral Bloom Filter three times and " 101 | << "removing once, then getting occurrences\n"; 102 | SpectralBloomFilter spectral_b(list2, 4); 103 | 104 | spectral_b.add(b); 105 | spectral_b.add(b); 106 | spectral_b.add(b); 107 | spectral_b.remove(b); 108 | 109 | std::cout << spectral_b.occurrences(b) << std::endl << std::endl; 110 | 111 | SpectralBloomFilter spectral_b2(count_b); 112 | std::cout << "Copying previous Counting Bloom Filter into new Spectral Bloom Filter and " 113 | << "testing existence\n"; 114 | 115 | std::cout << spectral_b2.exists(b) << std::endl << std::endl; 116 | 117 | return 0; 118 | } 119 | -------------------------------------------------------------------------------- /BloomFilter/c++/counting_bloom.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * counting_bloom.hpp 3 | * 4 | * 5 | * Counting Bloom Filter Implementation 6 | * 7 | * 8 | * Copyright (C) 2012 Bryant Moscon - bmoscon@gmail.com 9 | * 10 | * Permission is hereby granted, free of charge, to any person obtaining a copy 11 | * of this software and associated documentation files (the "Software"), to 12 | * deal in the Software without restriction, including without limitation the 13 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 14 | * sell copies of the Software, and to permit persons to whom the Software is 15 | * furnished to do so, subject to the following conditions: 16 | * 17 | * 1. Redistributions of source code must retain the above copyright notice, 18 | * this list of conditions, and the following disclaimer. 19 | * 20 | * 2. Redistributions in binary form must reproduce the above copyright notice, 21 | * this list of conditions and the following disclaimer in the documentation 22 | * and/or other materials provided with the distribution, and in the same 23 | * place and form as other copyright, 24 | * license and disclaimer information. 25 | * 26 | * 3. The end-user documentation included with the redistribution, if any, must 27 | * include the following acknowledgment: "This product includes software 28 | * developed by Bryant Moscon (http://www.bryantmoscon.org/)", in the same 29 | * place and form as other third-party acknowledgments. Alternately, this 30 | * acknowledgment may appear in the software itself, in the same form and 31 | * location as other such third-party acknowledgments. 32 | * 33 | * 4. Except as contained in this notice, the name of the author, Bryant Moscon, 34 | * shall not be used in advertising or otherwise to promote the sale, use or 35 | * other dealings in this Software without prior written authorization from 36 | * the author. 37 | * 38 | * 39 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 40 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 41 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 42 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 43 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 44 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 45 | * THE SOFTWARE. 46 | * 47 | */ 48 | 49 | #ifndef __COUNTING_BLOOM_FILTER__ 50 | #define __COUNTING_BLOOM_FILTER__ 51 | 52 | #include 53 | #include 54 | 55 | #include "bloom_.hpp" 56 | #include "bloom_array.hpp" 57 | 58 | 59 | template 60 | class CountingBloomFilter : Bloom { 61 | public: 62 | typedef S (*hash_function)(const T &s); 63 | 64 | 65 | CountingBloomFilter(const std::vector &hash_list, uint32_t bits) : 66 | bloom_array_(std::numeric_limits::max(), bits), 67 | hash_list_(hash_list) {} 68 | 69 | CountingBloomFilter(const CountingBloomFilter &s) : 70 | bloom_array_(s.bloom_array_), 71 | hash_list_(s.hash_list_) {} 72 | 73 | virtual void add(const T &s) { 74 | for (uint32_t i = 0; i < hash_list_.size(); ++i) { 75 | bloom_array_.inc((*hash_list_[i])(s)); 76 | } 77 | } 78 | 79 | virtual void remove(const T &s) { 80 | for (uint32_t i = 0; i < hash_list_.size(); ++i) { 81 | bloom_array_.dec((*hash_list_[i])(s)); 82 | } 83 | } 84 | 85 | 86 | virtual bool exists(const T &s) const { 87 | for (uint32_t i = 0; i < hash_list_.size(); ++i) { 88 | if (!bloom_array_.at((*hash_list_[i])(s))) { 89 | return (false); 90 | } 91 | } 92 | 93 | return (true); 94 | } 95 | 96 | protected: 97 | 98 | BloomArray<> bloom_array_; 99 | std::vector hash_list_; 100 | }; 101 | 102 | 103 | #endif 104 | -------------------------------------------------------------------------------- /BloomFilter/c++/spectral_bloom.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * spectral_bloom.hpp 3 | * 4 | * 5 | * Spectral Bloom Filter Implementation 6 | * 7 | * 8 | * Copyright (C) 2012 Bryant Moscon - bmoscon@gmail.com 9 | * 10 | * Permission is hereby granted, free of charge, to any person obtaining a copy 11 | * of this software and associated documentation files (the "Software"), to 12 | * deal in the Software without restriction, including without limitation the 13 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 14 | * sell copies of the Software, and to permit persons to whom the Software is 15 | * furnished to do so, subject to the following conditions: 16 | * 17 | * 1. Redistributions of source code must retain the above copyright notice, 18 | * this list of conditions, and the following disclaimer. 19 | * 20 | * 2. Redistributions in binary form must reproduce the above copyright notice, 21 | * this list of conditions and the following disclaimer in the documentation 22 | * and/or other materials provided with the distribution, and in the same 23 | * place and form as other copyright, 24 | * license and disclaimer information. 25 | * 26 | * 3. The end-user documentation included with the redistribution, if any, must 27 | * include the following acknowledgment: "This product includes software 28 | * developed by Bryant Moscon (http://www.bryantmoscon.org/)", in the same 29 | * place and form as other third-party acknowledgments. Alternately, this 30 | * acknowledgment may appear in the software itself, in the same form and 31 | * location as other such third-party acknowledgments. 32 | * 33 | * 4. Except as contained in this notice, the name of the author, Bryant Moscon, 34 | * shall not be used in advertising or otherwise to promote the sale, use or 35 | * other dealings in this Software without prior written authorization from 36 | * the author. 37 | * 38 | * 39 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 40 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 41 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 42 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 43 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 44 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 45 | * THE SOFTWARE. 46 | * 47 | */ 48 | 49 | #ifndef __SPECTRAL_BLOOM_FILTER__ 50 | #define __SPECTRAL_BLOOM_FILTER__ 51 | 52 | #include 53 | #include 54 | 55 | #include "counting_bloom.hpp" 56 | 57 | 58 | template 59 | class SpectralBloomFilter : public CountingBloomFilter { 60 | public: 61 | typedef S (*hash_function)(const T &s); 62 | 63 | 64 | SpectralBloomFilter(const std::vector &hash_list, uint32_t bits) : 65 | CountingBloomFilter(hash_list, bits) {} 66 | 67 | SpectralBloomFilter(const CountingBloomFilter &s) : 68 | CountingBloomFilter(s) {} 69 | 70 | 71 | uint64_t occurrences(const T &s) const { 72 | 73 | uint64_t return_val = std::numeric_limits::max(); 74 | uint64_t compare_val; 75 | 76 | for (uint32_t i = 0; i < parent::hash_list_.size(); ++i) { 77 | compare_val = parent::bloom_array_.at((*parent::hash_list_[i])(s)); 78 | if (compare_val < return_val) { 79 | return_val = compare_val; 80 | } 81 | } 82 | 83 | return (return_val); 84 | } 85 | 86 | private: 87 | typedef CountingBloomFilter parent; 88 | 89 | }; 90 | 91 | 92 | #endif 93 | -------------------------------------------------------------------------------- /ChangeLog: -------------------------------------------------------------------------------- 1 | November 15, 2016: 2 | * Save and Load methods added to StreamSummary in Python. These methods allow the user to save the state of the StreamSummary 3 | object to JSON, and to load the state from JSON 4 | 5 | November 17, 2015: 6 | * New python implementation of StreamSummary. Significantly more efficient and a lot cleaner (analagous to C++ version). 7 | * New unit pytests for StreamSummary 8 | 9 | September 14, 2015: 10 | * Moved code from GoStream repo to this repo. No reason to have a separate 11 | repo for a single language, when this already includes multiple languages 12 | 13 | September 28, 2013: 14 | * fixed various bugs in StreamSummary, added destructor, etc 15 | * ported stream summary to python 16 | 17 | September 2, 2013: 18 | * Fixed bug in bucket logic that was causing an assert 19 | * Checking return value of all insertions now 20 | 21 | March 30, 2013: 22 | * StreamSummary 23 | - Added unordered_map to Bucket class to avoid O(n) search for object to remove from bucket. 24 | This uses more memory, but its probably an ok tradeoff for speed. 25 | 26 | * BloomFilter 27 | - Added two new methods and a new constuctor. No longer necessary to pass a vector 28 | of function pointers to constructor; use setHash to pass a fp vector, or use 29 | addHash to add individual fps to the list 30 | 31 | ChangeLog started March 30, 2013 32 | -------------------------------------------------------------------------------- /CountMin/count_min_sketch.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * count_min_sketch.hpp 3 | * 4 | * 5 | * Count-Min Sketch Implementation 6 | * 7 | * 8 | * Copyright (C) 2012 Bryant Moscon - bmoscon@gmail.com 9 | * 10 | * Permission is hereby granted, free of charge, to any person obtaining a copy 11 | * of this software and associated documentation files (the "Software"), to 12 | * deal in the Software without restriction, including without limitation the 13 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 14 | * sell copies of the Software, and to permit persons to whom the Software is 15 | * furnished to do so, subject to the following conditions: 16 | * 17 | * 1. Redistributions of source code must retain the above copyright notice, 18 | * this list of conditions, and the following disclaimer. 19 | * 20 | * 2. Redistributions in binary form must reproduce the above copyright notice, 21 | * this list of conditions and the following disclaimer in the documentation 22 | * and/or other materials provided with the distribution, and in the same 23 | * place and form as other copyright, 24 | * license and disclaimer information. 25 | * 26 | * 3. The end-user documentation included with the redistribution, if any, must 27 | * include the following acknowledgment: "This product includes software 28 | * developed by Bryant Moscon (http://www.bryantmoscon.org/)", in the same 29 | * place and form as other third-party acknowledgments. Alternately, this 30 | * acknowledgment may appear in the software itself, in the same form and 31 | * location as other such third-party acknowledgments. 32 | * 33 | * 4. Except as contained in this notice, the name of the author, Bryant Moscon, 34 | * shall not be used in advertising or otherwise to promote the sale, use or 35 | * other dealings in this Software without prior written authorization from 36 | * the author. 37 | * 38 | * 39 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 40 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 41 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 42 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 43 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 44 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 45 | * THE SOFTWARE. 46 | * 47 | */ 48 | 49 | #ifndef __COUNT_MIN_SKETCH__ 50 | #define __COUNT_MIN_SKETCH__ 51 | 52 | #include 53 | #include 54 | #include 55 | 56 | template 57 | class CountMinSketch { 58 | public: 59 | typedef S (*hash_function)(const T &s); 60 | 61 | 62 | CountMinSketch(const std::vector &hash_list) : 63 | matrix_(hash_list.size(), std::vector(std::numeric_limits::max(), 0)), 64 | hash_list_(hash_list) {} 65 | 66 | void add(const T &s) { 67 | for (uint32_t i = 0; i < hash_list_.size(); ++i) { 68 | ++matrix_[i][(*hash_list_[i])(s)]; 69 | } 70 | } 71 | 72 | bool exists(const T &s) const { 73 | for (uint32_t i = 0; i < hash_list_.size(); ++i) { 74 | if (!matrix_[i][(*hash_list_[i])(s)]) { 75 | return (false); 76 | } 77 | } 78 | 79 | return (true); 80 | } 81 | 82 | private: 83 | 84 | std::vector > matrix_; 85 | std::vector hash_list_; 86 | }; 87 | 88 | 89 | 90 | 91 | #endif 92 | -------------------------------------------------------------------------------- /CountMin/count_min_sketch_example.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * count_min_sketch_example.cpp 3 | * 4 | * 5 | * Count Min Sketch - Example Usage 6 | * 7 | * 8 | * Copyright (C) 2012 Bryant Moscon - bmoscon@gmail.com 9 | * 10 | * Permission is hereby granted, free of charge, to any person obtaining a copy 11 | * of this software and associated documentation files (the "Software"), to 12 | * deal in the Software without restriction, including without limitation the 13 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 14 | * sell copies of the Software, and to permit persons to whom the Software is 15 | * furnished to do so, subject to the following conditions: 16 | * 17 | * 1. Redistributions of source code must retain the above copyright notice, 18 | * this list of conditions, and the following disclaimer. 19 | * 20 | * 2. Redistributions in binary form must reproduce the above copyright notice, 21 | * this list of conditions and the following disclaimer in the documentation 22 | * and/or other materials provided with the distribution, and in the same 23 | * place and form as other copyright, 24 | * license and disclaimer information. 25 | * 26 | * 3. The end-user documentation included with the redistribution, if any, must 27 | * include the following acknowledgment: "This product includes software 28 | * developed by Bryant Moscon (http://www.bryantmoscon.org/)", in the same 29 | * place and form as other third-party acknowledgments. Alternately, this 30 | * acknowledgment may appear in the software itself, in the same form and 31 | * location as other such third-party acknowledgments. 32 | * 33 | * 4. Except as contained in this notice, the name of the author, Bryant Moscon, 34 | * shall not be used in advertising or otherwise to promote the sale, use or 35 | * other dealings in this Software without prior written authorization from 36 | * the author. 37 | * 38 | * 39 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 40 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 41 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 42 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 43 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 44 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 45 | * THE SOFTWARE. 46 | * 47 | */ 48 | 49 | 50 | #include 51 | #include 52 | 53 | #include "count_min_sketch.hpp" 54 | #include "../hash/MurmurHash3.hpp" 55 | #include "../hash/checksum.hpp" 56 | 57 | 58 | uint16_t hash(const std::string &s) { 59 | uint32_t ret; 60 | uint16_t rc; 61 | MurmurHash3_x86_32(s.c_str(), s.length(), 5, &ret); 62 | 63 | memcpy(&rc, &ret, sizeof(uint16_t)); 64 | 65 | return rc; 66 | } 67 | 68 | int main () { 69 | 70 | std::vector::hash_function> list(2); 71 | 72 | list[0] = hash; 73 | list[1] = checksum_16; 74 | 75 | CountMinSketch a(list); 76 | std::string b = "abc"; 77 | 78 | std::cout << a.exists(b) << std::endl; 79 | 80 | a.add(b); 81 | 82 | std::cout << a.exists(b) << std::endl; 83 | 84 | 85 | return 0; 86 | } 87 | -------------------------------------------------------------------------------- /KPS/kps.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * kps.hpp 3 | * 4 | * 5 | * Karp-Papadimitriou-Shenker Algorithm Implementation 6 | * 7 | * as introduced in "A Simple Algorithm for Finding Frequent Elements in Streams and Bags" 8 | * by Karp, Papadimitriou, and Shenker 9 | * 10 | * 11 | * Copyright (C) 2013 Bryant Moscon - bmoscon@gmail.com 12 | * 13 | * Permission is hereby granted, free of charge, to any person obtaining a copy 14 | * of this software and associated documentation files (the "Software"), to 15 | * deal in the Software without restriction, including without limitation the 16 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 17 | * sell copies of the Software, and to permit persons to whom the Software is 18 | * furnished to do so, subject to the following conditions: 19 | * 20 | * 1. Redistributions of source code must retain the above copyright notice, 21 | * this list of conditions, and the following disclaimer. 22 | * 23 | * 2. Redistributions in binary form must reproduce the above copyright notice, 24 | * this list of conditions and the following disclaimer in the documentation 25 | * and/or other materials provided with the distribution, and in the same 26 | * place and form as other copyright, license and disclaimer information. 27 | * 28 | * 3. The end-user documentation included with the redistribution, if any, must 29 | * include the following acknowledgment: "This product includes software 30 | * developed by Bryant Moscon (http://www.bryantmoscon.org/)", in the same 31 | * place and form as other third-party acknowledgments. Alternately, this 32 | * acknowledgment may appear in the software itself, in the same form and 33 | * location as other such third-party acknowledgments. 34 | * 35 | * 4. Except as contained in this notice, the name of the author, Bryant Moscon, 36 | * shall not be used in advertising or otherwise to promote the sale, use or 37 | * other dealings in this Software without prior written authorization from 38 | * the author. 39 | * 40 | * 41 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 42 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 43 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 44 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 45 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 46 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 47 | * THE SOFTWARE. 48 | * 49 | */ 50 | 51 | #ifndef __KPS__ 52 | #define __KPS__ 53 | 54 | #include 55 | #include 56 | #include 57 | 58 | template 59 | class KPS { 60 | public: 61 | 62 | KPS(const double theta) : theta_(1.0 / theta) {} 63 | 64 | void add(const S &x) 65 | { 66 | // insert or increment phase 67 | typename std::unordered_map::iterator map_it = k_.find(x); 68 | if (map_it == k_.end()) { 69 | k_.insert(std::make_pair(x, 1)); 70 | } else { 71 | ++(map_it->second); 72 | } 73 | 74 | // delete phase 75 | if (k_.size() > theta_) { 76 | map_it = k_.begin(); 77 | 78 | while(map_it != k_.end()) { 79 | --(map_it->second); 80 | if (map_it->second == 0) { 81 | map_it = k_.erase(map_it); 82 | } else { 83 | ++map_it; 84 | } 85 | } 86 | } 87 | } 88 | 89 | void add(const std::vector &x) 90 | { 91 | for (int i = 0; i < x.size(); ++i) { 92 | add(x[i]); 93 | } 94 | } 95 | 96 | bool exists(const S &x) const 97 | { 98 | return (k_.find(x) != k_.end()); 99 | } 100 | 101 | //returning a vector ok thanks to copy elision/return value optimization 102 | std::vector report() const 103 | { 104 | typename std::unordered_map::const_iterator it; 105 | std::vector ret; 106 | ret.reserve(k_.size()); 107 | 108 | for (it = k_.begin(); it != k_.end(); ++it) { 109 | ret.push_back(it->first); 110 | } 111 | 112 | return (ret); 113 | } 114 | 115 | private: 116 | std::unordered_map k_; 117 | double theta_; 118 | }; 119 | 120 | 121 | #endif 122 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2012-2017 Bryant Moscon - bmoscon@gmail.com 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to 5 | deal in the Software without restriction, including without limitation the 6 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | sell copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | 1. Redistributions of source code must retain the above copyright notice, 11 | this list of conditions, and the following disclaimer. 12 | 13 | 2. Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution, and in the same 16 | place and form as other copyright, 17 | license and disclaimer information. 18 | 19 | 3. The end-user documentation included with the redistribution, if any, must 20 | include the following acknowledgment: "This product includes software 21 | developed by Bryant Moscon (http://www.bryantmoscon.com/)", in the same 22 | place and form as other third-party acknowledgments. Alternately, this 23 | acknowledgment may appear in the software itself, in the same form and 24 | location as other such third-party acknowledgments. 25 | 26 | 4. Except as contained in this notice, the name of the author, Bryant Moscon, 27 | shall not be used in advertising or otherwise to promote the sale, use or 28 | other dealings in this Software without prior written authorization from 29 | the author. 30 | 31 | 32 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 33 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 34 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 35 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 36 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 37 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 38 | THE SOFTWARE. 39 | 40 | -------------------------------------------------------------------------------- /MisraGries/misra_gries.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * misra_gries.hpp 3 | * 4 | * 5 | * Misra-Gries Algorithm Implementation 6 | * 7 | * as introduced in "Finding Repeated Elements" 8 | * by J. Misra and D. Gries 9 | * 10 | * 11 | * 12 | * Copyright (C) 2013 Bryant Moscon - bmoscon@gmail.com 13 | * 14 | * Permission is hereby granted, free of charge, to any person obtaining a copy 15 | * of this software and associated documentation files (the "Software"), to 16 | * deal in the Software without restriction, including without limitation the 17 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 18 | * sell copies of the Software, and to permit persons to whom the Software is 19 | * furnished to do so, subject to the following conditions: 20 | * 21 | * 1. Redistributions of source code must retain the above copyright notice, 22 | * this list of conditions, and the following disclaimer. 23 | * 24 | * 2. Redistributions in binary form must reproduce the above copyright notice, 25 | * this list of conditions and the following disclaimer in the documentation 26 | * and/or other materials provided with the distribution, and in the same 27 | * place and form as other copyright, license and disclaimer information. 28 | * 29 | * 3. The end-user documentation included with the redistribution, if any, must 30 | * include the following acknowledgment: "This product includes software 31 | * developed by Bryant Moscon (http://www.bryantmoscon.org/)", in the same 32 | * place and form as other third-party acknowledgments. Alternately, this 33 | * acknowledgment may appear in the software itself, in the same form and 34 | * location as other such third-party acknowledgments. 35 | * 36 | * 4. Except as contained in this notice, the name of the author, Bryant Moscon, 37 | * shall not be used in advertising or otherwise to promote the sale, use or 38 | * other dealings in this Software without prior written authorization from 39 | * the author. 40 | * 41 | * 42 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 43 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 44 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 45 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 46 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 47 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 48 | * THE SOFTWARE. 49 | * 50 | */ 51 | 52 | #ifndef __MISRA_GRIES__ 53 | #define __MISRA_GRIES__ 54 | 55 | #include 56 | #include 57 | #include 58 | #include 59 | #include 60 | 61 | 62 | template 63 | class MG { 64 | public: 65 | 66 | MG(uint64_t size) : k_(size) {} 67 | 68 | void add(const T &obj) 69 | { 70 | typename std::unordered_map::iterator it; 71 | it = table_.find(obj); 72 | 73 | if (it != table_.end()) { 74 | // item exists, increment its counter 75 | 76 | ++(it->second); 77 | } else if (table_.size() < k_) { 78 | // item doesnt exist, but we are below k threshold 79 | // so add a new element to the table 80 | 81 | std::pair::iterator, bool> ret; 82 | 83 | ret = table_.insert(std::make_pair(obj, 1)); 84 | assert(ret.second); 85 | } else { 86 | // no room, decremement all counters 87 | 88 | it = table_.begin(); 89 | 90 | while (it != table_.end()) { 91 | --(it->second); 92 | 93 | if (it->second == 0) { 94 | it = table_.erase(it); 95 | } else { 96 | ++it; 97 | } 98 | } 99 | } 100 | } 101 | 102 | void add(const std::vector &vec) 103 | { 104 | for (int i = 0; i < vec.size(); ++i) { 105 | add(vec[i]); 106 | } 107 | } 108 | 109 | 110 | std::pair getMajorityItem() const 111 | { 112 | typename std::unordered_map::const_iterator it; 113 | std::pair ret; 114 | 115 | ret.first = T(); 116 | ret.second = 0; 117 | 118 | for (it = table_.begin(); it != table_.end(); ++it) { 119 | if (it->second > ret.second) { 120 | ret.first = it->first; 121 | ret.second = it->second; 122 | } 123 | } 124 | 125 | return (ret); 126 | } 127 | 128 | 129 | void print() const 130 | { 131 | typename std::unordered_map::const_iterator it; 132 | 133 | for (it = table_.begin(); it != table_.end(); ++it) { 134 | std::cout << it->first << ":" << it->second << ", "; 135 | } 136 | std::cout << std::endl; 137 | } 138 | 139 | 140 | private: 141 | std::unordered_map table_; 142 | uint64_t k_; 143 | 144 | }; 145 | 146 | 147 | #endif 148 | -------------------------------------------------------------------------------- /MisraGries/test/makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Misra-Gries test program 2 | # 3 | # Jan 2013 - Bryant Moscon 4 | 5 | 6 | misra_gries_test: misra_gries_test.cpp ../misra_gries.hpp 7 | g++ -o misra_gries_test -g -Wall -Wextra -I../ misra_gries_test.cpp -std=c++11 8 | 9 | clean: 10 | rm misra_gries_test 11 | -------------------------------------------------------------------------------- /MisraGries/test/misra_gries_test.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * misra_gries_test.cpp 3 | * 4 | * 5 | * Misra-Gries Algorithm Test Program 6 | * 7 | * 8 | * Copyright (C) 2013 Bryant Moscon - bmoscon@gmail.com 9 | * 10 | * Permission is hereby granted, free of charge, to any person obtaining a copy 11 | * of this software and associated documentation files (the "Software"), to 12 | * deal in the Software without restriction, including without limitation the 13 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 14 | * sell copies of the Software, and to permit persons to whom the Software is 15 | * furnished to do so, subject to the following conditions: 16 | * 17 | * 1. Redistributions of source code must retain the above copyright notice, 18 | * this list of conditions, and the following disclaimer. 19 | * 20 | * 2. Redistributions in binary form must reproduce the above copyright notice, 21 | * this list of conditions and the following disclaimer in the documentation 22 | * and/or other materials provided with the distribution, and in the same 23 | * place and form as other copyright, license and disclaimer information. 24 | * 25 | * 3. The end-user documentation included with the redistribution, if any, must 26 | * include the following acknowledgment: "This product includes software 27 | * developed by Bryant Moscon (http://www.bryantmoscon.org/)", in the same 28 | * place and form as other third-party acknowledgments. Alternately, this 29 | * acknowledgment may appear in the software itself, in the same form and 30 | * location as other such third-party acknowledgments. 31 | * 32 | * 4. Except as contained in this notice, the name of the author, Bryant Moscon, 33 | * shall not be used in advertising or otherwise to promote the sale, use or 34 | * other dealings in this Software without prior written authorization from 35 | * the author. 36 | * 37 | * 38 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 39 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 40 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 41 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 42 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 43 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 44 | * THE SOFTWARE. 45 | * 46 | */ 47 | 48 | 49 | 50 | //TODO: This is just a "placeholder" for the time being. Eventual plan: 51 | // Load a test file that contains an input stream, variables passed to the MG class 52 | // and correct output. Program will test output and bail if it fails. 53 | // First the MG implementation must be completed, then this will be. 54 | 55 | #include 56 | 57 | #include "misra_gries.hpp" 58 | 59 | 60 | int main() 61 | { 62 | MG mg(2); 63 | 64 | mg.add(2); 65 | mg.add(1); 66 | mg.add(2); 67 | mg.add(2); 68 | mg.add(1); 69 | mg.add(5); 70 | mg.add(7); 71 | mg.add(9); 72 | mg.add(10); 73 | mg.add(11); 74 | mg.add(13); 75 | 76 | 77 | mg.print(); 78 | 79 | 80 | return (0); 81 | } 82 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [Streaming Algorithms](https://en.wikipedia.org/wiki/Streaming_algorithm) 2 | 3 | [![License](https://img.shields.io/badge/license-XFree86-blue.svg)](LICENSE) 4 | 5 | 6 | A Set of Streaming Algorithms. Types include: 7 | 8 | * Bloom Filters 9 | * Basic 10 | * Counting 11 | * Spectral 12 | 13 | * Count-Min Sketch 14 | 15 | * Karp-Papadimitriou-Shenker 16 | 17 | * Misra-Gries 18 | 19 | * Space Saving/Stream Summary 20 | 21 | 22 | 23 | Majority are in C++ (one is in python and Go) and plans are in place to port all to Python, Ruby, Java, Scala and Go. 24 | 25 | 26 | The C++ implementations use templated classes, and are single header files. To use, simply include the header file - no make files or anything similar. 27 | -------------------------------------------------------------------------------- /StreamSummary/c++/stream_summary.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * stream_summary.hpp 3 | * 4 | * 5 | * Stream Summary Algorithm Implementation 6 | * 7 | * as introduced in "Efficient Computation of Frequent and Top-k Elements in Data Streams" 8 | * by A. Metwally, D. Agrawal, and E. Abbadi 9 | * 10 | * 11 | * Copyright (C) 2013-2015 Bryant Moscon - bmoscon@gmail.com 12 | * 13 | * Permission is hereby granted, free of charge, to any person obtaining a copy 14 | * of this software and associated documentation files (the "Software"), to 15 | * deal in the Software without restriction, including without limitation the 16 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 17 | * sell copies of the Software, and to permit persons to whom the Software is 18 | * furnished to do so, subject to the following conditions: 19 | * 20 | * 1. Redistributions of source code must retain the above copyright notice, 21 | * this list of conditions, and the following disclaimer. 22 | * 23 | * 2. Redistributions in binary form must reproduce the above copyright notice, 24 | * this list of conditions and the following disclaimer in the documentation 25 | * and/or other materials provided with the distribution, and in the same 26 | * place and form as other copyright, license and disclaimer information. 27 | * 28 | * 3. The end-user documentation included with the redistribution, if any, must 29 | * include the following acknowledgment: "This product includes software 30 | * developed by Bryant Moscon (http://www.bryantmoscon.org/)", in the same 31 | * place and form as other third-party acknowledgments. Alternately, this 32 | * acknowledgment may appear in the software itself, in the same form and 33 | * location as other such third-party acknowledgments. 34 | * 35 | * 4. Except as contained in this notice, the name of the author, Bryant Moscon, 36 | * shall not be used in advertising or otherwise to promote the sale, use or 37 | * other dealings in this Software without prior written authorization from 38 | * the author. 39 | * 40 | * 41 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 42 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 43 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 44 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 45 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 46 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 47 | * THE SOFTWARE. 48 | * 49 | */ 50 | 51 | #ifndef __STREAM_SUMMARY__ 52 | #define __STREAM_SUMMARY__ 53 | 54 | 55 | #include 56 | #include 57 | #include 58 | #include 59 | #include 60 | #include 61 | #include 62 | 63 | 64 | template 65 | class Bucket { 66 | public: 67 | Bucket(const uint64_t &v = 1) : value_(v) {} 68 | 69 | void insert(const T &obj) 70 | { 71 | list_.push_back(obj); 72 | } 73 | 74 | void remove(const T &obj) 75 | { 76 | typename std::list::iterator it; 77 | 78 | for (it = list_.begin(); it != list_.end(); ++it) { 79 | if (*it == obj) { 80 | list_.erase(it); 81 | return; 82 | } 83 | } 84 | 85 | // we should only arrive here if obj is not in the list, which should never happen 86 | // so throw an error 87 | assert(false); 88 | } 89 | 90 | uint64_t value() const 91 | { 92 | return (value_); 93 | } 94 | 95 | size_t size() const 96 | { 97 | return (list_.size()); 98 | } 99 | 100 | T min() const 101 | { 102 | assert(list_.size() > 0); 103 | return (list_.front()); 104 | } 105 | 106 | template 107 | friend std::ostream& operator<<(std::ostream& os, const Bucket& b); 108 | 109 | private: 110 | uint64_t value_; 111 | std::list list_; 112 | }; 113 | 114 | 115 | template 116 | class StreamSummary 117 | { 118 | public: 119 | StreamSummary(const uint64_t &size) : max_size(size) {} 120 | 121 | ~StreamSummary() 122 | { 123 | clear(); 124 | } 125 | 126 | void add(const T &obj) 127 | { 128 | bm_ret ret; 129 | 130 | ret = exists(obj); 131 | 132 | if (ret.second) { 133 | increment(obj, ret.first); 134 | } else if (bucket_map.size() < max_size) { 135 | insert(obj); 136 | } else { 137 | replace_and_insert(obj); 138 | } 139 | } 140 | 141 | bool exists(const T &obj) const 142 | { 143 | typename std::unordered_map *>::const_iterator it; 144 | 145 | it = bucket_map.find(obj); 146 | 147 | return (it != bucket_map.cend()); 148 | } 149 | 150 | // thanks to copy elision/RVO the compiler will 151 | // elide the copy, so no performance hit 152 | // in returning a copy of the local variable 153 | std::vector to_vector() const 154 | { 155 | std::vector ret; 156 | typename std::unordered_map *>::const_iterator it; 157 | 158 | for (it = bucket_map.begin(); it != bucket_map.end(); ++it) { 159 | ret.push_back(it->first); 160 | } 161 | 162 | return (ret); 163 | } 164 | 165 | void clear () 166 | { 167 | v_it vit; 168 | Bucket *b; 169 | 170 | vit = value_map.begin(); 171 | while (vit != value_map.end()) { 172 | b = vit->second; 173 | vit = value_map.erase(vit); 174 | delete b; 175 | } 176 | 177 | bucket_map.clear(); 178 | } 179 | 180 | template 181 | friend std::ostream& operator<<(std::ostream& os, const StreamSummary& s); 182 | 183 | private: 184 | typedef typename std::unordered_map *>::iterator bm_it; 185 | typedef typename std::map *>::iterator v_it; 186 | typedef std::pair bm_ret; 187 | typedef std::pair v_ret; 188 | 189 | 190 | bm_ret exists(const T &obj) 191 | { 192 | bm_it it = bucket_map.find(obj); 193 | 194 | if (it == bucket_map.end()) { 195 | return (std::make_pair(it, false)); 196 | } 197 | 198 | return (std::make_pair(it, true)); 199 | } 200 | 201 | void increment(const T &obj, bm_it &it) 202 | { 203 | Bucket *b = it->second; 204 | uint64_t val = b->value(); 205 | 206 | // regardless of what path we take below, we need to remove the old entry 207 | // from the original bucket 208 | b->remove(obj); 209 | // if bucket is now empty, we can remove it 210 | if (b->size() == 0) { 211 | value_map.erase(val); 212 | delete b; 213 | } 214 | 215 | // check if bucket of val+1 exists 216 | v_it value_it = value_map.find(val + 1); 217 | v_ret value_ret; 218 | 219 | if (value_it != value_map.end()) { 220 | // bucket exists, insert object 221 | value_it->second->insert(obj); 222 | it->second = value_it->second; 223 | } else { 224 | // does not exits, create new bucket 225 | it->second = new Bucket(val+1); 226 | value_ret = value_map.insert(std::make_pair(val + 1, it->second)); 227 | assert(value_ret.second); 228 | it->second->insert(obj); 229 | } 230 | } 231 | 232 | void insert(const T &obj) 233 | { 234 | bm_ret ret; 235 | v_it value_it; 236 | v_ret value_ret; 237 | 238 | value_it = value_map.find(1); 239 | 240 | // check if bucket with value 1 exists 241 | if (value_it == value_map.end()) { 242 | ret = bucket_map.insert(std::make_pair(obj, new Bucket(1))); 243 | assert(ret.second); 244 | value_ret = value_map.insert(std::make_pair(1, ret.first->second)); 245 | assert(value_ret.second); 246 | ret.first->second->insert(obj); 247 | } else { 248 | value_it->second->insert(obj); 249 | ret = bucket_map.insert(std::make_pair(obj, value_it->second)); 250 | assert(ret.second); 251 | } 252 | } 253 | 254 | void replace_and_insert(const T &obj) 255 | { 256 | Bucket *b = value_map.begin()->second; 257 | uint64_t val = b->value(); 258 | T old = b->min(); 259 | 260 | // remove old object 261 | b->remove(old); 262 | bucket_map.erase(old); 263 | 264 | // check size of old bucket 265 | if (b->size() == 0) { 266 | value_map.erase(val); 267 | delete b; 268 | } 269 | 270 | // check if bucket of val+1 exists 271 | v_it value_it; 272 | bm_ret ret; 273 | v_ret value_ret; 274 | value_it = value_map.find(val + 1); 275 | if (value_it != value_map.end()) { 276 | // bucket exists, insert object 277 | value_it->second->insert(obj); 278 | // insert obj into map 279 | ret = bucket_map.insert(std::make_pair(obj, value_it->second)); 280 | assert(ret.second); 281 | } else { 282 | // does not exits, create new bucket 283 | ret = bucket_map.insert(std::make_pair(obj, new Bucket(val+1))); 284 | assert(ret.second); 285 | value_ret = value_map.insert(std::make_pair(val + 1, ret.first->second)); 286 | assert(value_ret.second); 287 | ret.first->second->insert(obj); 288 | } 289 | } 290 | 291 | 292 | uint64_t max_size; 293 | std::unordered_map *> bucket_map; 294 | std::map *> value_map; 295 | }; 296 | 297 | 298 | template 299 | std::ostream& operator<<(std::ostream& os, const Bucket& b) 300 | { 301 | os << b.value() << "-> "; 302 | 303 | typename std::list::const_iterator it; 304 | for (it = b.list_.begin(); it != b.list_.end(); ++it) { 305 | os << *it << " "; 306 | } 307 | 308 | return (os); 309 | } 310 | 311 | 312 | template 313 | std::ostream& operator<<(std::ostream& os, const StreamSummary& s) 314 | { 315 | typename std::map *>::const_iterator it; 316 | for (it = s.value_map.begin(); it != s.value_map.end(); ++it) { 317 | os << *(it->second); 318 | os << std::endl << std::endl; 319 | } 320 | 321 | return (os); 322 | } 323 | 324 | 325 | #endif 326 | -------------------------------------------------------------------------------- /StreamSummary/c++/test/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for StreamSummary test program 2 | # 3 | # March 2013 - Bryant Moscon 4 | 5 | 6 | ss_test: ss_test.o 7 | g++ -o ss_test -g -Wall -Wextra -Wshadow -pedantic ss_test.o -std=c++11 8 | 9 | ss_test.o: ss_test.cc ../stream_summary.hpp 10 | g++ -c -g -Wall -Wextra -Wshadow -pedantic ss_test.cc -std=c++11 11 | 12 | clean: 13 | rm ss_test ss_test.o 14 | -------------------------------------------------------------------------------- /StreamSummary/c++/test/ss_test.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * ss_test.cc 3 | * 4 | * 5 | * Stream Summary Test 6 | * 7 | * 8 | * Copyright (C) 2013 Bryant Moscon - bmoscon@gmail.com 9 | * 10 | * Permission is hereby granted, free of charge, to any person obtaining a copy 11 | * of this software and associated documentation files (the "Software"), to 12 | * deal in the Software without restriction, including without limitation the 13 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 14 | * sell copies of the Software, and to permit persons to whom the Software is 15 | * furnished to do so, subject to the following conditions: 16 | * 17 | * 1. Redistributions of source code must retain the above copyright notice, 18 | * this list of conditions, and the following disclaimer. 19 | * 20 | * 2. Redistributions in binary form must reproduce the above copyright notice, 21 | * this list of conditions and the following disclaimer in the documentation 22 | * and/or other materials provided with the distribution, and in the same 23 | * place and form as other copyright, license and disclaimer information. 24 | * 25 | * 3. The end-user documentation included with the redistribution, if any, must 26 | * include the following acknowledgment: "This product includes software 27 | * developed by Bryant Moscon (http://www.bryantmoscon.org/)", in the same 28 | * place and form as other third-party acknowledgments. Alternately, this 29 | * acknowledgment may appear in the software itself, in the same form and 30 | * location as other such third-party acknowledgments. 31 | * 32 | * 4. Except as contained in this notice, the name of the author, Bryant Moscon, 33 | * shall not be used in advertising or otherwise to promote the sale, use or 34 | * other dealings in this Software without prior written authorization from 35 | * the author. 36 | * 37 | * 38 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 39 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 40 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 41 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 42 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 43 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 44 | * THE SOFTWARE. 45 | */ 46 | 47 | #include 48 | #include 49 | #include 50 | #include 51 | #include 52 | #include 53 | #include 54 | 55 | #include "../stream_summary.hpp" 56 | 57 | 58 | std::string read_file(const char* file) 59 | { 60 | std::ifstream file_h(file); 61 | std::string data; 62 | 63 | if (file_h.is_open()) { 64 | file_h.seekg(0, std::ios::end); 65 | data.resize(file_h.tellg()); 66 | file_h.seekg(0, std::ios::beg); 67 | file_h.read(&data[0], data.size()); 68 | file_h.close(); 69 | } else { 70 | perror("error"); 71 | exit(1); 72 | } 73 | 74 | return(data); 75 | } 76 | 77 | 78 | int main(int argc, char* argv[]) 79 | { 80 | std::string line; 81 | std::string data; 82 | 83 | if (argc != 3) { 84 | std::cout << "usage: test_ss " << std::endl; 85 | exit(1); 86 | } 87 | 88 | StreamSummary a(atoi(argv[1])); 89 | 90 | data = read_file(argv[2]); 91 | std::stringstream stream(data); 92 | 93 | 94 | while (std::getline(stream, line)) { 95 | if (!line.size()) { 96 | // blank lines (i.e. lines with only a carriage return) will have line with len 0 97 | continue; 98 | } 99 | if (line[0] != '#') { 100 | a.add(line); 101 | } 102 | } 103 | std::cout << std::endl; 104 | 105 | std::cout << "printing object:\n"; 106 | std::cout << a; 107 | 108 | std::cout << "top object vector:\n"; 109 | std::vector v = a.to_vector(); 110 | 111 | for (unsigned int i = 0; i < v.size(); ++i) { 112 | std::cout<< v[i] << " "; 113 | } 114 | std::cout << std::endl; 115 | 116 | return (0); 117 | } 118 | -------------------------------------------------------------------------------- /StreamSummary/c++/test/test.dat: -------------------------------------------------------------------------------- 1 | # Simple test data 2 | # 3 | # run like so: 4 | # ss_test 5 | # 6 | # output is in the format: 7 | # 8 | # ... 9 | # 10 | # 11 | # ... 12 | # ... 13 | # etc 14 | 15 | abc 16 | def 17 | a 18 | b 19 | b 20 | b 21 | a 22 | a 23 | a 24 | b 25 | b 26 | b 27 | b 28 | b 29 | a 30 | d 31 | -------------------------------------------------------------------------------- /StreamSummary/c++/test/test2.dat: -------------------------------------------------------------------------------- 1 | # more complicated test 2 | b 3 | b 4 | b 5 | b 6 | b 7 | b 8 | b 9 | b 10 | b 11 | b 12 | b 13 | b 14 | b 15 | b 16 | abc 17 | abc 18 | def 19 | def 20 | a 21 | a 22 | b 23 | c 24 | d 25 | f 26 | g 27 | b 28 | b 29 | b 30 | h 31 | j 32 | l 33 | j 34 | b 35 | n 36 | g 37 | h 38 | j 39 | k 40 | l 41 | b 42 | 1 43 | 2 44 | 3 45 | 4 46 | b 47 | 5 48 | 65 49 | 67 50 | 7 51 | a 52 | s 53 | d 54 | f 55 | b 56 | g 57 | h 58 | z 59 | x 60 | c 61 | v 62 | g 63 | b 64 | g 65 | h 66 | h 67 | j 68 | j 69 | k 70 | hh 71 | y 72 | y 73 | b 74 | y 75 | h 76 | h 77 | g 78 | b 79 | g 80 | f 81 | f 82 | m 83 | jk 84 | l 85 | b 86 | i 87 | j 88 | j 89 | j 90 | 12 91 | 12 92 | 12 93 | b 94 | 34 95 | 54 96 | 56 97 | 6 98 | g 99 | g 100 | g 101 | b 102 | bg 103 | f 104 | f 105 | d 106 | fsdf 107 | sf 108 | b 109 | sdf 110 | dsf 111 | sd 112 | sdf 113 | dsg 114 | h 115 | jj 116 | gh 117 | a 118 | a 119 | a 120 | a 121 | a 122 | a 123 | a 124 | a 125 | a 126 | a 127 | a 128 | a 129 | a 130 | a 131 | a 132 | a 133 | -------------------------------------------------------------------------------- /StreamSummary/go/example.go: -------------------------------------------------------------------------------- 1 | /******************************************************************************************* 2 | Copyright (C) 2013-2017 Bryant Moscon - bmoscon@gmail.com 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to 6 | deal in the Software without restriction, including without limitation the 7 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 | sell copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | 1. Redistributions of source code must retain the above copyright notice, 12 | this list of conditions, and the following disclaimer. 13 | 14 | 2. Redistributions in binary form must reproduce the above copyright notice, 15 | this list of conditions and the following disclaimer in the documentation 16 | and/or other materials provided with the distribution, and in the same 17 | place and form as other copyright, license and disclaimer information. 18 | 19 | 3. The end-user documentation included with the redistribution, if any, must 20 | include the following acknowledgment: "This product includes software 21 | developed by Bryant Moscon (http://www.bryantmoscon.com/)", in the same 22 | place and form as other third-party acknowledgments. Alternately, this 23 | acknowledgment may appear in the software itself, in the same form and 24 | location as other such third-party acknowledgments. 25 | 26 | 4. Except as contained in this notice, the name of the author, Bryant Moscon, 27 | shall not be used in advertising or otherwise to promote the sale, use or 28 | other dealings in this Software without prior written authorization from 29 | the author. 30 | 31 | 32 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 33 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 34 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 35 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 36 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 37 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 38 | THE SOFTWARE. 39 | ************************************************************************************/ 40 | 41 | package main 42 | 43 | 44 | import ( 45 | "fmt" 46 | "github.com/bmoscon/StreamingAlgorithms/StreamSummary/go/streamsummary" 47 | ) 48 | 49 | 50 | 51 | func main() { 52 | a := streamsummary.NewStreamSummary(2) 53 | a.Add("a") 54 | a.Add("b") 55 | a.Add("a") 56 | a.Add("c") 57 | 58 | a.Print() 59 | 60 | fmt.Println("a exists? ", a.Exists("a")) 61 | fmt.Println("b exists? ", a.Exists("b")) 62 | fmt.Println(a.GetStream()) 63 | } 64 | -------------------------------------------------------------------------------- /StreamSummary/go/streamsummary/streamsummary.go: -------------------------------------------------------------------------------- 1 | /******************************************************************************************* 2 | Copyright (C) 2013-2017 Bryant Moscon - bmoscon@gmail.com 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to 6 | deal in the Software without restriction, including without limitation the 7 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 | sell copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | 1. Redistributions of source code must retain the above copyright notice, 12 | this list of conditions, and the following disclaimer. 13 | 14 | 2. Redistributions in binary form must reproduce the above copyright notice, 15 | this list of conditions and the following disclaimer in the documentation 16 | and/or other materials provided with the distribution, and in the same 17 | place and form as other copyright, license and disclaimer information. 18 | 19 | 3. The end-user documentation included with the redistribution, if any, must 20 | include the following acknowledgment: "This product includes software 21 | developed by Bryant Moscon (http://www.bryantmoscon.com/)", in the same 22 | place and form as other third-party acknowledgments. Alternately, this 23 | acknowledgment may appear in the software itself, in the same form and 24 | location as other such third-party acknowledgments. 25 | 26 | 4. Except as contained in this notice, the name of the author, Bryant Moscon, 27 | shall not be used in advertising or otherwise to promote the sale, use or 28 | other dealings in this Software without prior written authorization from 29 | the author. 30 | 31 | 32 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 33 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 34 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 35 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 36 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 37 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 38 | THE SOFTWARE. 39 | ************************************************************************************/ 40 | 41 | package streamsummary 42 | 43 | import ( 44 | "container/list" 45 | "fmt" 46 | ) 47 | 48 | type bucket struct { 49 | entries list.List 50 | value uint64 51 | } 52 | 53 | func newBucket(value uint64) *bucket { 54 | b := new(bucket) 55 | b.value = value 56 | 57 | return b 58 | } 59 | 60 | func (b *bucket) print() { 61 | fmt.Print(b.value, ": ") 62 | i := b.entries.Front() 63 | for { 64 | if i == nil { 65 | fmt.Println(" ") 66 | return 67 | } 68 | 69 | fmt.Print(i.Value.(string), " ") 70 | 71 | i = i.Next() 72 | } 73 | } 74 | 75 | func (b *bucket) insert(s string) { 76 | b.entries.PushBack(s) 77 | } 78 | 79 | func (b *bucket) getValue() uint64 { 80 | return b.value 81 | } 82 | 83 | func (b *bucket) getSize() int { 84 | return b.entries.Len() 85 | } 86 | 87 | func (b *bucket) getMin() string { 88 | return (b.entries.Front().Value.(string)) 89 | } 90 | 91 | func (b *bucket) remove(s string) { 92 | i := b.entries.Front() 93 | for { 94 | if i == nil { 95 | panic("tried to remove a non-existent entry") 96 | } 97 | 98 | if i.Value.(string) == s { 99 | b.entries.Remove(i) 100 | return 101 | } 102 | 103 | i = i.Next() 104 | } 105 | } 106 | 107 | type StreamSummary struct { 108 | size int 109 | bucketMap map[string]*bucket 110 | valueMap map[uint64]*bucket 111 | smallestValue uint64 112 | } 113 | 114 | func NewStreamSummary(capacity int) *StreamSummary { 115 | s := new(StreamSummary) 116 | s.size = capacity 117 | s.bucketMap = make(map[string]*bucket) 118 | s.valueMap = make(map[uint64]*bucket) 119 | 120 | return s 121 | } 122 | 123 | func (s *StreamSummary) Print() { 124 | for _, bucket := range s.valueMap { 125 | bucket.print() 126 | } 127 | } 128 | 129 | func (s *StreamSummary) Exists(item string) bool { 130 | _, found := s.bucketMap[item] 131 | return found 132 | } 133 | 134 | func (s *StreamSummary) GetStream() []string { 135 | ret := make([]string, len(s.bucketMap)) 136 | 137 | i := 0 138 | for key := range s.bucketMap { 139 | ret[i] = key 140 | i++ 141 | } 142 | 143 | return ret 144 | } 145 | 146 | func (s *StreamSummary) Add(item string) { 147 | // does item already exist? 148 | b, found := s.bucketMap[item] 149 | 150 | if found { 151 | // if so, find old bucket, and remove item 152 | oldValue := b.getValue() 153 | b.remove(item) 154 | if b.getSize() == 0 { 155 | // if old bucket is now empty, remove it 156 | delete(s.valueMap, oldValue) 157 | 158 | // update smallest value if necessary 159 | if oldValue == s.smallestValue { 160 | s.smallestValue++ 161 | } 162 | } 163 | 164 | // see if a bucket exists for item's new value 165 | newb, found := s.valueMap[oldValue+1] 166 | if found { 167 | // it does, so insert it 168 | newb.insert(item) 169 | s.bucketMap[item] = newb 170 | } else { 171 | // it doesnt, so we'll have to add a new bucket 172 | newbucket := newBucket(oldValue + 1) 173 | newbucket.insert(item) 174 | s.bucketMap[item] = newbucket 175 | s.valueMap[oldValue+1] = newbucket 176 | } 177 | } else if len(s.bucketMap) < s.size { 178 | // are we within allowable capacity? 179 | 180 | s.smallestValue = 1 181 | 182 | // does bucket 1 exist? 183 | b1, found := s.valueMap[1] 184 | if found { 185 | b1.insert(item) 186 | s.bucketMap[item] = b1 187 | } else { 188 | newbucket := newBucket(1) 189 | newbucket.insert(item) 190 | s.bucketMap[item] = newbucket 191 | s.valueMap[1] = newbucket 192 | } 193 | } else { 194 | // item didnt already exist, and we are over the size 195 | // so we need to evict smallest item, 196 | // and use its value + 1 as item to insert's value 197 | evictBucket := s.valueMap[s.smallestValue] 198 | evictValue := evictBucket.getValue() 199 | evictItem := evictBucket.getMin() 200 | evictBucket.remove(evictItem) 201 | 202 | // remove entry in bucket_map 203 | delete(s.bucketMap, evictItem) 204 | 205 | // if bucket empty, cleanup 206 | if evictBucket.getSize() == 0 { 207 | delete(s.valueMap, evictValue) 208 | 209 | // update smallest value if necessary 210 | if evictValue == s.smallestValue { 211 | s.smallestValue++ 212 | } 213 | } 214 | 215 | newbucket, found := s.valueMap[evictValue+1] 216 | if found { 217 | newbucket.insert(item) 218 | s.bucketMap[item] = newbucket 219 | } else { 220 | b := newBucket(evictValue + 1) 221 | b.insert(item) 222 | s.valueMap[evictValue+1] = b 223 | s.bucketMap[item] = b 224 | } 225 | } 226 | } 227 | -------------------------------------------------------------------------------- /StreamSummary/python/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bmoscon/StreamingAlgorithms/f71ee839e8f158e74838067d2d72934cf8c52576/StreamSummary/python/__init__.py -------------------------------------------------------------------------------- /StreamSummary/python/example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | """ 3 | Copyright (C) 2012-2015 Bryant Moscon - bmoscon@gmail.com 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to 7 | deal in the Software without restriction, including without limitation the 8 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 9 | sell copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | 1. Redistributions of source code must retain the above copyright notice, 13 | this list of conditions, and the following disclaimer. 14 | 15 | 2. Redistributions in binary form must reproduce the above copyright notice, 16 | this list of conditions and the following disclaimer in the documentation 17 | and/or other materials provided with the distribution, and in the same 18 | place and form as other copyright, 19 | license and disclaimer information. 20 | 21 | 3. The end-user documentation included with the redistribution, if any, must 22 | include the following acknowledgment: "This product includes software 23 | developed by Bryant Moscon (http://www.bryantmoscon.org/)", in the same 24 | place and form as other third-party acknowledgments. Alternately, this 25 | acknowledgment may appear in the software itself, in the same form and 26 | location as other such third-party acknowledgments. 27 | 28 | 4. Except as contained in this notice, the name of the author, Bryant Moscon, 29 | shall not be used in advertising or otherwise to promote the sale, use or 30 | other dealings in this Software without prior written authorization from 31 | the author. 32 | 33 | 34 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 35 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 36 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 37 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 38 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 39 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 40 | THE SOFTWARE. 41 | 42 | """ 43 | 44 | import sys 45 | from stream_summary import StreamSummary 46 | 47 | 48 | 49 | def main(): 50 | if len(sys.argv) != 3: 51 | print ("Usage: example.py ") 52 | return 53 | 54 | size = sys.argv[1] 55 | 56 | with open(sys.argv[2], "r") as f: 57 | data = f.read().splitlines() 58 | 59 | data = [x for x in data if not x.startswith('#') and not len(x) == 0] 60 | ss = StreamSummary(int(size)) 61 | 62 | for e in data: 63 | ss.add(e) 64 | 65 | print(ss) 66 | elements = ss.to_list() 67 | print(elements) 68 | print(ss.exists(elements[0])) 69 | ss.clear() 70 | print(ss.exists(elements[0])) 71 | 72 | 73 | 74 | 75 | if __name__ == "__main__": 76 | main() 77 | -------------------------------------------------------------------------------- /StreamSummary/python/stream_summary.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2012-2016 Bryant Moscon - bmoscon@gmail.com 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to 6 | deal in the Software without restriction, including without limitation the 7 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 | sell copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | 1. Redistributions of source code must retain the above copyright notice, 12 | this list of conditions, and the following disclaimer. 13 | 14 | 2. Redistributions in binary form must reproduce the above copyright notice, 15 | this list of conditions and the following disclaimer in the documentation 16 | and/or other materials provided with the distribution, and in the same 17 | place and form as other copyright, 18 | license and disclaimer information. 19 | 20 | 3. The end-user documentation included with the redistribution, if any, must 21 | include the following acknowledgment: "This product includes software 22 | developed by Bryant Moscon (http://www.bryantmoscon.com/)", in the same 23 | place and form as other third-party acknowledgments. Alternately, this 24 | acknowledgment may appear in the software itself, in the same form and 25 | location as other such third-party acknowledgments. 26 | 27 | 4. Except as contained in this notice, the name of the author, Bryant Moscon, 28 | shall not be used in advertising or otherwise to promote the sale, use or 29 | other dealings in this Software without prior written authorization from 30 | the author. 31 | 32 | 33 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 34 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 35 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 36 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 37 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 38 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 39 | THE SOFTWARE. 40 | 41 | """ 42 | 43 | 44 | class Bucket(object): 45 | ''' 46 | A bucket is a list of objects with the same value 47 | ''' 48 | 49 | def __init__(self, v): 50 | self._val = v 51 | self._items = [] 52 | 53 | def __str__(self): 54 | s = "Bucket " + str(self._val) + "\n" 55 | s += str(self._items) 56 | return s 57 | 58 | def insert(self, e): 59 | self._items.append(e) 60 | 61 | def oldest(self): 62 | return self._items[0] 63 | 64 | def size(self): 65 | return len(self._items) 66 | 67 | def remove(self, e): 68 | self._items.remove(e) 69 | 70 | def value(self): 71 | return self._val 72 | 73 | def items(self): 74 | return self._items 75 | 76 | 77 | class StreamSummary(object): 78 | ''' 79 | Summarizes a Stream into the top N-1 number of objects (where N == SIZE). The last object 80 | in the StreamSummary (i.e. the Nth object) should be ignored. 81 | ''' 82 | def __init__(self, size): 83 | self.size = size 84 | # maps bucket value -> bucket 85 | self.bucket_map = {} 86 | # maps item -> bucket 87 | self.item_map = {} 88 | # keeps track of the minimum valued bucket 89 | self.min_val = 0 90 | 91 | def __str__(self): 92 | s = '' 93 | for key in self.bucket_map.keys(): 94 | s += str(self.bucket_map[key]) 95 | s += '\n' 96 | return s 97 | 98 | def __increment(self, item): 99 | ''' 100 | item exists, so remove it from its current bucket, and 101 | insert it into bucket+1 102 | ''' 103 | # get value of item, and remove the item from that bucket. 104 | # if bucket is empty, remove it 105 | b = self.item_map[item] 106 | val = b.value() 107 | b.remove(item) 108 | 109 | if b.size() == 0: 110 | del self.bucket_map[val] 111 | if self.min_val == val: 112 | self.min_val += 1 113 | 114 | # find bucket+1. Create if needed. Insert item in bucket 115 | if val+1 in self.bucket_map: 116 | b = self.bucket_map[val+1] 117 | else: 118 | b = Bucket(val+1) 119 | self.bucket_map[val+1] = b 120 | 121 | b.insert(item) 122 | self.item_map[item] = b 123 | 124 | def __insert(self, item): 125 | ''' 126 | new item, insert into Bucket(1) 127 | ''' 128 | if 1 in self.bucket_map: 129 | b = self.bucket_map[1] 130 | else: 131 | b = Bucket(1) 132 | self.bucket_map[1] = b 133 | self.min_val = 1 134 | 135 | b.insert(item) 136 | self.item_map[item] = b 137 | 138 | def __eject_and_insert(self, item): 139 | ''' 140 | eject lowest ranked item, insert new item with 141 | ejected_value+1 142 | ''' 143 | 144 | b = self.bucket_map[self.min_val] 145 | old = b.oldest() 146 | new_val = self.min_val + 1 147 | b.remove(old) 148 | del self.item_map[old] 149 | if b.size() == 0: 150 | del self.bucket_map[self.min_val] 151 | self.min_val += 1 152 | 153 | if new_val in self.bucket_map: 154 | b = self.bucket_map[new_val] 155 | else: 156 | b = Bucket(new_val) 157 | self.bucket_map[new_val] = b 158 | 159 | b.insert(item) 160 | self.item_map[item] = b 161 | 162 | def add(self, item): 163 | ''' 164 | adds an item to the summarized stream 165 | ''' 166 | if item in self.item_map: 167 | self.__increment(item) 168 | elif len(self.item_map) < self.size: 169 | self.__insert(item) 170 | else: 171 | self.__eject_and_insert(item) 172 | 173 | def exists(self, item): 174 | return item in self.item_map 175 | 176 | 177 | def clear(self): 178 | self.bucket_map = {} 179 | self.item_map = {} 180 | self.min_val = 0 181 | 182 | def to_list(self): 183 | return [item for item in self.item_map] 184 | 185 | def save(self): 186 | bm = self.bucket_map 187 | data = {bm[item].value(): bm[item].items() for item in bm} 188 | return {'size': self.size, 'data': data} 189 | 190 | def load(self, data): 191 | self.clear() 192 | self.size = data['size'] 193 | 194 | buckets = data['data'] 195 | 196 | for d in buckets: 197 | if d < self.min_val or self.min_val is 0: 198 | self.min_val = d 199 | b = Bucket(d) 200 | b._items = buckets[d] 201 | self.bucket_map[d] = b 202 | for item in buckets[d]: 203 | self.item_map[item] = b 204 | 205 | 206 | -------------------------------------------------------------------------------- /StreamSummary/python/test.dat: -------------------------------------------------------------------------------- 1 | b 2 | b 3 | b 4 | b 5 | b 6 | b 7 | b 8 | b 9 | b 10 | b 11 | b 12 | b 13 | b 14 | b 15 | abc 16 | abc 17 | def 18 | def 19 | a 20 | a 21 | b 22 | c 23 | d 24 | f 25 | g 26 | b 27 | b 28 | b 29 | h 30 | j 31 | l 32 | j 33 | b 34 | n 35 | g 36 | h 37 | j 38 | k 39 | l 40 | b 41 | 1 42 | 2 43 | 3 44 | 4 45 | b 46 | 5 47 | 65 48 | 67 49 | 7 50 | a 51 | s 52 | d 53 | f 54 | b 55 | g 56 | h 57 | z 58 | x 59 | c 60 | v 61 | g 62 | b 63 | g 64 | h 65 | h 66 | j 67 | j 68 | k 69 | hh 70 | y 71 | y 72 | b 73 | y 74 | h 75 | h 76 | g 77 | b 78 | g 79 | f 80 | f 81 | m 82 | jk 83 | l 84 | b 85 | i 86 | j 87 | j 88 | j 89 | 12 90 | 12 91 | 12 92 | b 93 | 34 94 | 54 95 | 56 96 | 6 97 | g 98 | g 99 | g 100 | b 101 | bg 102 | f 103 | f 104 | d 105 | fsdf 106 | sf 107 | b 108 | sdf 109 | dsf 110 | sd 111 | sdf 112 | dsg 113 | h 114 | jj 115 | gh 116 | a 117 | a 118 | a 119 | a 120 | a 121 | a 122 | a 123 | a 124 | a 125 | a 126 | a 127 | a 128 | a 129 | a 130 | a 131 | a 132 | -------------------------------------------------------------------------------- /StreamSummary/python/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bmoscon/StreamingAlgorithms/f71ee839e8f158e74838067d2d72934cf8c52576/StreamSummary/python/test/__init__.py -------------------------------------------------------------------------------- /StreamSummary/python/test/test_stream_summary.py: -------------------------------------------------------------------------------- 1 | from ..stream_summary import Bucket, StreamSummary 2 | import random 3 | 4 | 5 | ''' 6 | Simple trivial tests for Bucket simply to ensure that 7 | if the Bucket functionality/design changes that 8 | the unit tests catch it 9 | ''' 10 | 11 | def test_bucket_insertion(): 12 | b = Bucket(1) 13 | assert(b.size() == 0) 14 | 15 | b.insert('a') 16 | assert(b.size() == 1) 17 | 18 | b.remove('a') 19 | assert(b.size() == 0) 20 | 21 | 22 | def test_bucket_oldest(): 23 | b = Bucket(5) 24 | assert(b.value() == 5) 25 | 26 | for i in range(0, 2048): 27 | b.insert(i) 28 | assert(b.size() == i+1) 29 | 30 | assert(b.oldest() == 0) 31 | 32 | for i in range(0, 2048): 33 | b.remove(i) 34 | assert(b.size() == 2048 - (i+1)) 35 | if b.size(): 36 | assert(b.oldest() == i+1) 37 | assert(b.size() == 0) 38 | 39 | 40 | 41 | def test_stream_summary_add(): 42 | ''' 43 | Test that adding up to size N results in the object being 44 | added to bucket 1, and that no other buckets are created, and that the 'age' order 45 | in the bucket is correct 46 | ''' 47 | ss = StreamSummary(2) 48 | ss.add(1) 49 | ss.add(2) 50 | assert(ss.bucket_map[1].oldest() == 1) 51 | ss.add(3) 52 | assert(ss.exists(1) == False) 53 | assert(ss.exists(2) == ss.exists(3) == True) 54 | 55 | assert(1 in ss.bucket_map) 56 | assert(2 in ss.bucket_map) 57 | assert(ss.bucket_map[1].size() == 1) 58 | assert(ss.bucket_map[2].size() == 1) 59 | 60 | 61 | def test_stream_summary_add_complex(): 62 | ss = StreamSummary(5) 63 | for i in range(5): 64 | ss.add(i+1) 65 | 66 | ss.add(1) 67 | ss.add(1) 68 | ss.add(10) 69 | 70 | assert(ss.bucket_map[3].items[0] == 1) 71 | assert(ss.bucket_map[2].items[0] == 10) 72 | assert(ss.bucket_map[1].size() == 3) 73 | assert(ss.bucket_map[1].oldest() == 3) 74 | 75 | 76 | def test_stress_test(): 77 | ss = StreamSummary(1000) 78 | 79 | for i in range(5000000): 80 | ss.add(random.randint(0, 2500)) 81 | 82 | list_from_buckets = [] 83 | for b in ss.bucket_map.keys(): 84 | list_from_buckets.extend(ss.bucket_map[b].items) 85 | 86 | list_from_map = ss.to_list() 87 | assert(len(list_from_map) == len(list_from_buckets)) 88 | assert(set(list_from_map) == set(list_from_buckets)) 89 | 90 | 91 | 92 | 93 | 94 | -------------------------------------------------------------------------------- /hash/MurmurHash3.cpp: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // MurmurHash3 was written by Austin Appleby, and is placed in the public 3 | // domain. The author hereby disclaims copyright to this source code. 4 | 5 | // Note - The x86 and x64 versions do _not_ produce the same results, as the 6 | // algorithms are optimized for their respective platforms. You can still 7 | // compile and run any of them on any platform, but your performance with the 8 | // non-native version will be less than optimal. 9 | 10 | #include "MurmurHash3.hpp" 11 | 12 | //----------------------------------------------------------------------------- 13 | 14 | inline uint32_t rotl32 ( uint32_t x, int8_t r ) 15 | { 16 | return (x << r) | (x >> (32 - r)); 17 | } 18 | 19 | inline uint64_t rotl64 ( uint64_t x, int8_t r ) 20 | { 21 | return (x << r) | (x >> (64 - r)); 22 | } 23 | 24 | #define ROTL32(x,y) rotl32(x,y) 25 | #define ROTL64(x,y) rotl64(x,y) 26 | 27 | #define BIG_CONSTANT(x) (x##LLU) 28 | 29 | 30 | //----------------------------------------------------------------------------- 31 | // Block read - if your platform needs to do endian-swapping or can only 32 | // handle aligned reads, do the conversion here 33 | 34 | inline uint32_t getblock ( const uint32_t * p, int i ) 35 | { 36 | return p[i]; 37 | } 38 | 39 | inline uint64_t getblock ( const uint64_t * p, int i ) 40 | { 41 | return p[i]; 42 | } 43 | 44 | //----------------------------------------------------------------------------- 45 | // Finalization mix - force all bits of a hash block to avalanche 46 | 47 | inline uint32_t fmix ( uint32_t h ) 48 | { 49 | h ^= h >> 16; 50 | h *= 0x85ebca6b; 51 | h ^= h >> 13; 52 | h *= 0xc2b2ae35; 53 | h ^= h >> 16; 54 | 55 | return h; 56 | } 57 | 58 | //---------- 59 | 60 | inline uint64_t fmix ( uint64_t k ) 61 | { 62 | k ^= k >> 33; 63 | k *= BIG_CONSTANT(0xff51afd7ed558ccd); 64 | k ^= k >> 33; 65 | k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); 66 | k ^= k >> 33; 67 | 68 | return k; 69 | } 70 | 71 | //----------------------------------------------------------------------------- 72 | 73 | void MurmurHash3_x86_32 ( const void * key, int len, 74 | uint32_t seed, void * out ) 75 | { 76 | const uint8_t * data = (const uint8_t*)key; 77 | const int nblocks = len / 4; 78 | 79 | uint32_t h1 = seed; 80 | 81 | const uint32_t c1 = 0xcc9e2d51; 82 | const uint32_t c2 = 0x1b873593; 83 | 84 | //---------- 85 | // body 86 | 87 | const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); 88 | 89 | for(int i = -nblocks; i; i++) 90 | { 91 | uint32_t k1 = getblock(blocks,i); 92 | 93 | k1 *= c1; 94 | k1 = ROTL32(k1,15); 95 | k1 *= c2; 96 | 97 | h1 ^= k1; 98 | h1 = ROTL32(h1,13); 99 | h1 = h1*5+0xe6546b64; 100 | } 101 | 102 | //---------- 103 | // tail 104 | 105 | const uint8_t * tail = (const uint8_t*)(data + nblocks*4); 106 | 107 | uint32_t k1 = 0; 108 | 109 | switch(len & 3) 110 | { 111 | case 3: k1 ^= tail[2] << 16; 112 | case 2: k1 ^= tail[1] << 8; 113 | case 1: k1 ^= tail[0]; 114 | k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; 115 | }; 116 | 117 | //---------- 118 | // finalization 119 | 120 | h1 ^= len; 121 | 122 | h1 = fmix(h1); 123 | 124 | *(uint32_t*)out = h1; 125 | } 126 | 127 | //----------------------------------------------------------------------------- 128 | 129 | void MurmurHash3_x86_128 ( const void * key, const int len, 130 | uint32_t seed, void * out ) 131 | { 132 | const uint8_t * data = (const uint8_t*)key; 133 | const int nblocks = len / 16; 134 | 135 | uint32_t h1 = seed; 136 | uint32_t h2 = seed; 137 | uint32_t h3 = seed; 138 | uint32_t h4 = seed; 139 | 140 | const uint32_t c1 = 0x239b961b; 141 | const uint32_t c2 = 0xab0e9789; 142 | const uint32_t c3 = 0x38b34ae5; 143 | const uint32_t c4 = 0xa1e38b93; 144 | 145 | //---------- 146 | // body 147 | 148 | const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); 149 | 150 | for(int i = -nblocks; i; i++) 151 | { 152 | uint32_t k1 = getblock(blocks,i*4+0); 153 | uint32_t k2 = getblock(blocks,i*4+1); 154 | uint32_t k3 = getblock(blocks,i*4+2); 155 | uint32_t k4 = getblock(blocks,i*4+3); 156 | 157 | k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; 158 | 159 | h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; 160 | 161 | k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; 162 | 163 | h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; 164 | 165 | k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; 166 | 167 | h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; 168 | 169 | k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; 170 | 171 | h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; 172 | } 173 | 174 | //---------- 175 | // tail 176 | 177 | const uint8_t * tail = (const uint8_t*)(data + nblocks*16); 178 | 179 | uint32_t k1 = 0; 180 | uint32_t k2 = 0; 181 | uint32_t k3 = 0; 182 | uint32_t k4 = 0; 183 | 184 | switch(len & 15) 185 | { 186 | case 15: k4 ^= tail[14] << 16; 187 | case 14: k4 ^= tail[13] << 8; 188 | case 13: k4 ^= tail[12] << 0; 189 | k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; 190 | 191 | case 12: k3 ^= tail[11] << 24; 192 | case 11: k3 ^= tail[10] << 16; 193 | case 10: k3 ^= tail[ 9] << 8; 194 | case 9: k3 ^= tail[ 8] << 0; 195 | k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; 196 | 197 | case 8: k2 ^= tail[ 7] << 24; 198 | case 7: k2 ^= tail[ 6] << 16; 199 | case 6: k2 ^= tail[ 5] << 8; 200 | case 5: k2 ^= tail[ 4] << 0; 201 | k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; 202 | 203 | case 4: k1 ^= tail[ 3] << 24; 204 | case 3: k1 ^= tail[ 2] << 16; 205 | case 2: k1 ^= tail[ 1] << 8; 206 | case 1: k1 ^= tail[ 0] << 0; 207 | k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; 208 | }; 209 | 210 | //---------- 211 | // finalization 212 | 213 | h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; 214 | 215 | h1 += h2; h1 += h3; h1 += h4; 216 | h2 += h1; h3 += h1; h4 += h1; 217 | 218 | h1 = fmix(h1); 219 | h2 = fmix(h2); 220 | h3 = fmix(h3); 221 | h4 = fmix(h4); 222 | 223 | h1 += h2; h1 += h3; h1 += h4; 224 | h2 += h1; h3 += h1; h4 += h1; 225 | 226 | ((uint32_t*)out)[0] = h1; 227 | ((uint32_t*)out)[1] = h2; 228 | ((uint32_t*)out)[2] = h3; 229 | ((uint32_t*)out)[3] = h4; 230 | } 231 | 232 | //----------------------------------------------------------------------------- 233 | 234 | void MurmurHash3_x64_128 ( const void * key, const int len, 235 | const uint32_t seed, void * out ) 236 | { 237 | const uint8_t * data = (const uint8_t*)key; 238 | const int nblocks = len / 16; 239 | 240 | uint64_t h1 = seed; 241 | uint64_t h2 = seed; 242 | 243 | const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); 244 | const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); 245 | 246 | //---------- 247 | // body 248 | 249 | const uint64_t * blocks = (const uint64_t *)(data); 250 | 251 | for(int i = 0; i < nblocks; i++) 252 | { 253 | uint64_t k1 = getblock(blocks,i*2+0); 254 | uint64_t k2 = getblock(blocks,i*2+1); 255 | 256 | k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; 257 | 258 | h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; 259 | 260 | k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; 261 | 262 | h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; 263 | } 264 | 265 | //---------- 266 | // tail 267 | 268 | const uint8_t * tail = (const uint8_t*)(data + nblocks*16); 269 | 270 | uint64_t k1 = 0; 271 | uint64_t k2 = 0; 272 | 273 | switch(len & 15) 274 | { 275 | case 15: k2 ^= uint64_t(tail[14]) << 48; 276 | case 14: k2 ^= uint64_t(tail[13]) << 40; 277 | case 13: k2 ^= uint64_t(tail[12]) << 32; 278 | case 12: k2 ^= uint64_t(tail[11]) << 24; 279 | case 11: k2 ^= uint64_t(tail[10]) << 16; 280 | case 10: k2 ^= uint64_t(tail[ 9]) << 8; 281 | case 9: k2 ^= uint64_t(tail[ 8]) << 0; 282 | k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; 283 | 284 | case 8: k1 ^= uint64_t(tail[ 7]) << 56; 285 | case 7: k1 ^= uint64_t(tail[ 6]) << 48; 286 | case 6: k1 ^= uint64_t(tail[ 5]) << 40; 287 | case 5: k1 ^= uint64_t(tail[ 4]) << 32; 288 | case 4: k1 ^= uint64_t(tail[ 3]) << 24; 289 | case 3: k1 ^= uint64_t(tail[ 2]) << 16; 290 | case 2: k1 ^= uint64_t(tail[ 1]) << 8; 291 | case 1: k1 ^= uint64_t(tail[ 0]) << 0; 292 | k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; 293 | }; 294 | 295 | //---------- 296 | // finalization 297 | 298 | h1 ^= len; h2 ^= len; 299 | 300 | h1 += h2; 301 | h2 += h1; 302 | 303 | h1 = fmix(h1); 304 | h2 = fmix(h2); 305 | 306 | h1 += h2; 307 | h2 += h1; 308 | 309 | ((uint64_t*)out)[0] = h1; 310 | ((uint64_t*)out)[1] = h2; 311 | } 312 | 313 | //----------------------------------------------------------------------------- 314 | -------------------------------------------------------------------------------- /hash/MurmurHash3.hpp: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // MurmurHash3 was written by Austin Appleby, and is placed in the public 3 | // domain. The author hereby disclaims copyright to this source code. 4 | 5 | #ifndef _MURMURHASH3_H_ 6 | #define _MURMURHASH3_H_ 7 | 8 | //----------------------------------------------------------------------------- 9 | // Platform-specific functions and macros 10 | 11 | // Microsoft Visual Studio 12 | 13 | #if defined(_MSC_VER) 14 | 15 | typedef unsigned char uint8_t; 16 | typedef unsigned long uint32_t; 17 | typedef unsigned __int64 uint64_t; 18 | 19 | // Other compilers 20 | 21 | #else // defined(_MSC_VER) 22 | 23 | #include 24 | 25 | #endif // !defined(_MSC_VER) 26 | 27 | //----------------------------------------------------------------------------- 28 | 29 | void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); 30 | 31 | void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); 32 | 33 | void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); 34 | 35 | //----------------------------------------------------------------------------- 36 | 37 | #endif // _MURMURHASH3_H_ 38 | -------------------------------------------------------------------------------- /hash/checksum.cpp: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * checksum.cpp 4 | * 5 | * 6 | * Simple Checksums 7 | * 8 | * 9 | * Copyright (C) 2012 Bryant Moscon - bmoscon@gmail.com 10 | * 11 | * Permission is hereby granted, free of charge, to any person obtaining a copy 12 | * of this software and associated documentation files (the "Software"), to 13 | * deal in the Software without restriction, including without limitation the 14 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 15 | * sell copies of the Software, and to permit persons to whom the Software is 16 | * furnished to do so, subject to the following conditions: 17 | * 18 | * 1. Redistributions of source code must retain the above copyright notice, 19 | * this list of conditions, and the following disclaimer. 20 | * 21 | * 2. Redistributions in binary form must reproduce the above copyright notice, 22 | * this list of conditions and the following disclaimer in the documentation 23 | * and/or other materials provided with the distribution, and in the same 24 | * place and form as other copyright, 25 | * license and disclaimer information. 26 | * 27 | * 3. The end-user documentation included with the redistribution, if any, must 28 | * include the following acknowledgment: "This product includes software 29 | * developed by Bryant Moscon (http://www.bryantmoscon.org/)", in the same 30 | * place and form as other third-party acknowledgments. Alternately, this 31 | * acknowledgment may appear in the software itself, in the same form and 32 | * location as other such third-party acknowledgments. 33 | * 34 | * 4. Except as contained in this notice, the name of the author, Bryant Moscon, 35 | * shall not be used in advertising or otherwise to promote the sale, use or 36 | * other dealings in this Software without prior written authorization from 37 | * the author. 38 | * 39 | * 40 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 41 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 42 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 43 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 44 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 45 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 46 | * THE SOFTWARE. 47 | * 48 | */ 49 | 50 | #include 51 | #include 52 | 53 | #define CHECKSUM(c, s, b) \ 54 | for (uint32_t i = 0; i < s.size(); ++i) { \ 55 | c = (c >> 1) + ((c & 1) << (b-1)); \ 56 | c += s[i]; \ 57 | } 58 | 59 | 60 | 61 | uint8_t checksum_8(const std::string &s) { 62 | uint8_t checksum = 0; 63 | 64 | CHECKSUM(checksum, s, 8); 65 | 66 | return (checksum); 67 | 68 | } 69 | 70 | uint16_t checksum_16(const std::string &s) { 71 | uint16_t checksum = 0; 72 | 73 | CHECKSUM(checksum, s, 16); 74 | 75 | return (checksum); 76 | } 77 | 78 | 79 | uint32_t checksum_32(const std::string &s) { 80 | uint32_t checksum = 0; 81 | 82 | CHECKSUM(checksum, s, 32); 83 | 84 | return (checksum); 85 | } 86 | 87 | uint64_t checksum_64(const std::string &s) { 88 | uint64_t checksum = 0; 89 | 90 | CHECKSUM(checksum, s, 64); 91 | 92 | return (checksum); 93 | } 94 | -------------------------------------------------------------------------------- /hash/checksum.hpp: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * checksum.hpp 4 | * 5 | * 6 | * Simple Checksums 7 | * 8 | * 9 | * Copyright (C) 2012 Bryant Moscon - bmoscon@gmail.com 10 | * 11 | * Permission is hereby granted, free of charge, to any person obtaining a copy 12 | * of this software and associated documentation files (the "Software"), to 13 | * deal in the Software without restriction, including without limitation the 14 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 15 | * sell copies of the Software, and to permit persons to whom the Software is 16 | * furnished to do so, subject to the following conditions: 17 | * 18 | * 1. Redistributions of source code must retain the above copyright notice, 19 | * this list of conditions, and the following disclaimer. 20 | * 21 | * 2. Redistributions in binary form must reproduce the above copyright notice, 22 | * this list of conditions and the following disclaimer in the documentation 23 | * and/or other materials provided with the distribution, and in the same 24 | * place and form as other copyright, 25 | * license and disclaimer information. 26 | * 27 | * 3. The end-user documentation included with the redistribution, if any, must 28 | * include the following acknowledgment: "This product includes software 29 | * developed by Bryant Moscon (http://www.bryantmoscon.org/)", in the same 30 | * place and form as other third-party acknowledgments. Alternately, this 31 | * acknowledgment may appear in the software itself, in the same form and 32 | * location as other such third-party acknowledgments. 33 | * 34 | * 4. Except as contained in this notice, the name of the author, Bryant Moscon, 35 | * shall not be used in advertising or otherwise to promote the sale, use or 36 | * other dealings in this Software without prior written authorization from 37 | * the author. 38 | * 39 | * 40 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 41 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 42 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 43 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 44 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 45 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 46 | * THE SOFTWARE. 47 | * 48 | */ 49 | 50 | #ifndef __CHECKSUM__ 51 | #define __CHECKSUM__ 52 | 53 | 54 | uint8_t checksum_8(const std::string &s); 55 | 56 | uint16_t checksum_16(const std::string &s); 57 | 58 | uint32_t checksum_32(const std::string &s); 59 | 60 | uint64_t checksum_64(const std::string &s); 61 | 62 | #endif 63 | --------------------------------------------------------------------------------