├── .github └── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── .gitignore ├── CONTRIBUTING.md ├── Default.testsettings ├── LICENSE.txt ├── PULL_REQUEST_TEMPLATE.md ├── ProbabilisticDataStructures.sln ├── ProbabilisticDataStructures.vsmdi ├── ProbabilisticDataStructures ├── BloomFilter.cs ├── BloomFilter64.cs ├── Buckets.cs ├── Buckets64.cs ├── CountMinSketch.cs ├── CountingBloomFilter.cs ├── CuckooBloomFilter.cs ├── Defaults.cs ├── DeletableBloomFilter.cs ├── Element.cs ├── ElementHeap.cs ├── HyperLogLog.cs ├── IFilter.cs ├── InverseBloomFilter.cs ├── MinHash.cs ├── PartitionedBloomFilter.cs ├── ProbabilisticDataStructures.csproj ├── ScalableBloomFilter.cs ├── StableBloomFilter.cs ├── TopK.cs └── Utils.cs ├── README.md ├── TestProbabilisticDataStructures ├── Properties │ └── AssemblyInfo.cs ├── TestBloomFilter.cs ├── TestBloomFilter64.cs ├── TestBuckets.cs ├── TestBuckets64.cs ├── TestCountMinSketch.cs ├── TestCountingBloomFilter.cs ├── TestCuckooBloomFilter.cs ├── TestDeletableBloomFilter.cs ├── TestHyperLogLog.cs ├── TestInverseBloomFilter.cs ├── TestMinHash.cs ├── TestPartitionedBloomFilter.cs ├── TestProbabilisticDataStructures.cs ├── TestProbabilisticDataStructures.csproj ├── TestScalableBloomFilter.cs ├── TestStableBloomFilter.cs ├── TestTopK.cs └── Words.cs └── appveyor.yml /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | 5 | --- 6 | 7 | **Describe the bug** 8 | A clear and concise description of what the bug is. 9 | 10 | **To Reproduce** 11 | Steps to reproduce the behavior: 12 | 1. Go to '...' 13 | 2. Click on '....' 14 | 3. Scroll down to '....' 15 | 4. See error 16 | 17 | **Expected behavior** 18 | A clear and concise description of what you expected to happen. 19 | 20 | **Screenshots** 21 | If applicable, add screenshots to help explain your problem. 22 | 23 | **Desktop (please complete the following information):** 24 | - OS: [e.g. iOS] 25 | - Browser [e.g. chrome, safari] 26 | - Version [e.g. 22] 27 | 28 | **Smartphone (please complete the following information):** 29 | - Device: [e.g. iPhone6] 30 | - OS: [e.g. iOS8.1] 31 | - Browser [e.g. stock browser, safari] 32 | - Version [e.g. 22] 33 | 34 | **Additional context** 35 | Add any other context about the problem here. 36 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | 5 | --- 6 | 7 | **Is your feature request related to a problem? Please describe.** 8 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 9 | 10 | **Describe the solution you'd like** 11 | A clear and concise description of what you want to happen. 12 | 13 | **Describe alternatives you've considered** 14 | A clear and concise description of any alternative solutions or features you've considered. 15 | 16 | **Additional context** 17 | Add any other context or screenshots about the feature request here. 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ProbabilisticDataStructures.v11.suo 2 | ProbabilisticDataStructures/bin/ 3 | ProbabilisticDataStructures/obj/ 4 | TestProbabilisticDataStructures/bin/ 5 | TestProbabilisticDataStructures/obj/ 6 | TestResults/ProbabilisticDataStructures.TE.Tests.mdf 7 | TestResults/ProbabilisticDataStructures.TE.Tests_log.ldf 8 | TestResults/ 9 | .vs/ 10 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | If you think a change would be useful, make a PR! I would only care that you inquire about a change before wirting it if it seems like a oh-man-this-is-changing-everything type of change. 2 | -------------------------------------------------------------------------------- /Default.testsettings: -------------------------------------------------------------------------------- 1 |  2 | 3 | These are default test settings for a local test run. 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ### What is this PR? 2 | THIS_IS_A_PR_THAT_DOES_X_Y_Z 3 | 4 | ### Things to consider: 5 | - [ ] I added tests for my changes 6 | - [ ] I ran the tests locally and they all passed 7 | - [x] I am awesome for making a contribution 8 | -------------------------------------------------------------------------------- /ProbabilisticDataStructures.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 15 4 | VisualStudioVersion = 15.0.27428.2043 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ProbabilisticDataStructures", "ProbabilisticDataStructures\ProbabilisticDataStructures.csproj", "{4775E89C-C139-43B0-8436-B456C035C9D9}" 7 | EndProject 8 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "TestProbabilisticDataStructures", "TestProbabilisticDataStructures\TestProbabilisticDataStructures.csproj", "{8212EFDE-5134-4914-96D3-C550FD9432F1}" 9 | EndProject 10 | Global 11 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 12 | Debug|Any CPU = Debug|Any CPU 13 | Release|Any CPU = Release|Any CPU 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {4775E89C-C139-43B0-8436-B456C035C9D9}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 17 | {4775E89C-C139-43B0-8436-B456C035C9D9}.Debug|Any CPU.Build.0 = Debug|Any CPU 18 | {4775E89C-C139-43B0-8436-B456C035C9D9}.Release|Any CPU.ActiveCfg = Release|Any CPU 19 | {4775E89C-C139-43B0-8436-B456C035C9D9}.Release|Any CPU.Build.0 = Release|Any CPU 20 | {8212EFDE-5134-4914-96D3-C550FD9432F1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 21 | {8212EFDE-5134-4914-96D3-C550FD9432F1}.Debug|Any CPU.Build.0 = Debug|Any CPU 22 | {8212EFDE-5134-4914-96D3-C550FD9432F1}.Release|Any CPU.ActiveCfg = Release|Any CPU 23 | {8212EFDE-5134-4914-96D3-C550FD9432F1}.Release|Any CPU.Build.0 = Release|Any CPU 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | GlobalSection(ExtensibilityGlobals) = postSolution 29 | SolutionGuid = {DD9C9C10-6340-471D-BF9D-A6823302D332} 30 | EndGlobalSection 31 | EndGlobal 32 | -------------------------------------------------------------------------------- /ProbabilisticDataStructures.vsmdi: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /ProbabilisticDataStructures/BloomFilter.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Security.Cryptography; 3 | 4 | namespace ProbabilisticDataStructures 5 | { 6 | /// 7 | /// BloomFilter implements a classic Bloom filter. A bloom filter has a non-zero 8 | /// probability of false positives and a zero probability of false negatives. 9 | /// 10 | public class BloomFilter : IFilter 11 | { 12 | /// 13 | /// Filter data 14 | /// 15 | internal Buckets Buckets { get; set; } 16 | /// 17 | /// Hash algorithm 18 | /// 19 | private HashAlgorithm Hash { get; set; } 20 | /// 21 | /// Filter size 22 | /// 23 | private uint m { get; set; } 24 | /// 25 | /// Number of hash functions 26 | /// 27 | private uint k { get; set; } 28 | /// 29 | /// Number of items added 30 | /// 31 | private uint count { get; set; } 32 | 33 | /// 34 | /// Creates a new Bloom filter optimized to store n items with a specified target 35 | /// false-positive rate. 36 | /// 37 | /// Number of items to store. 38 | /// Desired false positive rate. 39 | public BloomFilter(uint n, double fpRate) 40 | { 41 | var m = Utils.OptimalM(n, fpRate); 42 | var k = Utils.OptimalK(fpRate); 43 | Buckets = new Buckets(m, 1); 44 | Hash = Defaults.GetDefaultHashAlgorithm(); 45 | this.m = m; 46 | this.k = k; 47 | } 48 | 49 | /// 50 | /// Returns the Bloom filter capacity, m. 51 | /// 52 | /// The Bloom filter capacity, m. 53 | public uint Capacity() 54 | { 55 | return this.m; 56 | } 57 | 58 | /// 59 | /// Returns the number of hash functions. 60 | /// 61 | /// The number of hash functions. 62 | public uint K() 63 | { 64 | return this.k; 65 | } 66 | 67 | /// 68 | /// Returns the number of items in the filter. 69 | /// 70 | /// 71 | public uint Count() 72 | { 73 | return this.count; 74 | } 75 | 76 | /// 77 | /// Returns the current estimated ratio of set bits. 78 | /// 79 | /// The current estimated ratio of set bits. 80 | public double EstimatedFillRatio() 81 | { 82 | return 1 - Math.Exp((-(double)this.count * (double)this.k) / (double)this.m); 83 | } 84 | 85 | /// 86 | /// Returns the ratio of set bits. 87 | /// 88 | /// The ratio of set bits. 89 | public double FillRatio() 90 | { 91 | uint sum = 0; 92 | for (uint i = 0; i < this.Buckets.count; i++) 93 | { 94 | sum += this.Buckets.Get(i); 95 | } 96 | return (double)sum / (double)this.m; 97 | } 98 | 99 | /// 100 | /// Will test for membership of the data and returns true if it is a member, 101 | /// false if not. This is a probabilistic test, meaning there is a non-zero 102 | /// probability of false positives but a zero probability of false negatives. 103 | /// 104 | /// The data to search for. 105 | /// Whether or not the data is maybe contained in the filter. 106 | public bool Test(byte[] data) 107 | { 108 | var hashKernel = Utils.HashKernel(data, this.Hash); 109 | var lower = hashKernel.LowerBaseHash; 110 | var upper = hashKernel.UpperBaseHash; 111 | 112 | // If any of the K bits are not set, then it's not a member. 113 | for (uint i = 0; i < this.k; i++) 114 | { 115 | if (this.Buckets.Get((lower + upper * i) % this.m) == 0) 116 | { 117 | return false; 118 | } 119 | } 120 | return true; 121 | } 122 | 123 | /// 124 | /// Will add the data to the Bloom filter. It returns the filter to allow 125 | /// for chaining. 126 | /// 127 | /// The data to add. 128 | /// The filter. 129 | public IFilter Add(byte[] data) 130 | { 131 | var hashKernel = Utils.HashKernel(data, this.Hash); 132 | var lower = hashKernel.LowerBaseHash; 133 | var upper = hashKernel.UpperBaseHash; 134 | 135 | // Set the K bits. 136 | for (uint i = 0; i < this.k; i++) 137 | { 138 | this.Buckets.Set((lower + upper * i) % this.m, 1); 139 | } 140 | 141 | this.count++; 142 | return this; 143 | } 144 | 145 | /// 146 | /// Is equivalent to calling Test followed by Add. It returns true if the data is 147 | /// a member, false if not. 148 | /// 149 | /// The data to test for and add if it doesn't exist. 150 | /// Whether or not the data was probably contained in the filter. 151 | public bool TestAndAdd(byte[] data) 152 | { 153 | var hashKernel = Utils.HashKernel(data, this.Hash); 154 | var lower = hashKernel.LowerBaseHash; 155 | var upper = hashKernel.UpperBaseHash; 156 | var member = true; 157 | 158 | // If any of the K bits are not set, then it's not a member. 159 | for (uint i = 0; i < this.k; i++) 160 | { 161 | var idx = (lower + upper * i) % this.m; 162 | if (this.Buckets.Get(idx) == 0) 163 | { 164 | member = false; 165 | } 166 | this.Buckets.Set(idx, 1); 167 | } 168 | 169 | this.count++; 170 | return member; 171 | } 172 | 173 | /// 174 | /// Restores the Bloom filter to its original state. It returns the filter to 175 | /// allow for chaining. 176 | /// 177 | /// The reset bloom filter. 178 | public BloomFilter Reset() 179 | { 180 | this.Buckets.Reset(); 181 | return this; 182 | } 183 | 184 | /// 185 | /// Sets the hashing function used in the filter. 186 | /// 187 | /// The HashAlgorithm to use. 188 | // TODO: Add SetHash to the IFilter interface? 189 | public void SetHash(HashAlgorithm h) 190 | { 191 | this.Hash = h; 192 | } 193 | } 194 | } 195 | -------------------------------------------------------------------------------- /ProbabilisticDataStructures/BloomFilter64.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | using ProbabilisticDataStructures; 7 | using System.Security.Cryptography; 8 | 9 | namespace ProbabilisticDataStructures 10 | { 11 | /// 12 | /// BloomFilter64 implements a classic Bloom filter. A bloom filter has a non-zero 13 | /// probability of false positives and a zero probability of false negatives. 14 | /// 15 | public class BloomFilter64 : IFilter 16 | { 17 | /// 18 | /// Filter data 19 | /// 20 | internal Buckets64 Buckets { get; set; } 21 | /// 22 | /// Hash algorithm 23 | /// 24 | private HashAlgorithm Hash { get; set; } 25 | /// 26 | /// Filter size 27 | /// 28 | private ulong m { get; set; } 29 | /// 30 | /// Number of hash functions 31 | /// 32 | private uint k { get; set; } 33 | /// 34 | /// Number of items added 35 | /// 36 | private ulong count { get; set; } 37 | 38 | /// 39 | /// Creates a new Bloom filter optimized to store n items with a specified target 40 | /// false-positive rate. 41 | /// 42 | /// Number of items to store. 43 | /// Desired false positive rate. 44 | public BloomFilter64(ulong n, double fpRate) 45 | { 46 | var m = Utils.OptimalM64(n, fpRate); 47 | var k = Utils.OptimalK(fpRate); 48 | Buckets = new Buckets64(m, 1); 49 | Hash = Defaults.GetDefaultHashAlgorithm(); 50 | this.m = m; 51 | this.k = k; 52 | } 53 | 54 | /// 55 | /// Returns the Bloom filter capacity, m. 56 | /// 57 | /// The Bloom filter capacity, m. 58 | public ulong Capacity() 59 | { 60 | return this.m; 61 | } 62 | 63 | /// 64 | /// Returns the number of hash functions. 65 | /// 66 | /// The number of hash functions. 67 | public uint K() 68 | { 69 | return this.k; 70 | } 71 | 72 | /// 73 | /// Returns the number of items in the filter. 74 | /// 75 | /// 76 | public ulong Count() 77 | { 78 | return this.count; 79 | } 80 | 81 | /// 82 | /// Returns the current estimated ratio of set bits. 83 | /// 84 | /// The current estimated ratio of set bits. 85 | public double EstimatedFillRatio() 86 | { 87 | return 1 - Math.Exp((-(double)this.count * (double)this.k) / (double)this.m); 88 | } 89 | 90 | /// 91 | /// Returns the ratio of set bits. 92 | /// 93 | /// The ratio of set bits. 94 | public double FillRatio() 95 | { 96 | ulong sum = 0; 97 | for (ulong i = 0; i < this.Buckets.count; i++) 98 | { 99 | sum += this.Buckets.Get(i); 100 | } 101 | return (double)sum / (double)this.m; 102 | } 103 | 104 | /// 105 | /// Will test for membership of the data and returns true if it is a member, 106 | /// false if not. This is a probabilistic test, meaning there is a non-zero 107 | /// probability of false positives but a zero probability of false negatives. 108 | /// 109 | /// The data to search for. 110 | /// Whether or not the data is maybe contained in the filter. 111 | public bool Test(byte[] data) 112 | { 113 | var hashKernel = Utils.HashKernel128(data, this.Hash); 114 | var lower = hashKernel.LowerBaseHash; 115 | var upper = hashKernel.UpperBaseHash; 116 | 117 | // If any of the K bits are not set, then it's not a member. 118 | for (uint i = 0; i < this.k; i++) 119 | { 120 | if (this.Buckets.Get((lower + upper * i) % this.m) == 0) 121 | { 122 | return false; 123 | } 124 | } 125 | return true; 126 | } 127 | 128 | /// 129 | /// Will add the data to the Bloom filter. It returns the filter to allow 130 | /// for chaining. 131 | /// 132 | /// The data to add. 133 | /// The filter. 134 | public IFilter Add(byte[] data) 135 | { 136 | var hashKernel = Utils.HashKernel128(data, this.Hash); 137 | var lower = hashKernel.LowerBaseHash; 138 | var upper = hashKernel.UpperBaseHash; 139 | 140 | // Set the K bits. 141 | for (uint i = 0; i < this.k; i++) 142 | { 143 | this.Buckets.Set((lower + upper * i) % this.m, 1); 144 | } 145 | 146 | this.count++; 147 | return this; 148 | } 149 | 150 | /// 151 | /// Is equivalent to calling Test followed by Add. It returns true if the data is 152 | /// a member, false if not. 153 | /// 154 | /// The data to test for and add if it doesn't exist. 155 | /// Whether or not the data was probably contained in the filter. 156 | public bool TestAndAdd(byte[] data) 157 | { 158 | var hashKernel = Utils.HashKernel128(data, this.Hash); 159 | var lower = hashKernel.LowerBaseHash; 160 | var upper = hashKernel.UpperBaseHash; 161 | var member = true; 162 | 163 | // If any of the K bits are not set, then it's not a member. 164 | for (uint i = 0; i < this.k; i++) 165 | { 166 | var idx = (lower + upper * i) % this.m; 167 | if (this.Buckets.Get(idx) == 0) 168 | { 169 | member = false; 170 | } 171 | this.Buckets.Set(idx, 1); 172 | } 173 | 174 | this.count++; 175 | return member; 176 | } 177 | 178 | /// 179 | /// Restores the Bloom filter to its original state. It returns the filter to 180 | /// allow for chaining. 181 | /// 182 | /// The reset bloom filter. 183 | public BloomFilter64 Reset() 184 | { 185 | this.Buckets.Reset(); 186 | return this; 187 | } 188 | 189 | /// 190 | /// Sets the hashing function used in the filter. 191 | /// 192 | /// The HashAlgorithm to use. 193 | // TODO: Add SetHash to the IFilter interface? 194 | public void SetHash(HashAlgorithm h) 195 | { 196 | this.Hash = h; 197 | } 198 | } 199 | } 200 | -------------------------------------------------------------------------------- /ProbabilisticDataStructures/Buckets.cs: -------------------------------------------------------------------------------- 1 | namespace ProbabilisticDataStructures 2 | { 3 | /// 4 | /// Buckets is a fast, space-efficient array of buckets where each bucket can store 5 | /// up to a configured maximum value. 6 | /// 7 | public class Buckets 8 | { 9 | private byte[] Data { get; set; } 10 | private byte bucketSize { get; set; } 11 | private byte _max; 12 | private int Max 13 | { 14 | get 15 | { 16 | return _max; 17 | } 18 | set 19 | { 20 | // TODO: Figure out this truncation thing. 21 | // I'm not sure if MaxValue is always supposed to be capped at 255 via 22 | // a byte conversion or not... 23 | if (value > byte.MaxValue) 24 | _max = byte.MaxValue; 25 | else 26 | _max = (byte)value; 27 | } 28 | } 29 | internal uint count { get; set; } 30 | 31 | /// 32 | /// Creates a new Buckets with the provided number of buckets where each bucket 33 | /// is the specified number of bits. 34 | /// 35 | /// Number of buckets. 36 | /// Number of bits per bucket. 37 | internal Buckets(uint count, byte bucketSize) 38 | { 39 | this.count = count; 40 | this.Data = new byte[(count * bucketSize + 7) / 8]; 41 | this.bucketSize = bucketSize; 42 | this.Max = (1 << bucketSize) - 1; 43 | } 44 | 45 | /// 46 | /// Returns the maximum value that can be stored in a bucket. 47 | /// 48 | /// The bucket max value. 49 | internal byte MaxBucketValue() 50 | { 51 | return this._max; 52 | } 53 | 54 | /// 55 | /// Increment the value in the specified bucket by the provided delta. A bucket 56 | /// can be decremented by providing a negative delta. 57 | /// 58 | /// The value is clamped to zero and the maximum bucket value. Returns itself 59 | /// to allow for chaining. 60 | /// 61 | /// 62 | /// The bucket to increment. 63 | /// The amount to increment the bucket by. 64 | /// The modified bucket. 65 | internal Buckets Increment(uint bucket, int delta) 66 | { 67 | int val = (int)(GetBits(bucket * this.bucketSize, this.bucketSize) + delta); 68 | 69 | if (val > this.Max) 70 | val = this.Max; 71 | else if (val < 0) 72 | val = 0; 73 | 74 | SetBits((uint)bucket * (uint)this.bucketSize, this.bucketSize, (uint)val); 75 | return this; 76 | } 77 | 78 | /// 79 | /// Set the bucket value. The value is clamped to zero and the maximum bucket 80 | /// value. Returns itself to allow for chaining. 81 | /// 82 | /// The bucket to change the value of. 83 | /// The value to set. 84 | /// The modified bucket. 85 | internal Buckets Set(uint bucket, byte value) 86 | { 87 | if (value > this._max) 88 | value = this._max; 89 | 90 | SetBits(bucket * this.bucketSize, this.bucketSize, value); 91 | return this; 92 | } 93 | 94 | /// 95 | /// Returns the value in the specified bucket. 96 | /// 97 | /// The bucket to get. 98 | /// The specified bucket. 99 | internal uint Get(uint bucket) 100 | { 101 | return GetBits(bucket * this.bucketSize, this.bucketSize); 102 | } 103 | 104 | /// 105 | /// Restores the Buckets to the original state. Returns itself to allow for 106 | /// chaining. 107 | /// 108 | /// The Buckets object the reset operation was performed on. 109 | internal Buckets Reset() 110 | { 111 | this.Data = new byte[(this.count * this.bucketSize + 7) / 8]; 112 | return this; 113 | } 114 | 115 | /// 116 | /// Returns the bits at the specified offset and length. 117 | /// 118 | /// The position to start reading at. 119 | /// The distance to read from the offset. 120 | /// The bits at the specified offset and length. 121 | internal uint GetBits(uint offset, int length) 122 | { 123 | uint byteIndex = offset / 8; 124 | int byteOffset = (int)(offset % 8); 125 | 126 | if ((byteOffset + length) > 8) 127 | { 128 | int rem = 8 - byteOffset; 129 | return GetBits(offset, rem) 130 | | (GetBits((uint)(offset + rem), length - rem) << rem); 131 | } 132 | 133 | int bitMask = (1 << length) - 1; 134 | return (uint)((this.Data[byteIndex] & (bitMask << byteOffset)) >> byteOffset); 135 | } 136 | 137 | /// 138 | /// Sets bits at the specified offset and length. 139 | /// 140 | /// The position to start writing at. 141 | /// The distance to write from the offset. 142 | /// The bits to write. 143 | internal void SetBits(uint offset, int length, uint bits) 144 | { 145 | uint byteIndex = offset / 8; 146 | int byteOffset = (int)(offset % 8); 147 | 148 | if ((byteOffset + length) > 8) 149 | { 150 | int rem = 8 - byteOffset; 151 | SetBits(offset, (byte)rem, bits); 152 | SetBits((uint)(offset + rem), length - rem, bits >> rem); 153 | return; 154 | } 155 | 156 | int bitMask = (1 << length) - 1; 157 | this.Data[byteIndex] = 158 | (byte)((this.Data[byteIndex]) & ~(bitMask << byteOffset)); 159 | this.Data[byteIndex] = 160 | (byte)((this.Data[byteIndex]) | ((bits & bitMask) << byteOffset)); 161 | } 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /ProbabilisticDataStructures/Buckets64.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | 7 | namespace ProbabilisticDataStructures 8 | { 9 | /// 10 | /// Buckets64 is a fast, space-efficient array of buckets where each bucket can store 11 | /// up to a configured maximum value. 12 | /// 13 | public class Buckets64 14 | { 15 | // The largest C# array to create; the largest power of 2 that C# can support. 16 | private const uint maxArraySize = 1U << 30; 17 | private byte[][] Data { get; set; } 18 | private int arrayCount { get; set; } 19 | private byte bucketSize { get; set; } 20 | private byte _max; 21 | private int Max 22 | { 23 | get 24 | { 25 | return _max; 26 | } 27 | set 28 | { 29 | // TODO: Figure out this truncation thing. 30 | // I'm not sure if MaxValue is always supposed to be capped at 255 via 31 | // a byte conversion or not... 32 | if (value > byte.MaxValue) 33 | _max = byte.MaxValue; 34 | else 35 | _max = (byte)value; 36 | } 37 | } 38 | internal ulong count { get; set; } 39 | 40 | /// 41 | /// Creates a new Buckets64 with the provided number of buckets where each bucket 42 | /// is the specified number of bits. 43 | /// 44 | /// Number of buckets. 45 | /// Number of bits per bucket. 46 | internal Buckets64(ulong count, byte bucketSize) 47 | { 48 | this.count = count; 49 | this.bucketSize = bucketSize; 50 | AllocateArray(count, bucketSize); 51 | this.Max = (1 << bucketSize) - 1; 52 | } 53 | 54 | private void AllocateArray(ulong count, byte bucketSize) 55 | { 56 | this.arrayCount = (int)(count / maxArraySize + 1); 57 | this.Data = new byte[this.arrayCount][]; 58 | var bytesToAllocate = (count * bucketSize + 7) / 8; 59 | for (int i = 0; i < this.arrayCount; i++) 60 | { 61 | var arraySize = Math.Min(bytesToAllocate, maxArraySize); 62 | this.Data[i] = new byte[arraySize]; 63 | bytesToAllocate -= arraySize; 64 | } 65 | } 66 | 67 | /// 68 | /// Returns the maximum value that can be stored in a bucket. 69 | /// 70 | /// The bucket max value. 71 | internal byte MaxBucketValue() 72 | { 73 | return this._max; 74 | } 75 | 76 | /// 77 | /// Increment the value in the specified bucket by the provided delta. A bucket 78 | /// can be decremented by providing a negative delta. 79 | /// 80 | /// The value is clamped to zero and the maximum bucket value. Returns itself 81 | /// to allow for chaining. 82 | /// 83 | /// 84 | /// The bucket to increment. 85 | /// The amount to increment the bucket by. 86 | /// The modified bucket. 87 | internal Buckets64 Increment(uint bucket, int delta) 88 | { 89 | int val = (int)(GetBits(bucket * this.bucketSize, this.bucketSize) + delta); 90 | 91 | if (val > this.Max) 92 | val = this.Max; 93 | else if (val < 0) 94 | val = 0; 95 | 96 | SetBits((uint)bucket * (uint)this.bucketSize, this.bucketSize, (uint)val); 97 | return this; 98 | } 99 | 100 | /// 101 | /// Set the bucket value. The value is clamped to zero and the maximum bucket 102 | /// value. Returns itself to allow for chaining. 103 | /// 104 | /// The bucket to change the value of. 105 | /// The value to set. 106 | /// The modified bucket. 107 | internal Buckets64 Set(ulong bucket, byte value) 108 | { 109 | if (value > this._max) 110 | value = this._max; 111 | 112 | SetBits(bucket * this.bucketSize, this.bucketSize, value); 113 | return this; 114 | } 115 | 116 | /// 117 | /// Returns the value in the specified bucket. 118 | /// 119 | /// The bucket to get. 120 | /// The specified bucket. 121 | internal uint Get(ulong bucket) 122 | { 123 | return GetBits(bucket * this.bucketSize, this.bucketSize); 124 | } 125 | 126 | /// 127 | /// Restores the Buckets64 to the original state. Returns itself to allow for 128 | /// chaining. 129 | /// 130 | /// The Buckets64 object the reset operation was performed on. 131 | internal Buckets64 Reset() 132 | { 133 | AllocateArray(this.count, this.bucketSize); 134 | return this; 135 | } 136 | 137 | /// 138 | /// Returns the bits at the specified offset and length. 139 | /// 140 | /// The position to start reading at. 141 | /// The distance to read from the offset. 142 | /// The bits at the specified offset and length. 143 | internal uint GetBits(ulong offset, int length) 144 | { 145 | ulong byteIndex = offset / 8; 146 | int byteOffset = (int)(offset % 8); 147 | 148 | if ((byteOffset + length) > 8) 149 | { 150 | int rem = 8 - byteOffset; 151 | return GetBits(offset, rem) 152 | | (GetBits(offset + (ulong)rem, length - rem) << rem); 153 | } 154 | 155 | var dataArray = this.Data[byteIndex / maxArraySize]; 156 | var dataArrayByteIndex = byteIndex % maxArraySize; 157 | int bitMask = (1 << length) - 1; 158 | return (uint)((dataArray[dataArrayByteIndex] & (bitMask << byteOffset)) >> byteOffset); 159 | } 160 | 161 | /// 162 | /// Sets bits at the specified offset and length. 163 | /// 164 | /// The position to start writing at. 165 | /// The distance to write from the offset. 166 | /// The bits to write. 167 | internal void SetBits(ulong offset, int length, uint bits) 168 | { 169 | ulong byteIndex = offset / 8; 170 | int byteOffset = (int)(offset % 8); 171 | 172 | if ((byteOffset + length) > 8) 173 | { 174 | int rem = 8 - byteOffset; 175 | SetBits(offset, (byte)rem, bits); 176 | SetBits(offset + (ulong)rem, length - rem, bits >> rem); 177 | return; 178 | } 179 | 180 | var dataArray = this.Data[(uint)(byteIndex / maxArraySize)]; 181 | var dataArrayByteIndex = (uint)(byteIndex % maxArraySize); 182 | int bitMask = (1 << length) - 1; 183 | dataArray[dataArrayByteIndex] = 184 | (byte)((dataArray[dataArrayByteIndex]) & ~(bitMask << byteOffset)); 185 | dataArray[dataArrayByteIndex] = 186 | (byte)((dataArray[dataArrayByteIndex]) | ((bits & bitMask) << byteOffset)); 187 | } 188 | } 189 | } 190 | -------------------------------------------------------------------------------- /ProbabilisticDataStructures/CountMinSketch.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Security.Cryptography; 3 | 4 | namespace ProbabilisticDataStructures 5 | { 6 | /// 7 | /// CountMinSketch implements a Count-Min Sketch as described by Cormode and 8 | /// Muthukrishnan in An Improved Data Stream Summary: The Count-Min Sketch and its 9 | /// Applications: 10 | /// 11 | /// http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf 12 | /// 13 | /// A Count-Min Sketch (CMS) is a probabilistic data structure which approximates 14 | /// the frequency of events in a data stream. Unlike a hash map, a CMS uses 15 | /// sub-linear space at the expense of a configurable error factor. Similar to 16 | /// Counting Bloom filters, items are hashed to a series of buckets, which increment 17 | /// a counter. The frequency of an item is estimated by taking the minimum of each of 18 | /// the item's respective counter values. 19 | /// 20 | /// Count-Min Sketches are useful for counting the frequency of events in massive 21 | /// data sets or unbounded streams online. In these situations, storing the entire 22 | /// data set or allocating counters for every event in memory is impractical. It may 23 | /// be possible for offline processing, but real-time processing requires fast, 24 | /// space-efficient solutions like the CMS. For approximating set cardinality, refer 25 | /// to the HyperLogLog. 26 | /// 27 | public class CountMinSketch 28 | { 29 | /// 30 | /// Count matrix 31 | /// 32 | internal UInt64[][] Matrix { get; set; } 33 | /// 34 | /// Matrix width 35 | /// 36 | internal uint Width { get; set; } 37 | /// 38 | /// Matrix depth 39 | /// 40 | internal uint Depth { get; set; } 41 | /// 42 | /// Number of items added 43 | /// 44 | private UInt64 count { get; set; } 45 | /// 46 | /// Relative-accuracy factor 47 | /// 48 | private double epsilon { get; set; } 49 | /// 50 | /// Relative-accuracy probability 51 | /// 52 | private double delta { get; set; } 53 | /// 54 | /// Hash function 55 | /// 56 | private HashAlgorithm Hash { get; set; } 57 | 58 | /// 59 | /// Creates a new Count-Min Sketch whose relative accuracy is within a factor of 60 | /// epsilon with probability delta. Both of these parameters affect the space and 61 | /// time complexity. 62 | /// 63 | /// Relative-accuracy factor 64 | /// Relative-accuracy probability 65 | public CountMinSketch(double epsilon, double delta) 66 | { 67 | var width = (uint)(Math.Ceiling(Math.E / epsilon)); 68 | var depth = (uint)(Math.Ceiling(Math.Log(1 / delta))); 69 | this.Matrix = new UInt64[depth][]; 70 | 71 | for (int i = 0; i < depth; i++) 72 | { 73 | this.Matrix[i] = new UInt64[width]; 74 | } 75 | 76 | this.Width = width; 77 | this.Depth = depth; 78 | this.epsilon = epsilon; 79 | this.delta = delta; 80 | this.Hash = Defaults.GetDefaultHashAlgorithm(); 81 | } 82 | 83 | /// 84 | /// Returns the relative-accuracy factor, epsilon. 85 | /// 86 | /// The relative-accuracy factor, epsilon 87 | public double Epsilon() 88 | { 89 | return this.epsilon; 90 | } 91 | 92 | /// 93 | /// Returns the relative-accuracy probability, delta. 94 | /// 95 | /// The relative-accuracy probability, delta 96 | public double Delta() 97 | { 98 | return this.delta; 99 | } 100 | 101 | /// 102 | /// Returns the number of items added to the sketch. 103 | /// 104 | /// The number of items added to the sketch. 105 | public UInt64 TotalCount() 106 | { 107 | return this.count; 108 | } 109 | 110 | /// 111 | /// Add the data to the set. Returns the CountMinSketch to allow for chaining. 112 | /// 113 | /// The data to add. 114 | /// The CountMinSketch 115 | public CountMinSketch Add(byte[] data) 116 | { 117 | var hashKernel = Utils.HashKernel(data, this.Hash); 118 | var lower = hashKernel.LowerBaseHash; 119 | var upper = hashKernel.UpperBaseHash; 120 | 121 | // Increment count in each row. 122 | for (uint i = 0; i < this.Depth; i++) 123 | { 124 | this.Matrix[i][(lower + upper * i) % this.Width]++; 125 | } 126 | 127 | this.count++; 128 | return this; 129 | } 130 | 131 | /// 132 | /// Returns the approximate count for the specified item, correct within 133 | /// epsilon * total count with a probability of delta. 134 | /// 135 | /// 136 | /// The data to count. 137 | public UInt64 Count(byte[] data) 138 | { 139 | var hashKernel = Utils.HashKernel(data, this.Hash); 140 | var lower = hashKernel.LowerBaseHash; 141 | var upper = hashKernel.UpperBaseHash; 142 | var count = UInt64.MaxValue; 143 | 144 | for (uint i = 0; i < this.Depth; i++) 145 | { 146 | count = Math.Min(count, this.Matrix[i][(lower + upper * i) % this.Width]); 147 | } 148 | 149 | return count; 150 | } 151 | 152 | /// 153 | /// Combines this CountMinSketch with another. Returns a bool if the merge was 154 | /// successful. Throws an exception if the matrix width and depth are not equal. 155 | /// 156 | /// The CountMinSketch to merge with the current 157 | /// instance. 158 | /// True if successful. 159 | public bool Merge(CountMinSketch other) 160 | { 161 | if (this.Depth != other.Depth) 162 | { 163 | throw new Exception("Matrix depth must match."); 164 | } 165 | 166 | if (this.Width != other.Width) 167 | { 168 | throw new Exception("Matrix width must match."); 169 | } 170 | 171 | for (uint i = 0; i < this.Depth; i++) 172 | { 173 | for (int j = 0; j < this.Width; j++) 174 | { 175 | this.Matrix[i][j] += other.Matrix[i][j]; 176 | } 177 | } 178 | 179 | this.count += other.count; 180 | return true; 181 | } 182 | 183 | /// 184 | /// Restores the CountMinSketch to its original state. It returns itself to allow 185 | /// for chaining. 186 | /// 187 | /// The CountMinSketch 188 | public CountMinSketch Reset() 189 | { 190 | this.Matrix = new UInt64[this.Depth][]; 191 | for (uint i = 0; i < this.Depth; i++) 192 | { 193 | this.Matrix[i] = new UInt64[this.Width]; 194 | } 195 | 196 | this.count = 0; 197 | return this; 198 | } 199 | 200 | /// 201 | /// Sets the hashing function used in the filter. 202 | /// 203 | /// The HashAlgorithm to use. 204 | public void SetHash(HashAlgorithm h) 205 | { 206 | this.Hash = h; 207 | } 208 | 209 | // TODO: Implement these later. 210 | // WriteDataTo() 211 | // ReadDataFrom() 212 | } 213 | } 214 | -------------------------------------------------------------------------------- /ProbabilisticDataStructures/CountingBloomFilter.cs: -------------------------------------------------------------------------------- 1 | using System.Security.Cryptography; 2 | 3 | namespace ProbabilisticDataStructures 4 | { 5 | /// 6 | /// CountingBloomFilter implements a Counting Bloom Filter as described by Fan, 7 | /// Cao, Almeida, and Broder in Summary Cache: A Scalable Wide-Area Web Cache 8 | /// Sharing Protocol: 9 | /// 10 | /// http://pages.cs.wisc.edu/~jussara/papers/00ton.pdf 11 | /// 12 | /// A Counting Bloom Filter (CBF) provides a way to remove elements by using an 13 | /// array of n-bit buckets. When an element is added, the respective buckets are 14 | /// incremented. To remove an element, the respective buckets are decremented. A 15 | /// query checks that each of the respective buckets are non-zero. Because CBFs 16 | /// allow elements to be removed, they introduce a non-zero probability of false 17 | /// negatives in addition to the possibility of false positives. 18 | /// 19 | /// Counting Bloom Filters are useful for cases where elements are both added 20 | /// and removed from the data set. Since they use n-bit buckets, CBFs use 21 | /// roughly n-times more memory than traditional Bloom filters. 22 | /// 23 | public class CountingBloomFilter : IFilter 24 | { 25 | /// 26 | /// Filter data 27 | /// 28 | internal Buckets Buckets { get; set; } 29 | /// 30 | /// Hash algorithm 31 | /// 32 | private HashAlgorithm Hash { get; set; } 33 | /// 34 | /// Filter size 35 | /// 36 | private uint m { get; set; } 37 | /// 38 | /// Number of hash functions 39 | /// 40 | private uint k { get; set; } 41 | /// 42 | /// Number of items added 43 | /// 44 | private uint count { get; set; } 45 | /// 46 | /// Buffer used to cache indices 47 | /// 48 | private uint[] indexBuffer { get; set; } 49 | 50 | /// 51 | /// Creates a new Counting Bloom Filter optimized to store n-items with a 52 | /// specified target false-positive rate and bucket size. If you don't know how 53 | /// many bits to use for buckets, use NewDefaultCountingBloomFilter for a 54 | /// sensible default. 55 | /// 56 | /// Number of items to store. 57 | /// Bucket size. 58 | /// Desired false positive rate. 59 | public CountingBloomFilter(uint n, byte b, double fpRate) 60 | { 61 | var m = Utils.OptimalM(n, fpRate); 62 | var k = Utils.OptimalK(fpRate); 63 | this.Buckets = new Buckets(m, b); 64 | this.Hash = Defaults.GetDefaultHashAlgorithm(); 65 | this.m = m; 66 | this.k = k; 67 | this.indexBuffer = new uint[k]; 68 | } 69 | 70 | /// 71 | /// Creates a new Counting Bloom Filter optimized to store n items with a 72 | /// specified target false-positive rate. Buckets are allocated four bits. 73 | /// 74 | /// Number of items to store. 75 | /// Desired false positive rate. 76 | /// Default CountingBloomFilter 77 | public static CountingBloomFilter NewDefaultCountingBloomFilter( 78 | uint n, 79 | double fpRate) 80 | { 81 | return new CountingBloomFilter(n, 4, fpRate); 82 | } 83 | 84 | /// 85 | /// Returns the Bloom filter capacity, m. 86 | /// 87 | /// The Bloom filter capacity, m. 88 | public uint Capacity() 89 | { 90 | return this.m; 91 | } 92 | 93 | /// 94 | /// Returns the number of hash functions. 95 | /// 96 | /// The number of hash functions. 97 | public uint K() 98 | { 99 | return this.k; 100 | } 101 | 102 | /// 103 | /// Returns the number of items in the filter. 104 | /// 105 | /// 106 | public uint Count() 107 | { 108 | return this.count; 109 | } 110 | 111 | /// 112 | /// Will test for membership of the data and returns true if it is a member, 113 | /// false if not. This is a probabilistic test, meaning there is a non-zero 114 | /// probability of false positives but a zero probability of false negatives. 115 | /// 116 | /// The data to search for. 117 | /// Whether or not the data is maybe contained in the filter. 118 | public bool Test(byte[] data) 119 | { 120 | var hashKernel = Utils.HashKernel(data, this.Hash); 121 | var lower = hashKernel.LowerBaseHash; 122 | var upper = hashKernel.UpperBaseHash; 123 | 124 | // If any of the K bits are not set, then it's not a member. 125 | for (uint i = 0; i < this.k; i++) 126 | { 127 | if (this.Buckets.Get((lower + upper * i) % this.m) == 0) 128 | { 129 | return false; 130 | } 131 | } 132 | return true; 133 | } 134 | 135 | /// 136 | /// Will add the data to the Bloom filter. It returns the filter to allow 137 | /// for chaining. 138 | /// 139 | /// The data to add. 140 | /// The filter. 141 | public IFilter Add(byte[] data) 142 | { 143 | var hashKernel = Utils.HashKernel(data, this.Hash); 144 | var lower = hashKernel.LowerBaseHash; 145 | var upper = hashKernel.UpperBaseHash; 146 | 147 | // Set the K bits. 148 | for (uint i = 0; i < this.k; i++) 149 | { 150 | this.Buckets.Increment((lower + upper * i) % this.m, 1); 151 | } 152 | 153 | this.count++; 154 | return this; 155 | } 156 | 157 | /// 158 | /// Is equivalent to calling Test followed by Add. It returns true if the data is 159 | /// a member, false if not. 160 | /// 161 | /// The data to test for and add if it doesn't exist. 162 | /// Whether or not the data was probably contained in the filter. 163 | public bool TestAndAdd(byte[] data) 164 | { 165 | var hashKernel = Utils.HashKernel(data, this.Hash); 166 | var lower = hashKernel.LowerBaseHash; 167 | var upper = hashKernel.UpperBaseHash; 168 | var member = true; 169 | 170 | // If any of the K bits are not set, then it's not a member. 171 | for (uint i = 0; i < this.k; i++) 172 | { 173 | var idx = (lower + upper * i) % this.m; 174 | if (this.Buckets.Get(idx) == 0) 175 | { 176 | member = false; 177 | } 178 | this.Buckets.Increment(idx, 1); 179 | } 180 | 181 | this.count++; 182 | return member; 183 | } 184 | 185 | /// 186 | /// Will test for membership of the data and remove it from the filter if it 187 | /// exists. Returns true if the data was a member, false if not. 188 | /// 189 | /// The data to check for and remove. 190 | /// Whether or not the data was in the filter before removal. 191 | public bool TestAndRemove(byte[] data) 192 | { 193 | var hashKernel = Utils.HashKernel(data, this.Hash); 194 | var lower = hashKernel.LowerBaseHash; 195 | var upper = hashKernel.UpperBaseHash; 196 | var member = true; 197 | 198 | // Set the K bits. 199 | for (uint i = 0; i < this.k; i++) 200 | { 201 | this.indexBuffer[i] = (lower + upper * i) % this.m; 202 | if (this.Buckets.Get(this.indexBuffer[i]) == 0) 203 | { 204 | member = false; 205 | } 206 | } 207 | 208 | if (member) 209 | { 210 | foreach (var idx in this.indexBuffer) 211 | { 212 | this.Buckets.Increment(idx, -1); 213 | } 214 | this.count--; 215 | } 216 | 217 | return member; 218 | } 219 | 220 | /// 221 | /// Restores the Bloom filter to its original state. It returns the filter to 222 | /// allow for chaining. 223 | /// 224 | /// The reset bloom filter. 225 | public CountingBloomFilter Reset() 226 | { 227 | this.Buckets.Reset(); 228 | this.count = 0; 229 | return this; 230 | } 231 | 232 | /// 233 | /// Sets the hashing function used in the filter. 234 | /// 235 | /// The HashAlgorithm to use. 236 | // TODO: Add SetHash to the IFilter interface? 237 | public void SetHash(HashAlgorithm h) 238 | { 239 | this.Hash = h; 240 | } 241 | } 242 | } 243 | -------------------------------------------------------------------------------- /ProbabilisticDataStructures/Defaults.cs: -------------------------------------------------------------------------------- 1 | using System.Security.Cryptography; 2 | using System.Runtime.CompilerServices; 3 | [assembly: InternalsVisibleTo("TestProbabilisticDataStructures")] 4 | 5 | namespace ProbabilisticDataStructures 6 | { 7 | public static class Defaults 8 | { 9 | public const double FILL_RATIO = 0.5; 10 | 11 | /// 12 | /// Returns the default hashing algorithm for the library. 13 | /// 14 | /// The default hashing algorithm for the library 15 | internal static HashAlgorithm GetDefaultHashAlgorithm() 16 | { 17 | return HashAlgorithm.Create("MD5"); 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /ProbabilisticDataStructures/DeletableBloomFilter.cs: -------------------------------------------------------------------------------- 1 | using System.Security.Cryptography; 2 | 3 | namespace ProbabilisticDataStructures 4 | { 5 | /// 6 | /// DeletableBloomFilter implements a Deletable Bloom Filter as described by 7 | /// Rothenberg, Macapuna, Verdi, Magalhaes in The Deletable Bloom filter - A new 8 | /// member of the Bloom family: 9 | /// 10 | /// http://arxiv.org/pdf/1005.0352.pdf 11 | /// 12 | /// A Deletable Bloom Filter compactly stores information on collisions when 13 | /// inserting elements. This information is used to determine if elements are 14 | /// deletable. This design enables false-negative-free deletions at a fraction 15 | /// of the cost in memory consumption. 16 | /// 17 | /// Deletable Bloom Filters are useful for cases which require removing elements 18 | /// but cannot allow false negatives. This means they can be safely swapped in 19 | /// place of traditional Bloom filters. 20 | /// 21 | public class DeletableBloomFilter : IFilter 22 | { 23 | /// 24 | /// Filter data 25 | /// 26 | internal Buckets Buckets { get; set; } 27 | /// 28 | /// Filter collision data 29 | /// 30 | internal Buckets Collisions { get; set; } 31 | /// 32 | /// Hash algorithm 33 | /// 34 | private HashAlgorithm Hash { get; set; } 35 | /// 36 | /// Filter size 37 | /// 38 | private uint M { get; set; } 39 | /// 40 | /// Number of bits in a region 41 | /// 42 | private uint RegionSize { get; set; } 43 | /// 44 | /// Number of hash functions 45 | /// 46 | private uint k { get; set; } 47 | /// 48 | /// Number of items in the filter 49 | /// 50 | private uint count { get; set; } 51 | /// 52 | /// Buffer used to cache indices 53 | /// 54 | private uint[] IndexBuffer { get; set; } 55 | 56 | /// 57 | /// NewDeletableBloomFilter creates a new DeletableBloomFilter optimized to store 58 | /// n items with a specified target false-positive rate. The r value determines 59 | /// the number of bits to use to store collision information. This controls the 60 | /// deletability of an element. Refer to the paper for selecting an optimal value. 61 | /// 62 | /// Number of items 63 | /// Number of bits to use to store collision information 64 | /// Desired false positive rate 65 | public DeletableBloomFilter(uint n, uint r, double fpRate) 66 | { 67 | var m = Utils.OptimalM(n, fpRate); 68 | var k = Utils.OptimalK(fpRate); 69 | 70 | this.Buckets = new Buckets(m - r, 1); 71 | this.Collisions = new Buckets(r, 1); 72 | this.Hash = Defaults.GetDefaultHashAlgorithm(); 73 | this.M = m - r; 74 | this.RegionSize = (m - r) / r; 75 | this.k = k; 76 | this.IndexBuffer = new uint[k]; 77 | } 78 | 79 | /// 80 | /// Returns the Bloom filter capacity, m. 81 | /// 82 | /// The Bloom filter capacity, m 83 | public uint Capacity() 84 | { 85 | return this.M; 86 | } 87 | 88 | /// 89 | /// Returns the number of hash functions. 90 | /// 91 | /// The number of hash functions 92 | public uint K() 93 | { 94 | return this.k; 95 | } 96 | 97 | /// 98 | /// Returns the number of items added to the filter. 99 | /// 100 | /// The number of items added to the filter 101 | public uint Count() 102 | { 103 | return this.count; 104 | } 105 | 106 | /// 107 | /// Will test for membership of the data and returns true if it is a member, 108 | /// false if not. This is a probabilistic test, meaning there is a non-zero 109 | /// probability of false positives but a zero probability of false negatives. 110 | /// 111 | /// The data to search for. 112 | /// Whether or not the data is maybe contained in the filter. 113 | public bool Test(byte[] data) 114 | { 115 | var hashKernel = Utils.HashKernel(data, this.Hash); 116 | var lower = hashKernel.LowerBaseHash; 117 | var upper = hashKernel.UpperBaseHash; 118 | 119 | // If any of the K bits are not set, then it's not a member. 120 | for (uint i = 0; i < this.k; i++) 121 | { 122 | if (this.Buckets.Get((lower + upper * i) % this.M) == 0) 123 | { 124 | return false; 125 | } 126 | } 127 | return true; 128 | } 129 | 130 | /// 131 | /// Will add the data to the Bloom filter. It returns the filter to allow 132 | /// for chaining. 133 | /// 134 | /// The data to add. 135 | /// The filter. 136 | public IFilter Add(byte[] data) 137 | { 138 | var hashKernel = Utils.HashKernel(data, this.Hash); 139 | var lower = hashKernel.LowerBaseHash; 140 | var upper = hashKernel.UpperBaseHash; 141 | 142 | // Set the K bits. 143 | for (uint i = 0; i < this.k; i++) 144 | { 145 | var idx = (lower + upper * i) % this.M; 146 | if (this.Buckets.Get(idx) != 0) 147 | { 148 | // Collision, set corresponding region bit. 149 | this.Collisions.Set(idx / this.RegionSize, 1); 150 | } 151 | else 152 | { 153 | this.Buckets.Set(idx, 1); 154 | } 155 | } 156 | 157 | this.count++; 158 | return this; 159 | } 160 | 161 | /// 162 | /// Is equivalent to calling Test followed by Add. It returns true if the data is 163 | /// a member, false if not. 164 | /// 165 | /// The data to test for and add if it doesn't exist. 166 | /// Whether or not the data was probably contained in the filter. 167 | public bool TestAndAdd(byte[] data) 168 | { 169 | var hashKernel = Utils.HashKernel(data, this.Hash); 170 | var lower = hashKernel.LowerBaseHash; 171 | var upper = hashKernel.UpperBaseHash; 172 | var member = true; 173 | 174 | // If any of the K bits are not set, then it's not a member. 175 | for (uint i = 0; i < this.k; i++) 176 | { 177 | var idx = (lower + upper * i) % this.M; 178 | if (this.Buckets.Get(idx) == 0) 179 | { 180 | member = false; 181 | } 182 | else 183 | { 184 | // Collision, set corresponding region bit. 185 | this.Collisions.Set(idx / this.RegionSize, 1); 186 | } 187 | this.Buckets.Set(idx, 1); 188 | } 189 | 190 | this.count++; 191 | return member; 192 | } 193 | 194 | /// 195 | /// Will test for membership of the data and remove it from the filter if it 196 | /// exists. Returns true if the data was a member, false if not. 197 | /// 198 | /// The data to test for and remove 199 | /// Whether or not the data was a member before this call 200 | public bool TestAndRemove(byte[] data) 201 | { 202 | var hashKernel = Utils.HashKernel(data, this.Hash); 203 | var lower = hashKernel.LowerBaseHash; 204 | var upper = hashKernel.UpperBaseHash; 205 | var member = true; 206 | 207 | // Set the K bits. 208 | for (uint i = 0; i < this.k; i++) 209 | { 210 | var idx = (lower + upper * i) % this.M; 211 | this.IndexBuffer[i] = idx; 212 | if (this.Buckets.Get(idx) == 0) 213 | { 214 | member = false; 215 | } 216 | } 217 | 218 | if (member) 219 | { 220 | foreach (var idx in this.IndexBuffer) 221 | { 222 | if (this.Collisions.Get(idx / this.RegionSize) == 0) 223 | { 224 | // Clear only bits located in collision-free zones. 225 | this.Buckets.Set(idx, 0); 226 | } 227 | } 228 | this.count--; 229 | } 230 | 231 | return member; 232 | } 233 | 234 | /// 235 | /// Restores the Bloom filter to its original state. It returns the filter to 236 | /// allow for chaining. 237 | /// 238 | /// The reset bloom filter. 239 | public DeletableBloomFilter Reset() 240 | { 241 | this.Buckets.Reset(); 242 | this.Collisions.Reset(); 243 | this.count = 0; 244 | return this; 245 | } 246 | 247 | /// 248 | /// Sets the hashing function used in the filter. 249 | /// 250 | /// The HashAlgorithm to use. 251 | // TODO: Add SetHash to the IFilter interface? 252 | public void SetHash(HashAlgorithm h) 253 | { 254 | this.Hash = h; 255 | } 256 | } 257 | } 258 | -------------------------------------------------------------------------------- /ProbabilisticDataStructures/Element.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | namespace ProbabilisticDataStructures 4 | { 5 | public class Element 6 | { 7 | public byte[] Data { get; set; } 8 | public UInt64 Freq { get; set; } 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /ProbabilisticDataStructures/ElementHeap.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | 5 | namespace ProbabilisticDataStructures 6 | { 7 | internal class ElementHeap 8 | { 9 | internal List Heap { get; set; } 10 | 11 | /// 12 | /// Create a new ElementHeap that can store the top-k elements. 13 | /// 14 | /// The number of top elements to track 15 | internal ElementHeap(int k) 16 | { 17 | this.Heap = new List(k); 18 | } 19 | 20 | /// 21 | /// Get the count of the number of items on the heap. 22 | /// 23 | /// The number of items on the heap 24 | internal int Len() 25 | { 26 | return this.Heap.Count; 27 | } 28 | 29 | /// 30 | /// Return whether or not the item at i-position on the heap is less than the 31 | /// item at j-position. 32 | /// 33 | /// Item 1 34 | /// Item 2 35 | /// 36 | /// Whether or not the item at i-position on the heap is less than the item at 37 | /// j-position. 38 | /// 39 | internal bool Less(int i, int j) 40 | { 41 | return this.Heap[i].Freq < this.Heap[j].Freq; 42 | } 43 | 44 | /// 45 | /// Swap the items at i-position and j-position on the heap. 46 | /// 47 | /// Item 1 48 | /// Item 2 49 | internal void Swap(int i, int j) 50 | { 51 | var temp = this.Heap[i]; 52 | Heap[i] = Heap[j]; 53 | Heap[j] = temp; 54 | } 55 | 56 | /// 57 | /// Push an Element onto the heap. 58 | /// 59 | /// The Element to push onto the heap 60 | internal void Push(Element e) 61 | { 62 | this.Heap.Add(e); 63 | this.Up(this.Len() - 1); 64 | } 65 | 66 | /// 67 | /// Remove the Element at the top of the heap. 68 | /// 69 | /// The Element that was removed 70 | internal Element Pop() 71 | { 72 | var elementToRemove = this.Heap[0]; 73 | this.Heap.Remove(elementToRemove); 74 | return elementToRemove; 75 | } 76 | 77 | internal void Up(int j) 78 | { 79 | while (true) 80 | { 81 | var i = (j - 1) / 2; // parent 82 | if (i == j || !this.Less(j, i)) 83 | { 84 | break; 85 | } 86 | this.Swap(i, j); 87 | j = i; 88 | } 89 | } 90 | 91 | internal void Down(int i, int n) 92 | { 93 | while (true) 94 | { 95 | var j1 = 2 * i + 1; 96 | if (j1 >= n || j1 < 0) 97 | { 98 | // j1 < - after int overflow 99 | break; 100 | } 101 | var j = j1; // left child 102 | var j2 = j1 + 1; 103 | if (j2 < n && !this.Less(j1, j2)) 104 | { 105 | j = j2; // 2*i + 2 // right child 106 | } 107 | if (!this.Less(j, i)) 108 | { 109 | break; 110 | } 111 | this.Swap(i, j); 112 | i = j; 113 | } 114 | } 115 | 116 | /// 117 | /// Returns the top-k elements from lowest to highest frequency. 118 | /// 119 | /// The top-k elements from lowest to highest frequency 120 | internal Element[] Elements() 121 | { 122 | if (this.Len() == 0) 123 | { 124 | return new Element[0]; 125 | } 126 | 127 | return this.Heap 128 | .OrderBy(x => x.Freq) 129 | .ToArray(); 130 | } 131 | 132 | /// 133 | /// Adds the data to the top-k heap. If the data is already an element, the 134 | /// frequency is updated. If the heap already has k elements, the element with 135 | /// the minimum frequency is removed. 136 | /// 137 | /// The data to insert 138 | /// The frequency to associate with the data 139 | internal void insert(byte[] data, UInt64 freq, uint k) 140 | { 141 | for (int i = 0; i < this.Len(); i++) 142 | { 143 | var element = this.Heap[i]; 144 | if (Enumerable.SequenceEqual(data, element.Data)) 145 | { 146 | // Element already in top-k. 147 | element.Freq = freq; 148 | return; 149 | } 150 | } 151 | 152 | if (this.Len() == k) 153 | { 154 | // Remove minimum-frequency element. 155 | this.Pop(); 156 | } 157 | 158 | // Add element to top-k. 159 | this.Push(new Element 160 | { 161 | Data = data, 162 | Freq = freq, 163 | }); 164 | } 165 | 166 | /// 167 | /// Indicates if the given frequency falls within the top-k heap. 168 | /// 169 | /// The frequency to check 170 | /// Whether or not the frequency falls within the top-k heap 171 | internal bool isTop(UInt64 freq, uint k) 172 | { 173 | if (this.Len() < k) 174 | { 175 | return true; 176 | } 177 | 178 | return freq >= this.Heap[0].Freq; 179 | } 180 | } 181 | } 182 | -------------------------------------------------------------------------------- /ProbabilisticDataStructures/HyperLogLog.cs: -------------------------------------------------------------------------------- 1 | /* 2 | Original work Copyright 2013 Eric Lesh 3 | Modified work Copyright 2015 Tyler Treat 4 | Modified work Copyright 2015 Matthew Lorimor 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining 7 | a copy of this software and associated documentation files (the 8 | "Software"), to deal in the Software without restriction, including 9 | without limitation the rights to use, copy, modify, merge, publish, 10 | distribute, sublicense, and/or sell copies of the Software, and to 11 | permit persons to whom the Software is furnished to do so, subject to 12 | the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be 15 | included in all copies or substantial portions of the Software. 16 | */ 17 | 18 | using System; 19 | using System.Linq; 20 | using System.Security.Cryptography; 21 | 22 | namespace ProbabilisticDataStructures 23 | { 24 | /// 25 | /// implements the HyperLogLog cardinality estimation algorithm as 26 | /// described by Flajolet, Fusy, Gandouet, and Meunier in HyperLogLog: the 27 | /// analysis of a near-optimal cardinality estimation algorithm: 28 | /// 29 | /// http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf 30 | /// 31 | /// HyperLogLog is a probabilistic algorithm which approximates the number of 32 | /// distinct elements in a multiset. It works by hashing values and calculating 33 | /// the maximum number of leading zeros in the binary representation of each 34 | /// hash. If the maximum number of leading zeros is n, the estimated number of 35 | /// distinct elements in the set is 2^n. To minimize variance, the multiset is 36 | /// split into a configurable number of registers, the maximum number of leading 37 | /// zeros is calculated in the numbers in each register, and a harmonic mean is 38 | /// used to combine the estimates. 39 | /// 40 | /// For large or unbounded data sets, calculating the exact cardinality is 41 | /// impractical. HyperLogLog uses a fraction of the memory while providing an 42 | /// accurate approximation. For counting element frequency, refer to the 43 | /// Count-Min Sketch. 44 | /// 45 | public class HyperLogLog 46 | { 47 | private static double Exp32 = Math.Pow(2, 32); 48 | 49 | /// 50 | /// Counter registers 51 | /// 52 | private byte[] Registers { get; set; } 53 | /// 54 | /// Number of registers 55 | /// 56 | internal uint M { get; set; } 57 | /// 58 | /// Number of bits to calculate register 59 | /// 60 | private uint B { get; set; } 61 | /// 62 | /// Bias-correction constant 63 | /// 64 | private double Alpha { get; set; } 65 | /// 66 | /// Hash algorithm 67 | /// 68 | private HashAlgorithm Hash { get; set; } 69 | 70 | /// 71 | /// Creates a new HyperLogLog with m registers. Returns an error if m isn't a 72 | /// power of two. 73 | /// 74 | /// Number of registers (must be a power of two) 75 | public HyperLogLog(uint m) 76 | { 77 | if ((m & (m - 1)) != 0) 78 | { 79 | throw new ArgumentException(String.Format("{0} is not a power of two", m)); 80 | } 81 | 82 | this.Registers = new byte[m]; 83 | this.M = m; 84 | this.B = (uint)Math.Ceiling(Math.Log(m, 2)); 85 | this.Alpha = CalculateAlpha(m); 86 | this.Hash = Defaults.GetDefaultHashAlgorithm(); 87 | } 88 | 89 | /// 90 | /// Creates a new HyperLogLog optimized for the specified standard error. 91 | /// Throws an ArgumentException if the number of registers can't be calculated 92 | /// for the provided accuracy. 93 | /// 94 | /// Desired standard error 95 | /// The HyperLogLog optimized for the standard error 96 | public static HyperLogLog NewDefaultHyperLogLog(double e) 97 | { 98 | var m = Math.Pow(1.04 / e, 2); 99 | return new HyperLogLog((uint)Math.Pow(2, Math.Ceiling(Math.Log(m, 2)))); 100 | } 101 | 102 | /// 103 | /// Will add the data to the set. Returns the HyperLogLog to allow for chaining. 104 | /// 105 | /// The data to add 106 | /// The HyperLogLog 107 | public HyperLogLog Add(byte[] data) 108 | { 109 | var hash = CalculateHash(data); 110 | var k = 32 - this.B; 111 | var r = CalculateRho(hash << (int)this.B, k); 112 | var j = hash >> (int)k; 113 | 114 | if (r > this.Registers[j]) 115 | { 116 | this.Registers[j] = r; 117 | } 118 | 119 | return this; 120 | } 121 | 122 | /// 123 | /// Returns the approximated cardinality of the set. 124 | /// 125 | /// The approximated cardinality of the set 126 | public UInt64 Count() 127 | { 128 | var sum = 0.0; 129 | var m = (double)this.M; 130 | foreach (var val in this.Registers) 131 | { 132 | sum += 1.0 / Math.Pow(2.0, val); 133 | } 134 | var estimate = this.Alpha * m * m / sum; 135 | if (estimate <= 5.0 / 2.0 * m) 136 | { 137 | // Small range correction 138 | var v = 0; 139 | foreach (var r in this.Registers) 140 | { 141 | if (r == 0) 142 | { 143 | v++; 144 | } 145 | } 146 | if (v > 0) 147 | { 148 | estimate = m * Math.Log(m / v); 149 | } 150 | } 151 | else if (estimate > 1.0 / 30.0 * Exp32) 152 | { 153 | // Large range correction 154 | estimate = -Exp32 * Math.Log(1 - estimate / Exp32); 155 | } 156 | return (UInt64)estimate; 157 | } 158 | 159 | /// 160 | /// Combines this HyperLogLog with another. Returns an error if the number of 161 | /// registers in the two HyperLogLogs are not equal. 162 | /// 163 | /// The HyperLogLog to merge 164 | /// Whether or not the merge was successful 165 | public bool Merge(HyperLogLog other) 166 | { 167 | if (this.M != other.M) 168 | { 169 | throw new ArgumentException("Number of registers must match"); 170 | } 171 | 172 | for (int i = 0; i < other.Registers.Count(); i++) 173 | { 174 | var r = other.Registers[i]; 175 | if (r > this.Registers[i]) 176 | { 177 | this.Registers[i] = r; 178 | } 179 | } 180 | 181 | return true; 182 | } 183 | 184 | /// 185 | /// Restores the HyperLogLog to its original state. It returns itself to allow 186 | /// for chaining. 187 | /// 188 | /// The HyperLogLog 189 | public HyperLogLog Reset() 190 | { 191 | this.Registers = new byte[this.M]; 192 | return this; 193 | } 194 | 195 | /// 196 | /// Sets the hashing function used in the filter. 197 | /// 198 | /// The HashAlgorithm to use. 199 | public void SetHash(HashAlgorithm h) 200 | { 201 | this.Hash = h; 202 | } 203 | 204 | /// 205 | /// Returns a 32-bit hash value for the given data. 206 | /// 207 | /// Data 208 | /// 32-bit hash value 209 | private uint CalculateHash(byte[] data) 210 | { 211 | var sum = Hash.ComputeHash(data); 212 | return Utils.HashBytesToUInt32(sum); 213 | } 214 | 215 | /// 216 | /// Calculates the bias-correction constant alpha based on the number of 217 | /// registers, m. 218 | /// 219 | /// Number of registers 220 | /// Calculated bias-correction constant, alpha 221 | private static double CalculateAlpha(uint m) 222 | { 223 | switch (m) 224 | { 225 | case 16: 226 | return 0.673; 227 | case 32: 228 | return 0.697; 229 | case 64: 230 | return 0.709; 231 | default: 232 | return 0.7213 / (1.0 + 1.079 / m); 233 | } 234 | } 235 | 236 | /// 237 | /// Calculates the position of the leftmost 1-bit. 238 | /// 239 | /// The value to check 240 | /// 241 | /// The position of the leftmost 1-bit 242 | private static byte CalculateRho(uint val, uint max) 243 | { 244 | var r = 1; 245 | while ((val & 0x80000000) == 0 && r <= max) 246 | { 247 | r++; 248 | val <<= 1; 249 | } 250 | return (byte)r; 251 | } 252 | 253 | // TODO: Implement these later. 254 | // WriteDataTo 255 | // ReadDataFrom 256 | } 257 | } 258 | -------------------------------------------------------------------------------- /ProbabilisticDataStructures/IFilter.cs: -------------------------------------------------------------------------------- 1 | namespace ProbabilisticDataStructures 2 | { 3 | public interface IFilter 4 | { 5 | /// 6 | /// Will test for membership of the data and returns true if it is a member, 7 | /// false if not. 8 | /// 9 | /// The data to test for. 10 | /// Whether or not the data is probably contained in the filter. 11 | bool Test(byte[] data); 12 | /// 13 | /// Add will add the data to the Bloom filter. It returns the filter to allow 14 | /// for chaining. 15 | /// 16 | /// The data to add. 17 | /// The filter. 18 | IFilter Add(byte[] data); 19 | /// 20 | /// Is equivalent to calling Test followed by Add. It returns true if the data is 21 | /// a member, false if not. 22 | /// 23 | /// The data to test for and add if it doesn't exist. 24 | /// Whether or not the data was probably contained in the filter. 25 | bool TestAndAdd(byte[] data); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /ProbabilisticDataStructures/InverseBloomFilter.cs: -------------------------------------------------------------------------------- 1 | /* 2 | Original work Copyright (c) 2012 Jeff Hodges. All rights reserved. 3 | Modified work Copyright (c) 2015 Tyler Treat. All rights reserved. 4 | Modified work Copyright (c) 2015 Matthew Lorimor. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are 8 | met: 9 | 10 | * Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | * Redistributions in binary form must reproduce the above 13 | copyright notice, this list of conditions and the following disclaimer 14 | in the documentation and/or other materials provided with the 15 | distribution. 16 | * Neither the name of Jeff Hodges nor the names of this project's 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | */ 32 | 33 | using System.Linq; 34 | using System.Security.Cryptography; 35 | 36 | namespace ProbabilisticDataStructures 37 | { 38 | /// 39 | /// InverseBloomFilter is a concurrent "inverse" Bloom filter, which is 40 | /// effectively the opposite of a classic Bloom filter. This was originally 41 | /// described and written by Jeff Hodges: 42 | /// 43 | /// http://www.somethingsimilar.com/2012/05/21/the-opposite-of-a-bloom-filter/ 44 | /// 45 | /// The InverseBloomFilter may report a false negative but can never report a 46 | /// false positive. That is, it may report that an item has not been seen when 47 | /// it actually has, but it will never report an item as seen which it hasn't 48 | /// come across. This behaves in a similar manner to a fixed-size hashmap which 49 | /// does not handle conflicts. 50 | /// 51 | /// An example use case is deduplicating events while processing a stream of 52 | /// data. Ideally, duplicate events are relatively close together. 53 | /// 54 | public class InverseBloomFilter : IFilter 55 | { 56 | private byte[][] Array { get; set; } 57 | internal HashAlgorithm Hash { get; set; } 58 | private uint capacity { get; set; } 59 | 60 | /// 61 | /// Instantiates an InverseBloomFilter with the specified capacity. 62 | /// 63 | /// The capacity of the filter 64 | public InverseBloomFilter(uint capacity) 65 | { 66 | this.Array = new byte[capacity][]; 67 | this.Hash = Defaults.GetDefaultHashAlgorithm(); 68 | this.capacity = capacity; 69 | } 70 | 71 | 72 | /// 73 | /// Will test for membership of the data and returns true if it is a 74 | /// member, false if not. This is a probabilistic test, meaning there is a 75 | /// non-zero probability of false negatives but a zero probability of false 76 | /// positives. That is, it may return false even though the data was added, but 77 | /// it will never return true for data that hasn't been added. 78 | /// 79 | /// The data to test for 80 | /// Whether or not the data is present 81 | public bool Test(byte[] data) 82 | { 83 | var index = this.Index(data); 84 | var val = this.Array[index]; 85 | if (val == null) 86 | { 87 | return false; 88 | } 89 | return Enumerable.SequenceEqual(val, data); 90 | } 91 | 92 | /// 93 | /// Will add the data to the filter. It returns the filter to allow for chaining. 94 | /// 95 | /// 96 | /// 97 | public IFilter Add(byte[] data) 98 | { 99 | var index = this.Index(data); 100 | this.GetAndSet(index, data); 101 | return this; 102 | } 103 | 104 | /// 105 | /// Equivalent to calling Test followed by Add atomically. It returns true if 106 | /// the data is a member, false if not. 107 | /// 108 | /// The data to test and add 109 | /// Whether the data was already a member 110 | public bool TestAndAdd(byte[] data) 111 | { 112 | var index = this.Index(data); 113 | var oldId = this.GetAndSet(index, data); 114 | if (oldId == null) 115 | { 116 | return false; 117 | } 118 | return Enumerable.SequenceEqual(oldId, data); 119 | } 120 | 121 | /// 122 | /// Returns the filter capactiy. 123 | /// 124 | /// The filter capactiy 125 | public uint Capacity() 126 | { 127 | return this.capacity; 128 | } 129 | 130 | /// 131 | /// Returns the data that was in the array at the given index after putting the 132 | /// new data in the array at that index, atomically. 133 | /// 134 | /// The index to get and set 135 | /// The data to set 136 | /// 137 | /// The data that was in the array at the index before setting it 138 | /// 139 | private byte[] GetAndSet(uint index, byte[] data) 140 | { 141 | var oldData = this.Array[index]; 142 | this.Array[index] = data; 143 | return oldData; 144 | } 145 | 146 | /// 147 | /// Returns the array index for the given data. 148 | /// 149 | /// The data to find the index for 150 | /// The array index for the given data 151 | private uint Index(byte[] data) 152 | { 153 | var index = this.ComputeHashSum32(data) % this.capacity; 154 | return index; 155 | } 156 | 157 | /// 158 | /// Returns a 32-bit hash value for the given data. 159 | /// 160 | /// Data 161 | /// 32-bit hash value 162 | private uint ComputeHashSum32(byte[] data) 163 | { 164 | var sum = Hash.ComputeHash(data); 165 | return Utils.HashBytesToUInt32(sum); 166 | } 167 | 168 | /// 169 | /// Sets the hashing function used in the filter. 170 | /// 171 | /// The HashAlgorithm to use. 172 | // TODO: Add SetHash to the IFilter interface? 173 | public void SetHash(HashAlgorithm h) 174 | { 175 | this.Hash = h; 176 | } 177 | } 178 | } 179 | -------------------------------------------------------------------------------- /ProbabilisticDataStructures/MinHash.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Threading.Tasks; 5 | 6 | namespace ProbabilisticDataStructures 7 | { 8 | /// 9 | /// MinHash is a variation of the technique for estimating similarity between 10 | /// two sets as presented by Broder in On the resemblance and containment of 11 | /// documents: 12 | /// 13 | /// http://gatekeeper.dec.com/ftp/pub/dec/SRC/publications/broder/positano-final-wpnums.pdf 14 | /// 15 | /// This can be used to cluster or compare documents by splitting the corpus 16 | /// into a bag of words. MinHash returns the approximated similarity ratio of 17 | /// the two bags. The similarity is less accurate for very small bags of words. 18 | /// 19 | public static class MinHash 20 | { 21 | private static Random random = new Random(); 22 | 23 | /// 24 | /// Returns the similarity between two bags. 25 | /// 26 | /// The first bag 27 | /// The second bag 28 | /// The similarity between the bags 29 | public static float Similarity(string[] bag1, string[] bag2) 30 | { 31 | var k = bag1.Length + bag2.Length; 32 | var hashes = new int[k]; 33 | for (int i = 0; i < k; i++) 34 | { 35 | var a = random.Next(); 36 | var b = random.Next(); 37 | var c = random.Next(); 38 | var x = computeHash((uint)(a * b * c), (uint)a, (uint)b, c); 39 | hashes[i] = (int)x; 40 | } 41 | 42 | var bMap = bitMap(bag1, bag2); 43 | var minHashValues = hashBuckets(2, k); 44 | minHash(bag1, 0, minHashValues, bMap, k, hashes); 45 | minHash(bag2, 1, minHashValues, bMap, k, hashes); 46 | return similarity(minHashValues, k); 47 | } 48 | 49 | private static void minHash( 50 | string[] bag, 51 | int bagIndex, 52 | int[][] minHashValues, 53 | Dictionary bitArray, 54 | int k, 55 | int[] hashes) 56 | { 57 | var options = new ParallelOptions(); 58 | options.MaxDegreeOfParallelism = 4; 59 | var index = 0; 60 | 61 | foreach (var element in bitArray) 62 | { 63 | Parallel.For(0, k, options, (i, loopState) => 64 | { 65 | if (bag.Contains(element.Key)) 66 | { 67 | var hindex = hashes[index]; 68 | if (hindex < minHashValues[bagIndex][index]) 69 | { 70 | minHashValues[bagIndex][index] = hindex; 71 | } 72 | } 73 | }); 74 | index++; 75 | } 76 | } 77 | 78 | private static Dictionary bitMap(string[] bag1, string[] bag2) 79 | { 80 | var bitArray = new Dictionary(); 81 | foreach (var element in bag1) 82 | { 83 | bitArray[element] = new bool[] { true, false }; 84 | } 85 | 86 | foreach (var element in bag2) 87 | { 88 | if (bitArray.ContainsKey(element)) 89 | { 90 | bitArray[element] = new bool[] { true, true }; 91 | } 92 | else 93 | { 94 | bitArray[element] = new bool[] { false, true }; 95 | } 96 | } 97 | 98 | return bitArray; 99 | } 100 | 101 | private static int[][] hashBuckets(int numSets, int k) 102 | { 103 | var minHashValues = new int[numSets][]; 104 | for (int i = 0; i < numSets; i++) 105 | { 106 | minHashValues[i] = new int[k]; 107 | } 108 | 109 | for (int i = 0; i < numSets; i++) 110 | { 111 | for (int j = 0; j < k; j++) 112 | { 113 | minHashValues[i][j] = int.MaxValue; 114 | } 115 | } 116 | return minHashValues; 117 | } 118 | 119 | private static uint computeHash(uint x, uint a, uint b, int u) 120 | { 121 | return (a * x + b) >> (32 - u); 122 | } 123 | 124 | private static float similarity(int[][] minHashValues, int k) 125 | { 126 | var identicalMinHashes = 0; 127 | for (int i = 0; i < k; i++) 128 | { 129 | if (minHashValues[0][i] == minHashValues[1][i]) 130 | { 131 | identicalMinHashes++; 132 | } 133 | } 134 | 135 | return (float)(1.0 * (float)identicalMinHashes) / (float)k; 136 | } 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /ProbabilisticDataStructures/PartitionedBloomFilter.cs: -------------------------------------------------------------------------------- 1 | /* 2 | Original work Copyright (c) 2013 zhenjl 3 | Modified work Copyright (c) 2015 Tyler Treat 4 | Modified work Copyright (c) 2015 Matthew Lorimor 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of 7 | this software and associated documentation files (the "Software"), to deal in 8 | the Software without restriction, including without limitation the rights to 9 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 | of the Software, and to permit persons to whom the Software is furnished to do 11 | so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | */ 16 | 17 | using System; 18 | using System.Security.Cryptography; 19 | 20 | namespace ProbabilisticDataStructures 21 | { 22 | /// 23 | /// PartitionedBloomFilter implements a variation of a classic Bloom filter as 24 | /// described by Almeida, Baquero, Preguica, and Hutchison in Scalable Bloom 25 | /// Filters: 26 | /// 27 | /// http://gsd.di.uminho.pt/members/cbm/ps/dbloom.pdf 28 | /// 29 | /// This filter works by partitioning the M-sized bit array into k slices of 30 | /// size m = M/k bits. Each hash function produces an index over m for its 31 | /// respective slice. Thus, each element is described by exactly k bits, meaning 32 | /// the distribution of false positives is uniform across all elements. 33 | /// 34 | public class PartitionedBloomFilter : IFilter 35 | { 36 | /// 37 | /// Partitioned filter data 38 | /// 39 | internal Buckets[] Partitions { get; set; } 40 | /// 41 | /// Hash algorithm 42 | /// 43 | internal HashAlgorithm Hash { get; set; } 44 | /// 45 | /// Filter size (divided into k partitions) 46 | /// 47 | private uint M { get; set; } 48 | /// 49 | /// Number of hash functions (and partitions) 50 | /// 51 | private uint k { get; set; } 52 | /// 53 | /// Partition size (m / k) 54 | /// 55 | private uint S { get; set; } 56 | /// 57 | /// Number of items added 58 | /// 59 | private uint count { get; set; } 60 | 61 | /// 62 | /// Creates a new partitioned Bloom filter optimized to store n items with a 63 | /// specified target false-positive rate. 64 | /// 65 | /// Number of items 66 | /// Desired false-positive rate 67 | public PartitionedBloomFilter(uint n, double fpRate) 68 | { 69 | var m = Utils.OptimalM(n, fpRate); 70 | var k = Utils.OptimalK(fpRate); 71 | var partitions = new Buckets[k]; 72 | var s = (uint)Math.Ceiling((double)m / (double)k); 73 | 74 | for (uint i = 0; i < k; i++) 75 | { 76 | partitions[i] = new Buckets(s, 1); 77 | } 78 | 79 | this.Partitions = partitions; 80 | this.Hash = Defaults.GetDefaultHashAlgorithm(); 81 | this.M = m; 82 | this.k = k; 83 | this.S = s; 84 | } 85 | 86 | /// 87 | /// Returns the Bloom filter capacity, m. 88 | /// 89 | /// The Bloom filter capacity, m 90 | public uint Capacity() 91 | { 92 | return this.M; 93 | } 94 | 95 | /// 96 | /// Returns the number of hash functions. 97 | /// 98 | /// The number of hash functions 99 | public uint K() 100 | { 101 | return this.k; 102 | } 103 | 104 | /// 105 | /// Returns the number of items in the filter. 106 | /// 107 | /// The number of items in the filter 108 | public uint Count() 109 | { 110 | return this.count; 111 | } 112 | 113 | /// 114 | /// Returns the current estimated ratio of set bits. 115 | /// 116 | /// The current estimated ratio of set bits 117 | public double EstimatedFillRatio() 118 | { 119 | return 1 - Math.Exp(-(double)this.count / (double)this.S); 120 | } 121 | 122 | /// 123 | /// Returns the average ratio of set bits across all partitions. 124 | /// 125 | /// The average ratio of set bitsacross all partitions 126 | public double FillRatio() 127 | { 128 | var t = (double)0; 129 | for (uint i = 0; i < this.k; i++) 130 | { 131 | uint sum = 0; 132 | for (uint j = 0; j < this.Partitions[i].count; j++) 133 | { 134 | sum += this.Partitions[i].Get(j); 135 | } 136 | t += ((double)sum / (double)this.S); 137 | } 138 | return (double)t / (double)this.k; 139 | } 140 | 141 | /// 142 | /// Will test for membership of the data and returns true if it is a 143 | /// member, false if not. This is a probabilistic test, meaning there is a 144 | /// non-zero probability of false positives but a zero probability of false 145 | /// negatives. Due to the way the filter is partitioned, the probability of 146 | /// false positives is uniformly distributed across all elements. 147 | /// 148 | /// The data to test for 149 | /// Whether or not the data was found 150 | public bool Test(byte[] data) 151 | { 152 | var hashKernel = Utils.HashKernel(data, this.Hash); 153 | var lower = hashKernel.LowerBaseHash; 154 | var upper = hashKernel.UpperBaseHash; 155 | 156 | // If any of the K partiion bits are not set, then it's not a member. 157 | for (uint i = 0; i < this.k; i++) 158 | { 159 | if (this.Partitions[i].Get((lower + upper * i) % this.S) == 0) 160 | { 161 | return false; 162 | } 163 | } 164 | 165 | return true; 166 | } 167 | 168 | /// 169 | /// Will add the data to the Bloom filter. It returns the filter to allow for 170 | /// chaining. 171 | /// 172 | /// The data to add 173 | /// The PartitionedBloomFilter 174 | public IFilter Add(byte[] data) 175 | { 176 | var hashKernel = Utils.HashKernel(data, this.Hash); 177 | var lower = hashKernel.LowerBaseHash; 178 | var upper = hashKernel.UpperBaseHash; 179 | 180 | // Set the K partition bits. 181 | for (uint i = 0; i < this.k; i++) 182 | { 183 | this.Partitions[i].Set((lower + upper * i) % this.S, 1); 184 | } 185 | 186 | this.count++; 187 | return this; 188 | } 189 | 190 | /// 191 | /// Equivalent to calling Test followed by Add. It returns true if the data is a 192 | /// member, false if not. 193 | /// 194 | /// The data to test for and add 195 | /// 196 | /// Whether the data was present in the filter prior to adding it 197 | /// 198 | public bool TestAndAdd(byte[] data) 199 | { 200 | var hashKernel = Utils.HashKernel(data, this.Hash); 201 | var lower = hashKernel.LowerBaseHash; 202 | var upper = hashKernel.UpperBaseHash; 203 | var member = true; 204 | 205 | // If any K partition bits are not set, then it's not a member. 206 | for (uint i = 0; i < this.k; i++) 207 | { 208 | var idx = (lower + upper * i) % this.S; 209 | if (this.Partitions[i].Get(idx) == 0) 210 | { 211 | member = false; 212 | } 213 | this.Partitions[i].Set(idx, 1); 214 | } 215 | 216 | this.count++; 217 | return member; 218 | } 219 | 220 | /// 221 | /// Restores the Bloom filter to its original state. It returns the filter 222 | /// to allow for chaining. 223 | /// 224 | /// The PartitionedBloomFilter 225 | public PartitionedBloomFilter Reset() 226 | { 227 | foreach (var partition in this.Partitions) 228 | { 229 | partition.Reset(); 230 | } 231 | return this; 232 | } 233 | 234 | /// 235 | /// Sets the hashing function used in the filter. 236 | /// 237 | /// The HashAlgorithm to use. 238 | // TODO: Add SetHash to the IFilter interface? 239 | public void SetHash(HashAlgorithm h) 240 | { 241 | this.Hash = h; 242 | } 243 | } 244 | } 245 | -------------------------------------------------------------------------------- /ProbabilisticDataStructures/ProbabilisticDataStructures.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | netstandard2.0;net45 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /ProbabilisticDataStructures/ScalableBloomFilter.cs: -------------------------------------------------------------------------------- 1 | /* 2 | Original work Copyright (c) 2013 zhenjl 3 | Modified work Copyright (c) 2015 Tyler Treat 4 | Modified work Copyright (c) 2015 Matthew Lorimor 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of 7 | this software and associated documentation files (the "Software"), to deal in 8 | the Software without restriction, including without limitation the rights to 9 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 | of the Software, and to permit persons to whom the Software is furnished to do 11 | so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | */ 16 | 17 | using System; 18 | using System.Collections.Generic; 19 | using System.Linq; 20 | using System.Security.Cryptography; 21 | 22 | namespace ProbabilisticDataStructures 23 | { 24 | /// 25 | /// ScalableBloomFilter implements a Scalable Bloom Filter as described by 26 | /// Almeida, Baquero, Preguica, and Hutchison in Scalable Bloom Filters: 27 | /// 28 | /// http://gsd.di.uminho.pt/members/cbm/ps/dbloom.pdf 29 | /// 30 | /// A Scalable Bloom Filter dynamically adapts to the number of elements in the 31 | /// data set while enforcing a tight upper bound on the false-positive rate. 32 | /// This works by adding Bloom filters with geometrically decreasing 33 | /// false-positive rates as filters become full. The tightening ratio, r, 34 | /// controls the filter growth. The compounded probability over the whole series 35 | /// converges to a target value, even accounting for an infinite series. 36 | /// 37 | /// Scalable Bloom Filters are useful for cases where the size of the data set 38 | /// isn't known a priori and memory constraints aren't of particular concern. 39 | /// For situations where memory is bounded, consider using Inverse or Stable 40 | /// Bloom Filters. 41 | /// 42 | public class ScalableBloomFilter : IFilter 43 | { 44 | /// 45 | /// Filters with geometrically decreasing error rates 46 | /// 47 | internal List Filters { get; set; } 48 | /// 49 | /// Tightening ratio 50 | /// 51 | internal double R { get; set; } 52 | /// 53 | /// Target false-positive rate 54 | /// 55 | internal double FP { get; set; } 56 | /// 57 | /// Partition fill ratio 58 | /// 59 | private double P { get; set; } 60 | /// 61 | /// Filter size hint 62 | /// 63 | internal uint Hint { get; set; } 64 | 65 | /// 66 | /// Creates a new Scalable Bloom Filter with the specified target false-positive 67 | /// rate and tightening ratio. Use NewDefaultScalableBloomFilter if you don't 68 | /// want to calculate all these parameters. 69 | /// 70 | /// 71 | /// 72 | /// 73 | public ScalableBloomFilter(uint hint, double fpRate, double r) 74 | { 75 | this.Filters = new List(); 76 | this.R = r; 77 | this.FP = fpRate; 78 | this.P = Defaults.FILL_RATIO; 79 | this.Hint = hint; 80 | 81 | this.AddFilter(); 82 | } 83 | 84 | /// 85 | /// Creates a new Scalable Bloom Filter with the specified target false-positive 86 | /// rate and an optimal tightening ratio. 87 | /// 88 | /// 89 | public static ScalableBloomFilter NewDefaultScalableBloomFilter(double fpRate) 90 | { 91 | return new ScalableBloomFilter(10000, fpRate, 0.8); 92 | } 93 | 94 | /// 95 | /// Returns the current Scalable Bloom Filter capacity, which is the sum of the 96 | /// capacities for the contained series of Bloom filters. 97 | /// 98 | /// The current Scalable Bloom Filter capacity 99 | public uint Capacity() 100 | { 101 | var capacity = 0u; 102 | foreach (var filter in this.Filters) 103 | { 104 | capacity += filter.Capacity(); 105 | } 106 | return capacity; 107 | } 108 | 109 | /// 110 | /// Returns the number of hash functions used in each Bloom filter. 111 | /// 112 | /// The number of hash functions used in each Bloom filter 113 | public uint K() 114 | { 115 | return this.Filters[0].K(); 116 | } 117 | 118 | /// 119 | /// Returns the average ratio of set bits across every filter. 120 | /// 121 | /// The average ratio of set bits across every filter 122 | public double FillRatio() 123 | { 124 | var sum = 0.0; 125 | foreach (var filter in this.Filters) 126 | { 127 | sum += filter.FillRatio(); 128 | } 129 | return (double)sum / this.Filters.Count(); 130 | } 131 | 132 | /// 133 | /// Will test for membership of the data and returns true if it is a member, 134 | /// false if not. This is a probabilistic test, meaning there is a non-zero 135 | /// probability of false positives but a zero probability of false negatives. 136 | /// 137 | /// The data to search for. 138 | /// Whether or not the data is maybe contained in the filter. 139 | public bool Test(byte[] data) 140 | { 141 | // Querying is made by testing for the presence in each filter. 142 | foreach (var filter in this.Filters) 143 | { 144 | if (filter.Test(data)) 145 | { 146 | return true; 147 | } 148 | } 149 | 150 | return false; 151 | } 152 | 153 | /// 154 | /// Add will add the data to the Bloom filter. It returns the filter to allow 155 | /// for chaining. 156 | /// 157 | /// The data to add 158 | /// The ScalableBloomFilter 159 | public IFilter Add(byte[] data) 160 | { 161 | var idx = this.Filters.Count() - 1; 162 | 163 | // If the last filter has reached its fill ratio, add a new one. 164 | if (this.Filters[idx].EstimatedFillRatio() >= this.P) 165 | { 166 | this.AddFilter(); 167 | idx++; 168 | } 169 | 170 | this.Filters[idx].Add(data); 171 | return this; 172 | } 173 | 174 | /// 175 | /// Is equivalent to calling Test followed by Add. It returns true if the data 176 | /// is a member, false if not. 177 | /// 178 | /// The data to test for and add 179 | /// Whether or not the data was present before adding it 180 | public bool TestAndAdd(byte[] data) 181 | { 182 | var member = this.Test(data); 183 | this.Add(data); 184 | return member; 185 | } 186 | 187 | /// 188 | /// Sets the hashing function used in the filter. 189 | /// 190 | /// The HashAlgorithm to use. 191 | // TODO: Add SetHash to the IFilter interface? 192 | public void SetHash(HashAlgorithm h) 193 | { 194 | foreach (var filter in this.Filters) 195 | { 196 | filter.SetHash(h); 197 | } 198 | } 199 | 200 | /// 201 | /// Restores the Bloom filter to its original state. It returns the filter to 202 | /// allow for chaining. 203 | /// 204 | /// The reset bloom filter. 205 | public ScalableBloomFilter Reset() 206 | { 207 | this.Filters = new List(); 208 | this.AddFilter(); 209 | return this; 210 | } 211 | 212 | /// 213 | /// Adds a new Bloom filter with a restricted false-positive rate to the 214 | /// Scalable Bloom Filter 215 | /// 216 | internal void AddFilter() 217 | { 218 | var fpRate = this.FP * Math.Pow(this.R, this.Filters.Count()); 219 | var p = new PartitionedBloomFilter(this.Hint, fpRate); 220 | if (this.Filters.Count() > 0) 221 | { 222 | p.SetHash(this.Filters[0].Hash); 223 | } 224 | this.Filters.Add(p); 225 | } 226 | } 227 | } 228 | -------------------------------------------------------------------------------- /ProbabilisticDataStructures/TopK.cs: -------------------------------------------------------------------------------- 1 | namespace ProbabilisticDataStructures 2 | { 3 | /// 4 | /// TopK uses a Count-Min Sketch to calculate the top-K frequent elements in a 5 | /// stream. 6 | /// 7 | public class TopK 8 | { 9 | private CountMinSketch Cms { get; set; } 10 | private uint K { get; set; } 11 | internal uint N { get; set; } 12 | private ElementHeap elements { get; set; } 13 | 14 | /// 15 | /// Creates a new TopK backed by a Count-Min sketch whose relative accuracy is 16 | /// within a factor of epsilon with probability delta. It tracks the k-most 17 | /// frequent elements. 18 | /// 19 | /// Relative-accuracy factor 20 | /// Relative-accuracy probability 21 | /// Number of top elements to track 22 | /// 23 | public TopK(double epsilon, double delta, uint k) 24 | { 25 | this.Cms = new CountMinSketch(epsilon, delta); 26 | this.K = k; 27 | this.elements = new ElementHeap((int)k); 28 | } 29 | 30 | /// 31 | /// Will add the data to the Count-Min Sketch and update the top-k heap if 32 | /// applicable. Returns the TopK to allow for chaining. 33 | /// 34 | /// The data to add 35 | /// The TopK 36 | public TopK Add(byte[] data) 37 | { 38 | this.Cms.Add(data); 39 | this.N++; 40 | 41 | var freq = this.Cms.Count(data); 42 | if (this.elements.isTop(freq, this.K)) 43 | { 44 | elements.insert(data, freq, this.K); 45 | } 46 | 47 | return this; 48 | } 49 | 50 | /// 51 | /// Returns the top-k elements from lowest to highest frequency. 52 | /// 53 | /// The top-k elements from lowest to highest frequency 54 | public Element[] Elements() 55 | { 56 | return elements.Elements(); 57 | } 58 | 59 | /// 60 | /// Restores the TopK to its original state. It returns itself to allow for 61 | /// chaining. 62 | /// 63 | /// The TopK 64 | public TopK Reset() 65 | { 66 | this.Cms.Reset(); 67 | this.elements = new ElementHeap((int)K); 68 | this.N = 0; 69 | return this; 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /ProbabilisticDataStructures/Utils.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Linq; 3 | using System.Security.Cryptography; 4 | using System.Text; 5 | 6 | namespace ProbabilisticDataStructures 7 | { 8 | public static class Utils 9 | { 10 | /// 11 | /// Calculates the optimal Bloom filter size, m, based on the number of items and 12 | /// the desired rate of false positives. 13 | /// 14 | /// Number of items. 15 | /// Desired false positive rate. 16 | /// The optimal BloomFilter size, m. 17 | public static uint OptimalM(uint n, double fpRate) 18 | { 19 | var optimalM = Math.Ceiling((double)n / ((Math.Log(Defaults.FILL_RATIO) * 20 | Math.Log(1 - Defaults.FILL_RATIO)) / Math.Abs(Math.Log(fpRate)))); 21 | return Convert.ToUInt32(optimalM); 22 | } 23 | 24 | /// 25 | /// Calculates the optimal Bloom filter size, m, based on the number of items and 26 | /// the desired rate of false positives. 27 | /// 28 | /// Number of items. 29 | /// Desired false positive rate. 30 | /// The optimal BloomFilter size, m. 31 | public static ulong OptimalM64(ulong n, double fpRate) 32 | { 33 | var optimalM = Math.Ceiling((double)n / ((Math.Log(Defaults.FILL_RATIO) * 34 | Math.Log(1 - Defaults.FILL_RATIO)) / Math.Abs(Math.Log(fpRate)))); 35 | return Convert.ToUInt64(optimalM); 36 | } 37 | 38 | /// 39 | /// Calculates the optimal number of hash functions to use for a Bloom filter 40 | /// based on the desired rate of false positives. 41 | /// 42 | /// Desired false positive rate. 43 | /// The optimal number of hash functions, k. 44 | public static uint OptimalK(double fpRate) 45 | { 46 | var optimalK = Math.Ceiling(Math.Log(1 / fpRate, 2)); 47 | return Convert.ToUInt32(optimalK); 48 | } 49 | 50 | /// 51 | /// Returns the upper and lower base hash values from which the k hashes are 52 | /// derived. The result will be the same regardless of the endianness of the 53 | /// architecture. 54 | /// 55 | /// The data bytes to hash. 56 | /// The hashing algorithm to use. 57 | /// A HashKernel 58 | public static HashKernelReturnValue HashKernel(byte[] data, HashAlgorithm algorithm) 59 | { 60 | var sum = algorithm.ComputeHash(data); 61 | return HashKernelFromHashBytes(sum); 62 | } 63 | 64 | /// 65 | /// Returns the upper and lower base hash values from which the k hashes are 66 | /// derived using the given hash bytes directly. The result will be the 67 | /// same regardless of the endianness of the architecture. Used by a unit 68 | /// test to confirm the calculation is compatible with the HashKernel from 69 | /// https://github.com/tylertreat/BoomFilters running in Go. 70 | /// 71 | /// The hash bytes. 72 | /// A HashKernel 73 | public static HashKernelReturnValue HashKernelFromHashBytes(byte[] hashBytes) 74 | { 75 | return HashKernelReturnValue.Create( 76 | HashBytesToUInt32(hashBytes, 0), 77 | HashBytesToUInt32(hashBytes, 4) 78 | ); 79 | } 80 | 81 | /// 82 | /// Returns the upper and lower base hash values from which the k hashes are 83 | /// derived. 84 | /// 85 | /// The data bytes to hash. 86 | /// The hashing algorithm to use. 87 | /// A HashKernel 88 | public static HashKernel128ReturnValue HashKernel128(byte[] data, HashAlgorithm algorithm) 89 | { 90 | var sum = algorithm.ComputeHash(data); 91 | return HashKernel128ReturnValue.Create( 92 | HashBytesToUInt64(sum, 0), 93 | HashBytesToUInt64(sum, 8) 94 | ); 95 | } 96 | 97 | /// 98 | /// Returns the uint represented by the given hash bytes, starting at 99 | /// byte . The result will be the same 100 | /// regardless of the endianness of the architecture. 101 | /// 102 | /// 103 | /// 104 | /// 105 | public static uint HashBytesToUInt32(byte[] hashBytes, int offset = 0) 106 | { 107 | return 108 | ((uint)hashBytes[offset]) | 109 | ((uint)hashBytes[offset + 1]) << 8 | 110 | ((uint)hashBytes[offset + 2]) << 16 | 111 | ((uint)hashBytes[offset + 3]) << 24; 112 | } 113 | 114 | /// 115 | /// Returns the ulong represented by the given hash bytes, starting at 116 | /// byte . The result will be the same 117 | /// regardless of the endianness of the architecture. 118 | /// 119 | /// 120 | /// 121 | /// 122 | public static ulong HashBytesToUInt64(byte[] hashBytes, int offset = 0) 123 | { 124 | return 125 | ((ulong)hashBytes[offset]) | 126 | ((ulong)hashBytes[offset + 1]) << 8 | 127 | ((ulong)hashBytes[offset + 2]) << 16 | 128 | ((ulong)hashBytes[offset + 3]) << 24 | 129 | ((ulong)hashBytes[offset + 4]) << 32 | 130 | ((ulong)hashBytes[offset + 5]) << 40 | 131 | ((ulong)hashBytes[offset + 6]) << 48 | 132 | ((ulong)hashBytes[offset + 7]) << 56; 133 | } 134 | 135 | /// 136 | /// Compute the hash for the provided bytes. 137 | /// 138 | /// The bytes to hash. 139 | /// The hash string of the bytes. 140 | public static string ComputeHashAsString(byte[] inputBytes, HashAlgorithm hashAlgorithm) 141 | { 142 | // Compute the hash of the input byte array. 143 | byte[] data = hashAlgorithm.ComputeHash(inputBytes); 144 | 145 | // Create a new StringBuilder to collect the bytes and create a string. 146 | StringBuilder sb = new StringBuilder(); 147 | 148 | // Loop through each byte of the hashed data and format each one as a 149 | // hexadecimal string. 150 | for (int i = 0; i < data.Length; i++) 151 | { 152 | sb.Append(data[i].ToString("X2")); 153 | } 154 | 155 | // Return the hexadecimal string. 156 | return sb.ToString(); 157 | } 158 | } 159 | 160 | public struct HashKernelReturnValue 161 | { 162 | public uint UpperBaseHash { get; private set; } 163 | public uint LowerBaseHash { get; private set; } 164 | 165 | public static HashKernelReturnValue Create(uint lowerBaseHash, uint upperBaseHash) 166 | { 167 | return new HashKernelReturnValue 168 | { 169 | UpperBaseHash = upperBaseHash, 170 | LowerBaseHash = lowerBaseHash 171 | }; 172 | } 173 | } 174 | 175 | public struct HashKernel128ReturnValue 176 | { 177 | public ulong UpperBaseHash { get; private set; } 178 | public ulong LowerBaseHash { get; private set; } 179 | public static HashKernel128ReturnValue Create(ulong lowerBaseHash, ulong upperBaseHash) 180 | { 181 | return new HashKernel128ReturnValue 182 | { 183 | UpperBaseHash = upperBaseHash, 184 | LowerBaseHash = lowerBaseHash, 185 | }; 186 | } 187 | } 188 | } 189 | -------------------------------------------------------------------------------- /TestProbabilisticDataStructures/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.InteropServices; 4 | 5 | // General Information about an assembly is controlled through the following 6 | // set of attributes. Change these attribute values to modify the information 7 | // associated with an assembly. 8 | [assembly: AssemblyTitle("TestProbabilisticDataStructures")] 9 | [assembly: AssemblyDescription("")] 10 | [assembly: AssemblyConfiguration("")] 11 | [assembly: AssemblyCompany("")] 12 | [assembly: AssemblyProduct("TestProbabilisticDataStructures")] 13 | [assembly: AssemblyCopyright("Copyright © 2015")] 14 | [assembly: AssemblyTrademark("")] 15 | [assembly: AssemblyCulture("")] 16 | 17 | // Setting ComVisible to false makes the types in this assembly not visible 18 | // to COM components. If you need to access a type in this assembly from 19 | // COM, set the ComVisible attribute to true on that type. 20 | [assembly: ComVisible(false)] 21 | 22 | // The following GUID is for the ID of the typelib if this project is exposed to COM 23 | [assembly: Guid("df071d43-8650-491c-a572-4329e4cf8e5f")] 24 | 25 | // Version information for an assembly consists of the following four values: 26 | // 27 | // Major Version 28 | // Minor Version 29 | // Build Number 30 | // Revision 31 | // 32 | // You can specify all the values or you can default the Build and Revision Numbers 33 | // by using the '*' as shown below: 34 | // [assembly: AssemblyVersion("1.0.*")] 35 | [assembly: AssemblyVersion("1.0.0.0")] 36 | [assembly: AssemblyFileVersion("1.0.0.0")] 37 | -------------------------------------------------------------------------------- /TestProbabilisticDataStructures/TestBloomFilter.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.VisualStudio.TestTools.UnitTesting; 2 | using ProbabilisticDataStructures; 3 | using System.Text; 4 | 5 | namespace TestProbabilisticDataStructures 6 | { 7 | [TestClass] 8 | public class TestBloomFilter 9 | { 10 | private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a"); 11 | private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b"); 12 | private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c"); 13 | private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x"); 14 | 15 | /// 16 | /// Ensures that Capacity() returns the number of bits, m, in the Bloom filter. 17 | /// 18 | [TestMethod] 19 | public void TestBloomCapacity() 20 | { 21 | var f = new BloomFilter(100, 0.1); 22 | var capacity = f.Capacity(); 23 | 24 | Assert.AreEqual(480u, capacity); 25 | } 26 | 27 | /// 28 | /// Ensures that K() returns the number of hash functions in the Bloom Filter. 29 | /// 30 | [TestMethod] 31 | public void TestBloomK() 32 | { 33 | var f = new BloomFilter(100, 0.1); 34 | var k = f.K(); 35 | 36 | Assert.AreEqual(4u, k); 37 | } 38 | 39 | /// 40 | /// Ensures that Count returns the number of items added to the filter. 41 | /// 42 | [TestMethod] 43 | public void TestBloomCount() 44 | { 45 | var f = new BloomFilter(100, 0.1); 46 | for (uint i = 0; i < 10; i++) 47 | { 48 | f.Add(Encoding.ASCII.GetBytes(i.ToString())); 49 | } 50 | 51 | var count = f.Count(); 52 | Assert.AreEqual(10u, count); 53 | } 54 | 55 | /// 56 | /// Ensures that EstimatedFillRatio returns the correct approximation. 57 | /// 58 | [TestMethod] 59 | public void TestBloomEstimatedFillRatio() 60 | { 61 | var f = new BloomFilter(100, 0.5); 62 | for (uint i = 0; i < 100; i++) 63 | { 64 | f.Add(Encoding.ASCII.GetBytes(i.ToString())); 65 | } 66 | 67 | var ratio = f.EstimatedFillRatio(); 68 | if (ratio > 0.5) 69 | { 70 | Assert.Fail("Expected less than or equal to 0.5, got {0}", ratio); 71 | } 72 | } 73 | 74 | /// 75 | /// Ensures that FillRatio returns the ratio of set bits. 76 | /// 77 | [TestMethod] 78 | public void TestBloomFillRatio() 79 | { 80 | var f = new BloomFilter(100, 0.1); 81 | f.Add(A_BYTES); 82 | f.Add(B_BYTES); 83 | f.Add(C_BYTES); 84 | 85 | var ratio = f.FillRatio(); 86 | Assert.AreEqual(0.025, ratio); 87 | } 88 | 89 | /// 90 | /// Ensures that Test, Add, and TestAndAdd behave correctly. 91 | /// 92 | [TestMethod] 93 | public void TestBloomTestAndAdd() 94 | { 95 | var f = new BloomFilter(100, 0.01); 96 | 97 | // 'a' is not in the filter. 98 | if (f.Test(A_BYTES)) 99 | { 100 | Assert.Fail("'a' should not be a member"); 101 | } 102 | 103 | var addedF = f.Add(A_BYTES); 104 | Assert.AreSame(f, addedF, "Returned BloomFilter should be the same instance"); 105 | 106 | // 'a' is now in the filter. 107 | if (!f.Test(A_BYTES)) 108 | { 109 | Assert.Fail("'a' should be a member"); 110 | } 111 | 112 | // 'a' is still in the filter. 113 | if (!f.TestAndAdd(A_BYTES)) 114 | { 115 | Assert.Fail("'a' should be a member"); 116 | } 117 | 118 | // 'b' is not in the filter. 119 | if (f.TestAndAdd(B_BYTES)) 120 | { 121 | Assert.Fail("'b' should not be a member"); 122 | } 123 | 124 | // 'a' is still in the filter. 125 | if (!f.Test(A_BYTES)) 126 | { 127 | Assert.Fail("'a' should be a member"); 128 | } 129 | 130 | // 'b' is now in the filter. 131 | if (!f.Test(B_BYTES)) 132 | { 133 | Assert.Fail("'b' should be a member"); 134 | } 135 | 136 | // 'c' is not in the filter. 137 | if (f.Test(C_BYTES)) 138 | { 139 | Assert.Fail("'c' should not be a member"); 140 | } 141 | 142 | for (int i = 0; i < 1000000; i++) 143 | { 144 | f.TestAndAdd(Encoding.ASCII.GetBytes(i.ToString())); 145 | } 146 | 147 | // 'x' should be a false positive. 148 | if (!f.Test(X_BYTES)) 149 | { 150 | Assert.Fail("'x' should be a member"); 151 | } 152 | } 153 | 154 | /// 155 | /// Ensures that Reset sets every bit to zero. 156 | /// 157 | [TestMethod] 158 | public void TestBloomReset() 159 | { 160 | var f = new BloomFilter(100, 0.1); 161 | for (int i = 0; i < 1000; i++) 162 | { 163 | f.Add(Encoding.ASCII.GetBytes(i.ToString())); 164 | } 165 | 166 | var resetF = f.Reset(); 167 | Assert.AreSame(f, resetF, "Returned BloomFilter should be the same instance"); 168 | 169 | for (uint i = 0; i < f.Buckets.count; i++) 170 | { 171 | if (f.Buckets.Get(i) != 0) 172 | { 173 | Assert.Fail("Expected all bits to be unset"); 174 | } 175 | } 176 | } 177 | } 178 | 179 | [TestClass] 180 | public class BenchmarkBloomFilter 181 | { 182 | private BloomFilter f; 183 | private int n; 184 | private byte[][] data; 185 | 186 | [TestInitialize()] 187 | public void Testinitialize() 188 | { 189 | n = 100000; 190 | f = new BloomFilter(100000, 0.1); 191 | data = new byte[n][]; 192 | for (int i = 0; i < n; i++) 193 | { 194 | data[i] = Encoding.ASCII.GetBytes(i.ToString()); 195 | } 196 | } 197 | 198 | [TestCleanup()] 199 | public void TestCleanup() 200 | { 201 | f = null; 202 | n = 0; 203 | data = null; 204 | } 205 | 206 | [TestMethod] 207 | public void BenchmarkBloomAdd() 208 | { 209 | for (int i = 0; i < n; i++) 210 | { 211 | f.Add(data[i]); 212 | } 213 | } 214 | 215 | [TestMethod] 216 | public void BenchmarkBloomTest() 217 | { 218 | for (int i = 0; i < n; i++) 219 | { 220 | f.Test(data[i]); 221 | } 222 | } 223 | 224 | [TestMethod] 225 | public void BenchmarkBloomTestAndAdd() 226 | { 227 | for (int i = 0; i < n; i++) 228 | { 229 | f.TestAndAdd(data[i]); 230 | } 231 | } 232 | } 233 | } 234 | -------------------------------------------------------------------------------- /TestProbabilisticDataStructures/TestBloomFilter64.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using Microsoft.VisualStudio.TestTools.UnitTesting; 3 | using ProbabilisticDataStructures; 4 | using System.Text; 5 | using System.Collections.Generic; 6 | 7 | namespace TestProbabilisticDataStructures 8 | { 9 | [TestClass] 10 | public class TestBloomFilter64 11 | { 12 | private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a"); 13 | private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b"); 14 | private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c"); 15 | private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x"); 16 | 17 | /// 18 | /// Ensures that Capacity() returns the number of bits, m, in the Bloom filter. 19 | /// 20 | [TestMethod] 21 | public void TestBloomCapacity() 22 | { 23 | var f = new BloomFilter64(100, 0.1); 24 | var capacity = f.Capacity(); 25 | 26 | Assert.AreEqual(480u, capacity); 27 | } 28 | 29 | /// 30 | /// Ensures that K() returns the number of hash functions in the Bloom Filter. 31 | /// 32 | [TestMethod] 33 | public void TestBloom64K() 34 | { 35 | var f = new BloomFilter64(100, 0.1); 36 | var k = f.K(); 37 | 38 | Assert.AreEqual(4u, k); 39 | } 40 | 41 | /// 42 | /// Ensures that Count returns the number of items added to the filter. 43 | /// 44 | [TestMethod] 45 | public void TestBloom64Count() 46 | { 47 | var f = new BloomFilter64(100, 0.1); 48 | for (uint i = 0; i < 10; i++) 49 | { 50 | f.Add(Encoding.ASCII.GetBytes(i.ToString())); 51 | } 52 | 53 | var count = f.Count(); 54 | Assert.AreEqual(10u, count); 55 | } 56 | 57 | /// 58 | /// Ensures that EstimatedFillRatio returns the correct approximation. 59 | /// 60 | [TestMethod] 61 | public void TestBloom64EstimatedFillRatio() 62 | { 63 | var f = new BloomFilter64(100, 0.5); 64 | for (uint i = 0; i < 100; i++) 65 | { 66 | f.Add(Encoding.ASCII.GetBytes(i.ToString())); 67 | } 68 | 69 | var ratio = f.EstimatedFillRatio(); 70 | if (ratio > 0.5) 71 | { 72 | Assert.Fail("Expected less than or equal to 0.5, got {0}", ratio); 73 | } 74 | } 75 | 76 | /// 77 | /// Ensures that FillRatio returns the ratio of set bits. 78 | /// 79 | [TestMethod] 80 | public void TestBloom64FillRatio() 81 | { 82 | var f = new BloomFilter64(100, 0.1); 83 | f.Add(A_BYTES); 84 | f.Add(B_BYTES); 85 | f.Add(C_BYTES); 86 | 87 | var ratio = f.FillRatio(); 88 | Assert.AreEqual(0.025, ratio); 89 | } 90 | 91 | /// 92 | /// Ensures that Test, Add, and TestAndAdd behave correctly. 93 | /// 94 | [TestMethod] 95 | public void TestBloom64TestAndAdd() 96 | { 97 | var f = new BloomFilter64(100, 0.01); 98 | 99 | // 'a' is not in the filter. 100 | if (f.Test(A_BYTES)) 101 | { 102 | Assert.Fail("'a' should not be a member"); 103 | } 104 | 105 | var addedF = f.Add(A_BYTES); 106 | Assert.AreSame(f, addedF, "Returned BloomFilter64 should be the same instance"); 107 | 108 | // 'a' is now in the filter. 109 | if (!f.Test(A_BYTES)) 110 | { 111 | Assert.Fail("'a' should be a member"); 112 | } 113 | 114 | // 'a' is still in the filter. 115 | if (!f.TestAndAdd(A_BYTES)) 116 | { 117 | Assert.Fail("'a' should be a member"); 118 | } 119 | 120 | // 'b' is not in the filter. 121 | if (f.TestAndAdd(B_BYTES)) 122 | { 123 | Assert.Fail("'b' should not be a member"); 124 | } 125 | 126 | // 'a' is still in the filter. 127 | if (!f.Test(A_BYTES)) 128 | { 129 | Assert.Fail("'a' should be a member"); 130 | } 131 | 132 | // 'b' is now in the filter. 133 | if (!f.Test(B_BYTES)) 134 | { 135 | Assert.Fail("'b' should be a member"); 136 | } 137 | 138 | // 'c' is not in the filter. 139 | if (f.Test(C_BYTES)) 140 | { 141 | Assert.Fail("'c' should not be a member"); 142 | } 143 | 144 | for (int i = 0; i < 1000000; i++) 145 | { 146 | f.TestAndAdd(Encoding.ASCII.GetBytes(i.ToString())); 147 | } 148 | 149 | // 'x' should be a false positive. 150 | if (!f.Test(X_BYTES)) 151 | { 152 | Assert.Fail("'x' should be a member"); 153 | } 154 | } 155 | 156 | /// 157 | /// Ensures that Reset sets every bit to zero. 158 | /// 159 | [TestMethod] 160 | public void TestBloom64Reset() 161 | { 162 | var f = new BloomFilter64(100, 0.1); 163 | for (int i = 0; i < 1000; i++) 164 | { 165 | f.Add(Encoding.ASCII.GetBytes(i.ToString())); 166 | } 167 | 168 | var resetF = f.Reset(); 169 | Assert.AreSame(f, resetF, "Returned BloomFilter64 should be the same instance"); 170 | 171 | for (uint i = 0; i < f.Buckets.count; i++) 172 | { 173 | if (f.Buckets.Get(i) != 0) 174 | { 175 | Assert.Fail("Expected all bits to be unset"); 176 | } 177 | } 178 | } 179 | } 180 | 181 | [TestClass] 182 | public class BenchmarkBloomFilter64 183 | { 184 | private BloomFilter64 f; 185 | private int n; 186 | private byte[][] data; 187 | 188 | [TestInitialize()] 189 | public void Testinitialize() 190 | { 191 | n = 100000; 192 | f = new BloomFilter64(100000, 0.1); 193 | data = new byte[n][]; 194 | for (int i = 0; i < n; i++) 195 | { 196 | data[i] = Encoding.ASCII.GetBytes(i.ToString()); 197 | } 198 | } 199 | 200 | [TestCleanup()] 201 | public void TestCleanup() 202 | { 203 | f = null; 204 | n = 0; 205 | data = null; 206 | } 207 | 208 | [TestMethod] 209 | public void BenchmarkBloom64Add() 210 | { 211 | for (int i = 0; i < n; i++) 212 | { 213 | f.Add(data[i]); 214 | } 215 | } 216 | 217 | [TestMethod] 218 | public void BenchmarkBloom64Test() 219 | { 220 | for (int i = 0; i < n; i++) 221 | { 222 | f.Test(data[i]); 223 | } 224 | } 225 | 226 | [TestMethod] 227 | public void BenchmarkBloom64TestAndAdd() 228 | { 229 | for (int i = 0; i < n; i++) 230 | { 231 | f.TestAndAdd(data[i]); 232 | } 233 | } 234 | } 235 | } 236 | -------------------------------------------------------------------------------- /TestProbabilisticDataStructures/TestBuckets.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.VisualStudio.TestTools.UnitTesting; 2 | using ProbabilisticDataStructures; 3 | 4 | namespace TestProbabilisticDataStructures 5 | { 6 | [TestClass] 7 | public class TestBuckets 8 | { 9 | /// 10 | /// Ensures that Max returns the correct maximum based on the bucket 11 | /// size. 12 | /// 13 | [TestMethod] 14 | public void TestMaxBucketValue() 15 | { 16 | var b = new Buckets(10, 2); 17 | 18 | var max = b.MaxBucketValue(); 19 | Assert.AreEqual(3, max); 20 | } 21 | 22 | /// 23 | /// Ensures that Count returns the number of buckets. 24 | /// 25 | [TestMethod] 26 | public void TestBucketsCount() 27 | { 28 | var b = new Buckets(10, 2); 29 | 30 | var count = b.count; 31 | Assert.AreEqual(10u, count); 32 | } 33 | 34 | /// 35 | /// Ensures that Increment increments the bucket value by the correct delta and 36 | /// clamps to zero and the maximum, Get returns the correct bucket value, and Set 37 | /// sets the bucket value correctly. 38 | /// 39 | [TestMethod] 40 | public void TestBucketsIncrementAndGetAndSet() 41 | { 42 | var b = new Buckets(5, 2); 43 | 44 | var incrementedB = b.Increment(0, 1); 45 | Assert.AreSame(b, incrementedB, "Returned Buckets should be the same instance"); 46 | 47 | var v = b.Get(0); 48 | Assert.AreEqual(1u, v); 49 | 50 | b.Increment(1u, -1); 51 | 52 | v = b.Get(1); 53 | Assert.AreEqual(0u, v); 54 | 55 | var setB = b.Set(2u, 100); 56 | Assert.AreSame(b, setB, "Returned Buckets should be the same instance"); 57 | 58 | v = b.Get(2); 59 | Assert.AreEqual(3u, v); 60 | 61 | b.Increment(3, 2); 62 | 63 | v = b.Get(3); 64 | Assert.AreEqual(2u, v); 65 | } 66 | 67 | /// 68 | /// Ensures that Reset restores the Buckets to the original state. 69 | /// 70 | [TestMethod] 71 | public void TestBucketsReset() 72 | { 73 | var b = new Buckets(5, 2); 74 | 75 | for (uint i = 0; i < 5; i++) 76 | { 77 | b.Increment(i, 1); 78 | } 79 | 80 | var resetB = b.Reset(); 81 | Assert.AreSame(b, resetB, "Returned Buckets should be the same instance"); 82 | 83 | for (uint i = 0; i < 5; i++) 84 | { 85 | var c = b.Get(i); 86 | Assert.AreEqual(0u, c); 87 | } 88 | } 89 | 90 | [TestMethod] 91 | public void BenchmarkBucketsIncrement() 92 | { 93 | var buckets = new Buckets(10000, 10); 94 | for (uint i = 0; i < buckets.count; i++) 95 | { 96 | buckets.Increment(i % 10000, 1); 97 | } 98 | } 99 | 100 | [TestMethod] 101 | public void BenchmarkBucketsSet() 102 | { 103 | var buckets = new Buckets(10000, 10); 104 | for (uint i = 0; i < buckets.count; i++) 105 | { 106 | buckets.Set(i % 10000, 1); 107 | } 108 | } 109 | 110 | [TestMethod] 111 | public void BenchmarkBucketsGet() 112 | { 113 | var buckets = new Buckets(10000, 10); 114 | for (uint i = 0; i < buckets.count; i++) 115 | { 116 | buckets.Get(i % 10000); 117 | } 118 | } 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /TestProbabilisticDataStructures/TestBuckets64.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.VisualStudio.TestTools.UnitTesting; 2 | using ProbabilisticDataStructures; 3 | 4 | namespace TestProbabilisticDataStructures 5 | { 6 | [TestClass] 7 | public class TestBuckets64 8 | { 9 | /// 10 | /// Ensures that Max returns the correct maximum based on the bucket 11 | /// size. 12 | /// 13 | [TestMethod] 14 | public void TestMaxBucketValue() 15 | { 16 | var b = new Buckets64(10, 2); 17 | 18 | var max = b.MaxBucketValue(); 19 | Assert.AreEqual(3, max); 20 | } 21 | 22 | /// 23 | /// Ensures that Count returns the number of buckets. 24 | /// 25 | [TestMethod] 26 | public void TestBuckets64Count() 27 | { 28 | var b = new Buckets64(10, 2); 29 | 30 | var count = b.count; 31 | Assert.AreEqual(10u, count); 32 | } 33 | 34 | /// 35 | /// Ensures that Increment increments the bucket value by the correct delta and 36 | /// clamps to zero and the maximum, Get returns the correct bucket value, and Set 37 | /// sets the bucket value correctly. 38 | /// 39 | [TestMethod] 40 | public void TestBuckets64IncrementAndGetAndSet() 41 | { 42 | var b = new Buckets64(5, 2); 43 | 44 | var incrementedB = b.Increment(0, 1); 45 | Assert.AreSame(b, incrementedB, "Returned Buckets64 should be the same instance"); 46 | 47 | var v = b.Get(0); 48 | Assert.AreEqual(1u, v); 49 | 50 | b.Increment(1u, -1); 51 | 52 | v = b.Get(1); 53 | Assert.AreEqual(0u, v); 54 | 55 | var setB = b.Set(2u, 100); 56 | Assert.AreSame(b, setB, "Returned Buckets64 should be the same instance"); 57 | 58 | v = b.Get(2); 59 | Assert.AreEqual(3u, v); 60 | 61 | b.Increment(3, 2); 62 | 63 | v = b.Get(3); 64 | Assert.AreEqual(2u, v); 65 | } 66 | 67 | /// 68 | /// Ensures that Reset restores the Buckets64 to the original state. 69 | /// 70 | [TestMethod] 71 | public void TestBuckets64Reset() 72 | { 73 | var b = new Buckets64(5, 2); 74 | 75 | for (uint i = 0; i < 5; i++) 76 | { 77 | b.Increment(i, 1); 78 | } 79 | 80 | var resetB = b.Reset(); 81 | Assert.AreSame(b, resetB, "Returned Buckets64 should be the same instance"); 82 | 83 | for (uint i = 0; i < 5; i++) 84 | { 85 | var c = b.Get(i); 86 | Assert.AreEqual(0u, c); 87 | } 88 | } 89 | 90 | [TestMethod] 91 | public void BenchmarkBuckets64Increment() 92 | { 93 | var buckets = new Buckets64(10000, 10); 94 | for (uint i = 0; i < buckets.count; i++) 95 | { 96 | buckets.Increment(i % 10000, 1); 97 | } 98 | } 99 | 100 | [TestMethod] 101 | public void BenchmarkBuckets64Set() 102 | { 103 | var buckets = new Buckets64(10000, 10); 104 | for (uint i = 0; i < buckets.count; i++) 105 | { 106 | buckets.Set(i % 10000, 1); 107 | } 108 | } 109 | 110 | [TestMethod] 111 | public void BenchmarkBuckets64Get() 112 | { 113 | var buckets = new Buckets64(10000, 10); 114 | for (uint i = 0; i < buckets.count; i++) 115 | { 116 | buckets.Get(i % 10000); 117 | } 118 | } 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /TestProbabilisticDataStructures/TestCountMinSketch.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | using Microsoft.VisualStudio.TestTools.UnitTesting; 3 | using ProbabilisticDataStructures; 4 | 5 | namespace TestProbabilisticDataStructures 6 | { 7 | [TestClass] 8 | public class TestCountMinSketch 9 | { 10 | private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a"); 11 | private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b"); 12 | private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c"); 13 | private static byte[] D_BYTES = Encoding.ASCII.GetBytes("d"); 14 | private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x"); 15 | 16 | /// 17 | /// Ensures that TotalCount returns the number of items added to the sketch. 18 | /// 19 | [TestMethod] 20 | public void TestCMSTotalCount() 21 | { 22 | var cms = new CountMinSketch(0.001, 0.99); 23 | 24 | for (int i = 0; i < 100; i++) 25 | { 26 | cms.Add(Encoding.ASCII.GetBytes(i.ToString())); 27 | } 28 | 29 | var count = cms.TotalCount(); 30 | Assert.AreEqual(100u, count); 31 | } 32 | 33 | /// 34 | /// Ensures that Add adds to the set and Count returns the correct approximation. 35 | /// 36 | [TestMethod] 37 | public void TestCMSAddAndCount() 38 | { 39 | var cms = new CountMinSketch(0.001, 0.99); 40 | 41 | var addedCms = cms.Add(A_BYTES); 42 | Assert.AreSame(cms, addedCms); 43 | 44 | cms.Add(B_BYTES); 45 | cms.Add(C_BYTES); 46 | cms.Add(B_BYTES); 47 | cms.Add(D_BYTES); 48 | cms.Add(A_BYTES).Add(A_BYTES); 49 | 50 | var count = cms.Count(A_BYTES); 51 | Assert.AreEqual(3u, count); 52 | 53 | count = cms.Count(B_BYTES); 54 | Assert.AreEqual(2u, count); 55 | 56 | count = cms.Count(C_BYTES); 57 | Assert.AreEqual(1u, count); 58 | 59 | count = cms.Count(D_BYTES); 60 | Assert.AreEqual(1u, count); 61 | 62 | count = cms.Count(X_BYTES); 63 | Assert.AreEqual(0u, count); 64 | } 65 | 66 | /// 67 | /// Ensures that Merge combines the two sketches. 68 | /// 69 | [TestMethod] 70 | public void TestCMSMerge() 71 | { 72 | var cms = new CountMinSketch(0.001, 0.99); 73 | cms.Add(B_BYTES); 74 | cms.Add(C_BYTES); 75 | cms.Add(B_BYTES); 76 | cms.Add(D_BYTES); 77 | cms.Add(A_BYTES).Add(A_BYTES); 78 | 79 | var other = new CountMinSketch(0.001, 0.99); 80 | other.Add(B_BYTES); 81 | other.Add(C_BYTES); 82 | other.Add(B_BYTES); 83 | 84 | var wasMerged = cms.Merge(other); 85 | Assert.IsTrue(wasMerged); 86 | 87 | var count = cms.Count(A_BYTES); 88 | Assert.AreEqual(2u, count); 89 | 90 | count = cms.Count(B_BYTES); 91 | Assert.AreEqual(4u, count); 92 | 93 | count = cms.Count(C_BYTES); 94 | Assert.AreEqual(2u, count); 95 | 96 | count = cms.Count(D_BYTES); 97 | Assert.AreEqual(1u, count); 98 | 99 | count = cms.Count(X_BYTES); 100 | Assert.AreEqual(0u, count); 101 | } 102 | 103 | /// 104 | /// Ensures that Reset restores the sketch to its original state. 105 | /// 106 | [TestMethod] 107 | public void TestCMSReset() 108 | { 109 | var cms = new CountMinSketch(0.001, 0.99); 110 | cms.Add(B_BYTES); 111 | cms.Add(C_BYTES); 112 | cms.Add(B_BYTES); 113 | cms.Add(D_BYTES); 114 | cms.Add(A_BYTES).Add(A_BYTES); 115 | 116 | var resetCms = cms.Reset(); 117 | Assert.AreSame(cms, resetCms); 118 | 119 | for (uint i = 0; i < cms.Depth; i++) 120 | { 121 | for (int j = 0; j < cms.Width; j++) 122 | { 123 | if (cms.Matrix[i][j] != 0) 124 | { 125 | Assert.Fail("Expected matrix to be completely empty."); 126 | } 127 | } 128 | } 129 | } 130 | 131 | [TestMethod] 132 | public void BenchmarkCMSAdd() 133 | { 134 | var n = 100000; 135 | var cms = new CountMinSketch(0.001, 0.99); 136 | var data = new byte[n][]; 137 | for (int i = 0; i < n; i++) 138 | { 139 | data[i] = Encoding.ASCII.GetBytes(i.ToString()); 140 | } 141 | 142 | for (int i = 0; i < n; i++) 143 | { 144 | cms.Add(data[i]); 145 | } 146 | } 147 | 148 | [TestMethod] 149 | public void BenchmarkCMSCount() 150 | { 151 | var n = 100000; 152 | var cms = new CountMinSketch(0.001, 0.99); 153 | var data = new byte[n][]; 154 | for (int i = 0; i < n; i++) 155 | { 156 | var byteArray = Encoding.ASCII.GetBytes(i.ToString()); 157 | data[i] = byteArray; 158 | cms.Add(byteArray); 159 | } 160 | 161 | for (int i = 0; i < n; i++) 162 | { 163 | cms.Add(data[i]); 164 | } 165 | } 166 | 167 | // TODO: Implement these later. 168 | // TestCMSSerialization 169 | // BenchmarkCMSWriteDataTo 170 | // BenchmarkCMSReadDataFrom 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /TestProbabilisticDataStructures/TestCountingBloomFilter.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.VisualStudio.TestTools.UnitTesting; 2 | using ProbabilisticDataStructures; 3 | using System.Text; 4 | 5 | namespace TestProbabilisticDataStructures 6 | { 7 | [TestClass] 8 | public class TestCountingBloomFilter 9 | { 10 | private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a"); 11 | private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b"); 12 | private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c"); 13 | private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x"); 14 | 15 | /// 16 | /// Ensures that Capacity() returns the number of bits, m, in the Bloom filter. 17 | /// 18 | [TestMethod] 19 | public void TestCountingCapacity() 20 | { 21 | var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100, 0.1); 22 | var capacity = f.Capacity(); 23 | 24 | Assert.AreEqual(480u, capacity); 25 | } 26 | 27 | /// 28 | /// Ensures that K() returns the number of hash functions in the Bloom Filter. 29 | /// 30 | [TestMethod] 31 | public void TestCountingK() 32 | { 33 | var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100, 0.1); 34 | var k = f.K(); 35 | 36 | Assert.AreEqual(4u, k); 37 | } 38 | 39 | /// 40 | /// Ensures that Count returns the number of items added to the filter. 41 | /// 42 | [TestMethod] 43 | public void TestCountingCount() 44 | { 45 | var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100, 0.1); 46 | for (uint i = 0; i < 10; i++) 47 | { 48 | f.Add(Encoding.ASCII.GetBytes(i.ToString())); 49 | } 50 | 51 | for (int i = 0; i < 5; i++) 52 | { 53 | f.TestAndRemove(Encoding.ASCII.GetBytes(i.ToString())); 54 | } 55 | 56 | var count = f.Count(); 57 | Assert.AreEqual(5u, count); 58 | } 59 | 60 | /// 61 | /// Ensures that Test, Add, and TestAndAdd behave correctly. 62 | /// 63 | [TestMethod] 64 | public void TestCountingTestAndAdd() 65 | { 66 | var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100, 0.01); 67 | 68 | // 'a' is not in the filter. 69 | if (f.Test(A_BYTES)) 70 | { 71 | Assert.Fail("'a' should not be a member"); 72 | } 73 | 74 | var addedF = f.Add(A_BYTES); 75 | Assert.AreSame(f, addedF, "Returned BloomFilter should be the same instance"); 76 | 77 | // 'a' is now in the filter. 78 | if (!f.Test(A_BYTES)) 79 | { 80 | Assert.Fail("'a' should be a member"); 81 | } 82 | 83 | // 'a' is still in the filter. 84 | if (!f.TestAndAdd(A_BYTES)) 85 | { 86 | Assert.Fail("'a' should be a member"); 87 | } 88 | 89 | // 'b' is not in the filter. 90 | if (f.TestAndAdd(B_BYTES)) 91 | { 92 | Assert.Fail("'b' should not be a member"); 93 | } 94 | 95 | // 'a' is still in the filter. 96 | if (!f.Test(A_BYTES)) 97 | { 98 | Assert.Fail("'a' should be a member"); 99 | } 100 | 101 | // 'b' is now in the filter. 102 | if (!f.Test(B_BYTES)) 103 | { 104 | Assert.Fail("'b' should be a member"); 105 | } 106 | 107 | // 'c' is not in the filter. 108 | if (f.Test(C_BYTES)) 109 | { 110 | Assert.Fail("'c' should not be a member"); 111 | } 112 | 113 | for (int i = 0; i < 1000000; i++) 114 | { 115 | f.TestAndAdd(Encoding.ASCII.GetBytes(i.ToString())); 116 | } 117 | 118 | // 'x' should be a false positive. 119 | if (!f.Test(X_BYTES)) 120 | { 121 | Assert.Fail("'x' should be a member"); 122 | } 123 | } 124 | 125 | /// 126 | /// Ensures that TestAndRemove behaves correctly. 127 | /// 128 | [TestMethod] 129 | public void TestCountingTestAndRemove() 130 | { 131 | var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100, 0.01); 132 | 133 | // 'a' is not in the filter. 134 | if (f.TestAndRemove(A_BYTES)) 135 | { 136 | Assert.Fail("'a' should not be a member"); 137 | } 138 | 139 | f.Add(Encoding.ASCII.GetBytes("a")); 140 | 141 | // 'a' is now in the filter. 142 | if (!f.TestAndRemove(A_BYTES)) 143 | { 144 | Assert.Fail("'a' should be a member"); 145 | } 146 | 147 | // 'a' is no longer in the filter. 148 | if (f.TestAndRemove(A_BYTES)) 149 | { 150 | Assert.Fail("'a' should not be a member"); 151 | } 152 | } 153 | 154 | /// 155 | /// Ensures that Reset sets every bit to zero and the count is zero. 156 | /// 157 | [TestMethod] 158 | public void TestCountingReset() 159 | { 160 | var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100, 0.1); 161 | for (int i = 0; i < 1000; i++) 162 | { 163 | f.Add(Encoding.ASCII.GetBytes(i.ToString())); 164 | } 165 | 166 | var resetF = f.Reset(); 167 | Assert.AreSame(f, resetF, "Returned CountingBloomFilter should be the same instance"); 168 | 169 | for (uint i = 0; i < f.Buckets.count; i++) 170 | { 171 | if (f.Buckets.Get(i) != 0) 172 | { 173 | Assert.Fail("Expected all bits to be unset"); 174 | } 175 | } 176 | 177 | Assert.AreEqual(0u, f.Count()); 178 | } 179 | 180 | [TestMethod] 181 | public void BenchmarkCountingAdd() 182 | { 183 | var n = 100000; 184 | var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100000, 0.1); 185 | var data = new byte[n][]; 186 | for (int i = 0; i < n; i++) 187 | { 188 | data[i] = Encoding.ASCII.GetBytes(i.ToString()); 189 | } 190 | 191 | for (int i = 0; i < n; i++) 192 | { 193 | f.Add(data[i]); 194 | } 195 | } 196 | 197 | [TestMethod] 198 | public void BenchmarkCountingTest() 199 | { 200 | var n = 100000; 201 | var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100000, 0.1); 202 | var data = new byte[n][]; 203 | for (int i = 0; i < n; i++) 204 | { 205 | data[i] = Encoding.ASCII.GetBytes(i.ToString()); 206 | } 207 | 208 | for (int i = 0; i < n; i++) 209 | { 210 | f.Test(data[i]); 211 | } 212 | } 213 | 214 | [TestMethod] 215 | public void BenchmarkCountingTestAndAdd() 216 | { 217 | var n = 100000; 218 | var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100000, 0.1); 219 | var data = new byte[n][]; 220 | for (int i = 0; i < n; i++) 221 | { 222 | data[i] = Encoding.ASCII.GetBytes(i.ToString()); 223 | } 224 | 225 | for (int i = 0; i < n; i++) 226 | { 227 | f.TestAndAdd(data[i]); 228 | } 229 | } 230 | 231 | [TestMethod] 232 | public void BenchmarkCountingTestAndRemove() 233 | { 234 | var n = 100000; 235 | var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100000, 0.1); 236 | var data = new byte[n][]; 237 | for (int i = 0; i < n; i++) 238 | { 239 | data[i] = Encoding.ASCII.GetBytes(i.ToString()); 240 | } 241 | 242 | for (int i = 0; i < n; i++) 243 | { 244 | f.TestAndRemove(data[i]); 245 | } 246 | } 247 | } 248 | } 249 | -------------------------------------------------------------------------------- /TestProbabilisticDataStructures/TestCuckooBloomFilter.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.VisualStudio.TestTools.UnitTesting; 2 | using ProbabilisticDataStructures; 3 | using System.Text; 4 | 5 | namespace TestProbabilisticDataStructures 6 | { 7 | [TestClass] 8 | public class TestCuckooBloomFilter 9 | { 10 | private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a"); 11 | private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b"); 12 | private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c"); 13 | private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x"); 14 | 15 | /// 16 | /// Ensures that Buckets returns the number of buckets, m, in the Cuckoo Filter. 17 | /// 18 | [TestMethod] 19 | public void TestCuckooBuckets() 20 | { 21 | var f = new CuckooBloomFilter(100, 0.1); 22 | var buckets = f.BucketCount(); 23 | 24 | Assert.AreEqual(1024u, buckets); 25 | } 26 | 27 | /// 28 | /// Ensures that Capacity returns the expected filter capacity. 29 | /// 30 | [TestMethod] 31 | public void TestCuckooCapacity() 32 | { 33 | var f = new CuckooBloomFilter(100, 0.1); 34 | var capacity = f.Capacity(); 35 | 36 | Assert.AreEqual(100u, capacity); 37 | } 38 | 39 | /// 40 | /// Ensures that Count returns the number of items added to the filter. 41 | /// 42 | [TestMethod] 43 | public void TestCuckooCount() 44 | { 45 | var f = new CuckooBloomFilter(100, 0.1); 46 | for (int i = 0; i < 10; i++) 47 | { 48 | f.Add(Encoding.ASCII.GetBytes(i.ToString())); 49 | } 50 | 51 | for (int i = 0; i < 5; i++) 52 | { 53 | f.TestAndRemove(Encoding.ASCII.GetBytes(i.ToString())); 54 | } 55 | 56 | var count = f.Count(); 57 | Assert.AreEqual(5u, count); 58 | } 59 | 60 | /// 61 | /// Ensures that Test, Add, and TestAndAdd behave correctly. 62 | /// 63 | [TestMethod] 64 | public void TestCuckooTestAndAdd() 65 | { 66 | var f = new CuckooBloomFilter(100, 0.1); 67 | 68 | // 'a' is not in the filter. 69 | if (f.Test(A_BYTES)) 70 | { 71 | Assert.Fail("'a' should not be a member"); 72 | } 73 | 74 | if (!f.Add(A_BYTES)) 75 | { 76 | Assert.Fail("Should return true"); 77 | } 78 | 79 | // 'a' is now in the filter. 80 | if (!f.Test(A_BYTES)) 81 | { 82 | Assert.Fail("'a' should be a member"); 83 | } 84 | 85 | // 'a' is still in the filter. 86 | var testAndAdd = f.TestAndAdd(A_BYTES); 87 | if (!testAndAdd.WasAlreadyAMember) 88 | { 89 | Assert.Fail("'a' should be a member"); 90 | } 91 | // Should not have added 92 | Assert.IsFalse(testAndAdd.Added); 93 | 94 | // 'b' is not in the filter. 95 | testAndAdd = f.TestAndAdd(B_BYTES); 96 | if (testAndAdd.WasAlreadyAMember) 97 | { 98 | Assert.Fail("'b' should not be a member"); 99 | } 100 | // Should add 101 | Assert.IsTrue(testAndAdd.Added); 102 | 103 | // 'a' is still in the filter. 104 | if (!f.Test(A_BYTES)) 105 | { 106 | Assert.Fail("'a' should be a member"); 107 | } 108 | 109 | // 'b' is now in the filter. 110 | if (!f.Test(B_BYTES)) 111 | { 112 | Assert.Fail("'b' should be a member"); 113 | } 114 | 115 | // 'c' is not in the filter. 116 | if (f.Test(C_BYTES)) 117 | { 118 | Assert.Fail("'c' should not be a member"); 119 | } 120 | 121 | for (int i = 0; i < 10000; i++) 122 | { 123 | f.Add(Encoding.ASCII.GetBytes(i.ToString())); 124 | } 125 | 126 | // Filter should be full. 127 | testAndAdd = f.TestAndAdd(X_BYTES); 128 | // Make sure not there 129 | Assert.IsFalse(testAndAdd.WasAlreadyAMember); 130 | // Make sure didn't add 131 | Assert.IsFalse(testAndAdd.Added); 132 | } 133 | 134 | /// 135 | /// Ensures that TestAndRemove behaves correctly. 136 | /// 137 | [TestMethod] 138 | public void TestCuckooTestAndRemove() 139 | { 140 | var f = new CuckooBloomFilter(100, 0.1); 141 | 142 | // 'a' is not in the filter. 143 | if (f.Test(A_BYTES)) 144 | { 145 | Assert.Fail("'a' should not be a member"); 146 | } 147 | 148 | f.Add(A_BYTES); 149 | 150 | // 'a' is now in the filter. 151 | if (!f.TestAndRemove(A_BYTES)) 152 | { 153 | Assert.Fail("'a' should be a member"); 154 | } 155 | 156 | // 'a' is no longer in the filter. 157 | if (f.Test(A_BYTES)) 158 | { 159 | Assert.Fail("'a' should not be a member"); 160 | } 161 | } 162 | 163 | /// 164 | /// Ensures that Reset clears all buckets and the count is zero. 165 | /// 166 | [TestMethod] 167 | public void TestCuckooReset() 168 | { 169 | var f = new CuckooBloomFilter(100, 0.1); 170 | for (int i = 0; i < 1000; i++) 171 | { 172 | f.Add(Encoding.ASCII.GetBytes(i.ToString())); 173 | } 174 | 175 | var resetFilter = f.Reset(); 176 | Assert.AreSame(f, resetFilter); 177 | 178 | for (int i = 0; i < f.BucketCount(); i++) 179 | { 180 | for (uint j = 0; j < f.B; j++) 181 | { 182 | if (f.Buckets[i][j] != null) 183 | { 184 | Assert.Fail("Exected all buckets to be cleared"); 185 | } 186 | } 187 | } 188 | 189 | Assert.AreEqual(0u, f.Count()); 190 | } 191 | 192 | [TestMethod] 193 | public void BenchmarkCuckooAdd() 194 | { 195 | var n = 100000u; 196 | var f = new CuckooBloomFilter(n, 0.1); 197 | var data = new byte[n][]; 198 | for (int i = 0; i < n; i++) 199 | { 200 | data[i] = Encoding.ASCII.GetBytes(i.ToString()); 201 | } 202 | 203 | for (int i = 0; i < n; i++) 204 | { 205 | f.Add(data[i]); 206 | } 207 | } 208 | 209 | [TestMethod] 210 | public void BenchmarkCuckooTest() 211 | { 212 | var n = 100000u; 213 | var f = new CuckooBloomFilter(n, 0.1); 214 | var data = new byte[n][]; 215 | for (int i = 0; i < n; i++) 216 | { 217 | data[i] = Encoding.ASCII.GetBytes(i.ToString()); 218 | } 219 | 220 | for (int i = 0; i < n; i++) 221 | { 222 | f.Test(data[i]); 223 | } 224 | } 225 | 226 | [TestMethod] 227 | public void BenchmarkCuckooTestAndAdd() 228 | { 229 | var n = 100000u; 230 | var f = new CuckooBloomFilter(n, 0.1); 231 | var data = new byte[n][]; 232 | for (int i = 0; i < n; i++) 233 | { 234 | data[i] = Encoding.ASCII.GetBytes(i.ToString()); 235 | } 236 | 237 | for (int i = 0; i < n; i++) 238 | { 239 | f.TestAndAdd(data[i]); 240 | } 241 | } 242 | 243 | [TestMethod] 244 | public void BenchmarkCuckooTestAndRemove() 245 | { 246 | var n = 100000u; 247 | var f = new CuckooBloomFilter(n, 0.1); 248 | var data = new byte[n][]; 249 | for (int i = 0; i < n; i++) 250 | { 251 | data[i] = Encoding.ASCII.GetBytes(i.ToString()); 252 | } 253 | 254 | for (int i = 0; i < n; i++) 255 | { 256 | f.TestAndRemove(data[i]); 257 | } 258 | } 259 | } 260 | } 261 | -------------------------------------------------------------------------------- /TestProbabilisticDataStructures/TestDeletableBloomFilter.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.VisualStudio.TestTools.UnitTesting; 2 | using ProbabilisticDataStructures; 3 | using System.Text; 4 | 5 | namespace TestProbabilisticDataStructures 6 | { 7 | [TestClass] 8 | public class TestDeletableBloomFilter 9 | { 10 | private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a"); 11 | private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b"); 12 | private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c"); 13 | private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x"); 14 | 15 | /// 16 | /// Ensures that Capacity() returns the number of bits, m, in the Bloom filter. 17 | /// 18 | [TestMethod] 19 | public void TestDeletableCapacity() 20 | { 21 | var d = new DeletableBloomFilter(100, 10, 0.1); 22 | var capacity = d.Capacity(); 23 | 24 | Assert.AreEqual(470u, capacity); 25 | } 26 | 27 | /// 28 | /// Ensures that K() returns the number of hash functions in the Bloom Filter. 29 | /// 30 | [TestMethod] 31 | public void TestDeletableK() 32 | { 33 | var d = new DeletableBloomFilter(100, 10, 0.1); 34 | var k = d.K(); 35 | 36 | Assert.AreEqual(4u, k); 37 | } 38 | 39 | /// 40 | /// Ensures that Count returns the number of items added to the filter. 41 | /// 42 | [TestMethod] 43 | public void TestDeletableCount() 44 | { 45 | var d = new DeletableBloomFilter(100, 10, 0.1); 46 | for (uint i = 0; i < 10; i++) 47 | { 48 | d.Add(Encoding.ASCII.GetBytes(i.ToString())); 49 | } 50 | 51 | for (int i = 0; i < 5; i++) 52 | { 53 | d.TestAndRemove(Encoding.ASCII.GetBytes(i.ToString())); 54 | } 55 | 56 | var count = d.Count(); 57 | Assert.AreEqual(5u, count); 58 | } 59 | 60 | /// 61 | /// Ensures that Test, Add, and TestAndAdd behave correctly. 62 | /// 63 | [TestMethod] 64 | public void TestDeletableTestAndAdd() 65 | { 66 | var d = new DeletableBloomFilter(100, 10, 0.1); 67 | 68 | // 'a' is not in the filter. 69 | if (d.Test(A_BYTES)) 70 | { 71 | Assert.Fail("'a' should not be a member"); 72 | } 73 | 74 | var addedF = d.Add(A_BYTES); 75 | Assert.AreSame(d, addedF, "Returned CountingBloomFilter should be the same instance"); 76 | 77 | // 'a' is now in the filter. 78 | if (!d.Test(A_BYTES)) 79 | { 80 | Assert.Fail("'a' should be a member"); 81 | } 82 | 83 | // 'a' is still in the filter. 84 | if (!d.TestAndAdd(A_BYTES)) 85 | { 86 | Assert.Fail("'a' should be a member"); 87 | } 88 | 89 | // 'b' is not in the filter. 90 | if (d.TestAndAdd(B_BYTES)) 91 | { 92 | Assert.Fail("'b' should not be a member"); 93 | } 94 | 95 | // 'a' is still in the filter. 96 | if (!d.Test(A_BYTES)) 97 | { 98 | Assert.Fail("'a' should be a member"); 99 | } 100 | 101 | // 'b' is now in the filter. 102 | if (!d.Test(B_BYTES)) 103 | { 104 | Assert.Fail("'b' should be a member"); 105 | } 106 | 107 | // 'c' is not in the filter. 108 | if (d.Test(C_BYTES)) 109 | { 110 | Assert.Fail("'c' should not be a member"); 111 | } 112 | 113 | for (int i = 0; i < 1000000; i++) 114 | { 115 | d.TestAndAdd(Encoding.ASCII.GetBytes(i.ToString())); 116 | } 117 | 118 | // 'x' should be a false positive. 119 | if (!d.Test(X_BYTES)) 120 | { 121 | Assert.Fail("'x' should be a member"); 122 | } 123 | } 124 | 125 | /// 126 | /// Ensures that TestAndRemove behaves correctly. 127 | /// 128 | [TestMethod] 129 | public void TestDeletableTestAndRemove() 130 | { 131 | var d = new DeletableBloomFilter(100, 10, 0.1); 132 | 133 | // 'a' is not in the filter. 134 | if (d.TestAndRemove(A_BYTES)) 135 | { 136 | Assert.Fail("'a' should not be a member"); 137 | } 138 | 139 | d.Add(A_BYTES); 140 | 141 | // 'a' is now in the filter. 142 | if (!d.TestAndRemove(A_BYTES)) 143 | { 144 | Assert.Fail("'a' should be a member"); 145 | } 146 | 147 | // 'a' is no longer in the filter. 148 | if (d.TestAndRemove(A_BYTES)) 149 | { 150 | Assert.Fail("'a' should not be a member"); 151 | } 152 | } 153 | 154 | /// 155 | /// Ensures that Reset sets every bit to zero. 156 | /// 157 | [TestMethod] 158 | public void TestDeletableReset() 159 | { 160 | var d = new DeletableBloomFilter(100, 10, 0.1); 161 | for (int i = 0; i < 1000; i++) 162 | { 163 | d.Add(Encoding.ASCII.GetBytes(i.ToString())); 164 | } 165 | 166 | var resetF = d.Reset(); 167 | Assert.AreSame(d, resetF, "Returned DeletableBloomFilter should be the same instance"); 168 | 169 | for (uint i = 0; i < d.Buckets.count; i++) 170 | { 171 | if (d.Buckets.Get(i) != 0) 172 | { 173 | Assert.Fail("Expected all bits to be unset"); 174 | } 175 | } 176 | 177 | for (uint i = 0; i < d.Collisions.count; i++) 178 | { 179 | if (d.Collisions.Get(i) != 0) 180 | { 181 | Assert.Fail("Expected all bits to be unset"); 182 | } 183 | } 184 | 185 | var count = d.Count(); 186 | Assert.AreEqual(0u, count); 187 | } 188 | 189 | [TestMethod] 190 | public void BenchmarkDeletableAdd() 191 | { 192 | var n = 100000; 193 | var d = new DeletableBloomFilter(100, 10, 0.1); 194 | var data = new byte[n][]; 195 | for (int i = 0; i < n; i++) 196 | { 197 | data[i] = Encoding.ASCII.GetBytes(i.ToString()); 198 | } 199 | 200 | for (int i = 0; i < n; i++) 201 | { 202 | d.Add(data[i]); 203 | } 204 | } 205 | 206 | [TestMethod] 207 | public void BenchmarkDeletableTest() 208 | { 209 | var n = 100000; 210 | var d = new DeletableBloomFilter(100, 10, 0.1); 211 | var data = new byte[n][]; 212 | for (int i = 0; i < n; i++) 213 | { 214 | data[i] = Encoding.ASCII.GetBytes(i.ToString()); 215 | } 216 | 217 | for (int i = 0; i < n; i++) 218 | { 219 | d.Test(data[i]); 220 | } 221 | } 222 | 223 | [TestMethod] 224 | public void BenchmarkDeletableTestAndAdd() 225 | { 226 | var n = 100000; 227 | var d = new DeletableBloomFilter(100, 10, 0.1); 228 | var data = new byte[n][]; 229 | for (int i = 0; i < n; i++) 230 | { 231 | data[i] = Encoding.ASCII.GetBytes(i.ToString()); 232 | } 233 | 234 | for (int i = 0; i < n; i++) 235 | { 236 | d.TestAndAdd(data[i]); 237 | } 238 | } 239 | 240 | [TestMethod] 241 | public void BenchmarkDeletableTestAndRemove() 242 | { 243 | var n = 100000; 244 | var d = new DeletableBloomFilter(100, 10, 0.1); 245 | var data = new byte[n][]; 246 | for (int i = 0; i < n; i++) 247 | { 248 | data[i] = Encoding.ASCII.GetBytes(i.ToString()); 249 | } 250 | 251 | for (int i = 0; i < n; i++) 252 | { 253 | d.TestAndRemove(data[i]); 254 | } 255 | } 256 | } 257 | } 258 | -------------------------------------------------------------------------------- /TestProbabilisticDataStructures/TestHyperLogLog.cs: -------------------------------------------------------------------------------- 1 | /* 2 | Original work Copyright 2013 Eric Lesh 3 | Modified work Copyright 2015 Tyler Treat 4 | Modified work Copyright 2015 Matthew Lorimor 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining 7 | a copy of this software and associated documentation files (the 8 | "Software"), to deal in the Software without restriction, including 9 | without limitation the rights to use, copy, modify, merge, publish, 10 | distribute, sublicense, and/or sell copies of the Software, and to 11 | permit persons to whom the Software is furnished to do so, subject to 12 | the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be 15 | included in all copies or substantial portions of the Software. 16 | */ 17 | 18 | using System; 19 | using Microsoft.VisualStudio.TestTools.UnitTesting; 20 | using ProbabilisticDataStructures; 21 | using System.Text; 22 | using System.Threading.Tasks; 23 | 24 | namespace TestProbabilisticDataStructures 25 | { 26 | [TestClass] 27 | public class TestHyperLogLog 28 | { 29 | private double geterror(UInt64 actual, UInt64 estimate) 30 | { 31 | return ((float)estimate - (float)actual) / (float)actual; 32 | } 33 | 34 | private void testHyperLogLog(int n, int lowB, int highB) 35 | { 36 | var words = Words.Dictionary(n); 37 | var bad = 0; 38 | var nWords = (UInt64)words.LongLength; 39 | 40 | var options = new ParallelOptions(); 41 | options.MaxDegreeOfParallelism = 4; 42 | Parallel.For(lowB, highB, options, i => 43 | { 44 | var m = (uint)Math.Pow(2, i); 45 | 46 | HyperLogLog h = null; 47 | try 48 | { 49 | h = new HyperLogLog(m); 50 | } 51 | catch (Exception) 52 | { 53 | Assert.Fail(string.Format("Can't make HyperLogLog({0})", m)); 54 | } 55 | 56 | foreach (var word in words) 57 | { 58 | h.Add(Encoding.ASCII.GetBytes(word)); 59 | } 60 | 61 | var expectedError = 1.04 / Math.Sqrt(m); 62 | var actualError = Math.Abs(this.geterror(nWords, h.Count())); 63 | 64 | if (actualError > expectedError) 65 | { 66 | bad++; 67 | //Assert.Fail(string.Format("Expected: {0}, Actual: {1}", expectedError, actualError)); 68 | } 69 | }); 70 | } 71 | 72 | private void benchmarkCount(int registers) 73 | { 74 | var n = 100000; 75 | var words = Words.Dictionary(0); 76 | var m = (uint)Math.Pow(2, registers); 77 | 78 | var h = new HyperLogLog(m); 79 | 80 | foreach (var word in words) 81 | { 82 | h.Add(Encoding.ASCII.GetBytes(word)); 83 | } 84 | 85 | for (int i = 0; i < n; i++) 86 | { 87 | h.Count(); 88 | } 89 | } 90 | 91 | [TestMethod] 92 | public void TestHyperLogLogSmall() 93 | { 94 | this.testHyperLogLog(5, 4, 17); 95 | } 96 | 97 | [TestMethod] 98 | public void TestHyperLogLogBig() 99 | { 100 | this.testHyperLogLog(0, 4, 17); 101 | } 102 | 103 | [TestMethod] 104 | public void TestNewDefaultHyperLogLog() 105 | { 106 | var hll = HyperLogLog.NewDefaultHyperLogLog(0.1); 107 | 108 | Assert.AreEqual(128u, hll.M); 109 | } 110 | 111 | [TestMethod] 112 | public void BenchmarkHLLCount4() 113 | { 114 | this.benchmarkCount(4); 115 | } 116 | 117 | [TestMethod] 118 | public void BenchmarkHLLCount5() 119 | { 120 | this.benchmarkCount(5); 121 | } 122 | 123 | [TestMethod] 124 | public void BenchmarkHLLCount6() 125 | { 126 | this.benchmarkCount(6); 127 | } 128 | 129 | [TestMethod] 130 | public void BenchmarkHLLCount7() 131 | { 132 | this.benchmarkCount(7); 133 | } 134 | 135 | [TestMethod] 136 | public void BenchmarkHLLCount8() 137 | { 138 | this.benchmarkCount(8); 139 | } 140 | 141 | [TestMethod] 142 | public void BenchmarkHLLCount9() 143 | { 144 | this.benchmarkCount(9); 145 | } 146 | 147 | [TestMethod] 148 | public void BenchmarkHLLCount10() 149 | { 150 | this.benchmarkCount(10); 151 | } 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /TestProbabilisticDataStructures/TestInverseBloomFilter.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.VisualStudio.TestTools.UnitTesting; 2 | using System.Text; 3 | using ProbabilisticDataStructures; 4 | 5 | namespace TestProbabilisticDataStructures 6 | { 7 | [TestClass] 8 | public class TestInverseBloomFilter 9 | { 10 | private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a"); 11 | private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b"); 12 | private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c"); 13 | private static byte[] D_BYTES = Encoding.ASCII.GetBytes("d"); 14 | private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x"); 15 | 16 | /// 17 | /// Ensures that Capacity returns the correct filter size. 18 | /// 19 | [TestMethod] 20 | public void TestInverseCapacity() 21 | { 22 | var f = new InverseBloomFilter(100); 23 | 24 | var capacity = f.Capacity(); 25 | Assert.AreEqual(100u, capacity); 26 | } 27 | 28 | /// 29 | /// Ensures that TestAndAdd behaves correctly. 30 | /// 31 | [TestMethod] 32 | public void TestInverseTestAndAdd() 33 | { 34 | var f = new InverseBloomFilter(3); 35 | 36 | if (f.TestAndAdd(A_BYTES)) 37 | { 38 | Assert.Fail("'a' should not be a member"); 39 | } 40 | 41 | if (!f.Test(A_BYTES)) 42 | { 43 | Assert.Fail("'a' should be a member"); 44 | } 45 | 46 | // 'd' hashes to the same index as 'a' 47 | if (f.TestAndAdd(D_BYTES)) 48 | { 49 | Assert.Fail("'d' should not be a member"); 50 | } 51 | 52 | // 'a' was swapped out. 53 | if (f.TestAndAdd(A_BYTES)) 54 | { 55 | Assert.Fail("'a' should not be a member"); 56 | } 57 | 58 | if (!f.Test(A_BYTES)) 59 | { 60 | Assert.Fail("'a' should be a member"); 61 | } 62 | 63 | // 'b' hashes to another index 64 | if (f.TestAndAdd(B_BYTES)) 65 | { 66 | Assert.Fail("'b' should not be a member"); 67 | } 68 | 69 | if (!f.Test(B_BYTES)) 70 | { 71 | Assert.Fail("'b' should be a member"); 72 | } 73 | 74 | // 'a' should still be a member. 75 | if (!f.Test(A_BYTES)) 76 | { 77 | Assert.Fail("'a' should be a member"); 78 | } 79 | 80 | if (f.Test(C_BYTES)) 81 | { 82 | Assert.Fail("'c' should not be a member"); 83 | } 84 | 85 | var addedC = f.Add(C_BYTES); 86 | Assert.AreSame(f, addedC, "Returned InverseBloomFilter should be the same instance"); 87 | 88 | if (!f.Test(C_BYTES)) 89 | { 90 | Assert.Fail("'c' should be a member"); 91 | } 92 | } 93 | } 94 | 95 | [TestClass] 96 | public class BenchmarkInverseBloomFilter 97 | { 98 | private InverseBloomFilter f; 99 | private int n; 100 | private byte[][] data; 101 | 102 | [TestInitialize()] 103 | public void Testinitialize() 104 | { 105 | n = 100000; 106 | f = new InverseBloomFilter((uint)n); 107 | data = new byte[n][]; 108 | for (int i = 0; i < n; i++) 109 | { 110 | data[i] = Encoding.ASCII.GetBytes(i.ToString()); 111 | } 112 | } 113 | 114 | [TestCleanup()] 115 | public void TestCleanup() 116 | { 117 | f = null; 118 | n = 0; 119 | data = null; 120 | } 121 | 122 | [TestMethod] 123 | public void BenchmarkInverseAdd() 124 | { 125 | for (int i = 0; i < n; i++) 126 | { 127 | f.Add(data[i]); 128 | } 129 | } 130 | 131 | [TestMethod] 132 | public void BenchmarkInverseTest() 133 | { 134 | for (int i = 0; i < n; i++) 135 | { 136 | f.Test(data[i]); 137 | } 138 | } 139 | 140 | [TestMethod] 141 | public void BenchmarkInverseTestAndAdd() 142 | { 143 | for (int i = 0; i < n; i++) 144 | { 145 | f.TestAndAdd(data[i]); 146 | } 147 | } 148 | } 149 | } 150 | -------------------------------------------------------------------------------- /TestProbabilisticDataStructures/TestMinHash.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using Microsoft.VisualStudio.TestTools.UnitTesting; 3 | using ProbabilisticDataStructures; 4 | 5 | 6 | namespace TestProbabilisticDataStructures 7 | { 8 | [TestClass] 9 | public class TestMinHash 10 | { 11 | /// 12 | /// Ensures that MinHash returns the correct similarity ratio. 13 | /// 14 | [TestMethod] 15 | public void TestMinHashSimilarity() 16 | { 17 | var bag = new List{ 18 | "bob", 19 | "alice", 20 | "frank", 21 | "tyler", 22 | "sara" 23 | }; 24 | 25 | var simRatio = MinHash.Similarity(bag.ToArray(), bag.ToArray()); 26 | Assert.AreEqual(1.0, simRatio); 27 | 28 | var dict = Words.Dictionary(1000); 29 | var bag2 = new List(); 30 | for (int i = 0; i < 1000; i++) 31 | { 32 | bag2.Add(i.ToString()); 33 | } 34 | 35 | simRatio = MinHash.Similarity(dict, bag2.ToArray()); 36 | Assert.AreEqual(0.0, simRatio); 37 | 38 | var bag3 = Words.Dictionary(500); 39 | simRatio = MinHash.Similarity(dict, bag3); 40 | if (simRatio > 0.7 || simRatio < 0.5) 41 | { 42 | Assert.Fail(string.Format("Expected between 0.5 and 0.7, got {0}", simRatio)); 43 | } 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /TestProbabilisticDataStructures/TestPartitionedBloomFilter.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.VisualStudio.TestTools.UnitTesting; 2 | using ProbabilisticDataStructures; 3 | using System.Text; 4 | 5 | namespace TestProbabilisticDataStructures 6 | { 7 | [TestClass] 8 | public class TestPartitionedBloomFilter 9 | { 10 | private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a"); 11 | private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b"); 12 | private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c"); 13 | private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x"); 14 | 15 | /// 16 | /// Ensures that Capacity() returns the number of bits, m, in the Bloom filter. 17 | /// 18 | [TestMethod] 19 | public void TestPartitionedCapacity() 20 | { 21 | var f = new PartitionedBloomFilter(100, 0.1); 22 | var capacity = f.Capacity(); 23 | 24 | Assert.AreEqual(480u, capacity); 25 | } 26 | 27 | /// 28 | /// Ensures that K() returns the number of hash functions in the Bloom Filter. 29 | /// 30 | [TestMethod] 31 | public void TestPartitionedK() 32 | { 33 | var f = new PartitionedBloomFilter(100, 0.1); 34 | var k = f.K(); 35 | 36 | Assert.AreEqual(4u, k); 37 | } 38 | 39 | /// 40 | /// Ensures that Count returns the number of items added to the filter. 41 | /// 42 | [TestMethod] 43 | public void TestPartitionedCount() 44 | { 45 | var f = new PartitionedBloomFilter(100, 0.1); 46 | for (uint i = 0; i < 10; i++) 47 | { 48 | f.Add(Encoding.ASCII.GetBytes(i.ToString())); 49 | } 50 | 51 | var count = f.Count(); 52 | Assert.AreEqual(10u, count); 53 | } 54 | 55 | /// 56 | /// Ensures that EstimatedFillRatio returns the correct approximation. 57 | /// 58 | [TestMethod] 59 | public void TestPartitionedEstimatedFillRatio() 60 | { 61 | var f = new PartitionedBloomFilter(100, 0.5); 62 | for (uint i = 0; i < 100; i++) 63 | { 64 | f.Add(Encoding.ASCII.GetBytes(i.ToString())); 65 | } 66 | 67 | var ratio = f.EstimatedFillRatio(); 68 | if (ratio > 0.5) 69 | { 70 | Assert.Fail("Expected less than or equal to 0.5, got {0}", ratio); 71 | } 72 | } 73 | 74 | /// 75 | /// Ensures that FillRatio returns the ratio of set bits. 76 | /// 77 | [TestMethod] 78 | public void TestPartitionedFillRatio() 79 | { 80 | var f = new PartitionedBloomFilter(100, 0.1); 81 | f.Add(A_BYTES); 82 | f.Add(B_BYTES); 83 | f.Add(C_BYTES); 84 | f.Add(X_BYTES); 85 | 86 | var ratio = f.FillRatio(); 87 | Assert.AreEqual(0.03125, ratio); 88 | } 89 | 90 | /// 91 | /// Ensures that Test, Add, and TestAndAdd behave correctly. 92 | /// 93 | [TestMethod] 94 | public void TestPartitionedBloomTestAndAdd() 95 | { 96 | var f = new PartitionedBloomFilter(100, 0.01); 97 | 98 | // 'a' is not in the filter. 99 | if (f.Test(A_BYTES)) 100 | { 101 | Assert.Fail("'a' should not be a member"); 102 | } 103 | 104 | var addedF = f.Add(A_BYTES); 105 | Assert.AreSame(f, addedF, "Returned PartitionedBloomFilter should be the same instance"); 106 | 107 | // 'a' is now in the filter. 108 | if (!f.Test(A_BYTES)) 109 | { 110 | Assert.Fail("'a' should be a member"); 111 | } 112 | 113 | // 'a' is still in the filter. 114 | if (!f.TestAndAdd(A_BYTES)) 115 | { 116 | Assert.Fail("'a' should be a member"); 117 | } 118 | 119 | // 'b' is not in the filter. 120 | if (f.TestAndAdd(B_BYTES)) 121 | { 122 | Assert.Fail("'b' should not be a member"); 123 | } 124 | 125 | // 'a' is still in the filter. 126 | if (!f.Test(A_BYTES)) 127 | { 128 | Assert.Fail("'a' should be a member"); 129 | } 130 | 131 | // 'b' is now in the filter. 132 | if (!f.Test(B_BYTES)) 133 | { 134 | Assert.Fail("'b' should be a member"); 135 | } 136 | 137 | // 'c' is not in the filter. 138 | if (f.Test(C_BYTES)) 139 | { 140 | Assert.Fail("'c' should not be a member"); 141 | } 142 | 143 | for (int i = 0; i < 1000000; i++) 144 | { 145 | f.TestAndAdd(Encoding.ASCII.GetBytes(i.ToString())); 146 | } 147 | 148 | // 'x' should be a false positive. 149 | if (!f.Test(X_BYTES)) 150 | { 151 | Assert.Fail("'x' should be a member"); 152 | } 153 | } 154 | 155 | /// 156 | /// Ensures that Reset sets every bit to zero. 157 | /// 158 | [TestMethod] 159 | public void TestPartitionedBloomReset() 160 | { 161 | var f = new PartitionedBloomFilter(100, 0.1); 162 | for (int i = 0; i < 1000; i++) 163 | { 164 | f.Add(Encoding.ASCII.GetBytes(i.ToString())); 165 | } 166 | 167 | var resetF = f.Reset(); 168 | Assert.AreSame(f, resetF, "Returned PartitionedBloomFilter should be the same instance"); 169 | 170 | foreach (var partition in f.Partitions) 171 | { 172 | for (uint i = 0; i < partition.count; i++) 173 | { 174 | if (partition.Get(0) != 0) 175 | { 176 | Assert.Fail("Expected all bits to be unset"); 177 | } 178 | } 179 | } 180 | } 181 | 182 | [TestMethod] 183 | public void BenchmarkPartitionedBloomAdd() 184 | { 185 | var n = 100000; 186 | var f = new PartitionedBloomFilter(100000, 0.1); 187 | var data = new byte[n][]; 188 | for (int i = 0; i < n; i++) 189 | { 190 | data[i] = Encoding.ASCII.GetBytes(i.ToString()); 191 | } 192 | 193 | for (int i = 0; i < n; i++) 194 | { 195 | f.Add(data[i]); 196 | } 197 | } 198 | 199 | [TestMethod] 200 | public void BenchmarkPartitionedBloomTest() 201 | { 202 | var n = 100000; 203 | var f = new PartitionedBloomFilter(100000, 0.1); 204 | var data = new byte[n][]; 205 | for (int i = 0; i < n; i++) 206 | { 207 | data[i] = Encoding.ASCII.GetBytes(i.ToString()); 208 | } 209 | 210 | for (int i = 0; i < n; i++) 211 | { 212 | f.Test(data[i]); 213 | } 214 | } 215 | 216 | [TestMethod] 217 | public void BenchmarkPartitionedBloomTestAndAdd() 218 | { 219 | var n = 100000; 220 | var f = new PartitionedBloomFilter(100000, 0.1); 221 | var data = new byte[n][]; 222 | for (int i = 0; i < n; i++) 223 | { 224 | data[i] = Encoding.ASCII.GetBytes(i.ToString()); 225 | } 226 | 227 | for (int i = 0; i < n; i++) 228 | { 229 | f.TestAndAdd(data[i]); 230 | } 231 | } 232 | } 233 | } 234 | -------------------------------------------------------------------------------- /TestProbabilisticDataStructures/TestProbabilisticDataStructures.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.VisualStudio.TestTools.UnitTesting; 2 | using ProbabilisticDataStructures; 3 | using System.Security.Cryptography; 4 | 5 | namespace TestProbabilisticDataStructures 6 | { 7 | [TestClass] 8 | public class TestProbabilisticDataStructures 9 | { 10 | /// 11 | /// Ensures that correct math is performed for OptimalM(). 12 | /// 13 | [TestMethod] 14 | public void TestOptimalM() 15 | { 16 | var optimalM = OptimalM(100, 0.01); 17 | Assert.AreEqual(959u, optimalM); 18 | 19 | optimalM = OptimalM(100, 0.5); 20 | Assert.AreEqual(145u, optimalM); 21 | } 22 | 23 | /// 24 | /// Ensures that correct math is performed for OptimalM64(). 25 | /// 26 | [TestMethod] 27 | public void TestOptimalM64() 28 | { 29 | var optimalM = OptimalM64(100, 0.01); 30 | Assert.AreEqual(959ul, optimalM); 31 | 32 | optimalM = OptimalM64(100, 0.5); 33 | Assert.AreEqual(145ul, optimalM); 34 | 35 | optimalM = OptimalM64(8589934592ul, 0.0001); 36 | Assert.AreEqual(164670049045ul, optimalM); 37 | } 38 | 39 | /// 40 | /// Ensures that correct math is performed for OptimalK(). 41 | /// 42 | [TestMethod] 43 | public void TestOptimalK() 44 | { 45 | var optimalK = OptimalK(0.01); 46 | Assert.AreEqual(7u, optimalK); 47 | 48 | optimalK = OptimalK(0.0001); 49 | Assert.AreEqual(14u, optimalK); 50 | } 51 | 52 | /// 53 | /// Ensures that HashKernel() returns the same upper and lower base 54 | /// as https://github.com/tylertreat/BoomFilters does when using the 55 | /// FNV1 hash. 56 | /// 57 | [TestMethod] 58 | public void TestHashKernelFNV1() 59 | { 60 | // FNV1 hash bytes for new byte[] { 0, 1, 2, 3 } 61 | var hashBytes = 62 | new byte[] 63 | { 64 | 0x15, 65 | 0x54, 66 | 0xe0, 67 | 0x98, 68 | 0x7f, 69 | 0x32, 70 | 0x75, 71 | 0x44 72 | }; 73 | var hashKernel = ProbabilisticDataStructures 74 | .Utils.HashKernelFromHashBytes(hashBytes); 75 | // Compare against upper and lower base values gotten by 76 | // calling the HashKernel function from 77 | // https://github.com/tylertreat/BoomFilters using that library's 78 | // default FNV1 hash algorithm. 79 | Assert.AreEqual(2564838421u, hashKernel.LowerBaseHash); 80 | Assert.AreEqual(1148531327u, hashKernel.UpperBaseHash); 81 | } 82 | 83 | /// 84 | /// Ensures that HashKernel() returns the proper upper and lower base when using 85 | /// MD5. 86 | /// 87 | [TestMethod] 88 | public void TestHashKernelMD5() 89 | { 90 | var data = new byte[] { 0, 1, 2, 3 }; 91 | var hashAlgorithm = HashAlgorithm.Create("MD5"); 92 | var hashKernel = ProbabilisticDataStructures 93 | .Utils.HashKernel(data, hashAlgorithm); 94 | 95 | Assert.AreEqual(4254774583u, hashKernel.LowerBaseHash); 96 | Assert.AreEqual(4179961689u, hashKernel.UpperBaseHash); 97 | } 98 | 99 | /// 100 | /// Ensures that HashKernel() returns the proper upper and lower base when using 101 | /// SHA256. 102 | /// 103 | [TestMethod] 104 | public void TestHashKernelSHA256() 105 | { 106 | var data = new byte[] { 0, 1, 2, 3 }; 107 | var hashAlgorithm = HashAlgorithm.Create("SHA256"); 108 | var hashKernel = ProbabilisticDataStructures 109 | .Utils.HashKernel(data, hashAlgorithm); 110 | 111 | Assert.AreEqual(3252571653u, hashKernel.LowerBaseHash); 112 | Assert.AreEqual(1646207440u, hashKernel.UpperBaseHash); 113 | } 114 | 115 | /// 116 | /// Ensures that HashKernel() returns the proper upper and lower base when using 117 | /// MD5. 118 | /// 119 | [TestMethod] 120 | public void TestHashKerne128lMD5() 121 | { 122 | var data = new byte[] { 0, 1, 2, 3 }; 123 | var hashAlgorithm = HashAlgorithm.Create("MD5"); 124 | var hashKernel = ProbabilisticDataStructures 125 | .Utils.HashKernel128(data, hashAlgorithm); 126 | 127 | Assert.AreEqual(17952798757042697527ul, hashKernel.LowerBaseHash); 128 | Assert.AreEqual(7516929291713011248ul, hashKernel.UpperBaseHash); 129 | } 130 | 131 | /// 132 | /// Ensures that HashKernel() returns the proper upper and lower base when using 133 | /// SHA256. 134 | /// 135 | [TestMethod] 136 | public void TestHashKernel128SHA256() 137 | { 138 | var data = new byte[] { 0, 1, 2, 3 }; 139 | var hashAlgorithm = HashAlgorithm.Create("SHA256"); 140 | var hashKernel = ProbabilisticDataStructures 141 | .Utils.HashKernel128(data, hashAlgorithm); 142 | 143 | Assert.AreEqual(7070407120484453893ul, hashKernel.LowerBaseHash); 144 | Assert.AreEqual(4682007113097866575ul, hashKernel.UpperBaseHash); 145 | } 146 | 147 | /// 148 | /// Helper method to get OptimalM(). 149 | /// 150 | /// 151 | /// 152 | /// 153 | private uint OptimalM(uint n, double fpRate) 154 | { 155 | return ProbabilisticDataStructures 156 | .Utils.OptimalM(n, fpRate); 157 | } 158 | 159 | /// 160 | /// Helper method to get OptimalM64(). 161 | /// 162 | /// 163 | /// 164 | /// 165 | private ulong OptimalM64(ulong n, double fpRate) 166 | { 167 | return ProbabilisticDataStructures 168 | .Utils.OptimalM64(n, fpRate); 169 | } 170 | 171 | /// 172 | /// Helper method to get OptimalK(). 173 | /// 174 | /// 175 | /// 176 | private uint OptimalK(double fpRate) 177 | { 178 | return ProbabilisticDataStructures 179 | .Utils.OptimalK(fpRate); 180 | } 181 | 182 | [TestMethod] 183 | public void TestHashBytesToUInt32() 184 | { 185 | var hashBytes = 186 | new byte[] 187 | { 188 | 0x40, 189 | 0x51, 190 | 0x62, 191 | 0x73, 192 | 0x84, 193 | 0x95, 194 | 0xa6, 195 | 0xb7, 196 | 0xc8, 197 | 0xd9, 198 | 0xea, 199 | 0xfb 200 | }; 201 | Assert.AreEqual(0x73625140u, Utils.HashBytesToUInt32(hashBytes, 0)); 202 | Assert.AreEqual(0xb7a69584u, Utils.HashBytesToUInt32(hashBytes, 4)); 203 | Assert.AreEqual(0xfbead9c8u, Utils.HashBytesToUInt32(hashBytes, 8)); 204 | } 205 | 206 | [TestMethod] 207 | public void TestHashBytesToUInt64() 208 | { 209 | var hashBytes = 210 | new byte[] 211 | { 212 | 0x40, 213 | 0x51, 214 | 0x62, 215 | 0x73, 216 | 0x84, 217 | 0x95, 218 | 0xa6, 219 | 0xb7, 220 | 0xc8, 221 | 0xd9, 222 | 0xea, 223 | 0xfb 224 | }; 225 | Assert.AreEqual(0xb7a6958473625140ul, Utils.HashBytesToUInt64(hashBytes, 0)); 226 | Assert.AreEqual(0xfbead9c8b7a69584ul, Utils.HashBytesToUInt64(hashBytes, 4)); 227 | } 228 | 229 | [TestMethod] 230 | public void TestComputeHashAsStringMD5() 231 | { 232 | var data = new byte[] { 0, 1, 2, 3 }; 233 | var hashingAlgorithm = HashAlgorithm.Create("MD5"); 234 | var hashString = Utils.ComputeHashAsString(data, hashingAlgorithm); 235 | Assert.AreEqual("37B59AFD592725F9305E484A5D7F5168", hashString); 236 | } 237 | 238 | [TestMethod] 239 | public void TestComputeHashAsStringSHA256() 240 | { 241 | var data = new byte[] { 0, 1, 2, 3 }; 242 | var hashingAlgorithm = HashAlgorithm.Create("SHA256"); 243 | var hashString = Utils.ComputeHashAsString(data, hashingAlgorithm); 244 | Assert.AreEqual("054EDEC1D0211F624FED0CBCA9D4F9400B0E491C43742AF2C5B0ABEBF0C990D8", hashString); 245 | } 246 | } 247 | } 248 | -------------------------------------------------------------------------------- /TestProbabilisticDataStructures/TestProbabilisticDataStructures.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Debug 5 | AnyCPU 6 | {8212EFDE-5134-4914-96D3-C550FD9432F1} 7 | Library 8 | Properties 9 | TestProbabilisticDataStructures 10 | TestProbabilisticDataStructures 11 | v4.7 12 | 512 13 | {3AC096D0-A1C2-E12C-1390-A8335801FDAB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} 14 | 10.0 15 | $(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion) 16 | $(ProgramFiles)\Common Files\microsoft shared\VSTT\$(VisualStudioVersion)\UITestExtensionPackages 17 | False 18 | UnitTest 19 | 20 | 21 | 22 | true 23 | full 24 | false 25 | bin\Debug\ 26 | DEBUG;TRACE 27 | prompt 28 | 4 29 | 30 | 31 | pdbonly 32 | true 33 | bin\Release\ 34 | TRACE 35 | prompt 36 | 4 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | {bf43f4a8-a892-413c-8e11-9a53d2249bf4} 76 | ProbabilisticDataStructures 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | False 85 | 86 | 87 | False 88 | 89 | 90 | False 91 | 92 | 93 | False 94 | 95 | 96 | 97 | 98 | 99 | 100 | 107 | -------------------------------------------------------------------------------- /TestProbabilisticDataStructures/TestScalableBloomFilter.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.VisualStudio.TestTools.UnitTesting; 2 | using ProbabilisticDataStructures; 3 | using System.Text; 4 | 5 | namespace TestProbabilisticDataStructures 6 | { 7 | [TestClass] 8 | public class TestScalableBloomFilter 9 | { 10 | private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a"); 11 | private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b"); 12 | private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c"); 13 | private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x"); 14 | 15 | [TestMethod] 16 | public void TestNewDefaultScalableBloomFilter() 17 | { 18 | var f = ScalableBloomFilter.NewDefaultScalableBloomFilter(0.1); 19 | 20 | Assert.AreEqual(0.1, f.FP); 21 | Assert.AreEqual(10000u, f.Hint); 22 | Assert.AreEqual(0.8, f.R); 23 | } 24 | 25 | [TestMethod] 26 | public void TestScalableBloomCapacity() 27 | { 28 | var f = new ScalableBloomFilter(1, 0.1, 1); 29 | f.AddFilter(); 30 | f.AddFilter(); 31 | 32 | var capacity = f.Capacity(); 33 | Assert.AreEqual(15u, capacity); 34 | } 35 | 36 | // Ensures that K returns the number of hash functions used in each Bloom filter. 37 | [TestMethod] 38 | public void TestScalableBloomK() 39 | { 40 | var f = new ScalableBloomFilter(10, 0.1, 0.8); 41 | 42 | var k = f.K(); 43 | Assert.AreEqual(4u, k); 44 | } 45 | 46 | /// 47 | /// Ensures that FillRatio returns the average fill ratio of the contained 48 | /// filters. 49 | /// 50 | [TestMethod] 51 | public void TestScalableFillRatio() 52 | { 53 | var f = new ScalableBloomFilter(100, 0.1, 0.8); 54 | f.SetHash(ProbabilisticDataStructures.Defaults.GetDefaultHashAlgorithm()); 55 | for (int i = 0; i < 200; i++) 56 | { 57 | f.Add(Encoding.ASCII.GetBytes(i.ToString())); 58 | } 59 | 60 | var fillRatio = f.FillRatio(); 61 | if (fillRatio > 0.5) 62 | { 63 | Assert.Fail(string.Format("Expected less than or equal to 0.5, got {0}", fillRatio)); 64 | } 65 | } 66 | 67 | /// 68 | /// Ensures that Test, Add, and TestAndAdd behave correctly. 69 | /// 70 | [TestMethod] 71 | public void TestScalableBloomTestAndAdd() 72 | { 73 | var f = new ScalableBloomFilter(1000, 0.01, 0.8); 74 | 75 | // 'a' is not in the filter. 76 | if (f.Test(A_BYTES)) 77 | { 78 | Assert.Fail("'a' should not be a member"); 79 | } 80 | 81 | var addedF = f.Add(A_BYTES); 82 | Assert.AreSame(f, addedF, "Returned ScalableBloomFilter should be the same instance"); 83 | 84 | // 'a' is now in the filter. 85 | if (!f.Test(A_BYTES)) 86 | { 87 | Assert.Fail("'a' should be a member"); 88 | } 89 | 90 | // 'a' is still in the filter. 91 | if (!f.TestAndAdd(A_BYTES)) 92 | { 93 | Assert.Fail("'a' should be a member"); 94 | } 95 | 96 | // 'b' is not in the filter. 97 | if (f.TestAndAdd(B_BYTES)) 98 | { 99 | Assert.Fail("'b' should not be a member"); 100 | } 101 | 102 | // 'a' is still in the filter. 103 | if (!f.Test(A_BYTES)) 104 | { 105 | Assert.Fail("'a' should be a member"); 106 | } 107 | 108 | // 'b' is now in the filter. 109 | if (!f.Test(B_BYTES)) 110 | { 111 | Assert.Fail("'b' should be a member"); 112 | } 113 | 114 | // 'c' is not in the filter. 115 | if (f.Test(C_BYTES)) 116 | { 117 | Assert.Fail("'c' should not be a member"); 118 | } 119 | 120 | for (int i = 0; i < 10000; i++) 121 | { 122 | f.TestAndAdd(Encoding.ASCII.GetBytes(i.ToString())); 123 | } 124 | 125 | // 'x' should not be a false positive. 126 | if (f.Test(X_BYTES)) 127 | { 128 | Assert.Fail("'x' should be a member"); 129 | } 130 | } 131 | 132 | /// 133 | /// Ensures that Reset sets every bit to zero. 134 | /// 135 | [TestMethod] 136 | public void TestScalableBloomReset() 137 | { 138 | var f = new ScalableBloomFilter(10, 0.1, 0.8); 139 | for (int i = 0; i < 1000; i++) 140 | { 141 | f.Add(Encoding.ASCII.GetBytes(i.ToString())); 142 | } 143 | 144 | var count = f.Filters.Count; 145 | Assert.IsTrue(count > 1, string.Format("Expected more than 1 filter, got {0}", count)); 146 | 147 | var resetF = f.Reset(); 148 | Assert.AreSame(f, resetF, "Returned ScalableBloomFilter should be the same instance"); 149 | 150 | count = f.Filters.Count; 151 | Assert.IsTrue(count == 1, string.Format("Expected 1 filter, got {0}", count)); 152 | 153 | foreach(var partition in f.Filters[0].Partitions) 154 | { 155 | for (uint i = 0; i < partition.count; i++) 156 | { 157 | if (partition.Get(i) != 0) 158 | { 159 | Assert.Fail("Expected all bits to be unset"); 160 | } 161 | } 162 | } 163 | } 164 | 165 | [TestMethod] 166 | public void BenchmarkScalableBloomAdd() 167 | { 168 | var n = 100000; 169 | var f = new ScalableBloomFilter(100000, 0.1, 0.8); 170 | var data = new byte[n][]; 171 | for (int i = 0; i < n; i++) 172 | { 173 | data[i] = Encoding.ASCII.GetBytes(i.ToString()); 174 | } 175 | 176 | for (int i = 0; i < n; i++) 177 | { 178 | f.Add(data[i]); 179 | } 180 | } 181 | 182 | [TestMethod] 183 | public void BenchmarkScalableBloomTest() 184 | { 185 | var n = 100000; 186 | var f = new ScalableBloomFilter(100000, 0.1, 0.8); 187 | var data = new byte[n][]; 188 | for (int i = 0; i < n; i++) 189 | { 190 | data[i] = Encoding.ASCII.GetBytes(i.ToString()); 191 | } 192 | 193 | for (int i = 0; i < n; i++) 194 | { 195 | f.Test(data[i]); 196 | } 197 | } 198 | 199 | [TestMethod] 200 | public void BenchmarkScalableBloomTestAndAdd() 201 | { 202 | var n = 100000; 203 | var f = new ScalableBloomFilter(100000, 0.1, 0.8); 204 | var data = new byte[n][]; 205 | for (int i = 0; i < n; i++) 206 | { 207 | data[i] = Encoding.ASCII.GetBytes(i.ToString()); 208 | } 209 | 210 | for (int i = 0; i < n; i++) 211 | { 212 | f.TestAndAdd(data[i]); 213 | } 214 | } 215 | } 216 | } 217 | -------------------------------------------------------------------------------- /TestProbabilisticDataStructures/TestTopK.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.VisualStudio.TestTools.UnitTesting; 2 | using ProbabilisticDataStructures; 3 | using System.Text; 4 | using System.Linq; 5 | 6 | namespace TestProbabilisticDataStructures 7 | { 8 | [TestClass] 9 | public class TestTopK 10 | { 11 | private static byte[] BOB_BYTES = Encoding.ASCII.GetBytes("bob"); 12 | private static byte[] TYLER_BYTES = Encoding.ASCII.GetBytes("tyler"); 13 | private static byte[] FRED_BYTES = Encoding.ASCII.GetBytes("fred"); 14 | private static byte[] ALICE_BYTES = Encoding.ASCII.GetBytes("alice"); 15 | private static byte[] JAMES_BYTES = Encoding.ASCII.GetBytes("james"); 16 | private static byte[] SARA_BYTES = Encoding.ASCII.GetBytes("sara"); 17 | private static byte[] BILL_BYTES = Encoding.ASCII.GetBytes("bill"); 18 | 19 | /// 20 | /// Ensures that TopK return the top-k most frequent elements. 21 | /// 22 | [TestMethod] 23 | public void TestTopk() 24 | { 25 | var topK = new TopK(0.001, 0.99, 5); 26 | 27 | topK.Add(BOB_BYTES).Add(BOB_BYTES).Add(BOB_BYTES); 28 | topK.Add(TYLER_BYTES).Add(TYLER_BYTES).Add(TYLER_BYTES).Add(TYLER_BYTES).Add(TYLER_BYTES); 29 | topK.Add(FRED_BYTES); 30 | topK.Add(ALICE_BYTES).Add(ALICE_BYTES).Add(ALICE_BYTES).Add(ALICE_BYTES); 31 | topK.Add(JAMES_BYTES); 32 | topK.Add(FRED_BYTES); 33 | topK.Add(SARA_BYTES).Add(SARA_BYTES); 34 | 35 | var addedK = topK.Add(BILL_BYTES); 36 | Assert.AreSame(topK, addedK); 37 | // latest one also 38 | var expected = new ProbabilisticDataStructures.Element[]{ 39 | new ProbabilisticDataStructures.Element{Data=BILL_BYTES, Freq=1}, 40 | new ProbabilisticDataStructures.Element{Data=SARA_BYTES, Freq=2}, 41 | new ProbabilisticDataStructures.Element{Data=BOB_BYTES, Freq=3}, 42 | new ProbabilisticDataStructures.Element{Data=ALICE_BYTES, Freq=4}, 43 | new ProbabilisticDataStructures.Element{Data=TYLER_BYTES, Freq=5}, 44 | }; 45 | 46 | var actual = topK.Elements(); 47 | 48 | Assert.AreEqual(5, actual.Length); 49 | 50 | for (int i = 0; i < actual.Length; i++) 51 | { 52 | var element = actual[i]; 53 | Assert.IsTrue(Enumerable.SequenceEqual(element.Data, expected[i].Data)); 54 | // freq check 55 | Assert.AreEqual(expected[i].Freq, element.Freq); 56 | } 57 | 58 | var resetK = topK.Reset(); 59 | Assert.AreSame(topK, resetK); 60 | 61 | Assert.AreEqual(0, topK.Elements().Length); 62 | Assert.AreEqual(0u, topK.N); 63 | } 64 | 65 | [TestMethod] 66 | public void BenchmarkTopKAdd() 67 | { 68 | var n = 100000; 69 | var topK = new TopK(0.001, 0.99, 5); 70 | var data = new byte[n][]; 71 | for (int i = 0; i < n; i++) 72 | { 73 | data[i] = Encoding.ASCII.GetBytes(i.ToString()); 74 | } 75 | 76 | for (int i = 0; i < n; i++) 77 | { 78 | topK.Add(data[i]); 79 | } 80 | } 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | version: 1.0.{build} 2 | configuration: Release 3 | test: on 4 | skip_tags: true 5 | pull_requests: 6 | do_not_increment_build_number: true 7 | build: 8 | verbosity: minimal 9 | assembly_info: 10 | patch: true 11 | file: '**\AssemblyInfo.*' 12 | assembly_version: '{version}' 13 | assembly_file_version: '{version}' 14 | assembly_informational_version: '{version}' 15 | artifacts: 16 | - path: ProbabilisticDataStructures\bin\Release 17 | name: ProbabilisticDataStructures-v$(appveyor_build_version) 18 | --------------------------------------------------------------------------------