├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
├── .gitignore
├── CONTRIBUTING.md
├── Default.testsettings
├── LICENSE.txt
├── PULL_REQUEST_TEMPLATE.md
├── ProbabilisticDataStructures.sln
├── ProbabilisticDataStructures.vsmdi
├── ProbabilisticDataStructures
    ├── BloomFilter.cs
    ├── BloomFilter64.cs
    ├── Buckets.cs
    ├── Buckets64.cs
    ├── CountMinSketch.cs
    ├── CountingBloomFilter.cs
    ├── CuckooBloomFilter.cs
    ├── Defaults.cs
    ├── DeletableBloomFilter.cs
    ├── Element.cs
    ├── ElementHeap.cs
    ├── HyperLogLog.cs
    ├── IFilter.cs
    ├── InverseBloomFilter.cs
    ├── MinHash.cs
    ├── PartitionedBloomFilter.cs
    ├── ProbabilisticDataStructures.csproj
    ├── ScalableBloomFilter.cs
    ├── StableBloomFilter.cs
    ├── TopK.cs
    └── Utils.cs
├── README.md
├── TestProbabilisticDataStructures
    ├── Properties
    │   └── AssemblyInfo.cs
    ├── TestBloomFilter.cs
    ├── TestBloomFilter64.cs
    ├── TestBuckets.cs
    ├── TestBuckets64.cs
    ├── TestCountMinSketch.cs
    ├── TestCountingBloomFilter.cs
    ├── TestCuckooBloomFilter.cs
    ├── TestDeletableBloomFilter.cs
    ├── TestHyperLogLog.cs
    ├── TestInverseBloomFilter.cs
    ├── TestMinHash.cs
    ├── TestPartitionedBloomFilter.cs
    ├── TestProbabilisticDataStructures.cs
    ├── TestProbabilisticDataStructures.csproj
    ├── TestScalableBloomFilter.cs
    ├── TestStableBloomFilter.cs
    ├── TestTopK.cs
    └── Words.cs
└── appveyor.yml


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | 
 5 | ---
 6 | 
 7 | **Describe the bug**
 8 | A clear and concise description of what the bug is.
 9 | 
10 | **To Reproduce**
11 | Steps to reproduce the behavior:
12 | 1. Go to '...'
13 | 2. Click on '....'
14 | 3. Scroll down to '....'
15 | 4. See error
16 | 
17 | **Expected behavior**
18 | A clear and concise description of what you expected to happen.
19 | 
20 | **Screenshots**
21 | If applicable, add screenshots to help explain your problem.
22 | 
23 | **Desktop (please complete the following information):**
24 |  - OS: [e.g. iOS]
25 |  - Browser [e.g. chrome, safari]
26 |  - Version [e.g. 22]
27 | 
28 | **Smartphone (please complete the following information):**
29 |  - Device: [e.g. iPhone6]
30 |  - OS: [e.g. iOS8.1]
31 |  - Browser [e.g. stock browser, safari]
32 |  - Version [e.g. 22]
33 | 
34 | **Additional context**
35 | Add any other context about the problem here.
36 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | 
 5 | ---
 6 | 
 7 | **Is your feature request related to a problem? Please describe.**
 8 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
 9 | 
10 | **Describe the solution you'd like**
11 | A clear and concise description of what you want to happen.
12 | 
13 | **Describe alternatives you've considered**
14 | A clear and concise description of any alternative solutions or features you've considered.
15 | 
16 | **Additional context**
17 | Add any other context or screenshots about the feature request here.
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | ProbabilisticDataStructures.v11.suo
 2 | ProbabilisticDataStructures/bin/
 3 | ProbabilisticDataStructures/obj/
 4 | TestProbabilisticDataStructures/bin/
 5 | TestProbabilisticDataStructures/obj/
 6 | TestResults/ProbabilisticDataStructures.TE.Tests.mdf
 7 | TestResults/ProbabilisticDataStructures.TE.Tests_log.ldf
 8 | TestResults/
 9 | .vs/
10 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | If you think a change would be useful, make a PR! I would only care that you inquire about a change before wirting it if it seems like a oh-man-this-is-changing-everything type of change.
2 | 


--------------------------------------------------------------------------------
/Default.testsettings:
--------------------------------------------------------------------------------
 1 | ﻿<?xml version="1.0" encoding="UTF-8"?>
 2 | <TestSettings name="Default" id="d414acca-5208-4c2c-a1b3-0e277002baa0" xmlns="http://microsoft.com/schemas/VisualStudio/TeamTest/2010">
 3 |   <Description>These are default test settings for a local test run.</Description>
 4 |   <Deployment enabled="false" />
 5 |   <Execution parallelTestCount="5">
 6 |     <TestTypeSpecific>
 7 |       <UnitTestRunConfig testTypeId="13cdc9d9-ddb5-4fa4-a97d-d965ccfc6d4b">
 8 |         <AssemblyResolution>
 9 |           <TestDirectory useLoadContext="true" />
10 |         </AssemblyResolution>
11 |       </UnitTestRunConfig>
12 |     </TestTypeSpecific>
13 |     <AgentRule name="Execution Agents">
14 |     </AgentRule>
15 |   </Execution>
16 | </TestSettings>


--------------------------------------------------------------------------------
/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | ### What is this PR?
2 | THIS_IS_A_PR_THAT_DOES_X_Y_Z
3 | 
4 | ### Things to consider:
5 | - [ ] I added tests for my changes
6 | - [ ] I ran the tests locally and they all passed
7 | - [x] I am awesome for making a contribution
8 | 


--------------------------------------------------------------------------------
/ProbabilisticDataStructures.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio 15
 4 | VisualStudioVersion = 15.0.27428.2043
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ProbabilisticDataStructures", "ProbabilisticDataStructures\ProbabilisticDataStructures.csproj", "{4775E89C-C139-43B0-8436-B456C035C9D9}"
 7 | EndProject
 8 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "TestProbabilisticDataStructures", "TestProbabilisticDataStructures\TestProbabilisticDataStructures.csproj", "{8212EFDE-5134-4914-96D3-C550FD9432F1}"
 9 | EndProject
10 | Global
11 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
12 | 		Debug|Any CPU = Debug|Any CPU
13 | 		Release|Any CPU = Release|Any CPU
14 | 	EndGlobalSection
15 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
16 | 		{4775E89C-C139-43B0-8436-B456C035C9D9}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
17 | 		{4775E89C-C139-43B0-8436-B456C035C9D9}.Debug|Any CPU.Build.0 = Debug|Any CPU
18 | 		{4775E89C-C139-43B0-8436-B456C035C9D9}.Release|Any CPU.ActiveCfg = Release|Any CPU
19 | 		{4775E89C-C139-43B0-8436-B456C035C9D9}.Release|Any CPU.Build.0 = Release|Any CPU
20 | 		{8212EFDE-5134-4914-96D3-C550FD9432F1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
21 | 		{8212EFDE-5134-4914-96D3-C550FD9432F1}.Debug|Any CPU.Build.0 = Debug|Any CPU
22 | 		{8212EFDE-5134-4914-96D3-C550FD9432F1}.Release|Any CPU.ActiveCfg = Release|Any CPU
23 | 		{8212EFDE-5134-4914-96D3-C550FD9432F1}.Release|Any CPU.Build.0 = Release|Any CPU
24 | 	EndGlobalSection
25 | 	GlobalSection(SolutionProperties) = preSolution
26 | 		HideSolutionNode = FALSE
27 | 	EndGlobalSection
28 | 	GlobalSection(ExtensibilityGlobals) = postSolution
29 | 		SolutionGuid = {DD9C9C10-6340-471D-BF9D-A6823302D332}
30 | 	EndGlobalSection
31 | EndGlobal
32 | 


--------------------------------------------------------------------------------
/ProbabilisticDataStructures.vsmdi:
--------------------------------------------------------------------------------
1 | ﻿<?xml version="1.0" encoding="UTF-8"?>
2 | <TestLists xmlns="http://microsoft.com/schemas/VisualStudio/TeamTest/2010">
3 |   <TestList name="Lists of Tests" id="8c43106b-9dc1-4907-a29f-aa66a61bf5b6">
4 |     <RunConfiguration id="d414acca-5208-4c2c-a1b3-0e277002baa0" name="Default" storage="default.testsettings" type="Microsoft.VisualStudio.TestTools.Common.TestRunConfiguration, Microsoft.VisualStudio.QualityTools.Common, Version=11.0.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a" />
5 |   </TestList>
6 | </TestLists>


--------------------------------------------------------------------------------
/ProbabilisticDataStructures/BloomFilter.cs:
--------------------------------------------------------------------------------
  1 | ﻿using System;
  2 | using System.Security.Cryptography;
  3 | 
  4 | namespace ProbabilisticDataStructures
  5 | {
  6 |     /// <summary>
  7 |     /// BloomFilter implements a classic Bloom filter. A bloom filter has a non-zero
  8 |     /// probability of false positives and a zero probability of false negatives.
  9 |     /// </summary>
 10 |     public class BloomFilter : IFilter
 11 |     {
 12 |         /// <summary>
 13 |         /// Filter data
 14 |         /// </summary>
 15 |         internal Buckets Buckets { get; set; }
 16 |         /// <summary>
 17 |         /// Hash algorithm
 18 |         /// </summary>
 19 |         private HashAlgorithm Hash { get; set; }
 20 |         /// <summary>
 21 |         /// Filter size
 22 |         /// </summary>
 23 |         private uint m { get; set; }
 24 |         /// <summary>
 25 |         /// Number of hash functions
 26 |         /// </summary>
 27 |         private uint k { get; set; }
 28 |         /// <summary>
 29 |         /// Number of items added
 30 |         /// </summary>
 31 |         private uint count { get; set; }
 32 | 
 33 |         /// <summary>
 34 |         /// Creates a new Bloom filter optimized to store n items with a specified target
 35 |         /// false-positive rate.
 36 |         /// </summary>
 37 |         /// <param name="n">Number of items to store.</param>
 38 |         /// <param name="fpRate">Desired false positive rate.</param>
 39 |         public BloomFilter(uint n, double fpRate)
 40 |         {
 41 |             var m = Utils.OptimalM(n, fpRate);
 42 |             var k = Utils.OptimalK(fpRate);
 43 |             Buckets = new Buckets(m, 1);
 44 |             Hash = Defaults.GetDefaultHashAlgorithm();
 45 |             this.m = m;
 46 |             this.k = k;
 47 |         }
 48 | 
 49 |         /// <summary>
 50 |         /// Returns the Bloom filter capacity, m.
 51 |         /// </summary>
 52 |         /// <returns>The Bloom filter capacity, m.</returns>
 53 |         public uint Capacity()
 54 |         {
 55 |             return this.m;
 56 |         }
 57 | 
 58 |         /// <summary>
 59 |         /// Returns the number of hash functions.
 60 |         /// </summary>
 61 |         /// <returns>The number of hash functions.</returns>
 62 |         public uint K()
 63 |         {
 64 |             return this.k;
 65 |         }
 66 | 
 67 |         /// <summary>
 68 |         /// Returns the number of items in the filter.
 69 |         /// </summary>
 70 |         /// <returns></returns>
 71 |         public uint Count()
 72 |         {
 73 |             return this.count;
 74 |         }
 75 | 
 76 |         /// <summary>
 77 |         /// Returns the current estimated ratio of set bits.
 78 |         /// </summary>
 79 |         /// <returns>The current estimated ratio of set bits.</returns>
 80 |         public double EstimatedFillRatio()
 81 |         {
 82 |             return 1 - Math.Exp((-(double)this.count * (double)this.k) / (double)this.m);
 83 |         }
 84 | 
 85 |         /// <summary>
 86 |         /// Returns the ratio of set bits.
 87 |         /// </summary>
 88 |         /// <returns>The ratio of set bits.</returns>
 89 |         public double FillRatio()
 90 |         {
 91 |             uint sum = 0;
 92 |             for (uint i = 0; i < this.Buckets.count; i++)
 93 |             {
 94 |                 sum += this.Buckets.Get(i);
 95 |             }
 96 |             return (double)sum / (double)this.m;
 97 |         }
 98 | 
 99 |         /// <summary>
100 |         /// Will test for membership of the data and returns true if it is a member,
101 |         /// false if not. This is a probabilistic test, meaning there is a non-zero
102 |         /// probability of false positives but a zero probability of false negatives.
103 |         /// </summary>
104 |         /// <param name="data">The data to search for.</param>
105 |         /// <returns>Whether or not the data is maybe contained in the filter.</returns>
106 |         public bool Test(byte[] data)
107 |         {
108 |             var hashKernel = Utils.HashKernel(data, this.Hash);
109 |             var lower = hashKernel.LowerBaseHash;
110 |             var upper = hashKernel.UpperBaseHash;
111 | 
112 |             // If any of the K bits are not set, then it's not a member.
113 |             for (uint i = 0; i < this.k; i++)
114 |             {
115 |                 if (this.Buckets.Get((lower + upper * i) % this.m) == 0)
116 |                 {
117 |                     return false;
118 |                 }
119 |             }
120 |             return true;
121 |         }
122 | 
123 |         /// <summary>
124 |         /// Will add the data to the Bloom filter. It returns the filter to allow
125 |         /// for chaining.
126 |         /// </summary>
127 |         /// <param name="data">The data to add.</param>
128 |         /// <returns>The filter.</returns>
129 |         public IFilter Add(byte[] data)
130 |         {
131 |             var hashKernel = Utils.HashKernel(data, this.Hash);
132 |             var lower = hashKernel.LowerBaseHash;
133 |             var upper = hashKernel.UpperBaseHash;
134 | 
135 |             // Set the K bits.
136 |             for (uint i = 0; i < this.k; i++)
137 |             {
138 |                 this.Buckets.Set((lower + upper * i) % this.m, 1);
139 |             }
140 | 
141 |             this.count++;
142 |             return this;
143 |         }
144 | 
145 |         /// <summary>
146 |         /// Is equivalent to calling Test followed by Add. It returns true if the data is
147 |         /// a member, false if not.
148 |         /// </summary>
149 |         /// <param name="data">The data to test for and add if it doesn't exist.</param>
150 |         /// <returns>Whether or not the data was probably contained in the filter.</returns>
151 |         public bool TestAndAdd(byte[] data)
152 |         {
153 |             var hashKernel = Utils.HashKernel(data, this.Hash);
154 |             var lower = hashKernel.LowerBaseHash;
155 |             var upper = hashKernel.UpperBaseHash;
156 |             var member = true;
157 | 
158 |             // If any of the K bits are not set, then it's not a member.
159 |             for (uint i = 0; i < this.k; i++)
160 |             {
161 |                 var idx = (lower + upper * i) % this.m;
162 |                 if (this.Buckets.Get(idx) == 0)
163 |                 {
164 |                     member = false;
165 |                 }
166 |                 this.Buckets.Set(idx, 1);
167 |             }
168 | 
169 |             this.count++;
170 |             return member;
171 |         }
172 | 
173 |         /// <summary>
174 |         /// Restores the Bloom filter to its original state. It returns the filter to
175 |         /// allow for chaining.
176 |         /// </summary>
177 |         /// <returns>The reset bloom filter.</returns>
178 |         public BloomFilter Reset()
179 |         {
180 |             this.Buckets.Reset();
181 |             return this;
182 |         }
183 | 
184 |         /// <summary>
185 |         /// Sets the hashing function used in the filter.
186 |         /// </summary>
187 |         /// <param name="h">The HashAlgorithm to use.</param>
188 |         // TODO: Add SetHash to the IFilter interface?
189 |         public void SetHash(HashAlgorithm h)
190 |         {
191 |             this.Hash = h;
192 |         }
193 |     }
194 | }
195 | 


--------------------------------------------------------------------------------
/ProbabilisticDataStructures/BloomFilter64.cs:
--------------------------------------------------------------------------------
  1 | using System;
  2 | using System.Collections.Generic;
  3 | using System.Linq;
  4 | using System.Text;
  5 | using System.Threading.Tasks;
  6 | using ProbabilisticDataStructures;
  7 | using System.Security.Cryptography;
  8 | 
  9 | namespace ProbabilisticDataStructures
 10 | {
 11 |     /// <summary>
 12 |     /// BloomFilter64 implements a classic Bloom filter. A bloom filter has a non-zero
 13 |     /// probability of false positives and a zero probability of false negatives.
 14 |     /// </summary>
 15 |     public class BloomFilter64 : IFilter
 16 |     {
 17 |         /// <summary>
 18 |         /// Filter data
 19 |         /// </summary>
 20 |         internal Buckets64 Buckets { get; set; }
 21 |         /// <summary>
 22 |         /// Hash algorithm
 23 |         /// </summary>
 24 |         private HashAlgorithm Hash { get; set; }
 25 |         /// <summary>
 26 |         /// Filter size
 27 |         /// </summary>
 28 |         private ulong m { get; set; }
 29 |         /// <summary>
 30 |         /// Number of hash functions
 31 |         /// </summary>
 32 |         private uint k { get; set; }
 33 |         /// <summary>
 34 |         /// Number of items added
 35 |         /// </summary>
 36 |         private ulong count { get; set; }
 37 | 
 38 |         /// <summary>
 39 |         /// Creates a new Bloom filter optimized to store n items with a specified target
 40 |         /// false-positive rate.
 41 |         /// </summary>
 42 |         /// <param name="n">Number of items to store.</param>
 43 |         /// <param name="fpRate">Desired false positive rate.</param>
 44 |         public BloomFilter64(ulong n, double fpRate)
 45 |         {
 46 |             var m = Utils.OptimalM64(n, fpRate);
 47 |             var k = Utils.OptimalK(fpRate);
 48 |             Buckets = new Buckets64(m, 1);
 49 |             Hash = Defaults.GetDefaultHashAlgorithm();
 50 |             this.m = m;
 51 |             this.k = k;
 52 |         }
 53 | 
 54 |         /// <summary>
 55 |         /// Returns the Bloom filter capacity, m.
 56 |         /// </summary>
 57 |         /// <returns>The Bloom filter capacity, m.</returns>
 58 |         public ulong Capacity()
 59 |         {
 60 |             return this.m;
 61 |         }
 62 | 
 63 |         /// <summary>
 64 |         /// Returns the number of hash functions.
 65 |         /// </summary>
 66 |         /// <returns>The number of hash functions.</returns>
 67 |         public uint K()
 68 |         {
 69 |             return this.k;
 70 |         }
 71 | 
 72 |         /// <summary>
 73 |         /// Returns the number of items in the filter.
 74 |         /// </summary>
 75 |         /// <returns></returns>
 76 |         public ulong Count()
 77 |         {
 78 |             return this.count;
 79 |         }
 80 | 
 81 |         /// <summary>
 82 |         /// Returns the current estimated ratio of set bits.
 83 |         /// </summary>
 84 |         /// <returns>The current estimated ratio of set bits.</returns>
 85 |         public double EstimatedFillRatio()
 86 |         {
 87 |             return 1 - Math.Exp((-(double)this.count * (double)this.k) / (double)this.m);
 88 |         }
 89 | 
 90 |         /// <summary>
 91 |         /// Returns the ratio of set bits.
 92 |         /// </summary>
 93 |         /// <returns>The ratio of set bits.</returns>
 94 |         public double FillRatio()
 95 |         {
 96 |             ulong sum = 0;
 97 |             for (ulong i = 0; i < this.Buckets.count; i++)
 98 |             {
 99 |                 sum += this.Buckets.Get(i);
100 |             }
101 |             return (double)sum / (double)this.m;
102 |         }
103 | 
104 |         /// <summary>
105 |         /// Will test for membership of the data and returns true if it is a member,
106 |         /// false if not. This is a probabilistic test, meaning there is a non-zero
107 |         /// probability of false positives but a zero probability of false negatives.
108 |         /// </summary>
109 |         /// <param name="data">The data to search for.</param>
110 |         /// <returns>Whether or not the data is maybe contained in the filter.</returns>
111 |         public bool Test(byte[] data)
112 |         {
113 |             var hashKernel = Utils.HashKernel128(data, this.Hash);
114 |             var lower = hashKernel.LowerBaseHash;
115 |             var upper = hashKernel.UpperBaseHash;
116 | 
117 |             // If any of the K bits are not set, then it's not a member.
118 |             for (uint i = 0; i < this.k; i++)
119 |             {
120 |                 if (this.Buckets.Get((lower + upper * i) % this.m) == 0)
121 |                 {
122 |                     return false;
123 |                 }
124 |             }
125 |             return true;
126 |         }
127 | 
128 |         /// <summary>
129 |         /// Will add the data to the Bloom filter. It returns the filter to allow
130 |         /// for chaining.
131 |         /// </summary>
132 |         /// <param name="data">The data to add.</param>
133 |         /// <returns>The filter.</returns>
134 |         public IFilter Add(byte[] data)
135 |         {
136 |             var hashKernel = Utils.HashKernel128(data, this.Hash);
137 |             var lower = hashKernel.LowerBaseHash;
138 |             var upper = hashKernel.UpperBaseHash;
139 | 
140 |             // Set the K bits.
141 |             for (uint i = 0; i < this.k; i++)
142 |             {
143 |                 this.Buckets.Set((lower + upper * i) % this.m, 1);
144 |             }
145 | 
146 |             this.count++;
147 |             return this;
148 |         }
149 | 
150 |         /// <summary>
151 |         /// Is equivalent to calling Test followed by Add. It returns true if the data is
152 |         /// a member, false if not.
153 |         /// </summary>
154 |         /// <param name="data">The data to test for and add if it doesn't exist.</param>
155 |         /// <returns>Whether or not the data was probably contained in the filter.</returns>
156 |         public bool TestAndAdd(byte[] data)
157 |         {
158 |             var hashKernel = Utils.HashKernel128(data, this.Hash);
159 |             var lower = hashKernel.LowerBaseHash;
160 |             var upper = hashKernel.UpperBaseHash;
161 |             var member = true;
162 | 
163 |             // If any of the K bits are not set, then it's not a member.
164 |             for (uint i = 0; i < this.k; i++)
165 |             {
166 |                 var idx = (lower + upper * i) % this.m;
167 |                 if (this.Buckets.Get(idx) == 0)
168 |                 {
169 |                     member = false;
170 |                 }
171 |                 this.Buckets.Set(idx, 1);
172 |             }
173 | 
174 |             this.count++;
175 |             return member;
176 |         }
177 | 
178 |         /// <summary>
179 |         /// Restores the Bloom filter to its original state. It returns the filter to
180 |         /// allow for chaining.
181 |         /// </summary>
182 |         /// <returns>The reset bloom filter.</returns>
183 |         public BloomFilter64 Reset()
184 |         {
185 |             this.Buckets.Reset();
186 |             return this;
187 |         }
188 | 
189 |         /// <summary>
190 |         /// Sets the hashing function used in the filter.
191 |         /// </summary>
192 |         /// <param name="h">The HashAlgorithm to use.</param>
193 |         // TODO: Add SetHash to the IFilter interface?
194 |         public void SetHash(HashAlgorithm h)
195 |         {
196 |             this.Hash = h;
197 |         }
198 |     }
199 | }
200 | 


--------------------------------------------------------------------------------
/ProbabilisticDataStructures/Buckets.cs:
--------------------------------------------------------------------------------
  1 | ﻿namespace ProbabilisticDataStructures
  2 | {
  3 |     /// <summary>
  4 |     /// Buckets is a fast, space-efficient array of buckets where each bucket can store
  5 |     /// up to a configured maximum value.
  6 |     /// </summary>
  7 |     public class Buckets
  8 |     {
  9 |         private byte[] Data { get; set; }
 10 |         private byte bucketSize { get; set; }
 11 |         private byte _max;
 12 |         private int Max
 13 |         {
 14 |             get
 15 |             {
 16 |                 return _max;
 17 |             }
 18 |             set
 19 |             {
 20 |                 // TODO: Figure out this truncation thing.
 21 |                 // I'm not sure if MaxValue is always supposed to be capped at 255 via
 22 |                 // a byte conversion or not...
 23 |                 if (value > byte.MaxValue)
 24 |                     _max = byte.MaxValue;
 25 |                 else
 26 |                     _max = (byte)value;
 27 |             } 
 28 |         }
 29 |         internal uint count { get; set; }
 30 | 
 31 |         /// <summary>
 32 |         /// Creates a new Buckets with the provided number of buckets where each bucket
 33 |         /// is the specified number of bits.
 34 |         /// </summary>
 35 |         /// <param name="count">Number of buckets.</param>
 36 |         /// <param name="bucketSize">Number of bits per bucket.</param>
 37 |         internal Buckets(uint count, byte bucketSize)
 38 |         {
 39 |             this.count = count;
 40 |             this.Data = new byte[(count * bucketSize + 7) / 8];
 41 |             this.bucketSize = bucketSize;
 42 |             this.Max = (1 << bucketSize) - 1;
 43 |         }
 44 | 
 45 |         /// <summary>
 46 |         /// Returns the maximum value that can be stored in a bucket.
 47 |         /// </summary>
 48 |         /// <returns>The bucket max value.</returns>
 49 |         internal byte MaxBucketValue()
 50 |         {
 51 |             return this._max;
 52 |         }
 53 | 
 54 |         /// <summary>
 55 |         /// Increment the value in the specified bucket by the provided delta. A bucket
 56 |         /// can be decremented by providing a negative delta.
 57 |         /// <para>
 58 |         ///     The value is clamped to zero and the maximum bucket value. Returns itself
 59 |         ///     to allow for chaining.
 60 |         /// </para>
 61 |         /// </summary>
 62 |         /// <param name="bucket">The bucket to increment.</param>
 63 |         /// <param name="delta">The amount to increment the bucket by.</param>
 64 |         /// <returns>The modified bucket.</returns>
 65 |         internal Buckets Increment(uint bucket, int delta)
 66 |         {
 67 |             int val = (int)(GetBits(bucket * this.bucketSize, this.bucketSize) + delta);
 68 | 
 69 |             if (val > this.Max)
 70 |                 val = this.Max;
 71 |             else if (val < 0)
 72 |                 val = 0;
 73 | 
 74 |             SetBits((uint)bucket * (uint)this.bucketSize, this.bucketSize, (uint)val);
 75 |             return this;
 76 |         }
 77 | 
 78 |         /// <summary>
 79 |         /// Set the bucket value. The value is clamped to zero and the maximum bucket
 80 |         /// value. Returns itself to allow for chaining.
 81 |         /// </summary>
 82 |         /// <param name="bucket">The bucket to change the value of.</param>
 83 |         /// <param name="value">The value to set.</param>
 84 |         /// <returns>The modified bucket.</returns>
 85 |         internal Buckets Set(uint bucket, byte value)
 86 |         {
 87 |             if (value > this._max)
 88 |                 value = this._max;
 89 | 
 90 |             SetBits(bucket * this.bucketSize, this.bucketSize, value);
 91 |             return this;
 92 |         }
 93 | 
 94 |         /// <summary>
 95 |         /// Returns the value in the specified bucket.
 96 |         /// </summary>
 97 |         /// <param name="bucket">The bucket to get.</param>
 98 |         /// <returns>The specified bucket.</returns>
 99 |         internal uint Get(uint bucket)
100 |         {
101 |             return GetBits(bucket * this.bucketSize, this.bucketSize);
102 |         }
103 | 
104 |         /// <summary>
105 |         /// Restores the Buckets to the original state. Returns itself to allow for
106 |         /// chaining.
107 |         /// </summary>
108 |         /// <returns>The Buckets object the reset operation was performed on.</returns>
109 |         internal Buckets Reset()
110 |         {
111 |             this.Data = new byte[(this.count * this.bucketSize + 7) / 8];
112 |             return this;
113 |         }
114 | 
115 |         /// <summary>
116 |         /// Returns the bits at the specified offset and length.
117 |         /// </summary>
118 |         /// <param name="offset">The position to start reading at.</param>
119 |         /// <param name="length">The distance to read from the offset.</param>
120 |         /// <returns>The bits at the specified offset and length.</returns>
121 |         internal uint GetBits(uint offset, int length)
122 |         {
123 |             uint byteIndex = offset / 8;
124 |             int byteOffset = (int)(offset % 8);
125 | 
126 |             if ((byteOffset + length) > 8)
127 |             {
128 |                 int rem = 8 - byteOffset;
129 |                 return GetBits(offset, rem)
130 |                     | (GetBits((uint)(offset + rem), length - rem) << rem);
131 |             }
132 | 
133 |             int bitMask = (1 << length) - 1;
134 |             return (uint)((this.Data[byteIndex] & (bitMask << byteOffset)) >> byteOffset);
135 |         }
136 | 
137 |         /// <summary>
138 |         /// Sets bits at the specified offset and length.
139 |         /// </summary>
140 |         /// <param name="offset">The position to start writing at.</param>
141 |         /// <param name="length">The distance to write from the offset.</param>
142 |         /// <param name="bits">The bits to write.</param>
143 |         internal void SetBits(uint offset, int length, uint bits)
144 |         {
145 |             uint byteIndex = offset / 8;
146 |             int byteOffset = (int)(offset % 8);
147 | 
148 |             if ((byteOffset + length) > 8)
149 |             {
150 |                 int rem = 8 - byteOffset;
151 |                 SetBits(offset, (byte)rem, bits);
152 |                 SetBits((uint)(offset + rem), length - rem, bits >> rem);
153 |                 return;
154 |             }
155 | 
156 |             int bitMask = (1 << length) - 1;
157 |             this.Data[byteIndex] =
158 |                 (byte)((this.Data[byteIndex]) & ~(bitMask << byteOffset));
159 |             this.Data[byteIndex] =
160 |                 (byte)((this.Data[byteIndex]) | ((bits & bitMask) << byteOffset));
161 |         }
162 |     }
163 | }
164 | 


--------------------------------------------------------------------------------
/ProbabilisticDataStructures/Buckets64.cs:
--------------------------------------------------------------------------------
  1 | using System;
  2 | using System.Collections.Generic;
  3 | using System.Linq;
  4 | using System.Text;
  5 | using System.Threading.Tasks;
  6 | 
  7 | namespace ProbabilisticDataStructures
  8 | {
  9 |     /// <summary>
 10 |     /// Buckets64 is a fast, space-efficient array of buckets where each bucket can store
 11 |     /// up to a configured maximum value.
 12 |     /// </summary>
 13 |     public class Buckets64
 14 |     {
 15 |         // The largest C# array to create; the largest power of 2 that C# can support.
 16 |         private const uint maxArraySize = 1U << 30;
 17 |         private byte[][] Data { get; set; }
 18 |         private int arrayCount { get; set; }
 19 |         private byte bucketSize { get; set; }
 20 |         private byte _max;
 21 |         private int Max
 22 |         {
 23 |             get
 24 |             {
 25 |                 return _max;
 26 |             }
 27 |             set
 28 |             {
 29 |                 // TODO: Figure out this truncation thing.
 30 |                 // I'm not sure if MaxValue is always supposed to be capped at 255 via
 31 |                 // a byte conversion or not...
 32 |                 if (value > byte.MaxValue)
 33 |                     _max = byte.MaxValue;
 34 |                 else
 35 |                     _max = (byte)value;
 36 |             }
 37 |         }
 38 |         internal ulong count { get; set; }
 39 | 
 40 |         /// <summary>
 41 |         /// Creates a new Buckets64 with the provided number of buckets where each bucket
 42 |         /// is the specified number of bits.
 43 |         /// </summary>
 44 |         /// <param name="count">Number of buckets.</param>
 45 |         /// <param name="bucketSize">Number of bits per bucket.</param>
 46 |         internal Buckets64(ulong count, byte bucketSize)
 47 |         {
 48 |             this.count = count;
 49 |             this.bucketSize = bucketSize;
 50 |             AllocateArray(count, bucketSize);
 51 |             this.Max = (1 << bucketSize) - 1;
 52 |         }
 53 | 
 54 |         private void AllocateArray(ulong count, byte bucketSize)
 55 |         {
 56 |             this.arrayCount = (int)(count / maxArraySize + 1);
 57 |             this.Data = new byte[this.arrayCount][];
 58 |             var bytesToAllocate = (count * bucketSize + 7) / 8;
 59 |             for (int i = 0; i < this.arrayCount; i++)
 60 |             {
 61 |                 var arraySize = Math.Min(bytesToAllocate, maxArraySize);
 62 |                 this.Data[i] = new byte[arraySize];
 63 |                 bytesToAllocate -= arraySize;
 64 |             }
 65 |         }
 66 | 
 67 |         /// <summary>
 68 |         /// Returns the maximum value that can be stored in a bucket.
 69 |         /// </summary>
 70 |         /// <returns>The bucket max value.</returns>
 71 |         internal byte MaxBucketValue()
 72 |         {
 73 |             return this._max;
 74 |         }
 75 | 
 76 |         /// <summary>
 77 |         /// Increment the value in the specified bucket by the provided delta. A bucket
 78 |         /// can be decremented by providing a negative delta.
 79 |         /// <para>
 80 |         ///     The value is clamped to zero and the maximum bucket value. Returns itself
 81 |         ///     to allow for chaining.
 82 |         /// </para>
 83 |         /// </summary>
 84 |         /// <param name="bucket">The bucket to increment.</param>
 85 |         /// <param name="delta">The amount to increment the bucket by.</param>
 86 |         /// <returns>The modified bucket.</returns>
 87 |         internal Buckets64 Increment(uint bucket, int delta)
 88 |         {
 89 |             int val = (int)(GetBits(bucket * this.bucketSize, this.bucketSize) + delta);
 90 | 
 91 |             if (val > this.Max)
 92 |                 val = this.Max;
 93 |             else if (val < 0)
 94 |                 val = 0;
 95 | 
 96 |             SetBits((uint)bucket * (uint)this.bucketSize, this.bucketSize, (uint)val);
 97 |             return this;
 98 |         }
 99 | 
100 |         /// <summary>
101 |         /// Set the bucket value. The value is clamped to zero and the maximum bucket
102 |         /// value. Returns itself to allow for chaining.
103 |         /// </summary>
104 |         /// <param name="bucket">The bucket to change the value of.</param>
105 |         /// <param name="value">The value to set.</param>
106 |         /// <returns>The modified bucket.</returns>
107 |         internal Buckets64 Set(ulong bucket, byte value)
108 |         {
109 |             if (value > this._max)
110 |                 value = this._max;
111 | 
112 |             SetBits(bucket * this.bucketSize, this.bucketSize, value);
113 |             return this;
114 |         }
115 | 
116 |         /// <summary>
117 |         /// Returns the value in the specified bucket.
118 |         /// </summary>
119 |         /// <param name="bucket">The bucket to get.</param>
120 |         /// <returns>The specified bucket.</returns>
121 |         internal uint Get(ulong bucket)
122 |         {
123 |             return GetBits(bucket * this.bucketSize, this.bucketSize);
124 |         }
125 | 
126 |         /// <summary>
127 |         /// Restores the Buckets64 to the original state. Returns itself to allow for
128 |         /// chaining.
129 |         /// </summary>
130 |         /// <returns>The Buckets64 object the reset operation was performed on.</returns>
131 |         internal Buckets64 Reset()
132 |         {
133 |             AllocateArray(this.count, this.bucketSize);
134 |             return this;
135 |         }
136 | 
137 |         /// <summary>
138 |         /// Returns the bits at the specified offset and length.
139 |         /// </summary>
140 |         /// <param name="offset">The position to start reading at.</param>
141 |         /// <param name="length">The distance to read from the offset.</param>
142 |         /// <returns>The bits at the specified offset and length.</returns>
143 |         internal uint GetBits(ulong offset, int length)
144 |         {
145 |             ulong byteIndex = offset / 8;
146 |             int byteOffset = (int)(offset % 8);
147 | 
148 |             if ((byteOffset + length) > 8)
149 |             {
150 |                 int rem = 8 - byteOffset;
151 |                 return GetBits(offset, rem)
152 |                     | (GetBits(offset + (ulong)rem, length - rem) << rem);
153 |             }
154 | 
155 |             var dataArray = this.Data[byteIndex / maxArraySize];
156 |             var dataArrayByteIndex = byteIndex % maxArraySize;
157 |             int bitMask = (1 << length) - 1;
158 |             return (uint)((dataArray[dataArrayByteIndex] & (bitMask << byteOffset)) >> byteOffset);
159 |         }
160 | 
161 |         /// <summary>
162 |         /// Sets bits at the specified offset and length.
163 |         /// </summary>
164 |         /// <param name="offset">The position to start writing at.</param>
165 |         /// <param name="length">The distance to write from the offset.</param>
166 |         /// <param name="bits">The bits to write.</param>
167 |         internal void SetBits(ulong offset, int length, uint bits)
168 |         {
169 |             ulong byteIndex = offset / 8;
170 |             int byteOffset = (int)(offset % 8);
171 | 
172 |             if ((byteOffset + length) > 8)
173 |             {
174 |                 int rem = 8 - byteOffset;
175 |                 SetBits(offset, (byte)rem, bits);
176 |                 SetBits(offset + (ulong)rem, length - rem, bits >> rem);
177 |                 return;
178 |             }
179 | 
180 |             var dataArray = this.Data[(uint)(byteIndex / maxArraySize)];
181 |             var dataArrayByteIndex = (uint)(byteIndex % maxArraySize);
182 |             int bitMask = (1 << length) - 1;
183 |             dataArray[dataArrayByteIndex] =
184 |                 (byte)((dataArray[dataArrayByteIndex]) & ~(bitMask << byteOffset));
185 |             dataArray[dataArrayByteIndex] =
186 |                 (byte)((dataArray[dataArrayByteIndex]) | ((bits & bitMask) << byteOffset));
187 |         }
188 |     }
189 | }
190 | 


--------------------------------------------------------------------------------
/ProbabilisticDataStructures/CountMinSketch.cs:
--------------------------------------------------------------------------------
  1 | ﻿using System;
  2 | using System.Security.Cryptography;
  3 | 
  4 | namespace ProbabilisticDataStructures
  5 | {
  6 |     /// <summary>
  7 |     /// CountMinSketch implements a Count-Min Sketch as described by Cormode and
  8 |     /// Muthukrishnan in An Improved Data Stream Summary: The Count-Min Sketch and its
  9 |     /// Applications:
 10 |     ///
 11 |     /// http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf
 12 |     ///
 13 |     /// A Count-Min Sketch (CMS) is a probabilistic data structure which approximates
 14 |     /// the frequency of events in a data stream. Unlike a hash map, a CMS uses
 15 |     /// sub-linear space at the expense of a configurable error factor. Similar to
 16 |     /// Counting Bloom filters, items are hashed to a series of buckets, which increment
 17 |     /// a counter. The frequency of an item is estimated by taking the minimum of each of
 18 |     /// the item's respective counter values.
 19 |     ///
 20 |     /// Count-Min Sketches are useful for counting the frequency of events in massive
 21 |     /// data sets or unbounded streams online. In these situations, storing the entire
 22 |     /// data set or allocating counters for every event in memory is impractical. It may
 23 |     /// be possible for offline processing, but real-time processing requires fast,
 24 |     /// space-efficient solutions like the CMS. For approximating set cardinality, refer
 25 |     /// to the HyperLogLog.
 26 |     /// </summary>
 27 |     public class CountMinSketch
 28 |     {
 29 |         /// <summary>
 30 |         /// Count matrix
 31 |         /// </summary>
 32 |         internal UInt64[][] Matrix { get; set; }
 33 |         /// <summary>
 34 |         /// Matrix width
 35 |         /// </summary>
 36 |         internal uint Width { get; set; }
 37 |         /// <summary>
 38 |         /// Matrix depth
 39 |         /// </summary>
 40 |         internal uint Depth { get; set; }
 41 |         /// <summary>
 42 |         /// Number of items added
 43 |         /// </summary>
 44 |         private UInt64 count { get; set; }
 45 |         /// <summary>
 46 |         /// Relative-accuracy factor
 47 |         /// </summary>
 48 |         private double epsilon { get; set; }
 49 |         /// <summary>
 50 |         /// Relative-accuracy probability
 51 |         /// </summary>
 52 |         private double delta { get; set; }
 53 |         /// <summary>
 54 |         /// Hash function
 55 |         /// </summary>
 56 |         private HashAlgorithm Hash { get; set; }
 57 | 
 58 |         /// <summary>
 59 |         /// Creates a new Count-Min Sketch whose relative accuracy is within a factor of
 60 |         /// epsilon with probability delta. Both of these parameters affect the space and
 61 |         /// time complexity.
 62 |         /// </summary>
 63 |         /// <param name="epsilon">Relative-accuracy factor</param>
 64 |         /// <param name="delta">Relative-accuracy probability</param>
 65 |         public CountMinSketch(double epsilon, double delta)
 66 |         {
 67 |             var width = (uint)(Math.Ceiling(Math.E / epsilon));
 68 |             var depth = (uint)(Math.Ceiling(Math.Log(1 / delta)));
 69 |             this.Matrix = new UInt64[depth][];
 70 | 
 71 |             for (int i = 0; i < depth; i++)
 72 |             {
 73 |                this.Matrix[i] = new UInt64[width];
 74 |             }
 75 | 
 76 |             this.Width = width;
 77 |             this.Depth = depth;
 78 |             this.epsilon = epsilon;
 79 |             this.delta = delta;
 80 |             this.Hash = Defaults.GetDefaultHashAlgorithm();
 81 |         }
 82 | 
 83 |         /// <summary>
 84 |         /// Returns the relative-accuracy factor, epsilon.
 85 |         /// </summary>
 86 |         /// <returns>The relative-accuracy factor, epsilon</returns>
 87 |         public double Epsilon()
 88 |         {
 89 |             return this.epsilon;
 90 |         }
 91 | 
 92 |         /// <summary>
 93 |         /// Returns the relative-accuracy probability, delta.
 94 |         /// </summary>
 95 |         /// <returns>The relative-accuracy probability, delta</returns>
 96 |         public double Delta()
 97 |         {
 98 |             return this.delta;
 99 |         }
100 | 
101 |         /// <summary>
102 |         /// Returns the number of items added to the sketch.
103 |         /// </summary>
104 |         /// <returns>The number of items added to the sketch.</returns>
105 |         public UInt64 TotalCount()
106 |         {
107 |             return this.count;
108 |         }
109 | 
110 |         /// <summary>
111 |         /// Add the data to the set. Returns the CountMinSketch to allow for chaining.
112 |         /// </summary>
113 |         /// <param name="data">The data to add.</param>
114 |         /// <returns>The CountMinSketch</returns>
115 |         public CountMinSketch Add(byte[] data)
116 |         {
117 |             var hashKernel = Utils.HashKernel(data, this.Hash);
118 |             var lower = hashKernel.LowerBaseHash;
119 |             var upper = hashKernel.UpperBaseHash;
120 | 
121 |             // Increment count in each row.
122 |             for (uint i = 0; i < this.Depth; i++)
123 |             {
124 |                 this.Matrix[i][(lower + upper * i) % this.Width]++;
125 |             }
126 | 
127 |             this.count++;
128 |             return this;
129 |         }
130 | 
131 |         /// <summary>
132 |         /// Returns the approximate count for the specified item, correct within
133 |         /// epsilon * total count with a probability of delta.
134 |         /// </summary>
135 |         /// <param name="data"></param>
136 |         /// <returns>The data to count.</returns>
137 |         public UInt64 Count(byte[] data)
138 |         {
139 |             var hashKernel = Utils.HashKernel(data, this.Hash);
140 |             var lower = hashKernel.LowerBaseHash;
141 |             var upper = hashKernel.UpperBaseHash;
142 |             var count = UInt64.MaxValue;
143 | 
144 |             for (uint i = 0; i < this.Depth; i++)
145 |             {
146 |                 count = Math.Min(count, this.Matrix[i][(lower + upper * i) % this.Width]);
147 |             }
148 | 
149 |             return count;
150 |         }
151 | 
152 |         /// <summary>
153 |         /// Combines this CountMinSketch with another. Returns a bool if the merge was
154 |         /// successful. Throws an exception if the matrix width and depth are not equal.
155 |         /// </summary>
156 |         /// <param name="other">The CountMinSketch to merge with the current
157 |         /// instance.</param>
158 |         /// <returns>True if successful.</returns>
159 |         public bool Merge(CountMinSketch other)
160 |         {
161 |             if (this.Depth != other.Depth)
162 |             {
163 |                 throw new Exception("Matrix depth must match.");
164 |             }
165 | 
166 |             if (this.Width != other.Width)
167 |             {
168 |                 throw new Exception("Matrix width must match.");
169 |             }
170 | 
171 |             for (uint i = 0; i < this.Depth; i++)
172 |             {
173 |                 for (int j = 0; j < this.Width; j++)
174 |                 {
175 |                     this.Matrix[i][j] += other.Matrix[i][j];
176 |                 }
177 |             }
178 | 
179 |             this.count += other.count;
180 |             return true;
181 |         }
182 | 
183 |         /// <summary>
184 |         /// Restores the CountMinSketch to its original state. It returns itself to allow
185 |         /// for chaining.
186 |         /// </summary>
187 |         /// <returns>The CountMinSketch</returns>
188 |         public CountMinSketch Reset()
189 |         {
190 |             this.Matrix = new UInt64[this.Depth][];
191 |             for (uint i = 0; i < this.Depth; i++)
192 |             {
193 |                 this.Matrix[i] = new UInt64[this.Width];
194 |             }
195 | 
196 |             this.count = 0;
197 |             return this;
198 |         }
199 | 
200 |         /// <summary>
201 |         /// Sets the hashing function used in the filter.
202 |         /// </summary>
203 |         /// <param name="h">The HashAlgorithm to use.</param>
204 |         public void SetHash(HashAlgorithm h)
205 |         {
206 |             this.Hash = h;
207 |         }
208 | 
209 |         // TODO: Implement these later.
210 |         // WriteDataTo()
211 |         // ReadDataFrom()
212 |     }
213 | }
214 | 


--------------------------------------------------------------------------------
/ProbabilisticDataStructures/CountingBloomFilter.cs:
--------------------------------------------------------------------------------
  1 | ﻿using System.Security.Cryptography;
  2 | 
  3 | namespace ProbabilisticDataStructures
  4 | {
  5 |     /// <summary>
  6 |     /// CountingBloomFilter implements a Counting Bloom Filter as described by Fan,
  7 |     /// Cao, Almeida, and Broder in Summary Cache: A Scalable Wide-Area Web Cache
  8 |     /// Sharing Protocol:
  9 |     ///
 10 |     /// http://pages.cs.wisc.edu/~jussara/papers/00ton.pdf
 11 |     ///
 12 |     /// A Counting Bloom Filter (CBF) provides a way to remove elements by using an
 13 |     /// array of n-bit buckets. When an element is added, the respective buckets are
 14 |     /// incremented. To remove an element, the respective buckets are decremented. A
 15 |     /// query checks that each of the respective buckets are non-zero. Because CBFs
 16 |     /// allow elements to be removed, they introduce a non-zero probability of false
 17 |     /// negatives in addition to the possibility of false positives.
 18 |     ///
 19 |     /// Counting Bloom Filters are useful for cases where elements are both added
 20 |     /// and removed from the data set. Since they use n-bit buckets, CBFs use
 21 |     /// roughly n-times more memory than traditional Bloom filters.
 22 |     /// </summary>
 23 |     public class CountingBloomFilter : IFilter
 24 |     {
 25 |         /// <summary>
 26 |         /// Filter data
 27 |         /// </summary>
 28 |         internal Buckets Buckets { get; set; }
 29 |         /// <summary>
 30 |         /// Hash algorithm
 31 |         /// </summary>
 32 |         private HashAlgorithm Hash { get; set; }
 33 |         /// <summary>
 34 |         /// Filter size
 35 |         /// </summary>
 36 |         private uint m { get; set; }
 37 |         /// <summary>
 38 |         /// Number of hash functions
 39 |         /// </summary>
 40 |         private uint k { get; set; }
 41 |         /// <summary>
 42 |         /// Number of items added
 43 |         /// </summary>
 44 |         private uint count { get; set; }
 45 |         /// <summary>
 46 |         /// Buffer used to cache indices
 47 |         /// </summary>
 48 |         private uint[] indexBuffer { get; set; }
 49 | 
 50 |         /// <summary>
 51 |         /// Creates a new Counting Bloom Filter optimized to store n-items with a
 52 |         /// specified target false-positive rate and bucket size. If you don't know how
 53 |         /// many bits to use for buckets, use NewDefaultCountingBloomFilter for a
 54 |         /// sensible default.
 55 |         /// </summary>
 56 |         /// <param name="n">Number of items to store.</param>
 57 |         /// <param name="b">Bucket size.</param>
 58 |         /// <param name="fpRate">Desired false positive rate.</param>
 59 |         public CountingBloomFilter(uint n, byte b, double fpRate)
 60 |         {
 61 |             var m = Utils.OptimalM(n, fpRate);
 62 |             var k = Utils.OptimalK(fpRate);
 63 |             this.Buckets =  new Buckets(m, b);
 64 |             this.Hash = Defaults.GetDefaultHashAlgorithm();
 65 |             this.m = m;
 66 |             this.k = k;
 67 |             this.indexBuffer = new uint[k];
 68 |         }
 69 | 
 70 |         /// <summary>
 71 |         /// Creates a new Counting Bloom Filter optimized to store n items with a
 72 |         /// specified target false-positive rate. Buckets are allocated four bits.
 73 |         /// </summary>
 74 |         /// <param name="n">Number of items to store.</param>
 75 |         /// <param name="fpRate">Desired false positive rate.</param>
 76 |         /// <returns>Default CountingBloomFilter</returns>
 77 |         public static CountingBloomFilter NewDefaultCountingBloomFilter(
 78 |             uint n,
 79 |             double fpRate)
 80 |         {
 81 |             return new CountingBloomFilter(n, 4, fpRate);
 82 |         }
 83 | 
 84 |         /// <summary>
 85 |         /// Returns the Bloom filter capacity, m.
 86 |         /// </summary>
 87 |         /// <returns>The Bloom filter capacity, m.</returns>
 88 |         public uint Capacity()
 89 |         {
 90 |             return this.m;
 91 |         }
 92 | 
 93 |         /// <summary>
 94 |         /// Returns the number of hash functions.
 95 |         /// </summary>
 96 |         /// <returns>The number of hash functions.</returns>
 97 |         public uint K()
 98 |         {
 99 |             return this.k;
100 |         }
101 | 
102 |         /// <summary>
103 |         /// Returns the number of items in the filter.
104 |         /// </summary>
105 |         /// <returns></returns>
106 |         public uint Count()
107 |         {
108 |             return this.count;
109 |         } 
110 | 
111 |         /// <summary>
112 |         /// Will test for membership of the data and returns true if it is a member,
113 |         /// false if not. This is a probabilistic test, meaning there is a non-zero
114 |         /// probability of false positives but a zero probability of false negatives.
115 |         /// </summary>
116 |         /// <param name="data">The data to search for.</param>
117 |         /// <returns>Whether or not the data is maybe contained in the filter.</returns>
118 |         public bool Test(byte[] data)
119 |         {
120 |             var hashKernel = Utils.HashKernel(data, this.Hash);
121 |             var lower = hashKernel.LowerBaseHash;
122 |             var upper = hashKernel.UpperBaseHash;
123 | 
124 |             // If any of the K bits are not set, then it's not a member.
125 |             for (uint i = 0; i < this.k; i++)
126 |             {
127 |                 if (this.Buckets.Get((lower + upper * i) % this.m) == 0)
128 |                 {
129 |                     return false;
130 |                 }
131 |             }
132 |             return true;
133 |         }
134 | 
135 |         /// <summary>
136 |         /// Will add the data to the Bloom filter. It returns the filter to allow
137 |         /// for chaining.
138 |         /// </summary>
139 |         /// <param name="data">The data to add.</param>
140 |         /// <returns>The filter.</returns>
141 |         public IFilter Add(byte[] data)
142 |         {
143 |             var hashKernel = Utils.HashKernel(data, this.Hash);
144 |             var lower = hashKernel.LowerBaseHash;
145 |             var upper = hashKernel.UpperBaseHash;
146 | 
147 |             // Set the K bits.
148 |             for (uint i = 0; i < this.k; i++)
149 |             {
150 |                 this.Buckets.Increment((lower + upper * i) % this.m, 1);
151 |             }
152 | 
153 |             this.count++;
154 |             return this;
155 |         }
156 | 
157 |         /// <summary>
158 |         /// Is equivalent to calling Test followed by Add. It returns true if the data is
159 |         /// a member, false if not.
160 |         /// </summary>
161 |         /// <param name="data">The data to test for and add if it doesn't exist.</param>
162 |         /// <returns>Whether or not the data was probably contained in the filter.</returns>
163 |         public bool TestAndAdd(byte[] data)
164 |         {
165 |             var hashKernel = Utils.HashKernel(data, this.Hash);
166 |             var lower = hashKernel.LowerBaseHash;
167 |             var upper = hashKernel.UpperBaseHash;
168 |             var member = true;
169 | 
170 |             // If any of the K bits are not set, then it's not a member.
171 |             for (uint i = 0; i < this.k; i++)
172 |             {
173 |                 var idx = (lower + upper * i) % this.m;
174 |                 if (this.Buckets.Get(idx) == 0)
175 |                 {
176 |                     member = false;
177 |                 }
178 |                 this.Buckets.Increment(idx, 1);
179 |             }
180 | 
181 |             this.count++;
182 |             return member;
183 |         }
184 | 
185 |         /// <summary>
186 |         /// Will test for membership of the data and remove it from the filter if it
187 |         /// exists. Returns true if the data was a member, false if not.
188 |         /// </summary>
189 |         /// <param name="data">The data to check for and remove.</param>
190 |         /// <returns>Whether or not the data was in the filter before removal.</returns>
191 |         public bool TestAndRemove(byte[] data)
192 |         {
193 |             var hashKernel = Utils.HashKernel(data, this.Hash);
194 |             var lower = hashKernel.LowerBaseHash;
195 |             var upper = hashKernel.UpperBaseHash;
196 |             var member = true;
197 | 
198 |             // Set the K bits.
199 |             for (uint i = 0; i < this.k; i++)
200 |             {
201 |                 this.indexBuffer[i] = (lower + upper * i) % this.m;
202 |                 if (this.Buckets.Get(this.indexBuffer[i]) == 0)
203 |                 {
204 |                     member = false;
205 |                 }
206 |             }
207 | 
208 |             if (member)
209 |             {
210 |                 foreach (var idx in this.indexBuffer)
211 |                 {
212 |                     this.Buckets.Increment(idx, -1);
213 |                 }
214 |                 this.count--;
215 |             }
216 | 
217 |             return member;
218 |         }
219 | 
220 |         /// <summary>
221 |         /// Restores the Bloom filter to its original state. It returns the filter to
222 |         /// allow for chaining.
223 |         /// </summary>
224 |         /// <returns>The reset bloom filter.</returns>
225 |         public CountingBloomFilter Reset()
226 |         {
227 |             this.Buckets.Reset();
228 |             this.count = 0;
229 |             return this;
230 |         }
231 | 
232 |         /// <summary>
233 |         /// Sets the hashing function used in the filter.
234 |         /// </summary>
235 |         /// <param name="h">The HashAlgorithm to use.</param>
236 |         // TODO: Add SetHash to the IFilter interface?
237 |         public void SetHash(HashAlgorithm h)
238 |         {
239 |             this.Hash = h;
240 |         }
241 |     }
242 | }
243 | 


--------------------------------------------------------------------------------
/ProbabilisticDataStructures/Defaults.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System.Security.Cryptography;
 2 | using System.Runtime.CompilerServices;
 3 | [assembly: InternalsVisibleTo("TestProbabilisticDataStructures")]
 4 | 
 5 | namespace ProbabilisticDataStructures
 6 | {
 7 |     public static class Defaults
 8 |     {
 9 |         public const double FILL_RATIO = 0.5;
10 | 
11 |         /// <summary>
12 |         /// Returns the default hashing algorithm for the library.
13 |         /// </summary>
14 |         /// <returns>The default hashing algorithm for the library</returns>
15 |         internal static HashAlgorithm GetDefaultHashAlgorithm()
16 |         {
17 |             return HashAlgorithm.Create("MD5");
18 |         }
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/ProbabilisticDataStructures/DeletableBloomFilter.cs:
--------------------------------------------------------------------------------
  1 | ﻿using System.Security.Cryptography;
  2 | 
  3 | namespace ProbabilisticDataStructures
  4 | {
  5 |     /// <summary>
  6 |     /// DeletableBloomFilter implements a Deletable Bloom Filter as described by
  7 |     /// Rothenberg, Macapuna, Verdi, Magalhaes in The Deletable Bloom filter - A new
  8 |     /// member of the Bloom family:
  9 |     ///
 10 |     /// http://arxiv.org/pdf/1005.0352.pdf
 11 |     ///
 12 |     /// A Deletable Bloom Filter compactly stores information on collisions when
 13 |     /// inserting elements. This information is used to determine if elements are
 14 |     /// deletable. This design enables false-negative-free deletions at a fraction
 15 |     /// of the cost in memory consumption.
 16 |     ///
 17 |     /// Deletable Bloom Filters are useful for cases which require removing elements
 18 |     /// but cannot allow false negatives. This means they can be safely swapped in
 19 |     /// place of traditional Bloom filters.
 20 |     /// </summary>
 21 |     public class DeletableBloomFilter : IFilter
 22 |     {
 23 |         /// <summary>
 24 |         /// Filter data
 25 |         /// </summary>
 26 |         internal Buckets Buckets { get; set; }
 27 |         /// <summary>
 28 |         /// Filter collision data
 29 |         /// </summary>
 30 |         internal Buckets Collisions { get; set; }
 31 |         /// <summary>
 32 |         /// Hash algorithm
 33 |         /// </summary>
 34 |         private HashAlgorithm Hash { get; set; }
 35 |         /// <summary>
 36 |         /// Filter size
 37 |         /// </summary>
 38 |         private uint M { get; set; }
 39 |         /// <summary>
 40 |         /// Number of bits in a region
 41 |         /// </summary>
 42 |         private uint RegionSize { get; set; }
 43 |         /// <summary>
 44 |         /// Number of hash functions
 45 |         /// </summary>
 46 |         private uint k { get; set; }
 47 |         /// <summary>
 48 |         /// Number of items in the filter
 49 |         /// </summary>
 50 |         private uint count { get; set; }
 51 |         /// <summary>
 52 |         /// Buffer used to cache indices
 53 |         /// </summary>
 54 |         private uint[] IndexBuffer { get; set; }
 55 | 
 56 |         /// <summary>
 57 |         /// NewDeletableBloomFilter creates a new DeletableBloomFilter optimized to store
 58 |         /// n items with a specified target false-positive rate. The r value determines
 59 |         /// the number of bits to use to store collision information. This controls the
 60 |         /// deletability of an element. Refer to the paper for selecting an optimal value.
 61 |         /// </summary>
 62 |         /// <param name="n">Number of items</param>
 63 |         /// <param name="r">Number of bits to use to store collision information</param>
 64 |         /// <param name="fpRate">Desired false positive rate</param>
 65 |         public DeletableBloomFilter(uint n, uint r, double fpRate)
 66 |         {
 67 |             var m = Utils.OptimalM(n, fpRate);
 68 |             var k = Utils.OptimalK(fpRate);
 69 | 
 70 |             this.Buckets = new Buckets(m - r, 1);
 71 |             this.Collisions = new Buckets(r, 1);
 72 |             this.Hash = Defaults.GetDefaultHashAlgorithm();
 73 |             this.M = m - r;
 74 |             this.RegionSize = (m - r) / r;
 75 |             this.k = k;
 76 |             this.IndexBuffer = new uint[k];
 77 |         }
 78 | 
 79 |         /// <summary>
 80 |         /// Returns the Bloom filter capacity, m.
 81 |         /// </summary>
 82 |         /// <returns>The Bloom filter capacity, m</returns>
 83 |         public uint Capacity()
 84 |         {
 85 |             return this.M;
 86 |         }
 87 | 
 88 |         /// <summary>
 89 |         /// Returns the number of hash functions.
 90 |         /// </summary>
 91 |         /// <returns>The number of hash functions</returns>
 92 |         public uint K()
 93 |         {
 94 |             return this.k;
 95 |         }
 96 | 
 97 |         /// <summary>
 98 |         /// Returns the number of items added to the filter.
 99 |         /// </summary>
100 |         /// <returns>The number of items added to the filter</returns>
101 |         public uint Count()
102 |         {
103 |             return this.count;
104 |         }
105 | 
106 |         /// <summary>
107 |         /// Will test for membership of the data and returns true if it is a member,
108 |         /// false if not. This is a probabilistic test, meaning there is a non-zero
109 |         /// probability of false positives but a zero probability of false negatives.
110 |         /// </summary>
111 |         /// <param name="data">The data to search for.</param>
112 |         /// <returns>Whether or not the data is maybe contained in the filter.</returns>
113 |         public bool Test(byte[] data)
114 |         {
115 |             var hashKernel = Utils.HashKernel(data, this.Hash);
116 |             var lower = hashKernel.LowerBaseHash;
117 |             var upper = hashKernel.UpperBaseHash;
118 | 
119 |             // If any of the K bits are not set, then it's not a member.
120 |             for (uint i = 0; i < this.k; i++)
121 |             {
122 |                 if (this.Buckets.Get((lower + upper * i) % this.M) == 0)
123 |                 {
124 |                     return false;
125 |                 }
126 |             }
127 |             return true;
128 |         }
129 | 
130 |         /// <summary>
131 |         /// Will add the data to the Bloom filter. It returns the filter to allow
132 |         /// for chaining.
133 |         /// </summary>
134 |         /// <param name="data">The data to add.</param>
135 |         /// <returns>The filter.</returns>
136 |         public IFilter Add(byte[] data)
137 |         {
138 |             var hashKernel = Utils.HashKernel(data, this.Hash);
139 |             var lower = hashKernel.LowerBaseHash;
140 |             var upper = hashKernel.UpperBaseHash;
141 | 
142 |             // Set the K bits.
143 |             for (uint i = 0; i < this.k; i++)
144 |             {
145 |                 var idx = (lower + upper * i) % this.M;
146 |                 if (this.Buckets.Get(idx) != 0)
147 |                 {
148 |                     // Collision, set corresponding region bit.
149 |                     this.Collisions.Set(idx / this.RegionSize, 1);
150 |                 }
151 |                 else
152 |                 {
153 |                     this.Buckets.Set(idx, 1);
154 |                 }
155 |             }
156 | 
157 |             this.count++;
158 |             return this;
159 |         }
160 | 
161 |         /// <summary>
162 |         /// Is equivalent to calling Test followed by Add. It returns true if the data is
163 |         /// a member, false if not.
164 |         /// </summary>
165 |         /// <param name="data">The data to test for and add if it doesn't exist.</param>
166 |         /// <returns>Whether or not the data was probably contained in the filter.</returns>
167 |         public bool TestAndAdd(byte[] data)
168 |         {
169 |             var hashKernel = Utils.HashKernel(data, this.Hash);
170 |             var lower = hashKernel.LowerBaseHash;
171 |             var upper = hashKernel.UpperBaseHash;
172 |             var member = true;
173 | 
174 |             // If any of the K bits are not set, then it's not a member.
175 |             for (uint i = 0; i < this.k; i++)
176 |             {
177 |                 var idx = (lower + upper * i) % this.M;
178 |                 if (this.Buckets.Get(idx) == 0)
179 |                 {
180 |                     member = false;
181 |                 }
182 |                 else
183 |                 {
184 |                     // Collision, set corresponding region bit.
185 |                     this.Collisions.Set(idx / this.RegionSize, 1);
186 |                 }
187 |                 this.Buckets.Set(idx, 1);
188 |             }
189 | 
190 |             this.count++;
191 |             return member;
192 |         }
193 | 
194 |         /// <summary>
195 |         /// Will test for membership of the data and remove it from the filter if it
196 |         /// exists. Returns true if the data was a member, false if not.
197 |         /// </summary>
198 |         /// <param name="data">The data to test for and remove</param>
199 |         /// <returns>Whether or not the data was a member before this call</returns>
200 |         public bool TestAndRemove(byte[] data)
201 |         {
202 |             var hashKernel = Utils.HashKernel(data, this.Hash);
203 |             var lower = hashKernel.LowerBaseHash;
204 |             var upper = hashKernel.UpperBaseHash;
205 |             var member = true;
206 | 
207 |             // Set the K bits.
208 |             for (uint i = 0; i < this.k; i++)
209 |             {
210 |                 var idx = (lower + upper * i) % this.M;
211 |                 this.IndexBuffer[i] = idx;
212 |                 if (this.Buckets.Get(idx) == 0)
213 |                 {
214 |                     member = false;
215 |                 }
216 |             }
217 | 
218 |             if (member)
219 |             {
220 |                 foreach (var idx in this.IndexBuffer)
221 |                 {
222 |                     if (this.Collisions.Get(idx / this.RegionSize) == 0)
223 |                     {
224 |                         // Clear only bits located in collision-free zones.
225 |                         this.Buckets.Set(idx, 0);
226 |                     }
227 |                 }
228 |                 this.count--;
229 |             }
230 | 
231 |             return member;
232 |         }
233 | 
234 |         /// <summary>
235 |         /// Restores the Bloom filter to its original state. It returns the filter to
236 |         /// allow for chaining.
237 |         /// </summary>
238 |         /// <returns>The reset bloom filter.</returns>
239 |         public DeletableBloomFilter Reset()
240 |         {
241 |             this.Buckets.Reset();
242 |             this.Collisions.Reset();
243 |             this.count = 0;
244 |             return this;
245 |         }
246 | 
247 |         /// <summary>
248 |         /// Sets the hashing function used in the filter.
249 |         /// </summary>
250 |         /// <param name="h">The HashAlgorithm to use.</param>
251 |         // TODO: Add SetHash to the IFilter interface?
252 |         public void SetHash(HashAlgorithm h)
253 |         {
254 |             this.Hash = h;
255 |         }
256 |     }
257 | }
258 | 


--------------------------------------------------------------------------------
/ProbabilisticDataStructures/Element.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | 
 3 | namespace ProbabilisticDataStructures
 4 | {
 5 |     public class Element
 6 |     {
 7 |         public byte[] Data { get; set; }
 8 |         public UInt64 Freq { get; set; }
 9 |     }
10 | }
11 | 


--------------------------------------------------------------------------------
/ProbabilisticDataStructures/ElementHeap.cs:
--------------------------------------------------------------------------------
  1 | ﻿using System;
  2 | using System.Collections.Generic;
  3 | using System.Linq;
  4 | 
  5 | namespace ProbabilisticDataStructures
  6 | {
  7 |     internal class ElementHeap
  8 |     {
  9 |         internal List<Element> Heap { get; set; }
 10 | 
 11 |         /// <summary>
 12 |         /// Create a new ElementHeap that can store the top-k elements.
 13 |         /// </summary>
 14 |         /// <param name="k">The number of top elements to track</param>
 15 |         internal ElementHeap(int k)
 16 |         {
 17 |             this.Heap = new List<Element>(k);
 18 |         }
 19 | 
 20 |         /// <summary>
 21 |         /// Get the count of the number of items on the heap.
 22 |         /// </summary>
 23 |         /// <returns>The number of items on the heap</returns>
 24 |         internal int Len()
 25 |         {
 26 |             return this.Heap.Count;
 27 |         }
 28 | 
 29 |         /// <summary>
 30 |         /// Return whether or not the item at i-position on the heap is less than the
 31 |         /// item at j-position.
 32 |         /// </summary>
 33 |         /// <param name="i">Item 1</param>
 34 |         /// <param name="j">Item 2</param>
 35 |         /// <returns>
 36 |         /// Whether or not the item at i-position on the heap is less than the item at
 37 |         /// j-position.
 38 |         /// </returns>
 39 |         internal bool Less(int i, int j)
 40 |         {
 41 |             return this.Heap[i].Freq < this.Heap[j].Freq;
 42 |         }
 43 | 
 44 |         /// <summary>
 45 |         /// Swap the items at i-position and j-position on the heap.
 46 |         /// </summary>
 47 |         /// <param name="i">Item 1</param>
 48 |         /// <param name="j">Item 2</param>
 49 |         internal void Swap(int i, int j)
 50 |         {
 51 |             var temp = this.Heap[i];
 52 |             Heap[i] = Heap[j];
 53 |             Heap[j] = temp;
 54 |         }
 55 | 
 56 |         /// <summary>
 57 |         /// Push an Element onto the heap.
 58 |         /// </summary>
 59 |         /// <param name="e">The Element to push onto the heap</param>
 60 |         internal void Push(Element e)
 61 |         {
 62 |             this.Heap.Add(e);
 63 |             this.Up(this.Len() - 1);
 64 |         }
 65 | 
 66 |         /// <summary>
 67 |         /// Remove the Element at the top of the heap.
 68 |         /// </summary>
 69 |         /// <returns>The Element that was removed</returns>
 70 |         internal Element Pop()
 71 |         {
 72 |             var elementToRemove = this.Heap[0];
 73 |             this.Heap.Remove(elementToRemove);
 74 |             return elementToRemove;
 75 |         }
 76 | 
 77 |         internal void Up(int j)
 78 |         {
 79 |             while (true)
 80 |             {
 81 |                 var i = (j - 1) / 2; // parent
 82 |                 if (i == j || !this.Less(j, i))
 83 |                 {
 84 |                     break;
 85 |                 }
 86 |                 this.Swap(i, j);
 87 |                 j = i;
 88 |             }
 89 |         }
 90 | 
 91 |         internal void Down(int i, int n)
 92 |         {
 93 |             while (true)
 94 |             {
 95 |                 var j1 = 2 * i + 1;
 96 |                 if (j1 >= n || j1 < 0)
 97 |                 {
 98 |                     // j1 < - after int overflow
 99 |                     break;
100 |                 }
101 |                 var j = j1; // left child
102 |                 var j2 = j1 + 1;
103 |                 if (j2 < n && !this.Less(j1, j2))
104 |                 {
105 |                     j = j2; // 2*i + 2 // right child
106 |                 }
107 |                 if (!this.Less(j, i))
108 |                 {
109 |                     break;
110 |                 }
111 |                 this.Swap(i, j);
112 |                 i = j;
113 |             }
114 |         }
115 | 
116 |         /// <summary>
117 |         /// Returns the top-k elements from lowest to highest frequency.
118 |         /// </summary>
119 |         /// <returns>The top-k elements from lowest to highest frequency</returns>
120 |         internal Element[] Elements()
121 |         {
122 |             if (this.Len() == 0)
123 |             {
124 |                 return new Element[0];
125 |             }
126 | 
127 |             return this.Heap
128 |                 .OrderBy(x => x.Freq)
129 |                 .ToArray();
130 |         }
131 | 
132 |         /// <summary>
133 |         /// Adds the data to the top-k heap. If the data is already an element, the
134 |         /// frequency is updated. If the heap already has k elements, the element with
135 |         /// the minimum frequency is removed.
136 |         /// </summary>
137 |         /// <param name="data">The data to insert</param>
138 |         /// <param name="freq">The frequency to associate with the data</param>
139 |         internal void insert(byte[] data, UInt64 freq, uint k)
140 |         {
141 |             for (int i = 0; i < this.Len(); i++)
142 |             {
143 |                 var element = this.Heap[i];
144 |                 if (Enumerable.SequenceEqual(data, element.Data))
145 |                 {
146 |                     // Element already in top-k.
147 |                     element.Freq = freq;
148 |                     return;
149 |                 }
150 |             }
151 | 
152 |             if (this.Len() == k)
153 |             {
154 |                 // Remove minimum-frequency element.
155 |                 this.Pop();
156 |             }
157 | 
158 |             // Add element to top-k.
159 |             this.Push(new Element
160 |             {
161 |                 Data = data,
162 |                 Freq = freq,
163 |             });
164 |         }
165 | 
166 |         /// <summary>
167 |         /// Indicates if the given frequency falls within the top-k heap.
168 |         /// </summary>
169 |         /// <param name="freq">The frequency to check</param>
170 |         /// <returns>Whether or not the frequency falls within the top-k heap</returns>
171 |         internal bool isTop(UInt64 freq, uint k)
172 |         {
173 |             if (this.Len() < k)
174 |             {
175 |                 return true;
176 |             }
177 | 
178 |             return freq >= this.Heap[0].Freq;
179 |         }
180 |     }
181 | }
182 | 


--------------------------------------------------------------------------------
/ProbabilisticDataStructures/HyperLogLog.cs:
--------------------------------------------------------------------------------
  1 | ﻿/*
  2 | Original work Copyright 2013 Eric Lesh
  3 | Modified work Copyright 2015 Tyler Treat
  4 | Modified work Copyright 2015 Matthew Lorimor
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining
  7 | a copy of this software and associated documentation files (the
  8 | "Software"), to deal in the Software without restriction, including
  9 | without limitation the rights to use, copy, modify, merge, publish,
 10 | distribute, sublicense, and/or sell copies of the Software, and to
 11 | permit persons to whom the Software is furnished to do so, subject to
 12 | the following conditions:
 13 | 
 14 | The above copyright notice and this permission notice shall be
 15 | included in all copies or substantial portions of the Software.
 16 | */
 17 | 
 18 | using System;
 19 | using System.Linq;
 20 | using System.Security.Cryptography;
 21 | 
 22 | namespace ProbabilisticDataStructures
 23 | {
 24 |     /// <summary>
 25 |     /// implements the HyperLogLog cardinality estimation algorithm as
 26 |     /// described by Flajolet, Fusy, Gandouet, and Meunier in HyperLogLog: the
 27 |     /// analysis of a near-optimal cardinality estimation algorithm:
 28 |     ///
 29 |     /// http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf
 30 |     ///
 31 |     /// HyperLogLog is a probabilistic algorithm which approximates the number of
 32 |     /// distinct elements in a multiset. It works by hashing values and calculating
 33 |     /// the maximum number of leading zeros in the binary representation of each
 34 |     /// hash. If the maximum number of leading zeros is n, the estimated number of
 35 |     /// distinct elements in the set is 2^n. To minimize variance, the multiset is
 36 |     /// split into a configurable number of registers, the maximum number of leading
 37 |     /// zeros is calculated in the numbers in each register, and a harmonic mean is
 38 |     /// used to combine the estimates.
 39 |     ///
 40 |     /// For large or unbounded data sets, calculating the exact cardinality is
 41 |     /// impractical. HyperLogLog uses a fraction of the memory while providing an
 42 |     /// accurate approximation. For counting element frequency, refer to the
 43 |     /// Count-Min Sketch.
 44 |     /// </summary>
 45 |     public class HyperLogLog
 46 |     {
 47 |         private static double Exp32 = Math.Pow(2, 32);
 48 | 
 49 |         /// <summary>
 50 |         /// Counter registers
 51 |         /// </summary>
 52 |         private byte[] Registers { get; set; }
 53 |         /// <summary>
 54 |         /// Number of registers
 55 |         /// </summary>
 56 |         internal uint M { get; set; }
 57 |         /// <summary>
 58 |         /// Number of bits to calculate register
 59 |         /// </summary>
 60 |         private uint B { get; set; }
 61 |         /// <summary>
 62 |         /// Bias-correction constant
 63 |         /// </summary>
 64 |         private double Alpha { get; set; }
 65 |         /// <summary>
 66 |         /// Hash algorithm
 67 |         /// </summary>
 68 |         private HashAlgorithm Hash { get; set; }
 69 | 
 70 |         /// <summary>
 71 |         /// Creates a new HyperLogLog with m registers. Returns an error if m isn't a
 72 |         /// power of two.
 73 |         /// </summary>
 74 |         /// <param name="m">Number of registers (must be a power of two)</param>
 75 |         public HyperLogLog(uint m)
 76 |         {
 77 |             if ((m & (m - 1)) != 0)
 78 |             {
 79 |                 throw new ArgumentException(String.Format("{0} is not a power of two", m));
 80 |             }
 81 | 
 82 |             this.Registers = new byte[m];
 83 |             this.M = m;
 84 |             this.B = (uint)Math.Ceiling(Math.Log(m, 2));
 85 |             this.Alpha = CalculateAlpha(m);
 86 |             this.Hash = Defaults.GetDefaultHashAlgorithm();
 87 |         }
 88 | 
 89 |         /// <summary>
 90 |         /// Creates a new HyperLogLog optimized for the specified standard error.
 91 |         /// Throws an ArgumentException if the number of registers can't be calculated
 92 |         /// for the provided accuracy.
 93 |         /// </summary>
 94 |         /// <param name="e">Desired standard error</param>
 95 |         /// <returns>The HyperLogLog optimized for the standard error</returns>
 96 |         public static HyperLogLog NewDefaultHyperLogLog(double e)
 97 |         {
 98 |             var m = Math.Pow(1.04 / e, 2);
 99 |             return new HyperLogLog((uint)Math.Pow(2, Math.Ceiling(Math.Log(m, 2))));
100 |         }
101 | 
102 |         /// <summary>
103 |         /// Will add the data to the set. Returns the HyperLogLog to allow for chaining.
104 |         /// </summary>
105 |         /// <param name="data">The data to add</param>
106 |         /// <returns>The HyperLogLog</returns>
107 |         public HyperLogLog Add(byte[] data)
108 |         {
109 |             var hash = CalculateHash(data);
110 |             var k = 32 - this.B;
111 |             var r = CalculateRho(hash << (int)this.B, k);
112 |             var j = hash >> (int)k;
113 | 
114 |             if (r > this.Registers[j])
115 |             {
116 |                 this.Registers[j] = r;
117 |             }
118 | 
119 |             return this;
120 |         }
121 | 
122 |         /// <summary>
123 |         /// Returns the approximated cardinality of the set.
124 |         /// </summary>
125 |         /// <returns>The approximated cardinality of the set</returns>
126 |         public UInt64 Count()
127 |         {
128 |             var sum = 0.0;
129 |             var m = (double)this.M;
130 |             foreach (var val in this.Registers)
131 |             {
132 |                 sum += 1.0 / Math.Pow(2.0, val);
133 |             }
134 |             var estimate = this.Alpha * m * m / sum;
135 |             if (estimate <= 5.0 / 2.0 * m)
136 |             {
137 |                 // Small range correction
138 |                 var v = 0;
139 |                 foreach (var r in this.Registers)
140 |                 {
141 |                     if (r == 0)
142 |                     {
143 |                         v++;
144 |                     }
145 |                 }
146 |                 if (v > 0)
147 |                 {
148 |                     estimate = m * Math.Log(m / v);
149 |                 }
150 |             }
151 |             else if (estimate > 1.0 / 30.0 * Exp32)
152 |             {
153 |                 // Large range correction
154 |                 estimate = -Exp32 * Math.Log(1 - estimate / Exp32);
155 |             }
156 |             return (UInt64)estimate;
157 |         }
158 | 
159 |         /// <summary>
160 |         /// Combines this HyperLogLog with another. Returns an error if the number of
161 |         /// registers in the two HyperLogLogs are not equal.
162 |         /// </summary>
163 |         /// <param name="other">The HyperLogLog to merge</param>
164 |         /// <returns>Whether or not the merge was successful</returns>
165 |         public bool Merge(HyperLogLog other)
166 |         {
167 |             if (this.M != other.M)
168 |             {
169 |                 throw new ArgumentException("Number of registers must match");
170 |             }
171 | 
172 |             for (int i = 0; i < other.Registers.Count(); i++)
173 |             {
174 |                 var r = other.Registers[i];
175 |                 if (r > this.Registers[i])
176 |                 {
177 |                     this.Registers[i] = r;
178 |                 }
179 |             }
180 | 
181 |             return true;
182 |         }
183 | 
184 |         /// <summary>
185 |         /// Restores the HyperLogLog to its original state. It returns itself to allow
186 |         /// for chaining.
187 |         /// </summary>
188 |         /// <returns>The HyperLogLog</returns>
189 |         public HyperLogLog Reset()
190 |         {
191 |             this.Registers = new byte[this.M];
192 |             return this;
193 |         }
194 | 
195 |         /// <summary>
196 |         /// Sets the hashing function used in the filter.
197 |         /// </summary>
198 |         /// <param name="h">The HashAlgorithm to use.</param>
199 |         public void SetHash(HashAlgorithm h)
200 |         {
201 |             this.Hash = h;
202 |         }
203 | 
204 |         /// <summary>
205 |         /// Returns a 32-bit hash value for the given data.
206 |         /// </summary>
207 |         /// <param name="data">Data</param>
208 |         /// <returns>32-bit hash value</returns>
209 |         private uint CalculateHash(byte[] data)
210 |         {
211 |             var sum = Hash.ComputeHash(data);
212 |             return Utils.HashBytesToUInt32(sum);
213 |         }
214 | 
215 |         /// <summary>
216 |         /// Calculates the bias-correction constant alpha based on the number of
217 |         /// registers, m.
218 |         /// </summary>
219 |         /// <param name="m">Number of registers</param>
220 |         /// <returns>Calculated bias-correction constant, alpha</returns>
221 |         private static double CalculateAlpha(uint m)
222 |         {
223 |             switch (m)
224 |             {
225 |                 case 16:
226 |                     return 0.673;
227 |                 case 32:
228 |                     return 0.697;
229 |                 case 64:
230 |                     return 0.709;
231 |                 default:
232 |                     return 0.7213 / (1.0 + 1.079 / m);
233 |             }
234 |         }
235 | 
236 |         /// <summary>
237 |         /// Calculates the position of the leftmost 1-bit.
238 |         /// </summary>
239 |         /// <param name="val">The value to check</param>
240 |         /// <param name="max"></param>
241 |         /// <returns>The position of the leftmost 1-bit</returns>
242 |         private static byte CalculateRho(uint val, uint max)
243 |         {
244 |             var r = 1;
245 |             while ((val & 0x80000000) == 0 && r <= max)
246 |             {
247 |                 r++;
248 |                 val <<= 1;
249 |             }
250 |             return (byte)r;
251 |         }
252 | 
253 |         // TODO: Implement these later.
254 |         // WriteDataTo
255 |         // ReadDataFrom
256 |     }
257 | }
258 | 


--------------------------------------------------------------------------------
/ProbabilisticDataStructures/IFilter.cs:
--------------------------------------------------------------------------------
 1 | ﻿namespace ProbabilisticDataStructures
 2 | {
 3 |     public interface IFilter
 4 |     {
 5 |         /// <summary>
 6 |         /// Will test for membership of the data and returns true if it is a member,
 7 |         /// false if not.
 8 |         /// </summary>
 9 |         /// <param name="data">The data to test for.</param>
10 |         /// <returns>Whether or not the data is probably contained in the filter.</returns>
11 |         bool Test(byte[] data);
12 |         /// <summary>
13 |         /// Add will add the data to the Bloom filter. It returns the filter to allow
14 |         /// for chaining.
15 |         /// </summary>
16 |         /// <param name="data">The data to add.</param>
17 |         /// <returns>The filter.</returns>
18 |         IFilter Add(byte[] data);
19 |         /// <summary>
20 |         /// Is equivalent to calling Test followed by Add. It returns true if the data is
21 |         /// a member, false if not.
22 |         /// </summary>
23 |         /// <param name="data">The data to test for and add if it doesn't exist.</param>
24 |         /// <returns>Whether or not the data was probably contained in the filter.</returns>
25 |         bool TestAndAdd(byte[] data);
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/ProbabilisticDataStructures/InverseBloomFilter.cs:
--------------------------------------------------------------------------------
  1 | ﻿/*
  2 | Original work Copyright (c) 2012 Jeff Hodges. All rights reserved.
  3 | Modified work Copyright (c) 2015 Tyler Treat. All rights reserved.
  4 | Modified work Copyright (c) 2015 Matthew Lorimor. All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions are
  8 | met:
  9 | 
 10 |    * Redistributions of source code must retain the above copyright
 11 | notice, this list of conditions and the following disclaimer.
 12 |    * Redistributions in binary form must reproduce the above
 13 | copyright notice, this list of conditions and the following disclaimer
 14 | in the documentation and/or other materials provided with the
 15 | distribution.
 16 |    * Neither the name of Jeff Hodges nor the names of this project's
 17 | contributors may be used to endorse or promote products derived from
 18 | this software without specific prior written permission.
 19 | 
 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | */
 32 | 
 33 | using System.Linq;
 34 | using System.Security.Cryptography;
 35 | 
 36 | namespace ProbabilisticDataStructures
 37 | {
 38 |     /// <summary>
 39 |     /// InverseBloomFilter is a concurrent "inverse" Bloom filter, which is
 40 |     /// effectively the opposite of a classic Bloom filter. This was originally
 41 |     /// described and written by Jeff Hodges:
 42 |     ///
 43 |     /// http://www.somethingsimilar.com/2012/05/21/the-opposite-of-a-bloom-filter/
 44 |     ///
 45 |     /// The InverseBloomFilter may report a false negative but can never report a
 46 |     /// false positive. That is, it may report that an item has not been seen when
 47 |     /// it actually has, but it will never report an item as seen which it hasn't
 48 |     /// come across. This behaves in a similar manner to a fixed-size hashmap which
 49 |     /// does not handle conflicts.
 50 |     ///
 51 |     /// An example use case is deduplicating events while processing a stream of
 52 |     /// data. Ideally, duplicate events are relatively close together.
 53 |     /// </summary>
 54 |     public class InverseBloomFilter : IFilter
 55 |     {
 56 |         private byte[][] Array { get; set; }
 57 |         internal HashAlgorithm Hash { get; set; }
 58 |         private uint capacity { get; set; }
 59 | 
 60 |         /// <summary>
 61 |         /// Instantiates an InverseBloomFilter with the specified capacity.
 62 |         /// </summary>
 63 |         /// <param name="capacity">The capacity of the filter</param>
 64 |         public InverseBloomFilter(uint capacity)
 65 |         {
 66 |             this.Array = new byte[capacity][];
 67 |             this.Hash = Defaults.GetDefaultHashAlgorithm();
 68 |             this.capacity = capacity;
 69 |         }
 70 | 
 71 | 
 72 |         /// <summary>
 73 |         /// Will test for membership of the data and returns true if it is a
 74 |         /// member, false if not. This is a probabilistic test, meaning there is a
 75 |         /// non-zero probability of false negatives but a zero probability of false
 76 |         /// positives. That is, it may return false even though the data was added, but
 77 |         /// it will never return true for data that hasn't been added.
 78 |         /// </summary>
 79 |         /// <param name="data">The data to test for</param>
 80 |         /// <returns>Whether or not the data is present</returns>
 81 |         public bool Test(byte[] data)
 82 |         {
 83 |             var index = this.Index(data);
 84 |             var val = this.Array[index];
 85 |             if (val == null)
 86 |             {
 87 |                 return false;
 88 |             }
 89 |             return Enumerable.SequenceEqual(val, data);
 90 |         }
 91 | 
 92 |         /// <summary>
 93 |         /// Will add the data to the filter. It returns the filter to allow for chaining.
 94 |         /// </summary>
 95 |         /// <param name="data"></param>
 96 |         /// <returns></returns>
 97 |         public IFilter Add(byte[] data)
 98 |         {
 99 |             var index = this.Index(data);
100 |             this.GetAndSet(index, data);
101 |             return this;
102 |         }
103 | 
104 |         /// <summary>
105 |         /// Equivalent to calling Test followed by Add atomically. It returns true if
106 |         /// the data is a member, false if not.
107 |         /// </summary>
108 |         /// <param name="data">The data to test and add</param>
109 |         /// <returns>Whether the data was already a member</returns>
110 |         public bool TestAndAdd(byte[] data)
111 |         {
112 |             var index = this.Index(data);
113 |             var oldId = this.GetAndSet(index, data);
114 |             if (oldId == null)
115 |             {
116 |                 return false;
117 |             }
118 |             return Enumerable.SequenceEqual(oldId, data);
119 |         }
120 | 
121 |         /// <summary>
122 |         /// Returns the filter capactiy.
123 |         /// </summary>
124 |         /// <returns>The filter capactiy</returns>
125 |         public uint Capacity()
126 |         {
127 |             return this.capacity;
128 |         }
129 | 
130 |         /// <summary>
131 |         /// Returns the data that was in the array at the given index after putting the
132 |         /// new data in the array at that index, atomically.
133 |         /// </summary>
134 |         /// <param name="index">The index to get and set</param>
135 |         /// <param name="data">The data to set</param>
136 |         /// <returns>
137 |         /// The data that was in the array at the index before setting it
138 |         /// </returns>
139 |         private byte[] GetAndSet(uint index, byte[] data)
140 |         {
141 |             var oldData = this.Array[index];
142 |             this.Array[index] = data;
143 |             return oldData;
144 |         }
145 | 
146 |         /// <summary>
147 |         /// Returns the array index for the given data.
148 |         /// </summary>
149 |         /// <param name="data">The data to find the index for</param>
150 |         /// <returns>The array index for the given data</returns>
151 |         private uint Index(byte[] data)
152 |         {
153 |             var index = this.ComputeHashSum32(data) % this.capacity;
154 |             return index;
155 |         }
156 | 
157 |         /// <summary>
158 |         /// Returns a 32-bit hash value for the given data.
159 |         /// </summary>
160 |         /// <param name="data">Data</param>
161 |         /// <returns>32-bit hash value</returns>
162 |         private uint ComputeHashSum32(byte[] data)
163 |         {
164 |             var sum = Hash.ComputeHash(data);
165 |             return Utils.HashBytesToUInt32(sum);
166 |         }
167 | 
168 |         /// <summary>
169 |         /// Sets the hashing function used in the filter.
170 |         /// </summary>
171 |         /// <param name="h">The HashAlgorithm to use.</param>
172 |         // TODO: Add SetHash to the IFilter interface?
173 |         public void SetHash(HashAlgorithm h)
174 |         {
175 |             this.Hash = h;
176 |         }
177 |     }
178 | }
179 | 


--------------------------------------------------------------------------------
/ProbabilisticDataStructures/MinHash.cs:
--------------------------------------------------------------------------------
  1 | ﻿using System;
  2 | using System.Collections.Generic;
  3 | using System.Linq;
  4 | using System.Threading.Tasks;
  5 | 
  6 | namespace ProbabilisticDataStructures
  7 | {
  8 |     /// <summary>
  9 |     /// MinHash is a variation of the technique for estimating similarity between
 10 |     /// two sets as presented by Broder in On the resemblance and containment of
 11 |     /// documents:
 12 |     ///
 13 |     /// http://gatekeeper.dec.com/ftp/pub/dec/SRC/publications/broder/positano-final-wpnums.pdf
 14 |     ///
 15 |     /// This can be used to cluster or compare documents by splitting the corpus
 16 |     /// into a bag of words. MinHash returns the approximated similarity ratio of
 17 |     /// the two bags. The similarity is less accurate for very small bags of words.
 18 |     /// </summary>
 19 |     public static class MinHash
 20 |     {
 21 |         private static Random random = new Random();
 22 | 
 23 |         /// <summary>
 24 |         /// Returns the similarity between two bags.
 25 |         /// </summary>
 26 |         /// <param name="bag1">The first bag</param>
 27 |         /// <param name="bag2">The second bag</param>
 28 |         /// <returns>The similarity between the bags</returns>
 29 |         public static float Similarity(string[] bag1, string[] bag2)
 30 |         {
 31 |             var k = bag1.Length + bag2.Length;
 32 |             var hashes = new int[k];
 33 |             for (int i = 0; i < k; i++)
 34 |             {
 35 |                 var a = random.Next();
 36 |                 var b = random.Next();
 37 |                 var c = random.Next();
 38 |                 var x = computeHash((uint)(a * b * c), (uint)a, (uint)b, c);
 39 |                 hashes[i] = (int)x;
 40 |             }
 41 | 
 42 |             var bMap = bitMap(bag1, bag2);
 43 |             var minHashValues = hashBuckets(2, k);
 44 |             minHash(bag1, 0, minHashValues, bMap, k, hashes);
 45 |             minHash(bag2, 1, minHashValues, bMap, k, hashes);
 46 |             return similarity(minHashValues, k);
 47 |         }
 48 | 
 49 |         private static void minHash(
 50 |             string[] bag,
 51 |             int bagIndex,
 52 |             int[][] minHashValues,
 53 |             Dictionary<string, bool[]> bitArray,
 54 |             int k,
 55 |             int[] hashes)
 56 |         {
 57 |             var options = new ParallelOptions();
 58 |             options.MaxDegreeOfParallelism = 4;
 59 |             var index = 0;
 60 | 
 61 |             foreach (var element in bitArray)
 62 |             {
 63 |                 Parallel.For(0, k, options, (i, loopState) =>
 64 |                 {
 65 |                     if (bag.Contains(element.Key))
 66 |                     {
 67 |                         var hindex = hashes[index];
 68 |                         if (hindex < minHashValues[bagIndex][index])
 69 |                         {
 70 |                             minHashValues[bagIndex][index] = hindex;
 71 |                         }
 72 |                     }
 73 |                 });
 74 |                 index++;
 75 |             }
 76 |         }
 77 | 
 78 |         private static Dictionary<string, bool[]> bitMap(string[] bag1, string[] bag2)
 79 |         {
 80 |             var bitArray = new Dictionary<string, bool[]>();
 81 |             foreach (var element in bag1)
 82 |             {
 83 |                 bitArray[element] = new bool[] { true, false };
 84 |             }
 85 | 
 86 |             foreach (var element in bag2)
 87 |             {
 88 |                 if (bitArray.ContainsKey(element))
 89 |                 {
 90 |                     bitArray[element] = new bool[] { true, true };
 91 |                 }
 92 |                 else
 93 |                 {
 94 |                     bitArray[element] = new bool[] { false, true };
 95 |                 }
 96 |             }
 97 | 
 98 |             return bitArray;
 99 |         }
100 | 
101 |         private static int[][] hashBuckets(int numSets, int k)
102 |         {
103 |             var minHashValues = new int[numSets][];
104 |             for (int i = 0; i < numSets; i++)
105 |             {
106 |                 minHashValues[i] = new int[k];
107 |             }
108 | 
109 |             for (int i = 0; i < numSets; i++)
110 |             {
111 |                 for (int j = 0; j < k; j++)
112 |                 {
113 |                     minHashValues[i][j] = int.MaxValue;
114 |                 }
115 |             }
116 |             return minHashValues;
117 |         }
118 | 
119 |         private static uint computeHash(uint x, uint a, uint b, int u)
120 |         {
121 |             return (a * x + b) >> (32 - u);
122 |         }
123 | 
124 |         private static float similarity(int[][] minHashValues, int k)
125 |         {
126 |             var identicalMinHashes = 0;
127 |             for (int i = 0; i < k; i++)
128 |             {
129 |                 if (minHashValues[0][i] == minHashValues[1][i])
130 |                 {
131 |                     identicalMinHashes++;
132 |                 }
133 |             }
134 | 
135 |             return (float)(1.0 * (float)identicalMinHashes) / (float)k;
136 |         }
137 |     }
138 | }
139 | 


--------------------------------------------------------------------------------
/ProbabilisticDataStructures/PartitionedBloomFilter.cs:
--------------------------------------------------------------------------------
  1 | ﻿/*
  2 | Original work Copyright (c) 2013 zhenjl
  3 | Modified work Copyright (c) 2015 Tyler Treat
  4 | Modified work Copyright (c) 2015 Matthew Lorimor
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy of
  7 | this software and associated documentation files (the "Software"), to deal in
  8 | the Software without restriction, including without limitation the rights to
  9 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 10 | of the Software, and to permit persons to whom the Software is furnished to do
 11 | so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in all
 14 | copies or substantial portions of the Software.
 15 | */
 16 | 
 17 | using System;
 18 | using System.Security.Cryptography;
 19 | 
 20 | namespace ProbabilisticDataStructures
 21 | {
 22 |     /// <summary>
 23 |     /// PartitionedBloomFilter implements a variation of a classic Bloom filter as
 24 |     /// described by Almeida, Baquero, Preguica, and Hutchison in Scalable Bloom
 25 |     /// Filters:
 26 |     ///
 27 |     /// http://gsd.di.uminho.pt/members/cbm/ps/dbloom.pdf
 28 |     ///
 29 |     /// This filter works by partitioning the M-sized bit array into k slices of
 30 |     /// size m = M/k bits. Each hash function produces an index over m for its
 31 |     /// respective slice. Thus, each element is described by exactly k bits, meaning
 32 |     /// the distribution of false positives is uniform across all elements.
 33 |     /// </summary>
 34 |     public class PartitionedBloomFilter : IFilter
 35 |     {
 36 |         /// <summary>
 37 |         /// Partitioned filter data
 38 |         /// </summary>
 39 |         internal Buckets[] Partitions { get; set; }
 40 |         /// <summary>
 41 |         /// Hash algorithm
 42 |         /// </summary>
 43 |         internal HashAlgorithm Hash { get; set; }
 44 |         /// <summary>
 45 |         /// Filter size (divided into k partitions)
 46 |         /// </summary>
 47 |         private uint M { get; set; }
 48 |         /// <summary>
 49 |         /// Number of hash functions (and partitions)
 50 |         /// </summary>
 51 |         private uint k { get; set; }
 52 |         /// <summary>
 53 |         /// Partition size (m / k)
 54 |         /// </summary>
 55 |         private uint S { get; set; }
 56 |         /// <summary>
 57 |         /// Number of items added
 58 |         /// </summary>
 59 |         private uint count { get; set; }
 60 | 
 61 |         /// <summary>
 62 |         /// Creates a new partitioned Bloom filter optimized to store n items with a
 63 |         /// specified target false-positive rate.
 64 |         /// </summary>
 65 |         /// <param name="n">Number of items</param>
 66 |         /// <param name="fpRate">Desired false-positive rate</param>
 67 |         public PartitionedBloomFilter(uint n, double fpRate)
 68 |         {
 69 |             var m = Utils.OptimalM(n, fpRate);
 70 |             var k = Utils.OptimalK(fpRate);
 71 |             var partitions = new Buckets[k];
 72 |             var s = (uint)Math.Ceiling((double)m / (double)k);
 73 | 
 74 |             for (uint i = 0; i < k; i++)
 75 |             {
 76 |                 partitions[i] = new Buckets(s, 1);
 77 |             }
 78 | 
 79 |             this.Partitions = partitions;
 80 |             this.Hash = Defaults.GetDefaultHashAlgorithm();
 81 |             this.M = m;
 82 |             this.k = k;
 83 |             this.S = s;
 84 |         }
 85 | 
 86 |         /// <summary>
 87 |         /// Returns the Bloom filter capacity, m.
 88 |         /// </summary>
 89 |         /// <returns>The Bloom filter capacity, m</returns>
 90 |         public uint Capacity()
 91 |         {
 92 |             return this.M;
 93 |         }
 94 | 
 95 |         /// <summary>
 96 |         /// Returns the number of hash functions.
 97 |         /// </summary>
 98 |         /// <returns>The number of hash functions</returns>
 99 |         public uint K()
100 |         {
101 |             return this.k;
102 |         }
103 | 
104 |         /// <summary>
105 |         /// Returns the number of items in the filter.
106 |         /// </summary>
107 |         /// <returns>The number of items in the filter</returns>
108 |         public uint Count()
109 |         {
110 |             return this.count;
111 |         }
112 | 
113 |         /// <summary>
114 |         /// Returns the current estimated ratio of set bits.
115 |         /// </summary>
116 |         /// <returns>The current estimated ratio of set bits</returns>
117 |         public double EstimatedFillRatio()
118 |         {
119 |             return 1 - Math.Exp(-(double)this.count / (double)this.S);
120 |         }
121 | 
122 |         /// <summary>
123 |         /// Returns the average ratio of set bits across all partitions.
124 |         /// </summary>
125 |         /// <returns>The average ratio of set bitsacross all partitions</returns>
126 |         public double FillRatio()
127 |         {
128 |             var t = (double)0;
129 |             for (uint i = 0; i < this.k; i++)
130 |             {
131 |                 uint sum = 0;
132 |                 for (uint j = 0; j < this.Partitions[i].count; j++)
133 |                 {
134 |                     sum += this.Partitions[i].Get(j);
135 |                 }
136 |                 t += ((double)sum / (double)this.S);
137 |             }
138 |             return (double)t / (double)this.k;
139 |         }
140 | 
141 |         /// <summary>
142 |         /// Will test for membership of the data and returns true if it is a
143 |         /// member, false if not. This is a probabilistic test, meaning there is a
144 |         /// non-zero probability of false positives but a zero probability of false
145 |         /// negatives. Due to the way the filter is partitioned, the probability of
146 |         /// false positives is uniformly distributed across all elements.
147 |         /// </summary>
148 |         /// <param name="data">The data to test for</param>
149 |         /// <returns>Whether or not the data was found</returns>
150 |         public bool Test(byte[] data)
151 |         {
152 |             var hashKernel = Utils.HashKernel(data, this.Hash);
153 |             var lower = hashKernel.LowerBaseHash;
154 |             var upper = hashKernel.UpperBaseHash;
155 | 
156 |             // If any of the K partiion bits are not set, then it's not a member.
157 |             for (uint i = 0; i < this.k; i++)
158 |             {
159 |                 if (this.Partitions[i].Get((lower + upper * i) % this.S) == 0)
160 |                 {
161 |                     return false;
162 |                 }
163 |             }
164 | 
165 |             return true;
166 |         }
167 | 
168 |         /// <summary>
169 |         /// Will add the data to the Bloom filter. It returns the filter to allow for
170 |         /// chaining.
171 |         /// </summary>
172 |         /// <param name="data">The data to add</param>
173 |         /// <returns>The PartitionedBloomFilter</returns>
174 |         public IFilter Add(byte[] data)
175 |         {
176 |             var hashKernel = Utils.HashKernel(data, this.Hash);
177 |             var lower = hashKernel.LowerBaseHash;
178 |             var upper = hashKernel.UpperBaseHash;
179 | 
180 |             // Set the K partition bits.
181 |             for (uint i = 0; i < this.k; i++)
182 |             {
183 |                 this.Partitions[i].Set((lower + upper * i) % this.S, 1);
184 |             }
185 | 
186 |             this.count++;
187 |             return this;
188 |         }
189 | 
190 |         /// <summary>
191 |         /// Equivalent to calling Test followed by Add. It returns true if the data is a
192 |         /// member, false if not.
193 |         /// </summary>
194 |         /// <param name="data">The data to test for and add</param>
195 |         /// <returns>
196 |         /// Whether the data was present in the filter prior to adding it
197 |         /// </returns>
198 |         public bool TestAndAdd(byte[] data)
199 |         {
200 |             var hashKernel = Utils.HashKernel(data, this.Hash);
201 |             var lower = hashKernel.LowerBaseHash;
202 |             var upper = hashKernel.UpperBaseHash;
203 |             var member = true;
204 | 
205 |             // If any K partition bits are not set, then it's not a member.
206 |             for (uint i = 0; i < this.k; i++)
207 |             {
208 |                 var idx = (lower + upper * i) % this.S;
209 |                 if (this.Partitions[i].Get(idx) == 0)
210 |                 {
211 |                     member = false;
212 |                 }
213 |                 this.Partitions[i].Set(idx, 1);
214 |             }
215 | 
216 |             this.count++;
217 |             return member;
218 |         }
219 | 
220 |         /// <summary>
221 |         /// Restores the Bloom filter to its original state. It returns the filter
222 |         /// to allow for chaining.
223 |         /// </summary>
224 |         /// <returns>The PartitionedBloomFilter</returns>
225 |         public PartitionedBloomFilter Reset()
226 |         {
227 |             foreach (var partition in this.Partitions)
228 |             {
229 |                 partition.Reset();
230 |             }
231 |             return this;
232 |         }
233 | 
234 |         /// <summary>
235 |         /// Sets the hashing function used in the filter.
236 |         /// </summary>
237 |         /// <param name="h">The HashAlgorithm to use.</param>
238 |         // TODO: Add SetHash to the IFilter interface?
239 |         public void SetHash(HashAlgorithm h)
240 |         {
241 |             this.Hash = h;
242 |         }
243 |     }
244 | }
245 | 


--------------------------------------------------------------------------------
/ProbabilisticDataStructures/ProbabilisticDataStructures.csproj:
--------------------------------------------------------------------------------
1 | <Project Sdk="Microsoft.NET.Sdk">
2 | 
3 |   <PropertyGroup>
4 |     <TargetFrameworks>netstandard2.0;net45</TargetFrameworks>
5 |   </PropertyGroup>
6 | 
7 | </Project>
8 | 


--------------------------------------------------------------------------------
/ProbabilisticDataStructures/ScalableBloomFilter.cs:
--------------------------------------------------------------------------------
  1 | ﻿/*
  2 | Original work Copyright (c) 2013 zhenjl
  3 | Modified work Copyright (c) 2015 Tyler Treat
  4 | Modified work Copyright (c) 2015 Matthew Lorimor
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy of
  7 | this software and associated documentation files (the "Software"), to deal in
  8 | the Software without restriction, including without limitation the rights to
  9 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 10 | of the Software, and to permit persons to whom the Software is furnished to do
 11 | so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in all
 14 | copies or substantial portions of the Software.
 15 | */
 16 | 
 17 | using System;
 18 | using System.Collections.Generic;
 19 | using System.Linq;
 20 | using System.Security.Cryptography;
 21 | 
 22 | namespace ProbabilisticDataStructures
 23 | {
 24 |     /// <summary>
 25 |     /// ScalableBloomFilter implements a Scalable Bloom Filter as described by
 26 |     /// Almeida, Baquero, Preguica, and Hutchison in Scalable Bloom Filters:
 27 |     ///
 28 |     /// http://gsd.di.uminho.pt/members/cbm/ps/dbloom.pdf
 29 |     ///
 30 |     /// A Scalable Bloom Filter dynamically adapts to the number of elements in the
 31 |     /// data set while enforcing a tight upper bound on the false-positive rate.
 32 |     /// This works by adding Bloom filters with geometrically decreasing
 33 |     /// false-positive rates as filters become full. The tightening ratio, r,
 34 |     /// controls the filter growth. The compounded probability over the whole series
 35 |     /// converges to a target value, even accounting for an infinite series.
 36 |     ///
 37 |     /// Scalable Bloom Filters are useful for cases where the size of the data set
 38 |     /// isn't known a priori and memory constraints aren't of particular concern.
 39 |     /// For situations where memory is bounded, consider using Inverse or Stable
 40 |     /// Bloom Filters.
 41 |     /// </summary>
 42 |     public class ScalableBloomFilter : IFilter
 43 |     {
 44 |         /// <summary>
 45 |         /// Filters with geometrically decreasing error rates
 46 |         /// </summary>
 47 |         internal List<PartitionedBloomFilter> Filters { get; set; }
 48 |         /// <summary>
 49 |         /// Tightening ratio
 50 |         /// </summary>
 51 |         internal double R { get; set; }
 52 |         /// <summary>
 53 |         /// Target false-positive rate
 54 |         /// </summary>
 55 |         internal double FP { get; set; }
 56 |         /// <summary>
 57 |         /// Partition fill ratio
 58 |         /// </summary>
 59 |         private double P { get; set; }
 60 |         /// <summary>
 61 |         /// Filter size hint
 62 |         /// </summary>
 63 |         internal uint Hint { get; set; }
 64 | 
 65 |         /// <summary>
 66 |         /// Creates a new Scalable Bloom Filter with the specified target false-positive
 67 |         /// rate and tightening ratio. Use NewDefaultScalableBloomFilter if you don't
 68 |         /// want to calculate all these parameters.
 69 |         /// </summary>
 70 |         /// <param name="hint"></param>
 71 |         /// <param name="fpRate"></param>
 72 |         /// <param name="r"></param>
 73 |         public ScalableBloomFilter(uint hint, double fpRate, double r)
 74 |         {
 75 |             this.Filters = new List<PartitionedBloomFilter>();
 76 |             this.R = r;
 77 |             this.FP = fpRate;
 78 |             this.P = Defaults.FILL_RATIO;
 79 |             this.Hint = hint;
 80 | 
 81 |             this.AddFilter();
 82 |         }
 83 | 
 84 |         /// <summary>
 85 |         /// Creates a new Scalable Bloom Filter with the specified target false-positive
 86 |         /// rate and an optimal tightening ratio.
 87 |         /// </summary>
 88 |         /// <param name="fpRate"></param>
 89 |         public static ScalableBloomFilter NewDefaultScalableBloomFilter(double fpRate)
 90 |         {
 91 |             return new ScalableBloomFilter(10000, fpRate, 0.8);
 92 |         }
 93 | 
 94 |         /// <summary>
 95 |         /// Returns the current Scalable Bloom Filter capacity, which is the sum of the
 96 |         /// capacities for the contained series of Bloom filters.
 97 |         /// </summary>
 98 |         /// <returns>The current Scalable Bloom Filter capacity</returns>
 99 |         public uint Capacity()
100 |         {
101 |             var capacity = 0u;
102 |             foreach (var filter in this.Filters)
103 |             {
104 |                 capacity += filter.Capacity();
105 |             }
106 |             return capacity;
107 |         }
108 | 
109 |         /// <summary>
110 |         /// Returns the number of hash functions used in each Bloom filter.
111 |         /// </summary>
112 |         /// <returns>The number of hash functions used in each Bloom filter</returns>
113 |         public uint K()
114 |         {
115 |             return this.Filters[0].K();
116 |         }
117 | 
118 |         /// <summary>
119 |         /// Returns the average ratio of set bits across every filter.
120 |         /// </summary>
121 |         /// <returns>The average ratio of set bits across every filter</returns>
122 |         public double FillRatio()
123 |         {
124 |             var sum = 0.0;
125 |             foreach (var filter in this.Filters)
126 |             {
127 |                 sum += filter.FillRatio();
128 |             }
129 |             return (double)sum / this.Filters.Count();
130 |         }
131 | 
132 |         /// <summary>
133 |         /// Will test for membership of the data and returns true if it is a member,
134 |         /// false if not. This is a probabilistic test, meaning there is a non-zero
135 |         /// probability of false positives but a zero probability of false negatives.
136 |         /// </summary>
137 |         /// <param name="data">The data to search for.</param>
138 |         /// <returns>Whether or not the data is maybe contained in the filter.</returns>
139 |         public bool Test(byte[] data)
140 |         {
141 |             // Querying is made by testing for the presence in each filter.
142 |             foreach (var filter in this.Filters)
143 |             {
144 |                 if (filter.Test(data))
145 |                 {
146 |                     return true;
147 |                 }
148 |             }
149 | 
150 |             return false;
151 |         }
152 | 
153 |         /// <summary>
154 |         /// Add will add the data to the Bloom filter. It returns the filter to allow
155 |         /// for chaining.
156 |         /// </summary>
157 |         /// <param name="data">The data to add</param>
158 |         /// <returns>The ScalableBloomFilter</returns>
159 |         public IFilter Add(byte[] data)
160 |         {
161 |             var idx = this.Filters.Count() - 1;
162 | 
163 |             // If the last filter has reached its fill ratio, add a new one.
164 |             if (this.Filters[idx].EstimatedFillRatio() >= this.P)
165 |             {
166 |                 this.AddFilter();
167 |                 idx++;
168 |             }
169 | 
170 |             this.Filters[idx].Add(data);
171 |             return this;
172 |         }
173 | 
174 |         /// <summary>
175 |         /// Is equivalent to calling Test followed by Add. It returns true if the data
176 |         /// is a member, false if not.
177 |         /// </summary>
178 |         /// <param name="data">The data to test for and add</param>
179 |         /// <returns>Whether or not the data was present before adding it</returns>
180 |         public bool TestAndAdd(byte[] data)
181 |         {
182 |             var member = this.Test(data);
183 |             this.Add(data);
184 |             return member;
185 |         }
186 | 
187 |         /// <summary>
188 |         /// Sets the hashing function used in the filter.
189 |         /// </summary>
190 |         /// <param name="h">The HashAlgorithm to use.</param>
191 |         // TODO: Add SetHash to the IFilter interface?
192 |         public void SetHash(HashAlgorithm h)
193 |         {
194 |             foreach (var filter in this.Filters)
195 |             {
196 |                 filter.SetHash(h);
197 |             }
198 |         }
199 | 
200 |         /// <summary>
201 |         /// Restores the Bloom filter to its original state. It returns the filter to
202 |         /// allow for chaining.
203 |         /// </summary>
204 |         /// <returns>The reset bloom filter.</returns>
205 |         public ScalableBloomFilter Reset()
206 |         {
207 |             this.Filters = new List<PartitionedBloomFilter>();
208 |             this.AddFilter();
209 |             return this;
210 |         }
211 | 
212 |         /// <summary>
213 |         /// Adds a new Bloom filter with a restricted false-positive rate to the
214 |         /// Scalable Bloom Filter
215 |         /// </summary>
216 |         internal void AddFilter()
217 |         {
218 |             var fpRate = this.FP * Math.Pow(this.R, this.Filters.Count());
219 |             var p = new PartitionedBloomFilter(this.Hint, fpRate);
220 |             if (this.Filters.Count() > 0)
221 |             {
222 |                 p.SetHash(this.Filters[0].Hash);
223 |             }
224 |             this.Filters.Add(p);
225 |         }
226 |     }
227 | }
228 | 


--------------------------------------------------------------------------------
/ProbabilisticDataStructures/TopK.cs:
--------------------------------------------------------------------------------
 1 | ﻿namespace ProbabilisticDataStructures
 2 | {
 3 |     /// <summary>
 4 |     /// TopK uses a Count-Min Sketch to calculate the top-K frequent elements in a
 5 |     /// stream.
 6 |     /// </summary>
 7 |     public class TopK
 8 |     {
 9 |         private CountMinSketch Cms { get; set; }
10 |         private uint K { get; set; }
11 |         internal uint N { get; set; }
12 |         private ElementHeap elements { get; set; }
13 | 
14 |         /// <summary>
15 |         /// Creates a new TopK backed by a Count-Min sketch whose relative accuracy is
16 |         /// within a factor of epsilon with probability delta. It tracks the k-most
17 |         /// frequent elements.
18 |         /// </summary>
19 |         /// <param name="epsilon">Relative-accuracy factor</param>
20 |         /// <param name="delta">Relative-accuracy probability</param>
21 |         /// <param name="k">Number of top elements to track</param>
22 |         /// <returns></returns>
23 |         public TopK(double epsilon, double delta, uint k)
24 |         {
25 |             this.Cms = new CountMinSketch(epsilon, delta);
26 |             this.K = k;
27 |             this.elements = new ElementHeap((int)k);
28 |         }
29 | 
30 |         /// <summary>
31 |         /// Will add the data to the Count-Min Sketch and update the top-k heap if
32 |         /// applicable. Returns the TopK to allow for chaining.
33 |         /// </summary>
34 |         /// <param name="data">The data to add</param>
35 |         /// <returns>The TopK</returns>
36 |         public TopK Add(byte[] data)
37 |         {
38 |             this.Cms.Add(data);
39 |             this.N++;
40 | 
41 |             var freq = this.Cms.Count(data);
42 |             if (this.elements.isTop(freq, this.K))
43 |             {
44 |                 elements.insert(data, freq, this.K);
45 |             }
46 | 
47 |             return this;
48 |         }
49 | 
50 |         /// <summary>
51 |         /// Returns the top-k elements from lowest to highest frequency.
52 |         /// </summary>
53 |         /// <returns>The top-k elements from lowest to highest frequency</returns>
54 |         public Element[] Elements()
55 |         {
56 |             return elements.Elements();
57 |         }
58 | 
59 |         /// <summary>
60 |         /// Restores the TopK to its original state. It returns itself to allow for
61 |         /// chaining.
62 |         /// </summary>
63 |         /// <returns>The TopK</returns>
64 |         public TopK Reset()
65 |         {
66 |             this.Cms.Reset();
67 |             this.elements = new ElementHeap((int)K);
68 |             this.N = 0;
69 |             return this;
70 |         }
71 |     }
72 | }
73 | 


--------------------------------------------------------------------------------
/ProbabilisticDataStructures/Utils.cs:
--------------------------------------------------------------------------------
  1 | using System;
  2 | using System.Linq;
  3 | using System.Security.Cryptography;
  4 | using System.Text;
  5 | 
  6 | namespace ProbabilisticDataStructures
  7 | {
  8 |     public static class Utils
  9 |     {
 10 |         /// <summary>
 11 |         /// Calculates the optimal Bloom filter size, m, based on the number of items and
 12 |         /// the desired rate of false positives.
 13 |         /// </summary>
 14 |         /// <param name="n">Number of items.</param>
 15 |         /// <param name="fpRate">Desired false positive rate.</param>
 16 |         /// <returns>The optimal BloomFilter size, m.</returns>
 17 |         public static uint OptimalM(uint n, double fpRate)
 18 |         {
 19 |             var optimalM = Math.Ceiling((double)n / ((Math.Log(Defaults.FILL_RATIO) *
 20 |                 Math.Log(1 - Defaults.FILL_RATIO)) / Math.Abs(Math.Log(fpRate))));
 21 |             return Convert.ToUInt32(optimalM);
 22 |         }
 23 | 
 24 |         /// <summary>
 25 |         /// Calculates the optimal Bloom filter size, m, based on the number of items and
 26 |         /// the desired rate of false positives.
 27 |         /// </summary>
 28 |         /// <param name="n">Number of items.</param>
 29 |         /// <param name="fpRate">Desired false positive rate.</param>
 30 |         /// <returns>The optimal BloomFilter size, m.</returns>
 31 |         public static ulong OptimalM64(ulong n, double fpRate)
 32 |         {
 33 |             var optimalM = Math.Ceiling((double)n / ((Math.Log(Defaults.FILL_RATIO) *
 34 |                 Math.Log(1 - Defaults.FILL_RATIO)) / Math.Abs(Math.Log(fpRate))));
 35 |             return Convert.ToUInt64(optimalM);
 36 |         }
 37 | 
 38 |         /// <summary>
 39 |         /// Calculates the optimal number of hash functions to use for a Bloom filter
 40 |         /// based on the desired rate of false positives.
 41 |         /// </summary>
 42 |         /// <param name="fpRate">Desired false positive rate.</param>
 43 |         /// <returns>The optimal number of hash functions, k.</returns>
 44 |         public static uint OptimalK(double fpRate)
 45 |         {
 46 |             var optimalK = Math.Ceiling(Math.Log(1 / fpRate, 2));
 47 |             return Convert.ToUInt32(optimalK);
 48 |         }
 49 | 
 50 |         /// <summary>
 51 |         /// Returns the upper and lower base hash values from which the k hashes are
 52 |         /// derived.  The result will be the same regardless of the endianness of the
 53 |         /// architecture.
 54 |         /// </summary>
 55 |         /// <param name="data">The data bytes to hash.</param>
 56 |         /// <param name="algorithm">The hashing algorithm to use.</param>
 57 |         /// <returns>A HashKernel</returns>
 58 |         public static HashKernelReturnValue HashKernel(byte[] data, HashAlgorithm algorithm)
 59 |         {
 60 |             var sum = algorithm.ComputeHash(data);
 61 |             return HashKernelFromHashBytes(sum);
 62 |         }
 63 | 
 64 |         /// <summary>
 65 |         /// Returns the upper and lower base hash values from which the k hashes are
 66 |         /// derived using the given hash bytes directly.  The result will be the
 67 |         /// same regardless of the endianness of the architecture.  Used by a unit
 68 |         /// test to confirm the calculation is compatible with the HashKernel from
 69 |         /// https://github.com/tylertreat/BoomFilters running in Go.
 70 |         /// </summary>
 71 |         /// <param name="hashBytes">The hash bytes.</param>
 72 |         /// <returns>A HashKernel</returns>
 73 |         public static HashKernelReturnValue HashKernelFromHashBytes(byte[] hashBytes)
 74 |         {
 75 |             return HashKernelReturnValue.Create(
 76 |                 HashBytesToUInt32(hashBytes, 0),
 77 |                 HashBytesToUInt32(hashBytes, 4)
 78 |                 );
 79 |         }
 80 | 
 81 |         /// <summary>
 82 |         /// Returns the upper and lower base hash values from which the k hashes are
 83 |         /// derived.
 84 |         /// </summary>
 85 |         /// <param name="data">The data bytes to hash.</param>
 86 |         /// <param name="algorithm">The hashing algorithm to use.</param>
 87 |         /// <returns>A HashKernel</returns>
 88 |         public static HashKernel128ReturnValue HashKernel128(byte[] data, HashAlgorithm algorithm)
 89 |         {
 90 |             var sum = algorithm.ComputeHash(data);
 91 |             return HashKernel128ReturnValue.Create(
 92 |                 HashBytesToUInt64(sum, 0),
 93 |                 HashBytesToUInt64(sum, 8)
 94 |                 );
 95 |         }
 96 | 
 97 |         /// <summary>
 98 |         /// Returns the uint represented by the given hash bytes, starting at
 99 |         /// byte <paramref name="offset"/>.  The result will be the same
100 |         /// regardless of the endianness of the architecture.
101 |         /// </summary>
102 |         /// <param name="hashBytes"></param>
103 |         /// <param name="offset"></param>
104 |         /// <returns></returns>
105 |         public static uint HashBytesToUInt32(byte[] hashBytes, int offset = 0)
106 |         {
107 |             return
108 |                 ((uint)hashBytes[offset]) |
109 |                 ((uint)hashBytes[offset + 1]) << 8 |
110 |                 ((uint)hashBytes[offset + 2]) << 16 |
111 |                 ((uint)hashBytes[offset + 3]) << 24;
112 |         }
113 | 
114 |         /// <summary>
115 |         /// Returns the ulong represented by the given hash bytes, starting at
116 |         /// byte <paramref name="offset"/>.  The result will be the same
117 |         /// regardless of the endianness of the architecture.
118 |         /// </summary>
119 |         /// <param name="hashBytes"></param>
120 |         /// <param name="offset"></param>
121 |         /// <returns></returns>
122 |         public static ulong HashBytesToUInt64(byte[] hashBytes, int offset = 0)
123 |         {
124 |             return
125 |                 ((ulong)hashBytes[offset]) |
126 |                 ((ulong)hashBytes[offset + 1]) << 8 |
127 |                 ((ulong)hashBytes[offset + 2]) << 16 |
128 |                 ((ulong)hashBytes[offset + 3]) << 24 |
129 |                 ((ulong)hashBytes[offset + 4]) << 32 |
130 |                 ((ulong)hashBytes[offset + 5]) << 40 |
131 |                 ((ulong)hashBytes[offset + 6]) << 48 |
132 |                 ((ulong)hashBytes[offset + 7]) << 56;
133 |         }
134 | 
135 |         /// <summary>
136 |         /// Compute the hash for the provided bytes.
137 |         /// </summary>
138 |         /// <param name="inputBytes">The bytes to hash.</param>
139 |         /// <returns>The hash string of the bytes.</returns>
140 |         public static string ComputeHashAsString(byte[] inputBytes, HashAlgorithm hashAlgorithm)
141 |         {
142 |             // Compute the hash of the input byte array.
143 |             byte[] data = hashAlgorithm.ComputeHash(inputBytes);
144 | 
145 |             // Create a new StringBuilder to collect the bytes and create a string.
146 |             StringBuilder sb = new StringBuilder();
147 | 
148 |             // Loop through each byte of the hashed data and format each one as a
149 |             // hexadecimal string.
150 |             for (int i = 0; i < data.Length; i++)
151 |             {
152 |                 sb.Append(data[i].ToString("X2"));
153 |             }
154 | 
155 |             // Return the hexadecimal string.
156 |             return sb.ToString();
157 |         }
158 |     }
159 | 
160 |     public struct HashKernelReturnValue
161 |     {
162 |         public uint UpperBaseHash { get; private set; }
163 |         public uint LowerBaseHash { get; private set; }
164 | 
165 |         public static HashKernelReturnValue Create(uint lowerBaseHash, uint upperBaseHash)
166 |         {
167 |             return new HashKernelReturnValue
168 |             {
169 |                 UpperBaseHash = upperBaseHash,
170 |                 LowerBaseHash = lowerBaseHash
171 |             };
172 |         }
173 |     }
174 | 
175 |     public struct HashKernel128ReturnValue
176 |     {
177 |         public ulong UpperBaseHash { get; private set; }
178 |         public ulong LowerBaseHash { get; private set; }
179 |         public static HashKernel128ReturnValue Create(ulong lowerBaseHash, ulong upperBaseHash)
180 |         {
181 |             return new HashKernel128ReturnValue
182 |             {
183 |                 UpperBaseHash = upperBaseHash,
184 |                 LowerBaseHash = lowerBaseHash,
185 |             };
186 |         }
187 |     }
188 | }
189 | 


--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/Properties/AssemblyInfo.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System.Reflection;
 2 | using System.Runtime.CompilerServices;
 3 | using System.Runtime.InteropServices;
 4 | 
 5 | // General Information about an assembly is controlled through the following 
 6 | // set of attributes. Change these attribute values to modify the information
 7 | // associated with an assembly.
 8 | [assembly: AssemblyTitle("TestProbabilisticDataStructures")]
 9 | [assembly: AssemblyDescription("")]
10 | [assembly: AssemblyConfiguration("")]
11 | [assembly: AssemblyCompany("")]
12 | [assembly: AssemblyProduct("TestProbabilisticDataStructures")]
13 | [assembly: AssemblyCopyright("Copyright ©  2015")]
14 | [assembly: AssemblyTrademark("")]
15 | [assembly: AssemblyCulture("")]
16 | 
17 | // Setting ComVisible to false makes the types in this assembly not visible 
18 | // to COM components.  If you need to access a type in this assembly from 
19 | // COM, set the ComVisible attribute to true on that type.
20 | [assembly: ComVisible(false)]
21 | 
22 | // The following GUID is for the ID of the typelib if this project is exposed to COM
23 | [assembly: Guid("df071d43-8650-491c-a572-4329e4cf8e5f")]
24 | 
25 | // Version information for an assembly consists of the following four values:
26 | //
27 | //      Major Version
28 | //      Minor Version 
29 | //      Build Number
30 | //      Revision
31 | //
32 | // You can specify all the values or you can default the Build and Revision Numbers 
33 | // by using the '*' as shown below:
34 | // [assembly: AssemblyVersion("1.0.*")]
35 | [assembly: AssemblyVersion("1.0.0.0")]
36 | [assembly: AssemblyFileVersion("1.0.0.0")]
37 | 


--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestBloomFilter.cs:
--------------------------------------------------------------------------------
  1 | ﻿using Microsoft.VisualStudio.TestTools.UnitTesting;
  2 | using ProbabilisticDataStructures;
  3 | using System.Text;
  4 | 
  5 | namespace TestProbabilisticDataStructures
  6 | {
  7 |     [TestClass]
  8 |     public class TestBloomFilter
  9 |     {
 10 |         private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a");
 11 |         private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b");
 12 |         private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c");
 13 |         private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x");
 14 | 
 15 |         /// <summary>
 16 |         /// Ensures that Capacity() returns the number of bits, m, in the Bloom filter.
 17 |         /// </summary>
 18 |         [TestMethod]
 19 |         public void TestBloomCapacity()
 20 |         {
 21 |             var f = new BloomFilter(100, 0.1);
 22 |             var capacity = f.Capacity();
 23 | 
 24 |             Assert.AreEqual(480u, capacity);
 25 |         }
 26 | 
 27 |         /// <summary>
 28 |         /// Ensures that K() returns the number of hash functions in the Bloom Filter.
 29 |         /// </summary>
 30 |         [TestMethod]
 31 |         public void TestBloomK()
 32 |         {
 33 |             var f = new BloomFilter(100, 0.1);
 34 |             var k = f.K();
 35 | 
 36 |             Assert.AreEqual(4u, k);
 37 |         }
 38 | 
 39 |         /// <summary>
 40 |         /// Ensures that Count returns the number of items added to the filter.
 41 |         /// </summary>
 42 |         [TestMethod]
 43 |         public void TestBloomCount()
 44 |         {
 45 |             var f = new BloomFilter(100, 0.1);
 46 |             for (uint i = 0; i < 10; i++)
 47 |             {
 48 |                 f.Add(Encoding.ASCII.GetBytes(i.ToString()));
 49 |             }
 50 | 
 51 |             var count = f.Count();
 52 |             Assert.AreEqual(10u, count);
 53 |         }
 54 | 
 55 |         /// <summary>
 56 |         /// Ensures that EstimatedFillRatio returns the correct approximation.
 57 |         /// </summary>
 58 |         [TestMethod]
 59 |         public void TestBloomEstimatedFillRatio()
 60 |         {
 61 |             var f = new BloomFilter(100, 0.5);
 62 |             for (uint i = 0; i < 100; i++)
 63 |             {
 64 |                 f.Add(Encoding.ASCII.GetBytes(i.ToString()));
 65 |             }
 66 | 
 67 |             var ratio = f.EstimatedFillRatio();
 68 |             if (ratio > 0.5)
 69 |             {
 70 |                 Assert.Fail("Expected less than or equal to 0.5, got {0}", ratio);
 71 |             }
 72 |         }
 73 | 
 74 |         /// <summary>
 75 |         /// Ensures that FillRatio returns the ratio of set bits.
 76 |         /// </summary>
 77 |         [TestMethod]
 78 |         public void TestBloomFillRatio()
 79 |         {
 80 |             var f = new BloomFilter(100, 0.1);
 81 |             f.Add(A_BYTES);
 82 |             f.Add(B_BYTES);
 83 |             f.Add(C_BYTES);
 84 | 
 85 |             var ratio = f.FillRatio();
 86 |             Assert.AreEqual(0.025, ratio);
 87 |         }
 88 | 
 89 |         /// <summary>
 90 |         /// Ensures that Test, Add, and TestAndAdd behave correctly.
 91 |         /// </summary>
 92 |         [TestMethod]
 93 |         public void TestBloomTestAndAdd()
 94 |         {
 95 |             var f = new BloomFilter(100, 0.01);
 96 | 
 97 |             // 'a' is not in the filter.
 98 |             if (f.Test(A_BYTES))
 99 |             {
100 |                 Assert.Fail("'a' should not be a member");
101 |             }
102 | 
103 |             var addedF = f.Add(A_BYTES);
104 |             Assert.AreSame(f, addedF, "Returned BloomFilter should be the same instance");
105 | 
106 |             // 'a' is now in the filter.
107 |             if (!f.Test(A_BYTES))
108 |             {
109 |                 Assert.Fail("'a' should be a member");
110 |             }
111 | 
112 |             // 'a' is still in the filter.
113 |             if (!f.TestAndAdd(A_BYTES))
114 |             {
115 |                 Assert.Fail("'a' should be a member");
116 |             }
117 | 
118 |             // 'b' is not in the filter.
119 |             if (f.TestAndAdd(B_BYTES))
120 |             {
121 |                 Assert.Fail("'b' should not be a member");
122 |             }
123 | 
124 |             // 'a' is still in the filter.
125 |             if (!f.Test(A_BYTES))
126 |             {
127 |                 Assert.Fail("'a' should be a member");
128 |             }
129 | 
130 |             // 'b' is now in the filter.
131 |             if (!f.Test(B_BYTES))
132 |             {
133 |                 Assert.Fail("'b' should be a member");
134 |             }
135 | 
136 |             // 'c' is not in the filter.
137 |             if (f.Test(C_BYTES))
138 |             {
139 |                 Assert.Fail("'c' should not be a member");
140 |             }
141 | 
142 |             for (int i = 0; i < 1000000; i++)
143 |             {
144 |                 f.TestAndAdd(Encoding.ASCII.GetBytes(i.ToString()));
145 |             }
146 | 
147 |             // 'x' should be a false positive.
148 |             if (!f.Test(X_BYTES))
149 |             {
150 |                 Assert.Fail("'x' should be a member");
151 |             }
152 |         }
153 | 
154 |         /// <summary>
155 |         /// Ensures that Reset sets every bit to zero.
156 |         /// </summary>
157 |         [TestMethod]
158 |         public void TestBloomReset()
159 |         {
160 |             var f = new BloomFilter(100, 0.1);
161 |             for (int i = 0; i < 1000; i++)
162 |             {
163 |                 f.Add(Encoding.ASCII.GetBytes(i.ToString()));
164 |             }
165 | 
166 |             var resetF = f.Reset();
167 |             Assert.AreSame(f, resetF, "Returned BloomFilter should be the same instance");
168 | 
169 |             for (uint i = 0; i < f.Buckets.count; i++)
170 |             {
171 |                 if (f.Buckets.Get(i) != 0)
172 |                 {
173 |                     Assert.Fail("Expected all bits to be unset");
174 |                 }
175 |             }
176 |         }
177 |     }
178 | 
179 |     [TestClass]
180 |     public class BenchmarkBloomFilter
181 |     {
182 |         private BloomFilter f;
183 |         private int n;
184 |         private byte[][] data;
185 | 
186 |         [TestInitialize()]
187 |         public void Testinitialize()
188 |         {
189 |             n = 100000;
190 |             f = new BloomFilter(100000, 0.1);
191 |             data = new byte[n][];
192 |             for (int i = 0; i < n; i++)
193 |             {
194 |                 data[i] = Encoding.ASCII.GetBytes(i.ToString());
195 |             }
196 |         }
197 | 
198 |         [TestCleanup()]
199 |         public void TestCleanup()
200 |         {
201 |             f = null;
202 |             n = 0;
203 |             data = null;
204 |         }
205 | 
206 |         [TestMethod]
207 |         public void BenchmarkBloomAdd()
208 |         {
209 |             for (int i = 0; i < n; i++)
210 |             {
211 |                 f.Add(data[i]);
212 |             }
213 |         }
214 | 
215 |         [TestMethod]
216 |         public void BenchmarkBloomTest()
217 |         {
218 |             for (int i = 0; i < n; i++)
219 |             {
220 |                 f.Test(data[i]);
221 |             }
222 |         }
223 | 
224 |         [TestMethod]
225 |         public void BenchmarkBloomTestAndAdd()
226 |         {
227 |             for (int i = 0; i < n; i++)
228 |             {
229 |                 f.TestAndAdd(data[i]);
230 |             }
231 |         }
232 |     }
233 | }
234 | 


--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestBloomFilter64.cs:
--------------------------------------------------------------------------------
  1 | using System;
  2 | using Microsoft.VisualStudio.TestTools.UnitTesting;
  3 | using ProbabilisticDataStructures;
  4 | using System.Text;
  5 | using System.Collections.Generic;
  6 | 
  7 | namespace TestProbabilisticDataStructures
  8 | {
  9 |     [TestClass]
 10 |     public class TestBloomFilter64
 11 |     {
 12 |         private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a");
 13 |         private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b");
 14 |         private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c");
 15 |         private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x");
 16 | 
 17 |         /// <summary>
 18 |         /// Ensures that Capacity() returns the number of bits, m, in the Bloom filter.
 19 |         /// </summary>
 20 |         [TestMethod]
 21 |         public void TestBloomCapacity()
 22 |         {
 23 |             var f = new BloomFilter64(100, 0.1);
 24 |             var capacity = f.Capacity();
 25 | 
 26 |             Assert.AreEqual(480u, capacity);
 27 |         }
 28 | 
 29 |         /// <summary>
 30 |         /// Ensures that K() returns the number of hash functions in the Bloom Filter.
 31 |         /// </summary>
 32 |         [TestMethod]
 33 |         public void TestBloom64K()
 34 |         {
 35 |             var f = new BloomFilter64(100, 0.1);
 36 |             var k = f.K();
 37 | 
 38 |             Assert.AreEqual(4u, k);
 39 |         }
 40 | 
 41 |         /// <summary>
 42 |         /// Ensures that Count returns the number of items added to the filter.
 43 |         /// </summary>
 44 |         [TestMethod]
 45 |         public void TestBloom64Count()
 46 |         {
 47 |             var f = new BloomFilter64(100, 0.1);
 48 |             for (uint i = 0; i < 10; i++)
 49 |             {
 50 |                 f.Add(Encoding.ASCII.GetBytes(i.ToString()));
 51 |             }
 52 | 
 53 |             var count = f.Count();
 54 |             Assert.AreEqual(10u, count);
 55 |         }
 56 | 
 57 |         /// <summary>
 58 |         /// Ensures that EstimatedFillRatio returns the correct approximation.
 59 |         /// </summary>
 60 |         [TestMethod]
 61 |         public void TestBloom64EstimatedFillRatio()
 62 |         {
 63 |             var f = new BloomFilter64(100, 0.5);
 64 |             for (uint i = 0; i < 100; i++)
 65 |             {
 66 |                 f.Add(Encoding.ASCII.GetBytes(i.ToString()));
 67 |             }
 68 | 
 69 |             var ratio = f.EstimatedFillRatio();
 70 |             if (ratio > 0.5)
 71 |             {
 72 |                 Assert.Fail("Expected less than or equal to 0.5, got {0}", ratio);
 73 |             }
 74 |         }
 75 | 
 76 |         /// <summary>
 77 |         /// Ensures that FillRatio returns the ratio of set bits.
 78 |         /// </summary>
 79 |         [TestMethod]
 80 |         public void TestBloom64FillRatio()
 81 |         {
 82 |             var f = new BloomFilter64(100, 0.1);
 83 |             f.Add(A_BYTES);
 84 |             f.Add(B_BYTES);
 85 |             f.Add(C_BYTES);
 86 | 
 87 |             var ratio = f.FillRatio();
 88 |             Assert.AreEqual(0.025, ratio);
 89 |         }
 90 | 
 91 |         /// <summary>
 92 |         /// Ensures that Test, Add, and TestAndAdd behave correctly.
 93 |         /// </summary>
 94 |         [TestMethod]
 95 |         public void TestBloom64TestAndAdd()
 96 |         {
 97 |             var f = new BloomFilter64(100, 0.01);
 98 | 
 99 |             // 'a' is not in the filter.
100 |             if (f.Test(A_BYTES))
101 |             {
102 |                 Assert.Fail("'a' should not be a member");
103 |             }
104 | 
105 |             var addedF = f.Add(A_BYTES);
106 |             Assert.AreSame(f, addedF, "Returned BloomFilter64 should be the same instance");
107 | 
108 |             // 'a' is now in the filter.
109 |             if (!f.Test(A_BYTES))
110 |             {
111 |                 Assert.Fail("'a' should be a member");
112 |             }
113 | 
114 |             // 'a' is still in the filter.
115 |             if (!f.TestAndAdd(A_BYTES))
116 |             {
117 |                 Assert.Fail("'a' should be a member");
118 |             }
119 | 
120 |             // 'b' is not in the filter.
121 |             if (f.TestAndAdd(B_BYTES))
122 |             {
123 |                 Assert.Fail("'b' should not be a member");
124 |             }
125 | 
126 |             // 'a' is still in the filter.
127 |             if (!f.Test(A_BYTES))
128 |             {
129 |                 Assert.Fail("'a' should be a member");
130 |             }
131 | 
132 |             // 'b' is now in the filter.
133 |             if (!f.Test(B_BYTES))
134 |             {
135 |                 Assert.Fail("'b' should be a member");
136 |             }
137 | 
138 |             // 'c' is not in the filter.
139 |             if (f.Test(C_BYTES))
140 |             {
141 |                 Assert.Fail("'c' should not be a member");
142 |             }
143 | 
144 |             for (int i = 0; i < 1000000; i++)
145 |             {
146 |                 f.TestAndAdd(Encoding.ASCII.GetBytes(i.ToString()));
147 |             }
148 | 
149 |             // 'x' should be a false positive.
150 |             if (!f.Test(X_BYTES))
151 |             {
152 |                 Assert.Fail("'x' should be a member");
153 |             }
154 |         }
155 | 
156 |         /// <summary>
157 |         /// Ensures that Reset sets every bit to zero.
158 |         /// </summary>
159 |         [TestMethod]
160 |         public void TestBloom64Reset()
161 |         {
162 |             var f = new BloomFilter64(100, 0.1);
163 |             for (int i = 0; i < 1000; i++)
164 |             {
165 |                 f.Add(Encoding.ASCII.GetBytes(i.ToString()));
166 |             }
167 | 
168 |             var resetF = f.Reset();
169 |             Assert.AreSame(f, resetF, "Returned BloomFilter64 should be the same instance");
170 | 
171 |             for (uint i = 0; i < f.Buckets.count; i++)
172 |             {
173 |                 if (f.Buckets.Get(i) != 0)
174 |                 {
175 |                     Assert.Fail("Expected all bits to be unset");
176 |                 }
177 |             }
178 |         }
179 |     }
180 | 
181 |     [TestClass]
182 |     public class BenchmarkBloomFilter64
183 |     {
184 |         private BloomFilter64 f;
185 |         private int n;
186 |         private byte[][] data;
187 | 
188 |         [TestInitialize()]
189 |         public void Testinitialize()
190 |         {
191 |             n = 100000;
192 |             f = new BloomFilter64(100000, 0.1);
193 |             data = new byte[n][];
194 |             for (int i = 0; i < n; i++)
195 |             {
196 |                 data[i] = Encoding.ASCII.GetBytes(i.ToString());
197 |             }
198 |         }
199 | 
200 |         [TestCleanup()]
201 |         public void TestCleanup()
202 |         {
203 |             f = null;
204 |             n = 0;
205 |             data = null;
206 |         }
207 | 
208 |         [TestMethod]
209 |         public void BenchmarkBloom64Add()
210 |         {
211 |             for (int i = 0; i < n; i++)
212 |             {
213 |                 f.Add(data[i]);
214 |             }
215 |         }
216 | 
217 |         [TestMethod]
218 |         public void BenchmarkBloom64Test()
219 |         {
220 |             for (int i = 0; i < n; i++)
221 |             {
222 |                 f.Test(data[i]);
223 |             }
224 |         }
225 | 
226 |         [TestMethod]
227 |         public void BenchmarkBloom64TestAndAdd()
228 |         {
229 |             for (int i = 0; i < n; i++)
230 |             {
231 |                 f.TestAndAdd(data[i]);
232 |             }
233 |         }
234 |     }
235 | }
236 | 


--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestBuckets.cs:
--------------------------------------------------------------------------------
  1 | ﻿using Microsoft.VisualStudio.TestTools.UnitTesting;
  2 | using ProbabilisticDataStructures;
  3 | 
  4 | namespace TestProbabilisticDataStructures
  5 | {
  6 |     [TestClass]
  7 |     public class TestBuckets
  8 |     {
  9 |         /// <summary>
 10 |         /// Ensures that Max returns the correct maximum based on the bucket
 11 |         /// size.
 12 |         /// </summary>
 13 |         [TestMethod]
 14 |         public void TestMaxBucketValue()
 15 |         {
 16 |             var b = new Buckets(10, 2);
 17 | 
 18 |             var max = b.MaxBucketValue();
 19 |             Assert.AreEqual(3, max);
 20 |         }
 21 | 
 22 |         /// <summary>
 23 |         /// Ensures that Count returns the number of buckets.
 24 |         /// </summary>
 25 |         [TestMethod]
 26 |         public void TestBucketsCount()
 27 |         {
 28 |             var b = new Buckets(10, 2);
 29 | 
 30 |             var count = b.count;
 31 |             Assert.AreEqual(10u, count);
 32 |         }
 33 | 
 34 |         /// <summary>
 35 |         /// Ensures that Increment increments the bucket value by the correct delta and
 36 |         /// clamps to zero and the maximum, Get returns the correct bucket value, and Set
 37 |         /// sets the bucket value correctly.
 38 |         /// </summary>
 39 |         [TestMethod]
 40 |         public void TestBucketsIncrementAndGetAndSet()
 41 |         {
 42 |             var b = new Buckets(5, 2);
 43 | 
 44 |             var incrementedB = b.Increment(0, 1);
 45 |             Assert.AreSame(b, incrementedB, "Returned Buckets should be the same instance");
 46 | 
 47 |             var v = b.Get(0);
 48 |             Assert.AreEqual(1u, v);
 49 | 
 50 |             b.Increment(1u, -1);
 51 | 
 52 |             v = b.Get(1);
 53 |             Assert.AreEqual(0u, v);
 54 | 
 55 |             var setB = b.Set(2u, 100);
 56 |             Assert.AreSame(b, setB, "Returned Buckets should be the same instance");
 57 | 
 58 |             v = b.Get(2);
 59 |             Assert.AreEqual(3u, v);
 60 | 
 61 |             b.Increment(3, 2);
 62 | 
 63 |             v = b.Get(3);
 64 |             Assert.AreEqual(2u, v);
 65 |         }
 66 | 
 67 |         /// <summary>
 68 |         /// Ensures that Reset restores the Buckets to the original state.
 69 |         /// </summary>
 70 |         [TestMethod]
 71 |         public void TestBucketsReset()
 72 |         {
 73 |             var b = new Buckets(5, 2);
 74 | 
 75 |             for (uint i = 0; i < 5; i++)
 76 |             {
 77 |                 b.Increment(i, 1);
 78 |             }
 79 | 
 80 |             var resetB = b.Reset();
 81 |             Assert.AreSame(b, resetB, "Returned Buckets should be the same instance");
 82 | 
 83 |             for (uint i = 0; i < 5; i++)
 84 |             {
 85 |                 var c = b.Get(i);
 86 |                 Assert.AreEqual(0u, c);
 87 |             }
 88 |         }
 89 | 
 90 |         [TestMethod]
 91 |         public void BenchmarkBucketsIncrement()
 92 |         {
 93 |             var buckets = new Buckets(10000, 10);
 94 |             for (uint i = 0; i < buckets.count; i++)
 95 |             {
 96 |                 buckets.Increment(i % 10000, 1);
 97 |             }
 98 |         }
 99 | 
100 |         [TestMethod]
101 |         public void BenchmarkBucketsSet()
102 |         {
103 |             var buckets = new Buckets(10000, 10);
104 |             for (uint i = 0; i < buckets.count; i++)
105 |             {
106 |                 buckets.Set(i % 10000, 1);
107 |             }
108 |         }
109 | 
110 |         [TestMethod]
111 |         public void BenchmarkBucketsGet()
112 |         {
113 |             var buckets = new Buckets(10000, 10);
114 |             for (uint i = 0; i < buckets.count; i++)
115 |             {
116 |                 buckets.Get(i % 10000);
117 |             }
118 |         }
119 |     }
120 | }
121 | 


--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestBuckets64.cs:
--------------------------------------------------------------------------------
  1 | ﻿using Microsoft.VisualStudio.TestTools.UnitTesting;
  2 | using ProbabilisticDataStructures;
  3 | 
  4 | namespace TestProbabilisticDataStructures
  5 | {
  6 |     [TestClass]
  7 |     public class TestBuckets64
  8 |     {
  9 |         /// <summary>
 10 |         /// Ensures that Max returns the correct maximum based on the bucket
 11 |         /// size.
 12 |         /// </summary>
 13 |         [TestMethod]
 14 |         public void TestMaxBucketValue()
 15 |         {
 16 |             var b = new Buckets64(10, 2);
 17 | 
 18 |             var max = b.MaxBucketValue();
 19 |             Assert.AreEqual(3, max);
 20 |         }
 21 | 
 22 |         /// <summary>
 23 |         /// Ensures that Count returns the number of buckets.
 24 |         /// </summary>
 25 |         [TestMethod]
 26 |         public void TestBuckets64Count()
 27 |         {
 28 |             var b = new Buckets64(10, 2);
 29 | 
 30 |             var count = b.count;
 31 |             Assert.AreEqual(10u, count);
 32 |         }
 33 | 
 34 |         /// <summary>
 35 |         /// Ensures that Increment increments the bucket value by the correct delta and
 36 |         /// clamps to zero and the maximum, Get returns the correct bucket value, and Set
 37 |         /// sets the bucket value correctly.
 38 |         /// </summary>
 39 |         [TestMethod]
 40 |         public void TestBuckets64IncrementAndGetAndSet()
 41 |         {
 42 |             var b = new Buckets64(5, 2);
 43 | 
 44 |             var incrementedB = b.Increment(0, 1);
 45 |             Assert.AreSame(b, incrementedB, "Returned Buckets64 should be the same instance");
 46 | 
 47 |             var v = b.Get(0);
 48 |             Assert.AreEqual(1u, v);
 49 | 
 50 |             b.Increment(1u, -1);
 51 | 
 52 |             v = b.Get(1);
 53 |             Assert.AreEqual(0u, v);
 54 | 
 55 |             var setB = b.Set(2u, 100);
 56 |             Assert.AreSame(b, setB, "Returned Buckets64 should be the same instance");
 57 | 
 58 |             v = b.Get(2);
 59 |             Assert.AreEqual(3u, v);
 60 | 
 61 |             b.Increment(3, 2);
 62 | 
 63 |             v = b.Get(3);
 64 |             Assert.AreEqual(2u, v);
 65 |         }
 66 | 
 67 |         /// <summary>
 68 |         /// Ensures that Reset restores the Buckets64 to the original state.
 69 |         /// </summary>
 70 |         [TestMethod]
 71 |         public void TestBuckets64Reset()
 72 |         {
 73 |             var b = new Buckets64(5, 2);
 74 | 
 75 |             for (uint i = 0; i < 5; i++)
 76 |             {
 77 |                 b.Increment(i, 1);
 78 |             }
 79 | 
 80 |             var resetB = b.Reset();
 81 |             Assert.AreSame(b, resetB, "Returned Buckets64 should be the same instance");
 82 | 
 83 |             for (uint i = 0; i < 5; i++)
 84 |             {
 85 |                 var c = b.Get(i);
 86 |                 Assert.AreEqual(0u, c);
 87 |             }
 88 |         }
 89 | 
 90 |         [TestMethod]
 91 |         public void BenchmarkBuckets64Increment()
 92 |         {
 93 |             var buckets = new Buckets64(10000, 10);
 94 |             for (uint i = 0; i < buckets.count; i++)
 95 |             {
 96 |                 buckets.Increment(i % 10000, 1);
 97 |             }
 98 |         }
 99 | 
100 |         [TestMethod]
101 |         public void BenchmarkBuckets64Set()
102 |         {
103 |             var buckets = new Buckets64(10000, 10);
104 |             for (uint i = 0; i < buckets.count; i++)
105 |             {
106 |                 buckets.Set(i % 10000, 1);
107 |             }
108 |         }
109 | 
110 |         [TestMethod]
111 |         public void BenchmarkBuckets64Get()
112 |         {
113 |             var buckets = new Buckets64(10000, 10);
114 |             for (uint i = 0; i < buckets.count; i++)
115 |             {
116 |                 buckets.Get(i % 10000);
117 |             }
118 |         }
119 |     }
120 | }
121 | 


--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestCountMinSketch.cs:
--------------------------------------------------------------------------------
  1 | ﻿using System.Text;
  2 | using Microsoft.VisualStudio.TestTools.UnitTesting;
  3 | using ProbabilisticDataStructures;
  4 | 
  5 | namespace TestProbabilisticDataStructures
  6 | {
  7 |     [TestClass]
  8 |     public class TestCountMinSketch
  9 |     {
 10 |         private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a");
 11 |         private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b");
 12 |         private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c");
 13 |         private static byte[] D_BYTES = Encoding.ASCII.GetBytes("d");
 14 |         private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x");
 15 | 
 16 |         /// <summary>
 17 |         /// Ensures that TotalCount returns the number of items added to the sketch.
 18 |         /// </summary>
 19 |         [TestMethod]
 20 |         public void TestCMSTotalCount()
 21 |         {
 22 |             var cms = new CountMinSketch(0.001, 0.99);
 23 | 
 24 |             for (int i = 0; i < 100; i++)
 25 |             {
 26 |                 cms.Add(Encoding.ASCII.GetBytes(i.ToString()));
 27 |             }
 28 | 
 29 |             var count = cms.TotalCount();
 30 |             Assert.AreEqual(100u, count);
 31 |         }
 32 | 
 33 |         /// <summary>
 34 |         /// Ensures that Add adds to the set and Count returns the correct approximation.
 35 |         /// </summary>
 36 |         [TestMethod]
 37 |         public void TestCMSAddAndCount()
 38 |         {
 39 |             var cms = new CountMinSketch(0.001, 0.99);
 40 | 
 41 |             var addedCms = cms.Add(A_BYTES);
 42 |             Assert.AreSame(cms, addedCms);
 43 | 
 44 |             cms.Add(B_BYTES);
 45 |             cms.Add(C_BYTES);
 46 |             cms.Add(B_BYTES);
 47 |             cms.Add(D_BYTES);
 48 |             cms.Add(A_BYTES).Add(A_BYTES);
 49 | 
 50 |             var count = cms.Count(A_BYTES);
 51 |             Assert.AreEqual(3u, count);
 52 | 
 53 |             count = cms.Count(B_BYTES);
 54 |             Assert.AreEqual(2u, count);
 55 | 
 56 |             count = cms.Count(C_BYTES);
 57 |             Assert.AreEqual(1u, count);
 58 | 
 59 |             count = cms.Count(D_BYTES);
 60 |             Assert.AreEqual(1u, count);
 61 | 
 62 |             count = cms.Count(X_BYTES);
 63 |             Assert.AreEqual(0u, count);
 64 |         }
 65 | 
 66 |         /// <summary>
 67 |         /// Ensures that Merge combines the two sketches.
 68 |         /// </summary>
 69 |         [TestMethod]
 70 |         public void TestCMSMerge()
 71 |         {
 72 |             var cms = new CountMinSketch(0.001, 0.99);
 73 |             cms.Add(B_BYTES);
 74 |             cms.Add(C_BYTES);
 75 |             cms.Add(B_BYTES);
 76 |             cms.Add(D_BYTES);
 77 |             cms.Add(A_BYTES).Add(A_BYTES);
 78 | 
 79 |             var other = new CountMinSketch(0.001, 0.99);
 80 |             other.Add(B_BYTES);
 81 |             other.Add(C_BYTES);
 82 |             other.Add(B_BYTES);
 83 | 
 84 |             var wasMerged = cms.Merge(other);
 85 |             Assert.IsTrue(wasMerged);
 86 | 
 87 |             var count = cms.Count(A_BYTES);
 88 |             Assert.AreEqual(2u, count);
 89 | 
 90 |             count = cms.Count(B_BYTES);
 91 |             Assert.AreEqual(4u, count);
 92 | 
 93 |             count = cms.Count(C_BYTES);
 94 |             Assert.AreEqual(2u, count);
 95 | 
 96 |             count = cms.Count(D_BYTES);
 97 |             Assert.AreEqual(1u, count);
 98 | 
 99 |             count = cms.Count(X_BYTES);
100 |             Assert.AreEqual(0u, count);
101 |         }
102 | 
103 |         /// <summary>
104 |         /// Ensures that Reset restores the sketch to its original state.
105 |         /// </summary>
106 |         [TestMethod]
107 |         public void TestCMSReset()
108 |         {
109 |             var cms = new CountMinSketch(0.001, 0.99);
110 |             cms.Add(B_BYTES);
111 |             cms.Add(C_BYTES);
112 |             cms.Add(B_BYTES);
113 |             cms.Add(D_BYTES);
114 |             cms.Add(A_BYTES).Add(A_BYTES);
115 | 
116 |             var resetCms = cms.Reset();
117 |             Assert.AreSame(cms, resetCms);
118 | 
119 |             for (uint i = 0; i < cms.Depth; i++)
120 |             {
121 |                 for (int j = 0; j < cms.Width; j++)
122 |                 {
123 |                     if (cms.Matrix[i][j] != 0)
124 |                     {
125 |                         Assert.Fail("Expected matrix to be completely empty.");
126 |                     }
127 |                 }
128 |             }
129 |         }
130 | 
131 |         [TestMethod]
132 |         public void BenchmarkCMSAdd()
133 |         {
134 |             var n = 100000;
135 |             var cms = new CountMinSketch(0.001, 0.99);
136 |             var data = new byte[n][];
137 |             for (int i = 0; i < n; i++)
138 |             {
139 |                 data[i] = Encoding.ASCII.GetBytes(i.ToString());
140 |             }
141 | 
142 |             for (int i = 0; i < n; i++)
143 |             {
144 |                 cms.Add(data[i]);
145 |             }
146 |         }
147 | 
148 |         [TestMethod]
149 |         public void BenchmarkCMSCount()
150 |         {
151 |             var n = 100000;
152 |             var cms = new CountMinSketch(0.001, 0.99);
153 |             var data = new byte[n][];
154 |             for (int i = 0; i < n; i++)
155 |             {
156 |                 var byteArray = Encoding.ASCII.GetBytes(i.ToString());
157 |                 data[i] = byteArray;
158 |                 cms.Add(byteArray);
159 |             }
160 | 
161 |             for (int i = 0; i < n; i++)
162 |             {
163 |                 cms.Add(data[i]);
164 |             }
165 |         }
166 | 
167 |         // TODO: Implement these later.
168 |         // TestCMSSerialization
169 |         // BenchmarkCMSWriteDataTo
170 |         // BenchmarkCMSReadDataFrom
171 |     }
172 | }
173 | 


--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestCountingBloomFilter.cs:
--------------------------------------------------------------------------------
  1 | ﻿using Microsoft.VisualStudio.TestTools.UnitTesting;
  2 | using ProbabilisticDataStructures;
  3 | using System.Text;
  4 | 
  5 | namespace TestProbabilisticDataStructures
  6 | {
  7 |     [TestClass]
  8 |     public class TestCountingBloomFilter
  9 |     {
 10 |         private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a");
 11 |         private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b");
 12 |         private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c");
 13 |         private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x");
 14 | 
 15 |         /// <summary>
 16 |         /// Ensures that Capacity() returns the number of bits, m, in the Bloom filter.
 17 |         /// </summary>
 18 |         [TestMethod]
 19 |         public void TestCountingCapacity()
 20 |         {
 21 |             var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100, 0.1);
 22 |             var capacity = f.Capacity();
 23 | 
 24 |             Assert.AreEqual(480u, capacity);
 25 |         }
 26 | 
 27 |         /// <summary>
 28 |         /// Ensures that K() returns the number of hash functions in the Bloom Filter.
 29 |         /// </summary>
 30 |         [TestMethod]
 31 |         public void TestCountingK()
 32 |         {
 33 |             var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100, 0.1);
 34 |             var k = f.K();
 35 | 
 36 |             Assert.AreEqual(4u, k);
 37 |         }
 38 | 
 39 |         /// <summary>
 40 |         /// Ensures that Count returns the number of items added to the filter.
 41 |         /// </summary>
 42 |         [TestMethod]
 43 |         public void TestCountingCount()
 44 |         {
 45 |             var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100, 0.1);
 46 |             for (uint i = 0; i < 10; i++)
 47 |             {
 48 |                 f.Add(Encoding.ASCII.GetBytes(i.ToString()));
 49 |             }
 50 | 
 51 |             for (int i = 0; i < 5; i++)
 52 |             {
 53 |                 f.TestAndRemove(Encoding.ASCII.GetBytes(i.ToString()));
 54 |             }
 55 | 
 56 |             var count = f.Count();
 57 |             Assert.AreEqual(5u, count);
 58 |         }
 59 | 
 60 |         /// <summary>
 61 |         /// Ensures that Test, Add, and TestAndAdd behave correctly.
 62 |         /// </summary>
 63 |         [TestMethod]
 64 |         public void TestCountingTestAndAdd()
 65 |         {
 66 |             var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100, 0.01);
 67 | 
 68 |             // 'a' is not in the filter.
 69 |             if (f.Test(A_BYTES))
 70 |             {
 71 |                 Assert.Fail("'a' should not be a member");
 72 |             }
 73 | 
 74 |             var addedF = f.Add(A_BYTES);
 75 |             Assert.AreSame(f, addedF, "Returned BloomFilter should be the same instance");
 76 | 
 77 |             // 'a' is now in the filter.
 78 |             if (!f.Test(A_BYTES))
 79 |             {
 80 |                 Assert.Fail("'a' should be a member");
 81 |             }
 82 | 
 83 |             // 'a' is still in the filter.
 84 |             if (!f.TestAndAdd(A_BYTES))
 85 |             {
 86 |                 Assert.Fail("'a' should be a member");
 87 |             }
 88 | 
 89 |             // 'b' is not in the filter.
 90 |             if (f.TestAndAdd(B_BYTES))
 91 |             {
 92 |                 Assert.Fail("'b' should not be a member");
 93 |             }
 94 | 
 95 |             // 'a' is still in the filter.
 96 |             if (!f.Test(A_BYTES))
 97 |             {
 98 |                 Assert.Fail("'a' should be a member");
 99 |             }
100 | 
101 |             // 'b' is now in the filter.
102 |             if (!f.Test(B_BYTES))
103 |             {
104 |                 Assert.Fail("'b' should be a member");
105 |             }
106 | 
107 |             // 'c' is not in the filter.
108 |             if (f.Test(C_BYTES))
109 |             {
110 |                 Assert.Fail("'c' should not be a member");
111 |             }
112 | 
113 |             for (int i = 0; i < 1000000; i++)
114 |             {
115 |                 f.TestAndAdd(Encoding.ASCII.GetBytes(i.ToString()));
116 |             }
117 | 
118 |             // 'x' should be a false positive.
119 |             if (!f.Test(X_BYTES))
120 |             {
121 |                 Assert.Fail("'x' should be a member");
122 |             }
123 |         }
124 | 
125 |         /// <summary>
126 |         /// Ensures that TestAndRemove behaves correctly.
127 |         /// </summary>
128 |         [TestMethod]
129 |         public void TestCountingTestAndRemove()
130 |         {
131 |             var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100, 0.01);
132 | 
133 |             // 'a' is not in the filter.
134 |             if (f.TestAndRemove(A_BYTES))
135 |             {
136 |                 Assert.Fail("'a' should not be a member");
137 |             }
138 | 
139 |             f.Add(Encoding.ASCII.GetBytes("a"));
140 | 
141 |             // 'a' is now in the filter.
142 |             if (!f.TestAndRemove(A_BYTES))
143 |             {
144 |                 Assert.Fail("'a' should be a member");
145 |             }
146 | 
147 |             // 'a' is no longer in the filter.
148 |             if (f.TestAndRemove(A_BYTES))
149 |             {
150 |                 Assert.Fail("'a' should not be a member");
151 |             }
152 |         }
153 | 
154 |         /// <summary>
155 |         /// Ensures that Reset sets every bit to zero and the count is zero.
156 |         /// </summary>
157 |         [TestMethod]
158 |         public void TestCountingReset()
159 |         {
160 |             var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100, 0.1);
161 |             for (int i = 0; i < 1000; i++)
162 |             {
163 |                 f.Add(Encoding.ASCII.GetBytes(i.ToString()));
164 |             }
165 | 
166 |             var resetF = f.Reset();
167 |             Assert.AreSame(f, resetF, "Returned CountingBloomFilter should be the same instance");
168 | 
169 |             for (uint i = 0; i < f.Buckets.count; i++)
170 |             {
171 |                 if (f.Buckets.Get(i) != 0)
172 |                 {
173 |                     Assert.Fail("Expected all bits to be unset");
174 |                 }
175 |             }
176 | 
177 |             Assert.AreEqual(0u, f.Count());
178 |         }
179 | 
180 |         [TestMethod]
181 |         public void BenchmarkCountingAdd()
182 |         {
183 |             var n = 100000;
184 |             var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100000, 0.1);
185 |             var data = new byte[n][];
186 |             for (int i = 0; i < n; i++)
187 |             {
188 |                 data[i] = Encoding.ASCII.GetBytes(i.ToString());
189 |             }
190 | 
191 |             for (int i = 0; i < n; i++)
192 |             {
193 |                 f.Add(data[i]);
194 |             }
195 |         }
196 | 
197 |         [TestMethod]
198 |         public void BenchmarkCountingTest()
199 |         {
200 |             var n = 100000;
201 |             var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100000, 0.1);
202 |             var data = new byte[n][];
203 |             for (int i = 0; i < n; i++)
204 |             {
205 |                 data[i] = Encoding.ASCII.GetBytes(i.ToString());
206 |             }
207 | 
208 |             for (int i = 0; i < n; i++)
209 |             {
210 |                 f.Test(data[i]);
211 |             }
212 |         }
213 | 
214 |         [TestMethod]
215 |         public void BenchmarkCountingTestAndAdd()
216 |         {
217 |             var n = 100000;
218 |             var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100000, 0.1);
219 |             var data = new byte[n][];
220 |             for (int i = 0; i < n; i++)
221 |             {
222 |                 data[i] = Encoding.ASCII.GetBytes(i.ToString());
223 |             }
224 | 
225 |             for (int i = 0; i < n; i++)
226 |             {
227 |                 f.TestAndAdd(data[i]);
228 |             }
229 |         }
230 | 
231 |         [TestMethod]
232 |         public void BenchmarkCountingTestAndRemove()
233 |         {
234 |             var n = 100000;
235 |             var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100000, 0.1);
236 |             var data = new byte[n][];
237 |             for (int i = 0; i < n; i++)
238 |             {
239 |                 data[i] = Encoding.ASCII.GetBytes(i.ToString());
240 |             }
241 | 
242 |             for (int i = 0; i < n; i++)
243 |             {
244 |                 f.TestAndRemove(data[i]);
245 |             }
246 |         }
247 |     }
248 | }
249 | 


--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestCuckooBloomFilter.cs:
--------------------------------------------------------------------------------
  1 | ﻿using Microsoft.VisualStudio.TestTools.UnitTesting;
  2 | using ProbabilisticDataStructures;
  3 | using System.Text;
  4 | 
  5 | namespace TestProbabilisticDataStructures
  6 | {
  7 |     [TestClass]
  8 |     public class TestCuckooBloomFilter
  9 |     {
 10 |         private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a");
 11 |         private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b");
 12 |         private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c");
 13 |         private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x");
 14 | 
 15 |         /// <summary>
 16 |         /// Ensures that Buckets returns the number of buckets, m, in the Cuckoo Filter.
 17 |         /// </summary>
 18 |         [TestMethod]
 19 |         public void TestCuckooBuckets()
 20 |         {
 21 |             var f = new CuckooBloomFilter(100, 0.1);
 22 |             var buckets = f.BucketCount();
 23 | 
 24 |             Assert.AreEqual(1024u, buckets);
 25 |         }
 26 | 
 27 |         /// <summary>
 28 |         /// Ensures that Capacity returns the expected filter capacity.
 29 |         /// </summary>
 30 |         [TestMethod]
 31 |         public void TestCuckooCapacity()
 32 |         {
 33 |             var f = new CuckooBloomFilter(100, 0.1);
 34 |             var capacity = f.Capacity();
 35 | 
 36 |             Assert.AreEqual(100u, capacity);
 37 |         }
 38 | 
 39 |         /// <summary>
 40 |         /// Ensures that Count returns the number of items added to the filter.
 41 |         /// </summary>
 42 |         [TestMethod]
 43 |         public void TestCuckooCount()
 44 |         {
 45 |             var f = new CuckooBloomFilter(100, 0.1);
 46 |             for (int i = 0; i < 10; i++)
 47 |             {
 48 |                 f.Add(Encoding.ASCII.GetBytes(i.ToString()));
 49 |             }
 50 | 
 51 |             for (int i = 0; i < 5; i++)
 52 |             {
 53 |                 f.TestAndRemove(Encoding.ASCII.GetBytes(i.ToString()));
 54 |             }
 55 | 
 56 |             var count = f.Count();
 57 |             Assert.AreEqual(5u, count);
 58 |         }
 59 | 
 60 |         /// <summary>
 61 |         /// Ensures that Test, Add, and TestAndAdd behave correctly.
 62 |         /// </summary>
 63 |         [TestMethod]
 64 |         public void TestCuckooTestAndAdd()
 65 |         {
 66 |             var f = new CuckooBloomFilter(100, 0.1);
 67 | 
 68 |             // 'a' is not in the filter.
 69 |             if (f.Test(A_BYTES))
 70 |             {
 71 |                 Assert.Fail("'a' should not be a member");
 72 |             }
 73 | 
 74 |             if (!f.Add(A_BYTES))
 75 |             {
 76 |                 Assert.Fail("Should return true");
 77 |             }
 78 | 
 79 |             // 'a' is now in the filter.
 80 |             if (!f.Test(A_BYTES))
 81 |             {
 82 |                 Assert.Fail("'a' should be a member");
 83 |             }
 84 | 
 85 |             // 'a' is still in the filter.
 86 |             var testAndAdd = f.TestAndAdd(A_BYTES);
 87 |             if (!testAndAdd.WasAlreadyAMember)
 88 |             {
 89 |                 Assert.Fail("'a' should be a member");
 90 |             }
 91 |             // Should not have added
 92 |             Assert.IsFalse(testAndAdd.Added);
 93 | 
 94 |             // 'b' is not in the filter.
 95 |             testAndAdd = f.TestAndAdd(B_BYTES);
 96 |             if (testAndAdd.WasAlreadyAMember)
 97 |             {
 98 |                 Assert.Fail("'b' should not be a member");
 99 |             }
100 |             // Should add
101 |             Assert.IsTrue(testAndAdd.Added);
102 | 
103 |             // 'a' is still in the filter.
104 |             if (!f.Test(A_BYTES))
105 |             {
106 |                 Assert.Fail("'a' should be a member");
107 |             }
108 | 
109 |             // 'b' is now in the filter.
110 |             if (!f.Test(B_BYTES))
111 |             {
112 |                 Assert.Fail("'b' should be a member");
113 |             }
114 | 
115 |             // 'c' is not in the filter.
116 |             if (f.Test(C_BYTES))
117 |             {
118 |                 Assert.Fail("'c' should not be a member");
119 |             }
120 | 
121 |             for (int i = 0; i < 10000; i++)
122 |             {
123 |                 f.Add(Encoding.ASCII.GetBytes(i.ToString()));
124 |             }
125 | 
126 |             // Filter should be full.
127 |             testAndAdd = f.TestAndAdd(X_BYTES);
128 |             // Make sure not there
129 |             Assert.IsFalse(testAndAdd.WasAlreadyAMember);
130 |             // Make sure didn't add
131 |             Assert.IsFalse(testAndAdd.Added);
132 |         }
133 | 
134 |         /// <summary>
135 |         /// Ensures that TestAndRemove behaves correctly.
136 |         /// </summary>
137 |         [TestMethod]
138 |         public void TestCuckooTestAndRemove()
139 |         {
140 |             var f = new CuckooBloomFilter(100, 0.1);
141 | 
142 |             // 'a' is not in the filter.
143 |             if (f.Test(A_BYTES))
144 |             {
145 |                 Assert.Fail("'a' should not be a member");
146 |             }
147 | 
148 |             f.Add(A_BYTES);
149 | 
150 |             // 'a' is now in the filter.
151 |             if (!f.TestAndRemove(A_BYTES))
152 |             {
153 |                 Assert.Fail("'a' should be a member");
154 |             }
155 | 
156 |             // 'a' is no longer in the filter.
157 |             if (f.Test(A_BYTES))
158 |             {
159 |                 Assert.Fail("'a' should not be a member");
160 |             }
161 |         }
162 | 
163 |         /// <summary>
164 |         /// Ensures that Reset clears all buckets and the count is zero.
165 |         /// </summary>
166 |         [TestMethod]
167 |         public void TestCuckooReset()
168 |         {
169 |             var f = new CuckooBloomFilter(100, 0.1);
170 |             for (int i = 0; i < 1000; i++)
171 |             {
172 |                 f.Add(Encoding.ASCII.GetBytes(i.ToString()));
173 |             }
174 | 
175 |             var resetFilter = f.Reset();
176 |             Assert.AreSame(f, resetFilter);
177 | 
178 |             for (int i = 0; i < f.BucketCount(); i++)
179 |             {
180 |                 for (uint j = 0; j < f.B; j++)
181 |                 {
182 |                     if (f.Buckets[i][j] != null)
183 |                     {
184 |                         Assert.Fail("Exected all buckets to be cleared");
185 |                     }
186 |                 }
187 |             }
188 | 
189 |             Assert.AreEqual(0u, f.Count());
190 |         }
191 | 
192 |         [TestMethod]
193 |         public void BenchmarkCuckooAdd()
194 |         {
195 |             var n = 100000u;
196 |             var f = new CuckooBloomFilter(n, 0.1);
197 |             var data = new byte[n][];
198 |             for (int i = 0; i < n; i++)
199 |             {
200 |                 data[i] = Encoding.ASCII.GetBytes(i.ToString());
201 |             }
202 | 
203 |             for (int i = 0; i < n; i++)
204 |             {
205 |                 f.Add(data[i]);
206 |             }
207 |         }
208 | 
209 |         [TestMethod]
210 |         public void BenchmarkCuckooTest()
211 |         {
212 |             var n = 100000u;
213 |             var f = new CuckooBloomFilter(n, 0.1);
214 |             var data = new byte[n][];
215 |             for (int i = 0; i < n; i++)
216 |             {
217 |                 data[i] = Encoding.ASCII.GetBytes(i.ToString());
218 |             }
219 | 
220 |             for (int i = 0; i < n; i++)
221 |             {
222 |                 f.Test(data[i]);
223 |             }
224 |         }
225 | 
226 |         [TestMethod]
227 |         public void BenchmarkCuckooTestAndAdd()
228 |         {
229 |             var n = 100000u;
230 |             var f = new CuckooBloomFilter(n, 0.1);
231 |             var data = new byte[n][];
232 |             for (int i = 0; i < n; i++)
233 |             {
234 |                 data[i] = Encoding.ASCII.GetBytes(i.ToString());
235 |             }
236 | 
237 |             for (int i = 0; i < n; i++)
238 |             {
239 |                 f.TestAndAdd(data[i]);
240 |             }
241 |         }
242 | 
243 |         [TestMethod]
244 |         public void BenchmarkCuckooTestAndRemove()
245 |         {
246 |             var n = 100000u;
247 |             var f = new CuckooBloomFilter(n, 0.1);
248 |             var data = new byte[n][];
249 |             for (int i = 0; i < n; i++)
250 |             {
251 |                 data[i] = Encoding.ASCII.GetBytes(i.ToString());
252 |             }
253 | 
254 |             for (int i = 0; i < n; i++)
255 |             {
256 |                 f.TestAndRemove(data[i]);
257 |             }
258 |         }
259 |     }
260 | }
261 | 


--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestDeletableBloomFilter.cs:
--------------------------------------------------------------------------------
  1 | ﻿using Microsoft.VisualStudio.TestTools.UnitTesting;
  2 | using ProbabilisticDataStructures;
  3 | using System.Text;
  4 | 
  5 | namespace TestProbabilisticDataStructures
  6 | {
  7 |     [TestClass]
  8 |     public class TestDeletableBloomFilter
  9 |     {
 10 |         private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a");
 11 |         private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b");
 12 |         private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c");
 13 |         private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x");
 14 | 
 15 |         /// <summary>
 16 |         /// Ensures that Capacity() returns the number of bits, m, in the Bloom filter.
 17 |         /// </summary>
 18 |         [TestMethod]
 19 |         public void TestDeletableCapacity()
 20 |         {
 21 |             var d = new DeletableBloomFilter(100, 10, 0.1);
 22 |             var capacity = d.Capacity();
 23 | 
 24 |             Assert.AreEqual(470u, capacity);
 25 |         }
 26 | 
 27 |         /// <summary>
 28 |         /// Ensures that K() returns the number of hash functions in the Bloom Filter.
 29 |         /// </summary>
 30 |         [TestMethod]
 31 |         public void TestDeletableK()
 32 |         {
 33 |             var d = new DeletableBloomFilter(100, 10, 0.1);
 34 |             var k = d.K();
 35 | 
 36 |             Assert.AreEqual(4u, k);
 37 |         }
 38 | 
 39 |         /// <summary>
 40 |         /// Ensures that Count returns the number of items added to the filter.
 41 |         /// </summary>
 42 |         [TestMethod]
 43 |         public void TestDeletableCount()
 44 |         {
 45 |             var d = new DeletableBloomFilter(100, 10, 0.1);
 46 |             for (uint i = 0; i < 10; i++)
 47 |             {
 48 |                 d.Add(Encoding.ASCII.GetBytes(i.ToString()));
 49 |             }
 50 | 
 51 |             for (int i = 0; i < 5; i++)
 52 |             {
 53 |                 d.TestAndRemove(Encoding.ASCII.GetBytes(i.ToString()));
 54 |             }
 55 | 
 56 |             var count = d.Count();
 57 |             Assert.AreEqual(5u, count);
 58 |         }
 59 | 
 60 |         /// <summary>
 61 |         /// Ensures that Test, Add, and TestAndAdd behave correctly.
 62 |         /// </summary>
 63 |         [TestMethod]
 64 |         public void TestDeletableTestAndAdd()
 65 |         {
 66 |             var d = new DeletableBloomFilter(100, 10, 0.1);
 67 | 
 68 |             // 'a' is not in the filter.
 69 |             if (d.Test(A_BYTES))
 70 |             {
 71 |                 Assert.Fail("'a' should not be a member");
 72 |             }
 73 | 
 74 |             var addedF = d.Add(A_BYTES);
 75 |             Assert.AreSame(d, addedF, "Returned CountingBloomFilter should be the same instance");
 76 | 
 77 |             // 'a' is now in the filter.
 78 |             if (!d.Test(A_BYTES))
 79 |             {
 80 |                 Assert.Fail("'a' should be a member");
 81 |             }
 82 | 
 83 |             // 'a' is still in the filter.
 84 |             if (!d.TestAndAdd(A_BYTES))
 85 |             {
 86 |                 Assert.Fail("'a' should be a member");
 87 |             }
 88 | 
 89 |             // 'b' is not in the filter.
 90 |             if (d.TestAndAdd(B_BYTES))
 91 |             {
 92 |                 Assert.Fail("'b' should not be a member");
 93 |             }
 94 | 
 95 |             // 'a' is still in the filter.
 96 |             if (!d.Test(A_BYTES))
 97 |             {
 98 |                 Assert.Fail("'a' should be a member");
 99 |             }
100 | 
101 |             // 'b' is now in the filter.
102 |             if (!d.Test(B_BYTES))
103 |             {
104 |                 Assert.Fail("'b' should be a member");
105 |             }
106 | 
107 |             // 'c' is not in the filter.
108 |             if (d.Test(C_BYTES))
109 |             {
110 |                 Assert.Fail("'c' should not be a member");
111 |             }
112 | 
113 |             for (int i = 0; i < 1000000; i++)
114 |             {
115 |                 d.TestAndAdd(Encoding.ASCII.GetBytes(i.ToString()));
116 |             }
117 | 
118 |             // 'x' should be a false positive.
119 |             if (!d.Test(X_BYTES))
120 |             {
121 |                 Assert.Fail("'x' should be a member");
122 |             }
123 |         }
124 | 
125 |         /// <summary>
126 |         /// Ensures that TestAndRemove behaves correctly.
127 |         /// </summary>
128 |         [TestMethod]
129 |         public void TestDeletableTestAndRemove()
130 |         {
131 |             var d = new DeletableBloomFilter(100, 10, 0.1);
132 | 
133 |             // 'a' is not in the filter.
134 |             if (d.TestAndRemove(A_BYTES))
135 |             {
136 |                 Assert.Fail("'a' should not be a member");
137 |             }
138 | 
139 |             d.Add(A_BYTES);
140 | 
141 |             // 'a' is now in the filter.
142 |             if (!d.TestAndRemove(A_BYTES))
143 |             {
144 |                 Assert.Fail("'a' should be a member");
145 |             }
146 | 
147 |             // 'a' is no longer in the filter.
148 |             if (d.TestAndRemove(A_BYTES))
149 |             {
150 |                 Assert.Fail("'a' should not be a member");
151 |             }
152 |         }
153 | 
154 |         /// <summary>
155 |         /// Ensures that Reset sets every bit to zero.
156 |         /// </summary>
157 |         [TestMethod]
158 |         public void TestDeletableReset()
159 |         {
160 |             var d = new DeletableBloomFilter(100, 10, 0.1);
161 |             for (int i = 0; i < 1000; i++)
162 |             {
163 |                 d.Add(Encoding.ASCII.GetBytes(i.ToString()));
164 |             }
165 | 
166 |             var resetF = d.Reset();
167 |             Assert.AreSame(d, resetF, "Returned DeletableBloomFilter should be the same instance");
168 | 
169 |             for (uint i = 0; i < d.Buckets.count; i++)
170 |             {
171 |                 if (d.Buckets.Get(i) != 0)
172 |                 {
173 |                     Assert.Fail("Expected all bits to be unset");
174 |                 }
175 |             }
176 | 
177 |             for (uint i = 0; i < d.Collisions.count; i++)
178 |             {
179 |                 if (d.Collisions.Get(i) != 0)
180 |                 {
181 |                     Assert.Fail("Expected all bits to be unset");
182 |                 }
183 |             }
184 | 
185 |             var count = d.Count();
186 |             Assert.AreEqual(0u, count);
187 |         }
188 | 
189 |         [TestMethod]
190 |         public void BenchmarkDeletableAdd()
191 |         {
192 |             var n = 100000;
193 |             var d = new DeletableBloomFilter(100, 10, 0.1);
194 |             var data = new byte[n][];
195 |             for (int i = 0; i < n; i++)
196 |             {
197 |                 data[i] = Encoding.ASCII.GetBytes(i.ToString());
198 |             }
199 | 
200 |             for (int i = 0; i < n; i++)
201 |             {
202 |                 d.Add(data[i]);
203 |             }
204 |         }
205 | 
206 |         [TestMethod]
207 |         public void BenchmarkDeletableTest()
208 |         {
209 |             var n = 100000;
210 |             var d = new DeletableBloomFilter(100, 10, 0.1);
211 |             var data = new byte[n][];
212 |             for (int i = 0; i < n; i++)
213 |             {
214 |                 data[i] = Encoding.ASCII.GetBytes(i.ToString());
215 |             }
216 | 
217 |             for (int i = 0; i < n; i++)
218 |             {
219 |                 d.Test(data[i]);
220 |             }
221 |         }
222 | 
223 |         [TestMethod]
224 |         public void BenchmarkDeletableTestAndAdd()
225 |         {
226 |             var n = 100000;
227 |             var d = new DeletableBloomFilter(100, 10, 0.1);
228 |             var data = new byte[n][];
229 |             for (int i = 0; i < n; i++)
230 |             {
231 |                 data[i] = Encoding.ASCII.GetBytes(i.ToString());
232 |             }
233 | 
234 |             for (int i = 0; i < n; i++)
235 |             {
236 |                 d.TestAndAdd(data[i]);
237 |             }
238 |         }
239 | 
240 |         [TestMethod]
241 |         public void BenchmarkDeletableTestAndRemove()
242 |         {
243 |             var n = 100000;
244 |             var d = new DeletableBloomFilter(100, 10, 0.1);
245 |             var data = new byte[n][];
246 |             for (int i = 0; i < n; i++)
247 |             {
248 |                 data[i] = Encoding.ASCII.GetBytes(i.ToString());
249 |             }
250 | 
251 |             for (int i = 0; i < n; i++)
252 |             {
253 |                 d.TestAndRemove(data[i]);
254 |             }
255 |         }
256 |     }
257 | }
258 | 


--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestHyperLogLog.cs:
--------------------------------------------------------------------------------
  1 | ﻿/*
  2 | Original work Copyright 2013 Eric Lesh
  3 | Modified work Copyright 2015 Tyler Treat
  4 | Modified work Copyright 2015 Matthew Lorimor
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining
  7 | a copy of this software and associated documentation files (the
  8 | "Software"), to deal in the Software without restriction, including
  9 | without limitation the rights to use, copy, modify, merge, publish,
 10 | distribute, sublicense, and/or sell copies of the Software, and to
 11 | permit persons to whom the Software is furnished to do so, subject to
 12 | the following conditions:
 13 | 
 14 | The above copyright notice and this permission notice shall be
 15 | included in all copies or substantial portions of the Software.
 16 | */
 17 | 
 18 | using System;
 19 | using Microsoft.VisualStudio.TestTools.UnitTesting;
 20 | using ProbabilisticDataStructures;
 21 | using System.Text;
 22 | using System.Threading.Tasks;
 23 | 
 24 | namespace TestProbabilisticDataStructures
 25 | {
 26 |     [TestClass]
 27 |     public class TestHyperLogLog
 28 |     {
 29 |         private double geterror(UInt64 actual, UInt64 estimate)
 30 |         {
 31 |             return ((float)estimate - (float)actual) / (float)actual;
 32 |         }
 33 | 
 34 |         private void testHyperLogLog(int n, int lowB, int highB)
 35 |         {
 36 |             var words = Words.Dictionary(n);
 37 |             var bad = 0;
 38 |             var nWords = (UInt64)words.LongLength;
 39 | 
 40 |             var options = new ParallelOptions();
 41 |             options.MaxDegreeOfParallelism = 4;
 42 |             Parallel.For(lowB, highB, options, i =>
 43 |             {
 44 |                 var m = (uint)Math.Pow(2, i);
 45 | 
 46 |                 HyperLogLog h = null;
 47 |                 try
 48 |                 {
 49 |                     h = new HyperLogLog(m);
 50 |                 }
 51 |                 catch (Exception)
 52 |                 {
 53 |                     Assert.Fail(string.Format("Can't make HyperLogLog({0})", m));
 54 |                 }
 55 | 
 56 |                 foreach (var word in words)
 57 |                 {
 58 |                     h.Add(Encoding.ASCII.GetBytes(word));
 59 |                 }
 60 | 
 61 |                 var expectedError = 1.04 / Math.Sqrt(m);
 62 |                 var actualError = Math.Abs(this.geterror(nWords, h.Count()));
 63 | 
 64 |                 if (actualError > expectedError)
 65 |                 {
 66 |                     bad++;
 67 |                     //Assert.Fail(string.Format("Expected: {0}, Actual: {1}", expectedError, actualError));
 68 |                 }
 69 |             });
 70 |         }
 71 | 
 72 |         private void benchmarkCount(int registers)
 73 |         {
 74 |             var n = 100000;
 75 |             var words = Words.Dictionary(0);
 76 |             var m = (uint)Math.Pow(2, registers);
 77 | 
 78 |             var h = new HyperLogLog(m);
 79 | 
 80 |             foreach (var word in words)
 81 |             {
 82 |                 h.Add(Encoding.ASCII.GetBytes(word));
 83 |             }
 84 | 
 85 |             for (int i = 0; i < n; i++)
 86 |             {
 87 |                 h.Count();
 88 |             }
 89 |         }
 90 | 
 91 |         [TestMethod]
 92 |         public void TestHyperLogLogSmall()
 93 |         {
 94 |             this.testHyperLogLog(5, 4, 17);
 95 |         }
 96 | 
 97 |         [TestMethod]
 98 |         public void TestHyperLogLogBig()
 99 |         {
100 |             this.testHyperLogLog(0, 4, 17);
101 |         }
102 | 
103 |         [TestMethod]
104 |         public void TestNewDefaultHyperLogLog()
105 |         {
106 |             var hll = HyperLogLog.NewDefaultHyperLogLog(0.1);
107 | 
108 |             Assert.AreEqual(128u, hll.M);
109 |         }
110 | 
111 |         [TestMethod]
112 |         public void BenchmarkHLLCount4()
113 |         {
114 |             this.benchmarkCount(4);
115 |         }
116 | 
117 |         [TestMethod]
118 |         public void BenchmarkHLLCount5()
119 |         {
120 |             this.benchmarkCount(5);
121 |         }
122 | 
123 |         [TestMethod]
124 |         public void BenchmarkHLLCount6()
125 |         {
126 |             this.benchmarkCount(6);
127 |         }
128 | 
129 |         [TestMethod]
130 |         public void BenchmarkHLLCount7()
131 |         {
132 |             this.benchmarkCount(7);
133 |         }
134 | 
135 |         [TestMethod]
136 |         public void BenchmarkHLLCount8()
137 |         {
138 |             this.benchmarkCount(8);
139 |         }
140 | 
141 |         [TestMethod]
142 |         public void BenchmarkHLLCount9()
143 |         {
144 |             this.benchmarkCount(9);
145 |         }
146 | 
147 |         [TestMethod]
148 |         public void BenchmarkHLLCount10()
149 |         {
150 |             this.benchmarkCount(10);
151 |         }
152 |     }
153 | }
154 | 


--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestInverseBloomFilter.cs:
--------------------------------------------------------------------------------
  1 | ﻿using Microsoft.VisualStudio.TestTools.UnitTesting;
  2 | using System.Text;
  3 | using ProbabilisticDataStructures;
  4 | 
  5 | namespace TestProbabilisticDataStructures
  6 | {
  7 |     [TestClass]
  8 |     public class TestInverseBloomFilter
  9 |     {
 10 |         private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a");
 11 |         private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b");
 12 |         private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c");
 13 |         private static byte[] D_BYTES = Encoding.ASCII.GetBytes("d");
 14 |         private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x");
 15 | 
 16 |         /// <summary>
 17 |         /// Ensures that Capacity returns the correct filter size.
 18 |         /// </summary>
 19 |         [TestMethod]
 20 |         public void TestInverseCapacity()
 21 |         {
 22 |             var f = new InverseBloomFilter(100);
 23 | 
 24 |             var capacity = f.Capacity();
 25 |             Assert.AreEqual(100u, capacity);
 26 |         }
 27 | 
 28 |         /// <summary>
 29 |         /// Ensures that TestAndAdd behaves correctly.
 30 |         /// </summary>
 31 |         [TestMethod]
 32 |         public void TestInverseTestAndAdd()
 33 |         {
 34 |             var f = new InverseBloomFilter(3);
 35 | 
 36 |             if (f.TestAndAdd(A_BYTES))
 37 |             {
 38 |                 Assert.Fail("'a' should not be a member");
 39 |             }
 40 | 
 41 |             if (!f.Test(A_BYTES))
 42 |             {
 43 |                 Assert.Fail("'a' should be a member");
 44 |             }
 45 | 
 46 |             // 'd' hashes to the same index as 'a'
 47 |             if (f.TestAndAdd(D_BYTES))
 48 |             {
 49 |                 Assert.Fail("'d' should not be a member");
 50 |             }
 51 | 
 52 |             // 'a' was swapped out.
 53 |             if (f.TestAndAdd(A_BYTES))
 54 |             {
 55 |                 Assert.Fail("'a' should not be a member");
 56 |             }
 57 | 
 58 |             if (!f.Test(A_BYTES))
 59 |             {
 60 |                 Assert.Fail("'a' should be a member");
 61 |             }
 62 | 
 63 |             // 'b' hashes to another index
 64 |             if (f.TestAndAdd(B_BYTES))
 65 |             {
 66 |                 Assert.Fail("'b' should not be a member");
 67 |             }
 68 | 
 69 |             if (!f.Test(B_BYTES))
 70 |             {
 71 |                 Assert.Fail("'b' should be a member");
 72 |             }
 73 | 
 74 |             // 'a' should still be a member.
 75 |             if (!f.Test(A_BYTES))
 76 |             {
 77 |                 Assert.Fail("'a' should be a member");
 78 |             }
 79 | 
 80 |             if (f.Test(C_BYTES))
 81 |             {
 82 |                 Assert.Fail("'c' should not be a member");
 83 |             }
 84 | 
 85 |             var addedC = f.Add(C_BYTES);
 86 |             Assert.AreSame(f, addedC, "Returned InverseBloomFilter should be the same instance");
 87 | 
 88 |             if (!f.Test(C_BYTES))
 89 |             {
 90 |                 Assert.Fail("'c' should be a member");
 91 |             }
 92 |         }
 93 |     }
 94 | 
 95 |     [TestClass]
 96 |     public class BenchmarkInverseBloomFilter
 97 |     {
 98 |         private InverseBloomFilter f;
 99 |         private int n;
100 |         private byte[][] data;
101 | 
102 |         [TestInitialize()]
103 |         public void Testinitialize()
104 |         {
105 |             n = 100000;
106 |             f = new InverseBloomFilter((uint)n);
107 |             data = new byte[n][];
108 |             for (int i = 0; i < n; i++)
109 |             {
110 |                 data[i] = Encoding.ASCII.GetBytes(i.ToString());
111 |             }
112 |         }
113 | 
114 |         [TestCleanup()]
115 |         public void TestCleanup()
116 |         {
117 |             f = null;
118 |             n = 0;
119 |             data = null;
120 |         }
121 | 
122 |         [TestMethod]
123 |         public void BenchmarkInverseAdd()
124 |         {
125 |             for (int i = 0; i < n; i++)
126 |             {
127 |                 f.Add(data[i]);
128 |             }
129 |         }
130 | 
131 |         [TestMethod]
132 |         public void BenchmarkInverseTest()
133 |         {
134 |             for (int i = 0; i < n; i++)
135 |             {
136 |                 f.Test(data[i]);
137 |             }
138 |         }
139 | 
140 |         [TestMethod]
141 |         public void BenchmarkInverseTestAndAdd()
142 |         {
143 |             for (int i = 0; i < n; i++)
144 |             {
145 |                 f.TestAndAdd(data[i]);
146 |             }
147 |         }
148 |     }
149 | }
150 | 


--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestMinHash.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System.Collections.Generic;
 2 | using Microsoft.VisualStudio.TestTools.UnitTesting;
 3 | using ProbabilisticDataStructures;
 4 | 
 5 | 
 6 | namespace TestProbabilisticDataStructures
 7 | {
 8 |     [TestClass]
 9 |     public class TestMinHash
10 |     {
11 |         /// <summary>
12 |         /// Ensures that MinHash returns the correct similarity ratio.
13 |         /// </summary>
14 |         [TestMethod]
15 |         public void TestMinHashSimilarity()
16 |         {
17 |             var bag = new List<string>{
18 |                 "bob",
19 |                 "alice",
20 |                 "frank",
21 |                 "tyler",
22 |                 "sara"
23 |             };
24 | 
25 |             var simRatio = MinHash.Similarity(bag.ToArray(), bag.ToArray());
26 |             Assert.AreEqual(1.0, simRatio);
27 | 
28 |             var dict = Words.Dictionary(1000);
29 |             var bag2 = new List<string>();
30 |             for (int i = 0; i < 1000; i++)
31 |             {
32 |                 bag2.Add(i.ToString());
33 |             }
34 | 
35 |             simRatio = MinHash.Similarity(dict, bag2.ToArray());
36 |             Assert.AreEqual(0.0, simRatio);
37 | 
38 |             var bag3 = Words.Dictionary(500);
39 |             simRatio = MinHash.Similarity(dict, bag3);
40 |             if (simRatio > 0.7 || simRatio < 0.5)
41 |             {
42 |                 Assert.Fail(string.Format("Expected between 0.5 and 0.7, got {0}", simRatio));
43 |             }
44 |         }
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestPartitionedBloomFilter.cs:
--------------------------------------------------------------------------------
  1 | ﻿using Microsoft.VisualStudio.TestTools.UnitTesting;
  2 | using ProbabilisticDataStructures;
  3 | using System.Text;
  4 | 
  5 | namespace TestProbabilisticDataStructures
  6 | {
  7 |     [TestClass]
  8 |     public class TestPartitionedBloomFilter
  9 |     {
 10 |         private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a");
 11 |         private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b");
 12 |         private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c");
 13 |         private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x");
 14 | 
 15 |         /// <summary>
 16 |         /// Ensures that Capacity() returns the number of bits, m, in the Bloom filter.
 17 |         /// </summary>
 18 |         [TestMethod]
 19 |         public void TestPartitionedCapacity()
 20 |         {
 21 |             var f = new PartitionedBloomFilter(100, 0.1);
 22 |             var capacity = f.Capacity();
 23 | 
 24 |             Assert.AreEqual(480u, capacity);
 25 |         }
 26 | 
 27 |         /// <summary>
 28 |         /// Ensures that K() returns the number of hash functions in the Bloom Filter.
 29 |         /// </summary>
 30 |         [TestMethod]
 31 |         public void TestPartitionedK()
 32 |         {
 33 |             var f = new PartitionedBloomFilter(100, 0.1);
 34 |             var k = f.K();
 35 | 
 36 |             Assert.AreEqual(4u, k);
 37 |         }
 38 | 
 39 |         /// <summary>
 40 |         /// Ensures that Count returns the number of items added to the filter.
 41 |         /// </summary>
 42 |         [TestMethod]
 43 |         public void TestPartitionedCount()
 44 |         {
 45 |             var f = new PartitionedBloomFilter(100, 0.1);
 46 |             for (uint i = 0; i < 10; i++)
 47 |             {
 48 |                 f.Add(Encoding.ASCII.GetBytes(i.ToString()));
 49 |             }
 50 | 
 51 |             var count = f.Count();
 52 |             Assert.AreEqual(10u, count);
 53 |         }
 54 | 
 55 |         /// <summary>
 56 |         /// Ensures that EstimatedFillRatio returns the correct approximation.
 57 |         /// </summary>
 58 |         [TestMethod]
 59 |         public void TestPartitionedEstimatedFillRatio()
 60 |         {
 61 |             var f = new PartitionedBloomFilter(100, 0.5);
 62 |             for (uint i = 0; i < 100; i++)
 63 |             {
 64 |                 f.Add(Encoding.ASCII.GetBytes(i.ToString()));
 65 |             }
 66 | 
 67 |             var ratio = f.EstimatedFillRatio();
 68 |             if (ratio > 0.5)
 69 |             {
 70 |                 Assert.Fail("Expected less than or equal to 0.5, got {0}", ratio);
 71 |             }
 72 |         }
 73 | 
 74 |         /// <summary>
 75 |         /// Ensures that FillRatio returns the ratio of set bits.
 76 |         /// </summary>
 77 |         [TestMethod]
 78 |         public void TestPartitionedFillRatio()
 79 |         {
 80 |             var f = new PartitionedBloomFilter(100, 0.1);
 81 |             f.Add(A_BYTES);
 82 |             f.Add(B_BYTES);
 83 |             f.Add(C_BYTES);
 84 |             f.Add(X_BYTES);
 85 | 
 86 |             var ratio = f.FillRatio();
 87 |             Assert.AreEqual(0.03125, ratio);
 88 |         }
 89 | 
 90 |         /// <summary>
 91 |         /// Ensures that Test, Add, and TestAndAdd behave correctly.
 92 |         /// </summary>
 93 |         [TestMethod]
 94 |         public void TestPartitionedBloomTestAndAdd()
 95 |         {
 96 |             var f = new PartitionedBloomFilter(100, 0.01);
 97 | 
 98 |             // 'a' is not in the filter.
 99 |             if (f.Test(A_BYTES))
100 |             {
101 |                 Assert.Fail("'a' should not be a member");
102 |             }
103 | 
104 |             var addedF = f.Add(A_BYTES);
105 |             Assert.AreSame(f, addedF, "Returned PartitionedBloomFilter should be the same instance");
106 | 
107 |             // 'a' is now in the filter.
108 |             if (!f.Test(A_BYTES))
109 |             {
110 |                 Assert.Fail("'a' should be a member");
111 |             }
112 | 
113 |             // 'a' is still in the filter.
114 |             if (!f.TestAndAdd(A_BYTES))
115 |             {
116 |                 Assert.Fail("'a' should be a member");
117 |             }
118 | 
119 |             // 'b' is not in the filter.
120 |             if (f.TestAndAdd(B_BYTES))
121 |             {
122 |                 Assert.Fail("'b' should not be a member");
123 |             }
124 | 
125 |             // 'a' is still in the filter.
126 |             if (!f.Test(A_BYTES))
127 |             {
128 |                 Assert.Fail("'a' should be a member");
129 |             }
130 | 
131 |             // 'b' is now in the filter.
132 |             if (!f.Test(B_BYTES))
133 |             {
134 |                 Assert.Fail("'b' should be a member");
135 |             }
136 | 
137 |             // 'c' is not in the filter.
138 |             if (f.Test(C_BYTES))
139 |             {
140 |                 Assert.Fail("'c' should not be a member");
141 |             }
142 | 
143 |             for (int i = 0; i < 1000000; i++)
144 |             {
145 |                 f.TestAndAdd(Encoding.ASCII.GetBytes(i.ToString()));
146 |             }
147 | 
148 |             // 'x' should be a false positive.
149 |             if (!f.Test(X_BYTES))
150 |             {
151 |                 Assert.Fail("'x' should be a member");
152 |             }
153 |         }
154 | 
155 |         /// <summary>
156 |         /// Ensures that Reset sets every bit to zero.
157 |         /// </summary>
158 |         [TestMethod]
159 |         public void TestPartitionedBloomReset()
160 |         {
161 |             var f = new PartitionedBloomFilter(100, 0.1);
162 |             for (int i = 0; i < 1000; i++)
163 |             {
164 |                 f.Add(Encoding.ASCII.GetBytes(i.ToString()));
165 |             }
166 | 
167 |             var resetF = f.Reset();
168 |             Assert.AreSame(f, resetF, "Returned PartitionedBloomFilter should be the same instance");
169 | 
170 |             foreach (var partition in f.Partitions)
171 |             {
172 |                 for (uint i = 0; i < partition.count; i++)
173 |                 {
174 |                     if (partition.Get(0) != 0)
175 |                     {
176 |                         Assert.Fail("Expected all bits to be unset");
177 |                     }
178 |                 }
179 |             }
180 |         }
181 | 
182 |         [TestMethod]
183 |         public void BenchmarkPartitionedBloomAdd()
184 |         {
185 |             var n = 100000;
186 |             var f = new PartitionedBloomFilter(100000, 0.1);
187 |             var data = new byte[n][];
188 |             for (int i = 0; i < n; i++)
189 |             {
190 |                 data[i] = Encoding.ASCII.GetBytes(i.ToString());
191 |             }
192 | 
193 |             for (int i = 0; i < n; i++)
194 |             {
195 |                 f.Add(data[i]);
196 |             }
197 |         }
198 | 
199 |         [TestMethod]
200 |         public void BenchmarkPartitionedBloomTest()
201 |         {
202 |             var n = 100000;
203 |             var f = new PartitionedBloomFilter(100000, 0.1);
204 |             var data = new byte[n][];
205 |             for (int i = 0; i < n; i++)
206 |             {
207 |                 data[i] = Encoding.ASCII.GetBytes(i.ToString());
208 |             }
209 | 
210 |             for (int i = 0; i < n; i++)
211 |             {
212 |                 f.Test(data[i]);
213 |             }
214 |         }
215 | 
216 |         [TestMethod]
217 |         public void BenchmarkPartitionedBloomTestAndAdd()
218 |         {
219 |             var n = 100000;
220 |             var f = new PartitionedBloomFilter(100000, 0.1);
221 |             var data = new byte[n][];
222 |             for (int i = 0; i < n; i++)
223 |             {
224 |                 data[i] = Encoding.ASCII.GetBytes(i.ToString());
225 |             }
226 | 
227 |             for (int i = 0; i < n; i++)
228 |             {
229 |                 f.TestAndAdd(data[i]);
230 |             }
231 |         }
232 |     }
233 | }
234 | 


--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestProbabilisticDataStructures.cs:
--------------------------------------------------------------------------------
  1 | using Microsoft.VisualStudio.TestTools.UnitTesting;
  2 | using ProbabilisticDataStructures;
  3 | using System.Security.Cryptography;
  4 | 
  5 | namespace TestProbabilisticDataStructures
  6 | {
  7 |     [TestClass]
  8 |     public class TestProbabilisticDataStructures
  9 |     {
 10 |         /// <summary>
 11 |         /// Ensures that correct math is performed for OptimalM().
 12 |         /// </summary>
 13 |         [TestMethod]
 14 |         public void TestOptimalM()
 15 |         {
 16 |             var optimalM = OptimalM(100, 0.01);
 17 |             Assert.AreEqual(959u, optimalM);
 18 | 
 19 |             optimalM = OptimalM(100, 0.5);
 20 |             Assert.AreEqual(145u, optimalM);
 21 |         }
 22 | 
 23 |         /// <summary>
 24 |         /// Ensures that correct math is performed for OptimalM64().
 25 |         /// </summary>
 26 |         [TestMethod]
 27 |         public void TestOptimalM64()
 28 |         {
 29 |             var optimalM = OptimalM64(100, 0.01);
 30 |             Assert.AreEqual(959ul, optimalM);
 31 | 
 32 |             optimalM = OptimalM64(100, 0.5);
 33 |             Assert.AreEqual(145ul, optimalM);
 34 | 
 35 |             optimalM = OptimalM64(8589934592ul, 0.0001);
 36 |             Assert.AreEqual(164670049045ul, optimalM);
 37 |         }
 38 | 
 39 |         /// <summary>
 40 |         /// Ensures that correct math is performed for OptimalK().
 41 |         /// </summary>
 42 |         [TestMethod]
 43 |         public void TestOptimalK()
 44 |         {
 45 |             var optimalK = OptimalK(0.01);
 46 |             Assert.AreEqual(7u, optimalK);
 47 | 
 48 |             optimalK = OptimalK(0.0001);
 49 |             Assert.AreEqual(14u, optimalK);
 50 |         }
 51 | 
 52 |         /// <summary>
 53 |         /// Ensures that HashKernel() returns the same upper and lower base
 54 |         /// as https://github.com/tylertreat/BoomFilters does when using the
 55 |         /// FNV1 hash.
 56 |         /// </summary>
 57 |         [TestMethod]
 58 |         public void TestHashKernelFNV1()
 59 |         {
 60 |             // FNV1 hash bytes for new byte[] { 0, 1, 2, 3 }
 61 |             var hashBytes =
 62 |                 new byte[]
 63 |                 {
 64 |                     0x15,
 65 |                     0x54,
 66 |                     0xe0,
 67 |                     0x98,
 68 |                     0x7f,
 69 |                     0x32,
 70 |                     0x75,
 71 |                     0x44
 72 |                 };
 73 |             var hashKernel = ProbabilisticDataStructures
 74 |                 .Utils.HashKernelFromHashBytes(hashBytes);
 75 |             // Compare against upper and lower base values gotten by
 76 |             // calling the HashKernel function from
 77 |             // https://github.com/tylertreat/BoomFilters using that library's
 78 |             // default FNV1 hash algorithm.
 79 |             Assert.AreEqual(2564838421u, hashKernel.LowerBaseHash);
 80 |             Assert.AreEqual(1148531327u, hashKernel.UpperBaseHash);
 81 |         }
 82 | 
 83 |         /// <summary>
 84 |         /// Ensures that HashKernel() returns the proper upper and lower base when using
 85 |         /// MD5.
 86 |         /// </summary>
 87 |         [TestMethod]
 88 |         public void TestHashKernelMD5()
 89 |         {
 90 |             var data = new byte[] { 0, 1, 2, 3 };
 91 |             var hashAlgorithm = HashAlgorithm.Create("MD5");
 92 |             var hashKernel = ProbabilisticDataStructures
 93 |                 .Utils.HashKernel(data, hashAlgorithm);
 94 | 
 95 |             Assert.AreEqual(4254774583u, hashKernel.LowerBaseHash);
 96 |             Assert.AreEqual(4179961689u, hashKernel.UpperBaseHash);
 97 |         }
 98 | 
 99 |         /// <summary>
100 |         /// Ensures that HashKernel() returns the proper upper and lower base when using
101 |         /// SHA256.
102 |         /// </summary>
103 |         [TestMethod]
104 |         public void TestHashKernelSHA256()
105 |         {
106 |             var data = new byte[] { 0, 1, 2, 3 };
107 |             var hashAlgorithm = HashAlgorithm.Create("SHA256");
108 |             var hashKernel = ProbabilisticDataStructures
109 |                 .Utils.HashKernel(data, hashAlgorithm);
110 | 
111 |             Assert.AreEqual(3252571653u, hashKernel.LowerBaseHash);
112 |             Assert.AreEqual(1646207440u, hashKernel.UpperBaseHash);
113 |         }
114 | 
115 |         /// <summary>
116 |         /// Ensures that HashKernel() returns the proper upper and lower base when using
117 |         /// MD5.
118 |         /// </summary>
119 |         [TestMethod]
120 |         public void TestHashKerne128lMD5()
121 |         {
122 |             var data = new byte[] { 0, 1, 2, 3 };
123 |             var hashAlgorithm = HashAlgorithm.Create("MD5");
124 |             var hashKernel = ProbabilisticDataStructures
125 |                 .Utils.HashKernel128(data, hashAlgorithm);
126 | 
127 |             Assert.AreEqual(17952798757042697527ul, hashKernel.LowerBaseHash);
128 |             Assert.AreEqual(7516929291713011248ul, hashKernel.UpperBaseHash);
129 |         }
130 | 
131 |         /// <summary>
132 |         /// Ensures that HashKernel() returns the proper upper and lower base when using
133 |         /// SHA256.
134 |         /// </summary>
135 |         [TestMethod]
136 |         public void TestHashKernel128SHA256()
137 |         {
138 |             var data = new byte[] { 0, 1, 2, 3 };
139 |             var hashAlgorithm = HashAlgorithm.Create("SHA256");
140 |             var hashKernel = ProbabilisticDataStructures
141 |                 .Utils.HashKernel128(data, hashAlgorithm);
142 | 
143 |             Assert.AreEqual(7070407120484453893ul, hashKernel.LowerBaseHash);
144 |             Assert.AreEqual(4682007113097866575ul, hashKernel.UpperBaseHash);
145 |         }
146 | 
147 |         /// <summary>
148 |         /// Helper method to get OptimalM().
149 |         /// </summary>
150 |         /// <param name="n"></param>
151 |         /// <param name="fpRate"></param>
152 |         /// <returns></returns>
153 |         private uint OptimalM(uint n, double fpRate)
154 |         {
155 |             return ProbabilisticDataStructures
156 |                 .Utils.OptimalM(n, fpRate);
157 |         }
158 | 
159 |         /// <summary>
160 |         /// Helper method to get OptimalM64().
161 |         /// </summary>
162 |         /// <param name="n"></param>
163 |         /// <param name="fpRate"></param>
164 |         /// <returns></returns>
165 |         private ulong OptimalM64(ulong n, double fpRate)
166 |         {
167 |             return ProbabilisticDataStructures
168 |                 .Utils.OptimalM64(n, fpRate);
169 |         }
170 | 
171 |         /// <summary>
172 |         /// Helper method to get OptimalK().
173 |         /// </summary>
174 |         /// <param name="fpRate"></param>
175 |         /// <returns></returns>
176 |         private uint OptimalK(double fpRate)
177 |         {
178 |             return ProbabilisticDataStructures
179 |                 .Utils.OptimalK(fpRate);
180 |         }
181 | 
182 |         [TestMethod]
183 |         public void TestHashBytesToUInt32()
184 |         {
185 |             var hashBytes =
186 |                 new byte[]
187 |                 {
188 |                     0x40,
189 |                     0x51,
190 |                     0x62,
191 |                     0x73,
192 |                     0x84,
193 |                     0x95,
194 |                     0xa6,
195 |                     0xb7,
196 |                     0xc8,
197 |                     0xd9,
198 |                     0xea,
199 |                     0xfb
200 |                 };
201 |             Assert.AreEqual(0x73625140u, Utils.HashBytesToUInt32(hashBytes, 0));
202 |             Assert.AreEqual(0xb7a69584u, Utils.HashBytesToUInt32(hashBytes, 4));
203 |             Assert.AreEqual(0xfbead9c8u, Utils.HashBytesToUInt32(hashBytes, 8));
204 |         }
205 | 
206 |         [TestMethod]
207 |         public void TestHashBytesToUInt64()
208 |         {
209 |             var hashBytes =
210 |                 new byte[]
211 |                 {
212 |                     0x40,
213 |                     0x51,
214 |                     0x62,
215 |                     0x73,
216 |                     0x84,
217 |                     0x95,
218 |                     0xa6,
219 |                     0xb7,
220 |                     0xc8,
221 |                     0xd9,
222 |                     0xea,
223 |                     0xfb
224 |                 };
225 |             Assert.AreEqual(0xb7a6958473625140ul, Utils.HashBytesToUInt64(hashBytes, 0));
226 |             Assert.AreEqual(0xfbead9c8b7a69584ul, Utils.HashBytesToUInt64(hashBytes, 4));
227 |         }
228 | 
229 |         [TestMethod]
230 |         public void TestComputeHashAsStringMD5()
231 |         {
232 |             var data = new byte[] { 0, 1, 2, 3 };
233 |             var hashingAlgorithm = HashAlgorithm.Create("MD5");
234 |             var hashString = Utils.ComputeHashAsString(data, hashingAlgorithm);
235 |             Assert.AreEqual("37B59AFD592725F9305E484A5D7F5168", hashString);
236 |         }
237 | 
238 |         [TestMethod]
239 |         public void TestComputeHashAsStringSHA256()
240 |         {
241 |             var data = new byte[] { 0, 1, 2, 3 };
242 |             var hashingAlgorithm = HashAlgorithm.Create("SHA256");
243 |             var hashString = Utils.ComputeHashAsString(data, hashingAlgorithm);
244 |             Assert.AreEqual("054EDEC1D0211F624FED0CBCA9D4F9400B0E491C43742AF2C5B0ABEBF0C990D8", hashString);
245 |         }
246 |     }
247 | }
248 | 


--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestProbabilisticDataStructures.csproj:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="utf-8"?>
  2 | <Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  3 |   <PropertyGroup>
  4 |     <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
  5 |     <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
  6 |     <ProjectGuid>{8212EFDE-5134-4914-96D3-C550FD9432F1}</ProjectGuid>
  7 |     <OutputType>Library</OutputType>
  8 |     <AppDesignerFolder>Properties</AppDesignerFolder>
  9 |     <RootNamespace>TestProbabilisticDataStructures</RootNamespace>
 10 |     <AssemblyName>TestProbabilisticDataStructures</AssemblyName>
 11 |     <TargetFrameworkVersion>v4.7</TargetFrameworkVersion>
 12 |     <FileAlignment>512</FileAlignment>
 13 |     <ProjectTypeGuids>{3AC096D0-A1C2-E12C-1390-A8335801FDAB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}</ProjectTypeGuids>
 14 |     <VisualStudioVersion Condition="'$(VisualStudioVersion)' == ''">10.0</VisualStudioVersion>
 15 |     <VSToolsPath Condition="'$(VSToolsPath)' == ''">$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)</VSToolsPath>
 16 |     <ReferencePath>$(ProgramFiles)\Common Files\microsoft shared\VSTT\$(VisualStudioVersion)\UITestExtensionPackages</ReferencePath>
 17 |     <IsCodedUITest>False</IsCodedUITest>
 18 |     <TestProjectType>UnitTest</TestProjectType>
 19 |     <TargetFrameworkProfile />
 20 |   </PropertyGroup>
 21 |   <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
 22 |     <DebugSymbols>true</DebugSymbols>
 23 |     <DebugType>full</DebugType>
 24 |     <Optimize>false</Optimize>
 25 |     <OutputPath>bin\Debug\</OutputPath>
 26 |     <DefineConstants>DEBUG;TRACE</DefineConstants>
 27 |     <ErrorReport>prompt</ErrorReport>
 28 |     <WarningLevel>4</WarningLevel>
 29 |   </PropertyGroup>
 30 |   <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
 31 |     <DebugType>pdbonly</DebugType>
 32 |     <Optimize>true</Optimize>
 33 |     <OutputPath>bin\Release\</OutputPath>
 34 |     <DefineConstants>TRACE</DefineConstants>
 35 |     <ErrorReport>prompt</ErrorReport>
 36 |     <WarningLevel>4</WarningLevel>
 37 |   </PropertyGroup>
 38 |   <ItemGroup>
 39 |     <Reference Include="System" />
 40 |   </ItemGroup>
 41 |   <Choose>
 42 |     <When Condition="('$(VisualStudioVersion)' == '10.0' or '$(VisualStudioVersion)' == '') and '$(TargetFrameworkVersion)' == 'v3.5'">
 43 |       <ItemGroup>
 44 |         <Reference Include="Microsoft.VisualStudio.QualityTools.UnitTestFramework, Version=10.1.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL" />
 45 |       </ItemGroup>
 46 |     </When>
 47 |     <Otherwise>
 48 |       <ItemGroup>
 49 |         <Reference Include="Microsoft.VisualStudio.QualityTools.UnitTestFramework" />
 50 |       </ItemGroup>
 51 |     </Otherwise>
 52 |   </Choose>
 53 |   <ItemGroup>
 54 |     <Compile Include="TestBloomFilter64.cs" />
 55 |     <Compile Include="TestBuckets.cs" />
 56 |     <Compile Include="Properties\AssemblyInfo.cs" />
 57 |     <Compile Include="TestBuckets64.cs" />
 58 |     <Compile Include="TestProbabilisticDataStructures.cs" />
 59 |     <Compile Include="TestBloomFilter.cs" />
 60 |     <Compile Include="TestCountingBloomFilter.cs" />
 61 |     <Compile Include="TestCountMinSketch.cs" />
 62 |     <Compile Include="TestCuckooBloomFilter.cs" />
 63 |     <Compile Include="TestDeletableBloomFilter.cs" />
 64 |     <Compile Include="TestHyperLogLog.cs" />
 65 |     <Compile Include="TestMinHash.cs" />
 66 |     <Compile Include="TestPartitionedBloomFilter.cs" />
 67 |     <Compile Include="TestScalableBloomFilter.cs" />
 68 |     <Compile Include="TestStableBloomFilter.cs" />
 69 |     <Compile Include="TestTopK.cs" />
 70 |     <Compile Include="TestInverseBloomFilter.cs" />
 71 |     <Compile Include="Words.cs" />
 72 |   </ItemGroup>
 73 |   <ItemGroup>
 74 |     <ProjectReference Include="..\ProbabilisticDataStructures\ProbabilisticDataStructures.csproj">
 75 |       <Project>{bf43f4a8-a892-413c-8e11-9a53d2249bf4}</Project>
 76 |       <Name>ProbabilisticDataStructures</Name>
 77 |     </ProjectReference>
 78 |   </ItemGroup>
 79 |   <ItemGroup />
 80 |   <Choose>
 81 |     <When Condition="'$(VisualStudioVersion)' == '10.0' And '$(IsCodedUITest)' == 'True'">
 82 |       <ItemGroup>
 83 |         <Reference Include="Microsoft.VisualStudio.QualityTools.CodedUITestFramework, Version=10.0.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
 84 |           <Private>False</Private>
 85 |         </Reference>
 86 |         <Reference Include="Microsoft.VisualStudio.TestTools.UITest.Common, Version=10.0.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
 87 |           <Private>False</Private>
 88 |         </Reference>
 89 |         <Reference Include="Microsoft.VisualStudio.TestTools.UITest.Extension, Version=10.0.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
 90 |           <Private>False</Private>
 91 |         </Reference>
 92 |         <Reference Include="Microsoft.VisualStudio.TestTools.UITesting, Version=10.0.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
 93 |           <Private>False</Private>
 94 |         </Reference>
 95 |       </ItemGroup>
 96 |     </When>
 97 |   </Choose>
 98 |   <Import Project="$(VSToolsPath)\TeamTest\Microsoft.TestTools.targets" Condition="Exists('$(VSToolsPath)\TeamTest\Microsoft.TestTools.targets')" />
 99 |   <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
100 |   <!-- To modify your build process, add your task inside one of the targets below and uncomment it. 
101 |        Other similar extension points exist, see Microsoft.Common.targets.
102 |   <Target Name="BeforeBuild">
103 |   </Target>
104 |   <Target Name="AfterBuild">
105 |   </Target>
106 |   -->
107 | </Project>


--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestScalableBloomFilter.cs:
--------------------------------------------------------------------------------
  1 | ﻿using Microsoft.VisualStudio.TestTools.UnitTesting;
  2 | using ProbabilisticDataStructures;
  3 | using System.Text;
  4 | 
  5 | namespace TestProbabilisticDataStructures
  6 | {
  7 |     [TestClass]
  8 |     public class TestScalableBloomFilter
  9 |     {
 10 |         private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a");
 11 |         private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b");
 12 |         private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c");
 13 |         private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x");
 14 | 
 15 |         [TestMethod]
 16 |         public void TestNewDefaultScalableBloomFilter()
 17 |         {
 18 |             var f = ScalableBloomFilter.NewDefaultScalableBloomFilter(0.1);
 19 | 
 20 |             Assert.AreEqual(0.1, f.FP);
 21 |             Assert.AreEqual(10000u, f.Hint);
 22 |             Assert.AreEqual(0.8, f.R);
 23 |         }
 24 | 
 25 |         [TestMethod]
 26 |         public void TestScalableBloomCapacity()
 27 |         {
 28 |             var f = new ScalableBloomFilter(1, 0.1, 1);
 29 |             f.AddFilter();
 30 |             f.AddFilter();
 31 | 
 32 |             var capacity = f.Capacity();
 33 |             Assert.AreEqual(15u, capacity);
 34 |         }
 35 | 
 36 |         // Ensures that K returns the number of hash functions used in each Bloom filter.
 37 |         [TestMethod]
 38 |         public void TestScalableBloomK()
 39 |         {
 40 |             var f = new ScalableBloomFilter(10, 0.1, 0.8);
 41 | 
 42 |             var k = f.K();
 43 |             Assert.AreEqual(4u, k);
 44 |         }
 45 | 
 46 |         /// <summary>
 47 |         /// Ensures that FillRatio returns the average fill ratio of the contained
 48 |         /// filters.
 49 |         /// </summary>
 50 |         [TestMethod]
 51 |         public void TestScalableFillRatio()
 52 |         {
 53 |             var f = new ScalableBloomFilter(100, 0.1, 0.8);
 54 |             f.SetHash(ProbabilisticDataStructures.Defaults.GetDefaultHashAlgorithm());
 55 |             for (int i = 0; i < 200; i++)
 56 |             {
 57 |                 f.Add(Encoding.ASCII.GetBytes(i.ToString()));
 58 |             }
 59 | 
 60 |             var fillRatio = f.FillRatio();
 61 |             if (fillRatio > 0.5)
 62 |             {
 63 |                 Assert.Fail(string.Format("Expected less than or equal to 0.5, got {0}", fillRatio));
 64 |             }
 65 |         }
 66 | 
 67 |         /// <summary>
 68 |         /// Ensures that Test, Add, and TestAndAdd behave correctly.
 69 |         /// </summary>
 70 |         [TestMethod]
 71 |         public void TestScalableBloomTestAndAdd()
 72 |         {
 73 |             var f = new ScalableBloomFilter(1000, 0.01, 0.8);
 74 | 
 75 |             // 'a' is not in the filter.
 76 |             if (f.Test(A_BYTES))
 77 |             {
 78 |                 Assert.Fail("'a' should not be a member");
 79 |             }
 80 | 
 81 |             var addedF = f.Add(A_BYTES);
 82 |             Assert.AreSame(f, addedF, "Returned ScalableBloomFilter should be the same instance");
 83 | 
 84 |             // 'a' is now in the filter.
 85 |             if (!f.Test(A_BYTES))
 86 |             {
 87 |                 Assert.Fail("'a' should be a member");
 88 |             }
 89 | 
 90 |             // 'a' is still in the filter.
 91 |             if (!f.TestAndAdd(A_BYTES))
 92 |             {
 93 |                 Assert.Fail("'a' should be a member");
 94 |             }
 95 | 
 96 |             // 'b' is not in the filter.
 97 |             if (f.TestAndAdd(B_BYTES))
 98 |             {
 99 |                 Assert.Fail("'b' should not be a member");
100 |             }
101 | 
102 |             // 'a' is still in the filter.
103 |             if (!f.Test(A_BYTES))
104 |             {
105 |                 Assert.Fail("'a' should be a member");
106 |             }
107 | 
108 |             // 'b' is now in the filter.
109 |             if (!f.Test(B_BYTES))
110 |             {
111 |                 Assert.Fail("'b' should be a member");
112 |             }
113 | 
114 |             // 'c' is not in the filter.
115 |             if (f.Test(C_BYTES))
116 |             {
117 |                 Assert.Fail("'c' should not be a member");
118 |             }
119 | 
120 |             for (int i = 0; i < 10000; i++)
121 |             {
122 |                 f.TestAndAdd(Encoding.ASCII.GetBytes(i.ToString()));
123 |             }
124 | 
125 |             // 'x' should not be a false positive.
126 |             if (f.Test(X_BYTES))
127 |             {
128 |                 Assert.Fail("'x' should be a member");
129 |             }
130 |         }
131 | 
132 |         /// <summary>
133 |         /// Ensures that Reset sets every bit to zero.
134 |         /// </summary>
135 |         [TestMethod]
136 |         public void TestScalableBloomReset()
137 |         {
138 |             var f = new ScalableBloomFilter(10, 0.1, 0.8);
139 |             for (int i = 0; i < 1000; i++)
140 |             {
141 |                 f.Add(Encoding.ASCII.GetBytes(i.ToString()));
142 |             }
143 | 
144 |             var count = f.Filters.Count;
145 |             Assert.IsTrue(count > 1, string.Format("Expected more than 1 filter, got {0}", count));
146 | 
147 |             var resetF = f.Reset();
148 |             Assert.AreSame(f, resetF, "Returned ScalableBloomFilter should be the same instance");
149 | 
150 |             count = f.Filters.Count;
151 |             Assert.IsTrue(count == 1, string.Format("Expected 1 filter, got {0}", count));
152 | 
153 |             foreach(var partition in f.Filters[0].Partitions)
154 |             {
155 |                 for (uint i = 0; i < partition.count; i++)
156 |                 {
157 |                     if (partition.Get(i) != 0)
158 |                     {
159 |                         Assert.Fail("Expected all bits to be unset");
160 |                     } 
161 |                 }
162 |             }
163 |         }
164 | 
165 |         [TestMethod]
166 |         public void BenchmarkScalableBloomAdd()
167 |         {
168 |             var n = 100000;
169 |             var f = new ScalableBloomFilter(100000, 0.1, 0.8);
170 |             var data = new byte[n][];
171 |             for (int i = 0; i < n; i++)
172 |             {
173 |                 data[i] = Encoding.ASCII.GetBytes(i.ToString());
174 |             }
175 | 
176 |             for (int i = 0; i < n; i++)
177 |             {
178 |                 f.Add(data[i]);
179 |             }
180 |         }
181 | 
182 |         [TestMethod]
183 |         public void BenchmarkScalableBloomTest()
184 |         {
185 |             var n = 100000;
186 |             var f = new ScalableBloomFilter(100000, 0.1, 0.8);
187 |             var data = new byte[n][];
188 |             for (int i = 0; i < n; i++)
189 |             {
190 |                 data[i] = Encoding.ASCII.GetBytes(i.ToString());
191 |             }
192 | 
193 |             for (int i = 0; i < n; i++)
194 |             {
195 |                 f.Test(data[i]);
196 |             }
197 |         }
198 | 
199 |         [TestMethod]
200 |         public void BenchmarkScalableBloomTestAndAdd()
201 |         {
202 |             var n = 100000;
203 |             var f = new ScalableBloomFilter(100000, 0.1, 0.8);
204 |             var data = new byte[n][];
205 |             for (int i = 0; i < n; i++)
206 |             {
207 |                 data[i] = Encoding.ASCII.GetBytes(i.ToString());
208 |             }
209 | 
210 |             for (int i = 0; i < n; i++)
211 |             {
212 |                 f.TestAndAdd(data[i]);
213 |             }
214 |         }
215 |     }
216 | }
217 | 


--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestTopK.cs:
--------------------------------------------------------------------------------
 1 | ﻿using Microsoft.VisualStudio.TestTools.UnitTesting;
 2 | using ProbabilisticDataStructures;
 3 | using System.Text;
 4 | using System.Linq;
 5 | 
 6 | namespace TestProbabilisticDataStructures
 7 | {
 8 |     [TestClass]
 9 |     public class TestTopK
10 |     {
11 |         private static byte[] BOB_BYTES = Encoding.ASCII.GetBytes("bob");
12 |         private static byte[] TYLER_BYTES = Encoding.ASCII.GetBytes("tyler");
13 |         private static byte[] FRED_BYTES = Encoding.ASCII.GetBytes("fred");
14 |         private static byte[] ALICE_BYTES = Encoding.ASCII.GetBytes("alice");
15 |         private static byte[] JAMES_BYTES = Encoding.ASCII.GetBytes("james");
16 |         private static byte[] SARA_BYTES = Encoding.ASCII.GetBytes("sara");
17 |         private static byte[] BILL_BYTES = Encoding.ASCII.GetBytes("bill");
18 | 
19 |         /// <summary>
20 |         /// Ensures that TopK return the top-k most frequent elements.
21 |         /// </summary>
22 |         [TestMethod]
23 |         public void TestTopk()
24 |         {
25 |             var topK = new TopK(0.001, 0.99, 5);
26 | 
27 |             topK.Add(BOB_BYTES).Add(BOB_BYTES).Add(BOB_BYTES);
28 |             topK.Add(TYLER_BYTES).Add(TYLER_BYTES).Add(TYLER_BYTES).Add(TYLER_BYTES).Add(TYLER_BYTES);
29 |             topK.Add(FRED_BYTES);
30 |             topK.Add(ALICE_BYTES).Add(ALICE_BYTES).Add(ALICE_BYTES).Add(ALICE_BYTES);
31 |             topK.Add(JAMES_BYTES);
32 |             topK.Add(FRED_BYTES);
33 |             topK.Add(SARA_BYTES).Add(SARA_BYTES);
34 | 
35 |             var addedK = topK.Add(BILL_BYTES);
36 |             Assert.AreSame(topK, addedK);
37 |             // latest one also
38 |             var expected = new ProbabilisticDataStructures.Element[]{
39 |                 new ProbabilisticDataStructures.Element{Data=BILL_BYTES, Freq=1},
40 |                 new ProbabilisticDataStructures.Element{Data=SARA_BYTES, Freq=2},
41 |                 new ProbabilisticDataStructures.Element{Data=BOB_BYTES, Freq=3},
42 |                 new ProbabilisticDataStructures.Element{Data=ALICE_BYTES, Freq=4},
43 |                 new ProbabilisticDataStructures.Element{Data=TYLER_BYTES, Freq=5},
44 |             };
45 | 
46 |             var actual = topK.Elements();
47 | 
48 |             Assert.AreEqual(5, actual.Length);
49 | 
50 |             for (int i = 0; i < actual.Length; i++)
51 |             {
52 |                 var element = actual[i];
53 |                 Assert.IsTrue(Enumerable.SequenceEqual(element.Data, expected[i].Data));
54 |                 // freq check
55 |                 Assert.AreEqual(expected[i].Freq, element.Freq);
56 |             }
57 | 
58 |             var resetK = topK.Reset();
59 |             Assert.AreSame(topK, resetK);
60 | 
61 |             Assert.AreEqual(0, topK.Elements().Length);
62 |             Assert.AreEqual(0u, topK.N);
63 |         }
64 | 
65 |         [TestMethod]
66 |         public void BenchmarkTopKAdd()
67 |         {
68 |             var n = 100000;
69 |             var topK = new TopK(0.001, 0.99, 5);
70 |             var data = new byte[n][];
71 |             for (int i = 0; i < n; i++)
72 |             {
73 |                 data[i] = Encoding.ASCII.GetBytes(i.ToString());
74 |             }
75 | 
76 |             for (int i = 0; i < n; i++)
77 |             {
78 |                 topK.Add(data[i]);
79 |             }
80 |         }
81 |     }
82 | }
83 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | version: 1.0.{build}
 2 | configuration: Release
 3 | test: on
 4 | skip_tags: true
 5 | pull_requests:
 6 |   do_not_increment_build_number: true
 7 | build:
 8 |   verbosity: minimal
 9 | assembly_info:
10 |   patch: true
11 |   file: '**\AssemblyInfo.*'
12 |   assembly_version: '{version}'
13 |   assembly_file_version: '{version}'
14 |   assembly_informational_version: '{version}'
15 | artifacts:
16 | - path: ProbabilisticDataStructures\bin\Release
17 |   name: ProbabilisticDataStructures-v$(appveyor_build_version)
18 | 


--------------------------------------------------------------------------------