├── .github
└── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
├── .gitignore
├── CONTRIBUTING.md
├── Default.testsettings
├── LICENSE.txt
├── PULL_REQUEST_TEMPLATE.md
├── ProbabilisticDataStructures.sln
├── ProbabilisticDataStructures.vsmdi
├── ProbabilisticDataStructures
├── BloomFilter.cs
├── BloomFilter64.cs
├── Buckets.cs
├── Buckets64.cs
├── CountMinSketch.cs
├── CountingBloomFilter.cs
├── CuckooBloomFilter.cs
├── Defaults.cs
├── DeletableBloomFilter.cs
├── Element.cs
├── ElementHeap.cs
├── HyperLogLog.cs
├── IFilter.cs
├── InverseBloomFilter.cs
├── MinHash.cs
├── PartitionedBloomFilter.cs
├── ProbabilisticDataStructures.csproj
├── ScalableBloomFilter.cs
├── StableBloomFilter.cs
├── TopK.cs
└── Utils.cs
├── README.md
├── TestProbabilisticDataStructures
├── Properties
│ └── AssemblyInfo.cs
├── TestBloomFilter.cs
├── TestBloomFilter64.cs
├── TestBuckets.cs
├── TestBuckets64.cs
├── TestCountMinSketch.cs
├── TestCountingBloomFilter.cs
├── TestCuckooBloomFilter.cs
├── TestDeletableBloomFilter.cs
├── TestHyperLogLog.cs
├── TestInverseBloomFilter.cs
├── TestMinHash.cs
├── TestPartitionedBloomFilter.cs
├── TestProbabilisticDataStructures.cs
├── TestProbabilisticDataStructures.csproj
├── TestScalableBloomFilter.cs
├── TestStableBloomFilter.cs
├── TestTopK.cs
└── Words.cs
└── appveyor.yml
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 |
5 | ---
6 |
7 | **Describe the bug**
8 | A clear and concise description of what the bug is.
9 |
10 | **To Reproduce**
11 | Steps to reproduce the behavior:
12 | 1. Go to '...'
13 | 2. Click on '....'
14 | 3. Scroll down to '....'
15 | 4. See error
16 |
17 | **Expected behavior**
18 | A clear and concise description of what you expected to happen.
19 |
20 | **Screenshots**
21 | If applicable, add screenshots to help explain your problem.
22 |
23 | **Desktop (please complete the following information):**
24 | - OS: [e.g. iOS]
25 | - Browser [e.g. chrome, safari]
26 | - Version [e.g. 22]
27 |
28 | **Smartphone (please complete the following information):**
29 | - Device: [e.g. iPhone6]
30 | - OS: [e.g. iOS8.1]
31 | - Browser [e.g. stock browser, safari]
32 | - Version [e.g. 22]
33 |
34 | **Additional context**
35 | Add any other context about the problem here.
36 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 |
5 | ---
6 |
7 | **Is your feature request related to a problem? Please describe.**
8 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
9 |
10 | **Describe the solution you'd like**
11 | A clear and concise description of what you want to happen.
12 |
13 | **Describe alternatives you've considered**
14 | A clear and concise description of any alternative solutions or features you've considered.
15 |
16 | **Additional context**
17 | Add any other context or screenshots about the feature request here.
18 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ProbabilisticDataStructures.v11.suo
2 | ProbabilisticDataStructures/bin/
3 | ProbabilisticDataStructures/obj/
4 | TestProbabilisticDataStructures/bin/
5 | TestProbabilisticDataStructures/obj/
6 | TestResults/ProbabilisticDataStructures.TE.Tests.mdf
7 | TestResults/ProbabilisticDataStructures.TE.Tests_log.ldf
8 | TestResults/
9 | .vs/
10 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | If you think a change would be useful, make a PR! I would only care that you inquire about a change before wirting it if it seems like a oh-man-this-is-changing-everything type of change.
2 |
--------------------------------------------------------------------------------
/Default.testsettings:
--------------------------------------------------------------------------------
1 |
2 |
3 | These are default test settings for a local test run.
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | ### What is this PR?
2 | THIS_IS_A_PR_THAT_DOES_X_Y_Z
3 |
4 | ### Things to consider:
5 | - [ ] I added tests for my changes
6 | - [ ] I ran the tests locally and they all passed
7 | - [x] I am awesome for making a contribution
8 |
--------------------------------------------------------------------------------
/ProbabilisticDataStructures.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio 15
4 | VisualStudioVersion = 15.0.27428.2043
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ProbabilisticDataStructures", "ProbabilisticDataStructures\ProbabilisticDataStructures.csproj", "{4775E89C-C139-43B0-8436-B456C035C9D9}"
7 | EndProject
8 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "TestProbabilisticDataStructures", "TestProbabilisticDataStructures\TestProbabilisticDataStructures.csproj", "{8212EFDE-5134-4914-96D3-C550FD9432F1}"
9 | EndProject
10 | Global
11 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
12 | Debug|Any CPU = Debug|Any CPU
13 | Release|Any CPU = Release|Any CPU
14 | EndGlobalSection
15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
16 | {4775E89C-C139-43B0-8436-B456C035C9D9}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
17 | {4775E89C-C139-43B0-8436-B456C035C9D9}.Debug|Any CPU.Build.0 = Debug|Any CPU
18 | {4775E89C-C139-43B0-8436-B456C035C9D9}.Release|Any CPU.ActiveCfg = Release|Any CPU
19 | {4775E89C-C139-43B0-8436-B456C035C9D9}.Release|Any CPU.Build.0 = Release|Any CPU
20 | {8212EFDE-5134-4914-96D3-C550FD9432F1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
21 | {8212EFDE-5134-4914-96D3-C550FD9432F1}.Debug|Any CPU.Build.0 = Debug|Any CPU
22 | {8212EFDE-5134-4914-96D3-C550FD9432F1}.Release|Any CPU.ActiveCfg = Release|Any CPU
23 | {8212EFDE-5134-4914-96D3-C550FD9432F1}.Release|Any CPU.Build.0 = Release|Any CPU
24 | EndGlobalSection
25 | GlobalSection(SolutionProperties) = preSolution
26 | HideSolutionNode = FALSE
27 | EndGlobalSection
28 | GlobalSection(ExtensibilityGlobals) = postSolution
29 | SolutionGuid = {DD9C9C10-6340-471D-BF9D-A6823302D332}
30 | EndGlobalSection
31 | EndGlobal
32 |
--------------------------------------------------------------------------------
/ProbabilisticDataStructures.vsmdi:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/ProbabilisticDataStructures/BloomFilter.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Security.Cryptography;
3 |
4 | namespace ProbabilisticDataStructures
5 | {
6 | ///
7 | /// BloomFilter implements a classic Bloom filter. A bloom filter has a non-zero
8 | /// probability of false positives and a zero probability of false negatives.
9 | ///
10 | public class BloomFilter : IFilter
11 | {
12 | ///
13 | /// Filter data
14 | ///
15 | internal Buckets Buckets { get; set; }
16 | ///
17 | /// Hash algorithm
18 | ///
19 | private HashAlgorithm Hash { get; set; }
20 | ///
21 | /// Filter size
22 | ///
23 | private uint m { get; set; }
24 | ///
25 | /// Number of hash functions
26 | ///
27 | private uint k { get; set; }
28 | ///
29 | /// Number of items added
30 | ///
31 | private uint count { get; set; }
32 |
33 | ///
34 | /// Creates a new Bloom filter optimized to store n items with a specified target
35 | /// false-positive rate.
36 | ///
37 | /// Number of items to store.
38 | /// Desired false positive rate.
39 | public BloomFilter(uint n, double fpRate)
40 | {
41 | var m = Utils.OptimalM(n, fpRate);
42 | var k = Utils.OptimalK(fpRate);
43 | Buckets = new Buckets(m, 1);
44 | Hash = Defaults.GetDefaultHashAlgorithm();
45 | this.m = m;
46 | this.k = k;
47 | }
48 |
49 | ///
50 | /// Returns the Bloom filter capacity, m.
51 | ///
52 | /// The Bloom filter capacity, m.
53 | public uint Capacity()
54 | {
55 | return this.m;
56 | }
57 |
58 | ///
59 | /// Returns the number of hash functions.
60 | ///
61 | /// The number of hash functions.
62 | public uint K()
63 | {
64 | return this.k;
65 | }
66 |
67 | ///
68 | /// Returns the number of items in the filter.
69 | ///
70 | ///
71 | public uint Count()
72 | {
73 | return this.count;
74 | }
75 |
76 | ///
77 | /// Returns the current estimated ratio of set bits.
78 | ///
79 | /// The current estimated ratio of set bits.
80 | public double EstimatedFillRatio()
81 | {
82 | return 1 - Math.Exp((-(double)this.count * (double)this.k) / (double)this.m);
83 | }
84 |
85 | ///
86 | /// Returns the ratio of set bits.
87 | ///
88 | /// The ratio of set bits.
89 | public double FillRatio()
90 | {
91 | uint sum = 0;
92 | for (uint i = 0; i < this.Buckets.count; i++)
93 | {
94 | sum += this.Buckets.Get(i);
95 | }
96 | return (double)sum / (double)this.m;
97 | }
98 |
99 | ///
100 | /// Will test for membership of the data and returns true if it is a member,
101 | /// false if not. This is a probabilistic test, meaning there is a non-zero
102 | /// probability of false positives but a zero probability of false negatives.
103 | ///
104 | /// The data to search for.
105 | /// Whether or not the data is maybe contained in the filter.
106 | public bool Test(byte[] data)
107 | {
108 | var hashKernel = Utils.HashKernel(data, this.Hash);
109 | var lower = hashKernel.LowerBaseHash;
110 | var upper = hashKernel.UpperBaseHash;
111 |
112 | // If any of the K bits are not set, then it's not a member.
113 | for (uint i = 0; i < this.k; i++)
114 | {
115 | if (this.Buckets.Get((lower + upper * i) % this.m) == 0)
116 | {
117 | return false;
118 | }
119 | }
120 | return true;
121 | }
122 |
123 | ///
124 | /// Will add the data to the Bloom filter. It returns the filter to allow
125 | /// for chaining.
126 | ///
127 | /// The data to add.
128 | /// The filter.
129 | public IFilter Add(byte[] data)
130 | {
131 | var hashKernel = Utils.HashKernel(data, this.Hash);
132 | var lower = hashKernel.LowerBaseHash;
133 | var upper = hashKernel.UpperBaseHash;
134 |
135 | // Set the K bits.
136 | for (uint i = 0; i < this.k; i++)
137 | {
138 | this.Buckets.Set((lower + upper * i) % this.m, 1);
139 | }
140 |
141 | this.count++;
142 | return this;
143 | }
144 |
145 | ///
146 | /// Is equivalent to calling Test followed by Add. It returns true if the data is
147 | /// a member, false if not.
148 | ///
149 | /// The data to test for and add if it doesn't exist.
150 | /// Whether or not the data was probably contained in the filter.
151 | public bool TestAndAdd(byte[] data)
152 | {
153 | var hashKernel = Utils.HashKernel(data, this.Hash);
154 | var lower = hashKernel.LowerBaseHash;
155 | var upper = hashKernel.UpperBaseHash;
156 | var member = true;
157 |
158 | // If any of the K bits are not set, then it's not a member.
159 | for (uint i = 0; i < this.k; i++)
160 | {
161 | var idx = (lower + upper * i) % this.m;
162 | if (this.Buckets.Get(idx) == 0)
163 | {
164 | member = false;
165 | }
166 | this.Buckets.Set(idx, 1);
167 | }
168 |
169 | this.count++;
170 | return member;
171 | }
172 |
173 | ///
174 | /// Restores the Bloom filter to its original state. It returns the filter to
175 | /// allow for chaining.
176 | ///
177 | /// The reset bloom filter.
178 | public BloomFilter Reset()
179 | {
180 | this.Buckets.Reset();
181 | return this;
182 | }
183 |
184 | ///
185 | /// Sets the hashing function used in the filter.
186 | ///
187 | /// The HashAlgorithm to use.
188 | // TODO: Add SetHash to the IFilter interface?
189 | public void SetHash(HashAlgorithm h)
190 | {
191 | this.Hash = h;
192 | }
193 | }
194 | }
195 |
--------------------------------------------------------------------------------
/ProbabilisticDataStructures/BloomFilter64.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using System.Text;
5 | using System.Threading.Tasks;
6 | using ProbabilisticDataStructures;
7 | using System.Security.Cryptography;
8 |
9 | namespace ProbabilisticDataStructures
10 | {
11 | ///
12 | /// BloomFilter64 implements a classic Bloom filter. A bloom filter has a non-zero
13 | /// probability of false positives and a zero probability of false negatives.
14 | ///
15 | public class BloomFilter64 : IFilter
16 | {
17 | ///
18 | /// Filter data
19 | ///
20 | internal Buckets64 Buckets { get; set; }
21 | ///
22 | /// Hash algorithm
23 | ///
24 | private HashAlgorithm Hash { get; set; }
25 | ///
26 | /// Filter size
27 | ///
28 | private ulong m { get; set; }
29 | ///
30 | /// Number of hash functions
31 | ///
32 | private uint k { get; set; }
33 | ///
34 | /// Number of items added
35 | ///
36 | private ulong count { get; set; }
37 |
38 | ///
39 | /// Creates a new Bloom filter optimized to store n items with a specified target
40 | /// false-positive rate.
41 | ///
42 | /// Number of items to store.
43 | /// Desired false positive rate.
44 | public BloomFilter64(ulong n, double fpRate)
45 | {
46 | var m = Utils.OptimalM64(n, fpRate);
47 | var k = Utils.OptimalK(fpRate);
48 | Buckets = new Buckets64(m, 1);
49 | Hash = Defaults.GetDefaultHashAlgorithm();
50 | this.m = m;
51 | this.k = k;
52 | }
53 |
54 | ///
55 | /// Returns the Bloom filter capacity, m.
56 | ///
57 | /// The Bloom filter capacity, m.
58 | public ulong Capacity()
59 | {
60 | return this.m;
61 | }
62 |
63 | ///
64 | /// Returns the number of hash functions.
65 | ///
66 | /// The number of hash functions.
67 | public uint K()
68 | {
69 | return this.k;
70 | }
71 |
72 | ///
73 | /// Returns the number of items in the filter.
74 | ///
75 | ///
76 | public ulong Count()
77 | {
78 | return this.count;
79 | }
80 |
81 | ///
82 | /// Returns the current estimated ratio of set bits.
83 | ///
84 | /// The current estimated ratio of set bits.
85 | public double EstimatedFillRatio()
86 | {
87 | return 1 - Math.Exp((-(double)this.count * (double)this.k) / (double)this.m);
88 | }
89 |
90 | ///
91 | /// Returns the ratio of set bits.
92 | ///
93 | /// The ratio of set bits.
94 | public double FillRatio()
95 | {
96 | ulong sum = 0;
97 | for (ulong i = 0; i < this.Buckets.count; i++)
98 | {
99 | sum += this.Buckets.Get(i);
100 | }
101 | return (double)sum / (double)this.m;
102 | }
103 |
104 | ///
105 | /// Will test for membership of the data and returns true if it is a member,
106 | /// false if not. This is a probabilistic test, meaning there is a non-zero
107 | /// probability of false positives but a zero probability of false negatives.
108 | ///
109 | /// The data to search for.
110 | /// Whether or not the data is maybe contained in the filter.
111 | public bool Test(byte[] data)
112 | {
113 | var hashKernel = Utils.HashKernel128(data, this.Hash);
114 | var lower = hashKernel.LowerBaseHash;
115 | var upper = hashKernel.UpperBaseHash;
116 |
117 | // If any of the K bits are not set, then it's not a member.
118 | for (uint i = 0; i < this.k; i++)
119 | {
120 | if (this.Buckets.Get((lower + upper * i) % this.m) == 0)
121 | {
122 | return false;
123 | }
124 | }
125 | return true;
126 | }
127 |
128 | ///
129 | /// Will add the data to the Bloom filter. It returns the filter to allow
130 | /// for chaining.
131 | ///
132 | /// The data to add.
133 | /// The filter.
134 | public IFilter Add(byte[] data)
135 | {
136 | var hashKernel = Utils.HashKernel128(data, this.Hash);
137 | var lower = hashKernel.LowerBaseHash;
138 | var upper = hashKernel.UpperBaseHash;
139 |
140 | // Set the K bits.
141 | for (uint i = 0; i < this.k; i++)
142 | {
143 | this.Buckets.Set((lower + upper * i) % this.m, 1);
144 | }
145 |
146 | this.count++;
147 | return this;
148 | }
149 |
150 | ///
151 | /// Is equivalent to calling Test followed by Add. It returns true if the data is
152 | /// a member, false if not.
153 | ///
154 | /// The data to test for and add if it doesn't exist.
155 | /// Whether or not the data was probably contained in the filter.
156 | public bool TestAndAdd(byte[] data)
157 | {
158 | var hashKernel = Utils.HashKernel128(data, this.Hash);
159 | var lower = hashKernel.LowerBaseHash;
160 | var upper = hashKernel.UpperBaseHash;
161 | var member = true;
162 |
163 | // If any of the K bits are not set, then it's not a member.
164 | for (uint i = 0; i < this.k; i++)
165 | {
166 | var idx = (lower + upper * i) % this.m;
167 | if (this.Buckets.Get(idx) == 0)
168 | {
169 | member = false;
170 | }
171 | this.Buckets.Set(idx, 1);
172 | }
173 |
174 | this.count++;
175 | return member;
176 | }
177 |
178 | ///
179 | /// Restores the Bloom filter to its original state. It returns the filter to
180 | /// allow for chaining.
181 | ///
182 | /// The reset bloom filter.
183 | public BloomFilter64 Reset()
184 | {
185 | this.Buckets.Reset();
186 | return this;
187 | }
188 |
189 | ///
190 | /// Sets the hashing function used in the filter.
191 | ///
192 | /// The HashAlgorithm to use.
193 | // TODO: Add SetHash to the IFilter interface?
194 | public void SetHash(HashAlgorithm h)
195 | {
196 | this.Hash = h;
197 | }
198 | }
199 | }
200 |
--------------------------------------------------------------------------------
/ProbabilisticDataStructures/Buckets.cs:
--------------------------------------------------------------------------------
1 | namespace ProbabilisticDataStructures
2 | {
3 | ///
4 | /// Buckets is a fast, space-efficient array of buckets where each bucket can store
5 | /// up to a configured maximum value.
6 | ///
7 | public class Buckets
8 | {
9 | private byte[] Data { get; set; }
10 | private byte bucketSize { get; set; }
11 | private byte _max;
12 | private int Max
13 | {
14 | get
15 | {
16 | return _max;
17 | }
18 | set
19 | {
20 | // TODO: Figure out this truncation thing.
21 | // I'm not sure if MaxValue is always supposed to be capped at 255 via
22 | // a byte conversion or not...
23 | if (value > byte.MaxValue)
24 | _max = byte.MaxValue;
25 | else
26 | _max = (byte)value;
27 | }
28 | }
29 | internal uint count { get; set; }
30 |
31 | ///
32 | /// Creates a new Buckets with the provided number of buckets where each bucket
33 | /// is the specified number of bits.
34 | ///
35 | /// Number of buckets.
36 | /// Number of bits per bucket.
37 | internal Buckets(uint count, byte bucketSize)
38 | {
39 | this.count = count;
40 | this.Data = new byte[(count * bucketSize + 7) / 8];
41 | this.bucketSize = bucketSize;
42 | this.Max = (1 << bucketSize) - 1;
43 | }
44 |
45 | ///
46 | /// Returns the maximum value that can be stored in a bucket.
47 | ///
48 | /// The bucket max value.
49 | internal byte MaxBucketValue()
50 | {
51 | return this._max;
52 | }
53 |
54 | ///
55 | /// Increment the value in the specified bucket by the provided delta. A bucket
56 | /// can be decremented by providing a negative delta.
57 | ///
58 | /// The value is clamped to zero and the maximum bucket value. Returns itself
59 | /// to allow for chaining.
60 | ///
61 | ///
62 | /// The bucket to increment.
63 | /// The amount to increment the bucket by.
64 | /// The modified bucket.
65 | internal Buckets Increment(uint bucket, int delta)
66 | {
67 | int val = (int)(GetBits(bucket * this.bucketSize, this.bucketSize) + delta);
68 |
69 | if (val > this.Max)
70 | val = this.Max;
71 | else if (val < 0)
72 | val = 0;
73 |
74 | SetBits((uint)bucket * (uint)this.bucketSize, this.bucketSize, (uint)val);
75 | return this;
76 | }
77 |
78 | ///
79 | /// Set the bucket value. The value is clamped to zero and the maximum bucket
80 | /// value. Returns itself to allow for chaining.
81 | ///
82 | /// The bucket to change the value of.
83 | /// The value to set.
84 | /// The modified bucket.
85 | internal Buckets Set(uint bucket, byte value)
86 | {
87 | if (value > this._max)
88 | value = this._max;
89 |
90 | SetBits(bucket * this.bucketSize, this.bucketSize, value);
91 | return this;
92 | }
93 |
94 | ///
95 | /// Returns the value in the specified bucket.
96 | ///
97 | /// The bucket to get.
98 | /// The specified bucket.
99 | internal uint Get(uint bucket)
100 | {
101 | return GetBits(bucket * this.bucketSize, this.bucketSize);
102 | }
103 |
104 | ///
105 | /// Restores the Buckets to the original state. Returns itself to allow for
106 | /// chaining.
107 | ///
108 | /// The Buckets object the reset operation was performed on.
109 | internal Buckets Reset()
110 | {
111 | this.Data = new byte[(this.count * this.bucketSize + 7) / 8];
112 | return this;
113 | }
114 |
115 | ///
116 | /// Returns the bits at the specified offset and length.
117 | ///
118 | /// The position to start reading at.
119 | /// The distance to read from the offset.
120 | /// The bits at the specified offset and length.
121 | internal uint GetBits(uint offset, int length)
122 | {
123 | uint byteIndex = offset / 8;
124 | int byteOffset = (int)(offset % 8);
125 |
126 | if ((byteOffset + length) > 8)
127 | {
128 | int rem = 8 - byteOffset;
129 | return GetBits(offset, rem)
130 | | (GetBits((uint)(offset + rem), length - rem) << rem);
131 | }
132 |
133 | int bitMask = (1 << length) - 1;
134 | return (uint)((this.Data[byteIndex] & (bitMask << byteOffset)) >> byteOffset);
135 | }
136 |
137 | ///
138 | /// Sets bits at the specified offset and length.
139 | ///
140 | /// The position to start writing at.
141 | /// The distance to write from the offset.
142 | /// The bits to write.
143 | internal void SetBits(uint offset, int length, uint bits)
144 | {
145 | uint byteIndex = offset / 8;
146 | int byteOffset = (int)(offset % 8);
147 |
148 | if ((byteOffset + length) > 8)
149 | {
150 | int rem = 8 - byteOffset;
151 | SetBits(offset, (byte)rem, bits);
152 | SetBits((uint)(offset + rem), length - rem, bits >> rem);
153 | return;
154 | }
155 |
156 | int bitMask = (1 << length) - 1;
157 | this.Data[byteIndex] =
158 | (byte)((this.Data[byteIndex]) & ~(bitMask << byteOffset));
159 | this.Data[byteIndex] =
160 | (byte)((this.Data[byteIndex]) | ((bits & bitMask) << byteOffset));
161 | }
162 | }
163 | }
164 |
--------------------------------------------------------------------------------
/ProbabilisticDataStructures/Buckets64.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using System.Text;
5 | using System.Threading.Tasks;
6 |
7 | namespace ProbabilisticDataStructures
8 | {
9 | ///
10 | /// Buckets64 is a fast, space-efficient array of buckets where each bucket can store
11 | /// up to a configured maximum value.
12 | ///
13 | public class Buckets64
14 | {
15 | // The largest C# array to create; the largest power of 2 that C# can support.
16 | private const uint maxArraySize = 1U << 30;
17 | private byte[][] Data { get; set; }
18 | private int arrayCount { get; set; }
19 | private byte bucketSize { get; set; }
20 | private byte _max;
21 | private int Max
22 | {
23 | get
24 | {
25 | return _max;
26 | }
27 | set
28 | {
29 | // TODO: Figure out this truncation thing.
30 | // I'm not sure if MaxValue is always supposed to be capped at 255 via
31 | // a byte conversion or not...
32 | if (value > byte.MaxValue)
33 | _max = byte.MaxValue;
34 | else
35 | _max = (byte)value;
36 | }
37 | }
38 | internal ulong count { get; set; }
39 |
40 | ///
41 | /// Creates a new Buckets64 with the provided number of buckets where each bucket
42 | /// is the specified number of bits.
43 | ///
44 | /// Number of buckets.
45 | /// Number of bits per bucket.
46 | internal Buckets64(ulong count, byte bucketSize)
47 | {
48 | this.count = count;
49 | this.bucketSize = bucketSize;
50 | AllocateArray(count, bucketSize);
51 | this.Max = (1 << bucketSize) - 1;
52 | }
53 |
54 | private void AllocateArray(ulong count, byte bucketSize)
55 | {
56 | this.arrayCount = (int)(count / maxArraySize + 1);
57 | this.Data = new byte[this.arrayCount][];
58 | var bytesToAllocate = (count * bucketSize + 7) / 8;
59 | for (int i = 0; i < this.arrayCount; i++)
60 | {
61 | var arraySize = Math.Min(bytesToAllocate, maxArraySize);
62 | this.Data[i] = new byte[arraySize];
63 | bytesToAllocate -= arraySize;
64 | }
65 | }
66 |
67 | ///
68 | /// Returns the maximum value that can be stored in a bucket.
69 | ///
70 | /// The bucket max value.
71 | internal byte MaxBucketValue()
72 | {
73 | return this._max;
74 | }
75 |
76 | ///
77 | /// Increment the value in the specified bucket by the provided delta. A bucket
78 | /// can be decremented by providing a negative delta.
79 | ///
80 | /// The value is clamped to zero and the maximum bucket value. Returns itself
81 | /// to allow for chaining.
82 | ///
83 | ///
84 | /// The bucket to increment.
85 | /// The amount to increment the bucket by.
86 | /// The modified bucket.
87 | internal Buckets64 Increment(uint bucket, int delta)
88 | {
89 | int val = (int)(GetBits(bucket * this.bucketSize, this.bucketSize) + delta);
90 |
91 | if (val > this.Max)
92 | val = this.Max;
93 | else if (val < 0)
94 | val = 0;
95 |
96 | SetBits((uint)bucket * (uint)this.bucketSize, this.bucketSize, (uint)val);
97 | return this;
98 | }
99 |
100 | ///
101 | /// Set the bucket value. The value is clamped to zero and the maximum bucket
102 | /// value. Returns itself to allow for chaining.
103 | ///
104 | /// The bucket to change the value of.
105 | /// The value to set.
106 | /// The modified bucket.
107 | internal Buckets64 Set(ulong bucket, byte value)
108 | {
109 | if (value > this._max)
110 | value = this._max;
111 |
112 | SetBits(bucket * this.bucketSize, this.bucketSize, value);
113 | return this;
114 | }
115 |
116 | ///
117 | /// Returns the value in the specified bucket.
118 | ///
119 | /// The bucket to get.
120 | /// The specified bucket.
121 | internal uint Get(ulong bucket)
122 | {
123 | return GetBits(bucket * this.bucketSize, this.bucketSize);
124 | }
125 |
126 | ///
127 | /// Restores the Buckets64 to the original state. Returns itself to allow for
128 | /// chaining.
129 | ///
130 | /// The Buckets64 object the reset operation was performed on.
131 | internal Buckets64 Reset()
132 | {
133 | AllocateArray(this.count, this.bucketSize);
134 | return this;
135 | }
136 |
137 | ///
138 | /// Returns the bits at the specified offset and length.
139 | ///
140 | /// The position to start reading at.
141 | /// The distance to read from the offset.
142 | /// The bits at the specified offset and length.
143 | internal uint GetBits(ulong offset, int length)
144 | {
145 | ulong byteIndex = offset / 8;
146 | int byteOffset = (int)(offset % 8);
147 |
148 | if ((byteOffset + length) > 8)
149 | {
150 | int rem = 8 - byteOffset;
151 | return GetBits(offset, rem)
152 | | (GetBits(offset + (ulong)rem, length - rem) << rem);
153 | }
154 |
155 | var dataArray = this.Data[byteIndex / maxArraySize];
156 | var dataArrayByteIndex = byteIndex % maxArraySize;
157 | int bitMask = (1 << length) - 1;
158 | return (uint)((dataArray[dataArrayByteIndex] & (bitMask << byteOffset)) >> byteOffset);
159 | }
160 |
161 | ///
162 | /// Sets bits at the specified offset and length.
163 | ///
164 | /// The position to start writing at.
165 | /// The distance to write from the offset.
166 | /// The bits to write.
167 | internal void SetBits(ulong offset, int length, uint bits)
168 | {
169 | ulong byteIndex = offset / 8;
170 | int byteOffset = (int)(offset % 8);
171 |
172 | if ((byteOffset + length) > 8)
173 | {
174 | int rem = 8 - byteOffset;
175 | SetBits(offset, (byte)rem, bits);
176 | SetBits(offset + (ulong)rem, length - rem, bits >> rem);
177 | return;
178 | }
179 |
180 | var dataArray = this.Data[(uint)(byteIndex / maxArraySize)];
181 | var dataArrayByteIndex = (uint)(byteIndex % maxArraySize);
182 | int bitMask = (1 << length) - 1;
183 | dataArray[dataArrayByteIndex] =
184 | (byte)((dataArray[dataArrayByteIndex]) & ~(bitMask << byteOffset));
185 | dataArray[dataArrayByteIndex] =
186 | (byte)((dataArray[dataArrayByteIndex]) | ((bits & bitMask) << byteOffset));
187 | }
188 | }
189 | }
190 |
--------------------------------------------------------------------------------
/ProbabilisticDataStructures/CountMinSketch.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Security.Cryptography;
3 |
4 | namespace ProbabilisticDataStructures
5 | {
6 | ///
7 | /// CountMinSketch implements a Count-Min Sketch as described by Cormode and
8 | /// Muthukrishnan in An Improved Data Stream Summary: The Count-Min Sketch and its
9 | /// Applications:
10 | ///
11 | /// http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf
12 | ///
13 | /// A Count-Min Sketch (CMS) is a probabilistic data structure which approximates
14 | /// the frequency of events in a data stream. Unlike a hash map, a CMS uses
15 | /// sub-linear space at the expense of a configurable error factor. Similar to
16 | /// Counting Bloom filters, items are hashed to a series of buckets, which increment
17 | /// a counter. The frequency of an item is estimated by taking the minimum of each of
18 | /// the item's respective counter values.
19 | ///
20 | /// Count-Min Sketches are useful for counting the frequency of events in massive
21 | /// data sets or unbounded streams online. In these situations, storing the entire
22 | /// data set or allocating counters for every event in memory is impractical. It may
23 | /// be possible for offline processing, but real-time processing requires fast,
24 | /// space-efficient solutions like the CMS. For approximating set cardinality, refer
25 | /// to the HyperLogLog.
26 | ///
27 | public class CountMinSketch
28 | {
29 | ///
30 | /// Count matrix
31 | ///
32 | internal UInt64[][] Matrix { get; set; }
33 | ///
34 | /// Matrix width
35 | ///
36 | internal uint Width { get; set; }
37 | ///
38 | /// Matrix depth
39 | ///
40 | internal uint Depth { get; set; }
41 | ///
42 | /// Number of items added
43 | ///
44 | private UInt64 count { get; set; }
45 | ///
46 | /// Relative-accuracy factor
47 | ///
48 | private double epsilon { get; set; }
49 | ///
50 | /// Relative-accuracy probability
51 | ///
52 | private double delta { get; set; }
53 | ///
54 | /// Hash function
55 | ///
56 | private HashAlgorithm Hash { get; set; }
57 |
58 | ///
59 | /// Creates a new Count-Min Sketch whose relative accuracy is within a factor of
60 | /// epsilon with probability delta. Both of these parameters affect the space and
61 | /// time complexity.
62 | ///
63 | /// Relative-accuracy factor
64 | /// Relative-accuracy probability
65 | public CountMinSketch(double epsilon, double delta)
66 | {
67 | var width = (uint)(Math.Ceiling(Math.E / epsilon));
68 | var depth = (uint)(Math.Ceiling(Math.Log(1 / delta)));
69 | this.Matrix = new UInt64[depth][];
70 |
71 | for (int i = 0; i < depth; i++)
72 | {
73 | this.Matrix[i] = new UInt64[width];
74 | }
75 |
76 | this.Width = width;
77 | this.Depth = depth;
78 | this.epsilon = epsilon;
79 | this.delta = delta;
80 | this.Hash = Defaults.GetDefaultHashAlgorithm();
81 | }
82 |
83 | ///
84 | /// Returns the relative-accuracy factor, epsilon.
85 | ///
86 | /// The relative-accuracy factor, epsilon
87 | public double Epsilon()
88 | {
89 | return this.epsilon;
90 | }
91 |
92 | ///
93 | /// Returns the relative-accuracy probability, delta.
94 | ///
95 | /// The relative-accuracy probability, delta
96 | public double Delta()
97 | {
98 | return this.delta;
99 | }
100 |
101 | ///
102 | /// Returns the number of items added to the sketch.
103 | ///
104 | /// The number of items added to the sketch.
105 | public UInt64 TotalCount()
106 | {
107 | return this.count;
108 | }
109 |
110 | ///
111 | /// Add the data to the set. Returns the CountMinSketch to allow for chaining.
112 | ///
113 | /// The data to add.
114 | /// The CountMinSketch
115 | public CountMinSketch Add(byte[] data)
116 | {
117 | var hashKernel = Utils.HashKernel(data, this.Hash);
118 | var lower = hashKernel.LowerBaseHash;
119 | var upper = hashKernel.UpperBaseHash;
120 |
121 | // Increment count in each row.
122 | for (uint i = 0; i < this.Depth; i++)
123 | {
124 | this.Matrix[i][(lower + upper * i) % this.Width]++;
125 | }
126 |
127 | this.count++;
128 | return this;
129 | }
130 |
131 | ///
132 | /// Returns the approximate count for the specified item, correct within
133 | /// epsilon * total count with a probability of delta.
134 | ///
135 | ///
136 | /// The data to count.
137 | public UInt64 Count(byte[] data)
138 | {
139 | var hashKernel = Utils.HashKernel(data, this.Hash);
140 | var lower = hashKernel.LowerBaseHash;
141 | var upper = hashKernel.UpperBaseHash;
142 | var count = UInt64.MaxValue;
143 |
144 | for (uint i = 0; i < this.Depth; i++)
145 | {
146 | count = Math.Min(count, this.Matrix[i][(lower + upper * i) % this.Width]);
147 | }
148 |
149 | return count;
150 | }
151 |
152 | ///
153 | /// Combines this CountMinSketch with another. Returns a bool if the merge was
154 | /// successful. Throws an exception if the matrix width and depth are not equal.
155 | ///
156 | /// The CountMinSketch to merge with the current
157 | /// instance.
158 | /// True if successful.
159 | public bool Merge(CountMinSketch other)
160 | {
161 | if (this.Depth != other.Depth)
162 | {
163 | throw new Exception("Matrix depth must match.");
164 | }
165 |
166 | if (this.Width != other.Width)
167 | {
168 | throw new Exception("Matrix width must match.");
169 | }
170 |
171 | for (uint i = 0; i < this.Depth; i++)
172 | {
173 | for (int j = 0; j < this.Width; j++)
174 | {
175 | this.Matrix[i][j] += other.Matrix[i][j];
176 | }
177 | }
178 |
179 | this.count += other.count;
180 | return true;
181 | }
182 |
183 | ///
184 | /// Restores the CountMinSketch to its original state. It returns itself to allow
185 | /// for chaining.
186 | ///
187 | /// The CountMinSketch
188 | public CountMinSketch Reset()
189 | {
190 | this.Matrix = new UInt64[this.Depth][];
191 | for (uint i = 0; i < this.Depth; i++)
192 | {
193 | this.Matrix[i] = new UInt64[this.Width];
194 | }
195 |
196 | this.count = 0;
197 | return this;
198 | }
199 |
200 | ///
201 | /// Sets the hashing function used in the filter.
202 | ///
203 | /// The HashAlgorithm to use.
204 | public void SetHash(HashAlgorithm h)
205 | {
206 | this.Hash = h;
207 | }
208 |
209 | // TODO: Implement these later.
210 | // WriteDataTo()
211 | // ReadDataFrom()
212 | }
213 | }
214 |
--------------------------------------------------------------------------------
/ProbabilisticDataStructures/CountingBloomFilter.cs:
--------------------------------------------------------------------------------
1 | using System.Security.Cryptography;
2 |
3 | namespace ProbabilisticDataStructures
4 | {
5 | ///
6 | /// CountingBloomFilter implements a Counting Bloom Filter as described by Fan,
7 | /// Cao, Almeida, and Broder in Summary Cache: A Scalable Wide-Area Web Cache
8 | /// Sharing Protocol:
9 | ///
10 | /// http://pages.cs.wisc.edu/~jussara/papers/00ton.pdf
11 | ///
12 | /// A Counting Bloom Filter (CBF) provides a way to remove elements by using an
13 | /// array of n-bit buckets. When an element is added, the respective buckets are
14 | /// incremented. To remove an element, the respective buckets are decremented. A
15 | /// query checks that each of the respective buckets are non-zero. Because CBFs
16 | /// allow elements to be removed, they introduce a non-zero probability of false
17 | /// negatives in addition to the possibility of false positives.
18 | ///
19 | /// Counting Bloom Filters are useful for cases where elements are both added
20 | /// and removed from the data set. Since they use n-bit buckets, CBFs use
21 | /// roughly n-times more memory than traditional Bloom filters.
22 | ///
23 | public class CountingBloomFilter : IFilter
24 | {
25 | ///
26 | /// Filter data
27 | ///
28 | internal Buckets Buckets { get; set; }
29 | ///
30 | /// Hash algorithm
31 | ///
32 | private HashAlgorithm Hash { get; set; }
33 | ///
34 | /// Filter size
35 | ///
36 | private uint m { get; set; }
37 | ///
38 | /// Number of hash functions
39 | ///
40 | private uint k { get; set; }
41 | ///
42 | /// Number of items added
43 | ///
44 | private uint count { get; set; }
45 | ///
46 | /// Buffer used to cache indices
47 | ///
48 | private uint[] indexBuffer { get; set; }
49 |
50 | ///
51 | /// Creates a new Counting Bloom Filter optimized to store n-items with a
52 | /// specified target false-positive rate and bucket size. If you don't know how
53 | /// many bits to use for buckets, use NewDefaultCountingBloomFilter for a
54 | /// sensible default.
55 | ///
56 | /// Number of items to store.
57 | /// Bucket size.
58 | /// Desired false positive rate.
59 | public CountingBloomFilter(uint n, byte b, double fpRate)
60 | {
61 | var m = Utils.OptimalM(n, fpRate);
62 | var k = Utils.OptimalK(fpRate);
63 | this.Buckets = new Buckets(m, b);
64 | this.Hash = Defaults.GetDefaultHashAlgorithm();
65 | this.m = m;
66 | this.k = k;
67 | this.indexBuffer = new uint[k];
68 | }
69 |
70 | ///
71 | /// Creates a new Counting Bloom Filter optimized to store n items with a
72 | /// specified target false-positive rate. Buckets are allocated four bits.
73 | ///
74 | /// Number of items to store.
75 | /// Desired false positive rate.
76 | /// Default CountingBloomFilter
77 | public static CountingBloomFilter NewDefaultCountingBloomFilter(
78 | uint n,
79 | double fpRate)
80 | {
81 | return new CountingBloomFilter(n, 4, fpRate);
82 | }
83 |
84 | ///
85 | /// Returns the Bloom filter capacity, m.
86 | ///
87 | /// The Bloom filter capacity, m.
88 | public uint Capacity()
89 | {
90 | return this.m;
91 | }
92 |
93 | ///
94 | /// Returns the number of hash functions.
95 | ///
96 | /// The number of hash functions.
97 | public uint K()
98 | {
99 | return this.k;
100 | }
101 |
102 | ///
103 | /// Returns the number of items in the filter.
104 | ///
105 | ///
106 | public uint Count()
107 | {
108 | return this.count;
109 | }
110 |
111 | ///
112 | /// Will test for membership of the data and returns true if it is a member,
113 | /// false if not. This is a probabilistic test, meaning there is a non-zero
114 | /// probability of false positives but a zero probability of false negatives.
115 | ///
116 | /// The data to search for.
117 | /// Whether or not the data is maybe contained in the filter.
118 | public bool Test(byte[] data)
119 | {
120 | var hashKernel = Utils.HashKernel(data, this.Hash);
121 | var lower = hashKernel.LowerBaseHash;
122 | var upper = hashKernel.UpperBaseHash;
123 |
124 | // If any of the K bits are not set, then it's not a member.
125 | for (uint i = 0; i < this.k; i++)
126 | {
127 | if (this.Buckets.Get((lower + upper * i) % this.m) == 0)
128 | {
129 | return false;
130 | }
131 | }
132 | return true;
133 | }
134 |
135 | ///
136 | /// Will add the data to the Bloom filter. It returns the filter to allow
137 | /// for chaining.
138 | ///
139 | /// The data to add.
140 | /// The filter.
141 | public IFilter Add(byte[] data)
142 | {
143 | var hashKernel = Utils.HashKernel(data, this.Hash);
144 | var lower = hashKernel.LowerBaseHash;
145 | var upper = hashKernel.UpperBaseHash;
146 |
147 | // Set the K bits.
148 | for (uint i = 0; i < this.k; i++)
149 | {
150 | this.Buckets.Increment((lower + upper * i) % this.m, 1);
151 | }
152 |
153 | this.count++;
154 | return this;
155 | }
156 |
157 | ///
158 | /// Is equivalent to calling Test followed by Add. It returns true if the data is
159 | /// a member, false if not.
160 | ///
161 | /// The data to test for and add if it doesn't exist.
162 | /// Whether or not the data was probably contained in the filter.
163 | public bool TestAndAdd(byte[] data)
164 | {
165 | var hashKernel = Utils.HashKernel(data, this.Hash);
166 | var lower = hashKernel.LowerBaseHash;
167 | var upper = hashKernel.UpperBaseHash;
168 | var member = true;
169 |
170 | // If any of the K bits are not set, then it's not a member.
171 | for (uint i = 0; i < this.k; i++)
172 | {
173 | var idx = (lower + upper * i) % this.m;
174 | if (this.Buckets.Get(idx) == 0)
175 | {
176 | member = false;
177 | }
178 | this.Buckets.Increment(idx, 1);
179 | }
180 |
181 | this.count++;
182 | return member;
183 | }
184 |
185 | ///
186 | /// Will test for membership of the data and remove it from the filter if it
187 | /// exists. Returns true if the data was a member, false if not.
188 | ///
189 | /// The data to check for and remove.
190 | /// Whether or not the data was in the filter before removal.
191 | public bool TestAndRemove(byte[] data)
192 | {
193 | var hashKernel = Utils.HashKernel(data, this.Hash);
194 | var lower = hashKernel.LowerBaseHash;
195 | var upper = hashKernel.UpperBaseHash;
196 | var member = true;
197 |
198 | // Set the K bits.
199 | for (uint i = 0; i < this.k; i++)
200 | {
201 | this.indexBuffer[i] = (lower + upper * i) % this.m;
202 | if (this.Buckets.Get(this.indexBuffer[i]) == 0)
203 | {
204 | member = false;
205 | }
206 | }
207 |
208 | if (member)
209 | {
210 | foreach (var idx in this.indexBuffer)
211 | {
212 | this.Buckets.Increment(idx, -1);
213 | }
214 | this.count--;
215 | }
216 |
217 | return member;
218 | }
219 |
220 | ///
221 | /// Restores the Bloom filter to its original state. It returns the filter to
222 | /// allow for chaining.
223 | ///
224 | /// The reset bloom filter.
225 | public CountingBloomFilter Reset()
226 | {
227 | this.Buckets.Reset();
228 | this.count = 0;
229 | return this;
230 | }
231 |
232 | ///
233 | /// Sets the hashing function used in the filter.
234 | ///
235 | /// The HashAlgorithm to use.
236 | // TODO: Add SetHash to the IFilter interface?
237 | public void SetHash(HashAlgorithm h)
238 | {
239 | this.Hash = h;
240 | }
241 | }
242 | }
243 |
--------------------------------------------------------------------------------
/ProbabilisticDataStructures/Defaults.cs:
--------------------------------------------------------------------------------
1 | using System.Security.Cryptography;
2 | using System.Runtime.CompilerServices;
3 | [assembly: InternalsVisibleTo("TestProbabilisticDataStructures")]
4 |
5 | namespace ProbabilisticDataStructures
6 | {
7 | public static class Defaults
8 | {
9 | public const double FILL_RATIO = 0.5;
10 |
11 | ///
12 | /// Returns the default hashing algorithm for the library.
13 | ///
14 | /// The default hashing algorithm for the library
15 | internal static HashAlgorithm GetDefaultHashAlgorithm()
16 | {
17 | return HashAlgorithm.Create("MD5");
18 | }
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/ProbabilisticDataStructures/DeletableBloomFilter.cs:
--------------------------------------------------------------------------------
1 | using System.Security.Cryptography;
2 |
3 | namespace ProbabilisticDataStructures
4 | {
5 | ///
6 | /// DeletableBloomFilter implements a Deletable Bloom Filter as described by
7 | /// Rothenberg, Macapuna, Verdi, Magalhaes in The Deletable Bloom filter - A new
8 | /// member of the Bloom family:
9 | ///
10 | /// http://arxiv.org/pdf/1005.0352.pdf
11 | ///
12 | /// A Deletable Bloom Filter compactly stores information on collisions when
13 | /// inserting elements. This information is used to determine if elements are
14 | /// deletable. This design enables false-negative-free deletions at a fraction
15 | /// of the cost in memory consumption.
16 | ///
17 | /// Deletable Bloom Filters are useful for cases which require removing elements
18 | /// but cannot allow false negatives. This means they can be safely swapped in
19 | /// place of traditional Bloom filters.
20 | ///
21 | public class DeletableBloomFilter : IFilter
22 | {
23 | ///
24 | /// Filter data
25 | ///
26 | internal Buckets Buckets { get; set; }
27 | ///
28 | /// Filter collision data
29 | ///
30 | internal Buckets Collisions { get; set; }
31 | ///
32 | /// Hash algorithm
33 | ///
34 | private HashAlgorithm Hash { get; set; }
35 | ///
36 | /// Filter size
37 | ///
38 | private uint M { get; set; }
39 | ///
40 | /// Number of bits in a region
41 | ///
42 | private uint RegionSize { get; set; }
43 | ///
44 | /// Number of hash functions
45 | ///
46 | private uint k { get; set; }
47 | ///
48 | /// Number of items in the filter
49 | ///
50 | private uint count { get; set; }
51 | ///
52 | /// Buffer used to cache indices
53 | ///
54 | private uint[] IndexBuffer { get; set; }
55 |
56 | ///
57 | /// NewDeletableBloomFilter creates a new DeletableBloomFilter optimized to store
58 | /// n items with a specified target false-positive rate. The r value determines
59 | /// the number of bits to use to store collision information. This controls the
60 | /// deletability of an element. Refer to the paper for selecting an optimal value.
61 | ///
62 | /// Number of items
63 | /// Number of bits to use to store collision information
64 | /// Desired false positive rate
65 | public DeletableBloomFilter(uint n, uint r, double fpRate)
66 | {
67 | var m = Utils.OptimalM(n, fpRate);
68 | var k = Utils.OptimalK(fpRate);
69 |
70 | this.Buckets = new Buckets(m - r, 1);
71 | this.Collisions = new Buckets(r, 1);
72 | this.Hash = Defaults.GetDefaultHashAlgorithm();
73 | this.M = m - r;
74 | this.RegionSize = (m - r) / r;
75 | this.k = k;
76 | this.IndexBuffer = new uint[k];
77 | }
78 |
79 | ///
80 | /// Returns the Bloom filter capacity, m.
81 | ///
82 | /// The Bloom filter capacity, m
83 | public uint Capacity()
84 | {
85 | return this.M;
86 | }
87 |
88 | ///
89 | /// Returns the number of hash functions.
90 | ///
91 | /// The number of hash functions
92 | public uint K()
93 | {
94 | return this.k;
95 | }
96 |
97 | ///
98 | /// Returns the number of items added to the filter.
99 | ///
100 | /// The number of items added to the filter
101 | public uint Count()
102 | {
103 | return this.count;
104 | }
105 |
106 | ///
107 | /// Will test for membership of the data and returns true if it is a member,
108 | /// false if not. This is a probabilistic test, meaning there is a non-zero
109 | /// probability of false positives but a zero probability of false negatives.
110 | ///
111 | /// The data to search for.
112 | /// Whether or not the data is maybe contained in the filter.
113 | public bool Test(byte[] data)
114 | {
115 | var hashKernel = Utils.HashKernel(data, this.Hash);
116 | var lower = hashKernel.LowerBaseHash;
117 | var upper = hashKernel.UpperBaseHash;
118 |
119 | // If any of the K bits are not set, then it's not a member.
120 | for (uint i = 0; i < this.k; i++)
121 | {
122 | if (this.Buckets.Get((lower + upper * i) % this.M) == 0)
123 | {
124 | return false;
125 | }
126 | }
127 | return true;
128 | }
129 |
130 | ///
131 | /// Will add the data to the Bloom filter. It returns the filter to allow
132 | /// for chaining.
133 | ///
134 | /// The data to add.
135 | /// The filter.
136 | public IFilter Add(byte[] data)
137 | {
138 | var hashKernel = Utils.HashKernel(data, this.Hash);
139 | var lower = hashKernel.LowerBaseHash;
140 | var upper = hashKernel.UpperBaseHash;
141 |
142 | // Set the K bits.
143 | for (uint i = 0; i < this.k; i++)
144 | {
145 | var idx = (lower + upper * i) % this.M;
146 | if (this.Buckets.Get(idx) != 0)
147 | {
148 | // Collision, set corresponding region bit.
149 | this.Collisions.Set(idx / this.RegionSize, 1);
150 | }
151 | else
152 | {
153 | this.Buckets.Set(idx, 1);
154 | }
155 | }
156 |
157 | this.count++;
158 | return this;
159 | }
160 |
161 | ///
162 | /// Is equivalent to calling Test followed by Add. It returns true if the data is
163 | /// a member, false if not.
164 | ///
165 | /// The data to test for and add if it doesn't exist.
166 | /// Whether or not the data was probably contained in the filter.
167 | public bool TestAndAdd(byte[] data)
168 | {
169 | var hashKernel = Utils.HashKernel(data, this.Hash);
170 | var lower = hashKernel.LowerBaseHash;
171 | var upper = hashKernel.UpperBaseHash;
172 | var member = true;
173 |
174 | // If any of the K bits are not set, then it's not a member.
175 | for (uint i = 0; i < this.k; i++)
176 | {
177 | var idx = (lower + upper * i) % this.M;
178 | if (this.Buckets.Get(idx) == 0)
179 | {
180 | member = false;
181 | }
182 | else
183 | {
184 | // Collision, set corresponding region bit.
185 | this.Collisions.Set(idx / this.RegionSize, 1);
186 | }
187 | this.Buckets.Set(idx, 1);
188 | }
189 |
190 | this.count++;
191 | return member;
192 | }
193 |
194 | ///
195 | /// Will test for membership of the data and remove it from the filter if it
196 | /// exists. Returns true if the data was a member, false if not.
197 | ///
198 | /// The data to test for and remove
199 | /// Whether or not the data was a member before this call
200 | public bool TestAndRemove(byte[] data)
201 | {
202 | var hashKernel = Utils.HashKernel(data, this.Hash);
203 | var lower = hashKernel.LowerBaseHash;
204 | var upper = hashKernel.UpperBaseHash;
205 | var member = true;
206 |
207 | // Set the K bits.
208 | for (uint i = 0; i < this.k; i++)
209 | {
210 | var idx = (lower + upper * i) % this.M;
211 | this.IndexBuffer[i] = idx;
212 | if (this.Buckets.Get(idx) == 0)
213 | {
214 | member = false;
215 | }
216 | }
217 |
218 | if (member)
219 | {
220 | foreach (var idx in this.IndexBuffer)
221 | {
222 | if (this.Collisions.Get(idx / this.RegionSize) == 0)
223 | {
224 | // Clear only bits located in collision-free zones.
225 | this.Buckets.Set(idx, 0);
226 | }
227 | }
228 | this.count--;
229 | }
230 |
231 | return member;
232 | }
233 |
234 | ///
235 | /// Restores the Bloom filter to its original state. It returns the filter to
236 | /// allow for chaining.
237 | ///
238 | /// The reset bloom filter.
239 | public DeletableBloomFilter Reset()
240 | {
241 | this.Buckets.Reset();
242 | this.Collisions.Reset();
243 | this.count = 0;
244 | return this;
245 | }
246 |
247 | ///
248 | /// Sets the hashing function used in the filter.
249 | ///
250 | /// The HashAlgorithm to use.
251 | // TODO: Add SetHash to the IFilter interface?
252 | public void SetHash(HashAlgorithm h)
253 | {
254 | this.Hash = h;
255 | }
256 | }
257 | }
258 |
--------------------------------------------------------------------------------
/ProbabilisticDataStructures/Element.cs:
--------------------------------------------------------------------------------
1 | using System;
2 |
3 | namespace ProbabilisticDataStructures
4 | {
5 | public class Element
6 | {
7 | public byte[] Data { get; set; }
8 | public UInt64 Freq { get; set; }
9 | }
10 | }
11 |
--------------------------------------------------------------------------------
/ProbabilisticDataStructures/ElementHeap.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 |
5 | namespace ProbabilisticDataStructures
6 | {
7 | internal class ElementHeap
8 | {
9 | internal List Heap { get; set; }
10 |
11 | ///
12 | /// Create a new ElementHeap that can store the top-k elements.
13 | ///
14 | /// The number of top elements to track
15 | internal ElementHeap(int k)
16 | {
17 | this.Heap = new List(k);
18 | }
19 |
20 | ///
21 | /// Get the count of the number of items on the heap.
22 | ///
23 | /// The number of items on the heap
24 | internal int Len()
25 | {
26 | return this.Heap.Count;
27 | }
28 |
29 | ///
30 | /// Return whether or not the item at i-position on the heap is less than the
31 | /// item at j-position.
32 | ///
33 | /// Item 1
34 | /// Item 2
35 | ///
36 | /// Whether or not the item at i-position on the heap is less than the item at
37 | /// j-position.
38 | ///
39 | internal bool Less(int i, int j)
40 | {
41 | return this.Heap[i].Freq < this.Heap[j].Freq;
42 | }
43 |
44 | ///
45 | /// Swap the items at i-position and j-position on the heap.
46 | ///
47 | /// Item 1
48 | /// Item 2
49 | internal void Swap(int i, int j)
50 | {
51 | var temp = this.Heap[i];
52 | Heap[i] = Heap[j];
53 | Heap[j] = temp;
54 | }
55 |
56 | ///
57 | /// Push an Element onto the heap.
58 | ///
59 | /// The Element to push onto the heap
60 | internal void Push(Element e)
61 | {
62 | this.Heap.Add(e);
63 | this.Up(this.Len() - 1);
64 | }
65 |
66 | ///
67 | /// Remove the Element at the top of the heap.
68 | ///
69 | /// The Element that was removed
70 | internal Element Pop()
71 | {
72 | var elementToRemove = this.Heap[0];
73 | this.Heap.Remove(elementToRemove);
74 | return elementToRemove;
75 | }
76 |
77 | internal void Up(int j)
78 | {
79 | while (true)
80 | {
81 | var i = (j - 1) / 2; // parent
82 | if (i == j || !this.Less(j, i))
83 | {
84 | break;
85 | }
86 | this.Swap(i, j);
87 | j = i;
88 | }
89 | }
90 |
91 | internal void Down(int i, int n)
92 | {
93 | while (true)
94 | {
95 | var j1 = 2 * i + 1;
96 | if (j1 >= n || j1 < 0)
97 | {
98 | // j1 < - after int overflow
99 | break;
100 | }
101 | var j = j1; // left child
102 | var j2 = j1 + 1;
103 | if (j2 < n && !this.Less(j1, j2))
104 | {
105 | j = j2; // 2*i + 2 // right child
106 | }
107 | if (!this.Less(j, i))
108 | {
109 | break;
110 | }
111 | this.Swap(i, j);
112 | i = j;
113 | }
114 | }
115 |
116 | ///
117 | /// Returns the top-k elements from lowest to highest frequency.
118 | ///
119 | /// The top-k elements from lowest to highest frequency
120 | internal Element[] Elements()
121 | {
122 | if (this.Len() == 0)
123 | {
124 | return new Element[0];
125 | }
126 |
127 | return this.Heap
128 | .OrderBy(x => x.Freq)
129 | .ToArray();
130 | }
131 |
132 | ///
133 | /// Adds the data to the top-k heap. If the data is already an element, the
134 | /// frequency is updated. If the heap already has k elements, the element with
135 | /// the minimum frequency is removed.
136 | ///
137 | /// The data to insert
138 | /// The frequency to associate with the data
139 | internal void insert(byte[] data, UInt64 freq, uint k)
140 | {
141 | for (int i = 0; i < this.Len(); i++)
142 | {
143 | var element = this.Heap[i];
144 | if (Enumerable.SequenceEqual(data, element.Data))
145 | {
146 | // Element already in top-k.
147 | element.Freq = freq;
148 | return;
149 | }
150 | }
151 |
152 | if (this.Len() == k)
153 | {
154 | // Remove minimum-frequency element.
155 | this.Pop();
156 | }
157 |
158 | // Add element to top-k.
159 | this.Push(new Element
160 | {
161 | Data = data,
162 | Freq = freq,
163 | });
164 | }
165 |
166 | ///
167 | /// Indicates if the given frequency falls within the top-k heap.
168 | ///
169 | /// The frequency to check
170 | /// Whether or not the frequency falls within the top-k heap
171 | internal bool isTop(UInt64 freq, uint k)
172 | {
173 | if (this.Len() < k)
174 | {
175 | return true;
176 | }
177 |
178 | return freq >= this.Heap[0].Freq;
179 | }
180 | }
181 | }
182 |
--------------------------------------------------------------------------------
/ProbabilisticDataStructures/HyperLogLog.cs:
--------------------------------------------------------------------------------
1 | /*
2 | Original work Copyright 2013 Eric Lesh
3 | Modified work Copyright 2015 Tyler Treat
4 | Modified work Copyright 2015 Matthew Lorimor
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining
7 | a copy of this software and associated documentation files (the
8 | "Software"), to deal in the Software without restriction, including
9 | without limitation the rights to use, copy, modify, merge, publish,
10 | distribute, sublicense, and/or sell copies of the Software, and to
11 | permit persons to whom the Software is furnished to do so, subject to
12 | the following conditions:
13 |
14 | The above copyright notice and this permission notice shall be
15 | included in all copies or substantial portions of the Software.
16 | */
17 |
18 | using System;
19 | using System.Linq;
20 | using System.Security.Cryptography;
21 |
22 | namespace ProbabilisticDataStructures
23 | {
24 | ///
25 | /// implements the HyperLogLog cardinality estimation algorithm as
26 | /// described by Flajolet, Fusy, Gandouet, and Meunier in HyperLogLog: the
27 | /// analysis of a near-optimal cardinality estimation algorithm:
28 | ///
29 | /// http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf
30 | ///
31 | /// HyperLogLog is a probabilistic algorithm which approximates the number of
32 | /// distinct elements in a multiset. It works by hashing values and calculating
33 | /// the maximum number of leading zeros in the binary representation of each
34 | /// hash. If the maximum number of leading zeros is n, the estimated number of
35 | /// distinct elements in the set is 2^n. To minimize variance, the multiset is
36 | /// split into a configurable number of registers, the maximum number of leading
37 | /// zeros is calculated in the numbers in each register, and a harmonic mean is
38 | /// used to combine the estimates.
39 | ///
40 | /// For large or unbounded data sets, calculating the exact cardinality is
41 | /// impractical. HyperLogLog uses a fraction of the memory while providing an
42 | /// accurate approximation. For counting element frequency, refer to the
43 | /// Count-Min Sketch.
44 | ///
45 | public class HyperLogLog
46 | {
47 | private static double Exp32 = Math.Pow(2, 32);
48 |
49 | ///
50 | /// Counter registers
51 | ///
52 | private byte[] Registers { get; set; }
53 | ///
54 | /// Number of registers
55 | ///
56 | internal uint M { get; set; }
57 | ///
58 | /// Number of bits to calculate register
59 | ///
60 | private uint B { get; set; }
61 | ///
62 | /// Bias-correction constant
63 | ///
64 | private double Alpha { get; set; }
65 | ///
66 | /// Hash algorithm
67 | ///
68 | private HashAlgorithm Hash { get; set; }
69 |
70 | ///
71 | /// Creates a new HyperLogLog with m registers. Returns an error if m isn't a
72 | /// power of two.
73 | ///
74 | /// Number of registers (must be a power of two)
75 | public HyperLogLog(uint m)
76 | {
77 | if ((m & (m - 1)) != 0)
78 | {
79 | throw new ArgumentException(String.Format("{0} is not a power of two", m));
80 | }
81 |
82 | this.Registers = new byte[m];
83 | this.M = m;
84 | this.B = (uint)Math.Ceiling(Math.Log(m, 2));
85 | this.Alpha = CalculateAlpha(m);
86 | this.Hash = Defaults.GetDefaultHashAlgorithm();
87 | }
88 |
89 | ///
90 | /// Creates a new HyperLogLog optimized for the specified standard error.
91 | /// Throws an ArgumentException if the number of registers can't be calculated
92 | /// for the provided accuracy.
93 | ///
94 | /// Desired standard error
95 | /// The HyperLogLog optimized for the standard error
96 | public static HyperLogLog NewDefaultHyperLogLog(double e)
97 | {
98 | var m = Math.Pow(1.04 / e, 2);
99 | return new HyperLogLog((uint)Math.Pow(2, Math.Ceiling(Math.Log(m, 2))));
100 | }
101 |
102 | ///
103 | /// Will add the data to the set. Returns the HyperLogLog to allow for chaining.
104 | ///
105 | /// The data to add
106 | /// The HyperLogLog
107 | public HyperLogLog Add(byte[] data)
108 | {
109 | var hash = CalculateHash(data);
110 | var k = 32 - this.B;
111 | var r = CalculateRho(hash << (int)this.B, k);
112 | var j = hash >> (int)k;
113 |
114 | if (r > this.Registers[j])
115 | {
116 | this.Registers[j] = r;
117 | }
118 |
119 | return this;
120 | }
121 |
122 | ///
123 | /// Returns the approximated cardinality of the set.
124 | ///
125 | /// The approximated cardinality of the set
126 | public UInt64 Count()
127 | {
128 | var sum = 0.0;
129 | var m = (double)this.M;
130 | foreach (var val in this.Registers)
131 | {
132 | sum += 1.0 / Math.Pow(2.0, val);
133 | }
134 | var estimate = this.Alpha * m * m / sum;
135 | if (estimate <= 5.0 / 2.0 * m)
136 | {
137 | // Small range correction
138 | var v = 0;
139 | foreach (var r in this.Registers)
140 | {
141 | if (r == 0)
142 | {
143 | v++;
144 | }
145 | }
146 | if (v > 0)
147 | {
148 | estimate = m * Math.Log(m / v);
149 | }
150 | }
151 | else if (estimate > 1.0 / 30.0 * Exp32)
152 | {
153 | // Large range correction
154 | estimate = -Exp32 * Math.Log(1 - estimate / Exp32);
155 | }
156 | return (UInt64)estimate;
157 | }
158 |
159 | ///
160 | /// Combines this HyperLogLog with another. Returns an error if the number of
161 | /// registers in the two HyperLogLogs are not equal.
162 | ///
163 | /// The HyperLogLog to merge
164 | /// Whether or not the merge was successful
165 | public bool Merge(HyperLogLog other)
166 | {
167 | if (this.M != other.M)
168 | {
169 | throw new ArgumentException("Number of registers must match");
170 | }
171 |
172 | for (int i = 0; i < other.Registers.Count(); i++)
173 | {
174 | var r = other.Registers[i];
175 | if (r > this.Registers[i])
176 | {
177 | this.Registers[i] = r;
178 | }
179 | }
180 |
181 | return true;
182 | }
183 |
184 | ///
185 | /// Restores the HyperLogLog to its original state. It returns itself to allow
186 | /// for chaining.
187 | ///
188 | /// The HyperLogLog
189 | public HyperLogLog Reset()
190 | {
191 | this.Registers = new byte[this.M];
192 | return this;
193 | }
194 |
195 | ///
196 | /// Sets the hashing function used in the filter.
197 | ///
198 | /// The HashAlgorithm to use.
199 | public void SetHash(HashAlgorithm h)
200 | {
201 | this.Hash = h;
202 | }
203 |
204 | ///
205 | /// Returns a 32-bit hash value for the given data.
206 | ///
207 | /// Data
208 | /// 32-bit hash value
209 | private uint CalculateHash(byte[] data)
210 | {
211 | var sum = Hash.ComputeHash(data);
212 | return Utils.HashBytesToUInt32(sum);
213 | }
214 |
215 | ///
216 | /// Calculates the bias-correction constant alpha based on the number of
217 | /// registers, m.
218 | ///
219 | /// Number of registers
220 | /// Calculated bias-correction constant, alpha
221 | private static double CalculateAlpha(uint m)
222 | {
223 | switch (m)
224 | {
225 | case 16:
226 | return 0.673;
227 | case 32:
228 | return 0.697;
229 | case 64:
230 | return 0.709;
231 | default:
232 | return 0.7213 / (1.0 + 1.079 / m);
233 | }
234 | }
235 |
236 | ///
237 | /// Calculates the position of the leftmost 1-bit.
238 | ///
239 | /// The value to check
240 | ///
241 | /// The position of the leftmost 1-bit
242 | private static byte CalculateRho(uint val, uint max)
243 | {
244 | var r = 1;
245 | while ((val & 0x80000000) == 0 && r <= max)
246 | {
247 | r++;
248 | val <<= 1;
249 | }
250 | return (byte)r;
251 | }
252 |
253 | // TODO: Implement these later.
254 | // WriteDataTo
255 | // ReadDataFrom
256 | }
257 | }
258 |
--------------------------------------------------------------------------------
/ProbabilisticDataStructures/IFilter.cs:
--------------------------------------------------------------------------------
1 | namespace ProbabilisticDataStructures
2 | {
3 | public interface IFilter
4 | {
5 | ///
6 | /// Will test for membership of the data and returns true if it is a member,
7 | /// false if not.
8 | ///
9 | /// The data to test for.
10 | /// Whether or not the data is probably contained in the filter.
11 | bool Test(byte[] data);
12 | ///
13 | /// Add will add the data to the Bloom filter. It returns the filter to allow
14 | /// for chaining.
15 | ///
16 | /// The data to add.
17 | /// The filter.
18 | IFilter Add(byte[] data);
19 | ///
20 | /// Is equivalent to calling Test followed by Add. It returns true if the data is
21 | /// a member, false if not.
22 | ///
23 | /// The data to test for and add if it doesn't exist.
24 | /// Whether or not the data was probably contained in the filter.
25 | bool TestAndAdd(byte[] data);
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/ProbabilisticDataStructures/InverseBloomFilter.cs:
--------------------------------------------------------------------------------
1 | /*
2 | Original work Copyright (c) 2012 Jeff Hodges. All rights reserved.
3 | Modified work Copyright (c) 2015 Tyler Treat. All rights reserved.
4 | Modified work Copyright (c) 2015 Matthew Lorimor. All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are
8 | met:
9 |
10 | * Redistributions of source code must retain the above copyright
11 | notice, this list of conditions and the following disclaimer.
12 | * Redistributions in binary form must reproduce the above
13 | copyright notice, this list of conditions and the following disclaimer
14 | in the documentation and/or other materials provided with the
15 | distribution.
16 | * Neither the name of Jeff Hodges nor the names of this project's
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | */
32 |
33 | using System.Linq;
34 | using System.Security.Cryptography;
35 |
36 | namespace ProbabilisticDataStructures
37 | {
38 | ///
39 | /// InverseBloomFilter is a concurrent "inverse" Bloom filter, which is
40 | /// effectively the opposite of a classic Bloom filter. This was originally
41 | /// described and written by Jeff Hodges:
42 | ///
43 | /// http://www.somethingsimilar.com/2012/05/21/the-opposite-of-a-bloom-filter/
44 | ///
45 | /// The InverseBloomFilter may report a false negative but can never report a
46 | /// false positive. That is, it may report that an item has not been seen when
47 | /// it actually has, but it will never report an item as seen which it hasn't
48 | /// come across. This behaves in a similar manner to a fixed-size hashmap which
49 | /// does not handle conflicts.
50 | ///
51 | /// An example use case is deduplicating events while processing a stream of
52 | /// data. Ideally, duplicate events are relatively close together.
53 | ///
54 | public class InverseBloomFilter : IFilter
55 | {
56 | private byte[][] Array { get; set; }
57 | internal HashAlgorithm Hash { get; set; }
58 | private uint capacity { get; set; }
59 |
60 | ///
61 | /// Instantiates an InverseBloomFilter with the specified capacity.
62 | ///
63 | /// The capacity of the filter
64 | public InverseBloomFilter(uint capacity)
65 | {
66 | this.Array = new byte[capacity][];
67 | this.Hash = Defaults.GetDefaultHashAlgorithm();
68 | this.capacity = capacity;
69 | }
70 |
71 |
72 | ///
73 | /// Will test for membership of the data and returns true if it is a
74 | /// member, false if not. This is a probabilistic test, meaning there is a
75 | /// non-zero probability of false negatives but a zero probability of false
76 | /// positives. That is, it may return false even though the data was added, but
77 | /// it will never return true for data that hasn't been added.
78 | ///
79 | /// The data to test for
80 | /// Whether or not the data is present
81 | public bool Test(byte[] data)
82 | {
83 | var index = this.Index(data);
84 | var val = this.Array[index];
85 | if (val == null)
86 | {
87 | return false;
88 | }
89 | return Enumerable.SequenceEqual(val, data);
90 | }
91 |
92 | ///
93 | /// Will add the data to the filter. It returns the filter to allow for chaining.
94 | ///
95 | ///
96 | ///
97 | public IFilter Add(byte[] data)
98 | {
99 | var index = this.Index(data);
100 | this.GetAndSet(index, data);
101 | return this;
102 | }
103 |
104 | ///
105 | /// Equivalent to calling Test followed by Add atomically. It returns true if
106 | /// the data is a member, false if not.
107 | ///
108 | /// The data to test and add
109 | /// Whether the data was already a member
110 | public bool TestAndAdd(byte[] data)
111 | {
112 | var index = this.Index(data);
113 | var oldId = this.GetAndSet(index, data);
114 | if (oldId == null)
115 | {
116 | return false;
117 | }
118 | return Enumerable.SequenceEqual(oldId, data);
119 | }
120 |
121 | ///
122 | /// Returns the filter capactiy.
123 | ///
124 | /// The filter capactiy
125 | public uint Capacity()
126 | {
127 | return this.capacity;
128 | }
129 |
130 | ///
131 | /// Returns the data that was in the array at the given index after putting the
132 | /// new data in the array at that index, atomically.
133 | ///
134 | /// The index to get and set
135 | /// The data to set
136 | ///
137 | /// The data that was in the array at the index before setting it
138 | ///
139 | private byte[] GetAndSet(uint index, byte[] data)
140 | {
141 | var oldData = this.Array[index];
142 | this.Array[index] = data;
143 | return oldData;
144 | }
145 |
146 | ///
147 | /// Returns the array index for the given data.
148 | ///
149 | /// The data to find the index for
150 | /// The array index for the given data
151 | private uint Index(byte[] data)
152 | {
153 | var index = this.ComputeHashSum32(data) % this.capacity;
154 | return index;
155 | }
156 |
157 | ///
158 | /// Returns a 32-bit hash value for the given data.
159 | ///
160 | /// Data
161 | /// 32-bit hash value
162 | private uint ComputeHashSum32(byte[] data)
163 | {
164 | var sum = Hash.ComputeHash(data);
165 | return Utils.HashBytesToUInt32(sum);
166 | }
167 |
168 | ///
169 | /// Sets the hashing function used in the filter.
170 | ///
171 | /// The HashAlgorithm to use.
172 | // TODO: Add SetHash to the IFilter interface?
173 | public void SetHash(HashAlgorithm h)
174 | {
175 | this.Hash = h;
176 | }
177 | }
178 | }
179 |
--------------------------------------------------------------------------------
/ProbabilisticDataStructures/MinHash.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using System.Threading.Tasks;
5 |
6 | namespace ProbabilisticDataStructures
7 | {
8 | ///
9 | /// MinHash is a variation of the technique for estimating similarity between
10 | /// two sets as presented by Broder in On the resemblance and containment of
11 | /// documents:
12 | ///
13 | /// http://gatekeeper.dec.com/ftp/pub/dec/SRC/publications/broder/positano-final-wpnums.pdf
14 | ///
15 | /// This can be used to cluster or compare documents by splitting the corpus
16 | /// into a bag of words. MinHash returns the approximated similarity ratio of
17 | /// the two bags. The similarity is less accurate for very small bags of words.
18 | ///
19 | public static class MinHash
20 | {
21 | private static Random random = new Random();
22 |
23 | ///
24 | /// Returns the similarity between two bags.
25 | ///
26 | /// The first bag
27 | /// The second bag
28 | /// The similarity between the bags
29 | public static float Similarity(string[] bag1, string[] bag2)
30 | {
31 | var k = bag1.Length + bag2.Length;
32 | var hashes = new int[k];
33 | for (int i = 0; i < k; i++)
34 | {
35 | var a = random.Next();
36 | var b = random.Next();
37 | var c = random.Next();
38 | var x = computeHash((uint)(a * b * c), (uint)a, (uint)b, c);
39 | hashes[i] = (int)x;
40 | }
41 |
42 | var bMap = bitMap(bag1, bag2);
43 | var minHashValues = hashBuckets(2, k);
44 | minHash(bag1, 0, minHashValues, bMap, k, hashes);
45 | minHash(bag2, 1, minHashValues, bMap, k, hashes);
46 | return similarity(minHashValues, k);
47 | }
48 |
49 | private static void minHash(
50 | string[] bag,
51 | int bagIndex,
52 | int[][] minHashValues,
53 | Dictionary bitArray,
54 | int k,
55 | int[] hashes)
56 | {
57 | var options = new ParallelOptions();
58 | options.MaxDegreeOfParallelism = 4;
59 | var index = 0;
60 |
61 | foreach (var element in bitArray)
62 | {
63 | Parallel.For(0, k, options, (i, loopState) =>
64 | {
65 | if (bag.Contains(element.Key))
66 | {
67 | var hindex = hashes[index];
68 | if (hindex < minHashValues[bagIndex][index])
69 | {
70 | minHashValues[bagIndex][index] = hindex;
71 | }
72 | }
73 | });
74 | index++;
75 | }
76 | }
77 |
78 | private static Dictionary bitMap(string[] bag1, string[] bag2)
79 | {
80 | var bitArray = new Dictionary();
81 | foreach (var element in bag1)
82 | {
83 | bitArray[element] = new bool[] { true, false };
84 | }
85 |
86 | foreach (var element in bag2)
87 | {
88 | if (bitArray.ContainsKey(element))
89 | {
90 | bitArray[element] = new bool[] { true, true };
91 | }
92 | else
93 | {
94 | bitArray[element] = new bool[] { false, true };
95 | }
96 | }
97 |
98 | return bitArray;
99 | }
100 |
101 | private static int[][] hashBuckets(int numSets, int k)
102 | {
103 | var minHashValues = new int[numSets][];
104 | for (int i = 0; i < numSets; i++)
105 | {
106 | minHashValues[i] = new int[k];
107 | }
108 |
109 | for (int i = 0; i < numSets; i++)
110 | {
111 | for (int j = 0; j < k; j++)
112 | {
113 | minHashValues[i][j] = int.MaxValue;
114 | }
115 | }
116 | return minHashValues;
117 | }
118 |
119 | private static uint computeHash(uint x, uint a, uint b, int u)
120 | {
121 | return (a * x + b) >> (32 - u);
122 | }
123 |
124 | private static float similarity(int[][] minHashValues, int k)
125 | {
126 | var identicalMinHashes = 0;
127 | for (int i = 0; i < k; i++)
128 | {
129 | if (minHashValues[0][i] == minHashValues[1][i])
130 | {
131 | identicalMinHashes++;
132 | }
133 | }
134 |
135 | return (float)(1.0 * (float)identicalMinHashes) / (float)k;
136 | }
137 | }
138 | }
139 |
--------------------------------------------------------------------------------
/ProbabilisticDataStructures/PartitionedBloomFilter.cs:
--------------------------------------------------------------------------------
1 | /*
2 | Original work Copyright (c) 2013 zhenjl
3 | Modified work Copyright (c) 2015 Tyler Treat
4 | Modified work Copyright (c) 2015 Matthew Lorimor
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy of
7 | this software and associated documentation files (the "Software"), to deal in
8 | the Software without restriction, including without limitation the rights to
9 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
10 | of the Software, and to permit persons to whom the Software is furnished to do
11 | so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | */
16 |
17 | using System;
18 | using System.Security.Cryptography;
19 |
20 | namespace ProbabilisticDataStructures
21 | {
22 | ///
23 | /// PartitionedBloomFilter implements a variation of a classic Bloom filter as
24 | /// described by Almeida, Baquero, Preguica, and Hutchison in Scalable Bloom
25 | /// Filters:
26 | ///
27 | /// http://gsd.di.uminho.pt/members/cbm/ps/dbloom.pdf
28 | ///
29 | /// This filter works by partitioning the M-sized bit array into k slices of
30 | /// size m = M/k bits. Each hash function produces an index over m for its
31 | /// respective slice. Thus, each element is described by exactly k bits, meaning
32 | /// the distribution of false positives is uniform across all elements.
33 | ///
34 | public class PartitionedBloomFilter : IFilter
35 | {
36 | ///
37 | /// Partitioned filter data
38 | ///
39 | internal Buckets[] Partitions { get; set; }
40 | ///
41 | /// Hash algorithm
42 | ///
43 | internal HashAlgorithm Hash { get; set; }
44 | ///
45 | /// Filter size (divided into k partitions)
46 | ///
47 | private uint M { get; set; }
48 | ///
49 | /// Number of hash functions (and partitions)
50 | ///
51 | private uint k { get; set; }
52 | ///
53 | /// Partition size (m / k)
54 | ///
55 | private uint S { get; set; }
56 | ///
57 | /// Number of items added
58 | ///
59 | private uint count { get; set; }
60 |
61 | ///
62 | /// Creates a new partitioned Bloom filter optimized to store n items with a
63 | /// specified target false-positive rate.
64 | ///
65 | /// Number of items
66 | /// Desired false-positive rate
67 | public PartitionedBloomFilter(uint n, double fpRate)
68 | {
69 | var m = Utils.OptimalM(n, fpRate);
70 | var k = Utils.OptimalK(fpRate);
71 | var partitions = new Buckets[k];
72 | var s = (uint)Math.Ceiling((double)m / (double)k);
73 |
74 | for (uint i = 0; i < k; i++)
75 | {
76 | partitions[i] = new Buckets(s, 1);
77 | }
78 |
79 | this.Partitions = partitions;
80 | this.Hash = Defaults.GetDefaultHashAlgorithm();
81 | this.M = m;
82 | this.k = k;
83 | this.S = s;
84 | }
85 |
86 | ///
87 | /// Returns the Bloom filter capacity, m.
88 | ///
89 | /// The Bloom filter capacity, m
90 | public uint Capacity()
91 | {
92 | return this.M;
93 | }
94 |
95 | ///
96 | /// Returns the number of hash functions.
97 | ///
98 | /// The number of hash functions
99 | public uint K()
100 | {
101 | return this.k;
102 | }
103 |
104 | ///
105 | /// Returns the number of items in the filter.
106 | ///
107 | /// The number of items in the filter
108 | public uint Count()
109 | {
110 | return this.count;
111 | }
112 |
113 | ///
114 | /// Returns the current estimated ratio of set bits.
115 | ///
116 | /// The current estimated ratio of set bits
117 | public double EstimatedFillRatio()
118 | {
119 | return 1 - Math.Exp(-(double)this.count / (double)this.S);
120 | }
121 |
122 | ///
123 | /// Returns the average ratio of set bits across all partitions.
124 | ///
125 | /// The average ratio of set bitsacross all partitions
126 | public double FillRatio()
127 | {
128 | var t = (double)0;
129 | for (uint i = 0; i < this.k; i++)
130 | {
131 | uint sum = 0;
132 | for (uint j = 0; j < this.Partitions[i].count; j++)
133 | {
134 | sum += this.Partitions[i].Get(j);
135 | }
136 | t += ((double)sum / (double)this.S);
137 | }
138 | return (double)t / (double)this.k;
139 | }
140 |
141 | ///
142 | /// Will test for membership of the data and returns true if it is a
143 | /// member, false if not. This is a probabilistic test, meaning there is a
144 | /// non-zero probability of false positives but a zero probability of false
145 | /// negatives. Due to the way the filter is partitioned, the probability of
146 | /// false positives is uniformly distributed across all elements.
147 | ///
148 | /// The data to test for
149 | /// Whether or not the data was found
150 | public bool Test(byte[] data)
151 | {
152 | var hashKernel = Utils.HashKernel(data, this.Hash);
153 | var lower = hashKernel.LowerBaseHash;
154 | var upper = hashKernel.UpperBaseHash;
155 |
156 | // If any of the K partiion bits are not set, then it's not a member.
157 | for (uint i = 0; i < this.k; i++)
158 | {
159 | if (this.Partitions[i].Get((lower + upper * i) % this.S) == 0)
160 | {
161 | return false;
162 | }
163 | }
164 |
165 | return true;
166 | }
167 |
168 | ///
169 | /// Will add the data to the Bloom filter. It returns the filter to allow for
170 | /// chaining.
171 | ///
172 | /// The data to add
173 | /// The PartitionedBloomFilter
174 | public IFilter Add(byte[] data)
175 | {
176 | var hashKernel = Utils.HashKernel(data, this.Hash);
177 | var lower = hashKernel.LowerBaseHash;
178 | var upper = hashKernel.UpperBaseHash;
179 |
180 | // Set the K partition bits.
181 | for (uint i = 0; i < this.k; i++)
182 | {
183 | this.Partitions[i].Set((lower + upper * i) % this.S, 1);
184 | }
185 |
186 | this.count++;
187 | return this;
188 | }
189 |
190 | ///
191 | /// Equivalent to calling Test followed by Add. It returns true if the data is a
192 | /// member, false if not.
193 | ///
194 | /// The data to test for and add
195 | ///
196 | /// Whether the data was present in the filter prior to adding it
197 | ///
198 | public bool TestAndAdd(byte[] data)
199 | {
200 | var hashKernel = Utils.HashKernel(data, this.Hash);
201 | var lower = hashKernel.LowerBaseHash;
202 | var upper = hashKernel.UpperBaseHash;
203 | var member = true;
204 |
205 | // If any K partition bits are not set, then it's not a member.
206 | for (uint i = 0; i < this.k; i++)
207 | {
208 | var idx = (lower + upper * i) % this.S;
209 | if (this.Partitions[i].Get(idx) == 0)
210 | {
211 | member = false;
212 | }
213 | this.Partitions[i].Set(idx, 1);
214 | }
215 |
216 | this.count++;
217 | return member;
218 | }
219 |
220 | ///
221 | /// Restores the Bloom filter to its original state. It returns the filter
222 | /// to allow for chaining.
223 | ///
224 | /// The PartitionedBloomFilter
225 | public PartitionedBloomFilter Reset()
226 | {
227 | foreach (var partition in this.Partitions)
228 | {
229 | partition.Reset();
230 | }
231 | return this;
232 | }
233 |
234 | ///
235 | /// Sets the hashing function used in the filter.
236 | ///
237 | /// The HashAlgorithm to use.
238 | // TODO: Add SetHash to the IFilter interface?
239 | public void SetHash(HashAlgorithm h)
240 | {
241 | this.Hash = h;
242 | }
243 | }
244 | }
245 |
--------------------------------------------------------------------------------
/ProbabilisticDataStructures/ProbabilisticDataStructures.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | netstandard2.0;net45
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/ProbabilisticDataStructures/ScalableBloomFilter.cs:
--------------------------------------------------------------------------------
1 | /*
2 | Original work Copyright (c) 2013 zhenjl
3 | Modified work Copyright (c) 2015 Tyler Treat
4 | Modified work Copyright (c) 2015 Matthew Lorimor
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy of
7 | this software and associated documentation files (the "Software"), to deal in
8 | the Software without restriction, including without limitation the rights to
9 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
10 | of the Software, and to permit persons to whom the Software is furnished to do
11 | so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | */
16 |
17 | using System;
18 | using System.Collections.Generic;
19 | using System.Linq;
20 | using System.Security.Cryptography;
21 |
22 | namespace ProbabilisticDataStructures
23 | {
24 | ///
25 | /// ScalableBloomFilter implements a Scalable Bloom Filter as described by
26 | /// Almeida, Baquero, Preguica, and Hutchison in Scalable Bloom Filters:
27 | ///
28 | /// http://gsd.di.uminho.pt/members/cbm/ps/dbloom.pdf
29 | ///
30 | /// A Scalable Bloom Filter dynamically adapts to the number of elements in the
31 | /// data set while enforcing a tight upper bound on the false-positive rate.
32 | /// This works by adding Bloom filters with geometrically decreasing
33 | /// false-positive rates as filters become full. The tightening ratio, r,
34 | /// controls the filter growth. The compounded probability over the whole series
35 | /// converges to a target value, even accounting for an infinite series.
36 | ///
37 | /// Scalable Bloom Filters are useful for cases where the size of the data set
38 | /// isn't known a priori and memory constraints aren't of particular concern.
39 | /// For situations where memory is bounded, consider using Inverse or Stable
40 | /// Bloom Filters.
41 | ///
42 | public class ScalableBloomFilter : IFilter
43 | {
44 | ///
45 | /// Filters with geometrically decreasing error rates
46 | ///
47 | internal List Filters { get; set; }
48 | ///
49 | /// Tightening ratio
50 | ///
51 | internal double R { get; set; }
52 | ///
53 | /// Target false-positive rate
54 | ///
55 | internal double FP { get; set; }
56 | ///
57 | /// Partition fill ratio
58 | ///
59 | private double P { get; set; }
60 | ///
61 | /// Filter size hint
62 | ///
63 | internal uint Hint { get; set; }
64 |
65 | ///
66 | /// Creates a new Scalable Bloom Filter with the specified target false-positive
67 | /// rate and tightening ratio. Use NewDefaultScalableBloomFilter if you don't
68 | /// want to calculate all these parameters.
69 | ///
70 | ///
71 | ///
72 | ///
73 | public ScalableBloomFilter(uint hint, double fpRate, double r)
74 | {
75 | this.Filters = new List();
76 | this.R = r;
77 | this.FP = fpRate;
78 | this.P = Defaults.FILL_RATIO;
79 | this.Hint = hint;
80 |
81 | this.AddFilter();
82 | }
83 |
84 | ///
85 | /// Creates a new Scalable Bloom Filter with the specified target false-positive
86 | /// rate and an optimal tightening ratio.
87 | ///
88 | ///
89 | public static ScalableBloomFilter NewDefaultScalableBloomFilter(double fpRate)
90 | {
91 | return new ScalableBloomFilter(10000, fpRate, 0.8);
92 | }
93 |
94 | ///
95 | /// Returns the current Scalable Bloom Filter capacity, which is the sum of the
96 | /// capacities for the contained series of Bloom filters.
97 | ///
98 | /// The current Scalable Bloom Filter capacity
99 | public uint Capacity()
100 | {
101 | var capacity = 0u;
102 | foreach (var filter in this.Filters)
103 | {
104 | capacity += filter.Capacity();
105 | }
106 | return capacity;
107 | }
108 |
109 | ///
110 | /// Returns the number of hash functions used in each Bloom filter.
111 | ///
112 | /// The number of hash functions used in each Bloom filter
113 | public uint K()
114 | {
115 | return this.Filters[0].K();
116 | }
117 |
118 | ///
119 | /// Returns the average ratio of set bits across every filter.
120 | ///
121 | /// The average ratio of set bits across every filter
122 | public double FillRatio()
123 | {
124 | var sum = 0.0;
125 | foreach (var filter in this.Filters)
126 | {
127 | sum += filter.FillRatio();
128 | }
129 | return (double)sum / this.Filters.Count();
130 | }
131 |
132 | ///
133 | /// Will test for membership of the data and returns true if it is a member,
134 | /// false if not. This is a probabilistic test, meaning there is a non-zero
135 | /// probability of false positives but a zero probability of false negatives.
136 | ///
137 | /// The data to search for.
138 | /// Whether or not the data is maybe contained in the filter.
139 | public bool Test(byte[] data)
140 | {
141 | // Querying is made by testing for the presence in each filter.
142 | foreach (var filter in this.Filters)
143 | {
144 | if (filter.Test(data))
145 | {
146 | return true;
147 | }
148 | }
149 |
150 | return false;
151 | }
152 |
153 | ///
154 | /// Add will add the data to the Bloom filter. It returns the filter to allow
155 | /// for chaining.
156 | ///
157 | /// The data to add
158 | /// The ScalableBloomFilter
159 | public IFilter Add(byte[] data)
160 | {
161 | var idx = this.Filters.Count() - 1;
162 |
163 | // If the last filter has reached its fill ratio, add a new one.
164 | if (this.Filters[idx].EstimatedFillRatio() >= this.P)
165 | {
166 | this.AddFilter();
167 | idx++;
168 | }
169 |
170 | this.Filters[idx].Add(data);
171 | return this;
172 | }
173 |
174 | ///
175 | /// Is equivalent to calling Test followed by Add. It returns true if the data
176 | /// is a member, false if not.
177 | ///
178 | /// The data to test for and add
179 | /// Whether or not the data was present before adding it
180 | public bool TestAndAdd(byte[] data)
181 | {
182 | var member = this.Test(data);
183 | this.Add(data);
184 | return member;
185 | }
186 |
187 | ///
188 | /// Sets the hashing function used in the filter.
189 | ///
190 | /// The HashAlgorithm to use.
191 | // TODO: Add SetHash to the IFilter interface?
192 | public void SetHash(HashAlgorithm h)
193 | {
194 | foreach (var filter in this.Filters)
195 | {
196 | filter.SetHash(h);
197 | }
198 | }
199 |
200 | ///
201 | /// Restores the Bloom filter to its original state. It returns the filter to
202 | /// allow for chaining.
203 | ///
204 | /// The reset bloom filter.
205 | public ScalableBloomFilter Reset()
206 | {
207 | this.Filters = new List();
208 | this.AddFilter();
209 | return this;
210 | }
211 |
212 | ///
213 | /// Adds a new Bloom filter with a restricted false-positive rate to the
214 | /// Scalable Bloom Filter
215 | ///
216 | internal void AddFilter()
217 | {
218 | var fpRate = this.FP * Math.Pow(this.R, this.Filters.Count());
219 | var p = new PartitionedBloomFilter(this.Hint, fpRate);
220 | if (this.Filters.Count() > 0)
221 | {
222 | p.SetHash(this.Filters[0].Hash);
223 | }
224 | this.Filters.Add(p);
225 | }
226 | }
227 | }
228 |
--------------------------------------------------------------------------------
/ProbabilisticDataStructures/TopK.cs:
--------------------------------------------------------------------------------
1 | namespace ProbabilisticDataStructures
2 | {
3 | ///
4 | /// TopK uses a Count-Min Sketch to calculate the top-K frequent elements in a
5 | /// stream.
6 | ///
7 | public class TopK
8 | {
9 | private CountMinSketch Cms { get; set; }
10 | private uint K { get; set; }
11 | internal uint N { get; set; }
12 | private ElementHeap elements { get; set; }
13 |
14 | ///
15 | /// Creates a new TopK backed by a Count-Min sketch whose relative accuracy is
16 | /// within a factor of epsilon with probability delta. It tracks the k-most
17 | /// frequent elements.
18 | ///
19 | /// Relative-accuracy factor
20 | /// Relative-accuracy probability
21 | /// Number of top elements to track
22 | ///
23 | public TopK(double epsilon, double delta, uint k)
24 | {
25 | this.Cms = new CountMinSketch(epsilon, delta);
26 | this.K = k;
27 | this.elements = new ElementHeap((int)k);
28 | }
29 |
30 | ///
31 | /// Will add the data to the Count-Min Sketch and update the top-k heap if
32 | /// applicable. Returns the TopK to allow for chaining.
33 | ///
34 | /// The data to add
35 | /// The TopK
36 | public TopK Add(byte[] data)
37 | {
38 | this.Cms.Add(data);
39 | this.N++;
40 |
41 | var freq = this.Cms.Count(data);
42 | if (this.elements.isTop(freq, this.K))
43 | {
44 | elements.insert(data, freq, this.K);
45 | }
46 |
47 | return this;
48 | }
49 |
50 | ///
51 | /// Returns the top-k elements from lowest to highest frequency.
52 | ///
53 | /// The top-k elements from lowest to highest frequency
54 | public Element[] Elements()
55 | {
56 | return elements.Elements();
57 | }
58 |
59 | ///
60 | /// Restores the TopK to its original state. It returns itself to allow for
61 | /// chaining.
62 | ///
63 | /// The TopK
64 | public TopK Reset()
65 | {
66 | this.Cms.Reset();
67 | this.elements = new ElementHeap((int)K);
68 | this.N = 0;
69 | return this;
70 | }
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/ProbabilisticDataStructures/Utils.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Linq;
3 | using System.Security.Cryptography;
4 | using System.Text;
5 |
6 | namespace ProbabilisticDataStructures
7 | {
8 | public static class Utils
9 | {
10 | ///
11 | /// Calculates the optimal Bloom filter size, m, based on the number of items and
12 | /// the desired rate of false positives.
13 | ///
14 | /// Number of items.
15 | /// Desired false positive rate.
16 | /// The optimal BloomFilter size, m.
17 | public static uint OptimalM(uint n, double fpRate)
18 | {
19 | var optimalM = Math.Ceiling((double)n / ((Math.Log(Defaults.FILL_RATIO) *
20 | Math.Log(1 - Defaults.FILL_RATIO)) / Math.Abs(Math.Log(fpRate))));
21 | return Convert.ToUInt32(optimalM);
22 | }
23 |
24 | ///
25 | /// Calculates the optimal Bloom filter size, m, based on the number of items and
26 | /// the desired rate of false positives.
27 | ///
28 | /// Number of items.
29 | /// Desired false positive rate.
30 | /// The optimal BloomFilter size, m.
31 | public static ulong OptimalM64(ulong n, double fpRate)
32 | {
33 | var optimalM = Math.Ceiling((double)n / ((Math.Log(Defaults.FILL_RATIO) *
34 | Math.Log(1 - Defaults.FILL_RATIO)) / Math.Abs(Math.Log(fpRate))));
35 | return Convert.ToUInt64(optimalM);
36 | }
37 |
38 | ///
39 | /// Calculates the optimal number of hash functions to use for a Bloom filter
40 | /// based on the desired rate of false positives.
41 | ///
42 | /// Desired false positive rate.
43 | /// The optimal number of hash functions, k.
44 | public static uint OptimalK(double fpRate)
45 | {
46 | var optimalK = Math.Ceiling(Math.Log(1 / fpRate, 2));
47 | return Convert.ToUInt32(optimalK);
48 | }
49 |
50 | ///
51 | /// Returns the upper and lower base hash values from which the k hashes are
52 | /// derived. The result will be the same regardless of the endianness of the
53 | /// architecture.
54 | ///
55 | /// The data bytes to hash.
56 | /// The hashing algorithm to use.
57 | /// A HashKernel
58 | public static HashKernelReturnValue HashKernel(byte[] data, HashAlgorithm algorithm)
59 | {
60 | var sum = algorithm.ComputeHash(data);
61 | return HashKernelFromHashBytes(sum);
62 | }
63 |
64 | ///
65 | /// Returns the upper and lower base hash values from which the k hashes are
66 | /// derived using the given hash bytes directly. The result will be the
67 | /// same regardless of the endianness of the architecture. Used by a unit
68 | /// test to confirm the calculation is compatible with the HashKernel from
69 | /// https://github.com/tylertreat/BoomFilters running in Go.
70 | ///
71 | /// The hash bytes.
72 | /// A HashKernel
73 | public static HashKernelReturnValue HashKernelFromHashBytes(byte[] hashBytes)
74 | {
75 | return HashKernelReturnValue.Create(
76 | HashBytesToUInt32(hashBytes, 0),
77 | HashBytesToUInt32(hashBytes, 4)
78 | );
79 | }
80 |
81 | ///
82 | /// Returns the upper and lower base hash values from which the k hashes are
83 | /// derived.
84 | ///
85 | /// The data bytes to hash.
86 | /// The hashing algorithm to use.
87 | /// A HashKernel
88 | public static HashKernel128ReturnValue HashKernel128(byte[] data, HashAlgorithm algorithm)
89 | {
90 | var sum = algorithm.ComputeHash(data);
91 | return HashKernel128ReturnValue.Create(
92 | HashBytesToUInt64(sum, 0),
93 | HashBytesToUInt64(sum, 8)
94 | );
95 | }
96 |
97 | ///
98 | /// Returns the uint represented by the given hash bytes, starting at
99 | /// byte . The result will be the same
100 | /// regardless of the endianness of the architecture.
101 | ///
102 | ///
103 | ///
104 | ///
105 | public static uint HashBytesToUInt32(byte[] hashBytes, int offset = 0)
106 | {
107 | return
108 | ((uint)hashBytes[offset]) |
109 | ((uint)hashBytes[offset + 1]) << 8 |
110 | ((uint)hashBytes[offset + 2]) << 16 |
111 | ((uint)hashBytes[offset + 3]) << 24;
112 | }
113 |
114 | ///
115 | /// Returns the ulong represented by the given hash bytes, starting at
116 | /// byte . The result will be the same
117 | /// regardless of the endianness of the architecture.
118 | ///
119 | ///
120 | ///
121 | ///
122 | public static ulong HashBytesToUInt64(byte[] hashBytes, int offset = 0)
123 | {
124 | return
125 | ((ulong)hashBytes[offset]) |
126 | ((ulong)hashBytes[offset + 1]) << 8 |
127 | ((ulong)hashBytes[offset + 2]) << 16 |
128 | ((ulong)hashBytes[offset + 3]) << 24 |
129 | ((ulong)hashBytes[offset + 4]) << 32 |
130 | ((ulong)hashBytes[offset + 5]) << 40 |
131 | ((ulong)hashBytes[offset + 6]) << 48 |
132 | ((ulong)hashBytes[offset + 7]) << 56;
133 | }
134 |
135 | ///
136 | /// Compute the hash for the provided bytes.
137 | ///
138 | /// The bytes to hash.
139 | /// The hash string of the bytes.
140 | public static string ComputeHashAsString(byte[] inputBytes, HashAlgorithm hashAlgorithm)
141 | {
142 | // Compute the hash of the input byte array.
143 | byte[] data = hashAlgorithm.ComputeHash(inputBytes);
144 |
145 | // Create a new StringBuilder to collect the bytes and create a string.
146 | StringBuilder sb = new StringBuilder();
147 |
148 | // Loop through each byte of the hashed data and format each one as a
149 | // hexadecimal string.
150 | for (int i = 0; i < data.Length; i++)
151 | {
152 | sb.Append(data[i].ToString("X2"));
153 | }
154 |
155 | // Return the hexadecimal string.
156 | return sb.ToString();
157 | }
158 | }
159 |
160 | public struct HashKernelReturnValue
161 | {
162 | public uint UpperBaseHash { get; private set; }
163 | public uint LowerBaseHash { get; private set; }
164 |
165 | public static HashKernelReturnValue Create(uint lowerBaseHash, uint upperBaseHash)
166 | {
167 | return new HashKernelReturnValue
168 | {
169 | UpperBaseHash = upperBaseHash,
170 | LowerBaseHash = lowerBaseHash
171 | };
172 | }
173 | }
174 |
175 | public struct HashKernel128ReturnValue
176 | {
177 | public ulong UpperBaseHash { get; private set; }
178 | public ulong LowerBaseHash { get; private set; }
179 | public static HashKernel128ReturnValue Create(ulong lowerBaseHash, ulong upperBaseHash)
180 | {
181 | return new HashKernel128ReturnValue
182 | {
183 | UpperBaseHash = upperBaseHash,
184 | LowerBaseHash = lowerBaseHash,
185 | };
186 | }
187 | }
188 | }
189 |
--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/Properties/AssemblyInfo.cs:
--------------------------------------------------------------------------------
1 | using System.Reflection;
2 | using System.Runtime.CompilerServices;
3 | using System.Runtime.InteropServices;
4 |
5 | // General Information about an assembly is controlled through the following
6 | // set of attributes. Change these attribute values to modify the information
7 | // associated with an assembly.
8 | [assembly: AssemblyTitle("TestProbabilisticDataStructures")]
9 | [assembly: AssemblyDescription("")]
10 | [assembly: AssemblyConfiguration("")]
11 | [assembly: AssemblyCompany("")]
12 | [assembly: AssemblyProduct("TestProbabilisticDataStructures")]
13 | [assembly: AssemblyCopyright("Copyright © 2015")]
14 | [assembly: AssemblyTrademark("")]
15 | [assembly: AssemblyCulture("")]
16 |
17 | // Setting ComVisible to false makes the types in this assembly not visible
18 | // to COM components. If you need to access a type in this assembly from
19 | // COM, set the ComVisible attribute to true on that type.
20 | [assembly: ComVisible(false)]
21 |
22 | // The following GUID is for the ID of the typelib if this project is exposed to COM
23 | [assembly: Guid("df071d43-8650-491c-a572-4329e4cf8e5f")]
24 |
25 | // Version information for an assembly consists of the following four values:
26 | //
27 | // Major Version
28 | // Minor Version
29 | // Build Number
30 | // Revision
31 | //
32 | // You can specify all the values or you can default the Build and Revision Numbers
33 | // by using the '*' as shown below:
34 | // [assembly: AssemblyVersion("1.0.*")]
35 | [assembly: AssemblyVersion("1.0.0.0")]
36 | [assembly: AssemblyFileVersion("1.0.0.0")]
37 |
--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestBloomFilter.cs:
--------------------------------------------------------------------------------
1 | using Microsoft.VisualStudio.TestTools.UnitTesting;
2 | using ProbabilisticDataStructures;
3 | using System.Text;
4 |
5 | namespace TestProbabilisticDataStructures
6 | {
7 | [TestClass]
8 | public class TestBloomFilter
9 | {
10 | private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a");
11 | private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b");
12 | private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c");
13 | private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x");
14 |
15 | ///
16 | /// Ensures that Capacity() returns the number of bits, m, in the Bloom filter.
17 | ///
18 | [TestMethod]
19 | public void TestBloomCapacity()
20 | {
21 | var f = new BloomFilter(100, 0.1);
22 | var capacity = f.Capacity();
23 |
24 | Assert.AreEqual(480u, capacity);
25 | }
26 |
27 | ///
28 | /// Ensures that K() returns the number of hash functions in the Bloom Filter.
29 | ///
30 | [TestMethod]
31 | public void TestBloomK()
32 | {
33 | var f = new BloomFilter(100, 0.1);
34 | var k = f.K();
35 |
36 | Assert.AreEqual(4u, k);
37 | }
38 |
39 | ///
40 | /// Ensures that Count returns the number of items added to the filter.
41 | ///
42 | [TestMethod]
43 | public void TestBloomCount()
44 | {
45 | var f = new BloomFilter(100, 0.1);
46 | for (uint i = 0; i < 10; i++)
47 | {
48 | f.Add(Encoding.ASCII.GetBytes(i.ToString()));
49 | }
50 |
51 | var count = f.Count();
52 | Assert.AreEqual(10u, count);
53 | }
54 |
55 | ///
56 | /// Ensures that EstimatedFillRatio returns the correct approximation.
57 | ///
58 | [TestMethod]
59 | public void TestBloomEstimatedFillRatio()
60 | {
61 | var f = new BloomFilter(100, 0.5);
62 | for (uint i = 0; i < 100; i++)
63 | {
64 | f.Add(Encoding.ASCII.GetBytes(i.ToString()));
65 | }
66 |
67 | var ratio = f.EstimatedFillRatio();
68 | if (ratio > 0.5)
69 | {
70 | Assert.Fail("Expected less than or equal to 0.5, got {0}", ratio);
71 | }
72 | }
73 |
74 | ///
75 | /// Ensures that FillRatio returns the ratio of set bits.
76 | ///
77 | [TestMethod]
78 | public void TestBloomFillRatio()
79 | {
80 | var f = new BloomFilter(100, 0.1);
81 | f.Add(A_BYTES);
82 | f.Add(B_BYTES);
83 | f.Add(C_BYTES);
84 |
85 | var ratio = f.FillRatio();
86 | Assert.AreEqual(0.025, ratio);
87 | }
88 |
89 | ///
90 | /// Ensures that Test, Add, and TestAndAdd behave correctly.
91 | ///
92 | [TestMethod]
93 | public void TestBloomTestAndAdd()
94 | {
95 | var f = new BloomFilter(100, 0.01);
96 |
97 | // 'a' is not in the filter.
98 | if (f.Test(A_BYTES))
99 | {
100 | Assert.Fail("'a' should not be a member");
101 | }
102 |
103 | var addedF = f.Add(A_BYTES);
104 | Assert.AreSame(f, addedF, "Returned BloomFilter should be the same instance");
105 |
106 | // 'a' is now in the filter.
107 | if (!f.Test(A_BYTES))
108 | {
109 | Assert.Fail("'a' should be a member");
110 | }
111 |
112 | // 'a' is still in the filter.
113 | if (!f.TestAndAdd(A_BYTES))
114 | {
115 | Assert.Fail("'a' should be a member");
116 | }
117 |
118 | // 'b' is not in the filter.
119 | if (f.TestAndAdd(B_BYTES))
120 | {
121 | Assert.Fail("'b' should not be a member");
122 | }
123 |
124 | // 'a' is still in the filter.
125 | if (!f.Test(A_BYTES))
126 | {
127 | Assert.Fail("'a' should be a member");
128 | }
129 |
130 | // 'b' is now in the filter.
131 | if (!f.Test(B_BYTES))
132 | {
133 | Assert.Fail("'b' should be a member");
134 | }
135 |
136 | // 'c' is not in the filter.
137 | if (f.Test(C_BYTES))
138 | {
139 | Assert.Fail("'c' should not be a member");
140 | }
141 |
142 | for (int i = 0; i < 1000000; i++)
143 | {
144 | f.TestAndAdd(Encoding.ASCII.GetBytes(i.ToString()));
145 | }
146 |
147 | // 'x' should be a false positive.
148 | if (!f.Test(X_BYTES))
149 | {
150 | Assert.Fail("'x' should be a member");
151 | }
152 | }
153 |
154 | ///
155 | /// Ensures that Reset sets every bit to zero.
156 | ///
157 | [TestMethod]
158 | public void TestBloomReset()
159 | {
160 | var f = new BloomFilter(100, 0.1);
161 | for (int i = 0; i < 1000; i++)
162 | {
163 | f.Add(Encoding.ASCII.GetBytes(i.ToString()));
164 | }
165 |
166 | var resetF = f.Reset();
167 | Assert.AreSame(f, resetF, "Returned BloomFilter should be the same instance");
168 |
169 | for (uint i = 0; i < f.Buckets.count; i++)
170 | {
171 | if (f.Buckets.Get(i) != 0)
172 | {
173 | Assert.Fail("Expected all bits to be unset");
174 | }
175 | }
176 | }
177 | }
178 |
179 | [TestClass]
180 | public class BenchmarkBloomFilter
181 | {
182 | private BloomFilter f;
183 | private int n;
184 | private byte[][] data;
185 |
186 | [TestInitialize()]
187 | public void Testinitialize()
188 | {
189 | n = 100000;
190 | f = new BloomFilter(100000, 0.1);
191 | data = new byte[n][];
192 | for (int i = 0; i < n; i++)
193 | {
194 | data[i] = Encoding.ASCII.GetBytes(i.ToString());
195 | }
196 | }
197 |
198 | [TestCleanup()]
199 | public void TestCleanup()
200 | {
201 | f = null;
202 | n = 0;
203 | data = null;
204 | }
205 |
206 | [TestMethod]
207 | public void BenchmarkBloomAdd()
208 | {
209 | for (int i = 0; i < n; i++)
210 | {
211 | f.Add(data[i]);
212 | }
213 | }
214 |
215 | [TestMethod]
216 | public void BenchmarkBloomTest()
217 | {
218 | for (int i = 0; i < n; i++)
219 | {
220 | f.Test(data[i]);
221 | }
222 | }
223 |
224 | [TestMethod]
225 | public void BenchmarkBloomTestAndAdd()
226 | {
227 | for (int i = 0; i < n; i++)
228 | {
229 | f.TestAndAdd(data[i]);
230 | }
231 | }
232 | }
233 | }
234 |
--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestBloomFilter64.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using Microsoft.VisualStudio.TestTools.UnitTesting;
3 | using ProbabilisticDataStructures;
4 | using System.Text;
5 | using System.Collections.Generic;
6 |
7 | namespace TestProbabilisticDataStructures
8 | {
9 | [TestClass]
10 | public class TestBloomFilter64
11 | {
12 | private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a");
13 | private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b");
14 | private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c");
15 | private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x");
16 |
17 | ///
18 | /// Ensures that Capacity() returns the number of bits, m, in the Bloom filter.
19 | ///
20 | [TestMethod]
21 | public void TestBloomCapacity()
22 | {
23 | var f = new BloomFilter64(100, 0.1);
24 | var capacity = f.Capacity();
25 |
26 | Assert.AreEqual(480u, capacity);
27 | }
28 |
29 | ///
30 | /// Ensures that K() returns the number of hash functions in the Bloom Filter.
31 | ///
32 | [TestMethod]
33 | public void TestBloom64K()
34 | {
35 | var f = new BloomFilter64(100, 0.1);
36 | var k = f.K();
37 |
38 | Assert.AreEqual(4u, k);
39 | }
40 |
41 | ///
42 | /// Ensures that Count returns the number of items added to the filter.
43 | ///
44 | [TestMethod]
45 | public void TestBloom64Count()
46 | {
47 | var f = new BloomFilter64(100, 0.1);
48 | for (uint i = 0; i < 10; i++)
49 | {
50 | f.Add(Encoding.ASCII.GetBytes(i.ToString()));
51 | }
52 |
53 | var count = f.Count();
54 | Assert.AreEqual(10u, count);
55 | }
56 |
57 | ///
58 | /// Ensures that EstimatedFillRatio returns the correct approximation.
59 | ///
60 | [TestMethod]
61 | public void TestBloom64EstimatedFillRatio()
62 | {
63 | var f = new BloomFilter64(100, 0.5);
64 | for (uint i = 0; i < 100; i++)
65 | {
66 | f.Add(Encoding.ASCII.GetBytes(i.ToString()));
67 | }
68 |
69 | var ratio = f.EstimatedFillRatio();
70 | if (ratio > 0.5)
71 | {
72 | Assert.Fail("Expected less than or equal to 0.5, got {0}", ratio);
73 | }
74 | }
75 |
76 | ///
77 | /// Ensures that FillRatio returns the ratio of set bits.
78 | ///
79 | [TestMethod]
80 | public void TestBloom64FillRatio()
81 | {
82 | var f = new BloomFilter64(100, 0.1);
83 | f.Add(A_BYTES);
84 | f.Add(B_BYTES);
85 | f.Add(C_BYTES);
86 |
87 | var ratio = f.FillRatio();
88 | Assert.AreEqual(0.025, ratio);
89 | }
90 |
91 | ///
92 | /// Ensures that Test, Add, and TestAndAdd behave correctly.
93 | ///
94 | [TestMethod]
95 | public void TestBloom64TestAndAdd()
96 | {
97 | var f = new BloomFilter64(100, 0.01);
98 |
99 | // 'a' is not in the filter.
100 | if (f.Test(A_BYTES))
101 | {
102 | Assert.Fail("'a' should not be a member");
103 | }
104 |
105 | var addedF = f.Add(A_BYTES);
106 | Assert.AreSame(f, addedF, "Returned BloomFilter64 should be the same instance");
107 |
108 | // 'a' is now in the filter.
109 | if (!f.Test(A_BYTES))
110 | {
111 | Assert.Fail("'a' should be a member");
112 | }
113 |
114 | // 'a' is still in the filter.
115 | if (!f.TestAndAdd(A_BYTES))
116 | {
117 | Assert.Fail("'a' should be a member");
118 | }
119 |
120 | // 'b' is not in the filter.
121 | if (f.TestAndAdd(B_BYTES))
122 | {
123 | Assert.Fail("'b' should not be a member");
124 | }
125 |
126 | // 'a' is still in the filter.
127 | if (!f.Test(A_BYTES))
128 | {
129 | Assert.Fail("'a' should be a member");
130 | }
131 |
132 | // 'b' is now in the filter.
133 | if (!f.Test(B_BYTES))
134 | {
135 | Assert.Fail("'b' should be a member");
136 | }
137 |
138 | // 'c' is not in the filter.
139 | if (f.Test(C_BYTES))
140 | {
141 | Assert.Fail("'c' should not be a member");
142 | }
143 |
144 | for (int i = 0; i < 1000000; i++)
145 | {
146 | f.TestAndAdd(Encoding.ASCII.GetBytes(i.ToString()));
147 | }
148 |
149 | // 'x' should be a false positive.
150 | if (!f.Test(X_BYTES))
151 | {
152 | Assert.Fail("'x' should be a member");
153 | }
154 | }
155 |
156 | ///
157 | /// Ensures that Reset sets every bit to zero.
158 | ///
159 | [TestMethod]
160 | public void TestBloom64Reset()
161 | {
162 | var f = new BloomFilter64(100, 0.1);
163 | for (int i = 0; i < 1000; i++)
164 | {
165 | f.Add(Encoding.ASCII.GetBytes(i.ToString()));
166 | }
167 |
168 | var resetF = f.Reset();
169 | Assert.AreSame(f, resetF, "Returned BloomFilter64 should be the same instance");
170 |
171 | for (uint i = 0; i < f.Buckets.count; i++)
172 | {
173 | if (f.Buckets.Get(i) != 0)
174 | {
175 | Assert.Fail("Expected all bits to be unset");
176 | }
177 | }
178 | }
179 | }
180 |
181 | [TestClass]
182 | public class BenchmarkBloomFilter64
183 | {
184 | private BloomFilter64 f;
185 | private int n;
186 | private byte[][] data;
187 |
188 | [TestInitialize()]
189 | public void Testinitialize()
190 | {
191 | n = 100000;
192 | f = new BloomFilter64(100000, 0.1);
193 | data = new byte[n][];
194 | for (int i = 0; i < n; i++)
195 | {
196 | data[i] = Encoding.ASCII.GetBytes(i.ToString());
197 | }
198 | }
199 |
200 | [TestCleanup()]
201 | public void TestCleanup()
202 | {
203 | f = null;
204 | n = 0;
205 | data = null;
206 | }
207 |
208 | [TestMethod]
209 | public void BenchmarkBloom64Add()
210 | {
211 | for (int i = 0; i < n; i++)
212 | {
213 | f.Add(data[i]);
214 | }
215 | }
216 |
217 | [TestMethod]
218 | public void BenchmarkBloom64Test()
219 | {
220 | for (int i = 0; i < n; i++)
221 | {
222 | f.Test(data[i]);
223 | }
224 | }
225 |
226 | [TestMethod]
227 | public void BenchmarkBloom64TestAndAdd()
228 | {
229 | for (int i = 0; i < n; i++)
230 | {
231 | f.TestAndAdd(data[i]);
232 | }
233 | }
234 | }
235 | }
236 |
--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestBuckets.cs:
--------------------------------------------------------------------------------
1 | using Microsoft.VisualStudio.TestTools.UnitTesting;
2 | using ProbabilisticDataStructures;
3 |
4 | namespace TestProbabilisticDataStructures
5 | {
6 | [TestClass]
7 | public class TestBuckets
8 | {
9 | ///
10 | /// Ensures that Max returns the correct maximum based on the bucket
11 | /// size.
12 | ///
13 | [TestMethod]
14 | public void TestMaxBucketValue()
15 | {
16 | var b = new Buckets(10, 2);
17 |
18 | var max = b.MaxBucketValue();
19 | Assert.AreEqual(3, max);
20 | }
21 |
22 | ///
23 | /// Ensures that Count returns the number of buckets.
24 | ///
25 | [TestMethod]
26 | public void TestBucketsCount()
27 | {
28 | var b = new Buckets(10, 2);
29 |
30 | var count = b.count;
31 | Assert.AreEqual(10u, count);
32 | }
33 |
34 | ///
35 | /// Ensures that Increment increments the bucket value by the correct delta and
36 | /// clamps to zero and the maximum, Get returns the correct bucket value, and Set
37 | /// sets the bucket value correctly.
38 | ///
39 | [TestMethod]
40 | public void TestBucketsIncrementAndGetAndSet()
41 | {
42 | var b = new Buckets(5, 2);
43 |
44 | var incrementedB = b.Increment(0, 1);
45 | Assert.AreSame(b, incrementedB, "Returned Buckets should be the same instance");
46 |
47 | var v = b.Get(0);
48 | Assert.AreEqual(1u, v);
49 |
50 | b.Increment(1u, -1);
51 |
52 | v = b.Get(1);
53 | Assert.AreEqual(0u, v);
54 |
55 | var setB = b.Set(2u, 100);
56 | Assert.AreSame(b, setB, "Returned Buckets should be the same instance");
57 |
58 | v = b.Get(2);
59 | Assert.AreEqual(3u, v);
60 |
61 | b.Increment(3, 2);
62 |
63 | v = b.Get(3);
64 | Assert.AreEqual(2u, v);
65 | }
66 |
67 | ///
68 | /// Ensures that Reset restores the Buckets to the original state.
69 | ///
70 | [TestMethod]
71 | public void TestBucketsReset()
72 | {
73 | var b = new Buckets(5, 2);
74 |
75 | for (uint i = 0; i < 5; i++)
76 | {
77 | b.Increment(i, 1);
78 | }
79 |
80 | var resetB = b.Reset();
81 | Assert.AreSame(b, resetB, "Returned Buckets should be the same instance");
82 |
83 | for (uint i = 0; i < 5; i++)
84 | {
85 | var c = b.Get(i);
86 | Assert.AreEqual(0u, c);
87 | }
88 | }
89 |
90 | [TestMethod]
91 | public void BenchmarkBucketsIncrement()
92 | {
93 | var buckets = new Buckets(10000, 10);
94 | for (uint i = 0; i < buckets.count; i++)
95 | {
96 | buckets.Increment(i % 10000, 1);
97 | }
98 | }
99 |
100 | [TestMethod]
101 | public void BenchmarkBucketsSet()
102 | {
103 | var buckets = new Buckets(10000, 10);
104 | for (uint i = 0; i < buckets.count; i++)
105 | {
106 | buckets.Set(i % 10000, 1);
107 | }
108 | }
109 |
110 | [TestMethod]
111 | public void BenchmarkBucketsGet()
112 | {
113 | var buckets = new Buckets(10000, 10);
114 | for (uint i = 0; i < buckets.count; i++)
115 | {
116 | buckets.Get(i % 10000);
117 | }
118 | }
119 | }
120 | }
121 |
--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestBuckets64.cs:
--------------------------------------------------------------------------------
1 | using Microsoft.VisualStudio.TestTools.UnitTesting;
2 | using ProbabilisticDataStructures;
3 |
4 | namespace TestProbabilisticDataStructures
5 | {
6 | [TestClass]
7 | public class TestBuckets64
8 | {
9 | ///
10 | /// Ensures that Max returns the correct maximum based on the bucket
11 | /// size.
12 | ///
13 | [TestMethod]
14 | public void TestMaxBucketValue()
15 | {
16 | var b = new Buckets64(10, 2);
17 |
18 | var max = b.MaxBucketValue();
19 | Assert.AreEqual(3, max);
20 | }
21 |
22 | ///
23 | /// Ensures that Count returns the number of buckets.
24 | ///
25 | [TestMethod]
26 | public void TestBuckets64Count()
27 | {
28 | var b = new Buckets64(10, 2);
29 |
30 | var count = b.count;
31 | Assert.AreEqual(10u, count);
32 | }
33 |
34 | ///
35 | /// Ensures that Increment increments the bucket value by the correct delta and
36 | /// clamps to zero and the maximum, Get returns the correct bucket value, and Set
37 | /// sets the bucket value correctly.
38 | ///
39 | [TestMethod]
40 | public void TestBuckets64IncrementAndGetAndSet()
41 | {
42 | var b = new Buckets64(5, 2);
43 |
44 | var incrementedB = b.Increment(0, 1);
45 | Assert.AreSame(b, incrementedB, "Returned Buckets64 should be the same instance");
46 |
47 | var v = b.Get(0);
48 | Assert.AreEqual(1u, v);
49 |
50 | b.Increment(1u, -1);
51 |
52 | v = b.Get(1);
53 | Assert.AreEqual(0u, v);
54 |
55 | var setB = b.Set(2u, 100);
56 | Assert.AreSame(b, setB, "Returned Buckets64 should be the same instance");
57 |
58 | v = b.Get(2);
59 | Assert.AreEqual(3u, v);
60 |
61 | b.Increment(3, 2);
62 |
63 | v = b.Get(3);
64 | Assert.AreEqual(2u, v);
65 | }
66 |
67 | ///
68 | /// Ensures that Reset restores the Buckets64 to the original state.
69 | ///
70 | [TestMethod]
71 | public void TestBuckets64Reset()
72 | {
73 | var b = new Buckets64(5, 2);
74 |
75 | for (uint i = 0; i < 5; i++)
76 | {
77 | b.Increment(i, 1);
78 | }
79 |
80 | var resetB = b.Reset();
81 | Assert.AreSame(b, resetB, "Returned Buckets64 should be the same instance");
82 |
83 | for (uint i = 0; i < 5; i++)
84 | {
85 | var c = b.Get(i);
86 | Assert.AreEqual(0u, c);
87 | }
88 | }
89 |
90 | [TestMethod]
91 | public void BenchmarkBuckets64Increment()
92 | {
93 | var buckets = new Buckets64(10000, 10);
94 | for (uint i = 0; i < buckets.count; i++)
95 | {
96 | buckets.Increment(i % 10000, 1);
97 | }
98 | }
99 |
100 | [TestMethod]
101 | public void BenchmarkBuckets64Set()
102 | {
103 | var buckets = new Buckets64(10000, 10);
104 | for (uint i = 0; i < buckets.count; i++)
105 | {
106 | buckets.Set(i % 10000, 1);
107 | }
108 | }
109 |
110 | [TestMethod]
111 | public void BenchmarkBuckets64Get()
112 | {
113 | var buckets = new Buckets64(10000, 10);
114 | for (uint i = 0; i < buckets.count; i++)
115 | {
116 | buckets.Get(i % 10000);
117 | }
118 | }
119 | }
120 | }
121 |
--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestCountMinSketch.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 | using Microsoft.VisualStudio.TestTools.UnitTesting;
3 | using ProbabilisticDataStructures;
4 |
5 | namespace TestProbabilisticDataStructures
6 | {
7 | [TestClass]
8 | public class TestCountMinSketch
9 | {
10 | private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a");
11 | private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b");
12 | private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c");
13 | private static byte[] D_BYTES = Encoding.ASCII.GetBytes("d");
14 | private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x");
15 |
16 | ///
17 | /// Ensures that TotalCount returns the number of items added to the sketch.
18 | ///
19 | [TestMethod]
20 | public void TestCMSTotalCount()
21 | {
22 | var cms = new CountMinSketch(0.001, 0.99);
23 |
24 | for (int i = 0; i < 100; i++)
25 | {
26 | cms.Add(Encoding.ASCII.GetBytes(i.ToString()));
27 | }
28 |
29 | var count = cms.TotalCount();
30 | Assert.AreEqual(100u, count);
31 | }
32 |
33 | ///
34 | /// Ensures that Add adds to the set and Count returns the correct approximation.
35 | ///
36 | [TestMethod]
37 | public void TestCMSAddAndCount()
38 | {
39 | var cms = new CountMinSketch(0.001, 0.99);
40 |
41 | var addedCms = cms.Add(A_BYTES);
42 | Assert.AreSame(cms, addedCms);
43 |
44 | cms.Add(B_BYTES);
45 | cms.Add(C_BYTES);
46 | cms.Add(B_BYTES);
47 | cms.Add(D_BYTES);
48 | cms.Add(A_BYTES).Add(A_BYTES);
49 |
50 | var count = cms.Count(A_BYTES);
51 | Assert.AreEqual(3u, count);
52 |
53 | count = cms.Count(B_BYTES);
54 | Assert.AreEqual(2u, count);
55 |
56 | count = cms.Count(C_BYTES);
57 | Assert.AreEqual(1u, count);
58 |
59 | count = cms.Count(D_BYTES);
60 | Assert.AreEqual(1u, count);
61 |
62 | count = cms.Count(X_BYTES);
63 | Assert.AreEqual(0u, count);
64 | }
65 |
66 | ///
67 | /// Ensures that Merge combines the two sketches.
68 | ///
69 | [TestMethod]
70 | public void TestCMSMerge()
71 | {
72 | var cms = new CountMinSketch(0.001, 0.99);
73 | cms.Add(B_BYTES);
74 | cms.Add(C_BYTES);
75 | cms.Add(B_BYTES);
76 | cms.Add(D_BYTES);
77 | cms.Add(A_BYTES).Add(A_BYTES);
78 |
79 | var other = new CountMinSketch(0.001, 0.99);
80 | other.Add(B_BYTES);
81 | other.Add(C_BYTES);
82 | other.Add(B_BYTES);
83 |
84 | var wasMerged = cms.Merge(other);
85 | Assert.IsTrue(wasMerged);
86 |
87 | var count = cms.Count(A_BYTES);
88 | Assert.AreEqual(2u, count);
89 |
90 | count = cms.Count(B_BYTES);
91 | Assert.AreEqual(4u, count);
92 |
93 | count = cms.Count(C_BYTES);
94 | Assert.AreEqual(2u, count);
95 |
96 | count = cms.Count(D_BYTES);
97 | Assert.AreEqual(1u, count);
98 |
99 | count = cms.Count(X_BYTES);
100 | Assert.AreEqual(0u, count);
101 | }
102 |
103 | ///
104 | /// Ensures that Reset restores the sketch to its original state.
105 | ///
106 | [TestMethod]
107 | public void TestCMSReset()
108 | {
109 | var cms = new CountMinSketch(0.001, 0.99);
110 | cms.Add(B_BYTES);
111 | cms.Add(C_BYTES);
112 | cms.Add(B_BYTES);
113 | cms.Add(D_BYTES);
114 | cms.Add(A_BYTES).Add(A_BYTES);
115 |
116 | var resetCms = cms.Reset();
117 | Assert.AreSame(cms, resetCms);
118 |
119 | for (uint i = 0; i < cms.Depth; i++)
120 | {
121 | for (int j = 0; j < cms.Width; j++)
122 | {
123 | if (cms.Matrix[i][j] != 0)
124 | {
125 | Assert.Fail("Expected matrix to be completely empty.");
126 | }
127 | }
128 | }
129 | }
130 |
131 | [TestMethod]
132 | public void BenchmarkCMSAdd()
133 | {
134 | var n = 100000;
135 | var cms = new CountMinSketch(0.001, 0.99);
136 | var data = new byte[n][];
137 | for (int i = 0; i < n; i++)
138 | {
139 | data[i] = Encoding.ASCII.GetBytes(i.ToString());
140 | }
141 |
142 | for (int i = 0; i < n; i++)
143 | {
144 | cms.Add(data[i]);
145 | }
146 | }
147 |
148 | [TestMethod]
149 | public void BenchmarkCMSCount()
150 | {
151 | var n = 100000;
152 | var cms = new CountMinSketch(0.001, 0.99);
153 | var data = new byte[n][];
154 | for (int i = 0; i < n; i++)
155 | {
156 | var byteArray = Encoding.ASCII.GetBytes(i.ToString());
157 | data[i] = byteArray;
158 | cms.Add(byteArray);
159 | }
160 |
161 | for (int i = 0; i < n; i++)
162 | {
163 | cms.Add(data[i]);
164 | }
165 | }
166 |
167 | // TODO: Implement these later.
168 | // TestCMSSerialization
169 | // BenchmarkCMSWriteDataTo
170 | // BenchmarkCMSReadDataFrom
171 | }
172 | }
173 |
--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestCountingBloomFilter.cs:
--------------------------------------------------------------------------------
1 | using Microsoft.VisualStudio.TestTools.UnitTesting;
2 | using ProbabilisticDataStructures;
3 | using System.Text;
4 |
5 | namespace TestProbabilisticDataStructures
6 | {
7 | [TestClass]
8 | public class TestCountingBloomFilter
9 | {
10 | private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a");
11 | private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b");
12 | private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c");
13 | private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x");
14 |
15 | ///
16 | /// Ensures that Capacity() returns the number of bits, m, in the Bloom filter.
17 | ///
18 | [TestMethod]
19 | public void TestCountingCapacity()
20 | {
21 | var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100, 0.1);
22 | var capacity = f.Capacity();
23 |
24 | Assert.AreEqual(480u, capacity);
25 | }
26 |
27 | ///
28 | /// Ensures that K() returns the number of hash functions in the Bloom Filter.
29 | ///
30 | [TestMethod]
31 | public void TestCountingK()
32 | {
33 | var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100, 0.1);
34 | var k = f.K();
35 |
36 | Assert.AreEqual(4u, k);
37 | }
38 |
39 | ///
40 | /// Ensures that Count returns the number of items added to the filter.
41 | ///
42 | [TestMethod]
43 | public void TestCountingCount()
44 | {
45 | var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100, 0.1);
46 | for (uint i = 0; i < 10; i++)
47 | {
48 | f.Add(Encoding.ASCII.GetBytes(i.ToString()));
49 | }
50 |
51 | for (int i = 0; i < 5; i++)
52 | {
53 | f.TestAndRemove(Encoding.ASCII.GetBytes(i.ToString()));
54 | }
55 |
56 | var count = f.Count();
57 | Assert.AreEqual(5u, count);
58 | }
59 |
60 | ///
61 | /// Ensures that Test, Add, and TestAndAdd behave correctly.
62 | ///
63 | [TestMethod]
64 | public void TestCountingTestAndAdd()
65 | {
66 | var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100, 0.01);
67 |
68 | // 'a' is not in the filter.
69 | if (f.Test(A_BYTES))
70 | {
71 | Assert.Fail("'a' should not be a member");
72 | }
73 |
74 | var addedF = f.Add(A_BYTES);
75 | Assert.AreSame(f, addedF, "Returned BloomFilter should be the same instance");
76 |
77 | // 'a' is now in the filter.
78 | if (!f.Test(A_BYTES))
79 | {
80 | Assert.Fail("'a' should be a member");
81 | }
82 |
83 | // 'a' is still in the filter.
84 | if (!f.TestAndAdd(A_BYTES))
85 | {
86 | Assert.Fail("'a' should be a member");
87 | }
88 |
89 | // 'b' is not in the filter.
90 | if (f.TestAndAdd(B_BYTES))
91 | {
92 | Assert.Fail("'b' should not be a member");
93 | }
94 |
95 | // 'a' is still in the filter.
96 | if (!f.Test(A_BYTES))
97 | {
98 | Assert.Fail("'a' should be a member");
99 | }
100 |
101 | // 'b' is now in the filter.
102 | if (!f.Test(B_BYTES))
103 | {
104 | Assert.Fail("'b' should be a member");
105 | }
106 |
107 | // 'c' is not in the filter.
108 | if (f.Test(C_BYTES))
109 | {
110 | Assert.Fail("'c' should not be a member");
111 | }
112 |
113 | for (int i = 0; i < 1000000; i++)
114 | {
115 | f.TestAndAdd(Encoding.ASCII.GetBytes(i.ToString()));
116 | }
117 |
118 | // 'x' should be a false positive.
119 | if (!f.Test(X_BYTES))
120 | {
121 | Assert.Fail("'x' should be a member");
122 | }
123 | }
124 |
125 | ///
126 | /// Ensures that TestAndRemove behaves correctly.
127 | ///
128 | [TestMethod]
129 | public void TestCountingTestAndRemove()
130 | {
131 | var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100, 0.01);
132 |
133 | // 'a' is not in the filter.
134 | if (f.TestAndRemove(A_BYTES))
135 | {
136 | Assert.Fail("'a' should not be a member");
137 | }
138 |
139 | f.Add(Encoding.ASCII.GetBytes("a"));
140 |
141 | // 'a' is now in the filter.
142 | if (!f.TestAndRemove(A_BYTES))
143 | {
144 | Assert.Fail("'a' should be a member");
145 | }
146 |
147 | // 'a' is no longer in the filter.
148 | if (f.TestAndRemove(A_BYTES))
149 | {
150 | Assert.Fail("'a' should not be a member");
151 | }
152 | }
153 |
154 | ///
155 | /// Ensures that Reset sets every bit to zero and the count is zero.
156 | ///
157 | [TestMethod]
158 | public void TestCountingReset()
159 | {
160 | var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100, 0.1);
161 | for (int i = 0; i < 1000; i++)
162 | {
163 | f.Add(Encoding.ASCII.GetBytes(i.ToString()));
164 | }
165 |
166 | var resetF = f.Reset();
167 | Assert.AreSame(f, resetF, "Returned CountingBloomFilter should be the same instance");
168 |
169 | for (uint i = 0; i < f.Buckets.count; i++)
170 | {
171 | if (f.Buckets.Get(i) != 0)
172 | {
173 | Assert.Fail("Expected all bits to be unset");
174 | }
175 | }
176 |
177 | Assert.AreEqual(0u, f.Count());
178 | }
179 |
180 | [TestMethod]
181 | public void BenchmarkCountingAdd()
182 | {
183 | var n = 100000;
184 | var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100000, 0.1);
185 | var data = new byte[n][];
186 | for (int i = 0; i < n; i++)
187 | {
188 | data[i] = Encoding.ASCII.GetBytes(i.ToString());
189 | }
190 |
191 | for (int i = 0; i < n; i++)
192 | {
193 | f.Add(data[i]);
194 | }
195 | }
196 |
197 | [TestMethod]
198 | public void BenchmarkCountingTest()
199 | {
200 | var n = 100000;
201 | var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100000, 0.1);
202 | var data = new byte[n][];
203 | for (int i = 0; i < n; i++)
204 | {
205 | data[i] = Encoding.ASCII.GetBytes(i.ToString());
206 | }
207 |
208 | for (int i = 0; i < n; i++)
209 | {
210 | f.Test(data[i]);
211 | }
212 | }
213 |
214 | [TestMethod]
215 | public void BenchmarkCountingTestAndAdd()
216 | {
217 | var n = 100000;
218 | var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100000, 0.1);
219 | var data = new byte[n][];
220 | for (int i = 0; i < n; i++)
221 | {
222 | data[i] = Encoding.ASCII.GetBytes(i.ToString());
223 | }
224 |
225 | for (int i = 0; i < n; i++)
226 | {
227 | f.TestAndAdd(data[i]);
228 | }
229 | }
230 |
231 | [TestMethod]
232 | public void BenchmarkCountingTestAndRemove()
233 | {
234 | var n = 100000;
235 | var f = CountingBloomFilter.NewDefaultCountingBloomFilter(100000, 0.1);
236 | var data = new byte[n][];
237 | for (int i = 0; i < n; i++)
238 | {
239 | data[i] = Encoding.ASCII.GetBytes(i.ToString());
240 | }
241 |
242 | for (int i = 0; i < n; i++)
243 | {
244 | f.TestAndRemove(data[i]);
245 | }
246 | }
247 | }
248 | }
249 |
--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestCuckooBloomFilter.cs:
--------------------------------------------------------------------------------
1 | using Microsoft.VisualStudio.TestTools.UnitTesting;
2 | using ProbabilisticDataStructures;
3 | using System.Text;
4 |
5 | namespace TestProbabilisticDataStructures
6 | {
7 | [TestClass]
8 | public class TestCuckooBloomFilter
9 | {
10 | private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a");
11 | private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b");
12 | private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c");
13 | private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x");
14 |
15 | ///
16 | /// Ensures that Buckets returns the number of buckets, m, in the Cuckoo Filter.
17 | ///
18 | [TestMethod]
19 | public void TestCuckooBuckets()
20 | {
21 | var f = new CuckooBloomFilter(100, 0.1);
22 | var buckets = f.BucketCount();
23 |
24 | Assert.AreEqual(1024u, buckets);
25 | }
26 |
27 | ///
28 | /// Ensures that Capacity returns the expected filter capacity.
29 | ///
30 | [TestMethod]
31 | public void TestCuckooCapacity()
32 | {
33 | var f = new CuckooBloomFilter(100, 0.1);
34 | var capacity = f.Capacity();
35 |
36 | Assert.AreEqual(100u, capacity);
37 | }
38 |
39 | ///
40 | /// Ensures that Count returns the number of items added to the filter.
41 | ///
42 | [TestMethod]
43 | public void TestCuckooCount()
44 | {
45 | var f = new CuckooBloomFilter(100, 0.1);
46 | for (int i = 0; i < 10; i++)
47 | {
48 | f.Add(Encoding.ASCII.GetBytes(i.ToString()));
49 | }
50 |
51 | for (int i = 0; i < 5; i++)
52 | {
53 | f.TestAndRemove(Encoding.ASCII.GetBytes(i.ToString()));
54 | }
55 |
56 | var count = f.Count();
57 | Assert.AreEqual(5u, count);
58 | }
59 |
60 | ///
61 | /// Ensures that Test, Add, and TestAndAdd behave correctly.
62 | ///
63 | [TestMethod]
64 | public void TestCuckooTestAndAdd()
65 | {
66 | var f = new CuckooBloomFilter(100, 0.1);
67 |
68 | // 'a' is not in the filter.
69 | if (f.Test(A_BYTES))
70 | {
71 | Assert.Fail("'a' should not be a member");
72 | }
73 |
74 | if (!f.Add(A_BYTES))
75 | {
76 | Assert.Fail("Should return true");
77 | }
78 |
79 | // 'a' is now in the filter.
80 | if (!f.Test(A_BYTES))
81 | {
82 | Assert.Fail("'a' should be a member");
83 | }
84 |
85 | // 'a' is still in the filter.
86 | var testAndAdd = f.TestAndAdd(A_BYTES);
87 | if (!testAndAdd.WasAlreadyAMember)
88 | {
89 | Assert.Fail("'a' should be a member");
90 | }
91 | // Should not have added
92 | Assert.IsFalse(testAndAdd.Added);
93 |
94 | // 'b' is not in the filter.
95 | testAndAdd = f.TestAndAdd(B_BYTES);
96 | if (testAndAdd.WasAlreadyAMember)
97 | {
98 | Assert.Fail("'b' should not be a member");
99 | }
100 | // Should add
101 | Assert.IsTrue(testAndAdd.Added);
102 |
103 | // 'a' is still in the filter.
104 | if (!f.Test(A_BYTES))
105 | {
106 | Assert.Fail("'a' should be a member");
107 | }
108 |
109 | // 'b' is now in the filter.
110 | if (!f.Test(B_BYTES))
111 | {
112 | Assert.Fail("'b' should be a member");
113 | }
114 |
115 | // 'c' is not in the filter.
116 | if (f.Test(C_BYTES))
117 | {
118 | Assert.Fail("'c' should not be a member");
119 | }
120 |
121 | for (int i = 0; i < 10000; i++)
122 | {
123 | f.Add(Encoding.ASCII.GetBytes(i.ToString()));
124 | }
125 |
126 | // Filter should be full.
127 | testAndAdd = f.TestAndAdd(X_BYTES);
128 | // Make sure not there
129 | Assert.IsFalse(testAndAdd.WasAlreadyAMember);
130 | // Make sure didn't add
131 | Assert.IsFalse(testAndAdd.Added);
132 | }
133 |
134 | ///
135 | /// Ensures that TestAndRemove behaves correctly.
136 | ///
137 | [TestMethod]
138 | public void TestCuckooTestAndRemove()
139 | {
140 | var f = new CuckooBloomFilter(100, 0.1);
141 |
142 | // 'a' is not in the filter.
143 | if (f.Test(A_BYTES))
144 | {
145 | Assert.Fail("'a' should not be a member");
146 | }
147 |
148 | f.Add(A_BYTES);
149 |
150 | // 'a' is now in the filter.
151 | if (!f.TestAndRemove(A_BYTES))
152 | {
153 | Assert.Fail("'a' should be a member");
154 | }
155 |
156 | // 'a' is no longer in the filter.
157 | if (f.Test(A_BYTES))
158 | {
159 | Assert.Fail("'a' should not be a member");
160 | }
161 | }
162 |
163 | ///
164 | /// Ensures that Reset clears all buckets and the count is zero.
165 | ///
166 | [TestMethod]
167 | public void TestCuckooReset()
168 | {
169 | var f = new CuckooBloomFilter(100, 0.1);
170 | for (int i = 0; i < 1000; i++)
171 | {
172 | f.Add(Encoding.ASCII.GetBytes(i.ToString()));
173 | }
174 |
175 | var resetFilter = f.Reset();
176 | Assert.AreSame(f, resetFilter);
177 |
178 | for (int i = 0; i < f.BucketCount(); i++)
179 | {
180 | for (uint j = 0; j < f.B; j++)
181 | {
182 | if (f.Buckets[i][j] != null)
183 | {
184 | Assert.Fail("Exected all buckets to be cleared");
185 | }
186 | }
187 | }
188 |
189 | Assert.AreEqual(0u, f.Count());
190 | }
191 |
192 | [TestMethod]
193 | public void BenchmarkCuckooAdd()
194 | {
195 | var n = 100000u;
196 | var f = new CuckooBloomFilter(n, 0.1);
197 | var data = new byte[n][];
198 | for (int i = 0; i < n; i++)
199 | {
200 | data[i] = Encoding.ASCII.GetBytes(i.ToString());
201 | }
202 |
203 | for (int i = 0; i < n; i++)
204 | {
205 | f.Add(data[i]);
206 | }
207 | }
208 |
209 | [TestMethod]
210 | public void BenchmarkCuckooTest()
211 | {
212 | var n = 100000u;
213 | var f = new CuckooBloomFilter(n, 0.1);
214 | var data = new byte[n][];
215 | for (int i = 0; i < n; i++)
216 | {
217 | data[i] = Encoding.ASCII.GetBytes(i.ToString());
218 | }
219 |
220 | for (int i = 0; i < n; i++)
221 | {
222 | f.Test(data[i]);
223 | }
224 | }
225 |
226 | [TestMethod]
227 | public void BenchmarkCuckooTestAndAdd()
228 | {
229 | var n = 100000u;
230 | var f = new CuckooBloomFilter(n, 0.1);
231 | var data = new byte[n][];
232 | for (int i = 0; i < n; i++)
233 | {
234 | data[i] = Encoding.ASCII.GetBytes(i.ToString());
235 | }
236 |
237 | for (int i = 0; i < n; i++)
238 | {
239 | f.TestAndAdd(data[i]);
240 | }
241 | }
242 |
243 | [TestMethod]
244 | public void BenchmarkCuckooTestAndRemove()
245 | {
246 | var n = 100000u;
247 | var f = new CuckooBloomFilter(n, 0.1);
248 | var data = new byte[n][];
249 | for (int i = 0; i < n; i++)
250 | {
251 | data[i] = Encoding.ASCII.GetBytes(i.ToString());
252 | }
253 |
254 | for (int i = 0; i < n; i++)
255 | {
256 | f.TestAndRemove(data[i]);
257 | }
258 | }
259 | }
260 | }
261 |
--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestDeletableBloomFilter.cs:
--------------------------------------------------------------------------------
1 | using Microsoft.VisualStudio.TestTools.UnitTesting;
2 | using ProbabilisticDataStructures;
3 | using System.Text;
4 |
5 | namespace TestProbabilisticDataStructures
6 | {
7 | [TestClass]
8 | public class TestDeletableBloomFilter
9 | {
10 | private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a");
11 | private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b");
12 | private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c");
13 | private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x");
14 |
15 | ///
16 | /// Ensures that Capacity() returns the number of bits, m, in the Bloom filter.
17 | ///
18 | [TestMethod]
19 | public void TestDeletableCapacity()
20 | {
21 | var d = new DeletableBloomFilter(100, 10, 0.1);
22 | var capacity = d.Capacity();
23 |
24 | Assert.AreEqual(470u, capacity);
25 | }
26 |
27 | ///
28 | /// Ensures that K() returns the number of hash functions in the Bloom Filter.
29 | ///
30 | [TestMethod]
31 | public void TestDeletableK()
32 | {
33 | var d = new DeletableBloomFilter(100, 10, 0.1);
34 | var k = d.K();
35 |
36 | Assert.AreEqual(4u, k);
37 | }
38 |
39 | ///
40 | /// Ensures that Count returns the number of items added to the filter.
41 | ///
42 | [TestMethod]
43 | public void TestDeletableCount()
44 | {
45 | var d = new DeletableBloomFilter(100, 10, 0.1);
46 | for (uint i = 0; i < 10; i++)
47 | {
48 | d.Add(Encoding.ASCII.GetBytes(i.ToString()));
49 | }
50 |
51 | for (int i = 0; i < 5; i++)
52 | {
53 | d.TestAndRemove(Encoding.ASCII.GetBytes(i.ToString()));
54 | }
55 |
56 | var count = d.Count();
57 | Assert.AreEqual(5u, count);
58 | }
59 |
60 | ///
61 | /// Ensures that Test, Add, and TestAndAdd behave correctly.
62 | ///
63 | [TestMethod]
64 | public void TestDeletableTestAndAdd()
65 | {
66 | var d = new DeletableBloomFilter(100, 10, 0.1);
67 |
68 | // 'a' is not in the filter.
69 | if (d.Test(A_BYTES))
70 | {
71 | Assert.Fail("'a' should not be a member");
72 | }
73 |
74 | var addedF = d.Add(A_BYTES);
75 | Assert.AreSame(d, addedF, "Returned CountingBloomFilter should be the same instance");
76 |
77 | // 'a' is now in the filter.
78 | if (!d.Test(A_BYTES))
79 | {
80 | Assert.Fail("'a' should be a member");
81 | }
82 |
83 | // 'a' is still in the filter.
84 | if (!d.TestAndAdd(A_BYTES))
85 | {
86 | Assert.Fail("'a' should be a member");
87 | }
88 |
89 | // 'b' is not in the filter.
90 | if (d.TestAndAdd(B_BYTES))
91 | {
92 | Assert.Fail("'b' should not be a member");
93 | }
94 |
95 | // 'a' is still in the filter.
96 | if (!d.Test(A_BYTES))
97 | {
98 | Assert.Fail("'a' should be a member");
99 | }
100 |
101 | // 'b' is now in the filter.
102 | if (!d.Test(B_BYTES))
103 | {
104 | Assert.Fail("'b' should be a member");
105 | }
106 |
107 | // 'c' is not in the filter.
108 | if (d.Test(C_BYTES))
109 | {
110 | Assert.Fail("'c' should not be a member");
111 | }
112 |
113 | for (int i = 0; i < 1000000; i++)
114 | {
115 | d.TestAndAdd(Encoding.ASCII.GetBytes(i.ToString()));
116 | }
117 |
118 | // 'x' should be a false positive.
119 | if (!d.Test(X_BYTES))
120 | {
121 | Assert.Fail("'x' should be a member");
122 | }
123 | }
124 |
125 | ///
126 | /// Ensures that TestAndRemove behaves correctly.
127 | ///
128 | [TestMethod]
129 | public void TestDeletableTestAndRemove()
130 | {
131 | var d = new DeletableBloomFilter(100, 10, 0.1);
132 |
133 | // 'a' is not in the filter.
134 | if (d.TestAndRemove(A_BYTES))
135 | {
136 | Assert.Fail("'a' should not be a member");
137 | }
138 |
139 | d.Add(A_BYTES);
140 |
141 | // 'a' is now in the filter.
142 | if (!d.TestAndRemove(A_BYTES))
143 | {
144 | Assert.Fail("'a' should be a member");
145 | }
146 |
147 | // 'a' is no longer in the filter.
148 | if (d.TestAndRemove(A_BYTES))
149 | {
150 | Assert.Fail("'a' should not be a member");
151 | }
152 | }
153 |
154 | ///
155 | /// Ensures that Reset sets every bit to zero.
156 | ///
157 | [TestMethod]
158 | public void TestDeletableReset()
159 | {
160 | var d = new DeletableBloomFilter(100, 10, 0.1);
161 | for (int i = 0; i < 1000; i++)
162 | {
163 | d.Add(Encoding.ASCII.GetBytes(i.ToString()));
164 | }
165 |
166 | var resetF = d.Reset();
167 | Assert.AreSame(d, resetF, "Returned DeletableBloomFilter should be the same instance");
168 |
169 | for (uint i = 0; i < d.Buckets.count; i++)
170 | {
171 | if (d.Buckets.Get(i) != 0)
172 | {
173 | Assert.Fail("Expected all bits to be unset");
174 | }
175 | }
176 |
177 | for (uint i = 0; i < d.Collisions.count; i++)
178 | {
179 | if (d.Collisions.Get(i) != 0)
180 | {
181 | Assert.Fail("Expected all bits to be unset");
182 | }
183 | }
184 |
185 | var count = d.Count();
186 | Assert.AreEqual(0u, count);
187 | }
188 |
189 | [TestMethod]
190 | public void BenchmarkDeletableAdd()
191 | {
192 | var n = 100000;
193 | var d = new DeletableBloomFilter(100, 10, 0.1);
194 | var data = new byte[n][];
195 | for (int i = 0; i < n; i++)
196 | {
197 | data[i] = Encoding.ASCII.GetBytes(i.ToString());
198 | }
199 |
200 | for (int i = 0; i < n; i++)
201 | {
202 | d.Add(data[i]);
203 | }
204 | }
205 |
206 | [TestMethod]
207 | public void BenchmarkDeletableTest()
208 | {
209 | var n = 100000;
210 | var d = new DeletableBloomFilter(100, 10, 0.1);
211 | var data = new byte[n][];
212 | for (int i = 0; i < n; i++)
213 | {
214 | data[i] = Encoding.ASCII.GetBytes(i.ToString());
215 | }
216 |
217 | for (int i = 0; i < n; i++)
218 | {
219 | d.Test(data[i]);
220 | }
221 | }
222 |
223 | [TestMethod]
224 | public void BenchmarkDeletableTestAndAdd()
225 | {
226 | var n = 100000;
227 | var d = new DeletableBloomFilter(100, 10, 0.1);
228 | var data = new byte[n][];
229 | for (int i = 0; i < n; i++)
230 | {
231 | data[i] = Encoding.ASCII.GetBytes(i.ToString());
232 | }
233 |
234 | for (int i = 0; i < n; i++)
235 | {
236 | d.TestAndAdd(data[i]);
237 | }
238 | }
239 |
240 | [TestMethod]
241 | public void BenchmarkDeletableTestAndRemove()
242 | {
243 | var n = 100000;
244 | var d = new DeletableBloomFilter(100, 10, 0.1);
245 | var data = new byte[n][];
246 | for (int i = 0; i < n; i++)
247 | {
248 | data[i] = Encoding.ASCII.GetBytes(i.ToString());
249 | }
250 |
251 | for (int i = 0; i < n; i++)
252 | {
253 | d.TestAndRemove(data[i]);
254 | }
255 | }
256 | }
257 | }
258 |
--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestHyperLogLog.cs:
--------------------------------------------------------------------------------
1 | /*
2 | Original work Copyright 2013 Eric Lesh
3 | Modified work Copyright 2015 Tyler Treat
4 | Modified work Copyright 2015 Matthew Lorimor
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining
7 | a copy of this software and associated documentation files (the
8 | "Software"), to deal in the Software without restriction, including
9 | without limitation the rights to use, copy, modify, merge, publish,
10 | distribute, sublicense, and/or sell copies of the Software, and to
11 | permit persons to whom the Software is furnished to do so, subject to
12 | the following conditions:
13 |
14 | The above copyright notice and this permission notice shall be
15 | included in all copies or substantial portions of the Software.
16 | */
17 |
18 | using System;
19 | using Microsoft.VisualStudio.TestTools.UnitTesting;
20 | using ProbabilisticDataStructures;
21 | using System.Text;
22 | using System.Threading.Tasks;
23 |
24 | namespace TestProbabilisticDataStructures
25 | {
26 | [TestClass]
27 | public class TestHyperLogLog
28 | {
29 | private double geterror(UInt64 actual, UInt64 estimate)
30 | {
31 | return ((float)estimate - (float)actual) / (float)actual;
32 | }
33 |
34 | private void testHyperLogLog(int n, int lowB, int highB)
35 | {
36 | var words = Words.Dictionary(n);
37 | var bad = 0;
38 | var nWords = (UInt64)words.LongLength;
39 |
40 | var options = new ParallelOptions();
41 | options.MaxDegreeOfParallelism = 4;
42 | Parallel.For(lowB, highB, options, i =>
43 | {
44 | var m = (uint)Math.Pow(2, i);
45 |
46 | HyperLogLog h = null;
47 | try
48 | {
49 | h = new HyperLogLog(m);
50 | }
51 | catch (Exception)
52 | {
53 | Assert.Fail(string.Format("Can't make HyperLogLog({0})", m));
54 | }
55 |
56 | foreach (var word in words)
57 | {
58 | h.Add(Encoding.ASCII.GetBytes(word));
59 | }
60 |
61 | var expectedError = 1.04 / Math.Sqrt(m);
62 | var actualError = Math.Abs(this.geterror(nWords, h.Count()));
63 |
64 | if (actualError > expectedError)
65 | {
66 | bad++;
67 | //Assert.Fail(string.Format("Expected: {0}, Actual: {1}", expectedError, actualError));
68 | }
69 | });
70 | }
71 |
72 | private void benchmarkCount(int registers)
73 | {
74 | var n = 100000;
75 | var words = Words.Dictionary(0);
76 | var m = (uint)Math.Pow(2, registers);
77 |
78 | var h = new HyperLogLog(m);
79 |
80 | foreach (var word in words)
81 | {
82 | h.Add(Encoding.ASCII.GetBytes(word));
83 | }
84 |
85 | for (int i = 0; i < n; i++)
86 | {
87 | h.Count();
88 | }
89 | }
90 |
91 | [TestMethod]
92 | public void TestHyperLogLogSmall()
93 | {
94 | this.testHyperLogLog(5, 4, 17);
95 | }
96 |
97 | [TestMethod]
98 | public void TestHyperLogLogBig()
99 | {
100 | this.testHyperLogLog(0, 4, 17);
101 | }
102 |
103 | [TestMethod]
104 | public void TestNewDefaultHyperLogLog()
105 | {
106 | var hll = HyperLogLog.NewDefaultHyperLogLog(0.1);
107 |
108 | Assert.AreEqual(128u, hll.M);
109 | }
110 |
111 | [TestMethod]
112 | public void BenchmarkHLLCount4()
113 | {
114 | this.benchmarkCount(4);
115 | }
116 |
117 | [TestMethod]
118 | public void BenchmarkHLLCount5()
119 | {
120 | this.benchmarkCount(5);
121 | }
122 |
123 | [TestMethod]
124 | public void BenchmarkHLLCount6()
125 | {
126 | this.benchmarkCount(6);
127 | }
128 |
129 | [TestMethod]
130 | public void BenchmarkHLLCount7()
131 | {
132 | this.benchmarkCount(7);
133 | }
134 |
135 | [TestMethod]
136 | public void BenchmarkHLLCount8()
137 | {
138 | this.benchmarkCount(8);
139 | }
140 |
141 | [TestMethod]
142 | public void BenchmarkHLLCount9()
143 | {
144 | this.benchmarkCount(9);
145 | }
146 |
147 | [TestMethod]
148 | public void BenchmarkHLLCount10()
149 | {
150 | this.benchmarkCount(10);
151 | }
152 | }
153 | }
154 |
--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestInverseBloomFilter.cs:
--------------------------------------------------------------------------------
1 | using Microsoft.VisualStudio.TestTools.UnitTesting;
2 | using System.Text;
3 | using ProbabilisticDataStructures;
4 |
5 | namespace TestProbabilisticDataStructures
6 | {
7 | [TestClass]
8 | public class TestInverseBloomFilter
9 | {
10 | private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a");
11 | private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b");
12 | private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c");
13 | private static byte[] D_BYTES = Encoding.ASCII.GetBytes("d");
14 | private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x");
15 |
16 | ///
17 | /// Ensures that Capacity returns the correct filter size.
18 | ///
19 | [TestMethod]
20 | public void TestInverseCapacity()
21 | {
22 | var f = new InverseBloomFilter(100);
23 |
24 | var capacity = f.Capacity();
25 | Assert.AreEqual(100u, capacity);
26 | }
27 |
28 | ///
29 | /// Ensures that TestAndAdd behaves correctly.
30 | ///
31 | [TestMethod]
32 | public void TestInverseTestAndAdd()
33 | {
34 | var f = new InverseBloomFilter(3);
35 |
36 | if (f.TestAndAdd(A_BYTES))
37 | {
38 | Assert.Fail("'a' should not be a member");
39 | }
40 |
41 | if (!f.Test(A_BYTES))
42 | {
43 | Assert.Fail("'a' should be a member");
44 | }
45 |
46 | // 'd' hashes to the same index as 'a'
47 | if (f.TestAndAdd(D_BYTES))
48 | {
49 | Assert.Fail("'d' should not be a member");
50 | }
51 |
52 | // 'a' was swapped out.
53 | if (f.TestAndAdd(A_BYTES))
54 | {
55 | Assert.Fail("'a' should not be a member");
56 | }
57 |
58 | if (!f.Test(A_BYTES))
59 | {
60 | Assert.Fail("'a' should be a member");
61 | }
62 |
63 | // 'b' hashes to another index
64 | if (f.TestAndAdd(B_BYTES))
65 | {
66 | Assert.Fail("'b' should not be a member");
67 | }
68 |
69 | if (!f.Test(B_BYTES))
70 | {
71 | Assert.Fail("'b' should be a member");
72 | }
73 |
74 | // 'a' should still be a member.
75 | if (!f.Test(A_BYTES))
76 | {
77 | Assert.Fail("'a' should be a member");
78 | }
79 |
80 | if (f.Test(C_BYTES))
81 | {
82 | Assert.Fail("'c' should not be a member");
83 | }
84 |
85 | var addedC = f.Add(C_BYTES);
86 | Assert.AreSame(f, addedC, "Returned InverseBloomFilter should be the same instance");
87 |
88 | if (!f.Test(C_BYTES))
89 | {
90 | Assert.Fail("'c' should be a member");
91 | }
92 | }
93 | }
94 |
95 | [TestClass]
96 | public class BenchmarkInverseBloomFilter
97 | {
98 | private InverseBloomFilter f;
99 | private int n;
100 | private byte[][] data;
101 |
102 | [TestInitialize()]
103 | public void Testinitialize()
104 | {
105 | n = 100000;
106 | f = new InverseBloomFilter((uint)n);
107 | data = new byte[n][];
108 | for (int i = 0; i < n; i++)
109 | {
110 | data[i] = Encoding.ASCII.GetBytes(i.ToString());
111 | }
112 | }
113 |
114 | [TestCleanup()]
115 | public void TestCleanup()
116 | {
117 | f = null;
118 | n = 0;
119 | data = null;
120 | }
121 |
122 | [TestMethod]
123 | public void BenchmarkInverseAdd()
124 | {
125 | for (int i = 0; i < n; i++)
126 | {
127 | f.Add(data[i]);
128 | }
129 | }
130 |
131 | [TestMethod]
132 | public void BenchmarkInverseTest()
133 | {
134 | for (int i = 0; i < n; i++)
135 | {
136 | f.Test(data[i]);
137 | }
138 | }
139 |
140 | [TestMethod]
141 | public void BenchmarkInverseTestAndAdd()
142 | {
143 | for (int i = 0; i < n; i++)
144 | {
145 | f.TestAndAdd(data[i]);
146 | }
147 | }
148 | }
149 | }
150 |
--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestMinHash.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using Microsoft.VisualStudio.TestTools.UnitTesting;
3 | using ProbabilisticDataStructures;
4 |
5 |
6 | namespace TestProbabilisticDataStructures
7 | {
8 | [TestClass]
9 | public class TestMinHash
10 | {
11 | ///
12 | /// Ensures that MinHash returns the correct similarity ratio.
13 | ///
14 | [TestMethod]
15 | public void TestMinHashSimilarity()
16 | {
17 | var bag = new List{
18 | "bob",
19 | "alice",
20 | "frank",
21 | "tyler",
22 | "sara"
23 | };
24 |
25 | var simRatio = MinHash.Similarity(bag.ToArray(), bag.ToArray());
26 | Assert.AreEqual(1.0, simRatio);
27 |
28 | var dict = Words.Dictionary(1000);
29 | var bag2 = new List();
30 | for (int i = 0; i < 1000; i++)
31 | {
32 | bag2.Add(i.ToString());
33 | }
34 |
35 | simRatio = MinHash.Similarity(dict, bag2.ToArray());
36 | Assert.AreEqual(0.0, simRatio);
37 |
38 | var bag3 = Words.Dictionary(500);
39 | simRatio = MinHash.Similarity(dict, bag3);
40 | if (simRatio > 0.7 || simRatio < 0.5)
41 | {
42 | Assert.Fail(string.Format("Expected between 0.5 and 0.7, got {0}", simRatio));
43 | }
44 | }
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestPartitionedBloomFilter.cs:
--------------------------------------------------------------------------------
1 | using Microsoft.VisualStudio.TestTools.UnitTesting;
2 | using ProbabilisticDataStructures;
3 | using System.Text;
4 |
5 | namespace TestProbabilisticDataStructures
6 | {
7 | [TestClass]
8 | public class TestPartitionedBloomFilter
9 | {
10 | private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a");
11 | private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b");
12 | private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c");
13 | private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x");
14 |
15 | ///
16 | /// Ensures that Capacity() returns the number of bits, m, in the Bloom filter.
17 | ///
18 | [TestMethod]
19 | public void TestPartitionedCapacity()
20 | {
21 | var f = new PartitionedBloomFilter(100, 0.1);
22 | var capacity = f.Capacity();
23 |
24 | Assert.AreEqual(480u, capacity);
25 | }
26 |
27 | ///
28 | /// Ensures that K() returns the number of hash functions in the Bloom Filter.
29 | ///
30 | [TestMethod]
31 | public void TestPartitionedK()
32 | {
33 | var f = new PartitionedBloomFilter(100, 0.1);
34 | var k = f.K();
35 |
36 | Assert.AreEqual(4u, k);
37 | }
38 |
39 | ///
40 | /// Ensures that Count returns the number of items added to the filter.
41 | ///
42 | [TestMethod]
43 | public void TestPartitionedCount()
44 | {
45 | var f = new PartitionedBloomFilter(100, 0.1);
46 | for (uint i = 0; i < 10; i++)
47 | {
48 | f.Add(Encoding.ASCII.GetBytes(i.ToString()));
49 | }
50 |
51 | var count = f.Count();
52 | Assert.AreEqual(10u, count);
53 | }
54 |
55 | ///
56 | /// Ensures that EstimatedFillRatio returns the correct approximation.
57 | ///
58 | [TestMethod]
59 | public void TestPartitionedEstimatedFillRatio()
60 | {
61 | var f = new PartitionedBloomFilter(100, 0.5);
62 | for (uint i = 0; i < 100; i++)
63 | {
64 | f.Add(Encoding.ASCII.GetBytes(i.ToString()));
65 | }
66 |
67 | var ratio = f.EstimatedFillRatio();
68 | if (ratio > 0.5)
69 | {
70 | Assert.Fail("Expected less than or equal to 0.5, got {0}", ratio);
71 | }
72 | }
73 |
74 | ///
75 | /// Ensures that FillRatio returns the ratio of set bits.
76 | ///
77 | [TestMethod]
78 | public void TestPartitionedFillRatio()
79 | {
80 | var f = new PartitionedBloomFilter(100, 0.1);
81 | f.Add(A_BYTES);
82 | f.Add(B_BYTES);
83 | f.Add(C_BYTES);
84 | f.Add(X_BYTES);
85 |
86 | var ratio = f.FillRatio();
87 | Assert.AreEqual(0.03125, ratio);
88 | }
89 |
90 | ///
91 | /// Ensures that Test, Add, and TestAndAdd behave correctly.
92 | ///
93 | [TestMethod]
94 | public void TestPartitionedBloomTestAndAdd()
95 | {
96 | var f = new PartitionedBloomFilter(100, 0.01);
97 |
98 | // 'a' is not in the filter.
99 | if (f.Test(A_BYTES))
100 | {
101 | Assert.Fail("'a' should not be a member");
102 | }
103 |
104 | var addedF = f.Add(A_BYTES);
105 | Assert.AreSame(f, addedF, "Returned PartitionedBloomFilter should be the same instance");
106 |
107 | // 'a' is now in the filter.
108 | if (!f.Test(A_BYTES))
109 | {
110 | Assert.Fail("'a' should be a member");
111 | }
112 |
113 | // 'a' is still in the filter.
114 | if (!f.TestAndAdd(A_BYTES))
115 | {
116 | Assert.Fail("'a' should be a member");
117 | }
118 |
119 | // 'b' is not in the filter.
120 | if (f.TestAndAdd(B_BYTES))
121 | {
122 | Assert.Fail("'b' should not be a member");
123 | }
124 |
125 | // 'a' is still in the filter.
126 | if (!f.Test(A_BYTES))
127 | {
128 | Assert.Fail("'a' should be a member");
129 | }
130 |
131 | // 'b' is now in the filter.
132 | if (!f.Test(B_BYTES))
133 | {
134 | Assert.Fail("'b' should be a member");
135 | }
136 |
137 | // 'c' is not in the filter.
138 | if (f.Test(C_BYTES))
139 | {
140 | Assert.Fail("'c' should not be a member");
141 | }
142 |
143 | for (int i = 0; i < 1000000; i++)
144 | {
145 | f.TestAndAdd(Encoding.ASCII.GetBytes(i.ToString()));
146 | }
147 |
148 | // 'x' should be a false positive.
149 | if (!f.Test(X_BYTES))
150 | {
151 | Assert.Fail("'x' should be a member");
152 | }
153 | }
154 |
155 | ///
156 | /// Ensures that Reset sets every bit to zero.
157 | ///
158 | [TestMethod]
159 | public void TestPartitionedBloomReset()
160 | {
161 | var f = new PartitionedBloomFilter(100, 0.1);
162 | for (int i = 0; i < 1000; i++)
163 | {
164 | f.Add(Encoding.ASCII.GetBytes(i.ToString()));
165 | }
166 |
167 | var resetF = f.Reset();
168 | Assert.AreSame(f, resetF, "Returned PartitionedBloomFilter should be the same instance");
169 |
170 | foreach (var partition in f.Partitions)
171 | {
172 | for (uint i = 0; i < partition.count; i++)
173 | {
174 | if (partition.Get(0) != 0)
175 | {
176 | Assert.Fail("Expected all bits to be unset");
177 | }
178 | }
179 | }
180 | }
181 |
182 | [TestMethod]
183 | public void BenchmarkPartitionedBloomAdd()
184 | {
185 | var n = 100000;
186 | var f = new PartitionedBloomFilter(100000, 0.1);
187 | var data = new byte[n][];
188 | for (int i = 0; i < n; i++)
189 | {
190 | data[i] = Encoding.ASCII.GetBytes(i.ToString());
191 | }
192 |
193 | for (int i = 0; i < n; i++)
194 | {
195 | f.Add(data[i]);
196 | }
197 | }
198 |
199 | [TestMethod]
200 | public void BenchmarkPartitionedBloomTest()
201 | {
202 | var n = 100000;
203 | var f = new PartitionedBloomFilter(100000, 0.1);
204 | var data = new byte[n][];
205 | for (int i = 0; i < n; i++)
206 | {
207 | data[i] = Encoding.ASCII.GetBytes(i.ToString());
208 | }
209 |
210 | for (int i = 0; i < n; i++)
211 | {
212 | f.Test(data[i]);
213 | }
214 | }
215 |
216 | [TestMethod]
217 | public void BenchmarkPartitionedBloomTestAndAdd()
218 | {
219 | var n = 100000;
220 | var f = new PartitionedBloomFilter(100000, 0.1);
221 | var data = new byte[n][];
222 | for (int i = 0; i < n; i++)
223 | {
224 | data[i] = Encoding.ASCII.GetBytes(i.ToString());
225 | }
226 |
227 | for (int i = 0; i < n; i++)
228 | {
229 | f.TestAndAdd(data[i]);
230 | }
231 | }
232 | }
233 | }
234 |
--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestProbabilisticDataStructures.cs:
--------------------------------------------------------------------------------
1 | using Microsoft.VisualStudio.TestTools.UnitTesting;
2 | using ProbabilisticDataStructures;
3 | using System.Security.Cryptography;
4 |
5 | namespace TestProbabilisticDataStructures
6 | {
7 | [TestClass]
8 | public class TestProbabilisticDataStructures
9 | {
10 | ///
11 | /// Ensures that correct math is performed for OptimalM().
12 | ///
13 | [TestMethod]
14 | public void TestOptimalM()
15 | {
16 | var optimalM = OptimalM(100, 0.01);
17 | Assert.AreEqual(959u, optimalM);
18 |
19 | optimalM = OptimalM(100, 0.5);
20 | Assert.AreEqual(145u, optimalM);
21 | }
22 |
23 | ///
24 | /// Ensures that correct math is performed for OptimalM64().
25 | ///
26 | [TestMethod]
27 | public void TestOptimalM64()
28 | {
29 | var optimalM = OptimalM64(100, 0.01);
30 | Assert.AreEqual(959ul, optimalM);
31 |
32 | optimalM = OptimalM64(100, 0.5);
33 | Assert.AreEqual(145ul, optimalM);
34 |
35 | optimalM = OptimalM64(8589934592ul, 0.0001);
36 | Assert.AreEqual(164670049045ul, optimalM);
37 | }
38 |
39 | ///
40 | /// Ensures that correct math is performed for OptimalK().
41 | ///
42 | [TestMethod]
43 | public void TestOptimalK()
44 | {
45 | var optimalK = OptimalK(0.01);
46 | Assert.AreEqual(7u, optimalK);
47 |
48 | optimalK = OptimalK(0.0001);
49 | Assert.AreEqual(14u, optimalK);
50 | }
51 |
52 | ///
53 | /// Ensures that HashKernel() returns the same upper and lower base
54 | /// as https://github.com/tylertreat/BoomFilters does when using the
55 | /// FNV1 hash.
56 | ///
57 | [TestMethod]
58 | public void TestHashKernelFNV1()
59 | {
60 | // FNV1 hash bytes for new byte[] { 0, 1, 2, 3 }
61 | var hashBytes =
62 | new byte[]
63 | {
64 | 0x15,
65 | 0x54,
66 | 0xe0,
67 | 0x98,
68 | 0x7f,
69 | 0x32,
70 | 0x75,
71 | 0x44
72 | };
73 | var hashKernel = ProbabilisticDataStructures
74 | .Utils.HashKernelFromHashBytes(hashBytes);
75 | // Compare against upper and lower base values gotten by
76 | // calling the HashKernel function from
77 | // https://github.com/tylertreat/BoomFilters using that library's
78 | // default FNV1 hash algorithm.
79 | Assert.AreEqual(2564838421u, hashKernel.LowerBaseHash);
80 | Assert.AreEqual(1148531327u, hashKernel.UpperBaseHash);
81 | }
82 |
83 | ///
84 | /// Ensures that HashKernel() returns the proper upper and lower base when using
85 | /// MD5.
86 | ///
87 | [TestMethod]
88 | public void TestHashKernelMD5()
89 | {
90 | var data = new byte[] { 0, 1, 2, 3 };
91 | var hashAlgorithm = HashAlgorithm.Create("MD5");
92 | var hashKernel = ProbabilisticDataStructures
93 | .Utils.HashKernel(data, hashAlgorithm);
94 |
95 | Assert.AreEqual(4254774583u, hashKernel.LowerBaseHash);
96 | Assert.AreEqual(4179961689u, hashKernel.UpperBaseHash);
97 | }
98 |
99 | ///
100 | /// Ensures that HashKernel() returns the proper upper and lower base when using
101 | /// SHA256.
102 | ///
103 | [TestMethod]
104 | public void TestHashKernelSHA256()
105 | {
106 | var data = new byte[] { 0, 1, 2, 3 };
107 | var hashAlgorithm = HashAlgorithm.Create("SHA256");
108 | var hashKernel = ProbabilisticDataStructures
109 | .Utils.HashKernel(data, hashAlgorithm);
110 |
111 | Assert.AreEqual(3252571653u, hashKernel.LowerBaseHash);
112 | Assert.AreEqual(1646207440u, hashKernel.UpperBaseHash);
113 | }
114 |
115 | ///
116 | /// Ensures that HashKernel() returns the proper upper and lower base when using
117 | /// MD5.
118 | ///
119 | [TestMethod]
120 | public void TestHashKerne128lMD5()
121 | {
122 | var data = new byte[] { 0, 1, 2, 3 };
123 | var hashAlgorithm = HashAlgorithm.Create("MD5");
124 | var hashKernel = ProbabilisticDataStructures
125 | .Utils.HashKernel128(data, hashAlgorithm);
126 |
127 | Assert.AreEqual(17952798757042697527ul, hashKernel.LowerBaseHash);
128 | Assert.AreEqual(7516929291713011248ul, hashKernel.UpperBaseHash);
129 | }
130 |
131 | ///
132 | /// Ensures that HashKernel() returns the proper upper and lower base when using
133 | /// SHA256.
134 | ///
135 | [TestMethod]
136 | public void TestHashKernel128SHA256()
137 | {
138 | var data = new byte[] { 0, 1, 2, 3 };
139 | var hashAlgorithm = HashAlgorithm.Create("SHA256");
140 | var hashKernel = ProbabilisticDataStructures
141 | .Utils.HashKernel128(data, hashAlgorithm);
142 |
143 | Assert.AreEqual(7070407120484453893ul, hashKernel.LowerBaseHash);
144 | Assert.AreEqual(4682007113097866575ul, hashKernel.UpperBaseHash);
145 | }
146 |
147 | ///
148 | /// Helper method to get OptimalM().
149 | ///
150 | ///
151 | ///
152 | ///
153 | private uint OptimalM(uint n, double fpRate)
154 | {
155 | return ProbabilisticDataStructures
156 | .Utils.OptimalM(n, fpRate);
157 | }
158 |
159 | ///
160 | /// Helper method to get OptimalM64().
161 | ///
162 | ///
163 | ///
164 | ///
165 | private ulong OptimalM64(ulong n, double fpRate)
166 | {
167 | return ProbabilisticDataStructures
168 | .Utils.OptimalM64(n, fpRate);
169 | }
170 |
171 | ///
172 | /// Helper method to get OptimalK().
173 | ///
174 | ///
175 | ///
176 | private uint OptimalK(double fpRate)
177 | {
178 | return ProbabilisticDataStructures
179 | .Utils.OptimalK(fpRate);
180 | }
181 |
182 | [TestMethod]
183 | public void TestHashBytesToUInt32()
184 | {
185 | var hashBytes =
186 | new byte[]
187 | {
188 | 0x40,
189 | 0x51,
190 | 0x62,
191 | 0x73,
192 | 0x84,
193 | 0x95,
194 | 0xa6,
195 | 0xb7,
196 | 0xc8,
197 | 0xd9,
198 | 0xea,
199 | 0xfb
200 | };
201 | Assert.AreEqual(0x73625140u, Utils.HashBytesToUInt32(hashBytes, 0));
202 | Assert.AreEqual(0xb7a69584u, Utils.HashBytesToUInt32(hashBytes, 4));
203 | Assert.AreEqual(0xfbead9c8u, Utils.HashBytesToUInt32(hashBytes, 8));
204 | }
205 |
206 | [TestMethod]
207 | public void TestHashBytesToUInt64()
208 | {
209 | var hashBytes =
210 | new byte[]
211 | {
212 | 0x40,
213 | 0x51,
214 | 0x62,
215 | 0x73,
216 | 0x84,
217 | 0x95,
218 | 0xa6,
219 | 0xb7,
220 | 0xc8,
221 | 0xd9,
222 | 0xea,
223 | 0xfb
224 | };
225 | Assert.AreEqual(0xb7a6958473625140ul, Utils.HashBytesToUInt64(hashBytes, 0));
226 | Assert.AreEqual(0xfbead9c8b7a69584ul, Utils.HashBytesToUInt64(hashBytes, 4));
227 | }
228 |
229 | [TestMethod]
230 | public void TestComputeHashAsStringMD5()
231 | {
232 | var data = new byte[] { 0, 1, 2, 3 };
233 | var hashingAlgorithm = HashAlgorithm.Create("MD5");
234 | var hashString = Utils.ComputeHashAsString(data, hashingAlgorithm);
235 | Assert.AreEqual("37B59AFD592725F9305E484A5D7F5168", hashString);
236 | }
237 |
238 | [TestMethod]
239 | public void TestComputeHashAsStringSHA256()
240 | {
241 | var data = new byte[] { 0, 1, 2, 3 };
242 | var hashingAlgorithm = HashAlgorithm.Create("SHA256");
243 | var hashString = Utils.ComputeHashAsString(data, hashingAlgorithm);
244 | Assert.AreEqual("054EDEC1D0211F624FED0CBCA9D4F9400B0E491C43742AF2C5B0ABEBF0C990D8", hashString);
245 | }
246 | }
247 | }
248 |
--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestProbabilisticDataStructures.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Debug
5 | AnyCPU
6 | {8212EFDE-5134-4914-96D3-C550FD9432F1}
7 | Library
8 | Properties
9 | TestProbabilisticDataStructures
10 | TestProbabilisticDataStructures
11 | v4.7
12 | 512
13 | {3AC096D0-A1C2-E12C-1390-A8335801FDAB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}
14 | 10.0
15 | $(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)
16 | $(ProgramFiles)\Common Files\microsoft shared\VSTT\$(VisualStudioVersion)\UITestExtensionPackages
17 | False
18 | UnitTest
19 |
20 |
21 |
22 | true
23 | full
24 | false
25 | bin\Debug\
26 | DEBUG;TRACE
27 | prompt
28 | 4
29 |
30 |
31 | pdbonly
32 | true
33 | bin\Release\
34 | TRACE
35 | prompt
36 | 4
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 | {bf43f4a8-a892-413c-8e11-9a53d2249bf4}
76 | ProbabilisticDataStructures
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 | False
85 |
86 |
87 | False
88 |
89 |
90 | False
91 |
92 |
93 | False
94 |
95 |
96 |
97 |
98 |
99 |
100 |
107 |
--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestScalableBloomFilter.cs:
--------------------------------------------------------------------------------
1 | using Microsoft.VisualStudio.TestTools.UnitTesting;
2 | using ProbabilisticDataStructures;
3 | using System.Text;
4 |
5 | namespace TestProbabilisticDataStructures
6 | {
7 | [TestClass]
8 | public class TestScalableBloomFilter
9 | {
10 | private static byte[] A_BYTES = Encoding.ASCII.GetBytes("a");
11 | private static byte[] B_BYTES = Encoding.ASCII.GetBytes("b");
12 | private static byte[] C_BYTES = Encoding.ASCII.GetBytes("c");
13 | private static byte[] X_BYTES = Encoding.ASCII.GetBytes("x");
14 |
15 | [TestMethod]
16 | public void TestNewDefaultScalableBloomFilter()
17 | {
18 | var f = ScalableBloomFilter.NewDefaultScalableBloomFilter(0.1);
19 |
20 | Assert.AreEqual(0.1, f.FP);
21 | Assert.AreEqual(10000u, f.Hint);
22 | Assert.AreEqual(0.8, f.R);
23 | }
24 |
25 | [TestMethod]
26 | public void TestScalableBloomCapacity()
27 | {
28 | var f = new ScalableBloomFilter(1, 0.1, 1);
29 | f.AddFilter();
30 | f.AddFilter();
31 |
32 | var capacity = f.Capacity();
33 | Assert.AreEqual(15u, capacity);
34 | }
35 |
36 | // Ensures that K returns the number of hash functions used in each Bloom filter.
37 | [TestMethod]
38 | public void TestScalableBloomK()
39 | {
40 | var f = new ScalableBloomFilter(10, 0.1, 0.8);
41 |
42 | var k = f.K();
43 | Assert.AreEqual(4u, k);
44 | }
45 |
46 | ///
47 | /// Ensures that FillRatio returns the average fill ratio of the contained
48 | /// filters.
49 | ///
50 | [TestMethod]
51 | public void TestScalableFillRatio()
52 | {
53 | var f = new ScalableBloomFilter(100, 0.1, 0.8);
54 | f.SetHash(ProbabilisticDataStructures.Defaults.GetDefaultHashAlgorithm());
55 | for (int i = 0; i < 200; i++)
56 | {
57 | f.Add(Encoding.ASCII.GetBytes(i.ToString()));
58 | }
59 |
60 | var fillRatio = f.FillRatio();
61 | if (fillRatio > 0.5)
62 | {
63 | Assert.Fail(string.Format("Expected less than or equal to 0.5, got {0}", fillRatio));
64 | }
65 | }
66 |
67 | ///
68 | /// Ensures that Test, Add, and TestAndAdd behave correctly.
69 | ///
70 | [TestMethod]
71 | public void TestScalableBloomTestAndAdd()
72 | {
73 | var f = new ScalableBloomFilter(1000, 0.01, 0.8);
74 |
75 | // 'a' is not in the filter.
76 | if (f.Test(A_BYTES))
77 | {
78 | Assert.Fail("'a' should not be a member");
79 | }
80 |
81 | var addedF = f.Add(A_BYTES);
82 | Assert.AreSame(f, addedF, "Returned ScalableBloomFilter should be the same instance");
83 |
84 | // 'a' is now in the filter.
85 | if (!f.Test(A_BYTES))
86 | {
87 | Assert.Fail("'a' should be a member");
88 | }
89 |
90 | // 'a' is still in the filter.
91 | if (!f.TestAndAdd(A_BYTES))
92 | {
93 | Assert.Fail("'a' should be a member");
94 | }
95 |
96 | // 'b' is not in the filter.
97 | if (f.TestAndAdd(B_BYTES))
98 | {
99 | Assert.Fail("'b' should not be a member");
100 | }
101 |
102 | // 'a' is still in the filter.
103 | if (!f.Test(A_BYTES))
104 | {
105 | Assert.Fail("'a' should be a member");
106 | }
107 |
108 | // 'b' is now in the filter.
109 | if (!f.Test(B_BYTES))
110 | {
111 | Assert.Fail("'b' should be a member");
112 | }
113 |
114 | // 'c' is not in the filter.
115 | if (f.Test(C_BYTES))
116 | {
117 | Assert.Fail("'c' should not be a member");
118 | }
119 |
120 | for (int i = 0; i < 10000; i++)
121 | {
122 | f.TestAndAdd(Encoding.ASCII.GetBytes(i.ToString()));
123 | }
124 |
125 | // 'x' should not be a false positive.
126 | if (f.Test(X_BYTES))
127 | {
128 | Assert.Fail("'x' should be a member");
129 | }
130 | }
131 |
132 | ///
133 | /// Ensures that Reset sets every bit to zero.
134 | ///
135 | [TestMethod]
136 | public void TestScalableBloomReset()
137 | {
138 | var f = new ScalableBloomFilter(10, 0.1, 0.8);
139 | for (int i = 0; i < 1000; i++)
140 | {
141 | f.Add(Encoding.ASCII.GetBytes(i.ToString()));
142 | }
143 |
144 | var count = f.Filters.Count;
145 | Assert.IsTrue(count > 1, string.Format("Expected more than 1 filter, got {0}", count));
146 |
147 | var resetF = f.Reset();
148 | Assert.AreSame(f, resetF, "Returned ScalableBloomFilter should be the same instance");
149 |
150 | count = f.Filters.Count;
151 | Assert.IsTrue(count == 1, string.Format("Expected 1 filter, got {0}", count));
152 |
153 | foreach(var partition in f.Filters[0].Partitions)
154 | {
155 | for (uint i = 0; i < partition.count; i++)
156 | {
157 | if (partition.Get(i) != 0)
158 | {
159 | Assert.Fail("Expected all bits to be unset");
160 | }
161 | }
162 | }
163 | }
164 |
165 | [TestMethod]
166 | public void BenchmarkScalableBloomAdd()
167 | {
168 | var n = 100000;
169 | var f = new ScalableBloomFilter(100000, 0.1, 0.8);
170 | var data = new byte[n][];
171 | for (int i = 0; i < n; i++)
172 | {
173 | data[i] = Encoding.ASCII.GetBytes(i.ToString());
174 | }
175 |
176 | for (int i = 0; i < n; i++)
177 | {
178 | f.Add(data[i]);
179 | }
180 | }
181 |
182 | [TestMethod]
183 | public void BenchmarkScalableBloomTest()
184 | {
185 | var n = 100000;
186 | var f = new ScalableBloomFilter(100000, 0.1, 0.8);
187 | var data = new byte[n][];
188 | for (int i = 0; i < n; i++)
189 | {
190 | data[i] = Encoding.ASCII.GetBytes(i.ToString());
191 | }
192 |
193 | for (int i = 0; i < n; i++)
194 | {
195 | f.Test(data[i]);
196 | }
197 | }
198 |
199 | [TestMethod]
200 | public void BenchmarkScalableBloomTestAndAdd()
201 | {
202 | var n = 100000;
203 | var f = new ScalableBloomFilter(100000, 0.1, 0.8);
204 | var data = new byte[n][];
205 | for (int i = 0; i < n; i++)
206 | {
207 | data[i] = Encoding.ASCII.GetBytes(i.ToString());
208 | }
209 |
210 | for (int i = 0; i < n; i++)
211 | {
212 | f.TestAndAdd(data[i]);
213 | }
214 | }
215 | }
216 | }
217 |
--------------------------------------------------------------------------------
/TestProbabilisticDataStructures/TestTopK.cs:
--------------------------------------------------------------------------------
1 | using Microsoft.VisualStudio.TestTools.UnitTesting;
2 | using ProbabilisticDataStructures;
3 | using System.Text;
4 | using System.Linq;
5 |
6 | namespace TestProbabilisticDataStructures
7 | {
8 | [TestClass]
9 | public class TestTopK
10 | {
11 | private static byte[] BOB_BYTES = Encoding.ASCII.GetBytes("bob");
12 | private static byte[] TYLER_BYTES = Encoding.ASCII.GetBytes("tyler");
13 | private static byte[] FRED_BYTES = Encoding.ASCII.GetBytes("fred");
14 | private static byte[] ALICE_BYTES = Encoding.ASCII.GetBytes("alice");
15 | private static byte[] JAMES_BYTES = Encoding.ASCII.GetBytes("james");
16 | private static byte[] SARA_BYTES = Encoding.ASCII.GetBytes("sara");
17 | private static byte[] BILL_BYTES = Encoding.ASCII.GetBytes("bill");
18 |
19 | ///
20 | /// Ensures that TopK return the top-k most frequent elements.
21 | ///
22 | [TestMethod]
23 | public void TestTopk()
24 | {
25 | var topK = new TopK(0.001, 0.99, 5);
26 |
27 | topK.Add(BOB_BYTES).Add(BOB_BYTES).Add(BOB_BYTES);
28 | topK.Add(TYLER_BYTES).Add(TYLER_BYTES).Add(TYLER_BYTES).Add(TYLER_BYTES).Add(TYLER_BYTES);
29 | topK.Add(FRED_BYTES);
30 | topK.Add(ALICE_BYTES).Add(ALICE_BYTES).Add(ALICE_BYTES).Add(ALICE_BYTES);
31 | topK.Add(JAMES_BYTES);
32 | topK.Add(FRED_BYTES);
33 | topK.Add(SARA_BYTES).Add(SARA_BYTES);
34 |
35 | var addedK = topK.Add(BILL_BYTES);
36 | Assert.AreSame(topK, addedK);
37 | // latest one also
38 | var expected = new ProbabilisticDataStructures.Element[]{
39 | new ProbabilisticDataStructures.Element{Data=BILL_BYTES, Freq=1},
40 | new ProbabilisticDataStructures.Element{Data=SARA_BYTES, Freq=2},
41 | new ProbabilisticDataStructures.Element{Data=BOB_BYTES, Freq=3},
42 | new ProbabilisticDataStructures.Element{Data=ALICE_BYTES, Freq=4},
43 | new ProbabilisticDataStructures.Element{Data=TYLER_BYTES, Freq=5},
44 | };
45 |
46 | var actual = topK.Elements();
47 |
48 | Assert.AreEqual(5, actual.Length);
49 |
50 | for (int i = 0; i < actual.Length; i++)
51 | {
52 | var element = actual[i];
53 | Assert.IsTrue(Enumerable.SequenceEqual(element.Data, expected[i].Data));
54 | // freq check
55 | Assert.AreEqual(expected[i].Freq, element.Freq);
56 | }
57 |
58 | var resetK = topK.Reset();
59 | Assert.AreSame(topK, resetK);
60 |
61 | Assert.AreEqual(0, topK.Elements().Length);
62 | Assert.AreEqual(0u, topK.N);
63 | }
64 |
65 | [TestMethod]
66 | public void BenchmarkTopKAdd()
67 | {
68 | var n = 100000;
69 | var topK = new TopK(0.001, 0.99, 5);
70 | var data = new byte[n][];
71 | for (int i = 0; i < n; i++)
72 | {
73 | data[i] = Encoding.ASCII.GetBytes(i.ToString());
74 | }
75 |
76 | for (int i = 0; i < n; i++)
77 | {
78 | topK.Add(data[i]);
79 | }
80 | }
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
1 | version: 1.0.{build}
2 | configuration: Release
3 | test: on
4 | skip_tags: true
5 | pull_requests:
6 | do_not_increment_build_number: true
7 | build:
8 | verbosity: minimal
9 | assembly_info:
10 | patch: true
11 | file: '**\AssemblyInfo.*'
12 | assembly_version: '{version}'
13 | assembly_file_version: '{version}'
14 | assembly_informational_version: '{version}'
15 | artifacts:
16 | - path: ProbabilisticDataStructures\bin\Release
17 | name: ProbabilisticDataStructures-v$(appveyor_build_version)
18 |
--------------------------------------------------------------------------------