├── .document
├── .gitignore
├── .rspec
├── Gemfile
├── Gemfile.lock
├── HISTORY.md
├── LICENSE.txt
├── README.md
├── Rakefile
├── VERSION
├── hyperloglog-redis.gemspec
├── lib
    ├── algorithm.rb
    ├── counter.rb
    ├── hyperloglog-redis.rb
    └── time_series_counter.rb
└── spec
    ├── hyper_log_log_spec.rb
    ├── spec_helper.rb
    └── time_series_counter_spec.rb


/.document:
--------------------------------------------------------------------------------
1 | lib/**/*.rb
2 | bin/*
3 | - 
4 | features/**/*.feature
5 | LICENSE.txt
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | *~
 3 | *#
 4 | .#*
 5 | .yardoc/*
 6 | .pt
 7 | .rvmrc
 8 | .redcar
 9 | .powrc
10 | .irbrc
11 | .bundle
12 | .pt
13 | docs/*
14 | log/*
15 | pkg/*
16 | .autotest
17 | *.watchr
18 | *.sublime-project
19 | *.sublime-workspace
20 | rspec.failures
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/.rspec:
--------------------------------------------------------------------------------
1 | --color
2 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
 1 | source "http://rubygems.org"
 2 | 
 3 | gem 'murmurhash3', '~> 0.1.3'
 4 | gem 'redis', '~> 3.0.1'
 5 | 
 6 | group :development, :test do
 7 |   gem 'jeweler', '~> 1.8.4'
 8 |   gem 'rake', '~> 0.9.2.2'
 9 |   gem 'rspec', '~> 2.11.0'
10 |   gem 'timecop', '~> 0.5.3'
11 | end


--------------------------------------------------------------------------------
/Gemfile.lock:
--------------------------------------------------------------------------------
 1 | GEM
 2 |   remote: http://rubygems.org/
 3 |   specs:
 4 |     diff-lcs (1.1.3)
 5 |     git (1.2.5)
 6 |     jeweler (1.8.4)
 7 |       bundler (~> 1.0)
 8 |       git (>= 1.2.5)
 9 |       rake
10 |       rdoc
11 |     json (1.7.5)
12 |     murmurhash3 (0.1.3)
13 |     rake (0.9.2.2)
14 |     rdoc (3.12)
15 |       json (~> 1.4)
16 |     redis (3.0.1)
17 |     rspec (2.11.0)
18 |       rspec-core (~> 2.11.0)
19 |       rspec-expectations (~> 2.11.0)
20 |       rspec-mocks (~> 2.11.0)
21 |     rspec-core (2.11.1)
22 |     rspec-expectations (2.11.3)
23 |       diff-lcs (~> 1.1.3)
24 |     rspec-mocks (2.11.2)
25 |     timecop (0.5.3)
26 | 
27 | PLATFORMS
28 |   ruby
29 | 
30 | DEPENDENCIES
31 |   jeweler (~> 1.8.4)
32 |   murmurhash3 (~> 0.1.3)
33 |   rake (~> 0.9.2.2)
34 |   redis (~> 3.0.1)
35 |   rspec (~> 2.11.0)
36 |   timecop (~> 0.5.3)
37 | 


--------------------------------------------------------------------------------
/HISTORY.md:
--------------------------------------------------------------------------------
 1 | ## 2.0.0 (11/30/2012)
 2 | 
 3 | * Changed the underlying storage from Redis hashes to bitstrings [simonkro](https://github.com/simonkro)
 4 |   If you have existing counters stored from version 1.0.0, you can upgrade them with 
 5 |   the following method:
 6 | 
 7 |         def upgrade_1_2(counter, redis)
 8 |           return if redis.type(counter) == "string"
 9 |           sketch = redis.hgetall(counter)
10 |           redis.del(counter)
11 |           sketch.each{ |key, value| redis.setrange(counter, key.to_i, value.to_i.chr) }
12 |         end
13 | 
14 | * Moved main counter implementation from `HyperLogLog` to the class `HyperLogLog::Counter`
15 | 
16 | * Added `HyperLogLog::TimeSeriesCounter` a counter type that can estimate cardinalities 
17 |   for all events from a particular point in the past until the present.
18 | 
19 | 
20 | ## 1.0.0 (10/26/2012)
21 | 
22 | * Changed the underlying storage from Redis sorted sets to Redis hashes. This
23 |   is a breaking change, if you have existing counters stored from earlier
24 |   versions of this library, you can upgrade them with something like the
25 |   following method:
26 | 
27 |         def upgrade(counter, redis)
28 |           return if redis.type(counter) == "hash"
29 |           values = redis.zrange(counter, 0, -1, {withscores: true})
30 |           redis.del(counter)
31 |           values.each { |key, value| redis.hset(counter, key, value.to_i) }
32 |         end
33 | 
34 | * Added union_store command, which stores the results of a union for querying
35 |   or combining with other sets later
36 | 
37 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012 Art.sy, Inc.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in 
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 7 | of the Software, and to permit persons to whom the Software is furnished to do
 8 | so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | hyperloglog-redis
  2 | =================
  3 | 
  4 | This gem is a pure Ruby implementation of the HyperLogLog algorithm for estimating 
  5 | cardinalities of sets observed via a stream of events. A [Redis](http://redis.io) 
  6 | instance is used for storing the counters. A minimal example:
  7 | 
  8 |     require 'redis'
  9 |     require 'hyperloglog-redis'
 10 | 
 11 |     counter = HyperLogLog::Counter.new(Redis.new)
 12 |     ['john', 'paul', 'george', 'ringo', 'john', 'paul'].each do |beatle|
 13 |       counter.add('beatles', beatle)
 14 |     end
 15 | 
 16 |     puts "There are approximately #{counter.count('beatles')} distinct Beatles"
 17 | 
 18 | Each HyperLogLog counter uses a small, fixed amount of space but can
 19 | estimate the cardinality of any set of up to around a billion values with
 20 | relative error of 1.04 / Math.sqrt(2 ** b) with high probability, where b is a 
 21 | parameter passed to the `HyperLogLog::Counter` initializer that defaults to 10. 
 22 | With b = 10, each counter is represented by a 1 KB string in Redis and we get 
 23 | an expected relative error of 3%. Contrast this with the amount of space needed 
 24 | to compute set cardinality exactly, which is over 100 MB for even a bit vector 
 25 | representing a set with a billion values.
 26 | 
 27 | The basic idea of HyperLogLog (and its predecessors PCSA, LogLog, and others) is 
 28 | to apply a good hash function to each value observed in the stream and record the longest 
 29 | run of zeros seen as a prefix of any hashed value. If the hash 
 30 | function is good, the bits in any hashed value should be close to statistically independent, 
 31 | so seeing a value that starts with exactly X zeros should happen with probability close to
 32 | 2 ** -(X + 1). So, if you've seen a run of 5 zeros in one of your hash values, 
 33 | you're likely to have around 2 ** 6 = 64 values in the underlying set. The actual 
 34 | implementation and analysis are much more advanced than this, but that's the idea.
 35 | 
 36 | This gem implements a few useful extensions to the basic HyperLogLog algorithm
 37 | which allow you to estimate unions and intersections of counters as well as
 38 | counts within specific time ranges. These extensions are described in detail below.
 39 | 
 40 | The HyperLogLog algorithm is described and analyzed in the paper 
 41 | ["HyperLogLog: the analysis of a near-optimal cardinality estimation 
 42 | algorithm"](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf) 
 43 | by Flajolet, Fusy, Gandouet, and Meunier. Our implementation closely 
 44 | follows the program described in Section 4 of that paper.
 45 | 
 46 | Unions and intersections
 47 | ========================
 48 | 
 49 | You can also ask for an estimate of the union from multiple counters:
 50 | 
 51 |     ['joe', 'denny', 'linda', 'jimmy', 'paul'].each do |wings_member|
 52 |       counter.add('wings', wings_member)
 53 |     end
 54 | 
 55 |     puts "There are approximately #{counter.union(['beatles', 'wings'])} people who were in the Beatles or Wings"
 56 | 
 57 | The same relative error guarantee above applies to unions: a union of
 58 | size N can be estimated to within +/- N * (1.04 / Math.sqrt(2 ** b)) elements,
 59 | regardless of how many HyperLogLog counters that union spans. You can store 
 60 | a unioned counter for querying or combining later with `union_store`:
 61 | 
 62 |     counter.union_store('all_beatles_and_wings_members', ['beatles', 'wings'])
 63 |     
 64 |     puts "There are approximately #{counter.count('all_beatles_and_wings_members'}} people who were in the Beatles or Wings"
 65 | 
 66 | Intersections can also be estimated:
 67 | 
 68 |     puts "There are approximately #{counter.intersection(['beatles', 'wings'])} people who were in both the Beatles and Wings"
 69 | 
 70 | However, intersections of HyperLogLog counters are calculated indirectly via the
 71 | [inclusion/exclusion principle](http://en.wikipedia.org/wiki/Inclusion%E2%80%93exclusion_principle)
 72 | as a sum of unions and there aren't good theoretical bounds on the error of that sum. In
 73 | practice, the estimates that come out of small intersections tend to follow the
 74 | same relative error patterns, but beware using this type of estimation on intersections
 75 | of large numbers of sets, both because the errors can be much larger than those guaranteed
 76 | for unions and the complexity of computing intersections grows exponentially with 
 77 | the number of sets in the intersection.
 78 | 
 79 | Set cardinality within a time interval
 80 | ======================================
 81 | 
 82 | All examples up until now use `HyperLogLog::Counter`, which stores HyperLogLog
 83 | counters as (2 ** b)-byte Redis strings. hyperloglog-redis also contains the counter implementation
 84 | `HyperLogLog::TimeSeriesCounter`, which uses a little more space (Redis strings of up to 
 85 | 4 * (32 - b) * (2 ** b) bytes) but allows you to estimate the cardinality of sets during 
 86 | certain time windows.
 87 | 
 88 | Using `HyperLogLog::TimeSeriesCounter`, you can get estimates of the number of distinct
 89 | elements added to a set in the past X seconds, for any value of X. A `HyperLogLog::TimeSeriesCounter`
 90 | is initialized with the same arguments as a regular `Counter` but implements a
 91 | superset of `HyperLogLog::Counter`'s interface. Namely, each of the methods `add`,
 92 | `count`, `union`, `intersection`, and `union_store` take an optional final time argument,
 93 | either a Ruby `Time` or an integer representing seconds since the epoch. 
 94 | 
 95 | When passed a time argument t, `add` registers an addition to the set at time t. When no
 96 | time is passed, the current system time is used. The methods `count`, `union`,
 97 | `intersection`, and `union_store` all estimate set cardinality for the time interval 
 98 | consisting of all events that happened after time t when t is passed as a final argument.
 99 | 
100 | For example, to get the number of distinct user logins within the
101 | past week, we might call:
102 | 
103 |     one_week = 60 * 60 * 24 * 7
104 |     logins_in_past_week = counter.count('user_logins', Time.now - one_week)
105 | 
106 | A note about relative errors
107 | ============================
108 | 
109 | With a parameter `b` in the range [4..16], HyperLogLog counters provide a relative
110 | error of 1.04/sqrt(2 ** b) with high probability. When unions, intersections, and
111 | time range queries are involved, it's sometimes not clear what the relative error
112 | is relative to, so here is some clarification:
113 | 
114 | * For a union of counters, the relative error applies to the size of the union. Taking
115 | the union of counters is lossless in the sense that you end up with the same counter
116 | you would have arrived at had you observed the union of all of the individual events.
117 | 
118 | * For an intersection of counters, there's no good theoretical bound on the relative
119 | error. In practice, and especially for intersections involving a small number of sets,
120 | the relative error you obtain tends to be in relation to the size of the union of the
121 | sets involved. For example, if you have two sets, each of cardinality 5000 and observe
122 | both sets through HyperLogLog counters with parameter b=10 (3% relative error), you can
123 | expect the intersection estimate to be within 10000 * 0.03 = 300 of the actual intersection
124 | size.
125 | 
126 | * For time queries, the relative error applies to the size of the set within the time
127 | range you've queried. For example, given a set of cardinality 1,000,000 that has had
128 | 100 distinct additions within the last 10 minutes, if you observe such a set with a
129 | HyperLogLog counter with parameter b=10 (3% relative error), you can expect the count
130 | returned from a query about the last 10 minutes to be within 3 of 100.
131 | 
132 | Comparison to other approaches
133 | ==============================
134 | 
135 | When trying to optimize for space, two well-known alternatives to HyperLogLog exist:
136 | 
137 | * Bit vectors: you provide some near-perfect hash function between keys in your domain
138 | and an interval of integers, then represent that interval of integers with bits.
139 | * Bloom filters with counters: use a [Bloom filter](http://en.wikipedia.org/wiki/Bloom_filter) 
140 | to keep track of items seen; on insert, when the Bloom filter tells you that the item
141 | seen is not in the set, increment the counter.
142 | 
143 | Both bit vectors and bloom filters can be augmented to hold timestamps for entries in the
144 | data structures and simulate counters for time-ranges like `HyperLogLog::TimeSeriesCounter`.
145 | 
146 | Bit vectors give exact counts, but the space complexity is linear with the size of
147 | the set, and you must either allocate a large bit vector upfront or cope with the complexity
148 | of dynamically resizing your bit vector as the set grows. Providing a manual mapping from
149 | members of your set to an interval of integers is sometimes a non-trivial task. Counts,
150 | unions, and intersections are all linear-time operations in the size of the universe of
151 | the set being represented.
152 | 
153 | Bloom filters can be much more compact than bit vectors, but the actual count associated
154 | with a Bloom filter is an artifact of the construction of the data structure, so the cost
155 | of estimating a union or intersection is linear in the size of the Bloom filter. Getting
156 | high probability guarantees on the quality of the estimate of Bloom filter counts requires
157 | several "good" hash functions that have some degree of independence from each other; in 
158 | practice, coming up with several independent implementations of good hash functions is 
159 | difficult. Bloom filters require that all of their space be allocated upfront (re-hashing
160 | isn't possible without replaying all events), so in practice you need some estimate of
161 | how large the counters are going to be before allocating the counter.
162 | 
163 | HyperLogLog counters take up less space than either of the above approaches and provide
164 | constant-time implementations (in the size of the sets being represented) of unions,
165 | intersections, and time range queries. A `HyperLogLog::Counter` with parameter b will
166 | be stored in a Redis string of length at most 2 ** b bytes, whereas a `HyperLogLog::TimeSeriesCounter` with parameter
167 | b will be stored in a Redis string of length at most 4 * (32 - b) * (2 ** b) bytes. For counters representing smaller sets,
168 | the size taken up by a `HyperLogLog::TimeSeriesCounter` can be significantly less. Here
169 | are some examples for specific values of b:
170 | 
171 | * With b = 7, a `HyperLogLog::Counter` uses at most 128 bytes and a `HyperLogLog::TimeSeriesCounter` uses at most 13 KB while providing a relative error of 9%.
172 | * With b = 11, a `HyperLogLog::Counter` uses at most 2 KB and a `HyperLogLog::TimeSeriesCounter` uses at most 168 KB while providing a relative error of 2%
173 | * With b = 16, a `HyperLogLog::Counter` uses at most 64 KB and a `HyperLogLog::TimeSeriesCounter` uses at most 4 MB while providing a relative error of less than half a percent.
174 | 
175 | Installation
176 | ============
177 | 
178 |     gem install hyperloglog-redis
179 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | require 'rubygems'
 4 | require 'bundler'
 5 | begin
 6 |   Bundler.setup(:default, :development)
 7 | rescue Bundler::BundlerError => e
 8 |   $stderr.puts e.message
 9 |   $stderr.puts "Run `bundle install` to install missing gems"
10 |   exit e.status_code
11 | end
12 | require 'rake'
13 | 
14 | require 'jeweler'
15 | Jeweler::Tasks.new do |gem|
16 |   # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17 |   gem.name = "hyperloglog-redis"
18 |   gem.homepage = "http://github.com/aaw/hyperloglog-redis"
19 |   gem.license = "MIT"
20 |   gem.summary = %Q{An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end}
21 |   gem.description = %Q{An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end}
22 |   gem.email = "aaron.windsor@gmail.com"
23 |   gem.authors = ["Aaron Windsor"]
24 |   # dependencies defined in Gemfile
25 | end
26 | Jeweler::RubygemsDotOrgTasks.new
27 | 
28 | require 'rspec/core'
29 | require 'rspec/core/rake_task'
30 | RSpec::Core::RakeTask.new(:spec) do |spec|
31 |   spec.pattern = FileList['spec/**/*_spec.rb']
32 | end
33 | 
34 | task :default => :spec
35 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 2.0.0


--------------------------------------------------------------------------------
/hyperloglog-redis.gemspec:
--------------------------------------------------------------------------------
 1 | # Generated by jeweler
 2 | # DO NOT EDIT THIS FILE DIRECTLY
 3 | # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
 4 | # -*- encoding: utf-8 -*-
 5 | 
 6 | Gem::Specification.new do |s|
 7 |   s.name = "hyperloglog-redis"
 8 |   s.version = "2.0.0"
 9 | 
10 |   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11 |   s.authors = ["Aaron Windsor"]
12 |   s.date = "2012-11-30"
13 |   s.description = "An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end"
14 |   s.email = "aaron.windsor@gmail.com"
15 |   s.extra_rdoc_files = [
16 |     "LICENSE.txt",
17 |     "README.md"
18 |   ]
19 |   s.files = [
20 |     ".document",
21 |     ".rspec",
22 |     "Gemfile",
23 |     "Gemfile.lock",
24 |     "HISTORY.md",
25 |     "LICENSE.txt",
26 |     "README.md",
27 |     "Rakefile",
28 |     "VERSION",
29 |     "hyperloglog-redis.gemspec",
30 |     "lib/algorithm.rb",
31 |     "lib/counter.rb",
32 |     "lib/hyperloglog-redis.rb",
33 |     "lib/time_series_counter.rb",
34 |     "spec/hyper_log_log_spec.rb",
35 |     "spec/spec_helper.rb",
36 |     "spec/time_series_counter_spec.rb"
37 |   ]
38 |   s.homepage = "http://github.com/aaw/hyperloglog-redis"
39 |   s.licenses = ["MIT"]
40 |   s.require_paths = ["lib"]
41 |   s.rubygems_version = "1.8.10"
42 |   s.summary = "An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end"
43 | 
44 |   if s.respond_to? :specification_version then
45 |     s.specification_version = 3
46 | 
47 |     if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
48 |       s.add_runtime_dependency(%q<murmurhash3>, ["~> 0.1.3"])
49 |       s.add_runtime_dependency(%q<redis>, ["~> 3.0.1"])
50 |       s.add_development_dependency(%q<jeweler>, ["~> 1.8.4"])
51 |       s.add_development_dependency(%q<rake>, ["~> 0.9.2.2"])
52 |       s.add_development_dependency(%q<rspec>, ["~> 2.11.0"])
53 |       s.add_development_dependency(%q<timecop>, ["~> 0.5.3"])
54 |     else
55 |       s.add_dependency(%q<murmurhash3>, ["~> 0.1.3"])
56 |       s.add_dependency(%q<redis>, ["~> 3.0.1"])
57 |       s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
58 |       s.add_dependency(%q<rake>, ["~> 0.9.2.2"])
59 |       s.add_dependency(%q<rspec>, ["~> 2.11.0"])
60 |       s.add_dependency(%q<timecop>, ["~> 0.5.3"])
61 |     end
62 |   else
63 |     s.add_dependency(%q<murmurhash3>, ["~> 0.1.3"])
64 |     s.add_dependency(%q<redis>, ["~> 3.0.1"])
65 |     s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
66 |     s.add_dependency(%q<rake>, ["~> 0.9.2.2"])
67 |     s.add_dependency(%q<rspec>, ["~> 2.11.0"])
68 |     s.add_dependency(%q<timecop>, ["~> 0.5.3"])
69 |   end
70 | end
71 | 
72 | 


--------------------------------------------------------------------------------
/lib/algorithm.rb:
--------------------------------------------------------------------------------
 1 | require 'redis'
 2 | require 'murmurhash3'
 3 | 
 4 | module HyperLogLog
 5 |   module Algorithm
 6 | 
 7 |     def initialize(redis, b=10)
 8 |       raise "Accuracy not supported. Please choose a value of b between 4 and 16" if b < 4 || b > 16
 9 |       @redis = redis
10 |       @bits_in_hash = 32 - b
11 |       @m = (2 ** b).to_i
12 |       if @m == 16
13 |         @alpha = 0.673
14 |       elsif @m == 32
15 |         @alpha = 0.697
16 |       elsif @m == 64
17 |         @alpha = 0.709
18 |       else
19 |         @alpha = 0.7213/(1 + 1.079/@m)
20 |       end
21 |     end
22 | 
23 |     # Estimate the cardinality of the intersection of several sets. We do this by 
24 |     # using the principle of inclusion and exclusion to represent the size of the
25 |     # intersection as the alternating sum of an exponential number of 
26 |     # cardinalities of unions of smaller sets.
27 |     def intersection(counter_names, time=0)
28 |       icount = (1..counter_names.length).map do |k|
29 |         counter_names.combination(k).map do |group|
30 |           ((k % 2 == 0) ? -1 : 1) * union_helper(group, time)
31 |         end.inject(0, :+)
32 |       end.inject(0, :+)
33 |       [icount, 0].max
34 |     end
35 | 
36 |     private
37 | 
38 |     def hash_info(value)
39 |       hash = MurmurHash3::V32.murmur3_32_str_hash(value)
40 |       [hash, hash % @m, rho(hash / @m)]
41 |     end
42 | 
43 |     def union_helper(counter_names, time=0)
44 |       all_estimates = raw_union(counter_names, time).select{ |i| i > 0 }
45 |       estimate_sum = all_estimates.reduce(0.0){ |a, score| a + 2.0 ** -score }
46 |       estimate = @alpha * @m * @m / (estimate_sum + @m - all_estimates.length)
47 |       if estimate <= 2.5 * @m
48 |         if all_estimates.length == @m
49 |           estimate.round
50 |         else # Correction for small sets
51 |           (@m * Math.log(Float(@m)/(@m - all_estimates.length))).round
52 |         end
53 |       elsif estimate <= 2 ** 32 / 30.0
54 |         estimate.round
55 |       else # Correction for large sets
56 |         (-2**32 * Math.log(1 - estimate/(2.0**32))).round
57 |       end
58 |     end
59 | 
60 |     # rho(i) is the position of the first 1 in the binary representation of i,
61 |     # reading from most significant to least significant bits. Some examples:
62 |     # rho(1...) = 1, rho(001...) = 3, rho(000...0) = @bits_in_hash + 1
63 |     def rho(i)
64 |       return @bits_in_hash + 1 if i == 0
65 |       @bits_in_hash - Math.log(i, 2).floor
66 |     end
67 | 
68 |   end
69 | end
70 | 


--------------------------------------------------------------------------------
/lib/counter.rb:
--------------------------------------------------------------------------------
 1 | module HyperLogLog
 2 |   class Counter
 3 |     include Algorithm
 4 | 
 5 |     # This is the implementation of the standard HyperLogLog algorithm, storing 
 6 |     # counts in each byte of a string of length 2 ** b. 
 7 | 
 8 |     def add(counter_name, value)
 9 |       hash, function_name, new_value = hash_info(value)
10 |       existing_value = @redis.getrange(counter_name, function_name, function_name).unpack('C').first.to_i
11 |       @redis.setrange(counter_name, function_name, new_value.chr) if new_value > existing_value
12 |     end
13 | 
14 |     # Estimate the cardinality of a single set
15 |     def count(counter_name)
16 |       union_helper([counter_name])
17 |     end
18 |     
19 |     # Estimate the cardinality of the union of several sets
20 |     def union(counter_names)
21 |       union_helper(counter_names)
22 |     end    
23 |     
24 |     # Store the union of several sets in *destination* so that it can be used as 
25 |     # a HyperLogLog counter later.
26 |     def union_store(destination, counter_names)
27 |       @redis.set(destination, raw_union(counter_names).inject('') {|a, e| a << e.chr})
28 |     end
29 | 
30 |     private
31 |     
32 |     def raw_union(counter_names, time=nil)
33 |       counters = @redis.mget(*counter_names).compact
34 |       return [] if counters.none?
35 |       return counters.first.each_byte if counters.one?
36 |       counters.map{|c| c.unpack("C#{@m}")}.transpose.map {|e| e.compact.max.to_i}
37 |     end
38 |     
39 |   end
40 | end
41 | 


--------------------------------------------------------------------------------
/lib/hyperloglog-redis.rb:
--------------------------------------------------------------------------------
1 | require "algorithm"
2 | require "counter"
3 | require "time_series_counter"
4 | 


--------------------------------------------------------------------------------
/lib/time_series_counter.rb:
--------------------------------------------------------------------------------
 1 | module HyperLogLog
 2 |   class TimeSeriesCounter
 3 |     include Algorithm
 4 | 
 5 |     # This is an implementation of HyperLogLog that allows for querying counts
 6 |     # within time ranges of the form (t, current_time] with second-level
 7 |     # granularity. The standard implementation of HyperLogLog stores the max
 8 |     # number of leading zeros seen in the image of each of 2 ** b hash 
 9 |     # functions. These counts can naturally be stored in a string of length
10 |     # 2 ** b by allocating one byte per leading zero count.
11 |     #
12 |     # To provide counts within a time range, we alter the standard
13 |     # implementation to store a mapping of pairs of the form (hash function,
14 |     # leading zero count) -> timestamp, where the mapping (h,z) -> t represents
15 |     # the fact that we observed z leading zeros in the image of hash function h
16 |     # most recently at time t. This mapping is stored in a string by packing
17 |     # 4-byte words (timestamps, represented in seconds since the epoch) into
18 |     # a matrix indexed by hash function and zero count stored in row-major
19 |     # order. Since the max zero count for a counter with parameter b is (32-b),
20 |     # this representation takes up at most 4 * (32-b) * (2 ** b) bytes (and
21 |     # usually much less, since we don't allocate space for rows corresponding
22 |     # to higher leading zero counts until they're actaully observed.)
23 |     #
24 |     # To convert this representation to a HyperLogLog counter for the time
25 |     # range (t, current_time], we simply filter out all timestamps less than t
26 |     # in the matrix and then find, for each hash function, the maximum z for 
27 |     # which that hash function has a non-zero timestamp.
28 | 
29 |     def add(counter_name, value, time=nil)
30 |       hash, function_name, new_value = hash_info(value)
31 |       index = 4 * (function_name + (new_value.to_i * @m))
32 |       if time.nil?
33 |         @redis.setrange(counter_name, index, [Time.now.to_i].pack('N'))
34 |       else
35 |         existing_time = @redis.getrange(counter_name, index, index + 3)
36 |         existing_val = existing_time.empty? ? 0 : existing_time.unpack('N').first
37 |         @redis.setrange(counter_name, index, [time.to_i].pack('N')) if time.to_i > existing_val 
38 |       end
39 |     end
40 | 
41 |     # Estimate the cardinality of a single set
42 |     def count(counter_name, time=0)
43 |       union_helper([counter_name], time)
44 |     end
45 |     
46 |     # Estimate the cardinality of the union of several sets
47 |     def union(counter_names, time=0)
48 |       union_helper(counter_names, time)
49 |     end    
50 |     
51 |     # Store the union of several sets in *destination* so that it can be used as 
52 |     # a HyperLogLog counter later.
53 |     def union_store(destination, counter_names, time=0)
54 |       raw_counters = @redis.mget(*counter_names).compact.map{ |c| c.unpack('N*').map{ |x| x > time ? x : 0 } }
55 |       combined_counters = jagged_transpose(raw_counters).map{ |x| x.max.to_i }
56 |       @redis.set(destination, combined_counters.pack('N*'))
57 |     end
58 | 
59 |     private
60 |     
61 |     def raw_union(counter_names, time=0)
62 |       raw_counters = @redis.mget(*counter_names).compact
63 |       return [] if raw_counters.none?
64 |       hyperloglog_counters = raw_counters.map do |counter|
65 |         jagged_transpose(counter.unpack('N*').each_slice(@m).to_a).map{ |x| x.rindex{ |c| c > time } || 0 }
66 |       end
67 |       return hyperloglog_counters.first if hyperloglog_counters.one?
68 |       jagged_transpose(hyperloglog_counters).map{ |x| x.max.to_i }
69 |     end
70 | 
71 |     # Given an array of non-uniform length arrays, right-pad all arrays with 
72 |     # zeros so they're the same size, then transpose the array. This is a 
73 |     # destructive operation: the zero-padding modifies the array-of-arrays
74 |     def jagged_transpose(arrays)
75 |       max_length = arrays.map{ |a| a.length }.max
76 |       arrays.map{ |a| a.fill(0, a.length, max_length - a.length) }.transpose
77 |     end
78 | 
79 |   end
80 | end
81 | 


--------------------------------------------------------------------------------
/spec/hyper_log_log_spec.rb:
--------------------------------------------------------------------------------
  1 | require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
  2 | 
  3 | describe HyperLogLog do
  4 |   
  5 |   [HyperLogLog::Counter, HyperLogLog::TimeSeriesCounter].each do |counter_type|
  6 | 
  7 |     it "doesn't change its count when it sees values that it's already seen" do
  8 |       redis = Redis.new
  9 |       counter = counter_type.new(redis, 10)
 10 |       test_set = (1..100).map{ |x| x.to_s }
 11 |       test_set.each{ |value| counter.add("mycounter", value) }
 12 |       original_estimate = counter.count("mycounter")
 13 |       5.times do 
 14 |         test_set.each do |value|
 15 |           counter.add("mycounter", value)
 16 |           counter.count("mycounter").should == original_estimate
 17 |         end
 18 |       end
 19 |     end
 20 | 
 21 |     it "can maintain more than one logically distinct counter" do
 22 |       redis = Redis.new
 23 |       counter = counter_type.new(redis, 10)
 24 |       other_estimate = counter.count("counter2")
 25 |       (1..100).each do |i| 
 26 |         counter.add("counter1", i.to_s)
 27 |         counter.count("counter2").should == other_estimate
 28 |       end
 29 |       other_estimate = counter.count("counter1")
 30 |       (101..200).each do |i| 
 31 |         counter.add("counter2", i.to_s)
 32 |         counter.count("counter1").should == other_estimate
 33 |       end
 34 |       other_estimate = counter.count("counter2")
 35 |       (201..300).each do |i| 
 36 |         counter.add("counter1", i.to_s)
 37 |         counter.count("counter2").should == other_estimate
 38 |       end 
 39 |       counter.count("counter1").should > 100
 40 |       counter.count("counter2").should > 50
 41 |       counter.count("counter1").should > counter.count("counter2")
 42 |     end
 43 |     
 44 |     it "can exactly count small sets" do
 45 |       redis = Redis.new
 46 |       counter = counter_type.new(redis, 11)
 47 |       10.times { |i| counter.add("mycounter", i.to_s) }
 48 |       counter.count("mycounter").should == 10
 49 |     end
 50 |     
 51 |     it "can exactly count small unions" do
 52 |       redis = Redis.new
 53 |       counter = counter_type.new(redis, 11)
 54 |       (1..8).each { |i| counter.add("mycounter1", i.to_s) }
 55 |       (5..12).each { |i| counter.add("mycounter2", i.to_s) }
 56 |       counter.union(["mycounter1", "mycounter2"]).should == 12
 57 |     end
 58 |     
 59 |     it "can exactly count small intersections" do
 60 |       redis = Redis.new
 61 |       counter = counter_type.new(redis, 11)
 62 |       (1..8).each { |i| counter.add("mycounter1", i.to_s) }
 63 |       (5..12).each { |i| counter.add("mycounter2", i.to_s) }
 64 |       counter.intersection(["mycounter1", "mycounter2"]).should == 4
 65 |     end
 66 |     
 67 |     it "can store unions for querying later" do
 68 |       redis = Redis.new
 69 |       counter = counter_type.new(redis, 11)
 70 |       (1..10).each { |i| counter.add("mycounter1", i.to_s) }
 71 |       (5..15).each { |i| counter.add("mycounter2", i.to_s) }
 72 |       (15..25).each { |i| counter.add("mycounter3", i.to_s) }
 73 |       (20..50).each { |i| counter.add("mycounter4", i.to_s) }
 74 |       counter.union_store("aggregate_counter", ["mycounter1", "mycounter2", "mycounter3", "mycounter4"])
 75 |       counter.union(["mycounter1", "mycounter2", "mycounter3", "mycounter4"]).should == counter.count("aggregate_counter")
 76 |     end
 77 |     
 78 |     # With parameter b, HyperLogLog should produce estimates that have
 79 |     # relative error of 1.04 / Math.sqrt(2 ** b). Of course, this analysis
 80 |     # is based on assumptions that aren't necessarily true in practice and
 81 |     # the observed relative error will depend on the distribution of data
 82 |     # we receive as well as the interaction of the murmur hash implementation
 83 |     # with that data. Keeping that in mind, the following spec makes sure 
 84 |     # that in the process of adding 1000 values to a set, HyperLogLog only 
 85 |     # gives bad estimates (more than twice the expected relative error) in 
 86 |     # less than 1% of the cases and never gives very bad estimates (more than
 87 |     # three times the expected relative error.)
 88 |     #
 89 |     # It's fine to fudge these numbers a little if the implementation changes,
 90 |     # since you can clearly find a different set of values that make this test 
 91 |     # fail even without changing the implementation. But it should serve as a 
 92 |     # good indication that there aren't any logical errors in the HyperLogLog
 93 |     # implementation, since it exercises all of the cases in HyperLogLog's
 94 |     # count method except for the correction for very large set sizes.
 95 |     
 96 |     it "produces acceptable estimates for counts" do
 97 |       max_items = 1000
 98 |       redis = Redis.new
 99 |       (6..16).each do |b|
100 |         counter = counter_type.new(redis, b)
101 |         redis.del('mycounter')
102 |         bad_estimates = 0
103 |         very_bad_estimates = 0
104 |         expected_relative_error = 1.04 / Math.sqrt(2 ** b)
105 |         max_items.times do |i|
106 |           value = Digest::MD5.hexdigest("value#{i}")
107 |           counter.add("mycounter", value)
108 |           actual = i + 1
109 |           approximate = counter.count("mycounter")
110 |           relative_error = (actual - approximate).abs / Float(actual)
111 |           bad_estimates += 1 if relative_error > expected_relative_error * 2
112 |           very_bad_estimates += 1 if relative_error > expected_relative_error * 3
113 |         end
114 |         bad_estimates.should < max_items / 100.00
115 |         very_bad_estimates.should == 0
116 |       end
117 |     end
118 |     
119 |     it "produces acceptable estimates for unions with few elements in common" do
120 |       b, max_items = 10, 2000
121 |       counter = counter_type.new(Redis.new, b)
122 |       bad_estimates = 0
123 |       very_bad_estimates = 0
124 |       expected_relative_error = 1.04 / Math.sqrt(2 ** b)
125 |       max_items.times do |i|
126 |         value1 = Digest::MD5.hexdigest("value#{i}")
127 |         counter.add("mycounter1", value1)
128 |         value2 = Digest::MD5.hexdigest("value#{i}incounter2")
129 |         counter.add("mycounter2", value2)
130 |         value3 = Digest::MD5.hexdigest("this is value#{i}")
131 |         counter.add("mycounter3", value3)
132 |         actual = 3 * (i + 1)
133 |         approximate = counter.union(["mycounter1", "mycounter2", "mycounter3"])
134 |         relative_error = (actual - approximate).abs / Float(actual)
135 |         bad_estimates += 1 if relative_error > expected_relative_error * 2
136 |         very_bad_estimates += 1 if relative_error > expected_relative_error * 3
137 |       end
138 |       bad_estimates.should < (3 * max_items) / 100.00
139 |       very_bad_estimates.should == 0
140 |     end
141 |     
142 |     it "produces acceptable estimates for unions with many elements in common" do
143 |       b, max_items, intersection_size = 10, 1000, 2000
144 |       counter = counter_type.new(Redis.new, b)
145 |       bad_estimates = 0
146 |       very_bad_estimates = 0
147 |       expected_relative_error = 1.04 / Math.sqrt(2 ** b)
148 |       
149 |       intersection_size.times do |i|
150 |         value = Digest::MD5.hexdigest("test#{i}value")
151 |         ['mycounter1', 'mycounter2', 'mycounter3'].each do |counter_name|
152 |           counter.add(counter_name, value)
153 |         end
154 |       end
155 |       
156 |       max_items.times do |i|
157 |         value1 = Digest::MD5.hexdigest("value#{i}")
158 |         counter.add("mycounter1", value1)
159 |         value2 = Digest::MD5.hexdigest("value#{i}isincounter2")
160 |         counter.add("mycounter2", value2)
161 |         value3 = Digest::MD5.hexdigest("this is value#{i}")
162 |         counter.add("mycounter3", value3)
163 |         actual = 3 * (i + 1) + intersection_size
164 |         approximate = counter.union(["mycounter1", "mycounter2", "mycounter3"])
165 |         relative_error = (actual - approximate).abs / Float(actual)
166 |         bad_estimates += 1 if relative_error > expected_relative_error * 2
167 |         very_bad_estimates += 1 if relative_error > expected_relative_error * 3
168 |       end
169 |       
170 |       bad_estimates.should < ((3 * max_items) + intersection_size) / 100.00
171 |       very_bad_estimates.should == 0
172 |     end
173 | 
174 |     # There are no good theoretical guarantees that I know of for arbitrary
175 |     # intersection estimation, since it's expessed as the sum of unions of
176 |     # HyperLogLog counters, but it tends to work okay in practice, as seen below.
177 |     
178 |     it "produces decent estimates for intersections" do
179 |       b, max_items = 6, 1000
180 |       counter = counter_type.new(Redis.new, b)
181 |       expected_relative_error = 1.04 / Math.sqrt(2 ** b)
182 |       
183 |       max_items.times do |i|
184 |         value1 = Digest::MD5.hexdigest("first-value#{i}")
185 |         value2 = Digest::MD5.hexdigest("second-value#{i}")
186 |         value3 = Digest::MD5.hexdigest("third-value#{i}")
187 |         value4 = Digest::MD5.hexdigest("fourth-value#{i}")
188 |         counter.add("mycounter1", value1)
189 |         counter.add("mycounter2", value2)
190 |         counter.add("mycounter3", value3)
191 |         counter.add("mycounter4", value4)
192 |         [value1, value2, value3, value4].each{ |value| counter.add("mycounter5", value) }
193 |       end
194 |       
195 |       small_counters = ['mycounter1', 'mycounter2', 'mycounter3', 'mycounter4']
196 |       
197 |       small_counters.each do |counter_name|
198 |         intersection_estimate = counter.intersection([counter_name, 'mycounter5'])
199 |         intersection_estimate.should > 0
200 |         (intersection_estimate - counter.count(counter_name)).abs.should < max_items * expected_relative_error
201 |       end
202 |       
203 |       [2,3].each do |intersection_size|
204 |         small_counters.combination(intersection_size).each do |counter_names|
205 |           intersection_estimate = counter.intersection(counter_names)
206 |           intersection_estimate.should >= 0
207 |           intersection_estimate.should < intersection_size * max_items * expected_relative_error
208 |         end
209 |       end
210 |       
211 |       100.times do |i|
212 |         value = Digest::MD5.hexdigest("somethingintheintersection#{i}")
213 |         small_counters.each { |counter_name| counter.add(counter_name, value) }
214 |       end
215 |       
216 |       [2,3,4].each do |intersection_size|
217 |         small_counters.combination(intersection_size).each do |counter_names|
218 |           intersection_estimate = counter.intersection(counter_names)
219 |           intersection_estimate.should >= 0
220 |           (intersection_estimate - 100).abs.should < intersection_size * (max_items + 100) * expected_relative_error
221 |         end
222 |       end
223 |       
224 |     end
225 |   end
226 | end
227 | 


--------------------------------------------------------------------------------
/spec/spec_helper.rb:
--------------------------------------------------------------------------------
 1 | $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
 2 | $LOAD_PATH.unshift(File.dirname(__FILE__))
 3 | require 'rspec'
 4 | require 'redis'
 5 | require 'hyperloglog-redis'
 6 | 
 7 | db_number = ENV['REDIS_TEST_DATABASE'] || '15'
 8 | ENV['REDIS_URL'] = "redis://localhost:6379/#{db_number}"
 9 | redis = Redis.new
10 | if redis.keys('*').length > 0
11 |   puts "Warning! These specs use database #{db_number} on your local redis instance"
12 |   puts "running on port 6379. Your database #{db_number} seems to have keys in it."
13 |   puts "Please clear them before running the specs or set the environment"
14 |   puts "variable REDIS_TEST_DATABASE to use a different database number."
15 |   raise SystemExit
16 | end
17 | 
18 | # Requires supporting files with custom matchers and macros, etc,
19 | # in ./support/ and its subdirectories.
20 | Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
21 | 
22 | RSpec.configure do |config|
23 |   config.before(:each) do
24 |     Redis.new.flushdb
25 |   end
26 |   config.after(:each) do
27 |     Redis.new.flushdb
28 |   end
29 | end
30 | 


--------------------------------------------------------------------------------
/spec/time_series_counter_spec.rb:
--------------------------------------------------------------------------------
  1 | require 'securerandom'
  2 | require 'timecop'
  3 | require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
  4 | 
  5 | MINUTES=60
  6 | HOURS=MINUTES*60
  7 | DAYS=HOURS*24
  8 | WEEKS=DAYS*7
  9 | 
 10 | describe HyperLogLog::TimeSeriesCounter do
 11 | 
 12 |   before(:each) do
 13 |     @b = 11
 14 |     @redis = Redis.new
 15 |     @counter = HyperLogLog::TimeSeriesCounter.new(@redis, @b)
 16 |     @expected_relative_error = 1.04 / Math.sqrt(2 ** @b)
 17 | 
 18 |     def counter_should_equal(counter_val, expected_val, relative_error_base=nil)
 19 |       (counter_val - expected_val).abs.should <= (relative_error_base || expected_val) * @expected_relative_error
 20 |     end
 21 |   end
 22 | 
 23 |   it "can estimate cardinalities from any particular point in time until the present" do
 24 |     Timecop.travel(Time.now - 2 * WEEKS) do
 25 |       (0..100).each { |i| @counter.add('mycounter', "item#{i}") }
 26 |     end
 27 |     Timecop.travel(Time.now - 1 * WEEKS) do
 28 |       (100..200).each { |i| @counter.add('mycounter', "item#{i}") }
 29 |     end
 30 |     Timecop.travel(Time.now - 6 * DAYS) do
 31 |       (0..100).each { |i| @counter.add('mycounter', "item#{i}") }
 32 |     end
 33 |     Timecop.travel(Time.now - 5 * DAYS) do
 34 |       (100..200).each { |i| @counter.add('mycounter', "item#{i}") }
 35 |     end
 36 |     Timecop.travel(Time.now - 4 * DAYS) do
 37 |       (200..250).each { |i| @counter.add('mycounter', "item#{i}") }
 38 |     end
 39 |     
 40 |     counter_should_equal(@counter.count('mycounter'), 250)
 41 |     counter_should_equal(@counter.count('mycounter', Time.now.to_i - 3 * WEEKS), 250)
 42 |     counter_should_equal(@counter.count('mycounter', Time.now.to_i - 1 * WEEKS - 3 * DAYS), 250)
 43 |     counter_should_equal(@counter.count('mycounter', Time.now.to_i - 1 * WEEKS), 250)
 44 |     counter_should_equal(@counter.count('mycounter', Time.now.to_i - 5 * DAYS - 12 * HOURS), 150, 250)
 45 |     counter_should_equal(@counter.count('mycounter', Time.now.to_i - 4 * DAYS - 12 * HOURS), 50, 250)
 46 |     counter_should_equal(@counter.count('mycounter', Time.now.to_i - 3 * DAYS), 0, 250)
 47 |   end
 48 | 
 49 |   it "can estimate unions from any particular point in time until the present" do
 50 |     Timecop.travel(Time.now - 2 * WEEKS) do
 51 |       (0..100).each { |i| @counter.add('mycounter1', "item#{i}") }
 52 |     end
 53 |     Timecop.travel(Time.now - 1 * WEEKS) do
 54 |       (100..200).each { |i| @counter.add('mycounter2', "item#{i}") }
 55 |     end
 56 |     Timecop.travel(Time.now - 6 * DAYS) do
 57 |       (0..100).each { |i| @counter.add('mycounter1', "item#{i}") }
 58 |     end
 59 |     Timecop.travel(Time.now - 5 * DAYS) do
 60 |       (100..200).each { |i| @counter.add('mycounter2', "item#{i}") }
 61 |     end
 62 |     Timecop.travel(Time.now - 4 * DAYS) do
 63 |       (200..250).each { |i| @counter.add('mycounter1', "item#{i}") }
 64 |     end
 65 |     
 66 |     counter_should_equal(@counter.union(['mycounter1', 'mycounter2']), 250)
 67 |     counter_should_equal(@counter.union(['mycounter1', 'mycounter2'], Time.now.to_i - 3 * WEEKS), 250)
 68 |     counter_should_equal(@counter.union(['mycounter1', 'mycounter2'], Time.now.to_i - 1 * WEEKS - 3 * DAYS), 250)
 69 |     counter_should_equal(@counter.union(['mycounter1', 'mycounter2'], Time.now.to_i - 1 * WEEKS), 250)
 70 |     counter_should_equal(@counter.union(['mycounter1', 'mycounter2'], Time.now.to_i - 5 * DAYS - 12 * HOURS), 150, 250)
 71 |     counter_should_equal(@counter.union(['mycounter1', 'mycounter2'], Time.now.to_i - 4 * DAYS - 12 * HOURS), 50, 250)
 72 |     counter_should_equal(@counter.union(['mycounter1', 'mycounter2'], Time.now.to_i - 3 * DAYS), 0, 250)
 73 |   end
 74 | 
 75 |   it "can estimate intersections from any particular point in time until the present" do
 76 |     Timecop.travel(Time.now - 2 * WEEKS) do
 77 |       (0..100).each { |i| @counter.add('mycounter1', "item#{i}") }
 78 |     end
 79 |     Timecop.travel(Time.now - 1 * WEEKS) do
 80 |       (100..200).each { |i| @counter.add('mycounter2', "item#{i}") }
 81 |     end
 82 |     Timecop.travel(Time.now - 6 * DAYS) do
 83 |       (0..100).each { |i| @counter.add('mycounter2', "item#{i}") }
 84 |     end
 85 |     Timecop.travel(Time.now - 5 * DAYS) do
 86 |       (100..200).each { |i| @counter.add('mycounter1', "item#{i}") }
 87 |     end
 88 |     Timecop.travel(Time.now - 4 * DAYS) do
 89 |       (200..250).each { |i| @counter.add('mycounter1', "item#{i}") }
 90 |     end
 91 |     Timecop.travel(Time.now - 3 * DAYS) do
 92 |       (200..250).each { |i| @counter.add('mycounter2', "item#{i}") }
 93 |     end
 94 |     
 95 |     counter_should_equal(@counter.intersection(['mycounter1', 'mycounter2']), 250)
 96 |     counter_should_equal(@counter.intersection(['mycounter1', 'mycounter2'], Time.now.to_i - 3 * WEEKS), 250)
 97 |     counter_should_equal(@counter.intersection(['mycounter1', 'mycounter2'], Time.now.to_i - 1 * WEEKS - 3 * DAYS), 150, 250)
 98 |     counter_should_equal(@counter.intersection(['mycounter1', 'mycounter2'], Time.now.to_i - 6 * DAYS - 12 * HOURS), 50, 250)
 99 |     counter_should_equal(@counter.intersection(['mycounter1', 'mycounter2'], Time.now.to_i - 5 * DAYS - 12 * HOURS), 50, 250)
100 |     counter_should_equal(@counter.intersection(['mycounter1', 'mycounter2'], Time.now.to_i - 4 * DAYS - 12 * HOURS), 50, 250)
101 |     counter_should_equal(@counter.intersection(['mycounter1', 'mycounter2'], Time.now.to_i - 3 * DAYS - 12 * HOURS), 0, 250)
102 |     counter_should_equal(@counter.intersection(['mycounter1', 'mycounter2'], Time.now.to_i - 2 * DAYS), 0, 250)
103 |   end
104 | 
105 |   it "can use union_store to store snapshots of counters at particular points in time" do
106 |     Timecop.travel(Time.now - 2 * WEEKS) do
107 |       (0..100).each { |i| @counter.add('mycounter1', "item#{i}") }
108 |     end
109 |     Timecop.travel(Time.now - 1 * WEEKS) do
110 |       (100..200).each { |i| @counter.add('mycounter2', "item#{i}") }
111 |     end
112 |     Timecop.travel(Time.now - 6 * DAYS) do
113 |       (0..100).each { |i| @counter.add('mycounter2', "item#{i}") }
114 |     end
115 |     Timecop.travel(Time.now - 5 * DAYS) do
116 |       (100..200).each { |i| @counter.add('mycounter1', "item#{i}") }
117 |     end
118 |     Timecop.travel(Time.now - 4 * DAYS) do
119 |       (200..250).each { |i| @counter.add('mycounter1', "item#{i}") }
120 |     end
121 |     Timecop.travel(Time.now - 3 * DAYS) do
122 |       (200..250).each { |i| @counter.add('mycounter2', "item#{i}") }
123 |     end
124 | 
125 |     @counter.union_store('counter1_1_week_ago', ['mycounter1'], Time.now.to_i - 1 * WEEKS)
126 |     @counter.union_store('counter2_5_days_ago', ['mycounter2'], Time.now.to_i - 5 * DAYS)
127 |     counter_should_equal(@counter.union(['counter1_1_week_ago', 'counter2_5_days_ago']), 150, 250)
128 |   end
129 | 
130 |   it "allows you to override the time an event is registered when it's added" do
131 |     (0..1000).each { |i| @counter.add('mycounter', "item#{i}", Time.now.to_i - 3 * WEEKS) }
132 |     (1000..2000).each { |i| @counter.add('mycounter', "item#{i}", Time.now.to_i - 2 * WEEKS) }
133 |     (2000..3000).each { |i| @counter.add('mycounter', "item#{i}", Time.now.to_i - 1 * WEEKS) }
134 |     (3000..4000).each { |i| @counter.add('mycounter', "item#{i}") }
135 |     
136 |     counter_should_equal(@counter.count('mycounter', Time.now.to_i - 4 * WEEKS), 4000)
137 |     counter_should_equal(@counter.count('mycounter', Time.now.to_i - 2 * WEEKS - 3 * DAYS), 3000)
138 |     counter_should_equal(@counter.count('mycounter', Time.now.to_i - 1 * WEEKS - 3 * DAYS), 2000)
139 |     counter_should_equal(@counter.count('mycounter', Time.now.to_i - 3 * DAYS), 1000)
140 |   end
141 | 
142 |   it "doesn't screw up more recent counts when items are injected with earlier timestamp overrides" do
143 |     Timecop.travel(Time.now - 3 * WEEKS) do
144 |       (0..1000).each { |i| @counter.add('mycounter', "item#{i}") }
145 |     end
146 |     
147 |     Timecop.travel(Time.now - 2 * WEEKS) do
148 |       (1000..2000).each { |i| @counter.add('mycounter', "item#{i}") }
149 |     end
150 | 
151 |     Timecop.travel(Time.now - 1 * WEEKS) do
152 |       (2000..3000).each { |i| @counter.add('mycounter', "item#{i}") }
153 |     end
154 | 
155 |     Timecop.travel(Time.now - 2 * DAYS) do
156 |       (1000..2000).each { |i| @counter.add('mycounter', "item#{i}") }
157 |     end
158 | 
159 |     counter_should_equal(@counter.count('mycounter', Time.now.to_i - 4 * WEEKS), 3000)
160 |     counter_should_equal(@counter.count('mycounter', Time.now.to_i - 2 * WEEKS - 3 * DAYS), 2000)
161 |     counter_should_equal(@counter.count('mycounter', Time.now.to_i - 1 * WEEKS - 3 * DAYS), 2000)
162 |     counter_should_equal(@counter.count('mycounter', Time.now.to_i - 3 * DAYS), 1000)
163 |     counter_should_equal(@counter.count('mycounter', Time.now.to_i - 1 * DAYS), 0)
164 |     
165 |     # Shouldn't change counts, since they're updates to counts that happen later
166 |     # than the time we're trying to inject
167 |     (1000..2000).each { |i| @counter.add('mycounter', "item#{i}", Time.now.to_i - 1 * WEEKS) }
168 | 
169 |     counter_should_equal(@counter.count('mycounter', Time.now.to_i - 4 * WEEKS), 3000)
170 |     counter_should_equal(@counter.count('mycounter', Time.now.to_i - 2 * WEEKS - 3 * DAYS), 2000)
171 |     counter_should_equal(@counter.count('mycounter', Time.now.to_i - 1 * WEEKS - 3 * DAYS), 2000)
172 |     counter_should_equal(@counter.count('mycounter', Time.now.to_i - 3 * DAYS), 1000)
173 |     counter_should_equal(@counter.count('mycounter', Time.now.to_i - 1 * DAYS), 0)
174 | 
175 |     # Should change counts, since they're updates to counts for items we've never
176 |     # seen before in the past
177 |     (3000..4000).each { |i| @counter.add('mycounter', "item#{i}", Time.now.to_i - 1 * WEEKS) }
178 | 
179 |     counter_should_equal(@counter.count('mycounter', Time.now.to_i - 4 * WEEKS), 4000)
180 |     counter_should_equal(@counter.count('mycounter', Time.now.to_i - 2 * WEEKS - 3 * DAYS), 3000)
181 |     counter_should_equal(@counter.count('mycounter', Time.now.to_i - 1 * WEEKS - 3 * DAYS), 3000)
182 |     counter_should_equal(@counter.count('mycounter', Time.now.to_i - 3 * DAYS), 1000)
183 |     counter_should_equal(@counter.count('mycounter', Time.now.to_i - 1 * DAYS), 0)
184 |   end
185 | 
186 |   it "can compute deltas over time on events correctly" do
187 |     # A larger-scale test that simulates user join events and tests that we can get 
188 |     # week-by-week deltas. Generate new user counts according to the following 
189 |     # weekly schedule: 55780 during the first week, 300 more during the next week, 
190 |     # 10 more the next week, etc.
191 | 
192 |     schedule = [55780, 300, 10, 4000, 1000, 1000, 5000, 15000, 30000, 3000]
193 |     schedule.each_with_index do |num_users, i|
194 |       Timecop.travel(Time.now - (schedule.length * WEEKS) + (i * WEEKS)) do
195 |         num_users.times do |i|
196 |           Timecop.travel(Time.now + 2 * HOURS + i) do
197 |             @counter.add("users", "user#{SecureRandom.uuid}")
198 |           end
199 |         end
200 |       end
201 |     end
202 | 
203 |     actual_total = schedule.reduce(:+) 
204 |     estimated_total = @counter.count("users")
205 |     (actual_total - estimated_total).abs.should < @expected_relative_error * actual_total
206 | 
207 |     # Go through the schedule, computing week-by-week deltas and comparing them to the
208 |     # scheduled additions. 
209 | 
210 |     schedule.each_with_index do |users_joined, i|
211 |       week = schedule.length - 1 - i
212 |       c = @counter.count('users', Time.now.to_i - (week+1) * WEEKS) - @counter.count('users', Time.now.to_i - week * WEEKS)
213 |       (users_joined - c).abs.should < @expected_relative_error * schedule.reduce(:+)
214 |     end
215 |   end
216 | end
217 | 


--------------------------------------------------------------------------------