├── .document ├── .gitignore ├── .rspec ├── Gemfile ├── Gemfile.lock ├── HISTORY.md ├── LICENSE.txt ├── README.md ├── Rakefile ├── VERSION ├── hyperloglog-redis.gemspec ├── lib ├── algorithm.rb ├── counter.rb ├── hyperloglog-redis.rb └── time_series_counter.rb └── spec ├── hyper_log_log_spec.rb ├── spec_helper.rb └── time_series_counter_spec.rb /.document: -------------------------------------------------------------------------------- 1 | lib/**/*.rb 2 | bin/* 3 | - 4 | features/**/*.feature 5 | LICENSE.txt 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *~ 3 | *# 4 | .#* 5 | .yardoc/* 6 | .pt 7 | .rvmrc 8 | .redcar 9 | .powrc 10 | .irbrc 11 | .bundle 12 | .pt 13 | docs/* 14 | log/* 15 | pkg/* 16 | .autotest 17 | *.watchr 18 | *.sublime-project 19 | *.sublime-workspace 20 | rspec.failures 21 | 22 | 23 | -------------------------------------------------------------------------------- /.rspec: -------------------------------------------------------------------------------- 1 | --color 2 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source "http://rubygems.org" 2 | 3 | gem 'murmurhash3', '~> 0.1.3' 4 | gem 'redis', '~> 3.0.1' 5 | 6 | group :development, :test do 7 | gem 'jeweler', '~> 1.8.4' 8 | gem 'rake', '~> 0.9.2.2' 9 | gem 'rspec', '~> 2.11.0' 10 | gem 'timecop', '~> 0.5.3' 11 | end -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: http://rubygems.org/ 3 | specs: 4 | diff-lcs (1.1.3) 5 | git (1.2.5) 6 | jeweler (1.8.4) 7 | bundler (~> 1.0) 8 | git (>= 1.2.5) 9 | rake 10 | rdoc 11 | json (1.7.5) 12 | murmurhash3 (0.1.3) 13 | rake (0.9.2.2) 14 | rdoc (3.12) 15 | json (~> 1.4) 16 | redis (3.0.1) 17 | rspec (2.11.0) 18 | rspec-core (~> 2.11.0) 19 | rspec-expectations (~> 2.11.0) 20 | rspec-mocks (~> 2.11.0) 21 | rspec-core (2.11.1) 22 | rspec-expectations (2.11.3) 23 | diff-lcs (~> 1.1.3) 24 | rspec-mocks (2.11.2) 25 | timecop (0.5.3) 26 | 27 | PLATFORMS 28 | ruby 29 | 30 | DEPENDENCIES 31 | jeweler (~> 1.8.4) 32 | murmurhash3 (~> 0.1.3) 33 | rake (~> 0.9.2.2) 34 | redis (~> 3.0.1) 35 | rspec (~> 2.11.0) 36 | timecop (~> 0.5.3) 37 | -------------------------------------------------------------------------------- /HISTORY.md: -------------------------------------------------------------------------------- 1 | ## 2.0.0 (11/30/2012) 2 | 3 | * Changed the underlying storage from Redis hashes to bitstrings [simonkro](https://github.com/simonkro) 4 | If you have existing counters stored from version 1.0.0, you can upgrade them with 5 | the following method: 6 | 7 | def upgrade_1_2(counter, redis) 8 | return if redis.type(counter) == "string" 9 | sketch = redis.hgetall(counter) 10 | redis.del(counter) 11 | sketch.each{ |key, value| redis.setrange(counter, key.to_i, value.to_i.chr) } 12 | end 13 | 14 | * Moved main counter implementation from `HyperLogLog` to the class `HyperLogLog::Counter` 15 | 16 | * Added `HyperLogLog::TimeSeriesCounter` a counter type that can estimate cardinalities 17 | for all events from a particular point in the past until the present. 18 | 19 | 20 | ## 1.0.0 (10/26/2012) 21 | 22 | * Changed the underlying storage from Redis sorted sets to Redis hashes. This 23 | is a breaking change, if you have existing counters stored from earlier 24 | versions of this library, you can upgrade them with something like the 25 | following method: 26 | 27 | def upgrade(counter, redis) 28 | return if redis.type(counter) == "hash" 29 | values = redis.zrange(counter, 0, -1, {withscores: true}) 30 | redis.del(counter) 31 | values.each { |key, value| redis.hset(counter, key, value.to_i) } 32 | end 33 | 34 | * Added union_store command, which stores the results of a union for querying 35 | or combining with other sets later 36 | 37 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012 Art.sy, Inc. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 7 | of the Software, and to permit persons to whom the Software is furnished to do 8 | so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | hyperloglog-redis 2 | ================= 3 | 4 | This gem is a pure Ruby implementation of the HyperLogLog algorithm for estimating 5 | cardinalities of sets observed via a stream of events. A [Redis](http://redis.io) 6 | instance is used for storing the counters. A minimal example: 7 | 8 | require 'redis' 9 | require 'hyperloglog-redis' 10 | 11 | counter = HyperLogLog::Counter.new(Redis.new) 12 | ['john', 'paul', 'george', 'ringo', 'john', 'paul'].each do |beatle| 13 | counter.add('beatles', beatle) 14 | end 15 | 16 | puts "There are approximately #{counter.count('beatles')} distinct Beatles" 17 | 18 | Each HyperLogLog counter uses a small, fixed amount of space but can 19 | estimate the cardinality of any set of up to around a billion values with 20 | relative error of 1.04 / Math.sqrt(2 ** b) with high probability, where b is a 21 | parameter passed to the `HyperLogLog::Counter` initializer that defaults to 10. 22 | With b = 10, each counter is represented by a 1 KB string in Redis and we get 23 | an expected relative error of 3%. Contrast this with the amount of space needed 24 | to compute set cardinality exactly, which is over 100 MB for even a bit vector 25 | representing a set with a billion values. 26 | 27 | The basic idea of HyperLogLog (and its predecessors PCSA, LogLog, and others) is 28 | to apply a good hash function to each value observed in the stream and record the longest 29 | run of zeros seen as a prefix of any hashed value. If the hash 30 | function is good, the bits in any hashed value should be close to statistically independent, 31 | so seeing a value that starts with exactly X zeros should happen with probability close to 32 | 2 ** -(X + 1). So, if you've seen a run of 5 zeros in one of your hash values, 33 | you're likely to have around 2 ** 6 = 64 values in the underlying set. The actual 34 | implementation and analysis are much more advanced than this, but that's the idea. 35 | 36 | This gem implements a few useful extensions to the basic HyperLogLog algorithm 37 | which allow you to estimate unions and intersections of counters as well as 38 | counts within specific time ranges. These extensions are described in detail below. 39 | 40 | The HyperLogLog algorithm is described and analyzed in the paper 41 | ["HyperLogLog: the analysis of a near-optimal cardinality estimation 42 | algorithm"](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf) 43 | by Flajolet, Fusy, Gandouet, and Meunier. Our implementation closely 44 | follows the program described in Section 4 of that paper. 45 | 46 | Unions and intersections 47 | ======================== 48 | 49 | You can also ask for an estimate of the union from multiple counters: 50 | 51 | ['joe', 'denny', 'linda', 'jimmy', 'paul'].each do |wings_member| 52 | counter.add('wings', wings_member) 53 | end 54 | 55 | puts "There are approximately #{counter.union(['beatles', 'wings'])} people who were in the Beatles or Wings" 56 | 57 | The same relative error guarantee above applies to unions: a union of 58 | size N can be estimated to within +/- N * (1.04 / Math.sqrt(2 ** b)) elements, 59 | regardless of how many HyperLogLog counters that union spans. You can store 60 | a unioned counter for querying or combining later with `union_store`: 61 | 62 | counter.union_store('all_beatles_and_wings_members', ['beatles', 'wings']) 63 | 64 | puts "There are approximately #{counter.count('all_beatles_and_wings_members'}} people who were in the Beatles or Wings" 65 | 66 | Intersections can also be estimated: 67 | 68 | puts "There are approximately #{counter.intersection(['beatles', 'wings'])} people who were in both the Beatles and Wings" 69 | 70 | However, intersections of HyperLogLog counters are calculated indirectly via the 71 | [inclusion/exclusion principle](http://en.wikipedia.org/wiki/Inclusion%E2%80%93exclusion_principle) 72 | as a sum of unions and there aren't good theoretical bounds on the error of that sum. In 73 | practice, the estimates that come out of small intersections tend to follow the 74 | same relative error patterns, but beware using this type of estimation on intersections 75 | of large numbers of sets, both because the errors can be much larger than those guaranteed 76 | for unions and the complexity of computing intersections grows exponentially with 77 | the number of sets in the intersection. 78 | 79 | Set cardinality within a time interval 80 | ====================================== 81 | 82 | All examples up until now use `HyperLogLog::Counter`, which stores HyperLogLog 83 | counters as (2 ** b)-byte Redis strings. hyperloglog-redis also contains the counter implementation 84 | `HyperLogLog::TimeSeriesCounter`, which uses a little more space (Redis strings of up to 85 | 4 * (32 - b) * (2 ** b) bytes) but allows you to estimate the cardinality of sets during 86 | certain time windows. 87 | 88 | Using `HyperLogLog::TimeSeriesCounter`, you can get estimates of the number of distinct 89 | elements added to a set in the past X seconds, for any value of X. A `HyperLogLog::TimeSeriesCounter` 90 | is initialized with the same arguments as a regular `Counter` but implements a 91 | superset of `HyperLogLog::Counter`'s interface. Namely, each of the methods `add`, 92 | `count`, `union`, `intersection`, and `union_store` take an optional final time argument, 93 | either a Ruby `Time` or an integer representing seconds since the epoch. 94 | 95 | When passed a time argument t, `add` registers an addition to the set at time t. When no 96 | time is passed, the current system time is used. The methods `count`, `union`, 97 | `intersection`, and `union_store` all estimate set cardinality for the time interval 98 | consisting of all events that happened after time t when t is passed as a final argument. 99 | 100 | For example, to get the number of distinct user logins within the 101 | past week, we might call: 102 | 103 | one_week = 60 * 60 * 24 * 7 104 | logins_in_past_week = counter.count('user_logins', Time.now - one_week) 105 | 106 | A note about relative errors 107 | ============================ 108 | 109 | With a parameter `b` in the range [4..16], HyperLogLog counters provide a relative 110 | error of 1.04/sqrt(2 ** b) with high probability. When unions, intersections, and 111 | time range queries are involved, it's sometimes not clear what the relative error 112 | is relative to, so here is some clarification: 113 | 114 | * For a union of counters, the relative error applies to the size of the union. Taking 115 | the union of counters is lossless in the sense that you end up with the same counter 116 | you would have arrived at had you observed the union of all of the individual events. 117 | 118 | * For an intersection of counters, there's no good theoretical bound on the relative 119 | error. In practice, and especially for intersections involving a small number of sets, 120 | the relative error you obtain tends to be in relation to the size of the union of the 121 | sets involved. For example, if you have two sets, each of cardinality 5000 and observe 122 | both sets through HyperLogLog counters with parameter b=10 (3% relative error), you can 123 | expect the intersection estimate to be within 10000 * 0.03 = 300 of the actual intersection 124 | size. 125 | 126 | * For time queries, the relative error applies to the size of the set within the time 127 | range you've queried. For example, given a set of cardinality 1,000,000 that has had 128 | 100 distinct additions within the last 10 minutes, if you observe such a set with a 129 | HyperLogLog counter with parameter b=10 (3% relative error), you can expect the count 130 | returned from a query about the last 10 minutes to be within 3 of 100. 131 | 132 | Comparison to other approaches 133 | ============================== 134 | 135 | When trying to optimize for space, two well-known alternatives to HyperLogLog exist: 136 | 137 | * Bit vectors: you provide some near-perfect hash function between keys in your domain 138 | and an interval of integers, then represent that interval of integers with bits. 139 | * Bloom filters with counters: use a [Bloom filter](http://en.wikipedia.org/wiki/Bloom_filter) 140 | to keep track of items seen; on insert, when the Bloom filter tells you that the item 141 | seen is not in the set, increment the counter. 142 | 143 | Both bit vectors and bloom filters can be augmented to hold timestamps for entries in the 144 | data structures and simulate counters for time-ranges like `HyperLogLog::TimeSeriesCounter`. 145 | 146 | Bit vectors give exact counts, but the space complexity is linear with the size of 147 | the set, and you must either allocate a large bit vector upfront or cope with the complexity 148 | of dynamically resizing your bit vector as the set grows. Providing a manual mapping from 149 | members of your set to an interval of integers is sometimes a non-trivial task. Counts, 150 | unions, and intersections are all linear-time operations in the size of the universe of 151 | the set being represented. 152 | 153 | Bloom filters can be much more compact than bit vectors, but the actual count associated 154 | with a Bloom filter is an artifact of the construction of the data structure, so the cost 155 | of estimating a union or intersection is linear in the size of the Bloom filter. Getting 156 | high probability guarantees on the quality of the estimate of Bloom filter counts requires 157 | several "good" hash functions that have some degree of independence from each other; in 158 | practice, coming up with several independent implementations of good hash functions is 159 | difficult. Bloom filters require that all of their space be allocated upfront (re-hashing 160 | isn't possible without replaying all events), so in practice you need some estimate of 161 | how large the counters are going to be before allocating the counter. 162 | 163 | HyperLogLog counters take up less space than either of the above approaches and provide 164 | constant-time implementations (in the size of the sets being represented) of unions, 165 | intersections, and time range queries. A `HyperLogLog::Counter` with parameter b will 166 | be stored in a Redis string of length at most 2 ** b bytes, whereas a `HyperLogLog::TimeSeriesCounter` with parameter 167 | b will be stored in a Redis string of length at most 4 * (32 - b) * (2 ** b) bytes. For counters representing smaller sets, 168 | the size taken up by a `HyperLogLog::TimeSeriesCounter` can be significantly less. Here 169 | are some examples for specific values of b: 170 | 171 | * With b = 7, a `HyperLogLog::Counter` uses at most 128 bytes and a `HyperLogLog::TimeSeriesCounter` uses at most 13 KB while providing a relative error of 9%. 172 | * With b = 11, a `HyperLogLog::Counter` uses at most 2 KB and a `HyperLogLog::TimeSeriesCounter` uses at most 168 KB while providing a relative error of 2% 173 | * With b = 16, a `HyperLogLog::Counter` uses at most 64 KB and a `HyperLogLog::TimeSeriesCounter` uses at most 4 MB while providing a relative error of less than half a percent. 174 | 175 | Installation 176 | ============ 177 | 178 | gem install hyperloglog-redis 179 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | require 'rubygems' 4 | require 'bundler' 5 | begin 6 | Bundler.setup(:default, :development) 7 | rescue Bundler::BundlerError => e 8 | $stderr.puts e.message 9 | $stderr.puts "Run `bundle install` to install missing gems" 10 | exit e.status_code 11 | end 12 | require 'rake' 13 | 14 | require 'jeweler' 15 | Jeweler::Tasks.new do |gem| 16 | # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options 17 | gem.name = "hyperloglog-redis" 18 | gem.homepage = "http://github.com/aaw/hyperloglog-redis" 19 | gem.license = "MIT" 20 | gem.summary = %Q{An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end} 21 | gem.description = %Q{An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end} 22 | gem.email = "aaron.windsor@gmail.com" 23 | gem.authors = ["Aaron Windsor"] 24 | # dependencies defined in Gemfile 25 | end 26 | Jeweler::RubygemsDotOrgTasks.new 27 | 28 | require 'rspec/core' 29 | require 'rspec/core/rake_task' 30 | RSpec::Core::RakeTask.new(:spec) do |spec| 31 | spec.pattern = FileList['spec/**/*_spec.rb'] 32 | end 33 | 34 | task :default => :spec 35 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 2.0.0 -------------------------------------------------------------------------------- /hyperloglog-redis.gemspec: -------------------------------------------------------------------------------- 1 | # Generated by jeweler 2 | # DO NOT EDIT THIS FILE DIRECTLY 3 | # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec' 4 | # -*- encoding: utf-8 -*- 5 | 6 | Gem::Specification.new do |s| 7 | s.name = "hyperloglog-redis" 8 | s.version = "2.0.0" 9 | 10 | s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version= 11 | s.authors = ["Aaron Windsor"] 12 | s.date = "2012-11-30" 13 | s.description = "An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end" 14 | s.email = "aaron.windsor@gmail.com" 15 | s.extra_rdoc_files = [ 16 | "LICENSE.txt", 17 | "README.md" 18 | ] 19 | s.files = [ 20 | ".document", 21 | ".rspec", 22 | "Gemfile", 23 | "Gemfile.lock", 24 | "HISTORY.md", 25 | "LICENSE.txt", 26 | "README.md", 27 | "Rakefile", 28 | "VERSION", 29 | "hyperloglog-redis.gemspec", 30 | "lib/algorithm.rb", 31 | "lib/counter.rb", 32 | "lib/hyperloglog-redis.rb", 33 | "lib/time_series_counter.rb", 34 | "spec/hyper_log_log_spec.rb", 35 | "spec/spec_helper.rb", 36 | "spec/time_series_counter_spec.rb" 37 | ] 38 | s.homepage = "http://github.com/aaw/hyperloglog-redis" 39 | s.licenses = ["MIT"] 40 | s.require_paths = ["lib"] 41 | s.rubygems_version = "1.8.10" 42 | s.summary = "An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end" 43 | 44 | if s.respond_to? :specification_version then 45 | s.specification_version = 3 46 | 47 | if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then 48 | s.add_runtime_dependency(%q, ["~> 0.1.3"]) 49 | s.add_runtime_dependency(%q, ["~> 3.0.1"]) 50 | s.add_development_dependency(%q, ["~> 1.8.4"]) 51 | s.add_development_dependency(%q, ["~> 0.9.2.2"]) 52 | s.add_development_dependency(%q, ["~> 2.11.0"]) 53 | s.add_development_dependency(%q, ["~> 0.5.3"]) 54 | else 55 | s.add_dependency(%q, ["~> 0.1.3"]) 56 | s.add_dependency(%q, ["~> 3.0.1"]) 57 | s.add_dependency(%q, ["~> 1.8.4"]) 58 | s.add_dependency(%q, ["~> 0.9.2.2"]) 59 | s.add_dependency(%q, ["~> 2.11.0"]) 60 | s.add_dependency(%q, ["~> 0.5.3"]) 61 | end 62 | else 63 | s.add_dependency(%q, ["~> 0.1.3"]) 64 | s.add_dependency(%q, ["~> 3.0.1"]) 65 | s.add_dependency(%q, ["~> 1.8.4"]) 66 | s.add_dependency(%q, ["~> 0.9.2.2"]) 67 | s.add_dependency(%q, ["~> 2.11.0"]) 68 | s.add_dependency(%q, ["~> 0.5.3"]) 69 | end 70 | end 71 | 72 | -------------------------------------------------------------------------------- /lib/algorithm.rb: -------------------------------------------------------------------------------- 1 | require 'redis' 2 | require 'murmurhash3' 3 | 4 | module HyperLogLog 5 | module Algorithm 6 | 7 | def initialize(redis, b=10) 8 | raise "Accuracy not supported. Please choose a value of b between 4 and 16" if b < 4 || b > 16 9 | @redis = redis 10 | @bits_in_hash = 32 - b 11 | @m = (2 ** b).to_i 12 | if @m == 16 13 | @alpha = 0.673 14 | elsif @m == 32 15 | @alpha = 0.697 16 | elsif @m == 64 17 | @alpha = 0.709 18 | else 19 | @alpha = 0.7213/(1 + 1.079/@m) 20 | end 21 | end 22 | 23 | # Estimate the cardinality of the intersection of several sets. We do this by 24 | # using the principle of inclusion and exclusion to represent the size of the 25 | # intersection as the alternating sum of an exponential number of 26 | # cardinalities of unions of smaller sets. 27 | def intersection(counter_names, time=0) 28 | icount = (1..counter_names.length).map do |k| 29 | counter_names.combination(k).map do |group| 30 | ((k % 2 == 0) ? -1 : 1) * union_helper(group, time) 31 | end.inject(0, :+) 32 | end.inject(0, :+) 33 | [icount, 0].max 34 | end 35 | 36 | private 37 | 38 | def hash_info(value) 39 | hash = MurmurHash3::V32.murmur3_32_str_hash(value) 40 | [hash, hash % @m, rho(hash / @m)] 41 | end 42 | 43 | def union_helper(counter_names, time=0) 44 | all_estimates = raw_union(counter_names, time).select{ |i| i > 0 } 45 | estimate_sum = all_estimates.reduce(0.0){ |a, score| a + 2.0 ** -score } 46 | estimate = @alpha * @m * @m / (estimate_sum + @m - all_estimates.length) 47 | if estimate <= 2.5 * @m 48 | if all_estimates.length == @m 49 | estimate.round 50 | else # Correction for small sets 51 | (@m * Math.log(Float(@m)/(@m - all_estimates.length))).round 52 | end 53 | elsif estimate <= 2 ** 32 / 30.0 54 | estimate.round 55 | else # Correction for large sets 56 | (-2**32 * Math.log(1 - estimate/(2.0**32))).round 57 | end 58 | end 59 | 60 | # rho(i) is the position of the first 1 in the binary representation of i, 61 | # reading from most significant to least significant bits. Some examples: 62 | # rho(1...) = 1, rho(001...) = 3, rho(000...0) = @bits_in_hash + 1 63 | def rho(i) 64 | return @bits_in_hash + 1 if i == 0 65 | @bits_in_hash - Math.log(i, 2).floor 66 | end 67 | 68 | end 69 | end 70 | -------------------------------------------------------------------------------- /lib/counter.rb: -------------------------------------------------------------------------------- 1 | module HyperLogLog 2 | class Counter 3 | include Algorithm 4 | 5 | # This is the implementation of the standard HyperLogLog algorithm, storing 6 | # counts in each byte of a string of length 2 ** b. 7 | 8 | def add(counter_name, value) 9 | hash, function_name, new_value = hash_info(value) 10 | existing_value = @redis.getrange(counter_name, function_name, function_name).unpack('C').first.to_i 11 | @redis.setrange(counter_name, function_name, new_value.chr) if new_value > existing_value 12 | end 13 | 14 | # Estimate the cardinality of a single set 15 | def count(counter_name) 16 | union_helper([counter_name]) 17 | end 18 | 19 | # Estimate the cardinality of the union of several sets 20 | def union(counter_names) 21 | union_helper(counter_names) 22 | end 23 | 24 | # Store the union of several sets in *destination* so that it can be used as 25 | # a HyperLogLog counter later. 26 | def union_store(destination, counter_names) 27 | @redis.set(destination, raw_union(counter_names).inject('') {|a, e| a << e.chr}) 28 | end 29 | 30 | private 31 | 32 | def raw_union(counter_names, time=nil) 33 | counters = @redis.mget(*counter_names).compact 34 | return [] if counters.none? 35 | return counters.first.each_byte if counters.one? 36 | counters.map{|c| c.unpack("C#{@m}")}.transpose.map {|e| e.compact.max.to_i} 37 | end 38 | 39 | end 40 | end 41 | -------------------------------------------------------------------------------- /lib/hyperloglog-redis.rb: -------------------------------------------------------------------------------- 1 | require "algorithm" 2 | require "counter" 3 | require "time_series_counter" 4 | -------------------------------------------------------------------------------- /lib/time_series_counter.rb: -------------------------------------------------------------------------------- 1 | module HyperLogLog 2 | class TimeSeriesCounter 3 | include Algorithm 4 | 5 | # This is an implementation of HyperLogLog that allows for querying counts 6 | # within time ranges of the form (t, current_time] with second-level 7 | # granularity. The standard implementation of HyperLogLog stores the max 8 | # number of leading zeros seen in the image of each of 2 ** b hash 9 | # functions. These counts can naturally be stored in a string of length 10 | # 2 ** b by allocating one byte per leading zero count. 11 | # 12 | # To provide counts within a time range, we alter the standard 13 | # implementation to store a mapping of pairs of the form (hash function, 14 | # leading zero count) -> timestamp, where the mapping (h,z) -> t represents 15 | # the fact that we observed z leading zeros in the image of hash function h 16 | # most recently at time t. This mapping is stored in a string by packing 17 | # 4-byte words (timestamps, represented in seconds since the epoch) into 18 | # a matrix indexed by hash function and zero count stored in row-major 19 | # order. Since the max zero count for a counter with parameter b is (32-b), 20 | # this representation takes up at most 4 * (32-b) * (2 ** b) bytes (and 21 | # usually much less, since we don't allocate space for rows corresponding 22 | # to higher leading zero counts until they're actaully observed.) 23 | # 24 | # To convert this representation to a HyperLogLog counter for the time 25 | # range (t, current_time], we simply filter out all timestamps less than t 26 | # in the matrix and then find, for each hash function, the maximum z for 27 | # which that hash function has a non-zero timestamp. 28 | 29 | def add(counter_name, value, time=nil) 30 | hash, function_name, new_value = hash_info(value) 31 | index = 4 * (function_name + (new_value.to_i * @m)) 32 | if time.nil? 33 | @redis.setrange(counter_name, index, [Time.now.to_i].pack('N')) 34 | else 35 | existing_time = @redis.getrange(counter_name, index, index + 3) 36 | existing_val = existing_time.empty? ? 0 : existing_time.unpack('N').first 37 | @redis.setrange(counter_name, index, [time.to_i].pack('N')) if time.to_i > existing_val 38 | end 39 | end 40 | 41 | # Estimate the cardinality of a single set 42 | def count(counter_name, time=0) 43 | union_helper([counter_name], time) 44 | end 45 | 46 | # Estimate the cardinality of the union of several sets 47 | def union(counter_names, time=0) 48 | union_helper(counter_names, time) 49 | end 50 | 51 | # Store the union of several sets in *destination* so that it can be used as 52 | # a HyperLogLog counter later. 53 | def union_store(destination, counter_names, time=0) 54 | raw_counters = @redis.mget(*counter_names).compact.map{ |c| c.unpack('N*').map{ |x| x > time ? x : 0 } } 55 | combined_counters = jagged_transpose(raw_counters).map{ |x| x.max.to_i } 56 | @redis.set(destination, combined_counters.pack('N*')) 57 | end 58 | 59 | private 60 | 61 | def raw_union(counter_names, time=0) 62 | raw_counters = @redis.mget(*counter_names).compact 63 | return [] if raw_counters.none? 64 | hyperloglog_counters = raw_counters.map do |counter| 65 | jagged_transpose(counter.unpack('N*').each_slice(@m).to_a).map{ |x| x.rindex{ |c| c > time } || 0 } 66 | end 67 | return hyperloglog_counters.first if hyperloglog_counters.one? 68 | jagged_transpose(hyperloglog_counters).map{ |x| x.max.to_i } 69 | end 70 | 71 | # Given an array of non-uniform length arrays, right-pad all arrays with 72 | # zeros so they're the same size, then transpose the array. This is a 73 | # destructive operation: the zero-padding modifies the array-of-arrays 74 | def jagged_transpose(arrays) 75 | max_length = arrays.map{ |a| a.length }.max 76 | arrays.map{ |a| a.fill(0, a.length, max_length - a.length) }.transpose 77 | end 78 | 79 | end 80 | end 81 | -------------------------------------------------------------------------------- /spec/hyper_log_log_spec.rb: -------------------------------------------------------------------------------- 1 | require File.expand_path(File.dirname(__FILE__) + '/spec_helper') 2 | 3 | describe HyperLogLog do 4 | 5 | [HyperLogLog::Counter, HyperLogLog::TimeSeriesCounter].each do |counter_type| 6 | 7 | it "doesn't change its count when it sees values that it's already seen" do 8 | redis = Redis.new 9 | counter = counter_type.new(redis, 10) 10 | test_set = (1..100).map{ |x| x.to_s } 11 | test_set.each{ |value| counter.add("mycounter", value) } 12 | original_estimate = counter.count("mycounter") 13 | 5.times do 14 | test_set.each do |value| 15 | counter.add("mycounter", value) 16 | counter.count("mycounter").should == original_estimate 17 | end 18 | end 19 | end 20 | 21 | it "can maintain more than one logically distinct counter" do 22 | redis = Redis.new 23 | counter = counter_type.new(redis, 10) 24 | other_estimate = counter.count("counter2") 25 | (1..100).each do |i| 26 | counter.add("counter1", i.to_s) 27 | counter.count("counter2").should == other_estimate 28 | end 29 | other_estimate = counter.count("counter1") 30 | (101..200).each do |i| 31 | counter.add("counter2", i.to_s) 32 | counter.count("counter1").should == other_estimate 33 | end 34 | other_estimate = counter.count("counter2") 35 | (201..300).each do |i| 36 | counter.add("counter1", i.to_s) 37 | counter.count("counter2").should == other_estimate 38 | end 39 | counter.count("counter1").should > 100 40 | counter.count("counter2").should > 50 41 | counter.count("counter1").should > counter.count("counter2") 42 | end 43 | 44 | it "can exactly count small sets" do 45 | redis = Redis.new 46 | counter = counter_type.new(redis, 11) 47 | 10.times { |i| counter.add("mycounter", i.to_s) } 48 | counter.count("mycounter").should == 10 49 | end 50 | 51 | it "can exactly count small unions" do 52 | redis = Redis.new 53 | counter = counter_type.new(redis, 11) 54 | (1..8).each { |i| counter.add("mycounter1", i.to_s) } 55 | (5..12).each { |i| counter.add("mycounter2", i.to_s) } 56 | counter.union(["mycounter1", "mycounter2"]).should == 12 57 | end 58 | 59 | it "can exactly count small intersections" do 60 | redis = Redis.new 61 | counter = counter_type.new(redis, 11) 62 | (1..8).each { |i| counter.add("mycounter1", i.to_s) } 63 | (5..12).each { |i| counter.add("mycounter2", i.to_s) } 64 | counter.intersection(["mycounter1", "mycounter2"]).should == 4 65 | end 66 | 67 | it "can store unions for querying later" do 68 | redis = Redis.new 69 | counter = counter_type.new(redis, 11) 70 | (1..10).each { |i| counter.add("mycounter1", i.to_s) } 71 | (5..15).each { |i| counter.add("mycounter2", i.to_s) } 72 | (15..25).each { |i| counter.add("mycounter3", i.to_s) } 73 | (20..50).each { |i| counter.add("mycounter4", i.to_s) } 74 | counter.union_store("aggregate_counter", ["mycounter1", "mycounter2", "mycounter3", "mycounter4"]) 75 | counter.union(["mycounter1", "mycounter2", "mycounter3", "mycounter4"]).should == counter.count("aggregate_counter") 76 | end 77 | 78 | # With parameter b, HyperLogLog should produce estimates that have 79 | # relative error of 1.04 / Math.sqrt(2 ** b). Of course, this analysis 80 | # is based on assumptions that aren't necessarily true in practice and 81 | # the observed relative error will depend on the distribution of data 82 | # we receive as well as the interaction of the murmur hash implementation 83 | # with that data. Keeping that in mind, the following spec makes sure 84 | # that in the process of adding 1000 values to a set, HyperLogLog only 85 | # gives bad estimates (more than twice the expected relative error) in 86 | # less than 1% of the cases and never gives very bad estimates (more than 87 | # three times the expected relative error.) 88 | # 89 | # It's fine to fudge these numbers a little if the implementation changes, 90 | # since you can clearly find a different set of values that make this test 91 | # fail even without changing the implementation. But it should serve as a 92 | # good indication that there aren't any logical errors in the HyperLogLog 93 | # implementation, since it exercises all of the cases in HyperLogLog's 94 | # count method except for the correction for very large set sizes. 95 | 96 | it "produces acceptable estimates for counts" do 97 | max_items = 1000 98 | redis = Redis.new 99 | (6..16).each do |b| 100 | counter = counter_type.new(redis, b) 101 | redis.del('mycounter') 102 | bad_estimates = 0 103 | very_bad_estimates = 0 104 | expected_relative_error = 1.04 / Math.sqrt(2 ** b) 105 | max_items.times do |i| 106 | value = Digest::MD5.hexdigest("value#{i}") 107 | counter.add("mycounter", value) 108 | actual = i + 1 109 | approximate = counter.count("mycounter") 110 | relative_error = (actual - approximate).abs / Float(actual) 111 | bad_estimates += 1 if relative_error > expected_relative_error * 2 112 | very_bad_estimates += 1 if relative_error > expected_relative_error * 3 113 | end 114 | bad_estimates.should < max_items / 100.00 115 | very_bad_estimates.should == 0 116 | end 117 | end 118 | 119 | it "produces acceptable estimates for unions with few elements in common" do 120 | b, max_items = 10, 2000 121 | counter = counter_type.new(Redis.new, b) 122 | bad_estimates = 0 123 | very_bad_estimates = 0 124 | expected_relative_error = 1.04 / Math.sqrt(2 ** b) 125 | max_items.times do |i| 126 | value1 = Digest::MD5.hexdigest("value#{i}") 127 | counter.add("mycounter1", value1) 128 | value2 = Digest::MD5.hexdigest("value#{i}incounter2") 129 | counter.add("mycounter2", value2) 130 | value3 = Digest::MD5.hexdigest("this is value#{i}") 131 | counter.add("mycounter3", value3) 132 | actual = 3 * (i + 1) 133 | approximate = counter.union(["mycounter1", "mycounter2", "mycounter3"]) 134 | relative_error = (actual - approximate).abs / Float(actual) 135 | bad_estimates += 1 if relative_error > expected_relative_error * 2 136 | very_bad_estimates += 1 if relative_error > expected_relative_error * 3 137 | end 138 | bad_estimates.should < (3 * max_items) / 100.00 139 | very_bad_estimates.should == 0 140 | end 141 | 142 | it "produces acceptable estimates for unions with many elements in common" do 143 | b, max_items, intersection_size = 10, 1000, 2000 144 | counter = counter_type.new(Redis.new, b) 145 | bad_estimates = 0 146 | very_bad_estimates = 0 147 | expected_relative_error = 1.04 / Math.sqrt(2 ** b) 148 | 149 | intersection_size.times do |i| 150 | value = Digest::MD5.hexdigest("test#{i}value") 151 | ['mycounter1', 'mycounter2', 'mycounter3'].each do |counter_name| 152 | counter.add(counter_name, value) 153 | end 154 | end 155 | 156 | max_items.times do |i| 157 | value1 = Digest::MD5.hexdigest("value#{i}") 158 | counter.add("mycounter1", value1) 159 | value2 = Digest::MD5.hexdigest("value#{i}isincounter2") 160 | counter.add("mycounter2", value2) 161 | value3 = Digest::MD5.hexdigest("this is value#{i}") 162 | counter.add("mycounter3", value3) 163 | actual = 3 * (i + 1) + intersection_size 164 | approximate = counter.union(["mycounter1", "mycounter2", "mycounter3"]) 165 | relative_error = (actual - approximate).abs / Float(actual) 166 | bad_estimates += 1 if relative_error > expected_relative_error * 2 167 | very_bad_estimates += 1 if relative_error > expected_relative_error * 3 168 | end 169 | 170 | bad_estimates.should < ((3 * max_items) + intersection_size) / 100.00 171 | very_bad_estimates.should == 0 172 | end 173 | 174 | # There are no good theoretical guarantees that I know of for arbitrary 175 | # intersection estimation, since it's expessed as the sum of unions of 176 | # HyperLogLog counters, but it tends to work okay in practice, as seen below. 177 | 178 | it "produces decent estimates for intersections" do 179 | b, max_items = 6, 1000 180 | counter = counter_type.new(Redis.new, b) 181 | expected_relative_error = 1.04 / Math.sqrt(2 ** b) 182 | 183 | max_items.times do |i| 184 | value1 = Digest::MD5.hexdigest("first-value#{i}") 185 | value2 = Digest::MD5.hexdigest("second-value#{i}") 186 | value3 = Digest::MD5.hexdigest("third-value#{i}") 187 | value4 = Digest::MD5.hexdigest("fourth-value#{i}") 188 | counter.add("mycounter1", value1) 189 | counter.add("mycounter2", value2) 190 | counter.add("mycounter3", value3) 191 | counter.add("mycounter4", value4) 192 | [value1, value2, value3, value4].each{ |value| counter.add("mycounter5", value) } 193 | end 194 | 195 | small_counters = ['mycounter1', 'mycounter2', 'mycounter3', 'mycounter4'] 196 | 197 | small_counters.each do |counter_name| 198 | intersection_estimate = counter.intersection([counter_name, 'mycounter5']) 199 | intersection_estimate.should > 0 200 | (intersection_estimate - counter.count(counter_name)).abs.should < max_items * expected_relative_error 201 | end 202 | 203 | [2,3].each do |intersection_size| 204 | small_counters.combination(intersection_size).each do |counter_names| 205 | intersection_estimate = counter.intersection(counter_names) 206 | intersection_estimate.should >= 0 207 | intersection_estimate.should < intersection_size * max_items * expected_relative_error 208 | end 209 | end 210 | 211 | 100.times do |i| 212 | value = Digest::MD5.hexdigest("somethingintheintersection#{i}") 213 | small_counters.each { |counter_name| counter.add(counter_name, value) } 214 | end 215 | 216 | [2,3,4].each do |intersection_size| 217 | small_counters.combination(intersection_size).each do |counter_names| 218 | intersection_estimate = counter.intersection(counter_names) 219 | intersection_estimate.should >= 0 220 | (intersection_estimate - 100).abs.should < intersection_size * (max_items + 100) * expected_relative_error 221 | end 222 | end 223 | 224 | end 225 | end 226 | end 227 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib')) 2 | $LOAD_PATH.unshift(File.dirname(__FILE__)) 3 | require 'rspec' 4 | require 'redis' 5 | require 'hyperloglog-redis' 6 | 7 | db_number = ENV['REDIS_TEST_DATABASE'] || '15' 8 | ENV['REDIS_URL'] = "redis://localhost:6379/#{db_number}" 9 | redis = Redis.new 10 | if redis.keys('*').length > 0 11 | puts "Warning! These specs use database #{db_number} on your local redis instance" 12 | puts "running on port 6379. Your database #{db_number} seems to have keys in it." 13 | puts "Please clear them before running the specs or set the environment" 14 | puts "variable REDIS_TEST_DATABASE to use a different database number." 15 | raise SystemExit 16 | end 17 | 18 | # Requires supporting files with custom matchers and macros, etc, 19 | # in ./support/ and its subdirectories. 20 | Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f} 21 | 22 | RSpec.configure do |config| 23 | config.before(:each) do 24 | Redis.new.flushdb 25 | end 26 | config.after(:each) do 27 | Redis.new.flushdb 28 | end 29 | end 30 | -------------------------------------------------------------------------------- /spec/time_series_counter_spec.rb: -------------------------------------------------------------------------------- 1 | require 'securerandom' 2 | require 'timecop' 3 | require File.expand_path(File.dirname(__FILE__) + '/spec_helper') 4 | 5 | MINUTES=60 6 | HOURS=MINUTES*60 7 | DAYS=HOURS*24 8 | WEEKS=DAYS*7 9 | 10 | describe HyperLogLog::TimeSeriesCounter do 11 | 12 | before(:each) do 13 | @b = 11 14 | @redis = Redis.new 15 | @counter = HyperLogLog::TimeSeriesCounter.new(@redis, @b) 16 | @expected_relative_error = 1.04 / Math.sqrt(2 ** @b) 17 | 18 | def counter_should_equal(counter_val, expected_val, relative_error_base=nil) 19 | (counter_val - expected_val).abs.should <= (relative_error_base || expected_val) * @expected_relative_error 20 | end 21 | end 22 | 23 | it "can estimate cardinalities from any particular point in time until the present" do 24 | Timecop.travel(Time.now - 2 * WEEKS) do 25 | (0..100).each { |i| @counter.add('mycounter', "item#{i}") } 26 | end 27 | Timecop.travel(Time.now - 1 * WEEKS) do 28 | (100..200).each { |i| @counter.add('mycounter', "item#{i}") } 29 | end 30 | Timecop.travel(Time.now - 6 * DAYS) do 31 | (0..100).each { |i| @counter.add('mycounter', "item#{i}") } 32 | end 33 | Timecop.travel(Time.now - 5 * DAYS) do 34 | (100..200).each { |i| @counter.add('mycounter', "item#{i}") } 35 | end 36 | Timecop.travel(Time.now - 4 * DAYS) do 37 | (200..250).each { |i| @counter.add('mycounter', "item#{i}") } 38 | end 39 | 40 | counter_should_equal(@counter.count('mycounter'), 250) 41 | counter_should_equal(@counter.count('mycounter', Time.now.to_i - 3 * WEEKS), 250) 42 | counter_should_equal(@counter.count('mycounter', Time.now.to_i - 1 * WEEKS - 3 * DAYS), 250) 43 | counter_should_equal(@counter.count('mycounter', Time.now.to_i - 1 * WEEKS), 250) 44 | counter_should_equal(@counter.count('mycounter', Time.now.to_i - 5 * DAYS - 12 * HOURS), 150, 250) 45 | counter_should_equal(@counter.count('mycounter', Time.now.to_i - 4 * DAYS - 12 * HOURS), 50, 250) 46 | counter_should_equal(@counter.count('mycounter', Time.now.to_i - 3 * DAYS), 0, 250) 47 | end 48 | 49 | it "can estimate unions from any particular point in time until the present" do 50 | Timecop.travel(Time.now - 2 * WEEKS) do 51 | (0..100).each { |i| @counter.add('mycounter1', "item#{i}") } 52 | end 53 | Timecop.travel(Time.now - 1 * WEEKS) do 54 | (100..200).each { |i| @counter.add('mycounter2', "item#{i}") } 55 | end 56 | Timecop.travel(Time.now - 6 * DAYS) do 57 | (0..100).each { |i| @counter.add('mycounter1', "item#{i}") } 58 | end 59 | Timecop.travel(Time.now - 5 * DAYS) do 60 | (100..200).each { |i| @counter.add('mycounter2', "item#{i}") } 61 | end 62 | Timecop.travel(Time.now - 4 * DAYS) do 63 | (200..250).each { |i| @counter.add('mycounter1', "item#{i}") } 64 | end 65 | 66 | counter_should_equal(@counter.union(['mycounter1', 'mycounter2']), 250) 67 | counter_should_equal(@counter.union(['mycounter1', 'mycounter2'], Time.now.to_i - 3 * WEEKS), 250) 68 | counter_should_equal(@counter.union(['mycounter1', 'mycounter2'], Time.now.to_i - 1 * WEEKS - 3 * DAYS), 250) 69 | counter_should_equal(@counter.union(['mycounter1', 'mycounter2'], Time.now.to_i - 1 * WEEKS), 250) 70 | counter_should_equal(@counter.union(['mycounter1', 'mycounter2'], Time.now.to_i - 5 * DAYS - 12 * HOURS), 150, 250) 71 | counter_should_equal(@counter.union(['mycounter1', 'mycounter2'], Time.now.to_i - 4 * DAYS - 12 * HOURS), 50, 250) 72 | counter_should_equal(@counter.union(['mycounter1', 'mycounter2'], Time.now.to_i - 3 * DAYS), 0, 250) 73 | end 74 | 75 | it "can estimate intersections from any particular point in time until the present" do 76 | Timecop.travel(Time.now - 2 * WEEKS) do 77 | (0..100).each { |i| @counter.add('mycounter1', "item#{i}") } 78 | end 79 | Timecop.travel(Time.now - 1 * WEEKS) do 80 | (100..200).each { |i| @counter.add('mycounter2', "item#{i}") } 81 | end 82 | Timecop.travel(Time.now - 6 * DAYS) do 83 | (0..100).each { |i| @counter.add('mycounter2', "item#{i}") } 84 | end 85 | Timecop.travel(Time.now - 5 * DAYS) do 86 | (100..200).each { |i| @counter.add('mycounter1', "item#{i}") } 87 | end 88 | Timecop.travel(Time.now - 4 * DAYS) do 89 | (200..250).each { |i| @counter.add('mycounter1', "item#{i}") } 90 | end 91 | Timecop.travel(Time.now - 3 * DAYS) do 92 | (200..250).each { |i| @counter.add('mycounter2', "item#{i}") } 93 | end 94 | 95 | counter_should_equal(@counter.intersection(['mycounter1', 'mycounter2']), 250) 96 | counter_should_equal(@counter.intersection(['mycounter1', 'mycounter2'], Time.now.to_i - 3 * WEEKS), 250) 97 | counter_should_equal(@counter.intersection(['mycounter1', 'mycounter2'], Time.now.to_i - 1 * WEEKS - 3 * DAYS), 150, 250) 98 | counter_should_equal(@counter.intersection(['mycounter1', 'mycounter2'], Time.now.to_i - 6 * DAYS - 12 * HOURS), 50, 250) 99 | counter_should_equal(@counter.intersection(['mycounter1', 'mycounter2'], Time.now.to_i - 5 * DAYS - 12 * HOURS), 50, 250) 100 | counter_should_equal(@counter.intersection(['mycounter1', 'mycounter2'], Time.now.to_i - 4 * DAYS - 12 * HOURS), 50, 250) 101 | counter_should_equal(@counter.intersection(['mycounter1', 'mycounter2'], Time.now.to_i - 3 * DAYS - 12 * HOURS), 0, 250) 102 | counter_should_equal(@counter.intersection(['mycounter1', 'mycounter2'], Time.now.to_i - 2 * DAYS), 0, 250) 103 | end 104 | 105 | it "can use union_store to store snapshots of counters at particular points in time" do 106 | Timecop.travel(Time.now - 2 * WEEKS) do 107 | (0..100).each { |i| @counter.add('mycounter1', "item#{i}") } 108 | end 109 | Timecop.travel(Time.now - 1 * WEEKS) do 110 | (100..200).each { |i| @counter.add('mycounter2', "item#{i}") } 111 | end 112 | Timecop.travel(Time.now - 6 * DAYS) do 113 | (0..100).each { |i| @counter.add('mycounter2', "item#{i}") } 114 | end 115 | Timecop.travel(Time.now - 5 * DAYS) do 116 | (100..200).each { |i| @counter.add('mycounter1', "item#{i}") } 117 | end 118 | Timecop.travel(Time.now - 4 * DAYS) do 119 | (200..250).each { |i| @counter.add('mycounter1', "item#{i}") } 120 | end 121 | Timecop.travel(Time.now - 3 * DAYS) do 122 | (200..250).each { |i| @counter.add('mycounter2', "item#{i}") } 123 | end 124 | 125 | @counter.union_store('counter1_1_week_ago', ['mycounter1'], Time.now.to_i - 1 * WEEKS) 126 | @counter.union_store('counter2_5_days_ago', ['mycounter2'], Time.now.to_i - 5 * DAYS) 127 | counter_should_equal(@counter.union(['counter1_1_week_ago', 'counter2_5_days_ago']), 150, 250) 128 | end 129 | 130 | it "allows you to override the time an event is registered when it's added" do 131 | (0..1000).each { |i| @counter.add('mycounter', "item#{i}", Time.now.to_i - 3 * WEEKS) } 132 | (1000..2000).each { |i| @counter.add('mycounter', "item#{i}", Time.now.to_i - 2 * WEEKS) } 133 | (2000..3000).each { |i| @counter.add('mycounter', "item#{i}", Time.now.to_i - 1 * WEEKS) } 134 | (3000..4000).each { |i| @counter.add('mycounter', "item#{i}") } 135 | 136 | counter_should_equal(@counter.count('mycounter', Time.now.to_i - 4 * WEEKS), 4000) 137 | counter_should_equal(@counter.count('mycounter', Time.now.to_i - 2 * WEEKS - 3 * DAYS), 3000) 138 | counter_should_equal(@counter.count('mycounter', Time.now.to_i - 1 * WEEKS - 3 * DAYS), 2000) 139 | counter_should_equal(@counter.count('mycounter', Time.now.to_i - 3 * DAYS), 1000) 140 | end 141 | 142 | it "doesn't screw up more recent counts when items are injected with earlier timestamp overrides" do 143 | Timecop.travel(Time.now - 3 * WEEKS) do 144 | (0..1000).each { |i| @counter.add('mycounter', "item#{i}") } 145 | end 146 | 147 | Timecop.travel(Time.now - 2 * WEEKS) do 148 | (1000..2000).each { |i| @counter.add('mycounter', "item#{i}") } 149 | end 150 | 151 | Timecop.travel(Time.now - 1 * WEEKS) do 152 | (2000..3000).each { |i| @counter.add('mycounter', "item#{i}") } 153 | end 154 | 155 | Timecop.travel(Time.now - 2 * DAYS) do 156 | (1000..2000).each { |i| @counter.add('mycounter', "item#{i}") } 157 | end 158 | 159 | counter_should_equal(@counter.count('mycounter', Time.now.to_i - 4 * WEEKS), 3000) 160 | counter_should_equal(@counter.count('mycounter', Time.now.to_i - 2 * WEEKS - 3 * DAYS), 2000) 161 | counter_should_equal(@counter.count('mycounter', Time.now.to_i - 1 * WEEKS - 3 * DAYS), 2000) 162 | counter_should_equal(@counter.count('mycounter', Time.now.to_i - 3 * DAYS), 1000) 163 | counter_should_equal(@counter.count('mycounter', Time.now.to_i - 1 * DAYS), 0) 164 | 165 | # Shouldn't change counts, since they're updates to counts that happen later 166 | # than the time we're trying to inject 167 | (1000..2000).each { |i| @counter.add('mycounter', "item#{i}", Time.now.to_i - 1 * WEEKS) } 168 | 169 | counter_should_equal(@counter.count('mycounter', Time.now.to_i - 4 * WEEKS), 3000) 170 | counter_should_equal(@counter.count('mycounter', Time.now.to_i - 2 * WEEKS - 3 * DAYS), 2000) 171 | counter_should_equal(@counter.count('mycounter', Time.now.to_i - 1 * WEEKS - 3 * DAYS), 2000) 172 | counter_should_equal(@counter.count('mycounter', Time.now.to_i - 3 * DAYS), 1000) 173 | counter_should_equal(@counter.count('mycounter', Time.now.to_i - 1 * DAYS), 0) 174 | 175 | # Should change counts, since they're updates to counts for items we've never 176 | # seen before in the past 177 | (3000..4000).each { |i| @counter.add('mycounter', "item#{i}", Time.now.to_i - 1 * WEEKS) } 178 | 179 | counter_should_equal(@counter.count('mycounter', Time.now.to_i - 4 * WEEKS), 4000) 180 | counter_should_equal(@counter.count('mycounter', Time.now.to_i - 2 * WEEKS - 3 * DAYS), 3000) 181 | counter_should_equal(@counter.count('mycounter', Time.now.to_i - 1 * WEEKS - 3 * DAYS), 3000) 182 | counter_should_equal(@counter.count('mycounter', Time.now.to_i - 3 * DAYS), 1000) 183 | counter_should_equal(@counter.count('mycounter', Time.now.to_i - 1 * DAYS), 0) 184 | end 185 | 186 | it "can compute deltas over time on events correctly" do 187 | # A larger-scale test that simulates user join events and tests that we can get 188 | # week-by-week deltas. Generate new user counts according to the following 189 | # weekly schedule: 55780 during the first week, 300 more during the next week, 190 | # 10 more the next week, etc. 191 | 192 | schedule = [55780, 300, 10, 4000, 1000, 1000, 5000, 15000, 30000, 3000] 193 | schedule.each_with_index do |num_users, i| 194 | Timecop.travel(Time.now - (schedule.length * WEEKS) + (i * WEEKS)) do 195 | num_users.times do |i| 196 | Timecop.travel(Time.now + 2 * HOURS + i) do 197 | @counter.add("users", "user#{SecureRandom.uuid}") 198 | end 199 | end 200 | end 201 | end 202 | 203 | actual_total = schedule.reduce(:+) 204 | estimated_total = @counter.count("users") 205 | (actual_total - estimated_total).abs.should < @expected_relative_error * actual_total 206 | 207 | # Go through the schedule, computing week-by-week deltas and comparing them to the 208 | # scheduled additions. 209 | 210 | schedule.each_with_index do |users_joined, i| 211 | week = schedule.length - 1 - i 212 | c = @counter.count('users', Time.now.to_i - (week+1) * WEEKS) - @counter.count('users', Time.now.to_i - week * WEEKS) 213 | (users_joined - c).abs.should < @expected_relative_error * schedule.reduce(:+) 214 | end 215 | end 216 | end 217 | --------------------------------------------------------------------------------