├── LICENSE
├── README.md
├── hyperloglog.go
├── hyperloglog_test.go
├── murmur.go
└── murmur_test.go


/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2013 Eric Lesh
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | hyperloglog
 2 | ===========
 3 | 
 4 | Package hyperloglog implements the HyperLogLog algorithm for
 5 | cardinality estimation. In English: it counts things. It counts things
 6 | using very small amounts of memory compared to the number of objects
 7 | it is counting.
 8 | 
 9 | For a full description of the algorithm, see the paper HyperLogLog:
10 | the analysis of a near-optimal cardinality estimation algorithm by
11 | Flajolet, et. al. at http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf
12 | 
13 | For documentation see http://godoc.org/github.com/DataDog/hyperloglog
14 | 
15 | Included are a set of fast implementations for murmurhash suitable for use
16 | on 32 and 64 bit integers on little endian machines.
17 | 
18 | Quick start
19 | ===========
20 | 
21 | 	$ go get github.com/DataDog/hyperloglog
22 | 	$ cd $GOPATH/src/github.com/DataDog/hyperloglog
23 | 	$ go test -test.v
24 | 	$ go test -bench=.
25 | 
26 | License
27 | =======
28 | 
29 | hyperloglog is licensed under the MIT license.
30 | 


--------------------------------------------------------------------------------
/hyperloglog.go:
--------------------------------------------------------------------------------
  1 | // Package hyperloglog implements the HyperLogLog algorithm for
  2 | // cardinality estimation. In English: it counts things. It counts
  3 | // things using very small amounts of memory compared to the number of
  4 | // objects it is counting.
  5 | //
  6 | // For a full description of the algorithm, see the paper HyperLogLog:
  7 | // the analysis of a near-optimal cardinality estimation algorithm by
  8 | // Flajolet, et. al.
  9 | package hyperloglog
 10 | 
 11 | import (
 12 | 	"fmt"
 13 | 	"math"
 14 | 	"math/bits"
 15 | )
 16 | 
 17 | const (
 18 | 	exp32 = 1 << 32 // 2^32
 19 | )
 20 | 
 21 | // A HyperLogLog is a deterministic cardinality estimator.  This version
 22 | // exports its fields so that it is suitable for saving eg. to a database.
 23 | type HyperLogLog struct {
 24 | 	M         uint    // Number of registers
 25 | 	B         uint32  // Number of bits used to determine register index
 26 | 	Alpha     float64 // Bias correction constant
 27 | 	Registers []uint8
 28 | }
 29 | 
 30 | // Compute bias correction alpha_m.
 31 | func getAlpha(m uint) (result float64) {
 32 | 	switch m {
 33 | 	case 16:
 34 | 		result = 0.673
 35 | 	case 32:
 36 | 		result = 0.697
 37 | 	case 64:
 38 | 		result = 0.709
 39 | 	default:
 40 | 		result = 0.7213 / (1.0 + 1.079/float64(m))
 41 | 	}
 42 | 	return result
 43 | }
 44 | 
 45 | // New creates a HyperLogLog with the given number of registers. More
 46 | // registers leads to lower error in your estimated count, at the
 47 | // expense of memory.
 48 | //
 49 | // Choose a power of two number of registers, depending on the amount
 50 | // of memory you're willing to use and the error you're willing to
 51 | // tolerate. Each register uses one byte of memory.
 52 | //
 53 | // Standard error will be: σ ≈ 1.04 / sqrt(registers)
 54 | // The estimates provided by hyperloglog are expected to be within σ, 2σ, 3σ
 55 | // of the exact count in respectively 65%, 95%, 99% of all the cases.
 56 | func New(registers uint) (*HyperLogLog, error) {
 57 | 	if registers == 0 {
 58 | 		panic("cannot have zero registers")
 59 | 	}
 60 | 	if (registers & (registers - 1)) != 0 {
 61 | 		return nil, fmt.Errorf("number of registers %d not a power of two", registers)
 62 | 	}
 63 | 	h := &HyperLogLog{}
 64 | 	h.M = registers
 65 | 	h.B = uint32(math.Log2(float64(registers)))
 66 | 	h.Alpha = getAlpha(registers)
 67 | 	h.Registers = make([]uint8, h.M)
 68 | 	return h, nil
 69 | }
 70 | 
 71 | // Reset all internal variables and set the count to zero.
 72 | func (h *HyperLogLog) Reset() {
 73 | 	for i := range h.Registers {
 74 | 		h.Registers[i] = 0
 75 | 	}
 76 | }
 77 | 
 78 | // Add to the count. val should be a 32 bit unsigned integer from a
 79 | // good hash function.
 80 | func (h *HyperLogLog) Add(val uint32) {
 81 | 	k := 32 - h.B
 82 | 	slice := (val << h.B) | (1 << (h.B - 1))
 83 | 	r := uint8(bits.LeadingZeros32(slice) + 1)
 84 | 	j := val >> uint(k)
 85 | 	if r > h.Registers[j] {
 86 | 		h.Registers[j] = r
 87 | 	}
 88 | }
 89 | 
 90 | // Count returns the estimated cardinality.
 91 | func (h *HyperLogLog) Count() uint64 {
 92 | 	return h.count(true)
 93 | }
 94 | 
 95 | // CountWithoutLargeRangeCorrection returns the estimated cardinality, without applying
 96 | // the large range correction proposed by Flajolet et al. as it can lead to significant
 97 | // overcounting.
 98 | //
 99 | // See https://github.com/DataDog/hyperloglog/pull/15
100 | func (h *HyperLogLog) CountWithoutLargeRangeCorrection() uint64 {
101 | 	return h.count(false)
102 | }
103 | 
104 | func (h *HyperLogLog) count(withLargeRangeCorrection bool) uint64 {
105 | 	sum := 0.0
106 | 	m := float64(h.M)
107 | 	for _, val := range h.Registers {
108 | 		sum += 1.0 / float64(int(1)<<val)
109 | 	}
110 | 	estimate := h.Alpha * m * m / sum
111 | 	if estimate <= 5.0/2.0*m {
112 | 		// Small range correction
113 | 		v := 0
114 | 		for _, r := range h.Registers {
115 | 			if r == 0 {
116 | 				v++
117 | 			}
118 | 		}
119 | 		if v > 0 {
120 | 			estimate = m * math.Log(m/float64(v))
121 | 		}
122 | 	} else if estimate > 1.0/30.0*exp32 && withLargeRangeCorrection {
123 | 		// Large range correction
124 | 		estimate = -exp32 * math.Log(1-estimate/exp32)
125 | 	}
126 | 	return uint64(estimate)
127 | }
128 | 
129 | // Merge another HyperLogLog into this one. The number of registers in
130 | // each must be the same.
131 | func (h *HyperLogLog) Merge(other *HyperLogLog) error {
132 | 	if h.M != other.M {
133 | 		return fmt.Errorf("number of registers doesn't match: %d != %d",
134 | 			h.M, other.M)
135 | 	}
136 | 
137 | 	// Trigger boundary check once for h.Registers
138 | 	registers := h.Registers
139 | 	_ = registers[len(other.Registers)-1]
140 | 
141 | 	for j, r := range other.Registers {
142 | 		if r > registers[j] {
143 | 			registers[j] = r
144 | 		}
145 | 	}
146 | 	return nil
147 | }
148 | 


--------------------------------------------------------------------------------
/hyperloglog_test.go:
--------------------------------------------------------------------------------
  1 | package hyperloglog
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"fmt"
  6 | 	"hash/fnv"
  7 | 	"io"
  8 | 	"math"
  9 | 	"math/rand"
 10 | 	"os"
 11 | 	"testing"
 12 | )
 13 | 
 14 | // Return a dictionary up to n words. If n is zero, return the entire
 15 | // dictionary.
 16 | func dictionary(n int) []string {
 17 | 	var words []string
 18 | 	dict := "/usr/share/dict/words"
 19 | 	f, err := os.Open(dict)
 20 | 	if err != nil {
 21 | 		fmt.Printf("can't open dictionary file '%s': %v\n", dict, err)
 22 | 		os.Exit(1)
 23 | 	}
 24 | 	count := 0
 25 | 	buf := bufio.NewReader(f)
 26 | 	for {
 27 | 		if n != 0 && count >= n {
 28 | 			break
 29 | 		}
 30 | 		word, err := buf.ReadString('\n')
 31 | 		if err != nil {
 32 | 			if err == io.EOF {
 33 | 				break
 34 | 			}
 35 | 			continue
 36 | 		}
 37 | 		words = append(words, word)
 38 | 		count++
 39 | 	}
 40 | 	f.Close()
 41 | 	return words
 42 | }
 43 | 
 44 | func geterror(actual uint64, estimate uint64) (result float64) {
 45 | 	return (float64(estimate) - float64(actual)) / float64(actual)
 46 | }
 47 | 
 48 | func testHyperLogLog(t *testing.T, n, lowB, highB int) {
 49 | 	words := dictionary(n)
 50 | 	bad := 0
 51 | 	nWords := uint64(len(words))
 52 | 	for i := lowB; i < highB; i++ {
 53 | 		m := uint(math.Pow(2, float64(i)))
 54 | 
 55 | 		h, err := New(m)
 56 | 		if err != nil {
 57 | 			t.Fatalf("can't make New(%d): %v", m, err)
 58 | 		}
 59 | 
 60 | 		hash := fnv.New32()
 61 | 		for _, word := range words {
 62 | 			hash.Write([]byte(word))
 63 | 			h.Add(hash.Sum32())
 64 | 			hash.Reset()
 65 | 		}
 66 | 
 67 | 		expectedError := 1.04 / math.Sqrt(float64(m))
 68 | 		actualError := math.Abs(geterror(nWords, h.Count()))
 69 | 
 70 | 		if actualError > expectedError {
 71 | 			bad++
 72 | 			t.Logf("m=%d: error=%.5f, expected <%.5f; actual=%d, estimated=%d\n",
 73 | 				m, actualError, expectedError, nWords, h.Count())
 74 | 		}
 75 | 
 76 | 	}
 77 | 	t.Logf("%d of %d tests exceeded estimated error", bad, highB-lowB)
 78 | }
 79 | 
 80 | func TestHyperLogLogSmall(t *testing.T) {
 81 | 	testHyperLogLog(t, 5, 4, 17)
 82 | }
 83 | 
 84 | func TestHyperLogLogBig(t *testing.T) {
 85 | 	testHyperLogLog(t, 0, 4, 17)
 86 | }
 87 | 
 88 | func testReset(t *testing.T, m uint, numObjects, runs int) {
 89 | 	rand.Seed(101)
 90 | 
 91 | 	h, err := New(m)
 92 | 	if err != nil {
 93 | 		t.Fatalf("can't make New(%d): %v", m, err)
 94 | 	}
 95 | 
 96 | 	for i := 0; i < runs; i++ {
 97 | 		for j := 0; j < numObjects; j++ {
 98 | 			h.Add(rand.Uint32())
 99 | 		}
100 | 
101 | 		oldRegisters := &h.Registers
102 | 		h.Reset()
103 | 		if oldRegisters != &h.Registers {
104 | 			t.Error("registers were reallocated")
105 | 		}
106 | 		for _, r := range h.Registers {
107 | 			if r != 0 {
108 | 				t.Error("register is not zeroed out after reset")
109 | 			}
110 | 		}
111 | 	}
112 | }
113 | 
114 | func TestReset(t *testing.T) {
115 | 	testReset(t, 512, 1_000_000, 10)
116 | }
117 | 
118 | func TestMerge(t *testing.T) {
119 | 	trueDisinctPerHll := uint64(100000)
120 | 	m := uint(math.Pow(2, float64(11)))
121 | 
122 | 	h, err := New(m)
123 | 	h2, err := New(m)
124 | 	if err != nil {
125 | 		return
126 | 	}
127 | 
128 | 	for i := uint64(0); i < trueDisinctPerHll; i++ {
129 | 		h.Add(Murmur64(i))
130 | 	}
131 | 
132 | 	h2.Merge(h)
133 | 
134 | 	if h.Count() != h2.Count() {
135 | 		t.Errorf("Estimate mismatch after merge, %d != %d", h.Count(), h2.Count())
136 | 	}
137 | }
138 | 
139 | func BenchmarkReset(b *testing.B) {
140 | 	m := uint(256)
141 | 	numObjects := 1000
142 | 
143 | 	h, err := New(m)
144 | 	if err != nil {
145 | 		b.Fatalf("can't make New(%d): %v", m, err)
146 | 	}
147 | 
148 | 	b.ResetTimer()
149 | 
150 | 	for n := 0; n < b.N; n++ {
151 | 		for i := 0; i < numObjects; i++ {
152 | 			h.Add(uint32(i))
153 | 		}
154 | 		h.Reset()
155 | 	}
156 | }
157 | 
158 | func benchmarkCount(b *testing.B, registers int) {
159 | 	words := dictionary(0)
160 | 	m := uint(math.Pow(2, float64(registers)))
161 | 
162 | 	h, err := New(m)
163 | 	if err != nil {
164 | 		return
165 | 	}
166 | 
167 | 	hash := fnv.New32()
168 | 	for _, word := range words {
169 | 		hash.Write([]byte(word))
170 | 		h.Add(hash.Sum32())
171 | 		hash.Reset()
172 | 	}
173 | 
174 | 	b.ResetTimer()
175 | 	for n := 0; n < b.N; n++ {
176 | 		h.Count()
177 | 	}
178 | }
179 | 
180 | func BenchmarkCount4(b *testing.B) {
181 | 	benchmarkCount(b, 4)
182 | }
183 | 
184 | func BenchmarkCount5(b *testing.B) {
185 | 	benchmarkCount(b, 5)
186 | }
187 | 
188 | func BenchmarkCount6(b *testing.B) {
189 | 	benchmarkCount(b, 6)
190 | }
191 | 
192 | func BenchmarkCount7(b *testing.B) {
193 | 	benchmarkCount(b, 7)
194 | }
195 | 
196 | func BenchmarkCount8(b *testing.B) {
197 | 	benchmarkCount(b, 8)
198 | }
199 | 
200 | func BenchmarkCount9(b *testing.B) {
201 | 	benchmarkCount(b, 9)
202 | }
203 | 
204 | func BenchmarkCount10(b *testing.B) {
205 | 	benchmarkCount(b, 10)
206 | }
207 | 
208 | func BenchmarkMerge(b *testing.B) {
209 | 	words := dictionary(0)
210 | 	m := uint(math.Pow(2, float64(11)))
211 | 
212 | 	h, err := New(m)
213 | 	h2, err := New(m)
214 | 	if err != nil {
215 | 		return
216 | 	}
217 | 
218 | 	hash := fnv.New32()
219 | 	for _, word := range words {
220 | 		hash.Write([]byte(word))
221 | 		h.Add(hash.Sum32())
222 | 		hash.Reset()
223 | 	}
224 | 
225 | 	b.ResetTimer()
226 | 	for n := 0; n < b.N; n++ {
227 | 		h2.Merge(h)
228 | 	}
229 | }
230 | 


--------------------------------------------------------------------------------
/murmur.go:
--------------------------------------------------------------------------------
  1 | package hyperloglog
  2 | 
  3 | import (
  4 | 	"math"
  5 | 	"math/bits"
  6 | 	"reflect"
  7 | 	"unsafe"
  8 | )
  9 | 
 10 | // This file implements the murmur3 32-bit hash on 32bit and 64bit integers
 11 | // for little endian machines only with no heap allocation.  If you are using
 12 | // HLL to count integer IDs on intel machines, this is your huckleberry.
 13 | 
 14 | // MurmurString implements a fast version of the murmur hash function for strings
 15 | // for little endian machines.  Suitable for adding strings to HLL counter.
 16 | func MurmurString(key string) uint32 {
 17 | 	if len(key) == 0 {
 18 | 		return MurmurBytes(nil)
 19 | 	}
 20 | 	// Reinterpret the string as bytes. This is safe because we don't write into the byte array.
 21 | 	sh := (*reflect.StringHeader)(unsafe.Pointer(&key))
 22 | 	byteSlice := (*[math.MaxInt32 - 1]byte)(unsafe.Pointer(sh.Data))[:sh.Len:sh.Len]
 23 | 	return MurmurBytes(byteSlice)
 24 | }
 25 | 
 26 | // MurmurBytes implements a fast version of the murmur hash function for bytes
 27 | // for little endian machines.  Suitable for adding strings to HLL counter.
 28 | func MurmurBytes(bkey []byte) uint32 {
 29 | 	var c1, c2 uint32 = 0xcc9e2d51, 0x1b873593
 30 | 	var h uint32
 31 | 
 32 | 	blen := len(bkey)
 33 | 	chunks := blen / 4 // chunk length
 34 | 
 35 | 	values := (*(*[]uint32)(unsafe.Pointer(&bkey)))[:chunks:chunks]
 36 | 
 37 | 	for _, k := range values {
 38 | 		k *= c1
 39 | 		k = bits.RotateLeft32(k, 15)
 40 | 		k *= c2
 41 | 
 42 | 		h ^= k
 43 | 		h = bits.RotateLeft32(h, 13)
 44 | 		h = (h * 5) + 0xe6546b64
 45 | 	}
 46 | 
 47 | 	var k uint32
 48 | 	tailLength := blen % 4
 49 | 	tailStart := blen - tailLength
 50 | 	// remainder
 51 | 	switch tailLength {
 52 | 	case 3:
 53 | 		k ^= uint32(bkey[tailStart+2]) << 16
 54 | 		fallthrough
 55 | 	case 2:
 56 | 		k ^= uint32(bkey[tailStart+1]) << 8
 57 | 		fallthrough
 58 | 	case 1:
 59 | 		k ^= uint32(bkey[tailStart])
 60 | 		k *= c1
 61 | 		k = bits.RotateLeft32(k, 15)
 62 | 		k *= c2
 63 | 		h ^= k
 64 | 	}
 65 | 
 66 | 	h ^= uint32(blen)
 67 | 	h ^= h >> 16
 68 | 	h *= 0x85ebca6b
 69 | 	h ^= h >> 13
 70 | 	h *= 0xc2b2ae35
 71 | 	h ^= h >> 16
 72 | 
 73 | 	return h
 74 | }
 75 | 
 76 | // Murmur32 implements a fast version of the murmur hash function for uint32 for
 77 | // little endian machines.  Suitable for adding 32bit integers to a HLL counter.
 78 | func Murmur32(i uint32) uint32 {
 79 | 	var c1, c2 uint32 = 0xcc9e2d51, 0x1b873593
 80 | 	var h, k uint32
 81 | 	k = i
 82 | 	k *= c1
 83 | 	k = (k << 15) | (k >> (32 - 15))
 84 | 	k *= c2
 85 | 	h ^= k
 86 | 	h = (h << 13) | (h >> (32 - 13))
 87 | 	h = (h * 5) + 0xe6546b64
 88 | 	// second part
 89 | 	h ^= 4
 90 | 	h ^= h >> 16
 91 | 	h *= 0x85ebca6b
 92 | 	h ^= h >> 13
 93 | 	h *= 0xc2b2ae35
 94 | 	h ^= h >> 16
 95 | 	return h
 96 | }
 97 | 
 98 | // Murmur64 implements a fast version of the murmur hash function for uint64 for
 99 | // little endian machines.  Suitable for adding 64bit integers to a HLL counter.
100 | func Murmur64(i uint64) uint32 {
101 | 	var c1, c2 uint32 = 0xcc9e2d51, 0x1b873593
102 | 	var h, k uint32
103 | 	//first 4-byte chunk
104 | 	k = uint32(i)
105 | 	k *= c1
106 | 	k = (k << 15) | (k >> (32 - 15))
107 | 	k *= c2
108 | 	h ^= k
109 | 	h = (h << 13) | (h >> (32 - 13))
110 | 	h = (h * 5) + 0xe6546b64
111 | 	// second 4-byte chunk
112 | 	k = uint32(i >> 32)
113 | 	k *= c1
114 | 	k = (k << 15) | (k >> (32 - 15))
115 | 	k *= c2
116 | 	h ^= k
117 | 	h = (h << 13) | (h >> (32 - 13))
118 | 	h = (h * 5) + 0xe6546b64
119 | 	// second part
120 | 	h ^= 8
121 | 	h ^= h >> 16
122 | 	h *= 0x85ebca6b
123 | 	h ^= h >> 13
124 | 	h *= 0xc2b2ae35
125 | 	h ^= h >> 16
126 | 	return h
127 | }
128 | 
129 | // Murmur128 implements a fast version of the murmur hash function for two uint64s
130 | // for little endian machines.  Suitable for adding a 128bit value to an HLL counter.
131 | func Murmur128(i, j uint64) uint32 {
132 | 	var c1, c2 uint32 = 0xcc9e2d51, 0x1b873593
133 | 	var h, k uint32
134 | 	//first 4-byte chunk
135 | 	k = uint32(i)
136 | 	k *= c1
137 | 	k = (k << 15) | (k >> (32 - 15))
138 | 	k *= c2
139 | 	h ^= k
140 | 	h = (h << 13) | (h >> (32 - 13))
141 | 	h = (h * 5) + 0xe6546b64
142 | 	// second 4-byte chunk
143 | 	k = uint32(i >> 32)
144 | 	k *= c1
145 | 	k = (k << 15) | (k >> (32 - 15))
146 | 	k *= c2
147 | 	h ^= k
148 | 	h = (h << 13) | (h >> (32 - 13))
149 | 	h = (h * 5) + 0xe6546b64
150 | 	// third 4-byte chunk
151 | 	k = uint32(j)
152 | 	k *= c1
153 | 	k = (k << 15) | (k >> (32 - 15))
154 | 	k *= c2
155 | 	h ^= k
156 | 	h = (h << 13) | (h >> (32 - 13))
157 | 	h = (h * 5) + 0xe6546b64
158 | 	// fourth 4-byte chunk
159 | 	k = uint32(j >> 32)
160 | 	k *= c1
161 | 	k = (k << 15) | (k >> (32 - 15))
162 | 	k *= c2
163 | 	h ^= k
164 | 	h = (h << 13) | (h >> (32 - 13))
165 | 	h = (h * 5) + 0xe6546b64
166 | 	// second part
167 | 	h ^= 16
168 | 	h ^= h >> 16
169 | 	h *= 0x85ebca6b
170 | 	h ^= h >> 13
171 | 	h *= 0xc2b2ae35
172 | 	h ^= h >> 16
173 | 	return h
174 | 
175 | }
176 | 


--------------------------------------------------------------------------------
/murmur_test.go:
--------------------------------------------------------------------------------
  1 | package hyperloglog
  2 | 
  3 | import (
  4 | 	"encoding/binary"
  5 | 	"math/rand"
  6 | 	"testing"
  7 | 	"unsafe"
  8 | 
  9 | 	"github.com/DataDog/mmh3"
 10 | 	"github.com/dustin/randbo"
 11 | )
 12 | 
 13 | var buf32 = make([]byte, 4)
 14 | var buf64 = make([]byte, 8)
 15 | var buf128 = make([]byte, 16)
 16 | 
 17 | // Test that our abbreviated murmur hash works the same as upstream
 18 | func TestMurmur(t *testing.T) {
 19 | 	for i := 0; i < 100; i++ {
 20 | 		x := rand.Int31()
 21 | 		binary.LittleEndian.PutUint32(buf32, uint32(x))
 22 | 		hash := mmh3.Hash32(buf32)
 23 | 		m := Murmur32(uint32(x))
 24 | 		if hash != m {
 25 | 			t.Errorf("Hash mismatch on 32 bit %d: expected 0x%X, got 0x%X\n", x, hash, m)
 26 | 		}
 27 | 	}
 28 | 
 29 | 	for i := 0; i < 100; i++ {
 30 | 		x := rand.Int63()
 31 | 		binary.LittleEndian.PutUint64(buf64, uint64(x))
 32 | 		hash := mmh3.Hash32(buf64)
 33 | 		m := Murmur64(uint64(x))
 34 | 		if hash != m {
 35 | 			t.Errorf("Hash mismatch on 64 bit %d: expected 0x%X, got 0x%X\n", x, hash, m)
 36 | 		}
 37 | 	}
 38 | 
 39 | 	for i := 0; i < 100; i++ {
 40 | 		x := rand.Int63()
 41 | 		y := rand.Int63()
 42 | 		binary.LittleEndian.PutUint64(buf128, uint64(x))
 43 | 		binary.LittleEndian.PutUint64(buf128[8:], uint64(y))
 44 | 		hash := mmh3.Hash32(buf128)
 45 | 		m := Murmur128(uint64(x), uint64(y))
 46 | 		if hash != m {
 47 | 			t.Errorf("Hash mismatch on 128 bit %d,%d: expected 0x%X, got 0x%X\n", x, y, hash, m)
 48 | 		}
 49 | 	}
 50 | 
 51 | 	for i := 0; i < 100; i++ {
 52 | 		key := randString((i % 15) + 5)
 53 | 		hash := mmh3.Hash32([]byte(key))
 54 | 		m := MurmurString(key)
 55 | 		if hash != m {
 56 | 			t.Errorf("Hash mismatch on key %s: expected 0x%X, got 0x%X\n", key, hash, m)
 57 | 		}
 58 | 	}
 59 | }
 60 | 
 61 | func TestMurmurBytes(t *testing.T) {
 62 | 	b := []byte("hello")
 63 | 	v := MurmurBytes(b)
 64 | 	if v != 613153351 {
 65 | 		t.Fatalf("MurmurBytes failed for %s: %v != %v", b, v, 613153351)
 66 | 	}
 67 | }
 68 | 
 69 | func TestMurmurString(t *testing.T) {
 70 | 	s := "hello"
 71 | 	v := MurmurString(s)
 72 | 	if v != 613153351 {
 73 | 		t.Fatalf("MurmurString failed for %s: %v != %v", s, v, 613153351)
 74 | 	}
 75 | }
 76 | 
 77 | func TestMurmurStringZero(t *testing.T) {
 78 | 	s := ""
 79 | 	v := MurmurString(s)
 80 | 	if v != 0 {
 81 | 		t.Fatalf("MurmurString failed for %s: %v != %v", s, v, 0)
 82 | 	}
 83 | }
 84 | 
 85 | func randString(n int) string {
 86 | 	rand.Seed(10)
 87 | 	letterRunes := []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
 88 | 	b := make([]rune, n)
 89 | 	for i := range b {
 90 | 		b[i] = letterRunes[rand.Intn(len(letterRunes))]
 91 | 	}
 92 | 	return string(b)
 93 | }
 94 | 
 95 | // Benchmarks
 96 | func benchmarkMurmurBytes(b *testing.B, input [][]byte) {
 97 | 	b.ResetTimer()
 98 | 	for n := 0; n < b.N; n++ {
 99 | 		for _, x := range input {
100 | 			MurmurBytes(x)
101 | 		}
102 | 	}
103 | }
104 | 
105 | func benchmarkMurmur64(b *testing.B, input []uint64) {
106 | 	b.ResetTimer()
107 | 	for n := 0; n < b.N; n++ {
108 | 		for _, x := range input {
109 | 			Murmur64(x)
110 | 		}
111 | 	}
112 | }
113 | 
114 | func benchmarkMurmurString(b *testing.B, input []string) {
115 | 	b.ResetTimer()
116 | 	for n := 0; n < b.N; n++ {
117 | 		for _, x := range input {
118 | 			MurmurString(x)
119 | 		}
120 | 	}
121 | }
122 | 
123 | func benchmarkHash32(b *testing.B, input []string) {
124 | 	b.ResetTimer()
125 | 	for n := 0; n < b.N; n++ {
126 | 		for _, x := range input {
127 | 			b := *(*[]byte)(unsafe.Pointer(&x))
128 | 			mmh3.Hash32(b)
129 | 		}
130 | 	}
131 | }
132 | 
133 | func Benchmark100MurmurBytes(b *testing.B) {
134 | 	rand.Seed(10)
135 | 	input := make([][]byte, 100)
136 | 	for i := 0; i < 100; i++ {
137 | 		x := make([]byte, 1000)
138 | 		rand.Read(x)
139 | 		input[i] = x
140 | 	}
141 | 	benchmarkMurmurBytes(b, input)
142 | }
143 | 
144 | func Benchmark100Murmur64(b *testing.B) {
145 | 	rand.Seed(10)
146 | 	input := make([]uint64, 100)
147 | 	for i := 0; i < 100; i++ {
148 | 		input[i] = uint64(rand.Int63())
149 | 	}
150 | 	benchmarkMurmur64(b, input)
151 | }
152 | 
153 | func Benchmark100MurmurString(b *testing.B) {
154 | 	rand.Seed(10)
155 | 	input := make([]string, 100)
156 | 	for i := 0; i < 100; i++ {
157 | 		input[i] = randString((i % 15) + 5)
158 | 	}
159 | 	benchmarkMurmurString(b, input)
160 | }
161 | 
162 | func Benchmark100Hash32(b *testing.B) {
163 | 	rand.Seed(10)
164 | 	input := make([]string, 100)
165 | 	for i := 0; i < 100; i++ {
166 | 		input[i] = randString((i % 15) + 5)
167 | 	}
168 | 	benchmarkHash32(b, input)
169 | }
170 | 
171 | func BenchmarkMurmurStringBig(b *testing.B) {
172 | 	// Make a 100Mb string and use that as a benchmark
173 | 	r := randbo.New()
174 | 	slice := make([]byte, 100*1024*1024)
175 | 	_, err := r.Read(slice)
176 | 	if err != nil {
177 | 		b.Fatalf("Failed to create benchmark data: %s", err)
178 | 	}
179 | 	s := string(slice)
180 | 	b.ResetTimer()
181 | 	for i := 0; i < b.N; i++ {
182 | 		MurmurString(s)
183 | 	}
184 | }
185 | 


--------------------------------------------------------------------------------