├── .gitignore ├── LICENSE ├── README.md ├── cmd └── main.go ├── data └── bible.txt ├── go.mod ├── go.sum ├── kmv.go └── kmv_test.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, build with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | 14 | # vscode 15 | .vscode 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Jordi Montes 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # go-kmv 2 | 3 | **go-kmv** is an adaptive version of *K-minimum values algorithm for cardinality estimation* 4 | 5 | This repository provides: 6 | - A **library** for your own Go programs 7 | - A **cmd tool** which estimates the cardinality reading from the stdin (so you can use it with the pipe `|` linux operator) 8 | 9 | The formula used for estimating the cardinality is exactly the same described in the paper [ Counting distinct elements in a data stream](http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&ved=0CEwQFjAA&url=http%3A%2F%2Fwww.cs.umd.edu%2F~samir%2F498%2Fdistinct.ps&ei=h-3IT5GPBfD16AG0q70v&usg=AFQjCNG4nYiSedl6W3r73ZCXNtnaOancnQ&sig2=E8KzKp4qkLiWMQk690Moyw). What makes this implementation interesting is the use of an adaptive table which grows in order to provide better estimations. The implementation of the adaptive-table can be found [here](https://github.com/positiveblue/adaptive-table) 10 | 11 | # Examples 12 | 13 | After compiling `cmd/main.go` we can run the algorithm from our terminal 14 | 15 | ```bash 16 | $ go build -o go-kmv main.go 17 | 18 | # Output 19 | # ${CardinalityEstimation} ${ProssecedElements} ${TableSize} 20 | $ ./go-kmv < ../data/bible.txt 21 | 33938 824036 465 22 | 23 | # If we (really) count them 24 | $ tr ' ' '\n' < ../data/bible.txt | sort | uniq -c | wc -l 25 | 34040 26 | ``` 27 | 28 | If what you want is to use it as a dependency for your project 29 | 30 | ```go 31 | package main 32 | 33 | import gokmv "github.com/positiveblue/go-kmv" 34 | 35 | func main() { 36 | // Get dataStream 37 | dataStream := myDataStream() 38 | 39 | // Create the estimator 40 | initialSize := 64 41 | estimator := gokmv.NewKMV(initialSize) 42 | for element := range dataStream { 43 | // element has to be a UInt64 44 | estimator.InsertUint64(element) 45 | } 46 | 47 | estimator.Size() // returns the table size 48 | estimator.ElementsAdded() // returns the total elements that we processed 49 | estimator.EstimateCardinality() // returns the cardinality estimation 50 | } 51 | ``` 52 | 53 | Because of the lack of generics in Go go-kmv only provides `Insert` functions for `Uint64` and `strings`. If you want to use your own hash functions or add new types you can just create your own function: 54 | 55 | ```go 56 | // Insert my type to the table 57 | // Using my hash function 58 | func (kmv *KMV) InsertMyType(s string) { 59 | // Remember to use the internal seed to have reproducible results 60 | hash := myHashFunction.Sum64([]byte(s), kmv.Seed()) 61 | // The has has to return a Uint64 62 | kmv.InsertUint64(hash) 63 | } 64 | ``` 65 | 66 | # Cardinality Estimation 67 | 68 | Cardinalty Estimation is considered solved under all meanings. Nowadays computers have enough memory for computing the cardinality of small sets and for extream cases (big data)algorithms like HyperLogLog and KMV already give an accuracy of ~98% using a few bytes of memory. 69 | 70 | In real life what people usually use is an implementation of [HyperLogLog](http://static.googleusercontent.com/external_content/untrusted_dlcp/research.google.com/en/us/pubs/archive/40671.pdf) with a table size from about 128 to 4096. HyperLogLog and all the algorithms of its family can only use tables of size `2^k` where k is a positive integer. **go-kmv** does not have that limitation and automatically provides a good trade-off without knowing in advance the order of distinct elements that we have to estimate. 71 | 72 | The current implementation grows with a factor of `klog(n)` where `k` is the inital table size and `n` is the number of disctinct elements in the stream. That means that runing go-kmv with an `initialSize` of 64 and processing and stream of 10^6 elements the final table size will be about ~600 and the accuracy of the estimation will be ~98.00%. 73 | 74 | # Hash Functions 75 | A critical part to achive meaningful results is to use a good hash function (where good = few colisions). Hash Functions like **FNV**, from the go stdlib are not good enough to ensure the theoretical results. Other algorithms like **AES** provide the best results but are slower and it seems a bit overkill for this implementation. [Murmur3](github.com/spaolacci/murmur3) provides the best ratio results/processing time and it has been used in this implementation. 76 | 77 | -------------------------------------------------------------------------------- /cmd/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "flag" 6 | "fmt" 7 | "os" 8 | "strings" 9 | 10 | gokmv "github.com/positiveblue/go-kmv" 11 | ) 12 | 13 | func getScanner(fileName string) *bufio.Scanner { 14 | if fileName != "" { 15 | f, err := os.Open(fileName) 16 | if err != nil { 17 | panic(err) 18 | } 19 | return bufio.NewScanner(f) 20 | } else { 21 | return bufio.NewScanner(os.Stdin) 22 | } 23 | } 24 | 25 | func fmtMessage(estimator *gokmv.KMV) string { 26 | distinct := estimator.EstimateCardinality() 27 | total := estimator.ElementsAdded() 28 | size := estimator.Size() 29 | return fmt.Sprintf("%d %d %d", distinct, total, size) 30 | } 31 | 32 | func main() { 33 | sizePtr := flag.Int("size", 64, "initial size for the kmv data structure") 34 | fileNamePtr := flag.String("filename", "", "File name to process (otherwhise will read from StdIn") 35 | 36 | flag.Parse() 37 | 38 | scanner := getScanner(*fileNamePtr) 39 | kmv := gokmv.NewKMV(*sizePtr) 40 | for scanner.Scan() { 41 | for _, word := range strings.Fields(scanner.Text()) { 42 | kmv.InsertString(word) 43 | } 44 | } 45 | if err := scanner.Err(); err != nil { 46 | fmt.Fprintln(os.Stderr, "reading standard input:", err) 47 | } else { 48 | fmt.Println(fmtMessage(kmv)) 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/positiveblue/go-kmv 2 | 3 | go 1.12 4 | 5 | require ( 6 | github.com/positiveblue/adaptive-table v0.0.0-20190428201714-2e1c6ba242ca 7 | github.com/spaolacci/murmur3 v1.1.0 8 | ) 9 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/positiveblue/adaptive-table v0.0.0-20190428201714-2e1c6ba242ca h1:Whh1Ll7vqIqj3IVIiFMbcygCYNkUU6Choi3ymQmjuN4= 2 | github.com/positiveblue/adaptive-table v0.0.0-20190428201714-2e1c6ba242ca/go.mod h1:YqVscn86Gujuh+8sLLHF8uWLnOixJ7qIGo8C/ko/WNY= 3 | github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI= 4 | github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= 5 | -------------------------------------------------------------------------------- /kmv.go: -------------------------------------------------------------------------------- 1 | package gokmv 2 | 3 | import ( 4 | "math" 5 | "math/rand" 6 | "time" 7 | 8 | adaptivetable "github.com/positiveblue/adaptive-table" 9 | murmur3 "github.com/spaolacci/murmur3" 10 | ) 11 | 12 | type KMV struct { 13 | table adaptivetable.AdaptiveTable 14 | initialSize int 15 | seed uint32 16 | totalCounter uint64 17 | } 18 | 19 | func NewKMV(size int) *KMV { 20 | rand.Seed(time.Now().UnixNano()) 21 | return NewKMVWithSeed(size, rand.Uint32()) 22 | } 23 | 24 | func NewKMVWithSeed(size int, seed uint32) *KMV { 25 | return &KMV{ 26 | table: adaptivetable.NewAdaptiveTableComplete(size, math.MaxInt64, size), 27 | initialSize: size, 28 | seed: seed, 29 | totalCounter: 0, 30 | } 31 | } 32 | 33 | func (kmv *KMV) ElementsAdded() uint64 { 34 | return kmv.totalCounter 35 | } 36 | 37 | func (kmv *KMV) Size() int { 38 | return kmv.table.Size() 39 | } 40 | 41 | func (kmv *KMV) Seed() uint32 { 42 | return kmv.seed 43 | } 44 | 45 | func (kmv *KMV) InsertUint64(hash uint64) { 46 | kmv.totalCounter++ 47 | kmv.table.Insert(hash) 48 | } 49 | 50 | func (kmv *KMV) InsertString(s string) { 51 | hash := murmur3.Sum64WithSeed([]byte(s), kmv.seed) 52 | kmv.InsertUint64(hash) 53 | } 54 | 55 | func (kmv *KMV) EstimateCardinality() uint64 { 56 | if kmv.Size() < kmv.initialSize { 57 | return uint64(kmv.table.Size()) 58 | } 59 | 60 | meanDistance := kmv.table.Max() / uint64(kmv.table.Size()) 61 | return uint64(math.MaxUint64 / meanDistance) 62 | } 63 | -------------------------------------------------------------------------------- /kmv_test.go: -------------------------------------------------------------------------------- 1 | package gokmv 2 | 3 | import ( 4 | "fmt" 5 | "math/rand" 6 | "testing" 7 | ) 8 | 9 | func TestKMVSize(t *testing.T) { 10 | kmv := NewKMV(2) 11 | 12 | if kmv.Size() != 0 { 13 | t.Error("An empty table should have size 0") 14 | } 15 | 16 | kmv.InsertUint64(1) 17 | if kmv.Size() != 1 { 18 | t.Error("We only added one element") 19 | } 20 | 21 | for i := 0; i < 10; i++ { 22 | kmv.InsertUint64(uint64(i)) 23 | } 24 | 25 | if kmv.Size() != 2 { 26 | t.Error("Size should not be bigger than maxSize") 27 | } 28 | } 29 | 30 | func TestKMVSeed(t *testing.T) { 31 | kmv := NewKMV(2) 32 | if kmv.Seed() == 0 { 33 | t.Error("Seed should be a random number when it is not passed as a parameter") 34 | } 35 | 36 | kmv = NewKMVWithSeed(2, 12) 37 | if kmv.Seed() != 12 { 38 | t.Error("Seed was specified in the constructor") 39 | } 40 | } 41 | 42 | func TestKMVElementsAdded(t *testing.T) { 43 | kmv := NewKMV(2) 44 | 45 | if kmv.ElementsAdded() != 0 { 46 | t.Error("We did not add any element") 47 | } 48 | 49 | kmv.InsertUint64(1) 50 | if kmv.ElementsAdded() != 1 { 51 | t.Error("We added one element") 52 | } 53 | 54 | for i := 0; i < 10; i++ { 55 | kmv.InsertUint64(uint64(i)) 56 | } 57 | 58 | if kmv.ElementsAdded() != 11 { 59 | t.Error("We added 11 elements") 60 | } 61 | } 62 | 63 | func TestKMVInsertString(t *testing.T) { 64 | kmv := NewKMV(2) 65 | 66 | kmv.InsertString("Golang") 67 | 68 | if kmv.ElementsAdded() != 1 || kmv.Size() != 1 { 69 | t.Error("We added element one") 70 | } 71 | } 72 | 73 | func inBounds(relativeError float64, approximation, real int) bool { 74 | fApprox := float64(approximation) 75 | fReal := float64(real) 76 | 77 | if fApprox < (1-relativeError)*fReal { 78 | return false 79 | } 80 | 81 | if fApprox > (1+relativeError)*fReal { 82 | return false 83 | } 84 | 85 | return true 86 | } 87 | func TestKMVEstimateCardinality(t *testing.T) { 88 | data := make(map[uint64]bool) 89 | dataSize := 1000000 90 | 91 | rand.Seed(42) 92 | for len(data) != dataSize { 93 | n := rand.Uint64() 94 | data[n] = true 95 | } 96 | 97 | // We have a sample of `dataSize` random uint64 98 | // We will estimate the carinality of the sample 99 | // `iterations` times and check that the `avgEstimation` is 100 | // not off by more than a factor of `relativeErr` 101 | avgEstimation := 0 102 | avgSize := 0 103 | iterations := 10 104 | for i := 0; i < iterations; i++ { 105 | kmv := NewKMV(64) 106 | 107 | for key := range data { 108 | kmv.InsertString(fmt.Sprint(key)) 109 | } 110 | avgEstimation += int(kmv.EstimateCardinality()) 111 | avgSize += kmv.Size() 112 | } 113 | avgEstimation /= iterations 114 | avgSize /= iterations 115 | 116 | relativeErr := 0.04 117 | 118 | if !inBounds(relativeErr, avgEstimation, dataSize) { 119 | errMsg := fmt.Sprintf("The kmv estimation was not in the theoretical bounds: %d out of %d", avgEstimation, dataSize) 120 | t.Error(errMsg) 121 | } 122 | } 123 | --------------------------------------------------------------------------------