├── .gitignore ├── LICENSE ├── README.md ├── benchmark ├── main.go └── zipf.csv ├── doc.go ├── pmc.go └── pmc_test.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | *.test 24 | *.prof 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Seif Lotfy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Probabilistic Multiplicity Counting Sketch (PMC) 2 | 3 | [![GoDoc](https://godoc.org/github.com/seiflotfy/pmc?status.svg)](https://godoc.org/github.com/seiflotfy/pmc) 4 | 5 | PMC to Count-Min is as HyperLogLog to Bloomfilter 6 | 7 | Package pmc provides a Probabilistic Multiplicity Counting Sketch, a novel data structure that is capable of accounting traffic per flow probabilistically, that can be used as an alternative to Count-min sketch. 8 | The stream processing algorithm — Probabilistic Multiplicity Counting (PMC) — uses probabilistic counting techniques to determine the approximate multiplicity of each element in large streams. It is particularly well suited for traffic measurements on high-speed communication links and likewise applicable for many other purposes. 9 | 10 | Count-Min Sketches hold counters in a matrix-like organization. A big caveat for both Spectral Bloom Filters and Count-Min Sketches is that the maximum multiplicity has to be known a priori quite accurately, to provide large enough counters without wasting too much memory. PMC does not need to know the maximum frequency beforehand, and its counting operation is much simpler. 11 | 12 | For details about the algorithm and citations please use this article: 13 | 14 | ["High-Speed Per-Flow Traffic Measurement with Probabilistic Multiplicity Counting" by Peter Lieven & Björn Scheuermann] 15 | (https://wwwcn.cs.uni-duesseldorf.de/publications/publications/library/Lieven2010a.pdf) 16 | 17 | ## Example Usage 18 | ```go 19 | import "github.com/seiflotfy/pmc" 20 | 21 | sketch, err := pmc.NewSketchForMaxFlows(1000000) 22 | 23 | //increment a flow 'flow1' 1000000 times 24 | for i:=0; i<1000000; i++ { 25 | sketch.Increment([]byte("flow1")) 26 | } 27 | 28 | count := sketch.GetEstimate([]byte("flow1")) 29 | // count ==> 994623 (its an approximation) 30 | ``` -------------------------------------------------------------------------------- /benchmark/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/csv" 5 | "fmt" 6 | "io" 7 | "log" 8 | "os" 9 | "strconv" 10 | "time" 11 | 12 | "github.com/seiflotfy/pmc" 13 | ) 14 | 15 | func main() { 16 | file, _ := os.Open("zipf.csv") 17 | 18 | r := csv.NewReader(file) 19 | var expected []uint 20 | r.Comma = ';' 21 | 22 | pmc, _ := pmc.New(8000000, 64, 64) 23 | //cml, _ := cml.NewSketch16ForEpsilonDelta(0.00000543657, 0.99) 24 | 25 | dur := time.Duration(0) 26 | x := 0 27 | for { 28 | record, err := r.Read() 29 | x++ 30 | if x == 1 { 31 | continue 32 | } 33 | if err == io.EOF { 34 | break 35 | } 36 | if err != nil { 37 | log.Fatal(err) 38 | } 39 | 40 | id := fmt.Sprintf("flow-%s", record[0]) 41 | counts, _ := strconv.ParseFloat(record[1], 64) 42 | expected = append(expected, uint(counts)) 43 | for i := 0.0; i < counts; i++ { 44 | start := time.Now() 45 | pmc.Increment([]byte(id)) 46 | //cml.IncreaseCount([]byte(id)) 47 | dur += time.Since(start) 48 | } 49 | } 50 | 51 | for i := range expected { 52 | id := fmt.Sprintf("flow-%d", i) 53 | // flow id, expected, estimation 54 | est := pmc.GetEstimate([]byte(id)) 55 | //est := cml.Frequency([]byte(id)) 56 | fmt.Println(id, expected[i], uint(est), est/float64(expected[i])) 57 | 58 | if i > 10 { 59 | break 60 | } 61 | } 62 | fmt.Println("fill rate:", pmc.GetFillRate()) 63 | //fmt.Println("fill rate:", cml.GetFillRate(), dur) 64 | 65 | } 66 | -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Permission is hereby granted, free of charge, to any person obtaining a copy 3 | of this software and associated documentation files (the "Software"), to deal 4 | in the Software without restriction, including without limitation the rights 5 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 6 | copies of the Software, and to permit persons to whom the Software is 7 | furnished to do so, subject to the following conditions: 8 | The above copyright notice and this permission notice shall be included in all 9 | copies or substantial portions of the Software. 10 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 11 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 12 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 13 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 14 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 15 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 16 | SOFTWARE. 17 | */ 18 | 19 | /* 20 | Package pmc provides a Probabilistic Multiplicity Counting Sketch, a novel data structure that is capable of accounting traffic per flow probabilistically, that can be used as an alternative to Count-min sketch. 21 | The stream processing algorithm — Probabilistic Multiplicity Counting (PMC) — uses probabilistic counting techniques to determine the approximate multiplicity of each element in large streams. It is particularly well suited for traffic measurements on high-speed communication links and likewise applicable for many other purposes. 22 | 23 | Count-Min Sketches hold counters in a matrix-like organization. A big caveat for both Spectral Bloom Filters and Count-Min Sketches is that the maximum multiplicity has to be known a priori quite accurately, to provide large enough counters without wasting too much memory. PMC does not need to know the maximum frequency beforehand, and its counting operation is much simpler. 24 | 25 | For details about the algorithm and citations please use this article: 26 | "High-Speed Per-Flow Traffic Measurement with Probabilistic Multiplicity Counting" by Peter Lieven & Björn Scheuermann 27 | (https://wwwcn.cs.uni-duesseldorf.de/publications/publications/library/Lieven2010a.pdf) 28 | */ 29 | package pmc 30 | -------------------------------------------------------------------------------- /pmc.go: -------------------------------------------------------------------------------- 1 | package pmc 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "math" 7 | 8 | "github.com/dgryski/go-bits" 9 | "github.com/dgryski/go-farm" 10 | "github.com/lazybeaver/xorshift" 11 | "github.com/willf/bitset" 12 | 13 | random "math/rand" 14 | ) 15 | 16 | var rnd = xorshift.NewXorShift64Star(42) 17 | 18 | // non-receiver methods 19 | func georand(w uint) uint { 20 | val := rnd.Next() 21 | // Calculate the position of the leftmost 1-bit. 22 | res := uint(bits.Clz(uint64(val) ^ 0)) 23 | if res >= w { 24 | res = w - 1 25 | } 26 | return res 27 | } 28 | 29 | func rand(m uint) uint { 30 | return uint(rnd.Next()) % m 31 | } 32 | 33 | /* 34 | We start with the probability qk(n) that at least the first k bits in a sketch row are set after n additions as given in (4). 35 | We observe that qk is now also a function of p, and obtain a modified version of (4) as follows: 36 | */ 37 | func qk(k, n, p float64) float64 { 38 | result := 1.0 39 | for i := 1.0; i <= k; i++ { 40 | result *= (1.0 - math.Pow(1.0-math.Pow(2, -i), n)*(1.0-p)) 41 | } 42 | return result 43 | } 44 | 45 | /* 46 | Sketch is a Probabilistic Multiplicity Counting Sketch, a novel data structure 47 | that is capable of accounting traffic per flow probabilistically, that can be 48 | used as an alternative to Count-min sketch. 49 | */ 50 | type Sketch struct { 51 | l float64 52 | m float64 53 | w float64 54 | bitmap *bitset.BitSet // FIXME: Get Rid of bitmap and use uint32 array 55 | p float64 56 | n uint 57 | } 58 | 59 | /* 60 | New returns a PMC Sketch with the properties: 61 | l = total number of bits for sketch 62 | m = total number of rows for each flow 63 | w = total number of columns for each flow 64 | */ 65 | func New(l uint, m uint, w uint) (*Sketch, error) { 66 | if l == 0 { 67 | return nil, errors.New("Expected l > 0, got 0") 68 | } 69 | if m == 0 { 70 | return nil, errors.New("Expected m > 0, got 0") 71 | } 72 | if w == 0 { 73 | return nil, errors.New("Expected w > 0, got 0") 74 | } 75 | return &Sketch{l: float64(l), m: float64(m), w: float64(w), 76 | bitmap: bitset.New(l), n: 0}, nil 77 | } 78 | 79 | /* 80 | NewForMaxFlows returns a PMC Sketch adapted to the size of the max number of 81 | flows expected. 82 | */ 83 | func NewForMaxFlows(maxFlows uint) (*Sketch, error) { 84 | l := maxFlows * 32 85 | return New(l, 256, 32) 86 | } 87 | 88 | func (sketch *Sketch) printVirtualMatrix(flow []byte) { 89 | for i := 0.0; i < sketch.m; i++ { 90 | for j := 0.0; j < sketch.w; j++ { 91 | pos := sketch.getPos(flow, i, j) 92 | if sketch.bitmap.Test(pos) == false { 93 | fmt.Print(0) 94 | } else { 95 | fmt.Print(1) 96 | } 97 | } 98 | fmt.Println("") 99 | } 100 | } 101 | 102 | /* 103 | GetFillRate ... 104 | */ 105 | func (sketch *Sketch) GetFillRate() float64 { 106 | return sketch.getP() * 100 107 | } 108 | 109 | /* 110 | It is straightforward to use any uniformly distributed hash function with 111 | sufficiently random output in the role of H: the input parameters can 112 | simply be concatenated to a single bit string. 113 | */ 114 | func (sketch *Sketch) getPos(f []byte, i, j float64) uint { 115 | hash := farm.Hash64WithSeeds(f, uint64(i), uint64(j)) 116 | return uint(hash) % uint(sketch.l) 117 | } 118 | 119 | /* 120 | Increment the count of the flow by 1 121 | */ 122 | func (sketch *Sketch) Increment(flow []byte) { 123 | sketch.p = 0 124 | i := rand(uint(sketch.m)) 125 | j := georand(uint(sketch.w)) 126 | 127 | pos := sketch.getPos(flow, float64(i), float64(j)) 128 | 129 | sketch.n++ 130 | if random.Float64() < float64(j)/float64(sketch.l) { 131 | return 132 | } 133 | 134 | sketch.bitmap.Set(pos) 135 | } 136 | 137 | func (sketch *Sketch) getZSum(flow []byte) float64 { 138 | z := 0.0 139 | for i := 0.0; i < sketch.m; i++ { 140 | for j := 0.0; j < sketch.w; j++ { 141 | pos := sketch.getPos(flow, i, j) 142 | if sketch.bitmap.Test(pos) == false { 143 | z += j 144 | break 145 | } 146 | } 147 | } 148 | return z 149 | } 150 | 151 | func (sketch *Sketch) getEmptyRows(flow []byte) float64 { 152 | k := 0.0 153 | for i := 0.0; i < sketch.m; i++ { 154 | pos := sketch.getPos(flow, i, 0) 155 | if sketch.bitmap.Test(pos) == false { 156 | k++ 157 | } 158 | } 159 | return k 160 | } 161 | 162 | func (sketch *Sketch) getP() float64 { 163 | ones := 0.0 164 | for i := uint(0); i < uint(sketch.l); i++ { 165 | if sketch.bitmap.Test(i) == true { 166 | ones++ 167 | } 168 | } 169 | return ones / sketch.l 170 | } 171 | 172 | func (sketch *Sketch) getE(n, p float64) float64 { 173 | result := 0.0 174 | for k := 1.0; k <= sketch.w; k++ { 175 | result += (k * (qk(k, n, p) - qk(k+1, n, p))) 176 | } 177 | return result 178 | } 179 | 180 | func (sketch *Sketch) phi(n, p float64) float64 { 181 | return math.Pow(2, sketch.getE(n, p)) / n 182 | } 183 | 184 | /* 185 | GetEstimate returns the estimated count of a given flow 186 | */ 187 | func (sketch *Sketch) GetEstimate(flow []byte) float64 { 188 | if sketch.p == 0 { 189 | sketch.p = sketch.getP() 190 | } 191 | k := sketch.getEmptyRows(flow) 192 | n := float64(sketch.n) 193 | m := sketch.m 194 | 195 | e := 0.0 196 | // Dealing with small multiplicities 197 | if kp := k / (1 - sketch.p); kp > 0.3*sketch.m { 198 | e = -2 * sketch.m * math.Log(kp/sketch.m) 199 | } else { 200 | z := sketch.getZSum(flow) 201 | e = m * math.Pow(2, z/m) / sketch.phi(n, sketch.p) 202 | } 203 | return math.Abs(e) 204 | } 205 | -------------------------------------------------------------------------------- /pmc_test.go: -------------------------------------------------------------------------------- 1 | package pmc 2 | 3 | import ( 4 | "math" 5 | random "math/rand" 6 | "strconv" 7 | "testing" 8 | ) 9 | 10 | func TestPMCHash(t *testing.T) { 11 | s, _ := New(1024, 4, 4) 12 | dist := make(map[uint]uint) 13 | for k := 0; k < 100000; k++ { 14 | i := float64(rand(uint(s.m))) 15 | j := float64(georand(uint(s.w))) 16 | pos := s.getPos([]byte("pmc"), i, j) 17 | dist[pos]++ 18 | } 19 | if len(dist) > 16 { 20 | t.Error("Expected maximum 16 different positions, got ", len(dist)) 21 | } 22 | } 23 | 24 | func TestPMCHashAdd(t *testing.T) { 25 | flows := make([]string, 100, 100) 26 | 27 | for i := 0; i < len(flows); i++ { 28 | flows[i] = strconv.Itoa(random.Int()) + "-flow-" + strconv.Itoa(random.Int()) 29 | } 30 | 31 | s, _ := New(8000000, 256, 64) 32 | for j := range flows { 33 | for i := 0; i < 1000000; i++ { 34 | if i%(j+1) == 0 { 35 | s.Increment([]byte(flows[j])) 36 | } 37 | } 38 | } 39 | 40 | for i, v := range flows { 41 | fCount := s.GetEstimate([]byte(v)) 42 | fErr := math.Abs(100 * (1 - float64(fCount)/(1000000/float64(i+1)))) 43 | if math.Abs(fErr) > 15 { 44 | t.Errorf("Expected error for flow %d '%s' <= 15%%, got %f", i, v, math.Abs(fErr)) 45 | } 46 | } 47 | } 48 | 49 | func TestRand(t *testing.T) { 50 | for i := 0; i < 10000; i++ { 51 | r := rand(32) 52 | if r >= 32 { 53 | t.Error("Expected rand to return r < 32, got", r) 54 | } 55 | } 56 | } 57 | --------------------------------------------------------------------------------