├── LICENSE ├── README.md ├── cmd └── topk-fss │ └── main.go ├── testdata └── domains.txt ├── topk.go └── topk_test.go /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Damian Gryski 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | go-topk: "filtered space saving" streaming topk algorithm 2 | 3 | godoc: http://godoc.org/github.com/dgryski/go-topk 4 | -------------------------------------------------------------------------------- /cmd/topk-fss/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "flag" 6 | "fmt" 7 | "io" 8 | "log" 9 | "os" 10 | "strconv" 11 | "strings" 12 | 13 | "github.com/dgryski/go-topk" 14 | ) 15 | 16 | func main() { 17 | 18 | k := flag.Int("n", 500, "k") 19 | f := flag.String("f", "", "file to read") 20 | counts := flag.Bool("c", false, "each item has a count associated with it") 21 | 22 | flag.Parse() 23 | 24 | var r io.Reader 25 | 26 | if *f == "" { 27 | r = os.Stdin 28 | } else { 29 | var err error 30 | r, err = os.Open(*f) 31 | if err != nil { 32 | log.Fatal(err) 33 | } 34 | } 35 | 36 | tk := topk.New(*k) 37 | sc := bufio.NewScanner(r) 38 | 39 | for sc.Scan() { 40 | line := sc.Text() 41 | 42 | var count int 43 | var item string 44 | 45 | if *counts { 46 | fields := strings.Fields(line) 47 | cint, err := strconv.Atoi(fields[1]) 48 | if err != nil { 49 | log.Println("failed to parse count: ", fields[1], ":", err) 50 | continue 51 | } 52 | item = fields[0] 53 | count = cint 54 | } else { 55 | item = line 56 | count = 1 57 | } 58 | 59 | tk.Insert(item, count) 60 | } 61 | 62 | if err := sc.Err(); err != nil { 63 | log.Fatal(err) 64 | } 65 | 66 | for _, v := range tk.Keys() { 67 | fmt.Println(v.Key, v.Count, v.Error) 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /topk.go: -------------------------------------------------------------------------------- 1 | // Package topk implements the Filtered Space-Saving TopK streaming algorithm 2 | /* 3 | 4 | The original Space-Saving algorithm: 5 | https://icmi.cs.ucsb.edu/research/tech_reports/reports/2005-23.pdf 6 | 7 | The Filtered Space-Saving enhancement: 8 | http://www.l2f.inesc-id.pt/~fmmb/wiki/uploads/Work/misnis.ref0a.pdf 9 | 10 | This implementation follows the algorithm of the FSS paper, but not the 11 | suggested implementation. Specifically, we use a heap instead of a sorted list 12 | of monitored items, and since we are also using a map to provide O(1) access on 13 | update also don't need the c_i counters in the hash table. 14 | 15 | Licensed under the MIT license. 16 | 17 | */ 18 | package topk 19 | 20 | import ( 21 | "bytes" 22 | "container/heap" 23 | "encoding/gob" 24 | "sort" 25 | 26 | "github.com/dgryski/go-sip13" 27 | ) 28 | 29 | // Element is a TopK item 30 | type Element struct { 31 | Key string 32 | Count int 33 | Error int 34 | } 35 | 36 | type elementsByCountDescending []Element 37 | 38 | func (elts elementsByCountDescending) Len() int { return len(elts) } 39 | func (elts elementsByCountDescending) Less(i, j int) bool { 40 | return (elts[i].Count > elts[j].Count) || (elts[i].Count == elts[j].Count && elts[i].Key < elts[j].Key) 41 | } 42 | func (elts elementsByCountDescending) Swap(i, j int) { elts[i], elts[j] = elts[j], elts[i] } 43 | 44 | type keys struct { 45 | m map[string]int 46 | elts []Element 47 | } 48 | 49 | // Implement the container/heap interface 50 | 51 | func (tk *keys) Len() int { return len(tk.elts) } 52 | func (tk *keys) Less(i, j int) bool { 53 | return (tk.elts[i].Count < tk.elts[j].Count) || (tk.elts[i].Count == tk.elts[j].Count && tk.elts[i].Error > tk.elts[j].Error) 54 | } 55 | func (tk *keys) Swap(i, j int) { 56 | 57 | tk.elts[i], tk.elts[j] = tk.elts[j], tk.elts[i] 58 | 59 | tk.m[tk.elts[i].Key] = i 60 | tk.m[tk.elts[j].Key] = j 61 | } 62 | 63 | func (tk *keys) Push(x interface{}) { 64 | e := x.(Element) 65 | tk.m[e.Key] = len(tk.elts) 66 | tk.elts = append(tk.elts, e) 67 | } 68 | 69 | func (tk *keys) Pop() interface{} { 70 | var e Element 71 | e, tk.elts = tk.elts[len(tk.elts)-1], tk.elts[:len(tk.elts)-1] 72 | 73 | delete(tk.m, e.Key) 74 | 75 | return e 76 | } 77 | 78 | // Stream calculates the TopK elements for a stream 79 | type Stream struct { 80 | n int 81 | k keys 82 | alphas []int 83 | } 84 | 85 | // New returns a Stream estimating the top n most frequent elements 86 | func New(n int) *Stream { 87 | return &Stream{ 88 | n: n, 89 | k: keys{m: make(map[string]int), elts: make([]Element, 0, n)}, 90 | alphas: make([]int, n*6), // 6 is the multiplicative constant from the paper 91 | } 92 | } 93 | 94 | func reduce(x uint64, n int) uint32 { 95 | return uint32(uint64(uint32(x)) * uint64(n) >> 32) 96 | } 97 | 98 | // Insert adds an element to the stream to be tracked 99 | // It returns an estimation for the just inserted element 100 | func (s *Stream) Insert(x string, count int) Element { 101 | 102 | xhash := reduce(sip13.Sum64Str(0, 0, x), len(s.alphas)) 103 | 104 | // are we tracking this element? 105 | if idx, ok := s.k.m[x]; ok { 106 | s.k.elts[idx].Count += count 107 | e := s.k.elts[idx] 108 | heap.Fix(&s.k, idx) 109 | return e 110 | } 111 | 112 | // can we track more elements? 113 | if len(s.k.elts) < s.n { 114 | // there is free space 115 | e := Element{Key: x, Count: count} 116 | heap.Push(&s.k, e) 117 | return e 118 | } 119 | 120 | if s.alphas[xhash]+count < s.k.elts[0].Count { 121 | e := Element{ 122 | Key: x, 123 | Error: s.alphas[xhash], 124 | Count: s.alphas[xhash] + count, 125 | } 126 | s.alphas[xhash] += count 127 | return e 128 | } 129 | 130 | // replace the current minimum element 131 | minKey := s.k.elts[0].Key 132 | 133 | mkhash := reduce(sip13.Sum64Str(0, 0, minKey), len(s.alphas)) 134 | s.alphas[mkhash] = s.k.elts[0].Count 135 | 136 | e := Element{ 137 | Key: x, 138 | Error: s.alphas[xhash], 139 | Count: s.alphas[xhash] + count, 140 | } 141 | s.k.elts[0] = e 142 | 143 | // we're not longer monitoring minKey 144 | delete(s.k.m, minKey) 145 | // but 'x' is as array position 0 146 | s.k.m[x] = 0 147 | 148 | heap.Fix(&s.k, 0) 149 | return e 150 | } 151 | 152 | // Keys returns the current estimates for the most frequent elements 153 | func (s *Stream) Keys() []Element { 154 | elts := append([]Element(nil), s.k.elts...) 155 | sort.Sort(elementsByCountDescending(elts)) 156 | return elts 157 | } 158 | 159 | // Estimate returns an estimate for the item x 160 | func (s *Stream) Estimate(x string) Element { 161 | xhash := reduce(sip13.Sum64Str(0, 0, x), len(s.alphas)) 162 | 163 | // are we tracking this element? 164 | if idx, ok := s.k.m[x]; ok { 165 | e := s.k.elts[idx] 166 | return e 167 | } 168 | count := s.alphas[xhash] 169 | e := Element{ 170 | Key: x, 171 | Error: count, 172 | Count: count, 173 | } 174 | return e 175 | } 176 | 177 | func (s *Stream) GobEncode() ([]byte, error) { 178 | buf := bytes.Buffer{} 179 | enc := gob.NewEncoder(&buf) 180 | if err := enc.Encode(s.n); err != nil { 181 | return nil, err 182 | } 183 | if err := enc.Encode(s.k.m); err != nil { 184 | return nil, err 185 | } 186 | if err := enc.Encode(s.k.elts); err != nil { 187 | return nil, err 188 | } 189 | if err := enc.Encode(s.alphas); err != nil { 190 | return nil, err 191 | } 192 | return buf.Bytes(), nil 193 | } 194 | 195 | func (s *Stream) GobDecode(b []byte) error { 196 | dec := gob.NewDecoder(bytes.NewBuffer(b)) 197 | if err := dec.Decode(&s.n); err != nil { 198 | return err 199 | } 200 | if err := dec.Decode(&s.k.m); err != nil { 201 | return err 202 | } 203 | if err := dec.Decode(&s.k.elts); err != nil { 204 | return err 205 | } 206 | if err := dec.Decode(&s.alphas); err != nil { 207 | return err 208 | } 209 | return nil 210 | } 211 | -------------------------------------------------------------------------------- /topk_test.go: -------------------------------------------------------------------------------- 1 | package topk 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "encoding/gob" 7 | "log" 8 | "os" 9 | "reflect" 10 | "sort" 11 | "testing" 12 | ) 13 | 14 | type freqs struct { 15 | keys []string 16 | counts map[string]int 17 | } 18 | 19 | func (f freqs) Len() int { return len(f.keys) } 20 | 21 | // Actually 'Greater', since we want decreasing 22 | func (f *freqs) Less(i, j int) bool { 23 | return f.counts[f.keys[i]] > f.counts[f.keys[j]] || f.counts[f.keys[i]] == f.counts[f.keys[j]] && f.keys[i] < f.keys[j] 24 | } 25 | 26 | func (f *freqs) Swap(i, j int) { f.keys[i], f.keys[j] = f.keys[j], f.keys[i] } 27 | 28 | func TestTopK(t *testing.T) { 29 | 30 | f, err := os.Open("testdata/domains.txt") 31 | 32 | if err != nil { 33 | t.Fatal(err) 34 | } 35 | 36 | scanner := bufio.NewScanner(f) 37 | 38 | tk := New(100) 39 | exact := make(map[string]int) 40 | 41 | for scanner.Scan() { 42 | 43 | item := scanner.Text() 44 | 45 | exact[item]++ 46 | e := tk.Insert(item, 1) 47 | if e.Count < exact[item] { 48 | t.Errorf("estimate lower than exact: key=%v, exact=%v, estimate=%v", e.Key, exact[item], e.Count) 49 | } 50 | if e.Count-e.Error > exact[item] { 51 | t.Errorf("error bounds too large: key=%v, count=%v, error=%v, exact=%v", e.Key, e.Count, e.Error, exact[item]) 52 | } 53 | } 54 | 55 | if err := scanner.Err(); err != nil { 56 | log.Println("error during scan: ", err) 57 | } 58 | 59 | var keys []string 60 | 61 | for k, _ := range exact { 62 | keys = append(keys, k) 63 | } 64 | 65 | freq := &freqs{keys: keys, counts: exact} 66 | 67 | sort.Sort(freq) 68 | 69 | top := tk.Keys() 70 | 71 | // at least the top 25 must be in order 72 | for i := 0; i < 25; i++ { 73 | if top[i].Key != freq.keys[i] { 74 | t.Errorf("key mismatch: idx=%d top=%s (%d) exact=%s (%d)", i, top[i].Key, top[i].Count, freq.keys[i], freq.counts[freq.keys[i]]) 75 | } 76 | } 77 | for k, v := range exact { 78 | e := tk.Estimate(k) 79 | if e.Count < v { 80 | t.Errorf("estimate lower than exact: key=%v, exact=%v, estimate=%v", e.Key, v, e.Count) 81 | } 82 | if e.Count-e.Error > v { 83 | t.Errorf("error bounds too large: key=%v, count=%v, error=%v, exact=%v", e.Key, e.Count, e.Error, v) 84 | } 85 | } 86 | for _, k := range top { 87 | e := tk.Estimate(k.Key) 88 | if e != k { 89 | t.Errorf("estimate differs from top keys: key=%v, estimate=%v(-%v) top=%v(-%v)", e.Key, e.Count, e.Error, k.Count, k.Error) 90 | } 91 | } 92 | 93 | // gob 94 | var buf bytes.Buffer 95 | enc := gob.NewEncoder(&buf) 96 | if err := enc.Encode(tk); err != nil { 97 | t.Error(err) 98 | } 99 | 100 | decoded := New(100) 101 | dec := gob.NewDecoder(&buf) 102 | if err := dec.Decode(decoded); err != nil { 103 | t.Error(err) 104 | } 105 | 106 | if !reflect.DeepEqual(tk, decoded) { 107 | t.Error("they are not equal.") 108 | } 109 | } 110 | --------------------------------------------------------------------------------