├── .gitignore ├── README.md ├── goforget ├── .gitignore ├── README.md ├── decay.go ├── distribution.go ├── forget.go ├── http_utils.go ├── redis_utils.go └── redis_utils_test.go └── pyforget ├── distribution.py ├── forget_table.py └── readme.md /.gitignore: -------------------------------------------------------------------------------- 1 | *.sw[op] 2 | *.rdb 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Forget-Table 2 | ============ 3 | 4 | Forget-Table is a database for storing non-stationary categorical distributions 5 | that forget old observations responsibly. It has been designed to store 6 | millions of distributions and can be written to at a high volume. 7 | 8 | This repo includes two implementations of the forget-table concept, both using 9 | [redis](http://redis.io) as a backend. They are: 10 | 11 | * `pyforget` - a quick and dirty implementation intended to be used as a playground 12 | * `goforget` - written in GO for great speed and scalability. This has a much 13 | stricter API and is much more stable. 14 | 15 | For additional documentation see the README for that specific implementation. 16 | 17 | Created by [Micha Gorelick](http://micha.gd/), [Mike 18 | Dewar](http://twitter.com/mikedewar) with the help of [Dan 19 | Frank](http://www.danielhfrank.com/) and all the amazing engineers and 20 | scientists at [bitly](https://bitly.com/pages/about). 21 | 22 | -------------------------------------------------------------------------------- /goforget/.gitignore: -------------------------------------------------------------------------------- 1 | goforget 2 | -------------------------------------------------------------------------------- /goforget/README.md: -------------------------------------------------------------------------------- 1 | # GoForget 2 | 3 | ## Building 4 | 5 | Simply build with `go build` and install with `go install`! 6 | 7 | Requirements: 8 | 9 | * [redigo](http://github.com/garyburd/redigo) (must be after commit 69e1a27a where redis.Pool.TestOnBorrow was introduced) 10 | * [redis](http://redis.io/) v2.7 or higher 11 | 12 | 13 | ## Running 14 | 15 | To start the service run `goforget -redis-host=localhost:6379:1 -http=:8080`. 16 | This will start an instance of `goforget` on port 8080 and will be connected to 17 | a local redis server on port 6379 using database 1. To see other valid 18 | options, type `goforget --help`! 19 | 20 | ## Endpoints 21 | 22 | * Increment 23 | * `/incr?distribution=colors&field=red` 24 | * This will create the distribution if necessary and increase the probability 25 | of the field red by increasing it's count by one. There is an optional 26 | parameter, `N`, to increase the count by an arbitrary amount. 27 | 28 | * Access entire distribution 29 | * `/dist?distribution=colors` 30 | 31 | * Access single field 32 | * `/get?distribution=colors&field=blue` 33 | 34 | * Access N fields with highest probability 35 | * `/nmostprobable?distribution=colors&N=10` 36 | 37 | * Get the number of distributions currently being stores 38 | * `/dbsize` 39 | -------------------------------------------------------------------------------- /goforget/decay.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | "math" 6 | "math/rand" 7 | "time" 8 | ) 9 | 10 | var MAX_ITER = 1000 11 | 12 | func Poisson(lambda float64) int { 13 | if lambda == 0.0 { 14 | return 0 15 | } 16 | e := math.Exp(-1.0 * lambda) 17 | if e < 1e-8 { 18 | return math.MaxInt32 19 | } 20 | 21 | counter := MAX_ITER 22 | r := rand.Float64() 23 | k := int(0) 24 | p := e 25 | for p < r { 26 | k += 1 27 | e *= lambda / float64(k) 28 | p += e 29 | if counter == 0 { 30 | return -1 31 | } 32 | } 33 | return k 34 | } 35 | 36 | func Decay(count, Z, t int, rate float64) int { 37 | return DecayTime(count, Z, t, rate, time.Now()) 38 | } 39 | 40 | func DecayTime(count, Z, t int, rate float64, now time.Time) int { 41 | if count < 1 { 42 | return 0.0 43 | } 44 | 45 | dt := int(now.Unix()) - t 46 | 47 | lambda := rate * float64(dt) 48 | k := Poisson(lambda) 49 | 50 | if k == -1 { 51 | log.Printf("Poisson simulation did not converge with rate = %f => lambda = %f", rate, lambda) 52 | return 0 53 | } 54 | 55 | return k 56 | } 57 | -------------------------------------------------------------------------------- /goforget/distribution.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "github.com/garyburd/redigo/redis" 7 | "log" 8 | "time" 9 | ) 10 | 11 | type Value struct { 12 | Count int `json:"count"` 13 | P float64 `json:"p"` 14 | } 15 | 16 | type ValueMap map[string]*Value 17 | 18 | func (vm ValueMap) MarshalJSON() ([]byte, error) { 19 | result := make([]map[string]interface{}, 0, len(vm)) 20 | for bin, k := range vm { 21 | r := make(map[string]interface{}) 22 | r["bin"] = bin 23 | r["count"] = k.Count 24 | r["p"] = k.P 25 | result = append(result, r) 26 | } 27 | return json.Marshal(result) 28 | } 29 | 30 | type Distribution struct { 31 | Name string `json:"distribution"` 32 | Z int `json:"Z"` 33 | T int 34 | Data ValueMap `json:"data"` 35 | Rate float64 `json:"rate"` 36 | Prune bool `json:"prune"` 37 | 38 | isFull bool 39 | hasDecayed bool 40 | } 41 | 42 | func (d *Distribution) GetNMostProbable(N int) error { 43 | data, err := GetNMostProbable(d.Name, N) 44 | if err != nil || len(data) != 3 { 45 | return fmt.Errorf("Could not fetch data for %s: %s", d.Name, err) 46 | } 47 | 48 | d.Z, _ = redis.Int(data[1], nil) 49 | d.T, _ = redis.Int(data[2], nil) 50 | d.Data = make(map[string]*Value) 51 | 52 | d.addMultiBulkCounts(data[0]) 53 | return nil 54 | } 55 | 56 | func (d *Distribution) GetField(fields ...string) error { 57 | data, err := GetField(d.Name, fields...) 58 | 59 | N := len(fields) 60 | if err != nil || len(data) != 2+N { 61 | return fmt.Errorf("Could not retrieve field") 62 | } 63 | 64 | Z, _ := redis.Int(data[N], nil) 65 | T, _ := redis.Int(data[N+1], nil) 66 | 67 | d.Z = Z 68 | d.T = T 69 | d.Data = make(map[string]*Value) 70 | var count int 71 | for i, field := range fields { 72 | count, _ = redis.Int(data[i], nil) 73 | d.Data[field] = &Value{Count: count} 74 | } 75 | d.calcProbabilities() 76 | 77 | return nil 78 | } 79 | 80 | func (d *Distribution) Fill() error { 81 | data, err := GetDistribution(d.Name) 82 | if err != nil { 83 | return fmt.Errorf("Could not fetch data for %s: %s", d.Name, err) 84 | } 85 | if data[0] == nil { 86 | return nil 87 | } 88 | 89 | T, err := redis.Int(data[0], nil) 90 | if err != nil { 91 | log.Printf("Could not read _T from distribution %s: %s", d.Name, err) 92 | } 93 | d.T = T 94 | 95 | // TODO: don't use the dist map to speed things up! 96 | d.Data = make(map[string]*Value) 97 | d.Rate = *defaultRate 98 | 99 | d.addMultiBulkCounts(data[1]) 100 | d.Normalize() 101 | d.calcProbabilities() 102 | 103 | d.isFull = true 104 | return nil 105 | } 106 | 107 | func (d *Distribution) addMultiBulkCounts(data interface{}) error { 108 | distData, _ := redis.MultiBulk(data, nil) 109 | for i := 0; i < len(distData); i += 2 { 110 | k, err := redis.String(distData[i], nil) 111 | if err != nil || k == "" { 112 | log.Printf("Could not read %s from distribution %s: %s", distData[i], d.Name, err) 113 | } 114 | v, err := redis.Int(distData[i+1], nil) 115 | if err != nil { 116 | log.Printf("Could not read %s from distribution %s: %s", distData[i+1], d.Name, err) 117 | } 118 | d.Data[k] = &Value{Count: v} 119 | } 120 | 121 | return nil 122 | } 123 | 124 | func (d *Distribution) Full() bool { 125 | return d.isFull 126 | } 127 | 128 | func (d *Distribution) HasDecayed() bool { 129 | return d.hasDecayed 130 | } 131 | 132 | func (d *Distribution) Normalize() { 133 | newZ := 0 134 | for _, v := range d.Data { 135 | newZ += v.Count 136 | } 137 | 138 | d.Z = newZ 139 | d.calcProbabilities() 140 | } 141 | 142 | func (d *Distribution) calcProbabilities() { 143 | fZ := float64(d.Z) 144 | for idx, _ := range d.Data { 145 | if fZ == 0 { 146 | d.Data[idx].P = 0 147 | } else { 148 | d.Data[idx].P = float64(d.Data[idx].Count) / fZ 149 | } 150 | } 151 | } 152 | 153 | func (d *Distribution) Decay() { 154 | startingZ := d.Z 155 | now := time.Now() 156 | for k, v := range d.Data { 157 | l := DecayTime(v.Count, d.Z, d.T, d.Rate, now) 158 | if l >= d.Data[k].Count { 159 | if d.Prune { 160 | l = d.Data[k].Count 161 | } else { 162 | l = d.Data[k].Count - 1 163 | } 164 | } 165 | d.Data[k].Count -= l 166 | d.Z -= l 167 | } 168 | 169 | if !d.hasDecayed && startingZ != d.Z { 170 | d.hasDecayed = true 171 | } 172 | 173 | d.T = int(time.Now().Unix()) 174 | d.calcProbabilities() 175 | } 176 | -------------------------------------------------------------------------------- /goforget/forget.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "log" 7 | "math/rand" 8 | "net/http" 9 | "net/url" 10 | "strconv" 11 | "strings" 12 | "sync" 13 | "time" 14 | ) 15 | 16 | var ( 17 | VERSION = "0.4.5" 18 | showVersion = flag.Bool("version", false, "print version string") 19 | httpAddress = flag.String("http", ":8080", "HTTP service address (e.g., ':8080')") 20 | redisHost = flag.String("redis-host", "", "Redis host in the form host:port:db.") 21 | redisUri = flag.String("redis-uri", "", "Redis URI in the form redis://:password@hostname:port/db_number") 22 | defaultRate = flag.Float64("default-rate", 0.5, "Default rate to decay distributions with") 23 | nWorkers = flag.Int("nworkers", 1, "Number of update workers that update the redis DB") 24 | pruneDist = flag.Bool("prune", true, "Whether or not to decay distributional fields out") 25 | expirSigma = flag.Float64("expire-sigma", 2, "Confidence level that a distribution will be empty when set to expire") 26 | ) 27 | 28 | var updateChan chan *Distribution 29 | var redisServer *RedisServer 30 | 31 | func IncrHandler(w http.ResponseWriter, r *http.Request) { 32 | reqParams, err := url.ParseQuery(r.URL.RawQuery) 33 | if err != nil { 34 | HttpError(w, 500, "INVALID_URI") 35 | return 36 | } 37 | distribution := reqParams.Get("distribution") 38 | if distribution == "" { 39 | HttpError(w, 500, "MISSING_ARG_DISTRIBUTION") 40 | return 41 | } 42 | fields, ok := reqParams["field"] 43 | if !ok || len(fields) == 0 { 44 | HttpError(w, 500, "MISSING_ARG_FIELD") 45 | return 46 | } 47 | N_raw := reqParams.Get("N") 48 | var N int 49 | if N_raw == "" { 50 | N = 1 51 | } else { 52 | N, err = strconv.Atoi(N_raw) 53 | if err != nil { 54 | HttpError(w, 500, "COULDNT_PARSE_N") 55 | return 56 | } 57 | } 58 | 59 | err = IncrField(distribution, fields, N) 60 | if err == nil { 61 | w.WriteHeader(200) 62 | fmt.Fprintf(w, "OK") 63 | } else { 64 | log.Printf("Failed to incr: %s", err) 65 | w.WriteHeader(500) 66 | fmt.Fprintf(w, "FAIL") 67 | } 68 | updateChan <- &Distribution{Name: distribution} 69 | } 70 | 71 | func DistHandler(w http.ResponseWriter, r *http.Request) { 72 | reqParams, err := url.ParseQuery(r.URL.RawQuery) 73 | if err != nil { 74 | HttpError(w, 500, "INVALID_URI") 75 | return 76 | } 77 | distribution := reqParams.Get("distribution") 78 | if distribution == "" { 79 | HttpError(w, 500, "MISSING_ARG_DISTRIBUTION") 80 | return 81 | } 82 | var rate float64 83 | rate_raw := reqParams.Get("rate") 84 | if rate_raw == "" { 85 | rate = *defaultRate 86 | } else { 87 | n, err := fmt.Fscan(strings.NewReader(rate_raw), &rate) 88 | if n == 0 || err != nil { 89 | HttpError(w, 500, "CANNOT_PARSE_RATE") 90 | return 91 | } 92 | } 93 | 94 | dist := Distribution{ 95 | Name: distribution, 96 | Prune: *pruneDist, 97 | } 98 | err = dist.Fill() 99 | if err != nil { 100 | HttpError(w, 500, "COULD_NOT_RETRIEVE_DISTRIBUTION") 101 | return 102 | } 103 | 104 | if len(dist.Data) != 0 { 105 | if dist.Rate == *defaultRate { 106 | dist.Rate = rate 107 | } 108 | 109 | dist.Decay() 110 | dist.Normalize() 111 | updateChan <- &dist 112 | } 113 | 114 | HttpResponse(w, 200, dist) 115 | } 116 | 117 | func GetHandler(w http.ResponseWriter, r *http.Request) { 118 | reqParams, err := url.ParseQuery(r.URL.RawQuery) 119 | if err != nil { 120 | HttpError(w, 500, "INVALID_URI") 121 | return 122 | } 123 | distribution := reqParams.Get("distribution") 124 | if distribution == "" { 125 | HttpError(w, 500, "MISSING_ARG_DISTRIBUTION") 126 | return 127 | } 128 | fields, ok := reqParams["field"] 129 | if !ok || len(fields) == 0 { 130 | HttpError(w, 500, "MISSING_ARG_FIELD") 131 | return 132 | } 133 | var rate float64 134 | rate_raw := reqParams.Get("rate") 135 | if rate_raw == "" { 136 | rate = *defaultRate 137 | } else { 138 | n, err := fmt.Fscan(strings.NewReader(rate_raw), &rate) 139 | if n == 0 || err != nil { 140 | HttpError(w, 500, "CANNOT_PARSE_RATE") 141 | return 142 | } 143 | } 144 | 145 | result := Distribution{ 146 | Name: distribution, 147 | Rate: rate, 148 | Prune: *pruneDist, 149 | } 150 | err = result.GetField(fields...) 151 | if err != nil { 152 | HttpError(w, 500, "COULD_NOT_RETRIEVE_FIELD") 153 | return 154 | } 155 | 156 | result.Decay() 157 | 158 | HttpResponse(w, 200, result) 159 | updateChan <- &result 160 | } 161 | 162 | func DBSizeHandler(w http.ResponseWriter, r *http.Request) { 163 | size, err := DBSize() 164 | if err != nil { 165 | HttpError(w, 500, "COULD_NOT_READ_SIZE") 166 | return 167 | } 168 | HttpResponse(w, 200, size/3) 169 | } 170 | 171 | func NMostProbableHandler(w http.ResponseWriter, r *http.Request) { 172 | reqParams, err := url.ParseQuery(r.URL.RawQuery) 173 | if err != nil { 174 | HttpError(w, 500, "INVALID_URI") 175 | return 176 | } 177 | distribution := reqParams.Get("distribution") 178 | if distribution == "" { 179 | HttpError(w, 500, "MISSING_ARG_DISTRIBUTION") 180 | return 181 | } 182 | var rate float64 183 | rate_raw := reqParams.Get("rate") 184 | if rate_raw == "" { 185 | rate = *defaultRate 186 | } else { 187 | n, err := fmt.Fscan(strings.NewReader(rate_raw), &rate) 188 | if n == 0 || err != nil { 189 | HttpError(w, 500, "CANNOT_PARSE_RATE") 190 | return 191 | } 192 | } 193 | N_raw := reqParams.Get("N") 194 | var N int 195 | if N_raw == "" { 196 | N = 10 197 | } else { 198 | N, err = strconv.Atoi(N_raw) 199 | if err != nil { 200 | HttpError(w, 500, "INVALID_ARG_N") 201 | return 202 | } 203 | } 204 | 205 | result := Distribution{ 206 | Name: distribution, 207 | Rate: rate, 208 | Prune: *pruneDist, 209 | } 210 | result.GetNMostProbable(N) 211 | result.Decay() 212 | 213 | HttpResponse(w, 200, result) 214 | updateChan <- &result 215 | } 216 | 217 | func ExitHandler(w http.ResponseWriter, r *http.Request) { 218 | fmt.Fprintf(w, "OK") 219 | Exit() 220 | } 221 | 222 | func Exit() { 223 | close(updateChan) 224 | } 225 | 226 | func main() { 227 | flag.Parse() 228 | 229 | if *showVersion { 230 | fmt.Printf("goforget: v%s\n", VERSION) 231 | return 232 | } 233 | 234 | rand.Seed(time.Now().UnixNano()) 235 | redisServer = NewRedisServerFromUri("redis://localhost:6379/1") 236 | if *redisUri != "" { 237 | // if a redis URI exists was specified, parse it 238 | redisServer = NewRedisServerFromUri(*redisUri) 239 | } else if *redisHost != "" { 240 | // for legacy mode 241 | redisServer = NewRedisServerFromRaw(*redisHost) 242 | } 243 | 244 | // create the connection pool 245 | redisServer.Connect(*nWorkers * 2) 246 | 247 | log.Printf("Starting %d update worker(s)", *nWorkers) 248 | workerWaitGroup := sync.WaitGroup{} 249 | updateChan = make(chan *Distribution, *nWorkers) 250 | for i := 0; i < *nWorkers; i++ { 251 | workerWaitGroup.Add(1) 252 | go func(idx int) { 253 | UpdateRedis(updateChan, idx) 254 | workerWaitGroup.Done() 255 | }(i) 256 | } 257 | 258 | http.HandleFunc("/get", GetHandler) 259 | http.HandleFunc("/incr", IncrHandler) 260 | http.HandleFunc("/dist", DistHandler) 261 | http.HandleFunc("/nmostprobable", NMostProbableHandler) 262 | http.HandleFunc("/dbsize", DBSizeHandler) 263 | http.HandleFunc("/exit", ExitHandler) 264 | go func() { 265 | log.Fatal(http.ListenAndServe(*httpAddress, nil)) 266 | }() 267 | 268 | workerWaitGroup.Wait() 269 | } 270 | -------------------------------------------------------------------------------- /goforget/http_utils.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "log" 7 | "net/http" 8 | ) 9 | 10 | type HttpResponseJson struct { 11 | StatusCode int `json:"status_code"` 12 | StatusTxt string `json:"status_txt"` 13 | Data interface{} `json:"data"` 14 | } 15 | 16 | var ( 17 | ERROR_RESPONSE = `{"status_code": 500,"data": null,"status_txt": "COULD_NOT_FORMAT_RESULT"}` 18 | ) 19 | 20 | func HttpError(w http.ResponseWriter, statusCode int, statusTxt string) bool { 21 | w.WriteHeader(statusCode) 22 | 23 | response := HttpResponseJson{StatusCode: statusCode, StatusTxt: statusTxt} 24 | j, err := json.Marshal(response) 25 | if err != nil { 26 | fmt.Fprintf(w, ERROR_RESPONSE) 27 | log.Printf("Could not format response: %s", err) 28 | return false 29 | } 30 | fmt.Fprintf(w, "%s", j) 31 | return true 32 | } 33 | 34 | func HttpResponse(w http.ResponseWriter, statusCode int, data interface{}) bool { 35 | w.WriteHeader(statusCode) 36 | 37 | response := HttpResponseJson{StatusCode: statusCode, Data: data} 38 | j, err := json.Marshal(response) 39 | if err != nil { 40 | fmt.Fprintf(w, ERROR_RESPONSE) 41 | log.Printf("Could not format response: %s", err) 42 | return false 43 | } 44 | fmt.Fprintf(w, "%s", j) 45 | return true 46 | } 47 | -------------------------------------------------------------------------------- /goforget/redis_utils.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "github.com/garyburd/redigo/redis" 6 | "log" 7 | "math" 8 | "net/url" 9 | "strings" 10 | "time" 11 | ) 12 | 13 | var ( 14 | DistributionEmpty = fmt.Errorf("Distribution already empty, not updating") 15 | ) 16 | 17 | type RedisServer struct { 18 | Host string 19 | Port string 20 | Db string 21 | Pass string 22 | 23 | hostname string 24 | pool *redis.Pool 25 | } 26 | 27 | func NewRedisServerFromRaw(rawString string) *RedisServer { 28 | parts := strings.Split(rawString, ":") 29 | if len(parts) != 3 { 30 | log.Fatal("redis-host must be in the form host:port:db") 31 | } 32 | rs := &RedisServer{ 33 | Host: parts[0], 34 | Port: parts[1], 35 | Db: parts[2], 36 | hostname: parts[0] + ":" + parts[1], 37 | } 38 | return rs 39 | } 40 | 41 | func NewRedisServerFromUri(uriString string) *RedisServer { 42 | url, err := url.Parse(uriString) 43 | if err != nil { 44 | log.Fatal("redis-uri must be in the form redis://[:password@]hostname:port[/db_number]") 45 | } 46 | 47 | // host and port (for nil-case port, set default: 6379) 48 | parts := strings.Split(url.Host, ":") 49 | host := parts[0] 50 | port := "6379" //default case 51 | if len(parts) > 1 { 52 | port = parts[1] 53 | } 54 | hostname := host + ":" + port 55 | 56 | // database number (for nil-case db, set default: 0) 57 | db := "0" 58 | if url.Path != "" { 59 | db = strings.Split(url.Path, "/")[1] 60 | } 61 | 62 | // check for password 63 | password := "" 64 | if url.User != nil { 65 | password, _ = url.User.Password() 66 | } 67 | 68 | rs := &RedisServer{ 69 | Host: host, 70 | Port: port, 71 | hostname: hostname, 72 | Pass: password, 73 | Db: db, 74 | } 75 | return rs 76 | } 77 | 78 | func (rs *RedisServer) GetConnection() redis.Conn { 79 | return rs.pool.Get() 80 | } 81 | 82 | func (rs *RedisServer) Connect(maxIdle int) { 83 | // set up the connection pool 84 | rs.connectPool(maxIdle) 85 | 86 | // verify the connection pool is valid before allowing program to continue 87 | conn := rs.GetConnection() 88 | _, err := conn.Do("PING") 89 | if err != nil { 90 | log.Fatal("Could not connect to Redis!") 91 | } 92 | conn.Close() 93 | 94 | } 95 | 96 | func (rs *RedisServer) connectPool(maxIdle int) { 97 | rs.pool = &redis.Pool{ 98 | MaxIdle: maxIdle, 99 | IdleTimeout: 240 * time.Second, 100 | Dial: func() (redis.Conn, error) { 101 | c, err := redis.Dial("tcp", rs.hostname) 102 | if err != nil { 103 | return nil, err 104 | } 105 | if rs.Pass != "" { 106 | if _, err := c.Do("AUTH", rs.Pass); err != nil { 107 | c.Close() 108 | return nil, err 109 | } 110 | } 111 | if _, err := c.Do("SELECT", rs.Db); err != nil { 112 | c.Close() 113 | return nil, err 114 | } 115 | return c, err 116 | }, 117 | TestOnBorrow: func(c redis.Conn, t time.Time) error { 118 | _, err := c.Do("PING") 119 | return err 120 | }, 121 | } 122 | } 123 | 124 | func UpdateRedis(readChan chan *Distribution, id int) error { 125 | var redisConn redis.Conn 126 | for dist := range readChan { 127 | log.Printf("[%d] Updating distribution: %s", id, dist.Name) 128 | 129 | redisConn = redisServer.GetConnection() 130 | err := UpdateDistribution(redisConn, dist) 131 | if err != nil { 132 | log.Printf("[%d] Failed to update: %s: %v: %s", id, dist.Name, redisConn.Err(), err.Error()) 133 | } 134 | redisConn.Close() 135 | } 136 | return nil 137 | } 138 | 139 | func UpdateDistribution(rconn redis.Conn, dist *Distribution) error { 140 | ZName := fmt.Sprintf("%s.%s", dist.Name, "_Z") 141 | TName := fmt.Sprintf("%s.%s", dist.Name, "_T") 142 | 143 | rconn.Send("WATCH", ZName) 144 | defer rconn.Send("UNWATCH") 145 | 146 | if dist.Full() == false { 147 | err := dist.Fill() 148 | if err != nil { 149 | return fmt.Errorf("Could not fill: %s", err) 150 | } 151 | dist.Decay() 152 | dist.Normalize() 153 | } 154 | 155 | maxCount := 0 156 | rconn.Send("MULTI") 157 | if dist.HasDecayed() == true { 158 | if dist.Z == 0 { 159 | rconn.Send("DISCARD") 160 | return DistributionEmpty 161 | } 162 | 163 | for k, v := range dist.Data { 164 | if v.Count == 0 { 165 | rconn.Send("ZREM", dist.Name, k) 166 | } else { 167 | rconn.Send("ZADD", dist.Name, v.Count, k) 168 | if v.Count > maxCount { 169 | maxCount = v.Count 170 | } 171 | } 172 | } 173 | 174 | rconn.Send("SET", ZName, dist.Z) 175 | rconn.Send("SET", TName, dist.T) 176 | } else { 177 | for _, v := range dist.Data { 178 | if v.Count != 0 && v.Count > maxCount { 179 | maxCount = v.Count 180 | } 181 | } 182 | } 183 | 184 | eta := math.Sqrt(float64(maxCount) / dist.Rate) 185 | expTime := int(((*expirSigma) + eta) * eta) 186 | 187 | rconn.Send("EXPIRE", dist.Name, expTime) 188 | rconn.Send("EXPIRE", ZName, expTime) 189 | rconn.Send("EXPIRE", TName, expTime) 190 | 191 | _, err := rconn.Do("EXEC") 192 | if err != nil { 193 | return fmt.Errorf("Could not update %s: %s", dist.Name, err) 194 | } 195 | return nil 196 | } 197 | 198 | func GetField(distribution string, fields ...string) ([]interface{}, error) { 199 | rdb := redisServer.GetConnection() 200 | 201 | rdb.Send("MULTI") 202 | for _, field := range fields { 203 | rdb.Send("ZSCORE", distribution, field) 204 | } 205 | rdb.Send("GET", fmt.Sprintf("%s.%s", distribution, "_Z")) 206 | rdb.Send("GET", fmt.Sprintf("%s.%s", distribution, "_T")) 207 | data, err := redis.MultiBulk(rdb.Do("EXEC")) 208 | return data, err 209 | } 210 | 211 | func GetNMostProbable(distribution string, N int) ([]interface{}, error) { 212 | rdb := redisServer.GetConnection() 213 | 214 | rdb.Send("MULTI") 215 | rdb.Send("ZREVRANGEBYSCORE", distribution, "+INF", "-INF", "WITHSCORES", "LIMIT", 0, N) 216 | rdb.Send("GET", fmt.Sprintf("%s.%s", distribution, "_Z")) 217 | rdb.Send("GET", fmt.Sprintf("%s.%s", distribution, "_T")) 218 | data, err := redis.MultiBulk(rdb.Do("EXEC")) 219 | return data, err 220 | } 221 | 222 | func IncrField(distribution string, fields []string, N int) error { 223 | rdb := redisServer.GetConnection() 224 | 225 | rdb.Send("MULTI") 226 | for _, field := range fields { 227 | rdb.Send("ZINCRBY", distribution, N, field) 228 | } 229 | rdb.Send("INCRBY", fmt.Sprintf("%s.%s", distribution, "_Z"), N*len(fields)) 230 | rdb.Send("SETNX", fmt.Sprintf("%s.%s", distribution, "_T"), int(time.Now().Unix())) 231 | _, err := rdb.Do("EXEC") 232 | return err 233 | } 234 | 235 | func GetDistribution(distribution string) ([]interface{}, error) { 236 | rdb := redisServer.GetConnection() 237 | 238 | rdb.Send("MULTI") 239 | rdb.Send("GET", fmt.Sprintf("%s.%s", distribution, "_T")) 240 | rdb.Send("ZRANGE", distribution, 0, -1, "WITHSCORES") 241 | data, err := redis.MultiBulk(rdb.Do("EXEC")) 242 | return data, err 243 | } 244 | 245 | func DBSize() (int, error) { 246 | rdb := redisServer.GetConnection() 247 | 248 | data, err := redis.Int(rdb.Do("DBSIZE")) 249 | return data, err 250 | } 251 | -------------------------------------------------------------------------------- /goforget/redis_utils_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | var testUriCases = []struct { 9 | uriString string 10 | expected *RedisServer 11 | description string 12 | }{ 13 | { 14 | uriString: "redis://localhost:6379", 15 | expected: &RedisServer{ 16 | Host: "localhost", 17 | Port: "6379", 18 | hostname: "localhost:6379", 19 | Db: "0", 20 | Pass: "", 21 | }, 22 | description: "typical case", 23 | }, 24 | { 25 | uriString: "redis://redisproviders.com:12345/4", 26 | expected: &RedisServer{ 27 | Host: "redisproviders.com", 28 | Port: "12345", 29 | hostname: "redisproviders.com:12345", 30 | Db: "4", 31 | Pass: "", 32 | }, 33 | description: "host, port, db", 34 | }, 35 | { 36 | uriString: "redis://10.0.0.1", 37 | expected: &RedisServer{ 38 | Host: "10.0.0.1", 39 | Port: "6379", 40 | hostname: "10.0.0.1:6379", 41 | Db: "0", 42 | Pass: "", 43 | }, 44 | description: "host-only ip (guess default port and db)", 45 | }, 46 | { 47 | uriString: "redis://10.0.0.1", 48 | expected: &RedisServer{ 49 | Host: "10.0.0.1", 50 | Port: "6379", 51 | hostname: "10.0.0.1:6379", 52 | Db: "0", 53 | Pass: "", 54 | }, 55 | description: "with password (no username)", 56 | }, 57 | { 58 | uriString: "redis://oakland:ratchets@redis.bitly.com:999/1", 59 | expected: &RedisServer{ 60 | Host: "redis.bitly.com", 61 | Port: "999", 62 | hostname: "redis.bitly.com:999", 63 | Db: "1", 64 | Pass: "ratchets", 65 | }, 66 | description: "everything URI (username should be ignored)", 67 | }, 68 | } 69 | 70 | var testRawCases = []struct { 71 | rawString string 72 | expected *RedisServer 73 | description string 74 | }{ 75 | { 76 | rawString: "localhost:6379:1", 77 | expected: &RedisServer{ 78 | Host: "localhost", 79 | Port: "6379", 80 | hostname: "localhost:6379", 81 | Db: "1", 82 | Pass: "", 83 | }, 84 | description: "typical case (all fields)", 85 | }, 86 | } 87 | 88 | func TestNewRedisServerFromUri(t *testing.T) { 89 | for _, tt := range testUriCases { 90 | expected := tt.expected 91 | actual := NewRedisServerFromUri(tt.uriString) 92 | if reflect.DeepEqual(expected, actual) { 93 | t.Logf("PASS: %s", tt.description) 94 | } else { 95 | t.Errorf("FAIL: %s, expected: %+v, actual: %+v", tt.description, expected, actual) 96 | } 97 | } 98 | } 99 | 100 | func TestNewRedisServerFromRaw(t *testing.T) { 101 | for _, tt := range testRawCases { 102 | expected := tt.expected 103 | actual := NewRedisServerFromRaw(tt.rawString) 104 | if reflect.DeepEqual(expected, actual) { 105 | t.Logf("PASS: %s", tt.description) 106 | } else { 107 | t.Errorf("FAIL: %s, expected: %+v, actual: %+v", tt.description, expected, actual) 108 | } 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /pyforget/distribution.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import logging 3 | import time 4 | import redis 5 | 6 | r = redis.StrictRedis( 7 | 'localhost', 8 | port=6379, 9 | db=2 10 | ) 11 | 12 | def interleave_izip(*iterables): 13 | # interleave_izip('ABCD', 'xy') --> A x B y 14 | iterators = map(iter, iterables) 15 | while iterators: 16 | for i in iterators: 17 | yield i.next() 18 | 19 | class Distribution(object): 20 | def __init__(self,k): 21 | self.k = k 22 | 23 | def decay(self, rate=0.02): 24 | """ 25 | returns the amount to decay each bin by 26 | """ 27 | t = int(time.time()) 28 | tau = t-self.last_updated 29 | rates = [v * rate * tau for v in self.values] 30 | y = np.random.poisson(rates) 31 | return y,t 32 | 33 | def incr(self,bin): 34 | """ 35 | on an event, update the sorted set and the normalizing constant 36 | """ 37 | r.zincrby(self.k, bin) 38 | a = r.incr(self.k+"_z") 39 | if a == 1: 40 | # this catches the situtation where we've never seen the 41 | # the key before, setting t to the time of the initial write 42 | r.set(self.k+'_t', int(time.time())) 43 | 44 | def __str__(self): 45 | return str(dict(zip(self.keys,self.values))) 46 | 47 | def decrement(self): 48 | # check this distribution exists to decrement 49 | if not r.exists(self.k): 50 | raise KeyError('Cannot find distribution in Redis') 51 | # get the currently stored data 52 | self.keys, self.values = zip(*r.zrevrange(self.k,0,-1,withscores=True)) 53 | self.z = r.get(self.k+"_z") 54 | self.n = len(self.values) 55 | self.last_updated = int(r.get(self.k+"_t")) 56 | # get the amount to decay by 57 | y,t = self.decay() 58 | # decay values by y 59 | self.values -= y 60 | self.values[self.values <= 0] = 1 61 | # normalizing constant 62 | self.z = int(self.values.sum()) 63 | # build multi call 64 | pipeline = r.pipeline() 65 | pipeline.watch(self.k, self.k+'_t', self.k+'_z') 66 | pipeline.multi() 67 | pipeline.zadd(self.k, *interleave_izip(self.values, self.keys)) 68 | pipeline.set(self.k+'_t', t) 69 | pipeline.set(self.k+'_z', self.z) 70 | try: 71 | # try to excute 72 | pipeline.execute() 73 | except redis.WatchError: 74 | pass 75 | 76 | def get_dist(self): 77 | self.decrement() 78 | normalised = dict([(k, v/self.z) for k,v in zip(self.keys, self.values)]) 79 | return normalised 80 | 81 | def get_bin(self, bin): 82 | self.decrement() 83 | try: 84 | out = self.values[self.keys.index(bin)] / self.z 85 | except ValueError: 86 | raise ValueError('bin not in distribution') 87 | return out 88 | 89 | -------------------------------------------------------------------------------- /pyforget/forget_table.py: -------------------------------------------------------------------------------- 1 | import tornado.options 2 | import tornado.web 3 | import tornado.httpserver 4 | import tornado.ioloop 5 | from distribution import Distribution 6 | 7 | class Application(tornado.web.Application): 8 | def __init__(self): 9 | 10 | app_settings = { 11 | 'debug': True, 12 | "autoescape" : None, 13 | } 14 | 15 | handlers = [ 16 | (r"/ping$", PingHandler), 17 | (r"/incr$", IncrHandler), 18 | (r"/get$", GetHandler), 19 | (r"/dist$", DistHandler), 20 | ] 21 | tornado.web.Application.__init__(self, handlers, **app_settings) 22 | 23 | class PingHandler(tornado.web.RequestHandler): 24 | def get(self): 25 | self.finish('OK') 26 | def head(self): 27 | self.finish('OK') 28 | 29 | class IncrHandler(tornado.web.RequestHandler): 30 | def get(self): 31 | key = self.get_argument('key') 32 | bin = self.get_argument('bin') 33 | Distribution(key).incr(bin) 34 | 35 | class GetHandler(tornado.web.RequestHandler): 36 | def get(self): 37 | key = self.get_argument('key') 38 | bin = self.get_argument('bin') 39 | try: 40 | self.finish({ 41 | "status_code":200, 42 | "data":[{ 43 | "bin": bin, 44 | "probability": Distribution(key).get_bin(bin) 45 | }] 46 | }) 47 | except ValueError: 48 | self.finish({ 49 | "status_code":404, 50 | "data":[], 51 | "error_message": "Could not find bin in distribution" 52 | }) 53 | except KeyError: 54 | self.finish({ 55 | "status_code":404, 56 | "data":[], 57 | "error_message": "Could not find distribution in Forget Table" 58 | }) 59 | 60 | class DistHandler(tornado.web.RequestHandler): 61 | def get(self): 62 | key = self.get_argument('key') 63 | try: 64 | dist = Distribution(key).get_dist() 65 | except KeyError: 66 | return self.finish({ 67 | "status_code":404, 68 | "data":[], 69 | "error_message": "Could not find distribution in Forget Table" 70 | }) 71 | return self.finish({ 72 | "status_code":200, 73 | "data":[{ 74 | "bin":key, 75 | "probability":value 76 | } for key,value in dist.iteritems()] 77 | }) 78 | 79 | if __name__ == "__main__": 80 | tornado.options.define("port", default=8000, help="Listen on port", type=int) 81 | tornado.options.parse_command_line() 82 | http_server = tornado.httpserver.HTTPServer(request_callback=Application()) 83 | http_server.listen(tornado.options.options.port, address="0.0.0.0") 84 | tornado.ioloop.IOLoop.instance().start() 85 | -------------------------------------------------------------------------------- /pyforget/readme.md: -------------------------------------------------------------------------------- 1 | # This is the Python implementation of Forget Table 2 | 3 | Written by [Mike Dewar](http://twitter.com/mikedewar) and [Micha Gorelick](http://micha.gd/). 4 | 5 | To start the service run `python forget-table.py --port=8080` which will start the wrapper. Note that you will need a Redis database running locally on port 6379. Forget Table will write into db 2 by default. 6 | 7 | Upon recieving an event, to increment a bin in a distribution call 8 | 9 | localhost:8080/incr?key=colours&bin=red 10 | 11 | where `colours` is the name of the distribution and `red` is the name of the bin you want to increment. 12 | The distribution and bin will be created if they don't already exist. 13 | 14 | To query the whole distribution call 15 | 16 | localhost:8080/dist?key=colours 17 | 18 | This will return a JSON blob with the normalised distribution in it. To query a specific bin of a distribution call 19 | 20 | localhost:8080/bin?key=colours&bin=blue 21 | 22 | --------------------------------------------------------------------------------