├── .whitesource ├── 1091025177.pdf ├── README.md ├── regress.go └── sample-data ├── 001.csv └── 002.csv /.whitesource: -------------------------------------------------------------------------------- 1 | { 2 | "settingsInheritedFrom": "VividCortex/whitesource-config@master" 3 | } -------------------------------------------------------------------------------- /1091025177.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VividCortex/wlr/ee45d4a83b0f58593364be33c03d5158029e5b71/1091025177.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | wlr 2 | === 3 | 4 | This repository holds samples of code and data for our weighted linear regression technique. A white paper explaining the technology is available at https://www.vividcortex.com/resources/query-regression-white-paper/ 5 | -------------------------------------------------------------------------------- /regress.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | /* 4 | Copyright (c) 2014 VividCortex, Inc. All rights reserved. 5 | Certain inventions disclosed in this program may be claimed within 6 | patents owned or patent applications filed by VividCortex, Inc. 7 | */ 8 | 9 | import ( 10 | "encoding/csv" 11 | "fmt" 12 | "io" 13 | "log" 14 | "math" 15 | "os" 16 | "strconv" 17 | ) 18 | 19 | type csvFloater struct { 20 | headers []string 21 | *csv.Reader 22 | } 23 | 24 | func newCsvFloater(r io.Reader) *csvFloater { 25 | csvReader := csv.NewReader(r) 26 | return &csvFloater{ 27 | Reader: csvReader, 28 | } 29 | } 30 | 31 | func (r *csvFloater) Read() (map[string]float64, error) { 32 | var ( 33 | line []string 34 | row = map[string]float64{} 35 | err error 36 | ) 37 | 38 | if r.headers == nil { 39 | r.headers, err = r.Reader.Read() 40 | if err != nil { 41 | return nil, err 42 | } 43 | } 44 | line, err = r.Reader.Read() 45 | if err != nil { 46 | return nil, err 47 | } else { 48 | var f float64 49 | for i, str := range line { 50 | f, err = strconv.ParseFloat(str, 64) 51 | if err != nil { 52 | return nil, err 53 | } 54 | row[r.headers[i]] = f 55 | } 56 | } 57 | return row, nil 58 | } 59 | 60 | type MultiSimple struct { 61 | Vars map[string]*Simple 62 | } 63 | 64 | func (r *MultiSimple) Add(xvalues map[string]float64, yvalue float64, verbose bool) { 65 | if yvalue == 0.0 { 66 | return 67 | } 68 | 69 | sum := 0.0 70 | for _, xvalue := range xvalues { 71 | sum += xvalue 72 | } 73 | if sum == 0.0 { 74 | return 75 | } 76 | slope := yvalue / sum 77 | 78 | for name, xValue := range xvalues { 79 | if xValue > 0 { 80 | v, present := r.Vars[name] 81 | if !present { 82 | v = &Simple{} 83 | r.Vars[name] = v 84 | } 85 | if verbose { 86 | fmt.Printf("TRAIN %s %.5g %.5g %.5g %.5g\n", name, xValue, yvalue, slope, slope*xValue) 87 | } 88 | v.Add(xValue, slope*xValue) 89 | } 90 | } 91 | } 92 | 93 | func (r *MultiSimple) Predict(xvalues map[string]float64) float64 { 94 | result := 0.0 95 | for name, xValue := range xvalues { 96 | if xValue > 0 { 97 | if v, present := r.Vars[name]; present { 98 | slope, intercept := v.Slope(), v.Intercept() 99 | if slope > 0 { 100 | if intercept < 0 { 101 | intercept = 0 102 | } 103 | result += intercept + xValue*slope 104 | } 105 | } 106 | } 107 | } 108 | return result 109 | } 110 | 111 | type Simple struct { 112 | n, sx, sy, sxx, sxy, syy float64 113 | } 114 | 115 | func (r *Simple) Add(x, y float64) { 116 | r.n++ 117 | r.sx += x 118 | r.sy += y 119 | r.sxx += x * x 120 | r.sxy += x * y 121 | r.syy += y * y 122 | } 123 | 124 | func (r *Simple) Count() float64 { 125 | return r.n 126 | } 127 | 128 | func (r *Simple) Slope() float64 { 129 | if r.n == 0 { 130 | return 0 131 | } else if r.n == 1 { 132 | return r.sy / r.sx 133 | } 134 | ss_xy := r.n*r.sxy - r.sx*r.sy 135 | ss_xx := r.n*r.sxx - r.sx*r.sx 136 | return ss_xy / ss_xx 137 | } 138 | 139 | func (r *Simple) Intercept() float64 { 140 | if r.n < 2 { 141 | return 0 142 | } 143 | return (r.sy - r.Slope()*r.sx) / r.n 144 | } 145 | 146 | func (r *Simple) Rsq() float64 { 147 | if r.n < 2 { 148 | return 0 149 | } 150 | ss_xy := r.n*r.sxy - r.sx*r.sy 151 | ss_xx := r.n*r.sxx - r.sx*r.sx 152 | ss_yy := r.n*r.syy - r.sy*r.sy 153 | return ss_xy * ss_xy / ss_xx / ss_yy 154 | } 155 | 156 | func (r *Simple) SlopeStderr() float64 { 157 | if r.n <= 2 { 158 | return 0 159 | } 160 | ss_xy := r.n*r.sxy - r.sx*r.sy 161 | ss_xx := r.n*r.sxx - r.sx*r.sx 162 | ss_yy := r.n*r.syy - r.sy*r.sy 163 | s := math.Sqrt((ss_yy - ss_xy*ss_xy/ss_xx) / (r.n - 2.0)) 164 | return s / math.Sqrt(ss_xx) 165 | } 166 | 167 | func (r *Simple) InterceptStderr() float64 { 168 | if r.n <= 2 { 169 | return 0 170 | } 171 | ss_xy := r.n*r.sxy - r.sx*r.sy 172 | ss_xx := r.n*r.sxx - r.sx*r.sx 173 | ss_yy := r.n*r.syy - r.sy*r.sy 174 | s := math.Sqrt((ss_yy - ss_xy*ss_xy/ss_xx) / (r.n - 2.0)) 175 | mean_x := r.sx / r.n 176 | return s * math.Sqrt(1.0/r.n+mean_x*mean_x/ss_xx) 177 | } 178 | 179 | func main() { 180 | var ( 181 | train, predict string 182 | yvar = "user_us" 183 | ms = MultiSimple{ 184 | Vars: map[string]*Simple{}, 185 | } 186 | r = Simple{} 187 | totalError, count float64 188 | ) 189 | 190 | if len(os.Args) < 2 || len(os.Args) > 3 { 191 | log.Fatalln("Usage: go run regress.go []") 192 | } 193 | train = os.Args[1] 194 | predict = train 195 | if len(os.Args) == 3 { 196 | predict = os.Args[2] 197 | } 198 | 199 | tfh, err := os.Open(train) 200 | if err != nil { 201 | log.Fatalln(err) 202 | } 203 | defer tfh.Close() 204 | pfh, err := os.Open(predict) 205 | if err != nil { 206 | log.Fatal(err) 207 | } 208 | defer pfh.Close() 209 | c := newCsvFloater(tfh) 210 | c2 := newCsvFloater(pfh) 211 | 212 | // train 213 | fmt.Println("TRAIN name xValue yValue slope contrib") 214 | for { 215 | row, err := c.Read() 216 | if err != nil { 217 | if err == io.EOF { 218 | break 219 | } else { 220 | log.Fatalln(err) 221 | } 222 | } 223 | yvalue := row[yvar] 224 | delete(row, yvar) 225 | ms.Add(row, yvalue, true) 226 | } 227 | 228 | // predict 229 | fmt.Println("PREDICT actual predicted") 230 | for { 231 | row, err := c2.Read() 232 | if err != nil { 233 | if err == io.EOF { 234 | break 235 | } else { 236 | log.Fatalln(err) 237 | } 238 | } 239 | yvalue := row[yvar] 240 | if yvalue != 0 { 241 | delete(row, yvar) 242 | pred := ms.Predict(row) 243 | count++ 244 | totalError += math.Abs((yvalue - pred) / yvalue) 245 | r.Add(yvalue, pred) 246 | fmt.Printf("PREDICT %.5g %.5g\n", yvalue, pred) 247 | } 248 | } 249 | 250 | fmt.Println() 251 | fmt.Println("================== RESULTS: VARIABLES ====================") 252 | fmt.Println() 253 | fmt.Println("variable count R^2 slope (t-stat) intercept (t-stat)") 254 | for name, v := range ms.Vars { 255 | fmt.Printf("%-20s %5.f %4.2f %9.3g %9.3g %9.3g %9.3g\n", name, v.Count(), v.Rsq(), 256 | v.Slope(), v.SlopeStderr()/v.Slope(), v.Intercept(), math.Abs(v.InterceptStderr()/v.Intercept())) 257 | } 258 | 259 | fmt.Println() 260 | fmt.Println("================= RESULTS: ACTUAL-VS-PRED ===================") 261 | fmt.Printf("Slope: %.2g T-stat: %.2g Intercept: %.2g T-stat: %.2g R^2 %.2g MAPE: %.2g\n", 262 | r.Slope(), r.SlopeStderr()/r.Slope(), r.Intercept(), math.Abs(r.InterceptStderr()/r.Intercept()), 263 | r.Rsq(), totalError/count) 264 | } 265 | --------------------------------------------------------------------------------