├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── crosses.go ├── crosses_test.go ├── examples ├── DetroitDataDoc.txt ├── chevy-mechanics.csv └── murders-poverty.txt ├── regression.go └── regression_test.go /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | .DS_Store 3 | # IDEA config files 4 | .idea -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: go 3 | go: 4 | - 1.8.x 5 | - 1.9.x 6 | - 1.10.x 7 | - tip 8 | 9 | notifications: 10 | email: 11 | - infra@sajari.com 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Sajari Pty Ltd 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | regression 2 | ======= 3 | [![GoDoc](https://godoc.org/github.com/sajari/regression?status.svg)](https://godoc.org/github.com/sajari/regression) 4 | [![Go Report Card](https://goreportcard.com/badge/sajari/regression)](https://goreportcard.com/report/sajari/regression) 5 | [![Build Status](https://travis-ci.org/sajari/regression.svg?branch=master)](https://travis-ci.org/sajari/regression) 6 | [![License][license-image]][license-url] 7 | 8 | [license-image]: http://img.shields.io/badge/license-MIT-green.svg?style=flat-square 9 | [license-url]: LICENSE.txt 10 | 11 | Multivariable Linear Regression in Go (golang) 12 | 13 | installation 14 | ------------ 15 | 16 | $ go get github.com/sajari/regression 17 | 18 | Supports Go 1.8+ 19 | 20 | example usage 21 | ------------- 22 | 23 | Import the package, create a regression and add data to it. You can use as many variables as you like, in the below example there are 3 variables for each observation. 24 | 25 | ```go 26 | package main 27 | 28 | import ( 29 | "fmt" 30 | 31 | "github.com/sajari/regression" 32 | ) 33 | 34 | func main() { 35 | r := new(regression.Regression) 36 | r.SetObserved("Murders per annum per 1,000,000 inhabitants") 37 | r.SetVar(0, "Inhabitants") 38 | r.SetVar(1, "Percent with incomes below $5000") 39 | r.SetVar(2, "Percent unemployed") 40 | r.Train( 41 | regression.DataPoint(11.2, []float64{587000, 16.5, 6.2}), 42 | regression.DataPoint(13.4, []float64{643000, 20.5, 6.4}), 43 | regression.DataPoint(40.7, []float64{635000, 26.3, 9.3}), 44 | regression.DataPoint(5.3, []float64{692000, 16.5, 5.3}), 45 | regression.DataPoint(24.8, []float64{1248000, 19.2, 7.3}), 46 | regression.DataPoint(12.7, []float64{643000, 16.5, 5.9}), 47 | regression.DataPoint(20.9, []float64{1964000, 20.2, 6.4}), 48 | regression.DataPoint(35.7, []float64{1531000, 21.3, 7.6}), 49 | regression.DataPoint(8.7, []float64{713000, 17.2, 4.9}), 50 | regression.DataPoint(9.6, []float64{749000, 14.3, 6.4}), 51 | regression.DataPoint(14.5, []float64{7895000, 18.1, 6}), 52 | regression.DataPoint(26.9, []float64{762000, 23.1, 7.4}), 53 | regression.DataPoint(15.7, []float64{2793000, 19.1, 5.8}), 54 | regression.DataPoint(36.2, []float64{741000, 24.7, 8.6}), 55 | regression.DataPoint(18.1, []float64{625000, 18.6, 6.5}), 56 | regression.DataPoint(28.9, []float64{854000, 24.9, 8.3}), 57 | regression.DataPoint(14.9, []float64{716000, 17.9, 6.7}), 58 | regression.DataPoint(25.8, []float64{921000, 22.4, 8.6}), 59 | regression.DataPoint(21.7, []float64{595000, 20.2, 8.4}), 60 | regression.DataPoint(25.7, []float64{3353000, 16.9, 6.7}), 61 | ) 62 | r.Run() 63 | 64 | fmt.Printf("Regression formula:\n%v\n", r.Formula) 65 | fmt.Printf("Regression:\n%s\n", r) 66 | } 67 | ``` 68 | 69 | Note: You can also add data points one by one. 70 | 71 | Once calculated you can print the data, look at the R^2, Variance, residuals, etc. You can also access the coefficients directly to use elsewhere, e.g. 72 | 73 | ```go 74 | // Get the coefficient for the "Inhabitants" variable 0: 75 | c := r.Coeff(0) 76 | ``` 77 | 78 | You can also use the model to predict new data points 79 | 80 | ```go 81 | prediction, err := r.Predict([]float64{587000, 16.5, 6.2}) 82 | ``` 83 | 84 | Feature crosses are supported so your model can capture fixed non-linear relationships 85 | 86 | ```go 87 | 88 | r.Train( 89 | regression.DataPoint(11.2, []float64{587000, 16.5, 6.2}), 90 | ) 91 | //Add a new feature which is the first variable (index 0) to the power of 2 92 | r.AddCross(PowCross(0, 2)) 93 | r.Run() 94 | 95 | ``` 96 | -------------------------------------------------------------------------------- /crosses.go: -------------------------------------------------------------------------------- 1 | package regression 2 | 3 | import ( 4 | "math" 5 | "strconv" 6 | ) 7 | 8 | type featureCross interface { 9 | Calculate([]float64) []float64 //must return the same number of features each run 10 | ExtendNames(map[int]string, int) int 11 | } 12 | 13 | type functionalCross struct { 14 | functionName string 15 | boundVars []int 16 | crossFn func([]float64) []float64 17 | } 18 | 19 | func (c *functionalCross) Calculate(input []float64) []float64 { 20 | return c.crossFn(input) 21 | } 22 | 23 | func (c *functionalCross) ExtendNames(input map[int]string, initialSize int) int { 24 | for i, varIndex := range c.boundVars { 25 | if input[varIndex] != "" { 26 | input[initialSize+i] = "(" + input[varIndex] + ")" + c.functionName 27 | } 28 | } 29 | return len(c.boundVars) 30 | } 31 | 32 | // Feature cross based on computing the power of an input. 33 | func PowCross(i int, power float64) featureCross { 34 | return &functionalCross{ 35 | functionName: "^" + strconv.FormatFloat(power, 'f', -1, 64), 36 | boundVars: []int{i}, 37 | crossFn: func(vars []float64) []float64 { 38 | 39 | return []float64{math.Pow(vars[i], power)} 40 | }, 41 | } 42 | } 43 | 44 | // Feature cross based on the multiplication of multiple inputs. 45 | func MultiplierCross(vars ...int) featureCross { 46 | name := "" 47 | for i, v := range vars { 48 | name += strconv.Itoa(v) 49 | if i < (len(vars) - 1) { 50 | name += "*" 51 | } 52 | } 53 | 54 | return &functionalCross{ 55 | functionName: name, 56 | boundVars: vars, 57 | crossFn: func(input []float64) []float64 { 58 | var output float64 = 1 59 | for _, variableIndex := range vars { 60 | output *= input[variableIndex] 61 | } 62 | return []float64{output} 63 | }, 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /crosses_test.go: -------------------------------------------------------------------------------- 1 | package regression 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestPowCrosses(t *testing.T) { 8 | cross1 := PowCross(0, 2) //cross of the variable at index 0 9 | if cross1.Calculate([]float64{2})[0] != 4 { 10 | t.Error("Incorrect value") 11 | } 12 | 13 | cross2 := PowCross(1, 2) //cross of the variable at index 1 14 | if cross2.Calculate([]float64{2, -3})[0] != 9 { 15 | t.Error("Incorrect value, got", cross2.Calculate([]float64{2, -3})) 16 | } 17 | } 18 | 19 | func TestMultiplicationCrosses(t *testing.T) { 20 | cross1 := MultiplierCross(0, 1, 3) 21 | if cross1.Calculate([]float64{2, 3, 4, 5})[0] != 30 { 22 | t.Errorf("Incorrect value, expected 30 got %.2f", cross1.Calculate([]float64{2, 3, 4, 5})[0]) 23 | } 24 | 25 | cross2 := MultiplierCross(0, 1) 26 | if cross2.Calculate([]float64{2, 3})[0] != 6 { 27 | t.Errorf("Incorrect value, expected 6 got %.2f", cross1.Calculate([]float64{2, 3, 4, 5})[0]) 28 | } 29 | } 30 | 31 | func TestFunctionalCrossExtendNames(t *testing.T) { 32 | varNames := map[int]string{1: "Number of cars", 0: "fgsd"} 33 | cross := PowCross(1, 2) //cross of the variable at index 0 34 | newVars := cross.ExtendNames(varNames, len(varNames)) 35 | 36 | if len(varNames) != 3 { 37 | t.Error("Expected another name") 38 | } 39 | if varNames[2] != "(Number of cars)^2" { 40 | t.Error("Expected '(Number of cars)^2'") 41 | } 42 | if newVars != 1 { 43 | t.Error("Expected 1 new var") 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /examples/DetroitDataDoc.txt: -------------------------------------------------------------------------------- 1 | This is the data set called `DETROIT' in the book `Subset selection in 2 | regression' by Alan J. Miller published in the Chapman & Hall series of 3 | monographs on Statistics & Applied Probability, no. 40. The data are 4 | unusual in that a subset of three predictors can be found which gives a 5 | very much better fit to the data than the subsets found from the Efroymson 6 | stepwise algorithm, or from forward selection or backward elimination. 7 | 8 | The original data were given in appendix A of `Regression analysis and its 9 | application: A data-oriented approach' by Gunst & Mason, Statistics 10 | textbooks and monographs no. 24, Marcel Dekker. It has caused problems 11 | because some copies of the Gunst & Mason book do not contain all of the data, 12 | and because Miller does not say which variables he used as predictors and 13 | which is the dependent variable. (HOM was the dependent variable, and the 14 | predictors were FTP ... WE) 15 | 16 | The data were collected by J.C. Fisher and used in his paper: "Homicide in 17 | Detroit: The Role of Firearms", Criminology, vol.14, 387-400 (1976) 18 | 19 | The data are on the homicide rate in Detroit for the years 1961-1973. 20 | FTP - Full-time police per 100,000 population 21 | UEMP - % unemployed in the population 22 | MAN - number of manufacturing workers in thousands 23 | LIC - Number of handgun licences per 100,000 population 24 | GR - Number of handgun registrations per 100,000 population 25 | CLEAR - % homicides cleared by arrests 26 | WM - Number of white males in the population 27 | NMAN - Number of non-manufacturing workers in thousands 28 | GOV - Number of government workers in thousands 29 | HE - Average hourly earnings 30 | WE - Average weekly earnings 31 | 32 | HOM - Number of homicides per 100,000 of population 33 | ACC - Death rate in accidents per 100,000 population 34 | ASR - Number of assaults per 100,000 population 35 | 36 | N.B. Each case takes two lines. 37 | 38 | FTP UEMP MAN LIC GR CLEAR WM NMAN GOV HE 39 | WE HOM ACC ASR 40 | 260.35 11.0 455.5 178.15 215.98 93.4 558724. 538.1 133.9 2.98 41 | 117.18 8.60 39.17 306.18 42 | 269.80 7.0 480.2 156.41 180.48 88.5 538584. 547.6 137.6 3.09 43 | 134.02 8.90 40.27 315.16 44 | 272.04 5.2 506.1 198.02 209.57 94.4 519171. 562.8 143.6 3.23 45 | 141.68 8.52 45.31 277.53 46 | 272.96 4.3 535.8 222.10 231.67 92.0 500457. 591.0 150.3 3.33 47 | 147.98 8.89 49.51 234.07 48 | 272.51 3.5 576.0 301.92 297.65 91.0 482418. 626.1 164.3 3.46 49 | 159.85 13.07 55.05 230.84 50 | 261.34 3.2 601.7 391.22 367.62 87.4 465029. 659.8 179.5 3.60 51 | 157.19 14.57 53.90 217.99 52 | 268.89 4.1 577.3 665.56 616.54 88.3 448267. 686.2 187.5 3.73 53 | 155.29 21.36 50.62 286.11 54 | 295.99 3.9 596.9 1131.21 1029.75 86.1 432109. 699.6 195.4 2.91 55 | 131.75 28.03 51.47 291.59 56 | 319.87 3.6 613.5 837.60 786.23 79.0 416533. 729.9 210.3 4.25 57 | 178.74 31.49 49.16 320.39 58 | 341.43 7.1 569.3 794.90 713.77 73.9 401518. 757.8 223.8 4.47 59 | 178.30 37.39 45.80 323.03 60 | 356.59 8.4 548.8 817.74 750.43 63.4 387046. 755.3 227.7 5.04 61 | 209.54 46.26 44.54 357.38 62 | 376.69 7.7 563.4 583.17 1027.38 62.5 373095. 787.0 230.9 5.47 63 | 240.05 47.24 41.03 422.07 64 | 390.19 6.3 609.3 709.59 666.50 58.9 359647. 819.8 230.2 5.76 65 | 258.05 52.33 44.17 473.01 66 | 67 | -------------------------------------------------------------------------------- /examples/chevy-mechanics.csv: -------------------------------------------------------------------------------- 1 | Job perf,Mech Apt,Consc 2 | 2,45,20 3 | 1,38,30 4 | 3,50,30 5 | 2,48,28 6 | 3,55,30 7 | 3,53,34 8 | 4,55,36 9 | 4,58,32 10 | 3,40,34 11 | 5,55,38 12 | 3,48,28 13 | 3,45,30 14 | 2,55,36 15 | 4,60,34 16 | 5,60,38 17 | 5,60,42 18 | 5,65,38 19 | 4,50,34 20 | 3,58,38 -------------------------------------------------------------------------------- /examples/murders-poverty.txt: -------------------------------------------------------------------------------- 1 | # x08.txt 2 | # 3 | # Reference: 4 | # 5 | # Helmut Spaeth, 6 | # Mathematical Algorithms for Linear Regression, 7 | # Academic Press, 1991, 8 | # ISBN 0-12-656460-4. 9 | # 10 | # D G Kleinbaum and L L Kupper, 11 | # Applied Regression Analysis and Other Multivariable Methods, 12 | # Duxbury Press, 1978, page 150. 13 | # 14 | # Discussion: 15 | # 16 | # Measurements were made of poverty, unemployment, and murder rates. 17 | # 18 | # There are 20 rows of data. The data include: 19 | # 20 | # I, the index, 21 | # A1, the inhabitants, 22 | # A2, the percentage of families incomes below $5000, 23 | # A3, the percentage unemployed, 24 | # B, the number of murders per 1,000,000 inhabitants per annum. 25 | # 26 | # We seek a model of the form: 27 | # 28 | # B = A1 * X1 + A2 * X2 + A3 * X3 29 | # 30 | 5 columns 31 | 20 rows 32 | Index 33 | Inhabitants 34 | Percent with incomes below $5000 35 | Percent unemployed 36 | Murders per annum per 1,000,000 inhabitants 37 | 1 587000 16.5 6.2 11.2 38 | 2 643000 20.5 6.4 13.4 39 | 3 635000 26.3 9.3 40.7 40 | 4 692000 16.5 5.3 5.3 41 | 5 1248000 19.2 7.3 24.8 42 | 6 643000 16.5 5.9 12.7 43 | 7 1964000 20.2 6.4 20.9 44 | 8 1531000 21.3 7.6 35.7 45 | 9 713000 17.2 4.9 8.7 46 | 10 749000 14.3 6.4 9.6 47 | 11 7895000 18.1 6.0 14.5 48 | 12 762000 23.1 7.4 26.9 49 | 13 2793000 19.1 5.8 15.7 50 | 14 741000 24.7 8.6 36.2 51 | 15 625000 18.6 6.5 18.1 52 | 16 854000 24.9 8.3 28.9 53 | 17 716000 17.9 6.7 14.9 54 | 18 921000 22.4 8.6 25.8 55 | 19 595000 20.2 8.4 21.7 56 | 20 3353000 16.9 6.7 25.7 57 | 58 | -------------------------------------------------------------------------------- /regression.go: -------------------------------------------------------------------------------- 1 | package regression 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "math" 7 | "strconv" 8 | "strings" 9 | 10 | "gonum.org/v1/gonum/mat" 11 | ) 12 | 13 | var ( 14 | // ErrNotEnoughData signals that there weren't enough datapoint to train the model. 15 | ErrNotEnoughData = errors.New("not enough data points") 16 | // ErrTooManyVars signals that there are too many variables for the number of observations being made. 17 | ErrTooManyVars = errors.New("not enough observations to support this many variables") 18 | // ErrRegressionRun signals that the Run method has already been called on the trained dataset. 19 | ErrRegressionRun = errors.New("regression has already been run") 20 | ) 21 | 22 | // Regression is the exposed data structure for interacting with the API. 23 | type Regression struct { 24 | names describe 25 | data []*dataPoint 26 | coeff map[int]float64 27 | R2 float64 28 | Varianceobserved float64 29 | VariancePredicted float64 30 | initialised bool 31 | Formula string 32 | crosses []featureCross 33 | hasRun bool 34 | } 35 | 36 | type dataPoint struct { 37 | Observed float64 38 | Variables []float64 39 | Predicted float64 40 | Error float64 41 | } 42 | 43 | type describe struct { 44 | obs string 45 | vars map[int]string 46 | } 47 | 48 | // DataPoints is a slice of *dataPoint 49 | // This type allows for easier construction of training data points. 50 | type DataPoints []*dataPoint 51 | 52 | // DataPoint creates a well formed *datapoint used for training. 53 | func DataPoint(obs float64, vars []float64) *dataPoint { 54 | return &dataPoint{Observed: obs, Variables: vars} 55 | } 56 | 57 | // Predict updates the "Predicted" value for the inputed features. 58 | func (r *Regression) Predict(vars []float64) (float64, error) { 59 | if !r.initialised { 60 | return 0, ErrNotEnoughData 61 | } 62 | 63 | // apply any features crosses to vars 64 | for _, cross := range r.crosses { 65 | vars = append(vars, cross.Calculate(vars)...) 66 | } 67 | 68 | p := r.Coeff(0) 69 | for j := 1; j < len(r.data[0].Variables)+1; j++ { 70 | p += r.Coeff(j) * vars[j-1] 71 | } 72 | return p, nil 73 | } 74 | 75 | // SetObserved sets the name of the observed value. 76 | func (r *Regression) SetObserved(name string) { 77 | r.names.obs = name 78 | } 79 | 80 | // GetObserved gets the name of the observed value. 81 | func (r *Regression) GetObserved() string { 82 | return r.names.obs 83 | } 84 | 85 | // SetVar sets the name of variable i. 86 | func (r *Regression) SetVar(i int, name string) { 87 | if len(r.names.vars) == 0 { 88 | r.names.vars = make(map[int]string, 5) 89 | } 90 | r.names.vars[i] = name 91 | } 92 | 93 | // GetVar gets the name of variable i 94 | func (r *Regression) GetVar(i int) string { 95 | x := r.names.vars[i] 96 | if x == "" { 97 | s := []string{"X", strconv.Itoa(i)} 98 | return strings.Join(s, "") 99 | } 100 | return x 101 | } 102 | 103 | // AddCross registers a feature cross to be applied to the data points. 104 | func (r *Regression) AddCross(cross featureCross) { 105 | r.crosses = append(r.crosses, cross) 106 | } 107 | 108 | // Train the regression with some data points. 109 | func (r *Regression) Train(d ...*dataPoint) { 110 | r.data = append(r.data, d...) 111 | if len(r.data) > 2 { 112 | r.initialised = true 113 | } 114 | } 115 | 116 | // Apply any feature crosses, generating new observations and updating the data points, as well as 117 | // populating variable names for the feature crosses. 118 | // this should only be run once, as part of Run(). 119 | func (r *Regression) applyCrosses() { 120 | unusedVariableIndexCursor := len(r.data[0].Variables) 121 | for _, point := range r.data { 122 | for _, cross := range r.crosses { 123 | point.Variables = append(point.Variables, cross.Calculate(point.Variables)...) 124 | } 125 | } 126 | 127 | if len(r.names.vars) == 0 { 128 | r.names.vars = make(map[int]string, 5) 129 | } 130 | for _, cross := range r.crosses { 131 | unusedVariableIndexCursor += cross.ExtendNames(r.names.vars, unusedVariableIndexCursor) 132 | } 133 | } 134 | 135 | // Run determines if there is enough data present to run the regression 136 | // and whether or not the training has already been completed. 137 | // Once the above checks have passed feature crosses are applied if any 138 | // and the model is trained using QR decomposition. 139 | func (r *Regression) Run() error { 140 | if !r.initialised { 141 | return ErrNotEnoughData 142 | } 143 | if r.hasRun { 144 | return ErrRegressionRun 145 | } 146 | 147 | //apply any features crosses 148 | r.applyCrosses() 149 | r.hasRun = true 150 | 151 | observations := len(r.data) 152 | numOfvars := len(r.data[0].Variables) 153 | 154 | if observations < (numOfvars + 1) { 155 | return ErrTooManyVars 156 | } 157 | 158 | // Create some blank variable space 159 | observed := mat.NewDense(observations, 1, nil) 160 | variables := mat.NewDense(observations, numOfvars+1, nil) 161 | 162 | for i := 0; i < observations; i++ { 163 | observed.Set(i, 0, r.data[i].Observed) 164 | for j := 0; j < numOfvars+1; j++ { 165 | if j == 0 { 166 | variables.Set(i, 0, 1) 167 | } else { 168 | variables.Set(i, j, r.data[i].Variables[j-1]) 169 | } 170 | } 171 | } 172 | 173 | // Now run the regression 174 | _, n := variables.Dims() // cols 175 | qr := new(mat.QR) 176 | qr.Factorize(variables) 177 | q := new(mat.Dense) 178 | reg := new(mat.Dense) 179 | qr.QTo(q) 180 | qr.RTo(reg) 181 | 182 | qtr := q.T() 183 | qty := new(mat.Dense) 184 | qty.Mul(qtr, observed) 185 | 186 | c := make([]float64, n) 187 | for i := n - 1; i >= 0; i-- { 188 | c[i] = qty.At(i, 0) 189 | for j := i + 1; j < n; j++ { 190 | c[i] -= c[j] * reg.At(i, j) 191 | } 192 | c[i] /= reg.At(i, i) 193 | } 194 | 195 | // Output the regression results 196 | r.coeff = make(map[int]float64, numOfvars) 197 | for i, val := range c { 198 | r.coeff[i] = val 199 | if i == 0 { 200 | r.Formula = fmt.Sprintf("Predicted = %.4f", val) 201 | } else { 202 | r.Formula += fmt.Sprintf(" + %v*%.4f", r.GetVar(i-1), val) 203 | } 204 | } 205 | 206 | r.calcPredicted() 207 | r.calcVariance() 208 | r.calcR2() 209 | return nil 210 | } 211 | 212 | // Coeff returns the calculated coefficient for variable i. 213 | func (r *Regression) Coeff(i int) float64 { 214 | if len(r.coeff) == 0 { 215 | return 0 216 | } 217 | return r.coeff[i] 218 | } 219 | 220 | // GetCoeffs returns the calculated coefficients. The element at index 0 is the offset. 221 | func (r *Regression) GetCoeffs() []float64 { 222 | if len(r.coeff) == 0 { 223 | return nil 224 | } 225 | coeffs := make([]float64, len(r.coeff)) 226 | for i := range coeffs { 227 | coeffs[i] = r.coeff[i] 228 | } 229 | return coeffs 230 | } 231 | 232 | func (r *Regression) calcPredicted() string { 233 | observations := len(r.data) 234 | var predicted float64 235 | var output string 236 | for i := 0; i < observations; i++ { 237 | r.data[i].Predicted, _ = r.Predict(r.data[i].Variables) 238 | r.data[i].Error = r.data[i].Predicted - r.data[i].Observed 239 | 240 | output += fmt.Sprintf("%v. observed = %v, Predicted = %v, Error = %v", i, r.data[i].Observed, predicted, r.data[i].Error) 241 | } 242 | return output 243 | } 244 | 245 | func (r *Regression) calcVariance() string { 246 | observations := len(r.data) 247 | var obtotal, prtotal, obvar, prvar float64 248 | for i := 0; i < observations; i++ { 249 | obtotal += r.data[i].Observed 250 | prtotal += r.data[i].Predicted 251 | } 252 | obaverage := obtotal / float64(observations) 253 | praverage := prtotal / float64(observations) 254 | 255 | for i := 0; i < observations; i++ { 256 | obvar += math.Pow(r.data[i].Observed-obaverage, 2) 257 | prvar += math.Pow(r.data[i].Predicted-praverage, 2) 258 | } 259 | r.Varianceobserved = obvar / float64(observations) 260 | r.VariancePredicted = prvar / float64(observations) 261 | return fmt.Sprintf("N = %v\nVariance observed = %v\nVariance Predicted = %v\n", observations, r.Varianceobserved, r.VariancePredicted) 262 | } 263 | 264 | func (r *Regression) calcR2() string { 265 | r.R2 = r.VariancePredicted / r.Varianceobserved 266 | return fmt.Sprintf("R2 = %.2f", r.R2) 267 | } 268 | 269 | func (r *Regression) calcResiduals() string { 270 | str := fmt.Sprintf("Residuals:\nobserved|\tPredicted|\tResidual\n") 271 | for _, d := range r.data { 272 | str += fmt.Sprintf("%.2f|\t%.2f|\t%.2f\n", d.Observed, d.Predicted, d.Observed-d.Predicted) 273 | } 274 | str += "\n" 275 | return str 276 | } 277 | 278 | // String satisfies the stringer interface to display a dataPoint as a string. 279 | func (d *dataPoint) String() string { 280 | str := fmt.Sprintf("%.2f", d.Observed) 281 | for _, v := range d.Variables { 282 | str += fmt.Sprintf("|\t%.2f", v) 283 | } 284 | return str 285 | } 286 | 287 | // String satisfies the stringer interface to display a regression as a string. 288 | func (r *Regression) String() string { 289 | if !r.initialised { 290 | return ErrNotEnoughData.Error() 291 | } 292 | str := fmt.Sprintf("%v", r.GetObserved()) 293 | for i := 0; i < len(r.names.vars); i++ { 294 | str += fmt.Sprintf("|\t%v", r.GetVar(i)) 295 | } 296 | str += "\n" 297 | for _, d := range r.data { 298 | str += fmt.Sprintf("%v\n", d) 299 | } 300 | fmt.Println(r.calcResiduals()) 301 | str += fmt.Sprintf("\nN = %v\nVariance observed = %v\nVariance Predicted = %v", len(r.data), r.Varianceobserved, r.VariancePredicted) 302 | str += fmt.Sprintf("\nR2 = %v\n", r.R2) 303 | return str 304 | } 305 | 306 | // MakeDataPoints makes a `[]*dataPoint` from a `[][]float64`. The expected fomat for the input is a row-major [][]float64. 307 | // That is to say the first slice represents a row, and the second represents the cols. 308 | // Furthermore it is expected that all the col slices are of the same length. 309 | // The obsIndex parameter indicates which column should be used 310 | func MakeDataPoints(a [][]float64, obsIndex int) []*dataPoint { 311 | if obsIndex != 0 && obsIndex != len(a[0])-1 { 312 | return perverseMakeDataPoints(a, obsIndex) 313 | } 314 | 315 | retVal := make([]*dataPoint, 0, len(a)) 316 | if obsIndex == 0 { 317 | for _, r := range a { 318 | retVal = append(retVal, DataPoint(r[0], r[1:])) 319 | } 320 | return retVal 321 | } 322 | 323 | // otherwise the observation is expected to be the last col 324 | last := len(a[0]) - 1 325 | for _, r := range a { 326 | retVal = append(retVal, DataPoint(r[last], r[:last])) 327 | } 328 | return retVal 329 | } 330 | 331 | func perverseMakeDataPoints(a [][]float64, obsIndex int) []*dataPoint { 332 | retVal := make([]*dataPoint, 0, len(a)) 333 | for _, r := range a { 334 | obs := r[obsIndex] 335 | others := make([]float64, 0, len(r)-1) 336 | for i, c := range r { 337 | if i == obsIndex { 338 | continue 339 | } 340 | others = append(others, c) 341 | } 342 | retVal = append(retVal, DataPoint(obs, others)) 343 | } 344 | return retVal 345 | } 346 | -------------------------------------------------------------------------------- /regression_test.go: -------------------------------------------------------------------------------- 1 | package regression 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "testing" 7 | ) 8 | 9 | func TestRun(t *testing.T) { 10 | r := new(Regression) 11 | r.SetObserved("Murders per annum per 1,000,000 inhabitants") 12 | r.SetVar(0, "Inhabitants") 13 | r.SetVar(1, "Percent with incomes below $5000") 14 | r.SetVar(2, "Percent unemployed") 15 | r.Train( 16 | DataPoint(11.2, []float64{587000, 16.5, 6.2}), 17 | DataPoint(13.4, []float64{643000, 20.5, 6.4}), 18 | DataPoint(40.7, []float64{635000, 26.3, 9.3}), 19 | DataPoint(5.3, []float64{692000, 16.5, 5.3}), 20 | DataPoint(24.8, []float64{1248000, 19.2, 7.3}), 21 | DataPoint(12.7, []float64{643000, 16.5, 5.9}), 22 | DataPoint(20.9, []float64{1964000, 20.2, 6.4}), 23 | DataPoint(35.7, []float64{1531000, 21.3, 7.6}), 24 | DataPoint(8.7, []float64{713000, 17.2, 4.9}), 25 | DataPoint(9.6, []float64{749000, 14.3, 6.4}), 26 | DataPoint(14.5, []float64{7895000, 18.1, 6}), 27 | DataPoint(26.9, []float64{762000, 23.1, 7.4}), 28 | DataPoint(15.7, []float64{2793000, 19.1, 5.8}), 29 | DataPoint(36.2, []float64{741000, 24.7, 8.6}), 30 | DataPoint(18.1, []float64{625000, 18.6, 6.5}), 31 | DataPoint(28.9, []float64{854000, 24.9, 8.3}), 32 | DataPoint(14.9, []float64{716000, 17.9, 6.7}), 33 | DataPoint(25.8, []float64{921000, 22.4, 8.6}), 34 | DataPoint(21.7, []float64{595000, 20.2, 8.4}), 35 | DataPoint(25.7, []float64{3353000, 16.9, 6.7}), 36 | ) 37 | r.Run() 38 | 39 | fmt.Printf("Regression formula:\n%v\n", r.Formula) 40 | fmt.Printf("Regression:\n%s\n", r) 41 | 42 | // All vars are known to positively correlate with the murder rate 43 | for i, c := range r.coeff { 44 | if i == 0 { 45 | // This is the offset and not a coeff 46 | continue 47 | } 48 | if c < 0 { 49 | t.Errorf("Coefficient is negative, but shouldn't be: %.2f", c) 50 | } 51 | } 52 | 53 | // We know this set has an R^2 above 80 54 | if r.R2 < 0.8 { 55 | t.Errorf("R^2 was %.2f, but we expected > 80", r.R2) 56 | } 57 | } 58 | 59 | func TestCrossApply(t *testing.T) { 60 | r := new(Regression) 61 | r.SetObserved("Input-Squared plus Input") 62 | r.SetVar(0, "Input") 63 | r.Train( 64 | DataPoint(6, []float64{2}), 65 | DataPoint(20, []float64{4}), 66 | DataPoint(30, []float64{5}), 67 | DataPoint(72, []float64{8}), 68 | DataPoint(156, []float64{12}), 69 | ) 70 | r.AddCross(PowCross(0, 2)) 71 | r.AddCross(PowCross(0, 7)) 72 | err := r.Run() 73 | if err != nil { 74 | t.Error(err) 75 | } 76 | 77 | fmt.Printf("Regression formula:\n%v\n", r.Formula) 78 | fmt.Printf("Regression:\n%s\n", r) 79 | if r.names.vars[1] != "(Input)^2" { 80 | t.Error("Name incorrect") 81 | } 82 | 83 | for i, c := range r.coeff { 84 | if i == 0 { 85 | // This is the offset and not a coeff 86 | continue 87 | } 88 | if c < 0 { 89 | t.Errorf("Coefficient is negative, but shouldn't be: %.2f", c) 90 | } 91 | } 92 | 93 | // We know this set has an R^2 above 80 94 | if r.R2 < 0.8 { 95 | t.Errorf("R^2 was %.2f, but we expected > 80", r.R2) 96 | } 97 | 98 | // Test that predict uses the cross as well 99 | val, err := r.Predict([]float64{6}) 100 | if err != nil { 101 | t.Error(err) 102 | } 103 | if val <= 41.999 && val >= 42.001 { 104 | t.Errorf("Expected 42, got %.2f", val) 105 | } 106 | } 107 | 108 | func TestMakeDataPoints(t *testing.T) { 109 | a := [][]float64{ 110 | {1, 2, 3, 4}, 111 | {2, 2, 3, 4}, 112 | {3, 2, 3, 4}, 113 | } 114 | correct := []float64{2, 3, 4} 115 | 116 | dps := MakeDataPoints(a, 0) 117 | for i, dp := range dps { 118 | for i, v := range dp.Variables { 119 | if correct[i] != v { 120 | t.Errorf("Expected variables to be %v. Got %v instead", correct, dp.Variables) 121 | } 122 | } 123 | if dp.Observed != float64(i+1) { 124 | t.Error("Expected observed to be the same as the index") 125 | } 126 | } 127 | 128 | a = [][]float64{ 129 | {1, 2, 3, 4}, 130 | {1, 2, 3, 4}, 131 | {1, 2, 3, 4}, 132 | } 133 | correct = []float64{1, 3, 4} 134 | dps = MakeDataPoints(a, 1) 135 | for _, dp := range dps { 136 | for i, v := range dp.Variables { 137 | if correct[i] != v { 138 | t.Errorf("Expected variables to be %v. Got %v instead", correct, dp.Variables) 139 | } 140 | } 141 | if dp.Observed != 2.0 { 142 | t.Error("Expected observed to be the same as the index") 143 | } 144 | } 145 | 146 | correct = []float64{1, 2, 3} 147 | dps = MakeDataPoints(a, 3) 148 | for _, dp := range dps { 149 | for i, v := range dp.Variables { 150 | if correct[i] != v { 151 | t.Errorf("Expected variables to be %v. Got %v instead", correct, dp.Variables) 152 | } 153 | } 154 | if dp.Observed != 4.0 { 155 | t.Error("Expected observed to be the same as the index") 156 | } 157 | } 158 | } 159 | 160 | func TestGetCoeffs(t *testing.T) { 161 | a := [][]float64{ 162 | {651, 1, 23}, 163 | {762, 2, 26}, 164 | {856, 3, 30}, 165 | {1063, 4, 34}, 166 | {1190, 5, 43}, 167 | {1298, 6, 48}, 168 | {1421, 7, 52}, 169 | {1440, 8, 57}, 170 | {1518, 9, 58}, 171 | } 172 | 173 | r := new(Regression) 174 | r.Train(MakeDataPoints(a, 0)...) 175 | r.Run() 176 | 177 | coeffs := r.GetCoeffs() 178 | if len(coeffs) != 3 { 179 | t.Errorf("Expected 3 coefficients. Got %v instead", len(coeffs)) 180 | } 181 | 182 | expected := []float64{323.54, 46.60, 13.99} 183 | for i := range expected { 184 | if math.Abs(expected[i]-coeffs[i]) > 0.01 { 185 | t.Errorf("Expected coefficient %v to be %v. Got %v instead", i, expected[i], coeffs[i]) 186 | } 187 | } 188 | } 189 | --------------------------------------------------------------------------------