├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── crosses.go
├── crosses_test.go
├── examples
    ├── DetroitDataDoc.txt
    ├── chevy-mechanics.csv
    └── murders-poverty.txt
├── regression.go
└── regression_test.go


/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | .DS_Store
3 | # IDEA config files
4 | .idea


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: false
 2 | language: go
 3 | go:
 4 | - 1.8.x
 5 | - 1.9.x
 6 | - 1.10.x
 7 | - tip
 8 | 
 9 | notifications:
10 |   email:
11 |     - infra@sajari.com
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Sajari Pty Ltd
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | regression
 2 | =======
 3 | [![GoDoc](https://godoc.org/github.com/sajari/regression?status.svg)](https://godoc.org/github.com/sajari/regression)
 4 | [![Go Report Card](https://goreportcard.com/badge/sajari/regression)](https://goreportcard.com/report/sajari/regression)
 5 | [![Build Status](https://travis-ci.org/sajari/regression.svg?branch=master)](https://travis-ci.org/sajari/regression)
 6 | [![License][license-image]][license-url]
 7 | 
 8 | [license-image]: http://img.shields.io/badge/license-MIT-green.svg?style=flat-square
 9 | [license-url]: LICENSE.txt
10 | 
11 | Multivariable Linear Regression in Go (golang)
12 | 
13 | installation
14 | ------------
15 | 
16 |     $ go get github.com/sajari/regression
17 | 
18 | Supports Go 1.8+
19 | 
20 | example usage
21 | -------------
22 | 
23 | Import the package, create a regression and add data to it. You can use as many variables as you like, in the below example there are 3 variables for each observation.
24 | 
25 | ```go
26 | package main
27 | 
28 | import (
29 | 	"fmt"
30 | 
31 | 	"github.com/sajari/regression"
32 | )
33 | 
34 | func main() {
35 | 	r := new(regression.Regression)
36 | 	r.SetObserved("Murders per annum per 1,000,000 inhabitants")
37 | 	r.SetVar(0, "Inhabitants")
38 | 	r.SetVar(1, "Percent with incomes below $5000")
39 | 	r.SetVar(2, "Percent unemployed")
40 | 	r.Train(
41 | 		regression.DataPoint(11.2, []float64{587000, 16.5, 6.2}),
42 | 		regression.DataPoint(13.4, []float64{643000, 20.5, 6.4}),
43 | 		regression.DataPoint(40.7, []float64{635000, 26.3, 9.3}),
44 | 		regression.DataPoint(5.3, []float64{692000, 16.5, 5.3}),
45 | 		regression.DataPoint(24.8, []float64{1248000, 19.2, 7.3}),
46 | 		regression.DataPoint(12.7, []float64{643000, 16.5, 5.9}),
47 | 		regression.DataPoint(20.9, []float64{1964000, 20.2, 6.4}),
48 | 		regression.DataPoint(35.7, []float64{1531000, 21.3, 7.6}),
49 | 		regression.DataPoint(8.7, []float64{713000, 17.2, 4.9}),
50 | 		regression.DataPoint(9.6, []float64{749000, 14.3, 6.4}),
51 | 		regression.DataPoint(14.5, []float64{7895000, 18.1, 6}),
52 | 		regression.DataPoint(26.9, []float64{762000, 23.1, 7.4}),
53 | 		regression.DataPoint(15.7, []float64{2793000, 19.1, 5.8}),
54 | 		regression.DataPoint(36.2, []float64{741000, 24.7, 8.6}),
55 | 		regression.DataPoint(18.1, []float64{625000, 18.6, 6.5}),
56 | 		regression.DataPoint(28.9, []float64{854000, 24.9, 8.3}),
57 | 		regression.DataPoint(14.9, []float64{716000, 17.9, 6.7}),
58 | 		regression.DataPoint(25.8, []float64{921000, 22.4, 8.6}),
59 | 		regression.DataPoint(21.7, []float64{595000, 20.2, 8.4}),
60 | 		regression.DataPoint(25.7, []float64{3353000, 16.9, 6.7}),
61 | 	)
62 | 	r.Run()
63 | 
64 | 	fmt.Printf("Regression formula:\n%v\n", r.Formula)
65 | 	fmt.Printf("Regression:\n%s\n", r)
66 | }
67 | ```
68 | 
69 | Note: You can also add data points one by one.
70 | 
71 | Once calculated you can print the data, look at the R^2, Variance, residuals, etc. You can also access the coefficients directly to use elsewhere, e.g.
72 | 
73 | ```go
74 | // Get the coefficient for the "Inhabitants" variable 0:
75 | c := r.Coeff(0)
76 | ```
77 | 
78 | You can also use the model to predict new data points
79 | 
80 | ```go
81 | prediction, err := r.Predict([]float64{587000, 16.5, 6.2})
82 | ```
83 | 
84 | Feature crosses are supported so your model can capture fixed non-linear relationships
85 | 
86 | ```go
87 | 
88 | r.Train(
89 |   regression.DataPoint(11.2, []float64{587000, 16.5, 6.2}),
90 | )
91 | //Add a new feature which is the first variable (index 0) to the power of 2
92 | r.AddCross(PowCross(0, 2))
93 | r.Run()
94 | 
95 | ```
96 | 


--------------------------------------------------------------------------------
/crosses.go:
--------------------------------------------------------------------------------
 1 | package regression
 2 | 
 3 | import (
 4 | 	"math"
 5 | 	"strconv"
 6 | )
 7 | 
 8 | type featureCross interface {
 9 | 	Calculate([]float64) []float64 //must return the same number of features each run
10 | 	ExtendNames(map[int]string, int) int
11 | }
12 | 
13 | type functionalCross struct {
14 | 	functionName string
15 | 	boundVars    []int
16 | 	crossFn      func([]float64) []float64
17 | }
18 | 
19 | func (c *functionalCross) Calculate(input []float64) []float64 {
20 | 	return c.crossFn(input)
21 | }
22 | 
23 | func (c *functionalCross) ExtendNames(input map[int]string, initialSize int) int {
24 | 	for i, varIndex := range c.boundVars {
25 | 		if input[varIndex] != "" {
26 | 			input[initialSize+i] = "(" + input[varIndex] + ")" + c.functionName
27 | 		}
28 | 	}
29 | 	return len(c.boundVars)
30 | }
31 | 
32 | // Feature cross based on computing the power of an input.
33 | func PowCross(i int, power float64) featureCross {
34 | 	return &functionalCross{
35 | 		functionName: "^" + strconv.FormatFloat(power, 'f', -1, 64),
36 | 		boundVars:    []int{i},
37 | 		crossFn: func(vars []float64) []float64 {
38 | 
39 | 			return []float64{math.Pow(vars[i], power)}
40 | 		},
41 | 	}
42 | }
43 | 
44 | // Feature cross based on the multiplication of multiple inputs.
45 | func MultiplierCross(vars ...int) featureCross {
46 | 	name := ""
47 | 	for i, v := range vars {
48 | 		name += strconv.Itoa(v)
49 | 		if i < (len(vars) - 1) {
50 | 			name += "*"
51 | 		}
52 | 	}
53 | 
54 | 	return &functionalCross{
55 | 		functionName: name,
56 | 		boundVars:    vars,
57 | 		crossFn: func(input []float64) []float64 {
58 | 			var output float64 = 1
59 | 			for _, variableIndex := range vars {
60 | 				output *= input[variableIndex]
61 | 			}
62 | 			return []float64{output}
63 | 		},
64 | 	}
65 | }
66 | 


--------------------------------------------------------------------------------
/crosses_test.go:
--------------------------------------------------------------------------------
 1 | package regression
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | func TestPowCrosses(t *testing.T) {
 8 | 	cross1 := PowCross(0, 2) //cross of the variable at index 0
 9 | 	if cross1.Calculate([]float64{2})[0] != 4 {
10 | 		t.Error("Incorrect value")
11 | 	}
12 | 
13 | 	cross2 := PowCross(1, 2) //cross of the variable at index 1
14 | 	if cross2.Calculate([]float64{2, -3})[0] != 9 {
15 | 		t.Error("Incorrect value, got", cross2.Calculate([]float64{2, -3}))
16 | 	}
17 | }
18 | 
19 | func TestMultiplicationCrosses(t *testing.T) {
20 | 	cross1 := MultiplierCross(0, 1, 3)
21 | 	if cross1.Calculate([]float64{2, 3, 4, 5})[0] != 30 {
22 | 		t.Errorf("Incorrect value, expected 30 got %.2f", cross1.Calculate([]float64{2, 3, 4, 5})[0])
23 | 	}
24 | 
25 | 	cross2 := MultiplierCross(0, 1)
26 | 	if cross2.Calculate([]float64{2, 3})[0] != 6 {
27 | 		t.Errorf("Incorrect value, expected 6 got %.2f", cross1.Calculate([]float64{2, 3, 4, 5})[0])
28 | 	}
29 | }
30 | 
31 | func TestFunctionalCrossExtendNames(t *testing.T) {
32 | 	varNames := map[int]string{1: "Number of cars", 0: "fgsd"}
33 | 	cross := PowCross(1, 2) //cross of the variable at index 0
34 | 	newVars := cross.ExtendNames(varNames, len(varNames))
35 | 
36 | 	if len(varNames) != 3 {
37 | 		t.Error("Expected another name")
38 | 	}
39 | 	if varNames[2] != "(Number of cars)^2" {
40 | 		t.Error("Expected '(Number of cars)^2'")
41 | 	}
42 | 	if newVars != 1 {
43 | 		t.Error("Expected 1 new var")
44 | 	}
45 | }
46 | 


--------------------------------------------------------------------------------
/examples/DetroitDataDoc.txt:
--------------------------------------------------------------------------------
 1 | This is the data set called `DETROIT' in the book `Subset selection in
 2 | regression' by Alan J. Miller published in the Chapman & Hall series of
 3 | monographs on Statistics & Applied Probability, no. 40.   The data are
 4 | unusual in that a subset of three predictors can be found which gives a
 5 | very much better fit to the data than the subsets found from the Efroymson
 6 | stepwise algorithm, or from forward selection or backward elimination.
 7 | 
 8 | The original data were given in appendix A of `Regression analysis and its
 9 | application: A data-oriented approach' by Gunst & Mason, Statistics
10 | textbooks and monographs no. 24, Marcel Dekker.   It has caused problems
11 | because some copies of the Gunst & Mason book do not contain all of the data,
12 | and because Miller does not say which variables he used as predictors and
13 | which is the dependent variable.   (HOM was the dependent variable, and the
14 | predictors were FTP ... WE)
15 | 
16 | The data were collected by J.C. Fisher and used in his paper: "Homicide in
17 | Detroit: The Role of Firearms", Criminology, vol.14, 387-400 (1976)
18 | 
19 | The data are on the homicide rate in Detroit for the years 1961-1973.
20 | FTP    - Full-time police per 100,000 population
21 | UEMP   - % unemployed in the population
22 | MAN    - number of manufacturing workers in thousands
23 | LIC    - Number of handgun licences per 100,000 population
24 | GR     - Number of handgun registrations per 100,000 population
25 | CLEAR  - % homicides cleared by arrests
26 | WM     - Number of white males in the population
27 | NMAN   - Number of non-manufacturing workers in thousands
28 | GOV    - Number of government workers in thousands
29 | HE     - Average hourly earnings
30 | WE     - Average weekly earnings
31 | 
32 | HOM    - Number of homicides per 100,000 of population
33 | ACC    - Death rate in accidents per 100,000 population
34 | ASR    - Number of assaults per 100,000 population
35 | 
36 | N.B. Each case takes two lines.
37 | 
38 |  FTP    UEMP     MAN     LIC      GR    CLEAR     WM    NMAN     GOV     HE
39 |   WE    HOM      ACC     ASR
40 | 260.35  11.0    455.5   178.15  215.98  93.4    558724. 538.1   133.9   2.98
41 | 117.18  8.60    39.17   306.18
42 | 269.80  7.0     480.2   156.41  180.48  88.5    538584. 547.6   137.6   3.09
43 | 134.02  8.90    40.27   315.16
44 | 272.04  5.2     506.1   198.02  209.57  94.4    519171. 562.8   143.6   3.23
45 | 141.68  8.52    45.31   277.53
46 | 272.96  4.3     535.8   222.10  231.67  92.0    500457. 591.0   150.3   3.33
47 | 147.98  8.89    49.51   234.07
48 | 272.51  3.5     576.0   301.92  297.65  91.0    482418. 626.1   164.3   3.46
49 | 159.85  13.07   55.05   230.84
50 | 261.34  3.2     601.7   391.22  367.62  87.4    465029. 659.8   179.5   3.60
51 | 157.19  14.57   53.90   217.99
52 | 268.89  4.1     577.3   665.56  616.54  88.3    448267. 686.2   187.5   3.73
53 | 155.29  21.36   50.62   286.11
54 | 295.99  3.9     596.9   1131.21 1029.75 86.1    432109. 699.6   195.4   2.91
55 | 131.75  28.03   51.47   291.59
56 | 319.87  3.6     613.5   837.60  786.23  79.0    416533. 729.9   210.3   4.25
57 | 178.74  31.49   49.16   320.39
58 | 341.43  7.1     569.3   794.90  713.77  73.9    401518. 757.8   223.8   4.47
59 | 178.30  37.39   45.80   323.03
60 | 356.59  8.4     548.8   817.74  750.43  63.4    387046. 755.3   227.7   5.04
61 | 209.54  46.26   44.54   357.38
62 | 376.69  7.7     563.4   583.17  1027.38 62.5    373095. 787.0   230.9   5.47
63 | 240.05  47.24   41.03   422.07
64 | 390.19  6.3     609.3   709.59  666.50  58.9    359647. 819.8   230.2   5.76
65 | 258.05  52.33   44.17   473.01
66 | 
67 | 


--------------------------------------------------------------------------------
/examples/chevy-mechanics.csv:
--------------------------------------------------------------------------------
 1 | Job perf,Mech Apt,Consc
 2 | 2,45,20
 3 | 1,38,30
 4 | 3,50,30
 5 | 2,48,28
 6 | 3,55,30
 7 | 3,53,34
 8 | 4,55,36
 9 | 4,58,32
10 | 3,40,34
11 | 5,55,38
12 | 3,48,28
13 | 3,45,30
14 | 2,55,36
15 | 4,60,34
16 | 5,60,38
17 | 5,60,42
18 | 5,65,38
19 | 4,50,34
20 | 3,58,38


--------------------------------------------------------------------------------
/examples/murders-poverty.txt:
--------------------------------------------------------------------------------
 1 | #  x08.txt
 2 | #
 3 | #  Reference:
 4 | #
 5 | #    Helmut Spaeth,
 6 | #    Mathematical Algorithms for Linear Regression,
 7 | #    Academic Press, 1991,
 8 | #    ISBN 0-12-656460-4.
 9 | #
10 | #    D G Kleinbaum and L L Kupper,
11 | #    Applied Regression Analysis and Other Multivariable Methods,
12 | #    Duxbury Press, 1978, page 150.
13 | #
14 | #  Discussion:
15 | #
16 | #    Measurements were made of poverty, unemployment, and murder rates.
17 | #
18 | #    There are 20 rows of data.  The data include:
19 | #
20 | #      I,  the index,
21 | #      A1, the inhabitants, 
22 | #      A2, the percentage of families incomes below $5000,
23 | #      A3, the percentage unemployed,
24 | #      B,  the number of murders per 1,000,000 inhabitants per annum.
25 | #
26 | #    We seek a model of the form:
27 | #
28 | #      B = A1 * X1 + A2 * X2 + A3 * X3
29 | #
30 | 5 columns
31 | 20 rows
32 | Index
33 | Inhabitants
34 | Percent with incomes below $5000
35 | Percent unemployed
36 | Murders per annum per 1,000,000 inhabitants
37 |  1   587000  16.5  6.2  11.2
38 |  2   643000  20.5  6.4  13.4
39 |  3   635000  26.3  9.3  40.7
40 |  4   692000  16.5  5.3   5.3
41 |  5  1248000  19.2  7.3  24.8
42 |  6   643000  16.5  5.9  12.7
43 |  7  1964000  20.2  6.4  20.9
44 |  8  1531000  21.3  7.6  35.7
45 |  9   713000  17.2  4.9   8.7
46 | 10   749000  14.3  6.4   9.6
47 | 11  7895000  18.1  6.0  14.5
48 | 12   762000  23.1  7.4  26.9
49 | 13  2793000  19.1  5.8  15.7
50 | 14   741000  24.7  8.6  36.2
51 | 15   625000  18.6  6.5  18.1
52 | 16   854000  24.9  8.3  28.9
53 | 17   716000  17.9  6.7  14.9
54 | 18   921000  22.4  8.6  25.8
55 | 19   595000  20.2  8.4  21.7
56 | 20  3353000  16.9  6.7  25.7
57 | 
58 | 


--------------------------------------------------------------------------------
/regression.go:
--------------------------------------------------------------------------------
  1 | package regression
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"fmt"
  6 | 	"math"
  7 | 	"strconv"
  8 | 	"strings"
  9 | 
 10 | 	"gonum.org/v1/gonum/mat"
 11 | )
 12 | 
 13 | var (
 14 | 	// ErrNotEnoughData signals that there weren't enough datapoint to train the model.
 15 | 	ErrNotEnoughData = errors.New("not enough data points")
 16 | 	// ErrTooManyVars signals that there are too many variables for the number of observations being made.
 17 | 	ErrTooManyVars = errors.New("not enough observations to support this many variables")
 18 | 	// ErrRegressionRun signals that the Run method has already been called on the trained dataset.
 19 | 	ErrRegressionRun = errors.New("regression has already been run")
 20 | )
 21 | 
 22 | // Regression is the exposed data structure for interacting with the API.
 23 | type Regression struct {
 24 | 	names             describe
 25 | 	data              []*dataPoint
 26 | 	coeff             map[int]float64
 27 | 	R2                float64
 28 | 	Varianceobserved  float64
 29 | 	VariancePredicted float64
 30 | 	initialised       bool
 31 | 	Formula           string
 32 | 	crosses           []featureCross
 33 | 	hasRun            bool
 34 | }
 35 | 
 36 | type dataPoint struct {
 37 | 	Observed  float64
 38 | 	Variables []float64
 39 | 	Predicted float64
 40 | 	Error     float64
 41 | }
 42 | 
 43 | type describe struct {
 44 | 	obs  string
 45 | 	vars map[int]string
 46 | }
 47 | 
 48 | // DataPoints is a slice of *dataPoint
 49 | // This type allows for easier construction of training data points.
 50 | type DataPoints []*dataPoint
 51 | 
 52 | // DataPoint creates a well formed *datapoint used for training.
 53 | func DataPoint(obs float64, vars []float64) *dataPoint {
 54 | 	return &dataPoint{Observed: obs, Variables: vars}
 55 | }
 56 | 
 57 | // Predict updates the "Predicted" value for the inputed features.
 58 | func (r *Regression) Predict(vars []float64) (float64, error) {
 59 | 	if !r.initialised {
 60 | 		return 0, ErrNotEnoughData
 61 | 	}
 62 | 
 63 | 	// apply any features crosses to vars
 64 | 	for _, cross := range r.crosses {
 65 | 		vars = append(vars, cross.Calculate(vars)...)
 66 | 	}
 67 | 
 68 | 	p := r.Coeff(0)
 69 | 	for j := 1; j < len(r.data[0].Variables)+1; j++ {
 70 | 		p += r.Coeff(j) * vars[j-1]
 71 | 	}
 72 | 	return p, nil
 73 | }
 74 | 
 75 | // SetObserved sets the name of the observed value.
 76 | func (r *Regression) SetObserved(name string) {
 77 | 	r.names.obs = name
 78 | }
 79 | 
 80 | // GetObserved gets the name of the observed value.
 81 | func (r *Regression) GetObserved() string {
 82 | 	return r.names.obs
 83 | }
 84 | 
 85 | // SetVar sets the name of variable i.
 86 | func (r *Regression) SetVar(i int, name string) {
 87 | 	if len(r.names.vars) == 0 {
 88 | 		r.names.vars = make(map[int]string, 5)
 89 | 	}
 90 | 	r.names.vars[i] = name
 91 | }
 92 | 
 93 | // GetVar gets the name of variable i
 94 | func (r *Regression) GetVar(i int) string {
 95 | 	x := r.names.vars[i]
 96 | 	if x == "" {
 97 | 		s := []string{"X", strconv.Itoa(i)}
 98 | 		return strings.Join(s, "")
 99 | 	}
100 | 	return x
101 | }
102 | 
103 | // AddCross registers a feature cross to be applied to the data points.
104 | func (r *Regression) AddCross(cross featureCross) {
105 | 	r.crosses = append(r.crosses, cross)
106 | }
107 | 
108 | // Train the regression with some data points.
109 | func (r *Regression) Train(d ...*dataPoint) {
110 | 	r.data = append(r.data, d...)
111 | 	if len(r.data) > 2 {
112 | 		r.initialised = true
113 | 	}
114 | }
115 | 
116 | // Apply any feature crosses, generating new observations and updating the data points, as well as
117 | // populating variable names for the feature crosses.
118 | // this should only be run once, as part of Run().
119 | func (r *Regression) applyCrosses() {
120 | 	unusedVariableIndexCursor := len(r.data[0].Variables)
121 | 	for _, point := range r.data {
122 | 		for _, cross := range r.crosses {
123 | 			point.Variables = append(point.Variables, cross.Calculate(point.Variables)...)
124 | 		}
125 | 	}
126 | 
127 | 	if len(r.names.vars) == 0 {
128 | 		r.names.vars = make(map[int]string, 5)
129 | 	}
130 | 	for _, cross := range r.crosses {
131 | 		unusedVariableIndexCursor += cross.ExtendNames(r.names.vars, unusedVariableIndexCursor)
132 | 	}
133 | }
134 | 
135 | // Run determines if there is enough data present to run the regression
136 | // and whether or not the training has already been completed.
137 | // Once the above checks have passed feature crosses are applied if any
138 | // and the model is trained using QR decomposition.
139 | func (r *Regression) Run() error {
140 | 	if !r.initialised {
141 | 		return ErrNotEnoughData
142 | 	}
143 | 	if r.hasRun {
144 | 		return ErrRegressionRun
145 | 	}
146 | 
147 | 	//apply any features crosses
148 | 	r.applyCrosses()
149 | 	r.hasRun = true
150 | 
151 | 	observations := len(r.data)
152 | 	numOfvars := len(r.data[0].Variables)
153 | 
154 | 	if observations < (numOfvars + 1) {
155 | 		return ErrTooManyVars
156 | 	}
157 | 
158 | 	// Create some blank variable space
159 | 	observed := mat.NewDense(observations, 1, nil)
160 | 	variables := mat.NewDense(observations, numOfvars+1, nil)
161 | 
162 | 	for i := 0; i < observations; i++ {
163 | 		observed.Set(i, 0, r.data[i].Observed)
164 | 		for j := 0; j < numOfvars+1; j++ {
165 | 			if j == 0 {
166 | 				variables.Set(i, 0, 1)
167 | 			} else {
168 | 				variables.Set(i, j, r.data[i].Variables[j-1])
169 | 			}
170 | 		}
171 | 	}
172 | 
173 | 	// Now run the regression
174 | 	_, n := variables.Dims() // cols
175 | 	qr := new(mat.QR)
176 | 	qr.Factorize(variables)
177 | 	q := new(mat.Dense)
178 | 	reg := new(mat.Dense)
179 | 	qr.QTo(q)
180 | 	qr.RTo(reg)
181 | 
182 | 	qtr := q.T()
183 | 	qty := new(mat.Dense)
184 | 	qty.Mul(qtr, observed)
185 | 
186 | 	c := make([]float64, n)
187 | 	for i := n - 1; i >= 0; i-- {
188 | 		c[i] = qty.At(i, 0)
189 | 		for j := i + 1; j < n; j++ {
190 | 			c[i] -= c[j] * reg.At(i, j)
191 | 		}
192 | 		c[i] /= reg.At(i, i)
193 | 	}
194 | 
195 | 	// Output the regression results
196 | 	r.coeff = make(map[int]float64, numOfvars)
197 | 	for i, val := range c {
198 | 		r.coeff[i] = val
199 | 		if i == 0 {
200 | 			r.Formula = fmt.Sprintf("Predicted = %.4f", val)
201 | 		} else {
202 | 			r.Formula += fmt.Sprintf(" + %v*%.4f", r.GetVar(i-1), val)
203 | 		}
204 | 	}
205 | 
206 | 	r.calcPredicted()
207 | 	r.calcVariance()
208 | 	r.calcR2()
209 | 	return nil
210 | }
211 | 
212 | // Coeff returns the calculated coefficient for variable i.
213 | func (r *Regression) Coeff(i int) float64 {
214 | 	if len(r.coeff) == 0 {
215 | 		return 0
216 | 	}
217 | 	return r.coeff[i]
218 | }
219 | 
220 | // GetCoeffs returns the calculated coefficients. The element at index 0 is the offset.
221 | func (r *Regression) GetCoeffs() []float64 {
222 | 	if len(r.coeff) == 0 {
223 | 		return nil
224 | 	}
225 | 	coeffs := make([]float64, len(r.coeff))
226 | 	for i := range coeffs {
227 | 		coeffs[i] = r.coeff[i]
228 | 	}
229 | 	return coeffs
230 | }
231 | 
232 | func (r *Regression) calcPredicted() string {
233 | 	observations := len(r.data)
234 | 	var predicted float64
235 | 	var output string
236 | 	for i := 0; i < observations; i++ {
237 | 		r.data[i].Predicted, _ = r.Predict(r.data[i].Variables)
238 | 		r.data[i].Error = r.data[i].Predicted - r.data[i].Observed
239 | 
240 | 		output += fmt.Sprintf("%v. observed = %v, Predicted = %v, Error = %v", i, r.data[i].Observed, predicted, r.data[i].Error)
241 | 	}
242 | 	return output
243 | }
244 | 
245 | func (r *Regression) calcVariance() string {
246 | 	observations := len(r.data)
247 | 	var obtotal, prtotal, obvar, prvar float64
248 | 	for i := 0; i < observations; i++ {
249 | 		obtotal += r.data[i].Observed
250 | 		prtotal += r.data[i].Predicted
251 | 	}
252 | 	obaverage := obtotal / float64(observations)
253 | 	praverage := prtotal / float64(observations)
254 | 
255 | 	for i := 0; i < observations; i++ {
256 | 		obvar += math.Pow(r.data[i].Observed-obaverage, 2)
257 | 		prvar += math.Pow(r.data[i].Predicted-praverage, 2)
258 | 	}
259 | 	r.Varianceobserved = obvar / float64(observations)
260 | 	r.VariancePredicted = prvar / float64(observations)
261 | 	return fmt.Sprintf("N = %v\nVariance observed = %v\nVariance Predicted = %v\n", observations, r.Varianceobserved, r.VariancePredicted)
262 | }
263 | 
264 | func (r *Regression) calcR2() string {
265 | 	r.R2 = r.VariancePredicted / r.Varianceobserved
266 | 	return fmt.Sprintf("R2 = %.2f", r.R2)
267 | }
268 | 
269 | func (r *Regression) calcResiduals() string {
270 | 	str := fmt.Sprintf("Residuals:\nobserved|\tPredicted|\tResidual\n")
271 | 	for _, d := range r.data {
272 | 		str += fmt.Sprintf("%.2f|\t%.2f|\t%.2f\n", d.Observed, d.Predicted, d.Observed-d.Predicted)
273 | 	}
274 | 	str += "\n"
275 | 	return str
276 | }
277 | 
278 | // String satisfies the stringer interface to display a dataPoint as a string.
279 | func (d *dataPoint) String() string {
280 | 	str := fmt.Sprintf("%.2f", d.Observed)
281 | 	for _, v := range d.Variables {
282 | 		str += fmt.Sprintf("|\t%.2f", v)
283 | 	}
284 | 	return str
285 | }
286 | 
287 | // String satisfies the stringer interface to display a regression as a string.
288 | func (r *Regression) String() string {
289 | 	if !r.initialised {
290 | 		return ErrNotEnoughData.Error()
291 | 	}
292 | 	str := fmt.Sprintf("%v", r.GetObserved())
293 | 	for i := 0; i < len(r.names.vars); i++ {
294 | 		str += fmt.Sprintf("|\t%v", r.GetVar(i))
295 | 	}
296 | 	str += "\n"
297 | 	for _, d := range r.data {
298 | 		str += fmt.Sprintf("%v\n", d)
299 | 	}
300 | 	fmt.Println(r.calcResiduals())
301 | 	str += fmt.Sprintf("\nN = %v\nVariance observed = %v\nVariance Predicted = %v", len(r.data), r.Varianceobserved, r.VariancePredicted)
302 | 	str += fmt.Sprintf("\nR2 = %v\n", r.R2)
303 | 	return str
304 | }
305 | 
306 | // MakeDataPoints makes a `[]*dataPoint` from a `[][]float64`. The expected fomat for the input is a row-major [][]float64.
307 | // That is to say the first slice represents a row, and the second represents the cols.
308 | // Furthermore it is expected that all the col slices are of the same length.
309 | // The obsIndex parameter indicates which column should be used
310 | func MakeDataPoints(a [][]float64, obsIndex int) []*dataPoint {
311 | 	if obsIndex != 0 && obsIndex != len(a[0])-1 {
312 | 		return perverseMakeDataPoints(a, obsIndex)
313 | 	}
314 | 
315 | 	retVal := make([]*dataPoint, 0, len(a))
316 | 	if obsIndex == 0 {
317 | 		for _, r := range a {
318 | 			retVal = append(retVal, DataPoint(r[0], r[1:]))
319 | 		}
320 | 		return retVal
321 | 	}
322 | 
323 | 	// otherwise the observation is expected to be the last col
324 | 	last := len(a[0]) - 1
325 | 	for _, r := range a {
326 | 		retVal = append(retVal, DataPoint(r[last], r[:last]))
327 | 	}
328 | 	return retVal
329 | }
330 | 
331 | func perverseMakeDataPoints(a [][]float64, obsIndex int) []*dataPoint {
332 | 	retVal := make([]*dataPoint, 0, len(a))
333 | 	for _, r := range a {
334 | 		obs := r[obsIndex]
335 | 		others := make([]float64, 0, len(r)-1)
336 | 		for i, c := range r {
337 | 			if i == obsIndex {
338 | 				continue
339 | 			}
340 | 			others = append(others, c)
341 | 		}
342 | 		retVal = append(retVal, DataPoint(obs, others))
343 | 	}
344 | 	return retVal
345 | }
346 | 


--------------------------------------------------------------------------------
/regression_test.go:
--------------------------------------------------------------------------------
  1 | package regression
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"math"
  6 | 	"testing"
  7 | )
  8 | 
  9 | func TestRun(t *testing.T) {
 10 | 	r := new(Regression)
 11 | 	r.SetObserved("Murders per annum per 1,000,000 inhabitants")
 12 | 	r.SetVar(0, "Inhabitants")
 13 | 	r.SetVar(1, "Percent with incomes below $5000")
 14 | 	r.SetVar(2, "Percent unemployed")
 15 | 	r.Train(
 16 | 		DataPoint(11.2, []float64{587000, 16.5, 6.2}),
 17 | 		DataPoint(13.4, []float64{643000, 20.5, 6.4}),
 18 | 		DataPoint(40.7, []float64{635000, 26.3, 9.3}),
 19 | 		DataPoint(5.3, []float64{692000, 16.5, 5.3}),
 20 | 		DataPoint(24.8, []float64{1248000, 19.2, 7.3}),
 21 | 		DataPoint(12.7, []float64{643000, 16.5, 5.9}),
 22 | 		DataPoint(20.9, []float64{1964000, 20.2, 6.4}),
 23 | 		DataPoint(35.7, []float64{1531000, 21.3, 7.6}),
 24 | 		DataPoint(8.7, []float64{713000, 17.2, 4.9}),
 25 | 		DataPoint(9.6, []float64{749000, 14.3, 6.4}),
 26 | 		DataPoint(14.5, []float64{7895000, 18.1, 6}),
 27 | 		DataPoint(26.9, []float64{762000, 23.1, 7.4}),
 28 | 		DataPoint(15.7, []float64{2793000, 19.1, 5.8}),
 29 | 		DataPoint(36.2, []float64{741000, 24.7, 8.6}),
 30 | 		DataPoint(18.1, []float64{625000, 18.6, 6.5}),
 31 | 		DataPoint(28.9, []float64{854000, 24.9, 8.3}),
 32 | 		DataPoint(14.9, []float64{716000, 17.9, 6.7}),
 33 | 		DataPoint(25.8, []float64{921000, 22.4, 8.6}),
 34 | 		DataPoint(21.7, []float64{595000, 20.2, 8.4}),
 35 | 		DataPoint(25.7, []float64{3353000, 16.9, 6.7}),
 36 | 	)
 37 | 	r.Run()
 38 | 
 39 | 	fmt.Printf("Regression formula:\n%v\n", r.Formula)
 40 | 	fmt.Printf("Regression:\n%s\n", r)
 41 | 
 42 | 	// All vars are known to positively correlate with the murder rate
 43 | 	for i, c := range r.coeff {
 44 | 		if i == 0 {
 45 | 			// This is the offset and not a coeff
 46 | 			continue
 47 | 		}
 48 | 		if c < 0 {
 49 | 			t.Errorf("Coefficient is negative, but shouldn't be: %.2f", c)
 50 | 		}
 51 | 	}
 52 | 
 53 | 	//  We know this set has an R^2 above 80
 54 | 	if r.R2 < 0.8 {
 55 | 		t.Errorf("R^2 was %.2f, but we expected > 80", r.R2)
 56 | 	}
 57 | }
 58 | 
 59 | func TestCrossApply(t *testing.T) {
 60 | 	r := new(Regression)
 61 | 	r.SetObserved("Input-Squared plus Input")
 62 | 	r.SetVar(0, "Input")
 63 | 	r.Train(
 64 | 		DataPoint(6, []float64{2}),
 65 | 		DataPoint(20, []float64{4}),
 66 | 		DataPoint(30, []float64{5}),
 67 | 		DataPoint(72, []float64{8}),
 68 | 		DataPoint(156, []float64{12}),
 69 | 	)
 70 | 	r.AddCross(PowCross(0, 2))
 71 | 	r.AddCross(PowCross(0, 7))
 72 | 	err := r.Run()
 73 | 	if err != nil {
 74 | 		t.Error(err)
 75 | 	}
 76 | 
 77 | 	fmt.Printf("Regression formula:\n%v\n", r.Formula)
 78 | 	fmt.Printf("Regression:\n%s\n", r)
 79 | 	if r.names.vars[1] != "(Input)^2" {
 80 | 		t.Error("Name incorrect")
 81 | 	}
 82 | 
 83 | 	for i, c := range r.coeff {
 84 | 		if i == 0 {
 85 | 			// This is the offset and not a coeff
 86 | 			continue
 87 | 		}
 88 | 		if c < 0 {
 89 | 			t.Errorf("Coefficient is negative, but shouldn't be: %.2f", c)
 90 | 		}
 91 | 	}
 92 | 
 93 | 	//  We know this set has an R^2 above 80
 94 | 	if r.R2 < 0.8 {
 95 | 		t.Errorf("R^2 was %.2f, but we expected > 80", r.R2)
 96 | 	}
 97 | 
 98 | 	// Test that predict uses the cross as well
 99 | 	val, err := r.Predict([]float64{6})
100 | 	if err != nil {
101 | 		t.Error(err)
102 | 	}
103 | 	if val <= 41.999 && val >= 42.001 {
104 | 		t.Errorf("Expected 42, got %.2f", val)
105 | 	}
106 | }
107 | 
108 | func TestMakeDataPoints(t *testing.T) {
109 | 	a := [][]float64{
110 | 		{1, 2, 3, 4},
111 | 		{2, 2, 3, 4},
112 | 		{3, 2, 3, 4},
113 | 	}
114 | 	correct := []float64{2, 3, 4}
115 | 
116 | 	dps := MakeDataPoints(a, 0)
117 | 	for i, dp := range dps {
118 | 		for i, v := range dp.Variables {
119 | 			if correct[i] != v {
120 | 				t.Errorf("Expected variables to be %v. Got %v instead", correct, dp.Variables)
121 | 			}
122 | 		}
123 | 		if dp.Observed != float64(i+1) {
124 | 			t.Error("Expected observed to be the same as the index")
125 | 		}
126 | 	}
127 | 
128 | 	a = [][]float64{
129 | 		{1, 2, 3, 4},
130 | 		{1, 2, 3, 4},
131 | 		{1, 2, 3, 4},
132 | 	}
133 | 	correct = []float64{1, 3, 4}
134 | 	dps = MakeDataPoints(a, 1)
135 | 	for _, dp := range dps {
136 | 		for i, v := range dp.Variables {
137 | 			if correct[i] != v {
138 | 				t.Errorf("Expected variables to be %v. Got %v instead", correct, dp.Variables)
139 | 			}
140 | 		}
141 | 		if dp.Observed != 2.0 {
142 | 			t.Error("Expected observed to be the same as the index")
143 | 		}
144 | 	}
145 | 
146 | 	correct = []float64{1, 2, 3}
147 | 	dps = MakeDataPoints(a, 3)
148 | 	for _, dp := range dps {
149 | 		for i, v := range dp.Variables {
150 | 			if correct[i] != v {
151 | 				t.Errorf("Expected variables to be %v. Got %v instead", correct, dp.Variables)
152 | 			}
153 | 		}
154 | 		if dp.Observed != 4.0 {
155 | 			t.Error("Expected observed to be the same as the index")
156 | 		}
157 | 	}
158 | }
159 | 
160 | func TestGetCoeffs(t *testing.T) {
161 | 	a := [][]float64{
162 | 		{651, 1, 23},
163 | 		{762, 2, 26},
164 | 		{856, 3, 30},
165 | 		{1063, 4, 34},
166 | 		{1190, 5, 43},
167 | 		{1298, 6, 48},
168 | 		{1421, 7, 52},
169 | 		{1440, 8, 57},
170 | 		{1518, 9, 58},
171 | 	}
172 | 
173 | 	r := new(Regression)
174 | 	r.Train(MakeDataPoints(a, 0)...)
175 | 	r.Run()
176 | 
177 | 	coeffs := r.GetCoeffs()
178 | 	if len(coeffs) != 3 {
179 | 		t.Errorf("Expected 3 coefficients. Got %v instead", len(coeffs))
180 | 	}
181 | 
182 | 	expected := []float64{323.54, 46.60, 13.99}
183 | 	for i := range expected {
184 | 		if math.Abs(expected[i]-coeffs[i]) > 0.01 {
185 | 			t.Errorf("Expected coefficient %v to be %v. Got %v instead", i, expected[i], coeffs[i])
186 | 		}
187 | 	}
188 | }
189 | 


--------------------------------------------------------------------------------