├── README ├── change.go ├── change_test.go └── example └── main.go /README: -------------------------------------------------------------------------------- 1 | go-change: An online change detection algorithm. 2 | 3 | For package documentation, please see http://godoc.org/github.com/dgryski/go-change 4 | -------------------------------------------------------------------------------- /change.go: -------------------------------------------------------------------------------- 1 | // Package change implements an online change detection algorithm 2 | /* 3 | https://sites.cs.ucsb.edu/~yfwang/papers/as06.pdf 4 | 5 | How effective is this algorithm? The authors state 6 | "[T]he estimation algorithm can be shown theoretically to give the correct 7 | prediction and approach an unbiased estimator of the true changeover point if 8 | the window length approaches infinity and the two distributions are 9 | sufficiently dissimilar." 10 | 11 | The algorithm works by examining the distributions on either side of a 12 | suspected change point, and finding the index in the window where the two 13 | distributions are most dissimilar. As there is not guaranteed to be a change 14 | point in the window, this implementation also performs a Student's t-test on 15 | the two distributions to reduce the rate of false positives. 16 | 17 | */ 18 | package change 19 | 20 | import ( 21 | "math" 22 | 23 | "github.com/dgryski/go-onlinestats" 24 | ) 25 | 26 | // Stats are some descriptive statistics for a block of items. It implements the interface needed by the t-test method of onlinestats. 27 | type Stats struct { 28 | mean float64 29 | variance float64 30 | n int 31 | } 32 | 33 | // Mean returns the mean of the data set 34 | func (s Stats) Mean() float64 { return s.mean } 35 | 36 | // Var returns the variance of the data set 37 | func (s Stats) Var() float64 { return s.variance } 38 | 39 | // Len returns the number of items in the data set 40 | func (s Stats) Len() int { return s.n } 41 | 42 | // Stddev returns the standard deviation of the sample 43 | func (s Stats) Stddev() float64 { return math.Sqrt(s.variance) } 44 | 45 | // ChangePoint is a potential change point found by Check(). 46 | type ChangePoint struct { 47 | // Index is the offset into the data set of the suspected change point 48 | Index int 49 | 50 | // Difference is the difference in distribution means found by the Student's t-test 51 | Difference float64 52 | 53 | // Confidence is the confidence returned by a Student's t-test 54 | Confidence float64 55 | 56 | // Before is the statistics of the distribution before the change point 57 | Before Stats 58 | 59 | // After is the statistics of the distribution after the change point 60 | After Stats 61 | } 62 | 63 | // DefaultMinSampleSize is the minimum sample size to consider from the window being checked 64 | const DefaultMinSampleSize = 30 65 | 66 | // Detector is a change detector. 67 | type Detector struct { 68 | MinSampleSize int 69 | MinConfidence float64 70 | } 71 | 72 | // Check returns the index of a potential change point 73 | func (d *Detector) Check(window []float64) *ChangePoint { 74 | 75 | n := len(window) 76 | 77 | // The paper provides recursive formulas for computing the means and 78 | // standard deviations as we slide along the window. This 79 | // implementation uses alternate math based on cumulative sums. 80 | 81 | // cumsum contains the cumulative sum of all elements <= i 82 | // cumsumsq contains the cumulative sum of squares of all elements <= i 83 | // TODO(dgryski): move this to a move numerically stable algorithm 84 | cumsum := make([]float64, n) 85 | cumsumsq := make([]float64, n) 86 | 87 | var sum, sumsq float64 88 | for i, v := range window { 89 | sum += v 90 | sumsq += v * v 91 | cumsum[i] = sum 92 | cumsumsq[i] = sumsq 93 | } 94 | 95 | // sb is our between-class scatter, the degree of dissimilarity of the 96 | // two distributions. This value is always positive, so we can set 0 97 | // as the minimum and know that any valid value will be larger 98 | var maxsb float64 99 | var maxsbIdx int 100 | 101 | // The paper also provides a metric sw, for 'within-class scatter', 102 | // which depends on the standard-deviation of the samples. It suggests 103 | // finding the point that minimizes the ratio sw/sb. However, it then 104 | // proves that this is equivalent to maximizing sb. The calculation of 105 | // sb depends only on the means of the two samples, and not of the 106 | // variances. However, we calculate the variances so that we can pass 107 | // them to the T test later on. 108 | 109 | var before, after Stats 110 | 111 | // sane default 112 | minSampleSize := d.MinSampleSize 113 | if minSampleSize == 0 { 114 | minSampleSize = DefaultMinSampleSize 115 | } 116 | 117 | for l := minSampleSize; l < (n - minSampleSize + 1); l++ { 118 | lidx := l - 1 119 | n1 := float64(l) 120 | mean1 := cumsum[lidx] / n1 121 | 122 | n2 := float64(n - l) 123 | sum2 := (sum - cumsum[lidx]) 124 | mean2 := sum2 / n2 125 | 126 | sb := ((n1 * n2) / (n1 + n2)) * (mean1 - mean2) * (mean1 - mean2) 127 | if maxsb < sb { 128 | maxsb = sb 129 | maxsbIdx = l 130 | 131 | // The variances are calculated only if needed to 132 | // reduce the math in the main loop 133 | var1 := (cumsumsq[lidx] - (cumsum[lidx]*cumsum[lidx])/(n1)) / (n1 - 1) 134 | var2 := ((sumsq - cumsumsq[lidx]) - (sum2*sum2)/(n2)) / (n2 - 1) 135 | 136 | before.mean, before.variance, before.n = mean1, var1, l 137 | after.mean, after.variance, after.n = mean2, var2, n-l 138 | } 139 | } 140 | 141 | var conf float64 142 | if before.n > 0 { 143 | // we found a difference 144 | conf = onlinestats.Welch(before, after) 145 | } 146 | 147 | // not above our threshold 148 | if conf <= d.MinConfidence { 149 | return nil 150 | } 151 | 152 | cp := &ChangePoint{ 153 | Index: maxsbIdx, 154 | Difference: after.Mean() - before.Mean(), 155 | Confidence: conf, 156 | Before: before, 157 | After: after, 158 | } 159 | 160 | return cp 161 | } 162 | 163 | // Stream monitors a stream of floats for changes 164 | type Stream struct { 165 | windowSize int 166 | blockSize int 167 | 168 | data []float64 169 | 170 | items int 171 | 172 | buffer []float64 173 | bufidx int 174 | 175 | detector *Detector 176 | } 177 | 178 | // NewStream constructs a new stream detector 179 | func NewStream(windowSize int, minSample int, blockSize int, confidence float64) *Stream { 180 | return &Stream{ 181 | windowSize: windowSize, 182 | blockSize: blockSize, 183 | data: make([]float64, windowSize), 184 | buffer: make([]float64, blockSize), 185 | 186 | detector: &Detector{ 187 | MinSampleSize: minSample, 188 | MinConfidence: confidence, 189 | }, 190 | } 191 | } 192 | 193 | // Push adds a float to the stream and calls the change detector 194 | func (s *Stream) Push(item float64) *ChangePoint { 195 | s.buffer[s.bufidx] = item 196 | s.bufidx++ 197 | s.items++ 198 | 199 | if s.bufidx < s.blockSize { 200 | return nil 201 | } 202 | 203 | copy(s.data[0:], s.data[s.blockSize:]) 204 | copy(s.data[s.windowSize-s.blockSize:], s.buffer) 205 | s.bufidx = 0 206 | 207 | if s.items < s.windowSize { 208 | return nil 209 | } 210 | 211 | return s.detector.Check(s.data) 212 | } 213 | 214 | // Window returns the current data window. This should be treated as read-only 215 | func (s *Stream) Window() []float64 { return s.data } 216 | -------------------------------------------------------------------------------- /change_test.go: -------------------------------------------------------------------------------- 1 | package change 2 | 3 | import "testing" 4 | 5 | func TestDetectChange(t *testing.T) { 6 | 7 | var tests = []struct { 8 | w []float64 9 | idx int 10 | }{ 11 | { 12 | []float64{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, 13 | 0, // no change point found 14 | }, 15 | 16 | { 17 | []float64{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}, 18 | 10, // the first 2 19 | }, 20 | { 21 | []float64{1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 3, 0, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2}, 22 | 0, // change occurs but not statistically significant 23 | }, 24 | } 25 | 26 | var detector = Detector{ 27 | MinSampleSize: 5, 28 | } 29 | 30 | for _, tt := range tests { 31 | r := detector.Check(tt.w) 32 | if (r == nil || r.Confidence < 0.95) && tt.idx == 0 { 33 | // no difference found and no difference expected -- good 34 | } else if r.Confidence >= 0.95 && r.Index == tt.idx { 35 | // difference found at expected location -- good 36 | } else { 37 | t.Errorf("DetectChange confidence=%f index=%d, wanted %d", r.Confidence, r.Index, tt.idx) 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /example/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "flag" 6 | "fmt" 7 | "html/template" 8 | "io" 9 | "log" 10 | "math" 11 | "os" 12 | "sort" 13 | "strconv" 14 | 15 | "github.com/dgryski/go-change" 16 | ) 17 | 18 | func main() { 19 | windowSize := flag.Int("w", 120, "window size") 20 | minSample := flag.Int("ms", 30, "min sample size") 21 | blockSize := flag.Int("bs", 10, "block size") 22 | compressPoints := flag.Int("cp", 10, "compress points for graph display") 23 | fname := flag.String("f", "", "file name") 24 | ymin := flag.Int("ymin", 0, "minimum y value for graph") 25 | 26 | flag.Parse() 27 | 28 | var f io.Reader 29 | 30 | if *fname == "" { 31 | log.Println("reading from stdin") 32 | f = os.Stdin 33 | } else { 34 | var err error 35 | f, err = os.Open(*fname) 36 | if err != nil { 37 | fmt.Println("open failed:", err) 38 | return 39 | } 40 | } 41 | 42 | scanner := bufio.NewScanner(f) 43 | 44 | s := change.NewStream(*windowSize, *minSample, *blockSize, 0.995) 45 | 46 | type graphPoints [2]float64 47 | var graphData []graphPoints 48 | var last []float64 49 | 50 | var changePoints []int 51 | 52 | var items int 53 | 54 | for scanner.Scan() { 55 | item, err := strconv.ParseFloat(scanner.Text(), 64) 56 | if err != nil { 57 | fmt.Printf("error parsing <%s>: %s\n", scanner.Text(), err) 58 | continue 59 | } 60 | 61 | last = append(last, item) 62 | items++ 63 | if items > 0 && items%*compressPoints == 0 { 64 | sort.Float64s(last) 65 | median := last[*compressPoints/2] 66 | last = last[:0] 67 | 68 | graphData = append(graphData, [2]float64{float64(items), median}) 69 | } 70 | 71 | r := s.Push(item) 72 | 73 | if r != nil { 74 | diff := math.Abs(r.Difference / r.Before.Mean()) 75 | if r.Difference != 0 && diff > 0.06 { 76 | log.Printf("difference found at offset=%d: %f %v\n", items-*windowSize+r.Index, diff, r) 77 | changePoints = append(changePoints, items-*windowSize+r.Index) 78 | } 79 | } 80 | } 81 | 82 | if err := scanner.Err(); err != nil { 83 | fmt.Printf("Error during scan: %v", err) 84 | } 85 | 86 | reportTmpl.Execute(os.Stdout, struct { 87 | YMin int 88 | GraphData []graphPoints 89 | ChangePoints []int 90 | }{ 91 | *ymin, 92 | graphData, 93 | changePoints, 94 | }) 95 | } 96 | 97 | var reportTmpl = template.Must(template.New("report").Parse(` 98 | 99 | 100 | 101 | 102 | 119 | 120 |
121 | 122 | 123 | 124 | 125 | 126 | `)) 127 | --------------------------------------------------------------------------------