├── .travis.yml
├── LICENSE
├── README.md
├── detect.go
├── detect_test.go
├── example_test.go
├── split.go
└── split_test.go


/.travis.yml:
--------------------------------------------------------------------------------
1 | language: go
2 | script: go test -race -cpu 1,2,4 -v ./...
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2015 Lytics
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Impact
 2 | ======
 3 | 
 4 | [![Build Status](https://travis-ci.org/lytics/impact.svg?branch=master)](https://travis-ci.org/lytics/impact) [![GoDoc](https://godoc.org/github.com/lytics/impact?status.svg)](https://godoc.org/github.com/lytics/impact)
 5 | 
 6 | Testing for change point detection and causal impact to timeseries in [Go](https://golang.org).
 7 | 
 8 | ## Purpose
 9 | 
10 | **Impact detects significant changes to the location of a time series**.
11 | 
12 | For a candidate point in time for change detection, it uses the structure of the preceding data to determine the probability of the subsequent data arriving to its final location.  A low probability indicates a significant departure in location, or a *casual impact* to the series.
13 | 
14 | Because of the nature of the underlying Monte Carlo simulation, Impact is free of any distributional assumptions, and fit for use in any processes whose likelihood function is either unknown or dynamic &mdash; **it is location, scale and distribution free**.
15 | 
16 | ## Design
17 | 
18 | ### Changepoint Detection
19 | 
20 | Impact implements Matteson and James' [divisive entropy decomposition algorithm](http://arxiv.org/pdf/1306.4933.pdf) for nonparametric changepoint detection.  
21 | 
22 | ### Causal Inference
23 | 
24 | Once changepoints are detected and used to divide the series the set of disjoint subsets, adjacent subsets can be used to determine whether the series between two inpoints implies causal change in the location of the series.
25 | 
26 | Given that a series contains one or more changepoints, preceding subsets, called *reference series*, serve to provide evidence for or against the location change in the subsequent subset, called a *candidate* series.
27 | 
28 | Impact performs Monte Carlo simulation (via bootstrap resampling) to create a set of alternative [random walks](http://en.wikipedia.org/wiki/Random_walk) to compare against the realized candidate series.  The destination of each simulated walk is compared against the realized value from the *candidate* subseries, and the percentage of destinations considered as or more extreme (in terms of absolute deviation in location) create the "p-value" for the test.
29 | 
30 | ## Example
31 | 
32 | Consider the following downward trending process, which is divided into two disjoint series &mdash; the *Reference* series in solid black, and the *Candidate* series in dotted black.
33 | 
34 | ![negativewalk](https://cloud.githubusercontent.com/assets/3698679/6422052/3c21eb06-be89-11e4-889f-f1718207d53a.png)
35 | 
36 | In order to determine if the start of the *Candidate* series indicates a causal disruption to the sequence, we simulate a large number of alternatives and deem that since the observed *Candidate* series is more extreme (in terms of final destination) than any of the simulations, that the start of the *Candidate* series indicates a causal disruption.
37 | 
38 | ![negativewalk_20](https://cloud.githubusercontent.com/assets/3698679/6422055/3c231cec-be89-11e4-966e-265bcd50766f.png)
39 | 
40 | Alternatively, consider the following upward trending process and its corresponding *Reference* and *Candidate* sub-series.
41 | 
42 | ![positivewalk](https://cloud.githubusercontent.com/assets/3698679/6422053/3c224d58-be89-11e4-96e7-219acda4691e.png)
43 | 
44 | We likewise simulate a large number of alternatives.  Since the realized *Candidate* sub-series lies well within the range of simulated alternatives, there's no evidence of a causal disruption at this point in the series.
45 | 
46 | ![positivewalk_20](https://cloud.githubusercontent.com/assets/3698679/6422054/3c22e862-be89-11e4-8513-18a06925f772.png)
47 | 
48 | *Note that although only 20 simulated alternatives are shown in each figure, that in practice the number of bootstrap resamples should be large enough to yield conclusive results &mdash; definitely upwards of 1,000.
49 | 
50 | ## Usage
51 | 
52 | ```go
53 | package main
54 | 
55 | import (
56 | 	"fmt"
57 | 
58 | 	"github.com/lytics/impact"
59 | )
60 | 
61 | func main() {
62 | niter := 1000
63 | 	series := []float64{0.2, 0, 0.4, 0, 0.1, 0.5, 0.2, 0.4, 0, 0, 0.1, 0.6, 0.1, 0.3, 0.1, 0.1, 0.2, 0.3, 0.1, 0.1}
64 | 
65 | 	// detect changepoints
66 | 	significance := 0.05
67 | 	minSize := 3
68 | 	changes, _ := DetectChanges(series, significance, niter, minSize)
69 | 	fmt.Println(changes)
70 | 	// Output: [0 8 20]
71 | 
72 | 	// detect impact
73 | 	p, op := DetectImpact(series[changes[0]:changes[1]], series[changes[1]:changes[2]], niter)
74 | 
75 | 	// Note that because of the nature of bootstrapping, the p-value from the test is subject to minor fluctuations.
76 | 	// To get a more accurate/consistent p-value, increase the number of iterations in the detection.
77 | }
78 | 
79 | ```
80 | 


--------------------------------------------------------------------------------
/detect.go:
--------------------------------------------------------------------------------
  1 | package impact
  2 | 
  3 | import (
  4 | 	"math/rand"
  5 | 	"os"
  6 | 	"runtime"
  7 | 	"strconv"
  8 | 	"sync"
  9 | 	"time"
 10 | 
 11 | 	"github.com/drewlanenga/govector"
 12 | )
 13 | 
 14 | // the operator indicates whether the candidate series has increased,
 15 | // decreased or stayed largely the same
 16 | type Operator int
 17 | 
 18 | type series govector.Vector
 19 | 
 20 | const (
 21 | 	EQUALS       Operator = 0
 22 | 	GREATER_THAN Operator = 1
 23 | 	LESS_THAN    Operator = 2
 24 | )
 25 | 
 26 | var (
 27 | 	smoother uint = 2 // the amount of smoothing on either side
 28 | 	rnd           = rand.New(rand.NewSource(time.Now().UnixNano()))
 29 | 	rndMutex      = &sync.Mutex{}
 30 | )
 31 | 
 32 | func walks(niter, nsteps, ncpu int, start float64, history govector.Vector) series {
 33 | 	destinations := make(series, niter)
 34 | 
 35 | 	steps := history.Diff()
 36 | 
 37 | 	c := make(chan int, ncpu)
 38 | 	for i := 0; i < niter; i++ {
 39 | 		go destinations.walk(i, nsteps, start, steps, c)
 40 | 	}
 41 | 
 42 | 	// drain the channel
 43 | 	for i := 0; i < ncpu; i++ {
 44 | 		<-c // wait for one task to complete
 45 | 	}
 46 | 
 47 | 	// all done
 48 | 	return destinations
 49 | }
 50 | 
 51 | // take random steps in a walk based on the `diff`.  (`diff` is a bunch of steps.)
 52 | func (s series) walk(i, nsteps int, start float64, diff govector.Vector, c chan int) {
 53 | 	walkrnd := rand.New(rand.NewSource(time.Now().UnixNano()))
 54 | 
 55 | 	n := len(diff)
 56 | 	dest := start
 57 | 	for i := 0; i < nsteps; i++ {
 58 | 		which := walkrnd.Intn(n)
 59 | 		dest += diff[which]
 60 | 	}
 61 | 
 62 | 	s[i] = dest
 63 | 	c <- 1 // signal that the walk has finished
 64 | }
 65 | 
 66 | // DetectImpact performs Monte Carlo based changepoint detection between two disjoint
 67 | // and adjacent subseries of a larger time series.  Increase `niter` to improve
 68 | // accuracy of the detection.
 69 | func DetectImpact(x1, x2 []float64, niter int) (float64, Operator, error) {
 70 | 	v1, err := govector.AsVector(x1)
 71 | 	if err != nil {
 72 | 		return 0.0, EQUALS, err
 73 | 	}
 74 | 
 75 | 	v2, err := govector.AsVector(x2)
 76 | 	if err != nil {
 77 | 		return 0.0, EQUALS, err
 78 | 	}
 79 | 
 80 | 	x1smooth := v1.Smooth(smoother, smoother)
 81 | 	x2smooth := v2.Smooth(smoother, smoother)
 82 | 
 83 | 	x1diff := x1smooth.Diff()
 84 | 
 85 | 	ncpu, _ := strconv.Atoi(os.Getenv("GOMAXPROCS"))
 86 | 	if ncpu == 0 {
 87 | 		ncpu = runtime.NumCPU()
 88 | 	}
 89 | 	runtime.GOMAXPROCS(ncpu)
 90 | 
 91 | 	// the final destinations of a bunch of random walks
 92 | 	simDest := walks(niter, len(x2), ncpu, x1smooth[len(x1)-1], x1diff)
 93 | 
 94 | 	realDest := x2smooth[len(x2)-1]
 95 | 
 96 | 	plower := float64(lt(realDest, simDest)) / float64(niter)
 97 | 	pupper := float64(gt(realDest, simDest)) / float64(niter)
 98 | 
 99 | 	p := 1.0
100 | 	op := EQUALS
101 | 
102 | 	if plower < pupper {
103 | 		p = plower
104 | 		op = LESS_THAN
105 | 	} else if pupper < plower {
106 | 		p = pupper
107 | 		op = GREATER_THAN
108 | 	}
109 | 
110 | 	return p, op, nil
111 | }
112 | 
113 | // count the number of xs greater than x
114 | func gt(x float64, xs []float64) int {
115 | 	count := 0
116 | 	for _, value := range xs {
117 | 		if x < value {
118 | 			count++
119 | 		}
120 | 	}
121 | 
122 | 	return count
123 | }
124 | 
125 | // count the number of xs less than x
126 | func lt(x float64, xs []float64) int {
127 | 	count := 0
128 | 	for _, value := range xs {
129 | 		if x > value {
130 | 			count++
131 | 		}
132 | 	}
133 | 
134 | 	return count
135 | }
136 | 


--------------------------------------------------------------------------------
/detect_test.go:
--------------------------------------------------------------------------------
 1 | package impact
 2 | 
 3 | import (
 4 | 	"math"
 5 | 	"testing"
 6 | 
 7 | 	"github.com/bmizerany/assert"
 8 | )
 9 | 
10 | var (
11 | 	mockShort = []float64{0.2, 0, 0.4, 0, 0.1, 0.5, 0.2, 0.4, 0, 0, 0.1, 0.6, 0.1, 0.3, 0.1, 0.1, 0.2, 0.3, 0.1, 0.1}
12 | )
13 | 
14 | func TestComparisons(t *testing.T) {
15 | 	greaterCount := gt(0.3, mockShort)
16 | 	assert.Equal(t, 4, greaterCount)
17 | 
18 | 	lessCount := lt(0.3, mockShort)
19 | 	assert.Equal(t, 14, lessCount)
20 | }
21 | 
22 | func TestDetect(t *testing.T) {
23 | 	// get the pvalue and operator for the test
24 | 	p, op, err := DetectImpact(mockShort[0:14], mockShort[14:20], 1000)
25 | 	assert.Equal(t, nil, err)
26 | 	assert.Tf(t, p > 0.1 && p < 0.25, "pvalue for Detect should be within [0.1, 0.2]")
27 | 	assert.Equalf(t, LESS_THAN, op, "the series detection should show a decrease")
28 | }
29 | 
30 | func equal(a, b float64) bool {
31 | 	eps := math.Abs(a - math.Nextafter(a, 1))
32 | 	abs := math.Abs(b - a)
33 | 
34 | 	return abs <= eps
35 | }
36 | 


--------------------------------------------------------------------------------
/example_test.go:
--------------------------------------------------------------------------------
 1 | package impact
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | )
 6 | 
 7 | func Example() {
 8 | 	niter := 1000
 9 | 	series := []float64{0.2, 0, 0.4, 0, 0.1, 0.5, 0.2, 0.4, 0, 0, 0.1, 0.6, 0.1, 0.3, 0.1, 0.1, 0.2, 0.3, 0.1, 0.1}
10 | 
11 | 	// detect changepoints
12 | 	significance := 0.05
13 | 	minSize := 3
14 | 	changes, _ := DetectChanges(series, significance, niter, minSize)
15 | 	fmt.Println(changes)
16 | 	// Output: [0 8 20]
17 | 
18 | 	// detect impact
19 | 	_, _, _ = DetectImpact(series[changes[0]:changes[1]], series[changes[1]:changes[2]], niter)
20 | }
21 | 


--------------------------------------------------------------------------------
/split.go:
--------------------------------------------------------------------------------
  1 | package impact
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"math"
  6 | 	"sort"
  7 | 
  8 | 	"github.com/bobhancock/gomatrix/matrix"
  9 | )
 10 | 
 11 | type splitter struct {
 12 | 	Index  int     // location of the changepoint
 13 | 	Energy float64 // "energy" released when cluster splits
 14 | }
 15 | 
 16 | func newSplitter() *splitter {
 17 | 	return &splitter{-1, math.Inf(-1)}
 18 | }
 19 | 
 20 | func newSplitters(n int) []*splitter {
 21 | 	splitters := make([]*splitter, n)
 22 | 	for i := 0; i < n; i++ {
 23 | 		splitters[i] = newSplitter()
 24 | 	}
 25 | 	return splitters
 26 | }
 27 | 
 28 | type splitSummary struct {
 29 | 	Changes []int
 30 | 	Best    float64
 31 | }
 32 | 
 33 | type permSummary struct {
 34 | 	P float64
 35 | 	R int
 36 | }
 37 | 
 38 | // DetectChanges implements divisive changepoint detection to identify
 39 | // structural changes to x.  `sig` determines significance level, `R` determines
 40 | // number of permutations to run during permutation testing, and `minSize`
 41 | // determines the minimum size of the series to detect.
 42 | func DetectChanges(x []float64, sig float64, R int, minSize int) ([]int, error) {
 43 | 	if sig < 0 || sig > 1.0 {
 44 | 		return nil, fmt.Errorf("sig (%v) should be bound [0, 1]", sig)
 45 | 	}
 46 | 
 47 | 	if minSize < 2 {
 48 | 		return nil, fmt.Errorf("minSize (%d) must be greater than 1", minSize)
 49 | 	}
 50 | 
 51 | 	n := len(x)
 52 | 
 53 | 	// initialize k to changepoints
 54 | 	k := n
 55 | 
 56 | 	// assume changes occur at beginning and end of series
 57 | 	changes := []int{0, n}
 58 | 	splitters := newSplitters(n)
 59 | 
 60 | 	distance := vectorDistance(x)
 61 | 
 62 | 	for k > 0 {
 63 | 		split := eSplit(changes, distance, minSize, false, splitters)
 64 | 		newestChangePoint := split.Changes[len(split.Changes)-1]
 65 | 
 66 | 		// not able to meet minimum size constraint
 67 | 		if newestChangePoint == -1 {
 68 | 			break
 69 | 		}
 70 | 
 71 | 		result := sigTest(distance, R, changes, minSize, split.Best, splitters)
 72 | 
 73 | 		// change point not significant
 74 | 		if result.P > sig {
 75 | 			break
 76 | 		}
 77 | 
 78 | 		// update set of change points
 79 | 		changes = split.Changes
 80 | 		k--
 81 | 	}
 82 | 
 83 | 	// remove the last (insignificant) changepoint
 84 | 	significant := changes[0 : len(changes)-1]
 85 | 
 86 | 	// sort them in sequential order (ordered natively by discovery)
 87 | 	sort.Sort(sort.IntSlice(significant))
 88 | 
 89 | 	return significant, nil
 90 | }
 91 | 
 92 | func eSplit(changes []int, distance *matrix.DenseMatrix, minSize int, forSim bool, splitters []*splitter) splitSummary {
 93 | 	// copy changes into splits
 94 | 	splits := copyAndSort(changes)
 95 | 
 96 | 	best := newSplitter()
 97 | 
 98 | 	ii := -1
 99 | 
100 | 	// if the procedure is being used for a significance test
101 | 	if forSim {
102 | 		for i := 1; i < len(splits); i++ {
103 | 			split := splitPoint(splits[i-1], splits[i]-1, distance, minSize)
104 | 			if split.Energy > best.Energy {
105 | 				ii = splits[i-1]
106 | 
107 | 				best = split // best split found so far
108 | 			}
109 | 		}
110 | 
111 | 		changes = append(changes, best.Index)
112 | 		return splitSummary{changes, best.Energy}
113 | 	}
114 | 
115 | 	for i := 1; i < len(splits); i++ {
116 | 		isplitter := splitters[splits[i-1]]
117 | 		if isplitter.Index == -1 {
118 | 			isplitter = splitPoint(splits[i-1], splits[i]-1, distance, minSize)
119 | 		}
120 | 
121 | 		if isplitter.Energy > best.Energy {
122 | 			ii = splits[i-1]
123 | 			best = isplitter
124 | 		}
125 | 	}
126 | 
127 | 	changes = append(changes, best.Index)
128 | 	splitters[ii].Index = 0    // update to account for newly proposed changepoint
129 | 	splitters[ii].Energy = 0.0 // update to account for newly proposed changepoint
130 | 
131 | 	return splitSummary{changes, best.Energy}
132 | }
133 | 
134 | // this implementation is of complexity O(n^2) to find each change point
135 | // so if k change points are found, complexity is O(kn^2)
136 | func splitPoint(start int, end int, distance *matrix.DenseMatrix, minSize int) *splitter {
137 | 	// interval is too small to split
138 | 	if (end - start + 1) < 2*minSize {
139 | 		return newSplitter()
140 | 	}
141 | 	best := newSplitter()
142 | 
143 | 	dist := distance.Copy()
144 | 
145 | 	// now represents number of data points
146 | 	end = end - start + 1
147 | 
148 | 	tau1 := minSize
149 | 	tau2 := minSize << 1
150 | 
151 | 	cut1 := subsetMatrix(dist, numericRange(0, tau1-1), numericRange(0, tau1-1))
152 | 	cut2 := subsetMatrix(dist, numericRange(tau1, tau2-1), numericRange(tau1, tau2-1))
153 | 	cut3 := subsetMatrix(dist, numericRange(0, tau1-1), numericRange(tau1, tau2-1))
154 | 
155 | 	// within distance for left cluster
156 | 	a := matrixSum(cut1) / 2.0
157 | 
158 | 	// within distance for right cluster
159 | 	b1 := matrixSum(cut2) / 2.0
160 | 
161 | 	// between distance for both clusters
162 | 	ab1 := matrixSum(cut3)
163 | 
164 | 	energy := calculateEnergy(a, b1, ab1, tau1, tau2)
165 | 	if energy > best.Energy {
166 | 		best.Index = tau1 + start
167 | 		best.Energy = energy
168 | 	}
169 | 
170 | 	// shift right cluster
171 | 	tau2 += 1
172 | 
173 | 	b := initVector(end+1, b1)
174 | 	ab := initVector(end+1, ab1)
175 | 
176 | 	for tau2 <= end {
177 | 		b[tau2] = b[tau2-1] + matrixSum(subsetMatrix(distance, numericRange(tau2-1, tau2-1), numericRange(tau1, tau2-2)))
178 | 		ab[tau2] = ab[tau2-1] + matrixSum(subsetMatrix(distance, numericRange(tau2-1, tau2-1), numericRange(0, tau1-1)))
179 | 
180 | 		energy = calculateEnergy(a, b[tau2], ab[tau2], tau1, tau2)
181 | 		if energy > best.Energy {
182 | 			best.Index = tau1 + start
183 | 			best.Energy = energy
184 | 		}
185 | 		tau2++
186 | 	}
187 | 
188 | 	// shift left cluster
189 | 	tau1 += 1
190 | 
191 | 	for {
192 | 		tau2 = tau1 + minSize
193 | 		if tau2 > end {
194 | 			break
195 | 		}
196 | 
197 | 		addA := matrixSum(subsetMatrix(distance, numericRange(tau1-1, tau1-1), numericRange(0, tau1-2)))
198 | 		addB := matrixSum(subsetMatrix(distance, numericRange(tau1-1, tau1-1), numericRange(tau1, tau2-2)))
199 | 
200 | 		// update within distance for left cluster
201 | 		a += addA
202 | 
203 | 		// iterate over possible endings for right cluster (tau2)
204 | 		for tau2 <= end {
205 | 			// update within distance for right cluster
206 | 			addB += distance.Get(tau1-1, tau2-1)
207 | 			b[tau2] -= addB
208 | 
209 | 			// update between cluster distance
210 | 			ab[tau2] += addB - addA
211 | 			energy = calculateEnergy(a, b[tau2], ab[tau2], tau1, tau2)
212 | 			if energy > best.Energy {
213 | 				best.Index = tau1 + start
214 | 				best.Energy = energy
215 | 			}
216 | 			tau2++
217 | 		}
218 | 		tau1++
219 | 	}
220 | 	return best
221 | }
222 | 
223 | func calculateEnergy(a, b, ab float64, tau1, tau2 int) float64 {
224 | 	info := scaleAB(ab, tau1, tau2) - scaleB(b, tau1, tau2) - scaleA(a, tau1, tau2)
225 | 	tau := float64(tau1*(tau2-tau1)) / float64(tau2)
226 | 	return info * tau
227 | }
228 | 
229 | func scaleA(a float64, tau1, tau2 int) float64 {
230 | 	return 2.0 * a / float64(tau1*(tau1-1))
231 | }
232 | 
233 | func scaleB(b float64, tau1, tau2 int) float64 {
234 | 	return 2.0 * b / float64((tau2-tau1-1)*(tau2-tau1))
235 | }
236 | 
237 | func scaleAB(ab float64, tau1, tau2 int) float64 {
238 | 	return 2.0 * ab / float64((tau2-tau1)*tau1)
239 | }
240 | 
241 | // create a numeric slice with range [start, end] inclusively
242 | func numericRange(start, end int) []int {
243 | 	n := end - start + 1
244 | 	ints := make([]int, n)
245 | 	for i := 0; i < n; i++ {
246 | 		ints[i] = i + start
247 | 	}
248 | 	return ints
249 | }
250 | 
251 | // return a subset of m given vectors of row and column indeces
252 | // TODO: we might be able to get away with a subset of the row/col for the single col/row extraction
253 | func subsetMatrix(m *matrix.DenseMatrix, rows []int, cols []int) *matrix.DenseMatrix {
254 | 	elements := make([]float64, len(rows)*len(cols))
255 | 	subset := matrix.MakeDenseMatrix(elements, len(rows), len(cols))
256 | 
257 | 	for newRowIndex, oldRowIndex := range rows {
258 | 		for newColIndex, oldColIndex := range cols {
259 | 			subset.Set(newRowIndex, newColIndex, m.Get(oldRowIndex, oldColIndex))
260 | 		}
261 | 	}
262 | 
263 | 	return subset
264 | }
265 | 
266 | // initialize vector of length n to a specific value
267 | func initVector(n int, value float64) []float64 {
268 | 	x := make([]float64, n)
269 | 	for i := 0; i < n; i++ {
270 | 		x[i] = value
271 | 	}
272 | 	return x
273 | }
274 | 
275 | // distance between each point
276 | func vectorDistance(x []float64) *matrix.DenseMatrix {
277 | 	elements := make([]float64, 0, len(x)*len(x))
278 | 	for _, xi := range x {
279 | 		xdist := make([]float64, len(x))
280 | 		for j, xj := range x {
281 | 			xdist[j] = math.Abs(xi - xj)
282 | 		}
283 | 		elements = append(elements, xdist...)
284 | 	}
285 | 
286 | 	return matrix.MakeDenseMatrix(elements, len(x), len(x))
287 | }
288 | 
289 | func matrixSum(m *matrix.DenseMatrix) float64 {
290 | 	sum := 0.0
291 | 	for i := 0; i < m.Cols(); i++ {
292 | 		sum += m.SumCol(i)
293 | 	}
294 | 	return sum
295 | }
296 | 
297 | func sigTest(distance *matrix.DenseMatrix, R int, changes []int, minSize int, obs float64, splitters []*splitter) permSummary {
298 | 	if R == 0 {
299 | 		return permSummary{0.0, -1}
300 | 	}
301 | 
302 | 	over := 0
303 | 	for f := 0; f < R; f++ {
304 | 		D1 := permCluster(distance, changes)
305 | 		split := eSplit(changes, D1, minSize, true, splitters)
306 | 		if split.Best > obs {
307 | 			over++
308 | 		}
309 | 	}
310 | 
311 | 	// pad the pvalue by 1 success
312 | 	p := float64(over+1) / float64(R+1)
313 | 	return permSummary{p, R}
314 | }
315 | 
316 | func permCluster(d *matrix.DenseMatrix, changes []int) *matrix.DenseMatrix {
317 | 	points := copyAndSort(changes)
318 | 
319 | 	for i := 0; i < len(points)-1; i++ { // number of clusters
320 | 		index := numericRange(points[i], points[i+1]-1) // shuffle within clusters by permuting matrix columns and rows
321 | 		u := shuffle(index)
322 | 		for ii, ui := range u {
323 | 			d.Set(ii, ii, d.Get(ui, ui))
324 | 		}
325 | 	}
326 | 	return d
327 | }
328 | 
329 | // maybe not a good copy
330 | func copyAndSort(x []int) []int {
331 | 	y := make([]int, len(x))
332 | 	for i, xi := range x {
333 | 		y[i] = xi
334 | 	}
335 | 	// sort the current set of change points
336 | 	sort.Sort(sort.IntSlice(y))
337 | 
338 | 	return y
339 | }
340 | 
341 | func shuffle(x []int) []int {
342 | 	// make it safe
343 | 	rndMutex.Lock()
344 | 	index := rnd.Perm(len(x))
345 | 	rndMutex.Unlock()
346 | 
347 | 	y := make([]int, len(x))
348 | 	for xindex, yindex := range index {
349 | 		y[yindex] = x[xindex]
350 | 	}
351 | 
352 | 	return y
353 | }
354 | 


--------------------------------------------------------------------------------
/split_test.go:
--------------------------------------------------------------------------------
 1 | package impact
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/bmizerany/assert"
 7 | )
 8 | 
 9 | var (
10 | 	mockLong = []float64{0.2819, -0.2685, -0.8953, -1.5097, 1.3481, 1.531, 2.8379, -1.4015, 0.6418, 0.6624, 0.0099, 1.0109, -0.4435, -0.82, -1.2299, -0.7309, 0.1284, 1.5323, 0.1371, -1.3134, -1.1574, 0.7684, -1.3807, 1.0011, 0.859, 0.5829, 1.6613, 0.9224, 0.3749, -0.2098, -0.4113, 1.5754, -0.5595, -1.1993, -0.115, 1.7752, -0.44, -0.8884, 0.0012, -1.3897, -1.1599, -1.3463, 0.6435, 0.1057, 0.6183, -0.1178, 0.4337, 0.0117, -1.4005, -0.4688, 0.8254, -0.4336, 0.2166, -1.4799, 0.1862, -0.1225, -0.7959, 0.619, 0.3448, -0.4218, -0.4955, 0.7085, 0.4557, 0.163, -0.9863, -1.1137, -0.2781, 0.89, -0.0805, -1.0104, -0.5461, 0.3711, 0.4303, 1.5358, -0.0145, 0.9244, 0.72, 0.2396, 0.3121, -0.5711, 1.7378, -1.0393, 2.8768, -1.1489, 2.8247, 0.6668, -2.4452, -0.2248, -1.4506, 0.2512, -0.6104, -0.7831, -0.9317, 0.9776, -0.2541, 0.3408, 1.9379, 0.435, 0.1587, -0.2107, 3.3023, 4.9829, 3.9371, 3.3873, 3.8503, 4.4646, 2.5079, 3.603, 2.902, 2.0079, 5.3926, 2.0144, 4.8468, 2.9475, 3.1235, 3.4594, 2.714, 1.4039, 5.7932, 2.8378, 3.6907, 2.8599, 0.8576, 3.1705, 1.9901, 3.2576, 2.1397, 2.0252, 2.0436, 2.2332, 2.5238, 3.6306, 2.2374, 3.9664, 4.5682, 2.0718, 3.4245, 3.8075, 2.8893, 4.7976, 2.2743, 3.2846, 2.5925, 2.158, 5.356, 3.1835, 2.7202, 2.8468, 1.0398, 3.3804, 6.2629, 3.5585, 2.7804, 4.4909, 2.997, 2.3899, 3.3283, 2.4281, 3.1302, 1.796, 2.4216, 4.5802, 1.9428, 2.2514, 2.543, 3.302, 4.4632, 2.7864, 3.8564, 2.2068, 4.9702, 2.9449, 3.404, 2.3057, 2.6701, 4.383, 2.4721, 2.3338, 3.3218, 3.5748, 2.5871, 3.1313, 2.1113, 3.4409, 3.6108, 1.8909, 0.7175, 2.7964, 2.9346, 2.4974, 3.2543, 3.6477, 2.2657, 2.7274, 4.4656, 3.1189, 2.2356, 3.1112, 2.0153, 3.8531, -2.2742, 0.5466, -3.5621, 0.5682, 0.0692, -0.4327, -1.0646, -1.547, -0.0711, -3.0897, -1.8727, 3.7229, 0.7642, 0.1332, 0.9649, -1.7761, -2.9089, 1.3173, 2.0166, 0.1236, 0.1901, 1.0666, 1.015, 2.2579, 0.5469, 0.4192, 2.5579, 0.2985, -0.4318, -0.2802, -3.4489, 1.8516, -0.2341, -1.4547, 0.1322, -1.1368, 1.92, 1.3704, -2.1235, -2.4244, -1.8488, 1.446, 1.4911, 3.3764, 0.002, 2.6589, -0.1781, 1.6332, 3.0993, 0.4452, 1.9963, -1.0558, 2.4251, 1.7624, -0.9189, 1.2277, -0.4107, -0.7209, -1.2883, -0.2033, -1.035, 2.1153, -0.4164, 1.4586, -0.4343, -1.2503, 0.3594, 1.7229, 1.0777, -1.6411, 1.1449, 1.0892, 0.6068, 1.109, -1.7486, 3.4536, -1.7635, 1.3976, -3.5496, -4.0665, -1.624, -0.1121, 1.2669, -1.4786, 1.4366, 0.0345, 0.5709, -0.4645, 0.6889, 1.1458, 2.628, 3.8735, -1.2663, 1.1235, 1.2226, -2.4448, 1.5101, -1.2581, -2.268, 1.6008}
11 | )
12 | 
13 | func TestChangepoints(t *testing.T) {
14 | 	sig := 0.05
15 | 	R := 99
16 | 	minSize := 30
17 | 
18 | 	changes, err := DetectChanges(mockLong, sig, R, minSize)
19 | 	assert.Equal(t, nil, err)
20 | 	assert.Equalf(t, []int{0, 100, 200, 300}, changes, "incorrect changepoint detection")
21 | }
22 | 
23 | // the the accuracy of a single split
24 | func TestSplit(t *testing.T) {
25 | 	d := vectorDistance(mockLong)
26 | 
27 | 	split := splitPoint(0, 299, d, 30)
28 | 	assert.Equalf(t, 100, split.Index, "incorrect series split")
29 | }
30 | 


--------------------------------------------------------------------------------