├── .idea
├── .name
├── scopes
│ └── scope_settings.xml
├── encodings.xml
├── vcs.xml
├── misc.xml
├── modules.xml
└── bck_stats.iml
├── bck_stats
├── __init__.py
├── multiclass_triangle_plot.py
├── gcv_smoother.py
├── avg_pred_comp.py
├── react.py
├── dynamic_linear_model.py
├── super_pca.py
├── dba.py
└── sklearn_estimator_suite.py
├── setup.py
├── LICENSE.txt
├── README.md
├── test_estimator_suite.py
└── .gitignore
/.idea/.name:
--------------------------------------------------------------------------------
1 | bck_stats
--------------------------------------------------------------------------------
/bck_stats/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'brandonkelly'
2 |
3 |
--------------------------------------------------------------------------------
/.idea/scopes/scope_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | __author__ = 'brandonkelly'
2 |
3 | from distutils.core import setup
4 | setup(name='bck_stats', version='0.1', author='Brandon C. Kelly',
5 | description='Routines for various statistical and machine learning techniques.',
6 | packages=['bck_stats'])
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/.idea/bck_stats.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2014 Brandon C. Kelly
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
6 | this software and associated documentation files (the "Software"), to deal in
7 | the Software without restriction, including without limitation the rights to
8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 |
22 |
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | bck_stats
2 | =========
3 |
4 | Routines for implementing various statistical and machine learning techniques.
5 |
6 | Description of routines:
7 |
8 | * `super_pca`: Class for performing supervised principal components regression (Bair, E., et al. *Prediction by supervised principal components.* J. Am. Stat. Assoc. 101, 473, 2006)
9 | * `sklearn_estimator_suite`: Classes for running through a set of scikit-learn estimators, using cross-validation to choose the tuning parameters.
10 | * `react`: Classes for performing non-parameteric regression in one or two dimensions based on the REACT technique (Beran, R. *REACT scatterplot smoothers: Superefficiency through basis economy.* J. Am. Stat. Assoc. 95, 449, 2000)
11 | * `multiclass_triangle_plot`: Plot the lower triangle of a scatterplot matrix, color-coding according to class label. A modified version of Dan Foreman-Mackey's triangle.py routine.
12 | * `gcv_smoother`: Perform exponential smoothing of a time series. The e-folding time scale is chosen using generalized cross-validation.
13 | * `dynamic_linear_model`: Class to perform dynamic linear regression via least-squares (Montana, G., et al. *Flexible least squares for temporal data mining and statistical arbitrage.* Expert Systems with Applications 36, 2819, 2009).
14 | * `dba`: Compute the dynamic time warping barycentric average of a set of time series (Petitjean, F., et al. *A global averaging method for dynamic time warping, with applications to clustering.* Pattern Recognition, 44, 678, 2011). Also contains a function to compute the dynamic time warping distance.
15 |
16 | -------------
17 | Installation
18 | -------------
19 |
20 | From the base directory type `python setup.py install` in a terminal.
21 |
--------------------------------------------------------------------------------
/test_estimator_suite.py:
--------------------------------------------------------------------------------
1 | __author__ = 'brandonkelly'
2 |
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | from sklearn_estimator_suite import ClassificationSuite
6 | from sklearn.datasets import make_classification
7 | from sklearn.cross_validation import train_test_split
8 | from sklearn.metrics import accuracy_score
9 |
10 | n_samples = 2000
11 | n_classes = 3
12 | X, y = make_classification(n_samples, n_classes=n_classes, n_informative=10)
13 |
14 | X, X_test, y, y_test = train_test_split(X, y, train_size=0.5)
15 |
16 | # suite = ClassificationSuite(n_features=X.shape[1])
17 | #
18 | # suite.fit(X, y)
19 | # names = suite.best_scores.keys()
20 | # scores = suite.best_scores.values()
21 | #
22 | # fig, ax1 = plt.subplots()
23 | # plt.bar(np.arange(0, len(names)), scores)
24 | # xtickNames = plt.setp(ax1, xticklabels=names)
25 | # plt.setp(xtickNames, rotation=45)
26 | # plt.ylabel('Accuracy')
27 | # plt.xlabel('Model')
28 | # plt.show()
29 |
30 | # now make sure things work in parallel
31 | suite = ClassificationSuite(n_features=X.shape[1], njobs=7)
32 |
33 | suite.fit(X, y)
34 |
35 | names = suite.best_scores.keys()
36 | scores = suite.best_scores.values()
37 |
38 | # get predictions
39 | y_predict_uniform = suite.predict(X_test, weights='uniform') # uniform weightings
40 | y_predict_stacked = suite.predict(X_test)
41 |
42 | uniform_accuracy = accuracy_score(y_test, y_predict_uniform)
43 | stacked_accuracy = accuracy_score(y_test, y_predict_stacked)
44 | y_predict_all = suite.predict_all(X_test)
45 |
46 | print ''
47 | print '---'
48 | print 'Test accuracy for uniform weighting:', uniform_accuracy
49 | print 'Test accuracy for validation score weighting:', stacked_accuracy
50 | for name in y_predict_all:
51 | print 'Test accuracy for', name, ':', accuracy_score(y_test, y_predict_all[name])
52 | print '---'
53 | print ''
54 |
55 | fig, ax1 = plt.subplots()
56 | plt.bar(np.arange(0, len(names)), scores)
57 | xtickNames = plt.setp(ax1, xticklabels=names)
58 | plt.setp(xtickNames, rotation=45)
59 | plt.ylabel('Accuracy')
60 | plt.xlabel('Model')
61 | plt.show()
62 |
63 | # try using different number of grid refinements for the models
64 | n_refinements = {name: 1 for name in suite.model_names}
65 | n_refinements['GbcAutoNtrees'] = 0
66 |
67 | suite.fit(X, y, n_refinements=n_refinements)
68 |
69 | names = suite.best_scores.keys()
70 | scores = suite.best_scores.values()
71 |
72 | fig, ax1 = plt.subplots()
73 | plt.bar(np.arange(0, len(names)), scores)
74 | xtickNames = plt.setp(ax1, xticklabels=names)
75 | plt.setp(xtickNames, rotation=45)
76 | plt.ylabel('Accuracy')
77 | plt.xlabel('Model')
78 | plt.tight_layout()
79 | plt.show()
80 |
81 | tuning_ranges = {'LogisticRegression': {'C': list(np.logspace(-3.0, 0.0, 5))},
82 | 'DecisionTreeClassifier': {'max_depth': [5, 10, 20, 50, 100]},
83 | 'LinearSVC': {'C': list(np.logspace(-3.0, 0.0, 5))}}
84 |
85 | suite = ClassificationSuite(tuning_ranges=tuning_ranges, njobs=7)
86 |
87 | suite.fit(X, y, n_refinements=3)
88 |
89 | names = suite.best_scores.keys()
90 | scores = suite.best_scores.values()
91 |
92 | fig, ax1 = plt.subplots()
93 | plt.bar(np.arange(0, len(names)), scores)
94 | xtickNames = plt.setp(ax1, xticklabels=names)
95 | plt.setp(xtickNames, rotation=45)
96 | plt.ylabel('Accuracy')
97 | plt.xlabel('Model')
98 | plt.show()
99 |
100 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | #########################
2 | # .gitignore file for Xcode4 / OS X Source projects
3 | #
4 | # Version 2.0
5 | # For latest version, see: http://stackoverflow.com/questions/49478/git-ignore-file-for-xcode-projects
6 | #
7 | # 2013 updates:
8 | # - fixed the broken "save personal Schemes"
9 | #
10 | # NB: if you are storing "built" products, this WILL NOT WORK,
11 | # and you should use a different .gitignore (or none at all)
12 | # This file is for SOURCE projects, where there are many extra
13 | # files that we want to exclude
14 | #
15 | #########################
16 |
17 | #####
18 | # OS X temporary files that should never be committed
19 |
20 | .DS_Store
21 | *.swp
22 | *.lock
23 | profile
24 |
25 |
26 | ####
27 | # Xcode temporary files that should never be committed
28 | #
29 | # NB: NIB/XIB files still exist even on Storyboard projects, so we want this...
30 |
31 | *~.nib
32 |
33 |
34 | ####
35 | # Xcode build files -
36 | #
37 | # NB: slash on the end, so we only remove the FOLDER, not any files that were badly named "DerivedData"
38 |
39 | DerivedData/
40 |
41 | # NB: slash on the end, so we only remove the FOLDER, not any files that were badly named "build"
42 |
43 | build/
44 |
45 |
46 | #####
47 | # Xcode private settings (window sizes, bookmarks, breakpoints, custom executables, smart groups)
48 | #
49 | # This is complicated:
50 | #
51 | # SOMETIMES you need to put this file in version control.
52 | # Apple designed it poorly - if you use "custom executables", they are
53 | # saved in this file.
54 | # 99% of projects do NOT use those, so they do NOT want to version control this file.
55 | # ..but if you're in the 1%, comment out the line "*.pbxuser"
56 |
57 | *.pbxuser
58 | *.mode1v3
59 | *.mode2v3
60 | *.perspectivev3
61 | # NB: also, whitelist the default ones, some projects need to use these
62 | !default.pbxuser
63 | !default.mode1v3
64 | !default.mode2v3
65 | !default.perspectivev3
66 |
67 |
68 | ####
69 | # Xcode 4 - semi-personal settings
70 | #
71 | #
72 | # OPTION 1: ---------------------------------
73 | # throw away ALL personal settings (including custom schemes!
74 | # - unless they are "shared")
75 | #
76 | # NB: this is exclusive with OPTION 2 below
77 | xcuserdata
78 | *.xcworkspacedata
79 |
80 | # OPTION 2: ---------------------------------
81 | # get rid of ALL personal settings, but KEEP SOME OF THEM
82 | # - NB: you must manually uncomment the bits you want to keep
83 | #
84 | # NB: this is exclusive with OPTION 1 above
85 | #
86 | #xcuserdata/**/*
87 |
88 | # (requires option 2 above): Personal Schemes
89 | #
90 | #!xcuserdata/**/xcschemes/*
91 |
92 | ####
93 | # XCode 4 workspaces - more detailed
94 | #
95 | # Workspaces are important! They are a core feature of Xcode - don't exclude them :)
96 | #
97 | # Workspace layout is quite spammy. For reference:
98 | #
99 | # /(root)/
100 | # /(project-name).xcodeproj/
101 | # project.pbxproj
102 | # /project.xcworkspace/
103 | # contents.xcworkspacedata
104 | # /xcuserdata/
105 | # /(your name)/xcuserdatad/
106 | # UserInterfaceState.xcuserstate
107 | # /xcsshareddata/
108 | # /xcschemes/
109 | # (shared scheme name).xcscheme
110 | # /xcuserdata/
111 | # /(your name)/xcuserdatad/
112 | # (private scheme).xcscheme
113 | # xcschememanagement.plist
114 | #
115 | #
116 |
117 | ####
118 | # Xcode 4 - Deprecated classes
119 | #
120 | # Allegedly, if you manually "deprecate" your classes, they get moved here.
121 | #
122 | # We're using source-control, so this is a "feature" that we do not want!
123 |
124 | *.moved-aside
125 |
126 | ####
127 | # Cocoapods: cocoapods.org
128 | #
129 | # Ignoring these files means that whoever uses the code will first have to run:
130 | # pod install
131 | # in the App.xcodeproj directory.
132 | # This ensures the latest dependencies are used.
133 | Pods/
134 | Podfile.lock
135 |
136 | ######
137 | # Ignore .pyc files
138 | *.pyc
139 |
140 | ####
141 | # Ignore PyCharm files
142 | .idea/
143 | __pychache__/
144 |
145 | # ignore csv files
146 | # *.csv
147 |
148 | #ignore data
149 | #data/
150 | #plots/
151 |
152 | #ignore pickle files
153 | *.pickle
154 | *.p
155 | *.h5
--------------------------------------------------------------------------------
/bck_stats/multiclass_triangle_plot.py:
--------------------------------------------------------------------------------
1 | __author__ = 'brandonkelly'
2 | __notes__ = "Adapted from Dan Foreman-Mackey triangle.py module."
3 |
4 | import numpy as np
5 | import matplotlib.pyplot as plt
6 | from matplotlib.ticker import MaxNLocator
7 |
8 |
9 | def multiclass_triangle(xs, classes, labels=None, verbose=True, fig=None, **kwargs):
10 | # Deal with 1D sample lists.
11 | xs = np.atleast_1d(xs)
12 | if len(xs.shape) == 1:
13 | xs = np.atleast_2d(xs)
14 | else:
15 | assert len(xs.shape) == 2, "The input sample array must be 1- or 2-D."
16 | xs = xs.T
17 | assert xs.shape[0] <= xs.shape[1], "I don't believe that you want more " \
18 | "dimensions than samples!"
19 |
20 | K = len(xs)
21 | factor = 2.0 # size of one side of one panel
22 | lbdim = 0.5 * factor # size of left/bottom margin
23 | trdim = 0.05 * factor # size of top/right margin
24 | whspace = 0.05 # w/hspace size
25 | plotdim = factor * K + factor * (K - 1.) * whspace
26 | dim = lbdim + plotdim + trdim
27 |
28 | if fig is None:
29 | fig, axes = plt.subplots(K, K, figsize=(dim, dim))
30 | else:
31 | try:
32 | axes = np.array(fig.axes).reshape((K, K))
33 | except:
34 | raise ValueError("Provided figure has {0} axes, but data has "
35 | "dimensions K={1}".format(len(fig.axes), K))
36 | lb = lbdim / dim
37 | tr = (lbdim + plotdim) / dim
38 | fig.subplots_adjust(left=lb, bottom=lb, right=tr, top=tr,
39 | wspace=whspace, hspace=whspace)
40 |
41 | extents = [[x.min(), x.max()] for x in xs]
42 |
43 | # Check for parameters that never change.
44 | m = np.array([e[0] == e[1] for e in extents], dtype=bool)
45 | if np.any(m):
46 | raise ValueError(("It looks like the parameter(s) in column(s) "
47 | "{0} have no dynamic range. Please provide an "
48 | "`extent` argument.")
49 | .format(", ".join(map("{0}".format,
50 | np.arange(len(m))[m]))))
51 |
52 | class_labels = np.unique(classes)
53 | nclasses = len(class_labels)
54 |
55 | color_list = ["Black", "DodgerBlue", "DarkOrange", "Green", "Magenta", "Red", "Brown", "Cyan"] * 10
56 |
57 | for i, x in enumerate(xs):
58 | ax = axes[i, i]
59 | # Plot the histograms.
60 | n = []
61 | for l, k in enumerate(class_labels):
62 | n_k, b_k, p_k = ax.hist(x[classes == k], bins=kwargs.get("bins", 50),
63 | range=extents[i], histtype="step",
64 | color=color_list[l], lw=2, normed=True)
65 | n.append(n_k)
66 |
67 | # Set up the axes.
68 | ax.set_xlim(extents[i])
69 | ax.set_ylim(0, 1.1 * np.max(n))
70 | ax.set_yticklabels([])
71 | ax.xaxis.set_major_locator(MaxNLocator(5))
72 |
73 | # Not so DRY.
74 | if i < K - 1:
75 | ax.set_xticklabels([])
76 | else:
77 | [l.set_rotation(45) for l in ax.get_xticklabels()]
78 | if labels is not None:
79 | ax.set_xlabel(labels[i])
80 | ax.xaxis.set_label_coords(0.5, -0.3)
81 |
82 | for j, y in enumerate(xs):
83 | ax = axes[i, j]
84 | if j > i:
85 | ax.set_visible(False)
86 | ax.set_frame_on(False)
87 | continue
88 | elif j == i:
89 | continue
90 |
91 | for l, k in enumerate(class_labels):
92 | ax.plot(y[classes == k], x[classes == k], 'o', ms=1.5, color=color_list[l], rasterized=True, alpha=0.25)
93 |
94 | extent = [[y.min(), y.max()], [x.min(), x.max()]]
95 | ax.set_xlim(extent[0])
96 | ax.set_ylim(extent[1])
97 | ax.xaxis.set_major_locator(MaxNLocator(5))
98 | ax.yaxis.set_major_locator(MaxNLocator(5))
99 |
100 | if i < K - 1:
101 | ax.set_xticklabels([])
102 | else:
103 | [l.set_rotation(45) for l in ax.get_xticklabels()]
104 | if labels is not None:
105 | ax.set_xlabel(labels[j])
106 | ax.xaxis.set_label_coords(0.5, -0.3)
107 |
108 | if j > 0:
109 | ax.set_yticklabels([])
110 | else:
111 | [l.set_rotation(45) for l in ax.get_yticklabels()]
112 | if labels is not None:
113 | ax.set_ylabel(labels[i])
114 | ax.yaxis.set_label_coords(-0.3, 0.5)
115 |
116 | return fig
--------------------------------------------------------------------------------
/bck_stats/gcv_smoother.py:
--------------------------------------------------------------------------------
1 | __author__ = 'brandonkelly'
2 |
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 |
6 |
7 | class GcvExpSmoother(object):
8 | def __init__(self, lookback=30):
9 | """
10 | Constructor for class to perform exponentially-weighted average smoothing of a 1-D data set.
11 |
12 | @param lookback: The maximum look-back length to use in the smoothing. Only the data points in
13 | y[idx - lookback:idx] are used to compute the smoothed estimate of y[idx+1].
14 | """
15 | self.lookback = int(lookback) # support of exponential smoother, only use this many data points in computation
16 | self.efold = 1.0
17 | self.gcv_grid = np.zeros(2.0 * self.lookback)
18 | self.efold_grid = np.zeros(2.0 * self.lookback)
19 |
20 | def smooth(self, y):
21 | """
22 | Return a smoothed estimate of y, using the current value of self.efold for the e-folding length.
23 |
24 | @param y: The data, a 1-D array.
25 | @return: The smoothed estimate of y, a 1-D numpy array.
26 | """
27 | ysmooth, peff = self._smooth(self.efold, y)
28 | return ysmooth
29 |
30 | def weights(self, efold, lookback=None):
31 | if lookback is None:
32 | lookback = self.lookback
33 | xvalues = np.arange(0.0, lookback)
34 | weights = np.exp(-xvalues / efold)
35 | return weights[::-1] / np.sum(weights)
36 |
37 | def choose_efold(self, y, verbose=False):
38 | """
39 | Choose the optimal e-folding length of the exponential smoothing kernel using generalized cross-validation.
40 |
41 | @param y: The training set, a 1-D array.
42 | @param verbose: If true, then print the chosen smoothing length.
43 | """
44 | ngrid = 20
45 | efold_grid = np.logspace(-1.0, np.log10(self.lookback * 2.0), ngrid)
46 | gcv_grid = np.zeros(efold_grid.size)
47 | for i in xrange(efold_grid.size):
48 | smoothed_y, peffective = self._smooth(efold_grid[i], y)
49 | gcv_grid[i] = gcv_error(y, smoothed_y, peffective)
50 |
51 | # choose e-folding length of smoother to minimize the generalized cross-validation error
52 | self.efold = efold_grid[gcv_grid.argmin()]
53 | if verbose:
54 | print 'E-folding length chosen to be', self.efold
55 |
56 | # save the grids
57 | self.efold_grid = efold_grid
58 | self.gcv_grid = gcv_grid
59 |
60 | def _smooth(self, efold, y):
61 | try:
62 | y.size > self.lookback
63 | except ValueError:
64 | 'Y must have at least self.lookback elements.'
65 |
66 | ysmooth = np.zeros(y.size)
67 | ysmooth[0] = y[0]
68 |
69 | peffective = 0.0 # trace of the smoothing matrix, the effective number of parameters
70 |
71 | # treat the first self.lookback data points seperately, since the base-line is shorter
72 | for i in xrange(1, self.lookback):
73 | weights = self.weights(efold, lookback=i)
74 | ysmooth[i] = weights.dot(y[0:i])
75 | peffective += weights[-1]
76 |
77 | weights = self.weights(efold)
78 | for i in xrange(y.size - self.lookback - 1):
79 | idx = self.lookback + i
80 | # estimate current y as exponentially-weighted average of previous self.lookback y-values
81 | ysmooth[idx] = weights.dot(y[idx - self.lookback:idx])
82 | peffective += weights[-1]
83 |
84 | ysmooth[-1] = weights.dot(y[y.size - self.lookback - 1:-1])
85 | peffective += weights[-1]
86 |
87 | return ysmooth, peffective
88 |
89 |
90 | def gcv_error(y, ysmooth, peffective):
91 | """
92 | Compute generalized cross-validation error.
93 |
94 | @param y: The numpy array of y-values.
95 | @param ysmooth: The smoothed numpy array of y-values.
96 | @param peffective: The effective number of parameters of the smoothing matrix, given by its trace.
97 | @return: The generalized cross-validation error (L2-loss function).
98 | """
99 | gcv = np.mean((y - ysmooth) ** 2) / (1.0 - peffective / y.size) ** 2
100 | return gcv
101 |
102 |
103 | if __name__ == "__main__":
104 | # example usage
105 | x = np.arange(500)
106 | y = np.cos(x / 15.0) + 0.1 * np.random.standard_normal(500)
107 |
108 | gcv = GcvExpSmoother()
109 | gcv.choose_efold(y, verbose=True)
110 | ysmooth = gcv.smooth(y)
111 |
112 | plt.semilogy(gcv.efold_grid, gcv.gcv_grid)
113 | plt.xlabel('E-folding length')
114 | plt.ylabel('GCV Error')
115 | plt.show()
116 |
117 | plt.clf()
118 | plt.plot(x, y, '.', label='Data')
119 | plt.plot(x, ysmooth, label='Smoothed', lw=2)
120 | plt.legend()
121 | plt.show()
122 |
--------------------------------------------------------------------------------
/bck_stats/avg_pred_comp.py:
--------------------------------------------------------------------------------
1 | __author__ = 'brandonkelly'
2 |
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | from sklearn.neighbors import NearestNeighbors
6 | from scipy.spatial.distance import cdist
7 | from scipy import linalg
8 | import multiprocessing
9 |
10 |
11 | def distance_matrix(Xvals):
12 | covar = np.cov(Xvals, rowvar=0)
13 | covar_inv = linalg.inv(covar)
14 | Dmat = cdist(Xvals, Xvals, metric='mahalanobis', VI=covar_inv)
15 |
16 | return Dmat
17 |
18 |
19 | def impact_single_theta(args):
20 | predict, theta, X, p_idx, weights, predict_args = args
21 | # first compute the matrix of model predictions:
22 | # y_predict[i, j] = E(y|u_i, v_j, theta)
23 | ndata = X.shape[0]
24 | X_copy = X.copy()
25 | u = X[:, p_idx] # the active predictor
26 | y_predict = np.zeros((ndata, ndata))
27 | for i in range(ndata):
28 | X_copy[:, p_idx] = u[i]
29 | y_predict[i] = predict(X_copy, theta, *predict_args)
30 |
31 | # get matrix of signs of transitions
32 | transition_sign = np.zeros((ndata, ndata))
33 | for j in range(ndata):
34 | transition_sign[:, j] = np.sign(u - u[j])
35 |
36 | u1, u2 = np.meshgrid(u, u)
37 | transition_sign = np.sign(u2 - u1)
38 | y_predict_diff = y_predict - np.outer(np.ones(ndata), y_predict.diagonal())
39 | numer = np.sum(weights * y_predict_diff * transition_sign) # signed version
40 | abs_numer = np.sum(weights * np.abs(y_predict_diff)) # absolute version
41 | # denom = np.sum(weights * (u2 - u1) * np.sign(u2 - u1))
42 | denom = np.sum(weights)
43 |
44 | return numer / denom, abs_numer / denom
45 |
46 |
47 | def impact(predict, theta, X, predictors=None, predict_args=None, nneighbors=None, nx=None, ntheta=None,
48 | mahalanobis_constant=1.0, n_jobs=1):
49 |
50 | if n_jobs < 0:
51 | n_jobs = multiprocessing.cpu_count()
52 | if n_jobs > 1:
53 | pool = multiprocessing.Pool(n_jobs)
54 |
55 | if predictors is None:
56 | # calculate the impact for all the predictors
57 | predictors = np.arange(X.shape[1])
58 |
59 | if nx is not None:
60 | # use only a subset of the data points
61 | subset_idx = np.random.permutation(X.shape[0])[:nx]
62 | X = X[subset_idx]
63 | else:
64 | nx = X.shape[0]
65 | if ntheta is not None:
66 | # use only a subset of the theta samples
67 | subset_idx = np.random.permutation(theta.shape[0])
68 | theta = theta[subset_idx]
69 | else:
70 | ntheta = theta.shape[0]
71 | if nneighbors is None:
72 | # use all of the neighbors when computing the weights
73 | nneighbors = X.shape[0]
74 |
75 | # first compute the distance matrix
76 | Dmat = distance_matrix(X)
77 | weights0 = 1.0 / (mahalanobis_constant + Dmat)
78 |
79 | # get the sets of nearest neighbors
80 | knn = NearestNeighbors(n_neighbors=nneighbors)
81 | knn.fit(X)
82 | nn_idx = knn.kneighbors(X, return_distance=False)
83 |
84 | weights = np.zeros_like(weights0)
85 | for i in range(weights.shape[0]):
86 | # data points outside of K nearest neighbors have weight of zero
87 | weights[nn_idx[i], i] = weights0[nn_idx[i], i]
88 |
89 | weights /= weights.sum(axis=0) # normalize weights to contribution to impact for each data point is the same
90 |
91 | impacts = np.zeros(len(predictors))
92 | abs_impacts = np.zeros_like(impacts)
93 | impact_sigmas = np.zeros_like(impacts)
94 | abs_impact_sigma = np.zeros_like(impacts)
95 | print 'Doing predictor'
96 | for p_idx in predictors:
97 | print p_idx, '...'
98 | args = []
99 | for s in range(ntheta):
100 | args.append([predict, theta[s], X, p_idx, weights, predict_args])
101 | if n_jobs == 1:
102 | results = map(impact_single_theta, args)
103 | else:
104 | results = pool.map(impact_single_theta, args)
105 | results = np.array(results)
106 | impacts[p_idx] = np.mean(results[:, 0])
107 | impact_sigmas[p_idx] = np.std(results[:, 0])
108 | abs_impacts[p_idx] = np.mean(results[:, 1])
109 | abs_impact_sigma[p_idx] = np.std(results[:, 1])
110 |
111 | # impact_theta = np.zeros(theta.shape)
112 | # impact_theta_abs = np.zeros_like(impact_theta)
113 | # for s in range(ntheta):
114 | # impact_s, abs_impact_s = impact_single_theta(predict, theta[s], X, p_idx, weights, predict_args=predict_args)
115 | # impact_theta[s] = impact_s
116 | # impact_theta_abs[s] = abs_impact_s
117 | # impacts[p_idx] = np.mean(impact_theta)
118 | # impact_sigmas[p_idx] = np.std(impact_theta)
119 | # abs_impacts[p_idx] = np.mean(impact_theta_abs)
120 | # abs_impact_sigma[p_idx] = np.std(impact_theta_abs)
121 |
122 | return impacts, impact_sigmas, abs_impacts, abs_impact_sigma
123 |
124 |
125 | if __name__ == "__main__":
126 | # test and example usage
127 | ndata = 200
128 | beta = np.array([1.0, 2.0, -0.6, 0.1])
129 | sigma = 0.1
130 | X = np.column_stack((np.ones(ndata), np.random.standard_normal(ndata), np.random.uniform(0.0, 5.0, ndata),
131 | np.random.standard_cauchy(ndata)))
132 | y = X.dot(beta) + sigma * np.random.standard_normal(ndata)
133 |
134 | XX_inv = linalg.inv(X.T.dot(X))
135 | bhat = XX_inv.dot(X.T.dot(y))
136 | bcov = XX_inv * sigma * sigma
137 |
138 | nsamples = 100
139 | betas = np.random.multivariate_normal(bhat, bcov, nsamples)
140 | betas = betas[:, 1:] # ignore constant term
141 |
142 | def linear_mean(X, beta, constant):
143 | ymean = X.dot(beta) + constant
144 | return ymean
145 |
146 | # don't include constant term
147 | impacts, isigmas, abs_impacts, aisigmas = \
148 | impact(linear_mean, betas, X[:, 1:], predict_args=(bhat[0],), nneighbors=20, n_jobs=4)
149 | print impacts
150 | sorted_idx = np.argsort(np.abs(impacts))
151 |
152 | labels = np.array(['x1', 'x2', 'x3'])[sorted_idx]
153 |
154 | pos = np.arange(sorted_idx.shape[0]) + .5
155 | plt.barh(pos, impacts[sorted_idx], align='center', xerr=isigmas[sorted_idx], alpha=0.5)
156 | plt.yticks(pos, labels)
157 | plt.xlabel('Impact')
158 | plt.show()
--------------------------------------------------------------------------------
/bck_stats/react.py:
--------------------------------------------------------------------------------
1 | __author__ = 'brandonkelly'
2 |
3 | import numpy as np
4 | from sklearn.isotonic import IsotonicRegression
5 |
6 |
7 | class REACT(object):
8 |
9 | def __init__(self, basis='DCT', n_components=None, method='monotone'):
10 | try:
11 | basis.lower() in ['dct', 'manual']
12 | except ValueError:
13 | 'Input basis must be either DCT or manual.'
14 |
15 | try:
16 | method.lower() in ['monotone', 'nss']
17 | except ValueError:
18 | 'method must be either monotone or nss.'
19 |
20 | self.basis = basis
21 | self.nfreq = 1
22 | self.ncomp = 1
23 | self.n_components = n_components
24 | self.method = method
25 | self.coefs = np.zeros(1)
26 | self.shrinkage_factors = np.zeros(1)
27 |
28 | def fit(self, y, X=None, sigsqr=None):
29 |
30 | # check inputs
31 | if X is None:
32 | # build the discrete cosine basis
33 | if self.n_components is None:
34 | n_components = len(y)
35 | else:
36 | n_components = self.n_components
37 | X = self.build_dct(len(y), n_components)
38 | self.nfreq = len(y)
39 | self.ncomp = n_components
40 | else:
41 | if self.n_components is None:
42 | n_components = X.shape[1]
43 | else:
44 | n_components = self.n_components
45 | self.ncomp = n_components
46 |
47 | try:
48 | n_components <= len(y)
49 | except ValueError:
50 | 'Number of components must be less than the length of y.'
51 |
52 | self.coefs = np.dot(X.T, y)
53 |
54 | if sigsqr is None:
55 | # estimate noise variance using first difference estimator
56 | sigsqr = np.sum((y[1:] - y[:-1]) ** 2) / (2.0 * (len(y) - 1))
57 |
58 | if self.method == 'monotone':
59 | # use monotone shrinkage on the basis coefficients
60 | self._set_shrinkage_factors(sigsqr)
61 | else:
62 | # use nested subset selection to choose the order of the basis expansion
63 | self._set_nss_order(sigsqr)
64 |
65 | self.coefs *= self.shrinkage_factors
66 |
67 | ysmooth = X.dot(self.coefs)
68 | return ysmooth
69 |
70 | @staticmethod
71 | def build_dct(n, p):
72 | rows, columns = np.mgrid[:n, :p]
73 | U = np.cos(np.pi * rows * columns / (n - 1.0))
74 | row_norm = 2 * np.ones(n)
75 | row_norm[0] = 1.0
76 | row_norm[-1] = 1.0
77 | col_norm = 2 * np.ones(p)
78 | col_norm[0] = 1.0
79 | if p == n:
80 | col_norm[-1] = 1.0
81 | U *= 0.5 * np.sqrt(2.0 * np.outer(row_norm, col_norm) / (n - 1))
82 |
83 | return U
84 |
85 | def interpolate(self, x_idx):
86 | try:
87 | self.method.lower() == 'dct'
88 | except AttributeError:
89 | 'Interpolation only available for DCT basis.'
90 |
91 | n = self.nfreq
92 | p = self.ncomp
93 | cols = np.arange(p)
94 | row_norm = 2 * np.ones(n)
95 | row_norm[0] = 1.0
96 | row_norm[-1] = 1.0
97 | col_norm = 2 * np.ones(p)
98 | col_norm[0] = 1.0
99 | U = np.cos(np.pi * np.outer(x_idx / n, cols))
100 | U *= 0.5 * np.sqrt(2.0 * np.outer(row_norm, col_norm) / (n - 1))
101 | y_interp = U.dot(self.coefs)
102 | return y_interp
103 |
104 | def _set_shrinkage_factors(self, sigsqr):
105 | coefs_snr = (self.coefs ** 2 - sigsqr) / self.coefs ** 2 # signal-to-noise ratio of the coefficients
106 | coefs_snr[coefs_snr < 0] = 0.0
107 | x = np.arange(len(coefs_snr))
108 | weights = self.coefs ** 2
109 | self.shrinkage_factors = \
110 | IsotonicRegression(y_min=0.0, y_max=1.0, increasing=False).fit_transform(x, coefs_snr, weights)
111 |
112 | def _set_nss_order(self, sigsqr):
113 | coefs_snr = (self.coefs ** 2 - sigsqr) / self.coefs ** 2 # signal-to-noise ratio of the coefficients
114 | coefs_snr[coefs_snr < 0] = 0.0
115 | risk = np.empty(len(coefs_snr))
116 | shrinkage_factor = np.zeros(len(coefs_snr))
117 | for j in xrange(len(risk)):
118 | shrinkage_factor[:j+1] = 1.0
119 | risk[j] = np.mean((shrinkage_factor - coefs_snr) ** 2 * self.coefs ** 2)
120 | best_order = risk.argmin()
121 | self.shrinkage_factors = np.ones(len(coefs_snr))
122 | self.shrinkage_factors[best_order:] = 0.0 # only keep first best_order basis coefficients
123 |
124 |
125 | class REACT2D(REACT):
126 | def __init__(self, max_order=None, method='monotone'):
127 | # currently only support the DCT for 2-D data
128 | super(REACT2D, self).__init__('DCT', max_order, method)
129 | self.row_order = np.zeros(1)
130 | self.col_order = np.zeros(1)
131 |
132 | def interpolate(self, x_idx):
133 | if True:
134 | print 'Interpolation not currently available for REACT2D'
135 | else:
136 | super(REACT2D, self).interpolate(x_idx)
137 |
138 | @staticmethod
139 | def build_dct(nrows, ncols, p):
140 | # first build 1-D basis vectors
141 | Urows = super(REACT2D, REACT2D).build_dct(nrows, p)
142 | Ucols = super(REACT2D, REACT2D).build_dct(ncols, p)
143 | # now build 2-d basis as outer products of 1-d basis vectors
144 | row_order, col_order = np.mgrid[:p, :p]
145 | row_order = row_order.ravel() + 1
146 | col_order = col_order.ravel() + 1
147 | # sort the basis images by the sum of squares of their orders
148 | sqr_order = row_order ** 2 + col_order ** 2
149 | s_idx = np.argsort(sqr_order)
150 | row_order = row_order[s_idx]
151 | col_order = col_order[s_idx]
152 | U = np.empty((nrows * ncols, len(row_order)))
153 | for j in xrange(len(row_order)):
154 | U[:, j] = np.outer(Urows[:, row_order[j]-1], Ucols[:, col_order[j]-1]).ravel()
155 |
156 | return U
157 |
158 | def fit(self, y, sigsqr):
159 | # build the discrete cosine basis
160 | if self.n_components is None:
161 | components_from_y = True
162 | self.n_components = min(y.shape)
163 | else:
164 | components_from_y = False
165 |
166 | try:
167 | self.n_components <= min(y.shape)
168 | except ValueError:
169 | 'Number of components must be less than the length of y.'
170 |
171 | # build the 2-D DCT here and then feed into REACT.fit()
172 | X = self.build_dct(y.shape[0], y.shape[1], self.n_components)
173 |
174 | ysmooth = super(REACT2D, self).fit(y.ravel(), X, sigsqr)
175 |
176 | # save the orders of the basis functions
177 | row_order, col_order = np.mgrid[:self.n_components, :self.n_components]
178 | row_order = row_order.ravel() + 1
179 | col_order = col_order.ravel() + 1
180 | # sort the basis images by the sum of squares of their orders
181 | sqr_order = row_order ** 2 + col_order ** 2
182 | s_idx = np.argsort(sqr_order)
183 | self.row_order = row_order[s_idx]
184 | self.col_order = col_order[s_idx]
185 |
186 | if components_from_y:
187 | # return n_components to value from constructor
188 | self.n_components = None
189 |
190 | return np.reshape(ysmooth, y.shape)
191 |
--------------------------------------------------------------------------------
/bck_stats/dynamic_linear_model.py:
--------------------------------------------------------------------------------
1 | __author__ = 'brandonkelly'
2 |
3 | import numpy as np
4 | import pykalman
5 | import matplotlib.pyplot as plt
6 | import multiprocessing
7 |
8 |
9 | def mae_loss(y, yfit):
10 | return np.mean(np.abs(y - yfit))
11 |
12 |
13 | def _train_predict_dlm(args):
14 | """
15 | Helper function to train and predict the dynamic linear model for a train and test set. Seperated from the main
16 | class to enable the use of the multiprocessing module. This should not be called directly.
17 | """
18 | delta, X, y, ntrain, loss = args
19 | print delta
20 | dlm = DynamicLinearModel(include_constant=False)
21 |
22 | # first fit using the training data
23 | dlm.fit(X[:ntrain], y[:ntrain], delta=delta, method='filter')
24 |
25 | # now run the filter on the whole data set
26 | ntime, pfeat = X.shape
27 | observation_matrix = X.reshape((ntime, 1, pfeat))
28 | k = dlm.kalman
29 | kalman = pykalman.KalmanFilter(transition_matrices=k.transition_matrices,
30 | observation_matrices=observation_matrix,
31 | observation_offsets=k.observation_offsets,
32 | transition_offsets=k.transition_offsets,
33 | observation_covariance=k.observation_covariance,
34 | transition_covariance=k.transition_covariance,
35 | initial_state_mean=k.initial_state_mean,
36 | initial_state_covariance=k.initial_state_covariance)
37 |
38 | beta, bcov = kalman.filter(y)
39 |
40 | # predict the y-values in the test set
41 | yfit = np.sum(beta[ntrain-1:-1] * X[ntrain-1:-1], axis=1)
42 |
43 | test_error = loss(y[ntrain:], yfit)
44 |
45 | return test_error
46 |
47 |
48 | class DynamicLinearModel(object):
49 | def __init__(self, include_constant=True):
50 | """
51 | Constructor for linear regression model with dynamic coefficients.
52 | """
53 | self.delta_grid = np.zeros(10)
54 | self.test_grid = np.zeros(10)
55 | self.delta = 1e-4
56 | self.test_error_ = 1.0
57 | self.kalman = pykalman.KalmanFilter()
58 | self.beta = np.zeros(2)
59 | self.beta_cov = np.identity(2)
60 | self.current_beta = np.zeros(2)
61 | self.current_bcov = np.identity(2)
62 | self.include_constant = include_constant
63 |
64 | @staticmethod
65 | def add_constant_(X):
66 | """
67 | Add a constant to the linear model by prepending a column of ones to the feature array.
68 |
69 | @param X: The feature array. Note that it will be overwritten, and the overwritten array will be returned.
70 | """
71 | if X.ndim == 1:
72 | # treat vector-valued X differently
73 | X = np.insert(X[:, np.newaxis], 0, np.ones(len(X)), axis=1)
74 | else:
75 | X = np.insert(X, 0, np.ones(X.shape[0]), axis=1)
76 |
77 | return X
78 |
79 | def fit(self, X, y, method='smoother', delta=None, include_constant=None):
80 | """
81 | Fit the coefficients for the dynamic linear model.
82 |
83 | @param method: The method used to estimate the dynamic coefficients, either 'smoother' or 'filter'. If
84 | 'smoother', then the Kalman Smoother is used, otherwise the Kalman Filter will be used. The two differ
85 | in the fact that the Kalman Smoother uses both future and past data, while the Kalman Filter only uses
86 | past data.
87 | @param X: The time-varying covariates, and (ntime, pfeat) array.
88 | @param y: The time-varying response, a 1-D array with ntime elements.
89 | @param delta: The regularization parameters on the time variation of the coefficients. Default is
90 | self.delta.
91 | @param include_constant: Boolean, if true then include a constant in the regression model.
92 | """
93 | try:
94 | method.lower() in ['smoother', 'filter']
95 | except ValueError:
96 | "method must be either 'smoother' or 'filter'."
97 |
98 | if delta is None:
99 | delta = self.delta
100 | else:
101 | self.delta = delta
102 |
103 | if include_constant is None:
104 | include_constant = self.include_constant
105 | else:
106 | self.include_constant = include_constant
107 |
108 | if include_constant:
109 | Xtemp = self.add_constant_(X.copy())
110 | else:
111 | Xtemp = X.copy()
112 |
113 | ntime, pfeat = Xtemp.shape
114 |
115 | observation_matrix = Xtemp.reshape((ntime, 1, pfeat))
116 | observation_offset = np.array([0.0])
117 |
118 | transition_matrix = np.identity(pfeat)
119 | transition_offset = np.zeros(pfeat)
120 |
121 | mu = (1.0 - delta) / delta
122 | # Var(beta_t - beta_{t-1}) = 1.0 / mu
123 | transition_covariance = np.identity(pfeat) / mu
124 |
125 | # parameters to be estimated using MLE
126 | em_vars = ['initial_state_mean', 'initial_state_covariance']
127 | kalman = pykalman.KalmanFilter(transition_matrices=transition_matrix, em_vars=em_vars,
128 | observation_matrices=observation_matrix,
129 | observation_offsets=observation_offset, transition_offsets=transition_offset,
130 | observation_covariance=np.array([1.0]),
131 | transition_covariance=transition_covariance)
132 |
133 | kalman.em(y)
134 | if method is 'smoother':
135 | beta, beta_covar = kalman.smooth(y)
136 | else:
137 | beta, beta_covar = kalman.filter(y)
138 |
139 | self.beta = beta
140 | self.beta_cov = beta_covar
141 | self.current_beta = beta[-1]
142 | self.current_bcov = beta_covar[-1]
143 | self.kalman = kalman
144 |
145 | def update(self, y, x):
146 | """
147 | Update the linear regression coefficients given the new values of the response and features.
148 |
149 | @param y: The new response value, a scalar.
150 | @param x: The new feature vector.
151 | """
152 | if self.include_constant:
153 | observation_matrix = np.insert(x, 0, 1.0)
154 | else:
155 | observation_matrix = x.copy()
156 |
157 | pfeat = observation_matrix.size
158 | observation_matrix = observation_matrix.reshape((1, pfeat))
159 |
160 | self.current_beta, self.current_bcov = \
161 | self.kalman.filter_update(self.current_beta, self.current_bcov, observation=y,
162 | observation_matrix=observation_matrix)
163 |
164 | self.beta = np.vstack((self.beta, self.current_beta))
165 | self.beta_cov = np.dstack((self.beta_cov.T, self.current_bcov)).T
166 |
167 | def predict(self, x):
168 | """
169 | Predict a value of the response given the input feature array and the current value of the coefficients.
170 |
171 | @param x: The input feature array.
172 | """
173 | if self.include_constant:
174 | xpredict = np.insert(x, 0, 1.0)
175 | else:
176 | xpredict = x
177 |
178 | return np.sum(self.current_beta * xpredict)
179 |
180 | def choose_delta(self, X, y, test_fraction=0.5, verbose=False, ndeltas=20, include_constant=True, loss=mae_loss,
181 | njobs=1):
182 | """
183 | Choose the optimal regularization parameters for the linear smoother coefficients by minimizing an input loss
184 | function on a test set.
185 |
186 | @param X: The time-varying covariates, and (ntime, pfeat) array.
187 | @param y: The training set, a 1-D array.
188 | @param ndeltas: The number of grid points to use for the regularization parameter.
189 | @param test_fraction: The fraction of the input data to use as the test set, default is half.
190 | @param verbose: If true, then print the chosen regularization parameter and test error.
191 | @param include_constant: Boolean, include a constant in the linear model?
192 | @param loss: The loss function to use for evaluating the test error when choosing the regularization parameter.
193 | Must be of the form result = loss(ytest, yfit).
194 | @param njobs: The number of processors to use when doing the search over delta. If njobs = -1, all processors
195 | will be used.
196 | """
197 |
198 | if include_constant is None:
199 | include_constant = self.include_constant
200 | else:
201 | self.include_constant = include_constant
202 |
203 | if njobs < 0:
204 | njobs = multiprocessing.cpu_count()
205 |
206 | pool = multiprocessing.Pool(njobs)
207 | pool.map(int, range(njobs)) # warm up the pool
208 |
209 | # split y into training and test sets
210 | ntime = y.size
211 | ntest = int(ntime * test_fraction)
212 | ntrain = ntime - ntest
213 | if X.ndim == 1:
214 | XX = X.reshape((X.size, 1))
215 | else:
216 | XX = X.copy()
217 |
218 | if include_constant:
219 | # add column of ones to feature array
220 | XX = self.add_constant_(XX)
221 |
222 | # grid of delta (regularization) values, between 1e-4 and 1.0.
223 | delta_grid = np.logspace(-4.0, np.log10(0.95), ndeltas)
224 |
225 | args = []
226 | for d in xrange(ndeltas):
227 | args.append((delta_grid[d], XX, y, ntrain, loss))
228 |
229 | if verbose:
230 | print 'Computing test errors...'
231 |
232 | if njobs == 1:
233 | test_grid = map(_train_predict_dlm, args)
234 | else:
235 | test_grid = pool.map(_train_predict_dlm, args)
236 |
237 | test_grid = np.array(test_grid)
238 | self.delta = delta_grid[test_grid.argmin()]
239 | self.test_error_ = test_grid.min()
240 |
241 | if verbose:
242 | print 'Best delta is', self.delta, 'and has a test error of', test_grid.min()
243 |
244 | self.delta_grid = delta_grid
245 | self.test_grid = test_grid
246 |
247 |
248 | if __name__ == "__main__":
249 | # run test from Montana et al. (2009)
250 | nx = 1000
251 | x = np.zeros(nx)
252 | x[0] = np.random.uniform(-2.0, 2.0)
253 | for i in xrange(1, nx):
254 | x[i] = 0.8 * x[i-1] + np.random.uniform(-2.0, 2.0)
255 |
256 | y = np.zeros(x.size)
257 | beta = np.zeros(x.size)
258 | beta[0] = 2.0
259 | for i in xrange(1, x.size):
260 | if i < 300:
261 | beta[i] = beta[i-1] + 0.1 * np.random.standard_normal()
262 | elif i == 300:
263 | beta[i] = beta[i-1] + 4.0
264 | elif (i > 300) and (i < 600):
265 | beta[i] = beta[i-1] + 0.001 * np.random.standard_normal()
266 | else:
267 | beta[i] = 5.0 * np.sin(i / 10.0) + np.random.uniform(-2.0, 2.0)
268 |
269 | y = 2.0 + beta * x + 2.0 * np.random.standard_normal(nx)
270 |
271 | plt.plot(beta)
272 | plt.ylabel(r'$\beta$')
273 | plt.show()
274 | plt.clf()
275 |
276 | plt.plot(x, y, '.')
277 | plt.ylabel('y')
278 | plt.xlabel('x')
279 | plt.show()
280 | plt.clf()
281 |
282 | plt.plot(y)
283 | plt.ylabel('y')
284 | plt.show()
285 | plt.clf()
286 |
287 | dynamic = DynamicLinearModel(include_constant=False)
288 | dynamic.choose_delta(np.ones(len(y)), y, test_fraction=0.5, verbose=True, ndeltas=20, njobs=5)
289 | dynamic.fit(np.ones(len(y)), y)
290 |
291 | plt.semilogx(dynamic.delta_grid, dynamic.test_grid)
292 | plt.xlabel('Regularization (delta)')
293 | plt.ylabel('Mean Absolute Test Error')
294 | plt.show()
295 |
296 | plt.clf()
297 | for i in xrange(1):
298 | plt.subplot(2, 1, i + 1)
299 | plt.plot(y, '.')
300 | plt.plot(dynamic.beta[:, i])
301 | plt.ylabel(r"$\beta_" + str(i) + '$')
302 | if i == 1:
303 | plt.plot(beta, 'k')
304 | plt.show()
--------------------------------------------------------------------------------
/bck_stats/super_pca.py:
--------------------------------------------------------------------------------
1 | __author__ = 'brandonkelly'
2 |
3 | import numpy as np
4 | from sklearn import cross_validation, metrics
5 | from sklearn.decomposition import PCA
6 | import multiprocessing
7 | import copy
8 | import matplotlib.pyplot as plt
9 |
10 |
11 | class SupervisedPCABase(object):
12 |
13 | def __init__(self, regressor, max_components=None, n_components=1, whiten=True):
14 | """
15 | Base class for performing supervised principal component regression. This is useful for cases where the number
16 | of inputs (features) is greater than the number of data points.
17 |
18 | @param regressor: The object that will perform the regression. The following members must be defined for this
19 | object:
20 |
21 | regressor.fit(X, y) : Fits the regression model y = f(X).
22 | regressor.predict(X) : Compute the prediction y = f(X).
23 | regressor.coef_score_ : The score of each parameter, used for ranking the most important features when
24 | computing the reduced feature space. In general this will be the absolute value of
25 | the coefficient value divided by its standard error. Note that this should *not*
26 | include the intercept.
27 |
28 | @param max_components: Maximum number of components to search over. The default is p.
29 | @param n_components: The number of reduced data matrix PCA components to use in the regression.
30 | @param whiten: Remove differences in variance among the components, i.e., principal components will have unit
31 | variance
32 | """
33 | self.regressor = regressor
34 | self.max_components = max_components
35 | self.pca_object = PCA(n_components=n_components, whiten=whiten)
36 | self.n_components = n_components
37 | self.whiten = whiten
38 | self.n_reduced = 0
39 | self.sort_idx = np.zeros(1)
40 |
41 | def _compute_stnd_coefs(self, X, y):
42 | """
43 | Compute the standardized regression coefficients, up to a common scaling factor.
44 |
45 | @param X: The matrix of inputs, shape (n,p).
46 | @param y: The array of response values, size n.
47 | @return: The standardized regression coefficients, size p.
48 | """
49 | p = X.shape[1]
50 | scoefs = np.zeros(p)
51 | for j in xrange(p):
52 | thisX = X[:, j]
53 | self.regressor.fit(thisX[:, np.newaxis], y)
54 | scoefs[j] = self.regressor.coef_score_
55 |
56 | return scoefs
57 |
58 | def _get_reduced_features(self, X, coefs, pmax):
59 | """
60 | Return the data projected onto the first n_components principal components computed using the reduced feature
61 | space.
62 |
63 | @param X: The array of inputs, shape (n, p).
64 | @param coefs: The array of standardized coefficients, size p.
65 | @param pmax: The maximum number of features to use in the reduced feature space PCA.
66 | @return: The data projected onto the reduced feature space PCA, shape (n, self.n_components).
67 | """
68 | sort_idx = np.argsort(coefs)[::-1]
69 | sort_idx = sort_idx[:pmax]
70 | self.pca_object.fit(X[:, sort_idx])
71 | X_reduced = self.pca_object.transform(X[:, sort_idx])
72 |
73 | return X_reduced, sort_idx
74 |
75 | def fit(self, X, y, n_reduced):
76 | """
77 | Perform the regression using the first self.n_components principal components from the reduced feature space.
78 | Note that this will call self.regressor.fit(X,y) to perform the regression.
79 |
80 | @param X: The array of inputs, shape (n, p).
81 | @param y: The array of response values, size n.
82 | @param n_reduced: The number of features to use in the reduced feature space.
83 | """
84 | scoefs = self._compute_stnd_coefs(X, y)
85 | X_reduced, sort_idx = self._get_reduced_features(X, scoefs, n_reduced)
86 | self.sort_idx = sort_idx
87 | self.regressor.fit(X_reduced, y)
88 |
89 | def predict(self, X):
90 | """
91 | Predict the value y = f(X) based on the PCA using the reduced feature space, based on the most recent call to
92 | self.fit(X, y, n_reduced).
93 |
94 | @param X: The array of inputs, shape (n, p).
95 | @return: The predicted values of the response.
96 | """
97 | X_reduced = self.pca_object.transform(X[:, self.sort_idx])
98 | y_predict = self.regressor.predict(X_reduced)
99 | return y_predict
100 |
101 |
102 | def launch_coef_scores(args):
103 | """
104 | Wrapper to compute the standardized scores of the regression coefficients, used when computing the number of
105 | features in the reduced parameter set.
106 |
107 | @param args: Tuple containing the instance of SupervisedPCABase, feature matrix and response array.
108 | @return: The standardzed scores of the coefficients.
109 | """
110 | spca, X, y = args
111 | scoefs = spca._compute_stnd_coefs(X, y)
112 | return scoefs
113 |
114 |
115 | def compute_cv_prediction(args):
116 | """
117 | Internal method to get predictions based on supervised PCA regression for each cross-validation fold. Need this
118 | format in order to compute the predictions for the CV folds in parallel.
119 | """
120 | spca, X_train, y_train, X_test, n_reduced, scoef = args
121 | SPCA = SupervisedPCABase(copy.deepcopy(spca.regressor), spca.max_components, spca.n_components, spca.whiten)
122 | X_reduced, sort_idx = SPCA._get_reduced_features(X_train, scoef, n_reduced)
123 | SPCA.regressor.fit(X_reduced, y_train)
124 | X_test_reduced = SPCA.pca_object.transform(X_test[:, sort_idx])
125 | y_predict = SPCA.regressor.predict(X_test_reduced)
126 | return y_predict
127 |
128 |
129 | class SupervisedPCA(SupervisedPCABase):
130 | def __init__(self, regressor, max_components=None, n_components=1, whiten=True, n_jobs=1):
131 | """
132 | Class for performing supervised principal component regression. This is useful for cases where the number of
133 | inputs (features) is greater than the number of data points.
134 |
135 | @param regressor: The object that will perform the regression. The following members must be defined for this
136 | object:
137 |
138 | regressor.fit(X, y) : Fits the regression model y = f(X).
139 | regressor.predict(X) : Compute the prediction y = f(X).
140 | regressor.coef_score_ : The score of each parameter, used for ranking the most important features when
141 | computing the reduced feature space. In general this will be the absolute value of
142 | the coefficient value divided by its standard error. Note that this should *not*
143 | include the intercept.
144 |
145 | @param max_components: Maximum number of components to search over. The default is p.
146 | @param n_components: The number of reduced data matrix PCA components to use in the regression.
147 | @param whiten: Remove differences in variance among the components, i.e., principal components will have unit
148 | variance
149 | @param n_jobs: The number of threads to use for parallel processing. If n_jobs = -1 then use maximum number
150 | available.
151 | """
152 | super(SupervisedPCA, self).__init__(regressor, max_components, n_components, whiten)
153 | if n_jobs < 0:
154 | n_jobs = multiprocessing.cpu_count()
155 | self.n_jobs = n_jobs
156 |
157 | def _compute_cv_prediction(self, args):
158 | """
159 | Internal method to get predictions based on supervised PCA regression for each cross-validation fold. Need this
160 | format in order to compute the predictions for the CV folds in parallel.
161 | """
162 | X_train, y_train, X_test, n_reduced, scoef = args
163 | SPCA = SupervisedPCABase(copy.deepcopy(self.regressor), self.max_components, self.n_components, self.whiten)
164 | X_reduced, sort_idx = SPCA._get_reduced_features(X_train, scoef, n_reduced)
165 | SPCA.regressor.fit(X_reduced, y_train)
166 | X_test_reduced = SPCA.pca_object.transform(X_test[:, sort_idx])
167 | y_predict = SPCA.regressor.predict(X_test_reduced)
168 | return y_predict
169 |
170 | def _launch_coef_scores(self, args):
171 | """
172 | Wrapper to compute the standardized scores of the regression coefficients, used when computing the number of
173 | features in the reduced parameter set.
174 |
175 | @param args: Tuple containing the feature matrix and response array.
176 | @return: The standardzed scores of the coefficients.
177 | """
178 | X, y = args
179 | scoefs = self._compute_stnd_coefs(X, y)
180 | return scoefs
181 |
182 | def choose_nreduced(self, X, y, lossfunc=None, cv=None, verbose=False, cvplot=False):
183 | """
184 | Choose the number of features to use in the reduced feature set by minimizing the cross-validation error.
185 |
186 | @param X: The feature matrix, shape (n,p)
187 | @param y: The vector of response values, size n.
188 | @param lossfunc: The loss function to use for the CV error, callable. The default is mean squared error.
189 | @param cv: Number of CV folds (if int), or cross-validation iterator.
190 | @param verbose: Print helpful information.
191 | @param cvplot: Plot the CV error as a function of the number features in the reduced feature set.
192 | @return: The number of features in the reduced feature set that minimized the CV error.
193 | """
194 | if self.n_jobs > 1:
195 | pool = multiprocessing.Pool(self.n_jobs)
196 | pool.map(int, range(self.n_jobs)) # Trick to "warm up" the Pool
197 |
198 | # setup cross-validation iterator
199 | if cv is None:
200 | K_folds = 8
201 | if isinstance(cv, int):
202 | K_folds = cv
203 |
204 | cv = cross_validation.KFold(y.size, n_folds=K_folds)
205 |
206 | if lossfunc is None:
207 | lossfunc = metrics.mean_squared_error
208 |
209 | if self.max_components is None:
210 | self.max_components = X.shape[1]
211 |
212 | if verbose:
213 | print 'Searching over', self.max_components, ' features to include in the reduced feature space.'
214 | print 'Computing univariate regression tests statistics for each feature...'
215 |
216 | # first compute coefficients scores
217 | sargs = []
218 | for train_idx, test_idx in cv:
219 | if self.n_jobs == 1:
220 | sargs.append((X[train_idx, :], y[train_idx]))
221 | else:
222 | sargs.append((self, X[train_idx, :], y[train_idx]))
223 |
224 | if self.n_jobs == 1:
225 | scoefs = map(self._launch_coef_scores, sargs)
226 | else:
227 | scoefs = pool.map(launch_coef_scores, sargs)
228 |
229 | # find optimal number of features to use in PCA on reduced feature set, do this by minimizing cross-validation
230 | # error on a grid.
231 | cverrors = np.zeros(self.max_components)
232 |
233 | if verbose:
234 | print 'Computing cross-validation errors on a grid of up to', self.max_components, 'features used in the', \
235 | 'reduced feature space...'
236 |
237 | for k in xrange(self.max_components):
238 | cverror_args = []
239 | ytest = []
240 | fold_idx = 0
241 | for train_idx, test_idx in cv:
242 | if self.n_jobs == 1:
243 | cverror_args.append((X[train_idx, :], y[train_idx], X[test_idx, :], k + 1, scoefs[fold_idx]))
244 | else:
245 | cverror_args.append((self, X[train_idx, :], y[train_idx], X[test_idx, :], k + 1, scoefs[fold_idx]))
246 | ytest.append(y[test_idx])
247 | fold_idx += 1
248 |
249 | if self.n_jobs == 1:
250 | ypredictions = map(self._compute_cv_prediction, cverror_args)
251 | else:
252 | ypredictions = pool.map(compute_cv_prediction, cverror_args)
253 |
254 | cverror_k = 0.0
255 | for yt, yp in zip(ytest, ypredictions):
256 | cverror_k += lossfunc(yt, yp) / K_folds
257 | cverrors[k] = cverror_k
258 |
259 | if cvplot:
260 | plt.plot(np.arange(1, self.max_components + 1), cverrors)
261 | plt.xlabel('# of features in reduced set')
262 | plt.ylabel('CV Loss Function')
263 | plt.show()
264 |
265 | n_reduced = cverrors.argmin() + 1
266 |
267 | if verbose:
268 | print 'Selected', n_reduced, 'features to use in the reduced feature set.'
269 |
270 | return n_reduced
--------------------------------------------------------------------------------
/bck_stats/dba.py:
--------------------------------------------------------------------------------
1 | __author__ = 'brandonkelly'
2 |
3 | import numpy as np
4 | from numba import jit
5 | import matplotlib.pyplot as plt
6 | from scipy.interpolate import interp1d
7 | import time
8 |
9 |
10 | @jit # if you don't have number, then comment out this line but this routine will be slow!
11 | def dynamic_time_warping(tseries1, tseries2):
12 | """
13 | Compute the dynamic time warping (DTW) distance between two time series. It is assumed that the time series are
14 | evenly sampled, but they can have different lengths. Numba is used to speed up the computation, so you must have
15 | Numba installed. Note that the time series can be multivariate.
16 |
17 | :param tseries1: The first time series, a 1-D or 2-D numpy array.
18 | :param tseries2: The second time series, a 1-D or 2-D numpy array.
19 | :return: A tuple containing the DTW distance, the DTW matrix, and the path matrix taken by the algorithm.
20 | """
21 | ntime1, nfeatures = tseries1.shape
22 | ntime2 = tseries2.shape[0]
23 | dtw = np.zeros((ntime1, ntime2), dtype=np.float) # matrix of coordinate distances
24 | path = np.zeros((ntime1, ntime2), dtype=np.int) # path of algorithm
25 |
26 | # initialize the first row and column
27 | for k in range(nfeatures):
28 | dtw[0, 0] += (tseries1[0, k] - tseries2[0, k]) ** 2
29 | path[0, 0] = -1
30 |
31 | for i in range(1, ntime1):
32 | dist = 0.0
33 | for k in range(nfeatures):
34 | dist += (tseries1[i, k] - tseries2[0, k]) ** 2
35 | dtw[i, 0] = dtw[i-1, 0] + dist
36 | path[i, 0] = 2
37 |
38 | for j in range(1, ntime2):
39 | dist = 0.0
40 | for k in range(nfeatures):
41 | dist += (tseries1[0, k] - tseries2[j, k]) ** 2
42 | dtw[0, j] = dtw[0, j-1] + dist
43 | path[0, j] = 1
44 |
45 | # main loop of the DTW algorithm
46 | for i in range(1, len(tseries1)):
47 | for j in range(1, len(tseries2)):
48 | a = dtw[i-1, j-1]
49 | b = dtw[i, j-1]
50 | c = dtw[i-1, j]
51 | if a < b:
52 | if a < c:
53 | idx = 0 # a is the minimum
54 | delta = a
55 | else:
56 | idx = 2 # c is the minimum
57 | delta = c
58 | else:
59 | if b < c:
60 | idx = 1 # b is the minimum
61 | delta = b
62 | else:
63 | idx = 2 # c is the minimum
64 | delta = c
65 | # neighbors = np.array([dtw[i-1, j-1], dtw[i, j-1], dtw[i-1, j]])
66 | # idx = np.argmin(neighbors)
67 | # delta = neighbors[idx]
68 | dist = 0.0
69 | for k in range(nfeatures):
70 | dist += (tseries1[i, k] - tseries2[j, k]) ** 2
71 | dtw[i, j] = dist + delta
72 | path[i, j] = idx
73 |
74 | return dtw[-1, -1], dtw, path
75 |
76 |
77 | class DBA(object):
78 |
79 | def __init__(self, max_iter, tol=1e-4, verbose=False):
80 | """
81 | Constructor for the DBA class. This class computes the dynamic time warping (DTW) barycenter averaging (DBA)
82 | strategy for averaging a set of time series. The method is described in
83 |
84 | "A global averaging method for dynamic time warping, with applications to clustering." Petitjean, F.,
85 | Ketterlin, A., & Gancarski, P. 2011, Pattern Recognition, 44, 678-693.
86 |
87 | :param max_iter: The maximum number of iterations for the DBA algorithm.
88 | :param tol: The tolerance level for the algorithm. The algorithm terminates once the fractional difference in
89 | the within-group sum of squares between successive iterations is less than tol. The algorithm will also
90 | terminate if the maximum number of iterations is exceeded, or if the sum of squares increases.
91 | :param verbose: If true, then provide helpful output.
92 | """
93 | self.max_iter = max_iter
94 | self.tol = tol
95 | self.average = np.zeros(1)
96 | self.wgss = 0.0 # the within-group sum of squares, called the inertia in the clustering literature
97 | self.verbose = verbose
98 |
99 | def compute_average(self, tseries, nstarts=1, initial_value=None, dba_length=None):
100 | """
101 | Perform the DBA algorithm to compute the average for a set of time series. The algorithm is a local optimization
102 | strategy and thus depends on the initial guess for the average. Improved results can be obtained by using
103 | multiple random initial starts.
104 |
105 | :param tseries: The list of time series, a list of numpy arrays. Can be multivariate time series.
106 | :param nstarts: The number of random starts to use for the DBA algorithm. The average time series that minimizes
107 | the within-group sum of squares over the random starts is returned and saved.
108 | :param initial_value: The initial value for the DBA algorithm, a numpy array. If None, then the initial values
109 | will be drawn randomly from the set of input time series (recommended). Note that is an initial guess is
110 | supplied, then the nstarts parameter is ignored.
111 | :param dba_length: The length of the DBA average time series. If None, this will be set to the length of the
112 | initial_value array. Otherwise, the initial value array will be linearly interpolated to this length.
113 | :return: The estimated average of the time series, defined to minimize the within-group sum of squares of the
114 | input set of time series.
115 | """
116 | if initial_value is not None:
117 | nstarts = 1
118 |
119 | if initial_value is None:
120 | # initialize the average as a random draw from the set of inputs
121 | start_idx = np.random.permutation(len(tseries))[:nstarts]
122 |
123 | best_wgss = 1e300
124 | if self.verbose:
125 | print 'Doing initialization iteration:'
126 | for i in range(nstarts):
127 | print i, '...'
128 | if initial_value is None:
129 | iseries = tseries[start_idx[i]]
130 | else:
131 | iseries = initial_value
132 | if dba_length is not None:
133 | # linearly interpolate initial average value to the requested length
134 | iseries0 = np.atleast_2d(iseries)
135 | if iseries0.shape[0] == 1:
136 | iseries0 = iseries0.T # vector, so transpose to shape (ntime, 1)
137 | nfeatures = iseries0.shape[1]
138 | iseries = np.zeros((dba_length, nfeatures))
139 | for k in range(nfeatures):
140 | lininterp = interp1d(np.arange(iseries0.shape[0]), iseries0[:, k])
141 | iseries[:, k] = lininterp(np.linspace(0.0, iseries0.shape[0]-1.01, num=dba_length))
142 |
143 | self._run_dba(tseries, iseries)
144 |
145 | if self.wgss < best_wgss:
146 | # found better average, save it
147 | if self.verbose:
148 | print 'New best estimate found for random start', i
149 | best_wgss = self.wgss
150 | best_average = self.average
151 |
152 | self.wgss = best_wgss
153 | self.average = best_average
154 |
155 | return best_average
156 |
157 | def associate_segments(self, tseries):
158 | """
159 | Identify the indices of the inputs time series that are associated with each element of the average time series.
160 |
161 | :param tseries: The times series for which the indices associated with the average are desired. A numpy array.
162 | :return: A list-of-lists containing the indices of the input time series that are associated with the elements
163 | of the DBA average. Call this assoc_table. Then assoc_table[i] will return a list of the indices of the
164 | input time series that are associated with the element i of the DBA average (i.e., self.average[i]).
165 | """
166 | dtw_dist, dtw, path = dynamic_time_warping(self.average, tseries)
167 |
168 | # table telling us which elements of the time series are identified with a specific element of the DBA average
169 | assoc_table = []
170 | for i in range(self.average.shape[0]):
171 | assoc_table.append([])
172 |
173 | i = self.average.shape[0] - 1
174 | j = tseries.shape[0] - 1
175 |
176 | while i >= 0 and j >= 0:
177 | assoc_table[i].append(j)
178 | if path[i, j] == 0:
179 | i -= 1
180 | j -= 1
181 | elif path[i, j] == 1:
182 | j -= 1
183 | elif path[i, j] == 2:
184 | i -= 1
185 | else:
186 | # should not happen, but just in case make sure we bail once path[i, j] = -1
187 | break
188 |
189 | return assoc_table
190 |
191 | def _run_dba(self, tseries, initial_value):
192 | """ Perform the DBA algorithm. """
193 | nseries = len(tseries)
194 |
195 | self.average = initial_value
196 |
197 | # first iteration: get initial within-group sum of squares
198 | if self.verbose:
199 | print 'Doing iteration'
200 | print ' ', '0', '...'
201 | wgss = self._dba_iteration(tseries)
202 |
203 | # main DBA loop
204 | for i in range(1, self.max_iter):
205 | if self.verbose:
206 | print ' ', i, '...', 'WGSS:', wgss
207 | wgss_old = wgss
208 | # WGSS is actually from previous iteration, but don't compute again because it is expensive
209 | wgss = self._dba_iteration(tseries)
210 | if wgss > wgss_old:
211 | # sum of squares should be non-increasing
212 | print 'Warning! Within-group sum of squares increased at iteration', i, 'terminating algorithm.'
213 | break
214 | elif np.abs(wgss - wgss_old) / wgss_old < self.tol:
215 | # convergence
216 | break
217 |
218 | # compute final within-group sum of squares
219 | wgss = 0.0
220 | for k in range(nseries):
221 | wgss += dynamic_time_warping(tseries[k], self.average)[0]
222 | self.wgss = wgss
223 |
224 | def _dba_iteration(self, tseries):
225 | """ Perform a single iteration of the DBA algorithm. """
226 | ntime = self.average.shape[0]
227 |
228 | # table telling us which elements of the time series are identified with a specific element of the DBA average
229 | assoc_table = []
230 | for i in range(ntime):
231 | assoc_table.append([])
232 |
233 | wgss = 0.0 # within group sum of squares from previous iteration, compute here so we don't have to repeat
234 | for series in tseries:
235 | if self.average.shape[1] == 1:
236 | series = series[:, np.newaxis]
237 | dtw_dist, dtw, path = dynamic_time_warping(self.average, series)
238 | wgss += dtw_dist
239 | i = ntime - 1
240 | j = series.shape[0] - 1
241 | while i >= 0 and j >= 0:
242 | assoc_table[i].append(series[j])
243 | if path[i, j] == 0:
244 | i -= 1
245 | j -= 1
246 | elif path[i, j] == 1:
247 | j -= 1
248 | elif path[i, j] == 2:
249 | i -= 1
250 | else:
251 | # should not happen, but just in case make sure we bail once path[i, j] = -1
252 | break
253 |
254 | # update the average
255 | for i, cell in enumerate(assoc_table):
256 | cell_array = np.array(cell)
257 | self.average[i] = cell_array.mean(axis=0)
258 |
259 | return wgss
260 |
261 |
262 | if __name__ == "__main__":
263 | # run on some test data
264 | nseries = 40
265 | ntime0 = 1000
266 | phase1 = 0.1 + 0.2 * np.random.uniform(0.0, 1.0, nseries) - 0.1
267 | period1 = np.pi / 4.0 + np.pi / 100.0 * np.random.standard_normal(nseries)
268 |
269 | phase2 = np.pi / 2 + 0.2 * np.random.uniform(0.0, 1.0, nseries) - 0.1
270 | period2 = np.pi / 2.0 + np.pi / 100.0 * np.random.standard_normal(nseries)
271 |
272 | noise_amplitude = 0.0
273 |
274 | t_list = []
275 | ts_list = []
276 | for i in range(nseries):
277 | ntime = np.random.random_integers(ntime0 * 0.9, ntime0 * 1.1)
278 | t = np.linspace(0.0, 10.0, ntime)
279 | t_list.append(t)
280 | tseries = np.zeros((ntime, 2))
281 | tseries[:, 0] = np.sin(t / period1[i] + phase1[i]) + noise_amplitude * np.random.standard_normal(ntime)
282 | tseries[:, 1] = np.sin(t / period2[i] + phase2[i]) + noise_amplitude * np.random.standard_normal(ntime)
283 | ts_list.append(tseries)
284 |
285 | niter = 30
286 | dba = DBA(niter, verbose=True, tol=1e-4)
287 | t1 = time.clock()
288 | dba_avg = dba.compute_average(ts_list, nstarts=5, dba_length=10)
289 | t2 = time.clock()
290 |
291 | print 'DBA algorithm took', t2 - t1, 'seconds.'
292 |
293 | plt.subplot(221)
294 | for i in range(nseries):
295 | plt.plot(t_list[i], ts_list[i][:, 0], '.', ms=2)
296 | t = np.linspace(0.0, 10.0, len(dba_avg))
297 | plt.plot(t, dba_avg[:, 0], 'ko')
298 | plt.subplot(222)
299 | for i in range(nseries):
300 | plt.plot(t_list[i], ts_list[i][:, 1], '.', ms=2)
301 | t = np.linspace(0.0, 10.0, len(dba_avg))
302 | plt.plot(t, dba_avg[:, 1], 'ko')
303 | plt.subplot(223)
304 | for ts in ts_list:
305 | plt.plot(ts[:, 0], ts[:, 1], '.', ms=2)
306 | plt.plot(dba_avg[:, 0], dba_avg[:, 1], 'ko')
307 | plt.show()
308 | plt.close()
309 |
310 | # find the segments of the first time series identified with each element of the average
311 | assoc = dba.associate_segments(ts_list[0])
312 | plt.subplot(221)
313 | t = t_list[0]
314 | ts = ts_list[0]
315 | for i, a in enumerate(assoc):
316 | plt.plot(t[a], ts[a, 0], '.', label=str(i))
317 | plt.plot(np.median(t[a]), dba_avg[i, 0], 'ko')
318 | plt.subplot(222)
319 | for i, a in enumerate(assoc):
320 | plt.plot(t[a], ts[a, 1], '.', label=str(i))
321 | plt.plot(np.median(t[a]), dba_avg[i, 1], 'ko')
322 | plt.subplot(223)
323 | for i, a in enumerate(assoc):
324 | plt.plot(ts[a, 0], ts[a, 1], '.', label=str(i))
325 | plt.plot(dba_avg[i, 0], dba_avg[i, 1], 'ko')
326 | plt.show()
--------------------------------------------------------------------------------
/bck_stats/sklearn_estimator_suite.py:
--------------------------------------------------------------------------------
1 | __author__ = 'brandonkelly'
2 |
3 | import numpy as np
4 | import abc
5 |
6 | from sklearn.linear_model import LogisticRegression
7 | from sklearn.grid_search import GridSearchCV, ParameterGrid
8 | from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
9 | from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor, \
10 | RandomForestRegressor
11 | from sklearn.svm import SVC, LinearSVC
12 | from sklearn.metrics import accuracy_score, make_scorer, mean_absolute_error, mean_squared_error
13 | from sklearn.cross_validation import KFold
14 | from sklearn.base import clone
15 |
16 | float_types = (float, np.float, np.float32, np.float64, np.float_, np.float128, np.float16)
17 | int_types = (int, np.int, np.int8, np.int16, np.int32, np.int64)
18 |
19 |
20 | class GbcAutoNtrees(GradientBoostingClassifier):
21 | """
22 | Same as GradientBoostingClassifier, but the number of estimators is chosen automatically by maximizing the
23 | out-of-bag score.
24 | """
25 | def __init__(self, subsample, loss='deviance', learning_rate=0.01, n_estimators=500, min_samples_split=2,
26 | min_samples_leaf=1, max_depth=3, init=None, random_state=None, max_features=None, verbose=0):
27 | super(GbcAutoNtrees, self).__init__(loss, learning_rate, n_estimators, subsample, min_samples_split,
28 | min_samples_leaf, max_depth, init, random_state, max_features, verbose)
29 |
30 | def fit(self, X, y):
31 |
32 | super(GbcAutoNtrees, self).fit(X, y)
33 | oob_score = np.cumsum(self.oob_improvement_)
34 | ntrees = oob_score.argmax() + 1
35 | if self.verbose:
36 | print 'Chose', ntrees, 'based on the OOB score.'
37 | self.n_estimators = ntrees
38 | self.estimators_ = self.estimators_[:ntrees]
39 |
40 | # plt.plot(oob_score)
41 | # plt.show()
42 |
43 | return self
44 |
45 |
46 | class GbrAutoNtrees(GradientBoostingRegressor):
47 | """
48 | Same as GradientBoostingRegressor, but the number of estimators is chosen automatically by maximizing the
49 | out-of-bag score.
50 | """
51 |
52 | def __init__(self, subsample, loss='ls', learning_rate=0.1, n_estimators=100, min_samples_split=2,
53 | min_samples_leaf=1, max_depth=3, init=None, random_state=None, max_features=None, alpha=0.9,
54 | verbose=0):
55 | super(GbrAutoNtrees, self).__init__(loss, learning_rate, n_estimators, subsample, min_samples_split,
56 | min_samples_leaf, max_depth, init, random_state, max_features, alpha,
57 | verbose)
58 |
59 | def fit(self, X, y):
60 |
61 | super(GbrAutoNtrees, self).fit(X, y)
62 | oob_score = np.cumsum(self.oob_improvement_)
63 | ntrees = oob_score.argmax() + 1
64 | if self.verbose:
65 | print 'Chose', ntrees, 'based on the OOB score.'
66 | self.n_estimators = ntrees
67 | self.estimators_ = self.estimators_[:ntrees]
68 |
69 | # plt.plot(oob_score)
70 | # plt.show()
71 |
72 | return self
73 |
74 |
75 | class BasePredictorSuite(object):
76 | """ Base class for running a suite of estimators from scikit-learn. """
77 | __metaclass__ = abc.ABCMeta
78 |
79 | @abc.abstractmethod
80 | def __init__(self, tuning_ranges=None, models=None, cv=None, njobs=1, pre_dispatch='2*n_jobs', stack=True,
81 | verbose=False):
82 | """
83 | Initialize a pipeline to run a suite of scikit-learn estimators. The tuning parameters are chosen through
84 | cross-validation or the out-of-bags score (for Random Forests) as part of the fitting process.
85 |
86 | :param tuning_ranges: A nested dictionary containing the ranges of the tuning parameters. It should be of the
87 | format {model name 1: {parameter name 1: list(value range 1), parameter name 2: list(value range 2), ...} }.
88 | :param models: A list of instantiated scikit-learn estimator classes to fit. If None, these are taken from
89 | the models listed in tuning_range.
90 | :param cv: The number of CV folds to use, or a CV generator.
91 | :param njobs: The number of processes to run in parallel.
92 | :param pre_dispatch: Passed to sklearn.grid_search.GridSearchCV, see documentation for GridSearchCV for further
93 | details.
94 | :param stack: If true, then the predict() method will return a stacked (averaged) value over the estimators.
95 | Otherwise, if false, then predict() will return the predictions for each estimator.
96 | :param verbose: If true, print out helpful information.
97 | """
98 | super(BasePredictorSuite, self).__init__()
99 | self.verbose = verbose
100 | if tuning_ranges is None:
101 | tuning_ranges = dict()
102 | self.tuning_ranges = tuning_ranges
103 | if models is None:
104 | models = []
105 | self.models = models
106 | self.model_names = []
107 | for model in self.models:
108 | # store the names of the sklearn classes used
109 | self.model_names.append(model.__class__.__name__)
110 | # make sure the model names are in the dictionary of tuning parameters
111 | if model.__class__.__name__ not in tuning_ranges:
112 | raise ValueError('Could not find tuning parameters for', model.__class__.__name__)
113 |
114 | if cv is None:
115 | cv = 3
116 | self.cv = cv
117 | self.njobs = njobs
118 | self.pre_dispatch = pre_dispatch
119 | self.scorer = None
120 | self.stack = stack
121 | self.best_scores = dict()
122 | self.nfeatures = None
123 |
124 | def refine_grid(self, best_params, model_name):
125 | """
126 | Refine the tuning parameter grid to zoom in on the region near the current maximum.
127 |
128 | :param best_params: A dictionary containing the set of best tuning parameter names and their values. Should be
129 | of the form {'parameter 1': value1, 'parameter 2', value2, ... }. The tuning parameter grid will be refined
130 | in the region of these parameter values.
131 | :param model_name: The name of the estimator corresponding to the tuning parameters in best_params.
132 | """
133 | for param_name in best_params:
134 | pvalue_list = self.tuning_ranges[model_name][param_name]
135 | best_value = best_params[param_name]
136 | # find the values corresponding to
137 | idx = pvalue_list.index(best_value)
138 | ngrid = len(pvalue_list)
139 | if idx == 0:
140 | # first element of grid, so expand below it
141 | if type(pvalue_list[0]) in int_types:
142 | pv_min = pvalue_list[0] / 2 # reduce minimum grid value by a factor of 2
143 | pv_min = max(1, pv_min) # assume integer tuning parameters are never less than 1.
144 | pv_max = pvalue_list[1]
145 | self.tuning_ranges[model_name][param_name] = \
146 | list(np.unique(np.linspace(pv_min, pv_max, ngrid).astype(np.int)))
147 | else:
148 | # use logarithmic grids for floats
149 | dp = np.log10(pvalue_list[1]) - np.log10(pvalue_list[0])
150 | pv_min = np.log10(pvalue_list[0]) - dp
151 | pv_max = np.log10(pvalue_list[1])
152 | self.tuning_ranges[model_name][param_name] = list(np.logspace(pv_min, pv_max, ngrid))
153 | if self.verbose:
154 | print self.tuning_ranges[model_name][param_name]
155 | elif idx == ngrid - 1:
156 | # last element of grid, so expand above it
157 | if pvalue_list[idx] is None:
158 | # special situation for some estimators, like the DecisionTreeClassifier
159 | pv_min = pvalue_list[idx-1] # increase the maximum grid value by a factor of 2
160 | pv_max = 2 * pv_min
161 | self.tuning_ranges[model_name][param_name] = \
162 | list(np.unique(np.linspace(pv_min, pv_max, ngrid-1).astype(np.int)))
163 | # make sure we keep None as the last value in the list
164 | self.tuning_ranges[model_name][param_name].append(None)
165 | elif type(pvalue_list[idx]) in int_types:
166 | pv_min = np.log10(pvalue_list[idx-1])
167 | pv_max = np.log10(2 * pvalue_list[idx]) # increase the maximum grid value by a factor of 2
168 | if param_name == 'max_features':
169 | # can't have max_features > nfeatures
170 | pv_max = min(2 * pvalue_list[idx], self.nfeatures)
171 | pv_max = np.log10(pv_max)
172 | self.tuning_ranges[model_name][param_name] = \
173 | list(np.unique(np.logspace(pv_min, pv_max, ngrid).astype(np.int)))
174 | else:
175 | # use logarithmic grids for floats
176 | dp = np.log10(pvalue_list[idx]) - np.log10(pvalue_list[idx-1])
177 | pv_min = np.log10(pvalue_list[idx-1])
178 | pv_max = np.log10(pvalue_list[idx]) + dp
179 | self.tuning_ranges[model_name][param_name] = list(np.logspace(pv_min, pv_max, ngrid))
180 | if self.verbose:
181 | print self.tuning_ranges[model_name][param_name]
182 | else:
183 | # inner element of grid
184 | if pvalue_list[idx + 1] is None:
185 | # special situation for some estimators, like the DecisionTreeClassifier
186 | pv_min = pvalue_list[idx-1] # increase the maximum grid value by a factor of 2
187 | pv_max = 2 * pvalue_list[idx]
188 | self.tuning_ranges[model_name][param_name] = \
189 | list(np.unique(np.linspace(pv_min, pv_max, ngrid-1).astype(np.int)))
190 | # make sure we keep None as the last value in the list
191 | self.tuning_ranges[model_name][param_name].append(None)
192 | elif type(pvalue_list[idx]) in int_types:
193 | pv_min = pvalue_list[idx-1]
194 | pv_max = pvalue_list[idx+1]
195 | # switch to linear spacing for interior integer grid values
196 | self.tuning_ranges[model_name][param_name] = \
197 | list(np.unique(np.linspace(pv_min, pv_max, ngrid).astype(np.int)))
198 | else:
199 | # use logarithmic grids for floats
200 | pv_min = np.log10(pvalue_list[idx-1])
201 | pv_max = np.log10(pvalue_list[idx+1])
202 | self.tuning_ranges[model_name][param_name] = list(np.logspace(pv_min, pv_max, ngrid))
203 | if self.verbose:
204 | print self.tuning_ranges[model_name][param_name]
205 |
206 | # print 'New Grid:', self.tuning_ranges[model_name][param_name]
207 |
208 | def cross_validate(self, X, model_idx, y):
209 | """
210 | Fit the tuning parameters for an estimator on a grid using cross-validation.
211 |
212 | :param X: The array of predictors, shape (n_samples, n_features).
213 | :param model_idx: The index of the estimator to fit.
214 | :param y: The array of response values, shape (n_samples) or (n_samples, n_outputs) depending on the estimator.
215 | :return: A tuple containing the scikit-learn estimator object with the best tuning parameters, the score
216 | corresponding to the best tuning parameters, and a dictionary containing the best tuning parameter values.
217 | """
218 | if self.verbose:
219 | print 'Doing cross-validation for model', self.model_names[model_idx], '...'
220 | grid = GridSearchCV(self.models[model_idx], self.tuning_ranges[self.model_names[model_idx]],
221 | scoring=self.scorer, n_jobs=self.njobs, cv=self.cv, pre_dispatch=self.pre_dispatch)
222 | grid.fit(X, y)
223 | if self.verbose:
224 | print 'Best', self.model_names[model_idx], 'has:'
225 | for tuning_parameter in self.tuning_ranges[self.model_names[model_idx]]:
226 | print ' ', tuning_parameter, '=', grid.best_params_[tuning_parameter]
227 | print ' CV Score of', grid.best_score_
228 | return grid.best_estimator_, grid.best_score_, grid.best_params_
229 |
230 | def oob_validate(self, X, model_idx, y):
231 | """
232 | Fit the tuning parameters for a Random Forest estimator on a grid by maximizing the score of the out-of-bag
233 | samples. This is faster than using cross-validation.
234 |
235 | :param X: The array of predictors, shape (n_samples, n_features).
236 | :param model_idx: The index of the estimator to fit.
237 | :param y: The array of response values, shape (n_samples) or (n_samples, n_outputs) depending on the estimator.
238 | :return: A tuple containing the scikit-learn estimator object with the best tuning parameters, the score
239 | corresponding to the best tuning parameters, and a dictionary containing the best tuning parameter values.
240 | """
241 | if self.verbose:
242 | print 'Doing OOB-validation for model', self.model_names[model_idx], '...'
243 |
244 | tune_grid = list(ParameterGrid(self.tuning_ranges[self.model_names[model_idx]]))
245 |
246 | best_estimator = None
247 | best_score = -1e30
248 |
249 | # fit random forest
250 | for point in tune_grid:
251 | estimator = clone(self.models[model_idx])
252 | for tpar in point:
253 | # set the tuning parameters
254 | estimator.__setattr__(tpar, point[tpar])
255 | estimator.fit(X, y)
256 |
257 | if estimator.oob_score_ > best_score:
258 | # new best values, save them
259 | best_score = estimator.oob_score_
260 | best_estimator = estimator
261 | best_params = estimator.get_params()
262 |
263 | best_tparams = dict()
264 | for tpar in self.tuning_ranges[self.model_names[model_idx]]:
265 | best_tparams[tpar] = best_params[tpar] # only grab the values of the best tuning parameter
266 |
267 | if self.verbose:
268 | print 'Best', self.model_names[model_idx], 'has:'
269 | for tuning_parameter in self.tuning_ranges[self.model_names[model_idx]]:
270 | print ' ', tuning_parameter, '=', best_tparams[tuning_parameter]
271 | print ' OOB Score of', best_score
272 |
273 | return best_estimator, best_score, best_tparams
274 |
275 | def fit(self, X, y, n_refinements=1):
276 | """
277 | Fit the suite of estimators. The tuning parameters are estimated using cross-validation.
278 |
279 | :param X: The array of predictors, shape (n_samples, n_features).
280 | :param y: The array of response values, shape (n_samples) or (n_samples, n_outputs), depending on the estimator.
281 | :param n_refinements: The number of time to refine the grid of tuning parameter values. Must be an integer or
282 | dictionary. If an integer, the grid for all models will be refined this many times. If a dictionary, should
283 | have (key value) pairs given by (estimator name, n_refinements).
284 | :return: Returns self.
285 | """
286 | self.nfeatures = X.shape[1]
287 | ndata = len(y)
288 | if X.shape[0] != ndata:
289 | raise ValueError('X and y must have same number of rows.')
290 |
291 | if np.isscalar(n_refinements):
292 | # use same number of refinements for all models
293 | n_refinements = {name: n_refinements for name in self.model_names}
294 |
295 | if type(self.cv) in int_types:
296 | # construct cross-validation iterator
297 | self.cv = KFold(ndata, n_folds=self.cv)
298 | elif self.cv.n != ndata:
299 | # need to reconstruct cross-validation iterator since we have different data
300 | self.cv = KFold(ndata, n_folds=self.cv.n_folds)
301 |
302 | for k in range(len(self.models)):
303 | if 'RandomForest' in self.model_names[k]:
304 | # use out-of-bag error for validation error
305 | best_estimator, best_score, best_params = self.oob_validate(X, k, y)
306 | else:
307 | # use cross-validation for validation error
308 | best_estimator, best_score, best_params = self.cross_validate(X, k, y)
309 |
310 | self.models[k] = best_estimator
311 | self.best_scores[self.model_names[k]] = best_score
312 |
313 | for i in range(n_refinements[self.model_names[k]]):
314 | if self.verbose:
315 | print 'Refining Grid...'
316 | old_score = best_score
317 | # now refine the grid and refit
318 | self.refine_grid(best_params, self.model_names[k])
319 |
320 | if 'RandomForest' in self.model_names[k]:
321 | # use out-of-bag error for validation error
322 | best_estimator, best_score, best_params = self.oob_validate(X, k, y)
323 | else:
324 | # use cross-validation for validation error
325 | best_estimator, best_score, best_params = self.cross_validate(X, k, y)
326 | if self.verbose:
327 | print ' New Validation Score of', best_score, 'is an improvement of', \
328 | 100.0 * (best_score - old_score) / np.abs(old_score), '%.'
329 |
330 | self.models[k] = best_estimator
331 | self.best_scores[self.model_names[k]] = best_score
332 |
333 | return self
334 |
335 | def predict_all(self, X):
336 | """
337 | Predict the outputs as a function of the inputs for each model.
338 |
339 | :param X: The array of predictor values, shape (n_samples, n_features).
340 | :return: A dictionary containing the values of the response predicted at the input values for each model.
341 | """
342 | y_predict_all = {name: model.predict(X) for name, model in zip(self.model_names, self.models)}
343 |
344 | return y_predict_all
345 |
346 | @abc.abstractmethod
347 | def predict(self, X, weights='auto'):
348 | return self.predict_all(X)
349 |
350 |
351 | class ClassificationSuite(BasePredictorSuite):
352 |
353 | def __init__(self, n_features=None, tuning_ranges=None, models=None, cv=None, njobs=1, pre_dispatch='2*n_jobs',
354 | stack=True, verbose=False):
355 | """
356 | Initialize a pipeline to run a suite of scikit-learn classifiers. The tuning parameters are chosen through
357 | cross-validation or the out-of-bags score (for Random Forests) as part of the fitting process. The score
358 | function used is the accuracy score (fraction of correct classifications).
359 |
360 | :param verbose: Provide helpful output.
361 | :param n_features: The number of features that will be used when performing the fit. Must supply either
362 | n_features or tuning_ranges. This is necessary because the tuning parameter for the RandomForestClassifier
363 | is max_features, and max_features must be less than the number of features in the input array. So, in order
364 | to automatically construct the tuning_ranges dictionary it is necessary to know n_features in order to
365 | ensure max_features <= n_features.
366 | :param tuning_ranges: A nested dictionary containing the ranges of the tuning parameters. It should be of the
367 | format {model name 1: {parameter name 1: list(value range 1), parameter name 2: list(value range 2), ...} }.
368 | If n_features is not supplied, then tuning_ranges must be provided.
369 | :param models: A list of instantiated scikit-learn classifier classes to fit. If None, these are taken from
370 | the models listed in tuning_range.
371 | :param cv: The number of CV folds to use, or a CV generator.
372 | :param njobs: The number of processes to run in parallel.
373 | :param pre_dispatch: Passed to sklearn.grid_search.GridSearchCV, see documentation for GridSearchCV for further
374 | details.
375 | :param stack: If true, then the predict() method will return a stacked (averaged) value over the estimators.
376 | Otherwise, if false, then predict() will return the predictions for each estimator.
377 | """
378 | if tuning_ranges is None:
379 | try:
380 | n_features is not None
381 | except ValueError:
382 | 'Must supply one of n_features or tuning_ranges.'
383 | # use default values for grid search over tuning parameters for all models
384 | tuning_ranges = {'LogisticRegression': {'C': list(np.logspace(-2.0, 1.0, 5))},
385 | 'DecisionTreeClassifier': {'max_depth': [5, 10, 20, 50, None]},
386 | 'LinearSVC': {'C': list(np.logspace(-2.0, 1.0, 5))},
387 | 'SVC': {'C': list(np.logspace(-2.0, 1.0, 5)),
388 | 'gamma': list(np.logspace(np.log10(1.0 / n_features),
389 | np.log10(1000.0 / n_features), 5))},
390 | 'RandomForestClassifier': {'max_features':
391 | list(np.unique(np.linspace(2, n_features, 5).astype(np.int)))},
392 | 'GbcAutoNtrees': {'max_depth': [1, 2, 3, 5, 10]}}
393 | if models is None:
394 | # initialize the list of sklearn objects corresponding to different statistical models
395 | models = []
396 | if 'LogisticRegression' in tuning_ranges:
397 | models.append(LogisticRegression(penalty='l1', class_weight='auto'))
398 | if 'DecisionTreeClassifier' in tuning_ranges:
399 | models.append(DecisionTreeClassifier())
400 | if 'LinearSVC' in tuning_ranges:
401 | models.append(LinearSVC(penalty='l1', loss='l2', dual=False, class_weight='auto'))
402 | if 'SVC' in tuning_ranges:
403 | models.append(SVC(class_weight='auto'))
404 | if 'RandomForestClassifier' in tuning_ranges:
405 | models.append(RandomForestClassifier(n_estimators=500, oob_score=True, n_jobs=njobs))
406 | if 'GbcAutoNtrees' in tuning_ranges:
407 | models.append(GbcAutoNtrees(subsample=0.75, n_estimators=500, learning_rate=0.01))
408 |
409 | super(ClassificationSuite, self).__init__(tuning_ranges=tuning_ranges, models=models, cv=cv, njobs=njobs,
410 | pre_dispatch=pre_dispatch, stack=stack, verbose=verbose)
411 |
412 | self.scorer = make_scorer(accuracy_score)
413 | self.nfeatures = n_features
414 | self.classes = None
415 |
416 | def predict(self, X, weights='auto'):
417 | """
418 | Predict the classes as a function of the inputs. If self.stack is true, then the predictions for each data point
419 | are computed based on a weighted majority vote of the estimators. Otherwise, a dictionary containing the
420 | predictions for each estimator are returns.
421 |
422 | :param X: The array of predictor values, shape (n_samples, n_features).
423 | :param weights: The weights to use when combining the predictions for the individual estimators. If 'auto', then
424 | the weights are given by the validation scores. If 'uniform', then uniform weights are used. Otherwise
425 | weights must be a dictionary with (model name, weight) as the (key, value) pair.
426 | No effect if self.stack = False.
427 | :return: The values of the response predicted at the input values.
428 | """
429 | y_predict_all = super(ClassificationSuite, self).predict_all(X)
430 |
431 | if weights is 'uniform':
432 | # just use uniform weighting
433 | weights = {name: 1.0 for name in self.model_names}
434 |
435 | if weights is 'auto':
436 | # weight based on validation score
437 | weights = self.best_scores
438 |
439 | if self.stack:
440 | # combine the model outputs
441 | y_votes = np.zeros((X.shape[0], len(self.model_names)))
442 | for name in y_predict_all:
443 | vote = y_predict_all[name]
444 | idx_1d = vote + np.arange(len(vote)) * y_votes.shape[1]
445 | # compute weighted vote for each class
446 | y_votes[np.unravel_index(idx_1d, y_votes.shape)] += weights[name]
447 |
448 | y_predict = self.classes[y_votes.argmax(axis=1)] # output is winner of majority vote
449 |
450 | else:
451 | y_predict = y_predict_all
452 |
453 | return y_predict
454 |
455 | def fit(self, X, y, n_refinements=1):
456 | classes, y = np.unique(y, return_inverse=True)
457 | self.classes = classes
458 | return super(ClassificationSuite, self).fit(X, y, n_refinements)
459 |
460 |
461 | class RegressionSuite(BasePredictorSuite):
462 |
463 | def __init__(self, n_features=None, tuning_ranges=None, models=None, cv=None, njobs=1, pre_dispatch='2*n_jobs',
464 | stack=True, verbose=False, metric='lad'):
465 | if metric.lower() not in ['lad', 'mse']:
466 | raise ValueError('Metric must be either lad or mse.')
467 |
468 | if tuning_ranges is None:
469 | try:
470 | n_features is not None
471 | except ValueError:
472 | 'Must supply one of n_features or tuning_ranges.'
473 | # use default values for grid search over tuning parameters for all models
474 | tuning_ranges = {'DecisionTreeClassifier': {'max_depth': [5, 10, 20, 50, None]},
475 | 'RandomForestRegressor': {'max_features':
476 | list(np.unique(np.linspace(2, n_features, 5).astype(np.int)))},
477 | 'GbrAutoNtrees': {'max_depth': [1, 2, 3, 5, 10]}}
478 | if models is None:
479 | # initialize the list of sklearn objects corresponding to different statistical models
480 | models = []
481 | if 'DecisionTreeRegressor' in tuning_ranges:
482 | models.append(DecisionTreeRegressor())
483 | if 'RandomForestRegressor' in tuning_ranges:
484 | models.append(RandomForestRegressor(n_estimators=500, oob_score=True, n_jobs=njobs))
485 | if 'GbrAutoNtrees' in tuning_ranges:
486 | models.append(GbrAutoNtrees(subsample=0.75, n_estimators=500, learning_rate=0.01))
487 |
488 | super(RegressionSuite, self).__init__(tuning_ranges, models, cv, njobs, pre_dispatch, stack, verbose)
489 |
490 | self.scorer = make_scorer(accuracy_score)
491 | self.nfeatures = n_features
492 | self.metric = metric.lower()
493 | if self.metric == 'lad':
494 | self.scorer = make_scorer(mean_absolute_error, greater_is_better=False)
495 | elif self.metric == 'mse':
496 | self.scorer = make_scorer(mean_squared_error, greater_is_better=False)
497 |
498 | def predict(self, X, weights='auto'):
499 |
500 | y_predict_all = super(RegressionSuite, self).predict_all(X)
501 |
502 | if weights is 'uniform':
503 | # just use uniform weighting
504 | weights = {name: 1.0 for name in self.model_names}
505 |
506 | if weights is 'auto':
507 | # weight based on validation score
508 | weights = self.best_scores
509 |
510 | if self.stack:
511 | # combine the model outputs
512 | y_predict = 0.0
513 | wsum = 0.0
514 | for name in y_predict_all:
515 | y_predict += weights[name] * y_predict_all[name]
516 | wsum += weights[name]
517 | y_predict /= wsum
518 | else:
519 | y_predict = y_predict_all
520 |
521 | return y_predict
--------------------------------------------------------------------------------