├── .idea ├── .name ├── scopes │ └── scope_settings.xml ├── encodings.xml ├── vcs.xml ├── misc.xml ├── modules.xml └── bck_stats.iml ├── bck_stats ├── __init__.py ├── multiclass_triangle_plot.py ├── gcv_smoother.py ├── avg_pred_comp.py ├── react.py ├── dynamic_linear_model.py ├── super_pca.py ├── dba.py └── sklearn_estimator_suite.py ├── setup.py ├── LICENSE.txt ├── README.md ├── test_estimator_suite.py └── .gitignore /.idea/.name: -------------------------------------------------------------------------------- 1 | bck_stats -------------------------------------------------------------------------------- /bck_stats/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'brandonkelly' 2 | 3 | -------------------------------------------------------------------------------- /.idea/scopes/scope_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | __author__ = 'brandonkelly' 2 | 3 | from distutils.core import setup 4 | setup(name='bck_stats', version='0.1', author='Brandon C. Kelly', 5 | description='Routines for various statistical and machine learning techniques.', 6 | packages=['bck_stats']) -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /.idea/bck_stats.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Brandon C. Kelly 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | bck_stats 2 | ========= 3 | 4 | Routines for implementing various statistical and machine learning techniques. 5 | 6 | Description of routines: 7 | 8 | * `super_pca`: Class for performing supervised principal components regression (Bair, E., et al. *Prediction by supervised principal components.* J. Am. Stat. Assoc. 101, 473, 2006) 9 | * `sklearn_estimator_suite`: Classes for running through a set of scikit-learn estimators, using cross-validation to choose the tuning parameters. 10 | * `react`: Classes for performing non-parameteric regression in one or two dimensions based on the REACT technique (Beran, R. *REACT scatterplot smoothers: Superefficiency through basis economy.* J. Am. Stat. Assoc. 95, 449, 2000) 11 | * `multiclass_triangle_plot`: Plot the lower triangle of a scatterplot matrix, color-coding according to class label. A modified version of Dan Foreman-Mackey's triangle.py routine. 12 | * `gcv_smoother`: Perform exponential smoothing of a time series. The e-folding time scale is chosen using generalized cross-validation. 13 | * `dynamic_linear_model`: Class to perform dynamic linear regression via least-squares (Montana, G., et al. *Flexible least squares for temporal data mining and statistical arbitrage.* Expert Systems with Applications 36, 2819, 2009). 14 | * `dba`: Compute the dynamic time warping barycentric average of a set of time series (Petitjean, F., et al. *A global averaging method for dynamic time warping, with applications to clustering.* Pattern Recognition, 44, 678, 2011). Also contains a function to compute the dynamic time warping distance. 15 | 16 | ------------- 17 | Installation 18 | ------------- 19 | 20 | From the base directory type `python setup.py install` in a terminal. 21 | -------------------------------------------------------------------------------- /test_estimator_suite.py: -------------------------------------------------------------------------------- 1 | __author__ = 'brandonkelly' 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from sklearn_estimator_suite import ClassificationSuite 6 | from sklearn.datasets import make_classification 7 | from sklearn.cross_validation import train_test_split 8 | from sklearn.metrics import accuracy_score 9 | 10 | n_samples = 2000 11 | n_classes = 3 12 | X, y = make_classification(n_samples, n_classes=n_classes, n_informative=10) 13 | 14 | X, X_test, y, y_test = train_test_split(X, y, train_size=0.5) 15 | 16 | # suite = ClassificationSuite(n_features=X.shape[1]) 17 | # 18 | # suite.fit(X, y) 19 | # names = suite.best_scores.keys() 20 | # scores = suite.best_scores.values() 21 | # 22 | # fig, ax1 = plt.subplots() 23 | # plt.bar(np.arange(0, len(names)), scores) 24 | # xtickNames = plt.setp(ax1, xticklabels=names) 25 | # plt.setp(xtickNames, rotation=45) 26 | # plt.ylabel('Accuracy') 27 | # plt.xlabel('Model') 28 | # plt.show() 29 | 30 | # now make sure things work in parallel 31 | suite = ClassificationSuite(n_features=X.shape[1], njobs=7) 32 | 33 | suite.fit(X, y) 34 | 35 | names = suite.best_scores.keys() 36 | scores = suite.best_scores.values() 37 | 38 | # get predictions 39 | y_predict_uniform = suite.predict(X_test, weights='uniform') # uniform weightings 40 | y_predict_stacked = suite.predict(X_test) 41 | 42 | uniform_accuracy = accuracy_score(y_test, y_predict_uniform) 43 | stacked_accuracy = accuracy_score(y_test, y_predict_stacked) 44 | y_predict_all = suite.predict_all(X_test) 45 | 46 | print '' 47 | print '---' 48 | print 'Test accuracy for uniform weighting:', uniform_accuracy 49 | print 'Test accuracy for validation score weighting:', stacked_accuracy 50 | for name in y_predict_all: 51 | print 'Test accuracy for', name, ':', accuracy_score(y_test, y_predict_all[name]) 52 | print '---' 53 | print '' 54 | 55 | fig, ax1 = plt.subplots() 56 | plt.bar(np.arange(0, len(names)), scores) 57 | xtickNames = plt.setp(ax1, xticklabels=names) 58 | plt.setp(xtickNames, rotation=45) 59 | plt.ylabel('Accuracy') 60 | plt.xlabel('Model') 61 | plt.show() 62 | 63 | # try using different number of grid refinements for the models 64 | n_refinements = {name: 1 for name in suite.model_names} 65 | n_refinements['GbcAutoNtrees'] = 0 66 | 67 | suite.fit(X, y, n_refinements=n_refinements) 68 | 69 | names = suite.best_scores.keys() 70 | scores = suite.best_scores.values() 71 | 72 | fig, ax1 = plt.subplots() 73 | plt.bar(np.arange(0, len(names)), scores) 74 | xtickNames = plt.setp(ax1, xticklabels=names) 75 | plt.setp(xtickNames, rotation=45) 76 | plt.ylabel('Accuracy') 77 | plt.xlabel('Model') 78 | plt.tight_layout() 79 | plt.show() 80 | 81 | tuning_ranges = {'LogisticRegression': {'C': list(np.logspace(-3.0, 0.0, 5))}, 82 | 'DecisionTreeClassifier': {'max_depth': [5, 10, 20, 50, 100]}, 83 | 'LinearSVC': {'C': list(np.logspace(-3.0, 0.0, 5))}} 84 | 85 | suite = ClassificationSuite(tuning_ranges=tuning_ranges, njobs=7) 86 | 87 | suite.fit(X, y, n_refinements=3) 88 | 89 | names = suite.best_scores.keys() 90 | scores = suite.best_scores.values() 91 | 92 | fig, ax1 = plt.subplots() 93 | plt.bar(np.arange(0, len(names)), scores) 94 | xtickNames = plt.setp(ax1, xticklabels=names) 95 | plt.setp(xtickNames, rotation=45) 96 | plt.ylabel('Accuracy') 97 | plt.xlabel('Model') 98 | plt.show() 99 | 100 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ######################### 2 | # .gitignore file for Xcode4 / OS X Source projects 3 | # 4 | # Version 2.0 5 | # For latest version, see: http://stackoverflow.com/questions/49478/git-ignore-file-for-xcode-projects 6 | # 7 | # 2013 updates: 8 | # - fixed the broken "save personal Schemes" 9 | # 10 | # NB: if you are storing "built" products, this WILL NOT WORK, 11 | # and you should use a different .gitignore (or none at all) 12 | # This file is for SOURCE projects, where there are many extra 13 | # files that we want to exclude 14 | # 15 | ######################### 16 | 17 | ##### 18 | # OS X temporary files that should never be committed 19 | 20 | .DS_Store 21 | *.swp 22 | *.lock 23 | profile 24 | 25 | 26 | #### 27 | # Xcode temporary files that should never be committed 28 | # 29 | # NB: NIB/XIB files still exist even on Storyboard projects, so we want this... 30 | 31 | *~.nib 32 | 33 | 34 | #### 35 | # Xcode build files - 36 | # 37 | # NB: slash on the end, so we only remove the FOLDER, not any files that were badly named "DerivedData" 38 | 39 | DerivedData/ 40 | 41 | # NB: slash on the end, so we only remove the FOLDER, not any files that were badly named "build" 42 | 43 | build/ 44 | 45 | 46 | ##### 47 | # Xcode private settings (window sizes, bookmarks, breakpoints, custom executables, smart groups) 48 | # 49 | # This is complicated: 50 | # 51 | # SOMETIMES you need to put this file in version control. 52 | # Apple designed it poorly - if you use "custom executables", they are 53 | # saved in this file. 54 | # 99% of projects do NOT use those, so they do NOT want to version control this file. 55 | # ..but if you're in the 1%, comment out the line "*.pbxuser" 56 | 57 | *.pbxuser 58 | *.mode1v3 59 | *.mode2v3 60 | *.perspectivev3 61 | # NB: also, whitelist the default ones, some projects need to use these 62 | !default.pbxuser 63 | !default.mode1v3 64 | !default.mode2v3 65 | !default.perspectivev3 66 | 67 | 68 | #### 69 | # Xcode 4 - semi-personal settings 70 | # 71 | # 72 | # OPTION 1: --------------------------------- 73 | # throw away ALL personal settings (including custom schemes! 74 | # - unless they are "shared") 75 | # 76 | # NB: this is exclusive with OPTION 2 below 77 | xcuserdata 78 | *.xcworkspacedata 79 | 80 | # OPTION 2: --------------------------------- 81 | # get rid of ALL personal settings, but KEEP SOME OF THEM 82 | # - NB: you must manually uncomment the bits you want to keep 83 | # 84 | # NB: this is exclusive with OPTION 1 above 85 | # 86 | #xcuserdata/**/* 87 | 88 | # (requires option 2 above): Personal Schemes 89 | # 90 | #!xcuserdata/**/xcschemes/* 91 | 92 | #### 93 | # XCode 4 workspaces - more detailed 94 | # 95 | # Workspaces are important! They are a core feature of Xcode - don't exclude them :) 96 | # 97 | # Workspace layout is quite spammy. For reference: 98 | # 99 | # /(root)/ 100 | # /(project-name).xcodeproj/ 101 | # project.pbxproj 102 | # /project.xcworkspace/ 103 | # contents.xcworkspacedata 104 | # /xcuserdata/ 105 | # /(your name)/xcuserdatad/ 106 | # UserInterfaceState.xcuserstate 107 | # /xcsshareddata/ 108 | # /xcschemes/ 109 | # (shared scheme name).xcscheme 110 | # /xcuserdata/ 111 | # /(your name)/xcuserdatad/ 112 | # (private scheme).xcscheme 113 | # xcschememanagement.plist 114 | # 115 | # 116 | 117 | #### 118 | # Xcode 4 - Deprecated classes 119 | # 120 | # Allegedly, if you manually "deprecate" your classes, they get moved here. 121 | # 122 | # We're using source-control, so this is a "feature" that we do not want! 123 | 124 | *.moved-aside 125 | 126 | #### 127 | # Cocoapods: cocoapods.org 128 | # 129 | # Ignoring these files means that whoever uses the code will first have to run: 130 | # pod install 131 | # in the App.xcodeproj directory. 132 | # This ensures the latest dependencies are used. 133 | Pods/ 134 | Podfile.lock 135 | 136 | ###### 137 | # Ignore .pyc files 138 | *.pyc 139 | 140 | #### 141 | # Ignore PyCharm files 142 | .idea/ 143 | __pychache__/ 144 | 145 | # ignore csv files 146 | # *.csv 147 | 148 | #ignore data 149 | #data/ 150 | #plots/ 151 | 152 | #ignore pickle files 153 | *.pickle 154 | *.p 155 | *.h5 -------------------------------------------------------------------------------- /bck_stats/multiclass_triangle_plot.py: -------------------------------------------------------------------------------- 1 | __author__ = 'brandonkelly' 2 | __notes__ = "Adapted from Dan Foreman-Mackey triangle.py module." 3 | 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from matplotlib.ticker import MaxNLocator 7 | 8 | 9 | def multiclass_triangle(xs, classes, labels=None, verbose=True, fig=None, **kwargs): 10 | # Deal with 1D sample lists. 11 | xs = np.atleast_1d(xs) 12 | if len(xs.shape) == 1: 13 | xs = np.atleast_2d(xs) 14 | else: 15 | assert len(xs.shape) == 2, "The input sample array must be 1- or 2-D." 16 | xs = xs.T 17 | assert xs.shape[0] <= xs.shape[1], "I don't believe that you want more " \ 18 | "dimensions than samples!" 19 | 20 | K = len(xs) 21 | factor = 2.0 # size of one side of one panel 22 | lbdim = 0.5 * factor # size of left/bottom margin 23 | trdim = 0.05 * factor # size of top/right margin 24 | whspace = 0.05 # w/hspace size 25 | plotdim = factor * K + factor * (K - 1.) * whspace 26 | dim = lbdim + plotdim + trdim 27 | 28 | if fig is None: 29 | fig, axes = plt.subplots(K, K, figsize=(dim, dim)) 30 | else: 31 | try: 32 | axes = np.array(fig.axes).reshape((K, K)) 33 | except: 34 | raise ValueError("Provided figure has {0} axes, but data has " 35 | "dimensions K={1}".format(len(fig.axes), K)) 36 | lb = lbdim / dim 37 | tr = (lbdim + plotdim) / dim 38 | fig.subplots_adjust(left=lb, bottom=lb, right=tr, top=tr, 39 | wspace=whspace, hspace=whspace) 40 | 41 | extents = [[x.min(), x.max()] for x in xs] 42 | 43 | # Check for parameters that never change. 44 | m = np.array([e[0] == e[1] for e in extents], dtype=bool) 45 | if np.any(m): 46 | raise ValueError(("It looks like the parameter(s) in column(s) " 47 | "{0} have no dynamic range. Please provide an " 48 | "`extent` argument.") 49 | .format(", ".join(map("{0}".format, 50 | np.arange(len(m))[m])))) 51 | 52 | class_labels = np.unique(classes) 53 | nclasses = len(class_labels) 54 | 55 | color_list = ["Black", "DodgerBlue", "DarkOrange", "Green", "Magenta", "Red", "Brown", "Cyan"] * 10 56 | 57 | for i, x in enumerate(xs): 58 | ax = axes[i, i] 59 | # Plot the histograms. 60 | n = [] 61 | for l, k in enumerate(class_labels): 62 | n_k, b_k, p_k = ax.hist(x[classes == k], bins=kwargs.get("bins", 50), 63 | range=extents[i], histtype="step", 64 | color=color_list[l], lw=2, normed=True) 65 | n.append(n_k) 66 | 67 | # Set up the axes. 68 | ax.set_xlim(extents[i]) 69 | ax.set_ylim(0, 1.1 * np.max(n)) 70 | ax.set_yticklabels([]) 71 | ax.xaxis.set_major_locator(MaxNLocator(5)) 72 | 73 | # Not so DRY. 74 | if i < K - 1: 75 | ax.set_xticklabels([]) 76 | else: 77 | [l.set_rotation(45) for l in ax.get_xticklabels()] 78 | if labels is not None: 79 | ax.set_xlabel(labels[i]) 80 | ax.xaxis.set_label_coords(0.5, -0.3) 81 | 82 | for j, y in enumerate(xs): 83 | ax = axes[i, j] 84 | if j > i: 85 | ax.set_visible(False) 86 | ax.set_frame_on(False) 87 | continue 88 | elif j == i: 89 | continue 90 | 91 | for l, k in enumerate(class_labels): 92 | ax.plot(y[classes == k], x[classes == k], 'o', ms=1.5, color=color_list[l], rasterized=True, alpha=0.25) 93 | 94 | extent = [[y.min(), y.max()], [x.min(), x.max()]] 95 | ax.set_xlim(extent[0]) 96 | ax.set_ylim(extent[1]) 97 | ax.xaxis.set_major_locator(MaxNLocator(5)) 98 | ax.yaxis.set_major_locator(MaxNLocator(5)) 99 | 100 | if i < K - 1: 101 | ax.set_xticklabels([]) 102 | else: 103 | [l.set_rotation(45) for l in ax.get_xticklabels()] 104 | if labels is not None: 105 | ax.set_xlabel(labels[j]) 106 | ax.xaxis.set_label_coords(0.5, -0.3) 107 | 108 | if j > 0: 109 | ax.set_yticklabels([]) 110 | else: 111 | [l.set_rotation(45) for l in ax.get_yticklabels()] 112 | if labels is not None: 113 | ax.set_ylabel(labels[i]) 114 | ax.yaxis.set_label_coords(-0.3, 0.5) 115 | 116 | return fig -------------------------------------------------------------------------------- /bck_stats/gcv_smoother.py: -------------------------------------------------------------------------------- 1 | __author__ = 'brandonkelly' 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | 7 | class GcvExpSmoother(object): 8 | def __init__(self, lookback=30): 9 | """ 10 | Constructor for class to perform exponentially-weighted average smoothing of a 1-D data set. 11 | 12 | @param lookback: The maximum look-back length to use in the smoothing. Only the data points in 13 | y[idx - lookback:idx] are used to compute the smoothed estimate of y[idx+1]. 14 | """ 15 | self.lookback = int(lookback) # support of exponential smoother, only use this many data points in computation 16 | self.efold = 1.0 17 | self.gcv_grid = np.zeros(2.0 * self.lookback) 18 | self.efold_grid = np.zeros(2.0 * self.lookback) 19 | 20 | def smooth(self, y): 21 | """ 22 | Return a smoothed estimate of y, using the current value of self.efold for the e-folding length. 23 | 24 | @param y: The data, a 1-D array. 25 | @return: The smoothed estimate of y, a 1-D numpy array. 26 | """ 27 | ysmooth, peff = self._smooth(self.efold, y) 28 | return ysmooth 29 | 30 | def weights(self, efold, lookback=None): 31 | if lookback is None: 32 | lookback = self.lookback 33 | xvalues = np.arange(0.0, lookback) 34 | weights = np.exp(-xvalues / efold) 35 | return weights[::-1] / np.sum(weights) 36 | 37 | def choose_efold(self, y, verbose=False): 38 | """ 39 | Choose the optimal e-folding length of the exponential smoothing kernel using generalized cross-validation. 40 | 41 | @param y: The training set, a 1-D array. 42 | @param verbose: If true, then print the chosen smoothing length. 43 | """ 44 | ngrid = 20 45 | efold_grid = np.logspace(-1.0, np.log10(self.lookback * 2.0), ngrid) 46 | gcv_grid = np.zeros(efold_grid.size) 47 | for i in xrange(efold_grid.size): 48 | smoothed_y, peffective = self._smooth(efold_grid[i], y) 49 | gcv_grid[i] = gcv_error(y, smoothed_y, peffective) 50 | 51 | # choose e-folding length of smoother to minimize the generalized cross-validation error 52 | self.efold = efold_grid[gcv_grid.argmin()] 53 | if verbose: 54 | print 'E-folding length chosen to be', self.efold 55 | 56 | # save the grids 57 | self.efold_grid = efold_grid 58 | self.gcv_grid = gcv_grid 59 | 60 | def _smooth(self, efold, y): 61 | try: 62 | y.size > self.lookback 63 | except ValueError: 64 | 'Y must have at least self.lookback elements.' 65 | 66 | ysmooth = np.zeros(y.size) 67 | ysmooth[0] = y[0] 68 | 69 | peffective = 0.0 # trace of the smoothing matrix, the effective number of parameters 70 | 71 | # treat the first self.lookback data points seperately, since the base-line is shorter 72 | for i in xrange(1, self.lookback): 73 | weights = self.weights(efold, lookback=i) 74 | ysmooth[i] = weights.dot(y[0:i]) 75 | peffective += weights[-1] 76 | 77 | weights = self.weights(efold) 78 | for i in xrange(y.size - self.lookback - 1): 79 | idx = self.lookback + i 80 | # estimate current y as exponentially-weighted average of previous self.lookback y-values 81 | ysmooth[idx] = weights.dot(y[idx - self.lookback:idx]) 82 | peffective += weights[-1] 83 | 84 | ysmooth[-1] = weights.dot(y[y.size - self.lookback - 1:-1]) 85 | peffective += weights[-1] 86 | 87 | return ysmooth, peffective 88 | 89 | 90 | def gcv_error(y, ysmooth, peffective): 91 | """ 92 | Compute generalized cross-validation error. 93 | 94 | @param y: The numpy array of y-values. 95 | @param ysmooth: The smoothed numpy array of y-values. 96 | @param peffective: The effective number of parameters of the smoothing matrix, given by its trace. 97 | @return: The generalized cross-validation error (L2-loss function). 98 | """ 99 | gcv = np.mean((y - ysmooth) ** 2) / (1.0 - peffective / y.size) ** 2 100 | return gcv 101 | 102 | 103 | if __name__ == "__main__": 104 | # example usage 105 | x = np.arange(500) 106 | y = np.cos(x / 15.0) + 0.1 * np.random.standard_normal(500) 107 | 108 | gcv = GcvExpSmoother() 109 | gcv.choose_efold(y, verbose=True) 110 | ysmooth = gcv.smooth(y) 111 | 112 | plt.semilogy(gcv.efold_grid, gcv.gcv_grid) 113 | plt.xlabel('E-folding length') 114 | plt.ylabel('GCV Error') 115 | plt.show() 116 | 117 | plt.clf() 118 | plt.plot(x, y, '.', label='Data') 119 | plt.plot(x, ysmooth, label='Smoothed', lw=2) 120 | plt.legend() 121 | plt.show() 122 | -------------------------------------------------------------------------------- /bck_stats/avg_pred_comp.py: -------------------------------------------------------------------------------- 1 | __author__ = 'brandonkelly' 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from sklearn.neighbors import NearestNeighbors 6 | from scipy.spatial.distance import cdist 7 | from scipy import linalg 8 | import multiprocessing 9 | 10 | 11 | def distance_matrix(Xvals): 12 | covar = np.cov(Xvals, rowvar=0) 13 | covar_inv = linalg.inv(covar) 14 | Dmat = cdist(Xvals, Xvals, metric='mahalanobis', VI=covar_inv) 15 | 16 | return Dmat 17 | 18 | 19 | def impact_single_theta(args): 20 | predict, theta, X, p_idx, weights, predict_args = args 21 | # first compute the matrix of model predictions: 22 | # y_predict[i, j] = E(y|u_i, v_j, theta) 23 | ndata = X.shape[0] 24 | X_copy = X.copy() 25 | u = X[:, p_idx] # the active predictor 26 | y_predict = np.zeros((ndata, ndata)) 27 | for i in range(ndata): 28 | X_copy[:, p_idx] = u[i] 29 | y_predict[i] = predict(X_copy, theta, *predict_args) 30 | 31 | # get matrix of signs of transitions 32 | transition_sign = np.zeros((ndata, ndata)) 33 | for j in range(ndata): 34 | transition_sign[:, j] = np.sign(u - u[j]) 35 | 36 | u1, u2 = np.meshgrid(u, u) 37 | transition_sign = np.sign(u2 - u1) 38 | y_predict_diff = y_predict - np.outer(np.ones(ndata), y_predict.diagonal()) 39 | numer = np.sum(weights * y_predict_diff * transition_sign) # signed version 40 | abs_numer = np.sum(weights * np.abs(y_predict_diff)) # absolute version 41 | # denom = np.sum(weights * (u2 - u1) * np.sign(u2 - u1)) 42 | denom = np.sum(weights) 43 | 44 | return numer / denom, abs_numer / denom 45 | 46 | 47 | def impact(predict, theta, X, predictors=None, predict_args=None, nneighbors=None, nx=None, ntheta=None, 48 | mahalanobis_constant=1.0, n_jobs=1): 49 | 50 | if n_jobs < 0: 51 | n_jobs = multiprocessing.cpu_count() 52 | if n_jobs > 1: 53 | pool = multiprocessing.Pool(n_jobs) 54 | 55 | if predictors is None: 56 | # calculate the impact for all the predictors 57 | predictors = np.arange(X.shape[1]) 58 | 59 | if nx is not None: 60 | # use only a subset of the data points 61 | subset_idx = np.random.permutation(X.shape[0])[:nx] 62 | X = X[subset_idx] 63 | else: 64 | nx = X.shape[0] 65 | if ntheta is not None: 66 | # use only a subset of the theta samples 67 | subset_idx = np.random.permutation(theta.shape[0]) 68 | theta = theta[subset_idx] 69 | else: 70 | ntheta = theta.shape[0] 71 | if nneighbors is None: 72 | # use all of the neighbors when computing the weights 73 | nneighbors = X.shape[0] 74 | 75 | # first compute the distance matrix 76 | Dmat = distance_matrix(X) 77 | weights0 = 1.0 / (mahalanobis_constant + Dmat) 78 | 79 | # get the sets of nearest neighbors 80 | knn = NearestNeighbors(n_neighbors=nneighbors) 81 | knn.fit(X) 82 | nn_idx = knn.kneighbors(X, return_distance=False) 83 | 84 | weights = np.zeros_like(weights0) 85 | for i in range(weights.shape[0]): 86 | # data points outside of K nearest neighbors have weight of zero 87 | weights[nn_idx[i], i] = weights0[nn_idx[i], i] 88 | 89 | weights /= weights.sum(axis=0) # normalize weights to contribution to impact for each data point is the same 90 | 91 | impacts = np.zeros(len(predictors)) 92 | abs_impacts = np.zeros_like(impacts) 93 | impact_sigmas = np.zeros_like(impacts) 94 | abs_impact_sigma = np.zeros_like(impacts) 95 | print 'Doing predictor' 96 | for p_idx in predictors: 97 | print p_idx, '...' 98 | args = [] 99 | for s in range(ntheta): 100 | args.append([predict, theta[s], X, p_idx, weights, predict_args]) 101 | if n_jobs == 1: 102 | results = map(impact_single_theta, args) 103 | else: 104 | results = pool.map(impact_single_theta, args) 105 | results = np.array(results) 106 | impacts[p_idx] = np.mean(results[:, 0]) 107 | impact_sigmas[p_idx] = np.std(results[:, 0]) 108 | abs_impacts[p_idx] = np.mean(results[:, 1]) 109 | abs_impact_sigma[p_idx] = np.std(results[:, 1]) 110 | 111 | # impact_theta = np.zeros(theta.shape) 112 | # impact_theta_abs = np.zeros_like(impact_theta) 113 | # for s in range(ntheta): 114 | # impact_s, abs_impact_s = impact_single_theta(predict, theta[s], X, p_idx, weights, predict_args=predict_args) 115 | # impact_theta[s] = impact_s 116 | # impact_theta_abs[s] = abs_impact_s 117 | # impacts[p_idx] = np.mean(impact_theta) 118 | # impact_sigmas[p_idx] = np.std(impact_theta) 119 | # abs_impacts[p_idx] = np.mean(impact_theta_abs) 120 | # abs_impact_sigma[p_idx] = np.std(impact_theta_abs) 121 | 122 | return impacts, impact_sigmas, abs_impacts, abs_impact_sigma 123 | 124 | 125 | if __name__ == "__main__": 126 | # test and example usage 127 | ndata = 200 128 | beta = np.array([1.0, 2.0, -0.6, 0.1]) 129 | sigma = 0.1 130 | X = np.column_stack((np.ones(ndata), np.random.standard_normal(ndata), np.random.uniform(0.0, 5.0, ndata), 131 | np.random.standard_cauchy(ndata))) 132 | y = X.dot(beta) + sigma * np.random.standard_normal(ndata) 133 | 134 | XX_inv = linalg.inv(X.T.dot(X)) 135 | bhat = XX_inv.dot(X.T.dot(y)) 136 | bcov = XX_inv * sigma * sigma 137 | 138 | nsamples = 100 139 | betas = np.random.multivariate_normal(bhat, bcov, nsamples) 140 | betas = betas[:, 1:] # ignore constant term 141 | 142 | def linear_mean(X, beta, constant): 143 | ymean = X.dot(beta) + constant 144 | return ymean 145 | 146 | # don't include constant term 147 | impacts, isigmas, abs_impacts, aisigmas = \ 148 | impact(linear_mean, betas, X[:, 1:], predict_args=(bhat[0],), nneighbors=20, n_jobs=4) 149 | print impacts 150 | sorted_idx = np.argsort(np.abs(impacts)) 151 | 152 | labels = np.array(['x1', 'x2', 'x3'])[sorted_idx] 153 | 154 | pos = np.arange(sorted_idx.shape[0]) + .5 155 | plt.barh(pos, impacts[sorted_idx], align='center', xerr=isigmas[sorted_idx], alpha=0.5) 156 | plt.yticks(pos, labels) 157 | plt.xlabel('Impact') 158 | plt.show() -------------------------------------------------------------------------------- /bck_stats/react.py: -------------------------------------------------------------------------------- 1 | __author__ = 'brandonkelly' 2 | 3 | import numpy as np 4 | from sklearn.isotonic import IsotonicRegression 5 | 6 | 7 | class REACT(object): 8 | 9 | def __init__(self, basis='DCT', n_components=None, method='monotone'): 10 | try: 11 | basis.lower() in ['dct', 'manual'] 12 | except ValueError: 13 | 'Input basis must be either DCT or manual.' 14 | 15 | try: 16 | method.lower() in ['monotone', 'nss'] 17 | except ValueError: 18 | 'method must be either monotone or nss.' 19 | 20 | self.basis = basis 21 | self.nfreq = 1 22 | self.ncomp = 1 23 | self.n_components = n_components 24 | self.method = method 25 | self.coefs = np.zeros(1) 26 | self.shrinkage_factors = np.zeros(1) 27 | 28 | def fit(self, y, X=None, sigsqr=None): 29 | 30 | # check inputs 31 | if X is None: 32 | # build the discrete cosine basis 33 | if self.n_components is None: 34 | n_components = len(y) 35 | else: 36 | n_components = self.n_components 37 | X = self.build_dct(len(y), n_components) 38 | self.nfreq = len(y) 39 | self.ncomp = n_components 40 | else: 41 | if self.n_components is None: 42 | n_components = X.shape[1] 43 | else: 44 | n_components = self.n_components 45 | self.ncomp = n_components 46 | 47 | try: 48 | n_components <= len(y) 49 | except ValueError: 50 | 'Number of components must be less than the length of y.' 51 | 52 | self.coefs = np.dot(X.T, y) 53 | 54 | if sigsqr is None: 55 | # estimate noise variance using first difference estimator 56 | sigsqr = np.sum((y[1:] - y[:-1]) ** 2) / (2.0 * (len(y) - 1)) 57 | 58 | if self.method == 'monotone': 59 | # use monotone shrinkage on the basis coefficients 60 | self._set_shrinkage_factors(sigsqr) 61 | else: 62 | # use nested subset selection to choose the order of the basis expansion 63 | self._set_nss_order(sigsqr) 64 | 65 | self.coefs *= self.shrinkage_factors 66 | 67 | ysmooth = X.dot(self.coefs) 68 | return ysmooth 69 | 70 | @staticmethod 71 | def build_dct(n, p): 72 | rows, columns = np.mgrid[:n, :p] 73 | U = np.cos(np.pi * rows * columns / (n - 1.0)) 74 | row_norm = 2 * np.ones(n) 75 | row_norm[0] = 1.0 76 | row_norm[-1] = 1.0 77 | col_norm = 2 * np.ones(p) 78 | col_norm[0] = 1.0 79 | if p == n: 80 | col_norm[-1] = 1.0 81 | U *= 0.5 * np.sqrt(2.0 * np.outer(row_norm, col_norm) / (n - 1)) 82 | 83 | return U 84 | 85 | def interpolate(self, x_idx): 86 | try: 87 | self.method.lower() == 'dct' 88 | except AttributeError: 89 | 'Interpolation only available for DCT basis.' 90 | 91 | n = self.nfreq 92 | p = self.ncomp 93 | cols = np.arange(p) 94 | row_norm = 2 * np.ones(n) 95 | row_norm[0] = 1.0 96 | row_norm[-1] = 1.0 97 | col_norm = 2 * np.ones(p) 98 | col_norm[0] = 1.0 99 | U = np.cos(np.pi * np.outer(x_idx / n, cols)) 100 | U *= 0.5 * np.sqrt(2.0 * np.outer(row_norm, col_norm) / (n - 1)) 101 | y_interp = U.dot(self.coefs) 102 | return y_interp 103 | 104 | def _set_shrinkage_factors(self, sigsqr): 105 | coefs_snr = (self.coefs ** 2 - sigsqr) / self.coefs ** 2 # signal-to-noise ratio of the coefficients 106 | coefs_snr[coefs_snr < 0] = 0.0 107 | x = np.arange(len(coefs_snr)) 108 | weights = self.coefs ** 2 109 | self.shrinkage_factors = \ 110 | IsotonicRegression(y_min=0.0, y_max=1.0, increasing=False).fit_transform(x, coefs_snr, weights) 111 | 112 | def _set_nss_order(self, sigsqr): 113 | coefs_snr = (self.coefs ** 2 - sigsqr) / self.coefs ** 2 # signal-to-noise ratio of the coefficients 114 | coefs_snr[coefs_snr < 0] = 0.0 115 | risk = np.empty(len(coefs_snr)) 116 | shrinkage_factor = np.zeros(len(coefs_snr)) 117 | for j in xrange(len(risk)): 118 | shrinkage_factor[:j+1] = 1.0 119 | risk[j] = np.mean((shrinkage_factor - coefs_snr) ** 2 * self.coefs ** 2) 120 | best_order = risk.argmin() 121 | self.shrinkage_factors = np.ones(len(coefs_snr)) 122 | self.shrinkage_factors[best_order:] = 0.0 # only keep first best_order basis coefficients 123 | 124 | 125 | class REACT2D(REACT): 126 | def __init__(self, max_order=None, method='monotone'): 127 | # currently only support the DCT for 2-D data 128 | super(REACT2D, self).__init__('DCT', max_order, method) 129 | self.row_order = np.zeros(1) 130 | self.col_order = np.zeros(1) 131 | 132 | def interpolate(self, x_idx): 133 | if True: 134 | print 'Interpolation not currently available for REACT2D' 135 | else: 136 | super(REACT2D, self).interpolate(x_idx) 137 | 138 | @staticmethod 139 | def build_dct(nrows, ncols, p): 140 | # first build 1-D basis vectors 141 | Urows = super(REACT2D, REACT2D).build_dct(nrows, p) 142 | Ucols = super(REACT2D, REACT2D).build_dct(ncols, p) 143 | # now build 2-d basis as outer products of 1-d basis vectors 144 | row_order, col_order = np.mgrid[:p, :p] 145 | row_order = row_order.ravel() + 1 146 | col_order = col_order.ravel() + 1 147 | # sort the basis images by the sum of squares of their orders 148 | sqr_order = row_order ** 2 + col_order ** 2 149 | s_idx = np.argsort(sqr_order) 150 | row_order = row_order[s_idx] 151 | col_order = col_order[s_idx] 152 | U = np.empty((nrows * ncols, len(row_order))) 153 | for j in xrange(len(row_order)): 154 | U[:, j] = np.outer(Urows[:, row_order[j]-1], Ucols[:, col_order[j]-1]).ravel() 155 | 156 | return U 157 | 158 | def fit(self, y, sigsqr): 159 | # build the discrete cosine basis 160 | if self.n_components is None: 161 | components_from_y = True 162 | self.n_components = min(y.shape) 163 | else: 164 | components_from_y = False 165 | 166 | try: 167 | self.n_components <= min(y.shape) 168 | except ValueError: 169 | 'Number of components must be less than the length of y.' 170 | 171 | # build the 2-D DCT here and then feed into REACT.fit() 172 | X = self.build_dct(y.shape[0], y.shape[1], self.n_components) 173 | 174 | ysmooth = super(REACT2D, self).fit(y.ravel(), X, sigsqr) 175 | 176 | # save the orders of the basis functions 177 | row_order, col_order = np.mgrid[:self.n_components, :self.n_components] 178 | row_order = row_order.ravel() + 1 179 | col_order = col_order.ravel() + 1 180 | # sort the basis images by the sum of squares of their orders 181 | sqr_order = row_order ** 2 + col_order ** 2 182 | s_idx = np.argsort(sqr_order) 183 | self.row_order = row_order[s_idx] 184 | self.col_order = col_order[s_idx] 185 | 186 | if components_from_y: 187 | # return n_components to value from constructor 188 | self.n_components = None 189 | 190 | return np.reshape(ysmooth, y.shape) 191 | -------------------------------------------------------------------------------- /bck_stats/dynamic_linear_model.py: -------------------------------------------------------------------------------- 1 | __author__ = 'brandonkelly' 2 | 3 | import numpy as np 4 | import pykalman 5 | import matplotlib.pyplot as plt 6 | import multiprocessing 7 | 8 | 9 | def mae_loss(y, yfit): 10 | return np.mean(np.abs(y - yfit)) 11 | 12 | 13 | def _train_predict_dlm(args): 14 | """ 15 | Helper function to train and predict the dynamic linear model for a train and test set. Seperated from the main 16 | class to enable the use of the multiprocessing module. This should not be called directly. 17 | """ 18 | delta, X, y, ntrain, loss = args 19 | print delta 20 | dlm = DynamicLinearModel(include_constant=False) 21 | 22 | # first fit using the training data 23 | dlm.fit(X[:ntrain], y[:ntrain], delta=delta, method='filter') 24 | 25 | # now run the filter on the whole data set 26 | ntime, pfeat = X.shape 27 | observation_matrix = X.reshape((ntime, 1, pfeat)) 28 | k = dlm.kalman 29 | kalman = pykalman.KalmanFilter(transition_matrices=k.transition_matrices, 30 | observation_matrices=observation_matrix, 31 | observation_offsets=k.observation_offsets, 32 | transition_offsets=k.transition_offsets, 33 | observation_covariance=k.observation_covariance, 34 | transition_covariance=k.transition_covariance, 35 | initial_state_mean=k.initial_state_mean, 36 | initial_state_covariance=k.initial_state_covariance) 37 | 38 | beta, bcov = kalman.filter(y) 39 | 40 | # predict the y-values in the test set 41 | yfit = np.sum(beta[ntrain-1:-1] * X[ntrain-1:-1], axis=1) 42 | 43 | test_error = loss(y[ntrain:], yfit) 44 | 45 | return test_error 46 | 47 | 48 | class DynamicLinearModel(object): 49 | def __init__(self, include_constant=True): 50 | """ 51 | Constructor for linear regression model with dynamic coefficients. 52 | """ 53 | self.delta_grid = np.zeros(10) 54 | self.test_grid = np.zeros(10) 55 | self.delta = 1e-4 56 | self.test_error_ = 1.0 57 | self.kalman = pykalman.KalmanFilter() 58 | self.beta = np.zeros(2) 59 | self.beta_cov = np.identity(2) 60 | self.current_beta = np.zeros(2) 61 | self.current_bcov = np.identity(2) 62 | self.include_constant = include_constant 63 | 64 | @staticmethod 65 | def add_constant_(X): 66 | """ 67 | Add a constant to the linear model by prepending a column of ones to the feature array. 68 | 69 | @param X: The feature array. Note that it will be overwritten, and the overwritten array will be returned. 70 | """ 71 | if X.ndim == 1: 72 | # treat vector-valued X differently 73 | X = np.insert(X[:, np.newaxis], 0, np.ones(len(X)), axis=1) 74 | else: 75 | X = np.insert(X, 0, np.ones(X.shape[0]), axis=1) 76 | 77 | return X 78 | 79 | def fit(self, X, y, method='smoother', delta=None, include_constant=None): 80 | """ 81 | Fit the coefficients for the dynamic linear model. 82 | 83 | @param method: The method used to estimate the dynamic coefficients, either 'smoother' or 'filter'. If 84 | 'smoother', then the Kalman Smoother is used, otherwise the Kalman Filter will be used. The two differ 85 | in the fact that the Kalman Smoother uses both future and past data, while the Kalman Filter only uses 86 | past data. 87 | @param X: The time-varying covariates, and (ntime, pfeat) array. 88 | @param y: The time-varying response, a 1-D array with ntime elements. 89 | @param delta: The regularization parameters on the time variation of the coefficients. Default is 90 | self.delta. 91 | @param include_constant: Boolean, if true then include a constant in the regression model. 92 | """ 93 | try: 94 | method.lower() in ['smoother', 'filter'] 95 | except ValueError: 96 | "method must be either 'smoother' or 'filter'." 97 | 98 | if delta is None: 99 | delta = self.delta 100 | else: 101 | self.delta = delta 102 | 103 | if include_constant is None: 104 | include_constant = self.include_constant 105 | else: 106 | self.include_constant = include_constant 107 | 108 | if include_constant: 109 | Xtemp = self.add_constant_(X.copy()) 110 | else: 111 | Xtemp = X.copy() 112 | 113 | ntime, pfeat = Xtemp.shape 114 | 115 | observation_matrix = Xtemp.reshape((ntime, 1, pfeat)) 116 | observation_offset = np.array([0.0]) 117 | 118 | transition_matrix = np.identity(pfeat) 119 | transition_offset = np.zeros(pfeat) 120 | 121 | mu = (1.0 - delta) / delta 122 | # Var(beta_t - beta_{t-1}) = 1.0 / mu 123 | transition_covariance = np.identity(pfeat) / mu 124 | 125 | # parameters to be estimated using MLE 126 | em_vars = ['initial_state_mean', 'initial_state_covariance'] 127 | kalman = pykalman.KalmanFilter(transition_matrices=transition_matrix, em_vars=em_vars, 128 | observation_matrices=observation_matrix, 129 | observation_offsets=observation_offset, transition_offsets=transition_offset, 130 | observation_covariance=np.array([1.0]), 131 | transition_covariance=transition_covariance) 132 | 133 | kalman.em(y) 134 | if method is 'smoother': 135 | beta, beta_covar = kalman.smooth(y) 136 | else: 137 | beta, beta_covar = kalman.filter(y) 138 | 139 | self.beta = beta 140 | self.beta_cov = beta_covar 141 | self.current_beta = beta[-1] 142 | self.current_bcov = beta_covar[-1] 143 | self.kalman = kalman 144 | 145 | def update(self, y, x): 146 | """ 147 | Update the linear regression coefficients given the new values of the response and features. 148 | 149 | @param y: The new response value, a scalar. 150 | @param x: The new feature vector. 151 | """ 152 | if self.include_constant: 153 | observation_matrix = np.insert(x, 0, 1.0) 154 | else: 155 | observation_matrix = x.copy() 156 | 157 | pfeat = observation_matrix.size 158 | observation_matrix = observation_matrix.reshape((1, pfeat)) 159 | 160 | self.current_beta, self.current_bcov = \ 161 | self.kalman.filter_update(self.current_beta, self.current_bcov, observation=y, 162 | observation_matrix=observation_matrix) 163 | 164 | self.beta = np.vstack((self.beta, self.current_beta)) 165 | self.beta_cov = np.dstack((self.beta_cov.T, self.current_bcov)).T 166 | 167 | def predict(self, x): 168 | """ 169 | Predict a value of the response given the input feature array and the current value of the coefficients. 170 | 171 | @param x: The input feature array. 172 | """ 173 | if self.include_constant: 174 | xpredict = np.insert(x, 0, 1.0) 175 | else: 176 | xpredict = x 177 | 178 | return np.sum(self.current_beta * xpredict) 179 | 180 | def choose_delta(self, X, y, test_fraction=0.5, verbose=False, ndeltas=20, include_constant=True, loss=mae_loss, 181 | njobs=1): 182 | """ 183 | Choose the optimal regularization parameters for the linear smoother coefficients by minimizing an input loss 184 | function on a test set. 185 | 186 | @param X: The time-varying covariates, and (ntime, pfeat) array. 187 | @param y: The training set, a 1-D array. 188 | @param ndeltas: The number of grid points to use for the regularization parameter. 189 | @param test_fraction: The fraction of the input data to use as the test set, default is half. 190 | @param verbose: If true, then print the chosen regularization parameter and test error. 191 | @param include_constant: Boolean, include a constant in the linear model? 192 | @param loss: The loss function to use for evaluating the test error when choosing the regularization parameter. 193 | Must be of the form result = loss(ytest, yfit). 194 | @param njobs: The number of processors to use when doing the search over delta. If njobs = -1, all processors 195 | will be used. 196 | """ 197 | 198 | if include_constant is None: 199 | include_constant = self.include_constant 200 | else: 201 | self.include_constant = include_constant 202 | 203 | if njobs < 0: 204 | njobs = multiprocessing.cpu_count() 205 | 206 | pool = multiprocessing.Pool(njobs) 207 | pool.map(int, range(njobs)) # warm up the pool 208 | 209 | # split y into training and test sets 210 | ntime = y.size 211 | ntest = int(ntime * test_fraction) 212 | ntrain = ntime - ntest 213 | if X.ndim == 1: 214 | XX = X.reshape((X.size, 1)) 215 | else: 216 | XX = X.copy() 217 | 218 | if include_constant: 219 | # add column of ones to feature array 220 | XX = self.add_constant_(XX) 221 | 222 | # grid of delta (regularization) values, between 1e-4 and 1.0. 223 | delta_grid = np.logspace(-4.0, np.log10(0.95), ndeltas) 224 | 225 | args = [] 226 | for d in xrange(ndeltas): 227 | args.append((delta_grid[d], XX, y, ntrain, loss)) 228 | 229 | if verbose: 230 | print 'Computing test errors...' 231 | 232 | if njobs == 1: 233 | test_grid = map(_train_predict_dlm, args) 234 | else: 235 | test_grid = pool.map(_train_predict_dlm, args) 236 | 237 | test_grid = np.array(test_grid) 238 | self.delta = delta_grid[test_grid.argmin()] 239 | self.test_error_ = test_grid.min() 240 | 241 | if verbose: 242 | print 'Best delta is', self.delta, 'and has a test error of', test_grid.min() 243 | 244 | self.delta_grid = delta_grid 245 | self.test_grid = test_grid 246 | 247 | 248 | if __name__ == "__main__": 249 | # run test from Montana et al. (2009) 250 | nx = 1000 251 | x = np.zeros(nx) 252 | x[0] = np.random.uniform(-2.0, 2.0) 253 | for i in xrange(1, nx): 254 | x[i] = 0.8 * x[i-1] + np.random.uniform(-2.0, 2.0) 255 | 256 | y = np.zeros(x.size) 257 | beta = np.zeros(x.size) 258 | beta[0] = 2.0 259 | for i in xrange(1, x.size): 260 | if i < 300: 261 | beta[i] = beta[i-1] + 0.1 * np.random.standard_normal() 262 | elif i == 300: 263 | beta[i] = beta[i-1] + 4.0 264 | elif (i > 300) and (i < 600): 265 | beta[i] = beta[i-1] + 0.001 * np.random.standard_normal() 266 | else: 267 | beta[i] = 5.0 * np.sin(i / 10.0) + np.random.uniform(-2.0, 2.0) 268 | 269 | y = 2.0 + beta * x + 2.0 * np.random.standard_normal(nx) 270 | 271 | plt.plot(beta) 272 | plt.ylabel(r'$\beta$') 273 | plt.show() 274 | plt.clf() 275 | 276 | plt.plot(x, y, '.') 277 | plt.ylabel('y') 278 | plt.xlabel('x') 279 | plt.show() 280 | plt.clf() 281 | 282 | plt.plot(y) 283 | plt.ylabel('y') 284 | plt.show() 285 | plt.clf() 286 | 287 | dynamic = DynamicLinearModel(include_constant=False) 288 | dynamic.choose_delta(np.ones(len(y)), y, test_fraction=0.5, verbose=True, ndeltas=20, njobs=5) 289 | dynamic.fit(np.ones(len(y)), y) 290 | 291 | plt.semilogx(dynamic.delta_grid, dynamic.test_grid) 292 | plt.xlabel('Regularization (delta)') 293 | plt.ylabel('Mean Absolute Test Error') 294 | plt.show() 295 | 296 | plt.clf() 297 | for i in xrange(1): 298 | plt.subplot(2, 1, i + 1) 299 | plt.plot(y, '.') 300 | plt.plot(dynamic.beta[:, i]) 301 | plt.ylabel(r"$\beta_" + str(i) + '$') 302 | if i == 1: 303 | plt.plot(beta, 'k') 304 | plt.show() -------------------------------------------------------------------------------- /bck_stats/super_pca.py: -------------------------------------------------------------------------------- 1 | __author__ = 'brandonkelly' 2 | 3 | import numpy as np 4 | from sklearn import cross_validation, metrics 5 | from sklearn.decomposition import PCA 6 | import multiprocessing 7 | import copy 8 | import matplotlib.pyplot as plt 9 | 10 | 11 | class SupervisedPCABase(object): 12 | 13 | def __init__(self, regressor, max_components=None, n_components=1, whiten=True): 14 | """ 15 | Base class for performing supervised principal component regression. This is useful for cases where the number 16 | of inputs (features) is greater than the number of data points. 17 | 18 | @param regressor: The object that will perform the regression. The following members must be defined for this 19 | object: 20 | 21 | regressor.fit(X, y) : Fits the regression model y = f(X). 22 | regressor.predict(X) : Compute the prediction y = f(X). 23 | regressor.coef_score_ : The score of each parameter, used for ranking the most important features when 24 | computing the reduced feature space. In general this will be the absolute value of 25 | the coefficient value divided by its standard error. Note that this should *not* 26 | include the intercept. 27 | 28 | @param max_components: Maximum number of components to search over. The default is p. 29 | @param n_components: The number of reduced data matrix PCA components to use in the regression. 30 | @param whiten: Remove differences in variance among the components, i.e., principal components will have unit 31 | variance 32 | """ 33 | self.regressor = regressor 34 | self.max_components = max_components 35 | self.pca_object = PCA(n_components=n_components, whiten=whiten) 36 | self.n_components = n_components 37 | self.whiten = whiten 38 | self.n_reduced = 0 39 | self.sort_idx = np.zeros(1) 40 | 41 | def _compute_stnd_coefs(self, X, y): 42 | """ 43 | Compute the standardized regression coefficients, up to a common scaling factor. 44 | 45 | @param X: The matrix of inputs, shape (n,p). 46 | @param y: The array of response values, size n. 47 | @return: The standardized regression coefficients, size p. 48 | """ 49 | p = X.shape[1] 50 | scoefs = np.zeros(p) 51 | for j in xrange(p): 52 | thisX = X[:, j] 53 | self.regressor.fit(thisX[:, np.newaxis], y) 54 | scoefs[j] = self.regressor.coef_score_ 55 | 56 | return scoefs 57 | 58 | def _get_reduced_features(self, X, coefs, pmax): 59 | """ 60 | Return the data projected onto the first n_components principal components computed using the reduced feature 61 | space. 62 | 63 | @param X: The array of inputs, shape (n, p). 64 | @param coefs: The array of standardized coefficients, size p. 65 | @param pmax: The maximum number of features to use in the reduced feature space PCA. 66 | @return: The data projected onto the reduced feature space PCA, shape (n, self.n_components). 67 | """ 68 | sort_idx = np.argsort(coefs)[::-1] 69 | sort_idx = sort_idx[:pmax] 70 | self.pca_object.fit(X[:, sort_idx]) 71 | X_reduced = self.pca_object.transform(X[:, sort_idx]) 72 | 73 | return X_reduced, sort_idx 74 | 75 | def fit(self, X, y, n_reduced): 76 | """ 77 | Perform the regression using the first self.n_components principal components from the reduced feature space. 78 | Note that this will call self.regressor.fit(X,y) to perform the regression. 79 | 80 | @param X: The array of inputs, shape (n, p). 81 | @param y: The array of response values, size n. 82 | @param n_reduced: The number of features to use in the reduced feature space. 83 | """ 84 | scoefs = self._compute_stnd_coefs(X, y) 85 | X_reduced, sort_idx = self._get_reduced_features(X, scoefs, n_reduced) 86 | self.sort_idx = sort_idx 87 | self.regressor.fit(X_reduced, y) 88 | 89 | def predict(self, X): 90 | """ 91 | Predict the value y = f(X) based on the PCA using the reduced feature space, based on the most recent call to 92 | self.fit(X, y, n_reduced). 93 | 94 | @param X: The array of inputs, shape (n, p). 95 | @return: The predicted values of the response. 96 | """ 97 | X_reduced = self.pca_object.transform(X[:, self.sort_idx]) 98 | y_predict = self.regressor.predict(X_reduced) 99 | return y_predict 100 | 101 | 102 | def launch_coef_scores(args): 103 | """ 104 | Wrapper to compute the standardized scores of the regression coefficients, used when computing the number of 105 | features in the reduced parameter set. 106 | 107 | @param args: Tuple containing the instance of SupervisedPCABase, feature matrix and response array. 108 | @return: The standardzed scores of the coefficients. 109 | """ 110 | spca, X, y = args 111 | scoefs = spca._compute_stnd_coefs(X, y) 112 | return scoefs 113 | 114 | 115 | def compute_cv_prediction(args): 116 | """ 117 | Internal method to get predictions based on supervised PCA regression for each cross-validation fold. Need this 118 | format in order to compute the predictions for the CV folds in parallel. 119 | """ 120 | spca, X_train, y_train, X_test, n_reduced, scoef = args 121 | SPCA = SupervisedPCABase(copy.deepcopy(spca.regressor), spca.max_components, spca.n_components, spca.whiten) 122 | X_reduced, sort_idx = SPCA._get_reduced_features(X_train, scoef, n_reduced) 123 | SPCA.regressor.fit(X_reduced, y_train) 124 | X_test_reduced = SPCA.pca_object.transform(X_test[:, sort_idx]) 125 | y_predict = SPCA.regressor.predict(X_test_reduced) 126 | return y_predict 127 | 128 | 129 | class SupervisedPCA(SupervisedPCABase): 130 | def __init__(self, regressor, max_components=None, n_components=1, whiten=True, n_jobs=1): 131 | """ 132 | Class for performing supervised principal component regression. This is useful for cases where the number of 133 | inputs (features) is greater than the number of data points. 134 | 135 | @param regressor: The object that will perform the regression. The following members must be defined for this 136 | object: 137 | 138 | regressor.fit(X, y) : Fits the regression model y = f(X). 139 | regressor.predict(X) : Compute the prediction y = f(X). 140 | regressor.coef_score_ : The score of each parameter, used for ranking the most important features when 141 | computing the reduced feature space. In general this will be the absolute value of 142 | the coefficient value divided by its standard error. Note that this should *not* 143 | include the intercept. 144 | 145 | @param max_components: Maximum number of components to search over. The default is p. 146 | @param n_components: The number of reduced data matrix PCA components to use in the regression. 147 | @param whiten: Remove differences in variance among the components, i.e., principal components will have unit 148 | variance 149 | @param n_jobs: The number of threads to use for parallel processing. If n_jobs = -1 then use maximum number 150 | available. 151 | """ 152 | super(SupervisedPCA, self).__init__(regressor, max_components, n_components, whiten) 153 | if n_jobs < 0: 154 | n_jobs = multiprocessing.cpu_count() 155 | self.n_jobs = n_jobs 156 | 157 | def _compute_cv_prediction(self, args): 158 | """ 159 | Internal method to get predictions based on supervised PCA regression for each cross-validation fold. Need this 160 | format in order to compute the predictions for the CV folds in parallel. 161 | """ 162 | X_train, y_train, X_test, n_reduced, scoef = args 163 | SPCA = SupervisedPCABase(copy.deepcopy(self.regressor), self.max_components, self.n_components, self.whiten) 164 | X_reduced, sort_idx = SPCA._get_reduced_features(X_train, scoef, n_reduced) 165 | SPCA.regressor.fit(X_reduced, y_train) 166 | X_test_reduced = SPCA.pca_object.transform(X_test[:, sort_idx]) 167 | y_predict = SPCA.regressor.predict(X_test_reduced) 168 | return y_predict 169 | 170 | def _launch_coef_scores(self, args): 171 | """ 172 | Wrapper to compute the standardized scores of the regression coefficients, used when computing the number of 173 | features in the reduced parameter set. 174 | 175 | @param args: Tuple containing the feature matrix and response array. 176 | @return: The standardzed scores of the coefficients. 177 | """ 178 | X, y = args 179 | scoefs = self._compute_stnd_coefs(X, y) 180 | return scoefs 181 | 182 | def choose_nreduced(self, X, y, lossfunc=None, cv=None, verbose=False, cvplot=False): 183 | """ 184 | Choose the number of features to use in the reduced feature set by minimizing the cross-validation error. 185 | 186 | @param X: The feature matrix, shape (n,p) 187 | @param y: The vector of response values, size n. 188 | @param lossfunc: The loss function to use for the CV error, callable. The default is mean squared error. 189 | @param cv: Number of CV folds (if int), or cross-validation iterator. 190 | @param verbose: Print helpful information. 191 | @param cvplot: Plot the CV error as a function of the number features in the reduced feature set. 192 | @return: The number of features in the reduced feature set that minimized the CV error. 193 | """ 194 | if self.n_jobs > 1: 195 | pool = multiprocessing.Pool(self.n_jobs) 196 | pool.map(int, range(self.n_jobs)) # Trick to "warm up" the Pool 197 | 198 | # setup cross-validation iterator 199 | if cv is None: 200 | K_folds = 8 201 | if isinstance(cv, int): 202 | K_folds = cv 203 | 204 | cv = cross_validation.KFold(y.size, n_folds=K_folds) 205 | 206 | if lossfunc is None: 207 | lossfunc = metrics.mean_squared_error 208 | 209 | if self.max_components is None: 210 | self.max_components = X.shape[1] 211 | 212 | if verbose: 213 | print 'Searching over', self.max_components, ' features to include in the reduced feature space.' 214 | print 'Computing univariate regression tests statistics for each feature...' 215 | 216 | # first compute coefficients scores 217 | sargs = [] 218 | for train_idx, test_idx in cv: 219 | if self.n_jobs == 1: 220 | sargs.append((X[train_idx, :], y[train_idx])) 221 | else: 222 | sargs.append((self, X[train_idx, :], y[train_idx])) 223 | 224 | if self.n_jobs == 1: 225 | scoefs = map(self._launch_coef_scores, sargs) 226 | else: 227 | scoefs = pool.map(launch_coef_scores, sargs) 228 | 229 | # find optimal number of features to use in PCA on reduced feature set, do this by minimizing cross-validation 230 | # error on a grid. 231 | cverrors = np.zeros(self.max_components) 232 | 233 | if verbose: 234 | print 'Computing cross-validation errors on a grid of up to', self.max_components, 'features used in the', \ 235 | 'reduced feature space...' 236 | 237 | for k in xrange(self.max_components): 238 | cverror_args = [] 239 | ytest = [] 240 | fold_idx = 0 241 | for train_idx, test_idx in cv: 242 | if self.n_jobs == 1: 243 | cverror_args.append((X[train_idx, :], y[train_idx], X[test_idx, :], k + 1, scoefs[fold_idx])) 244 | else: 245 | cverror_args.append((self, X[train_idx, :], y[train_idx], X[test_idx, :], k + 1, scoefs[fold_idx])) 246 | ytest.append(y[test_idx]) 247 | fold_idx += 1 248 | 249 | if self.n_jobs == 1: 250 | ypredictions = map(self._compute_cv_prediction, cverror_args) 251 | else: 252 | ypredictions = pool.map(compute_cv_prediction, cverror_args) 253 | 254 | cverror_k = 0.0 255 | for yt, yp in zip(ytest, ypredictions): 256 | cverror_k += lossfunc(yt, yp) / K_folds 257 | cverrors[k] = cverror_k 258 | 259 | if cvplot: 260 | plt.plot(np.arange(1, self.max_components + 1), cverrors) 261 | plt.xlabel('# of features in reduced set') 262 | plt.ylabel('CV Loss Function') 263 | plt.show() 264 | 265 | n_reduced = cverrors.argmin() + 1 266 | 267 | if verbose: 268 | print 'Selected', n_reduced, 'features to use in the reduced feature set.' 269 | 270 | return n_reduced -------------------------------------------------------------------------------- /bck_stats/dba.py: -------------------------------------------------------------------------------- 1 | __author__ = 'brandonkelly' 2 | 3 | import numpy as np 4 | from numba import jit 5 | import matplotlib.pyplot as plt 6 | from scipy.interpolate import interp1d 7 | import time 8 | 9 | 10 | @jit # if you don't have number, then comment out this line but this routine will be slow! 11 | def dynamic_time_warping(tseries1, tseries2): 12 | """ 13 | Compute the dynamic time warping (DTW) distance between two time series. It is assumed that the time series are 14 | evenly sampled, but they can have different lengths. Numba is used to speed up the computation, so you must have 15 | Numba installed. Note that the time series can be multivariate. 16 | 17 | :param tseries1: The first time series, a 1-D or 2-D numpy array. 18 | :param tseries2: The second time series, a 1-D or 2-D numpy array. 19 | :return: A tuple containing the DTW distance, the DTW matrix, and the path matrix taken by the algorithm. 20 | """ 21 | ntime1, nfeatures = tseries1.shape 22 | ntime2 = tseries2.shape[0] 23 | dtw = np.zeros((ntime1, ntime2), dtype=np.float) # matrix of coordinate distances 24 | path = np.zeros((ntime1, ntime2), dtype=np.int) # path of algorithm 25 | 26 | # initialize the first row and column 27 | for k in range(nfeatures): 28 | dtw[0, 0] += (tseries1[0, k] - tseries2[0, k]) ** 2 29 | path[0, 0] = -1 30 | 31 | for i in range(1, ntime1): 32 | dist = 0.0 33 | for k in range(nfeatures): 34 | dist += (tseries1[i, k] - tseries2[0, k]) ** 2 35 | dtw[i, 0] = dtw[i-1, 0] + dist 36 | path[i, 0] = 2 37 | 38 | for j in range(1, ntime2): 39 | dist = 0.0 40 | for k in range(nfeatures): 41 | dist += (tseries1[0, k] - tseries2[j, k]) ** 2 42 | dtw[0, j] = dtw[0, j-1] + dist 43 | path[0, j] = 1 44 | 45 | # main loop of the DTW algorithm 46 | for i in range(1, len(tseries1)): 47 | for j in range(1, len(tseries2)): 48 | a = dtw[i-1, j-1] 49 | b = dtw[i, j-1] 50 | c = dtw[i-1, j] 51 | if a < b: 52 | if a < c: 53 | idx = 0 # a is the minimum 54 | delta = a 55 | else: 56 | idx = 2 # c is the minimum 57 | delta = c 58 | else: 59 | if b < c: 60 | idx = 1 # b is the minimum 61 | delta = b 62 | else: 63 | idx = 2 # c is the minimum 64 | delta = c 65 | # neighbors = np.array([dtw[i-1, j-1], dtw[i, j-1], dtw[i-1, j]]) 66 | # idx = np.argmin(neighbors) 67 | # delta = neighbors[idx] 68 | dist = 0.0 69 | for k in range(nfeatures): 70 | dist += (tseries1[i, k] - tseries2[j, k]) ** 2 71 | dtw[i, j] = dist + delta 72 | path[i, j] = idx 73 | 74 | return dtw[-1, -1], dtw, path 75 | 76 | 77 | class DBA(object): 78 | 79 | def __init__(self, max_iter, tol=1e-4, verbose=False): 80 | """ 81 | Constructor for the DBA class. This class computes the dynamic time warping (DTW) barycenter averaging (DBA) 82 | strategy for averaging a set of time series. The method is described in 83 | 84 | "A global averaging method for dynamic time warping, with applications to clustering." Petitjean, F., 85 | Ketterlin, A., & Gancarski, P. 2011, Pattern Recognition, 44, 678-693. 86 | 87 | :param max_iter: The maximum number of iterations for the DBA algorithm. 88 | :param tol: The tolerance level for the algorithm. The algorithm terminates once the fractional difference in 89 | the within-group sum of squares between successive iterations is less than tol. The algorithm will also 90 | terminate if the maximum number of iterations is exceeded, or if the sum of squares increases. 91 | :param verbose: If true, then provide helpful output. 92 | """ 93 | self.max_iter = max_iter 94 | self.tol = tol 95 | self.average = np.zeros(1) 96 | self.wgss = 0.0 # the within-group sum of squares, called the inertia in the clustering literature 97 | self.verbose = verbose 98 | 99 | def compute_average(self, tseries, nstarts=1, initial_value=None, dba_length=None): 100 | """ 101 | Perform the DBA algorithm to compute the average for a set of time series. The algorithm is a local optimization 102 | strategy and thus depends on the initial guess for the average. Improved results can be obtained by using 103 | multiple random initial starts. 104 | 105 | :param tseries: The list of time series, a list of numpy arrays. Can be multivariate time series. 106 | :param nstarts: The number of random starts to use for the DBA algorithm. The average time series that minimizes 107 | the within-group sum of squares over the random starts is returned and saved. 108 | :param initial_value: The initial value for the DBA algorithm, a numpy array. If None, then the initial values 109 | will be drawn randomly from the set of input time series (recommended). Note that is an initial guess is 110 | supplied, then the nstarts parameter is ignored. 111 | :param dba_length: The length of the DBA average time series. If None, this will be set to the length of the 112 | initial_value array. Otherwise, the initial value array will be linearly interpolated to this length. 113 | :return: The estimated average of the time series, defined to minimize the within-group sum of squares of the 114 | input set of time series. 115 | """ 116 | if initial_value is not None: 117 | nstarts = 1 118 | 119 | if initial_value is None: 120 | # initialize the average as a random draw from the set of inputs 121 | start_idx = np.random.permutation(len(tseries))[:nstarts] 122 | 123 | best_wgss = 1e300 124 | if self.verbose: 125 | print 'Doing initialization iteration:' 126 | for i in range(nstarts): 127 | print i, '...' 128 | if initial_value is None: 129 | iseries = tseries[start_idx[i]] 130 | else: 131 | iseries = initial_value 132 | if dba_length is not None: 133 | # linearly interpolate initial average value to the requested length 134 | iseries0 = np.atleast_2d(iseries) 135 | if iseries0.shape[0] == 1: 136 | iseries0 = iseries0.T # vector, so transpose to shape (ntime, 1) 137 | nfeatures = iseries0.shape[1] 138 | iseries = np.zeros((dba_length, nfeatures)) 139 | for k in range(nfeatures): 140 | lininterp = interp1d(np.arange(iseries0.shape[0]), iseries0[:, k]) 141 | iseries[:, k] = lininterp(np.linspace(0.0, iseries0.shape[0]-1.01, num=dba_length)) 142 | 143 | self._run_dba(tseries, iseries) 144 | 145 | if self.wgss < best_wgss: 146 | # found better average, save it 147 | if self.verbose: 148 | print 'New best estimate found for random start', i 149 | best_wgss = self.wgss 150 | best_average = self.average 151 | 152 | self.wgss = best_wgss 153 | self.average = best_average 154 | 155 | return best_average 156 | 157 | def associate_segments(self, tseries): 158 | """ 159 | Identify the indices of the inputs time series that are associated with each element of the average time series. 160 | 161 | :param tseries: The times series for which the indices associated with the average are desired. A numpy array. 162 | :return: A list-of-lists containing the indices of the input time series that are associated with the elements 163 | of the DBA average. Call this assoc_table. Then assoc_table[i] will return a list of the indices of the 164 | input time series that are associated with the element i of the DBA average (i.e., self.average[i]). 165 | """ 166 | dtw_dist, dtw, path = dynamic_time_warping(self.average, tseries) 167 | 168 | # table telling us which elements of the time series are identified with a specific element of the DBA average 169 | assoc_table = [] 170 | for i in range(self.average.shape[0]): 171 | assoc_table.append([]) 172 | 173 | i = self.average.shape[0] - 1 174 | j = tseries.shape[0] - 1 175 | 176 | while i >= 0 and j >= 0: 177 | assoc_table[i].append(j) 178 | if path[i, j] == 0: 179 | i -= 1 180 | j -= 1 181 | elif path[i, j] == 1: 182 | j -= 1 183 | elif path[i, j] == 2: 184 | i -= 1 185 | else: 186 | # should not happen, but just in case make sure we bail once path[i, j] = -1 187 | break 188 | 189 | return assoc_table 190 | 191 | def _run_dba(self, tseries, initial_value): 192 | """ Perform the DBA algorithm. """ 193 | nseries = len(tseries) 194 | 195 | self.average = initial_value 196 | 197 | # first iteration: get initial within-group sum of squares 198 | if self.verbose: 199 | print 'Doing iteration' 200 | print ' ', '0', '...' 201 | wgss = self._dba_iteration(tseries) 202 | 203 | # main DBA loop 204 | for i in range(1, self.max_iter): 205 | if self.verbose: 206 | print ' ', i, '...', 'WGSS:', wgss 207 | wgss_old = wgss 208 | # WGSS is actually from previous iteration, but don't compute again because it is expensive 209 | wgss = self._dba_iteration(tseries) 210 | if wgss > wgss_old: 211 | # sum of squares should be non-increasing 212 | print 'Warning! Within-group sum of squares increased at iteration', i, 'terminating algorithm.' 213 | break 214 | elif np.abs(wgss - wgss_old) / wgss_old < self.tol: 215 | # convergence 216 | break 217 | 218 | # compute final within-group sum of squares 219 | wgss = 0.0 220 | for k in range(nseries): 221 | wgss += dynamic_time_warping(tseries[k], self.average)[0] 222 | self.wgss = wgss 223 | 224 | def _dba_iteration(self, tseries): 225 | """ Perform a single iteration of the DBA algorithm. """ 226 | ntime = self.average.shape[0] 227 | 228 | # table telling us which elements of the time series are identified with a specific element of the DBA average 229 | assoc_table = [] 230 | for i in range(ntime): 231 | assoc_table.append([]) 232 | 233 | wgss = 0.0 # within group sum of squares from previous iteration, compute here so we don't have to repeat 234 | for series in tseries: 235 | if self.average.shape[1] == 1: 236 | series = series[:, np.newaxis] 237 | dtw_dist, dtw, path = dynamic_time_warping(self.average, series) 238 | wgss += dtw_dist 239 | i = ntime - 1 240 | j = series.shape[0] - 1 241 | while i >= 0 and j >= 0: 242 | assoc_table[i].append(series[j]) 243 | if path[i, j] == 0: 244 | i -= 1 245 | j -= 1 246 | elif path[i, j] == 1: 247 | j -= 1 248 | elif path[i, j] == 2: 249 | i -= 1 250 | else: 251 | # should not happen, but just in case make sure we bail once path[i, j] = -1 252 | break 253 | 254 | # update the average 255 | for i, cell in enumerate(assoc_table): 256 | cell_array = np.array(cell) 257 | self.average[i] = cell_array.mean(axis=0) 258 | 259 | return wgss 260 | 261 | 262 | if __name__ == "__main__": 263 | # run on some test data 264 | nseries = 40 265 | ntime0 = 1000 266 | phase1 = 0.1 + 0.2 * np.random.uniform(0.0, 1.0, nseries) - 0.1 267 | period1 = np.pi / 4.0 + np.pi / 100.0 * np.random.standard_normal(nseries) 268 | 269 | phase2 = np.pi / 2 + 0.2 * np.random.uniform(0.0, 1.0, nseries) - 0.1 270 | period2 = np.pi / 2.0 + np.pi / 100.0 * np.random.standard_normal(nseries) 271 | 272 | noise_amplitude = 0.0 273 | 274 | t_list = [] 275 | ts_list = [] 276 | for i in range(nseries): 277 | ntime = np.random.random_integers(ntime0 * 0.9, ntime0 * 1.1) 278 | t = np.linspace(0.0, 10.0, ntime) 279 | t_list.append(t) 280 | tseries = np.zeros((ntime, 2)) 281 | tseries[:, 0] = np.sin(t / period1[i] + phase1[i]) + noise_amplitude * np.random.standard_normal(ntime) 282 | tseries[:, 1] = np.sin(t / period2[i] + phase2[i]) + noise_amplitude * np.random.standard_normal(ntime) 283 | ts_list.append(tseries) 284 | 285 | niter = 30 286 | dba = DBA(niter, verbose=True, tol=1e-4) 287 | t1 = time.clock() 288 | dba_avg = dba.compute_average(ts_list, nstarts=5, dba_length=10) 289 | t2 = time.clock() 290 | 291 | print 'DBA algorithm took', t2 - t1, 'seconds.' 292 | 293 | plt.subplot(221) 294 | for i in range(nseries): 295 | plt.plot(t_list[i], ts_list[i][:, 0], '.', ms=2) 296 | t = np.linspace(0.0, 10.0, len(dba_avg)) 297 | plt.plot(t, dba_avg[:, 0], 'ko') 298 | plt.subplot(222) 299 | for i in range(nseries): 300 | plt.plot(t_list[i], ts_list[i][:, 1], '.', ms=2) 301 | t = np.linspace(0.0, 10.0, len(dba_avg)) 302 | plt.plot(t, dba_avg[:, 1], 'ko') 303 | plt.subplot(223) 304 | for ts in ts_list: 305 | plt.plot(ts[:, 0], ts[:, 1], '.', ms=2) 306 | plt.plot(dba_avg[:, 0], dba_avg[:, 1], 'ko') 307 | plt.show() 308 | plt.close() 309 | 310 | # find the segments of the first time series identified with each element of the average 311 | assoc = dba.associate_segments(ts_list[0]) 312 | plt.subplot(221) 313 | t = t_list[0] 314 | ts = ts_list[0] 315 | for i, a in enumerate(assoc): 316 | plt.plot(t[a], ts[a, 0], '.', label=str(i)) 317 | plt.plot(np.median(t[a]), dba_avg[i, 0], 'ko') 318 | plt.subplot(222) 319 | for i, a in enumerate(assoc): 320 | plt.plot(t[a], ts[a, 1], '.', label=str(i)) 321 | plt.plot(np.median(t[a]), dba_avg[i, 1], 'ko') 322 | plt.subplot(223) 323 | for i, a in enumerate(assoc): 324 | plt.plot(ts[a, 0], ts[a, 1], '.', label=str(i)) 325 | plt.plot(dba_avg[i, 0], dba_avg[i, 1], 'ko') 326 | plt.show() -------------------------------------------------------------------------------- /bck_stats/sklearn_estimator_suite.py: -------------------------------------------------------------------------------- 1 | __author__ = 'brandonkelly' 2 | 3 | import numpy as np 4 | import abc 5 | 6 | from sklearn.linear_model import LogisticRegression 7 | from sklearn.grid_search import GridSearchCV, ParameterGrid 8 | from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor 9 | from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor, \ 10 | RandomForestRegressor 11 | from sklearn.svm import SVC, LinearSVC 12 | from sklearn.metrics import accuracy_score, make_scorer, mean_absolute_error, mean_squared_error 13 | from sklearn.cross_validation import KFold 14 | from sklearn.base import clone 15 | 16 | float_types = (float, np.float, np.float32, np.float64, np.float_, np.float128, np.float16) 17 | int_types = (int, np.int, np.int8, np.int16, np.int32, np.int64) 18 | 19 | 20 | class GbcAutoNtrees(GradientBoostingClassifier): 21 | """ 22 | Same as GradientBoostingClassifier, but the number of estimators is chosen automatically by maximizing the 23 | out-of-bag score. 24 | """ 25 | def __init__(self, subsample, loss='deviance', learning_rate=0.01, n_estimators=500, min_samples_split=2, 26 | min_samples_leaf=1, max_depth=3, init=None, random_state=None, max_features=None, verbose=0): 27 | super(GbcAutoNtrees, self).__init__(loss, learning_rate, n_estimators, subsample, min_samples_split, 28 | min_samples_leaf, max_depth, init, random_state, max_features, verbose) 29 | 30 | def fit(self, X, y): 31 | 32 | super(GbcAutoNtrees, self).fit(X, y) 33 | oob_score = np.cumsum(self.oob_improvement_) 34 | ntrees = oob_score.argmax() + 1 35 | if self.verbose: 36 | print 'Chose', ntrees, 'based on the OOB score.' 37 | self.n_estimators = ntrees 38 | self.estimators_ = self.estimators_[:ntrees] 39 | 40 | # plt.plot(oob_score) 41 | # plt.show() 42 | 43 | return self 44 | 45 | 46 | class GbrAutoNtrees(GradientBoostingRegressor): 47 | """ 48 | Same as GradientBoostingRegressor, but the number of estimators is chosen automatically by maximizing the 49 | out-of-bag score. 50 | """ 51 | 52 | def __init__(self, subsample, loss='ls', learning_rate=0.1, n_estimators=100, min_samples_split=2, 53 | min_samples_leaf=1, max_depth=3, init=None, random_state=None, max_features=None, alpha=0.9, 54 | verbose=0): 55 | super(GbrAutoNtrees, self).__init__(loss, learning_rate, n_estimators, subsample, min_samples_split, 56 | min_samples_leaf, max_depth, init, random_state, max_features, alpha, 57 | verbose) 58 | 59 | def fit(self, X, y): 60 | 61 | super(GbrAutoNtrees, self).fit(X, y) 62 | oob_score = np.cumsum(self.oob_improvement_) 63 | ntrees = oob_score.argmax() + 1 64 | if self.verbose: 65 | print 'Chose', ntrees, 'based on the OOB score.' 66 | self.n_estimators = ntrees 67 | self.estimators_ = self.estimators_[:ntrees] 68 | 69 | # plt.plot(oob_score) 70 | # plt.show() 71 | 72 | return self 73 | 74 | 75 | class BasePredictorSuite(object): 76 | """ Base class for running a suite of estimators from scikit-learn. """ 77 | __metaclass__ = abc.ABCMeta 78 | 79 | @abc.abstractmethod 80 | def __init__(self, tuning_ranges=None, models=None, cv=None, njobs=1, pre_dispatch='2*n_jobs', stack=True, 81 | verbose=False): 82 | """ 83 | Initialize a pipeline to run a suite of scikit-learn estimators. The tuning parameters are chosen through 84 | cross-validation or the out-of-bags score (for Random Forests) as part of the fitting process. 85 | 86 | :param tuning_ranges: A nested dictionary containing the ranges of the tuning parameters. It should be of the 87 | format {model name 1: {parameter name 1: list(value range 1), parameter name 2: list(value range 2), ...} }. 88 | :param models: A list of instantiated scikit-learn estimator classes to fit. If None, these are taken from 89 | the models listed in tuning_range. 90 | :param cv: The number of CV folds to use, or a CV generator. 91 | :param njobs: The number of processes to run in parallel. 92 | :param pre_dispatch: Passed to sklearn.grid_search.GridSearchCV, see documentation for GridSearchCV for further 93 | details. 94 | :param stack: If true, then the predict() method will return a stacked (averaged) value over the estimators. 95 | Otherwise, if false, then predict() will return the predictions for each estimator. 96 | :param verbose: If true, print out helpful information. 97 | """ 98 | super(BasePredictorSuite, self).__init__() 99 | self.verbose = verbose 100 | if tuning_ranges is None: 101 | tuning_ranges = dict() 102 | self.tuning_ranges = tuning_ranges 103 | if models is None: 104 | models = [] 105 | self.models = models 106 | self.model_names = [] 107 | for model in self.models: 108 | # store the names of the sklearn classes used 109 | self.model_names.append(model.__class__.__name__) 110 | # make sure the model names are in the dictionary of tuning parameters 111 | if model.__class__.__name__ not in tuning_ranges: 112 | raise ValueError('Could not find tuning parameters for', model.__class__.__name__) 113 | 114 | if cv is None: 115 | cv = 3 116 | self.cv = cv 117 | self.njobs = njobs 118 | self.pre_dispatch = pre_dispatch 119 | self.scorer = None 120 | self.stack = stack 121 | self.best_scores = dict() 122 | self.nfeatures = None 123 | 124 | def refine_grid(self, best_params, model_name): 125 | """ 126 | Refine the tuning parameter grid to zoom in on the region near the current maximum. 127 | 128 | :param best_params: A dictionary containing the set of best tuning parameter names and their values. Should be 129 | of the form {'parameter 1': value1, 'parameter 2', value2, ... }. The tuning parameter grid will be refined 130 | in the region of these parameter values. 131 | :param model_name: The name of the estimator corresponding to the tuning parameters in best_params. 132 | """ 133 | for param_name in best_params: 134 | pvalue_list = self.tuning_ranges[model_name][param_name] 135 | best_value = best_params[param_name] 136 | # find the values corresponding to 137 | idx = pvalue_list.index(best_value) 138 | ngrid = len(pvalue_list) 139 | if idx == 0: 140 | # first element of grid, so expand below it 141 | if type(pvalue_list[0]) in int_types: 142 | pv_min = pvalue_list[0] / 2 # reduce minimum grid value by a factor of 2 143 | pv_min = max(1, pv_min) # assume integer tuning parameters are never less than 1. 144 | pv_max = pvalue_list[1] 145 | self.tuning_ranges[model_name][param_name] = \ 146 | list(np.unique(np.linspace(pv_min, pv_max, ngrid).astype(np.int))) 147 | else: 148 | # use logarithmic grids for floats 149 | dp = np.log10(pvalue_list[1]) - np.log10(pvalue_list[0]) 150 | pv_min = np.log10(pvalue_list[0]) - dp 151 | pv_max = np.log10(pvalue_list[1]) 152 | self.tuning_ranges[model_name][param_name] = list(np.logspace(pv_min, pv_max, ngrid)) 153 | if self.verbose: 154 | print self.tuning_ranges[model_name][param_name] 155 | elif idx == ngrid - 1: 156 | # last element of grid, so expand above it 157 | if pvalue_list[idx] is None: 158 | # special situation for some estimators, like the DecisionTreeClassifier 159 | pv_min = pvalue_list[idx-1] # increase the maximum grid value by a factor of 2 160 | pv_max = 2 * pv_min 161 | self.tuning_ranges[model_name][param_name] = \ 162 | list(np.unique(np.linspace(pv_min, pv_max, ngrid-1).astype(np.int))) 163 | # make sure we keep None as the last value in the list 164 | self.tuning_ranges[model_name][param_name].append(None) 165 | elif type(pvalue_list[idx]) in int_types: 166 | pv_min = np.log10(pvalue_list[idx-1]) 167 | pv_max = np.log10(2 * pvalue_list[idx]) # increase the maximum grid value by a factor of 2 168 | if param_name == 'max_features': 169 | # can't have max_features > nfeatures 170 | pv_max = min(2 * pvalue_list[idx], self.nfeatures) 171 | pv_max = np.log10(pv_max) 172 | self.tuning_ranges[model_name][param_name] = \ 173 | list(np.unique(np.logspace(pv_min, pv_max, ngrid).astype(np.int))) 174 | else: 175 | # use logarithmic grids for floats 176 | dp = np.log10(pvalue_list[idx]) - np.log10(pvalue_list[idx-1]) 177 | pv_min = np.log10(pvalue_list[idx-1]) 178 | pv_max = np.log10(pvalue_list[idx]) + dp 179 | self.tuning_ranges[model_name][param_name] = list(np.logspace(pv_min, pv_max, ngrid)) 180 | if self.verbose: 181 | print self.tuning_ranges[model_name][param_name] 182 | else: 183 | # inner element of grid 184 | if pvalue_list[idx + 1] is None: 185 | # special situation for some estimators, like the DecisionTreeClassifier 186 | pv_min = pvalue_list[idx-1] # increase the maximum grid value by a factor of 2 187 | pv_max = 2 * pvalue_list[idx] 188 | self.tuning_ranges[model_name][param_name] = \ 189 | list(np.unique(np.linspace(pv_min, pv_max, ngrid-1).astype(np.int))) 190 | # make sure we keep None as the last value in the list 191 | self.tuning_ranges[model_name][param_name].append(None) 192 | elif type(pvalue_list[idx]) in int_types: 193 | pv_min = pvalue_list[idx-1] 194 | pv_max = pvalue_list[idx+1] 195 | # switch to linear spacing for interior integer grid values 196 | self.tuning_ranges[model_name][param_name] = \ 197 | list(np.unique(np.linspace(pv_min, pv_max, ngrid).astype(np.int))) 198 | else: 199 | # use logarithmic grids for floats 200 | pv_min = np.log10(pvalue_list[idx-1]) 201 | pv_max = np.log10(pvalue_list[idx+1]) 202 | self.tuning_ranges[model_name][param_name] = list(np.logspace(pv_min, pv_max, ngrid)) 203 | if self.verbose: 204 | print self.tuning_ranges[model_name][param_name] 205 | 206 | # print 'New Grid:', self.tuning_ranges[model_name][param_name] 207 | 208 | def cross_validate(self, X, model_idx, y): 209 | """ 210 | Fit the tuning parameters for an estimator on a grid using cross-validation. 211 | 212 | :param X: The array of predictors, shape (n_samples, n_features). 213 | :param model_idx: The index of the estimator to fit. 214 | :param y: The array of response values, shape (n_samples) or (n_samples, n_outputs) depending on the estimator. 215 | :return: A tuple containing the scikit-learn estimator object with the best tuning parameters, the score 216 | corresponding to the best tuning parameters, and a dictionary containing the best tuning parameter values. 217 | """ 218 | if self.verbose: 219 | print 'Doing cross-validation for model', self.model_names[model_idx], '...' 220 | grid = GridSearchCV(self.models[model_idx], self.tuning_ranges[self.model_names[model_idx]], 221 | scoring=self.scorer, n_jobs=self.njobs, cv=self.cv, pre_dispatch=self.pre_dispatch) 222 | grid.fit(X, y) 223 | if self.verbose: 224 | print 'Best', self.model_names[model_idx], 'has:' 225 | for tuning_parameter in self.tuning_ranges[self.model_names[model_idx]]: 226 | print ' ', tuning_parameter, '=', grid.best_params_[tuning_parameter] 227 | print ' CV Score of', grid.best_score_ 228 | return grid.best_estimator_, grid.best_score_, grid.best_params_ 229 | 230 | def oob_validate(self, X, model_idx, y): 231 | """ 232 | Fit the tuning parameters for a Random Forest estimator on a grid by maximizing the score of the out-of-bag 233 | samples. This is faster than using cross-validation. 234 | 235 | :param X: The array of predictors, shape (n_samples, n_features). 236 | :param model_idx: The index of the estimator to fit. 237 | :param y: The array of response values, shape (n_samples) or (n_samples, n_outputs) depending on the estimator. 238 | :return: A tuple containing the scikit-learn estimator object with the best tuning parameters, the score 239 | corresponding to the best tuning parameters, and a dictionary containing the best tuning parameter values. 240 | """ 241 | if self.verbose: 242 | print 'Doing OOB-validation for model', self.model_names[model_idx], '...' 243 | 244 | tune_grid = list(ParameterGrid(self.tuning_ranges[self.model_names[model_idx]])) 245 | 246 | best_estimator = None 247 | best_score = -1e30 248 | 249 | # fit random forest 250 | for point in tune_grid: 251 | estimator = clone(self.models[model_idx]) 252 | for tpar in point: 253 | # set the tuning parameters 254 | estimator.__setattr__(tpar, point[tpar]) 255 | estimator.fit(X, y) 256 | 257 | if estimator.oob_score_ > best_score: 258 | # new best values, save them 259 | best_score = estimator.oob_score_ 260 | best_estimator = estimator 261 | best_params = estimator.get_params() 262 | 263 | best_tparams = dict() 264 | for tpar in self.tuning_ranges[self.model_names[model_idx]]: 265 | best_tparams[tpar] = best_params[tpar] # only grab the values of the best tuning parameter 266 | 267 | if self.verbose: 268 | print 'Best', self.model_names[model_idx], 'has:' 269 | for tuning_parameter in self.tuning_ranges[self.model_names[model_idx]]: 270 | print ' ', tuning_parameter, '=', best_tparams[tuning_parameter] 271 | print ' OOB Score of', best_score 272 | 273 | return best_estimator, best_score, best_tparams 274 | 275 | def fit(self, X, y, n_refinements=1): 276 | """ 277 | Fit the suite of estimators. The tuning parameters are estimated using cross-validation. 278 | 279 | :param X: The array of predictors, shape (n_samples, n_features). 280 | :param y: The array of response values, shape (n_samples) or (n_samples, n_outputs), depending on the estimator. 281 | :param n_refinements: The number of time to refine the grid of tuning parameter values. Must be an integer or 282 | dictionary. If an integer, the grid for all models will be refined this many times. If a dictionary, should 283 | have (key value) pairs given by (estimator name, n_refinements). 284 | :return: Returns self. 285 | """ 286 | self.nfeatures = X.shape[1] 287 | ndata = len(y) 288 | if X.shape[0] != ndata: 289 | raise ValueError('X and y must have same number of rows.') 290 | 291 | if np.isscalar(n_refinements): 292 | # use same number of refinements for all models 293 | n_refinements = {name: n_refinements for name in self.model_names} 294 | 295 | if type(self.cv) in int_types: 296 | # construct cross-validation iterator 297 | self.cv = KFold(ndata, n_folds=self.cv) 298 | elif self.cv.n != ndata: 299 | # need to reconstruct cross-validation iterator since we have different data 300 | self.cv = KFold(ndata, n_folds=self.cv.n_folds) 301 | 302 | for k in range(len(self.models)): 303 | if 'RandomForest' in self.model_names[k]: 304 | # use out-of-bag error for validation error 305 | best_estimator, best_score, best_params = self.oob_validate(X, k, y) 306 | else: 307 | # use cross-validation for validation error 308 | best_estimator, best_score, best_params = self.cross_validate(X, k, y) 309 | 310 | self.models[k] = best_estimator 311 | self.best_scores[self.model_names[k]] = best_score 312 | 313 | for i in range(n_refinements[self.model_names[k]]): 314 | if self.verbose: 315 | print 'Refining Grid...' 316 | old_score = best_score 317 | # now refine the grid and refit 318 | self.refine_grid(best_params, self.model_names[k]) 319 | 320 | if 'RandomForest' in self.model_names[k]: 321 | # use out-of-bag error for validation error 322 | best_estimator, best_score, best_params = self.oob_validate(X, k, y) 323 | else: 324 | # use cross-validation for validation error 325 | best_estimator, best_score, best_params = self.cross_validate(X, k, y) 326 | if self.verbose: 327 | print ' New Validation Score of', best_score, 'is an improvement of', \ 328 | 100.0 * (best_score - old_score) / np.abs(old_score), '%.' 329 | 330 | self.models[k] = best_estimator 331 | self.best_scores[self.model_names[k]] = best_score 332 | 333 | return self 334 | 335 | def predict_all(self, X): 336 | """ 337 | Predict the outputs as a function of the inputs for each model. 338 | 339 | :param X: The array of predictor values, shape (n_samples, n_features). 340 | :return: A dictionary containing the values of the response predicted at the input values for each model. 341 | """ 342 | y_predict_all = {name: model.predict(X) for name, model in zip(self.model_names, self.models)} 343 | 344 | return y_predict_all 345 | 346 | @abc.abstractmethod 347 | def predict(self, X, weights='auto'): 348 | return self.predict_all(X) 349 | 350 | 351 | class ClassificationSuite(BasePredictorSuite): 352 | 353 | def __init__(self, n_features=None, tuning_ranges=None, models=None, cv=None, njobs=1, pre_dispatch='2*n_jobs', 354 | stack=True, verbose=False): 355 | """ 356 | Initialize a pipeline to run a suite of scikit-learn classifiers. The tuning parameters are chosen through 357 | cross-validation or the out-of-bags score (for Random Forests) as part of the fitting process. The score 358 | function used is the accuracy score (fraction of correct classifications). 359 | 360 | :param verbose: Provide helpful output. 361 | :param n_features: The number of features that will be used when performing the fit. Must supply either 362 | n_features or tuning_ranges. This is necessary because the tuning parameter for the RandomForestClassifier 363 | is max_features, and max_features must be less than the number of features in the input array. So, in order 364 | to automatically construct the tuning_ranges dictionary it is necessary to know n_features in order to 365 | ensure max_features <= n_features. 366 | :param tuning_ranges: A nested dictionary containing the ranges of the tuning parameters. It should be of the 367 | format {model name 1: {parameter name 1: list(value range 1), parameter name 2: list(value range 2), ...} }. 368 | If n_features is not supplied, then tuning_ranges must be provided. 369 | :param models: A list of instantiated scikit-learn classifier classes to fit. If None, these are taken from 370 | the models listed in tuning_range. 371 | :param cv: The number of CV folds to use, or a CV generator. 372 | :param njobs: The number of processes to run in parallel. 373 | :param pre_dispatch: Passed to sklearn.grid_search.GridSearchCV, see documentation for GridSearchCV for further 374 | details. 375 | :param stack: If true, then the predict() method will return a stacked (averaged) value over the estimators. 376 | Otherwise, if false, then predict() will return the predictions for each estimator. 377 | """ 378 | if tuning_ranges is None: 379 | try: 380 | n_features is not None 381 | except ValueError: 382 | 'Must supply one of n_features or tuning_ranges.' 383 | # use default values for grid search over tuning parameters for all models 384 | tuning_ranges = {'LogisticRegression': {'C': list(np.logspace(-2.0, 1.0, 5))}, 385 | 'DecisionTreeClassifier': {'max_depth': [5, 10, 20, 50, None]}, 386 | 'LinearSVC': {'C': list(np.logspace(-2.0, 1.0, 5))}, 387 | 'SVC': {'C': list(np.logspace(-2.0, 1.0, 5)), 388 | 'gamma': list(np.logspace(np.log10(1.0 / n_features), 389 | np.log10(1000.0 / n_features), 5))}, 390 | 'RandomForestClassifier': {'max_features': 391 | list(np.unique(np.linspace(2, n_features, 5).astype(np.int)))}, 392 | 'GbcAutoNtrees': {'max_depth': [1, 2, 3, 5, 10]}} 393 | if models is None: 394 | # initialize the list of sklearn objects corresponding to different statistical models 395 | models = [] 396 | if 'LogisticRegression' in tuning_ranges: 397 | models.append(LogisticRegression(penalty='l1', class_weight='auto')) 398 | if 'DecisionTreeClassifier' in tuning_ranges: 399 | models.append(DecisionTreeClassifier()) 400 | if 'LinearSVC' in tuning_ranges: 401 | models.append(LinearSVC(penalty='l1', loss='l2', dual=False, class_weight='auto')) 402 | if 'SVC' in tuning_ranges: 403 | models.append(SVC(class_weight='auto')) 404 | if 'RandomForestClassifier' in tuning_ranges: 405 | models.append(RandomForestClassifier(n_estimators=500, oob_score=True, n_jobs=njobs)) 406 | if 'GbcAutoNtrees' in tuning_ranges: 407 | models.append(GbcAutoNtrees(subsample=0.75, n_estimators=500, learning_rate=0.01)) 408 | 409 | super(ClassificationSuite, self).__init__(tuning_ranges=tuning_ranges, models=models, cv=cv, njobs=njobs, 410 | pre_dispatch=pre_dispatch, stack=stack, verbose=verbose) 411 | 412 | self.scorer = make_scorer(accuracy_score) 413 | self.nfeatures = n_features 414 | self.classes = None 415 | 416 | def predict(self, X, weights='auto'): 417 | """ 418 | Predict the classes as a function of the inputs. If self.stack is true, then the predictions for each data point 419 | are computed based on a weighted majority vote of the estimators. Otherwise, a dictionary containing the 420 | predictions for each estimator are returns. 421 | 422 | :param X: The array of predictor values, shape (n_samples, n_features). 423 | :param weights: The weights to use when combining the predictions for the individual estimators. If 'auto', then 424 | the weights are given by the validation scores. If 'uniform', then uniform weights are used. Otherwise 425 | weights must be a dictionary with (model name, weight) as the (key, value) pair. 426 | No effect if self.stack = False. 427 | :return: The values of the response predicted at the input values. 428 | """ 429 | y_predict_all = super(ClassificationSuite, self).predict_all(X) 430 | 431 | if weights is 'uniform': 432 | # just use uniform weighting 433 | weights = {name: 1.0 for name in self.model_names} 434 | 435 | if weights is 'auto': 436 | # weight based on validation score 437 | weights = self.best_scores 438 | 439 | if self.stack: 440 | # combine the model outputs 441 | y_votes = np.zeros((X.shape[0], len(self.model_names))) 442 | for name in y_predict_all: 443 | vote = y_predict_all[name] 444 | idx_1d = vote + np.arange(len(vote)) * y_votes.shape[1] 445 | # compute weighted vote for each class 446 | y_votes[np.unravel_index(idx_1d, y_votes.shape)] += weights[name] 447 | 448 | y_predict = self.classes[y_votes.argmax(axis=1)] # output is winner of majority vote 449 | 450 | else: 451 | y_predict = y_predict_all 452 | 453 | return y_predict 454 | 455 | def fit(self, X, y, n_refinements=1): 456 | classes, y = np.unique(y, return_inverse=True) 457 | self.classes = classes 458 | return super(ClassificationSuite, self).fit(X, y, n_refinements) 459 | 460 | 461 | class RegressionSuite(BasePredictorSuite): 462 | 463 | def __init__(self, n_features=None, tuning_ranges=None, models=None, cv=None, njobs=1, pre_dispatch='2*n_jobs', 464 | stack=True, verbose=False, metric='lad'): 465 | if metric.lower() not in ['lad', 'mse']: 466 | raise ValueError('Metric must be either lad or mse.') 467 | 468 | if tuning_ranges is None: 469 | try: 470 | n_features is not None 471 | except ValueError: 472 | 'Must supply one of n_features or tuning_ranges.' 473 | # use default values for grid search over tuning parameters for all models 474 | tuning_ranges = {'DecisionTreeClassifier': {'max_depth': [5, 10, 20, 50, None]}, 475 | 'RandomForestRegressor': {'max_features': 476 | list(np.unique(np.linspace(2, n_features, 5).astype(np.int)))}, 477 | 'GbrAutoNtrees': {'max_depth': [1, 2, 3, 5, 10]}} 478 | if models is None: 479 | # initialize the list of sklearn objects corresponding to different statistical models 480 | models = [] 481 | if 'DecisionTreeRegressor' in tuning_ranges: 482 | models.append(DecisionTreeRegressor()) 483 | if 'RandomForestRegressor' in tuning_ranges: 484 | models.append(RandomForestRegressor(n_estimators=500, oob_score=True, n_jobs=njobs)) 485 | if 'GbrAutoNtrees' in tuning_ranges: 486 | models.append(GbrAutoNtrees(subsample=0.75, n_estimators=500, learning_rate=0.01)) 487 | 488 | super(RegressionSuite, self).__init__(tuning_ranges, models, cv, njobs, pre_dispatch, stack, verbose) 489 | 490 | self.scorer = make_scorer(accuracy_score) 491 | self.nfeatures = n_features 492 | self.metric = metric.lower() 493 | if self.metric == 'lad': 494 | self.scorer = make_scorer(mean_absolute_error, greater_is_better=False) 495 | elif self.metric == 'mse': 496 | self.scorer = make_scorer(mean_squared_error, greater_is_better=False) 497 | 498 | def predict(self, X, weights='auto'): 499 | 500 | y_predict_all = super(RegressionSuite, self).predict_all(X) 501 | 502 | if weights is 'uniform': 503 | # just use uniform weighting 504 | weights = {name: 1.0 for name in self.model_names} 505 | 506 | if weights is 'auto': 507 | # weight based on validation score 508 | weights = self.best_scores 509 | 510 | if self.stack: 511 | # combine the model outputs 512 | y_predict = 0.0 513 | wsum = 0.0 514 | for name in y_predict_all: 515 | y_predict += weights[name] * y_predict_all[name] 516 | wsum += weights[name] 517 | y_predict /= wsum 518 | else: 519 | y_predict = y_predict_all 520 | 521 | return y_predict --------------------------------------------------------------------------------