├── tests ├── __init__.py └── test_resampling.py ├── bootstrap ├── __init__.py └── bootstrap.py ├── .gitignore ├── requirements.txt ├── .travis.yml ├── setup.py └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bootstrap/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.pyc 3 | build/* 4 | dist/* 5 | *.egg-info/ 6 | *.cache 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.8 2 | scipy>=0.8 3 | pytest==3.0.7 4 | pytest-cov==2.5.1 5 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | # command to install dependencies 5 | install: 6 | - pip install -r requirements.txt 7 | - pip install coveralls 8 | # command to run tests 9 | script: pytest --cov bootstrap 10 | after_success: 11 | - coveralls 12 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from setuptools import setup, find_packages 3 | 4 | DISTNAME = 'bootstrap' 5 | DESCRIPTION = 'Library for bootstrapping statistics' 6 | MAINTAINER = 'Christopher Jenness' 7 | URL = 'https://github.com/christopherjenness/bootstrap' 8 | 9 | classifiers = ['Programming Language :: Python', 10 | 'Programming Language :: Python :: 2', 11 | 'Programming Language :: Python :: 3', 12 | 'Programming Language :: Python :: 2.7', 13 | 'Programming Language :: Python :: 3.3'] 14 | 15 | with open('requirements.txt') as f: 16 | install_reqs = f.read().splitlines() 17 | 18 | if __name__ == "__main__": 19 | setup(name=DISTNAME, 20 | maintainer=MAINTAINER, 21 | description=DESCRIPTION, 22 | packages=find_packages(), 23 | url=URL, 24 | classifiers=classifiers, 25 | install_requires=install_reqs) 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bootstrap 2 | 3 | ![TRAVIS](https://travis-ci.org/christopherjenness/bootstrap.svg?branch=master) [![Coverage Status](http://coveralls.io/repos/github/christopherjenness/bootstrap/badge.svg?branch=master)](https://coveralls.io/github/christopherjenness/bootstrap?branch=master) 4 | 5 | A library for bootstrapping statistics. 6 | 7 | ## Features 8 | 9 | While incomplete, the library already incudes a number of features: 10 | * Bootstrap samples 11 | * Bootstrap matrices 12 | * Bootstrap statistics 13 | * Provides SEM and confidence intervals for statistics 14 | * Jackknife samples and statistics 15 | * Two sample testing 16 | 17 | ## Installation 18 | 19 | ```python 20 | python setup.py install 21 | ``` 22 | 23 | ## Usage 24 | 25 | Here, we document some of the library features using the University of Wisconsin breast cancer data set. [Available here](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)). For simplicity, only the first dimension will be looked at. 26 | 27 | ```python 28 | import numpy as np 29 | from sklearn.datasets import load_breast_cancer 30 | data = load_breast_cancer() 31 | ``` 32 | 33 | First, we will look at how the data are distributed. 34 | 35 | ```python 36 | import matplotlib.pyplot as plt 37 | import seaborn as sns 38 | plt.hist(data.data[:,0], bins=40) 39 | plt.title('Measurements') 40 | ``` 41 | 42 | ![Data](http://i.imgur.com/5Qm0wn4.png) 43 | 44 | Next, we will bootstrap 10,000 samples, to bootstrap the mean and 95% confidence interval for the mean. Below, the mean of each bootstrapped sample is plotted, with the estimated mean and confidence intervals shown. 45 | 46 | ```python 47 | results = bootstrap_statistic(data.data[:,0], func=np.mean, n_samples=10000) 48 | 49 | # Make plot of bootstrapped mean 50 | plt.hist(results.statistics, bins=40) 51 | plt.title('Bootstrapped Means') 52 | plt.xlabel('Mean') 53 | plt.ylabel('Counts') 54 | ax = plt.gca() 55 | ax.axvline(x=results.ci[0], color='red', linestyle='dashed', linewidth=2) 56 | ax.axvline(x=results.ci[1], color='red', linestyle='dashed', linewidth=2) 57 | ax.axvline(x=results.statistic, color='black', linewidth=5) 58 | ``` 59 | 60 | ![Mean](http://i.imgur.com/GkMnLtQ.png) 61 | 62 | An advantage of the bootstrap method is its adaptability. For example, you can bootstrap an estimate of the 95th percentile of the data. 63 | 64 | ```python 65 | def percentile(data): 66 | """returns 95th percentile of data""" 67 | return np.percentile(data, 95) 68 | 69 | # Bootstrap the 95th percentile 70 | results = bootstrap_statistic(data.data[:,0], func=percentile, n_samples=10000) 71 | 72 | # Make plot of bootstrapped 95th percentile 73 | plt.hist(results.statistics, bins=40) 74 | plt.title('Bootstrapped 95th Percentiles') 75 | plt.xlabel('95th Percentile') 76 | plt.ylabel('Counts') 77 | ax = plt.gca() 78 | ax.axvline(x=results.ci[0], color='red', linestyle='dashed', linewidth=2) 79 | ax.axvline(x=results.ci[1], color='red', linestyle='dashed', linewidth=2) 80 | ax.axvline(x=results.statistic, color='black', linewidth=5) 81 | ``` 82 | ![Percentile](http://i.imgur.com/SJkAh4l.png) 83 | 84 | Additionally, the library can perform two sample testing. First lets view the distribution of the same data, but broken up by tumor type. 85 | 86 | ```python 87 | benign = data.data[data.target == 0] 88 | malignant = data.data[data.target == 1] 89 | 90 | # Plot benign and malignant samples 91 | plt.hist(benign[:,0], bins=30, alpha=0.5, label='benign') 92 | plt.hist(malignant[:,0], bins=30, alpha=0.5, label='malignant') 93 | plt.legend() 94 | plt.xlabel('Measurement') 95 | plt.ylabel('Counts') 96 | ``` 97 | 98 | ![split](http://i.imgur.com/rsVrDJT.png) 99 | 100 | It appears their is a different in the groups distribution. The level of significance can be computer via the bootstrap method. 101 | 102 | ```python 103 | significance = two_sample_testing(benign[:, 0], malignant[:, 0], 104 | statistic_func=compare_means, 105 | n_samples=5000) 106 | print(significance) # prints 0.0 107 | ``` 108 | Hmmm, with 5,000 random bootstrapped samples, not a single one had the difference of means of the observed samples. 109 | 110 | What about a feature that is less predictive? Below, we look at feature 9. 111 | 112 | ```python 113 | plt.hist(benign[:,9], bins=30, alpha=0.5, label='benign') 114 | plt.hist(malignant[:,9], bins=30, alpha=0.5, label='malignant') 115 | plt.legend() 116 | plt.xlabel('Measurement') 117 | plt.ylabel('Counts') 118 | ``` 119 | 120 | ![Feature9](http://i.imgur.com/tCt1rnV.png) 121 | 122 | If then bootstrap the difference between the two means, we get a non-significant difference. 123 | 124 | ```python 125 | significance = two_sample_testing(malignant[:, 9], benign[:, 9], 126 | statistic_func=compare_means, 127 | n_samples=5000) 128 | print(significance) # prints 0.387 129 | ``` 130 | -------------------------------------------------------------------------------- /tests/test_resampling.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | import coveralls 4 | from bootstrap.bootstrap import bootstrap_sample, jackknife_sample, \ 5 | compare_means, t_test_statistic, two_sample_testing, \ 6 | bootstrap_matrixsample, bootstrap_statistic, jackknife_statistic 7 | 8 | 9 | class BootstrapInit(unittest.TestCase): 10 | def setUp(self): 11 | np.random.seed(0) 12 | self.normal_data = np.random.normal(100, 10, size=100) 13 | np.random.seed(0) 14 | self.normal_data2 = np.random.normal(200, 10, size=100) 15 | np.random.seed(0) 16 | self.uniform_data = np.random.uniform(0, 100, size=100) 17 | np.random.seed(0) 18 | self.poisson_data = np.random.poisson(10, size=(100)) 19 | np.random.seed(0) 20 | self.matrix_data = np.random.normal(100, 10, size=(100, 100)) 21 | 22 | class ResamplingTestCase(BootstrapInit): 23 | def testNonparametric(self): 24 | bootstrap_data = bootstrap_sample(self.normal_data) 25 | self.assertAlmostEqual(np.mean(self.normal_data)/10000, 26 | np.mean(bootstrap_data)/10000, 3) 27 | self.assertEqual(len(bootstrap_data), len(self.normal_data)) 28 | 29 | def testNormalParametric(self): 30 | bootstrap_data = bootstrap_sample(self.normal_data, 31 | parametric='normal') 32 | self.assertAlmostEqual(np.mean(self.normal_data)/10000, 33 | np.mean(bootstrap_data)/10000, 3) 34 | self.assertEqual(len(bootstrap_data), len(self.normal_data)) 35 | 36 | def testUniformParametric(self): 37 | bootstrap_data = bootstrap_sample(self.uniform_data, 38 | parametric='uniform') 39 | self.assertAlmostEqual(np.mean(self.uniform_data)/10000, 40 | np.mean(bootstrap_data)/10000, 3) 41 | self.assertEqual(len(bootstrap_data), len(self.uniform_data)) 42 | 43 | def testPoissonParametric(self): 44 | bootstrap_data = bootstrap_sample(self.poisson_data, 45 | parametric='poisson') 46 | self.assertAlmostEqual(np.mean(self.poisson_data)/10000, 47 | np.mean(bootstrap_data)/10000, 3) 48 | self.assertEqual(len(bootstrap_data), len(self.poisson_data)) 49 | 50 | def testJackknifeSample(self): 51 | jackknife_data = jackknife_sample(self.uniform_data, 10) 52 | self.assertAlmostEqual(np.mean(self.uniform_data)/10000, 53 | np.mean(jackknife_data)/10000, 3) 54 | self.assertEqual(len(jackknife_data) + 1, len(self.uniform_data)) 55 | 56 | 57 | class TwoSampleTestCase(BootstrapInit): 58 | def testMeanDifference(self): 59 | mean_difference = compare_means(self.normal_data2, 60 | self.normal_data) 61 | self.assertAlmostEqual(mean_difference/100000, 0.001, 3) 62 | 63 | def testTStatisticBig(self): 64 | t_statistic = t_test_statistic(self.normal_data2, 65 | self.normal_data) 66 | self.assertAlmostEqual(t_statistic / 10000, 0.007, 3) 67 | 68 | def testTStatisticZero(self): 69 | t_statistic = t_test_statistic(self.normal_data, 70 | self.normal_data) 71 | self.assertEqual(t_statistic, 0) 72 | 73 | def testTwoSampleZero(self): 74 | ASL = two_sample_testing(self.normal_data2, self.normal_data) 75 | self.assertEqual(ASL, 0) 76 | 77 | def testTwoSampleBig(self): 78 | ASL = two_sample_testing(self.normal_data, self.normal_data) 79 | self.assertGreater(ASL, 0.05) 80 | 81 | def testTwoSampleTTest(self): 82 | ASL = two_sample_testing(self.normal_data2, self.normal_data, 83 | statistic_func=t_test_statistic) 84 | self.assertEqual(ASL, 0) 85 | 86 | def testTwoSampleTTestBit(self): 87 | ASL = two_sample_testing(self.normal_data, self.normal_data, 88 | statistic_func=t_test_statistic) 89 | self.assertGreater(ASL, 0.05) 90 | 91 | 92 | class MatrixTestCase(BootstrapInit): 93 | 94 | def testMatrixResamplingCol(self): 95 | matrix_sample = np.matrix(bootstrap_matrixsample(self.matrix_data, 96 | axis=1)) 97 | self.assertEqual(np.shape(matrix_sample), np.shape(self.matrix_data)) 98 | self.assertAlmostEqual(np.average(matrix_sample)/100000, 99 | np.mean(self.matrix_data)/100000, 3) 100 | 101 | 102 | class StatisticsTestCast(BootstrapInit): 103 | def testBootstrapMean(self): 104 | statistics, statistic, bias, sem, confidence_interval = \ 105 | bootstrap_statistic(self.normal_data) 106 | self.assertAlmostEqual(statistic/100000, 100/100000, 2) 107 | self.assertAlmostEqual(np.abs(bias/100), 0.1/100, 3) 108 | self.assertAlmostEqual(sem/100, 0.1/100, 3) 109 | self.assertEqual(len(confidence_interval), 2) 110 | self.assertTrue(confidence_interval[0] < confidence_interval[1]) 111 | self.assertTrue(confidence_interval[1] - confidence_interval[0] < 5) 112 | 113 | def testBoostrapMedian(self): 114 | statistics, statistic, bias, sem, confidence_interval = \ 115 | bootstrap_statistic(self.normal_data, func=np.median) 116 | self.assertAlmostEqual(statistic/100000, 100/100000, 2) 117 | self.assertAlmostEqual(np.abs(bias/100), 0.3/100, 3) 118 | self.assertAlmostEqual(sem/100, 0.1/100, 3) 119 | self.assertEqual(len(confidence_interval), 2) 120 | self.assertTrue(confidence_interval[0] < confidence_interval[1]) 121 | self.assertTrue(confidence_interval[1] - confidence_interval[0] < 5) 122 | 123 | def testBootstrapBCa(self): 124 | statistics, statistic, bias, sem, confidence_interval = \ 125 | bootstrap_statistic(self.normal_data, bca=True) 126 | self.assertAlmostEqual(statistic/100000, 100/100000, 2) 127 | self.assertAlmostEqual(np.abs(bias/100), 0.1/100, 3) 128 | self.assertAlmostEqual(sem/100, 0.1/100, 3) 129 | self.assertEqual(len(confidence_interval), 2) 130 | self.assertTrue(confidence_interval[0] < confidence_interval[1]) 131 | self.assertTrue(confidence_interval[1] - confidence_interval[0] < 5) 132 | 133 | def testBoostrapParams(self): 134 | statistics, statistic, bias, sem, confidence_interval = \ 135 | bootstrap_statistic(self.normal_data, parametric='normal', 136 | bias_correction=True, alpha=0.1) 137 | self.assertAlmostEqual(statistic/100000, 100/100000, 2) 138 | self.assertAlmostEqual(np.abs(bias/100), 0.1/100, 3) 139 | self.assertAlmostEqual(sem/100, 0.2/100, 3) 140 | self.assertEqual(len(confidence_interval), 2) 141 | self.assertTrue(confidence_interval[0] < confidence_interval[1]) 142 | self.assertTrue(confidence_interval[1] - confidence_interval[0] < 5) 143 | 144 | def testBootstrapMatrix(self): 145 | statistics, statistic, bias, sem, confidence_interval = \ 146 | bootstrap_statistic(self.normal_data) 147 | self.assertAlmostEqual(statistic/100000, 100/100000, 2) 148 | self.assertAlmostEqual(np.abs(bias/100), 0.1/100, 3) 149 | self.assertAlmostEqual(sem/100, 0.1/100, 3) 150 | self.assertEqual(len(confidence_interval), 2) 151 | self.assertTrue(confidence_interval[0] < confidence_interval[1]) 152 | self.assertTrue(confidence_interval[1] - confidence_interval[0] < 5) 153 | 154 | def testJackknifeMean(self): 155 | statistic, sem, statistics = jackknife_statistic(self.normal_data) 156 | self.assertAlmostEqual(statistic/100000, 157 | np.mean(self.normal_data)/100000, 3) 158 | self.assertAlmostEqual(sem/10, 0.1/100, 3) 159 | self.assertEqual(len(statistics), len(self.normal_data)) 160 | 161 | if __name__ == '__main__': 162 | unittest.main() 163 | -------------------------------------------------------------------------------- /bootstrap/bootstrap.py: -------------------------------------------------------------------------------- 1 | """ 2 | Library for boostraping statistics 3 | 4 | Citation: 5 | Efron, Bradley, and Robert J. Tibshirani. 6 | An introduction to the bootstrap. CRC press, 1994. 7 | """ 8 | 9 | from collections import namedtuple 10 | import numpy as np 11 | from scipy import stats 12 | 13 | 14 | def bootstrap_sample(data, parametric=False): 15 | """ 16 | Resamples data by random sample with replacement 17 | 18 | Args 19 | --------- 20 | data : 1d array 21 | Data to resample 22 | parametric : str in ['normal', 'uniform', 'poisson'] 23 | parametric distribution to resample from, 24 | if False, use nonparametric bootstrap sampling 25 | 26 | Returns: 27 | --------- 28 | resamples : array 29 | bootstrap resampled data 30 | """ 31 | dists = ['normal', 'uniform', 'poisson'] 32 | if parametric and parametric not in dists: 33 | raise ValueError("Invalid parametric argument.") 34 | 35 | sample_size = len(data) 36 | if parametric == 'normal': 37 | mean_estimate = np.mean(data) 38 | std_estimate = np.std(data) 39 | return np.random.normal(mean_estimate, std_estimate, size=sample_size) 40 | elif parametric == 'uniform': 41 | min_estimate, max_estimate = np.min(data), np.max(data) 42 | return np.random.uniform(min_estimate, max_estimate, size=sample_size) 43 | elif parametric == 'poisson': 44 | lambda_estimate = np.mean(data) 45 | return np.random.poisson(lam=lambda_estimate, size=sample_size) 46 | else: 47 | inds = [np.random.randint(0, sample_size) for i in range(sample_size)] 48 | return data[inds] 49 | 50 | 51 | def bootstrap_matrixsample(data, axis=0): 52 | """ 53 | Resamples a matrix by rows or columns 54 | 55 | Args: 56 | --------- 57 | data : np.matrix 58 | matrix of data to resample 59 | axis : (int) in [0, 1] 60 | axis to resample by 61 | if 0, then resample rows 62 | if 1, then resample columns 63 | 64 | Returns: 65 | --------- 66 | resamples : matrix 67 | bootstrap resampled data 68 | """ 69 | 70 | if axis == 0: 71 | n_rows = np.shape(data)[0] 72 | samples = np.random.randint(n_rows, size=n_rows) 73 | bootstrap_matrix = data[samples, :] 74 | elif axis == 1: 75 | n_cols = np.shape(data)[1] 76 | samples = np.random.randint(n_cols, size=n_cols) 77 | bootstrap_matrix = data[:, samples] 78 | return bootstrap_matrix 79 | 80 | 81 | def jackknife_sample(data, index): 82 | """ 83 | Single jackknife sample of data 84 | 85 | Args: 86 | --------- 87 | data : np.array 88 | array of data to resample 89 | index : int 90 | Index of array to leave out in jackknife sample 91 | 92 | Returns: 93 | --------- 94 | resamples : array 95 | jackknife resampled data 96 | """ 97 | jackknife = np.delete(data, index) 98 | return jackknife 99 | 100 | 101 | def bootstrap_statistic(data, func=np.mean, n_samples=50, 102 | parametric=False, bias_correction=False, 103 | alpha=0.05, bca=False, axis=0): 104 | """ 105 | Bootstraps a statistic and calculates the standard error of the statistic 106 | 107 | Args: 108 | --------- 109 | data : array or matrix 110 | array or matrix of data to calculate statistic and SE of statistic 111 | func : function 112 | statistical function to calculate on data 113 | examples: np.mean, np.median 114 | n_samples : int 115 | number of bootstrap samples to calculate statistic for 116 | parametric : (str) in ['normal', 'uniform'] 117 | parametric distribution to resample from, 118 | If False, use nonparametric bootstrap sampling 119 | bias_correction : bool 120 | If True, bias correct bootstrap statistic 121 | bca : bool 122 | If true, use bias correction and (BCa) method to calculate bootstrap 123 | axis : int in [0, 1] 124 | if type(data) == np.matrix, axis to resample by 125 | if 0: resample rows 126 | if 1: resample columns 127 | 128 | Returns: 129 | --------- 130 | results : (float, float, float) 131 | The bootstrapped statistic, its bias and SEM. 132 | (statistic ,bias ,sem) 133 | """ 134 | plugin_estimate = func(data) 135 | statistics = [] 136 | 137 | # Compute statistics and mean it to get statistic's value 138 | for sample in range(n_samples): 139 | if isinstance(data, np.matrix): 140 | resample = bootstrap_matrixsample(data, axis=axis) 141 | else: 142 | resample = bootstrap_sample(data, parametric=parametric) 143 | statistic = func(resample) 144 | statistics.append(statistic) 145 | statistic = np.mean(statistics) 146 | 147 | # CI for the statistic 148 | confidence_interval = calculate_ci(data, statistics, func=func, 149 | alpha=alpha, bca=bca) 150 | 151 | # Compute bias and, if requested, correct for it 152 | bias = statistic - plugin_estimate 153 | if bias_correction: 154 | statistic = statistic - bias 155 | 156 | sem = stats.sem(statistics) 157 | 158 | # Pack together the results 159 | bootstrap_results = namedtuple('bootstrap_results', 160 | 'statistics statistic bias sem ci') 161 | results = bootstrap_results(statistics=statistics, statistic=statistic, 162 | bias=bias, sem=sem, ci=confidence_interval) 163 | return results 164 | 165 | 166 | def jackknife_statistic(data, func=np.mean): 167 | """ 168 | Jackknifes a statistic and calculates the standard error of the statistic 169 | 170 | Args: 171 | --------- 172 | data : array 173 | array of data to calculate statistic and SE of statistic 174 | func : function 175 | statistical function to calculate on data 176 | examples: np.mean, np.median 177 | 178 | Returns: 179 | --------- 180 | jackknifed_stat : (float, float, float) 181 | (statistic, sem, statistics) 182 | Returns the jackknifed statistic and the SEM of the statistic 183 | """ 184 | n_samples = len(data) 185 | statistics = [] 186 | 187 | for sample in range(n_samples): 188 | jack_sample = jackknife_sample(data, sample) 189 | statistic = func(jack_sample) 190 | statistics.append(statistic) 191 | return (np.mean(statistics), stats.sem(statistics), statistics) 192 | 193 | 194 | def calculate_ci(data, statistics, func=np.mean, 195 | alpha=0.05, bca=False): 196 | """ 197 | Calculates bootstrapped confidence interval using percentile 198 | intervals. 199 | 200 | Args: 201 | --------- 202 | statistics (array): array of bootstrapped statistics to calculate 203 | confidence interval for 204 | alpha (float): percentile used for upper and lower bounds of confidence 205 | interval. NOTE: Currently, both upper and lower bounds can have 206 | the same alpha. 207 | bca (bool): If true, use bias correction and accelerated (BCa) method 208 | theta_hat (float): Original estimate of the statistic from the data. 209 | Used to calculate BCa confidence interval. 210 | 211 | Returns: tuple (ci_low, ci_high) 212 | --------- 213 | confidence_interval : (float, float) 214 | (ci_low, ci_high) 215 | ci_low - lower bound on confidence interval 216 | ci_high - upper bound on confidence interval 217 | """ 218 | # If BCa method, update alpha 219 | if bca: 220 | # Calculate bias term, z 221 | plugin_estimate = func(data) 222 | num_below_plugin_est = len(np.where(statistics < plugin_estimate)[0]) 223 | bias_frac = num_below_plugin_est / len(statistics) 224 | z = stats.norm.ppf(bias_frac) 225 | # Calculate acceleration term, a 226 | j_statistic, j_sem, j_values = jackknife_statistic(data, func) 227 | numerator, denominator = 0, 0 228 | for value in j_values: 229 | numerator += (value - j_statistic)**3 230 | denominator += (value - j_statistic)**2 231 | a = numerator / (6 * denominator**(3/2)) 232 | bca_alpha = stats.norm.cdf(z + (z + stats.norm.ppf(alpha)) / 233 | 1 - a * (z + stats.norm.ppf(alpha))) 234 | alpha = bca_alpha 235 | sorted_statistics = np.sort(statistics) 236 | low_index = int(np.floor(alpha * len(statistics))) 237 | high_index = int(np.ceil((1 - alpha) * len(statistics))) 238 | 239 | # Correct for 0 based indexing 240 | if low_index > 0: 241 | low_index -= 1 242 | high_index -= 1 243 | low_value = sorted_statistics[low_index] 244 | high_value = sorted_statistics[high_index] 245 | return (low_value, high_value) 246 | 247 | 248 | def two_sample_testing(sampleA, sampleB, 249 | statistic_func=None, n_samples=50): 250 | """ 251 | Compares two samples via bootstrapping to determine if they came from 252 | the same distribution. 253 | 254 | Args: 255 | --------- 256 | sampleA : np.array 257 | Array of data from sample A 258 | sampleB : np.array 259 | Array of data form sample B 260 | statistic_func : function 261 | Function that compares two data sets and retuns a statistic. Function 262 | must accept two args, (np.array, np.array), where each array is a 263 | sample. 264 | Example statistics_func that compares the mean of two data sets: 265 | lambda data1, data2: np.mean(data1) - np.mean(data2) 266 | n_samples : int 267 | number of bootstrap samples to generate 268 | 269 | Returns: 270 | --------- 271 | sig_lvl : float 272 | bootstrapped achieved significance level 273 | """ 274 | if statistic_func is None: 275 | statistic_func = compare_means 276 | 277 | observed_statistic = statistic_func(sampleA, sampleB) 278 | combined_sample = np.append(sampleA, sampleB) 279 | 280 | # Count the number of bootstrap samples with statistic > observed_statistic 281 | m = len(sampleA) 282 | counter = 0 283 | for sample in range(n_samples): 284 | boot_sample = bootstrap_sample(combined_sample) 285 | boot_sampleA = boot_sample[:m] 286 | boot_sampleB = boot_sample[m:] 287 | boot_statistic = statistic_func(boot_sampleA, boot_sampleB) 288 | if boot_statistic > observed_statistic: 289 | counter += 1 290 | 291 | ASL = counter / float(n_samples) 292 | return ASL 293 | 294 | 295 | def compare_means(sampleA, sampleB): 296 | """ 297 | Compares the mean of two samples 298 | 299 | Args: 300 | --------- 301 | sampleA (np.array): Array of data from sample A 302 | sampleB (np.array): Array of data form sample B 303 | 304 | Returns: 305 | --------- 306 | difference : float 307 | difference in mean between the two samples 308 | """ 309 | difference = np.mean(sampleA) - np.mean(sampleB) 310 | return difference 311 | 312 | 313 | def t_test_statistic(sampleA, sampleB): 314 | """ 315 | Computes the t test statistic of two samples 316 | 317 | Args: 318 | --------- 319 | sampleA : np.array 320 | Array of data from sample A 321 | sampleB : np.array 322 | Array of data form sample B 323 | 324 | Returns: 325 | --------- 326 | t_stat : float 327 | t test statistic of two samples 328 | """ 329 | difference = compare_means(sampleA, sampleB) 330 | # Store lengths of samples 331 | n = len(sampleA) 332 | m = len(sampleB) 333 | stdev = (np.var(sampleA)/n + np.var(sampleB)/m)**0.5 334 | t_stat = difference / stdev 335 | return t_stat 336 | --------------------------------------------------------------------------------