├── tests
    ├── __init__.py
    └── test_resampling.py
├── bootstrap
    ├── __init__.py
    └── bootstrap.py
├── .gitignore
├── requirements.txt
├── .travis.yml
├── setup.py
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/bootstrap/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | *.pyc
3 | build/*
4 | dist/*
5 | *.egg-info/
6 | *.cache
7 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.8
2 | scipy>=0.8
3 | pytest==3.0.7
4 | pytest-cov==2.5.1
5 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "3.6"
 4 | # command to install dependencies
 5 | install:
 6 |     - pip install -r requirements.txt
 7 |     - pip install coveralls
 8 | # command to run tests
 9 | script: pytest --cov bootstrap
10 | after_success:
11 |     - coveralls
12 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from setuptools import setup, find_packages
 3 | 
 4 | DISTNAME = 'bootstrap'
 5 | DESCRIPTION = 'Library for bootstrapping statistics'
 6 | MAINTAINER = 'Christopher Jenness'
 7 | URL = 'https://github.com/christopherjenness/bootstrap'
 8 | 
 9 | classifiers = ['Programming Language :: Python',
10 |                'Programming Language :: Python :: 2',
11 |                'Programming Language :: Python :: 3',
12 |                'Programming Language :: Python :: 2.7',
13 |                'Programming Language :: Python :: 3.3']
14 | 
15 | with open('requirements.txt') as f:
16 |     install_reqs = f.read().splitlines()
17 | 
18 | if __name__ == "__main__":
19 |     setup(name=DISTNAME,
20 |           maintainer=MAINTAINER,
21 |           description=DESCRIPTION,
22 |           packages=find_packages(),
23 |           url=URL,
24 |           classifiers=classifiers,
25 |           install_requires=install_reqs)
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Bootstrap
  2 | 
  3 | ![TRAVIS](https://travis-ci.org/christopherjenness/bootstrap.svg?branch=master) [![Coverage Status](http://coveralls.io/repos/github/christopherjenness/bootstrap/badge.svg?branch=master)](https://coveralls.io/github/christopherjenness/bootstrap?branch=master)
  4 | 
  5 | A library for bootstrapping statistics.
  6 | 
  7 | ## Features
  8 | 
  9 | While incomplete, the library already incudes a number of features:
 10 | * Bootstrap samples
 11 | * Bootstrap matrices
 12 | * Bootstrap statistics
 13 |   * Provides SEM and confidence intervals for statistics
 14 | * Jackknife samples and statistics
 15 | * Two sample testing
 16 | 
 17 | ## Installation
 18 | 
 19 | ```python
 20 | python setup.py install
 21 | ```
 22 | 
 23 | ## Usage
 24 | 
 25 | Here, we document some of the library features using the University of Wisconsin breast cancer data set. [Available here](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)).  For simplicity, only the first dimension will be looked at.
 26 | 
 27 | ```python
 28 | import numpy as np
 29 | from sklearn.datasets import load_breast_cancer
 30 | data = load_breast_cancer()
 31 | ```
 32 | 
 33 | First, we will look at how the data are distributed.
 34 | 
 35 | ```python
 36 | import matplotlib.pyplot as plt
 37 | import seaborn as sns
 38 | plt.hist(data.data[:,0], bins=40)
 39 | plt.title('Measurements')
 40 | ```
 41 | 
 42 | ![Data](http://i.imgur.com/5Qm0wn4.png)
 43 | 
 44 | Next, we will bootstrap 10,000 samples, to bootstrap the mean and 95% confidence interval for the mean.  Below, the mean of each bootstrapped sample is plotted, with the estimated mean and confidence intervals shown.
 45 | 
 46 | ```python
 47 | results = bootstrap_statistic(data.data[:,0], func=np.mean, n_samples=10000)
 48 | 
 49 | # Make plot of bootstrapped mean
 50 | plt.hist(results.statistics, bins=40)
 51 | plt.title('Bootstrapped Means')
 52 | plt.xlabel('Mean')
 53 | plt.ylabel('Counts')
 54 | ax = plt.gca()
 55 | ax.axvline(x=results.ci[0], color='red', linestyle='dashed', linewidth=2)
 56 | ax.axvline(x=results.ci[1], color='red', linestyle='dashed', linewidth=2)
 57 | ax.axvline(x=results.statistic, color='black', linewidth=5)
 58 | ```
 59 | 
 60 | ![Mean](http://i.imgur.com/GkMnLtQ.png)
 61 | 
 62 | An advantage of the bootstrap method is its adaptability.  For example, you can bootstrap an estimate of the 95th percentile of the data.
 63 | 
 64 | ```python
 65 | def percentile(data):
 66 |     """returns 95th percentile of data"""
 67 |     return np.percentile(data, 95)
 68 |     
 69 | # Bootstrap the 95th percentile
 70 | results = bootstrap_statistic(data.data[:,0], func=percentile, n_samples=10000)
 71 | 
 72 | # Make plot of bootstrapped 95th percentile
 73 | plt.hist(results.statistics, bins=40)
 74 | plt.title('Bootstrapped 95th Percentiles')
 75 | plt.xlabel('95th Percentile')
 76 | plt.ylabel('Counts')
 77 | ax = plt.gca()
 78 | ax.axvline(x=results.ci[0], color='red', linestyle='dashed', linewidth=2)
 79 | ax.axvline(x=results.ci[1], color='red', linestyle='dashed', linewidth=2)
 80 | ax.axvline(x=results.statistic, color='black', linewidth=5)
 81 | ```
 82 | ![Percentile](http://i.imgur.com/SJkAh4l.png)
 83 | 
 84 | Additionally, the library can perform two sample testing.  First lets view the distribution of the same data, but broken up by tumor type.
 85 | 
 86 | ```python
 87 | benign = data.data[data.target == 0]
 88 | malignant = data.data[data.target == 1]
 89 | 
 90 | # Plot benign and malignant samples
 91 | plt.hist(benign[:,0], bins=30, alpha=0.5, label='benign')
 92 | plt.hist(malignant[:,0], bins=30, alpha=0.5, label='malignant')
 93 | plt.legend()
 94 | plt.xlabel('Measurement')
 95 | plt.ylabel('Counts')
 96 | ```
 97 | 
 98 | ![split](http://i.imgur.com/rsVrDJT.png)
 99 | 
100 | It appears their is a different in the groups distribution.  The level of significance can be computer via the bootstrap method.
101 | 
102 | ```python
103 | significance = two_sample_testing(benign[:, 0], malignant[:, 0],
104 |                                   statistic_func=compare_means,
105 |                                   n_samples=5000)
106 | print(significance) # prints 0.0
107 | ```
108 | Hmmm, with 5,000 random bootstrapped samples, not a single one had the difference of means of the observed samples.
109 | 
110 | What about a feature that is less predictive?  Below, we look at feature 9.
111 | 
112 | ```python
113 | plt.hist(benign[:,9], bins=30, alpha=0.5, label='benign')
114 | plt.hist(malignant[:,9], bins=30, alpha=0.5, label='malignant')
115 | plt.legend()
116 | plt.xlabel('Measurement')
117 | plt.ylabel('Counts')
118 | ```
119 | 
120 | ![Feature9](http://i.imgur.com/tCt1rnV.png)
121 | 
122 | If then bootstrap the difference between the two means, we get a non-significant difference.
123 | 
124 | ```python
125 | significance = two_sample_testing(malignant[:, 9], benign[:, 9],
126 |                                   statistic_func=compare_means,
127 |                                   n_samples=5000)
128 | print(significance) # prints 0.387
129 | ```
130 | 


--------------------------------------------------------------------------------
/tests/test_resampling.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import numpy as np
  3 | import coveralls
  4 | from bootstrap.bootstrap import bootstrap_sample, jackknife_sample, \
  5 |         compare_means, t_test_statistic, two_sample_testing, \
  6 |         bootstrap_matrixsample, bootstrap_statistic, jackknife_statistic
  7 | 
  8 | 
  9 | class BootstrapInit(unittest.TestCase):
 10 |     def setUp(self):
 11 |         np.random.seed(0)
 12 |         self.normal_data = np.random.normal(100, 10, size=100)
 13 |         np.random.seed(0)
 14 |         self.normal_data2 = np.random.normal(200, 10, size=100)
 15 |         np.random.seed(0)
 16 |         self.uniform_data = np.random.uniform(0, 100, size=100)
 17 |         np.random.seed(0)
 18 |         self.poisson_data = np.random.poisson(10, size=(100))        
 19 |         np.random.seed(0)
 20 |         self.matrix_data = np.random.normal(100, 10, size=(100, 100))
 21 | 
 22 | class ResamplingTestCase(BootstrapInit):
 23 |     def testNonparametric(self):
 24 |         bootstrap_data = bootstrap_sample(self.normal_data)
 25 |         self.assertAlmostEqual(np.mean(self.normal_data)/10000,
 26 |                                np.mean(bootstrap_data)/10000, 3)
 27 |         self.assertEqual(len(bootstrap_data), len(self.normal_data))
 28 | 
 29 |     def testNormalParametric(self):
 30 |         bootstrap_data = bootstrap_sample(self.normal_data,
 31 |                                           parametric='normal')
 32 |         self.assertAlmostEqual(np.mean(self.normal_data)/10000,
 33 |                                np.mean(bootstrap_data)/10000, 3)
 34 |         self.assertEqual(len(bootstrap_data), len(self.normal_data))
 35 | 
 36 |     def testUniformParametric(self):
 37 |         bootstrap_data = bootstrap_sample(self.uniform_data,
 38 |                                           parametric='uniform')
 39 |         self.assertAlmostEqual(np.mean(self.uniform_data)/10000,
 40 |                                np.mean(bootstrap_data)/10000, 3)
 41 |         self.assertEqual(len(bootstrap_data), len(self.uniform_data))
 42 | 
 43 |     def testPoissonParametric(self):
 44 |         bootstrap_data = bootstrap_sample(self.poisson_data,
 45 |                                           parametric='poisson')
 46 |         self.assertAlmostEqual(np.mean(self.poisson_data)/10000,
 47 |                                np.mean(bootstrap_data)/10000, 3)
 48 |         self.assertEqual(len(bootstrap_data), len(self.poisson_data))
 49 | 
 50 |     def testJackknifeSample(self):
 51 |         jackknife_data = jackknife_sample(self.uniform_data, 10)
 52 |         self.assertAlmostEqual(np.mean(self.uniform_data)/10000,
 53 |                                np.mean(jackknife_data)/10000, 3)
 54 |         self.assertEqual(len(jackknife_data) + 1, len(self.uniform_data))
 55 | 
 56 | 
 57 | class TwoSampleTestCase(BootstrapInit):
 58 |     def testMeanDifference(self):
 59 |         mean_difference = compare_means(self.normal_data2,
 60 |                                         self.normal_data)
 61 |         self.assertAlmostEqual(mean_difference/100000, 0.001, 3)
 62 | 
 63 |     def testTStatisticBig(self):
 64 |         t_statistic = t_test_statistic(self.normal_data2,
 65 |                                        self.normal_data)
 66 |         self.assertAlmostEqual(t_statistic / 10000, 0.007, 3)
 67 | 
 68 |     def testTStatisticZero(self):
 69 |         t_statistic = t_test_statistic(self.normal_data,
 70 |                                        self.normal_data)
 71 |         self.assertEqual(t_statistic, 0)
 72 | 
 73 |     def testTwoSampleZero(self):
 74 |         ASL = two_sample_testing(self.normal_data2, self.normal_data)
 75 |         self.assertEqual(ASL, 0)
 76 | 
 77 |     def testTwoSampleBig(self):
 78 |         ASL = two_sample_testing(self.normal_data, self.normal_data)
 79 |         self.assertGreater(ASL, 0.05)
 80 | 
 81 |     def testTwoSampleTTest(self):
 82 |         ASL = two_sample_testing(self.normal_data2, self.normal_data,
 83 |                                  statistic_func=t_test_statistic)
 84 |         self.assertEqual(ASL, 0)
 85 | 
 86 |     def testTwoSampleTTestBit(self):
 87 |         ASL = two_sample_testing(self.normal_data, self.normal_data,
 88 |                                  statistic_func=t_test_statistic)
 89 |         self.assertGreater(ASL, 0.05)
 90 | 
 91 | 
 92 | class MatrixTestCase(BootstrapInit):
 93 | 
 94 |     def testMatrixResamplingCol(self):
 95 |         matrix_sample = np.matrix(bootstrap_matrixsample(self.matrix_data,
 96 |                                                          axis=1))
 97 |         self.assertEqual(np.shape(matrix_sample), np.shape(self.matrix_data))
 98 |         self.assertAlmostEqual(np.average(matrix_sample)/100000,
 99 |                                np.mean(self.matrix_data)/100000, 3)
100 | 
101 | 
102 | class StatisticsTestCast(BootstrapInit):
103 |     def testBootstrapMean(self):
104 |         statistics, statistic, bias, sem, confidence_interval = \
105 |             bootstrap_statistic(self.normal_data)
106 |         self.assertAlmostEqual(statistic/100000, 100/100000, 2)
107 |         self.assertAlmostEqual(np.abs(bias/100), 0.1/100, 3)
108 |         self.assertAlmostEqual(sem/100, 0.1/100, 3)
109 |         self.assertEqual(len(confidence_interval), 2)
110 |         self.assertTrue(confidence_interval[0] < confidence_interval[1])
111 |         self.assertTrue(confidence_interval[1] - confidence_interval[0] < 5)
112 | 
113 |     def testBoostrapMedian(self):
114 |         statistics, statistic, bias, sem, confidence_interval = \
115 |             bootstrap_statistic(self.normal_data, func=np.median)
116 |         self.assertAlmostEqual(statistic/100000, 100/100000, 2)
117 |         self.assertAlmostEqual(np.abs(bias/100), 0.3/100, 3)
118 |         self.assertAlmostEqual(sem/100, 0.1/100, 3)
119 |         self.assertEqual(len(confidence_interval), 2)
120 |         self.assertTrue(confidence_interval[0] < confidence_interval[1])
121 |         self.assertTrue(confidence_interval[1] - confidence_interval[0] < 5)
122 | 
123 |     def testBootstrapBCa(self):
124 |         statistics, statistic, bias, sem, confidence_interval = \
125 |             bootstrap_statistic(self.normal_data, bca=True)
126 |         self.assertAlmostEqual(statistic/100000, 100/100000, 2)
127 |         self.assertAlmostEqual(np.abs(bias/100), 0.1/100, 3)
128 |         self.assertAlmostEqual(sem/100, 0.1/100, 3)
129 |         self.assertEqual(len(confidence_interval), 2)
130 |         self.assertTrue(confidence_interval[0] < confidence_interval[1])
131 |         self.assertTrue(confidence_interval[1] - confidence_interval[0] < 5)
132 | 
133 |     def testBoostrapParams(self):
134 |         statistics, statistic, bias, sem, confidence_interval = \
135 |             bootstrap_statistic(self.normal_data, parametric='normal',
136 |                                 bias_correction=True, alpha=0.1)
137 |         self.assertAlmostEqual(statistic/100000, 100/100000, 2)
138 |         self.assertAlmostEqual(np.abs(bias/100), 0.1/100, 3)
139 |         self.assertAlmostEqual(sem/100, 0.2/100, 3)
140 |         self.assertEqual(len(confidence_interval), 2)
141 |         self.assertTrue(confidence_interval[0] < confidence_interval[1])
142 |         self.assertTrue(confidence_interval[1] - confidence_interval[0] < 5)
143 | 
144 |     def testBootstrapMatrix(self):
145 |         statistics, statistic, bias, sem, confidence_interval = \
146 |             bootstrap_statistic(self.normal_data)
147 |         self.assertAlmostEqual(statistic/100000, 100/100000, 2)
148 |         self.assertAlmostEqual(np.abs(bias/100), 0.1/100, 3)
149 |         self.assertAlmostEqual(sem/100, 0.1/100, 3)
150 |         self.assertEqual(len(confidence_interval), 2)
151 |         self.assertTrue(confidence_interval[0] < confidence_interval[1])
152 |         self.assertTrue(confidence_interval[1] - confidence_interval[0] < 5)
153 | 
154 |     def testJackknifeMean(self):
155 |         statistic, sem, statistics = jackknife_statistic(self.normal_data)
156 |         self.assertAlmostEqual(statistic/100000,
157 |                                np.mean(self.normal_data)/100000, 3)
158 |         self.assertAlmostEqual(sem/10, 0.1/100, 3)
159 |         self.assertEqual(len(statistics), len(self.normal_data))
160 | 
161 | if __name__ == '__main__':
162 |     unittest.main()
163 | 


--------------------------------------------------------------------------------
/bootstrap/bootstrap.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Library for boostraping statistics
  3 | 
  4 | Citation:
  5 | Efron, Bradley, and Robert J. Tibshirani.
  6 | An introduction to the bootstrap. CRC press, 1994.
  7 | """
  8 | 
  9 | from collections import namedtuple
 10 | import numpy as np
 11 | from scipy import stats
 12 | 
 13 | 
 14 | def bootstrap_sample(data, parametric=False):
 15 |     """
 16 |     Resamples data by random sample with replacement
 17 | 
 18 |     Args
 19 |     ---------
 20 |     data : 1d array
 21 |         Data to resample
 22 |     parametric : str in ['normal', 'uniform', 'poisson']
 23 |         parametric distribution to resample from,
 24 |         if False, use nonparametric bootstrap sampling
 25 | 
 26 |     Returns:
 27 |     ---------
 28 |     resamples : array
 29 |         bootstrap resampled data
 30 |     """
 31 |     dists = ['normal', 'uniform', 'poisson']
 32 |     if parametric and parametric not in dists:
 33 |         raise ValueError("Invalid parametric argument.")
 34 | 
 35 |     sample_size = len(data)
 36 |     if parametric == 'normal':
 37 |         mean_estimate = np.mean(data)
 38 |         std_estimate = np.std(data)
 39 |         return np.random.normal(mean_estimate, std_estimate, size=sample_size)
 40 |     elif parametric == 'uniform':
 41 |         min_estimate, max_estimate = np.min(data), np.max(data)
 42 |         return np.random.uniform(min_estimate, max_estimate, size=sample_size)
 43 |     elif parametric == 'poisson':
 44 |         lambda_estimate = np.mean(data)
 45 |         return np.random.poisson(lam=lambda_estimate, size=sample_size)
 46 |     else:
 47 |         inds = [np.random.randint(0, sample_size) for i in range(sample_size)]
 48 |         return data[inds]
 49 | 
 50 | 
 51 | def bootstrap_matrixsample(data, axis=0):
 52 |     """
 53 |     Resamples a matrix by rows or columns
 54 | 
 55 |     Args:
 56 |     ---------
 57 |     data : np.matrix
 58 |         matrix of data to resample
 59 |     axis : (int) in [0, 1]
 60 |         axis to resample by
 61 |         if 0, then resample rows
 62 |         if 1, then resample columns
 63 | 
 64 |     Returns:
 65 |     ---------
 66 |     resamples : matrix
 67 |         bootstrap resampled data
 68 |     """
 69 | 
 70 |     if axis == 0:
 71 |         n_rows = np.shape(data)[0]
 72 |         samples = np.random.randint(n_rows, size=n_rows)
 73 |         bootstrap_matrix = data[samples, :]
 74 |     elif axis == 1:
 75 |         n_cols = np.shape(data)[1]
 76 |         samples = np.random.randint(n_cols, size=n_cols)
 77 |         bootstrap_matrix = data[:, samples]
 78 |     return bootstrap_matrix
 79 | 
 80 | 
 81 | def jackknife_sample(data, index):
 82 |     """
 83 |     Single jackknife sample of data
 84 | 
 85 |     Args:
 86 |     ---------
 87 |     data : np.array
 88 |         array of data to resample
 89 |     index : int
 90 |         Index of array to leave out in jackknife sample
 91 | 
 92 |     Returns:
 93 |     ---------
 94 |     resamples : array
 95 |         jackknife resampled data
 96 |     """
 97 |     jackknife = np.delete(data, index)
 98 |     return jackknife
 99 | 
100 | 
101 | def bootstrap_statistic(data, func=np.mean, n_samples=50,
102 |                         parametric=False, bias_correction=False,
103 |                         alpha=0.05, bca=False, axis=0):
104 |     """
105 |     Bootstraps a statistic and calculates the standard error of the statistic
106 | 
107 |     Args:
108 |     ---------
109 |     data : array or matrix
110 |         array or matrix of data to calculate statistic and SE of statistic
111 |     func : function
112 |         statistical function to calculate on data
113 |         examples: np.mean, np.median
114 |     n_samples : int
115 |         number of bootstrap samples to calculate statistic for
116 |     parametric : (str) in ['normal', 'uniform']
117 |         parametric distribution to resample from,
118 |         If False, use nonparametric bootstrap sampling
119 |     bias_correction : bool
120 |         If True, bias correct bootstrap statistic
121 |     bca : bool
122 |         If true, use bias correction and (BCa) method to calculate bootstrap
123 |     axis : int in [0, 1]
124 |         if type(data) == np.matrix, axis to resample by
125 |             if 0: resample rows
126 |             if 1: resample columns
127 | 
128 |     Returns:
129 |     ---------
130 |     results : (float, float, float)
131 |         The bootstrapped statistic, its bias and SEM.
132 |         (statistic ,bias ,sem)
133 |     """
134 |     plugin_estimate = func(data)
135 |     statistics = []
136 | 
137 |     # Compute statistics and mean it to get statistic's value
138 |     for sample in range(n_samples):
139 |         if isinstance(data, np.matrix):
140 |             resample = bootstrap_matrixsample(data, axis=axis)
141 |         else:
142 |             resample = bootstrap_sample(data, parametric=parametric)
143 |         statistic = func(resample)
144 |         statistics.append(statistic)
145 |     statistic = np.mean(statistics)
146 | 
147 |     # CI for the statistic
148 |     confidence_interval = calculate_ci(data, statistics, func=func,
149 |                                        alpha=alpha, bca=bca)
150 | 
151 |     # Compute bias and, if requested, correct for it
152 |     bias = statistic - plugin_estimate
153 |     if bias_correction:
154 |         statistic = statistic - bias
155 | 
156 |     sem = stats.sem(statistics)
157 | 
158 |     # Pack together the results
159 |     bootstrap_results = namedtuple('bootstrap_results',
160 |                                    'statistics statistic bias sem ci')
161 |     results = bootstrap_results(statistics=statistics, statistic=statistic,
162 |                                 bias=bias, sem=sem, ci=confidence_interval)
163 |     return results
164 | 
165 | 
166 | def jackknife_statistic(data, func=np.mean):
167 |     """
168 |     Jackknifes a statistic and calculates the standard error of the statistic
169 | 
170 |     Args:
171 |     ---------
172 |     data : array
173 |         array of data to calculate statistic and SE of statistic
174 |     func : function
175 |         statistical function to calculate on data
176 |         examples: np.mean, np.median
177 | 
178 |     Returns:
179 |     ---------
180 |     jackknifed_stat : (float, float, float)
181 |         (statistic, sem, statistics)
182 |     Returns the jackknifed statistic and the SEM of the statistic
183 |     """
184 |     n_samples = len(data)
185 |     statistics = []
186 | 
187 |     for sample in range(n_samples):
188 |         jack_sample = jackknife_sample(data, sample)
189 |         statistic = func(jack_sample)
190 |         statistics.append(statistic)
191 |     return (np.mean(statistics), stats.sem(statistics), statistics)
192 | 
193 | 
194 | def calculate_ci(data, statistics, func=np.mean,
195 |                  alpha=0.05, bca=False):
196 |     """
197 |     Calculates bootstrapped confidence interval using percentile
198 |     intervals.
199 | 
200 |     Args:
201 |     ---------
202 |     statistics (array): array of bootstrapped statistics to calculate
203 |           confidence interval for
204 |     alpha (float): percentile used for upper and lower bounds of confidence
205 |             interval.  NOTE: Currently, both upper and lower bounds can have
206 |             the same alpha.
207 |     bca (bool): If true, use bias correction and accelerated (BCa) method
208 |     theta_hat (float): Original estimate of the statistic from the data.
209 |             Used to calculate BCa confidence interval.
210 | 
211 |     Returns: tuple (ci_low, ci_high)
212 |     ---------
213 |     confidence_interval : (float, float)
214 |         (ci_low, ci_high)
215 |         ci_low - lower bound on confidence interval
216 |         ci_high - upper bound on confidence interval
217 |     """
218 |     # If BCa method, update alpha
219 |     if bca:
220 |         # Calculate bias term, z
221 |         plugin_estimate = func(data)
222 |         num_below_plugin_est = len(np.where(statistics < plugin_estimate)[0])
223 |         bias_frac = num_below_plugin_est / len(statistics)
224 |         z = stats.norm.ppf(bias_frac)
225 |         # Calculate acceleration term, a
226 |         j_statistic, j_sem, j_values = jackknife_statistic(data, func)
227 |         numerator, denominator = 0, 0
228 |         for value in j_values:
229 |             numerator += (value - j_statistic)**3
230 |             denominator += (value - j_statistic)**2
231 |         a = numerator / (6 * denominator**(3/2))
232 |         bca_alpha = stats.norm.cdf(z + (z + stats.norm.ppf(alpha)) /
233 |                                    1 - a * (z + stats.norm.ppf(alpha)))
234 |         alpha = bca_alpha
235 |     sorted_statistics = np.sort(statistics)
236 |     low_index = int(np.floor(alpha * len(statistics)))
237 |     high_index = int(np.ceil((1 - alpha) * len(statistics)))
238 | 
239 |     # Correct for 0 based indexing
240 |     if low_index > 0:
241 |         low_index -= 1
242 |     high_index -= 1
243 |     low_value = sorted_statistics[low_index]
244 |     high_value = sorted_statistics[high_index]
245 |     return (low_value, high_value)
246 | 
247 | 
248 | def two_sample_testing(sampleA, sampleB,
249 |                        statistic_func=None, n_samples=50):
250 |     """
251 |     Compares two samples via bootstrapping to determine if they came from
252 |     the same distribution.
253 | 
254 |     Args:
255 |     ---------
256 |     sampleA : np.array
257 |         Array of data from sample A
258 |     sampleB : np.array
259 |         Array of data form sample B
260 |     statistic_func : function
261 |         Function that compares two data sets and retuns a statistic. Function
262 |         must accept two args, (np.array, np.array), where each array is a
263 |         sample.
264 |         Example statistics_func that compares the mean of two data sets:
265 |             lambda data1, data2: np.mean(data1) - np.mean(data2)
266 |     n_samples : int
267 |         number of bootstrap samples to generate
268 | 
269 |     Returns:
270 |     ---------
271 |     sig_lvl : float
272 |         bootstrapped achieved significance level
273 |     """
274 |     if statistic_func is None:
275 |         statistic_func = compare_means
276 | 
277 |     observed_statistic = statistic_func(sampleA, sampleB)
278 |     combined_sample = np.append(sampleA, sampleB)
279 | 
280 |     # Count the number of bootstrap samples with statistic > observed_statistic
281 |     m = len(sampleA)
282 |     counter = 0
283 |     for sample in range(n_samples):
284 |         boot_sample = bootstrap_sample(combined_sample)
285 |         boot_sampleA = boot_sample[:m]
286 |         boot_sampleB = boot_sample[m:]
287 |         boot_statistic = statistic_func(boot_sampleA, boot_sampleB)
288 |         if boot_statistic > observed_statistic:
289 |             counter += 1
290 | 
291 |     ASL = counter / float(n_samples)
292 |     return ASL
293 | 
294 | 
295 | def compare_means(sampleA, sampleB):
296 |     """
297 |     Compares the mean of two samples
298 | 
299 |     Args:
300 |     ---------
301 |     sampleA (np.array): Array of data from sample A
302 |     sampleB (np.array): Array of data form sample B
303 | 
304 |     Returns:
305 |     ---------
306 |     difference : float
307 |         difference in mean between the two samples
308 |     """
309 |     difference = np.mean(sampleA) - np.mean(sampleB)
310 |     return difference
311 | 
312 | 
313 | def t_test_statistic(sampleA, sampleB):
314 |     """
315 |     Computes the t test statistic of two samples
316 | 
317 |     Args:
318 |     ---------
319 |     sampleA : np.array
320 |         Array of data from sample A
321 |     sampleB : np.array
322 |         Array of data form sample B
323 | 
324 |     Returns:
325 |     ---------
326 |     t_stat : float
327 |         t test statistic of two samples
328 |     """
329 |     difference = compare_means(sampleA, sampleB)
330 |     # Store lengths of samples
331 |     n = len(sampleA)
332 |     m = len(sampleB)
333 |     stdev = (np.var(sampleA)/n + np.var(sampleB)/m)**0.5
334 |     t_stat = difference / stdev
335 |     return t_stat
336 | 


--------------------------------------------------------------------------------