├── .gitignore ├── bayesian_bozo ├── __init__.py └── bayesian_bootstrap.py ├── README.rst ├── setup.py └── tests └── bernoulli_tests.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.tar.gz 3 | *.egg-* 4 | dist 5 | MANIFEST -------------------------------------------------------------------------------- /bayesian_bozo/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | from bayesian_bootstrap import bayesian_bootstrap 3 | from bayesian_bootstrap import test_difference_of_proportions, bayesian_bootstrap_diff, bayesian_bootstrap_lift 4 | from bayesian_bootstrap import _create_unimodal_hpd -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | My implementation of Rubin's Bayesian Bootstrap (detailed here: http://projecteuclid.org/download/pdf_1/euclid.aos/1176345338) 2 | 3 | # Personal reminder on how to update git repo 4 | 5 | Run this to create a new pip install 6 | 7 | tar czf BayesianBozo.tar.gz bayesian_bozo 8 | 9 | then 10 | 11 | pip install -e . -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | setup(name='BayesianBozo', 4 | version='0.1.0', 5 | author_email='darkxanthos@gmail.com', 6 | description='Bayesian statistical functions for Python', 7 | long_description='Bayesian statistical functions and algorithms Ive needed to build as I delve deeper into Bayesian Statistics', 8 | author='Justin Bozonier', 9 | url='https://github.com/jcbozonier/BayesianBozo', 10 | packages=['bayesian_bozo'], 11 | keywords='bayes statistics probability inferrence bootstrap') -------------------------------------------------------------------------------- /bayesian_bozo/bayesian_bootstrap.py: -------------------------------------------------------------------------------- 1 | import numpy, random, itertools, math 2 | 3 | def hdp_for(rounded_values, level=.95): 4 | groupings = {} 5 | for value in rounded_values: 6 | if not value in groupings: 7 | groupings[value] = 0 8 | groupings[value] += 1 9 | sorted_lifts = sorted(groupings.items(), key=lambda x: x[1], reverse=False) 10 | count = len(rounded_values) 11 | target_hdp_count = int(level*count) 12 | current_hdp_count = 0 13 | hdp_list = [] 14 | while current_hdp_count < target_hdp_count: 15 | pair = sorted_lifts.pop() 16 | hdp_list.append(pair[0]) 17 | current_hdp_count += pair[1] 18 | return [min(hdp_list), max(hdp_list)] 19 | 20 | def fast_mean_sample(hypotheses, observations): 21 | p_hypotheses = numpy.random.dirichlet(observations) 22 | return (p_hypotheses*hypotheses).sum() 23 | 24 | def bayesian_bootstrap(numbers, sample_count=2500): 25 | for i in numbers: 26 | if not type(i) is int: 27 | raise TypeError('All data must be integers.') 28 | histogram = list((k, len(list(g))) for k, g in itertools.groupby(sorted(numbers))) 29 | keys = map(lambda x: x[0], histogram) 30 | counts = map(lambda x: x[1], histogram) 31 | mean_samples = [fast_mean_sample(keys, counts) for i in range(0,sample_count)] 32 | mean_mean = numpy.mean(mean_samples) 33 | return {'mean_samples': mean_samples, 'expected_value':mean_mean, 'hdp_interval':hdp_for(mean_samples)} 34 | 35 | def bayesian_bootstrap_diff(control_numbers, variant_numbers, sample_count=2500): 36 | if len(control_numbers) == 0 or len(variant_numbers) == 0: 37 | raise RuntimeError('Must have at least one data point for control data') 38 | 39 | control_sampled_data = bayesian_bootstrap(control_numbers) 40 | variant_sampled_data = bayesian_bootstrap(variant_numbers) 41 | 42 | sampled_mean_lifts = [] 43 | 44 | for i in range(0,sample_count): 45 | sampled_control_mean = random.choice(control_sampled_data['mean_samples']) 46 | sampled_variant_mean = random.choice(variant_sampled_data['mean_samples']) 47 | sampled_mean_lifts.append(sampled_variant_mean - sampled_control_mean) 48 | 49 | hdp = hdp_for(sampled_mean_lifts) 50 | 51 | return { 52 | 'mean_diff':numpy.mean(sampled_mean_lifts), 53 | 'diff_samples': sampled_mean_lifts, 54 | 'is_significant': 0. < hdp[0] and 0. < hdp[1], 55 | 'hdp': hdp 56 | } 57 | 58 | def _compute_bootstrapped_lift_data(control_successes, control_population, variant_successes, variant_population): 59 | samples = [] 60 | for i in range(0,2500): 61 | control_rate_sample = numpy.random.beta(1 + control_successes, 1 + control_population - control_successes) 62 | variant_rate_sample = numpy.random.beta(1 + variant_successes, 1 + variant_population - variant_successes) 63 | if control_rate_sample == 0: 64 | if variant_rate_sample == 0: 65 | samples.append(0.) 66 | else: 67 | samples.append(float('inf')) 68 | else: 69 | samples.append(variant_rate_sample/control_rate_sample - 1) 70 | return dict((k, len(list(g))) for k, g in itertools.groupby(sorted(samples))) 71 | 72 | def _create_unimodal_hpd(distribution): 73 | total_observations = 1.*sum(distribution.values()) 74 | sorted_bins = [(x[0], x[1]/total_observations) for x in sorted(distribution.items(), key=lambda x: x[1], reverse=True)] 75 | min_value = None 76 | max_value = None 77 | current_level = 0 78 | current_index = 0 79 | while current_level < .95: 80 | i = sorted_bins[current_index] 81 | if min_value == None: 82 | min_value = i[0] 83 | if max_value == None: 84 | max_value = i[0] 85 | if i[0] < min_value: 86 | min_value = i[0] 87 | if i[0] > max_value: 88 | max_value = i[0] 89 | current_level += i[1] 90 | current_index += 1 91 | return [min_value, max_value] 92 | 93 | def test_difference_of_proportions(control_successes, control_population, variant_successes, variant_population): 94 | if control_population == 0 or variant_population == 0: 95 | raise RuntimeError('There must be at least one observation in both control and variant populations.') 96 | lift_distribution = _compute_bootstrapped_lift_data(control_successes, control_population, variant_successes, variant_population) 97 | unimodal_hpd = _create_unimodal_hpd(lift_distribution) 98 | return { 99 | 'is_significant':0 < unimodal_hpd[0] and 0 < unimodal_hpd[1], 100 | 'lift':{ 101 | 'lower_bound':unimodal_hpd[0], 102 | 'upper_bound':unimodal_hpd[1] 103 | } 104 | } 105 | 106 | def bayesian_bootstrap_lift(control_numbers, variant_numbers, sample_count=2500): 107 | if len(control_numbers) == 0 or len(variant_numbers) == 0: 108 | raise RuntimeError('Must have at least one data point for control data') 109 | control_sampled_data = bayesian_bootstrap(control_numbers) 110 | variant_sampled_data = bayesian_bootstrap(variant_numbers) 111 | 112 | sampled_mean_lifts = [] 113 | 114 | for i in range(0,sample_count): 115 | sampled_control_mean = random.choice(control_sampled_data['mean_samples']) 116 | sampled_variant_mean = random.choice(variant_sampled_data['mean_samples']) 117 | if sampled_control_mean == 0. and sampled_variant_mean != 0.: 118 | sampled_mean_lifts.append(float('inf')) 119 | elif sampled_control_mean == 0. and sampled_variant_mean == 0.: 120 | sampled_mean_lifts.append(0.) 121 | else: 122 | sampled_mean_lifts.append((sampled_variant_mean-sampled_control_mean)/(1.*sampled_control_mean)) 123 | 124 | hdp = hdp_for(sampled_mean_lifts) 125 | 126 | return { 127 | 'mean_lift':numpy.mean(sampled_mean_lifts), 128 | 'lift_samples': sampled_mean_lifts, 129 | 'is_significant': 0. < hdp[0] and 0. < hdp[1], 130 | 'hdp': hdp 131 | } 132 | -------------------------------------------------------------------------------- /tests/bernoulli_tests.py: -------------------------------------------------------------------------------- 1 | from nose.tools import * 2 | 3 | import sys, numpy, math 4 | sys.path.append("/Users/justin/Documents/Code/BayesianBozo") 5 | 6 | import bayesian_bozo 7 | import itertools 8 | 9 | @raises(RuntimeError) 10 | def no_observations_for_control_and_variant_test(): 11 | bayesian_bozo.test_difference_of_proportions(0,0,0,0) 12 | 13 | @raises(RuntimeError) 14 | def no_observations_for_variant_test(): 15 | bayesian_bozo.test_difference_of_proportions(0,1,0,0) 16 | 17 | @raises(RuntimeError) 18 | def no_observations_for_control_test(): 19 | bayesian_bozo.test_difference_of_proportions(0,0,0,1) 20 | 21 | def one_observation_for_each_no_success_test(): 22 | result = bayesian_bozo.test_difference_of_proportions(0,1,0,1) 23 | assert 'is_significant' in result 24 | assert result['is_significant'] == False 25 | assert 'lift' in result 26 | assert 'lower_bound' in result['lift'] 27 | assert 'upper_bound' in result['lift'] 28 | assert result['lift']['lower_bound'] < 0 and 0 < result['lift']['upper_bound'] 29 | 30 | def obvious_increased_lift_test(): 31 | result = bayesian_bozo.test_difference_of_proportions(0,10,10,10) 32 | assert result['is_significant'] == True 33 | assert 0 < result['lift']['lower_bound'] and 0 < result['lift']['upper_bound'], "should provide an HPD interval with zero excluded." 34 | 35 | def ambiguous_lift_test(): 36 | result = bayesian_bozo.test_difference_of_proportions(10,10,10,10) 37 | assert result['is_significant'] == False, 'Should never be statistically significant.' 38 | assert result['lift']['lower_bound'] < 0 and 0 < result['lift']['upper_bound'], "should provide an HPD interval with zero included." 39 | 40 | @raises(RuntimeError) 41 | def bayesian_bootstrap_diff_with_no_control_data_test(): 42 | bayesian_bozo.bayesian_bootstrap_diff([],[1,2,3,4]) 43 | 44 | @raises(RuntimeError) 45 | def bayesian_bootstrap_diff_with_no_variant_data_test(): 46 | bayesian_bozo.bayesian_bootstrap_diff([1,2,3,4],[]) 47 | 48 | def bayesian_bootstrap_diff_with_single_data_point_test(): 49 | sample_count = 2500 50 | result = bayesian_bozo.bayesian_bootstrap_diff([1], [2], sample_count=sample_count) 51 | assert result['mean_diff'] == 1.0 52 | assert sum(result['diff_samples']) == sample_count, 'Should only see a lift of 1: {0}'.format(sum(result['diff_samples'])) 53 | 54 | def bayesian_bootstrap_diff_with_two_data_point_test(): 55 | result = bayesian_bozo.bayesian_bootstrap_diff([1,2], [3,4]) 56 | assert result['mean_diff'] > 1.0 57 | assert not 0 in result['diff_samples'], 'Should be impossible to see a lift of zero with this data.' 58 | assert result['is_significant'] == True 59 | 60 | def bayesian_bootstrap_diff_with_two_discrete_normal_distributions_test(): 61 | control_data = [int(math.floor(i)) for i in numpy.random.normal(0,20,100)] 62 | variant_data = [int(math.floor(i)) for i in numpy.random.normal(0,20,100)] 63 | result = bayesian_bozo.bayesian_bootstrap_diff(control_data, variant_data) 64 | assert result['is_significant'] == False 65 | assert not float('-inf') in result['hdp'] and not float('inf') in result['hdp'], 'Should have somewhat of an idea of a range.' 66 | assert result['hdp'][0] < 0 67 | 68 | def bayesian_bootstrap_diff_with_two_discrete_normal_distributions_more_samples_test(): 69 | control_data = [int(round(i)) for i in numpy.random.normal(0,20,2000)] 70 | variant_data = [int(round(i)) for i in numpy.random.normal(0,20,2000)] 71 | result = bayesian_bozo.bayesian_bootstrap_diff(control_data, variant_data) 72 | assert result['is_significant'] == False 73 | assert not float('-inf') in result['hdp'] and not float('inf') in result['hdp'], 'Should have somewhat of an idea of a range.' 74 | assert result['hdp'][0] > -5, 'Should be smaller range than this.' 75 | assert result['hdp'][1] < 5, 'Should be smaller range than this.' 76 | 77 | def bayesian_bootstrap_diff_with_two_discrete_normals_shifted_by_5_test(): 78 | control_data = [int(round(i)) for i in numpy.random.normal(0,20,1000)] 79 | variant_data = [int(round(i)) for i in numpy.random.normal(5,20,1000)] 80 | result = bayesian_bozo.bayesian_bootstrap_diff(control_data, variant_data) 81 | assert result['is_significant'] == True 82 | assert result['hdp'][0] <= 5 and 5 <= result['hdp'][1] 83 | 84 | @raises(RuntimeError) 85 | def bayesian_bootstrap_lift_with_no_control_data_test(): 86 | bayesian_bozo.bayesian_bootstrap_lift([],[1,2,3,4]) 87 | 88 | @raises(RuntimeError) 89 | def bayesian_bootstrap_lift_with_no_variant_data_test(): 90 | bayesian_bozo.bayesian_bootstrap_lift([1,2,3,4],[]) 91 | 92 | def bayesian_bootstrap_lift_with_single_data_point_test(): 93 | sample_count = 2500 94 | result = bayesian_bozo.bayesian_bootstrap_lift([1], [2], sample_count=sample_count) 95 | assert result['mean_lift'] == 1.0 96 | assert len(result['lift_samples']) == sample_count, 'Should only see a lift of 1: {0}'.format(len(result['lift_samples'])) 97 | 98 | def bayesian_bootstrap_lift_with_two_data_point_test(): 99 | result = bayesian_bozo.bayesian_bootstrap_lift([1,2], [3,4]) 100 | assert result['mean_lift'] > 1.0 101 | assert not 0 in result['lift_samples'], 'Should be impossible to see a lift of zero with this data.' 102 | assert result['is_significant'] == True 103 | 104 | def bayesian_bootstrap_lift_with_two_discrete_poisson_distributions_quite_different_test(): 105 | control_data = map(int,numpy.random.poisson(5, 100)) 106 | variant_data = map(int,numpy.random.poisson(10, 100)) 107 | result = bayesian_bozo.bayesian_bootstrap_lift(control_data, variant_data) 108 | print result 109 | assert result['is_significant'] == True 110 | assert not float('-inf') in result['hdp'] and not float('inf') in result['hdp'], 'Should have somewhat of an idea of a range.' 111 | assert result['hdp'][0] > 0., 'Should be smaller range than this.' 112 | assert result['hdp'][1] < 3., 'Should be smaller range than this.' 113 | 114 | def bayesian_bootstrap_lift_with_two_discrete_poisson_distributions_not_different_test(): 115 | control_data = map(int,numpy.random.poisson(5, 100)) 116 | variant_data = map(int,numpy.random.poisson(5, 100)) 117 | result = bayesian_bozo.bayesian_bootstrap_lift(control_data, variant_data) 118 | print result 119 | assert result['is_significant'] == False 120 | assert not float('-inf') in result['hdp'] and not float('inf') in result['hdp'], 'Should have somewhat of an idea of a range.' 121 | assert result['hdp'][0] <= 0., 'Should be smaller range than this.' 122 | assert result['hdp'][1] >= 0., 'Should be smaller range than this.' 123 | 124 | def bayesian_bootstrap_lift_with_limited_data_no_lift_test(): 125 | control_data = [0,1] 126 | variant_data = [0,1,2] 127 | result = bayesian_bozo.bayesian_bootstrap_lift(control_data, variant_data) 128 | print result 129 | assert result['is_significant'] == False 130 | 131 | def bayesian_bootstrap_lift_with_limited_data_no_lift_both_zero_test(): 132 | control_data = [0] 133 | variant_data = [0] 134 | result = bayesian_bozo.bayesian_bootstrap_lift(control_data, variant_data) 135 | print result 136 | assert result['is_significant'] == False 137 | --------------------------------------------------------------------------------