├── README.md └── samplesize.py /README.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | This simple Python program computes an estimate for the minimal 4 | sample size to use in order to approximate the population mean 5 | with a certain level of confidence and error tolerance. 6 | 7 | For basic usage help run 8 | ```bash 9 | python samplesize.py -h 10 | ``` -------------------------------------------------------------------------------- /samplesize.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module to estimate the minimal subsample size, assuming 3 | the sample size is calculated with respect to estimating the 4 | population mean. 5 | 6 | The script can be run from the command line. 7 | """ 8 | 9 | import argparse 10 | 11 | 12 | def sampleSize( 13 | population_size, 14 | margin_error=.05, 15 | confidence_level=.99, 16 | sigma=1/2 17 | ): 18 | """ 19 | Calculate the minimal sample size to use to achieve a certain 20 | margin of error and confidence level for a sample estimate 21 | of the population mean. 22 | 23 | Inputs 24 | ------- 25 | population_size: integer 26 | Total size of the population that the sample is to be drawn from. 27 | 28 | margin_error: number 29 | Maximum expected difference between the true population parameter, 30 | such as the mean, and the sample estimate. 31 | 32 | confidence_level: number in the interval (0, 1) 33 | If we were to draw a large number of equal-size samples 34 | from the population, the true population parameter 35 | should lie within this percentage 36 | of the intervals (sample_parameter - e, sample_parameter + e) 37 | where e is the margin_error. 38 | 39 | sigma: number 40 | The standard deviation of the population. For the case 41 | of estimating a parameter in the interval [0, 1], sigma=1/2 42 | should be sufficient. 43 | 44 | """ 45 | alpha = 1 - (confidence_level) 46 | # dictionary of confidence levels and corresponding z-scores 47 | # computed via norm.ppf(1 - (alpha/2)), where norm is 48 | # a normal distribution object in scipy.stats. 49 | # Here, ppf is the percentile point function. 50 | zdict = { 51 | .90: 1.645, 52 | .91: 1.695, 53 | .99: 2.576, 54 | .97: 2.17, 55 | .94: 1.881, 56 | .93: 1.812, 57 | .95: 1.96, 58 | .98: 2.326, 59 | .96: 2.054, 60 | .92: 1.751 61 | } 62 | if confidence_level in zdict: 63 | z = zdict[confidence_level] 64 | else: 65 | from scipy.stats import norm 66 | z = norm.ppf(1 - (alpha/2)) 67 | N = population_size 68 | M = margin_error 69 | numerator = z**2 * sigma**2 * (N / (N-1)) 70 | denom = M**2 + ((z**2 * sigma**2)/(N-1)) 71 | return numerator/denom 72 | 73 | 74 | def main(): 75 | n = sampleSize(args.size, args.error, args.confidence, args.deviation) 76 | print(n) 77 | return n 78 | 79 | 80 | if __name__ == '__main__': 81 | parser = argparse.ArgumentParser( 82 | description='Update interest distributions.' 83 | ) 84 | parser.add_argument( 85 | '-s', 86 | '--size', 87 | type=int, 88 | help='Population size the sample is drawn from.' 89 | ) 90 | parser.add_argument( 91 | '-e', 92 | '--error', 93 | type=float, 94 | default=.05, 95 | help='Margin of error.' 96 | ) 97 | parser.add_argument( 98 | '-c', 99 | '--confidence', 100 | type=float, 101 | default=.99, 102 | help='Confidence level.' 103 | ) 104 | parser.add_argument( 105 | '-d', 106 | '--deviation', 107 | type=float, 108 | default=1/2, 109 | help='Population variance if know.' 110 | ) 111 | args = parser.parse_args() 112 | main() 113 | --------------------------------------------------------------------------------