├── analyze_sip.py └── log_normal.py /analyze_sip.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import math 3 | import matplotlib 4 | import numpy 5 | import scipy.optimize 6 | import scipy.stats 7 | import seaborn 8 | from matplotlib import pyplot 9 | 10 | # https://raw.githubusercontent.com/Derek-Jones/SiP_dataset/master/Sip-task-info.csv 11 | actuals = [] 12 | estimates = [] 13 | deltas = [] 14 | with open('Sip-task-info.csv') as f: 15 | reader = csv.reader(f) 16 | header = next(reader) 17 | while True: 18 | try: 19 | row = next(reader) 20 | except UnicodeDecodeError: 21 | continue 22 | except StopIteration: 23 | break 24 | 25 | row = dict(zip(header, row)) 26 | estimate, actual = row.get('HoursEstimate'), row.get('HoursActual') 27 | if estimate is None or actual is None: 28 | continue 29 | estimate, actual = float(estimate), float(actual) 30 | actuals.append(actual) 31 | estimates.append(estimate) 32 | if estimate > 7: 33 | deltas.append(math.log(actual) - math.log(estimate)) 34 | 35 | 36 | matplotlib.style.use('ggplot') 37 | 38 | pyplot.figure(figsize=(9, 6)) 39 | pyplot.scatter(estimates, actuals, alpha=0.05) 40 | pyplot.xscale('log') 41 | pyplot.yscale('log') 42 | pyplot.xlabel('Estimated number of hours') 43 | pyplot.ylabel('Actual number of hours') 44 | pyplot.xlim([1e-1, 1e3]) 45 | pyplot.ylim([1e-1, 1e3]) 46 | pyplot.plot([1e-1, 1e3], [1e-1, 1e3], color='C1', alpha=0.5, lw=3, label='Estimated=actual') 47 | pyplot.legend() 48 | pyplot.tight_layout() 49 | pyplot.savefig('scatter.png') 50 | 51 | print('mean:', numpy.mean(numpy.exp(deltas))) 52 | print('median:', numpy.median(numpy.exp(deltas))) 53 | print('p99:', numpy.percentile(numpy.exp(deltas), 99)) 54 | 55 | pyplot.figure(figsize=(9, 6)) 56 | seaborn.distplot(deltas, kde=False, norm_hist=True, bins=numpy.arange(-5, 5, 0.2) + 0.1) 57 | pyplot.xlabel('log(actual / estimated)') 58 | pyplot.xlim([-5, 5]) 59 | pyplot.ylabel('Probability distribution') 60 | pyplot.tight_layout() 61 | pyplot.savefig('distribution.png') 62 | 63 | def neg_ll(params): 64 | df, scale = numpy.exp(params) 65 | return -numpy.sum(scipy.stats.t.logpdf(deltas, df, 0, scale)) 66 | 67 | params = scipy.optimize.minimize(neg_ll, (0, 0)).x 68 | df, scale = numpy.exp(params) 69 | print('nu:', df) 70 | print('scale:', scale) 71 | 72 | xs = numpy.linspace(-10, 10, 1000) 73 | std = 0.5 74 | ys = scipy.stats.t.pdf(xs, df, 0, scale) 75 | pyplot.plot(xs, ys, color='C1', lw=3, label='$ \\nu = %.2f, \\sigma = %.2f $' % (df, scale)) 76 | pyplot.xlim([-5, 5]) 77 | pyplot.legend() 78 | pyplot.title('Best fit of a non-standardized Student\'s t-distribution') 79 | pyplot.tight_layout() 80 | pyplot.savefig('distribution_plus_t.png') 81 | 82 | # zs = scipy.stats.t.rvs(df=df, scale=scale, size=1000000) 83 | d = scipy.stats.t(df=df, scale=scale) 84 | xs = numpy.linspace(-30, 30, 1000000) 85 | print('mean:', numpy.mean(d.pdf(xs) * numpy.exp(xs))) 86 | print('median:', numpy.exp(d.ppf(0.5))) 87 | print('p99:', numpy.exp(d.ppf(0.99))) 88 | print('p99.9:', numpy.exp(d.ppf(0.999))) 89 | print('p99.99:', numpy.exp(d.ppf(0.9999))) 90 | 91 | a = 2*df - 1 92 | b = scale * a 93 | print(a, b) 94 | d = scipy.stats.invgamma(a=a, scale=b) 95 | xs = numpy.linspace(0, 10, 10000) 96 | pyplot.figure(figsize=(9, 6)) 97 | pyplot.fill_between(xs, 0*xs, d.pdf(xs), alpha=0.2, color='C0', label='$ \\alpha = %.2f, \\beta = %.2f $' % (a, b)) 98 | pyplot.plot(xs, d.pdf(xs), lw=3, color='C0') 99 | pyplot.xlabel('$ \\sigma $') 100 | pyplot.ylabel('Probability distribution') 101 | pyplot.xlim([0, 5]) 102 | pyplot.legend() 103 | pyplot.title('Inferred inverse Gamma distribution of $ \sigma $') 104 | pyplot.tight_layout() 105 | pyplot.savefig('sigma_distribution.png') 106 | 107 | #pyplot.figure(figsize=(9, 6)) 108 | #ss = scipy.stats.invgamma.rvs(a=a, scale=b, size=10000) 109 | #zs = scipy.stats.norm.rvs(scale=ss) 110 | #zs = numpy.clip(zs, -5, 5) 111 | #seaborn.distplot(zs) 112 | #pyplot.show() 113 | -------------------------------------------------------------------------------- /log_normal.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | import numpy 3 | import scipy.stats 4 | from matplotlib import pyplot 5 | 6 | matplotlib.style.use('ggplot') 7 | 8 | 9 | # Plot log-normal distribution 10 | pyplot.figure(figsize=(9, 6)) 11 | std = 1 12 | xs = numpy.linspace(0, 12, 1000) 13 | ys = scipy.stats.lognorm.pdf(xs, s=std) 14 | pyplot.plot(xs, ys, color='C0', lw=3) 15 | pyplot.fill_between(xs, ys*0, ys, color='C0', alpha=0.3, label='Distribution') 16 | pyplot.axvline(x=1, color='C1', lw=3, label='Median: 1.00') 17 | mean = numpy.exp(0 + std**2/2) 18 | pyplot.axvline(x=numpy.exp(0 + std**2/2), color='C2', lw=3, label='Mean : %.2f' % mean) 19 | pyplot.xlabel('Blowup factor (actual/estimated)') 20 | pyplot.ylabel('Probability distribution') 21 | pyplot.xlim([0, 10]) 22 | pyplot.legend() 23 | pyplot.title('Standard deviation $ \\sigma = %.2f $' % std) 24 | pyplot.tight_layout() 25 | pyplot.savefig('log_normal.png') 26 | 27 | # Plot normal distributions 28 | pyplot.figure(figsize=(9, 6)) 29 | xs = numpy.linspace(-10, 10, 1000) 30 | for std in [0.5, 1, 2]: 31 | ys = scipy.stats.norm.pdf(xs, scale=std) 32 | pyplot.plot(xs, ys, lw=3, label='Standard deviation $ \\sigma = %.2f $' % std) 33 | pyplot.xlabel('Logarithm of blowup factor: log(actual/estimated)') 34 | pyplot.ylabel('Probability distribution') 35 | pyplot.legend() 36 | pyplot.xlim([-5, 5]) 37 | pyplot.tight_layout() 38 | pyplot.savefig('normal.png') 39 | 40 | 41 | def add(sizes, stds): 42 | mus = numpy.log(sizes) 43 | rows = [] 44 | for mu, std in zip(mus, stds): 45 | mean = numpy.exp(mu + std**2/2) 46 | p99 = numpy.exp(scipy.stats.norm.ppf(0.99, mu, std)) 47 | rows.append((numpy.exp(mu), mean, p99)) 48 | 49 | rvs = scipy.stats.norm.rvs(mus, stds, size=(1000000, len(mus))) 50 | sums = numpy.sum(numpy.exp(rvs), axis=1) 51 | rows.append((numpy.median(sums), numpy.mean(sums), numpy.percentile(sums, 99))) 52 | 53 | return rows 54 | 55 | 56 | 57 | for sizes, stds in [([1, 1, 1], [1, 1, 1]), 58 | ([1, 1, 1], [0.5, 1, 2]), 59 | ([1, 1, 1, 1, 1, 1, 1], [0.5, 0.5, 0.5, 1, 1, 1, 2])]: 60 | for median, mean, p99 in add(sizes, stds): 61 | print('%9.2f %9.2f %9.2f' % (median, mean, p99)) 62 | print() 63 | 64 | 65 | pyplot.figure(figsize=(9, 6)) 66 | sigmas = numpy.linspace(0, 5.5, 10000) 67 | medians = numpy.exp(sigmas * 0) 68 | means = numpy.exp(sigmas**2/2) 69 | # p99s = [numpy.exp(scipy.stats.norm.ppf(0.99, loc=0, scale=sigma)) for sigma in sigmas] 70 | p99s = numpy.exp(scipy.stats.norm.ppf(0.99, loc=0, scale=sigmas)) 71 | print(p99s) 72 | pyplot.plot(sigmas, medians, lw=3, label='Median') 73 | pyplot.plot(sigmas, means, lw=3, label='Mean') 74 | pyplot.plot(sigmas, p99s, lw=3, label='99th percentile') 75 | pyplot.yscale('log') 76 | pyplot.xlabel('$ \\sigma $') 77 | pyplot.ylabel('Blowup factor') 78 | pyplot.xlim([0, 5]) 79 | pyplot.legend() 80 | pyplot.tight_layout() 81 | pyplot.savefig('sigmas.png') 82 | --------------------------------------------------------------------------------