├── analyze_sip.py
└── log_normal.py


/analyze_sip.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import math
  3 | import matplotlib
  4 | import numpy
  5 | import scipy.optimize
  6 | import scipy.stats
  7 | import seaborn
  8 | from matplotlib import pyplot
  9 | 
 10 | # https://raw.githubusercontent.com/Derek-Jones/SiP_dataset/master/Sip-task-info.csv
 11 | actuals = []
 12 | estimates = []
 13 | deltas = []
 14 | with open('Sip-task-info.csv') as f:
 15 |     reader = csv.reader(f)
 16 |     header = next(reader)
 17 |     while True:
 18 |         try:
 19 |             row = next(reader)
 20 |         except UnicodeDecodeError:
 21 |             continue
 22 |         except StopIteration:
 23 |             break
 24 | 
 25 |         row = dict(zip(header, row))
 26 |         estimate, actual = row.get('HoursEstimate'), row.get('HoursActual')
 27 |         if estimate is None or actual is None:
 28 |             continue
 29 |         estimate, actual = float(estimate), float(actual)
 30 |         actuals.append(actual)
 31 |         estimates.append(estimate)
 32 |         if estimate > 7:
 33 |             deltas.append(math.log(actual) - math.log(estimate))
 34 | 
 35 | 
 36 | matplotlib.style.use('ggplot')
 37 | 
 38 | pyplot.figure(figsize=(9, 6))
 39 | pyplot.scatter(estimates, actuals, alpha=0.05)
 40 | pyplot.xscale('log')
 41 | pyplot.yscale('log')
 42 | pyplot.xlabel('Estimated number of hours')
 43 | pyplot.ylabel('Actual number of hours')
 44 | pyplot.xlim([1e-1, 1e3])
 45 | pyplot.ylim([1e-1, 1e3])
 46 | pyplot.plot([1e-1, 1e3], [1e-1, 1e3], color='C1', alpha=0.5, lw=3, label='Estimated=actual')
 47 | pyplot.legend()
 48 | pyplot.tight_layout()
 49 | pyplot.savefig('scatter.png')
 50 | 
 51 | print('mean:', numpy.mean(numpy.exp(deltas)))
 52 | print('median:', numpy.median(numpy.exp(deltas)))
 53 | print('p99:', numpy.percentile(numpy.exp(deltas), 99))
 54 | 
 55 | pyplot.figure(figsize=(9, 6))
 56 | seaborn.distplot(deltas, kde=False, norm_hist=True, bins=numpy.arange(-5, 5, 0.2) + 0.1)
 57 | pyplot.xlabel('log(actual / estimated)')
 58 | pyplot.xlim([-5, 5])
 59 | pyplot.ylabel('Probability distribution')
 60 | pyplot.tight_layout()
 61 | pyplot.savefig('distribution.png')
 62 | 
 63 | def neg_ll(params):
 64 |     df, scale = numpy.exp(params)
 65 |     return -numpy.sum(scipy.stats.t.logpdf(deltas, df, 0, scale))
 66 | 
 67 | params = scipy.optimize.minimize(neg_ll, (0, 0)).x
 68 | df, scale = numpy.exp(params)
 69 | print('nu:', df)
 70 | print('scale:', scale)
 71 | 
 72 | xs = numpy.linspace(-10, 10, 1000)
 73 | std = 0.5
 74 | ys = scipy.stats.t.pdf(xs, df, 0, scale)
 75 | pyplot.plot(xs, ys, color='C1', lw=3, label='$ \\nu = %.2f, \\sigma = %.2f $' % (df, scale))
 76 | pyplot.xlim([-5, 5])
 77 | pyplot.legend()
 78 | pyplot.title('Best fit of a non-standardized Student\'s t-distribution')
 79 | pyplot.tight_layout()
 80 | pyplot.savefig('distribution_plus_t.png')
 81 | 
 82 | # zs = scipy.stats.t.rvs(df=df, scale=scale, size=1000000)
 83 | d = scipy.stats.t(df=df, scale=scale)
 84 | xs = numpy.linspace(-30, 30, 1000000)
 85 | print('mean:', numpy.mean(d.pdf(xs) * numpy.exp(xs)))
 86 | print('median:', numpy.exp(d.ppf(0.5)))
 87 | print('p99:', numpy.exp(d.ppf(0.99)))
 88 | print('p99.9:', numpy.exp(d.ppf(0.999)))
 89 | print('p99.99:', numpy.exp(d.ppf(0.9999)))
 90 | 
 91 | a = 2*df - 1
 92 | b = scale * a
 93 | print(a, b)
 94 | d = scipy.stats.invgamma(a=a, scale=b)
 95 | xs = numpy.linspace(0, 10, 10000)
 96 | pyplot.figure(figsize=(9, 6))
 97 | pyplot.fill_between(xs, 0*xs, d.pdf(xs), alpha=0.2, color='C0', label='$ \\alpha = %.2f, \\beta = %.2f $' % (a, b))
 98 | pyplot.plot(xs, d.pdf(xs), lw=3, color='C0')
 99 | pyplot.xlabel('$ \\sigma $')
100 | pyplot.ylabel('Probability distribution')
101 | pyplot.xlim([0, 5])
102 | pyplot.legend()
103 | pyplot.title('Inferred inverse Gamma distribution of $ \sigma $')
104 | pyplot.tight_layout()
105 | pyplot.savefig('sigma_distribution.png')
106 | 
107 | #pyplot.figure(figsize=(9, 6))
108 | #ss = scipy.stats.invgamma.rvs(a=a, scale=b, size=10000)
109 | #zs = scipy.stats.norm.rvs(scale=ss)
110 | #zs = numpy.clip(zs, -5, 5)
111 | #seaborn.distplot(zs)
112 | #pyplot.show()
113 | 


--------------------------------------------------------------------------------
/log_normal.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | import numpy
 3 | import scipy.stats
 4 | from matplotlib import pyplot
 5 | 
 6 | matplotlib.style.use('ggplot')
 7 | 
 8 | 
 9 | # Plot log-normal distribution
10 | pyplot.figure(figsize=(9, 6))
11 | std = 1
12 | xs = numpy.linspace(0, 12, 1000)
13 | ys = scipy.stats.lognorm.pdf(xs, s=std)
14 | pyplot.plot(xs, ys, color='C0', lw=3)
15 | pyplot.fill_between(xs, ys*0, ys, color='C0', alpha=0.3, label='Distribution')
16 | pyplot.axvline(x=1, color='C1', lw=3, label='Median: 1.00')
17 | mean = numpy.exp(0 + std**2/2)
18 | pyplot.axvline(x=numpy.exp(0 + std**2/2), color='C2', lw=3, label='Mean : %.2f' % mean)
19 | pyplot.xlabel('Blowup factor (actual/estimated)')
20 | pyplot.ylabel('Probability distribution')
21 | pyplot.xlim([0, 10])
22 | pyplot.legend()
23 | pyplot.title('Standard deviation $ \\sigma = %.2f $' % std)
24 | pyplot.tight_layout()
25 | pyplot.savefig('log_normal.png')
26 | 
27 | # Plot normal distributions
28 | pyplot.figure(figsize=(9, 6))
29 | xs = numpy.linspace(-10, 10, 1000)
30 | for std in [0.5, 1, 2]:
31 |     ys = scipy.stats.norm.pdf(xs, scale=std)
32 |     pyplot.plot(xs, ys, lw=3, label='Standard deviation $ \\sigma = %.2f $' % std)
33 | pyplot.xlabel('Logarithm of blowup factor: log(actual/estimated)')
34 | pyplot.ylabel('Probability distribution')
35 | pyplot.legend()
36 | pyplot.xlim([-5, 5])
37 | pyplot.tight_layout()
38 | pyplot.savefig('normal.png')
39 | 
40 | 
41 | def add(sizes, stds):
42 |     mus = numpy.log(sizes)
43 |     rows = []
44 |     for mu, std in zip(mus, stds):
45 |         mean = numpy.exp(mu + std**2/2)
46 |         p99 = numpy.exp(scipy.stats.norm.ppf(0.99, mu, std))
47 |         rows.append((numpy.exp(mu), mean, p99))
48 | 
49 |     rvs = scipy.stats.norm.rvs(mus, stds, size=(1000000, len(mus)))
50 |     sums = numpy.sum(numpy.exp(rvs), axis=1)
51 |     rows.append((numpy.median(sums), numpy.mean(sums), numpy.percentile(sums, 99)))
52 | 
53 |     return rows
54 | 
55 | 
56 | 
57 | for sizes, stds in [([1, 1, 1], [1, 1, 1]),
58 |                     ([1, 1, 1], [0.5, 1, 2]),
59 |                     ([1, 1, 1, 1, 1, 1, 1], [0.5, 0.5, 0.5, 1, 1, 1, 2])]:
60 |     for median, mean, p99 in add(sizes, stds):
61 |         print('%9.2f %9.2f %9.2f' % (median, mean, p99))
62 |     print()
63 | 
64 | 
65 | pyplot.figure(figsize=(9, 6))
66 | sigmas = numpy.linspace(0, 5.5, 10000)
67 | medians = numpy.exp(sigmas * 0)
68 | means = numpy.exp(sigmas**2/2)
69 | # p99s = [numpy.exp(scipy.stats.norm.ppf(0.99, loc=0, scale=sigma)) for sigma in sigmas]
70 | p99s = numpy.exp(scipy.stats.norm.ppf(0.99, loc=0, scale=sigmas))
71 | print(p99s)
72 | pyplot.plot(sigmas, medians, lw=3, label='Median')
73 | pyplot.plot(sigmas, means, lw=3, label='Mean')
74 | pyplot.plot(sigmas, p99s, lw=3, label='99th percentile')
75 | pyplot.yscale('log')
76 | pyplot.xlabel('$ \\sigma $')
77 | pyplot.ylabel('Blowup factor')
78 | pyplot.xlim([0, 5])
79 | pyplot.legend()
80 | pyplot.tight_layout()
81 | pyplot.savefig('sigmas.png')
82 | 


--------------------------------------------------------------------------------