├── BayesDataAnalysisWithPymc
    ├── Data
    │   ├── McIntyre1994data.csv
    │   └── McDonaldSK1991data.txt
    ├── short_hdi.py
    ├── YmetricXsinglePyMC.py
    ├── BernTwoPyMC.py
    ├── plot_post.py
    ├── BernBetaMuKappaPyMC.py
    ├── normalize.py
    ├── SimpleLinearRegressionPyMC.py
    └── ANOVAOnewayPyMC.py
├── LICENSE.md
└── README.md


/BayesDataAnalysisWithPymc/Data/McIntyre1994data.csv:
--------------------------------------------------------------------------------
 1 | Brand,Tar,Nic,Wt,CO
 2 | Alpine,14.1,0.86,0.9853,13.6
 3 | BensonAndHedges,16.0,1.06,1.0938,16.6
 4 | BullDurham,29.8,2.03,1.1650,23.5
 5 | CamelLights,8.0,0.67,0.9280,10.2
 6 | Carlton,4.1,0.40,0.9462,5.4
 7 | Chesterfield,15.0,1.04,0.8885,15.0
 8 | GoldenLights,8.8,0.76,1.0267,9.0
 9 | Kent,12.4,0.95,0.9225,12.3
10 | Kool,16.6,1.12,0.9372,16.3
11 | LandM,14.9,1.02,0.8858,15.4
12 | LarkLights,13.7,1.01,0.9643,13.0
13 | Marlboro,15.1,0.90,0.9316,14.4
14 | Merit,7.8,0.57,0.9705,10.0
15 | MultiFilter,11.4,0.78,1.1240,10.2
16 | NewportLights,9.0,0.74,0.8517,9.5
17 | Now,1.0,0.13,0.7851,1.5
18 | OldGold,17.0,1.26,0.9186,18.5
19 | PallMallLight,12.8,1.08,1.0395,12.6
20 | Raleigh,15.8,0.96,0.9573,17.5
21 | SalemUltra,4.5,0.42,0.9106,4.9
22 | Tareyton,14.5,1.01,1.0070,15.9
23 | True,7.3,0.61,0.9806,8.5
24 | ViceroyRichLight,8.6,0.69,0.9693,10.6
25 | VirginiaSlims,15.2,1.02,0.9496,13.9
26 | WinstonLights,12.0,0.82,1.1184,14.9
27 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013, Erikson Kaszubowski
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/BayesDataAnalysisWithPymc/short_hdi.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | '''Algorithm to calculate the shortest Highest Density Interval
 3 | (HDI). Adaptation of the R code from "Doing Bayesian Data Analysis",
 4 | by John K. Krushcke.
 5 | More info: http://doingbayesiandataanalysis.blogspot.com.br/
 6 | 
 7 | '''
 8 | 
 9 | 
10 | def short_hdi(sample, cred=0.95):
11 |     '''Calculate the shortest Highest Density Interval from
12 |     the posterior distribution sampled via MCMC.
13 | 
14 |     :Arguments:
15 |         sample: A list with the values of the posterior distribution.
16 |         cred: The mass of the posterior for which the interval is computed.
17 |                 Default is 95%, should be a float from 0.0 to 1.0.
18 | 
19 |     Returns a tuple with the limits of the HDI.
20 | 
21 |     PyMC has a 95% HDI algorithm, but it uses quantiles.
22 | 
23 |     '''
24 |     sorted_sample = sorted(sample)
25 |     ci_index = int(cred * len(sorted_sample))  # Uses 'int()' for R's 'floor()'
26 |     num_ci = len(sorted_sample) - ci_index
27 | 
28 |     ci_width = []
29 |     for i in range(num_ci):
30 |         width = sorted_sample[i + ci_index] - sorted_sample[i]
31 |         ci_width.append(width)
32 | 
33 |     hdi_min = sorted_sample[ci_width.index(min(ci_width))]
34 |     hdi_max = sorted_sample[ci_width.index(min(ci_width)) + ci_index]
35 |     hdi_lim = (hdi_min, hdi_max)
36 |     return hdi_lim
37 | 


--------------------------------------------------------------------------------
/BayesDataAnalysisWithPymc/YmetricXsinglePyMC.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | '''Hierarchical Model for inferring the mean (mu)
 3 | and precision (tau) of normal likelihood data via MCMC.
 4 | Python (PyMC) adaptation of the R code from "Doing Bayesian Data Analysis",
 5 | by John K. Krushcke.
 6 | More info: http://doingbayesiandataanalysis.blogspot.com.br/
 7 | 
 8 | '''
 9 | from __future__ import division
10 | 
11 | import pymc
12 | import numpy as np
13 | from matplotlib import pyplot as plot
14 | from plot_post import plot_post
15 | 
16 | # For simplicity's sake, I will generate random data just like
17 | # the R code in the book.
18 | 
19 | t_mean = 100
20 | t_sd = 15
21 | N = 20
22 | 
23 | # Generate N samples, no rounding.
24 | 
25 | y = np.random.normal(t_mean, t_sd, N)
26 | 
27 | # Defining the priors for mu and tau.
28 | 
29 | mu = pymc.Normal('mu', 0.0, 1.0e-10)  # Mean: 0.0, SD: 100000
30 | tau = pymc.Gamma('tau', 0.01, 0.01)  # Mean: 1.0, SD: 10
31 | 
32 | # Now the likelihood function.
33 | 
34 | like = pymc.Normal('like', mu, tau, value=y, observed=True)
35 | 
36 | # Create the model, generate initialization values and sample its posterior.
37 | 
38 | model = pymc.Model([like, mu, tau])
39 | map_ = pymc.MAP(model)
40 | map_.fit()
41 | mcmc = pymc.MCMC(model)
42 | mcmc.sample(iter=60000, burn=40000, thin=2)
43 | 
44 | # Sample the posterior for the parameter estimates.
45 | 
46 | mu_sample = mcmc.trace('mu')[:]
47 | tau_sample = mcmc.trace('tau')[:]
48 | 
49 | # Keeping the same idea as the book: convert the posterior samples to SD.
50 | 
51 | sigma_sample = 1 / np.sqrt(tau_sample)
52 | 
53 | # Plot the results.
54 | 
55 | plot.figure(figsize=(8.0, 8.0))
56 | 
57 | plot.subplot(211)
58 | plot_post(mu_sample, title=r'$\mu$ posterior distribution')
59 | 
60 | plot.subplot(212)
61 | plot_post(sigma_sample, title=r'$\sigma$ posterior distribution')
62 | 
63 | plot.subplots_adjust(wspace=0.2, hspace=0.2)
64 | plot.show()
65 | 


--------------------------------------------------------------------------------
/BayesDataAnalysisWithPymc/BernTwoPyMC.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | ''' Model for inferring two binomial proportions via MCMC.
 3 | Python (PyMC) adaptation of the R code from "Doing Bayesian Data Analysis",
 4 | by John K. Krushcke.
 5 | More info: http://doingbayesiandataanalysis.blogspot.com.br/
 6 | 
 7 | '''
 8 | from __future__ import division
 9 | 
10 | import pymc
11 | from matplotlib import pyplot as plot
12 | from plot_post import plot_post
13 | 
14 | # TODO: It would be good to import data from CSV files.
15 | 
16 | # Model specification in PyMC goes backwards, in comparison to JAGS:
17 | # first the prior are specified, THEN the likelihood function.
18 | 
19 | # TODO: With PyMC, it´s possible to define many stochastic variables
20 | # in just one variable name using the 'size' function parameter.
21 | 
22 | # But for now, I will use multiple variable names for simplicity.
23 | 
24 | theta1 = pymc.Beta('theta1', alpha=3, beta=3)
25 | theta2 = pymc.Beta('theta2', alpha=3, beta=3)
26 | 
27 | # Define the observed data.
28 | 
29 | data = [[1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1],
30 |         [1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]]
31 | 
32 | # Define the likelihood function for the observed data.
33 | 
34 | like1 = pymc.Bernoulli('like1', theta1, observed=True, value=data[0])
35 | like2 = pymc.Bernoulli('like2', theta2, observed=True, value=data[1])
36 | 
37 | # Use the PyMC 'Model' class to collect all the variables we are interested in.
38 | 
39 | model = pymc.Model([theta1, theta2])
40 | 
41 | # And instantiate the MCMC class to sample the posterior.
42 | 
43 | mcmc = pymc.MCMC(model)
44 | mcmc.sample(40000, 10000, 1)
45 | 
46 | # Use PyMC built-in plot function to show graphs of the samples.
47 | 
48 | # pymc.Matplot.plot(mcmc)
49 | # plot.show()
50 | 
51 | # Let's try plotting using Matplotlib's 'pyplot'.
52 | # First, we extract the traces for the parameters of interest.
53 | 
54 | theta1_samples = mcmc.trace('theta1')[:]
55 | theta2_samples = mcmc.trace('theta2')[:]
56 | theta_diff = theta2_samples - theta1_samples
57 | 
58 | # Then, we plot a histogram of their individual sample values.
59 | 
60 | plot.figure(figsize=(8.0, 10))
61 | 
62 | plot.subplot(311)
63 | plot_post(theta1_samples, title=r'Posterior of $\theta_1$')
64 | 
65 | plot.subplot(312)
66 | plot_post(theta2_samples, title=r'Posterior of $\theta_2$')
67 | 
68 | plot.subplot(313)
69 | plot_post(theta_diff, title=r'Posterior of $\Delta\theta$', comp=0.0)
70 | 
71 | plot.subplots_adjust(hspace=0.5)
72 | plot.show()
73 | 


--------------------------------------------------------------------------------
/BayesDataAnalysisWithPymc/plot_post.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | '''Plot the histogram of the posterior distribution sample,
 3 | with the mean and the 95% HDI.
 4 | Adaptation of the R code from "Doing Bayesian Data Analysis",
 5 | by John K. Krushcke.
 6 | More info: http://doingbayesiandataanalysis.blogspot.com.br/
 7 | 
 8 | Histogram code based on (copied from!) 'Probabilistic Programming and
 9 | Bayesian Methods for Hackers', by Cameron Davidson-Pilon.
10 | More info: https://github.com/CamDavidsonPilon/
11 | Probabilistic-Programming-and-Bayesian-Methods-for-Hackers
12 | 
13 | '''
14 | 
15 | from __future__ import division
16 | 
17 | from short_hdi import short_hdi
18 | from matplotlib import pyplot as plot
19 | 
20 | 
21 | def plot_post(sample, title='Posterior',
22 |               cred=0.95, comp=None, *args, **kwargs):
23 |     '''Plot the histogram of the posterior distribution sample,
24 |     with the mean and the HDI.
25 | 
26 |     :Arguments:
27 |         sample: array of sample values.
28 |         cred: credible interval (default: 95%)
29 |         comp: value for comparison (default: None)
30 |         title: String value for graph title.
31 | 
32 |     '''
33 |     # First we compute the shortest HDI using Krushcke's algorithm.
34 | 
35 |     sample_hdi = short_hdi(sample)
36 | 
37 |     # Then we plot the histogram of the sample.
38 |     ax = plot.hist(sample,
39 |                    bins=25,
40 |                    alpha=0.85,
41 |                    label='',
42 |                    normed=True)
43 | 
44 |     # Force the y-axis to be limited to 1.1 times the max probability density.
45 |     maxy = 1.1 * max(ax[0])
46 |     plot.ylim(0.0, maxy)
47 | 
48 |     # No y-axis label, they are not important here.
49 |     plot.yticks([])
50 | 
51 |     # Should we plot a vertical line on the mean?
52 |     #plot.vlines(sample.mean(), 0, maxy, linestyle='--',
53 |     #       label=r'Mean (%0.3f)' % sample.mean())
54 |     # But we keep the mean value in its right place.
55 | 
56 |     plot.text(sample.mean(), 0.9 * max(ax[0]), 'Mean: %0.3f' % sample.mean())
57 | 
58 |     #plot.legend(loc='upper right') #Legends are cumbersome!
59 |     plot.title(title)
60 | 
61 |     # Plot the HDI as a vertical line with their respective values.
62 |     plot.hlines(y=0, xmin=sample_hdi[0], xmax=sample_hdi[1], linewidth=6)
63 |     plot.text(sample_hdi[0], max(ax[0]) / 20, '%0.3f' % sample_hdi[0],
64 |           horizontalalignment='center')
65 |     plot.text(sample_hdi[1], max(ax[0]) / 20, '%0.3f' % sample_hdi[1],
66 |           horizontalalignment='center')
67 | 
68 |     # In case there is a comparison value, plot it and
69 |     # compute how much of the posterior falls at each side.
70 |     if comp != None:
71 |         loc = max(ax[0]) / 2.0
72 |         plot.vlines(comp, 0, loc, color='green', linestyle='--')
73 |         less = 100 * (sum(sample < comp)) / len(sample)
74 |         more = 100 * (sum(sample > comp)) / len(sample)
75 |         print less, more
76 |         plot.text(comp, loc, '%0.1f%% < %0.1f < %0.1f%%' % (less, comp, more),
77 |                   color='green', horizontalalignment='center')
78 | 
79 |     #return ax  # I thought the function should return something. It's not needed.
80 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Bayesian Data Analysis *with PyMC*
 2 | ======================
 3 | 
 4 | ###For those in a hurry
 5 | 
 6 | My first attempt to adapt to Python (using PyMC) the R code from "Doing Bayesian Data Analysis",
 7 | by John K. Krushcke.
 8 | 
 9 | ####Models done so far:
10 | - Inferring two binomials proportions and their difference;
11 | - Hierarchical prior for Bernoulli likelihood;
12 | - Metric variable for a single group;
13 | - Simple linear regression;
14 | - Oneway ANOVA.
15 | 
16 | ###Quick References
17 | >1. "Doing Bayesian Data Analysis", by John K. Krushcke   
18 | >[http://doingbayesiandataanalysis.blogspot.com.br/](http://doingbayesiandataanalysis.blogspot.com.br/)
19 | >
20 | >2. "Probabilistic Programming and Bayesian Methods for Hackers", by Cam Davidson-Pilon   
21 | >[https://github.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/](https://github.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/)
22 | >
23 | >3. "PyMC", by Christopher Fonnesbeck, Anand Patil and David Huard   
24 | >[http://pymc-devs.github.io/pymc/](http://pymc-devs.github.io/pymc/)
25 | 
26 | 
27 | ###For those with some time to spare
28 | 
29 | After many years avoiding any book on statistics, I found myself interested on bayesian methods
30 | when I first heard about them in the NLTK (Natural Language ToolKit for Python) Book.
31 | I read some introductory texts on Bayes' Theorem, but even tough the concepts were somewhat clear, I still
32 | wasn't able to understand how useful it was.
33 | 
34 | Then I found Krushcke's "Doing Bayesian Data Analysis". It was love at first few-pages-glance. Seriously, 
35 | the book is great, specially for those who know nothing about statistics, like myself. The language is 
36 | accessible, there are tons of examples, the hierarchical models are cleverly illustrated using a unique style... 
37 | What are you still doing here? Go buy the book!
38 | 
39 | Ahem, where was I? Oh, yeah. Krushcke's book. Have I mentioned that it uses R and JAGS (or BUGS) to implement 
40 | the concepts and models? It's really a great deal: you learn bayesian statistics, general probability AND R. 
41 | But I had problems with R syntax. It's so clumsy, and weird and... Well, I don't like it. Great packages,
42 | but I really prefer Python.
43 | 
44 | Being a Python fan, I had to find a way to adapt the original code. It was made possible after I discovered
45 | PyMC in Cam Davidson-Pilon "Probabilistic Programming and Bayesian Methods for Hackers". Also a great book, 
46 | published in an original media: IPython Notebooks - you read the book, you read (and edit!) the code and run 
47 | it all in your favorite browser. Did I mention it is free?
48 | 
49 | So, as a exercise in bayesian statistics and in Python programming, I adapted some models used by Krushcke 
50 | in his book to Python, using PyMC (and Numpy and Matplotlib). This is just a first attempt, and I have only 
51 | "translated" Krushcke's model to Python - and in a very non-pythonic way. There's plenty of room for improvement, 
52 | but the code shouldn't break if the data is entered correctly. The code is also heavily commented - after all, 
53 | the point here is to exercise the concepts and programming skills.
54 | 
55 | Feel free to fork and modify and pull request - all suggestions are welcome!
56 | 


--------------------------------------------------------------------------------
/BayesDataAnalysisWithPymc/Data/McDonaldSK1991data.txt:
--------------------------------------------------------------------------------
 1 | # From http://udel.edu/~mcdonald/statanovasig.html
 2 | # "Here are some data on a shell measurement (the length of the anterior 
 3 | # adductor muscle scar, standardized by dividing by length) in the mussel 
 4 | # Mytilus trossulus from five locations: Tillamook, Oregon; Newport, Oregon;
 5 | # Petersburg, Alaska; Magadan, Russia; and Tvarminne, Finland, 
 6 | # taken from a much larger data set used in McDonald et al. (1991)." 
 7 | #
 8 | # McDonald, J. H., R. Seed and R. K. Koehn. 1991. 
 9 | # Allozymes and morphometric characters of three species of Mytilus
10 | # in the Northern and Southern Hemispheres. 
11 | # Mar. Biol. 111:323-333.
12 | #
13 | # Group code: 
14 | # 1=Tillamook,Oregon  
15 | # 2=Newport,Oregon  
16 | # 3=Petersburg,Alaska  
17 | # 4=Magadan,Russia  
18 | # 5=Tvarminne,Finland
19 | Group Size Site
20 | 1 0.0571 OregonT                
21 | 1 0.0813 OregonT                 
22 | 1 0.0831 OregonT                 
23 | 1 0.0976 OregonT                 
24 | 1 0.0817 OregonT                 
25 | 1 0.0859 OregonT                 
26 | 1 0.0735 OregonT                 
27 | 1 0.0659 OregonT                 
28 | 1 0.0923 OregonT    
29 | 1 0.0836 OregonT     
30 | 2 0.0873 OregonN
31 | 2 0.0662 OregonN
32 | 2 0.0672 OregonN
33 | 2 0.0819 OregonN
34 | 2 0.0749 OregonN
35 | 2 0.0649 OregonN
36 | 2 0.0835 OregonN
37 | 2 0.0725 OregonN
38 | 3 0.0974 Alaska
39 | 3 0.1352 Alaska
40 | 3 0.0817 Alaska
41 | 3 0.1016 Alaska
42 | 3 0.0968 Alaska
43 | 3 0.1064 Alaska
44 | 3 0.1050 Alaska
45 | 4 0.1033 Russia
46 | 4 0.0915 Russia
47 | 4 0.0781 Russia
48 | 4 0.0685 Russia
49 | 4 0.0677 Russia
50 | 4 0.0697 Russia
51 | 4 0.0764 Russia
52 | 4 0.0689 Russia
53 | 5 0.0703 Finland
54 | 5 0.1026 Finland
55 | 5 0.0956 Finland
56 | 5 0.0973 Finland
57 | 5 0.1039 Finland
58 | 5 0.1045 Finland
59 | #
60 | #    http://udel.edu/~mcdonald/statanovaunplanned.html
61 | #    shows that Tukey-Kramer method of unplanned comparisons
62 | #    groups 
63 | #    Newport/Magadan/Tillamook (2/4/1), 
64 | #    Magadan/Tillamook/Tvarminne (4/1/5), 
65 | #    and Tvarminne/Petersburg (5/3).
66 | #    
67 | #    From http://udel.edu/~mcdonald/statanovaplanned.html:
68 | #    Really important note about planned comparisons
69 | #      Planned comparisons must be planned before you look at the data. If you 
70 | #    look at some data, pick out an interesting comparison, then analyze it as 
71 | #    if it were a planned comparison, you will be committing scientific fraud. 
72 | #    For example, if you look at the mean arch heights for the nine sports, see 
73 | #    that cross-country has the lowest mean and swimming has the highest mean, 
74 | #    then compare just those two means, your P-value will be much too low. This 
75 | #    is because there are 36 possible pairwise comparisons in a set of 9 means. 
76 | #    You expect 5 percent, or 1 out of 20, tests to be "significant" at the 
77 | #    P<0.05 level, even if all the data really fit the null hypothesis, so 
78 | #    there's a good chance that the most extreme comparison in a set of 36 
79 | #    will have a P-value less than 0.05.
80 | #      It would be acceptable to run a pilot experiment and plan your planned 
81 | #    comparisons based on the results of the pilot experiment. However, if you 
82 | #    do this you could not include the data from the pilot experiment in the 
83 | #    analysis; you would have to limit your anova to the new data.


--------------------------------------------------------------------------------
/BayesDataAnalysisWithPymc/BernBetaMuKappaPyMC.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | '''Hierarchical Model for inferring the mean (mu)
  3 | and sample size (kappa) of various Bernoulli trials via MCMC.
  4 | Python (PyMC) adaptation of the R code from "Doing Bayesian Data Analysis",
  5 | by John K. Krushcke.
  6 | More info: http://doingbayesiandataanalysis.blogspot.com.br/
  7 | 
  8 | '''
  9 | from __future__ import division
 10 | 
 11 | import pymc
 12 | from matplotlib import pyplot as plot
 13 | from plot_post import plot_post
 14 | 
 15 | # For better code flow, we define the data first.
 16 | # Based on the original code's 'Therapeutic touch data'.
 17 | 
 18 | z = [1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
 19 |      4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 7, 7, 7, 8]
 20 | N = 10  # Number of trials for each z.
 21 | 
 22 | data = [[0] * (N - i) + [1] * i for i in z]  # Build the Bernoulli trial data.
 23 | 
 24 | # Again, with PyMC we design the model from top to bottom.
 25 | # Let's start defining the hierarchical prior's constants.
 26 | # A,B constant for the overall beta distribution for mu.
 27 | 
 28 | a_mu = 2.0
 29 | b_mu = 2.0
 30 | 
 31 | # Shape and rate constants for the overall gamma distribution for kappa.
 32 | # They are reparametrized as mean and standard deviation.
 33 | 
 34 | s_kappa = 10**2 / 10**2
 35 | r_kappa = 10 / 10**2
 36 | 
 37 | # Then, we define the overall beta and gamma distributions.
 38 | 
 39 | mu = pymc.Beta('mu', a_mu, b_mu)
 40 | kappa = pymc.Gamma('kappa', s_kappa, r_kappa)
 41 | 
 42 | # Instead of using a 'for' loop for multiple stochastic variables,
 43 | # we use the 'size' parameter of PyMC. This is why we defined the data first.
 44 | # We could use a '@deterministic' wrapper, but operations already generate it.
 45 | 
 46 | a = mu * kappa
 47 | b = (1.0 - mu) * kappa
 48 | 
 49 | theta = pymc.Beta('theta', a, b, size=len(data))  # One beta for each subject.
 50 | 
 51 | # The priors are defined. Now we need to set the likelihood of our data.
 52 | # The likelihood can't be defined the same way. We need a 'for' loop.
 53 | # Or the 'Lambda()' class.
 54 | # For more info: https://github.com/pymc-devs/pymc/issues/319
 55 | #
 56 | # for i in range(len(data)):
 57 | #     like_i = pymc.Bernoulli('like_%i' % i, p=theta[i], value=data[i],
 58 | #                             observed=True)
 59 | #
 60 | # The code above works nicely (the posterior result is the same, and each theta
 61 | # is updated with its data. But how does looping the declaration of the same
 62 | # variable works? I prefer the following code, since it makes more sense.
 63 | 
 64 | like = []
 65 | for i in range(len(data)):
 66 |     like.append(pymc.Bernoulli('like_%i' % i, p=theta[i],
 67 |                                value=data[i], observed=True))
 68 | 
 69 | # Done! Now we need to collect the variables and fit our model.
 70 | 
 71 | model = pymc.Model([theta, mu, kappa])
 72 | 
 73 | map_ = pymc.MAP(model)
 74 | map_.fit()
 75 | 
 76 | mcmc = pymc.MCMC(model)
 77 | mcmc.sample(iter=60000, burn=10000, thin=2)
 78 | 
 79 | # Extracting the parameter samples.
 80 | 
 81 | mu_sample = mcmc.trace('mu')[:]
 82 | kappa_sample = mcmc.trace('kappa')[:]
 83 | theta_sample = mcmc.trace('theta')[:]
 84 | 
 85 | # And plot them.
 86 | 
 87 | plot.figure(figsize=(8.0, 8.0))
 88 | 
 89 | plot.subplot(221)
 90 | plot_post(mu_sample, comp=0.5, title=r'$\mu$ posterior distribution')
 91 | 
 92 | plot.subplot(222)
 93 | plot_post(kappa_sample, title=r'$\kappa$ posterior distribution')
 94 | 
 95 | plot.subplot(223)
 96 | plot_post(theta_sample[:, 0], title=r'$\theta_1$ posterior distribution')
 97 | 
 98 | plot.subplot(224)
 99 | plot_post(theta_sample[:, 27], title=r'$\theta_{28}$ posterior distribution')
100 | 
101 | plot.subplots_adjust(wspace=0.2, hspace=0.2)
102 | plot.show()
103 | 


--------------------------------------------------------------------------------
/BayesDataAnalysisWithPymc/normalize.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | '''Function to normalize data and convert parameter back to original scale.
  3 | Python (PyMC) adaptation of the R code from "Doing Bayesian Data Analysis",
  4 | by John K. Krushcke.
  5 | More info: http://doingbayesiandataanalysis.blogspot.com.br/
  6 | 
  7 | '''
  8 | from __future__ import division
  9 | import numpy as np
 10 | 
 11 | 
 12 | def normalize(data):
 13 |     '''Normalizes a set of data.
 14 | 
 15 |     '''
 16 | 
 17 |     mean = np.mean(data)
 18 |     sd = np.sqrt(np.var(data))
 19 |     z_data = (data - mean) / sd
 20 |     return z_data
 21 | 
 22 | 
 23 | def convert_slope(x_data, y_data, zb1_sample):
 24 |     '''Converts normalized b1 sample back to original scale.
 25 | 
 26 |     :Arguments:
 27 |     x_data: original predictor data list.
 28 |     y_data: original predicted data list.
 29 |     zb1_sample: normalized parameter samples.
 30 | 
 31 |     '''
 32 | 
 33 |     y_sd = np.sqrt(np.var(y_data))
 34 |     x_sd = np.sqrt(np.var(x_data))
 35 |     b1 = zb1_sample * (y_sd / x_sd)
 36 |     return b1
 37 | 
 38 | 
 39 | def convert_intercept(x_data, y_data, zb0_sample, zb1_sample):
 40 |     '''Converts normalized b0 sample back to original scale.
 41 | 
 42 |     :Arguments:
 43 |     x_data: original predictor data list.
 44 |     y_data: original predicted data list.
 45 |     zb0_sample: normalized parameter samples.
 46 |     zb1_sample: normalized parameter samples.
 47 | 
 48 |     '''
 49 | 
 50 |     y_sd = np.sqrt(np.var(y_data))
 51 |     y_mean = np.mean(y_data)
 52 | 
 53 |     x_sd = np.sqrt(np.var(x_data))
 54 |     x_mean = np.mean(x_data)
 55 | 
 56 |     b0 = zb0_sample * y_sd + y_mean - zb1_sample * (y_sd * x_mean) / x_sd
 57 |     return b0
 58 | 
 59 | 
 60 | def convert_tau_sigma(y_data, ztau_sample):
 61 |     '''Converts normalized tau samples back to original scale SD.
 62 | 
 63 |     :Arguments:
 64 |     y_data: original predicted data list.
 65 |     ztau_sample: normalized tau parameter samples.
 66 | 
 67 |     '''
 68 |     z_sigma = 1 / np.sqrt(ztau_sample)
 69 |     y_sd = np.sqrt(np.var(y_data))
 70 |     sigma = z_sigma * y_sd
 71 |     return sigma
 72 | 
 73 | def convert_sigma(y_data, zsigma_sample):
 74 |     '''Converts normalized tau samples back to original scale SD.
 75 | 
 76 |     :Arguments:
 77 |     y_data: original predicted data list.
 78 |     ztau_sample: normalized sigma parameter samples.
 79 | 
 80 |     '''
 81 |     y_sd = np.sqrt(np.var(y_data))
 82 |     sigma = zsigma_sample * y_sd
 83 |     return sigma
 84 | 
 85 | 
 86 | def convert_baseline(a0_sample, a_sample, x_levels, y_data):
 87 |     '''Convert normalized ANOVA baseline back to original scale.
 88 | 
 89 |     :Arguments:
 90 |     a0_sample: normalized baseline samples.
 91 |     a_sample: normalized deflection samples.
 92 |     x_levels: integer, levels of categorical variable.
 93 |     y_data: original predicted data list.
 94 | 
 95 |     '''
 96 |     m_sample = a0_sample.repeat(x_levels).reshape(len(a0_sample), x_levels) \
 97 |     + a_sample
 98 |     b0_sample = m_sample.mean(axis=1)
 99 |     b0_sample = b0_sample * np.sqrt(np.var(y_data) + np.mean(y_data))
100 |     return b0_sample
101 | 
102 | 
103 | def convert_deflection(a0_sample, a_sample, x_levels, y_data):
104 |     '''Convert normalized ANOVA deflections back to original scale.
105 | 
106 |     :Arguments:
107 |     a0_sample: normalized baseline samples.
108 |     a_sample: normalized deflection samples.
109 |     x_levels: integer, levels of categorical variable.
110 |     y_data: original predicted data list.
111 | 
112 |     '''
113 |     m_sample = a0_sample.repeat(x_levels).reshape(len(a0_sample), x_levels) \
114 |     + a_sample
115 |     b0_sample = m_sample.mean(axis=1)
116 |     b_sample = (m_sample -
117 |                 b0_sample.repeat(x_levels).reshape(len(b0_sample), x_levels))
118 |     b_sample = b_sample * np.sqrt(np.var(y_data))
119 |     return b_sample
120 | 


--------------------------------------------------------------------------------
/BayesDataAnalysisWithPymc/SimpleLinearRegressionPyMC.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | '''Hierarchical Model for estimation of simple linear regression
  3 | parameter via MCMC.
  4 | Python (PyMC) adaptation of the R code from "Doing Bayesian Data Analysis",
  5 | by John K. Krushcke.
  6 | More info: http://doingbayesiandataanalysis.blogspot.com.br/
  7 | 
  8 | '''
  9 | from __future__ import division
 10 | 
 11 | import pymc
 12 | import numpy as np
 13 | from matplotlib import pyplot as plot
 14 | from plot_post import plot_post
 15 | from normalize import (normalize, convert_intercept,
 16 |                        convert_slope, convert_tau_sigma)
 17 | from os import path
 18 | 
 19 | # Code to find the data path.
 20 | 
 21 | scr_dir = path.dirname(__file__)
 22 | file_name = 'McIntyre1994data.csv'
 23 | comp_dir = path.join(scr_dir, 'Data', file_name)
 24 | 
 25 | # So, let's be lazy: the data are from McIntyre cigarette weight.
 26 | # Use numpy to load the data we want directly in the appropriate variables.
 27 | 
 28 | y, x = np.genfromtxt(comp_dir, delimiter=',',
 29 |                      skip_header=1, usecols=(1, 3), unpack=True)
 30 | 
 31 | # Let's try normalizing, as suggested by Krushcke.
 32 | 
 33 | zy = normalize(y)
 34 | zx = normalize(x)
 35 | 
 36 | # Define the priors for the model.
 37 | # First, normal priors for the slope and intercept.
 38 | 
 39 | beta0 = pymc.Normal('b0', 0.0, 1.0e-10)
 40 | beta1 = pymc.Normal('b1', 0.0, 1.0e-10)
 41 | 
 42 | # Then, gamma and uniform prior for precision and DoF.
 43 | # Krushcke suggests the use of a Student's t distribution for the likelihood.
 44 | # It makes the estimation more robust in the presence of outliers.
 45 | # We will use Krushcke's DoF transformation using a gain constant.
 46 | 
 47 | tau = pymc.Gamma('tau', 0.01, 0.01)
 48 | udf = pymc.Uniform('udf', 0.0, 1.0)
 49 | tdf_gain = 1
 50 | 
 51 | 
 52 | @pymc.deterministic
 53 | def tdf(udf=udf, tdf_gain=tdf_gain):
 54 |     return 1 - tdf_gain * np.log(1 - udf)
 55 | 
 56 | # Defining the linear relationship between variables.
 57 | 
 58 | 
 59 | @pymc.deterministic
 60 | def mu(beta0=beta0, beta1=beta1, x=zx):
 61 |     mu = beta0 + beta1 * x
 62 |     return mu
 63 | 
 64 | 
 65 | # Finally, the likelihood using Student's t distribution.
 66 | 
 67 | like = pymc.NoncentralT('like', mu=mu, lam=tau, nu=tdf,
 68 |                         value=zy, observed=True)
 69 | 
 70 | # For those who want a more traditional linear model:
 71 | #like = pymc.Normal('like', mu=mu, tau=tau, value=zy, observed=True)
 72 | 
 73 | # The model is ready! Sampling code below.
 74 | 
 75 | model = pymc.Model([beta0, beta1, tau, tdf])
 76 | fit = pymc.MAP(model)
 77 | fit.fit()
 78 | mcmc = pymc.MCMC(model)
 79 | mcmc.sample(iter=100000, burn=50000, thin=10)
 80 | 
 81 | # Collect the sample values for the parameters.
 82 | 
 83 | z0_sample = mcmc.trace('b0')[:]
 84 | z1_sample = mcmc.trace('b1')[:]
 85 | ztau_sample = mcmc.trace('tau')[:]
 86 | tdf_sample = mcmc.trace('tdf')[:]
 87 | 
 88 | # Convert the data back to scale.
 89 | 
 90 | b0_sample = convert_intercept(x, y, z0_sample, z1_sample)
 91 | b1_sample = convert_slope(x, y, z1_sample)
 92 | sigma_sample = convert_tau_sigma(y, ztau_sample)
 93 | 
 94 | # Plot the results
 95 | 
 96 | plot.figure(figsize=(8.0, 8.0))
 97 | 
 98 | plot.subplot(221)
 99 | plot_post(b0_sample, title=r'$\beta_0$ posterior')
100 | 
101 | plot.subplot(222)
102 | plot_post(b1_sample, title=r'$\beta_1$ posterior')
103 | 
104 | plot.subplot(223)
105 | plot_post(sigma_sample, title=r'$\sigma$ posterior')
106 | 
107 | plot.subplot(224)
108 | plot_post(tdf_sample, title=r'tDF posterior')
109 | 
110 | plot.subplots_adjust(wspace=0.2, hspace=0.2)
111 | 
112 | # Plot the data with some credible regression lines.
113 | 
114 | plot.figure(figsize=(8.0, 8.0))
115 | 
116 | plot.scatter(x, y, c='k', s=60)
117 | plot.title('Data points with credible regression lines')
118 | 
119 | x1 = plot.axis()[0]
120 | x2 = plot.axis()[1]
121 | 
122 | plot.autoscale(enable=False)
123 | 
124 | for line in range(0, len(b1_sample), len(b1_sample) // 50):
125 |     plot.plot([x1, x2], [b0_sample[line] + b1_sample[line] * x1,
126 |                          b0_sample[line] + b1_sample[line] * x2],
127 |               c='#348ABD', lw=1)
128 | 
129 | plot.show()
130 | 


--------------------------------------------------------------------------------
/BayesDataAnalysisWithPymc/ANOVAOnewayPyMC.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | '''Hierarchical Model for estimation of oneway ANOVA parameters via MCMC.
  3 | Python (PyMC) adaptation of the R code from "Doing Bayesian Data Analysis",
  4 | by John K. Krushcke.
  5 | More info: http://doingbayesiandataanalysis.blogspot.com.br/
  6 | 
  7 | '''
  8 | from __future__ import division
  9 | 
 10 | import pymc
 11 | import numpy as np
 12 | from matplotlib import pyplot as plot
 13 | from plot_post import plot_post
 14 | from normalize import (normalize, convert_baseline, convert_deflection,
 15 |                        convert_sigma)
 16 | from math import ceil
 17 | from os import path
 18 | 
 19 | # Code to find the data path.
 20 | 
 21 | scr_dir = path.dirname(__file__)
 22 | file_name = 'McDonaldSK1991data.txt'
 23 | comp_dir = path.join(scr_dir, 'Data', file_name)
 24 | 
 25 | # Using data from the book for easier comparison.
 26 | # Data from McDonald (1991) study about geographical location and muscle size
 27 | # in mussels.
 28 | # Again we use Numpy to assign the data to variables.
 29 | 
 30 | x, y = np.genfromtxt(comp_dir, delimiter=' ',
 31 |                      skip_header=19, usecols=(0, 1), unpack=True)
 32 | 
 33 | # Define the contrasts.
 34 | # TODO: use dictionary for easier retrieval of contrast description.
 35 | 
 36 | contrasts = np.array(((-1/3, -1/3, 1/2, -1/3, 1/2),
 37 |                       (1, -1, 0, 0, 0),
 38 |                       (-1/2, -1/2, 1, 0, 0),
 39 |                       (-1/2, -1/2, 1/2, 1/2, 0),
 40 |                       (1/3, 1/3, 1/3, -1, 0),
 41 |                       (-1/4, -1/4, -1/4, -1/4, 1),
 42 |                       (1/3, 1/3, 1/3, -1/2, -1/2),
 43 |                       (0, 0, 0, -1, 1)))
 44 | # Random data, for test purposes:
 45 | 
 46 | # y_truesd = 4.0
 47 | # a0_true = 100
 48 | # atrue = [15, -10, -7, 8, -6]
 49 | 
 50 | #x = [1] * 3 + [2] * 4 + [3] * 3 + [4] * 5 + [5] * 3
 51 | #y = [a0_true + atrue[i - 1] + np.random.normal(0, y_truesd) for i in x]
 52 | 
 53 | 
 54 | # Normalize the data for better MCMC performance.
 55 | # And define the total number of levels in our categorical variable.
 56 | 
 57 | zy = normalize(y)
 58 | x_levels = len(set(x))
 59 | y_mean = np.mean(y)
 60 | y_sd = np.sqrt(np.var(y))
 61 | 
 62 | # Begin the definition of the model.
 63 | # First, we define a Gamma distribution for the precision of
 64 | # the deflection parameters.
 65 | 
 66 | a_sd = pymc.Gamma('a_sd', 1.01005, 0.1005)
 67 | 
 68 | @pymc.deterministic
 69 | def a_tau(a_sd=a_sd):
 70 |     return 1.0 / a_sd**2
 71 | 
 72 | # Then we define a normal prior on the baseline and deflection parameters.
 73 | 
 74 | a0 = pymc.Normal('a0', mu=0.0, tau=0.001)
 75 | a = pymc.Normal('a', mu=0.0, tau=a_tau, size=x_levels)
 76 | 
 77 | # Almost there! We still need to set the prior on the data variance.
 78 | 
 79 | sigma = pymc.Uniform('sigma', 0, 10)
 80 | 
 81 | 
 82 | @pymc.deterministic
 83 | def tau(sigma=sigma):
 84 |     return 1.0 / sigma**2
 85 | 
 86 | # The priors are all set! Now we can define the linear model.
 87 | # Maybe it can be clearly defined using the 'Lambda()' class.
 88 | # But we will use a 'for' loop for easier readability.
 89 | 
 90 | mu = []
 91 | for i in x:
 92 |     mu.append(a0 + a[int(i - 1)])
 93 | 
 94 | # And the likelihood.
 95 | 
 96 | like_y = pymc.Normal('like_y', mu=mu, tau=tau, value=zy, observed=True)
 97 | 
 98 | # Now we build the model, set the MAP and sample the posterior distribution.
 99 | 
100 | model = pymc.Model([like_y, a0, a, sigma, a_tau, a_sd])
101 | map_ = pymc.MAP(model)
102 | map_.fit()
103 | mcmc = pymc.MCMC(model)
104 | mcmc.sample(iter=80000, burn=20000, thin=10)
105 | 
106 | # Extract the samples.
107 | 
108 | a0_sample = mcmc.trace('a0')[:]
109 | a_sample = mcmc.trace('a')[:]
110 | sigma_sample = mcmc.trace('sigma')[:]
111 | a_sd_sample = mcmc.trace('a_sd')[:]
112 | 
113 | # Convert the values.
114 | 
115 | b0_sample = convert_baseline(a0_sample, a_sample, x_levels, y)
116 | b_sample = convert_deflection(a0_sample, a_sample, x_levels, y)
117 | 
118 | sig_sample = convert_sigma(y, sigma_sample)
119 | b_sd_sample = convert_sigma(y, a_sd_sample)
120 | 
121 | # Plot the results.
122 | 
123 | plot.figure(figsize=(6.0, 4.0))
124 | 
125 | plot.subplot(211)
126 | plot_post(sig_sample, title=r'$\sigma$ (cell SD) posterior')
127 | 
128 | plot.subplot(212)
129 | plot_post(b_sd_sample, title=r'$aSD$ posterior')
130 | 
131 | plot.subplots_adjust(wspace=0.2, hspace=0.5)
132 | 
133 | plot.figure(figsize=(18.0, 3.0))
134 | total_subplot = len(b_sample[0, :])
135 | plot_n = 100 + (total_subplot + 1) * 10 + 1
136 | 
137 | plot.subplot(plot_n)
138 | plot_post(b0_sample, title=r'$\beta_0$ posterior')
139 | 
140 | for i in range(total_subplot):
141 |     plot.subplot(plot_n + i + 1)
142 |     plot_post(b_sample[:, i], title=r'$\beta_{1%i}$ posterior' % (i + 1))
143 | 
144 | plot.subplots_adjust(wspace=0.2)
145 | 
146 | n_cons = len(contrasts)
147 | if n_cons > 0:
148 |     plot_per_rows = 5
149 |     plot_rows = ceil(n_cons / plot_per_rows)
150 |     plot_cols = ceil(n_cons / plot_rows)
151 | 
152 |     plot.figure(figsize=(3.75 * plot_cols, 2.5 * plot_rows))
153 | 
154 |     for i in range(n_cons):
155 |         contrast = contrasts[i, :]
156 |         comp = np.dot(b_sample, contrast)
157 |         plot.subplot(plot_rows, plot_cols, i + 1)
158 |         plot_post(comp, title='Contrast %i' % (i + 1), comp=0.0)
159 | 
160 | plot.subplots_adjust(wspace=0.2, hspace=0.5)
161 | plot.show()
162 | 


--------------------------------------------------------------------------------